xen-4.4.0/0000775000175000017500000000000012307547100010436 5ustar smbsmbxen-4.4.0/xen/0000775000175000017500000000000012307313555011235 5ustar smbsmbxen-4.4.0/xen/Makefile0000664000175000017500000001614512307313555012704 0ustar smbsmb# This is the correct place to edit the build version. # All other places this is stored (eg. compile.h) should be autogenerated. export XEN_VERSION = 4 export XEN_SUBVERSION = 4 export XEN_EXTRAVERSION ?= .0$(XEN_VENDORVERSION) export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION) -include xen-version export XEN_WHOAMI ?= $(USER) export XEN_DOMAIN ?= $(shell ([ -x /bin/dnsdomainname ] && /bin/dnsdomainname) || ([ -x /bin/domainname ] && /bin/domainname || echo [unknown])) export BASEDIR := $(CURDIR) export XEN_ROOT := $(BASEDIR)/.. EFI_MOUNTPOINT ?= /boot/efi .PHONY: default default: build .PHONY: dist dist: install .PHONY: build install uninstall clean distclean cscope TAGS tags MAP gtags build install uninstall debug clean distclean cscope TAGS tags MAP gtags:: ifneq ($(XEN_TARGET_ARCH),x86_32) $(MAKE) -f Rules.mk _$@ else echo "*** Xen x86/32 target no longer supported!" endif .PHONY: _build _build: $(TARGET)$(CONFIG_XEN_INSTALL_SUFFIX) .PHONY: _install _install: D=$(DESTDIR) _install: T=$(notdir $(TARGET)) _install: Z=$(CONFIG_XEN_INSTALL_SUFFIX) _install: $(TARGET)$(CONFIG_XEN_INSTALL_SUFFIX) [ -d $(D)/boot ] || $(INSTALL_DIR) $(D)/boot $(INSTALL_DATA) $(TARGET)$(Z) $(D)/boot/$(T)-$(XEN_FULLVERSION)$(Z) ln -f -s $(T)-$(XEN_FULLVERSION)$(Z) $(D)/boot/$(T)-$(XEN_VERSION).$(XEN_SUBVERSION)$(Z) ln -f -s $(T)-$(XEN_FULLVERSION)$(Z) $(D)/boot/$(T)-$(XEN_VERSION)$(Z) ln -f -s $(T)-$(XEN_FULLVERSION)$(Z) $(D)/boot/$(T)$(Z) $(INSTALL_DATA) $(TARGET)-syms $(D)/boot/$(T)-syms-$(XEN_FULLVERSION) if [ -r $(TARGET).efi -a -n '$(EFI_DIR)' ]; then \ [ -d $(D)$(EFI_DIR) ] || $(INSTALL_DIR) $(D)$(EFI_DIR); \ $(INSTALL_DATA) $(TARGET).efi $(D)$(EFI_DIR)/$(T)-$(XEN_FULLVERSION).efi; \ ln -sf $(T)-$(XEN_FULLVERSION).efi $(D)$(EFI_DIR)/$(T)-$(XEN_VERSION).$(XEN_SUBVERSION).efi; \ ln -sf $(T)-$(XEN_FULLVERSION).efi $(D)$(EFI_DIR)/$(T)-$(XEN_VERSION).efi; \ ln -sf $(T)-$(XEN_FULLVERSION).efi $(D)$(EFI_DIR)/$(T).efi; \ if [ -n '$(EFI_MOUNTPOINT)' -a -n '$(EFI_VENDOR)' ]; then \ $(INSTALL_DATA) $(TARGET).efi $(D)$(EFI_MOUNTPOINT)/efi/$(EFI_VENDOR)/$(T)-$(XEN_FULLVERSION).efi; \ elif [ "$(D)" = "$(patsubst $(shell cd $(XEN_ROOT) && pwd)/%,%,$(D))" ]; then \ echo 'EFI installation only partially done (EFI_VENDOR not set)' >&2; \ fi; \ fi .PHONY: _uninstall _uninstall: D=$(DESTDIR) _uninstall: T=$(notdir $(TARGET)) _uninstall: Z=$(CONFIG_XEN_INSTALL_SUFFIX) _uninstall: rm -f $(D)/boot/$(T)-$(XEN_FULLVERSION)$(Z) rm -f $(D)/boot/$(T)-$(XEN_VERSION).$(XEN_SUBVERSION)$(Z) rm -f $(D)/boot/$(T)-$(XEN_VERSION)$(Z) rm -f $(D)/boot/$(T)$(Z) rm -f $(D)/boot/$(T)-syms-$(XEN_FULLVERSION) rm -f $(D)$(EFI_DIR)/$(T)-$(XEN_FULLVERSION).efi rm -f $(D)$(EFI_DIR)/$(T)-$(XEN_VERSION).$(XEN_SUBVERSION).efi rm -f $(D)$(EFI_DIR)/$(T)-$(XEN_VERSION).efi rm -f $(D)$(EFI_DIR)/$(T).efi rm -f $(D)$(EFI_MOUNTPOINT)/efi/$(EFI_VENDOR)/$(T)-$(XEN_FULLVERSION).efi .PHONY: _debug _debug: objdump -D -S $(TARGET)-syms > $(TARGET).s .PHONY: _clean _clean: delete-unfresh-files $(MAKE) -C tools clean $(MAKE) -f $(BASEDIR)/Rules.mk -C include clean $(MAKE) -f $(BASEDIR)/Rules.mk -C common clean $(MAKE) -f $(BASEDIR)/Rules.mk -C drivers clean $(MAKE) -f $(BASEDIR)/Rules.mk -C xsm clean $(MAKE) -f $(BASEDIR)/Rules.mk -C crypto clean $(MAKE) -f $(BASEDIR)/Rules.mk -C arch/$(TARGET_ARCH) clean rm -f include/asm *.o $(TARGET) $(TARGET).gz $(TARGET)-syms *~ core rm -f include/asm-*/asm-offsets.h rm -f .banner .PHONY: _distclean _distclean: clean rm -f tags TAGS cscope.files cscope.in.out cscope.out cscope.po.out GTAGS GPATH GRTAGS GSYMS $(TARGET).gz: $(TARGET) gzip -f -9 < $< > $@.new mv $@.new $@ $(TARGET): delete-unfresh-files $(MAKE) -C tools $(MAKE) -f $(BASEDIR)/Rules.mk include/xen/compile.h [ -e include/asm ] || ln -sf asm-$(TARGET_ARCH) include/asm $(MAKE) -f $(BASEDIR)/Rules.mk -C include $(MAKE) -f $(BASEDIR)/Rules.mk -C arch/$(TARGET_ARCH) asm-offsets.s $(MAKE) -f $(BASEDIR)/Rules.mk include/asm-$(TARGET_ARCH)/asm-offsets.h $(MAKE) -f $(BASEDIR)/Rules.mk -C arch/$(TARGET_ARCH) $(TARGET) # drivers/char/console.o contains static banner/compile info. Blow it away. # Don't refresh these files during e.g., 'sudo make install' .PHONY: delete-unfresh-files delete-unfresh-files: @if [ ! -r include/xen/compile.h -o -O include/xen/compile.h ]; then \ rm -f include/xen/compile.h; \ fi .banner: Makefile @if which figlet >/dev/null 2>&1 ; then \ echo " Xen $(XEN_FULLVERSION)" | figlet -f tools/xen.flf > $@.tmp; \ else \ echo " Xen $(XEN_FULLVERSION)" > $@.tmp; \ fi @mv -f $@.tmp $@ # compile.h contains dynamic build info. Rebuilt on every 'make' invocation. include/xen/compile.h: include/xen/compile.h.in .banner @sed -e 's/@@date@@/$(shell LC_ALL=C date)/g' \ -e 's/@@time@@/$(shell LC_ALL=C date +%T)/g' \ -e 's/@@whoami@@/$(XEN_WHOAMI)/g' \ -e 's/@@domain@@/$(XEN_DOMAIN)/g' \ -e 's/@@hostname@@/$(shell hostname)/g' \ -e 's!@@compiler@@!$(shell $(CC) $(CFLAGS) --version 2>&1 | head -1)!g' \ -e 's/@@version@@/$(XEN_VERSION)/g' \ -e 's/@@subversion@@/$(XEN_SUBVERSION)/g' \ -e 's/@@extraversion@@/$(XEN_EXTRAVERSION)/g' \ -e 's!@@changeset@@!$(shell tools/scmversion $(XEN_ROOT) || echo "unavailable")!g' \ < include/xen/compile.h.in > $@.new @cat .banner @$(PYTHON) tools/fig-to-oct.py < .banner >> $@.new @mv -f $@.new $@ include/asm-$(TARGET_ARCH)/asm-offsets.h: arch/$(TARGET_ARCH)/asm-offsets.s @(set -e; \ echo "/*"; \ echo " * DO NOT MODIFY."; \ echo " *"; \ echo " * This file was auto-generated from $<"; \ echo " *"; \ echo " */"; \ echo ""; \ echo "#ifndef __ASM_OFFSETS_H__"; \ echo "#define __ASM_OFFSETS_H__"; \ echo ""; \ sed -ne "/^->/{s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; s:->::; p;}"; \ echo ""; \ echo "#endif") <$< >$@ SUBDIRS = xsm arch/$(TARGET_ARCH) common drivers define all_sources ( find include/asm-$(TARGET_ARCH) -name '*.h' -print; \ find include -name 'asm-*' -prune -o -name '*.h' -print; \ find $(SUBDIRS) -name '*.[chS]' -print ) endef define set_exuberant_flags exuberant_flags=`$1 --version 2>/dev/null | (grep -iq exuberant && \ echo "-I __initdata,__exitdata,__acquires,__releases \ -I EXPORT_SYMBOL,EXPORT_SYMBOL_GPL \ --extra=+f --c-kinds=+px") || true` endef .PHONY: xenversion xenversion: @echo $(XEN_FULLVERSION) .PHONY: _TAGS _TAGS: set -e; rm -f TAGS; \ $(call set_exuberant_flags,etags); \ $(all_sources) | xargs etags $$exuberant_flags -a .PHONY: _tags _tags: set -e; rm -f tags; \ $(call set_exuberant_flags,ctags); \ $(all_sources) | xargs ctags $$exuberant_flags -a .PHONY: _gtags _gtags: set -e; rm -f GTAGS GSYMS GPATH GRTAGS $(all_sources) | gtags -f - .PHONY: _cscope _cscope: $(all_sources) > cscope.files cscope -k -b -q .PHONY: _MAP _MAP: $(NM) -n $(TARGET)-syms | grep -v '\(compiled\)\|\(\.o$$\)\|\( [aUw] \)\|\(\.\.ng$$\)\|\(LASH[RL]DI\)' > System.map .PHONY: FORCE FORCE: %.o %.i %.s: %.c FORCE $(MAKE) -f $(BASEDIR)/Rules.mk -C $(*D) $(@F) %.o %.s: %.S FORCE $(MAKE) -f $(BASEDIR)/Rules.mk -C $(*D) $(@F) %/: FORCE $(MAKE) -f $(BASEDIR)/Rules.mk -C $* built_in.o built_in_bin.o xen-4.4.0/xen/crypto/0000775000175000017500000000000012307313555012555 5ustar smbsmbxen-4.4.0/xen/crypto/Makefile0000664000175000017500000000004412307313555014213 0ustar smbsmbobj-y += rijndael.o obj-y += vmac.o xen-4.4.0/xen/crypto/vmac.c0000664000175000017500000012022312307313555013647 0ustar smbsmb/* -------------------------------------------------------------------------- * VMAC and VHASH Implementation by Ted Krovetz (tdk@acm.org) and Wei Dai. * This implementation is herby placed in the public domain. * The authors offers no warranty. Use at your own risk. * Please send bug reports to the authors. * Last modified: 17 APR 08, 1700 PDT * ----------------------------------------------------------------------- */ /* start for Xen */ #include #include #include #include #include #define UINT64_C(x) x##ULL /* end for Xen */ /* Enable code tuned for 64-bit registers; otherwise tuned for 32-bit */ #ifndef VMAC_ARCH_64 #define VMAC_ARCH_64 (__x86_64__ || __ppc64__ || _M_X64) #endif /* Enable code tuned for Intel SSE2 instruction set */ #if ((__SSE2__ || (_M_IX86_FP >= 2)) && ( ! VMAC_ARCH_64)) #define VMAC_USE_SSE2 1 #include #endif /* Native word reads. Update (or define via compiler) if incorrect */ #ifndef VMAC_ARCH_BIG_ENDIAN /* Assume big-endian unless on the list */ #define VMAC_ARCH_BIG_ENDIAN \ (!(__x86_64__ || __i386__ || _M_IX86 || \ _M_X64 || __ARMEL__ || __MIPSEL__)) #endif /* ----------------------------------------------------------------------- */ /* Constants and masks */ const uint64_t p64 = UINT64_C(0xfffffffffffffeff); /* 2^64 - 257 prime */ const uint64_t m62 = UINT64_C(0x3fffffffffffffff); /* 62-bit mask */ const uint64_t m63 = UINT64_C(0x7fffffffffffffff); /* 63-bit mask */ const uint64_t m64 = UINT64_C(0xffffffffffffffff); /* 64-bit mask */ const uint64_t mpoly = UINT64_C(0x1fffffff1fffffff); /* Poly key mask */ /* ----------------------------------------------------------------------- * * The following routines are used in this implementation. They are * written via macros to simulate zero-overhead call-by-reference. * All have default implemantations for when they are not defined in an * architecture-specific manner. * * MUL64: 64x64->128-bit multiplication * PMUL64: assumes top bits cleared on inputs * ADD128: 128x128->128-bit addition * GET_REVERSED_64: load and byte-reverse 64-bit word * ----------------------------------------------------------------------- */ /* ----------------------------------------------------------------------- */ #if (__GNUC__ && (__x86_64__ || __amd64__)) /* ----------------------------------------------------------------------- */ #define ADD128(rh,rl,ih,il) \ asm ("addq %3, %1 \n\t" \ "adcq %2, %0" \ : "+r"(rh),"+r"(rl) \ : "r"(ih),"r"(il) : "cc"); #define MUL64(rh,rl,i1,i2) \ asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "r"(i2) : "cc") #define PMUL64 MUL64 #define GET_REVERSED_64(p) \ ({uint64_t x; \ asm ("bswapq %0" : "=r" (x) : "0"(*(uint64_t *)(p))); x;}) /* ----------------------------------------------------------------------- */ #elif (__GNUC__ && __i386__) /* ----------------------------------------------------------------------- */ #define GET_REVERSED_64(p) \ ({ uint64_t x; \ uint32_t *tp = (uint32_t *)(p); \ asm ("bswap %%edx\n\t" \ "bswap %%eax" \ : "=A"(x) \ : "a"(tp[1]), "d"(tp[0])); \ x; }) /* ----------------------------------------------------------------------- */ #elif (__GNUC__ && __ppc64__) /* ----------------------------------------------------------------------- */ #define ADD128(rh,rl,ih,il) \ asm volatile ( "addc %1, %1, %3 \n\t" \ "adde %0, %0, %2" \ : "+r"(rh),"+r"(rl) \ : "r"(ih),"r"(il)); #define MUL64(rh,rl,i1,i2) \ { uint64_t _i1 = (i1), _i2 = (i2); \ rl = _i1 * _i2; \ asm volatile ("mulhdu %0, %1, %2" : "=r" (rh) : "r" (_i1), "r" (_i2));\ } #define PMUL64 MUL64 #define GET_REVERSED_64(p) \ ({ uint32_t hi, lo, *_p = (uint32_t *)(p); \ asm volatile ("lwbrx %0, %1, %2" : "=r"(lo) : "b%"(0), "r"(_p) ); \ asm volatile ("lwbrx %0, %1, %2" : "=r"(hi) : "b%"(4), "r"(_p) ); \ ((uint64_t)hi << 32) | (uint64_t)lo; } ) /* ----------------------------------------------------------------------- */ #elif (__GNUC__ && (__ppc__ || __PPC__)) /* ----------------------------------------------------------------------- */ #define GET_REVERSED_64(p) \ ({ uint32_t hi, lo, *_p = (uint32_t *)(p); \ asm volatile ("lwbrx %0, %1, %2" : "=r"(lo) : "b%"(0), "r"(_p) ); \ asm volatile ("lwbrx %0, %1, %2" : "=r"(hi) : "b%"(4), "r"(_p) ); \ ((uint64_t)hi << 32) | (uint64_t)lo; } ) /* ----------------------------------------------------------------------- */ #elif (__GNUC__ && (__ARMEL__ || __ARM__)) /* ----------------------------------------------------------------------- */ #define bswap32(v) \ ({ uint32_t tmp,out; \ asm volatile( \ "eor %1, %2, %2, ror #16\n" \ "bic %1, %1, #0x00ff0000\n" \ "mov %0, %2, ror #8\n" \ "eor %0, %0, %1, lsr #8" \ : "=r" (out), "=&r" (tmp) \ : "r" (v)); \ out;}) /* ----------------------------------------------------------------------- */ #elif _MSC_VER /* ----------------------------------------------------------------------- */ #include #if (_M_IA64 || _M_X64) && \ (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000) #define MUL64(rh,rl,i1,i2) (rl) = _umul128(i1,i2,&(rh)); #pragma intrinsic(_umul128) #define PMUL64 MUL64 #endif /* MSVC uses add, adc in this version */ #define ADD128(rh,rl,ih,il) \ { uint64_t _il = (il); \ (rl) += (_il); \ (rh) += (ih) + ((rl) < (_il)); \ } #if _MSC_VER >= 1300 #define GET_REVERSED_64(p) _byteswap_uint64(*(uint64_t *)(p)) #pragma intrinsic(_byteswap_uint64) #endif #if _MSC_VER >= 1400 && \ (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000) #define MUL32(i1,i2) (__emulu((uint32_t)(i1),(uint32_t)(i2))) #pragma intrinsic(__emulu) #endif /* ----------------------------------------------------------------------- */ #endif /* ----------------------------------------------------------------------- */ #if __GNUC__ #define ALIGN(n) __attribute__ ((aligned(n))) #define NOINLINE __attribute__ ((noinline)) #define FASTCALL #elif _MSC_VER #define ALIGN(n) __declspec(align(n)) #define NOINLINE __declspec(noinline) #define FASTCALL __fastcall #else #define ALIGN(n) #define NOINLINE #define FASTCALL #endif /* ----------------------------------------------------------------------- */ /* Default implementations, if not defined above */ /* ----------------------------------------------------------------------- */ #ifndef ADD128 #define ADD128(rh,rl,ih,il) \ { uint64_t _il = (il); \ (rl) += (_il); \ if ((rl) < (_il)) (rh)++; \ (rh) += (ih); \ } #endif #ifndef MUL32 #define MUL32(i1,i2) ((uint64_t)(uint32_t)(i1)*(uint32_t)(i2)) #endif #ifndef PMUL64 /* rh may not be same as i1 or i2 */ #define PMUL64(rh,rl,i1,i2) /* Assumes m doesn't overflow */ \ { uint64_t _i1 = (i1), _i2 = (i2); \ uint64_t m = MUL32(_i1,_i2>>32) + MUL32(_i1>>32,_i2); \ rh = MUL32(_i1>>32,_i2>>32); \ rl = MUL32(_i1,_i2); \ ADD128(rh,rl,(m >> 32),(m << 32)); \ } #endif #ifndef MUL64 #define MUL64(rh,rl,i1,i2) \ { uint64_t _i1 = (i1), _i2 = (i2); \ uint64_t m1= MUL32(_i1,_i2>>32); \ uint64_t m2= MUL32(_i1>>32,_i2); \ rh = MUL32(_i1>>32,_i2>>32); \ rl = MUL32(_i1,_i2); \ ADD128(rh,rl,(m1 >> 32),(m1 << 32)); \ ADD128(rh,rl,(m2 >> 32),(m2 << 32)); \ } #endif #ifndef GET_REVERSED_64 #ifndef bswap64 #ifndef bswap32 #define bswap32(x) \ ({ uint32_t bsx = (x); \ ((((bsx) & 0xff000000u) >> 24) | (((bsx) & 0x00ff0000u) >> 8) | \ (((bsx) & 0x0000ff00u) << 8) | (((bsx) & 0x000000ffu) << 24)); }) #endif #define bswap64(x) \ ({ union { uint64_t ll; uint32_t l[2]; } w, r; \ w.ll = (x); \ r.l[0] = bswap32 (w.l[1]); \ r.l[1] = bswap32 (w.l[0]); \ r.ll; }) #endif #define GET_REVERSED_64(p) bswap64(*(uint64_t *)(p)) #endif /* ----------------------------------------------------------------------- */ #if (VMAC_PREFER_BIG_ENDIAN) # define get64PE get64BE #else # define get64PE get64LE #endif #if (VMAC_ARCH_BIG_ENDIAN) # define get64BE(ptr) (*(uint64_t *)(ptr)) # define get64LE(ptr) GET_REVERSED_64(ptr) #else /* assume little-endian */ # define get64BE(ptr) GET_REVERSED_64(ptr) # define get64LE(ptr) (*(uint64_t *)(ptr)) #endif /* --------------------------------------------------------------------- * * For highest performance the L1 NH and L2 polynomial hashes should be * carefully implemented to take advantage of one's target architechture. * Here these two hash functions are defined multiple time; once for * 64-bit architectures, once for 32-bit SSE2 architectures, and once * for the rest (32-bit) architectures. * For each, nh_16 *must* be defined (works on multiples of 16 bytes). * Optionally, nh_vmac_nhbytes can be defined (for multiples of * VMAC_NHBYTES), and nh_16_2 and nh_vmac_nhbytes_2 (versions that do two * NH computations at once). * --------------------------------------------------------------------- */ /* ----------------------------------------------------------------------- */ #if VMAC_ARCH_64 /* ----------------------------------------------------------------------- */ #define nh_16(mp, kp, nw, rh, rl) \ { int i; uint64_t th, tl; \ rh = rl = 0; \ for (i = 0; i < nw; i+= 2) { \ MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\ ADD128(rh,rl,th,tl); \ } \ } #define nh_16_2(mp, kp, nw, rh, rl, rh1, rl1) \ { int i; uint64_t th, tl; \ rh1 = rl1 = rh = rl = 0; \ for (i = 0; i < nw; i+= 2) { \ MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\ ADD128(rh,rl,th,tl); \ MUL64(th,tl,get64PE((mp)+i )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\ ADD128(rh1,rl1,th,tl); \ } \ } #if (VMAC_NHBYTES >= 64) /* These versions do 64-bytes of message at a time */ #define nh_vmac_nhbytes(mp, kp, nw, rh, rl) \ { int i; uint64_t th, tl; \ rh = rl = 0; \ for (i = 0; i < nw; i+= 8) { \ MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\ ADD128(rh,rl,th,tl); \ MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\ ADD128(rh,rl,th,tl); \ MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\ ADD128(rh,rl,th,tl); \ MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\ ADD128(rh,rl,th,tl); \ } \ } #define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh1, rl1) \ { int i; uint64_t th, tl; \ rh1 = rl1 = rh = rl = 0; \ for (i = 0; i < nw; i+= 8) { \ MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\ ADD128(rh,rl,th,tl); \ MUL64(th,tl,get64PE((mp)+i )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\ ADD128(rh1,rl1,th,tl); \ MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\ ADD128(rh,rl,th,tl); \ MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+4],get64PE((mp)+i+3)+(kp)[i+5]);\ ADD128(rh1,rl1,th,tl); \ MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\ ADD128(rh,rl,th,tl); \ MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+6],get64PE((mp)+i+5)+(kp)[i+7]);\ ADD128(rh1,rl1,th,tl); \ MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\ ADD128(rh,rl,th,tl); \ MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+8],get64PE((mp)+i+7)+(kp)[i+9]);\ ADD128(rh1,rl1,th,tl); \ } \ } #endif #define poly_step(ah, al, kh, kl, mh, ml) \ { uint64_t t1h, t1l, t2h, t2l, t3h, t3l, z=0; \ /* compute ab*cd, put bd into result registers */ \ PMUL64(t3h,t3l,al,kh); \ PMUL64(t2h,t2l,ah,kl); \ PMUL64(t1h,t1l,ah,2*kh); \ PMUL64(ah,al,al,kl); \ /* add 2 * ac to result */ \ ADD128(ah,al,t1h,t1l); \ /* add together ad + bc */ \ ADD128(t2h,t2l,t3h,t3l); \ /* now (ah,al), (t2l,2*t2h) need summing */ \ /* first add the high registers, carrying into t2h */ \ ADD128(t2h,ah,z,t2l); \ /* double t2h and add top bit of ah */ \ t2h = 2 * t2h + (ah >> 63); \ ah &= m63; \ /* now add the low registers */ \ ADD128(ah,al,mh,ml); \ ADD128(ah,al,z,t2h); \ } /* ----------------------------------------------------------------------- */ #elif VMAC_USE_SSE2 /* ----------------------------------------------------------------------- */ // macros from Crypto++ for sharing inline assembly code between MSVC and GNU C #if defined(__GNUC__) // define these in two steps to allow arguments to be expanded #define GNU_AS2(x, y) #x ", " #y ";" #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";" #define GNU_ASL(x) "\n" #x ":" #define GNU_ASJ(x, y, z) #x " " #y #z ";" #define AS2(x, y) GNU_AS2(x, y) #define AS3(x, y, z) GNU_AS3(x, y, z) #define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";" #define ASL(x) GNU_ASL(x) #define ASJ(x, y, z) GNU_ASJ(x, y, z) #else #define AS2(x, y) __asm {x, y} #define AS3(x, y, z) __asm {x, y, z} #define ASS(x, y, a, b, c, d) __asm {x, y, _MM_SHUFFLE(a, b, c, d)} #define ASL(x) __asm {label##x:} #define ASJ(x, y, z) __asm {x label##y} #endif static void NOINLINE nh_16_func(const uint64_t *mp, const uint64_t *kp, size_t nw, uint64_t *rh, uint64_t *rl) { // This assembly version, using MMX registers, is just as fast as the // intrinsics version (which uses XMM registers) on the Intel Core 2, // but is much faster on the Pentium 4. In order to schedule multiplies // as early as possible, the loop interleaves operations for the current // block and the next block. To mask out high 32-bits, we use "movd" // to move the lower 32-bits to the stack and then back. Surprisingly, // this is faster than any other method. #ifdef __GNUC__ __asm__ __volatile__ ( ".intel_syntax noprefix;" #else AS2( mov esi, mp) AS2( mov edi, kp) AS2( mov ecx, nw) AS2( mov eax, rl) AS2( mov edx, rh) #endif AS2( sub esp, 12) AS2( movq mm6, [esi]) AS2( paddq mm6, [edi]) AS2( movq mm5, [esi+8]) AS2( paddq mm5, [edi+8]) AS2( add esi, 16) AS2( add edi, 16) AS2( movq mm4, mm6) ASS( pshufw mm2, mm6, 1, 0, 3, 2) AS2( pmuludq mm6, mm5) ASS( pshufw mm3, mm5, 1, 0, 3, 2) AS2( pmuludq mm5, mm2) AS2( pmuludq mm2, mm3) AS2( pmuludq mm3, mm4) AS2( pxor mm7, mm7) AS2( movd [esp], mm6) AS2( psrlq mm6, 32) AS2( movd [esp+4], mm5) AS2( psrlq mm5, 32) AS2( sub ecx, 2) ASJ( jz, 1, f) ASL(0) AS2( movq mm0, [esi]) AS2( paddq mm0, [edi]) AS2( movq mm1, [esi+8]) AS2( paddq mm1, [edi+8]) AS2( add esi, 16) AS2( add edi, 16) AS2( movq mm4, mm0) AS2( paddq mm5, mm2) ASS( pshufw mm2, mm0, 1, 0, 3, 2) AS2( pmuludq mm0, mm1) AS2( movd [esp+8], mm3) AS2( psrlq mm3, 32) AS2( paddq mm5, mm3) ASS( pshufw mm3, mm1, 1, 0, 3, 2) AS2( pmuludq mm1, mm2) AS2( pmuludq mm2, mm3) AS2( pmuludq mm3, mm4) AS2( movd mm4, [esp]) AS2( paddq mm7, mm4) AS2( movd mm4, [esp+4]) AS2( paddq mm6, mm4) AS2( movd mm4, [esp+8]) AS2( paddq mm6, mm4) AS2( movd [esp], mm0) AS2( psrlq mm0, 32) AS2( paddq mm6, mm0) AS2( movd [esp+4], mm1) AS2( psrlq mm1, 32) AS2( paddq mm5, mm1) AS2( sub ecx, 2) ASJ( jnz, 0, b) ASL(1) AS2( paddq mm5, mm2) AS2( movd [esp+8], mm3) AS2( psrlq mm3, 32) AS2( paddq mm5, mm3) AS2( movd mm4, [esp]) AS2( paddq mm7, mm4) AS2( movd mm4, [esp+4]) AS2( paddq mm6, mm4) AS2( movd mm4, [esp+8]) AS2( paddq mm6, mm4) ASS( pshufw mm0, mm7, 3, 2, 1, 0) AS2( psrlq mm7, 32) AS2( paddq mm6, mm7) AS2( punpckldq mm0, mm6) AS2( psrlq mm6, 32) AS2( paddq mm5, mm6) AS2( movq [eax], mm0) AS2( movq [edx], mm5) AS2( add esp, 12) #ifdef __GNUC__ ".att_syntax prefix;" : : "S" (mp), "D" (kp), "c" (nw), "a" (rl), "d" (rh) : "memory", "cc" ); #endif } #define nh_16(mp, kp, nw, rh, rl) nh_16_func(mp, kp, nw, &(rh), &(rl)); static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh, const uint64_t *kl, const uint64_t *mh, const uint64_t *ml) { // This code tries to schedule the multiplies as early as possible to overcome // the long latencies on the Pentium 4. It also minimizes "movq" instructions // which are very expensive on the P4. #define a0 [eax+0] #define a1 [eax+4] #define a2 [ebx+0] #define a3 [ebx+4] #define k0 [ecx+0] #define k1 [ecx+4] #define k2 [edx+0] #define k3 [edx+4] #ifdef __GNUC__ uint32_t temp; __asm__ __volatile__ ( "mov %%ebx, %0;" "mov %1, %%ebx;" ".intel_syntax noprefix;" #else AS2( mov ebx, ahi) AS2( mov edx, kh) AS2( mov eax, alo) AS2( mov ecx, kl) AS2( mov esi, mh) AS2( mov edi, ml) #endif AS2( movd mm0, a3) AS2( movq mm4, mm0) AS2( pmuludq mm0, k3) // a3*k3 AS2( movd mm1, a0) AS2( pmuludq mm1, k2) // a0*k2 AS2( movd mm2, a1) AS2( movd mm6, k1) AS2( pmuludq mm2, mm6) // a1*k1 AS2( movd mm3, a2) AS2( movq mm5, mm3) AS2( movd mm7, k0) AS2( pmuludq mm3, mm7) // a2*k0 AS2( pmuludq mm4, mm7) // a3*k0 AS2( pmuludq mm5, mm6) // a2*k1 AS2( psllq mm0, 1) AS2( paddq mm0, [esi]) AS2( paddq mm0, mm1) AS2( movd mm1, a1) AS2( paddq mm4, mm5) AS2( movq mm5, mm1) AS2( pmuludq mm1, k2) // a1*k2 AS2( paddq mm0, mm2) AS2( movd mm2, a0) AS2( paddq mm0, mm3) AS2( movq mm3, mm2) AS2( pmuludq mm2, k3) // a0*k3 AS2( pmuludq mm3, mm7) // a0*k0 AS2( movd esi, mm0) AS2( psrlq mm0, 32) AS2( pmuludq mm7, mm5) // a1*k0 AS2( pmuludq mm5, k3) // a1*k3 AS2( paddq mm0, mm1) AS2( movd mm1, a2) AS2( pmuludq mm1, k2) // a2*k2 AS2( paddq mm0, mm2) AS2( paddq mm0, mm4) AS2( movq mm4, mm0) AS2( movd mm2, a3) AS2( pmuludq mm2, mm6) // a3*k1 AS2( pmuludq mm6, a0) // a0*k1 AS2( psrlq mm0, 31) AS2( paddq mm0, mm3) AS2( movd mm3, [edi]) AS2( paddq mm0, mm3) AS2( movd mm3, a2) AS2( pmuludq mm3, k3) // a2*k3 AS2( paddq mm5, mm1) AS2( movd mm1, a3) AS2( pmuludq mm1, k2) // a3*k2 AS2( paddq mm5, mm2) AS2( movd mm2, [edi+4]) AS2( psllq mm5, 1) AS2( paddq mm0, mm5) AS2( movq mm5, mm0) AS2( psllq mm4, 33) AS2( psrlq mm0, 32) AS2( paddq mm6, mm7) AS2( movd mm7, esi) AS2( paddq mm0, mm6) AS2( paddq mm0, mm2) AS2( paddq mm3, mm1) AS2( psllq mm3, 1) AS2( paddq mm0, mm3) AS2( psrlq mm4, 1) AS2( punpckldq mm5, mm0) AS2( psrlq mm0, 32) AS2( por mm4, mm7) AS2( paddq mm0, mm4) AS2( movq a0, mm5) AS2( movq a2, mm0) #ifdef __GNUC__ ".att_syntax prefix;" "mov %0, %%ebx;" : "=m" (temp) : "m" (ahi), "D" (ml), "d" (kh), "a" (alo), "S" (mh), "c" (kl) : "memory", "cc" ); #endif #undef a0 #undef a1 #undef a2 #undef a3 #undef k0 #undef k1 #undef k2 #undef k3 } #define poly_step(ah, al, kh, kl, mh, ml) \ poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml)) /* ----------------------------------------------------------------------- */ #else /* not VMAC_ARCH_64 and not SSE2 */ /* ----------------------------------------------------------------------- */ #ifndef nh_16 #define nh_16(mp, kp, nw, rh, rl) \ { uint64_t t1,t2,m1,m2,t; \ int i; \ rh = rl = t = 0; \ for (i = 0; i < nw; i+=2) { \ t1 = get64PE(mp+i) + kp[i]; \ t2 = get64PE(mp+i+1) + kp[i+1]; \ m2 = MUL32(t1 >> 32, t2); \ m1 = MUL32(t1, t2 >> 32); \ ADD128(rh,rl,MUL32(t1 >> 32,t2 >> 32),MUL32(t1,t2)); \ rh += (uint64_t)(uint32_t)(m1 >> 32) + (uint32_t)(m2 >> 32); \ t += (uint64_t)(uint32_t)m1 + (uint32_t)m2; \ } \ ADD128(rh,rl,(t >> 32),(t << 32)); \ } #endif static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh, const uint64_t *kl, const uint64_t *mh, const uint64_t *ml) { #if VMAC_ARCH_BIG_ENDIAN #define INDEX_HIGH 0 #define INDEX_LOW 1 #else #define INDEX_HIGH 1 #define INDEX_LOW 0 #endif #define a0 *(((uint32_t*)alo)+INDEX_LOW) #define a1 *(((uint32_t*)alo)+INDEX_HIGH) #define a2 *(((uint32_t*)ahi)+INDEX_LOW) #define a3 *(((uint32_t*)ahi)+INDEX_HIGH) #define k0 *(((uint32_t*)kl)+INDEX_LOW) #define k1 *(((uint32_t*)kl)+INDEX_HIGH) #define k2 *(((uint32_t*)kh)+INDEX_LOW) #define k3 *(((uint32_t*)kh)+INDEX_HIGH) uint64_t p, q, t; uint32_t t2; p = MUL32(a3, k3); p += p; p += *(uint64_t *)mh; p += MUL32(a0, k2); p += MUL32(a1, k1); p += MUL32(a2, k0); t = (uint32_t)(p); p >>= 32; p += MUL32(a0, k3); p += MUL32(a1, k2); p += MUL32(a2, k1); p += MUL32(a3, k0); t |= ((uint64_t)((uint32_t)p & 0x7fffffff)) << 32; p >>= 31; p += (uint64_t)(((uint32_t*)ml)[INDEX_LOW]); p += MUL32(a0, k0); q = MUL32(a1, k3); q += MUL32(a2, k2); q += MUL32(a3, k1); q += q; p += q; t2 = (uint32_t)(p); p >>= 32; p += (uint64_t)(((uint32_t*)ml)[INDEX_HIGH]); p += MUL32(a0, k1); p += MUL32(a1, k0); q = MUL32(a2, k3); q += MUL32(a3, k2); q += q; p += q; *(uint64_t *)(alo) = (p << 32) | t2; p >>= 32; *(uint64_t *)(ahi) = p + t; #undef a0 #undef a1 #undef a2 #undef a3 #undef k0 #undef k1 #undef k2 #undef k3 } #define poly_step(ah, al, kh, kl, mh, ml) \ poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml)) /* ----------------------------------------------------------------------- */ #endif /* end of specialized NH and poly definitions */ /* ----------------------------------------------------------------------- */ /* At least nh_16 is defined. Defined others as needed here */ #ifndef nh_16_2 #define nh_16_2(mp, kp, nw, rh, rl, rh2, rl2) \ nh_16(mp, kp, nw, rh, rl); \ nh_16(mp, ((kp)+2), nw, rh2, rl2); #endif #ifndef nh_vmac_nhbytes #define nh_vmac_nhbytes(mp, kp, nw, rh, rl) \ nh_16(mp, kp, nw, rh, rl) #endif #ifndef nh_vmac_nhbytes_2 #define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh2, rl2) \ nh_vmac_nhbytes(mp, kp, nw, rh, rl); \ nh_vmac_nhbytes(mp, ((kp)+2), nw, rh2, rl2); #endif /* ----------------------------------------------------------------------- */ static void vhash_abort(vmac_ctx_t *ctx) { ctx->polytmp[0] = ctx->polykey[0] ; ctx->polytmp[1] = ctx->polykey[1] ; #if (VMAC_TAG_LEN == 128) ctx->polytmp[2] = ctx->polykey[2] ; ctx->polytmp[3] = ctx->polykey[3] ; #endif ctx->first_block_processed = 0; } /* ----------------------------------------------------------------------- */ static uint64_t l3hash(uint64_t p1, uint64_t p2, uint64_t k1, uint64_t k2, uint64_t len) { uint64_t rh, rl, t, z=0; /* fully reduce (p1,p2)+(len,0) mod p127 */ t = p1 >> 63; p1 &= m63; ADD128(p1, p2, len, t); /* At this point, (p1,p2) is at most 2^127+(len<<64) */ t = (p1 > m63) + ((p1 == m63) && (p2 == m64)); ADD128(p1, p2, z, t); p1 &= m63; /* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */ t = p1 + (p2 >> 32); t += (t >> 32); t += (uint32_t)t > 0xfffffffeu; p1 += (t >> 32); p2 += (p1 << 32); /* compute (p1+k1)%p64 and (p2+k2)%p64 */ p1 += k1; p1 += (0 - (p1 < k1)) & 257; p2 += k2; p2 += (0 - (p2 < k2)) & 257; /* compute (p1+k1)*(p2+k2)%p64 */ MUL64(rh, rl, p1, p2); t = rh >> 56; ADD128(t, rl, z, rh); rh <<= 8; ADD128(t, rl, z, rh); t += t << 8; rl += t; rl += (0 - (rl < t)) & 257; rl += (0 - (rl > p64-1)) & 257; return rl; } /* ----------------------------------------------------------------------- */ void vhash_update(unsigned char *m, unsigned int mbytes, /* Pos multiple of VMAC_NHBYTES */ vmac_ctx_t *ctx) { uint64_t rh, rl, *mptr; const uint64_t *kptr = (uint64_t *)ctx->nhkey; int i; uint64_t ch, cl; uint64_t pkh = ctx->polykey[0]; uint64_t pkl = ctx->polykey[1]; #if (VMAC_TAG_LEN == 128) uint64_t ch2, cl2, rh2, rl2; uint64_t pkh2 = ctx->polykey[2]; uint64_t pkl2 = ctx->polykey[3]; #endif mptr = (uint64_t *)m; i = mbytes / VMAC_NHBYTES; /* Must be non-zero */ ch = ctx->polytmp[0]; cl = ctx->polytmp[1]; #if (VMAC_TAG_LEN == 128) ch2 = ctx->polytmp[2]; cl2 = ctx->polytmp[3]; #endif if ( ! ctx->first_block_processed) { ctx->first_block_processed = 1; #if (VMAC_TAG_LEN == 64) nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl); #else nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2); rh2 &= m62; ADD128(ch2,cl2,rh2,rl2); #endif rh &= m62; ADD128(ch,cl,rh,rl); mptr += (VMAC_NHBYTES/sizeof(uint64_t)); i--; } while (i--) { #if (VMAC_TAG_LEN == 64) nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl); #else nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2); rh2 &= m62; poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2); #endif rh &= m62; poly_step(ch,cl,pkh,pkl,rh,rl); mptr += (VMAC_NHBYTES/sizeof(uint64_t)); } ctx->polytmp[0] = ch; ctx->polytmp[1] = cl; #if (VMAC_TAG_LEN == 128) ctx->polytmp[2] = ch2; ctx->polytmp[3] = cl2; #endif #if VMAC_USE_SSE2 _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */ #endif } /* ----------------------------------------------------------------------- */ uint64_t vhash(unsigned char m[], unsigned int mbytes, uint64_t *tagl, vmac_ctx_t *ctx) { uint64_t rh, rl, *mptr; const uint64_t *kptr = (uint64_t *)ctx->nhkey; int i, remaining; uint64_t ch, cl; uint64_t pkh = ctx->polykey[0]; uint64_t pkl = ctx->polykey[1]; #if (VMAC_TAG_LEN == 128) uint64_t ch2, cl2, rh2, rl2; uint64_t pkh2 = ctx->polykey[2]; uint64_t pkl2 = ctx->polykey[3]; #endif mptr = (uint64_t *)m; i = mbytes / VMAC_NHBYTES; remaining = mbytes % VMAC_NHBYTES; if (ctx->first_block_processed) { ch = ctx->polytmp[0]; cl = ctx->polytmp[1]; #if (VMAC_TAG_LEN == 128) ch2 = ctx->polytmp[2]; cl2 = ctx->polytmp[3]; #endif } else if (i) { #if (VMAC_TAG_LEN == 64) nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,ch,cl); #else nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,ch,cl,ch2,cl2); ch2 &= m62; ADD128(ch2,cl2,pkh2,pkl2); #endif ch &= m62; ADD128(ch,cl,pkh,pkl); mptr += (VMAC_NHBYTES/sizeof(uint64_t)); i--; } else if (remaining) { #if (VMAC_TAG_LEN == 64) nh_16(mptr,kptr,2*((remaining+15)/16),ch,cl); #else nh_16_2(mptr,kptr,2*((remaining+15)/16),ch,cl,ch2,cl2); ch2 &= m62; ADD128(ch2,cl2,pkh2,pkl2); #endif ch &= m62; ADD128(ch,cl,pkh,pkl); mptr += (VMAC_NHBYTES/sizeof(uint64_t)); goto do_l3; } else /* Empty String */ { ch = pkh; cl = pkl; #if (VMAC_TAG_LEN == 128) ch2 = pkh2; cl2 = pkl2; #endif goto do_l3; } while (i--) { #if (VMAC_TAG_LEN == 64) nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl); #else nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2); rh2 &= m62; poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2); #endif rh &= m62; poly_step(ch,cl,pkh,pkl,rh,rl); mptr += (VMAC_NHBYTES/sizeof(uint64_t)); } if (remaining) { #if (VMAC_TAG_LEN == 64) nh_16(mptr,kptr,2*((remaining+15)/16),rh,rl); #else nh_16_2(mptr,kptr,2*((remaining+15)/16),rh,rl,rh2,rl2); rh2 &= m62; poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2); #endif rh &= m62; poly_step(ch,cl,pkh,pkl,rh,rl); } do_l3: #if VMAC_USE_SSE2 _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */ #endif vhash_abort(ctx); remaining *= 8; #if (VMAC_TAG_LEN == 128) *tagl = l3hash(ch2, cl2, ctx->l3key[2], ctx->l3key[3],remaining); #endif return l3hash(ch, cl, ctx->l3key[0], ctx->l3key[1],remaining); } /* ----------------------------------------------------------------------- */ uint64_t vmac(unsigned char m[], unsigned int mbytes, unsigned char n[16], uint64_t *tagl, vmac_ctx_t *ctx) { #if (VMAC_TAG_LEN == 64) uint64_t *in_n, *out_p; uint64_t p, h; int i; #if VMAC_CACHE_NONCES in_n = ctx->cached_nonce; out_p = ctx->cached_aes; #else uint64_t tmp[2]; in_n = out_p = tmp; #endif i = n[15] & 1; #if VMAC_CACHE_NONCES if ((*(uint64_t *)(n+8) != in_n[1]) || (*(uint64_t *)(n ) != in_n[0])) { #endif in_n[0] = *(uint64_t *)(n ); in_n[1] = *(uint64_t *)(n+8); ((unsigned char *)in_n)[15] &= 0xFE; aes_encryption(in_n, out_p, &ctx->cipher_key); #if VMAC_CACHE_NONCES ((unsigned char *)in_n)[15] |= (unsigned char)(1-i); } #endif p = get64BE(out_p + i); h = vhash(m, mbytes, (uint64_t *)0, ctx); return p + h; #else uint64_t tmp[2]; uint64_t th,tl; aes_encryption(n, (unsigned char *)tmp, &ctx->cipher_key); th = vhash(m, mbytes, &tl, ctx); th += get64BE(tmp); *tagl = tl + get64BE(tmp+1); return th; #endif } /* ----------------------------------------------------------------------- */ void vmac_set_key(unsigned char user_key[], vmac_ctx_t *ctx) { uint64_t in[2] = {0}, out[2]; unsigned i; aes_key_setup(user_key, &ctx->cipher_key); /* Fill nh key */ ((unsigned char *)in)[0] = 0x80; for (i = 0; i < sizeof(ctx->nhkey)/8; i+=2) { aes_encryption((unsigned char *)in, (unsigned char *)out, &ctx->cipher_key); ctx->nhkey[i ] = get64BE(out); ctx->nhkey[i+1] = get64BE(out+1); ((unsigned char *)in)[15] += 1; } /* Fill poly key */ ((unsigned char *)in)[0] = 0xC0; in[1] = 0; for (i = 0; i < sizeof(ctx->polykey)/8; i+=2) { aes_encryption((unsigned char *)in, (unsigned char *)out, &ctx->cipher_key); ctx->polytmp[i ] = ctx->polykey[i ] = get64BE(out) & mpoly; ctx->polytmp[i+1] = ctx->polykey[i+1] = get64BE(out+1) & mpoly; ((unsigned char *)in)[15] += 1; } /* Fill ip key */ ((unsigned char *)in)[0] = 0xE0; in[1] = 0; for (i = 0; i < sizeof(ctx->l3key)/8; i+=2) { do { aes_encryption((unsigned char *)in, (unsigned char *)out, &ctx->cipher_key); ctx->l3key[i ] = get64BE(out); ctx->l3key[i+1] = get64BE(out+1); ((unsigned char *)in)[15] += 1; } while (ctx->l3key[i] >= p64 || ctx->l3key[i+1] >= p64); } /* Invalidate nonce/aes cache and reset other elements */ #if (VMAC_TAG_LEN == 64) && (VMAC_CACHE_NONCES) ctx->cached_nonce[0] = (uint64_t)-1; /* Ensure illegal nonce */ ctx->cached_nonce[1] = (uint64_t)0; /* Ensure illegal nonce */ #endif ctx->first_block_processed = 0; } /* ----------------------------------------------------------------------- */ #if VMAC_RUN_TESTS #include #include #include #include unsigned prime(void) /* Wake variable speed cpu, get rough speed estimate */ { volatile uint64_t i; volatile uint64_t j=1; unsigned cnt=0; volatile clock_t ticks = clock(); do { for (i = 0; i < 500000; i++) { uint64_t x = get64PE(&j); j = x * x + (uint64_t)ticks; } cnt++; } while (clock() - ticks < (CLOCKS_PER_SEC/2)); return cnt; /* cnt is millions of iterations per second */ } int main(void) { ALIGN(16) vmac_ctx_t ctx, ctx_aio, ctx_inc1, ctx_inc2; uint64_t res, tagl; void *p; unsigned char *m; ALIGN(4) unsigned char key[] = "abcdefghijklmnop"; ALIGN(4) unsigned char nonce[] = "\0\0\0\0\0\0\0\0bcdefghi"; unsigned int vector_lengths[] = {0,3,48,300,3000000}; #if (VMAC_TAG_LEN == 64) ALIGN(4) char *should_be[] = {"2576BE1C56D8B81B","2D376CF5B1813CE5", "E8421F61D573D298","4492DF6C5CAC1BBE", "09BA597DD7601113"}; #else ALIGN(4) char *should_be[] = {"472766C70F74ED23481D6D7DE4E80DAC", "4EE815A06A1D71EDD36FC75D51188A42", "09F2C80C8E1007A0C12FAE19FE4504AE", "66438817154850C61D8A412164803BCB", "2B6B02288FFC461B75485DE893C629DC"}; #endif unsigned speed_lengths[] = {16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; unsigned i, j, *speed_iters; clock_t ticks; double cpb; const unsigned int buf_len = 3 * (1 << 20); j = prime(); i = sizeof(speed_lengths)/sizeof(speed_lengths[0]); speed_iters = (unsigned *)malloc(i*sizeof(speed_iters[0])); speed_iters[i-1] = j * (1 << 12); while (--i) speed_iters[i-1] = (unsigned)(1.3 * speed_iters[i]); /* Initialize context and message buffer, all 16-byte aligned */ p = malloc(buf_len + 32); m = (unsigned char *)(((size_t)p + 16) & ~((size_t)15)); memset(m, 0, buf_len + 16); vmac_set_key(key, &ctx); /* Test incremental and all-in-one interfaces for correctness */ vmac_set_key(key, &ctx_aio); vmac_set_key(key, &ctx_inc1); vmac_set_key(key, &ctx_inc2); /* for (i = 0; i <= 512; i++) { vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1); tagh = vmac(m+(i/VMAC_NHBYTES)*VMAC_NHBYTES, i%VMAC_NHBYTES, nonce, &tagl, &ctx); vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1); for (j = 0; j < vector_lengths[i]; j++) m[j] = (unsigned char)('a'+j%3); } */ /* Generate vectors */ for (i = 0; i < sizeof(vector_lengths)/sizeof(unsigned int); i++) { for (j = 0; j < vector_lengths[i]; j++) m[j] = (unsigned char)('a'+j%3); res = vmac(m, vector_lengths[i], nonce, &tagl, &ctx); #if (VMAC_TAG_LEN == 64) printf("\'abc\' * %7u: %016llX Should be: %s\n", vector_lengths[i]/3,res,should_be[i]); #else printf("\'abc\' * %7u: %016llX%016llX\nShould be : %s\n", vector_lengths[i]/3,res,tagl,should_be[i]); #endif } /* Speed test */ for (i = 0; i < sizeof(speed_lengths)/sizeof(unsigned int); i++) { ticks = clock(); for (j = 0; j < speed_iters[i]; j++) { #if HASH_ONLY res = vhash(m, speed_lengths[i], &tagl, &ctx); #else res = vmac(m, speed_lengths[i], nonce, &tagl, &ctx); nonce[7]++; #endif } ticks = clock() - ticks; cpb = ((ticks*VMAC_HZ)/ ((double)CLOCKS_PER_SEC*speed_lengths[i]*speed_iters[i])); printf("%4u bytes, %2.2f cpb\n", speed_lengths[i], cpb); } return 1; } #endif xen-4.4.0/xen/crypto/rijndael.c0000664000175000017500000016373212307313555014525 0ustar smbsmb/* $OpenBSD: rijndael.c,v 1.19 2008/06/09 07:49:45 djm Exp $ */ /** * rijndael-alg-fst.c * * @version 3.0 (December 2000) * * Optimised ANSI C code for the Rijndael cipher (now AES) * * @author Vincent Rijmen * @author Antoon Bosselaers * @author Paulo Barreto * * This code is hereby placed in the public domain. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* start for Xen */ #include #include #include #include #include /* end for Xen */ #undef FULL_UNROLL /* Te0[x] = S [x].[02, 01, 01, 03]; Te1[x] = S [x].[03, 02, 01, 01]; Te2[x] = S [x].[01, 03, 02, 01]; Te3[x] = S [x].[01, 01, 03, 02]; Te4[x] = S [x].[01, 01, 01, 01]; Td0[x] = Si[x].[0e, 09, 0d, 0b]; Td1[x] = Si[x].[0b, 0e, 09, 0d]; Td2[x] = Si[x].[0d, 0b, 0e, 09]; Td3[x] = Si[x].[09, 0d, 0b, 0e]; Td4[x] = Si[x].[01, 01, 01, 01]; */ static const u32 Te0[256] = { 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU, 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U, 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU, 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU, 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U, 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU, 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU, 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU, 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU, 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU, 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U, 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU, 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU, 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U, 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU, 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU, 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU, 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU, 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU, 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U, 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU, 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU, 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU, 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU, 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U, 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U, 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U, 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U, 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU, 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U, 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U, 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU, 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU, 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U, 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U, 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U, 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU, 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U, 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU, 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U, 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU, 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U, 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U, 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU, 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U, 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U, 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U, 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U, 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U, 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U, 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U, 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U, 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU, 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U, 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U, 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U, 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U, 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U, 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U, 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU, 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U, 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U, 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U, 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU, }; static const u32 Te1[256] = { 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU, 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U, 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU, 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U, 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU, 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U, 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU, 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U, 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U, 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU, 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U, 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U, 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U, 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU, 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U, 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U, 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU, 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U, 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U, 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U, 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU, 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU, 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U, 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU, 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU, 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U, 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU, 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U, 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU, 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U, 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U, 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U, 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU, 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U, 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU, 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U, 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU, 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U, 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U, 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU, 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU, 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU, 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U, 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U, 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU, 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U, 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU, 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U, 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU, 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U, 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU, 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU, 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U, 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU, 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U, 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU, 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U, 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U, 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U, 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU, 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU, 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U, 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU, 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U, }; static const u32 Te2[256] = { 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU, 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U, 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU, 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U, 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU, 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U, 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU, 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U, 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U, 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU, 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U, 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U, 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U, 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU, 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U, 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U, 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU, 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U, 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U, 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U, 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU, 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU, 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U, 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU, 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU, 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U, 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU, 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U, 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU, 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U, 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U, 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U, 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU, 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U, 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU, 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U, 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU, 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U, 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U, 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU, 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU, 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU, 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U, 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U, 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU, 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U, 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU, 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U, 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU, 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U, 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU, 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU, 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U, 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU, 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U, 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU, 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U, 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U, 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U, 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU, 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU, 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U, 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU, 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U, }; static const u32 Te3[256] = { 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U, 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U, 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U, 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU, 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU, 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU, 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U, 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU, 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU, 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U, 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U, 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU, 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU, 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU, 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU, 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU, 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U, 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU, 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU, 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U, 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U, 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U, 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U, 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U, 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU, 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U, 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU, 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU, 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U, 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U, 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U, 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU, 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U, 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU, 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU, 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U, 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U, 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU, 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U, 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU, 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U, 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U, 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U, 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U, 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU, 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U, 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU, 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U, 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU, 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U, 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU, 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU, 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU, 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU, 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U, 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U, 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U, 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U, 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U, 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U, 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU, 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U, 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU, 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU, }; static const u32 Te4[256] = { 0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU, 0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U, 0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU, 0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U, 0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU, 0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U, 0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU, 0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U, 0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U, 0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU, 0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U, 0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U, 0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U, 0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU, 0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U, 0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U, 0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU, 0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U, 0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U, 0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U, 0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU, 0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU, 0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U, 0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU, 0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU, 0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U, 0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU, 0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U, 0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU, 0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U, 0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U, 0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U, 0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU, 0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U, 0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU, 0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U, 0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU, 0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U, 0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U, 0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU, 0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU, 0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU, 0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U, 0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U, 0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU, 0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U, 0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU, 0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U, 0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU, 0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U, 0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU, 0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU, 0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U, 0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU, 0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U, 0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU, 0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U, 0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U, 0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U, 0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU, 0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU, 0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U, 0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU, 0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U, }; static const u32 Td0[256] = { 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U, 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U, 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U, 0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU, 0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U, 0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U, 0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU, 0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U, 0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU, 0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U, 0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U, 0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U, 0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U, 0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU, 0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U, 0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU, 0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U, 0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU, 0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U, 0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U, 0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U, 0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU, 0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U, 0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU, 0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U, 0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU, 0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U, 0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU, 0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU, 0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U, 0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU, 0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U, 0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU, 0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U, 0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U, 0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U, 0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU, 0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U, 0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U, 0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU, 0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U, 0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U, 0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U, 0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U, 0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U, 0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU, 0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U, 0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U, 0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U, 0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U, 0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U, 0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU, 0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU, 0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU, 0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU, 0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U, 0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U, 0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU, 0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU, 0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U, 0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU, 0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U, 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U, 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U, }; static const u32 Td1[256] = { 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU, 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U, 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU, 0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U, 0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U, 0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U, 0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U, 0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U, 0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U, 0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU, 0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU, 0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU, 0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U, 0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU, 0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U, 0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U, 0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U, 0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU, 0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU, 0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U, 0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU, 0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U, 0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU, 0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU, 0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U, 0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U, 0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U, 0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU, 0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U, 0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU, 0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U, 0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U, 0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U, 0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU, 0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U, 0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U, 0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U, 0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U, 0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U, 0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U, 0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU, 0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU, 0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U, 0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU, 0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U, 0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU, 0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU, 0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U, 0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU, 0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U, 0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U, 0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U, 0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U, 0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U, 0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U, 0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U, 0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU, 0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U, 0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U, 0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU, 0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U, 0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U, 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U, 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U, }; static const u32 Td2[256] = { 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U, 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U, 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U, 0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U, 0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU, 0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U, 0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U, 0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U, 0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U, 0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU, 0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U, 0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U, 0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU, 0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U, 0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U, 0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U, 0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U, 0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U, 0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U, 0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU, 0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U, 0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U, 0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U, 0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U, 0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U, 0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU, 0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU, 0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U, 0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU, 0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U, 0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU, 0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU, 0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU, 0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU, 0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U, 0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U, 0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U, 0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U, 0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U, 0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U, 0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U, 0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU, 0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU, 0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U, 0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U, 0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU, 0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU, 0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U, 0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U, 0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U, 0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U, 0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U, 0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U, 0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U, 0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU, 0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U, 0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U, 0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U, 0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U, 0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U, 0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U, 0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU, 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U, 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U, }; static const u32 Td3[256] = { 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU, 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU, 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U, 0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U, 0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU, 0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU, 0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U, 0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU, 0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U, 0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU, 0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U, 0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U, 0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U, 0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U, 0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U, 0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU, 0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU, 0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U, 0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U, 0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU, 0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU, 0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U, 0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U, 0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U, 0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U, 0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU, 0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U, 0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U, 0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU, 0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU, 0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U, 0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U, 0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U, 0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU, 0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U, 0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U, 0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U, 0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U, 0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U, 0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U, 0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U, 0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU, 0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U, 0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U, 0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU, 0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU, 0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U, 0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU, 0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U, 0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U, 0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U, 0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U, 0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U, 0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U, 0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU, 0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU, 0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU, 0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU, 0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U, 0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U, 0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U, 0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU, 0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U, 0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U, }; static const u32 Td4[256] = { 0x52525252U, 0x09090909U, 0x6a6a6a6aU, 0xd5d5d5d5U, 0x30303030U, 0x36363636U, 0xa5a5a5a5U, 0x38383838U, 0xbfbfbfbfU, 0x40404040U, 0xa3a3a3a3U, 0x9e9e9e9eU, 0x81818181U, 0xf3f3f3f3U, 0xd7d7d7d7U, 0xfbfbfbfbU, 0x7c7c7c7cU, 0xe3e3e3e3U, 0x39393939U, 0x82828282U, 0x9b9b9b9bU, 0x2f2f2f2fU, 0xffffffffU, 0x87878787U, 0x34343434U, 0x8e8e8e8eU, 0x43434343U, 0x44444444U, 0xc4c4c4c4U, 0xdedededeU, 0xe9e9e9e9U, 0xcbcbcbcbU, 0x54545454U, 0x7b7b7b7bU, 0x94949494U, 0x32323232U, 0xa6a6a6a6U, 0xc2c2c2c2U, 0x23232323U, 0x3d3d3d3dU, 0xeeeeeeeeU, 0x4c4c4c4cU, 0x95959595U, 0x0b0b0b0bU, 0x42424242U, 0xfafafafaU, 0xc3c3c3c3U, 0x4e4e4e4eU, 0x08080808U, 0x2e2e2e2eU, 0xa1a1a1a1U, 0x66666666U, 0x28282828U, 0xd9d9d9d9U, 0x24242424U, 0xb2b2b2b2U, 0x76767676U, 0x5b5b5b5bU, 0xa2a2a2a2U, 0x49494949U, 0x6d6d6d6dU, 0x8b8b8b8bU, 0xd1d1d1d1U, 0x25252525U, 0x72727272U, 0xf8f8f8f8U, 0xf6f6f6f6U, 0x64646464U, 0x86868686U, 0x68686868U, 0x98989898U, 0x16161616U, 0xd4d4d4d4U, 0xa4a4a4a4U, 0x5c5c5c5cU, 0xccccccccU, 0x5d5d5d5dU, 0x65656565U, 0xb6b6b6b6U, 0x92929292U, 0x6c6c6c6cU, 0x70707070U, 0x48484848U, 0x50505050U, 0xfdfdfdfdU, 0xededededU, 0xb9b9b9b9U, 0xdadadadaU, 0x5e5e5e5eU, 0x15151515U, 0x46464646U, 0x57575757U, 0xa7a7a7a7U, 0x8d8d8d8dU, 0x9d9d9d9dU, 0x84848484U, 0x90909090U, 0xd8d8d8d8U, 0xababababU, 0x00000000U, 0x8c8c8c8cU, 0xbcbcbcbcU, 0xd3d3d3d3U, 0x0a0a0a0aU, 0xf7f7f7f7U, 0xe4e4e4e4U, 0x58585858U, 0x05050505U, 0xb8b8b8b8U, 0xb3b3b3b3U, 0x45454545U, 0x06060606U, 0xd0d0d0d0U, 0x2c2c2c2cU, 0x1e1e1e1eU, 0x8f8f8f8fU, 0xcacacacaU, 0x3f3f3f3fU, 0x0f0f0f0fU, 0x02020202U, 0xc1c1c1c1U, 0xafafafafU, 0xbdbdbdbdU, 0x03030303U, 0x01010101U, 0x13131313U, 0x8a8a8a8aU, 0x6b6b6b6bU, 0x3a3a3a3aU, 0x91919191U, 0x11111111U, 0x41414141U, 0x4f4f4f4fU, 0x67676767U, 0xdcdcdcdcU, 0xeaeaeaeaU, 0x97979797U, 0xf2f2f2f2U, 0xcfcfcfcfU, 0xcecececeU, 0xf0f0f0f0U, 0xb4b4b4b4U, 0xe6e6e6e6U, 0x73737373U, 0x96969696U, 0xacacacacU, 0x74747474U, 0x22222222U, 0xe7e7e7e7U, 0xadadadadU, 0x35353535U, 0x85858585U, 0xe2e2e2e2U, 0xf9f9f9f9U, 0x37373737U, 0xe8e8e8e8U, 0x1c1c1c1cU, 0x75757575U, 0xdfdfdfdfU, 0x6e6e6e6eU, 0x47474747U, 0xf1f1f1f1U, 0x1a1a1a1aU, 0x71717171U, 0x1d1d1d1dU, 0x29292929U, 0xc5c5c5c5U, 0x89898989U, 0x6f6f6f6fU, 0xb7b7b7b7U, 0x62626262U, 0x0e0e0e0eU, 0xaaaaaaaaU, 0x18181818U, 0xbebebebeU, 0x1b1b1b1bU, 0xfcfcfcfcU, 0x56565656U, 0x3e3e3e3eU, 0x4b4b4b4bU, 0xc6c6c6c6U, 0xd2d2d2d2U, 0x79797979U, 0x20202020U, 0x9a9a9a9aU, 0xdbdbdbdbU, 0xc0c0c0c0U, 0xfefefefeU, 0x78787878U, 0xcdcdcdcdU, 0x5a5a5a5aU, 0xf4f4f4f4U, 0x1f1f1f1fU, 0xddddddddU, 0xa8a8a8a8U, 0x33333333U, 0x88888888U, 0x07070707U, 0xc7c7c7c7U, 0x31313131U, 0xb1b1b1b1U, 0x12121212U, 0x10101010U, 0x59595959U, 0x27272727U, 0x80808080U, 0xececececU, 0x5f5f5f5fU, 0x60606060U, 0x51515151U, 0x7f7f7f7fU, 0xa9a9a9a9U, 0x19191919U, 0xb5b5b5b5U, 0x4a4a4a4aU, 0x0d0d0d0dU, 0x2d2d2d2dU, 0xe5e5e5e5U, 0x7a7a7a7aU, 0x9f9f9f9fU, 0x93939393U, 0xc9c9c9c9U, 0x9c9c9c9cU, 0xefefefefU, 0xa0a0a0a0U, 0xe0e0e0e0U, 0x3b3b3b3bU, 0x4d4d4d4dU, 0xaeaeaeaeU, 0x2a2a2a2aU, 0xf5f5f5f5U, 0xb0b0b0b0U, 0xc8c8c8c8U, 0xebebebebU, 0xbbbbbbbbU, 0x3c3c3c3cU, 0x83838383U, 0x53535353U, 0x99999999U, 0x61616161U, 0x17171717U, 0x2b2b2b2bU, 0x04040404U, 0x7e7e7e7eU, 0xbabababaU, 0x77777777U, 0xd6d6d6d6U, 0x26262626U, 0xe1e1e1e1U, 0x69696969U, 0x14141414U, 0x63636363U, 0x55555555U, 0x21212121U, 0x0c0c0c0cU, 0x7d7d7d7dU, }; static const u32 rcon[] = { 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000, 0x80000000, 0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ }; #define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] << 8) ^ ((u32)(pt)[3])) #define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >> 8); (ct)[3] = (u8)(st); } /** * Expand the cipher key into the encryption key schedule. * * @return the number of rounds for the given cipher key size. */ int rijndaelKeySetupEnc(u32 rk[/*4*(Nr + 1)*/], const u8 cipherKey[], int keyBits) { int i = 0; u32 temp; rk[0] = GETU32(cipherKey ); rk[1] = GETU32(cipherKey + 4); rk[2] = GETU32(cipherKey + 8); rk[3] = GETU32(cipherKey + 12); if (keyBits == 128) { for (;;) { temp = rk[3]; rk[4] = rk[0] ^ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ (Te4[(temp ) & 0xff] & 0x0000ff00) ^ (Te4[(temp >> 24) ] & 0x000000ff) ^ rcon[i]; rk[5] = rk[1] ^ rk[4]; rk[6] = rk[2] ^ rk[5]; rk[7] = rk[3] ^ rk[6]; if (++i == 10) { return 10; } rk += 4; } } rk[4] = GETU32(cipherKey + 16); rk[5] = GETU32(cipherKey + 20); if (keyBits == 192) { for (;;) { temp = rk[ 5]; rk[ 6] = rk[ 0] ^ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ (Te4[(temp ) & 0xff] & 0x0000ff00) ^ (Te4[(temp >> 24) ] & 0x000000ff) ^ rcon[i]; rk[ 7] = rk[ 1] ^ rk[ 6]; rk[ 8] = rk[ 2] ^ rk[ 7]; rk[ 9] = rk[ 3] ^ rk[ 8]; if (++i == 8) { return 12; } rk[10] = rk[ 4] ^ rk[ 9]; rk[11] = rk[ 5] ^ rk[10]; rk += 6; } } rk[6] = GETU32(cipherKey + 24); rk[7] = GETU32(cipherKey + 28); if (keyBits == 256) { for (;;) { temp = rk[ 7]; rk[ 8] = rk[ 0] ^ (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ (Te4[(temp ) & 0xff] & 0x0000ff00) ^ (Te4[(temp >> 24) ] & 0x000000ff) ^ rcon[i]; rk[ 9] = rk[ 1] ^ rk[ 8]; rk[10] = rk[ 2] ^ rk[ 9]; rk[11] = rk[ 3] ^ rk[10]; if (++i == 7) { return 14; } temp = rk[11]; rk[12] = rk[ 4] ^ (Te4[(temp >> 24) ] & 0xff000000) ^ (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^ (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^ (Te4[(temp ) & 0xff] & 0x000000ff); rk[13] = rk[ 5] ^ rk[12]; rk[14] = rk[ 6] ^ rk[13]; rk[15] = rk[ 7] ^ rk[14]; rk += 8; } } return 0; } #ifdef NEED_RIJNDAEL_DECRYPT /** * Expand the cipher key into the decryption key schedule. * * @return the number of rounds for the given cipher key size. */ int rijndaelKeySetupDec(u32 rk[/*4*(Nr + 1)*/], const u8 cipherKey[], int keyBits) { int Nr, i, j; u32 temp; /* expand the cipher key: */ Nr = rijndaelKeySetupEnc(rk, cipherKey, keyBits); /* invert the order of the round keys: */ for (i = 0, j = 4*Nr; i < j; i += 4, j -= 4) { temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp; temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp; temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp; temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp; } /* apply the inverse MixColumn transform to all round keys but the first and the last: */ for (i = 1; i < Nr; i++) { rk += 4; rk[0] = Td0[Te4[(rk[0] >> 24) ] & 0xff] ^ Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^ Td2[Te4[(rk[0] >> 8) & 0xff] & 0xff] ^ Td3[Te4[(rk[0] ) & 0xff] & 0xff]; rk[1] = Td0[Te4[(rk[1] >> 24) ] & 0xff] ^ Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^ Td2[Te4[(rk[1] >> 8) & 0xff] & 0xff] ^ Td3[Te4[(rk[1] ) & 0xff] & 0xff]; rk[2] = Td0[Te4[(rk[2] >> 24) ] & 0xff] ^ Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^ Td2[Te4[(rk[2] >> 8) & 0xff] & 0xff] ^ Td3[Te4[(rk[2] ) & 0xff] & 0xff]; rk[3] = Td0[Te4[(rk[3] >> 24) ] & 0xff] ^ Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^ Td2[Te4[(rk[3] >> 8) & 0xff] & 0xff] ^ Td3[Te4[(rk[3] ) & 0xff] & 0xff]; } return Nr; } #endif /* NEED_RIJNDAEL_DECRYPT */ void rijndaelEncrypt(const u32 rk[/*4*(Nr + 1)*/], int Nr, const u8 pt[16], u8 ct[16]) { u32 s0, s1, s2, s3, t0, t1, t2, t3; #ifndef FULL_UNROLL int r; #endif /* ?FULL_UNROLL */ /* * map byte array block to cipher state * and add initial round key: */ s0 = GETU32(pt ) ^ rk[0]; s1 = GETU32(pt + 4) ^ rk[1]; s2 = GETU32(pt + 8) ^ rk[2]; s3 = GETU32(pt + 12) ^ rk[3]; #ifdef FULL_UNROLL /* round 1: */ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4]; t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5]; t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6]; t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7]; /* round 2: */ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8]; s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9]; s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10]; s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11]; /* round 3: */ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12]; t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13]; t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14]; t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15]; /* round 4: */ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16]; s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17]; s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18]; s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19]; /* round 5: */ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20]; t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21]; t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22]; t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23]; /* round 6: */ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24]; s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25]; s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26]; s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27]; /* round 7: */ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28]; t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29]; t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30]; t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31]; /* round 8: */ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32]; s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33]; s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34]; s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35]; /* round 9: */ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36]; t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37]; t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38]; t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39]; if (Nr > 10) { /* round 10: */ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[40]; s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[41]; s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[42]; s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[43]; /* round 11: */ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[44]; t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[45]; t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[46]; t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[47]; if (Nr > 12) { /* round 12: */ s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[48]; s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[49]; s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[50]; s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[51]; /* round 13: */ t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[52]; t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[53]; t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[54]; t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[55]; } } rk += Nr << 2; #else /* !FULL_UNROLL */ /* * Nr - 1 full rounds: */ r = Nr >> 1; for (;;) { t0 = Te0[(s0 >> 24) ] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff] ^ Te3[(s3 ) & 0xff] ^ rk[4]; t1 = Te0[(s1 >> 24) ] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[(s0 ) & 0xff] ^ rk[5]; t2 = Te0[(s2 >> 24) ] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[(s1 ) & 0xff] ^ rk[6]; t3 = Te0[(s3 >> 24) ] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[(s2 ) & 0xff] ^ rk[7]; rk += 8; if (--r == 0) { break; } s0 = Te0[(t0 >> 24) ] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[(t3 ) & 0xff] ^ rk[0]; s1 = Te0[(t1 >> 24) ] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff] ^ Te3[(t0 ) & 0xff] ^ rk[1]; s2 = Te0[(t2 >> 24) ] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >> 8) & 0xff] ^ Te3[(t1 ) & 0xff] ^ rk[2]; s3 = Te0[(t3 >> 24) ] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >> 8) & 0xff] ^ Te3[(t2 ) & 0xff] ^ rk[3]; } #endif /* ?FULL_UNROLL */ /* * apply last round and * map cipher state to byte array block: */ s0 = (Te4[(t0 >> 24) ] & 0xff000000) ^ (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^ (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^ (Te4[(t3 ) & 0xff] & 0x000000ff) ^ rk[0]; PUTU32(ct , s0); s1 = (Te4[(t1 >> 24) ] & 0xff000000) ^ (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^ (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^ (Te4[(t0 ) & 0xff] & 0x000000ff) ^ rk[1]; PUTU32(ct + 4, s1); s2 = (Te4[(t2 >> 24) ] & 0xff000000) ^ (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^ (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^ (Te4[(t1 ) & 0xff] & 0x000000ff) ^ rk[2]; PUTU32(ct + 8, s2); s3 = (Te4[(t3 >> 24) ] & 0xff000000) ^ (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^ (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^ (Te4[(t2 ) & 0xff] & 0x000000ff) ^ rk[3]; PUTU32(ct + 12, s3); } #ifdef NEED_RIJNDAEL_DECRYPT static void rijndaelDecrypt(const u32 rk[/*4*(Nr + 1)*/], int Nr, const u8 ct[16], u8 pt[16]) { u32 s0, s1, s2, s3, t0, t1, t2, t3; #ifndef FULL_UNROLL int r; #endif /* ?FULL_UNROLL */ /* * map byte array block to cipher state * and add initial round key: */ s0 = GETU32(ct ) ^ rk[0]; s1 = GETU32(ct + 4) ^ rk[1]; s2 = GETU32(ct + 8) ^ rk[2]; s3 = GETU32(ct + 12) ^ rk[3]; #ifdef FULL_UNROLL /* round 1: */ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[ 4]; t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[ 5]; t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[ 6]; t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[ 7]; /* round 2: */ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[ 8]; s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[ 9]; s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[10]; s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[11]; /* round 3: */ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[12]; t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[13]; t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[14]; t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[15]; /* round 4: */ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[16]; s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[17]; s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[18]; s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[19]; /* round 5: */ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[20]; t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[21]; t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[22]; t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[23]; /* round 6: */ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[24]; s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[25]; s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[26]; s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[27]; /* round 7: */ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[28]; t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[29]; t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[30]; t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[31]; /* round 8: */ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[32]; s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[33]; s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[34]; s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[35]; /* round 9: */ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[36]; t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[37]; t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[38]; t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[39]; if (Nr > 10) { /* round 10: */ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[40]; s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[41]; s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[42]; s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[43]; /* round 11: */ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[44]; t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[45]; t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[46]; t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[47]; if (Nr > 12) { /* round 12: */ s0 = Td0[t0 >> 24] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[t1 & 0xff] ^ rk[48]; s1 = Td0[t1 >> 24] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[t2 & 0xff] ^ rk[49]; s2 = Td0[t2 >> 24] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[t3 & 0xff] ^ rk[50]; s3 = Td0[t3 >> 24] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[t0 & 0xff] ^ rk[51]; /* round 13: */ t0 = Td0[s0 >> 24] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[s1 & 0xff] ^ rk[52]; t1 = Td0[s1 >> 24] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[s2 & 0xff] ^ rk[53]; t2 = Td0[s2 >> 24] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[s3 & 0xff] ^ rk[54]; t3 = Td0[s3 >> 24] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[s0 & 0xff] ^ rk[55]; } } rk += Nr << 2; #else /* !FULL_UNROLL */ /* * Nr - 1 full rounds: */ r = Nr >> 1; for (;;) { t0 = Td0[(s0 >> 24) ] ^ Td1[(s3 >> 16) & 0xff] ^ Td2[(s2 >> 8) & 0xff] ^ Td3[(s1 ) & 0xff] ^ rk[4]; t1 = Td0[(s1 >> 24) ] ^ Td1[(s0 >> 16) & 0xff] ^ Td2[(s3 >> 8) & 0xff] ^ Td3[(s2 ) & 0xff] ^ rk[5]; t2 = Td0[(s2 >> 24) ] ^ Td1[(s1 >> 16) & 0xff] ^ Td2[(s0 >> 8) & 0xff] ^ Td3[(s3 ) & 0xff] ^ rk[6]; t3 = Td0[(s3 >> 24) ] ^ Td1[(s2 >> 16) & 0xff] ^ Td2[(s1 >> 8) & 0xff] ^ Td3[(s0 ) & 0xff] ^ rk[7]; rk += 8; if (--r == 0) { break; } s0 = Td0[(t0 >> 24) ] ^ Td1[(t3 >> 16) & 0xff] ^ Td2[(t2 >> 8) & 0xff] ^ Td3[(t1 ) & 0xff] ^ rk[0]; s1 = Td0[(t1 >> 24) ] ^ Td1[(t0 >> 16) & 0xff] ^ Td2[(t3 >> 8) & 0xff] ^ Td3[(t2 ) & 0xff] ^ rk[1]; s2 = Td0[(t2 >> 24) ] ^ Td1[(t1 >> 16) & 0xff] ^ Td2[(t0 >> 8) & 0xff] ^ Td3[(t3 ) & 0xff] ^ rk[2]; s3 = Td0[(t3 >> 24) ] ^ Td1[(t2 >> 16) & 0xff] ^ Td2[(t1 >> 8) & 0xff] ^ Td3[(t0 ) & 0xff] ^ rk[3]; } #endif /* ?FULL_UNROLL */ /* * apply last round and * map cipher state to byte array block: */ s0 = (Td4[(t0 >> 24) ] & 0xff000000) ^ (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^ (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^ (Td4[(t1 ) & 0xff] & 0x000000ff) ^ rk[0]; PUTU32(pt , s0); s1 = (Td4[(t1 >> 24) ] & 0xff000000) ^ (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^ (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^ (Td4[(t2 ) & 0xff] & 0x000000ff) ^ rk[1]; PUTU32(pt + 4, s1); s2 = (Td4[(t2 >> 24) ] & 0xff000000) ^ (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^ (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^ (Td4[(t3 ) & 0xff] & 0x000000ff) ^ rk[2]; PUTU32(pt + 8, s2); s3 = (Td4[(t3 >> 24) ] & 0xff000000) ^ (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^ (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^ (Td4[(t0 ) & 0xff] & 0x000000ff) ^ rk[3]; PUTU32(pt + 12, s3); } #endif /* NEED_RIJNDAEL_DECRYPT */ #ifdef NEED_RIJNDAEL_WRAPPERS /* setup key context for encryption only */ int rijndael_set_key_enc_only(rijndael_ctx *ctx, const u_char *key, int bits) { int rounds; rounds = rijndaelKeySetupEnc(ctx->ek, key, bits); if (rounds == 0) return -1; ctx->Nr = rounds; ctx->enc_only = 1; return 0; } /* setup key context for both encryption and decryption */ int rijndael_set_key(rijndael_ctx *ctx, const u_char *key, int bits) { int rounds; rounds = rijndaelKeySetupEnc(ctx->ek, key, bits); if (rounds == 0) return -1; if (rijndaelKeySetupDec(ctx->dk, key, bits) != rounds) return -1; ctx->Nr = rounds; ctx->enc_only = 0; return 0; } void rijndael_decrypt(rijndael_ctx *ctx, const u_char *src, u_char *dst) { rijndaelDecrypt(ctx->dk, ctx->Nr, src, dst); } void rijndael_encrypt(rijndael_ctx *ctx, const u_char *src, u_char *dst) { rijndaelEncrypt(ctx->ek, ctx->Nr, src, dst); } #endif /* NEED_RIJNDAEL_WRAPPERS */ xen-4.4.0/xen/tools/0000775000175000017500000000000012307313555012375 5ustar smbsmbxen-4.4.0/xen/tools/Makefile0000664000175000017500000000025112307313555014033 0ustar smbsmb include $(XEN_ROOT)/Config.mk .PHONY: default default: $(MAKE) symbols .PHONY: clean clean: rm -f *.o symbols symbols: symbols.c $(HOSTCC) $(HOSTCFLAGS) -o $@ $< xen-4.4.0/xen/tools/fig-to-oct.py0000664000175000017500000000051312307313555014716 0ustar smbsmb#!/usr/bin/env python import sys chars_per_line = 18 chars_so_far = 0 sys.stdout.write('"') for char in sys.stdin.read(): sys.stdout.write("\\%03o" % ord(char)) chars_so_far = chars_so_far + 1 if chars_so_far == chars_per_line: chars_so_far = 0 sys.stdout.write('" \\\n"') sys.stdout.write('"\n') xen-4.4.0/xen/tools/compat-build-source.py0000775000175000017500000000166512307313555016640 0ustar smbsmb#!/usr/bin/env python import re,sys pats = [ [ r"^\s*#\s*include\s+", r"__InClUdE__ " ], [ r"^\s*#\s*ifdef (XEN_HAVE.*)\s+", r"__IfDeF__ \1" ], [ r"^\s*#\s*else /\* (XEN_HAVE.*) \*/\s+", r"__ElSe__" ], [ r"^\s*#\s*endif /\* (XEN_HAVE.*) \*/\s+", r"__EnDif__" ], [ r"^\s*#\s*define\s+([A-Z_]*_GUEST_HANDLE)", r"#define HIDE_\1" ], [ r"^\s*#\s*define\s+([a-z_]*_guest_handle)", r"#define hide_\1" ], [ r"XEN_GUEST_HANDLE(_[0-9A-Fa-f]+)?", r"COMPAT_HANDLE" ], ]; xlats = [] xlatf = open('xlat.lst', 'r') for line in xlatf.readlines(): match = re.subn(r"^\s*\?\s+(\w*)\s.*", r"\1", line.rstrip()) if match[1]: xlats.append(match[0]) xlatf.close() for line in sys.stdin.readlines(): for pat in pats: line = re.subn(pat[0], pat[1], line)[0] for xlat in xlats: line = re.subn(r"(struct|union)\s+(%s|xen_%s)\s+(\w)" % (xlat, xlat), r"\1 @KeeP@\2 \3", line.rstrip())[0] print line.rstrip() xen-4.4.0/xen/tools/scmversion0000775000175000017500000000345412307313555014521 0ustar smbsmb#!/bin/sh # # This scripts adds local version information from the version # control systems git and mercurial (hg). # # If something goes wrong, send a mail the kernel build mailinglist # (see MAINTAINERS) and CC Nico Schottelius # . # # Based on setlocalversion from Linux kernel # # usage() { echo "Usage: $0 [--save-scmversion] [srctree]" >&2 exit 1 } save_scm=false srctree=. if test "$1" = "--save-scmversion"; then save_scm=true shift fi if test $# -gt 0; then srctree=$1 shift fi if test $# -gt 0 -o ! -d "$srctree"; then usage fi scm_version() { if test -e .scmversion; then cat .scmversion return fi # Check for git and a git repo. if test -d .git && head=`git rev-parse --verify --short HEAD 2>/dev/null`; then date=`git show -s --pretty="%ad" HEAD` printf '%s %s%s' "$date" git: $head # Is this git on svn? if git config --get svn-remote.svn.url >/dev/null; then printf -- 'svn:%s' "`git svn find-rev $head`" fi # Update index only on r/w media [ -w . ] && git update-index --refresh --unmerged > /dev/null # Check for uncommitted changes if git diff-index --name-only HEAD | grep -qv "^scripts/package"; then printf '%s' -dirty fi # All done with git return fi # Check for mercurial and a mercurial repo. if test -d .hg && hgid=`hg id 2>/dev/null`; then id=`printf '%s' "$hgid" | sed 's/[+ ].*//'` date=`hg parents --template "{date|date}"` printf '%s %s%s' "$date" hg: "$id" # Are there uncommitted changes? # These are represented by + after the changeset id. case "$hgid" in *+|*+\ *) printf '%s' -dirty ;; esac # All done with mercurial return fi } cd $srctree # full scm version string res="$(scm_version)" if [ "$save_scm" = "true" ]; then echo $res > .scmversion fi echo "$res" xen-4.4.0/xen/tools/compat-build-header.py0000775000175000017500000000135612307313555016565 0ustar smbsmb#!/usr/bin/env python import re,sys pats = [ [ r"__InClUdE__(.*)", r"#include\1\n#pragma pack(4)" ], [ r"__IfDeF__ (XEN_HAVE.*)", r"#ifdef \1" ], [ r"__ElSe__", r"#else" ], [ r"__EnDif__", r"#endif" ], [ r"\"xen-compat.h\"", r"" ], [ r"(struct|union|enum)\s+(xen_?)?(\w)", r"\1 compat_\3" ], [ r"@KeeP@", r"" ], [ r"_t([^\w]|$)", r"_compat_t\1" ], [ r"(8|16|32|64)_compat_t([^\w]|$)", r"\1_t\2" ], [ r"(^|[^\w])xen_?(\w*)_compat_t([^\w]|$$)", r"\1compat_\2_t\3" ], [ r"(^|[^\w])XEN_?", r"\1COMPAT_" ], [ r"(^|[^\w])Xen_?", r"\1Compat_" ], [ r"(^|[^\w])long([^\w]|$$)", r"\1int\2" ] ]; for line in sys.stdin.readlines(): for pat in pats: line = re.subn(pat[0], pat[1], line)[0] print line.rstrip() xen-4.4.0/xen/tools/get-fields.sh0000664000175000017500000002207612307313555014763 0ustar smbsmbtest -n "$1" -a -n "$2" -a -n "$3" set -ef SED=sed if test -x /usr/xpg4/bin/sed; then SED=/usr/xpg4/bin/sed fi if test -z ${PYTHON}; then PYTHON=`/usr/bin/env python` fi if test -z ${PYTHON}; then echo "Python not found" exit 1 fi get_fields () { local level=1 aggr=0 name= fields= for token in $2 do case "$token" in struct|union) test $level != 1 || aggr=1 fields= name= ;; "{") level=$(expr $level + 1) ;; "}") level=$(expr $level - 1) if [ $level = 1 -a $name = $1 ] then echo "$fields }" return 0 fi ;; [a-zA-Z_]*) test $aggr = 0 -o -n "$name" || name="$token" ;; esac test $aggr = 0 || fields="$fields $token" done } get_typedefs () { local level=1 state= for token in $1 do case "$token" in typedef) test $level != 1 || state=1 ;; COMPAT_HANDLE\(*\)) test $level != 1 -o "$state" != 1 || state=2 ;; [\{\[]) level=$(expr $level + 1) ;; [\}\]]) level=$(expr $level - 1) ;; ";") test $level != 1 || state= ;; [a-zA-Z_]*) test $level != 1 -o "$state" != 2 || echo "$token" ;; esac done } build_enums () { local level=1 kind= fields= members= named= id= token for token in $2 do case "$token" in struct|union) test $level != 2 || fields=" " kind="$token;$kind" ;; "{") level=$(expr $level + 1) ;; "}") level=$(expr $level - 1) if [ $level = 1 ] then if [ "${kind%%;*}" = union ] then echo echo "enum XLAT_$1 {" for m in $members do echo " XLAT_${1}_$m," done echo "};" fi return 0 elif [ $level = 2 ] then named='?' fi ;; [a-zA-Z]*) id=$token if [ -n "$named" -a -n "${kind#*;}" ] then build_enums ${1}_$token "$fields" named='!' fi ;; ",") test $level != 2 || members="$members $id" ;; ";") test $level != 2 || members="$members $id" test -z "$named" || kind=${kind#*;} named= ;; esac test -z "$fields" || fields="$fields $token" done } handle_field () { if [ -z "$5" ] then echo " \\" if [ -z "$4" ] then echo -n "$1(_d_)->$3 = (_s_)->$3;" else echo -n "$1XLAT_${2}_HNDL_$(echo $3 | $SED 's,\.,_,g')(_d_, _s_);" fi elif [ -z "$(echo "$5" | $SED 's,[^{}],,g')" ] then local tag=$(echo "$5" | ${PYTHON} -c ' import re,sys for line in sys.stdin.readlines(): print re.subn(r"\s*(struct|union)\s+(compat_)?(\w+)\s.*", r"\3", line)[0].rstrip() ') echo " \\" echo -n "${1}XLAT_$tag(&(_d_)->$3, &(_s_)->$3);" else local level=1 kind= fields= id= array= arrlvl=1 array_type= type= token for token in $5 do case "$token" in struct|union) test $level != 2 || fields=" " if [ $level = 1 ] then kind=$token if [ $kind = union ] then echo " \\" echo -n "${1}switch ($(echo $3 | $SED 's,\.,_,g')) {" fi fi ;; "{") level=$(expr $level + 1) id= ;; "}") level=$(expr $level - 1) id= if [ $level = 1 -a $kind = union ] then echo " \\" echo -n "$1}" fi ;; "[") if [ $level != 2 -o $arrlvl != 1 ] then : elif [ -z "$array" ] then array=" " else array="$array;" fi arrlvl=$(expr $arrlvl + 1) ;; "]") arrlvl=$(expr $arrlvl - 1) ;; COMPAT_HANDLE\(*\)) if [ $level = 2 -a -z "$id" ] then type=${token#COMPAT_HANDLE?} type=${type%?} type=${type#compat_} fi ;; compat_domain_handle_t) if [ $level = 2 -a -z "$id" ] then array_type=$token fi ;; [a-zA-Z]*) if [ -z "$id" -a -z "$type" -a -z "$array_type" ] then for id in $typedefs do test $id != "$token" || type=$id done if [ -z "$type" ] then id=$token else id= fi else id=$token fi ;; [\,\;]) if [ $level = 2 -a -n "$(echo $id | $SED 's,^_pad[[:digit:]]*,,')" ] then if [ $kind = union ] then echo " \\" echo -n "${1}case XLAT_${2}_$(echo $3.$id | $SED 's,\.,_,g'):" handle_field "$1 " $2 $3.$id "$type" "$fields" elif [ -z "$array" -a -z "$array_type" ] then handle_field "$1" $2 $3.$id "$type" "$fields" elif [ -z "$array" ] then copy_array " " $3.$id else handle_array "$1" $2 $3.$id "${array#*;}" "$type" "$fields" fi test "$token" != ";" || fields= id= type= array= if [ $kind = union ] then echo " \\" echo -n "$1 break;" fi fi ;; *) if [ -n "$array" ] then array="$array $token" fi ;; esac test -z "$fields" || fields="$fields $token" done fi } copy_array () { echo " \\" echo "${1}if ((_d_)->$2 != (_s_)->$2) \\" echo -n "$1 memcpy((_d_)->$2, (_s_)->$2, sizeof((_d_)->$2));" } handle_array () { local i="i$(echo $4 | $SED 's,[^;], ,g' | wc -w | $SED 's,[[:space:]]*,,g')" echo " \\" echo "$1{ \\" echo "$1 unsigned int $i; \\" echo -n "$1 for ($i = 0; $i < "${4%%;*}"; ++$i) {" if [ "$4" = "${4#*;}" ] then handle_field "$1 " $2 $3[$i] "$5" "$6" else handle_array "$1 " $2 $3[$i] "${4#*;}" "$5" "$6" fi echo " \\" echo "$1 } \\" echo -n "$1}" } build_body () { echo echo -n "#define XLAT_$1(_d_, _s_) do {" local level=1 fields= id= array= arrlvl=1 array_type= type= token for token in $2 do case "$token" in struct|union) test $level != 2 || fields=" " ;; "{") level=$(expr $level + 1) id= ;; "}") level=$(expr $level - 1) id= ;; "[") if [ $level != 2 -o $arrlvl != 1 ] then : elif [ -z "$array" ] then array=" " else array="$array;" fi arrlvl=$(expr $arrlvl + 1) ;; "]") arrlvl=$(expr $arrlvl - 1) ;; COMPAT_HANDLE\(*\)) if [ $level = 2 -a -z "$id" ] then type=${token#COMPAT_HANDLE?} type=${type%?} type=${type#compat_} fi ;; compat_domain_handle_t) if [ $level = 2 -a -z "$id" ] then array_type=$token fi ;; [a-zA-Z_]*) if [ -n "$array" ] then array="$array $token" elif [ -z "$id" -a -z "$type" -a -z "$array_type" ] then for id in $typedefs do test $id != "$token" || type=$id done if [ -z "$type" ] then id=$token else id= fi else id=$token fi ;; [\,\;]) if [ $level = 2 -a -n "$(echo $id | $SED 's,^_pad[[:digit:]]*,,')" ] then if [ -z "$array" -a -z "$array_type" ] then handle_field " " $1 $id "$type" "$fields" elif [ -z "$array" ] then copy_array " " $id else handle_array " " $1 $id "${array#*;}" "$type" "$fields" fi test "$token" != ";" || fields= id= type= array= fi ;; *) if [ -n "$array" ] then array="$array $token" fi ;; esac test -z "$fields" || fields="$fields $token" done echo " \\" echo "} while (0)" } check_field () { if [ -z "$(echo "$4" | $SED 's,[^{}],,g')" ] then echo "; \\" local n=$(echo $3 | $SED 's,[^.], ,g' | wc -w | $SED 's,[[:space:]]*,,g') if [ -n "$4" ] then for n in $4 do case $n in struct|union) ;; [a-zA-Z_]*) echo -n " CHECK_${n#xen_}" break ;; *) echo "Malformed compound declaration: '$n'" >&2 exit 1 ;; esac done elif [ $n = 0 ] then echo -n " CHECK_FIELD_($1, $2, $3)" else echo -n " CHECK_SUBFIELD_${n}_($1, $2, $(echo $3 | $SED 's!\.!, !g'))" fi else local level=1 fields= id= token for token in $4 do case "$token" in struct|union) test $level != 2 || fields=" " ;; "{") level=$(expr $level + 1) id= ;; "}") level=$(expr $level - 1) id= ;; [a-zA-Z]*) id=$token ;; [\,\;]) if [ $level = 2 -a -n "$(echo $id | $SED 's,^_pad[[:digit:]]*,,')" ] then check_field $1 $2 $3.$id "$fields" test "$token" != ";" || fields= id= fi ;; esac test -z "$fields" || fields="$fields $token" done fi } build_check () { echo echo "#define CHECK_$1 \\" local level=1 fields= kind= id= arrlvl=1 token for token in $2 do case "$token" in struct|union) if [ $level = 1 ] then kind=$token echo -n " CHECK_SIZE_($kind, $1)" elif [ $level = 2 ] then fields=" " fi ;; "{") level=$(expr $level + 1) id= ;; "}") level=$(expr $level - 1) id= ;; "[") arrlvl=$(expr $arrlvl + 1) ;; "]") arrlvl=$(expr $arrlvl - 1) ;; [a-zA-Z_]*) test $level != 2 -o $arrlvl != 1 || id=$token ;; [\,\;]) if [ $level = 2 -a -n "$(echo $id | $SED 's,^_pad[[:digit:]]*,,')" ] then check_field $kind $1 $id "$fields" test "$token" != ";" || fields= id= fi ;; esac test -z "$fields" || fields="$fields $token" done echo "" } list="$($SED -e 's,^[[:space:]]#.*,,' -e 's!\([]\[,;:{}]\)! \1 !g' $3)" fields="$(get_fields $(echo $2 | $SED 's,^compat_xen,compat_,') "$list")" if [ -z "$fields" ] then echo "Fields of '$2' not found in '$3'" >&2 exit 1 fi name=${2#compat_} name=${name#xen} case "$1" in "!") typedefs="$(get_typedefs "$list")" build_enums $name "$fields" build_body $name "$fields" ;; "?") build_check $name "$fields" ;; *) echo "Invalid translation indicator: '$1'" >&2 exit 1 ;; esac xen-4.4.0/xen/tools/symbols.c0000664000175000017500000002700212307313555014232 0ustar smbsmb/* Generate assembler source containing symbol information * * Copyright 2002 by Kai Germaschewski * * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. * * Usage: nm -n vmlinux | scripts/symbols [--all-symbols] > symbols.S * * ChangeLog: * * (25/Aug/2004) Paulo Marques * Changed the compression method from stem compression to "table lookup" * compression * * Table compression uses all the unused char codes on the symbols and * maps these to the most used substrings (tokens). For instance, it might * map char code 0xF7 to represent "write_" and then in every symbol where * "write_" appears it can be replaced by 0xF7, saving 5 bytes. * The used codes themselves are also placed in the table so that the * decompresion can work without "special cases". * Applied to kernel symbols, this usually produces a compression ratio * of about 50%. * */ #define _GNU_SOURCE #include #include #include #include #include #define KSYM_NAME_LEN 127 struct sym_entry { unsigned long long addr; unsigned int len; unsigned char *sym; }; static struct sym_entry *table; static unsigned int table_size, table_cnt; static unsigned long long _stext, _etext, _sinittext, _einittext, _sextratext, _eextratext; static int all_symbols = 0; static char symbol_prefix_char = '\0'; int token_profit[0x10000]; /* the table that holds the result of the compression */ unsigned char best_table[256][2]; unsigned char best_table_len[256]; static void usage(void) { fprintf(stderr, "Usage: symbols [--all-symbols] [--symbol-prefix=] < in.map > out.S\n"); exit(1); } /* * This ignores the intensely annoying "mapping symbols" found * in ARM ELF files: $a, $t and $d. */ static inline int is_arm_mapping_symbol(const char *str) { return str[0] == '$' && strchr("atd", str[1]) && (str[2] == '\0' || str[2] == '.'); } static int read_symbol(FILE *in, struct sym_entry *s) { char str[500]; char *sym, stype; int rc; rc = fscanf(in, "%llx %c %499s\n", &s->addr, &stype, str); if (rc != 3) { if (rc != EOF) { /* skip line */ if (fgets(str, 500, in) == NULL) return -1; /* must check fgets result */ } return -1; } sym = str; /* skip prefix char */ if (symbol_prefix_char && str[0] == symbol_prefix_char) sym++; /* Ignore most absolute/undefined (?) symbols. */ if (strcmp(sym, "_stext") == 0) _stext = s->addr; else if (strcmp(sym, "_etext") == 0) _etext = s->addr; else if (strcmp(sym, "_sinittext") == 0) _sinittext = s->addr; else if (strcmp(sym, "_einittext") == 0) _einittext = s->addr; else if (strcmp(sym, "_sextratext") == 0) _sextratext = s->addr; else if (strcmp(sym, "_eextratext") == 0) _eextratext = s->addr; else if (toupper((uint8_t)stype) == 'A') { /* Keep these useful absolute symbols */ if (strcmp(sym, "__gp")) return -1; } else if (toupper((uint8_t)stype) == 'U' || is_arm_mapping_symbol(sym)) return -1; /* exclude also MIPS ELF local symbols ($L123 instead of .L123) */ else if (str[0] == '$') return -1; /* include the type field in the symbol name, so that it gets * compressed together */ s->len = strlen(str) + 1; s->sym = malloc(s->len + 1); strcpy((char *)s->sym + 1, str); s->sym[0] = stype; return 0; } static int symbol_valid(struct sym_entry *s) { int offset = 1; /* skip prefix char */ if (symbol_prefix_char && *(s->sym + 1) == symbol_prefix_char) offset++; /* if --all-symbols is not specified, then symbols outside the text * and inittext sections are discarded */ if (!all_symbols) { if ((s->addr < _stext || s->addr > _etext) && (s->addr < _sinittext || s->addr > _einittext) && (s->addr < _sextratext || s->addr > _eextratext)) return 0; /* Corner case. Discard any symbols with the same value as * _etext _einittext or _eextratext; they can move between pass * 1 and 2 when the symbols data are added. If these symbols * move then they may get dropped in pass 2, which breaks the * symbols rules. */ if ((s->addr == _etext && strcmp((char*)s->sym + offset, "_etext")) || (s->addr == _einittext && strcmp((char*)s->sym + offset, "_einittext")) || (s->addr == _eextratext && strcmp((char*)s->sym + offset, "_eextratext"))) return 0; } /* Exclude symbols which vary between passes. */ if (strstr((char *)s->sym + offset, "_compiled.")) return 0; return 1; } static void read_map(FILE *in) { while (!feof(in)) { if (table_cnt >= table_size) { table_size += 10000; table = realloc(table, sizeof(*table) * table_size); if (!table) { fprintf(stderr, "out of memory\n"); exit (1); } } if (read_symbol(in, &table[table_cnt]) == 0) table_cnt++; } } static void output_label(char *label) { if (symbol_prefix_char) printf(".globl %c%s\n", symbol_prefix_char, label); else printf(".globl %s\n", label); printf("\tALGN\n"); if (symbol_prefix_char) printf("%c%s:\n", symbol_prefix_char, label); else printf("%s:\n", label); } /* uncompress a compressed symbol. When this function is called, the best table * might still be compressed itself, so the function needs to be recursive */ static int expand_symbol(unsigned char *data, int len, char *result) { int c, rlen, total=0; while (len) { c = *data; /* if the table holds a single char that is the same as the one * we are looking for, then end the search */ if (best_table[c][0]==c && best_table_len[c]==1) { *result++ = c; total++; } else { /* if not, recurse and expand */ rlen = expand_symbol(best_table[c], best_table_len[c], result); total += rlen; result += rlen; } data++; len--; } *result=0; return total; } static void write_src(void) { unsigned int i, k, off; unsigned int best_idx[256]; unsigned int *markers; char buf[KSYM_NAME_LEN+1]; printf("#include \n"); printf("#include \n"); printf("#if BITS_PER_LONG == 64 && !defined(SYMBOLS_ORIGIN)\n"); printf("#define PTR .quad\n"); printf("#define ALGN .align 8\n"); printf("#else\n"); printf("#define PTR .long\n"); printf("#define ALGN .align 4\n"); printf("#endif\n"); printf("\t.section .rodata, \"a\"\n"); printf("#ifndef SYMBOLS_ORIGIN\n"); printf("#define SYMBOLS_ORIGIN 0\n"); output_label("symbols_addresses"); printf("#else\n"); output_label("symbols_offsets"); printf("#endif\n"); for (i = 0; i < table_cnt; i++) { printf("\tPTR\t%#llx - SYMBOLS_ORIGIN\n", table[i].addr); } printf("\n"); output_label("symbols_num_syms"); printf("\t.long\t%d\n", table_cnt); printf("\n"); /* table of offset markers, that give the offset in the compressed stream * every 256 symbols */ markers = (unsigned int *) malloc(sizeof(unsigned int) * ((table_cnt + 255) / 256)); output_label("symbols_names"); off = 0; for (i = 0; i < table_cnt; i++) { if ((i & 0xFF) == 0) markers[i >> 8] = off; printf("\t.byte 0x%02x", table[i].len); for (k = 0; k < table[i].len; k++) printf(", 0x%02x", table[i].sym[k]); printf("\n"); off += table[i].len + 1; } printf("\n"); output_label("symbols_markers"); for (i = 0; i < ((table_cnt + 255) >> 8); i++) printf("\t.long\t%d\n", markers[i]); printf("\n"); free(markers); output_label("symbols_token_table"); off = 0; for (i = 0; i < 256; i++) { best_idx[i] = off; expand_symbol(best_table[i], best_table_len[i], buf); printf("\t.asciz\t\"%s\"\n", buf); off += strlen(buf) + 1; } printf("\n"); output_label("symbols_token_index"); for (i = 0; i < 256; i++) printf("\t.short\t%d\n", best_idx[i]); printf("\n"); } /* table lookup compression functions */ /* count all the possible tokens in a symbol */ static void learn_symbol(unsigned char *symbol, int len) { int i; for (i = 0; i < len - 1; i++) token_profit[ symbol[i] + (symbol[i + 1] << 8) ]++; } /* decrease the count for all the possible tokens in a symbol */ static void forget_symbol(unsigned char *symbol, int len) { int i; for (i = 0; i < len - 1; i++) token_profit[ symbol[i] + (symbol[i + 1] << 8) ]--; } /* remove all the invalid symbols from the table and do the initial token count */ static void build_initial_tok_table(void) { unsigned int i, pos; pos = 0; for (i = 0; i < table_cnt; i++) { if ( symbol_valid(&table[i]) ) { if (pos != i) table[pos] = table[i]; learn_symbol(table[pos].sym, table[pos].len); pos++; } } table_cnt = pos; } static void *memmem_pvt(void *h, size_t hlen, void *n, size_t nlen) { char *p; for (p = h; (p - (char *)h) <= (long)(hlen - nlen); p++) if (!memcmp(p, n, nlen)) return p; return NULL; } /* replace a given token in all the valid symbols. Use the sampled symbols * to update the counts */ static void compress_symbols(unsigned char *str, int idx) { unsigned int i, len, size; unsigned char *p1, *p2; for (i = 0; i < table_cnt; i++) { len = table[i].len; p1 = table[i].sym; /* find the token on the symbol */ p2 = memmem_pvt(p1, len, str, 2); if (!p2) continue; /* decrease the counts for this symbol's tokens */ forget_symbol(table[i].sym, len); size = len; do { *p2 = idx; p2++; size -= (p2 - p1); memmove(p2, p2 + 1, size); p1 = p2; len--; if (size < 2) break; /* find the token on the symbol */ p2 = memmem_pvt(p1, size, str, 2); } while (p2); table[i].len = len; /* increase the counts for this symbol's new tokens */ learn_symbol(table[i].sym, len); } } /* search the token with the maximum profit */ static int find_best_token(void) { int i, best, bestprofit; bestprofit=-10000; best = 0; for (i = 0; i < 0x10000; i++) { if (token_profit[i] > bestprofit) { best = i; bestprofit = token_profit[i]; } } return best; } /* this is the core of the algorithm: calculate the "best" table */ static void optimize_result(void) { int i, best; /* using the '\0' symbol last allows compress_symbols to use standard * fast string functions */ for (i = 255; i >= 0; i--) { /* if this table slot is empty (it is not used by an actual * original char code */ if (!best_table_len[i]) { /* find the token with the breates profit value */ best = find_best_token(); /* place it in the "best" table */ best_table_len[i] = 2; best_table[i][0] = best & 0xFF; best_table[i][1] = (best >> 8) & 0xFF; /* replace this token in all the valid symbols */ compress_symbols(best_table[i], i); } } } /* start by placing the symbols that are actually used on the table */ static void insert_real_symbols_in_table(void) { unsigned int i, j, c; memset(best_table, 0, sizeof(best_table)); memset(best_table_len, 0, sizeof(best_table_len)); for (i = 0; i < table_cnt; i++) { for (j = 0; j < table[i].len; j++) { c = table[i].sym[j]; best_table[c][0]=c; best_table_len[c]=1; } } } static void optimize_token_table(void) { build_initial_tok_table(); insert_real_symbols_in_table(); /* When valid symbol is not registered, exit to error */ if (!table_cnt) { fprintf(stderr, "No valid symbol.\n"); exit(1); } optimize_result(); } int main(int argc, char **argv) { if (argc >= 2) { int i; for (i = 1; i < argc; i++) { if(strcmp(argv[i], "--all-symbols") == 0) all_symbols = 1; else if (strncmp(argv[i], "--symbol-prefix=", 16) == 0) { char *p = &argv[i][16]; /* skip quote */ if ((*p == '"' && *(p+2) == '"') || (*p == '\'' && *(p+2) == '\'')) p++; symbol_prefix_char = *p; } else usage(); } } else if (argc != 1) usage(); read_map(stdin); optimize_token_table(); write_src(); return 0; } xen-4.4.0/xen/tools/xen.flf0000664000175000017500000006734212307313555013674 0ustar smbsmbflf2a$ 6 5 16 15 14 0 24463 229 Standard by Glenn Chappell & Ian Chai 3/93 -- based on Frank's .sig Includes ISO Latin-1 figlet release 2.1 -- 12 Aug 1994 Modified for figlet 2.2 by John Cowan to add Latin-{2,3,4,5} support (Unicode U+0100-017F). Permission is hereby given to modify this font, as long as the modifier's name is placed on a comment line. Modified by Paul Burton 12/96 to include new parameter supported by FIGlet and FIGWin. May also be slightly modified for better use of new full-width/kern/smush alternatives, but default output is NOT changed. Modified by Keir Fraser for Xen build system. $@ $@ $@ $@ $@ $@@ _ @ | |@ | |@ |_|@ (_)@ @@ _ _ @ ( | )@ V V @ $ @ $ @ @@ _ _ @ _| || |_ @ |_ .. _|@ |_ _|@ |_||_| @ @@ _ @ | | @ / __)@ \__ \@ ( /@ |_| @@ _ __@ (_)/ /@ / / @ / /_ @ /_/(_)@ @@ ___ @ ( _ ) @ / _ \/\@ | (_> <@ \___/\/@ @@ _ @ ( )@ |/ @ $ @ $ @ @@ __@ / /@ | | @ | | @ | | @ \_\@@ __ @ \ \ @ | |@ | |@ | |@ /_/ @@ @ __/\__@ \ /@ /_ _\@ \/ @ @@ @ _ @ _| |_ @ |_ _|@ |_| @ @@ @ @ @ _ @ ( )@ |/ @@ @ @ __ @ |__|@ $ @ @@ @ @ @ _ @ (_)@ @@ __@ / /@ / / @ / / @ /_/ @ @@ ___ @ / _ \ @ | | | |@ | |_| |@ \___/ @ @@ _ @ / |@ | |@ | |@ |_|@ @@ ____ @ |___ \ @ __) |@ / __/ @ |_____|@ @@ _____ @ |___ / @ |_ \ @ ___) |@ |____/ @ @@ _ _ @ | || | @ | || |_ @ |__ _|@ |_| @ @@ ____ @ | ___| @ |___ \ @ ___) |@ |____/ @ @@ __ @ / /_ @ | '_ \ @ | (_) |@ \___/ @ @@ _____ @ |___ |@ / / @ / / @ /_/ @ @@ ___ @ ( _ ) @ / _ \ @ | (_) |@ \___/ @ @@ ___ @ / _ \ @ | (_) |@ \__, |@ /_/ @ @@ @ _ @ (_)@ _ @ (_)@ @@ @ _ @ (_)@ _ @ ( )@ |/ @@ __@ / /@ / / @ \ \ @ \_\@ @@ @ _____ @ |_____|@ |_____|@ $ @ @@ __ @ \ \ @ \ \@ / /@ /_/ @ @@ ___ @ |__ \@ / /@ |_| @ (_) @ @@ ____ @ / __ \ @ / / _` |@ | | (_| |@ \ \__,_|@ \____/ @@ _ @ / \ @ / _ \ @ / ___ \ @ /_/ \_\@ @@ ____ @ | __ ) @ | _ \ @ | |_) |@ |____/ @ @@ ____ @ / ___|@ | | @ | |___ @ \____|@ @@ ____ @ | _ \ @ | | | |@ | |_| |@ |____/ @ @@ _____ @ | ____|@ | _| @ | |___ @ |_____|@ @@ _____ @ | ___|@ | |_ @ | _| @ |_| @ @@ ____ @ / ___|@ | | _ @ | |_| |@ \____|@ @@ _ _ @ | | | |@ | |_| |@ | _ |@ |_| |_|@ @@ ___ @ |_ _|@ | | @ | | @ |___|@ @@ _ @ | |@ _ | |@ | |_| |@ \___/ @ @@ _ __@ | |/ /@ | ' / @ | . \ @ |_|\_\@ @@ _ @ | | @ | | @ | |___ @ |_____|@ @@ __ __ @ | \/ |@ | |\/| |@ | | | |@ |_| |_|@ @@ _ _ @ | \ | |@ | \| |@ | |\ |@ |_| \_|@ @@ ___ @ / _ \ @ | | | |@ | |_| |@ \___/ @ @@ ____ @ | _ \ @ | |_) |@ | __/ @ |_| @ @@ ___ @ / _ \ @ | | | |@ | |_| |@ \__\_\@ @@ ____ @ | _ \ @ | |_) |@ | _ < @ |_| \_\@ @@ ____ @ / ___| @ \___ \ @ ___) |@ |____/ @ @@ _____ @ |_ _|@ | | @ | | @ |_| @ @@ _ _ @ | | | |@ | | | |@ | |_| |@ \___/ @ @@ __ __@ \ \ / /@ \ \ / / @ \ V / @ \_/ @ @@ __ __@ \ \ / /@ \ \ /\ / / @ \ V V / @ \_/\_/ @ @@ __ __@ \ \/ /@ \ / @ / \ @ /_/\_\@ @@ __ __@ \ \ / /@ \ V / @ | | @ |_| @ @@ _____@ |__ /@ / / @ / /_ @ /____|@ @@ __ @ | _|@ | | @ | | @ | | @ |__|@@ __ @ \ \ @ \ \ @ \ \ @ \_\@ @@ __ @ |_ |@ | |@ | |@ | |@ |__|@@ /\ @ |/\|@ $ @ $ @ $ @ @@ @ @ @ @ _____ @ |_____|@@ _ @ ( )@ \|@ $ @ $ @ @@ @ __ _ @ / _` |@ | (_| |@ \__,_|@ @@ _ @ | |__ @ | '_ \ @ | |_) |@ |_.__/ @ @@ @ ___ @ / __|@ | (__ @ \___|@ @@ _ @ __| |@ / _` |@ | (_| |@ \__,_|@ @@ @ ___ @ / _ \@ | __/@ \___|@ @@ __ @ / _|@ | |_ @ | _|@ |_| @ @@ @ __ _ @ / _` |@ | (_| |@ \__, |@ |___/ @@ _ @ | |__ @ | '_ \ @ | | | |@ |_| |_|@ @@ _ @ (_)@ | |@ | |@ |_|@ @@ _ @ (_)@ | |@ | |@ _/ |@ |__/ @@ _ @ | | __@ | |/ /@ | < @ |_|\_\@ @@ _ @ | |@ | |@ | |@ |_|@ @@ @ _ __ ___ @ | '_ ` _ \ @ | | | | | |@ |_| |_| |_|@ @@ @ _ __ @ | '_ \ @ | | | |@ |_| |_|@ @@ @ ___ @ / _ \ @ | (_) |@ \___/ @ @@ @ _ __ @ | '_ \ @ | |_) |@ | .__/ @ |_| @@ @ __ _ @ / _` |@ | (_| |@ \__, |@ |_|@@ @ _ __ @ | '__|@ | | @ |_| @ @@ @ ___ @ / __|@ \__ \@ |___/@ @@ _ @ | |_ @ | __|@ | |_ @ \__|@ @@ @ _ _ @ | | | |@ | |_| |@ \__,_|@ @@ @ __ __@ \ \ / /@ \ V / @ \_/ @ @@ @ __ __@ \ \ /\ / /@ \ V V / @ \_/\_/ @ @@ @ __ __@ \ \/ /@ > < @ /_/\_\@ @@ @ _ _ @ | | | |@ | |_| |@ \__, |@ |___/ @@ @ ____@ |_ /@ / / @ /___|@ @@ __@ / /@ | | @ < < @ | | @ \_\@@ _ @ | |@ | |@ | |@ | |@ |_|@@ __ @ \ \ @ | | @ > >@ | | @ /_/ @@ /\/|@ |/\/ @ $ @ $ @ $ @ @@ _ _ @ (_)_(_)@ /_\ @ / _ \ @ /_/ \_\@ @@ _ _ @ (_)_(_)@ / _ \ @ | |_| |@ \___/ @ @@ _ _ @ (_) (_)@ | | | |@ | |_| |@ \___/ @ @@ _ _ @ (_)_(_)@ / _` |@ | (_| |@ \__,_|@ @@ _ _ @ (_)_(_)@ / _ \ @ | (_) |@ \___/ @ @@ _ _ @ (_) (_)@ | | | |@ | |_| |@ \__,_|@ @@ ___ @ / _ \@ | |/ /@ | |\ \@ | ||_/@ |_| @@ 160 NO-BREAK SPACE $@ $@ $@ $@ $@ $@@ 161 INVERTED EXCLAMATION MARK _ @ (_)@ | |@ | |@ |_|@ @@ 162 CENT SIGN _ @ | | @ / __)@ | (__ @ \ )@ |_| @@ 163 POUND SIGN ___ @ / ,_\ @ _| |_ @ | |___ @ (_,____|@ @@ 164 CURRENCY SIGN /\___/\@ \ _ /@ | (_) |@ / ___ \@ \/ \/@ @@ 165 YEN SIGN __ __ @ \ V / @ |__ __|@ |__ __|@ |_| @ @@ 166 BROKEN BAR _ @ | |@ |_|@ _ @ | |@ |_|@@ 167 SECTION SIGN __ @ _/ _)@ / \ \ @ \ \\ \@ \ \_/@ (__/ @@ 168 DIAERESIS _ _ @ (_) (_)@ $ $ @ $ $ @ $ $ @ @@ 169 COPYRIGHT SIGN _____ @ / ___ \ @ / / __| \ @ | | (__ |@ \ \___| / @ \_____/ @@ 170 FEMININE ORDINAL INDICATOR __ _ @ / _` |@ \__,_|@ |____|@ $ @ @@ 171 LEFT-POINTING DOUBLE ANGLE QUOTATION MARK ____@ / / /@ / / / @ \ \ \ @ \_\_\@ @@ 172 NOT SIGN @ _____ @ |___ |@ |_|@ $ @ @@ 173 SOFT HYPHEN @ @ ____ @ |____|@ $ @ @@ 174 REGISTERED SIGN _____ @ / ___ \ @ / | _ \ \ @ | | / |@ \ |_|_\ / @ \_____/ @@ 175 MACRON _____ @ |_____|@ $ @ $ @ $ @ @@ 176 DEGREE SIGN __ @ / \ @ | () |@ \__/ @ $ @ @@ 177 PLUS-MINUS SIGN _ @ _| |_ @ |_ _|@ _|_|_ @ |_____|@ @@ 178 SUPERSCRIPT TWO ___ @ |_ )@ / / @ /___|@ $ @ @@ 179 SUPERSCRIPT THREE ____@ |__ /@ |_ \@ |___/@ $ @ @@ 180 ACUTE ACCENT __@ /_/@ $ @ $ @ $ @ @@ 181 MICRO SIGN @ _ _ @ | | | |@ | |_| |@ | ._,_|@ |_| @@ 182 PILCROW SIGN _____ @ / |@ | (| | |@ \__ | |@ |_|_|@ @@ 183 MIDDLE DOT @ _ @ (_)@ $ @ $ @ @@ 184 CEDILLA @ @ @ @ _ @ )_)@@ 185 SUPERSCRIPT ONE _ @ / |@ | |@ |_|@ $ @ @@ 186 MASCULINE ORDINAL INDICATOR ___ @ / _ \@ \___/@ |___|@ $ @ @@ 187 RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK ____ @ \ \ \ @ \ \ \@ / / /@ /_/_/ @ @@ 188 VULGAR FRACTION ONE QUARTER _ __ @ / | / / _ @ | |/ / | | @ |_/ /|_ _|@ /_/ |_| @ @@ 189 VULGAR FRACTION ONE HALF _ __ @ / | / /__ @ | |/ /_ )@ |_/ / / / @ /_/ /___|@ @@ 190 VULGAR FRACTION THREE QUARTERS ____ __ @ |__ / / / _ @ |_ \/ / | | @ |___/ /|_ _|@ /_/ |_| @ @@ 191 INVERTED QUESTION MARK _ @ (_) @ | | @ / /_ @ \___|@ @@ 192 LATIN CAPITAL LETTER A WITH GRAVE __ @ \_\ @ /_\ @ / _ \ @ /_/ \_\@ @@ 193 LATIN CAPITAL LETTER A WITH ACUTE __ @ /_/ @ /_\ @ / _ \ @ /_/ \_\@ @@ 194 LATIN CAPITAL LETTER A WITH CIRCUMFLEX //\ @ |/_\| @ /_\ @ / _ \ @ /_/ \_\@ @@ 195 LATIN CAPITAL LETTER A WITH TILDE /\/| @ |/\/ @ /_\ @ / _ \ @ /_/ \_\@ @@ 196 LATIN CAPITAL LETTER A WITH DIAERESIS _ _ @ (_)_(_)@ /_\ @ / _ \ @ /_/ \_\@ @@ 197 LATIN CAPITAL LETTER A WITH RING ABOVE _ @ (o) @ /_\ @ / _ \ @ /_/ \_\@ @@ 198 LATIN CAPITAL LETTER AE ______ @ / ____|@ / _ _| @ / __ |___ @ /_/ |_____|@ @@ 199 LATIN CAPITAL LETTER C WITH CEDILLA ____ @ / ___|@ | | @ | |___ @ \____|@ )_) @@ 200 LATIN CAPITAL LETTER E WITH GRAVE __ @ _\_\_ @ | ____|@ | _|_ @ |_____|@ @@ 201 LATIN CAPITAL LETTER E WITH ACUTE __ @ _/_/_ @ | ____|@ | _|_ @ |_____|@ @@ 202 LATIN CAPITAL LETTER E WITH CIRCUMFLEX //\ @ |/_\| @ | ____|@ | _|_ @ |_____|@ @@ 203 LATIN CAPITAL LETTER E WITH DIAERESIS _ _ @ (_)_(_)@ | ____|@ | _|_ @ |_____|@ @@ 204 LATIN CAPITAL LETTER I WITH GRAVE __ @ \_\ @ |_ _|@ | | @ |___|@ @@ 205 LATIN CAPITAL LETTER I WITH ACUTE __ @ /_/ @ |_ _|@ | | @ |___|@ @@ 206 LATIN CAPITAL LETTER I WITH CIRCUMFLEX //\ @ |/_\|@ |_ _|@ | | @ |___|@ @@ 207 LATIN CAPITAL LETTER I WITH DIAERESIS _ _ @ (_)_(_)@ |_ _| @ | | @ |___| @ @@ 208 LATIN CAPITAL LETTER ETH ____ @ | _ \ @ _| |_| |@ |__ __| |@ |____/ @ @@ 209 LATIN CAPITAL LETTER N WITH TILDE /\/|@ |/\/ @ | \| |@ | .` |@ |_|\_|@ @@ 210 LATIN CAPITAL LETTER O WITH GRAVE __ @ \_\ @ / _ \ @ | |_| |@ \___/ @ @@ 211 LATIN CAPITAL LETTER O WITH ACUTE __ @ /_/ @ / _ \ @ | |_| |@ \___/ @ @@ 212 LATIN CAPITAL LETTER O WITH CIRCUMFLEX //\ @ |/_\| @ / _ \ @ | |_| |@ \___/ @ @@ 213 LATIN CAPITAL LETTER O WITH TILDE /\/| @ |/\/ @ / _ \ @ | |_| |@ \___/ @ @@ 214 LATIN CAPITAL LETTER O WITH DIAERESIS _ _ @ (_)_(_)@ / _ \ @ | |_| |@ \___/ @ @@ 215 MULTIPLICATION SIGN @ @ /\/\@ > <@ \/\/@ @@ 216 LATIN CAPITAL LETTER O WITH STROKE ____ @ / _// @ | |// |@ | //| |@ //__/ @ @@ 217 LATIN CAPITAL LETTER U WITH GRAVE __ @ _\_\_ @ | | | |@ | |_| |@ \___/ @ @@ 218 LATIN CAPITAL LETTER U WITH ACUTE __ @ _/_/_ @ | | | |@ | |_| |@ \___/ @ @@ 219 LATIN CAPITAL LETTER U WITH CIRCUMFLEX //\ @ |/ \| @ | | | |@ | |_| |@ \___/ @ @@ 220 LATIN CAPITAL LETTER U WITH DIAERESIS _ _ @ (_) (_)@ | | | |@ | |_| |@ \___/ @ @@ 221 LATIN CAPITAL LETTER Y WITH ACUTE __ @ __/_/__@ \ \ / /@ \ V / @ |_| @ @@ 222 LATIN CAPITAL LETTER THORN _ @ | |___ @ | __ \@ | ___/@ |_| @ @@ 223 LATIN SMALL LETTER SHARP S ___ @ / _ \@ | |/ /@ | |\ \@ | ||_/@ |_| @@ 224 LATIN SMALL LETTER A WITH GRAVE __ @ \_\_ @ / _` |@ | (_| |@ \__,_|@ @@ 225 LATIN SMALL LETTER A WITH ACUTE __ @ /_/_ @ / _` |@ | (_| |@ \__,_|@ @@ 226 LATIN SMALL LETTER A WITH CIRCUMFLEX //\ @ |/_\| @ / _` |@ | (_| |@ \__,_|@ @@ 227 LATIN SMALL LETTER A WITH TILDE /\/| @ |/\/_ @ / _` |@ | (_| |@ \__,_|@ @@ 228 LATIN SMALL LETTER A WITH DIAERESIS _ _ @ (_)_(_)@ / _` |@ | (_| |@ \__,_|@ @@ 229 LATIN SMALL LETTER A WITH RING ABOVE __ @ (()) @ / _ '|@ | (_| |@ \__,_|@ @@ 230 LATIN SMALL LETTER AE @ __ ____ @ / _` _ \@ | (_| __/@ \__,____|@ @@ 231 LATIN SMALL LETTER C WITH CEDILLA @ ___ @ / __|@ | (__ @ \___|@ )_) @@ 232 LATIN SMALL LETTER E WITH GRAVE __ @ \_\ @ / _ \@ | __/@ \___|@ @@ 233 LATIN SMALL LETTER E WITH ACUTE __ @ /_/ @ / _ \@ | __/@ \___|@ @@ 234 LATIN SMALL LETTER E WITH CIRCUMFLEX //\ @ |/_\|@ / _ \@ | __/@ \___|@ @@ 235 LATIN SMALL LETTER E WITH DIAERESIS _ _ @ (_)_(_)@ / _ \ @ | __/ @ \___| @ @@ 236 LATIN SMALL LETTER I WITH GRAVE __ @ \_\@ | |@ | |@ |_|@ @@ 237 LATIN SMALL LETTER I WITH ACUTE __@ /_/@ | |@ | |@ |_|@ @@ 238 LATIN SMALL LETTER I WITH CIRCUMFLEX //\ @ |/_\|@ | | @ | | @ |_| @ @@ 239 LATIN SMALL LETTER I WITH DIAERESIS _ _ @ (_)_(_)@ | | @ | | @ |_| @ @@ 240 LATIN SMALL LETTER ETH /\/\ @ > < @ _\/\ |@ / __` |@ \____/ @ @@ 241 LATIN SMALL LETTER N WITH TILDE /\/| @ |/\/ @ | '_ \ @ | | | |@ |_| |_|@ @@ 242 LATIN SMALL LETTER O WITH GRAVE __ @ \_\ @ / _ \ @ | (_) |@ \___/ @ @@ 243 LATIN SMALL LETTER O WITH ACUTE __ @ /_/ @ / _ \ @ | (_) |@ \___/ @ @@ 244 LATIN SMALL LETTER O WITH CIRCUMFLEX //\ @ |/_\| @ / _ \ @ | (_) |@ \___/ @ @@ 245 LATIN SMALL LETTER O WITH TILDE /\/| @ |/\/ @ / _ \ @ | (_) |@ \___/ @ @@ 246 LATIN SMALL LETTER O WITH DIAERESIS _ _ @ (_)_(_)@ / _ \ @ | (_) |@ \___/ @ @@ 247 DIVISION SIGN @ _ @ _(_)_ @ |_____|@ (_) @ @@ 248 LATIN SMALL LETTER O WITH STROKE @ ____ @ / _//\ @ | (//) |@ \//__/ @ @@ 249 LATIN SMALL LETTER U WITH GRAVE __ @ _\_\_ @ | | | |@ | |_| |@ \__,_|@ @@ 250 LATIN SMALL LETTER U WITH ACUTE __ @ _/_/_ @ | | | |@ | |_| |@ \__,_|@ @@ 251 LATIN SMALL LETTER U WITH CIRCUMFLEX //\ @ |/ \| @ | | | |@ | |_| |@ \__,_|@ @@ 252 LATIN SMALL LETTER U WITH DIAERESIS _ _ @ (_) (_)@ | | | |@ | |_| |@ \__,_|@ @@ 253 LATIN SMALL LETTER Y WITH ACUTE __ @ _/_/_ @ | | | |@ | |_| |@ \__, |@ |___/ @@ 254 LATIN SMALL LETTER THORN _ @ | |__ @ | '_ \ @ | |_) |@ | .__/ @ |_| @@ 255 LATIN SMALL LETTER Y WITH DIAERESIS _ _ @ (_) (_)@ | | | |@ | |_| |@ \__, |@ |___/ @@ 0x0100 LATIN CAPITAL LETTER A WITH MACRON ____ @ /___/ @ /_\ @ / _ \ @ /_/ \_\@ @@ 0x0101 LATIN SMALL LETTER A WITH MACRON ___ @ /_ _/@ / _` |@ | (_| |@ \__,_|@ @@ 0x0102 LATIN CAPITAL LETTER A WITH BREVE _ _ @ \\_// @ /_\ @ / _ \ @ /_/ \_\@ @@ 0x0103 LATIN SMALL LETTER A WITH BREVE \_/ @ ___ @ / _` |@ | (_| |@ \__,_|@ @@ 0x0104 LATIN CAPITAL LETTER A WITH OGONEK @ _ @ /_\ @ / _ \ @ /_/ \_\@ (_(@@ 0x0105 LATIN SMALL LETTER A WITH OGONEK @ __ _ @ / _` |@ | (_| |@ \__,_|@ (_(@@ 0x0106 LATIN CAPITAL LETTER C WITH ACUTE __ @ _/_/ @ / ___|@ | |___ @ \____|@ @@ 0x0107 LATIN SMALL LETTER C WITH ACUTE __ @ /__/@ / __|@ | (__ @ \___|@ @@ 0x0108 LATIN CAPITAL LETTER C WITH CIRCUMFLEX /\ @ _//\\@ / ___|@ | |___ @ \____|@ @@ 0x0109 LATIN SMALL LETTER C WITH CIRCUMFLEX /\ @ /_\ @ / __|@ | (__ @ \___|@ @@ 0x010A LATIN CAPITAL LETTER C WITH DOT ABOVE [] @ ____ @ / ___|@ | |___ @ \____|@ @@ 0x010B LATIN SMALL LETTER C WITH DOT ABOVE [] @ ___ @ / __|@ | (__ @ \___|@ @@ 0x010C LATIN CAPITAL LETTER C WITH CARON \\// @ _\/_ @ / ___|@ | |___ @ \____|@ @@ 0x010D LATIN SMALL LETTER C WITH CARON \\//@ _\/ @ / __|@ | (__ @ \___|@ @@ 0x010E LATIN CAPITAL LETTER D WITH CARON \\// @ __\/ @ | _ \ @ | |_| |@ |____/ @ @@ 0x010F LATIN SMALL LETTER D WITH CARON \/ _ @ __| |@ / _` |@ | (_| |@ \__,_|@ @@ 0x0110 LATIN CAPITAL LETTER D WITH STROKE ____ @ |_ __ \ @ /| |/ | |@ /|_|/_| |@ |_____/ @ @@ 0x0111 LATIN SMALL LETTER D WITH STROKE ---|@ __| |@ / _` |@ | (_| |@ \__,_|@ @@ 0x0112 LATIN CAPITAL LETTER E WITH MACRON ____ @ /___/ @ | ____|@ | _|_ @ |_____|@ @@ 0x0113 LATIN SMALL LETTER E WITH MACRON ____@ /_ _/@ / _ \ @ | __/ @ \___| @ @@ 0x0114 LATIN CAPITAL LETTER E WITH BREVE _ _ @ \\_// @ | ____|@ | _|_ @ |_____|@ @@ 0x0115 LATIN SMALL LETTER E WITH BREVE \\ //@ -- @ / _ \ @ | __/ @ \___| @ @@ 0x0116 LATIN CAPITAL LETTER E WITH DOT ABOVE [] @ _____ @ | ____|@ | _|_ @ |_____|@ @@ 0x0117 LATIN SMALL LETTER E WITH DOT ABOVE [] @ __ @ / _ \@ | __/@ \___|@ @@ 0x0118 LATIN CAPITAL LETTER E WITH OGONEK @ _____ @ | ____|@ | _|_ @ |_____|@ (__(@@ 0x0119 LATIN SMALL LETTER E WITH OGONEK @ ___ @ / _ \@ | __/@ \___|@ (_(@@ 0x011A LATIN CAPITAL LETTER E WITH CARON \\// @ __\/_ @ | ____|@ | _|_ @ |_____|@ @@ 0x011B LATIN SMALL LETTER E WITH CARON \\//@ \/ @ / _ \@ | __/@ \___|@ @@ 0x011C LATIN CAPITAL LETTER G WITH CIRCUMFLEX _/\_ @ / ___|@ | | _ @ | |_| |@ \____|@ @@ 0x011D LATIN SMALL LETTER G WITH CIRCUMFLEX /\ @ _/_ \@ / _` |@ | (_| |@ \__, |@ |___/ @@ 0x011E LATIN CAPITAL LETTER G WITH BREVE _\/_ @ / ___|@ | | _ @ | |_| |@ \____|@ @@ 0x011F LATIN SMALL LETTER G WITH BREVE \___/ @ __ _ @ / _` |@ | (_| |@ \__, |@ |___/ @@ 0x0120 LATIN CAPITAL LETTER G WITH DOT ABOVE _[]_ @ / ___|@ | | _ @ | |_| |@ \____|@ @@ 0x0121 LATIN SMALL LETTER G WITH DOT ABOVE [] @ __ _ @ / _` |@ | (_| |@ \__, |@ |___/ @@ 0x0122 LATIN CAPITAL LETTER G WITH CEDILLA ____ @ / ___|@ | | _ @ | |_| |@ \____|@ )__) @@ 0x0123 LATIN SMALL LETTER G WITH CEDILLA @ __ _ @ / _` |@ | (_| |@ \__, |@ |_))))@@ 0x0124 LATIN CAPITAL LETTER H WITH CIRCUMFLEX _/ \_ @ | / \ |@ | |_| |@ | _ |@ |_| |_|@ @@ 0x0125 LATIN SMALL LETTER H WITH CIRCUMFLEX _ /\ @ | |//\ @ | '_ \ @ | | | |@ |_| |_|@ @@ 0x0126 LATIN CAPITAL LETTER H WITH STROKE _ _ @ | |=| |@ | |_| |@ | _ |@ |_| |_|@ @@ 0x0127 LATIN SMALL LETTER H WITH STROKE _ @ |=|__ @ | '_ \ @ | | | |@ |_| |_|@ @@ 0x0128 LATIN CAPITAL LETTER I WITH TILDE /\//@ |_ _|@ | | @ | | @ |___|@ @@ 0x0129 LATIN SMALL LETTER I WITH TILDE @ /\/@ | |@ | |@ |_|@ @@ 0x012A LATIN CAPITAL LETTER I WITH MACRON /___/@ |_ _|@ | | @ | | @ |___|@ @@ 0x012B LATIN SMALL LETTER I WITH MACRON ____@ /___/@ | | @ | | @ |_| @ @@ 0x012C LATIN CAPITAL LETTER I WITH BREVE \__/@ |_ _|@ | | @ | | @ |___|@ @@ 0x012D LATIN SMALL LETTER I WITH BREVE @ \_/@ | |@ | |@ |_|@ @@ 0x012E LATIN CAPITAL LETTER I WITH OGONEK ___ @ |_ _|@ | | @ | | @ |___|@ (__(@@ 0x012F LATIN SMALL LETTER I WITH OGONEK _ @ (_) @ | | @ | | @ |_|_@ (_(@@ 0x0130 LATIN CAPITAL LETTER I WITH DOT ABOVE _[] @ |_ _|@ | | @ | | @ |___|@ @@ 0x0131 LATIN SMALL LETTER DOTLESS I @ _ @ | |@ | |@ |_|@ @@ 0x0132 LATIN CAPITAL LIGATURE IJ ___ _ @ |_ _|| |@ | | | |@ | |_| |@ |__|__/ @ @@ 0x0133 LATIN SMALL LIGATURE IJ _ _ @ (_) (_)@ | | | |@ | | | |@ |_|_/ |@ |__/ @@ 0x0134 LATIN CAPITAL LETTER J WITH CIRCUMFLEX /\ @ /_\|@ _ | | @ | |_| | @ \___/ @ @@ 0x0135 LATIN SMALL LETTER J WITH CIRCUMFLEX /\@ /_\@ | |@ | |@ _/ |@ |__/ @@ 0x0136 LATIN CAPITAL LETTER K WITH CEDILLA _ _ @ | |/ / @ | ' / @ | . \ @ |_|\_\ @ )__)@@ 0x0137 LATIN SMALL LETTER K WITH CEDILLA _ @ | | __@ | |/ /@ | < @ |_|\_\@ )_)@@ 0x0138 LATIN SMALL LETTER KRA @ _ __ @ | |/ \@ | < @ |_|\_\@ @@ 0x0139 LATIN CAPITAL LETTER L WITH ACUTE _ //@ | | // @ | | @ | |___ @ |_____|@ @@ 0x013A LATIN SMALL LETTER L WITH ACUTE //@ | |@ | |@ | |@ |_|@ @@ 0x013B LATIN CAPITAL LETTER L WITH CEDILLA _ @ | | @ | | @ | |___ @ |_____|@ )__)@@ 0x013C LATIN SMALL LETTER L WITH CEDILLA _ @ | | @ | | @ | | @ |_| @ )_)@@ 0x013D LATIN CAPITAL LETTER L WITH CARON _ \\//@ | | \/ @ | | @ | |___ @ |_____|@ @@ 0x013E LATIN SMALL LETTER L WITH CARON _ \\//@ | | \/ @ | | @ | | @ |_| @ @@ 0x013F LATIN CAPITAL LETTER L WITH MIDDLE DOT _ @ | | @ | | [] @ | |___ @ |_____|@ @@ 0x0140 LATIN SMALL LETTER L WITH MIDDLE DOT _ @ | | @ | | []@ | | @ |_| @ @@ 0x0141 LATIN CAPITAL LETTER L WITH STROKE __ @ | // @ |//| @ // |__ @ |_____|@ @@ 0x0142 LATIN SMALL LETTER L WITH STROKE _ @ | |@ |//@ //|@ |_|@ @@ 0x0143 LATIN CAPITAL LETTER N WITH ACUTE _/ /_ @ | \ | |@ | \| |@ | |\ |@ |_| \_|@ @@ 0x0144 LATIN SMALL LETTER N WITH ACUTE _ @ _ /_/ @ | '_ \ @ | | | |@ |_| |_|@ @@ 0x0145 LATIN CAPITAL LETTER N WITH CEDILLA _ _ @ | \ | |@ | \| |@ | |\ |@ |_| \_|@ )_) @@ 0x0146 LATIN SMALL LETTER N WITH CEDILLA @ _ __ @ | '_ \ @ | | | |@ |_| |_|@ )_) @@ 0x0147 LATIN CAPITAL LETTER N WITH CARON _\/ _ @ | \ | |@ | \| |@ | |\ |@ |_| \_|@ @@ 0x0148 LATIN SMALL LETTER N WITH CARON \\// @ _\/_ @ | '_ \ @ | | | |@ |_| |_|@ @@ 0x0149 LATIN SMALL LETTER N PRECEDED BY APOSTROPHE @ _ __ @ ( )| '_\ @ |/| | | |@ |_| |_|@ @@ 0x014A LATIN CAPITAL LETTER ENG _ _ @ | \ | |@ | \| |@ | |\ |@ |_| \ |@ )_)@@ 0x014B LATIN SMALL LETTER ENG _ __ @ | '_ \ @ | | | |@ |_| | |@ | |@ |__ @@ 0x014C LATIN CAPITAL LETTER O WITH MACRON ____ @ /_ _/ @ / _ \ @ | (_) |@ \___/ @ @@ 0x014D LATIN SMALL LETTER O WITH MACRON ____ @ /_ _/ @ / _ \ @ | (_) |@ \___/ @ @@ 0x014E LATIN CAPITAL LETTER O WITH BREVE \ / @ _-_ @ / _ \ @ | |_| |@ \___/ @ @@ 0x014F LATIN SMALL LETTER O WITH BREVE \ / @ _-_ @ / _ \ @ | |_| |@ \___/ @ @@ 0x0150 LATIN CAPITAL LETTER O WITH DOUBLE ACUTE ___ @ /_/_/@ / _ \ @ | |_| |@ \___/ @ @@ 0x0151 LATIN SMALL LETTER O WITH DOUBLE ACUTE ___ @ /_/_/@ / _ \ @ | |_| |@ \___/ @ @@ 0x0152 LATIN CAPITAL LIGATURE OE ___ ___ @ / _ \| __|@ | | | | | @ | |_| | |__@ \___/|____@ @@ 0x0153 LATIN SMALL LIGATURE OE @ ___ ___ @ / _ \ / _ \@ | (_) | __/@ \___/ \___|@ @@ 0x0154 LATIN CAPITAL LETTER R WITH ACUTE _/_/ @ | _ \ @ | |_) |@ | _ < @ |_| \_\@ @@ 0x0155 LATIN SMALL LETTER R WITH ACUTE __@ _ /_/@ | '__|@ | | @ |_| @ @@ 0x0156 LATIN CAPITAL LETTER R WITH CEDILLA ____ @ | _ \ @ | |_) |@ | _ < @ |_| \_\@ )_) @@ 0x0157 LATIN SMALL LETTER R WITH CEDILLA @ _ __ @ | '__|@ | | @ |_| @ )_) @@ 0x0158 LATIN CAPITAL LETTER R WITH CARON _\_/ @ | _ \ @ | |_) |@ | _ < @ |_| \_\@ @@ 0x0159 LATIN SMALL LETTER R WITH CARON \\// @ _\/_ @ | '__|@ | | @ |_| @ @@ 0x015A LATIN CAPITAL LETTER S WITH ACUTE _/_/ @ / ___| @ \___ \ @ ___) |@ |____/ @ @@ 0x015B LATIN SMALL LETTER S WITH ACUTE __@ _/_/@ / __|@ \__ \@ |___/@ @@ 0x015C LATIN CAPITAL LETTER S WITH CIRCUMFLEX _/\_ @ / ___| @ \___ \ @ ___) |@ |____/ @ @@ 0x015D LATIN SMALL LETTER S WITH CIRCUMFLEX @ /_\_@ / __|@ \__ \@ |___/@ @@ 0x015E LATIN CAPITAL LETTER S WITH CEDILLA ____ @ / ___| @ \___ \ @ ___) |@ |____/ @ )__)@@ 0x015F LATIN SMALL LETTER S WITH CEDILLA @ ___ @ / __|@ \__ \@ |___/@ )_)@@ 0x0160 LATIN CAPITAL LETTER S WITH CARON _\_/ @ / ___| @ \___ \ @ ___) |@ |____/ @ @@ 0x0161 LATIN SMALL LETTER S WITH CARON \\//@ _\/ @ / __|@ \__ \@ |___/@ @@ 0x0162 LATIN CAPITAL LETTER T WITH CEDILLA _____ @ |_ _|@ | | @ | | @ |_| @ )__)@@ 0x0163 LATIN SMALL LETTER T WITH CEDILLA _ @ | |_ @ | __|@ | |_ @ \__|@ )_)@@ 0x0164 LATIN CAPITAL LETTER T WITH CARON _____ @ |_ _|@ | | @ | | @ |_| @ @@ 0x0165 LATIN SMALL LETTER T WITH CARON \/ @ | |_ @ | __|@ | |_ @ \__|@ @@ 0x0166 LATIN CAPITAL LETTER T WITH STROKE _____ @ |_ _|@ | | @ -|-|- @ |_| @ @@ 0x0167 LATIN SMALL LETTER T WITH STROKE _ @ | |_ @ | __|@ |-|_ @ \__|@ @@ 0x0168 LATIN CAPITAL LETTER U WITH TILDE @ _/\/_ @ | | | |@ | |_| |@ \___/ @ @@ 0x0169 LATIN SMALL LETTER U WITH TILDE @ _/\/_ @ | | | |@ | |_| |@ \__,_|@ @@ 0x016A LATIN CAPITAL LETTER U WITH MACRON ____ @ /__ _/@ | | | |@ | |_| |@ \___/ @ @@ 0x016B LATIN SMALL LETTER U WITH MACRON ____ @ / _ /@ | | | |@ | |_| |@ \__,_|@ @@ 0x016C LATIN CAPITAL LETTER U WITH BREVE @ \_/_ @ | | | |@ | |_| |@ \____|@ @@ 0x016D LATIN SMALL LETTER U WITH BREVE @ \_/_ @ | | | |@ | |_| |@ \__,_|@ @@ 0x016E LATIN CAPITAL LETTER U WITH RING ABOVE O @ __ _ @ | | | |@ | |_| |@ \___/ @ @@ 0x016F LATIN SMALL LETTER U WITH RING ABOVE O @ __ __ @ | | | |@ | |_| |@ \__,_|@ @@ 0x0170 LATIN CAPITAL LETTER U WITH DOUBLE ACUTE -- --@ /_//_/@ | | | |@ | |_| |@ \___/ @ @@ 0x0171 LATIN SMALL LETTER U WITH DOUBLE ACUTE ____@ _/_/_/@ | | | |@ | |_| |@ \__,_|@ @@ 0x0172 LATIN CAPITAL LETTER U WITH OGONEK _ _ @ | | | |@ | | | |@ | |_| |@ \___/ @ (__(@@ 0x0173 LATIN SMALL LETTER U WITH OGONEK @ _ _ @ | | | |@ | |_| |@ \__,_|@ (_(@@ 0x0174 LATIN CAPITAL LETTER W WITH CIRCUMFLEX __ /\ __@ \ \ //\\/ /@ \ \ /\ / / @ \ V V / @ \_/\_/ @ @@ 0x0175 LATIN SMALL LETTER W WITH CIRCUMFLEX /\ @ __ //\\__@ \ \ /\ / /@ \ V V / @ \_/\_/ @ @@ 0x0176 LATIN CAPITAL LETTER Y WITH CIRCUMFLEX /\ @ __//\\ @ \ \ / /@ \ V / @ |_| @ @@ 0x0177 LATIN SMALL LETTER Y WITH CIRCUMFLEX /\ @ //\\ @ | | | |@ | |_| |@ \__, |@ |___/ @@ 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS [] []@ __ _@ \ \ / /@ \ V / @ |_| @ @@ 0x0179 LATIN CAPITAL LETTER Z WITH ACUTE __/_/@ |__ /@ / / @ / /_ @ /____|@ @@ 0x017A LATIN SMALL LETTER Z WITH ACUTE _ @ _/_/@ |_ /@ / / @ /___|@ @@ 0x017B LATIN CAPITAL LETTER Z WITH DOT ABOVE __[]_@ |__ /@ / / @ / /_ @ /____|@ @@ 0x017C LATIN SMALL LETTER Z WITH DOT ABOVE [] @ ____@ |_ /@ / / @ /___|@ @@ 0x017D LATIN CAPITAL LETTER Z WITH CARON _\_/_@ |__ /@ / / @ / /_ @ /____|@ @@ 0x017E LATIN SMALL LETTER Z WITH CARON \\//@ _\/_@ |_ /@ / / @ /___|@ @@ 0x017F LATIN SMALL LETTER LONG S __ @ / _|@ |-| | @ |-| | @ |_| @ @@ 0x02C7 CARON \\//@ \/ @ $@ $@ $@ $@@ 0x02D8 BREVE \\_//@ \_/ @ $@ $@ $@ $@@ 0x02D9 DOT ABOVE []@ $@ $@ $@ $@ $@@ 0x02DB OGONEK $@ $@ $@ $@ $@ )_) @@ 0x02DD DOUBLE ACUTE ACCENT _ _ @ /_/_/@ $@ $@ $@ $@@ xen-4.4.0/xen/arch/0000775000175000017500000000000012307313555012152 5ustar smbsmbxen-4.4.0/xen/arch/arm/0000775000175000017500000000000012307313555012731 5ustar smbsmbxen-4.4.0/xen/arch/arm/domctl.c0000664000175000017500000000270012307313555014356 0ustar smbsmb/****************************************************************************** * Arch-specific domctl.c * * Copyright (c) 2012, Citrix Systems */ #include #include #include #include #include #include #include long arch_do_domctl(struct xen_domctl *domctl, struct domain *d, XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) { switch ( domctl->cmd ) { case XEN_DOMCTL_cacheflush: { unsigned long s = domctl->u.cacheflush.start_pfn; unsigned long e = s + domctl->u.cacheflush.nr_pfns; if ( domctl->u.cacheflush.nr_pfns > (1U<user_regs; vcpu_regs_hyp_to_user(v, regs); ctxt->sctlr = v->arch.sctlr; ctxt->ttbr0 = v->arch.ttbr0; ctxt->ttbr1 = v->arch.ttbr1; ctxt->ttbcr = v->arch.ttbcr; if ( !test_bit(_VPF_down, &v->pause_flags) ) ctxt->flags |= VGCF_online; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/Makefile0000664000175000017500000000467312307313555014403 0ustar smbsmbsubdir-$(arm32) += arm32 subdir-$(arm64) += arm64 subdir-y += platforms obj-$(EARLY_PRINTK) += early_printk.o obj-y += cpu.o obj-y += domain.o obj-y += psci.o obj-y += vpsci.o obj-y += domctl.o obj-y += sysctl.o obj-y += domain_build.o obj-y += gic.o obj-y += io.o obj-y += irq.o obj-y += kernel.o obj-y += mm.o obj-y += p2m.o obj-y += percpu.o obj-y += guestcopy.o obj-y += physdev.o obj-y += platform.o obj-y += setup.o obj-y += time.o obj-y += smpboot.o obj-y += smp.o obj-y += shutdown.o obj-y += traps.o obj-y += vgic.o obj-y += vtimer.o obj-y += vuart.o obj-y += hvm.o obj-y += device.o obj-y += decode.o #obj-bin-y += ....o ifdef CONFIG_DTB_FILE obj-y += dtb.o AFLAGS += -DCONFIG_DTB_FILE=\"$(CONFIG_DTB_FILE)\" endif ALL_OBJS := $(TARGET_SUBARCH)/head.o $(ALL_OBJS) $(TARGET): $(TARGET)-syms $(TARGET).axf $(OBJCOPY) -O binary -S $< $@ $(TARGET).axf: $(TARGET)-syms # XXX: VE model loads by VMA so instead of # making a proper ELF we link with LMA == VMA and adjust crudely $(OBJCOPY) --change-addresses +0x80000000 $< $@ $(STRIP) $@ ifeq ($(lto),y) # Gather all LTO objects together prelink_lto.o: $(ALL_OBJS) $(LD_LTO) -r -o $@ $^ # Link it with all the binary objects prelink.o: $(patsubst %/built_in.o,%/built_in_bin.o,$(ALL_OBJS)) prelink_lto.o $(LD) $(LDFLAGS) -r -o $@ $^ else prelink.o: $(ALL_OBJS) $(LD) $(LDFLAGS) -r -o $@ $^ endif $(BASEDIR)/common/symbols-dummy.o: $(MAKE) -f $(BASEDIR)/Rules.mk -C $(BASEDIR)/common symbols-dummy.o $(TARGET)-syms: prelink.o xen.lds $(BASEDIR)/common/symbols-dummy.o $(LD) $(LDFLAGS) -T xen.lds -N prelink.o \ $(BASEDIR)/common/symbols-dummy.o -o $(@D)/.$(@F).0 $(NM) -n $(@D)/.$(@F).0 | $(BASEDIR)/tools/symbols >$(@D)/.$(@F).0.S $(MAKE) -f $(BASEDIR)/Rules.mk $(@D)/.$(@F).0.o $(LD) $(LDFLAGS) -T xen.lds -N prelink.o \ $(@D)/.$(@F).0.o -o $(@D)/.$(@F).1 $(NM) -n $(@D)/.$(@F).1 | $(BASEDIR)/tools/symbols >$(@D)/.$(@F).1.S $(MAKE) -f $(BASEDIR)/Rules.mk $(@D)/.$(@F).1.o $(LD) $(LDFLAGS) -T xen.lds -N prelink.o \ $(@D)/.$(@F).1.o -o $@ rm -f $(@D)/.$(@F).[0-9]* asm-offsets.s: $(TARGET_SUBARCH)/asm-offsets.c $(CC) $(filter-out -flto,$(CFLAGS)) -S -o $@ $< xen.lds: xen.lds.S $(CC) -P -E -Ui386 $(AFLAGS) -DXEN_PHYS_START=$(CONFIG_LOAD_ADDRESS) -o $@ $< sed -e 's/xen\.lds\.o:/xen\.lds:/g' <.xen.lds.d >.xen.lds.d.new mv -f .xen.lds.d.new .xen.lds.d dtb.o: $(CONFIG_DTB_FILE) .PHONY: clean clean:: rm -f asm-offsets.s xen.lds rm -f $(BASEDIR)/.xen-syms.[0-9]* rm -f $(TARGET).axf xen-4.4.0/xen/arch/arm/platforms/0000775000175000017500000000000012307313555014740 5ustar smbsmbxen-4.4.0/xen/arch/arm/platforms/Makefile0000664000175000017500000000027512307313555016404 0ustar smbsmbobj-y += vexpress.o obj-$(CONFIG_ARM_32) += exynos5.o obj-$(CONFIG_ARM_32) += midway.o obj-$(CONFIG_ARM_32) += omap5.o obj-$(CONFIG_ARM_32) += sunxi.o obj-$(CONFIG_ARM_64) += xgene-storm.o xen-4.4.0/xen/arch/arm/platforms/vexpress.c0000664000175000017500000001142512307313555016766 0ustar smbsmb/* * xen/arch/arm/platform_vexpress.c * * Versatile Express specific settings * * Stefano Stabellini * Copyright (c) 2013 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #define DCC_SHIFT 26 #define FUNCTION_SHIFT 20 #define SITE_SHIFT 16 #define POSITION_SHIFT 12 #define DEVICE_SHIFT 0 static inline int vexpress_ctrl_start(uint32_t *syscfg, int write, int function, int device) { int dcc = 0; /* DCC to access */ int site = 0; /* motherboard */ int position = 0; /* daughterboard */ uint32_t stat; /* set control register */ syscfg[V2M_SYS_CFGCTRL/4] = V2M_SYS_CFG_START | (write ? V2M_SYS_CFG_WRITE : 0) | (dcc << DCC_SHIFT) | (function << FUNCTION_SHIFT) | (site << SITE_SHIFT) | (position << POSITION_SHIFT) | (device << DEVICE_SHIFT); /* wait for complete flag to be set */ do { stat = syscfg[V2M_SYS_CFGSTAT/4]; dsb(); } while ( !(stat & V2M_SYS_CFG_COMPLETE) ); /* check error status and return error flag if set */ if ( stat & V2M_SYS_CFG_ERROR ) { printk(KERN_ERR "V2M SYS_CFGSTAT reported a configuration error\n"); return -1; } return 0; } int vexpress_syscfg(int write, int function, int device, uint32_t *data) { uint32_t *syscfg = (uint32_t *) FIXMAP_ADDR(FIXMAP_MISC); int ret = -1; set_fixmap(FIXMAP_MISC, V2M_SYS_MMIO_BASE >> PAGE_SHIFT, DEV_SHARED); if ( syscfg[V2M_SYS_CFGCTRL/4] & V2M_SYS_CFG_START ) goto out; /* clear the complete bit in the V2M_SYS_CFGSTAT status register */ syscfg[V2M_SYS_CFGSTAT/4] = 0; if ( write ) { /* write data */ syscfg[V2M_SYS_CFGDATA/4] = *data; if ( vexpress_ctrl_start(syscfg, write, function, device) < 0 ) goto out; } else { if ( vexpress_ctrl_start(syscfg, write, function, device) < 0 ) goto out; else /* read data */ *data = syscfg[V2M_SYS_CFGDATA/4]; } ret = 0; out: clear_fixmap(FIXMAP_MISC); return ret; } /* * TODO: Get base address from the device tree * See arm,vexpress-reset node */ static void vexpress_reset(void) { void __iomem *sp810; /* Use the SP810 system controller to force a reset */ sp810 = ioremap_nocache(SP810_ADDRESS, PAGE_SIZE); if ( !sp810 ) { dprintk(XENLOG_ERR, "Unable to map SP810\n"); return; } /* switch to slow mode */ writel(0x3, sp810); dsb(); isb(); /* writing any value to SCSYSSTAT reg will reset the system */ writel(0x1, sp810 + 4); dsb(); isb(); iounmap(sp810); } #ifdef CONFIG_ARM_32 static int __init vexpress_smp_init(void) { void __iomem *sysflags; sysflags = ioremap_nocache(V2M_SYS_MMIO_BASE, PAGE_SIZE); if ( !sysflags ) { dprintk(XENLOG_ERR, "Unable to map vexpress MMIO\n"); return -EFAULT; } printk("Set SYS_FLAGS to %"PRIpaddr" (%p)\n", __pa(init_secondary), init_secondary); writel(~0, sysflags + V2M_SYS_FLAGSCLR); writel(__pa(init_secondary), sysflags + V2M_SYS_FLAGSSET); iounmap(sysflags); return 0; } #endif static const char * const vexpress_dt_compat[] __initconst = { "arm,vexpress", NULL }; static const struct dt_device_match vexpress_blacklist_dev[] __initconst = { /* Cache Coherent Interconnect */ DT_MATCH_COMPATIBLE("arm,cci-400"), DT_MATCH_COMPATIBLE("arm,cci-400-pmu"), /* Video device * TODO: remove it once memreserve is handled properly by Xen */ DT_MATCH_COMPATIBLE("arm,hdlcd"), /* Hardware power management */ DT_MATCH_COMPATIBLE("arm,vexpress-reset"), DT_MATCH_COMPATIBLE("arm,vexpress-reboot"), DT_MATCH_COMPATIBLE("arm,vexpress-shutdown"), { /* sentinel */ }, }; PLATFORM_START(vexpress, "VERSATILE EXPRESS") .compatible = vexpress_dt_compat, #ifdef CONFIG_ARM_32 .smp_init = vexpress_smp_init, .cpu_up = cpu_up_send_sgi, #endif .reset = vexpress_reset, .blacklist_dev = vexpress_blacklist_dev, PLATFORM_END /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/platforms/exynos5.c0000664000175000017500000000656712307313555016534 0ustar smbsmb/* * xen/arch/arm/platforms/exynos5.c * * Exynos5 specific settings * * Julien Grall * Copyright (c) 2013 Linaro Limited. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include #include static int exynos5_init_time(void) { uint32_t reg; void __iomem *mct; BUILD_BUG_ON(EXYNOS5_MCT_G_TCON >= PAGE_SIZE); mct = ioremap_attr(EXYNOS5_MCT_BASE, PAGE_SIZE, PAGE_HYPERVISOR_NOCACHE); if ( !mct ) { dprintk(XENLOG_ERR, "Unable to map MCT\n"); return -ENOMEM; } /* Enable timer on Exynos 5250 should probably be done by u-boot */ reg = readl(mct + EXYNOS5_MCT_G_TCON); writel(reg | EXYNOS5_MCT_G_TCON_START, mct + EXYNOS5_MCT_G_TCON); iounmap(mct); return 0; } /* Additional mappings for dom0 (Not in the DTS) */ static int exynos5_specific_mapping(struct domain *d) { /* Map the chip ID */ map_mmio_regions(d, EXYNOS5_PA_CHIPID, EXYNOS5_PA_CHIPID + PAGE_SIZE - 1, EXYNOS5_PA_CHIPID); /* Map the PWM region */ map_mmio_regions(d, EXYNOS5_PA_TIMER, EXYNOS5_PA_TIMER + (PAGE_SIZE * 2) - 1, EXYNOS5_PA_TIMER); return 0; } static int __init exynos5_smp_init(void) { void __iomem *sysram; sysram = ioremap_nocache(S5P_PA_SYSRAM, PAGE_SIZE); if ( !sysram ) { dprintk(XENLOG_ERR, "Unable to map exynos5 MMIO\n"); return -EFAULT; } printk("Set SYSRAM to %"PRIpaddr" (%p)\n", __pa(init_secondary), init_secondary); writel(__pa(init_secondary), sysram); iounmap(sysram); return 0; } static void exynos5_reset(void) { void __iomem *pmu; BUILD_BUG_ON(EXYNOS5_SWRESET >= PAGE_SIZE); pmu = ioremap_nocache(EXYNOS5_PA_PMU, PAGE_SIZE); if ( !pmu ) { dprintk(XENLOG_ERR, "Unable to map PMU\n"); return; } writel(1, pmu + EXYNOS5_SWRESET); iounmap(pmu); } static const char * const exynos5_dt_compat[] __initconst = { "samsung,exynos5250", NULL }; static const struct dt_device_match exynos5_blacklist_dev[] __initconst = { /* Multi core Timer * TODO: this device set up IRQ to CPU 1 which is not yet handled by Xen. * This is result to random freeze. */ DT_MATCH_COMPATIBLE("samsung,exynos4210-mct"), { /* sentinel */ }, }; PLATFORM_START(exynos5, "SAMSUNG EXYNOS5") .compatible = exynos5_dt_compat, .init_time = exynos5_init_time, .specific_mapping = exynos5_specific_mapping, .smp_init = exynos5_smp_init, .cpu_up = cpu_up_send_sgi, .reset = exynos5_reset, .blacklist_dev = exynos5_blacklist_dev, PLATFORM_END /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/platforms/midway.c0000664000175000017500000000321012307313555016372 0ustar smbsmb/* * xen/arch/arm/platforms/midway.c * * Calxeda Midway specific settings * * Andre Przywara * Copyright (c) 2013 Linaro Limited. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include static void midway_reset(void) { void __iomem *pmu; BUILD_BUG_ON((MW_SREG_PWR_REQ & PAGE_MASK) != (MW_SREG_A15_PWR_CTRL & PAGE_MASK)); pmu = ioremap_nocache(MW_SREG_PWR_REQ & PAGE_MASK, PAGE_SIZE); if ( !pmu ) { dprintk(XENLOG_ERR, "Unable to map PMU\n"); return; } writel(MW_PWR_HARD_RESET, pmu + (MW_SREG_PWR_REQ & ~PAGE_MASK)); writel(1, pmu + (MW_SREG_A15_PWR_CTRL & ~PAGE_MASK)); iounmap(pmu); } static const char * const midway_dt_compat[] __initconst = { "calxeda,ecx-2000", NULL }; PLATFORM_START(midway, "CALXEDA MIDWAY") .compatible = midway_dt_compat, .reset = midway_reset, .dom0_gnttab_start = 0xff800000, .dom0_gnttab_size = 0x20000, PLATFORM_END /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/platforms/xgene-storm.c0000664000175000017500000001113412307313555017354 0ustar smbsmb/* * xen/arch/arm/platforms/xgene-storm.c * * Applied Micro's X-Gene specific settings * * Pranavkumar Sawargaonkar * Anup Patel * Copyright (c) 2013 Applied Micro. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include /* XGENE RESET Specific defines */ #define XGENE_RESET_ADDR 0x17000014UL #define XGENE_RESET_SIZE 0x100 #define XGENE_RESET_MASK 0x1 /* Variables to save reset address of soc during platform initialization. */ static u64 reset_addr, reset_size; static u32 reset_mask; static bool reset_vals_valid = false; static uint32_t xgene_storm_quirks(void) { return PLATFORM_QUIRK_GIC_64K_STRIDE; } static int map_one_mmio(struct domain *d, const char *what, paddr_t start, paddr_t end) { int ret; printk("Additional MMIO %"PRIpaddr"-%"PRIpaddr" (%s)\n", start, end, what); ret = map_mmio_regions(d, start, end, start); if ( ret ) printk("Failed to map %s @ %"PRIpaddr" to dom%d\n", what, start, d->domain_id); return ret; } static int map_one_spi(struct domain *d, const char *what, unsigned int spi, unsigned int type) { struct dt_irq irq; int ret; irq.type = type; irq.irq = spi + 32; /* SPIs start at IRQ 32 */ printk("Additional IRQ %u (%s)\n", irq.irq, what); ret = gic_route_irq_to_guest(d, &irq, what); if ( ret ) printk("Failed to route %s to dom%d\n", what, d->domain_id); return ret; } /* * Xen does not currently support mapping MMIO regions and interrupt * for bus child devices (referenced via the "ranges" and * "interrupt-map" properties to domain 0). Instead for now map the * necessary resources manually. */ static int xgene_storm_specific_mapping(struct domain *d) { int ret; /* Map the PCIe bus resources */ ret = map_one_mmio(d, "PCI MEM REGION", 0xe000000000UL, 0xe010000000UL); if ( ret ) goto err; ret = map_one_mmio(d, "PCI IO REGION", 0xe080000000UL, 0xe080010000UL); if ( ret ) goto err; ret = map_one_mmio(d, "PCI CFG REGION", 0xe0d0000000UL, 0xe0d0200000UL); if ( ret ) goto err; ret = map_one_mmio(d, "PCI MSI REGION", 0xe010000000UL, 0xe010800000UL); if ( ret ) goto err; ret = map_one_spi(d, "PCI#INTA", 0xc2, DT_IRQ_TYPE_LEVEL_HIGH); if ( ret ) goto err; ret = map_one_spi(d, "PCI#INTB", 0xc3, DT_IRQ_TYPE_LEVEL_HIGH); if ( ret ) goto err; ret = map_one_spi(d, "PCI#INTC", 0xc4, DT_IRQ_TYPE_LEVEL_HIGH); if ( ret ) goto err; ret = map_one_spi(d, "PCI#INTD", 0xc5, DT_IRQ_TYPE_LEVEL_HIGH); if ( ret ) goto err; ret = 0; err: return ret; } static void xgene_storm_reset(void) { void __iomem *addr; if ( !reset_vals_valid ) { printk("XGENE: Invalid reset values, can not reset XGENE...\n"); return; } addr = ioremap_nocache(reset_addr, reset_size); if ( !addr ) { printk("XGENE: Unable to map xgene reset address, can not reset XGENE...\n"); return; } /* Write reset mask to base address */ writel(reset_mask, addr); iounmap(addr); } static int xgene_storm_init(void) { /* TBD: Once Linux side device tree bindings are finalized retrieve * these values from dts. */ reset_addr = XGENE_RESET_ADDR; reset_size = XGENE_RESET_SIZE; reset_mask = XGENE_RESET_MASK; reset_vals_valid = true; return 0; } static const char * const xgene_storm_dt_compat[] __initconst = { "apm,xgene-storm", NULL }; PLATFORM_START(xgene_storm, "APM X-GENE STORM") .compatible = xgene_storm_dt_compat, .init = xgene_storm_init, .reset = xgene_storm_reset, .quirks = xgene_storm_quirks, .specific_mapping = xgene_storm_specific_mapping, .dom0_evtchn_ppi = 24, .dom0_gnttab_start = 0x1f800000, .dom0_gnttab_size = 0x20000, PLATFORM_END /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/platforms/sunxi.c0000664000175000017500000000252612307313555016257 0ustar smbsmb/* * xen/arch/arm/platforms/sunxi.c * * SUNXI (AllWinner A20/A31) specific settings * * Copyright (c) 2013 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include static const char * const sunxi_dt_compat[] __initconst = { "allwinner,sun7i-a20", NULL }; static const struct dt_device_match sunxi_blacklist_dev[] __initconst = { /* * The UARTs share a page which runs the risk of mapping the Xen console * UART to dom0, so don't map any of them. */ DT_MATCH_COMPATIBLE("snps,dw-apb-uart"), { /* sentinel */ }, }; PLATFORM_START(sunxi, "Allwinner A20") .compatible = sunxi_dt_compat, .blacklist_dev = sunxi_blacklist_dev, .dom0_gnttab_start = 0x01d00000, .dom0_gnttab_size = 0x20000, PLATFORM_END /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/platforms/omap5.c0000664000175000017500000001170612307313555016132 0ustar smbsmb/* * xen/arch/arm/platforms/omap5.c * * OMAP5 specific settings * * Chen Baozi * Copyright (c) 2013 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include static uint16_t num_den[8][2] = { { 0, 0 }, /* not used */ { 26 * 64, 26 * 125 }, /* 12.0 Mhz */ { 2 * 768, 2 * 1625 }, /* 13.0 Mhz */ { 0, 0 }, /* not used */ { 130 * 8, 130 * 25 }, /* 19.2 Mhz */ { 2 * 384, 2 * 1625 }, /* 26.0 Mhz */ { 3 * 256, 3 * 1125 }, /* 27.0 Mhz */ { 130 * 4, 130 * 25 }, /* 38.4 Mhz */ }; /* * The realtime counter also called master counter, is a free-running * counter, which is related to real time. It produces the count used * by the CPU local timer peripherals in the MPU cluster. The timer counts * at a rate of 6.144 MHz. Because the device operates on different clocks * in different power modes, the master counter shifts operation between * clocks, adjusting the increment per clock in hardware accordingly to * maintain a constant count rate. */ static int omap5_init_time(void) { void __iomem *ckgen_prm_base; void __iomem *rt_ct_base; unsigned int sys_clksel; unsigned int num, den, frac1, frac2; ckgen_prm_base = ioremap_attr(OMAP5_CKGEN_PRM_BASE, 0x20, PAGE_HYPERVISOR_NOCACHE); if ( !ckgen_prm_base ) { dprintk(XENLOG_ERR, "%s: PRM_BASE ioremap failed\n", __func__); return -ENOMEM; } sys_clksel = readl(ckgen_prm_base + OMAP5_CM_CLKSEL_SYS) & ~SYS_CLKSEL_MASK; iounmap(ckgen_prm_base); rt_ct_base = ioremap_attr(REALTIME_COUNTER_BASE, 0x20, PAGE_HYPERVISOR_NOCACHE); if ( !rt_ct_base ) { dprintk(XENLOG_ERR, "%s: REALTIME_COUNTER_BASE ioremap failed\n", __func__); return -ENOMEM; } frac1 = readl(rt_ct_base + INCREMENTER_NUMERATOR_OFFSET); num = frac1 & ~NUMERATOR_DENUMERATOR_MASK; if ( num_den[sys_clksel][0] != num ) { frac1 &= NUMERATOR_DENUMERATOR_MASK; frac1 |= num_den[sys_clksel][0]; } frac2 = readl(rt_ct_base + INCREMENTER_DENUMERATOR_RELOAD_OFFSET); den = frac2 & ~NUMERATOR_DENUMERATOR_MASK; if ( num_den[sys_clksel][1] != num ) { frac2 &= NUMERATOR_DENUMERATOR_MASK; frac2 |= num_den[sys_clksel][1]; } writel(frac1, rt_ct_base + INCREMENTER_NUMERATOR_OFFSET); writel(frac2 | PRM_FRAC_INCREMENTER_DENUMERATOR_RELOAD, rt_ct_base + INCREMENTER_DENUMERATOR_RELOAD_OFFSET); iounmap(rt_ct_base); return 0; } /* Additional mappings for dom0 (not in the DTS) */ static int omap5_specific_mapping(struct domain *d) { /* Map the PRM module */ map_mmio_regions(d, OMAP5_PRM_BASE, OMAP5_PRM_BASE + (PAGE_SIZE * 2) - 1, OMAP5_PRM_BASE); /* Map the PRM_MPU */ map_mmio_regions(d, OMAP5_PRCM_MPU_BASE, OMAP5_PRCM_MPU_BASE + PAGE_SIZE - 1, OMAP5_PRCM_MPU_BASE); /* Map the Wakeup Gen */ map_mmio_regions(d, OMAP5_WKUPGEN_BASE, OMAP5_WKUPGEN_BASE + PAGE_SIZE - 1, OMAP5_WKUPGEN_BASE); /* Map the on-chip SRAM */ map_mmio_regions(d, OMAP5_SRAM_PA, OMAP5_SRAM_PA + (PAGE_SIZE * 32) - 1, OMAP5_SRAM_PA); return 0; } static int __init omap5_smp_init(void) { void __iomem *wugen_base; wugen_base = ioremap_nocache(OMAP5_WKUPGEN_BASE, PAGE_SIZE); if ( !wugen_base ) { dprintk(XENLOG_ERR, "Unable to map omap5 MMIO\n"); return -EFAULT; } printk("Set AuxCoreBoot1 to %"PRIpaddr" (%p)\n", __pa(init_secondary), init_secondary); writel(__pa(init_secondary), wugen_base + OMAP_AUX_CORE_BOOT_1_OFFSET); printk("Set AuxCoreBoot0 to 0x20\n"); writel(0x20, wugen_base + OMAP_AUX_CORE_BOOT_0_OFFSET); iounmap(wugen_base); return 0; } static const char const *omap5_dt_compat[] __initconst = { "ti,omap5", NULL }; PLATFORM_START(omap5, "TI OMAP5") .compatible = omap5_dt_compat, .init_time = omap5_init_time, .specific_mapping = omap5_specific_mapping, .smp_init = omap5_smp_init, .cpu_up = cpu_up_send_sgi, .dom0_gnttab_start = 0x4b000000, .dom0_gnttab_size = 0x20000, PLATFORM_END /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm64/0000775000175000017500000000000012307313555013662 5ustar smbsmbxen-4.4.0/xen/arch/arm/arm64/domctl.c0000664000175000017500000000231412307313555015310 0ustar smbsmb/****************************************************************************** * Subarch-specific domctl.c * * Copyright (c) 2013, Citrix Systems */ #include #include #include #include #include #include #include static long switch_mode(struct domain *d, enum domain_type type) { if ( d == NULL ) return -EINVAL; if ( d->tot_pages != 0 ) return -EBUSY; if ( d->arch.type == type ) return 0; d->arch.type = type; return 0; } long subarch_do_domctl(struct xen_domctl *domctl, struct domain *d, XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) { switch ( domctl->cmd ) { case XEN_DOMCTL_set_address_size: switch ( domctl->u.address_size.size ) { case 32: return switch_mode(d, DOMAIN_PV32); case 64: return switch_mode(d, DOMAIN_PV64); default: return -EINVAL; } break; default: return -ENOSYS; } } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm64/asm-offsets.c0000664000175000017500000000332312307313555016256 0ustar smbsmb/* * Generate definitions needed by assembly language modules. * This code generates raw asm output which is post-processed * to extract and format the required data. */ #define COMPILE_OFFSETS #include #include #include #include #include #include #define DEFINE(_sym, _val) \ __asm__ __volatile__ ( "\n->" #_sym " %0 " #_val : : "i" (_val) ) #define BLANK() \ __asm__ __volatile__ ( "\n->" : : ) #define OFFSET(_sym, _str, _mem) \ DEFINE(_sym, offsetof(_str, _mem)); void __dummy__(void) { OFFSET(UREGS_X0, struct cpu_user_regs, x0); OFFSET(UREGS_LR, struct cpu_user_regs, lr); OFFSET(UREGS_SP, struct cpu_user_regs, sp); OFFSET(UREGS_PC, struct cpu_user_regs, pc); OFFSET(UREGS_CPSR, struct cpu_user_regs, cpsr); OFFSET(UREGS_SPSR_el1, struct cpu_user_regs, spsr_el1); OFFSET(UREGS_SPSR_fiq, struct cpu_user_regs, spsr_fiq); OFFSET(UREGS_SPSR_irq, struct cpu_user_regs, spsr_irq); OFFSET(UREGS_SPSR_und, struct cpu_user_regs, spsr_und); OFFSET(UREGS_SPSR_abt, struct cpu_user_regs, spsr_abt); OFFSET(UREGS_SP_el0, struct cpu_user_regs, sp_el0); OFFSET(UREGS_SP_el1, struct cpu_user_regs, sp_el1); OFFSET(UREGS_ELR_el1, struct cpu_user_regs, elr_el1); OFFSET(UREGS_kernel_sizeof, struct cpu_user_regs, spsr_el1); DEFINE(UREGS_user_sizeof, sizeof(struct cpu_user_regs)); BLANK(); DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info)); OFFSET(VCPU_arch_saved_context, struct vcpu, arch.saved_context); BLANK(); OFFSET(INITINFO_stack, struct init_info, stack); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm64/Makefile0000664000175000017500000000023212307313555015317 0ustar smbsmbsubdir-y += lib obj-y += entry.o obj-y += traps.o obj-y += domain.o obj-y += vfp.o obj-y += smpboot.o obj-y += domctl.o obj-$(EARLY_PRINTK) += debug.o xen-4.4.0/xen/arch/arm/arm64/debug.S0000664000175000017500000000240212307313555015072 0ustar smbsmb/* * xen/arch/arm/arm64/debug.S * * Wrapper for early printk * * Julien Grall * Copyright (c) 2013 Linaro Limited. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #ifdef EARLY_PRINTK_INC #include EARLY_PRINTK_INC #endif /* Print a character on the UART - this function is called by C * x0: character to print */ GLOBAL(early_putch) ldr x15, =EARLY_UART_VIRTUAL_ADDRESS early_uart_ready x15, 1 early_uart_transmit x15, w0 ret /* Flush the UART - this function is called by C */ GLOBAL(early_flush) ldr x15, =EARLY_UART_VIRTUAL_ADDRESS /* x15 := VA UART base address */ early_uart_ready x15, 1 ret /* * Local variables: * mode: ASM * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm64/domain.c0000664000175000017500000000317612307313555015304 0ustar smbsmb#include #include #include #include #include /* C(hyp,user), hyp is Xen internal name, user is user API name. */ #define ALLREGS \ C(x0,x0); C(x1,x1); C(x2,x2); C(x3,x3); \ C(x4,x4); C(x5,x5); C(x6,x6); C(x7,x7); \ C(x8,x8); C(x9,x9); C(x10,x10); C(x11,x11); \ C(x12,x12); C(x13,x13); C(x14,x14); C(x15,x15); \ C(x16,x16); C(x17,x17); C(x18,x18); C(x19,x19); \ C(x20,x20); C(x21,x21); C(x22,x22); C(x23,x23); \ C(x24,x24); C(x25,x25); C(x26,x26); C(x27,x27); \ C(x28,x28); C(fp,x29); C(lr,x30); C(pc,pc64); \ C(cpsr, cpsr); C(spsr_el1, spsr_el1) #define ALLREGS32 C(spsr_fiq, spsr_fiq); C(spsr_irq,spsr_irq); \ C(spsr_und,spsr_und); C(spsr_abt,spsr_abt) #define ALLREGS64 C(sp_el0,sp_el0); C(sp_el1,sp_el1); C(elr_el1,elr_el1) void vcpu_regs_hyp_to_user(const struct vcpu *vcpu, struct vcpu_guest_core_regs *regs) { #define C(hyp,user) regs->user = vcpu->arch.cpu_info->guest_cpu_user_regs.hyp ALLREGS; if ( is_pv32_domain(vcpu->domain) ) { ALLREGS32; } else { ALLREGS64; } #undef C } void vcpu_regs_user_to_hyp(struct vcpu *vcpu, const struct vcpu_guest_core_regs *regs) { #define C(hyp,user) vcpu->arch.cpu_info->guest_cpu_user_regs.hyp = regs->user ALLREGS; if ( is_pv32_domain(vcpu->domain) ) { ALLREGS32; } else { ALLREGS64; } #undef C } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm64/lib/0000775000175000017500000000000012307313555014430 5ustar smbsmbxen-4.4.0/xen/arch/arm/arm64/lib/Makefile0000664000175000017500000000004212307313555016064 0ustar smbsmbobj-y += bitops.o find_next_bit.o xen-4.4.0/xen/arch/arm/arm64/lib/bitops.S0000664000175000017500000000323112307313555016053 0ustar smbsmb/* * Based on linux/arch/arm64/lib/bitops.h which in turn is * Based on arch/arm/lib/bitops.h * * Copyright (C) 2013 ARM Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include /* * x0: bits 4:0 bit offset * bits 31:5 word offset * x1: address */ .macro bitop, name, instr ENTRY( \name ) and w3, w0, #31 // Get bit offset eor w0, w0, w3 // Clear low bits mov x2, #1 add x1, x1, x0, lsr #3 // Get word offset lsl x3, x2, x3 // Create mask 1: ldxr w2, [x1] \instr w2, w2, w3 stxr w0, w2, [x1] cbnz w0, 1b ret ENDPROC(\name ) .endm .macro testop, name, instr ENTRY( \name ) and w3, w0, #31 // Get bit offset eor w0, w0, w3 // Clear low bits mov x2, #1 add x1, x1, x0, lsr #3 // Get word offset lsl x4, x2, x3 // Create mask 1: ldaxr w2, [x1] lsr w0, w2, w3 // Save old value of bit \instr w2, w2, w4 // toggle bit stlxr w5, w2, [x1] cbnz w5, 1b and w0, w0, #1 3: ret ENDPROC(\name ) .endm /* * Atomic bit operations. */ bitop change_bit, eor bitop clear_bit, bic bitop set_bit, orr testop test_and_change_bit, eor testop test_and_clear_bit, bic testop test_and_set_bit, orr xen-4.4.0/xen/arch/arm/arm64/lib/find_next_bit.c0000664000175000017500000001460212307313555017413 0ustar smbsmb/* find_next_bit.c: fallback find next bit implementation * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #include #include #include #include #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) #ifndef find_next_bit /* * Find the next set bit in a memory region. */ unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BITOP_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG-1); unsigned long tmp; if (offset >= size) return size; size -= result; offset %= BITS_PER_LONG; if (offset) { tmp = *(p++); tmp &= (~0UL << offset); if (size < BITS_PER_LONG) goto found_first; if (tmp) goto found_middle; size -= BITS_PER_LONG; result += BITS_PER_LONG; } while (size & ~(BITS_PER_LONG-1)) { if ((tmp = *(p++))) goto found_middle; result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) return result; tmp = *p; found_first: tmp &= (~0UL >> (BITS_PER_LONG - size)); if (tmp == 0UL) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: return result + __ffs(tmp); } EXPORT_SYMBOL(find_next_bit); #endif #ifndef find_next_zero_bit /* * This implementation of find_{first,next}_zero_bit was stolen from * Linus' asm-alpha/bitops.h. */ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BITOP_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG-1); unsigned long tmp; if (offset >= size) return size; size -= result; offset %= BITS_PER_LONG; if (offset) { tmp = *(p++); tmp |= ~0UL >> (BITS_PER_LONG - offset); if (size < BITS_PER_LONG) goto found_first; if (~tmp) goto found_middle; size -= BITS_PER_LONG; result += BITS_PER_LONG; } while (size & ~(BITS_PER_LONG-1)) { if (~(tmp = *(p++))) goto found_middle; result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) return result; tmp = *p; found_first: tmp |= ~0UL << size; if (tmp == ~0UL) /* Are any bits zero? */ return result + size; /* Nope. */ found_middle: return result + ffz(tmp); } EXPORT_SYMBOL(find_next_zero_bit); #endif #ifndef find_first_bit /* * Find the first set bit in a memory region. */ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) { const unsigned long *p = addr; unsigned long result = 0; unsigned long tmp; while (size & ~(BITS_PER_LONG-1)) { if ((tmp = *(p++))) goto found; result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) return result; tmp = (*p) & (~0UL >> (BITS_PER_LONG - size)); if (tmp == 0UL) /* Are any bits set? */ return result + size; /* Nope. */ found: return result + __ffs(tmp); } EXPORT_SYMBOL(find_first_bit); #endif #ifndef find_first_zero_bit /* * Find the first cleared bit in a memory region. */ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) { const unsigned long *p = addr; unsigned long result = 0; unsigned long tmp; while (size & ~(BITS_PER_LONG-1)) { if (~(tmp = *(p++))) goto found; result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) return result; tmp = (*p) | (~0UL << size); if (tmp == ~0UL) /* Are any bits zero? */ return result + size; /* Nope. */ found: return result + ffz(tmp); } EXPORT_SYMBOL(find_first_zero_bit); #endif #ifdef __BIG_ENDIAN /* include/linux/byteorder does not support "unsigned long" type */ static inline unsigned long ext2_swabp(const unsigned long * x) { #if BITS_PER_LONG == 64 return (unsigned long) __swab64p((u64 *) x); #elif BITS_PER_LONG == 32 return (unsigned long) __swab32p((u32 *) x); #else #error BITS_PER_LONG not defined #endif } /* include/linux/byteorder doesn't support "unsigned long" type */ static inline unsigned long ext2_swab(const unsigned long y) { #if BITS_PER_LONG == 64 return (unsigned long) __swab64((u64) y); #elif BITS_PER_LONG == 32 return (unsigned long) __swab32((u32) y); #else #error BITS_PER_LONG not defined #endif } #ifndef find_next_zero_bit_le unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr; unsigned long result = offset & ~(BITS_PER_LONG - 1); unsigned long tmp; if (offset >= size) return size; p += BITOP_WORD(offset); size -= result; offset &= (BITS_PER_LONG - 1UL); if (offset) { tmp = ext2_swabp(p++); tmp |= (~0UL >> (BITS_PER_LONG - offset)); if (size < BITS_PER_LONG) goto found_first; if (~tmp) goto found_middle; size -= BITS_PER_LONG; result += BITS_PER_LONG; } while (size & ~(BITS_PER_LONG - 1)) { if (~(tmp = *(p++))) goto found_middle_swap; result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) return result; tmp = ext2_swabp(p); found_first: tmp |= ~0UL << size; if (tmp == ~0UL) /* Are any bits zero? */ return result + size; /* Nope. Skip ffz */ found_middle: return result + ffz(tmp); found_middle_swap: return result + ffz(ext2_swab(tmp)); } EXPORT_SYMBOL(find_next_zero_bit_le); #endif #ifndef find_next_bit_le unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr; unsigned long result = offset & ~(BITS_PER_LONG - 1); unsigned long tmp; if (offset >= size) return size; p += BITOP_WORD(offset); size -= result; offset &= (BITS_PER_LONG - 1UL); if (offset) { tmp = ext2_swabp(p++); tmp &= (~0UL << offset); if (size < BITS_PER_LONG) goto found_first; if (tmp) goto found_middle; size -= BITS_PER_LONG; result += BITS_PER_LONG; } while (size & ~(BITS_PER_LONG - 1)) { tmp = *(p++); if (tmp) goto found_middle_swap; result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) return result; tmp = ext2_swabp(p); found_first: tmp &= (~0UL >> (BITS_PER_LONG - size)); if (tmp == 0UL) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: return result + __ffs(tmp); found_middle_swap: return result + __ffs(ext2_swab(tmp)); } EXPORT_SYMBOL(find_next_bit_le); #endif #endif /* __BIG_ENDIAN */ xen-4.4.0/xen/arch/arm/arm64/entry.S0000664000175000017500000001752112307313555015155 0ustar smbsmb#include #include #include #include /* * Register aliases. */ lr .req x30 // link register /* * Stack pushing/popping (register pairs only). Equivalent to store decrement * before, load increment after. */ .macro push, xreg1, xreg2 stp \xreg1, \xreg2, [sp, #-16]! .endm .macro pop, xreg1, xreg2 ldp \xreg1, \xreg2, [sp], #16 .endm /* * Save/restore guest mode specific state, outer stack frame */ .macro entry_guest, compat add x21, sp, #UREGS_SPSR_el1 mrs x23, SPSR_el1 str x23, [x21] .if \compat == 0 /* Aarch64 mode */ add x21, sp, #UREGS_SP_el0 mrs x22, SP_el0 str x22, [x21] add x21, sp, #UREGS_SP_el1 mrs x22, SP_el1 mrs x23, ELR_el1 stp x22, x23, [x21] .else /* Aarch32 mode */ add x21, sp, #UREGS_SPSR_fiq mrs x22, SPSR_fiq mrs x23, SPSR_irq stp w22, w23, [x21] add x21, sp, #UREGS_SPSR_und mrs x22, SPSR_und mrs x23, SPSR_abt stp w22, w23, [x21] .endif .endm .macro exit_guest, compat add x21, sp, #UREGS_SPSR_el1 ldr x23, [x21] msr SPSR_el1, x23 .if \compat == 0 /* Aarch64 mode */ add x21, sp, #UREGS_SP_el0 ldr x22, [x21] msr SP_el0, x22 add x21, sp, #UREGS_SP_el1 ldp x22, x23, [x21] msr SP_el1, x22 msr ELR_el1, x23 .else /* Aarch32 mode */ add x21, sp, #UREGS_SPSR_fiq ldp w22, w23, [x21] msr SPSR_fiq, x22 msr SPSR_irq, x23 add x21, sp, #UREGS_SPSR_und ldp w22, w23, [x21] msr SPSR_und, x22 msr SPSR_abt, x23 .endif .endm /* * Save state on entry to hypervisor, restore on exit */ .macro entry, hyp, compat sub sp, sp, #(UREGS_SPSR_el1 - UREGS_LR) /* CPSR, PC, SP, LR */ push x28, x29 push x26, x27 push x24, x25 push x22, x23 push x20, x21 push x18, x19 push x16, x17 push x14, x15 push x12, x13 push x10, x11 push x8, x9 push x6, x7 push x4, x5 push x2, x3 push x0, x1 .if \hyp == 1 /* Hypervisor mode */ add x21, sp, #UREGS_kernel_sizeof .else /* Guest mode */ entry_guest \compat mov x21, ~0 /* sp only valid for hyp frame XXX */ .endif stp lr, x21, [sp, #UREGS_LR] mrs x22, elr_el2 mrs x23, spsr_el2 stp x22, x23, [sp, #UREGS_PC] .endm .macro exit, hyp, compat .if \hyp == 0 /* Guest mode */ bl leave_hypervisor_tail /* Disables interrupts on return */ exit_guest \compat .endif b return_from_trap .endm /* * Bad Abort numbers *----------------- */ #define BAD_SYNC 0 #define BAD_IRQ 1 #define BAD_FIQ 2 #define BAD_ERROR 3 .macro invalid, reason mov x0, sp mov x1, #\reason b do_bad_mode .endm hyp_sync_invalid: entry hyp=1 invalid BAD_SYNC hyp_irq_invalid: entry hyp=1 invalid BAD_IRQ hyp_fiq_invalid: entry hyp=1 invalid BAD_FIQ hyp_error_invalid: entry hyp=1 invalid BAD_ERROR /* Traps taken in Current EL with SP_ELx */ hyp_sync: entry hyp=1 msr daifclr, #2 mov x0, sp bl do_trap_hypervisor exit hyp=1 hyp_irq: entry hyp=1 mov x0, sp bl do_trap_irq exit hyp=1 guest_sync: entry hyp=0, compat=0 msr daifclr, #2 mov x0, sp bl do_trap_hypervisor exit hyp=0, compat=0 guest_irq: entry hyp=0, compat=0 mov x0, sp bl do_trap_irq exit hyp=0, compat=0 guest_fiq_invalid: entry hyp=0, compat=0 invalid BAD_FIQ guest_error_invalid: entry hyp=0, compat=0 invalid BAD_ERROR guest_sync_compat: entry hyp=0, compat=1 msr daifclr, #2 mov x0, sp bl do_trap_hypervisor exit hyp=0, compat=1 guest_irq_compat: entry hyp=0, compat=1 mov x0, sp bl do_trap_irq exit hyp=0, compat=1 guest_fiq_invalid_compat: entry hyp=0, compat=1 invalid BAD_FIQ guest_error_invalid_compat: entry hyp=0, compat=1 invalid BAD_ERROR ENTRY(return_to_new_vcpu32) exit hyp=0, compat=1 ENTRY(return_to_new_vcpu64) exit hyp=0, compat=0 return_from_trap: msr daifset, #2 /* Mask interrupts */ ldp x21, x22, [sp, #UREGS_PC] // load ELR, SPSR pop x0, x1 pop x2, x3 pop x4, x5 pop x6, x7 pop x8, x9 msr elr_el2, x21 // set up the return data msr spsr_el2, x22 pop x10, x11 pop x12, x13 pop x14, x15 pop x16, x17 pop x18, x19 pop x20, x21 pop x22, x23 pop x24, x25 pop x26, x27 pop x28, x29 ldr lr, [sp], #(UREGS_SPSR_el1 - UREGS_LR) /* CPSR, PC, SP, LR */ eret /* * Exception vectors. */ .macro ventry label .align 7 b \label .endm .align 11 ENTRY(hyp_traps_vector) ventry hyp_sync_invalid // Synchronous EL2t ventry hyp_irq_invalid // IRQ EL2t ventry hyp_fiq_invalid // FIQ EL2t ventry hyp_error_invalid // Error EL2t ventry hyp_sync // Synchronous EL2h ventry hyp_irq // IRQ EL2h ventry hyp_fiq_invalid // FIQ EL2h ventry hyp_error_invalid // Error EL2h ventry guest_sync // Synchronous 64-bit EL0/EL1 ventry guest_irq // IRQ 64-bit EL0/EL1 ventry guest_fiq_invalid // FIQ 64-bit EL0/EL1 ventry guest_error_invalid // Error 64-bit EL0/EL1 ventry guest_sync_compat // Synchronous 32-bit EL0/EL1 ventry guest_irq_compat // IRQ 32-bit EL0/EL1 ventry guest_fiq_invalid_compat // FIQ 32-bit EL0/EL1 ventry guest_error_invalid_compat // Error 32-bit EL0/EL1 /* * struct vcpu *__context_switch(struct vcpu *prev, struct vcpu *next) * * x0 - prev * x1 - next * * Returns prev in x0 */ ENTRY(__context_switch) add x8, x0, #VCPU_arch_saved_context mov x9, sp stp x19, x20, [x8], #16 // store callee-saved registers stp x21, x22, [x8], #16 stp x23, x24, [x8], #16 stp x25, x26, [x8], #16 stp x27, x28, [x8], #16 stp x29, x9, [x8], #16 str lr, [x8] add x8, x1, #VCPU_arch_saved_context ldp x19, x20, [x8], #16 // restore callee-saved registers ldp x21, x22, [x8], #16 ldp x23, x24, [x8], #16 ldp x25, x26, [x8], #16 ldp x27, x28, [x8], #16 ldp x29, x9, [x8], #16 ldr lr, [x8] mov sp, x9 ret /* * Local variables: * mode: ASM * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm64/vfp.c0000664000175000017500000000444612307313555014631 0ustar smbsmb#include #include #include #include void vfp_save_state(struct vcpu *v) { if ( !cpu_has_fp ) return; asm volatile("stp q0, q1, [%1, #16 * 0]\n\t" "stp q2, q3, [%1, #16 * 2]\n\t" "stp q4, q5, [%1, #16 * 4]\n\t" "stp q6, q7, [%1, #16 * 6]\n\t" "stp q8, q9, [%1, #16 * 8]\n\t" "stp q10, q11, [%1, #16 * 10]\n\t" "stp q12, q13, [%1, #16 * 12]\n\t" "stp q14, q15, [%1, #16 * 14]\n\t" "stp q16, q17, [%1, #16 * 16]\n\t" "stp q18, q19, [%1, #16 * 18]\n\t" "stp q20, q21, [%1, #16 * 20]\n\t" "stp q22, q23, [%1, #16 * 22]\n\t" "stp q24, q25, [%1, #16 * 24]\n\t" "stp q26, q27, [%1, #16 * 26]\n\t" "stp q28, q29, [%1, #16 * 28]\n\t" "stp q30, q31, [%1, #16 * 30]\n\t" : "=Q" (*v->arch.vfp.fpregs) : "r" (v->arch.vfp.fpregs)); v->arch.vfp.fpsr = READ_SYSREG32(FPSR); v->arch.vfp.fpcr = READ_SYSREG32(FPCR); v->arch.vfp.fpexc32_el2 = READ_SYSREG32(FPEXC32_EL2); } void vfp_restore_state(struct vcpu *v) { if ( !cpu_has_fp ) return; asm volatile("ldp q0, q1, [%1, #16 * 0]\n\t" "ldp q2, q3, [%1, #16 * 2]\n\t" "ldp q4, q5, [%1, #16 * 4]\n\t" "ldp q6, q7, [%1, #16 * 6]\n\t" "ldp q8, q9, [%1, #16 * 8]\n\t" "ldp q10, q11, [%1, #16 * 10]\n\t" "ldp q12, q13, [%1, #16 * 12]\n\t" "ldp q14, q15, [%1, #16 * 14]\n\t" "ldp q16, q17, [%1, #16 * 16]\n\t" "ldp q18, q19, [%1, #16 * 18]\n\t" "ldp q20, q21, [%1, #16 * 20]\n\t" "ldp q22, q23, [%1, #16 * 22]\n\t" "ldp q24, q25, [%1, #16 * 24]\n\t" "ldp q26, q27, [%1, #16 * 26]\n\t" "ldp q28, q29, [%1, #16 * 28]\n\t" "ldp q30, q31, [%1, #16 * 30]\n\t" : : "Q" (*v->arch.vfp.fpregs), "r" (v->arch.vfp.fpregs)); WRITE_SYSREG32(v->arch.vfp.fpsr, FPSR); WRITE_SYSREG32(v->arch.vfp.fpcr, FPCR); WRITE_SYSREG32(v->arch.vfp.fpexc32_el2, FPEXC32_EL2); } xen-4.4.0/xen/arch/arm/arm64/debug-pl011.inc0000664000175000017500000000374712307313555016311 0ustar smbsmb/* * xen/arch/arm/arm64/debug-pl011.S * * PL011 specific debug code * * Copyright (c) 2013 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include /* PL011 UART initialization * xb: register which containts the UART base address * c: scratch register number */ .macro early_uart_init xb, c mov x\c, #(7372800 / EARLY_PRINTK_BAUD % 16) strh w\c, [\xb, #0x28] /* -> UARTFBRD (Baud divisor fraction) */ mov x\c, #(7372800 / EARLY_PRINTK_BAUD / 16) strh w\c, [\xb, #0x24] /* -> UARTIBRD (Baud divisor integer) */ mov x\c, #0x60 /* 8n1 */ str w\c, [\xb, #0x2C] /* -> UARTLCR_H (Line control) */ ldr x\c, =0x00000301 /* RXE | TXE | UARTEN */ str w\c, [\xb, #0x30] /* -> UARTCR (Control Register) */ .endm /* PL011 UART wait UART to be ready to transmit * xb: register which contains the UART base address * c: scratch register number */ .macro early_uart_ready xb, c 1: ldrh w\c, [\xb, #0x18] /* <- UARTFR (Flag register) */ tst w\c, #0x8 /* Check BUSY bit */ b.ne 1b /* Wait for the UART to be ready */ .endm /* PL011 UART transmit character * xb: register which contains the UART base address * wt: register which contains the character to transmit */ .macro early_uart_transmit xb, wt strb \wt, [\xb] /* -> UARTDR (Data Register) */ .endm /* * Local variables: * mode: ASM * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm64/head.S0000664000175000017500000004332412307313555014715 0ustar smbsmb/* * xen/arch/arm/head.S * * Start-of-day code for an ARMv8. * * Ian Campbell * Copyright (c) 2012 Citrix Systems. * * Based on ARMv7-A head.S by * Tim Deegan * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #define PT_PT 0xe7f /* nG=1 AF=1 SH=10 AP=01 NS=1 ATTR=111 T=1 P=1 */ #define PT_MEM 0xe7d /* nG=1 AF=1 SH=10 AP=01 NS=1 ATTR=111 T=0 P=1 */ #define PT_DEV 0xe71 /* nG=1 AF=1 SH=10 AP=01 NS=1 ATTR=100 T=0 P=1 */ #define PT_DEV_L3 0xe73 /* nG=1 AF=1 SH=10 AP=01 NS=1 ATTR=100 T=1 P=1 */ #if (defined (EARLY_PRINTK)) && (defined (EARLY_PRINTK_INC)) #include EARLY_PRINTK_INC #endif /* * Common register usage in this file: * x0 - * x1 - * x2 - * x3 - * x4 - * x5 - * x6 - * x7 - * x8 - * x9 - * x10 - * x11 - * x12 - * x13 - * x14 - * x15 - * x16 - * x17 - * x18 - * x19 - paddr(start) * x20 - phys offset * x21 - DTB address (boot cpu only) * x22 - is_secondary_cpu * x23 - UART address * x24 - cpuid * x25 - * x26 - * x27 - * x28 - * x29 - * x30 - lr */ /* Macro to print a string to the UART, if there is one. * Clobbers x0-x3. */ #ifdef EARLY_PRINTK #define PRINT(_s) \ adr x0, 98f ; \ bl puts ; \ b 99f ; \ 98: .asciz _s ; \ .align 2 ; \ 99: #else /* EARLY_PRINTK */ #define PRINT(s) #endif /* !EARLY_PRINTK */ /*.aarch64*/ /* * Kernel startup entry point. * --------------------------- * * The requirements are: * MMU = off, D-cache = off, I-cache = on or off, * x0 = physical address to the FDT blob. * * This must be the very first address in the loaded image. * It should be linked at XEN_VIRT_START, and loaded at any * 2MB-aligned address. All of text+data+bss must fit in 2MB, * or the initial pagetable code below will need adjustment. */ .global start start: /* * DO NOT MODIFY. Image header expected by Linux boot-loaders. */ b real_start /* branch to kernel start, magic */ .long 0 /* reserved */ .quad 0 /* Image load offset from start of RAM */ .quad 0 /* reserved */ .quad 0 /* reserved */ .quad 0 /* reserved */ .quad 0 /* reserved */ .quad 0 /* reserved */ .byte 0x41 /* Magic number, "ARM\x64" */ .byte 0x52 .byte 0x4d .byte 0x64 .word 0 /* reserved */ real_start: msr DAIFSet, 0xf /* Disable all interrupts */ /* Save the bootloader arguments in less-clobberable registers */ mov x21, x0 /* x21 := DTB, physical address */ /* Find out where we are */ ldr x0, =start adr x19, start /* x19 := paddr (start) */ sub x20, x19, x0 /* x20 := phys-offset */ /* Using the DTB in the .dtb section? */ #ifdef CONFIG_DTB_FILE ldr x21, =_sdtb add x21, x21, x20 /* x21 := paddr(DTB) */ #endif mov x22, #0 /* x22 := is_secondary_cpu */ b common_start GLOBAL(init_secondary) msr DAIFSet, 0xf /* Disable all interrupts */ /* Find out where we are */ ldr x0, =start adr x19, start /* x19 := paddr (start) */ sub x20, x19, x0 /* x20 := phys-offset */ mov x22, #1 /* x22 := is_secondary_cpu */ common_start: mov x24, #0 /* x24 := CPU ID. Initialy zero until we * find that multiprocessor extensions are * present and the system is SMP */ mrs x0, mpidr_el1 tbz x0, _MPIDR_SMP, 1f /* Multiprocessor extension not supported? */ tbnz x0, _MPIDR_UP, 1f /* Uniprocessor system? */ mov x13, #(~MPIDR_HWID_MASK) bic x24, x0, x13 /* Mask out flags to get CPU ID */ 1: /* Non-boot CPUs wait here until __cpu_up is ready for them */ cbz x22, 1f ldr x0, =smp_up_cpu add x0, x0, x20 /* Apply physical offset */ dsb sy 2: ldr x1, [x0] cmp x1, x24 beq 1f wfe b 2b 1: #ifdef EARLY_PRINTK ldr x23, =EARLY_UART_BASE_ADDRESS /* x23 := UART base address */ cbnz x22, 1f bl init_uart /* Boot CPU sets up the UART too */ 1: PRINT("- CPU ") mov x0, x24 bl putn PRINT(" booting -\r\n") #endif PRINT("- Current EL ") mrs x4, CurrentEL mov x0, x4 bl putn PRINT(" -\r\n") /* Are we in EL2 */ cmp x4, #PSR_MODE_EL2t ccmp x4, #PSR_MODE_EL2h, #0x4, ne b.eq el2 /* Yes */ /* OK, we're boned. */ PRINT("- Xen must be entered in NS EL2 mode -\r\n" \ "- Please update the bootloader -\r\n") b fail el2: PRINT("- Xen starting at EL2 -\r\n") /* Zero BSS On the boot CPU to avoid nasty surprises */ cbnz x22, skip_bss PRINT("- Zero BSS -\r\n") ldr x0, =__bss_start /* Load start & end of bss */ ldr x1, =__bss_end add x0, x0, x20 /* Apply physical offset */ add x1, x1, x20 1: str xzr, [x0], #8 cmp x0, x1 b.lo 1b skip_bss: PRINT("- Setting up control registers -\r\n") /* XXXX call PROCINFO_cpu_init here */ /* Set up memory attribute type tables */ ldr x0, =MAIRVAL msr mair_el2, x0 /* Set up the HTCR: * PASize -- 40 bits / 1TB * Top byte is used * PT walks use Outer-Shareable accesses, * PT walks are write-back, write-allocate in both cache levels, * Full 64-bit address space goes through this table. */ ldr x0, =0x80822500 msr tcr_el2, x0 /* Set up the SCTLR_EL2: * Exceptions in LE ARM, * Low-latency IRQs disabled, * Write-implies-XN disabled (for now), * D-cache disabled (for now), * I-cache enabled, * Alignment checking enabled, * MMU translation disabled (for now). */ ldr x0, =(HSCTLR_BASE|SCTLR_A) msr SCTLR_EL2, x0 /* Rebuild the boot pagetable's first-level entries. The structure * is described in mm.c. * * After the CPU enables paging it will add the fixmap mapping * to these page tables, however this may clash with the 1:1 * mapping. So each CPU must rebuild the page tables here with * the 1:1 in place. */ /* Write Xen's PT's paddr into TTBR0_EL2 */ ldr x4, =boot_pgtable add x4, x4, x20 /* x4 := paddr (boot_pagetable) */ msr TTBR0_EL2, x4 /* Setup boot_pgtable: */ ldr x1, =boot_first add x1, x1, x20 /* x1 := paddr (boot_first) */ /* ... map boot_first in boot_pgtable[0] */ mov x3, #PT_PT /* x2 := table map of boot_first */ orr x2, x1, x3 /* + rights for linear PT */ str x2, [x4, #0] /* Map it in slot 0 */ /* ... map of paddr(start) in boot_pgtable */ lsr x1, x19, #39 /* Offset of base paddr in boot_pgtable */ cbz x1, 1f /* It's in slot 0, map in boot_first * or boot_second later on */ lsl x2, x1, #39 /* Base address for 512GB mapping */ mov x3, #PT_MEM /* x2 := Section mapping */ orr x2, x2, x3 lsl x1, x1, #3 /* x1 := Slot offset */ str x2, [x4, x1] /* Mapping of paddr(start)*/ 1: /* Setup boot_first: */ ldr x4, =boot_first /* Next level into boot_first */ add x4, x4, x20 /* x4 := paddr(boot_first) */ /* ... map boot_second in boot_first[0] */ ldr x1, =boot_second add x1, x1, x20 /* x1 := paddr(boot_second) */ mov x3, #PT_PT /* x2 := table map of boot_first */ orr x2, x1, x3 /* + rights for linear PT */ str x2, [x4, #0] /* Map it in slot 0 */ /* ... map of paddr(start) in boot_first */ lsr x2, x19, #30 /* x2 := Offset of base paddr in boot_first */ and x1, x2, 0x1ff /* x1 := Slot to use */ cbz x1, 1f /* It's in slot 0, map in boot_second */ lsl x2, x2, #30 /* Base address for 1GB mapping */ mov x3, #PT_MEM /* x2 := Section map */ orr x2, x2, x3 lsl x1, x1, #3 /* x1 := Slot offset */ str x2, [x4, x1] /* Create mapping of paddr(start)*/ 1: /* Setup boot_second: */ ldr x4, =boot_second add x4, x4, x20 /* x4 := paddr (boot_second) */ lsr x2, x19, #20 /* Base address for 2MB mapping */ lsl x2, x2, #20 mov x3, #PT_MEM /* x2 := Section map */ orr x2, x2, x3 /* ... map of vaddr(start) in boot_second */ ldr x1, =start lsr x1, x1, #18 /* Slot for vaddr(start) */ str x2, [x4, x1] /* Map vaddr(start) */ /* ... map of paddr(start) in boot_second */ lsr x1, x19, #30 /* Base paddr */ cbnz x1, 1f /* If paddr(start) is not in slot 0 * then the mapping was done in * boot_pgtable or boot_first above */ lsr x1, x19, #18 /* Slot for paddr(start) */ str x2, [x4, x1] /* Map Xen there */ 1: /* Defer fixmap and dtb mapping until after paging enabled, to * avoid them clashing with the 1:1 mapping. */ /* boot pagetable setup complete */ PRINT("- Turning on paging -\r\n") ldr x1, =paging /* Explicit vaddr, not RIP-relative */ mrs x0, SCTLR_EL2 orr x0, x0, #SCTLR_M /* Enable MMU */ orr x0, x0, #SCTLR_C /* Enable D-cache */ dsb sy /* Flush PTE writes and finish reads */ msr SCTLR_EL2, x0 /* now paging is enabled */ isb /* Now, flush the icache */ br x1 /* Get a proper vaddr into PC */ paging: /* Now we can install the fixmap and dtb mappings, since we * don't need the 1:1 map any more */ dsb sy #if defined(EARLY_PRINTK) /* Fixmap is only used by early printk */ /* Non-boot CPUs don't need to rebuild the fixmap itself, just * the mapping from boot_second to xen_fixmap */ cbnz x22, 1f /* Add UART to the fixmap table */ ldr x1, =xen_fixmap add x1, x1, x20 /* x1 := paddr (xen_fixmap) */ lsr x2, x23, #12 lsl x2, x2, #12 /* 4K aligned paddr of UART */ mov x3, #PT_DEV_L3 orr x2, x2, x3 /* x2 := 4K dev map including UART */ str x2, [x1, #(FIXMAP_CONSOLE*8)] /* Map it in the first fixmap's slot */ 1: /* Map fixmap into boot_second */ ldr x4, =boot_second /* x4 := vaddr (boot_second) */ ldr x2, =xen_fixmap add x2, x2, x20 /* x2 := paddr (xen_fixmap) */ mov x3, #PT_PT orr x2, x2, x3 /* x2 := table map of xen_fixmap */ ldr x1, =FIXMAP_ADDR(0) lsr x1, x1, #18 /* x1 := Slot for FIXMAP(0) */ str x2, [x4, x1] /* Map it in the fixmap's slot */ /* Use a virtual address to access the UART. */ ldr x23, =EARLY_UART_VIRTUAL_ADDRESS #endif /* Map the DTB in the boot misc slot */ cbnz x22, 1f /* Only on boot CPU */ lsr x2, x21, #21 lsl x2, x2, #21 /* x2 := 2MB-aligned paddr of DTB */ mov x3, #PT_MEM /* x2 := 2MB RAM incl. DTB */ orr x2, x2, x3 ldr x1, =BOOT_FDT_VIRT_START lsr x1, x1, #18 /* x4 := Slot for BOOT_FDT_VIRT_START */ str x2, [x4, x1] /* Map it in the early fdt slot */ dsb sy 1: PRINT("- Ready -\r\n") /* The boot CPU should go straight into C now */ cbz x22, launch /* Non-boot CPUs need to move on to the proper pagetables, which were * setup in init_secondary_pagetables. */ ldr x4, =init_ttbr /* VA of TTBR0_EL2 stashed by CPU 0 */ ldr x4, [x4] /* Actual value */ dsb sy msr TTBR0_EL2, x4 dsb sy isb tlbi alle2 dsb sy /* Ensure completion of TLB flush */ isb launch: ldr x0, =init_data add x0, x0, #INITINFO_stack /* Find the boot-time stack */ ldr x0, [x0] add x0, x0, #STACK_SIZE /* (which grows down from the top). */ sub x0, x0, #CPUINFO_sizeof /* Make room for CPU save record */ mov sp, x0 mov x0, x20 /* Marshal args: - phys_offset */ mov x1, x21 /* - FDT */ mov x2, x24 /* - CPU ID */ cbz x22, start_xen /* and disappear into the land of C */ b start_secondary /* (to the appropriate entry point) */ /* Fail-stop * r0: string explaining why */ fail: PRINT("- Boot failed -\r\n") 1: wfe b 1b /* Copy Xen to new location and switch TTBR * x0 ttbr * x1 source address * x2 destination address * x3 length * * Source and destination must be word aligned, length is rounded up * to a 16 byte boundary. * * MUST BE VERY CAREFUL when saving things to RAM over the copy */ ENTRY(relocate_xen) /* Copy 16 bytes at a time using: * x9: counter * x10: data * x11: data * x12: source * x13: destination */ mov x9, x3 mov x12, x1 mov x13, x2 1: ldp x10, x11, [x12], #16 stp x10, x11, [x13], #16 subs x9, x9, #16 bgt 1b /* Flush destination from dcache using: * x9: counter * x10: step * x11: vaddr */ dsb sy /* So the CPU issues all writes to the range */ mov x9, x3 ldr x10, =cacheline_bytes /* x10 := step */ ldr x10, [x10] mov x11, x2 1: dc cvac, x11 add x11, x11, x10 subs x9, x9, x10 bgt 1b dsb sy /* Ensure the flushes happen before * continuing */ isb /* Ensure synchronization with previous * changes to text */ tlbi alle2 /* Flush hypervisor TLB */ ic iallu /* Flush I-cache */ dsb sy /* Ensure completion of TLB flush */ isb msr TTBR0_EL2, x0 isb /* Ensure synchronization with previous * changes to text */ tlbi alle2 /* Flush hypervisor TLB */ ic iallu /* Flush I-cache */ dsb sy /* Ensure completion of TLB flush */ isb ret #ifdef EARLY_PRINTK /* Bring up the UART. * x23: Early UART base address * Clobbers x0-x1 */ init_uart: #ifdef EARLY_PRINTK_INIT_UART early_uart_init x23, 0 #endif adr x0, 1f b puts 1: .asciz "- UART enabled -\r\n" .align 4 /* Print early debug messages. * x0: Nul-terminated string to print. * x23: Early UART base address * Clobbers x0-x1 */ puts: early_uart_ready x23, 1 ldrb w1, [x0], #1 /* Load next char */ cbz w1, 1f /* Exit on nul */ early_uart_transmit x23, w1 b puts 1: ret /* Print a 32-bit number in hex. Specific to the PL011 UART. * x0: Number to print. * x23: Early UART base address * Clobbers x0-x3 */ putn: adr x1, hex mov x3, #8 1: early_uart_ready x23, 2 and x2, x0, #0xf0000000 /* Mask off the top nybble */ lsr x2, x2, #28 ldrb w2, [x1, x2] /* Convert to a char */ early_uart_transmit x23, w2 lsl x0, x0, #4 /* Roll it through one nybble at a time */ subs x3, x3, #1 b.ne 1b ret hex: .ascii "0123456789abcdef" .align 2 #else /* EARLY_PRINTK */ init_uart: .global early_puts early_puts: puts: putn: ret #endif /* EARLY_PRINTK */ /* * Local variables: * mode: ASM * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm64/debug-8250.inc0000664000175000017500000000243012307313555016036 0ustar smbsmb/* * xen/arch/arm/arm64/debug-8250.inc * * 8250 specific debug code * * Copyright (c) 2013 Applied Micro. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include /* UART wait UART to be ready to transmit * xb: register which contains the UART base address * c: scratch register */ .macro early_uart_ready xb c 1: ldrb w\c, [\xb, #UART_LSR << EARLY_UART_REG_SHIFT] and w\c, w\c, #UART_LSR_THRE cmp w\c, #UART_LSR_THRE b.ne 1b .endm /* UART transmit character * xb: register which contains the UART base address * wt: register which contains the character to transmit */ .macro early_uart_transmit xb wt /* UART_THR transmit holding */ strb \wt, [\xb, #UART_THR << EARLY_UART_REG_SHIFT] .endm /* * Local variables: * mode: ASM * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm64/smpboot.c0000664000175000017500000000455612307313555015523 0ustar smbsmb#include #include #include #include #include #include #include #include #include struct smp_enable_ops { int (*prepare_cpu)(int); }; static paddr_t cpu_release_addr[NR_CPUS]; static struct smp_enable_ops smp_enable_ops[NR_CPUS]; static int __init smp_spin_table_cpu_up(int cpu) { paddr_t __iomem *release; if (!cpu_release_addr[cpu]) { printk("CPU%d: No release addr\n", cpu); return -ENODEV; } release = ioremap_nocache(cpu_release_addr[cpu], 8); if ( !release ) { dprintk(XENLOG_ERR, "CPU%d: Unable to map release address\n", cpu); return -EFAULT; } release[0] = __pa(init_secondary); flush_xen_data_tlb_range_va((vaddr_t)release, sizeof(*release)); iounmap(release); sev(); return cpu_up_send_sgi(cpu); } static void __init smp_spin_table_init(int cpu, struct dt_device_node *dn) { if ( !dt_property_read_u64(dn, "cpu-release-addr", &cpu_release_addr[cpu]) ) { printk("CPU%d has no cpu-release-addr\n", cpu); return; } smp_enable_ops[cpu].prepare_cpu = smp_spin_table_cpu_up; } static int __init smp_psci_init(int cpu) { if ( !psci_available ) { printk("CPU%d asks for PSCI, but DTB has no PSCI node\n", cpu); return -ENODEV; } smp_enable_ops[cpu].prepare_cpu = call_psci_cpu_on; return 0; } int __init arch_smp_init(void) { /* Nothing */ return 0; } int __init arch_cpu_init(int cpu, struct dt_device_node *dn) { const char *enable_method; enable_method = dt_get_property(dn, "enable-method", NULL); if (!enable_method) { printk("CPU%d has no enable method\n", cpu); return -EINVAL; } if ( !strcmp(enable_method, "spin-table") ) smp_spin_table_init(cpu, dn); else if ( !strcmp(enable_method, "psci") ) return smp_psci_init(cpu); else { printk("CPU%d has unknown enable method \"%s\"\n", cpu, enable_method); return -EINVAL; } return 0; } int __init arch_cpu_up(int cpu) { if ( !smp_enable_ops[cpu].prepare_cpu ) return -ENODEV; return smp_enable_ops[cpu].prepare_cpu(cpu); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm64/traps.c0000664000175000017500000000247412307313555015166 0ustar smbsmb/* * xen/arch/arm/arm64/traps.c * * ARM AArch64 Specific Trap handlers * * Copyright (c) 2012 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include asmlinkage void do_trap_serror(struct cpu_user_regs *regs) { panic("Unhandled serror trap"); } static const char *handler[]= { "Synchronous Abort", "IRQ", "FIQ", "Error" }; asmlinkage void do_bad_mode(struct cpu_user_regs *regs, int reason) { uint64_t esr = READ_SYSREG64(ESR_EL2); printk("Bad mode in %s handler detected, code 0x%08"PRIx64"\n", handler[reason], esr); local_irq_disable(); panic("bad mode"); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/smp.c0000664000175000017500000000120312307313555013670 0ustar smbsmb#include #include #include #include #include #include #include void flush_tlb_mask(const cpumask_t *mask) { /* No need to IPI other processors on ARM, the processor takes care of it. */ flush_tlb_all(); } void smp_send_event_check_mask(const cpumask_t *mask) { send_SGI_mask(mask, GIC_SGI_EVENT_CHECK); } void smp_send_call_function_mask(const cpumask_t *mask) { send_SGI_mask(mask, GIC_SGI_CALL_FUNCTION); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/decode.h0000664000175000017500000000234412307313555014330 0ustar smbsmb/* * xen/arch/arm/decode.h * * Instruction decoder * * Julien Grall * Copyright (C) 2013 Linaro Limited. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #ifndef __ARCH_ARM_DECODE_H_ #define __ARCH_ARM_DECODE_H_ #include #include /** * Decode an instruction from pc * /!\ This function is not intended to fully decode an instruction. It * considers that the instruction is valid. * * This function will get: * - The transfer register * - Sign bit * - Size */ int decode_instruction(const struct cpu_user_regs *regs, struct hsr_dabt *dabt); #endif /* __ARCH_ARM_DECODE_H_ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/percpu.c0000664000175000017500000000415012307313555014373 0ustar smbsmb#include #include #include #include #include #include unsigned long __per_cpu_offset[NR_CPUS]; #define INVALID_PERCPU_AREA (-(long)__per_cpu_start) #define PERCPU_ORDER (get_order_from_bytes(__per_cpu_data_end-__per_cpu_start)) void __init percpu_init_areas(void) { unsigned int cpu; for ( cpu = 1; cpu < NR_CPUS; cpu++ ) __per_cpu_offset[cpu] = INVALID_PERCPU_AREA; } static int init_percpu_area(unsigned int cpu) { char *p; if ( __per_cpu_offset[cpu] != INVALID_PERCPU_AREA ) return -EBUSY; if ( (p = alloc_xenheap_pages(PERCPU_ORDER, 0)) == NULL ) return -ENOMEM; memset(p, 0, __per_cpu_data_end - __per_cpu_start); __per_cpu_offset[cpu] = p - __per_cpu_start; return 0; } struct free_info { unsigned int cpu; struct rcu_head rcu; }; static DEFINE_PER_CPU(struct free_info, free_info); static void _free_percpu_area(struct rcu_head *head) { struct free_info *info = container_of(head, struct free_info, rcu); unsigned int cpu = info->cpu; char *p = __per_cpu_start + __per_cpu_offset[cpu]; free_xenheap_pages(p, PERCPU_ORDER); __per_cpu_offset[cpu] = INVALID_PERCPU_AREA; } static void free_percpu_area(unsigned int cpu) { struct free_info *info = &per_cpu(free_info, cpu); info->cpu = cpu; call_rcu(&info->rcu, _free_percpu_area); } static int cpu_percpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int rc = 0; switch ( action ) { case CPU_UP_PREPARE: rc = init_percpu_area(cpu); break; case CPU_UP_CANCELED: case CPU_DEAD: free_percpu_area(cpu); break; default: break; } return !rc ? NOTIFY_DONE : notifier_from_errno(rc); } static struct notifier_block cpu_percpu_nfb = { .notifier_call = cpu_percpu_callback, .priority = 100 /* highest priority */ }; static int __init percpu_presmp_init(void) { register_cpu_notifier(&cpu_percpu_nfb); return 0; } presmp_initcall(percpu_presmp_init); xen-4.4.0/xen/arch/arm/irq.c0000664000175000017500000001225512307313555013675 0ustar smbsmb/* * xen/arch/arm/irq.c * * ARM Interrupt support * * Ian Campbell * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include static void enable_none(struct irq_desc *irq) { } static unsigned int startup_none(struct irq_desc *irq) { return 0; } static void disable_none(struct irq_desc *irq) { } static void ack_none(struct irq_desc *irq) { printk("unexpected IRQ trap at irq %02x\n", irq->irq); } #define shutdown_none disable_none #define end_none enable_none hw_irq_controller no_irq_type = { .typename = "none", .startup = startup_none, .shutdown = shutdown_none, .enable = enable_none, .disable = disable_none, .ack = ack_none, .end = end_none }; int __init arch_init_one_irq_desc(struct irq_desc *desc) { return 0; } static int __init init_irq_data(void) { int irq; for (irq = NR_LOCAL_IRQS; irq < NR_IRQS; irq++) { struct irq_desc *desc = irq_to_desc(irq); init_one_irq_desc(desc); desc->irq = irq; desc->action = NULL; } return 0; } static int __cpuinit init_local_irq_data(void) { int irq; for (irq = 0; irq < NR_LOCAL_IRQS; irq++) { struct irq_desc *desc = irq_to_desc(irq); init_one_irq_desc(desc); desc->irq = irq; desc->action = NULL; } return 0; } void __init init_IRQ(void) { BUG_ON(init_local_irq_data() < 0); BUG_ON(init_irq_data() < 0); } void __cpuinit init_secondary_IRQ(void) { BUG_ON(init_local_irq_data() < 0); } int __init request_dt_irq(const struct dt_irq *irq, void (*handler)(int, void *, struct cpu_user_regs *), const char *devname, void *dev_id) { struct irqaction *action; int retval; /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out * which interrupt is which (messes up the interrupt freeing * logic etc). */ if (irq->irq >= nr_irqs) return -EINVAL; if (!handler) return -EINVAL; action = xmalloc(struct irqaction); if (!action) return -ENOMEM; action->handler = handler; action->name = devname; action->dev_id = dev_id; action->free_on_release = 1; retval = setup_dt_irq(irq, action); if (retval) xfree(action); return retval; } /* Dispatch an interrupt */ void do_IRQ(struct cpu_user_regs *regs, unsigned int irq, int is_fiq) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action = desc->action; /* TODO: perfc_incr(irqs); */ /* TODO: this_cpu(irq_count)++; */ irq_enter(); spin_lock(&desc->lock); desc->handler->ack(desc); if ( action == NULL ) { printk("Unknown %s %#3.3x\n", is_fiq ? "FIQ" : "IRQ", irq); goto out; } if ( desc->status & IRQ_GUEST ) { struct domain *d = action->dev_id; desc->handler->end(desc); desc->status |= IRQ_INPROGRESS; desc->arch.eoi_cpu = smp_processor_id(); /* XXX: inject irq into all guest vcpus */ vgic_vcpu_inject_irq(d->vcpu[0], irq, 0); goto out_no_end; } desc->status |= IRQ_PENDING; /* * Since we set PENDING, if another processor is handling a different * instance of this same irq, the other processor will take care of it. */ if ( desc->status & (IRQ_DISABLED | IRQ_INPROGRESS) ) goto out; desc->status |= IRQ_INPROGRESS; action = desc->action; while ( desc->status & IRQ_PENDING ) { desc->status &= ~IRQ_PENDING; spin_unlock_irq(&desc->lock); action->handler(irq, action->dev_id, regs); spin_lock_irq(&desc->lock); } desc->status &= ~IRQ_INPROGRESS; out: desc->handler->end(desc); out_no_end: spin_unlock(&desc->lock); irq_exit(); } /* * pirq event channels. We don't use these on ARM, instead we use the * features of the GIC to inject virtualised normal interrupts. */ struct pirq *alloc_pirq_struct(struct domain *d) { return NULL; } /* * These are all unreachable given an alloc_pirq_struct * which returns NULL, all callers try to lookup struct pirq first * which will fail. */ int pirq_guest_bind(struct vcpu *v, struct pirq *pirq, int will_share) { BUG(); } void pirq_guest_unbind(struct domain *d, struct pirq *pirq) { BUG(); } void pirq_set_affinity(struct domain *d, int pirq, const cpumask_t *mask) { BUG(); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/cpu.c0000664000175000017500000000440312307313555013665 0ustar smbsmb/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include void __cpuinit identify_cpu(struct cpuinfo_arm *c) { c->midr.bits = READ_SYSREG32(MIDR_EL1); c->mpidr.bits = READ_SYSREG(MPIDR_EL1); #ifdef CONFIG_ARM_64 c->pfr64.bits[0] = READ_SYSREG64(ID_AA64PFR0_EL1); c->pfr64.bits[1] = READ_SYSREG64(ID_AA64PFR1_EL1); c->dbg64.bits[0] = READ_SYSREG64(ID_AA64DFR0_EL1); c->dbg64.bits[1] = READ_SYSREG64(ID_AA64DFR1_EL1); c->aux64.bits[0] = READ_SYSREG64(ID_AA64AFR0_EL1); c->aux64.bits[1] = READ_SYSREG64(ID_AA64AFR1_EL1); c->mm64.bits[0] = READ_SYSREG64(ID_AA64MMFR0_EL1); c->mm64.bits[1] = READ_SYSREG64(ID_AA64MMFR1_EL1); c->isa64.bits[0] = READ_SYSREG64(ID_AA64ISAR0_EL1); c->isa64.bits[1] = READ_SYSREG64(ID_AA64ISAR1_EL1); #endif c->pfr32.bits[0] = READ_SYSREG32(ID_PFR0_EL1); c->pfr32.bits[1] = READ_SYSREG32(ID_PFR1_EL1); c->dbg32.bits[0] = READ_SYSREG32(ID_DFR0_EL1); c->aux32.bits[0] = READ_SYSREG32(ID_AFR0_EL1); c->mm32.bits[0] = READ_SYSREG32(ID_MMFR0_EL1); c->mm32.bits[1] = READ_SYSREG32(ID_MMFR1_EL1); c->mm32.bits[2] = READ_SYSREG32(ID_MMFR2_EL1); c->mm32.bits[3] = READ_SYSREG32(ID_MMFR3_EL1); c->isa32.bits[0] = READ_SYSREG32(ID_ISAR0_EL1); c->isa32.bits[1] = READ_SYSREG32(ID_ISAR1_EL1); c->isa32.bits[2] = READ_SYSREG32(ID_ISAR2_EL1); c->isa32.bits[3] = READ_SYSREG32(ID_ISAR3_EL1); c->isa32.bits[4] = READ_SYSREG32(ID_ISAR4_EL1); c->isa32.bits[5] = READ_SYSREG32(ID_ISAR5_EL1); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/xen.lds.S0000664000175000017500000000747312307313555014443 0ustar smbsmb/* Excerpts written by Martin Mares */ /* Modified for i386/x86-64 Xen by Keir Fraser */ /* Modified for ARM Xen by Ian Campbell */ #include #include #include #include #undef ENTRY #undef ALIGN ENTRY(start) #if defined(__arm__) #define FORMAT arm #elif defined(__aarch64__) #define FORMAT aarch64 #endif OUTPUT_ARCH(FORMAT) PHDRS { text PT_LOAD /* XXX should be AT ( XEN_PHYS_START ) */ ; } SECTIONS { . = XEN_VIRT_START; _start = .; .text : /* XXX should be AT ( XEN_PHYS_START ) */ { _stext = .; /* Text section */ *(.text) *(.fixup) *(.gnu.warning) _etext = .; /* End of text section */ } :text = 0x9090 . = ALIGN(PAGE_SIZE); .rodata : { _srodata = .; /* Read-only data */ *(.rodata) *(.rodata.*) _erodata = .; /* End of read-only data */ } :text .data : { /* Data */ . = ALIGN(PAGE_SIZE); *(.data.page_aligned) *(.data) *(.data.rel) *(.data.rel.*) CONSTRUCTORS } :text . = ALIGN(SMP_CACHE_BYTES); .data.read_mostly : { /* Exception table */ __start___ex_table = .; *(.ex_table) __stop___ex_table = .; /* Pre-exception table */ __start___pre_ex_table = .; *(.ex_table.pre) __stop___pre_ex_table = .; *(.data.read_mostly) *(.data.rel.ro) *(.data.rel.ro.*) } :text #ifdef LOCK_PROFILE . = ALIGN(32); __lock_profile_start = .; .lockprofile.data : { *(.lockprofile.data) } :text __lock_profile_end = .; #endif . = ALIGN(8); .arch.info : { _splatform = .; *(.arch.info) _eplatform = .; } :text . = ALIGN(8); .dev.info : { _sdevice = .; *(.dev.info) _edevice = .; } :text . = ALIGN(PAGE_SIZE); /* Init code and data */ __init_begin = .; .init.text : { _sinittext = .; *(.init.text) _einittext = .; } :text . = ALIGN(PAGE_SIZE); .init.data : { *(.init.rodata) *(.init.rodata.str*) *(.init.data) *(.init.data.rel) *(.init.data.rel.*) . = ALIGN(8); __ctors_start = .; *(.init_array) __ctors_end = .; } :text . = ALIGN(32); .init.setup : { __setup_start = .; *(.init.setup) __setup_end = .; } :text .init.proc.info : { __proc_info_start = .; *(.init.proc.info) __proc_info_end = .; } :text .initcall.init : { __initcall_start = .; *(.initcallpresmp.init) __presmp_initcall_end = .; *(.initcall1.init) __initcall_end = .; } :text .xsm_initcall.init : { __xsm_initcall_start = .; *(.xsm_initcall.init) __xsm_initcall_end = .; } :text . = ALIGN(STACK_SIZE); __init_end = .; .bss : { /* BSS */ __bss_start = .; *(.bss.stack_aligned) . = ALIGN(PAGE_SIZE); *(.bss.page_aligned) *(.bss) . = ALIGN(SMP_CACHE_BYTES); __per_cpu_start = .; *(.bss.percpu) . = ALIGN(SMP_CACHE_BYTES); *(.bss.percpu.read_mostly) . = ALIGN(SMP_CACHE_BYTES); __per_cpu_data_end = .; __bss_end = .; } :text _end = . ; #ifdef CONFIG_DTB_FILE /* Section for the device tree blob (if any). */ _sdtb = .; .dtb : { *(.dtb) } :text #endif /* Sections to be discarded */ /DISCARD/ : { *(.exit.text) *(.exit.data) *(.exitcall.exit) *(.eh_frame) } /* Stabs debugging sections. */ .stab 0 : { *(.stab) } .stabstr 0 : { *(.stabstr) } .stab.excl 0 : { *(.stab.excl) } .stab.exclstr 0 : { *(.stab.exclstr) } .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) } } xen-4.4.0/xen/arch/arm/domain.c0000664000175000017500000004546412307313555014361 0ustar smbsmb/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vtimer.h" #include "vuart.h" DEFINE_PER_CPU(struct vcpu *, curr_vcpu); void idle_loop(void) { for ( ; ; ) { if ( cpu_is_offline(smp_processor_id()) ) stop_cpu(); local_irq_disable(); if ( cpu_is_haltable(smp_processor_id()) ) { dsb(); wfi(); } local_irq_enable(); do_tasklet(); do_softirq(); } } static void ctxt_switch_from(struct vcpu *p) { /* CP 15 */ p->arch.csselr = READ_SYSREG(CSSELR_EL1); /* Control Registers */ p->arch.sctlr = READ_SYSREG(SCTLR_EL1); p->arch.cpacr = READ_SYSREG(CPACR_EL1); p->arch.contextidr = READ_SYSREG(CONTEXTIDR_EL1); p->arch.tpidr_el0 = READ_SYSREG(TPIDR_EL0); p->arch.tpidrro_el0 = READ_SYSREG(TPIDRRO_EL0); p->arch.tpidr_el1 = READ_SYSREG(TPIDR_EL1); /* Arch timer */ virt_timer_save(p); if ( is_pv32_domain(p->domain) && cpu_has_thumbee ) { p->arch.teecr = READ_SYSREG32(TEECR32_EL1); p->arch.teehbr = READ_SYSREG32(TEEHBR32_EL1); } #ifdef CONFIG_ARM_32 p->arch.joscr = READ_CP32(JOSCR); p->arch.jmcr = READ_CP32(JMCR); #endif isb(); /* MMU */ p->arch.vbar = READ_SYSREG(VBAR_EL1); p->arch.ttbcr = READ_SYSREG(TCR_EL1); p->arch.ttbr0 = READ_SYSREG64(TTBR0_EL1); p->arch.ttbr1 = READ_SYSREG64(TTBR1_EL1); if ( is_pv32_domain(p->domain) ) p->arch.dacr = READ_SYSREG(DACR32_EL2); p->arch.par = READ_SYSREG64(PAR_EL1); #if defined(CONFIG_ARM_32) p->arch.mair0 = READ_CP32(MAIR0); p->arch.mair1 = READ_CP32(MAIR1); p->arch.amair0 = READ_CP32(AMAIR0); p->arch.amair1 = READ_CP32(AMAIR1); #else p->arch.mair = READ_SYSREG64(MAIR_EL1); p->arch.amair = READ_SYSREG64(AMAIR_EL1); #endif /* Fault Status */ #if defined(CONFIG_ARM_32) p->arch.dfar = READ_CP32(DFAR); p->arch.ifar = READ_CP32(IFAR); p->arch.dfsr = READ_CP32(DFSR); #elif defined(CONFIG_ARM_64) p->arch.far = READ_SYSREG64(FAR_EL1); p->arch.esr = READ_SYSREG64(ESR_EL1); #endif if ( is_pv32_domain(p->domain) ) p->arch.ifsr = READ_SYSREG(IFSR32_EL2); p->arch.afsr0 = READ_SYSREG(AFSR0_EL1); p->arch.afsr1 = READ_SYSREG(AFSR1_EL1); /* XXX MPU */ /* VFP */ vfp_save_state(p); /* VGIC */ gic_save_state(p); isb(); context_saved(p); } static void ctxt_switch_to(struct vcpu *n) { register_t hcr; hcr = READ_SYSREG(HCR_EL2); WRITE_SYSREG(hcr & ~HCR_VM, HCR_EL2); isb(); p2m_load_VTTBR(n->domain); isb(); WRITE_SYSREG32(n->domain->arch.vpidr, VPIDR_EL2); WRITE_SYSREG(n->arch.vmpidr, VMPIDR_EL2); /* VGIC */ gic_restore_state(n); /* VFP */ vfp_restore_state(n); /* XXX MPU */ /* Fault Status */ #if defined(CONFIG_ARM_32) WRITE_CP32(n->arch.dfar, DFAR); WRITE_CP32(n->arch.ifar, IFAR); WRITE_CP32(n->arch.dfsr, DFSR); #elif defined(CONFIG_ARM_64) WRITE_SYSREG64(n->arch.far, FAR_EL1); WRITE_SYSREG64(n->arch.esr, ESR_EL1); #endif if ( is_pv32_domain(n->domain) ) WRITE_SYSREG(n->arch.ifsr, IFSR32_EL2); WRITE_SYSREG(n->arch.afsr0, AFSR0_EL1); WRITE_SYSREG(n->arch.afsr1, AFSR1_EL1); /* MMU */ WRITE_SYSREG(n->arch.vbar, VBAR_EL1); WRITE_SYSREG(n->arch.ttbcr, TCR_EL1); WRITE_SYSREG64(n->arch.ttbr0, TTBR0_EL1); WRITE_SYSREG64(n->arch.ttbr1, TTBR1_EL1); if ( is_pv32_domain(n->domain) ) WRITE_SYSREG(n->arch.dacr, DACR32_EL2); WRITE_SYSREG64(n->arch.par, PAR_EL1); #if defined(CONFIG_ARM_32) WRITE_CP32(n->arch.mair0, MAIR0); WRITE_CP32(n->arch.mair1, MAIR1); WRITE_CP32(n->arch.amair0, AMAIR0); WRITE_CP32(n->arch.amair1, AMAIR1); #elif defined(CONFIG_ARM_64) WRITE_SYSREG64(n->arch.mair, MAIR_EL1); WRITE_SYSREG64(n->arch.amair, AMAIR_EL1); #endif isb(); /* Control Registers */ WRITE_SYSREG(n->arch.sctlr, SCTLR_EL1); WRITE_SYSREG(n->arch.cpacr, CPACR_EL1); WRITE_SYSREG(n->arch.contextidr, CONTEXTIDR_EL1); WRITE_SYSREG(n->arch.tpidr_el0, TPIDR_EL0); WRITE_SYSREG(n->arch.tpidrro_el0, TPIDRRO_EL0); WRITE_SYSREG(n->arch.tpidr_el1, TPIDR_EL1); if ( is_pv32_domain(n->domain) && cpu_has_thumbee ) { WRITE_SYSREG32(n->arch.teecr, TEECR32_EL1); WRITE_SYSREG32(n->arch.teehbr, TEEHBR32_EL1); } #ifdef CONFIG_ARM_32 WRITE_CP32(n->arch.joscr, JOSCR); WRITE_CP32(n->arch.jmcr, JMCR); #endif isb(); /* CP 15 */ WRITE_SYSREG(n->arch.csselr, CSSELR_EL1); isb(); if ( is_pv32_domain(n->domain) ) hcr &= ~HCR_RW; else hcr |= HCR_RW; WRITE_SYSREG(hcr, HCR_EL2); isb(); /* This is could trigger an hardware interrupt from the virtual * timer. The interrupt needs to be injected into the guest. */ virt_timer_restore(n); } /* Update per-VCPU guest runstate shared memory area (if registered). */ static void update_runstate_area(struct vcpu *v) { if ( guest_handle_is_null(runstate_guest(v)) ) return; __copy_to_guest(runstate_guest(v), &v->runstate, 1); } static void schedule_tail(struct vcpu *prev) { ctxt_switch_from(prev); ctxt_switch_to(current); local_irq_enable(); if ( prev != current ) update_runstate_area(current); /* Ensure that the vcpu has an up-to-date time base. */ update_vcpu_system_time(current); } static void continue_new_vcpu(struct vcpu *prev) { schedule_tail(prev); if ( is_idle_vcpu(current) ) reset_stack_and_jump(idle_loop); else if is_pv32_domain(current->domain) /* check_wakeup_from_wait(); */ reset_stack_and_jump(return_to_new_vcpu32); else /* check_wakeup_from_wait(); */ reset_stack_and_jump(return_to_new_vcpu64); } void context_switch(struct vcpu *prev, struct vcpu *next) { ASSERT(local_irq_is_enabled()); ASSERT(prev != next); ASSERT(cpumask_empty(next->vcpu_dirty_cpumask)); if ( prev != next ) update_runstate_area(prev); local_irq_disable(); set_current(next); prev = __context_switch(prev, next); schedule_tail(prev); } void continue_running(struct vcpu *same) { /* Nothing to do */ } void sync_local_execstate(void) { /* Nothing to do -- no lazy switching */ } void sync_vcpu_execstate(struct vcpu *v) { /* Nothing to do -- no lazy switching */ } #define next_arg(fmt, args) ({ \ unsigned long __arg; \ switch ( *(fmt)++ ) \ { \ case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \ case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \ case 'h': __arg = (unsigned long)va_arg(args, void *); break; \ default: __arg = 0; BUG(); \ } \ __arg; \ }) void hypercall_cancel_continuation(void) { struct cpu_user_regs *regs = guest_cpu_user_regs(); struct mc_state *mcs = ¤t->mc_state; if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) { __clear_bit(_MCSF_call_preempted, &mcs->flags); } else { regs->pc += 4; /* undo re-execute 'hvc #XEN_HYPERCALL_TAG' */ } } unsigned long hypercall_create_continuation( unsigned int op, const char *format, ...) { struct mc_state *mcs = ¤t->mc_state; struct cpu_user_regs *regs; const char *p = format; unsigned long arg, rc; unsigned int i; va_list args; /* All hypercalls take at least one argument */ BUG_ON( !p || *p == '\0' ); va_start(args, format); if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) { BUG(); /* XXX multicalls not implemented yet. */ __set_bit(_MCSF_call_preempted, &mcs->flags); for ( i = 0; *p != '\0'; i++ ) mcs->call.args[i] = next_arg(p, args); /* Return value gets written back to mcs->call.result */ rc = mcs->call.result; } else { regs = guest_cpu_user_regs(); regs->r12 = op; /* Ensure the hypercall trap instruction is re-executed. */ regs->pc -= 4; /* re-execute 'hvc #XEN_HYPERCALL_TAG' */ for ( i = 0; *p != '\0'; i++ ) { arg = next_arg(p, args); switch ( i ) { case 0: regs->r0 = arg; break; case 1: regs->r1 = arg; break; case 2: regs->r2 = arg; break; case 3: regs->r3 = arg; break; case 4: regs->r4 = arg; break; case 5: regs->r5 = arg; break; } } /* Return value gets written back to r0 */ rc = regs->r0; } va_end(args); return rc; } void startup_cpu_idle_loop(void) { struct vcpu *v = current; ASSERT(is_idle_vcpu(v)); /* TODO cpumask_set_cpu(v->processor, v->domain->domain_dirty_cpumask); cpumask_set_cpu(v->processor, v->vcpu_dirty_cpumask); */ reset_stack_and_jump(idle_loop); } struct domain *alloc_domain_struct(void) { struct domain *d; BUILD_BUG_ON(sizeof(*d) > PAGE_SIZE); d = alloc_xenheap_pages(0, 0); if ( d == NULL ) return NULL; clear_page(d); d->arch.grant_table_gpfn = xmalloc_array(xen_pfn_t, max_nr_grant_frames); return d; } void free_domain_struct(struct domain *d) { xfree(d->arch.grant_table_gpfn); free_xenheap_page(d); } void dump_pageframe_info(struct domain *d) { } struct vcpu *alloc_vcpu_struct(void) { struct vcpu *v; BUILD_BUG_ON(sizeof(*v) > PAGE_SIZE); v = alloc_xenheap_pages(0, 0); if ( v != NULL ) clear_page(v); return v; } void free_vcpu_struct(struct vcpu *v) { free_xenheap_page(v); } struct vcpu_guest_context *alloc_vcpu_guest_context(void) { return xmalloc(struct vcpu_guest_context); } void free_vcpu_guest_context(struct vcpu_guest_context *vgc) { xfree(vgc); } int vcpu_initialise(struct vcpu *v) { int rc = 0; BUILD_BUG_ON( sizeof(struct cpu_info) > STACK_SIZE ); v->arch.stack = alloc_xenheap_pages(STACK_ORDER, MEMF_node(vcpu_to_node(v))); if ( v->arch.stack == NULL ) return -ENOMEM; v->arch.cpu_info = (struct cpu_info *)(v->arch.stack + STACK_SIZE - sizeof(struct cpu_info)); memset(&v->arch.saved_context, 0, sizeof(v->arch.saved_context)); v->arch.saved_context.sp = (register_t)v->arch.cpu_info; v->arch.saved_context.pc = (register_t)continue_new_vcpu; /* Idle VCPUs don't need the rest of this setup */ if ( is_idle_vcpu(v) ) return rc; v->arch.sctlr = SCTLR_GUEST_INIT; /* * By default exposes an SMP system with AFF0 set to the VCPU ID * TODO: Handle multi-threading processor and cluster */ v->arch.vmpidr = MPIDR_SMP | (v->vcpu_id << MPIDR_AFF0_SHIFT); v->arch.actlr = READ_SYSREG32(ACTLR_EL1); /* XXX: Handle other than CA15 cpus */ if ( v->domain->max_vcpus > 1 ) v->arch.actlr |= ACTLR_CA15_SMP; else v->arch.actlr &= ~ACTLR_CA15_SMP; if ( (rc = vcpu_vgic_init(v)) != 0 ) return rc; if ( (rc = vcpu_vtimer_init(v)) != 0 ) return rc; return rc; } void vcpu_destroy(struct vcpu *v) { vcpu_timer_destroy(v); free_xenheap_pages(v->arch.stack, STACK_ORDER); } int arch_domain_create(struct domain *d, unsigned int domcr_flags) { int rc; d->arch.relmem = RELMEM_not_started; /* Idle domains do not need this setup */ if ( is_idle_domain(d) ) return 0; if ( (rc = p2m_init(d)) != 0 ) goto fail; rc = -ENOMEM; if ( (d->shared_info = alloc_xenheap_pages(0, 0)) == NULL ) goto fail; /* Default the virtual ID to match the physical */ d->arch.vpidr = boot_cpu_data.midr.bits; clear_page(d->shared_info); share_xen_page_with_guest( virt_to_page(d->shared_info), d, XENSHARE_writable); if ( (rc = p2m_alloc_table(d)) != 0 ) goto fail; if ( (rc = gicv_setup(d)) != 0 ) goto fail; if ( (rc = domain_vgic_init(d)) != 0 ) goto fail; if ( (rc = vcpu_domain_init(d)) != 0 ) goto fail; if ( d->domain_id ) d->arch.evtchn_irq = GUEST_EVTCHN_PPI; else d->arch.evtchn_irq = platform_dom0_evtchn_ppi(); /* * Virtual UART is only used by linux early printk and decompress code. * Only use it for dom0 because the linux kernel may not support * multi-platform. */ if ( (d->domain_id == 0) && (rc = domain_vuart_init(d)) ) goto fail; return 0; fail: d->is_dying = DOMDYING_dead; arch_domain_destroy(d); return rc; } void arch_domain_destroy(struct domain *d) { p2m_teardown(d); domain_vgic_free(d); domain_vuart_free(d); free_xenheap_page(d->shared_info); } static int is_guest_pv32_psr(uint32_t psr) { switch (psr & PSR_MODE_MASK) { case PSR_MODE_USR: case PSR_MODE_FIQ: case PSR_MODE_IRQ: case PSR_MODE_SVC: case PSR_MODE_ABT: case PSR_MODE_UND: case PSR_MODE_SYS: return 1; case PSR_MODE_MON: case PSR_MODE_HYP: default: return 0; } } #ifdef CONFIG_ARM_64 static int is_guest_pv64_psr(uint32_t psr) { if ( psr & PSR_MODE_BIT ) return 0; switch (psr & PSR_MODE_MASK) { case PSR_MODE_EL1h: case PSR_MODE_EL1t: case PSR_MODE_EL0t: return 1; case PSR_MODE_EL3h: case PSR_MODE_EL3t: case PSR_MODE_EL2h: case PSR_MODE_EL2t: default: return 0; } } #endif /* * Initialise VCPU state. The context can be supplied by either the * toolstack (XEN_DOMCTL_setvcpucontext) or the guest * (VCPUOP_initialise) and therefore must be properly validated. */ int arch_set_info_guest( struct vcpu *v, vcpu_guest_context_u c) { struct vcpu_guest_context *ctxt = c.nat; struct vcpu_guest_core_regs *regs = &c.nat->user_regs; if ( is_pv32_domain(v->domain) ) { if ( !is_guest_pv32_psr(regs->cpsr) ) return -EINVAL; if ( regs->spsr_svc && !is_guest_pv32_psr(regs->spsr_svc) ) return -EINVAL; if ( regs->spsr_abt && !is_guest_pv32_psr(regs->spsr_abt) ) return -EINVAL; if ( regs->spsr_und && !is_guest_pv32_psr(regs->spsr_und) ) return -EINVAL; if ( regs->spsr_irq && !is_guest_pv32_psr(regs->spsr_irq) ) return -EINVAL; if ( regs->spsr_fiq && !is_guest_pv32_psr(regs->spsr_fiq) ) return -EINVAL; } #ifdef CONFIG_ARM_64 else { if ( !is_guest_pv64_psr(regs->cpsr) ) return -EINVAL; if ( regs->spsr_el1 && !is_guest_pv64_psr(regs->spsr_el1) ) return -EINVAL; } #endif vcpu_regs_user_to_hyp(v, regs); v->arch.sctlr = ctxt->sctlr; v->arch.ttbr0 = ctxt->ttbr0; v->arch.ttbr1 = ctxt->ttbr1; v->arch.ttbcr = ctxt->ttbcr; v->is_initialised = 1; if ( ctxt->flags & VGCF_online ) clear_bit(_VPF_down, &v->pause_flags); else set_bit(_VPF_down, &v->pause_flags); return 0; } int arch_vcpu_reset(struct vcpu *v) { vcpu_end_shutdown_deferral(v); return 0; } static int relinquish_memory(struct domain *d, struct page_list_head *list) { struct page_info *page, *tmp; int ret = 0; /* Use a recursive lock, as we may enter 'free_domheap_page'. */ spin_lock_recursive(&d->page_alloc_lock); page_list_for_each_safe( page, tmp, list ) { /* Grab a reference to the page so it won't disappear from under us. */ if ( unlikely(!get_page(page, d)) ) /* Couldn't get a reference -- someone is freeing this page. */ BUG(); if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) put_page(page); put_page(page); if ( hypercall_preempt_check() ) { ret = -EAGAIN; goto out; } } out: spin_unlock_recursive(&d->page_alloc_lock); return ret; } int domain_relinquish_resources(struct domain *d) { int ret = 0; switch ( d->arch.relmem ) { case RELMEM_not_started: d->arch.relmem = RELMEM_xen; /* Falltrough */ case RELMEM_xen: ret = relinquish_memory(d, &d->xenpage_list); if ( ret ) return ret; d->arch.relmem = RELMEM_page; /* Fallthrough */ case RELMEM_page: ret = relinquish_memory(d, &d->page_list); if ( ret ) return ret; d->arch.relmem = RELMEM_mapping; /* Fallthrough */ case RELMEM_mapping: ret = relinquish_p2m_mapping(d); if ( ret ) return ret; d->arch.relmem = RELMEM_done; /* Fallthrough */ case RELMEM_done: break; default: BUG(); } return 0; } void arch_dump_domain_info(struct domain *d) { struct vcpu *v; for_each_vcpu ( d, v ) { gic_dump_info(v); } } long do_arm_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg) { switch ( cmd ) { case VCPUOP_register_vcpu_info: case VCPUOP_register_runstate_memory_area: return do_vcpu_op(cmd, vcpuid, arg); default: return -EINVAL; } } long arch_do_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg) { return -ENOSYS; } void arch_dump_vcpu_info(struct vcpu *v) { } void vcpu_mark_events_pending(struct vcpu *v) { int already_pending = test_and_set_bit( 0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending)); if ( already_pending ) return; vgic_vcpu_inject_irq(v, v->domain->arch.evtchn_irq, 1); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/kernel.h0000664000175000017500000000210412307313555014357 0ustar smbsmb/* * Kernel image loading. * * Copyright (C) 2011 Citrix Systems, Inc. */ #ifndef __ARCH_ARM_KERNEL_H__ #define __ARCH_ARM_KERNEL_H__ #include #include struct kernel_info { #ifdef CONFIG_ARM_64 enum domain_type type; #endif void *fdt; /* flat device tree */ paddr_t unassigned_mem; /* RAM not (yet) assigned to a bank */ struct dt_mem_info mem; paddr_t dtb_paddr; paddr_t entry; paddr_t initrd_paddr; void *kernel_img; unsigned kernel_order; union { struct { paddr_t kernel_addr; paddr_t load_addr; paddr_t len; } zimage; struct { struct elf_binary elf; struct elf_dom_parms parms; } elf; }; void (*load)(struct kernel_info *info); int load_attr; }; int kernel_prepare(struct kernel_info *info); void kernel_load(struct kernel_info *info); #endif /* #ifdef __ARCH_ARM_KERNEL_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/p2m.c0000664000175000017500000004057212307313555013603 0ustar smbsmb#include #include #include #include #include #include #include #include #include #include #include /* First level P2M is 2 consecutive pages */ #define P2M_FIRST_ORDER 1 #define P2M_FIRST_ENTRIES (LPAE_ENTRIES<arch.p2m; lpae_t *first; printk("dom%d IPA 0x%"PRIpaddr"\n", d->domain_id, addr); if ( first_linear_offset(addr) > LPAE_ENTRIES ) { printk("Cannot dump addresses in second of first level pages...\n"); return; } printk("P2M @ %p mfn:0x%lx\n", p2m->first_level, page_to_mfn(p2m->first_level)); first = __map_domain_page(p2m->first_level); dump_pt_walk(first, addr); unmap_domain_page(first); } void p2m_load_VTTBR(struct domain *d) { if ( is_idle_domain(d) ) return; BUG_ON(!d->arch.vttbr); WRITE_SYSREG64(d->arch.vttbr, VTTBR_EL2); isb(); /* Ensure update is visible */ } static int p2m_first_level_index(paddr_t addr) { /* * 1st pages are concatenated so zeroeth offset gives us the * index of the 1st page */ return zeroeth_table_offset(addr); } /* * Map whichever of the first pages contain addr. The caller should * then use first_table_offset as an index. */ static lpae_t *p2m_map_first(struct p2m_domain *p2m, paddr_t addr) { struct page_info *page; if ( first_linear_offset(addr) >= P2M_FIRST_ENTRIES ) return NULL; page = p2m->first_level + p2m_first_level_index(addr); return __map_domain_page(page); } /* * Lookup the MFN corresponding to a domain's PFN. * * There are no processor functions to do a stage 2 only lookup therefore we * do a a software walk. */ paddr_t p2m_lookup(struct domain *d, paddr_t paddr, p2m_type_t *t) { struct p2m_domain *p2m = &d->arch.p2m; lpae_t pte, *first = NULL, *second = NULL, *third = NULL; paddr_t maddr = INVALID_PADDR; p2m_type_t _t; /* Allow t to be NULL */ t = t ?: &_t; *t = p2m_invalid; spin_lock(&p2m->lock); first = p2m_map_first(p2m, paddr); if ( !first ) goto err; pte = first[first_table_offset(paddr)]; if ( !pte.p2m.valid || !pte.p2m.table ) goto done; second = map_domain_page(pte.p2m.base); pte = second[second_table_offset(paddr)]; if ( !pte.p2m.valid || !pte.p2m.table ) goto done; third = map_domain_page(pte.p2m.base); pte = third[third_table_offset(paddr)]; /* This bit must be one in the level 3 entry */ if ( !pte.p2m.table ) pte.bits = 0; done: if ( pte.p2m.valid ) { ASSERT(pte.p2m.type != p2m_invalid); maddr = (pte.bits & PADDR_MASK & PAGE_MASK) | (paddr & ~PAGE_MASK); *t = pte.p2m.type; } if (third) unmap_domain_page(third); if (second) unmap_domain_page(second); if (first) unmap_domain_page(first); err: spin_unlock(&p2m->lock); return maddr; } int guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn, unsigned int order) { return -ENOSYS; } int p2m_pod_decrease_reservation(struct domain *d, xen_pfn_t gpfn, unsigned int order) { return -ENOSYS; } static lpae_t mfn_to_p2m_entry(unsigned long mfn, unsigned int mattr, p2m_type_t t) { paddr_t pa = ((paddr_t) mfn) << PAGE_SHIFT; /* xn and write bit will be defined in the switch */ lpae_t e = (lpae_t) { .p2m.af = 1, .p2m.sh = LPAE_SH_OUTER, .p2m.read = 1, .p2m.mattr = mattr, .p2m.table = 1, .p2m.valid = 1, .p2m.type = t, }; BUILD_BUG_ON(p2m_max_real_type > (1 << 4)); switch (t) { case p2m_ram_rw: e.p2m.xn = 0; e.p2m.write = 1; break; case p2m_ram_ro: e.p2m.xn = 0; e.p2m.write = 0; break; case p2m_map_foreign: case p2m_grant_map_rw: case p2m_mmio_direct: e.p2m.xn = 1; e.p2m.write = 1; break; case p2m_grant_map_ro: case p2m_invalid: e.p2m.xn = 1; e.p2m.write = 0; break; case p2m_max_real_type: BUG(); break; } ASSERT(!(pa & ~PAGE_MASK)); ASSERT(!(pa & ~PADDR_MASK)); e.bits |= pa; return e; } /* Allocate a new page table page and hook it in via the given entry */ static int p2m_create_table(struct domain *d, lpae_t *entry) { struct p2m_domain *p2m = &d->arch.p2m; struct page_info *page; void *p; lpae_t pte; BUG_ON(entry->p2m.valid); page = alloc_domheap_page(NULL, 0); if ( page == NULL ) return -ENOMEM; page_list_add(page, &p2m->pages); p = __map_domain_page(page); clear_page(p); unmap_domain_page(p); pte = mfn_to_p2m_entry(page_to_mfn(page), MATTR_MEM, p2m_invalid); write_pte(entry, pte); return 0; } enum p2m_operation { INSERT, ALLOCATE, REMOVE, RELINQUISH, CACHEFLUSH, }; static int apply_p2m_changes(struct domain *d, enum p2m_operation op, paddr_t start_gpaddr, paddr_t end_gpaddr, paddr_t maddr, int mattr, p2m_type_t t) { int rc; struct p2m_domain *p2m = &d->arch.p2m; lpae_t *first = NULL, *second = NULL, *third = NULL; paddr_t addr; unsigned long cur_first_page = ~0, cur_first_offset = ~0, cur_second_offset = ~0; unsigned long count = 0; unsigned int flush = 0; bool_t populate = (op == INSERT || op == ALLOCATE); lpae_t pte; spin_lock(&p2m->lock); if ( d != current->domain ) p2m_load_VTTBR(d); addr = start_gpaddr; while ( addr < end_gpaddr ) { if ( cur_first_page != p2m_first_level_index(addr) ) { if ( first ) unmap_domain_page(first); first = p2m_map_first(p2m, addr); if ( !first ) { rc = -EINVAL; goto out; } cur_first_page = p2m_first_level_index(addr); } if ( !first[first_table_offset(addr)].p2m.valid ) { if ( !populate ) { addr = (addr + FIRST_SIZE) & FIRST_MASK; continue; } rc = p2m_create_table(d, &first[first_table_offset(addr)]); if ( rc < 0 ) { printk("p2m_populate_ram: L1 failed\n"); goto out; } } BUG_ON(!first[first_table_offset(addr)].p2m.valid); if ( cur_first_offset != first_table_offset(addr) ) { if (second) unmap_domain_page(second); second = map_domain_page(first[first_table_offset(addr)].p2m.base); cur_first_offset = first_table_offset(addr); } /* else: second already valid */ if ( !second[second_table_offset(addr)].p2m.valid ) { if ( !populate ) { addr = (addr + SECOND_SIZE) & SECOND_MASK; continue; } rc = p2m_create_table(d, &second[second_table_offset(addr)]); if ( rc < 0 ) { printk("p2m_populate_ram: L2 failed\n"); goto out; } } BUG_ON(!second[second_table_offset(addr)].p2m.valid); if ( cur_second_offset != second_table_offset(addr) ) { /* map third level */ if (third) unmap_domain_page(third); third = map_domain_page(second[second_table_offset(addr)].p2m.base); cur_second_offset = second_table_offset(addr); } pte = third[third_table_offset(addr)]; flush |= pte.p2m.valid; /* TODO: Handle other p2m type * * It's safe to do the put_page here because page_alloc will * flush the TLBs if the page is reallocated before the end of * this loop. */ if ( pte.p2m.valid && p2m_is_foreign(pte.p2m.type) ) { unsigned long mfn = pte.p2m.base; ASSERT(mfn_valid(mfn)); put_page(mfn_to_page(mfn)); } /* Allocate a new RAM page and attach */ switch (op) { case ALLOCATE: { struct page_info *page; ASSERT(!pte.p2m.valid); rc = -ENOMEM; page = alloc_domheap_page(d, 0); if ( page == NULL ) { printk("p2m_populate_ram: failed to allocate page\n"); goto out; } pte = mfn_to_p2m_entry(page_to_mfn(page), mattr, t); write_pte(&third[third_table_offset(addr)], pte); } break; case INSERT: { pte = mfn_to_p2m_entry(maddr >> PAGE_SHIFT, mattr, t); write_pte(&third[third_table_offset(addr)], pte); maddr += PAGE_SIZE; } break; case RELINQUISH: case REMOVE: { if ( !pte.p2m.valid ) { count++; break; } count += 0x10; memset(&pte, 0x00, sizeof(pte)); write_pte(&third[third_table_offset(addr)], pte); count++; } break; case CACHEFLUSH: { if ( !pte.p2m.valid || !p2m_is_ram(pte.p2m.type) ) break; flush_page_to_ram(pte.p2m.base); } break; } /* Preempt every 2MiB (mapped) or 32 MiB (unmapped) - arbitrary */ if ( op == RELINQUISH && count >= 0x2000 ) { if ( hypercall_preempt_check() ) { p2m->lowest_mapped_gfn = addr >> PAGE_SHIFT; rc = -EAGAIN; goto out; } count = 0; } /* Got the next page */ addr += PAGE_SIZE; } if ( flush ) { /* At the beginning of the function, Xen is updating VTTBR * with the domain where the mappings are created. In this * case it's only necessary to flush TLBs on every CPUs with * the current VMID (our domain). */ flush_tlb(); } if ( op == ALLOCATE || op == INSERT ) { unsigned long sgfn = paddr_to_pfn(start_gpaddr); unsigned long egfn = paddr_to_pfn(end_gpaddr); p2m->max_mapped_gfn = MAX(p2m->max_mapped_gfn, egfn); p2m->lowest_mapped_gfn = MIN(p2m->lowest_mapped_gfn, sgfn); } rc = 0; out: if (third) unmap_domain_page(third); if (second) unmap_domain_page(second); if (first) unmap_domain_page(first); if ( d != current->domain ) p2m_load_VTTBR(current->domain); spin_unlock(&p2m->lock); return rc; } int p2m_populate_ram(struct domain *d, paddr_t start, paddr_t end) { return apply_p2m_changes(d, ALLOCATE, start, end, 0, MATTR_MEM, p2m_ram_rw); } int map_mmio_regions(struct domain *d, paddr_t start_gaddr, paddr_t end_gaddr, paddr_t maddr) { return apply_p2m_changes(d, INSERT, start_gaddr, end_gaddr, maddr, MATTR_DEV, p2m_mmio_direct); } int guest_physmap_add_entry(struct domain *d, unsigned long gpfn, unsigned long mfn, unsigned long page_order, p2m_type_t t) { return apply_p2m_changes(d, INSERT, pfn_to_paddr(gpfn), pfn_to_paddr(gpfn + (1 << page_order)), pfn_to_paddr(mfn), MATTR_MEM, t); } void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, unsigned long mfn, unsigned int page_order) { apply_p2m_changes(d, REMOVE, pfn_to_paddr(gpfn), pfn_to_paddr(gpfn + (1<arch.p2m; struct page_info *page; void *p; page = alloc_domheap_pages(NULL, P2M_FIRST_ORDER, 0); if ( page == NULL ) return -ENOMEM; spin_lock(&p2m->lock); /* Clear both first level pages */ p = __map_domain_page(page); clear_page(p); unmap_domain_page(p); p = __map_domain_page(page + 1); clear_page(p); unmap_domain_page(p); p2m->first_level = page; d->arch.vttbr = page_to_maddr(p2m->first_level) | ((uint64_t)p2m->vmid&0xff)<<48; p2m_load_VTTBR(d); /* Make sure that all TLBs corresponding to the new VMID are flushed * before using it */ flush_tlb(); p2m_load_VTTBR(current->domain); spin_unlock(&p2m->lock); return 0; } #define MAX_VMID 256 #define INVALID_VMID 0 /* VMID 0 is reserved */ static spinlock_t vmid_alloc_lock = SPIN_LOCK_UNLOCKED; /* VTTBR_EL2 VMID field is 8 bits. Using a bitmap here limits us to * 256 concurrent domains. */ static DECLARE_BITMAP(vmid_mask, MAX_VMID); void p2m_vmid_allocator_init(void) { set_bit(INVALID_VMID, vmid_mask); } static int p2m_alloc_vmid(struct domain *d) { struct p2m_domain *p2m = &d->arch.p2m; int rc, nr; spin_lock(&vmid_alloc_lock); nr = find_first_zero_bit(vmid_mask, MAX_VMID); ASSERT(nr != INVALID_VMID); if ( nr == MAX_VMID ) { rc = -EBUSY; printk(XENLOG_ERR "p2m.c: dom%d: VMID pool exhausted\n", d->domain_id); goto out; } set_bit(nr, vmid_mask); p2m->vmid = nr; rc = 0; out: spin_unlock(&vmid_alloc_lock); return rc; } static void p2m_free_vmid(struct domain *d) { struct p2m_domain *p2m = &d->arch.p2m; spin_lock(&vmid_alloc_lock); if ( p2m->vmid != INVALID_VMID ) clear_bit(p2m->vmid, vmid_mask); spin_unlock(&vmid_alloc_lock); } void p2m_teardown(struct domain *d) { struct p2m_domain *p2m = &d->arch.p2m; struct page_info *pg; spin_lock(&p2m->lock); while ( (pg = page_list_remove_head(&p2m->pages)) ) free_domheap_page(pg); free_domheap_pages(p2m->first_level, P2M_FIRST_ORDER); p2m->first_level = NULL; p2m_free_vmid(d); spin_unlock(&p2m->lock); } int p2m_init(struct domain *d) { struct p2m_domain *p2m = &d->arch.p2m; int rc = 0; spin_lock_init(&p2m->lock); INIT_PAGE_LIST_HEAD(&p2m->pages); spin_lock(&p2m->lock); p2m->vmid = INVALID_VMID; rc = p2m_alloc_vmid(d); if ( rc != 0 ) goto err; d->arch.vttbr = 0; p2m->first_level = NULL; p2m->max_mapped_gfn = 0; p2m->lowest_mapped_gfn = ULONG_MAX; err: spin_unlock(&p2m->lock); return rc; } int relinquish_p2m_mapping(struct domain *d) { struct p2m_domain *p2m = &d->arch.p2m; return apply_p2m_changes(d, RELINQUISH, pfn_to_paddr(p2m->lowest_mapped_gfn), pfn_to_paddr(p2m->max_mapped_gfn), pfn_to_paddr(INVALID_MFN), MATTR_MEM, p2m_invalid); } int p2m_cache_flush(struct domain *d, xen_pfn_t start_mfn, xen_pfn_t end_mfn) { struct p2m_domain *p2m = &d->arch.p2m; start_mfn = MAX(start_mfn, p2m->lowest_mapped_gfn); end_mfn = MIN(end_mfn, p2m->max_mapped_gfn); return apply_p2m_changes(d, CACHEFLUSH, pfn_to_paddr(start_mfn), pfn_to_paddr(end_mfn), pfn_to_paddr(INVALID_MFN), MATTR_MEM, p2m_invalid); } unsigned long gmfn_to_mfn(struct domain *d, unsigned long gpfn) { paddr_t p = p2m_lookup(d, pfn_to_paddr(gpfn), NULL); return p >> PAGE_SHIFT; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/io.c0000664000175000017500000000245412307313555013511 0ustar smbsmb/* * xen/arch/arm/io.h * * ARM I/O handlers * * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include "io.h" static const struct mmio_handler *const mmio_handlers[] = { &vgic_distr_mmio_handler, &vuart_mmio_handler, }; #define MMIO_HANDLER_NR ARRAY_SIZE(mmio_handlers) int handle_mmio(mmio_info_t *info) { struct vcpu *v = current; int i; for ( i = 0; i < MMIO_HANDLER_NR; i++ ) if ( mmio_handlers[i]->check_handler(v, info->gpa) ) return info->dabt.write ? mmio_handlers[i]->write_handler(v, info) : mmio_handlers[i]->read_handler(v, info); return 0; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/io.h0000664000175000017500000000257012307313555013515 0ustar smbsmb/* * xen/arch/arm/io.h * * ARM I/O handlers * * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #ifndef __ARCH_ARM_IO_H__ #define __ARCH_ARM_IO_H__ #include #include #include typedef struct { struct hsr_dabt dabt; vaddr_t gva; paddr_t gpa; } mmio_info_t; typedef int (*mmio_read_t)(struct vcpu *v, mmio_info_t *info); typedef int (*mmio_write_t)(struct vcpu *v, mmio_info_t *info); typedef int (*mmio_check_t)(struct vcpu *v, paddr_t addr); struct mmio_handler { mmio_check_t check_handler; mmio_read_t read_handler; mmio_write_t write_handler; }; extern const struct mmio_handler vgic_distr_mmio_handler; extern const struct mmio_handler vuart_mmio_handler; extern int handle_mmio(mmio_info_t *info); #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/vgic.c0000664000175000017500000005540112307313555014032 0ustar smbsmb/* * xen/arch/arm/vgic.c * * ARM Virtual Generic Interrupt Controller support * * Ian Campbell * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include #include "io.h" #include #define REG(n) (n/4) /* Number of ranks of interrupt registers for a domain */ #define DOMAIN_NR_RANKS(d) (((d)->arch.vgic.nr_lines+31)/32) /* * Rank containing GICD_ for GICD_ with * -bits-per-interrupt */ static inline int REG_RANK_NR(int b, uint32_t n) { switch ( b ) { case 8: return n >> 3; case 4: return n >> 2; case 2: return n >> 1; case 1: return n; default: BUG(); } } /* * Offset of GICD_ with its rank, for GICD_ with * -bits-per-interrupt. */ #define REG_RANK_INDEX(b, n) ((n) & ((b)-1)) /* * Returns rank corresponding to a GICD_ register for * GICD_ with -bits-per-interrupt. */ static struct vgic_irq_rank *vgic_irq_rank(struct vcpu *v, int b, int n) { int rank = REG_RANK_NR(b, n); if ( rank == 0 ) return &v->arch.vgic.private_irqs; else if ( rank <= DOMAIN_NR_RANKS(v->domain) ) return &v->domain->arch.vgic.shared_irqs[rank - 1]; else return NULL; } int domain_vgic_init(struct domain *d) { int i; d->arch.vgic.ctlr = 0; /* Currently nr_lines in vgic and gic doesn't have the same meanings * Here nr_lines = number of SPIs */ if ( d->domain_id == 0 ) d->arch.vgic.nr_lines = gic_number_lines() - 32; else d->arch.vgic.nr_lines = 0; /* We don't need SPIs for the guest */ d->arch.vgic.shared_irqs = xzalloc_array(struct vgic_irq_rank, DOMAIN_NR_RANKS(d)); d->arch.vgic.pending_irqs = xzalloc_array(struct pending_irq, d->arch.vgic.nr_lines); for (i=0; iarch.vgic.nr_lines; i++) { INIT_LIST_HEAD(&d->arch.vgic.pending_irqs[i].inflight); INIT_LIST_HEAD(&d->arch.vgic.pending_irqs[i].lr_queue); } for (i=0; iarch.vgic.shared_irqs[i].lock); return 0; } void domain_vgic_free(struct domain *d) { xfree(d->arch.vgic.shared_irqs); xfree(d->arch.vgic.pending_irqs); } int vcpu_vgic_init(struct vcpu *v) { int i; memset(&v->arch.vgic.private_irqs, 0, sizeof(v->arch.vgic.private_irqs)); spin_lock_init(&v->arch.vgic.private_irqs.lock); memset(&v->arch.vgic.pending_irqs, 0, sizeof(v->arch.vgic.pending_irqs)); for (i = 0; i < 32; i++) { INIT_LIST_HEAD(&v->arch.vgic.pending_irqs[i].inflight); INIT_LIST_HEAD(&v->arch.vgic.pending_irqs[i].lr_queue); } /* For SGI and PPI the target is always this CPU */ for ( i = 0 ; i < 8 ; i++ ) v->arch.vgic.private_irqs.itargets[i] = (1<<(v->vcpu_id+0)) | (1<<(v->vcpu_id+8)) | (1<<(v->vcpu_id+16)) | (1<<(v->vcpu_id+24)); INIT_LIST_HEAD(&v->arch.vgic.inflight_irqs); INIT_LIST_HEAD(&v->arch.vgic.lr_pending); spin_lock_init(&v->arch.vgic.lock); return 0; } #define vgic_lock(v) spin_lock_irq(&(v)->domain->arch.vgic.lock) #define vgic_unlock(v) spin_unlock_irq(&(v)->domain->arch.vgic.lock) #define vgic_lock_rank(v, r) spin_lock(&(r)->lock) #define vgic_unlock_rank(v, r) spin_unlock(&(r)->lock) static uint32_t byte_read(uint32_t val, int sign, int offset) { int byte = offset & 0x3; val = val >> (8*byte); if ( sign && (val & 0x80) ) val |= 0xffffff00; else val &= 0x000000ff; return val; } static void byte_write(uint32_t *reg, uint32_t var, int offset) { int byte = offset & 0x3; var &= (0xff << (8*byte)); *reg &= ~(0xff << (8*byte)); *reg |= var; } static int vgic_distr_mmio_read(struct vcpu *v, mmio_info_t *info) { struct hsr_dabt dabt = info->dabt; struct cpu_user_regs *regs = guest_cpu_user_regs(); register_t *r = select_user_reg(regs, dabt.reg); struct vgic_irq_rank *rank; int offset = (int)(info->gpa - v->domain->arch.vgic.dbase); int gicd_reg = REG(offset); switch ( gicd_reg ) { case GICD_CTLR: if ( dabt.size != 2 ) goto bad_width; vgic_lock(v); *r = v->domain->arch.vgic.ctlr; vgic_unlock(v); return 1; case GICD_TYPER: if ( dabt.size != 2 ) goto bad_width; /* No secure world support for guests. */ vgic_lock(v); *r = ( (v->domain->max_vcpus<<5) & GICD_TYPE_CPUS ) |( ((v->domain->arch.vgic.nr_lines/32)) & GICD_TYPE_LINES ); vgic_unlock(v); return 1; case GICD_IIDR: if ( dabt.size != 2 ) goto bad_width; /* * XXX Do we need a JEP106 manufacturer ID? * Just use the physical h/w value for now */ *r = 0x0000043b; return 1; /* Implementation defined -- read as zero */ case REG(0x020) ... REG(0x03c): goto read_as_zero; case GICD_IGROUPR ... GICD_IGROUPRN: /* We do not implement security extensions for guests, read zero */ goto read_as_zero; case GICD_ISENABLER ... GICD_ISENABLERN: if ( dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ISENABLER); if ( rank == NULL) goto read_as_zero; vgic_lock_rank(v, rank); *r = rank->ienable; vgic_unlock_rank(v, rank); return 1; case GICD_ICENABLER ... GICD_ICENABLERN: if ( dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ICENABLER); if ( rank == NULL) goto read_as_zero; vgic_lock_rank(v, rank); *r = rank->ienable; vgic_unlock_rank(v, rank); return 1; case GICD_ISPENDR ... GICD_ISPENDRN: if ( dabt.size != 0 && dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ISPENDR); if ( rank == NULL) goto read_as_zero; vgic_lock_rank(v, rank); *r = byte_read(rank->ipend, dabt.sign, offset); vgic_unlock_rank(v, rank); return 1; case GICD_ICPENDR ... GICD_ICPENDRN: if ( dabt.size != 0 && dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ICPENDR); if ( rank == NULL) goto read_as_zero; vgic_lock_rank(v, rank); *r = byte_read(rank->ipend, dabt.sign, offset); vgic_unlock_rank(v, rank); return 1; case GICD_ISACTIVER ... GICD_ISACTIVERN: if ( dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ISACTIVER); if ( rank == NULL) goto read_as_zero; vgic_lock_rank(v, rank); *r = rank->iactive; vgic_unlock_rank(v, rank); return 1; case GICD_ICACTIVER ... GICD_ICACTIVERN: if ( dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ICACTIVER); if ( rank == NULL) goto read_as_zero; vgic_lock_rank(v, rank); *r = rank->iactive; vgic_unlock_rank(v, rank); return 1; case GICD_ITARGETSR ... GICD_ITARGETSRN: if ( dabt.size != 0 && dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 8, gicd_reg - GICD_ITARGETSR); if ( rank == NULL) goto read_as_zero; vgic_lock_rank(v, rank); *r = rank->itargets[REG_RANK_INDEX(8, gicd_reg - GICD_ITARGETSR)]; if ( dabt.size == 0 ) *r = byte_read(*r, dabt.sign, offset); vgic_unlock_rank(v, rank); return 1; case GICD_IPRIORITYR ... GICD_IPRIORITYRN: if ( dabt.size != 0 && dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 8, gicd_reg - GICD_IPRIORITYR); if ( rank == NULL) goto read_as_zero; vgic_lock_rank(v, rank); *r = rank->ipriority[REG_RANK_INDEX(8, gicd_reg - GICD_IPRIORITYR)]; if ( dabt.size == 0 ) *r = byte_read(*r, dabt.sign, offset); vgic_unlock_rank(v, rank); return 1; case GICD_ICFGR ... GICD_ICFGRN: if ( dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 2, gicd_reg - GICD_ICFGR); if ( rank == NULL) goto read_as_zero; vgic_lock_rank(v, rank); *r = rank->icfg[REG_RANK_INDEX(2, gicd_reg - GICD_ICFGR)]; vgic_unlock_rank(v, rank); return 1; case GICD_NSACR ... GICD_NSACRN: /* We do not implement security extensions for guests, read zero */ goto read_as_zero; case GICD_SGIR: if ( dabt.size != 2 ) goto bad_width; /* Write only -- read unknown */ *r = 0xdeadbeef; return 1; case GICD_CPENDSGIR ... GICD_CPENDSGIRN: if ( dabt.size != 0 && dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 1, gicd_reg - GICD_CPENDSGIR); if ( rank == NULL) goto read_as_zero; vgic_lock_rank(v, rank); *r = byte_read(rank->pendsgi, dabt.sign, offset); vgic_unlock_rank(v, rank); return 1; case GICD_SPENDSGIR ... GICD_SPENDSGIRN: if ( dabt.size != 0 && dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 1, gicd_reg - GICD_SPENDSGIR); if ( rank == NULL) goto read_as_zero; vgic_lock_rank(v, rank); *r = byte_read(rank->pendsgi, dabt.sign, offset); vgic_unlock_rank(v, rank); return 1; /* Implementation defined -- read as zero */ case REG(0xfd0) ... REG(0xfe4): goto read_as_zero; case GICD_ICPIDR2: if ( dabt.size != 2 ) goto bad_width; printk("vGICD: unhandled read from ICPIDR2\n"); return 0; /* Implementation defined -- read as zero */ case REG(0xfec) ... REG(0xffc): goto read_as_zero; /* Reserved -- read as zero */ case REG(0x00c) ... REG(0x01c): case REG(0x040) ... REG(0x07c): case REG(0x7fc): case REG(0xbfc): case REG(0xf04) ... REG(0xf0c): case REG(0xf30) ... REG(0xfcc): goto read_as_zero; default: printk("vGICD: unhandled read r%d offset %#08x\n", dabt.reg, offset); return 0; } bad_width: printk("vGICD: bad read width %d r%d offset %#08x\n", dabt.size, dabt.reg, offset); domain_crash_synchronous(); return 0; read_as_zero: if ( dabt.size != 2 ) goto bad_width; *r = 0; return 1; } static void vgic_disable_irqs(struct vcpu *v, uint32_t r, int n) { const unsigned long mask = r; struct pending_irq *p; unsigned int irq; int i = 0; while ( (i = find_next_bit(&mask, 32, i)) < 32 ) { irq = i + (32 * n); p = irq_to_pending(v, irq); clear_bit(GIC_IRQ_GUEST_ENABLED, &p->status); gic_remove_from_queues(v, irq); if ( p->desc != NULL ) p->desc->handler->disable(p->desc); i++; } } static void vgic_enable_irqs(struct vcpu *v, uint32_t r, int n) { const unsigned long mask = r; struct pending_irq *p; unsigned int irq; int i = 0; while ( (i = find_next_bit(&mask, 32, i)) < 32 ) { irq = i + (32 * n); p = irq_to_pending(v, irq); set_bit(GIC_IRQ_GUEST_ENABLED, &p->status); if ( !list_empty(&p->inflight) && !test_bit(GIC_IRQ_GUEST_VISIBLE, &p->status) ) gic_set_guest_irq(v, irq, GICH_LR_PENDING, p->priority); if ( p->desc != NULL ) p->desc->handler->enable(p->desc); i++; } } static inline int is_vcpu_running(struct domain *d, int vcpuid) { struct vcpu *v; if ( vcpuid >= d->max_vcpus ) return 0; v = d->vcpu[vcpuid]; if ( v == NULL ) return 0; if (test_bit(_VPF_down, &v->pause_flags) ) return 0; return 1; } static int vgic_to_sgi(struct vcpu *v, register_t sgir) { struct domain *d = v->domain; int virtual_irq; int filter; int vcpuid; int i; unsigned long vcpu_mask = 0; ASSERT(d->max_vcpus < 8*sizeof(vcpu_mask)); filter = (sgir & GICD_SGI_TARGET_LIST_MASK); virtual_irq = (sgir & GICD_SGI_INTID_MASK); ASSERT( virtual_irq < 16 ); switch ( filter ) { case GICD_SGI_TARGET_LIST: vcpu_mask = (sgir & GICD_SGI_TARGET_MASK) >> GICD_SGI_TARGET_SHIFT; break; case GICD_SGI_TARGET_OTHERS: for ( i = 0; i < d->max_vcpus; i++ ) { if ( i != current->vcpu_id && is_vcpu_running(d, i) ) set_bit(i, &vcpu_mask); } break; case GICD_SGI_TARGET_SELF: set_bit(current->vcpu_id, &vcpu_mask); break; default: gdprintk(XENLOG_WARNING, "vGICD: unhandled GICD_SGIR write %"PRIregister" with wrong TargetListFilter field\n", sgir); return 0; } for_each_set_bit( vcpuid, &vcpu_mask, d->max_vcpus ) { if ( !is_vcpu_running(d, vcpuid) ) { gdprintk(XENLOG_WARNING, "vGICD: GICD_SGIR write r=%"PRIregister" vcpu_mask=%lx, wrong CPUTargetList\n", sgir, vcpu_mask); continue; } vgic_vcpu_inject_irq(d->vcpu[vcpuid], virtual_irq, 1); } return 1; } static int vgic_distr_mmio_write(struct vcpu *v, mmio_info_t *info) { struct hsr_dabt dabt = info->dabt; struct cpu_user_regs *regs = guest_cpu_user_regs(); register_t *r = select_user_reg(regs, dabt.reg); struct vgic_irq_rank *rank; int offset = (int)(info->gpa - v->domain->arch.vgic.dbase); int gicd_reg = REG(offset); uint32_t tr; switch ( gicd_reg ) { case GICD_CTLR: if ( dabt.size != 2 ) goto bad_width; /* Ignore all but the enable bit */ v->domain->arch.vgic.ctlr = (*r) & GICD_CTL_ENABLE; return 1; /* R/O -- write ignored */ case GICD_TYPER: case GICD_IIDR: goto write_ignore; /* Implementation defined -- write ignored */ case REG(0x020) ... REG(0x03c): goto write_ignore; case GICD_IGROUPR ... GICD_IGROUPRN: /* We do not implement security extensions for guests, write ignore */ goto write_ignore; case GICD_ISENABLER ... GICD_ISENABLERN: if ( dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ISENABLER); if ( rank == NULL) goto write_ignore; vgic_lock_rank(v, rank); tr = rank->ienable; rank->ienable |= *r; vgic_unlock_rank(v, rank); vgic_enable_irqs(v, (*r) & (~tr), gicd_reg - GICD_ISENABLER); return 1; case GICD_ICENABLER ... GICD_ICENABLERN: if ( dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ICENABLER); if ( rank == NULL) goto write_ignore; vgic_lock_rank(v, rank); tr = rank->ienable; rank->ienable &= ~*r; vgic_unlock_rank(v, rank); vgic_disable_irqs(v, (*r) & tr, gicd_reg - GICD_ICENABLER); return 1; case GICD_ISPENDR ... GICD_ISPENDRN: if ( dabt.size != 0 && dabt.size != 2 ) goto bad_width; printk("vGICD: unhandled %s write %#"PRIregister" to ISPENDR%d\n", dabt.size ? "word" : "byte", *r, gicd_reg - GICD_ISPENDR); return 0; case GICD_ICPENDR ... GICD_ICPENDRN: if ( dabt.size != 0 && dabt.size != 2 ) goto bad_width; printk("vGICD: unhandled %s write %#"PRIregister" to ICPENDR%d\n", dabt.size ? "word" : "byte", *r, gicd_reg - GICD_ICPENDR); return 0; case GICD_ISACTIVER ... GICD_ISACTIVERN: if ( dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ISACTIVER); if ( rank == NULL) goto write_ignore; vgic_lock_rank(v, rank); rank->iactive &= ~*r; vgic_unlock_rank(v, rank); return 1; case GICD_ICACTIVER ... GICD_ICACTIVERN: if ( dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 1, gicd_reg - GICD_ICACTIVER); if ( rank == NULL) goto write_ignore; vgic_lock_rank(v, rank); rank->iactive &= ~*r; vgic_unlock_rank(v, rank); return 1; case GICD_ITARGETSR ... GICD_ITARGETSR + 7: /* SGI/PPI target is read only */ goto write_ignore; case GICD_ITARGETSR + 8 ... GICD_ITARGETSRN: if ( dabt.size != 0 && dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 8, gicd_reg - GICD_ITARGETSR); if ( rank == NULL) goto write_ignore; vgic_lock_rank(v, rank); if ( dabt.size == 2 ) rank->itargets[REG_RANK_INDEX(8, gicd_reg - GICD_ITARGETSR)] = *r; else byte_write(&rank->itargets[REG_RANK_INDEX(8, gicd_reg - GICD_ITARGETSR)], *r, offset); vgic_unlock_rank(v, rank); return 1; case GICD_IPRIORITYR ... GICD_IPRIORITYRN: if ( dabt.size != 0 && dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 8, gicd_reg - GICD_IPRIORITYR); if ( rank == NULL) goto write_ignore; vgic_lock_rank(v, rank); if ( dabt.size == 2 ) rank->ipriority[REG_RANK_INDEX(8, gicd_reg - GICD_IPRIORITYR)] = *r; else byte_write(&rank->ipriority[REG_RANK_INDEX(8, gicd_reg - GICD_IPRIORITYR)], *r, offset); vgic_unlock_rank(v, rank); return 1; case GICD_ICFGR: /* SGIs */ goto write_ignore; case GICD_ICFGR + 1: /* PPIs */ /* It is implementation defined if these are writeable. We chose not */ goto write_ignore; case GICD_ICFGR + 2 ... GICD_ICFGRN: /* SPIs */ if ( dabt.size != 2 ) goto bad_width; rank = vgic_irq_rank(v, 2, gicd_reg - GICD_ICFGR); vgic_lock_rank(v, rank); if ( rank == NULL) goto write_ignore; rank->icfg[REG_RANK_INDEX(2, gicd_reg - GICD_ICFGR)] = *r; vgic_unlock_rank(v, rank); return 1; case GICD_NSACR ... GICD_NSACRN: /* We do not implement security extensions for guests, write ignore */ goto write_ignore; case GICD_SGIR: if ( dabt.size != 2 ) goto bad_width; return vgic_to_sgi(v, *r); case GICD_CPENDSGIR ... GICD_CPENDSGIRN: if ( dabt.size != 0 && dabt.size != 2 ) goto bad_width; printk("vGICD: unhandled %s write %#"PRIregister" to ICPENDSGIR%d\n", dabt.size ? "word" : "byte", *r, gicd_reg - GICD_CPENDSGIR); return 0; case GICD_SPENDSGIR ... GICD_SPENDSGIRN: if ( dabt.size != 0 && dabt.size != 2 ) goto bad_width; printk("vGICD: unhandled %s write %#"PRIregister" to ISPENDSGIR%d\n", dabt.size ? "word" : "byte", *r, gicd_reg - GICD_SPENDSGIR); return 0; /* Implementation defined -- write ignored */ case REG(0xfd0) ... REG(0xfe4): goto write_ignore; /* R/O -- write ignore */ case GICD_ICPIDR2: goto write_ignore; /* Implementation defined -- write ignored */ case REG(0xfec) ... REG(0xffc): goto write_ignore; /* Reserved -- write ignored */ case REG(0x00c) ... REG(0x01c): case REG(0x040) ... REG(0x07c): case REG(0x7fc): case REG(0xbfc): case REG(0xf04) ... REG(0xf0c): case REG(0xf30) ... REG(0xfcc): goto write_ignore; default: printk("vGICD: unhandled write r%d=%"PRIregister" offset %#08x\n", dabt.reg, *r, offset); return 0; } bad_width: printk("vGICD: bad write width %d r%d=%"PRIregister" offset %#08x\n", dabt.size, dabt.reg, *r, offset); domain_crash_synchronous(); return 0; write_ignore: if ( dabt.size != 2 ) goto bad_width; return 1; } static int vgic_distr_mmio_check(struct vcpu *v, paddr_t addr) { struct domain *d = v->domain; return (addr >= (d->arch.vgic.dbase)) && (addr < (d->arch.vgic.dbase + PAGE_SIZE)); } const struct mmio_handler vgic_distr_mmio_handler = { .check_handler = vgic_distr_mmio_check, .read_handler = vgic_distr_mmio_read, .write_handler = vgic_distr_mmio_write, }; struct pending_irq *irq_to_pending(struct vcpu *v, unsigned int irq) { struct pending_irq *n; /* Pending irqs allocation strategy: the first vgic.nr_lines irqs * are used for SPIs; the rests are used for per cpu irqs */ if ( irq < 32 ) n = &v->arch.vgic.pending_irqs[irq]; else n = &v->domain->arch.vgic.pending_irqs[irq - 32]; return n; } void vgic_clear_pending_irqs(struct vcpu *v) { struct pending_irq *p, *t; unsigned long flags; spin_lock_irqsave(&v->arch.vgic.lock, flags); list_for_each_entry_safe ( p, t, &v->arch.vgic.inflight_irqs, inflight ) list_del_init(&p->inflight); gic_clear_pending_irqs(v); spin_unlock_irqrestore(&v->arch.vgic.lock, flags); } void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int irq, int virtual) { int idx = irq >> 2, byte = irq & 0x3; uint8_t priority; struct vgic_irq_rank *rank = vgic_irq_rank(v, 8, idx); struct pending_irq *iter, *n = irq_to_pending(v, irq); unsigned long flags; bool_t running; spin_lock_irqsave(&v->arch.vgic.lock, flags); if ( !list_empty(&n->inflight) ) { if ( (irq != current->domain->arch.evtchn_irq) || (!test_bit(GIC_IRQ_GUEST_VISIBLE, &n->status)) ) set_bit(GIC_IRQ_GUEST_PENDING, &n->status); spin_unlock_irqrestore(&v->arch.vgic.lock, flags); return; } /* vcpu offline */ if ( test_bit(_VPF_down, &v->pause_flags) ) { spin_unlock_irqrestore(&v->arch.vgic.lock, flags); return; } priority = byte_read(rank->ipriority[REG_RANK_INDEX(8, idx)], 0, byte); n->irq = irq; set_bit(GIC_IRQ_GUEST_PENDING, &n->status); n->priority = priority; /* the irq is enabled */ if ( test_bit(GIC_IRQ_GUEST_ENABLED, &n->status) ) gic_set_guest_irq(v, irq, GICH_LR_PENDING, priority); list_for_each_entry ( iter, &v->arch.vgic.inflight_irqs, inflight ) { if ( iter->priority > priority ) { list_add_tail(&n->inflight, &iter->inflight); goto out; } } list_add_tail(&n->inflight, &v->arch.vgic.inflight_irqs); out: spin_unlock_irqrestore(&v->arch.vgic.lock, flags); /* we have a new higher priority irq, inject it into the guest */ running = v->is_running; vcpu_unblock(v); if ( running && v != current ) smp_send_event_check_mask(cpumask_of(v->processor)); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/setup.c0000664000175000017500000005624712307313555014253 0ustar smbsmb/* * xen/arch/arm/setup.c * * Early bringup code for an ARMv7-A with virt extensions. * * Tim Deegan * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct cpuinfo_arm __read_mostly boot_cpu_data; static __used void init_done(void) { free_init_memory(); startup_cpu_idle_loop(); } static void __init init_idle_domain(void) { scheduler_init(); set_current(idle_vcpu[0]); /* TODO: setup_idle_pagetable(); */ } static const char * __initdata processor_implementers[] = { ['A'] = "ARM Limited", ['B'] = "Broadcom Corporation", ['D'] = "Digital Equipment Corp", ['M'] = "Motorola, Freescale Semiconductor Inc.", ['P'] = "Applied Micro", ['Q'] = "Qualcomm Inc.", ['V'] = "Marvell Semiconductor Inc.", ['i'] = "Intel Corporation", }; static void __init processor_id(void) { const char *implementer = "Unknown"; struct cpuinfo_arm *c = &boot_cpu_data; identify_cpu(c); current_cpu_data = *c; if ( c->midr.implementer < ARRAY_SIZE(processor_implementers) && processor_implementers[c->midr.implementer] ) implementer = processor_implementers[c->midr.implementer]; if ( c->midr.architecture != 0xf ) printk("Huh, cpu architecture %x, expected 0xf (defined by cpuid)\n", c->midr.architecture); printk("Processor: %08"PRIx32": \"%s\", variant: 0x%x, part 0x%03x, rev 0x%x\n", c->midr.bits, implementer, c->midr.variant, c->midr.part_number, c->midr.revision); #if defined(CONFIG_ARM_64) printk("64-bit Execution:\n"); printk(" Processor Features: %016"PRIx64" %016"PRIx64"\n", boot_cpu_data.pfr64.bits[0], boot_cpu_data.pfr64.bits[1]); printk(" Exception Levels: EL3:%s EL2:%s EL1:%s EL0:%s\n", cpu_has_el3_32 ? "64+32" : cpu_has_el3_64 ? "64" : "No", cpu_has_el2_32 ? "64+32" : cpu_has_el2_64 ? "64" : "No", cpu_has_el1_32 ? "64+32" : cpu_has_el1_64 ? "64" : "No", cpu_has_el0_32 ? "64+32" : cpu_has_el0_64 ? "64" : "No"); printk(" Extensions:%s%s\n", cpu_has_fp ? " FloatingPoint" : "", cpu_has_simd ? " AdvancedSIMD" : ""); printk(" Debug Features: %016"PRIx64" %016"PRIx64"\n", boot_cpu_data.dbg64.bits[0], boot_cpu_data.dbg64.bits[1]); printk(" Auxiliary Features: %016"PRIx64" %016"PRIx64"\n", boot_cpu_data.aux64.bits[0], boot_cpu_data.aux64.bits[1]); printk(" Memory Model Features: %016"PRIx64" %016"PRIx64"\n", boot_cpu_data.mm64.bits[0], boot_cpu_data.mm64.bits[1]); printk(" ISA Features: %016"PRIx64" %016"PRIx64"\n", boot_cpu_data.isa64.bits[0], boot_cpu_data.isa64.bits[1]); #endif /* * On AArch64 these refer to the capabilities when running in * AArch32 mode. */ if ( cpu_has_aarch32 ) { printk("32-bit Execution:\n"); printk(" Processor Features: %08"PRIx32":%08"PRIx32"\n", boot_cpu_data.pfr32.bits[0], boot_cpu_data.pfr32.bits[1]); printk(" Instruction Sets:%s%s%s%s%s\n", cpu_has_aarch32 ? " AArch32" : "", cpu_has_thumb ? " Thumb" : "", cpu_has_thumb2 ? " Thumb-2" : "", cpu_has_thumbee ? " ThumbEE" : "", cpu_has_jazelle ? " Jazelle" : ""); printk(" Extensions:%s%s\n", cpu_has_gentimer ? " GenericTimer" : "", cpu_has_security ? " Security" : ""); printk(" Debug Features: %08"PRIx32"\n", boot_cpu_data.dbg32.bits[0]); printk(" Auxiliary Features: %08"PRIx32"\n", boot_cpu_data.aux32.bits[0]); printk(" Memory Model Features: " "%08"PRIx32" %08"PRIx32" %08"PRIx32" %08"PRIx32"\n", boot_cpu_data.mm32.bits[0], boot_cpu_data.mm32.bits[1], boot_cpu_data.mm32.bits[2], boot_cpu_data.mm32.bits[3]); printk(" ISA Features: %08x %08x %08x %08x %08x %08x\n", boot_cpu_data.isa32.bits[0], boot_cpu_data.isa32.bits[1], boot_cpu_data.isa32.bits[2], boot_cpu_data.isa32.bits[3], boot_cpu_data.isa32.bits[4], boot_cpu_data.isa32.bits[5]); } else { printk("32-bit Execution: Unsupported\n"); } } static void dt_unreserved_regions(paddr_t s, paddr_t e, void (*cb)(paddr_t, paddr_t), int first) { int i, nr = fdt_num_mem_rsv(device_tree_flattened); for ( i = first; i < nr ; i++ ) { paddr_t r_s, r_e; if ( fdt_get_mem_rsv(device_tree_flattened, i, &r_s, &r_e ) < 0 ) /* If we can't read it, pretend it doesn't exist... */ continue; r_e += r_s; /* fdt_get_mem_rsc returns length */ if ( s < r_e && r_s < e ) { dt_unreserved_regions(r_e, e, cb, i+1); dt_unreserved_regions(s, r_s, cb, i+1); return; } } cb(s, e); } void __init discard_initial_modules(void) { struct dt_module_info *mi = &early_info.modules; int i; for ( i = MOD_DISCARD_FIRST; i <= mi->nr_mods; i++ ) { paddr_t s = mi->module[i].start; paddr_t e = s + PAGE_ALIGN(mi->module[i].size); dt_unreserved_regions(s, e, init_domheap_pages, 0); } mi->nr_mods = 0; remove_early_mappings(); } /* * Returns the end address of the highest region in the range s..e * with required size and alignment that does not conflict with the * modules from first_mod to nr_modules. * * For non-recursive callers first_mod should normally be 0 (all * modules and Xen itself) or 1 (all modules but not Xen). */ static paddr_t __init consider_modules(paddr_t s, paddr_t e, uint32_t size, paddr_t align, int first_mod) { const struct dt_module_info *mi = &early_info.modules; int i; int nr_rsvd; s = (s+align-1) & ~(align-1); e = e & ~(align-1); if ( s > e || e - s < size ) return 0; /* First check the boot modules */ for ( i = first_mod; i <= mi->nr_mods; i++ ) { paddr_t mod_s = mi->module[i].start; paddr_t mod_e = mod_s + mi->module[i].size; if ( s < mod_e && mod_s < e ) { mod_e = consider_modules(mod_e, e, size, align, i+1); if ( mod_e ) return mod_e; return consider_modules(s, mod_s, size, align, i+1); } } /* Now check any fdt reserved areas. */ nr_rsvd = fdt_num_mem_rsv(device_tree_flattened); for ( ; i < mi->nr_mods + nr_rsvd; i++ ) { paddr_t mod_s, mod_e; if ( fdt_get_mem_rsv(device_tree_flattened, i - mi->nr_mods, &mod_s, &mod_e ) < 0 ) /* If we can't read it, pretend it doesn't exist... */ continue; /* fdt_get_mem_rsv returns length */ mod_e += mod_s; if ( s < mod_e && mod_s < e ) { mod_e = consider_modules(mod_e, e, size, align, i+1); if ( mod_e ) return mod_e; return consider_modules(s, mod_s, size, align, i+1); } } return e; } /* * Return the end of the non-module region starting at s. In other * words return s the start of the next modules after s. * * On input *end is the end of the region which should be considered * and it is updated to reflect the end of the module, clipped to the * end of the region if it would run over. */ static paddr_t __init next_module(paddr_t s, paddr_t *end) { struct dt_module_info *mi = &early_info.modules; paddr_t lowest = ~(paddr_t)0; int i; for ( i = 0; i <= mi->nr_mods; i++ ) { paddr_t mod_s = mi->module[i].start; paddr_t mod_e = mod_s + mi->module[i].size; if ( mod_s < s ) continue; if ( mod_s > lowest ) continue; if ( mod_s > *end ) continue; lowest = mod_s; *end = min(*end, mod_e); } return lowest; } /** * get_xen_paddr - get physical address to relocate Xen to * * Xen is relocated to as near to the top of RAM as possible and * aligned to a XEN_PADDR_ALIGN boundary. */ static paddr_t __init get_xen_paddr(void) { struct dt_mem_info *mi = &early_info.mem; paddr_t min_size; paddr_t paddr = 0, last_end; int i; min_size = (_end - _start + (XEN_PADDR_ALIGN-1)) & ~(XEN_PADDR_ALIGN-1); last_end = mi->bank[0].start; /* Find the highest bank with enough space. */ for ( i = 0; i < mi->nr_banks; i++ ) { const struct membank *bank = &mi->bank[i]; paddr_t s, e; /* We can only deal with contiguous memory at the moment */ if ( last_end != bank->start ) break; last_end = bank->start + bank->size; if ( bank->size >= min_size ) { e = consider_modules(bank->start, bank->start + bank->size, min_size, XEN_PADDR_ALIGN, 1); if ( !e ) continue; #ifdef CONFIG_ARM_32 /* Xen must be under 4GB */ if ( e > 0x100000000ULL ) e = 0x100000000ULL; if ( e < bank->start ) continue; #endif s = e - min_size; if ( s > paddr ) paddr = s; } } if ( !paddr ) early_panic("Not enough memory to relocate Xen"); early_printk("Placing Xen at 0x%"PRIpaddr"-0x%"PRIpaddr"\n", paddr, paddr + min_size); early_info.modules.module[MOD_XEN].start = paddr; early_info.modules.module[MOD_XEN].size = min_size; return paddr; } #ifdef CONFIG_ARM_32 static void __init setup_mm(unsigned long dtb_paddr, size_t dtb_size) { paddr_t ram_start, ram_end, ram_size; paddr_t contig_start, contig_end; paddr_t s, e; unsigned long ram_pages; unsigned long heap_pages, xenheap_pages, domheap_pages; unsigned long dtb_pages; unsigned long boot_mfn_start, boot_mfn_end; int i; void *fdt; if ( !early_info.mem.nr_banks ) early_panic("No memory bank"); /* * We are going to accumulate two regions here. * * The first is the bounds of the initial memory region which is * contiguous with the first bank. For simplicity the xenheap is * always allocated from this region. * * The second is the complete bounds of the regions containing RAM * (ie. from the lowest RAM address to the highest), which * includes any holes. * * We also track the number of actual RAM pages (i.e. not counting * the holes). */ ram_size = early_info.mem.bank[0].size; contig_start = ram_start = early_info.mem.bank[0].start; contig_end = ram_end = ram_start + ram_size; for ( i = 1; i < early_info.mem.nr_banks; i++ ) { paddr_t bank_start = early_info.mem.bank[i].start; paddr_t bank_size = early_info.mem.bank[i].size; paddr_t bank_end = bank_start + bank_size; paddr_t new_ram_size = ram_size + bank_size; paddr_t new_ram_start = min(ram_start,bank_start); paddr_t new_ram_end = max(ram_end,bank_end); /* * If the new bank is contiguous with the initial contiguous * region then incorporate it into the contiguous region. * * Otherwise we allow non-contigious regions so long as at * least half of the total RAM region actually contains * RAM. We actually fudge this slightly and require that * adding the current bank does not cause us to violate this * restriction. * * This restriction ensures that the frametable (which is not * currently sparse) does not consume all available RAM. */ if ( bank_start == contig_end ) contig_end = bank_end; else if ( bank_end == contig_start ) contig_start = bank_start; else if ( 2 * new_ram_size < new_ram_end - new_ram_start ) /* Would create memory map which is too sparse, so stop here. */ break; ram_size = new_ram_size; ram_start = new_ram_start; ram_end = new_ram_end; } if ( i != early_info.mem.nr_banks ) { early_printk("WARNING: only using %d out of %d memory banks\n", i, early_info.mem.nr_banks); early_info.mem.nr_banks = i; } total_pages = ram_pages = ram_size >> PAGE_SHIFT; /* * Locate the xenheap using these constraints: * * - must be 32 MiB aligned * - must not include Xen itself or the boot modules * - must be at most 1/8 the total RAM in the system * - must be at least 128M * * We try to allocate the largest xenheap possible within these * constraints. */ heap_pages = ram_pages; xenheap_pages = (heap_pages/8 + 0x1fffUL) & ~0x1fffUL; xenheap_pages = max(xenheap_pages, 128UL<<(20-PAGE_SHIFT)); do { /* xenheap is always in the initial contiguous region */ e = consider_modules(contig_start, contig_end, pfn_to_paddr(xenheap_pages), 32<<20, 0); if ( e ) break; xenheap_pages >>= 1; } while ( xenheap_pages > 128<<(20-PAGE_SHIFT) ); if ( ! e ) early_panic("Not not enough space for xenheap"); domheap_pages = heap_pages - xenheap_pages; early_printk("Xen heap: %"PRIpaddr"-%"PRIpaddr" (%lu pages)\n", e - (pfn_to_paddr(xenheap_pages)), e, xenheap_pages); early_printk("Dom heap: %lu pages\n", domheap_pages); setup_xenheap_mappings((e >> PAGE_SHIFT) - xenheap_pages, xenheap_pages); /* * Need a single mapped page for populating bootmem_region_list * and enough mapped pages for copying the DTB. */ dtb_pages = (dtb_size + PAGE_SIZE-1) >> PAGE_SHIFT; boot_mfn_start = xenheap_mfn_end - dtb_pages - 1; boot_mfn_end = xenheap_mfn_end; init_boot_pages(pfn_to_paddr(boot_mfn_start), pfn_to_paddr(boot_mfn_end)); /* Copy the DTB. */ fdt = mfn_to_virt(alloc_boot_pages(dtb_pages, 1)); copy_from_paddr(fdt, dtb_paddr, dtb_size, BUFFERABLE); device_tree_flattened = fdt; /* Add non-xenheap memory */ for ( i = 0; i < early_info.mem.nr_banks; i++ ) { paddr_t bank_start = early_info.mem.bank[i].start; paddr_t bank_end = bank_start + early_info.mem.bank[i].size; s = bank_start; while ( s < bank_end ) { paddr_t n = bank_end; e = next_module(s, &n); if ( e == ~(paddr_t)0 ) { e = n = ram_end; } /* * Module in a RAM bank other than the one which we are * not dealing with here. */ if ( e > bank_end ) e = bank_end; /* Avoid the xenheap */ if ( s < pfn_to_paddr(xenheap_mfn_start+xenheap_pages) && pfn_to_paddr(xenheap_mfn_start) < e ) { e = pfn_to_paddr(xenheap_mfn_start); n = pfn_to_paddr(xenheap_mfn_start+xenheap_pages); } dt_unreserved_regions(s, e, init_boot_pages, 0); s = n; } } /* Frame table covers all of RAM region, including holes */ setup_frametable_mappings(ram_start, ram_end); max_page = PFN_DOWN(ram_end); /* Add xenheap memory that was not already added to the boot allocator. */ init_xenheap_pages(pfn_to_paddr(xenheap_mfn_start), pfn_to_paddr(boot_mfn_start)); end_boot_allocator(); } #else /* CONFIG_ARM_64 */ static void __init setup_mm(unsigned long dtb_paddr, size_t dtb_size) { paddr_t ram_start = ~0; paddr_t ram_end = 0; paddr_t ram_size = 0; int bank; unsigned long dtb_pages; void *fdt; total_pages = 0; for ( bank = 0 ; bank < early_info.mem.nr_banks; bank++ ) { paddr_t bank_start = early_info.mem.bank[bank].start; paddr_t bank_size = early_info.mem.bank[bank].size; paddr_t bank_end = bank_start + bank_size; paddr_t s, e; paddr_t new_ram_size = ram_size + bank_size; paddr_t new_ram_start = min(ram_start,bank_start); paddr_t new_ram_end = max(ram_end,bank_end); /* * We allow non-contigious regions so long as at least half of * the total RAM region actually contains RAM. We actually * fudge this slightly and require that adding the current * bank does not cause us to violate this restriction. * * This restriction ensures that the frametable (which is not * currently sparse) does not consume all available RAM. */ if ( bank > 0 && 2 * new_ram_size < new_ram_end - new_ram_start ) /* Would create memory map which is too sparse, so stop here. */ break; ram_start = new_ram_start; ram_end = new_ram_end; ram_size = new_ram_size; setup_xenheap_mappings(bank_start>>PAGE_SHIFT, bank_size>>PAGE_SHIFT); s = bank_start; while ( s < bank_end ) { paddr_t n = bank_end; e = next_module(s, &n); if ( e == ~(paddr_t)0 ) { e = n = bank_end; } if ( e > bank_end ) e = bank_end; xenheap_mfn_end = e; dt_unreserved_regions(s, e, init_boot_pages, 0); s = n; } } if ( bank != early_info.mem.nr_banks ) { early_printk("WARNING: only using %d out of %d memory banks\n", bank, early_info.mem.nr_banks); early_info.mem.nr_banks = bank; } total_pages += ram_size >> PAGE_SHIFT; xenheap_virt_end = XENHEAP_VIRT_START + ram_end - ram_start; xenheap_mfn_start = ram_start >> PAGE_SHIFT; xenheap_mfn_end = ram_end >> PAGE_SHIFT; xenheap_max_mfn(xenheap_mfn_end); /* * Need enough mapped pages for copying the DTB. */ dtb_pages = (dtb_size + PAGE_SIZE-1) >> PAGE_SHIFT; /* Copy the DTB. */ fdt = mfn_to_virt(alloc_boot_pages(dtb_pages, 1)); copy_from_paddr(fdt, dtb_paddr, dtb_size, BUFFERABLE); device_tree_flattened = fdt; setup_frametable_mappings(ram_start, ram_end); max_page = PFN_DOWN(ram_end); end_boot_allocator(); } #endif size_t __read_mostly cacheline_bytes; /* Very early check of the CPU cache properties */ void __init setup_cache(void) { uint32_t ccsid; /* Read the cache size ID register for the level-0 data cache */ WRITE_SYSREG32(0, CSSELR_EL1); ccsid = READ_SYSREG32(CCSIDR_EL1); /* Low 3 bits are log2(cacheline size in words) - 2. */ cacheline_bytes = 1U << (4 + (ccsid & 0x7)); } /* C entry point for boot CPU */ void __init start_xen(unsigned long boot_phys_offset, unsigned long fdt_paddr, unsigned long cpuid) { size_t fdt_size; int cpus, i; const char *cmdline; setup_cache(); percpu_init_areas(); set_processor_id(0); /* needed early, for smp_processor_id() */ smp_clear_cpu_maps(); /* This is mapped by head.S */ device_tree_flattened = (void *)BOOT_FDT_VIRT_START + (fdt_paddr & ((1 << SECOND_SHIFT) - 1)); fdt_size = device_tree_early_init(device_tree_flattened, fdt_paddr); cmdline = device_tree_bootargs(device_tree_flattened); early_printk("Command line: %s\n", cmdline); cmdline_parse(cmdline); setup_pagetables(boot_phys_offset, get_xen_paddr()); setup_mm(fdt_paddr, fdt_size); vm_init(); dt_unflatten_host_device_tree(); dt_irq_xlate = gic_irq_xlate; dt_uart_init(); console_init_preirq(); system_state = SYS_STATE_boot; processor_id(); platform_init(); smp_init_cpus(); cpus = smp_get_max_cpus(); init_xen_time(); gic_init(); set_current((struct vcpu *)0xfffff000); /* debug sanity */ idle_vcpu[0] = current; init_traps(); setup_virt_paging(); p2m_vmid_allocator_init(); softirq_init(); tasklet_subsys_init(); init_IRQ(); gic_route_ppis(); gic_route_spis(); init_maintenance_interrupt(); init_timer_interrupt(); timer_init(); init_idle_domain(); rcu_init(); arch_init_memory(); local_irq_enable(); local_abort_enable(); smp_prepare_cpus(cpus); initialize_keytable(); console_init_postirq(); do_presmp_initcalls(); for_each_present_cpu ( i ) { if ( (num_online_cpus() < cpus) && !cpu_online(i) ) { int ret = cpu_up(i); if ( ret != 0 ) printk("Failed to bring up CPU %u (error %d)\n", i, ret); } } printk("Brought up %ld CPUs\n", (long)num_online_cpus()); /* TODO: smp_cpus_done(); */ do_initcalls(); /* Create initial domain 0. */ dom0 = domain_create(0, 0, 0); if ( IS_ERR(dom0) || (alloc_dom0_vcpu0() == NULL) ) panic("Error creating domain 0"); dom0->is_privileged = 1; dom0->target = NULL; if ( construct_dom0(dom0) != 0) panic("Could not set up DOM0 guest OS"); /* Scrub RAM that is still free and so may go to an unprivileged domain. */ scrub_heap_pages(); init_constructors(); console_endboot(); /* Hide UART from DOM0 if we're using it */ serial_endboot(); system_state = SYS_STATE_active; domain_unpause_by_systemcontroller(dom0); /* Switch on to the dynamically allocated stack for the idle vcpu * since the static one we're running on is about to be freed. */ memcpy(idle_vcpu[0]->arch.cpu_info, get_cpu_info(), sizeof(struct cpu_info)); switch_stack_and_jump(idle_vcpu[0]->arch.cpu_info, init_done); } void arch_get_xen_caps(xen_capabilities_info_t *info) { /* Interface name is always xen-3.0-* for Xen-3.x. */ int major = 3, minor = 0; char s[32]; (*info)[0] = '\0'; #ifdef CONFIG_ARM_64 snprintf(s, sizeof(s), "xen-%d.%d-aarch64 ", major, minor); safe_strcat(*info, s); #endif snprintf(s, sizeof(s), "xen-%d.%d-armv7l ", major, minor); safe_strcat(*info, s); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/kernel.c0000664000175000017500000002553212307313555014364 0ustar smbsmb/* * Kernel image loading. * * Copyright (C) 2011 Citrix Systems, Inc. */ #include #include #include #include #include #include #include #include #include #include #include "kernel.h" /* Store kernel in first 8M of flash */ #define KERNEL_FLASH_ADDRESS 0x00000000UL #define KERNEL_FLASH_SIZE 0x00800000UL #define ZIMAGE32_MAGIC_OFFSET 0x24 #define ZIMAGE32_START_OFFSET 0x28 #define ZIMAGE32_END_OFFSET 0x2c #define ZIMAGE32_HEADER_LEN 0x30 #define ZIMAGE32_MAGIC 0x016f2818 #define ZIMAGE64_MAGIC_V0 0x14000008 #define ZIMAGE64_MAGIC_V1 0x644d5241 /* "ARM\x64" */ struct minimal_dtb_header { uint32_t magic; uint32_t total_size; /* There are other fields but we don't use them yet. */ }; #define DTB_MAGIC 0xd00dfeed /** * copy_from_paddr - copy data from a physical address * @dst: destination virtual address * @paddr: source physical address * @len: length to copy */ void copy_from_paddr(void *dst, paddr_t paddr, unsigned long len, int attrindx) { void *src = (void *)FIXMAP_ADDR(FIXMAP_MISC); while (len) { paddr_t p; unsigned long l, s; p = paddr >> PAGE_SHIFT; s = paddr & (PAGE_SIZE-1); l = min(PAGE_SIZE - s, len); set_fixmap(FIXMAP_MISC, p, attrindx); memcpy(dst, src + s, l); clean_xen_dcache_va_range(dst, l); paddr += l; dst += l; len -= l; } clear_fixmap(FIXMAP_MISC); } static void place_modules(struct kernel_info *info, paddr_t kernel_start, paddr_t kernel_end) { /* Align DTB and initrd size to 2Mb. Linux only requires 4 byte alignment */ const paddr_t initrd_len = ROUNDUP(early_info.modules.module[MOD_INITRD].size, MB(2)); const paddr_t dtb_len = ROUNDUP(fdt_totalsize(info->fdt), MB(2)); const paddr_t total = initrd_len + dtb_len; /* Convenient */ const paddr_t mem_start = info->mem.bank[0].start; const paddr_t mem_size = info->mem.bank[0].size; const paddr_t mem_end = mem_start + mem_size; const paddr_t kernel_size = kernel_end - kernel_start; paddr_t addr; if ( total + kernel_size > mem_size ) panic("Not enough memory in the first bank for the dtb+initrd"); /* * DTB must be loaded such that it does not conflict with the * kernel decompressor. For 32-bit Linux Documentation/arm/Booting * recommends just after the 128MB boundary while for 64-bit Linux * the recommendation in Documentation/arm64/booting.txt is below * 512MB. * * If the bootloader provides an initrd, it will be loaded just * after the DTB. * * We try to place dtb+initrd at 128MB, (or, if we have less RAM, * as high as possible). If there is no space then fallback to * just after the kernel, if there is room, otherwise just before. */ if ( kernel_end < MIN(mem_start + MB(128), mem_end - total) ) addr = MIN(mem_start + MB(128), mem_end - total); else if ( mem_end - ROUNDUP(kernel_end, MB(2)) >= total ) addr = ROUNDUP(kernel_end, MB(2)); else if ( kernel_start - mem_start >= total ) addr = kernel_start - total; else { panic("Unable to find suitable location for dtb+initrd"); return; } info->dtb_paddr = addr; info->initrd_paddr = info->dtb_paddr + dtb_len; } static void kernel_zimage_load(struct kernel_info *info) { paddr_t load_addr = info->zimage.load_addr; paddr_t paddr = info->zimage.kernel_addr; paddr_t attr = info->load_attr; paddr_t len = info->zimage.len; unsigned long offs; place_modules(info, load_addr, load_addr + len); printk("Loading zImage from %"PRIpaddr" to %"PRIpaddr"-%"PRIpaddr"\n", paddr, load_addr, load_addr + len); for ( offs = 0; offs < len; ) { int rc; paddr_t s, l, ma; void *dst; s = offs & ~PAGE_MASK; l = min(PAGE_SIZE - s, len); rc = gvirt_to_maddr(load_addr + offs, &ma); if ( rc ) { panic("Unable to map translate guest address"); return; } dst = map_domain_page(ma>>PAGE_SHIFT); copy_from_paddr(dst + s, paddr + offs, l, attr); unmap_domain_page(dst); offs += l; } } #ifdef CONFIG_ARM_64 /* * Check if the image is a 64-bit zImage and setup kernel_info */ static int kernel_try_zimage64_prepare(struct kernel_info *info, paddr_t addr, paddr_t size) { /* linux/Documentation/arm64/booting.txt */ struct { uint32_t magic0; uint32_t res0; uint64_t text_offset; /* Image load offset */ uint64_t res1; uint64_t res2; /* zImage V1 only from here */ uint64_t res3; uint64_t res4; uint64_t res5; uint32_t magic1; uint32_t res6; } zimage; uint64_t start, end; if ( size < sizeof(zimage) ) return -EINVAL; copy_from_paddr(&zimage, addr, sizeof(zimage), DEV_SHARED); if ( zimage.magic0 != ZIMAGE64_MAGIC_V0 && zimage.magic1 != ZIMAGE64_MAGIC_V1 ) return -EINVAL; /* Currently there is no length in the header, so just use the size */ start = 0; end = size; /* * Given the above this check is a bit pointless, but leave it * here in case someone adds a length field in the future. */ if ( (end - start) > size ) return -EINVAL; info->zimage.kernel_addr = addr; info->zimage.load_addr = info->mem.bank[0].start + zimage.text_offset; info->zimage.len = end - start; info->entry = info->zimage.load_addr; info->load = kernel_zimage_load; info->type = DOMAIN_PV64; return 0; } #endif /* * Check if the image is a 32-bit zImage and setup kernel_info */ static int kernel_try_zimage32_prepare(struct kernel_info *info, paddr_t addr, paddr_t size) { uint32_t zimage[ZIMAGE32_HEADER_LEN/4]; uint32_t start, end; struct minimal_dtb_header dtb_hdr; if ( size < ZIMAGE32_HEADER_LEN ) return -EINVAL; copy_from_paddr(zimage, addr, sizeof(zimage), DEV_SHARED); if (zimage[ZIMAGE32_MAGIC_OFFSET/4] != ZIMAGE32_MAGIC) return -EINVAL; start = zimage[ZIMAGE32_START_OFFSET/4]; end = zimage[ZIMAGE32_END_OFFSET/4]; if ( (end - start) > size ) return -EINVAL; /* * Check for an appended DTB. */ if ( addr + end - start + sizeof(dtb_hdr) <= size ) { copy_from_paddr(&dtb_hdr, addr + end - start, sizeof(dtb_hdr), DEV_SHARED); if (be32_to_cpu(dtb_hdr.magic) == DTB_MAGIC) { end += be32_to_cpu(dtb_hdr.total_size); if ( end > addr + size ) return -EINVAL; } } info->zimage.kernel_addr = addr; /* * If start is zero, the zImage is position independent, in this * case Documentation/arm/Booting recommends loading below 128MiB * and above 32MiB. Load it as high as possible within these * constraints, while also avoiding the DTB. */ if (start == 0) { paddr_t load_end; load_end = info->mem.bank[0].start + info->mem.bank[0].size; load_end = MIN(info->mem.bank[0].start + MB(128), load_end); info->zimage.load_addr = load_end - end; /* Align to 2MB */ info->zimage.load_addr &= ~((2 << 20) - 1); } else info->zimage.load_addr = start; info->zimage.len = end - start; info->entry = info->zimage.load_addr; info->load = kernel_zimage_load; #ifdef CONFIG_ARM_64 info->type = DOMAIN_PV32; #endif return 0; } static void kernel_elf_load(struct kernel_info *info) { place_modules(info, info->elf.parms.virt_kstart, info->elf.parms.virt_kend); printk("Loading ELF image into guest memory\n"); info->elf.elf.dest_base = (void*)(unsigned long)info->elf.parms.virt_kstart; info->elf.elf.dest_size = info->elf.parms.virt_kend - info->elf.parms.virt_kstart; elf_load_binary(&info->elf.elf); printk("Free temporary kernel buffer\n"); free_xenheap_pages(info->kernel_img, info->kernel_order); } static int kernel_try_elf_prepare(struct kernel_info *info, paddr_t addr, paddr_t size) { int rc; memset(&info->elf.elf, 0, sizeof(info->elf.elf)); info->kernel_order = get_order_from_bytes(size); info->kernel_img = alloc_xenheap_pages(info->kernel_order, 0); if ( info->kernel_img == NULL ) panic("Cannot allocate temporary buffer for kernel"); copy_from_paddr(info->kernel_img, addr, size, info->load_attr); if ( (rc = elf_init(&info->elf.elf, info->kernel_img, size )) != 0 ) goto err; #ifdef VERBOSE elf_set_verbose(&info->elf.elf); #endif elf_parse_binary(&info->elf.elf); if ( (rc = elf_xen_parse(&info->elf.elf, &info->elf.parms)) != 0 ) goto err; #ifdef CONFIG_ARM_64 if ( elf_32bit(&info->elf.elf) ) info->type = DOMAIN_PV32; else if ( elf_64bit(&info->elf.elf) ) info->type = DOMAIN_PV64; else { printk("Unknown ELF class\n"); rc = -EINVAL; goto err; } #endif /* * TODO: can the ELF header be used to find the physical address * to load the image to? Instead of assuming virt == phys. */ info->entry = info->elf.parms.virt_entry; info->load = kernel_elf_load; if ( elf_check_broken(&info->elf.elf) ) printk("Xen: warning: ELF kernel broken: %s\n", elf_check_broken(&info->elf.elf)); return 0; err: if ( elf_check_broken(&info->elf.elf) ) printk("Xen: ELF kernel broken: %s\n", elf_check_broken(&info->elf.elf)); free_xenheap_pages(info->kernel_img, info->kernel_order); return rc; } int kernel_prepare(struct kernel_info *info) { int rc; paddr_t start, size; if ( early_info.modules.nr_mods < MOD_KERNEL ) { printk("No boot modules found, trying flash\n"); start = KERNEL_FLASH_ADDRESS; size = KERNEL_FLASH_SIZE; info->load_attr = DEV_SHARED; } else { printk("Loading kernel from boot module %d\n", MOD_KERNEL); start = early_info.modules.module[MOD_KERNEL].start; size = early_info.modules.module[MOD_KERNEL].size; info->load_attr = BUFFERABLE; } #ifdef CONFIG_ARM_64 rc = kernel_try_zimage64_prepare(info, start, size); if (rc < 0) #endif rc = kernel_try_zimage32_prepare(info, start, size); if (rc < 0) rc = kernel_try_elf_prepare(info, start, size); return rc; } void kernel_load(struct kernel_info *info) { info->load(info); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/hvm.c0000664000175000017500000000245112307313555013671 0ustar smbsmb#include #include #include #include #include #include #include #include #include #include #include long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg) { long rc = 0; switch ( op ) { case HVMOP_set_param: case HVMOP_get_param: { struct xen_hvm_param a; struct domain *d; if ( copy_from_guest(&a, arg, 1) ) return -EFAULT; if ( a.index >= HVM_NR_PARAMS ) return -EINVAL; d = rcu_lock_domain_by_any_id(a.domid); if ( d == NULL ) return -ESRCH; rc = xsm_hvm_param(XSM_TARGET, d, op); if ( rc ) goto param_fail; if ( op == HVMOP_set_param ) { d->arch.hvm_domain.params[a.index] = a.value; } else { a.value = d->arch.hvm_domain.params[a.index]; rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0; } param_fail: rcu_unlock_domain(d); break; } default: { printk("%s: Bad HVM op %ld.\n", __func__, op); rc = -ENOSYS; break; } } return rc; } xen-4.4.0/xen/arch/arm/decode.c0000664000175000017500000001010612307313555014316 0ustar smbsmb/* * xen/arch/arm/decode.c * * Instruction decoder * * Julien Grall * Copyright (C) 2013 Linaro Limited. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include "decode.h" static void update_dabt(struct hsr_dabt *dabt, int reg, uint8_t size, bool_t sign) { dabt->reg = reg; dabt->size = size; dabt->sign = sign; } static int decode_thumb2(register_t pc, struct hsr_dabt *dabt, uint16_t hw1) { uint16_t hw2; uint16_t rt; if ( raw_copy_from_guest(&hw2, (void *__user)(pc + 2), sizeof (hw2)) ) return -EFAULT; rt = (hw2 >> 12) & 0x7; switch ( (hw1 >> 9) & 0xf ) { case 12: { bool_t sign = !!(hw1 & (1 << 8)); bool_t load = !!(hw1 & (1 << 4)); if ( (hw1 & 0x0110) == 0x0100 ) /* NEON instruction */ goto bad_thumb2; if ( (hw1 & 0x0070) == 0x0070 ) /* Undefined opcodes */ goto bad_thumb2; /* Store/Load single data item */ if ( rt == 15 ) /* XXX: Rt == 15 is only invalid for store instruction */ goto bad_thumb2; if ( !load && sign ) /* Store instruction doesn't support sign extension */ goto bad_thumb2; update_dabt(dabt, rt, (hw1 >> 5) & 3, sign); break; } default: goto bad_thumb2; } return 0; bad_thumb2: gdprintk(XENLOG_ERR, "unhandled THUMB2 instruction 0x%x%x\n", hw1, hw2); return 1; } static int decode_thumb(register_t pc, struct hsr_dabt *dabt) { uint16_t instr; if ( raw_copy_from_guest(&instr, (void * __user)pc, sizeof (instr)) ) return -EFAULT; switch ( instr >> 12 ) { case 5: { /* Load/Store register */ uint16_t opB = (instr >> 9) & 0x7; int reg = instr & 7; switch ( opB & 0x3 ) { case 0: /* Non-signed word */ update_dabt(dabt, reg, 2, 0); break; case 1: /* Non-signed halfword */ update_dabt(dabt, reg, 1, 0); break; case 2: /* Non-signed byte */ update_dabt(dabt, reg, 0, 0); break; case 3: /* Signed byte */ update_dabt(dabt, reg, 0, 1); break; } break; } case 6: /* Load/Store word immediate offset */ update_dabt(dabt, instr & 7, 2, 0); break; case 7: /* Load/Store byte immediate offset */ update_dabt(dabt, instr & 7, 0, 0); break; case 8: /* Load/Store halfword immediate offset */ update_dabt(dabt, instr & 7, 1, 0); break; case 9: /* Load/Store word sp offset */ update_dabt(dabt, (instr >> 8) & 7, 2, 0); break; case 14: if ( instr & (1 << 11) ) return decode_thumb2(pc, dabt, instr); goto bad_thumb; case 15: return decode_thumb2(pc, dabt, instr); default: goto bad_thumb; } return 0; bad_thumb: gdprintk(XENLOG_ERR, "unhandled THUMB instruction 0x%x\n", instr); return 1; } int decode_instruction(const struct cpu_user_regs *regs, struct hsr_dabt *dabt) { if ( is_pv32_domain(current->domain) && regs->cpsr & PSR_THUMB ) return decode_thumb(regs->pc, dabt); /* TODO: Handle ARM instruction */ gdprintk(XENLOG_ERR, "unhandled ARM instruction\n"); return 1; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/domain_build.c0000664000175000017500000007246712307313555015543 0ustar smbsmb#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "kernel.h" static unsigned int __initdata opt_dom0_max_vcpus; integer_param("dom0_max_vcpus", opt_dom0_max_vcpus); int dom0_11_mapping = 1; #define DOM0_MEM_DEFAULT 0x8000000 /* 128 MiB */ static u64 __initdata dom0_mem = DOM0_MEM_DEFAULT; static void __init parse_dom0_mem(const char *s) { dom0_mem = parse_size_and_unit(s, &s); if ( dom0_mem == 0 ) dom0_mem = DOM0_MEM_DEFAULT; } custom_param("dom0_mem", parse_dom0_mem); //#define DEBUG_DT #ifdef DEBUG_DT # define DPRINT(fmt, args...) printk(XENLOG_DEBUG fmt, ##args) #else # define DPRINT(fmt, args...) do {} while ( 0 ) #endif /* * Amount of extra space required to dom0's device tree. No new nodes * are added (yet) but one terminating reserve map entry (16 bytes) is * added. */ #define DOM0_FDT_EXTRA_SIZE (128 + sizeof(struct fdt_reserve_entry)) struct vcpu *__init alloc_dom0_vcpu0(void) { if ( opt_dom0_max_vcpus == 0 ) opt_dom0_max_vcpus = num_online_cpus(); if ( opt_dom0_max_vcpus > MAX_VIRT_CPUS ) opt_dom0_max_vcpus = MAX_VIRT_CPUS; dom0->vcpu = xzalloc_array(struct vcpu *, opt_dom0_max_vcpus); if ( !dom0->vcpu ) return NULL; dom0->max_vcpus = opt_dom0_max_vcpus; return alloc_vcpu(dom0, 0, 0); } static void allocate_memory_11(struct domain *d, struct kernel_info *kinfo) { paddr_t start; paddr_t size; struct page_info *pg = NULL; unsigned int order = get_order_from_bytes(dom0_mem); int res; paddr_t spfn; unsigned int bits; for ( bits = PAGE_SHIFT + 1; bits < PADDR_BITS; bits++ ) { pg = alloc_domheap_pages(d, order, MEMF_bits(bits)); if ( pg != NULL ) break; } if ( !pg ) panic("Failed to allocate contiguous memory for dom0"); spfn = page_to_mfn(pg); start = pfn_to_paddr(spfn); size = pfn_to_paddr((1 << order)); // 1:1 mapping printk("Populate P2M %#"PRIx64"->%#"PRIx64" (1:1 mapping for dom0)\n", start, start + size); res = guest_physmap_add_page(d, spfn, spfn, order); if ( res ) panic("Unable to add pages in DOM0: %d", res); kinfo->mem.bank[0].start = start; kinfo->mem.bank[0].size = size; kinfo->mem.nr_banks = 1; kinfo->unassigned_mem -= size; } static void allocate_memory(struct domain *d, struct kernel_info *kinfo) { struct dt_device_node *memory = NULL; const void *reg; u32 reg_len, reg_size; unsigned int bank = 0; if ( dom0_11_mapping ) return allocate_memory_11(d, kinfo); while ( (memory = dt_find_node_by_type(memory, "memory")) ) { int l; DPRINT("memory node\n"); reg_size = dt_cells_to_size(dt_n_addr_cells(memory) + dt_n_size_cells(memory)); reg = dt_get_property(memory, "reg", ®_len); if ( reg == NULL ) panic("Memory node has no reg property"); for ( l = 0; kinfo->unassigned_mem > 0 && l + reg_size <= reg_len && kinfo->mem.nr_banks < NR_MEM_BANKS; l += reg_size ) { paddr_t start, size; if ( dt_device_get_address(memory, bank, &start, &size) ) panic("Unable to retrieve the bank %u for %s", bank, dt_node_full_name(memory)); if ( size > kinfo->unassigned_mem ) size = kinfo->unassigned_mem; printk("Populate P2M %#"PRIx64"->%#"PRIx64"\n", start, start + size); if ( p2m_populate_ram(d, start, start + size) < 0 ) panic("Failed to populate P2M"); kinfo->mem.bank[kinfo->mem.nr_banks].start = start; kinfo->mem.bank[kinfo->mem.nr_banks].size = size; kinfo->mem.nr_banks++; kinfo->unassigned_mem -= size; } } } static int write_properties(struct domain *d, struct kernel_info *kinfo, const struct dt_device_node *node) { const char *bootargs = NULL; const struct dt_property *prop; int res = 0; int had_dom0_bootargs = 0; if ( early_info.modules.nr_mods >= MOD_KERNEL && early_info.modules.module[MOD_KERNEL].cmdline[0] ) bootargs = &early_info.modules.module[MOD_KERNEL].cmdline[0]; dt_for_each_property_node (node, prop) { const void *prop_data = prop->value; void *new_data = NULL; u32 prop_len = prop->length; /* * In chosen node: * * * remember xen,dom0-bootargs if we don't already have * bootargs (from module #1, above). * * remove bootargs, xen,dom0-bootargs, xen,xen-bootargs, * linux,initrd-start and linux,initrd-end. */ if ( dt_node_path_is_equal(node, "/chosen") ) { if ( dt_property_name_is_equal(prop, "xen,xen-bootargs") || dt_property_name_is_equal(prop, "linux,initrd-start") || dt_property_name_is_equal(prop, "linux,initrd-end") ) continue; if ( dt_property_name_is_equal(prop, "xen,dom0-bootargs") ) { had_dom0_bootargs = 1; bootargs = prop->value; continue; } if ( dt_property_name_is_equal(prop, "bootargs") ) { if ( !bootargs && !had_dom0_bootargs ) bootargs = prop->value; continue; } } res = fdt_property(kinfo->fdt, prop->name, prop_data, prop_len); xfree(new_data); if ( res ) return res; } if ( dt_node_path_is_equal(node, "/chosen") ) { if ( bootargs ) { res = fdt_property(kinfo->fdt, "bootargs", bootargs, strlen(bootargs) + 1); if ( res ) return res; } /* * If the bootloader provides an initrd, we must create a placeholder * for the initrd properties. The values will be replaced later. */ if ( early_info.modules.module[MOD_INITRD].size ) { u64 a = 0; res = fdt_property(kinfo->fdt, "linux,initrd-start", &a, sizeof(a)); if ( res ) return res; res = fdt_property(kinfo->fdt, "linux,initrd-end", &a, sizeof(a)); if ( res ) return res; } } return 0; } /* * Helper to write an interrupts with the GIC format * This code is assuming the irq is an PPI. */ typedef __be32 gic_interrupt_t[3]; static void set_interrupt_ppi(gic_interrupt_t interrupt, unsigned int irq, unsigned int cpumask, unsigned int level) { __be32 *cells = interrupt; BUG_ON(irq < 16 && irq >= 32); /* See linux Documentation/devictree/bindings/arm/gic.txt */ dt_set_cell(&cells, 1, 1); /* is a PPI */ dt_set_cell(&cells, 1, irq - 16); /* PPIs start at 16 */ dt_set_cell(&cells, 1, (cpumask << 8) | level); } /* * Helper to set interrupts for a node in the flat device tree. * It needs 2 property: * "interrupts": contains the list of interrupts * "interrupt-parent": link to the GIC */ static int fdt_property_interrupts(void *fdt, gic_interrupt_t *intr, unsigned num_irq) { int res; res = fdt_property(fdt, "interrupts", intr, sizeof (intr[0]) * num_irq); if ( res ) return res; res = fdt_property_cell(fdt, "interrupt-parent", dt_interrupt_controller->phandle); return res; } static int make_memory_node(const struct domain *d, void *fdt, const struct dt_device_node *parent, const struct kernel_info *kinfo) { int res, i; int reg_size = dt_n_addr_cells(parent) + dt_n_size_cells(parent); int nr_cells = reg_size*kinfo->mem.nr_banks; __be32 reg[nr_cells]; __be32 *cells; DPRINT("Create memory node (reg size %d, nr cells %d)\n", reg_size, nr_cells); /* ePAPR 3.4 */ res = fdt_begin_node(fdt, "memory"); if ( res ) return res; res = fdt_property_string(fdt, "device_type", "memory"); if ( res ) return res; cells = ®[0]; for ( i = 0 ; i < kinfo->mem.nr_banks; i++ ) { u64 start = kinfo->mem.bank[i].start; u64 size = kinfo->mem.bank[i].size; DPRINT(" Bank %d: %#"PRIx64"->%#"PRIx64"\n", i, start, start + size); dt_set_range(&cells, parent, start, size); } res = fdt_property(fdt, "reg", reg, sizeof(reg)); if ( res ) return res; res = fdt_end_node(fdt); return res; } static int make_hypervisor_node(struct domain *d, void *fdt, const struct dt_device_node *parent) { const char compat[] = "xen,xen-"__stringify(XEN_VERSION)"."__stringify(XEN_SUBVERSION)"\0" "xen,xen"; __be32 reg[4]; gic_interrupt_t intr; __be32 *cells; int res; int addrcells = dt_n_addr_cells(parent); int sizecells = dt_n_size_cells(parent); paddr_t gnttab_start, gnttab_size; DPRINT("Create hypervisor node\n"); /* * Sanity-check address sizes, since addresses and sizes which do * not take up exactly 4 or 8 bytes are not supported. */ if ((addrcells != 1 && addrcells != 2) || (sizecells != 1 && sizecells != 2)) panic("Cannot cope with this size"); /* See linux Documentation/devicetree/bindings/arm/xen.txt */ res = fdt_begin_node(fdt, "hypervisor"); if ( res ) return res; /* Cannot use fdt_property_string due to embedded nulls */ res = fdt_property(fdt, "compatible", compat, sizeof(compat)); if ( res ) return res; platform_dom0_gnttab(&gnttab_start, &gnttab_size); DPRINT(" Grant table range: %#"PRIpaddr"-%#"PRIpaddr"\n", gnttab_start, gnttab_start + gnttab_size); /* reg 0 is grant table space */ cells = ®[0]; dt_set_range(&cells, parent, gnttab_start, gnttab_size); res = fdt_property(fdt, "reg", reg, dt_cells_to_size(addrcells + sizecells)); if ( res ) return res; /* * interrupts is evtchn upcall: * - Active-low level-sensitive * - All cpus * * TODO: Handle correctly the cpumask */ DPRINT(" Event channel interrupt to %u\n", d->arch.evtchn_irq); set_interrupt_ppi(intr, d->arch.evtchn_irq, 0xf, DT_IRQ_TYPE_LEVEL_LOW); res = fdt_property_interrupts(fdt, &intr, 1); if ( res ) return res; res = fdt_end_node(fdt); return res; } static int make_psci_node(void *fdt, const struct dt_device_node *parent) { int res; DPRINT("Create PSCI node\n"); /* See linux Documentation/devicetree/bindings/arm/psci.txt */ res = fdt_begin_node(fdt, "psci"); if ( res ) return res; res = fdt_property_string(fdt, "compatible", "arm,psci"); if ( res ) return res; res = fdt_property_string(fdt, "method", "hvc"); if ( res ) return res; res = fdt_property_cell(fdt, "cpu_off", PSCI_cpu_off); if ( res ) return res; res = fdt_property_cell(fdt, "cpu_on", PSCI_cpu_on); if ( res ) return res; res = fdt_end_node(fdt); return res; } static int make_cpus_node(const struct domain *d, void *fdt, const struct dt_device_node *parent) { int res; const struct dt_device_node *cpus = dt_find_node_by_path("/cpus"); const struct dt_device_node *npcpu; unsigned int cpu; const void *compatible = NULL; u32 len; /* Placeholder for cpu@ + a 32-bit number + \0 */ char buf[15]; u32 clock_frequency; bool_t clock_valid; DPRINT("Create cpus node\n"); if ( !cpus ) { dprintk(XENLOG_ERR, "Missing /cpus node in the device tree?\n"); return -ENOENT; } /* * Get the compatible property of CPUs from the device tree. * We are assuming that all CPUs are the same so we are just look * for the first one. * TODO: Handle compatible per VCPU */ dt_for_each_child_node(cpus, npcpu) { if ( dt_device_type_is_equal(npcpu, "cpu") ) { compatible = dt_get_property(npcpu, "compatible", &len); clock_valid = dt_property_read_u32(npcpu, "clock-frequency", &clock_frequency); break; } } if ( !compatible ) { dprintk(XENLOG_ERR, "Can't find cpu in the device tree?\n"); return -ENOENT; } /* See Linux Documentation/devicetree/booting-without-of.txt * section III.5.b */ res = fdt_begin_node(fdt, "cpus"); if ( res ) return res; res = fdt_property_cell(fdt, "#address-cells", 1); if ( res ) return res; res = fdt_property_cell(fdt, "#size-cells", 0); if ( res ) return res; for ( cpu = 0; cpu < d->max_vcpus; cpu++ ) { DPRINT("Create cpu@%u node\n", cpu); snprintf(buf, sizeof(buf), "cpu@%u", cpu); res = fdt_begin_node(fdt, buf); if ( res ) return res; res = fdt_property(fdt, "compatible", compatible, len); if ( res ) return res; res = fdt_property_string(fdt, "device_type", "cpu"); if ( res ) return res; res = fdt_property_cell(fdt, "reg", cpu); if ( res ) return res; if (clock_valid) { res = fdt_property_cell(fdt, "clock-frequency", clock_frequency); if ( res ) return res; } if ( is_pv64_domain(d) ) { res = fdt_property_string(fdt, "enable-method", "psci"); if ( res ) return res; } res = fdt_end_node(fdt); if ( res ) return res; } res = fdt_end_node(fdt); return res; } static int make_gic_node(const struct domain *d, void *fdt, const struct dt_device_node *node) { const struct dt_device_node *gic = dt_interrupt_controller; const void *compatible = NULL; u32 len; __be32 *new_cells, *tmp; int res = 0; /* * Xen currently supports only a single GIC. Discard any secondary * GIC entries. */ if ( node != dt_interrupt_controller ) { DPRINT(" Skipping (secondary GIC)\n"); return 0; } DPRINT("Create gic node\n"); compatible = dt_get_property(gic, "compatible", &len); if ( !compatible ) { dprintk(XENLOG_ERR, "Can't find compatible property for the gic node\n"); return -FDT_ERR_XEN(ENOENT); } res = fdt_begin_node(fdt, "interrupt-controller"); if ( res ) return res; res = fdt_property(fdt, "compatible", compatible, len); if ( res ) return res; res = fdt_property_cell(fdt, "#interrupt-cells", 3); if ( res ) return res; res = fdt_property(fdt, "interrupt-controller", NULL, 0); if ( res ) return res; len = dt_cells_to_size(dt_n_addr_cells(node) + dt_n_size_cells(node)); len *= 2; /* GIC has two memory regions: Distributor + CPU interface */ new_cells = xzalloc_bytes(len); if ( new_cells == NULL ) return -FDT_ERR_XEN(ENOMEM); tmp = new_cells; DPRINT(" Set Distributor Base 0x%"PRIpaddr"-0x%"PRIpaddr"\n", d->arch.vgic.dbase, d->arch.vgic.dbase + PAGE_SIZE - 1); dt_set_range(&tmp, node, d->arch.vgic.dbase, PAGE_SIZE); DPRINT(" Set Cpu Base 0x%"PRIpaddr"-0x%"PRIpaddr"\n", d->arch.vgic.cbase, d->arch.vgic.cbase + (PAGE_SIZE * 2) - 1); dt_set_range(&tmp, node, d->arch.vgic.cbase, PAGE_SIZE * 2); res = fdt_property(fdt, "reg", new_cells, len); xfree(new_cells); if ( res ) return res; /* * The value of the property "phandle" in the property "interrupts" * to know on which interrupt controller the interrupt is wired. */ if ( gic->phandle ) { DPRINT(" Set phandle = 0x%x\n", gic->phandle); res = fdt_property_cell(fdt, "phandle", gic->phandle); if ( res ) return res; } res = fdt_end_node(fdt); return res; } static int make_timer_node(const struct domain *d, void *fdt, const struct dt_device_node *node) { static const struct dt_device_match timer_ids[] __initconst = { DT_MATCH_COMPATIBLE("arm,armv7-timer"), DT_MATCH_COMPATIBLE("arm,armv8-timer"), { /* sentinel */ }, }; struct dt_device_node *dev; u32 len; const void *compatible; int res; const struct dt_irq *irq; gic_interrupt_t intrs[3]; DPRINT("Create timer node\n"); dev = dt_find_matching_node(NULL, timer_ids); if ( !dev ) { dprintk(XENLOG_ERR, "Missing timer node in the device tree?\n"); return -FDT_ERR_XEN(ENOENT); } compatible = dt_get_property(dev, "compatible", &len); if ( !compatible ) { dprintk(XENLOG_ERR, "Can't find compatible property for timer node\n"); return -FDT_ERR_XEN(ENOENT); } res = fdt_begin_node(fdt, "timer"); if ( res ) return res; res = fdt_property(fdt, "compatible", compatible, len); if ( res ) return res; irq = timer_dt_irq(TIMER_PHYS_SECURE_PPI); DPRINT(" Secure interrupt %u\n", irq->irq); set_interrupt_ppi(intrs[0], irq->irq, 0xf, irq->type); irq = timer_dt_irq(TIMER_PHYS_NONSECURE_PPI); DPRINT(" Non secure interrupt %u\n", irq->irq); set_interrupt_ppi(intrs[1], irq->irq, 0xf, irq->type); irq = timer_dt_irq(TIMER_VIRT_PPI); DPRINT(" Virt interrupt %u\n", irq->irq); set_interrupt_ppi(intrs[2], irq->irq, 0xf, irq->type); res = fdt_property_interrupts(fdt, intrs, 3); if ( res ) return res; res = fdt_end_node(fdt); return res; } /* Map the device in the domain */ static int map_device(struct domain *d, const struct dt_device_node *dev) { unsigned int nirq; unsigned int naddr; unsigned int i; int res; struct dt_irq irq; struct dt_raw_irq rirq; u64 addr, size; nirq = dt_number_of_irq(dev); naddr = dt_number_of_address(dev); DPRINT("%s nirq = %d naddr = %u\n", dt_node_full_name(dev), nirq, naddr); /* Map IRQs */ for ( i = 0; i < nirq; i++ ) { res = dt_device_get_raw_irq(dev, i, &rirq); if ( res ) { printk(XENLOG_ERR "Unable to retrieve irq %u for %s\n", i, dt_node_full_name(dev)); return res; } /* * Don't map IRQ that have no physical meaning * ie: IRQ whose controller is not the GIC */ if ( rirq.controller != dt_interrupt_controller ) { DPRINT("irq %u not connected to primary controller." "Connected to %s\n", i, dt_node_full_name(rirq.controller)); continue; } res = dt_irq_translate(&rirq, &irq); if ( res ) { printk(XENLOG_ERR "Unable to translate irq %u for %s\n", i, dt_node_full_name(dev)); return res; } DPRINT("irq %u = %u type = 0x%x\n", i, irq.irq, irq.type); /* Don't check return because the IRQ can be use by multiple device */ gic_route_irq_to_guest(d, &irq, dt_node_name(dev)); } /* Map the address ranges */ for ( i = 0; i < naddr; i++ ) { res = dt_device_get_address(dev, i, &addr, &size); if ( res ) { printk(XENLOG_ERR "Unable to retrieve address %u for %s\n", i, dt_node_full_name(dev)); return res; } DPRINT("addr %u = 0x%"PRIx64" - 0x%"PRIx64"\n", i, addr, addr + size - 1); res = map_mmio_regions(d, addr & PAGE_MASK, PAGE_ALIGN(addr + size) - 1, addr & PAGE_MASK); if ( res ) { printk(XENLOG_ERR "Unable to map 0x%"PRIx64 " - 0x%"PRIx64" in dom0\n", addr & PAGE_MASK, PAGE_ALIGN(addr + size) - 1); return res; } } return 0; } static int handle_node(struct domain *d, struct kernel_info *kinfo, const struct dt_device_node *node) { static const struct dt_device_match skip_matches[] __initconst = { DT_MATCH_COMPATIBLE("xen,xen"), DT_MATCH_COMPATIBLE("xen,multiboot-module"), DT_MATCH_COMPATIBLE("arm,psci"), DT_MATCH_PATH("/cpus"), DT_MATCH_TYPE("memory"), { /* sentinel */ }, }; static const struct dt_device_match gic_matches[] __initconst = { DT_MATCH_GIC, { /* sentinel */ }, }; static const struct dt_device_match timer_matches[] __initconst = { DT_MATCH_TIMER, { /* sentinel */ }, }; const struct dt_device_node *child; int res; const char *name; const char *path; path = dt_node_full_name(node); DPRINT("handle %s\n", path); /* Skip theses nodes and the sub-nodes */ if ( dt_match_node(skip_matches, node) ) { DPRINT(" Skip it (matched)\n"); return 0; } if ( platform_device_is_blacklisted(node) ) { DPRINT(" Skip it (blacklisted)\n"); return 0; } /* Replace these nodes with our own. Note that the original may be * used_by DOMID_XEN so this check comes first. */ if ( dt_match_node(gic_matches, node) ) return make_gic_node(d, kinfo->fdt, node); if ( dt_match_node(timer_matches, node) ) return make_timer_node(d, kinfo->fdt, node); /* Skip nodes used by Xen */ if ( dt_device_used_by(node) == DOMID_XEN ) { DPRINT(" Skip it (used by Xen)\n"); return 0; } /* * Some device doesn't need to be mapped in Xen: * - Memory: the guest will see a different view of memory. It will * be allocated later. * - Disabled device: Linux is able to cope with status="disabled" * property. Therefore these device doesn't need to be mapped. This * solution can be use later for pass through. */ if ( !dt_device_type_is_equal(node, "memory") && dt_device_is_available(node) ) { res = map_device(d, node); if ( res ) return res; } /* * The property "name" is used to have a different name on older FDT * version. We want to keep the name retrieved during the tree * structure creation, that is store in the node path. */ name = strrchr(path, '/'); name = name ? name + 1 : path; res = fdt_begin_node(kinfo->fdt, name); if ( res ) return res; res = write_properties(d, kinfo, node); if ( res ) return res; for ( child = node->child; child != NULL; child = child->sibling ) { res = handle_node(d, kinfo, child); if ( res ) return res; } if ( node == dt_host ) { res = make_hypervisor_node(d, kinfo->fdt, node); if ( res ) return res; res = make_psci_node(kinfo->fdt, node); if ( res ) return res; res = make_cpus_node(d, kinfo->fdt, node); if ( res ) return res; res = make_memory_node(d, kinfo->fdt, node, kinfo); if ( res ) return res; } res = fdt_end_node(kinfo->fdt); return res; } static int prepare_dtb(struct domain *d, struct kernel_info *kinfo) { const void *fdt; int new_size; int ret; ASSERT(dt_host && (dt_host->sibling == NULL)); fdt = device_tree_flattened; new_size = fdt_totalsize(fdt) + DOM0_FDT_EXTRA_SIZE; kinfo->fdt = xmalloc_bytes(new_size); if ( kinfo->fdt == NULL ) return -ENOMEM; ret = fdt_create(kinfo->fdt, new_size); if ( ret < 0 ) goto err; fdt_finish_reservemap(kinfo->fdt); ret = handle_node(d, kinfo, dt_host); if ( ret ) goto err; ret = fdt_finish(kinfo->fdt); if ( ret < 0 ) goto err; return 0; err: printk("Device tree generation failed (%d).\n", ret); xfree(kinfo->fdt); return -EINVAL; } static void dtb_load(struct kernel_info *kinfo) { void * __user dtb_virt = (void * __user)(register_t)kinfo->dtb_paddr; unsigned long left; printk("Loading dom0 DTB to 0x%"PRIpaddr"-0x%"PRIpaddr"\n", kinfo->dtb_paddr, kinfo->dtb_paddr + fdt_totalsize(kinfo->fdt)); left = raw_copy_to_guest_flush_dcache(dtb_virt, kinfo->fdt, fdt_totalsize(kinfo->fdt)); if ( left != 0 ) panic("Unable to copy the DTB to dom0 memory (left = %lu bytes)", left); xfree(kinfo->fdt); } static void initrd_load(struct kernel_info *kinfo) { paddr_t load_addr = kinfo->initrd_paddr; paddr_t paddr = early_info.modules.module[MOD_INITRD].start; paddr_t len = early_info.modules.module[MOD_INITRD].size; unsigned long offs; int node; int res; __be32 val[2]; __be32 *cellp; if ( !len ) return; printk("Loading dom0 initrd from %"PRIpaddr" to 0x%"PRIpaddr"-0x%"PRIpaddr"\n", paddr, load_addr, load_addr + len); /* Fix up linux,initrd-start and linux,initrd-end in /chosen */ node = fdt_path_offset(kinfo->fdt, "/chosen"); if ( node < 0 ) panic("Cannot find the /chosen node"); cellp = (__be32 *)val; dt_set_cell(&cellp, ARRAY_SIZE(val), load_addr); res = fdt_setprop_inplace(kinfo->fdt, node, "linux,initrd-start", val, sizeof(val)); if ( res ) panic("Cannot fix up \"linux,initrd-start\" property"); cellp = (__be32 *)val; dt_set_cell(&cellp, ARRAY_SIZE(val), load_addr + len); res = fdt_setprop_inplace(kinfo->fdt, node, "linux,initrd-end", val, sizeof(val)); if ( res ) panic("Cannot fix up \"linux,initrd-end\" property"); for ( offs = 0; offs < len; ) { int rc; paddr_t s, l, ma; void *dst; s = offs & ~PAGE_MASK; l = min(PAGE_SIZE - s, len); rc = gvirt_to_maddr(load_addr + offs, &ma); if ( rc ) { panic("Unable to translate guest address"); return; } dst = map_domain_page(ma>>PAGE_SHIFT); copy_from_paddr(dst + s, paddr + offs, l, BUFFERABLE); unmap_domain_page(dst); offs += l; } } int construct_dom0(struct domain *d) { struct kernel_info kinfo = {}; int rc, i, cpu; struct vcpu *v = d->vcpu[0]; struct cpu_user_regs *regs = &v->arch.cpu_info->guest_cpu_user_regs; /* Sanity! */ BUG_ON(d->domain_id != 0); BUG_ON(d->vcpu[0] == NULL); BUG_ON(v->is_initialised); printk("*** LOADING DOMAIN 0 ***\n"); d->max_pages = ~0U; kinfo.unassigned_mem = dom0_mem; allocate_memory(d, &kinfo); rc = kernel_prepare(&kinfo); if ( rc < 0 ) return rc; #ifdef CONFIG_ARM_64 d->arch.type = kinfo.type; #endif rc = prepare_dtb(d, &kinfo); if ( rc < 0 ) return rc; rc = platform_specific_mapping(d); if ( rc < 0 ) return rc; /* The following loads use the domain's p2m */ p2m_load_VTTBR(d); #ifdef CONFIG_ARM_64 d->arch.type = kinfo.type; if ( is_pv32_domain(d) ) WRITE_SYSREG(READ_SYSREG(HCR_EL2) & ~HCR_RW, HCR_EL2); else WRITE_SYSREG(READ_SYSREG(HCR_EL2) | HCR_RW, HCR_EL2); #endif /* * kernel_load will determine the placement of the initrd & fdt in * RAM, so call it first. */ kernel_load(&kinfo); /* initrd_load will fix up the fdt, so call it before dtb_load */ initrd_load(&kinfo); dtb_load(&kinfo); discard_initial_modules(); v->is_initialised = 1; clear_bit(_VPF_down, &v->pause_flags); memset(regs, 0, sizeof(*regs)); regs->pc = (register_t)kinfo.entry; if ( is_pv32_domain(d) ) { regs->cpsr = PSR_GUEST32_INIT; /* FROM LINUX head.S * * Kernel startup entry point. * --------------------------- * * This is normally called from the decompressor code. The requirements * are: MMU = off, D-cache = off, I-cache = dont care, r0 = 0, * r1 = machine nr, r2 = atags or dtb pointer. *... */ regs->r0 = 0; /* SBZ */ regs->r1 = 0xffffffff; /* We use DTB therefore no machine id */ regs->r2 = kinfo.dtb_paddr; } #ifdef CONFIG_ARM_64 else { regs->cpsr = PSR_GUEST64_INIT; /* From linux/Documentation/arm64/booting.txt */ regs->x0 = kinfo.dtb_paddr; regs->x1 = 0; /* Reserved for future use */ regs->x2 = 0; /* Reserved for future use */ regs->x3 = 0; /* Reserved for future use */ } #endif for ( i = 1, cpu = 0; i < d->max_vcpus; i++ ) { cpu = cpumask_cycle(cpu, &cpu_online_map); if ( alloc_vcpu(d, i, cpu) == NULL ) { printk("Failed to allocate dom0 vcpu %d on pcpu %d\n", i, cpu); break; } } return 0; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/shutdown.c0000664000175000017500000000175412307313555014757 0ustar smbsmb#include #include #include #include #include #include #include static void raw_machine_reset(void) { platform_reset(); } static void halt_this_cpu(void *arg) { stop_cpu(); } void machine_halt(void) { watchdog_disable(); console_start_sync(); local_irq_enable(); smp_call_function(halt_this_cpu, NULL, 0); halt_this_cpu(NULL); } void machine_restart(unsigned int delay_millisecs) { int timeout = 10; local_irq_enable(); smp_call_function(halt_this_cpu, NULL, 0); local_irq_disable(); mdelay(delay_millisecs); /* Wait at most another 10ms for all other CPUs to go offline. */ while ( (num_online_cpus() > 1) && (timeout-- > 0) ) mdelay(1); while ( 1 ) { raw_machine_reset(); mdelay(100); } } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm32/0000775000175000017500000000000012307313555013655 5ustar smbsmbxen-4.4.0/xen/arch/arm/arm32/domctl.c0000664000175000017500000000140012307313555015276 0ustar smbsmb/****************************************************************************** * Subarch-specific domctl.c * * Copyright (c) 2013, Citrix Systems */ #include #include #include #include #include #include #include long subarch_do_domctl(struct xen_domctl *domctl, struct domain *d, XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) { switch ( domctl->cmd ) { case XEN_DOMCTL_set_address_size: return domctl->u.address_size.size == 32 ? 0 : -EINVAL; default: return -ENOSYS; } } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm32/proc-v7.S0000664000175000017500000000362012307313555015277 0ustar smbsmb/* * xen/arch/arm/proc-v7.S * * rename from xen/arch/arm/proc-ca15.S * arm v7 specific initializations * * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include v7_init: /* Set up the SMP bit in ACTLR */ mrc CP32(r0, ACTLR) orr r0, r0, #(ACTLR_V7_SMP) /* enable SMP bit */ mcr CP32(r0, ACTLR) mov pc, lr .section ".init.proc.info", #alloc, #execinstr .type __v7_ca15mp_proc_info, #object __v7_ca15mp_proc_info: .long 0x410FC0F0 /* Cortex-A15 */ .long 0xFF0FFFF0 /* Mask */ .long v7_init .size __v7_ca15mp_proc_info, . - __v7_ca15mp_proc_info .section ".init.proc.info", #alloc, #execinstr .type __v7_ca7mp_proc_info, #object __v7_ca7mp_proc_info: .long 0x410FC070 /* Cortex-A7 */ .long 0xFF0FFFF0 /* Mask */ .long v7_init .size __v7_ca7mp_proc_info, . - __v7_ca7mp_proc_info .section ".init.proc.info", #alloc, #execinstr .type __v7_brahma15mp_proc_info, #object __v7_brahma15mp_proc_info: .long 0x420F00F2 /* Broadcom Brahma-B15 */ .long 0xFF0FFFFF /* Mask */ .long v7_init .size __v7_brahma15mp_proc_info, . - __v7_brahma15mp_proc_info /* * Local variables: * mode: ASM * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm32/asm-offsets.c0000664000175000017500000000530712307313555016255 0ustar smbsmb/* * Generate definitions needed by assembly language modules. * This code generates raw asm output which is post-processed * to extract and format the required data. */ #define COMPILE_OFFSETS #include #include #include #include #include #include #include #define DEFINE(_sym, _val) \ __asm__ __volatile__ ( "\n->" #_sym " %0 " #_val : : "i" (_val) ) #define BLANK() \ __asm__ __volatile__ ( "\n->" : : ) #define OFFSET(_sym, _str, _mem) \ DEFINE(_sym, offsetof(_str, _mem)); void __dummy__(void) { OFFSET(UREGS_sp, struct cpu_user_regs, sp); OFFSET(UREGS_lr, struct cpu_user_regs, lr); OFFSET(UREGS_pc, struct cpu_user_regs, pc); OFFSET(UREGS_cpsr, struct cpu_user_regs, cpsr); OFFSET(UREGS_LR_usr, struct cpu_user_regs, lr_usr); OFFSET(UREGS_SP_usr, struct cpu_user_regs, sp_usr); OFFSET(UREGS_SP_svc, struct cpu_user_regs, sp_svc); OFFSET(UREGS_LR_svc, struct cpu_user_regs, lr_svc); OFFSET(UREGS_SPSR_svc, struct cpu_user_regs, spsr_svc); OFFSET(UREGS_SP_abt, struct cpu_user_regs, sp_abt); OFFSET(UREGS_LR_abt, struct cpu_user_regs, lr_abt); OFFSET(UREGS_SPSR_abt, struct cpu_user_regs, spsr_abt); OFFSET(UREGS_SP_und, struct cpu_user_regs, sp_und); OFFSET(UREGS_LR_und, struct cpu_user_regs, lr_und); OFFSET(UREGS_SPSR_und, struct cpu_user_regs, spsr_und); OFFSET(UREGS_SP_irq, struct cpu_user_regs, sp_irq); OFFSET(UREGS_LR_irq, struct cpu_user_regs, lr_irq); OFFSET(UREGS_SPSR_irq, struct cpu_user_regs, spsr_irq); OFFSET(UREGS_SP_fiq, struct cpu_user_regs, sp_fiq); OFFSET(UREGS_LR_fiq, struct cpu_user_regs, lr_fiq); OFFSET(UREGS_SPSR_fiq, struct cpu_user_regs, spsr_fiq); OFFSET(UREGS_R8_fiq, struct cpu_user_regs, r8_fiq); OFFSET(UREGS_R9_fiq, struct cpu_user_regs, r9_fiq); OFFSET(UREGS_R10_fiq, struct cpu_user_regs, r10_fiq); OFFSET(UREGS_R11_fiq, struct cpu_user_regs, r11_fiq); OFFSET(UREGS_R12_fiq, struct cpu_user_regs, r12_fiq); OFFSET(UREGS_kernel_sizeof, struct cpu_user_regs, cpsr); DEFINE(UREGS_user_sizeof, sizeof(struct cpu_user_regs)); BLANK(); DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info)); OFFSET(VCPU_arch_saved_context, struct vcpu, arch.saved_context); BLANK(); DEFINE(PROCINFO_sizeof, sizeof(struct proc_info_list)); OFFSET(PROCINFO_cpu_val, struct proc_info_list, cpu_val); OFFSET(PROCINFO_cpu_mask, struct proc_info_list, cpu_mask); OFFSET(PROCINFO_cpu_init, struct proc_info_list, cpu_init); BLANK(); OFFSET(INITINFO_stack, struct init_info, stack); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm32/Makefile0000664000175000017500000000025512307313555015317 0ustar smbsmbsubdir-y += lib obj-y += entry.o obj-y += proc-v7.o obj-y += traps.o obj-y += domain.o obj-y += vfp.o obj-y += smpboot.o obj-y += domctl.o obj-$(EARLY_PRINTK) += debug.o xen-4.4.0/xen/arch/arm/arm32/debug.S0000664000175000017500000000246212307313555015073 0ustar smbsmb/* * xen/arch/arm/arm32/debug.S * * Wrapper for early printk * * Julien Grall * Copyright (c) 2013 Linaro Limited. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #ifdef EARLY_PRINTK_INC #include EARLY_PRINTK_INC #endif /* Print a character on the UART - this function is called by C * r0: character to print */ GLOBAL(early_putch) ldr r1, =EARLY_UART_VIRTUAL_ADDRESS /* r1 := VA UART base address */ early_uart_ready r1, r2 early_uart_transmit r1, r0 mov pc, lr /* Flush the UART - this function is called by C */ GLOBAL(early_flush) ldr r1, =EARLY_UART_VIRTUAL_ADDRESS /* r1 := VA UART base address */ early_uart_ready r1, r2 mov pc, lr /* * Local variables: * mode: ASM * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm32/debug-exynos4210.inc0000664000175000017500000000512612307313555017274 0ustar smbsmb/* * xen/arch/arm/arm32/debug-exynos4210.inc * * Exynos 5 specific debug code * * Copyright (c) 2013 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include /* Exynos 5 UART initialization * rb: register which contains the UART base address * rc: scratch register 1 * rd: scratch register 2 */ .macro early_uart_init rb rc rd /* init clock */ ldr \rc, =0x10020000 /* select MPLL (800MHz) source clock */ ldr \rd, [\rc, #0x250] and \rd, \rd, #(~(0xf<<8)) orr \rd, \rd, #(0x6<<8) str \rd, [\rc, #0x250] /* ratio 800/(7+1) */ ldr \rd, [\rc, #0x558] and \rd, \rd, #(~(0xf<<8)) orr \rd, \rd, #(0x7<<8) str \rd, [\rc, #0x558] mov \rc, #(100000000 / EARLY_PRINTK_BAUD % 16) str \rc, [\rb, #UFRACVAL] /* -> UFRACVAL (Baud divisor fraction) */ mov \rc, #(100000000 / EARLY_PRINTK_BAUD / 16 - 1) str \rc, [\rb, #UBRDIV] /* -> UBRDIV (Baud divisor integer) */ mov \rc, #3 /* 8n1 */ str \rc, [\rb, #ULCON] /* -> (Line control) */ ldr \rc, =UCON_TX_IRQ /* TX IRQMODE */ str \rc, [\rb, #UCON] /* -> (Control Register) */ mov \rc, #0x0 str \rc, [\rb, #UFCON] /* disable FIFO */ mov \rc, #0x0 str \rc, [\rb, #UMCON] /* no auto flow control */ .endm /* Exynos 5 UART wait UART to be ready to transmit * rb: register which contains the UART base address * rc: scratch register */ .macro early_uart_ready rb rc 1: ldr \rc, [\rb, #UTRSTAT] /* <- UTRSTAT (Flag register) */ tst \rc, #UTRSTAT_TXFE /* Check BUSY bit */ beq 1b /* Wait for the UART to be ready */ .endm /* Exynos 5 UART transmit character * rb: register which contains the UART base address * rt: register which contains the character to transmit */ .macro early_uart_transmit rb rt str \rt, [\rb, #UTXH] /* -> UTXH (Data Register) */ .endm /* * Local variables: * mode: ASM * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm32/domain.c0000664000175000017500000000274112307313555015274 0ustar smbsmb#include #include #include #include #include /* C(hyp,user), hyp is Xen internal name, user is user API name. */ #define ALLREGS \ C(r0,r0_usr); C(r1,r1_usr); C(r2,r2_usr); C(r3,r3_usr); \ C(r4,r4_usr); C(r5,r5_usr); C(r6,r6_usr); C(r7,r7_usr); \ C(r8,r8_usr); C(r9,r9_usr); C(r10,r10_usr); C(r11,r11_usr); \ C(r12,r12_usr); \ C(sp_usr,sp_usr); \ C(lr,lr_usr); \ C(spsr_irq,spsr_irq); C(lr_irq,lr_irq); C(sp_irq,sp_irq); \ C(spsr_svc,spsr_svc); C(lr_svc,lr_svc); C(sp_svc,sp_svc); \ C(spsr_abt,spsr_abt); C(lr_abt,lr_abt); C(sp_abt,sp_abt); \ C(spsr_und,spsr_und); C(lr_und,lr_und); C(sp_und,sp_und); \ C(spsr_fiq,spsr_fiq); C(sp_fiq,sp_fiq); C(sp_fiq,sp_fiq); \ C(r8_fiq,r8_fiq); C(r9_fiq,r9_fiq); \ C(r10_fiq,r10_fiq); C(r11_fiq,r11_fiq); C(r12_fiq,r12_fiq); \ C(pc,pc32); \ C(cpsr,cpsr) void vcpu_regs_hyp_to_user(const struct vcpu *vcpu, struct vcpu_guest_core_regs *regs) { #define C(hyp,user) regs->user = vcpu->arch.cpu_info->guest_cpu_user_regs.hyp ALLREGS; #undef C } void vcpu_regs_user_to_hyp(struct vcpu *vcpu, const struct vcpu_guest_core_regs *regs) { #define C(hyp,user) vcpu->arch.cpu_info->guest_cpu_user_regs.hyp = regs->user ALLREGS; #undef C } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm32/lib/0000775000175000017500000000000012307313555014423 5ustar smbsmbxen-4.4.0/xen/arch/arm/arm32/lib/memmove.S0000664000175000017500000000775712307313555016234 0ustar smbsmb/* * linux/arch/arm/lib/memmove.S * * Author: Nicolas Pitre * Created: Sep 28, 2005 * Copyright: (C) MontaVista Software Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include #include "assembler.h" .text /* * Prototype: void *memmove(void *dest, const void *src, size_t n); * * Note: * * If the memory regions don't overlap, we simply branch to memcpy which is * normally a bit faster. Otherwise the copy is done going downwards. This * is a transposition of the code from copy_template.S but with the copy * occurring in the opposite direction. */ ENTRY(memmove) subs ip, r0, r1 cmphi r2, ip bls memcpy stmfd sp!, {r0, r4, lr} add r1, r1, r2 add r0, r0, r2 subs r2, r2, #4 blt 8f ands ip, r0, #3 PLD( pld [r1, #-4] ) bne 9f ands ip, r1, #3 bne 10f 1: subs r2, r2, #(28) stmfd sp!, {r5 - r8} blt 5f CALGN( ands ip, r0, #31 ) CALGN( sbcnes r4, ip, r2 ) @ C is always set here CALGN( bcs 2f ) CALGN( adr r4, 6f ) CALGN( subs r2, r2, ip ) @ C is set here CALGN( rsb ip, ip, #32 ) CALGN( add pc, r4, ip ) PLD( pld [r1, #-4] ) 2: PLD( subs r2, r2, #96 ) PLD( pld [r1, #-32] ) PLD( blt 4f ) PLD( pld [r1, #-64] ) PLD( pld [r1, #-96] ) 3: PLD( pld [r1, #-128] ) 4: ldmdb r1!, {r3, r4, r5, r6, r7, r8, ip, lr} subs r2, r2, #32 stmdb r0!, {r3, r4, r5, r6, r7, r8, ip, lr} bge 3b PLD( cmn r2, #96 ) PLD( bge 4b ) 5: ands ip, r2, #28 rsb ip, ip, #32 addne pc, pc, ip @ C is always clear here b 7f 6: W(nop) W(ldr) r3, [r1, #-4]! W(ldr) r4, [r1, #-4]! W(ldr) r5, [r1, #-4]! W(ldr) r6, [r1, #-4]! W(ldr) r7, [r1, #-4]! W(ldr) r8, [r1, #-4]! W(ldr) lr, [r1, #-4]! add pc, pc, ip nop W(nop) W(str) r3, [r0, #-4]! W(str) r4, [r0, #-4]! W(str) r5, [r0, #-4]! W(str) r6, [r0, #-4]! W(str) r7, [r0, #-4]! W(str) r8, [r0, #-4]! W(str) lr, [r0, #-4]! CALGN( bcs 2b ) 7: ldmfd sp!, {r5 - r8} 8: movs r2, r2, lsl #31 ldrneb r3, [r1, #-1]! ldrcsb r4, [r1, #-1]! ldrcsb ip, [r1, #-1] strneb r3, [r0, #-1]! strcsb r4, [r0, #-1]! strcsb ip, [r0, #-1] ldmfd sp!, {r0, r4, pc} 9: cmp ip, #2 ldrgtb r3, [r1, #-1]! ldrgeb r4, [r1, #-1]! ldrb lr, [r1, #-1]! strgtb r3, [r0, #-1]! strgeb r4, [r0, #-1]! subs r2, r2, ip strb lr, [r0, #-1]! blt 8b ands ip, r1, #3 beq 1b 10: bic r1, r1, #3 cmp ip, #2 ldr r3, [r1, #0] beq 17f blt 18f .macro backward_copy_shift push pull subs r2, r2, #28 blt 14f CALGN( ands ip, r0, #31 ) CALGN( sbcnes r4, ip, r2 ) @ C is always set here CALGN( subcc r2, r2, ip ) CALGN( bcc 15f ) 11: stmfd sp!, {r5 - r9} PLD( pld [r1, #-4] ) PLD( subs r2, r2, #96 ) PLD( pld [r1, #-32] ) PLD( blt 13f ) PLD( pld [r1, #-64] ) PLD( pld [r1, #-96] ) 12: PLD( pld [r1, #-128] ) 13: ldmdb r1!, {r7, r8, r9, ip} mov lr, r3, push #\push subs r2, r2, #32 ldmdb r1!, {r3, r4, r5, r6} orr lr, lr, ip, pull #\pull mov ip, ip, push #\push orr ip, ip, r9, pull #\pull mov r9, r9, push #\push orr r9, r9, r8, pull #\pull mov r8, r8, push #\push orr r8, r8, r7, pull #\pull mov r7, r7, push #\push orr r7, r7, r6, pull #\pull mov r6, r6, push #\push orr r6, r6, r5, pull #\pull mov r5, r5, push #\push orr r5, r5, r4, pull #\pull mov r4, r4, push #\push orr r4, r4, r3, pull #\pull stmdb r0!, {r4 - r9, ip, lr} bge 12b PLD( cmn r2, #96 ) PLD( bge 13b ) ldmfd sp!, {r5 - r9} 14: ands ip, r2, #28 beq 16f 15: mov lr, r3, push #\push ldr r3, [r1, #-4]! subs ip, ip, #4 orr lr, lr, r3, pull #\pull str lr, [r0, #-4]! bgt 15b CALGN( cmp r2, #0 ) CALGN( bge 11b ) 16: add r1, r1, #(\pull / 8) b 8b .endm backward_copy_shift push=8 pull=24 17: backward_copy_shift push=16 pull=16 18: backward_copy_shift push=24 pull=8 ENDPROC(memmove) xen-4.4.0/xen/arch/arm/arm32/lib/lshrdi3.S0000664000175000017500000000321512307313555016120 0ustar smbsmb/* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005 Free Software Foundation, Inc. This file is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. In addition to the permissions in the GNU General Public License, the Free Software Foundation gives you unlimited permission to link the compiled version of this file into combinations with other programs, and to distribute those combinations without any restriction coming from the use of this file. (The General Public License restrictions do apply in other respects; for example, they cover modification of the file, and distribution when not linked into a combine executable.) This file is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; see the file COPYING. If not, write to the Free Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include "assembler.h" #ifdef __ARMEB__ #define al r1 #define ah r0 #else #define al r0 #define ah r1 #endif ENTRY(__lshrdi3) ENTRY(__aeabi_llsr) subs r3, r2, #32 rsb ip, r2, #32 movmi al, al, lsr r2 movpl al, ah, lsr r3 ARM( orrmi al, al, ah, lsl ip ) THUMB( lslmi r3, ah, ip ) THUMB( orrmi al, al, r3 ) mov ah, ah, lsr r2 mov pc, lr ENDPROC(__lshrdi3) ENDPROC(__aeabi_llsr) xen-4.4.0/xen/arch/arm/arm32/lib/copy_template.S0000664000175000017500000001370612307313555017423 0ustar smbsmb/* * linux/arch/arm/lib/copy_template.s * * Code template for optimized memory copy functions * * Author: Nicolas Pitre * Created: Sep 28, 2005 * Copyright: MontaVista Software, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ /* * Theory of operation * ------------------- * * This file provides the core code for a forward memory copy used in * the implementation of memcopy(), copy_to_user() and copy_from_user(). * * The including file must define the following accessor macros * according to the need of the given function: * * ldr1w ptr reg abort * * This loads one word from 'ptr', stores it in 'reg' and increments * 'ptr' to the next word. The 'abort' argument is used for fixup tables. * * ldr4w ptr reg1 reg2 reg3 reg4 abort * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort * * This loads four or eight words starting from 'ptr', stores them * in provided registers and increments 'ptr' past those words. * The'abort' argument is used for fixup tables. * * ldr1b ptr reg cond abort * * Similar to ldr1w, but it loads a byte and increments 'ptr' one byte. * It also must apply the condition code if provided, otherwise the * "al" condition is assumed by default. * * str1w ptr reg abort * str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort * str1b ptr reg cond abort * * Same as their ldr* counterparts, but data is stored to 'ptr' location * rather than being loaded. * * enter reg1 reg2 * * Preserve the provided registers on the stack plus any additional * data as needed by the implementation including this code. Called * upon code entry. * * exit reg1 reg2 * * Restore registers with the values previously saved with the * 'preserv' macro. Called upon code termination. * * LDR1W_SHIFT * STR1W_SHIFT * * Correction to be applied to the "ip" register when branching into * the ldr1w or str1w instructions (some of these macros may expand to * than one 32bit instruction in Thumb-2) */ enter r4, lr subs r2, r2, #4 blt 8f ands ip, r0, #3 PLD( pld [r1, #0] ) bne 9f ands ip, r1, #3 bne 10f 1: subs r2, r2, #(28) stmfd sp!, {r5 - r8} blt 5f CALGN( ands ip, r0, #31 ) CALGN( rsb r3, ip, #32 ) CALGN( sbcnes r4, r3, r2 ) @ C is always set here CALGN( bcs 2f ) CALGN( adr r4, 6f ) CALGN( subs r2, r2, r3 ) @ C gets set CALGN( add pc, r4, ip ) PLD( pld [r1, #0] ) 2: PLD( subs r2, r2, #96 ) PLD( pld [r1, #28] ) PLD( blt 4f ) PLD( pld [r1, #60] ) PLD( pld [r1, #92] ) 3: PLD( pld [r1, #124] ) 4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f subs r2, r2, #32 str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f bge 3b PLD( cmn r2, #96 ) PLD( bge 4b ) 5: ands ip, r2, #28 rsb ip, ip, #32 #if LDR1W_SHIFT > 0 lsl ip, ip, #LDR1W_SHIFT #endif addne pc, pc, ip @ C is always clear here b 7f 6: .rept (1 << LDR1W_SHIFT) W(nop) .endr ldr1w r1, r3, abort=20f ldr1w r1, r4, abort=20f ldr1w r1, r5, abort=20f ldr1w r1, r6, abort=20f ldr1w r1, r7, abort=20f ldr1w r1, r8, abort=20f ldr1w r1, lr, abort=20f #if LDR1W_SHIFT < STR1W_SHIFT lsl ip, ip, #STR1W_SHIFT - LDR1W_SHIFT #elif LDR1W_SHIFT > STR1W_SHIFT lsr ip, ip, #LDR1W_SHIFT - STR1W_SHIFT #endif add pc, pc, ip nop .rept (1 << STR1W_SHIFT) W(nop) .endr str1w r0, r3, abort=20f str1w r0, r4, abort=20f str1w r0, r5, abort=20f str1w r0, r6, abort=20f str1w r0, r7, abort=20f str1w r0, r8, abort=20f str1w r0, lr, abort=20f CALGN( bcs 2b ) 7: ldmfd sp!, {r5 - r8} 8: movs r2, r2, lsl #31 ldr1b r1, r3, ne, abort=21f ldr1b r1, r4, cs, abort=21f ldr1b r1, ip, cs, abort=21f str1b r0, r3, ne, abort=21f str1b r0, r4, cs, abort=21f str1b r0, ip, cs, abort=21f exit r4, pc 9: rsb ip, ip, #4 cmp ip, #2 ldr1b r1, r3, gt, abort=21f ldr1b r1, r4, ge, abort=21f ldr1b r1, lr, abort=21f str1b r0, r3, gt, abort=21f str1b r0, r4, ge, abort=21f subs r2, r2, ip str1b r0, lr, abort=21f blt 8b ands ip, r1, #3 beq 1b 10: bic r1, r1, #3 cmp ip, #2 ldr1w r1, lr, abort=21f beq 17f bgt 18f .macro forward_copy_shift pull push subs r2, r2, #28 blt 14f CALGN( ands ip, r0, #31 ) CALGN( rsb ip, ip, #32 ) CALGN( sbcnes r4, ip, r2 ) @ C is always set here CALGN( subcc r2, r2, ip ) CALGN( bcc 15f ) 11: stmfd sp!, {r5 - r9} PLD( pld [r1, #0] ) PLD( subs r2, r2, #96 ) PLD( pld [r1, #28] ) PLD( blt 13f ) PLD( pld [r1, #60] ) PLD( pld [r1, #92] ) 12: PLD( pld [r1, #124] ) 13: ldr4w r1, r4, r5, r6, r7, abort=19f mov r3, lr, pull #\pull subs r2, r2, #32 ldr4w r1, r8, r9, ip, lr, abort=19f orr r3, r3, r4, push #\push mov r4, r4, pull #\pull orr r4, r4, r5, push #\push mov r5, r5, pull #\pull orr r5, r5, r6, push #\push mov r6, r6, pull #\pull orr r6, r6, r7, push #\push mov r7, r7, pull #\pull orr r7, r7, r8, push #\push mov r8, r8, pull #\pull orr r8, r8, r9, push #\push mov r9, r9, pull #\pull orr r9, r9, ip, push #\push mov ip, ip, pull #\pull orr ip, ip, lr, push #\push str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f bge 12b PLD( cmn r2, #96 ) PLD( bge 13b ) ldmfd sp!, {r5 - r9} 14: ands ip, r2, #28 beq 16f 15: mov r3, lr, pull #\pull ldr1w r1, lr, abort=21f subs ip, ip, #4 orr r3, r3, lr, push #\push str1w r0, r3, abort=21f bgt 15b CALGN( cmp r2, #0 ) CALGN( bge 11b ) 16: sub r1, r1, #(\push / 8) b 8b .endm forward_copy_shift pull=8 push=24 17: forward_copy_shift pull=16 push=16 18: forward_copy_shift pull=24 push=8 /* * Abort preamble and completion macros. * If a fixup handler is required then those macros must surround it. * It is assumed that the fixup code will handle the private part of * the exit macro. */ .macro copy_abort_preamble 19: ldmfd sp!, {r5 - r9} b 21f 20: ldmfd sp!, {r5 - r8} 21: .endm .macro copy_abort_end ldmfd sp!, {r4, pc} .endm xen-4.4.0/xen/arch/arm/arm32/lib/Makefile0000664000175000017500000000032012307313555016056 0ustar smbsmbobj-y += memcpy.o memmove.o memset.o memzero.o obj-y += findbit.o setbit.o obj-y += setbit.o clearbit.o changebit.o obj-y += testsetbit.o testclearbit.o testchangebit.o obj-y += lib1funcs.o lshrdi3.o div64.o xen-4.4.0/xen/arch/arm/arm32/lib/div64.S0000664000175000017500000000772712307313555015520 0ustar smbsmb/* * linux/arch/arm/lib/div64.S * * Optimized computation of 64-bit dividend / 32-bit divisor * * Author: Nicolas Pitre * Created: Oct 5, 2003 * Copyright: Monta Vista Software, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include #include "assembler.h" #ifdef __ARMEB__ #define xh r0 #define xl r1 #define yh r2 #define yl r3 #else #define xl r0 #define xh r1 #define yl r2 #define yh r3 #endif /* * __do_div64: perform a division with 64-bit dividend and 32-bit divisor. * * Note: Calling convention is totally non standard for optimal code. * This is meant to be used by do_div() from include/asm/div64.h only. * * Input parameters: * xh-xl = dividend (clobbered) * r4 = divisor (preserved) * * Output values: * yh-yl = result * xh = remainder * * Clobbered regs: xl, ip */ ENTRY(__do_div64) UNWIND(.fnstart) @ Test for easy paths first. subs ip, r4, #1 bls 9f @ divisor is 0 or 1 tst ip, r4 beq 8f @ divisor is power of 2 @ See if we need to handle upper 32-bit result. cmp xh, r4 mov yh, #0 blo 3f @ Align divisor with upper part of dividend. @ The aligned divisor is stored in yl preserving the original. @ The bit position is stored in ip. #if __LINUX_ARM_ARCH__ >= 5 clz yl, r4 clz ip, xh sub yl, yl, ip mov ip, #1 mov ip, ip, lsl yl mov yl, r4, lsl yl #else mov yl, r4 mov ip, #1 1: cmp yl, #0x80000000 cmpcc yl, xh movcc yl, yl, lsl #1 movcc ip, ip, lsl #1 bcc 1b #endif @ The division loop for needed upper bit positions. @ Break out early if dividend reaches 0. 2: cmp xh, yl orrcs yh, yh, ip subcss xh, xh, yl movnes ip, ip, lsr #1 mov yl, yl, lsr #1 bne 2b @ See if we need to handle lower 32-bit result. 3: cmp xh, #0 mov yl, #0 cmpeq xl, r4 movlo xh, xl movlo pc, lr @ The division loop for lower bit positions. @ Here we shift remainer bits leftwards rather than moving the @ divisor for comparisons, considering the carry-out bit as well. mov ip, #0x80000000 4: movs xl, xl, lsl #1 adcs xh, xh, xh beq 6f cmpcc xh, r4 5: orrcs yl, yl, ip subcs xh, xh, r4 movs ip, ip, lsr #1 bne 4b mov pc, lr @ The top part of remainder became zero. If carry is set @ (the 33th bit) this is a false positive so resume the loop. @ Otherwise, if lower part is also null then we are done. 6: bcs 5b cmp xl, #0 moveq pc, lr @ We still have remainer bits in the low part. Bring them up. #if __LINUX_ARM_ARCH__ >= 5 clz xh, xl @ we know xh is zero here so... add xh, xh, #1 mov xl, xl, lsl xh mov ip, ip, lsr xh #else 7: movs xl, xl, lsl #1 mov ip, ip, lsr #1 bcc 7b #endif @ Current remainder is now 1. It is worthless to compare with @ divisor at this point since divisor can not be smaller than 3 here. @ If possible, branch for another shift in the division loop. @ If no bit position left then we are done. movs ip, ip, lsr #1 mov xh, #1 bne 4b mov pc, lr 8: @ Division by a power of 2: determine what that divisor order is @ then simply shift values around #if __LINUX_ARM_ARCH__ >= 5 clz ip, r4 rsb ip, ip, #31 #else mov yl, r4 cmp r4, #(1 << 16) mov ip, #0 movhs yl, yl, lsr #16 movhs ip, #16 cmp yl, #(1 << 8) movhs yl, yl, lsr #8 addhs ip, ip, #8 cmp yl, #(1 << 4) movhs yl, yl, lsr #4 addhs ip, ip, #4 cmp yl, #(1 << 2) addhi ip, ip, #3 addls ip, ip, yl, lsr #1 #endif mov yh, xh, lsr ip mov yl, xl, lsr ip rsb ip, ip, #32 ARM( orr yl, yl, xh, lsl ip ) THUMB( lsl xh, xh, ip ) THUMB( orr yl, yl, xh ) mov xh, xl, lsl ip mov xh, xh, lsr ip mov pc, lr @ eq -> division by 1: obvious enough... 9: moveq yl, xl moveq yh, xh moveq xh, #0 moveq pc, lr UNWIND(.fnend) UNWIND(.fnstart) UNWIND(.pad #4) UNWIND(.save {lr}) Ldiv0_64: @ Division by 0: str lr, [sp, #-8]! bl __div0 @ as wrong as it could be... mov yl, #0 mov yh, #0 mov xh, #0 ldr pc, [sp], #8 UNWIND(.fnend) ENDPROC(__do_div64) xen-4.4.0/xen/arch/arm/arm32/lib/findbit.S0000664000175000017500000001154012307313555016167 0ustar smbsmb/* * linux/arch/arm/lib/findbit.S * * Copyright (C) 1995-2000 Russell King * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * 16th March 2001 - John Ripley * Fixed so that "size" is an exclusive not an inclusive quantity. * All users of these functions expect exclusive sizes, and may * also call with zero size. * Reworked by rmk. */ #include #include "assembler.h" .text /* * Purpose : Find a 'zero' bit * Prototype: int find_first_zero_bit(void *addr, unsigned int maxbit); */ ENTRY(_find_first_zero_bit_le) teq r1, #0 beq 3f mov r2, #0 1: ARM( ldrb r3, [r0, r2, lsr #3] ) THUMB( lsr r3, r2, #3 ) THUMB( ldrb r3, [r0, r3] ) eors r3, r3, #0xff @ invert bits bne .L_found @ any now set - found zero bit add r2, r2, #8 @ next bit pointer 2: cmp r2, r1 @ any more? blo 1b 3: mov r0, r1 @ no free bits mov pc, lr ENDPROC(_find_first_zero_bit_le) /* * Purpose : Find next 'zero' bit * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset) */ ENTRY(_find_next_zero_bit_le) teq r1, #0 beq 3b ands ip, r2, #7 beq 1b @ If new byte, goto old routine ARM( ldrb r3, [r0, r2, lsr #3] ) THUMB( lsr r3, r2, #3 ) THUMB( ldrb r3, [r0, r3] ) eor r3, r3, #0xff @ now looking for a 1 bit movs r3, r3, lsr ip @ shift off unused bits bne .L_found orr r2, r2, #7 @ if zero, then no bits here add r2, r2, #1 @ align bit pointer b 2b @ loop for next bit ENDPROC(_find_next_zero_bit_le) /* * Purpose : Find a 'one' bit * Prototype: int find_first_bit(const unsigned long *addr, unsigned int maxbit); */ ENTRY(_find_first_bit_le) teq r1, #0 beq 3f mov r2, #0 1: ARM( ldrb r3, [r0, r2, lsr #3] ) THUMB( lsr r3, r2, #3 ) THUMB( ldrb r3, [r0, r3] ) movs r3, r3 bne .L_found @ any now set - found zero bit add r2, r2, #8 @ next bit pointer 2: cmp r2, r1 @ any more? blo 1b 3: mov r0, r1 @ no free bits mov pc, lr ENDPROC(_find_first_bit_le) /* * Purpose : Find next 'one' bit * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset) */ ENTRY(_find_next_bit_le) teq r1, #0 beq 3b ands ip, r2, #7 beq 1b @ If new byte, goto old routine ARM( ldrb r3, [r0, r2, lsr #3] ) THUMB( lsr r3, r2, #3 ) THUMB( ldrb r3, [r0, r3] ) movs r3, r3, lsr ip @ shift off unused bits bne .L_found orr r2, r2, #7 @ if zero, then no bits here add r2, r2, #1 @ align bit pointer b 2b @ loop for next bit ENDPROC(_find_next_bit_le) #ifdef __ARMEB__ ENTRY(_find_first_zero_bit_be) teq r1, #0 beq 3f mov r2, #0 1: eor r3, r2, #0x18 @ big endian byte ordering ARM( ldrb r3, [r0, r3, lsr #3] ) THUMB( lsr r3, #3 ) THUMB( ldrb r3, [r0, r3] ) eors r3, r3, #0xff @ invert bits bne .L_found @ any now set - found zero bit add r2, r2, #8 @ next bit pointer 2: cmp r2, r1 @ any more? blo 1b 3: mov r0, r1 @ no free bits mov pc, lr ENDPROC(_find_first_zero_bit_be) ENTRY(_find_next_zero_bit_be) teq r1, #0 beq 3b ands ip, r2, #7 beq 1b @ If new byte, goto old routine eor r3, r2, #0x18 @ big endian byte ordering ARM( ldrb r3, [r0, r3, lsr #3] ) THUMB( lsr r3, #3 ) THUMB( ldrb r3, [r0, r3] ) eor r3, r3, #0xff @ now looking for a 1 bit movs r3, r3, lsr ip @ shift off unused bits bne .L_found orr r2, r2, #7 @ if zero, then no bits here add r2, r2, #1 @ align bit pointer b 2b @ loop for next bit ENDPROC(_find_next_zero_bit_be) ENTRY(_find_first_bit_be) teq r1, #0 beq 3f mov r2, #0 1: eor r3, r2, #0x18 @ big endian byte ordering ARM( ldrb r3, [r0, r3, lsr #3] ) THUMB( lsr r3, #3 ) THUMB( ldrb r3, [r0, r3] ) movs r3, r3 bne .L_found @ any now set - found zero bit add r2, r2, #8 @ next bit pointer 2: cmp r2, r1 @ any more? blo 1b 3: mov r0, r1 @ no free bits mov pc, lr ENDPROC(_find_first_bit_be) ENTRY(_find_next_bit_be) teq r1, #0 beq 3b ands ip, r2, #7 beq 1b @ If new byte, goto old routine eor r3, r2, #0x18 @ big endian byte ordering ARM( ldrb r3, [r0, r3, lsr #3] ) THUMB( lsr r3, #3 ) THUMB( ldrb r3, [r0, r3] ) movs r3, r3, lsr ip @ shift off unused bits bne .L_found orr r2, r2, #7 @ if zero, then no bits here add r2, r2, #1 @ align bit pointer b 2b @ loop for next bit ENDPROC(_find_next_bit_be) #endif /* * One or more bits in the LSB of r3 are assumed to be set. */ .L_found: #if __LINUX_ARM_ARCH__ >= 5 rsb r0, r3, #0 and r3, r3, r0 clz r3, r3 rsb r3, r3, #31 add r0, r2, r3 #else tst r3, #0x0f addeq r2, r2, #4 movne r3, r3, lsl #4 tst r3, #0x30 addeq r2, r2, #2 movne r3, r3, lsl #2 tst r3, #0x40 addeq r2, r2, #1 mov r0, r2 #endif cmp r1, r0 @ Clamp to maxbit movlo r0, r1 mov pc, lr xen-4.4.0/xen/arch/arm/arm32/lib/testsetbit.S0000664000175000017500000000067112307313555016745 0ustar smbsmb/* * linux/arch/arm/lib/testsetbit.S * * Copyright (C) 1995-1996 Russell King * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include #include "assembler.h" #include "bitops.h" .text ENTRY(_test_and_set_bit) testop orreq, streq ENDPROC(_test_and_set_bit) xen-4.4.0/xen/arch/arm/arm32/lib/lib1funcs.S0000664000175000017500000002066712307313555016450 0ustar smbsmb/* * linux/arch/arm/lib/lib1funcs.S: Optimized ARM division routines * * Author: Nicolas Pitre * - contributed to gcc-3.4 on Sep 30, 2003 * - adapted for the Linux kernel on Oct 2, 2003 */ /* Copyright 1995, 1996, 1998, 1999, 2000, 2003 Free Software Foundation, Inc. This file is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. In addition to the permissions in the GNU General Public License, the Free Software Foundation gives you unlimited permission to link the compiled version of this file into combinations with other programs, and to distribute those combinations without any restriction coming from the use of this file. (The General Public License restrictions do apply in other respects; for example, they cover modification of the file, and distribution when not linked into a combine executable.) This file is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; see the file COPYING. If not, write to the Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include "assembler.h" .macro ARM_DIV_BODY dividend, divisor, result, curbit #if __LINUX_ARM_ARCH__ >= 5 clz \curbit, \divisor clz \result, \dividend sub \result, \curbit, \result mov \curbit, #1 mov \divisor, \divisor, lsl \result mov \curbit, \curbit, lsl \result mov \result, #0 #else @ Initially shift the divisor left 3 bits if possible, @ set curbit accordingly. This allows for curbit to be located @ at the left end of each 4 bit nibbles in the division loop @ to save one loop in most cases. tst \divisor, #0xe0000000 moveq \divisor, \divisor, lsl #3 moveq \curbit, #8 movne \curbit, #1 @ Unless the divisor is very big, shift it up in multiples of @ four bits, since this is the amount of unwinding in the main @ division loop. Continue shifting until the divisor is @ larger than the dividend. 1: cmp \divisor, #0x10000000 cmplo \divisor, \dividend movlo \divisor, \divisor, lsl #4 movlo \curbit, \curbit, lsl #4 blo 1b @ For very big divisors, we must shift it a bit at a time, or @ we will be in danger of overflowing. 1: cmp \divisor, #0x80000000 cmplo \divisor, \dividend movlo \divisor, \divisor, lsl #1 movlo \curbit, \curbit, lsl #1 blo 1b mov \result, #0 #endif @ Division loop 1: cmp \dividend, \divisor subhs \dividend, \dividend, \divisor orrhs \result, \result, \curbit cmp \dividend, \divisor, lsr #1 subhs \dividend, \dividend, \divisor, lsr #1 orrhs \result, \result, \curbit, lsr #1 cmp \dividend, \divisor, lsr #2 subhs \dividend, \dividend, \divisor, lsr #2 orrhs \result, \result, \curbit, lsr #2 cmp \dividend, \divisor, lsr #3 subhs \dividend, \dividend, \divisor, lsr #3 orrhs \result, \result, \curbit, lsr #3 cmp \dividend, #0 @ Early termination? movnes \curbit, \curbit, lsr #4 @ No, any more bits to do? movne \divisor, \divisor, lsr #4 bne 1b .endm .macro ARM_DIV2_ORDER divisor, order #if __LINUX_ARM_ARCH__ >= 5 clz \order, \divisor rsb \order, \order, #31 #else cmp \divisor, #(1 << 16) movhs \divisor, \divisor, lsr #16 movhs \order, #16 movlo \order, #0 cmp \divisor, #(1 << 8) movhs \divisor, \divisor, lsr #8 addhs \order, \order, #8 cmp \divisor, #(1 << 4) movhs \divisor, \divisor, lsr #4 addhs \order, \order, #4 cmp \divisor, #(1 << 2) addhi \order, \order, #3 addls \order, \order, \divisor, lsr #1 #endif .endm .macro ARM_MOD_BODY dividend, divisor, order, spare #if __LINUX_ARM_ARCH__ >= 5 clz \order, \divisor clz \spare, \dividend sub \order, \order, \spare mov \divisor, \divisor, lsl \order #else mov \order, #0 @ Unless the divisor is very big, shift it up in multiples of @ four bits, since this is the amount of unwinding in the main @ division loop. Continue shifting until the divisor is @ larger than the dividend. 1: cmp \divisor, #0x10000000 cmplo \divisor, \dividend movlo \divisor, \divisor, lsl #4 addlo \order, \order, #4 blo 1b @ For very big divisors, we must shift it a bit at a time, or @ we will be in danger of overflowing. 1: cmp \divisor, #0x80000000 cmplo \divisor, \dividend movlo \divisor, \divisor, lsl #1 addlo \order, \order, #1 blo 1b #endif @ Perform all needed substractions to keep only the reminder. @ Do comparisons in batch of 4 first. subs \order, \order, #3 @ yes, 3 is intended here blt 2f 1: cmp \dividend, \divisor subhs \dividend, \dividend, \divisor cmp \dividend, \divisor, lsr #1 subhs \dividend, \dividend, \divisor, lsr #1 cmp \dividend, \divisor, lsr #2 subhs \dividend, \dividend, \divisor, lsr #2 cmp \dividend, \divisor, lsr #3 subhs \dividend, \dividend, \divisor, lsr #3 cmp \dividend, #1 mov \divisor, \divisor, lsr #4 subges \order, \order, #4 bge 1b tst \order, #3 teqne \dividend, #0 beq 5f @ Either 1, 2 or 3 comparison/substractions are left. 2: cmn \order, #2 blt 4f beq 3f cmp \dividend, \divisor subhs \dividend, \dividend, \divisor mov \divisor, \divisor, lsr #1 3: cmp \dividend, \divisor subhs \dividend, \dividend, \divisor mov \divisor, \divisor, lsr #1 4: cmp \dividend, \divisor subhs \dividend, \dividend, \divisor 5: .endm ENTRY(__udivsi3) ENTRY(__aeabi_uidiv) UNWIND(.fnstart) subs r2, r1, #1 moveq pc, lr bcc Ldiv0 cmp r0, r1 bls 11f tst r1, r2 beq 12f ARM_DIV_BODY r0, r1, r2, r3 mov r0, r2 mov pc, lr 11: moveq r0, #1 movne r0, #0 mov pc, lr 12: ARM_DIV2_ORDER r1, r2 mov r0, r0, lsr r2 mov pc, lr UNWIND(.fnend) ENDPROC(__udivsi3) ENDPROC(__aeabi_uidiv) ENTRY(__umodsi3) UNWIND(.fnstart) subs r2, r1, #1 @ compare divisor with 1 bcc Ldiv0 cmpne r0, r1 @ compare dividend with divisor moveq r0, #0 tsthi r1, r2 @ see if divisor is power of 2 andeq r0, r0, r2 movls pc, lr ARM_MOD_BODY r0, r1, r2, r3 mov pc, lr UNWIND(.fnend) ENDPROC(__umodsi3) ENTRY(__divsi3) ENTRY(__aeabi_idiv) UNWIND(.fnstart) cmp r1, #0 eor ip, r0, r1 @ save the sign of the result. beq Ldiv0 rsbmi r1, r1, #0 @ loops below use unsigned. subs r2, r1, #1 @ division by 1 or -1 ? beq 10f movs r3, r0 rsbmi r3, r0, #0 @ positive dividend value cmp r3, r1 bls 11f tst r1, r2 @ divisor is power of 2 ? beq 12f ARM_DIV_BODY r3, r1, r0, r2 cmp ip, #0 rsbmi r0, r0, #0 mov pc, lr 10: teq ip, r0 @ same sign ? rsbmi r0, r0, #0 mov pc, lr 11: movlo r0, #0 moveq r0, ip, asr #31 orreq r0, r0, #1 mov pc, lr 12: ARM_DIV2_ORDER r1, r2 cmp ip, #0 mov r0, r3, lsr r2 rsbmi r0, r0, #0 mov pc, lr UNWIND(.fnend) ENDPROC(__divsi3) ENDPROC(__aeabi_idiv) ENTRY(__modsi3) UNWIND(.fnstart) cmp r1, #0 beq Ldiv0 rsbmi r1, r1, #0 @ loops below use unsigned. movs ip, r0 @ preserve sign of dividend rsbmi r0, r0, #0 @ if negative make positive subs r2, r1, #1 @ compare divisor with 1 cmpne r0, r1 @ compare dividend with divisor moveq r0, #0 tsthi r1, r2 @ see if divisor is power of 2 andeq r0, r0, r2 bls 10f ARM_MOD_BODY r0, r1, r2, r3 10: cmp ip, #0 rsbmi r0, r0, #0 mov pc, lr UNWIND(.fnend) ENDPROC(__modsi3) #ifdef CONFIG_AEABI ENTRY(__aeabi_uidivmod) UNWIND(.fnstart) UNWIND(.save {r0, r1, ip, lr} ) stmfd sp!, {r0, r1, ip, lr} bl __aeabi_uidiv ldmfd sp!, {r1, r2, ip, lr} mul r3, r0, r2 sub r1, r1, r3 mov pc, lr UNWIND(.fnend) ENDPROC(__aeabi_uidivmod) ENTRY(__aeabi_idivmod) UNWIND(.fnstart) UNWIND(.save {r0, r1, ip, lr} ) stmfd sp!, {r0, r1, ip, lr} bl __aeabi_idiv ldmfd sp!, {r1, r2, ip, lr} mul r3, r0, r2 sub r1, r1, r3 mov pc, lr UNWIND(.fnend) ENDPROC(__aeabi_idivmod) ENTRY(__aeabi_uldivmod) UNWIND(.fnstart) UNWIND(.save {lr} ) sub sp, sp, #8 stmfd sp!, {sp, lr} bl __qdivrem ldr lr, [sp, #4] add sp, sp, #8 ldmfd sp!, {r2, r3} mov pc, lr UNWIND(.fnend) ENDPROC(__aeabi_uldivmod) ENTRY(__aeabi_ldivmod) UNWIND(.fnstart) UNWIND(.save {lr} ) sub sp, sp, #16 stmfd sp!, {sp, lr} bl __ldivmod_helper ldr lr, [sp, #4] add sp, sp, #16 ldmfd sp!, {r2, r3} mov pc, lr UNWIND(.fnend) ENDPROC(__aeabi_ldivmod) #endif Ldiv0: UNWIND(.fnstart) UNWIND(.pad #4) UNWIND(.save {lr}) str lr, [sp, #-8]! bl __div0 mov r0, #0 @ About as wrong as it could be. ldr pc, [sp], #8 UNWIND(.fnend) ENDPROC(Ldiv0) xen-4.4.0/xen/arch/arm/arm32/lib/clearbit.S0000664000175000017500000000064012307313555016334 0ustar smbsmb/* * linux/arch/arm/lib/clearbit.S * * Copyright (C) 1995-1996 Russell King * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include #include "assembler.h" #include "bitops.h" .text ENTRY(_clear_bit) bitop bic ENDPROC(_clear_bit) xen-4.4.0/xen/arch/arm/arm32/lib/bitops.h0000664000175000017500000000332612307313555016100 0ustar smbsmb#include #if __LINUX_ARM_ARCH__ >= 6 .macro bitop, instr ands ip, r1, #3 strneb r1, [ip] @ assert word-aligned mov r2, #1 and r3, r0, #31 @ Get bit offset mov r0, r0, lsr #5 add r1, r1, r0, lsl #2 @ Get word offset mov r3, r2, lsl r3 1: ldrex r2, [r1] \instr r2, r2, r3 strex r0, r2, [r1] cmp r0, #0 bne 1b bx lr .endm .macro testop, instr, store ands ip, r1, #3 strneb r1, [ip] @ assert word-aligned mov r2, #1 and r3, r0, #31 @ Get bit offset mov r0, r0, lsr #5 add r1, r1, r0, lsl #2 @ Get word offset mov r3, r2, lsl r3 @ create mask smp_dmb 1: ldrex r2, [r1] ands r0, r2, r3 @ save old value of bit \instr r2, r2, r3 @ toggle bit strex ip, r2, [r1] cmp ip, #0 bne 1b smp_dmb cmp r0, #0 movne r0, #1 2: bx lr .endm #else .macro bitop, name, instr ENTRY( \name ) UNWIND( .fnstart ) ands ip, r1, #3 strneb r1, [ip] @ assert word-aligned and r2, r0, #31 mov r0, r0, lsr #5 mov r3, #1 mov r3, r3, lsl r2 save_and_disable_irqs ip ldr r2, [r1, r0, lsl #2] \instr r2, r2, r3 str r2, [r1, r0, lsl #2] restore_irqs ip mov pc, lr UNWIND( .fnend ) ENDPROC(\name ) .endm /** * testop - implement a test_and_xxx_bit operation. * @instr: operational instruction * @store: store instruction * * Note: we can trivially conditionalise the store instruction * to avoid dirtying the data cache. */ .macro testop, name, instr, store ENTRY( \name ) UNWIND( .fnstart ) ands ip, r1, #3 strneb r1, [ip] @ assert word-aligned and r3, r0, #31 mov r0, r0, lsr #5 save_and_disable_irqs ip ldr r2, [r1, r0, lsl #2]! mov r0, #1 tst r2, r0, lsl r3 \instr r2, r2, r0, lsl r3 \store r2, [r1] moveq r0, #0 restore_irqs ip mov pc, lr UNWIND( .fnend ) ENDPROC(\name ) .endm #endif xen-4.4.0/xen/arch/arm/arm32/lib/memzero.S0000664000175000017500000000542412307313555016232 0ustar smbsmb/* * linux/arch/arm/lib/memzero.S * * Copyright (C) 1995-2000 Russell King * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include #include "assembler.h" .text .align 5 .word 0 /* * Align the pointer in r0. r3 contains the number of bytes that we are * mis-aligned by, and r1 is the number of bytes. If r1 < 4, then we * don't bother; we use byte stores instead. */ 1: subs r1, r1, #4 @ 1 do we have enough blt 5f @ 1 bytes to align with? cmp r3, #2 @ 1 strltb r2, [r0], #1 @ 1 strleb r2, [r0], #1 @ 1 strb r2, [r0], #1 @ 1 add r1, r1, r3 @ 1 (r1 = r1 - (4 - r3)) /* * The pointer is now aligned and the length is adjusted. Try doing the * memzero again. */ ENTRY(__memzero) mov r2, #0 @ 1 ands r3, r0, #3 @ 1 unaligned? bne 1b @ 1 /* * r3 = 0, and we know that the pointer in r0 is aligned to a word boundary. */ cmp r1, #16 @ 1 we can skip this chunk if we blt 4f @ 1 have < 16 bytes #if ! CALGN(1)+0 /* * We need an extra register for this loop - save the return address and * use the LR */ str lr, [sp, #-4]! @ 1 mov ip, r2 @ 1 mov lr, r2 @ 1 3: subs r1, r1, #64 @ 1 write 32 bytes out per loop stmgeia r0!, {r2, r3, ip, lr} @ 4 stmgeia r0!, {r2, r3, ip, lr} @ 4 stmgeia r0!, {r2, r3, ip, lr} @ 4 stmgeia r0!, {r2, r3, ip, lr} @ 4 bgt 3b @ 1 ldmeqfd sp!, {pc} @ 1/2 quick exit /* * No need to correct the count; we're only testing bits from now on */ tst r1, #32 @ 1 stmneia r0!, {r2, r3, ip, lr} @ 4 stmneia r0!, {r2, r3, ip, lr} @ 4 tst r1, #16 @ 1 16 bytes or more? stmneia r0!, {r2, r3, ip, lr} @ 4 ldr lr, [sp], #4 @ 1 #else /* * This version aligns the destination pointer in order to write * whole cache lines at once. */ stmfd sp!, {r4-r7, lr} mov r4, r2 mov r5, r2 mov r6, r2 mov r7, r2 mov ip, r2 mov lr, r2 cmp r1, #96 andgts ip, r0, #31 ble 3f rsb ip, ip, #32 sub r1, r1, ip movs ip, ip, lsl #(32 - 4) stmcsia r0!, {r4, r5, r6, r7} stmmiia r0!, {r4, r5} movs ip, ip, lsl #2 strcs r2, [r0], #4 3: subs r1, r1, #64 stmgeia r0!, {r2-r7, ip, lr} stmgeia r0!, {r2-r7, ip, lr} bgt 3b ldmeqfd sp!, {r4-r7, pc} tst r1, #32 stmneia r0!, {r2-r7, ip, lr} tst r1, #16 stmneia r0!, {r4-r7} ldmfd sp!, {r4-r7, lr} #endif 4: tst r1, #8 @ 1 8 bytes or more? stmneia r0!, {r2, r3} @ 2 tst r1, #4 @ 1 4 bytes or more? strne r2, [r0], #4 @ 1 /* * When we get here, we've got less than 4 bytes to zero. We * may have an unaligned pointer as well. */ 5: tst r1, #2 @ 1 2 bytes or more? strneb r2, [r0], #1 @ 1 strneb r2, [r0], #1 @ 1 tst r1, #1 @ 1 a byte left over strneb r2, [r0], #1 @ 1 mov pc, lr @ 1 ENDPROC(__memzero) xen-4.4.0/xen/arch/arm/arm32/lib/testchangebit.S0000664000175000017500000000067612307313555017404 0ustar smbsmb/* * linux/arch/arm/lib/testchangebit.S * * Copyright (C) 1995-1996 Russell King * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include #include "assembler.h" #include "bitops.h" .text ENTRY(_test_and_change_bit) testop eor, str ENDPROC(_test_and_change_bit) xen-4.4.0/xen/arch/arm/arm32/lib/setbit.S0000664000175000017500000000061212307313555016040 0ustar smbsmb/* * linux/arch/arm/lib/setbit.S * * Copyright (C) 1995-1996 Russell King * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include #include "assembler.h" #include "bitops.h" .text ENTRY(_set_bit) bitop orr ENDPROC(_set_bit) xen-4.4.0/xen/arch/arm/arm32/lib/changebit.S0000664000175000017500000000064212307313555016475 0ustar smbsmb/* * linux/arch/arm/lib/changebit.S * * Copyright (C) 1995-1996 Russell King * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include #include "assembler.h" #include "bitops.h" .text ENTRY(_change_bit) bitop eor ENDPROC(_change_bit) xen-4.4.0/xen/arch/arm/arm32/lib/memcpy.S0000664000175000017500000000247212307313555016046 0ustar smbsmb/* * linux/arch/arm/lib/memcpy.S * * Author: Nicolas Pitre * Created: Sep 28, 2005 * Copyright: MontaVista Software, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include #include "assembler.h" #define LDR1W_SHIFT 0 #define STR1W_SHIFT 0 .macro ldr1w ptr reg abort W(ldr) \reg, [\ptr], #4 .endm .macro ldr4w ptr reg1 reg2 reg3 reg4 abort ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4} .endm .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} .endm .macro ldr1b ptr reg cond=al abort ldr\cond\()b \reg, [\ptr], #1 .endm .macro str1w ptr reg abort W(str) \reg, [\ptr], #4 .endm .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8} .endm .macro str1b ptr reg cond=al abort str\cond\()b \reg, [\ptr], #1 .endm .macro enter reg1 reg2 stmdb sp!, {r0, \reg1, \reg2} .endm .macro exit reg1 reg2 ldmfd sp!, {r0, \reg1, \reg2} .endm .text /* Prototype: void *memcpy(void *dest, const void *src, size_t n); */ ENTRY(memcpy) #include "copy_template.S" ENDPROC(memcpy) xen-4.4.0/xen/arch/arm/arm32/lib/memset.S0000664000175000017500000000464312307313555016050 0ustar smbsmb/* * linux/arch/arm/lib/memset.S * * Copyright (C) 1995-2000 Russell King * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * ASM optimised string functions */ #include #include "assembler.h" .text .align 5 .word 0 1: subs r2, r2, #4 @ 1 do we have enough blt 5f @ 1 bytes to align with? cmp r3, #2 @ 1 strltb r1, [r0], #1 @ 1 strleb r1, [r0], #1 @ 1 strb r1, [r0], #1 @ 1 add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3)) /* * The pointer is now aligned and the length is adjusted. Try doing the * memset again. */ ENTRY(memset) ands r3, r0, #3 @ 1 unaligned? bne 1b @ 1 /* * we know that the pointer in r0 is aligned to a word boundary. */ orr r1, r1, r1, lsl #8 orr r1, r1, r1, lsl #16 mov r3, r1 cmp r2, #16 blt 4f #if ! CALGN(1)+0 /* * We need an extra register for this loop - save the return address and * use the LR */ str lr, [sp, #-4]! mov ip, r1 mov lr, r1 2: subs r2, r2, #64 stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time. stmgeia r0!, {r1, r3, ip, lr} stmgeia r0!, {r1, r3, ip, lr} stmgeia r0!, {r1, r3, ip, lr} bgt 2b ldmeqfd sp!, {pc} @ Now <64 bytes to go. /* * No need to correct the count; we're only testing bits from now on */ tst r2, #32 stmneia r0!, {r1, r3, ip, lr} stmneia r0!, {r1, r3, ip, lr} tst r2, #16 stmneia r0!, {r1, r3, ip, lr} ldr lr, [sp], #4 #else /* * This version aligns the destination pointer in order to write * whole cache lines at once. */ stmfd sp!, {r4-r7, lr} mov r4, r1 mov r5, r1 mov r6, r1 mov r7, r1 mov ip, r1 mov lr, r1 cmp r2, #96 tstgt r0, #31 ble 3f and ip, r0, #31 rsb ip, ip, #32 sub r2, r2, ip movs ip, ip, lsl #(32 - 4) stmcsia r0!, {r4, r5, r6, r7} stmmiia r0!, {r4, r5} tst ip, #(1 << 30) mov ip, r1 strne r1, [r0], #4 3: subs r2, r2, #64 stmgeia r0!, {r1, r3-r7, ip, lr} stmgeia r0!, {r1, r3-r7, ip, lr} bgt 3b ldmeqfd sp!, {r4-r7, pc} tst r2, #32 stmneia r0!, {r1, r3-r7, ip, lr} tst r2, #16 stmneia r0!, {r4-r7} ldmfd sp!, {r4-r7, lr} #endif 4: tst r2, #8 stmneia r0!, {r1, r3} tst r2, #4 strne r1, [r0], #4 /* * When we get here, we've got less than 4 bytes to zero. We * may have an unaligned pointer as well. */ 5: tst r2, #2 strneb r1, [r0], #1 strneb r1, [r0], #1 tst r2, #1 strneb r1, [r0], #1 mov pc, lr ENDPROC(memset) xen-4.4.0/xen/arch/arm/arm32/lib/testclearbit.S0000664000175000017500000000067712307313555017246 0ustar smbsmb/* * linux/arch/arm/lib/testclearbit.S * * Copyright (C) 1995-1996 Russell King * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include #include "assembler.h" #include "bitops.h" .text ENTRY(_test_and_clear_bit) testop bicne, strne ENDPROC(_test_and_clear_bit) xen-4.4.0/xen/arch/arm/arm32/lib/assembler.h0000664000175000017500000001552612307313555016562 0ustar smbsmb/* From Linux arch/arm/include/asm/assembler.h */ /* * arch/arm/include/asm/assembler.h * * Copyright (C) 1996-2000 Russell King * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This file contains arm architecture specific defines * for the different processors. * * Do not include any C declarations in this file - it is included by * assembler source. */ #ifndef __ASM_ASSEMBLER_H__ #define __ASM_ASSEMBLER_H__ #ifndef __ASSEMBLY__ #error "Only include this from assembly code" #endif // No Thumb, hence: #define W(instr) instr #define ARM(instr...) instr #define THUMB(instr...) #ifdef CONFIG_ARM_UNWIND #define UNWIND(code...) code #else #define UNWIND(code...) #endif /* * Endian independent macros for shifting bytes within registers. */ #ifndef __ARMEB__ #define pull lsr #define push lsl #define get_byte_0 lsl #0 #define get_byte_1 lsr #8 #define get_byte_2 lsr #16 #define get_byte_3 lsr #24 #define put_byte_0 lsl #0 #define put_byte_1 lsl #8 #define put_byte_2 lsl #16 #define put_byte_3 lsl #24 #else #define pull lsl #define push lsr #define get_byte_0 lsr #24 #define get_byte_1 lsr #16 #define get_byte_2 lsr #8 #define get_byte_3 lsl #0 #define put_byte_0 lsl #24 #define put_byte_1 lsl #16 #define put_byte_2 lsl #8 #define put_byte_3 lsl #0 #endif /* * Data preload for architectures that support it */ #if __LINUX_ARM_ARCH__ >= 5 #define PLD(code...) code #else #define PLD(code...) #endif /* * This can be used to enable code to cacheline align the destination * pointer when bulk writing to memory. Experiments on StrongARM and * XScale didn't show this a worthwhile thing to do when the cache is not * set to write-allocate (this would need further testing on XScale when WA * is used). * * On Feroceon there is much to gain however, regardless of cache mode. */ #ifdef CONFIG_CPU_FEROCEON #define CALGN(code...) code #else #define CALGN(code...) #endif /* * Enable and disable interrupts */ #if __LINUX_ARM_ARCH__ >= 6 .macro disable_irq_notrace cpsid i .endm .macro enable_irq_notrace cpsie i .endm #else .macro disable_irq_notrace msr cpsr_c, #PSR_I_BIT | SVC_MODE .endm .macro enable_irq_notrace msr cpsr_c, #SVC_MODE .endm #endif .macro asm_trace_hardirqs_off #if defined(CONFIG_TRACE_IRQFLAGS) stmdb sp!, {r0-r3, ip, lr} bl trace_hardirqs_off ldmia sp!, {r0-r3, ip, lr} #endif .endm .macro asm_trace_hardirqs_on_cond, cond #if defined(CONFIG_TRACE_IRQFLAGS) /* * actually the registers should be pushed and pop'd conditionally, but * after bl the flags are certainly clobbered */ stmdb sp!, {r0-r3, ip, lr} bl\cond trace_hardirqs_on ldmia sp!, {r0-r3, ip, lr} #endif .endm .macro asm_trace_hardirqs_on asm_trace_hardirqs_on_cond al .endm .macro disable_irq disable_irq_notrace asm_trace_hardirqs_off .endm .macro enable_irq asm_trace_hardirqs_on enable_irq_notrace .endm /* * Save the current IRQ state and disable IRQs. Note that this macro * assumes FIQs are enabled, and that the processor is in SVC mode. */ .macro save_and_disable_irqs, oldcpsr mrs \oldcpsr, cpsr disable_irq .endm /* * Restore interrupt state previously stored in a register. We don't * guarantee that this will preserve the flags. */ .macro restore_irqs_notrace, oldcpsr msr cpsr_c, \oldcpsr .endm .macro restore_irqs, oldcpsr tst \oldcpsr, #PSR_I_BIT asm_trace_hardirqs_on_cond eq restore_irqs_notrace \oldcpsr .endm #define USER(x...) \ 9999: x; \ .pushsection __ex_table,"a"; \ .align 3; \ .long 9999b,9001f; \ .popsection #ifdef CONFIG_SMP #define ALT_SMP(instr...) \ 9998: instr /* * Note: if you get assembler errors from ALT_UP() when building with * CONFIG_THUMB2_KERNEL, you almost certainly need to use * ALT_SMP( W(instr) ... ) */ #define ALT_UP(instr...) \ .pushsection ".alt.smp.init", "a" ;\ .long 9998b ;\ 9997: instr ;\ .if . - 9997b != 4 ;\ .error "ALT_UP() content must assemble to exactly 4 bytes";\ .endif ;\ .popsection #define ALT_UP_B(label) \ .equ up_b_offset, label - 9998b ;\ .pushsection ".alt.smp.init", "a" ;\ .long 9998b ;\ W(b) . + up_b_offset ;\ .popsection #else #define ALT_SMP(instr...) #define ALT_UP(instr...) instr #define ALT_UP_B(label) b label #endif /* * Instruction barrier */ .macro instr_sync #if __LINUX_ARM_ARCH__ >= 7 isb #elif __LINUX_ARM_ARCH__ == 6 mcr p15, 0, r0, c7, c5, 4 #endif .endm /* * SMP data memory barrier */ .macro smp_dmb mode #ifdef CONFIG_SMP #if __LINUX_ARM_ARCH__ >= 7 .ifeqs "\mode","arm" ALT_SMP(dmb) .else ALT_SMP(W(dmb)) .endif #elif __LINUX_ARM_ARCH__ == 6 ALT_SMP(mcr p15, 0, r0, c7, c10, 5) @ dmb #else #error Incompatible SMP platform #endif .ifeqs "\mode","arm" ALT_UP(nop) .else ALT_UP(W(nop)) .endif #endif .endm #ifdef CONFIG_THUMB2_KERNEL .macro setmode, mode, reg mov \reg, #\mode msr cpsr_c, \reg .endm #else .macro setmode, mode, reg msr cpsr_c, #\mode .endm #endif /* * STRT/LDRT access macros with ARM and Thumb-2 variants */ #ifdef CONFIG_THUMB2_KERNEL .macro usraccoff, instr, reg, ptr, inc, off, cond, abort, t=T() 9999: .if \inc == 1 \instr\cond\()b\()\t\().w \reg, [\ptr, #\off] .elseif \inc == 4 \instr\cond\()\t\().w \reg, [\ptr, #\off] .else .error "Unsupported inc macro argument" .endif .pushsection __ex_table,"a" .align 3 .long 9999b, \abort .popsection .endm .macro usracc, instr, reg, ptr, inc, cond, rept, abort @ explicit IT instruction needed because of the label @ introduced by the USER macro .ifnc \cond,al .if \rept == 1 itt \cond .elseif \rept == 2 ittt \cond .else .error "Unsupported rept macro argument" .endif .endif @ Slightly optimised to avoid incrementing the pointer twice usraccoff \instr, \reg, \ptr, \inc, 0, \cond, \abort .if \rept == 2 usraccoff \instr, \reg, \ptr, \inc, \inc, \cond, \abort .endif add\cond \ptr, #\rept * \inc .endm #else /* !CONFIG_THUMB2_KERNEL */ .macro usracc, instr, reg, ptr, inc, cond, rept, abort, t=T() .rept \rept 9999: .if \inc == 1 \instr\cond\()b\()\t \reg, [\ptr], #\inc .elseif \inc == 4 \instr\cond\()\t \reg, [\ptr], #\inc .else .error "Unsupported inc macro argument" .endif .pushsection __ex_table,"a" .align 3 .long 9999b, \abort .popsection .endr .endm #endif /* CONFIG_THUMB2_KERNEL */ .macro strusr, reg, ptr, inc, cond=al, rept=1, abort=9001f usracc str, \reg, \ptr, \inc, \cond, \rept, \abort .endm .macro ldrusr, reg, ptr, inc, cond=al, rept=1, abort=9001f usracc ldr, \reg, \ptr, \inc, \cond, \rept, \abort .endm /* Utility macro for declaring string literals */ .macro string name:req, string .type \name , #object \name: .asciz "\string" .size \name , . - \name .endm #endif /* __ASM_ASSEMBLER_H__ */ xen-4.4.0/xen/arch/arm/arm32/entry.S0000664000175000017500000001353212307313555015146 0ustar smbsmb#include #include #include #include #define SAVE_ONE_BANKED(reg) mrs r11, reg; str r11, [sp, #UREGS_##reg] #define RESTORE_ONE_BANKED(reg) ldr r11, [sp, #UREGS_##reg]; msr reg, r11 #define SAVE_BANKED(mode) \ SAVE_ONE_BANKED(SP_##mode) ; SAVE_ONE_BANKED(LR_##mode) ; SAVE_ONE_BANKED(SPSR_##mode) #define RESTORE_BANKED(mode) \ RESTORE_ONE_BANKED(SP_##mode) ; RESTORE_ONE_BANKED(LR_##mode) ; RESTORE_ONE_BANKED(SPSR_##mode) #define SAVE_ALL \ sub sp, #(UREGS_SP_usr - UREGS_sp); /* SP, LR, SPSR, PC */ \ push {r0-r12}; /* Save R0-R12 */ \ \ mrs r11, ELR_hyp; /* ELR_hyp is return address. */\ str r11, [sp, #UREGS_pc]; \ \ str lr, [sp, #UREGS_lr]; \ \ add r11, sp, #UREGS_kernel_sizeof+4; \ str r11, [sp, #UREGS_sp]; \ \ mrs r11, SPSR_hyp; \ str r11, [sp, #UREGS_cpsr]; \ and r11, #PSR_MODE_MASK; \ cmp r11, #PSR_MODE_HYP; \ blne save_guest_regs save_guest_regs: ldr r11, =0xffffffff /* Clobber SP which is only valid for hypervisor frames. */ str r11, [sp, #UREGS_sp] SAVE_ONE_BANKED(SP_usr) /* LR_usr is the same physical register as lr and is saved in SAVE_ALL */ SAVE_BANKED(svc) SAVE_BANKED(abt) SAVE_BANKED(und) SAVE_BANKED(irq) SAVE_BANKED(fiq) SAVE_ONE_BANKED(R8_fiq); SAVE_ONE_BANKED(R9_fiq); SAVE_ONE_BANKED(R10_fiq) SAVE_ONE_BANKED(R11_fiq); SAVE_ONE_BANKED(R12_fiq); mov pc, lr #define DEFINE_TRAP_ENTRY(trap) \ ALIGN; \ trap_##trap: \ SAVE_ALL; \ cpsie i; /* local_irq_enable */ \ adr lr, return_from_trap; \ mov r0, sp; \ mov r11, sp; \ bic sp, #7; /* Align the stack pointer (noop on guest trap) */ \ b do_trap_##trap #define DEFINE_TRAP_ENTRY_NOIRQ(trap) \ ALIGN; \ trap_##trap: \ SAVE_ALL; \ adr lr, return_from_trap; \ mov r0, sp; \ mov r11, sp; \ bic sp, #7; /* Align the stack pointer (noop on guest trap) */ \ b do_trap_##trap .align 5 GLOBAL(hyp_traps_vector) .word 0 /* 0x00 - Reset */ b trap_undefined_instruction /* 0x04 - Undefined Instruction */ b trap_supervisor_call /* 0x08 - Supervisor Call */ b trap_prefetch_abort /* 0x0c - Prefetch Abort */ b trap_data_abort /* 0x10 - Data Abort */ b trap_hypervisor /* 0x14 - Hypervisor */ b trap_irq /* 0x18 - IRQ */ b trap_fiq /* 0x1c - FIQ */ DEFINE_TRAP_ENTRY(undefined_instruction) DEFINE_TRAP_ENTRY(supervisor_call) DEFINE_TRAP_ENTRY(prefetch_abort) DEFINE_TRAP_ENTRY(data_abort) DEFINE_TRAP_ENTRY(hypervisor) DEFINE_TRAP_ENTRY_NOIRQ(irq) DEFINE_TRAP_ENTRY_NOIRQ(fiq) return_from_trap: mov sp, r11 ENTRY(return_to_new_vcpu32) ldr r11, [sp, #UREGS_cpsr] and r11, #PSR_MODE_MASK cmp r11, #PSR_MODE_HYP beq return_to_hypervisor /* Fall thru */ return_to_guest: mov r11, sp bic sp, #7 /* Align the stack pointer */ bl leave_hypervisor_tail /* Disables interrupts on return */ mov sp, r11 RESTORE_ONE_BANKED(SP_usr) /* LR_usr is the same physical register as lr and is restored below */ RESTORE_BANKED(svc) RESTORE_BANKED(abt) RESTORE_BANKED(und) RESTORE_BANKED(irq) RESTORE_BANKED(fiq) RESTORE_ONE_BANKED(R8_fiq); RESTORE_ONE_BANKED(R9_fiq); RESTORE_ONE_BANKED(R10_fiq) RESTORE_ONE_BANKED(R11_fiq); RESTORE_ONE_BANKED(R12_fiq); /* Fall thru */ return_to_hypervisor: cpsid i ldr lr, [sp, #UREGS_lr] ldr r11, [sp, #UREGS_pc] msr ELR_hyp, r11 ldr r11, [sp, #UREGS_cpsr] msr SPSR_hyp, r11 pop {r0-r12} add sp, #(UREGS_SP_usr - UREGS_sp); /* SP, LR, SPSR, PC */ clrex eret /* * struct vcpu *__context_switch(struct vcpu *prev, struct vcpu *next) * * r0 - prev * r1 - next * * Returns prev in r0 */ ENTRY(__context_switch) add ip, r0, #VCPU_arch_saved_context stmia ip!, {r4 - sl, fp, sp, lr} /* Save register state */ add r4, r1, #VCPU_arch_saved_context ldmia r4, {r4 - sl, fp, sp, pc} /* Load registers and return */ /* * Local variables: * mode: ASM * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm32/vfp.c0000664000175000017500000000525012307313555014616 0ustar smbsmb#include #include #include #include void vfp_save_state(struct vcpu *v) { v->arch.vfp.fpexc = READ_CP32(FPEXC); WRITE_CP32(v->arch.vfp.fpexc | FPEXC_EN, FPEXC); v->arch.vfp.fpscr = READ_CP32(FPSCR); if ( v->arch.vfp.fpexc & FPEXC_EX ) /* Check for sub-architecture */ { v->arch.vfp.fpinst = READ_CP32(FPINST); if ( v->arch.vfp.fpexc & FPEXC_FP2V ) v->arch.vfp.fpinst2 = READ_CP32(FPINST2); /* Disable FPEXC_EX */ WRITE_CP32((v->arch.vfp.fpexc | FPEXC_EN) & ~FPEXC_EX, FPEXC); } /* Save {d0-d15} */ asm volatile("stc p11, cr0, [%1], #32*4" : "=Q" (*v->arch.vfp.fpregs1) : "r" (v->arch.vfp.fpregs1)); /* 32 x 64 bits registers? */ if ( (READ_CP32(MVFR0) & MVFR0_A_SIMD_MASK) == 2 ) { /* Save {d16-d31} */ asm volatile("stcl p11, cr0, [%1], #32*4" : "=Q" (*v->arch.vfp.fpregs2) : "r" (v->arch.vfp.fpregs2)); } WRITE_CP32(v->arch.vfp.fpexc & ~(FPEXC_EN), FPEXC); } void vfp_restore_state(struct vcpu *v) { //uint64_t test[16]; WRITE_CP32(READ_CP32(FPEXC) | FPEXC_EN, FPEXC); /* Restore {d0-d15} */ asm volatile("ldc p11, cr0, [%1], #32*4" : : "Q" (*v->arch.vfp.fpregs1), "r" (v->arch.vfp.fpregs1)); /* 32 x 64 bits registers? */ if ( (READ_CP32(MVFR0) & MVFR0_A_SIMD_MASK) == 2 ) /* 32 x 64 bits registers */ /* Restore {d16-d31} */ asm volatile("ldcl p11, cr0, [%1], #32*4" : : "Q" (*v->arch.vfp.fpregs2), "r" (v->arch.vfp.fpregs2)); if ( v->arch.vfp.fpexc & FPEXC_EX ) { WRITE_CP32(v->arch.vfp.fpinst, FPINST); if ( v->arch.vfp.fpexc & FPEXC_FP2V ) WRITE_CP32(v->arch.vfp.fpinst2, FPINST2); } WRITE_CP32(v->arch.vfp.fpscr, FPSCR); WRITE_CP32(v->arch.vfp.fpexc, FPEXC); } static __init int vfp_init(void) { unsigned int vfpsid; unsigned int vfparch; vfpsid = READ_CP32(FPSID); printk("VFP implementer 0x%02x architecture %d part 0x%02x variant 0x%x " "rev 0x%x\n", (vfpsid & FPSID_IMPLEMENTER_MASK) >> FPSID_IMPLEMENTER_BIT, (vfpsid & FPSID_ARCH_MASK) >> FPSID_ARCH_BIT, (vfpsid & FPSID_PART_MASK) >> FPSID_PART_BIT, (vfpsid & FPSID_VARIANT_MASK) >> FPSID_VARIANT_BIT, (vfpsid & FPSID_REV_MASK) >> FPSID_REV_BIT); vfparch = (vfpsid & FPSID_ARCH_MASK) >> FPSID_ARCH_BIT; if ( vfparch < 2 ) panic("Xen only support VFP 3"); return 0; } presmp_initcall(vfp_init); /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm32/debug-pl011.inc0000664000175000017500000000402212307313555016267 0ustar smbsmb/* * xen/arch/arm/arm32/debug-pl011.inc * * PL011 specific debug code * * Copyright (c) 2013 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include /* PL011 UART initialization * rb: register which contains the UART base address * rc: scratch register 1 * rd: scratch register 2 (unused here) */ .macro early_uart_init rb, rc, rd mov \rc, #(7372800 / EARLY_PRINTK_BAUD % 16) str \rc, [\rb, #FBRD] /* -> UARTFBRD (Baud divisor fraction) */ mov \rc, #(7372800 / EARLY_PRINTK_BAUD / 16) str \rc, [\rb, #IBRD] /* -> UARTIBRD (Baud divisor integer) */ mov \rc, #0x60 /* 8n1 */ str \rc, [\rb, #LCR_H] /* -> UARTLCR_H (Line control) */ ldr \rc, =(RXE | TXE | UARTEN) /* RXE | TXE | UARTEN */ str \rc, [\rb, #CR] /* -> UARTCR (Control Register) */ .endm /* PL011 UART wait UART to be ready to transmit * rb: register which contains the UART base address * rc: scratch register */ .macro early_uart_ready rb, rc 1: ldr \rc, [\rb, #FR] /* <- UARTFR (Flag register) */ tst \rc, #BUSY /* Check BUSY bit */ bne 1b /* Wait for the UART to be ready */ .endm /* PL011 UART transmit character * rb: register which contains the UART base address * rt: register which contains the character to transmit */ .macro early_uart_transmit rb, rt str \rt, [\rb, #DR] /* -> UARTDR (Data Register) */ .endm /* * Local variables: * mode: ASM * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm32/head.S0000664000175000017500000004502212307313555014705 0ustar smbsmb/* * xen/arch/arm/head.S * * Start-of-day code for an ARMv7-A with virt extensions. * * Tim Deegan * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #define ZIMAGE_MAGIC_NUMBER 0x016f2818 #define PT_PT 0xe7f /* nG=1 AF=1 SH=10 AP=01 NS=1 ATTR=111 T=1 P=1 */ #define PT_MEM 0xe7d /* nG=1 AF=1 SH=10 AP=01 NS=1 ATTR=111 T=0 P=1 */ #define PT_DEV 0xe71 /* nG=1 AF=1 SH=10 AP=01 NS=1 ATTR=100 T=0 P=1 */ #define PT_DEV_L3 0xe73 /* nG=1 AF=1 SH=10 AP=01 NS=1 ATTR=100 T=1 P=1 */ #define PT_UPPER(x) (PT_##x & 0xf00) #define PT_LOWER(x) (PT_##x & 0x0ff) #if (defined (EARLY_PRINTK)) && (defined (EARLY_PRINTK_INC)) #include EARLY_PRINTK_INC #endif /* * Common register usage in this file: * r0 - * r1 - * r2 - * r3 - * r4 - * r5 - * r6 - * r7 - CPUID * r8 - DTB address (boot CPU only) * r9 - paddr(start) * r10 - phys offset * r11 - UART address * r12 - is_secondary_cpu * r13 - SP * r14 - LR * r15 - PC */ /* Macro to print a string to the UART, if there is one. * Clobbers r0-r3. */ #ifdef EARLY_PRINTK #define PRINT(_s) \ adr r0, 98f ; \ bl puts ; \ b 99f ; \ 98: .asciz _s ; \ .align 2 ; \ 99: #else /* EARLY_PRINTK */ #define PRINT(s) #endif /* !EARLY_PRINTK */ .arm /* This must be the very first address in the loaded image. * It should be linked at XEN_VIRT_START, and loaded at any * 2MB-aligned address. All of text+data+bss must fit in 2MB, * or the initial pagetable code below will need adjustment. */ .global start start: /* zImage magic header, see: * http://www.simtec.co.uk/products/SWLINUX/files/booting_article.html#d0e309 */ .rept 8 mov r0, r0 .endr b past_zImage .word ZIMAGE_MAGIC_NUMBER /* Magic numbers to help the loader */ .word 0x00000000 /* absolute load/run zImage address or * 0 for PiC */ .word (_end - start) /* zImage end address */ past_zImage: cpsid aif /* Disable all interrupts */ /* Save the bootloader arguments in less-clobberable registers */ mov r8, r2 /* r8 := DTB base address */ /* Find out where we are */ ldr r0, =start adr r9, start /* r9 := paddr (start) */ sub r10, r9, r0 /* r10 := phys-offset */ /* Using the DTB in the .dtb section? */ #ifdef CONFIG_DTB_FILE ldr r8, =_sdtb add r8, r10 /* r8 := paddr(DTB) */ #endif mov r12, #0 /* r12 := is_secondary_cpu */ b common_start GLOBAL(init_secondary) cpsid aif /* Disable all interrupts */ /* Find out where we are */ ldr r0, =start adr r9, start /* r9 := paddr (start) */ sub r10, r9, r0 /* r10 := phys-offset */ mov r12, #1 /* r12 := is_secondary_cpu */ common_start: mov r7, #0 /* r7 := CPU ID. Initialy zero until we * find that multiprocessor extensions are * present and the system is SMP */ mrc CP32(r1, MPIDR) tst r1, #MPIDR_SMP /* Multiprocessor extension supported? */ beq 1f tst r1, #MPIDR_UP /* Uniprocessor system? */ bne 1f bic r7, r1, #(~MPIDR_HWID_MASK) /* Mask out flags to get CPU ID */ 1: /* Non-boot CPUs wait here until __cpu_up is ready for them */ teq r12, #0 beq 1f ldr r0, =smp_up_cpu add r0, r0, r10 /* Apply physical offset */ dsb 2: ldr r1, [r0] cmp r1, r7 beq 1f wfe b 2b 1: #ifdef EARLY_PRINTK ldr r11, =EARLY_UART_BASE_ADDRESS /* r11 := UART base address */ teq r12, #0 /* Boot CPU sets up the UART too */ bleq init_uart PRINT("- CPU ") mov r0, r7 bl putn PRINT(" booting -\r\n") #endif /* Check that this CPU has Hyp mode */ mrc CP32(r0, ID_PFR1) and r0, r0, #0xf000 /* Bits 12-15 define virt extensions */ teq r0, #0x1000 /* Must == 0x1 or may be incompatible */ beq 1f PRINT("- CPU doesn't support the virtualization extensions -\r\n") b fail 1: /* Check that we're already in Hyp mode */ mrs r0, cpsr and r0, r0, #0x1f /* Mode is in the low 5 bits of CPSR */ teq r0, #0x1a /* Hyp Mode? */ beq hyp /* OK, we're boned. */ PRINT("- Xen must be entered in NS Hyp mode -\r\n" \ "- Please update the bootloader -\r\n") b fail hyp: PRINT("- Xen starting in Hyp mode -\r\n") /* Zero BSS On the boot CPU to avoid nasty surprises */ teq r12, #0 bne skip_bss PRINT("- Zero BSS -\r\n") ldr r0, =__bss_start /* Load start & end of bss */ ldr r1, =__bss_end add r0, r0, r10 /* Apply physical offset */ add r1, r1, r10 mov r2, #0 1: str r2, [r0], #4 cmp r0, r1 blo 1b skip_bss: PRINT("- Setting up control registers -\r\n") /* Get processor specific proc info into r1 */ mrc CP32(r0, MIDR) /* r0 := our cpu id */ ldr r1, = __proc_info_start add r1, r1, r10 /* r1 := paddr of table (start) */ ldr r2, = __proc_info_end add r2, r2, r10 /* r2 := paddr of table (end) */ 1: ldr r3, [r1, #PROCINFO_cpu_mask] and r4, r0, r3 /* r4 := our cpu id with mask */ ldr r3, [r1, #PROCINFO_cpu_val] /* r3 := cpu val in current proc info */ teq r4, r3 beq 2f /* Match => exit, or try next proc info */ add r1, r1, #PROCINFO_sizeof cmp r1, r2 blo 1b mov r4, r0 PRINT("- Missing processor info: ") mov r0, r4 bl putn PRINT(" -\r\n") b fail 2: /* Jump to cpu_init */ ldr r1, [r1, #PROCINFO_cpu_init] /* r1 := vaddr(init func) */ adr lr, cpu_init_done /* Save return address */ add pc, r1, r10 /* Call paddr(init func) */ cpu_init_done: /* Set up memory attribute type tables */ ldr r0, =MAIR0VAL ldr r1, =MAIR1VAL mcr CP32(r0, MAIR0) mcr CP32(r1, MAIR1) mcr CP32(r0, HMAIR0) mcr CP32(r1, HMAIR1) /* Set up the HTCR: * PT walks use Outer-Shareable accesses, * PT walks are write-back, write-allocate in both cache levels, * Full 32-bit address space goes through this table. */ ldr r0, =0x80002500 mcr CP32(r0, HTCR) /* Set up the HSCTLR: * Exceptions in LE ARM, * Low-latency IRQs disabled, * Write-implies-XN disabled (for now), * D-cache disabled (for now), * I-cache enabled, * Alignment checking enabled, * MMU translation disabled (for now). */ ldr r0, =(HSCTLR_BASE|SCTLR_A) mcr CP32(r0, HSCTLR) /* Rebuild the boot pagetable's first-level entries. The structure * is described in mm.c. * * After the CPU enables paging it will add the fixmap mapping * to these page tables, however this may clash with the 1:1 * mapping. So each CPU must rebuild the page tables here with * the 1:1 in place. */ /* Write Xen's PT's paddr into the HTTBR */ ldr r4, =boot_pgtable add r4, r4, r10 /* r4 := paddr (boot_pagetable) */ mov r5, #0 /* r4:r5 is paddr (boot_pagetable) */ mcrr CP64(r4, r5, HTTBR) /* Setup boot_pgtable: */ ldr r1, =boot_second add r1, r1, r10 /* r1 := paddr (boot_second) */ mov r3, #0x0 /* ... map boot_second in boot_pgtable[0] */ orr r2, r1, #PT_UPPER(PT) /* r2:r3 := table map of boot_second */ orr r2, r2, #PT_LOWER(PT) /* (+ rights for linear PT) */ strd r2, r3, [r4, #0] /* Map it in slot 0 */ /* ... map of paddr(start) in boot_pgtable */ lsrs r1, r9, #30 /* Offset of base paddr in boot_pgtable */ beq 1f /* If it is in slot 0 then map in boot_second * later on */ lsl r2, r1, #30 /* Base address for 1GB mapping */ orr r2, r2, #PT_UPPER(MEM) /* r2:r3 := section map */ orr r2, r2, #PT_LOWER(MEM) lsl r1, r1, #3 /* r1 := Slot offset */ strd r2, r3, [r4, r1] /* Mapping of paddr(start) */ 1: /* Setup boot_second: */ ldr r4, =boot_second add r4, r4, r10 /* r1 := paddr (boot_second) */ lsr r2, r9, #20 /* Base address for 2MB mapping */ lsl r2, r2, #20 orr r2, r2, #PT_UPPER(MEM) /* r2:r3 := section map */ orr r2, r2, #PT_LOWER(MEM) /* ... map of vaddr(start) in boot_second */ ldr r1, =start lsr r1, #18 /* Slot for vaddr(start) */ strd r2, r3, [r4, r1] /* Map vaddr(start) */ /* ... map of paddr(start) in boot_second */ lsrs r1, r9, #30 /* Base paddr */ bne 1f /* If paddr(start) is not in slot 0 * then the mapping was done in * boot_pgtable above */ mov r1, r9, lsr #18 /* Slot for paddr(start) */ strd r2, r3, [r4, r1] /* Map Xen there */ 1: /* Defer fixmap and dtb mapping until after paging enabled, to * avoid them clashing with the 1:1 mapping. */ /* boot pagetable setup complete */ PRINT("- Turning on paging -\r\n") ldr r1, =paging /* Explicit vaddr, not RIP-relative */ mrc CP32(r0, HSCTLR) orr r0, r0, #(SCTLR_M|SCTLR_C) /* Enable MMU and D-cache */ dsb /* Flush PTE writes and finish reads */ mcr CP32(r0, HSCTLR) /* now paging is enabled */ isb /* Now, flush the icache */ mov pc, r1 /* Get a proper vaddr into PC */ paging: /* Now we can install the fixmap and dtb mappings, since we * don't need the 1:1 map any more */ dsb #if defined(EARLY_PRINTK) /* Fixmap is only used by early printk */ /* Non-boot CPUs don't need to rebuild the fixmap itself, just * the mapping from boot_second to xen_fixmap */ teq r12, #0 bne 1f /* Add UART to the fixmap table */ ldr r1, =xen_fixmap /* r1 := vaddr (xen_fixmap) */ mov r3, #0 lsr r2, r11, #12 lsl r2, r2, #12 /* 4K aligned paddr of UART */ orr r2, r2, #PT_UPPER(DEV_L3) orr r2, r2, #PT_LOWER(DEV_L3) /* r2:r3 := 4K dev map including UART */ strd r2, r3, [r1, #(FIXMAP_CONSOLE*8)] /* Map it in the first fixmap's slot */ 1: /* Map fixmap into boot_second */ ldr r1, =boot_second /* r1 := vaddr (xen_fixmap) */ ldr r2, =xen_fixmap add r2, r2, r10 /* r2 := paddr (xen_fixmap) */ orr r2, r2, #PT_UPPER(PT) orr r2, r2, #PT_LOWER(PT) /* r2:r3 := table map of xen_fixmap */ ldr r4, =FIXMAP_ADDR(0) mov r4, r4, lsr #18 /* r4 := Slot for FIXMAP(0) */ strd r2, r3, [r1, r4] /* Map it in the fixmap's slot */ /* Use a virtual address to access the UART. */ ldr r11, =EARLY_UART_VIRTUAL_ADDRESS #endif /* Map the DTB in the boot misc slot */ teq r12, #0 /* Only on boot CPU */ bne 1f ldr r1, =boot_second mov r3, #0x0 lsr r2, r8, #21 lsl r2, r2, #21 /* r2: 2MB-aligned paddr of DTB */ orr r2, r2, #PT_UPPER(MEM) orr r2, r2, #PT_LOWER(MEM) /* r2:r3 := 2MB RAM incl. DTB */ ldr r4, =BOOT_FDT_VIRT_START mov r4, r4, lsr #18 /* Slot for BOOT_FDT_VIRT_START */ strd r2, r3, [r1, r4] /* Map it in the early fdt slot */ dsb 1: PRINT("- Ready -\r\n") /* The boot CPU should go straight into C now */ teq r12, #0 beq launch /* Non-boot CPUs need to move on to the proper pagetables, which were * setup in init_secondary_pagetables. */ ldr r4, =init_ttbr /* VA of HTTBR value stashed by CPU 0 */ ldrd r4, r5, [r4] /* Actual value */ dsb mcrr CP64(r4, r5, HTTBR) dsb isb mcr CP32(r0, TLBIALLH) /* Flush hypervisor TLB */ mcr CP32(r0, ICIALLU) /* Flush I-cache */ mcr CP32(r0, BPIALL) /* Flush branch predictor */ dsb /* Ensure completion of TLB+BP flush */ isb launch: ldr r0, =init_data add r0, #INITINFO_stack /* Find the boot-time stack */ ldr sp, [r0] add sp, #STACK_SIZE /* (which grows down from the top). */ sub sp, #CPUINFO_sizeof /* Make room for CPU save record */ mov r0, r10 /* Marshal args: - phys_offset */ mov r1, r8 /* - DTB address */ mov r2, r7 /* - CPU ID */ teq r12, #0 beq start_xen /* and disappear into the land of C */ b start_secondary /* (to the appropriate entry point) */ /* Fail-stop * r0: string explaining why */ fail: PRINT("- Boot failed -\r\n") 1: wfe b 1b /* Copy Xen to new location and switch TTBR * r1:r0 ttbr * r2 source address * r3 destination address * [sp]=>r4 length * * Source and destination must be word aligned, length is rounded up * to a 16 byte boundary. * * MUST BE VERY CAREFUL when saving things to RAM over the copy */ ENTRY(relocate_xen) push {r4,r5,r6,r7,r8,r9,r10,r11} ldr r4, [sp, #8*4] /* Get 4th argument from stack */ /* Copy 16 bytes at a time using: * r5: counter * r6: data * r7: data * r8: data * r9: data * r10: source * r11: destination */ mov r5, r4 mov r10, r2 mov r11, r3 1: ldmia r10!, {r6, r7, r8, r9} stmia r11!, {r6, r7, r8, r9} subs r5, r5, #16 bgt 1b /* Flush destination from dcache using: * r5: counter * r6: step * r7: vaddr */ dsb /* So the CPU issues all writes to the range */ mov r5, r4 ldr r6, =cacheline_bytes /* r6 := step */ ldr r6, [r6] mov r7, r3 1: mcr CP32(r7, DCCMVAC) add r7, r7, r6 subs r5, r5, r6 bgt 1b dsb /* Ensure the flushes happen before * continuing */ isb /* Ensure synchronization with previous * changes to text */ mcr CP32(r0, TLBIALLH) /* Flush hypervisor TLB */ mcr CP32(r0, ICIALLU) /* Flush I-cache */ mcr CP32(r0, BPIALL) /* Flush branch predictor */ dsb /* Ensure completion of TLB+BP flush */ isb mcrr CP64(r0, r1, HTTBR) dsb /* ensure memory accesses do not cross * over the TTBR0 write */ isb /* Ensure synchronization with previous * changes to text */ mcr CP32(r0, TLBIALLH) /* Flush hypervisor TLB */ mcr CP32(r0, ICIALLU) /* Flush I-cache */ mcr CP32(r0, BPIALL) /* Flush branch predictor */ dsb /* Ensure completion of TLB+BP flush */ isb pop {r4, r5,r6,r7,r8,r9,r10,r11} mov pc, lr #ifdef EARLY_PRINTK /* Bring up the UART. * r11: Early UART base address * Clobbers r0-r2 */ init_uart: #ifdef EARLY_PRINTK_INIT_UART early_uart_init r11, r1, r2 #endif adr r0, 1f b puts /* Jump to puts */ 1: .asciz "- UART enabled -\r\n" .align 4 /* Print early debug messages. * r0: Nul-terminated string to print. * r11: Early UART base address * Clobbers r0-r1 */ puts: early_uart_ready r11, r1 ldrb r1, [r0], #1 /* Load next char */ teq r1, #0 /* Exit on nul */ moveq pc, lr early_uart_transmit r11, r1 b puts /* Print a 32-bit number in hex. Specific to the PL011 UART. * r0: Number to print. * r11: Early UART base address * Clobbers r0-r3 */ putn: adr r1, hex mov r3, #8 1: early_uart_ready r11, r2 and r2, r0, #0xf0000000 /* Mask off the top nybble */ ldrb r2, [r1, r2, lsr #28] /* Convert to a char */ early_uart_transmit r11, r2 lsl r0, #4 /* Roll it through one nybble at a time */ subs r3, r3, #1 bne 1b mov pc, lr hex: .ascii "0123456789abcdef" .align 2 #else /* EARLY_PRINTK */ init_uart: .global early_puts early_puts: puts: putn: mov pc, lr #endif /* !EARLY_PRINTK */ /* * Local variables: * mode: ASM * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm32/debug-8250.inc0000664000175000017500000000241612307313555016035 0ustar smbsmb/* * xen/arch/arm/arm32/debug-8250.inc * * 8250 specific debug code * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include /* 8250 UART wait UART to be ready to transmit * rb: register which contains the UART base address * rc: scratch register */ .macro early_uart_ready rb rc 1: ldr \rc, [\rb, #(UART_LSR << EARLY_UART_REG_SHIFT)] /* Read LSR */ tst \rc, #UART_LSR_THRE /* Check Xmit holding register flag */ beq 1b /* Wait for the UART to be ready */ .endm /* 8250 UART transmit character * rb: register which contains the UART base address * rt: register which contains the character to transmit */ .macro early_uart_transmit rb rt str \rt, [\rb, #UART_THR] /* Write Transmit buffer */ .endm /* * Local variables: * mode: ASM * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm32/smpboot.c0000664000175000017500000000105312307313555015503 0ustar smbsmb#include #include #include #include int __init arch_smp_init(void) { return platform_smp_init(); } int __init arch_cpu_init(int cpu, struct dt_device_node *dn) { /* Not needed on ARM32, as there is no relevant information in * the CPU device tree node for ARMv7 CPUs. */ return 0; } int __init arch_cpu_up(int cpu) { return platform_cpu_up(cpu); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/arm32/traps.c0000664000175000017500000000245612307313555015161 0ustar smbsmb/* * xen/arch/arm/arm32/traps.c * * ARM AArch32 Specific Trap handlers * * Copyright (c) 2012 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include asmlinkage void do_trap_undefined_instruction(struct cpu_user_regs *regs) { do_unexpected_trap("Undefined Instruction", regs); } asmlinkage void do_trap_supervisor_call(struct cpu_user_regs *regs) { do_unexpected_trap("Supervisor Call", regs); } asmlinkage void do_trap_prefetch_abort(struct cpu_user_regs *regs) { do_unexpected_trap("Prefetch Abort", regs); } asmlinkage void do_trap_data_abort(struct cpu_user_regs *regs) { do_unexpected_trap("Data Abort", regs); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/platform.c0000664000175000017500000001024012307313555014716 0ustar smbsmb/* * xen/arch/arm/platform.c * * Helpers to execute platform specific code. * * Julien Grall * Copyright (C) 2013 Linaro Limited. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include extern const struct platform_desc _splatform[], _eplatform[]; /* Pointer to the current platform description */ static const struct platform_desc *platform; static bool_t __init platform_is_compatible(const struct platform_desc *plat) { const char *const *compat; if ( !plat->compatible ) return 0; for ( compat = plat->compatible; *compat; compat++ ) { if ( dt_machine_is_compatible(*compat) ) return 1; } return 0; } /* List of possible platform */ static void dump_platform_table(void) { const struct platform_desc *p; printk("Available platform support:\n"); for ( p = _splatform; p != _eplatform; p++ ) printk(" - %s\n", p->name); } void __init platform_init(void) { int res = 0; ASSERT(platform == NULL); /* Looking for the platform description */ for ( platform = _splatform; platform != _eplatform; platform++ ) { if ( platform_is_compatible(platform) ) break; } /* We don't have specific operations for this platform */ if ( platform == _eplatform ) { /* TODO: dump DT machine compatible node */ printk(XENLOG_WARNING "WARNING: Unrecognized/unsupported device tree " "compatible list\n"); dump_platform_table(); platform = NULL; } else printk(XENLOG_INFO "Platform: %s\n", platform->name); if ( platform && platform->init ) res = platform->init(); if ( res ) panic("Unable to initialize the platform"); } int __init platform_init_time(void) { int res = 0; if ( platform && platform->init_time ) res = platform->init_time(); return res; } int __init platform_specific_mapping(struct domain *d) { int res = 0; if ( platform && platform->specific_mapping ) res = platform->specific_mapping(d); return res; } #ifdef CONFIG_ARM_32 int __init platform_cpu_up(int cpu) { if ( psci_available ) return call_psci_cpu_on(cpu); if ( platform && platform->cpu_up ) return platform->cpu_up(cpu); return -ENODEV; } int __init platform_smp_init(void) { if ( platform && platform->smp_init ) return platform->smp_init(); return 0; } #endif void platform_reset(void) { if ( platform && platform->reset ) platform->reset(); } void platform_poweroff(void) { if ( platform && platform->poweroff ) platform->poweroff(); } bool_t platform_has_quirk(uint32_t quirk) { uint32_t quirks = 0; if ( platform && platform->quirks ) quirks = platform->quirks(); return !!(quirks & quirk); } bool_t platform_device_is_blacklisted(const struct dt_device_node *node) { const struct dt_device_match *blacklist = NULL; if ( platform && platform->blacklist_dev ) blacklist = platform->blacklist_dev; return dt_match_node(blacklist, node); } unsigned int platform_dom0_evtchn_ppi(void) { if ( platform && platform->dom0_evtchn_ppi ) return platform->dom0_evtchn_ppi; return GUEST_EVTCHN_PPI; } void platform_dom0_gnttab(paddr_t *start, paddr_t *size) { if ( platform && platform->dom0_gnttab_size ) { *start = platform->dom0_gnttab_start; *size = platform->dom0_gnttab_size; } else { *start = 0xb0000000; *size = 0x20000; } } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/vuart.h0000664000175000017500000000166512307313555014253 0ustar smbsmb/* * xen/arch/arm/vuart.h * * Virtual UART Emulation Support * * Ian Campbell * Copyright (c) 2012 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #ifndef __ARCH_ARM_VUART_H__ #define __ARCH_ARM_VUART_H__ int domain_vuart_init(struct domain *d); void domain_vuart_free(struct domain *d); #endif /* __ARCH_ARM_VUART_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/time.c0000664000175000017500000002063312307313555014037 0ustar smbsmb/* * xen/arch/arm/time.c * * Time and timer support, using the ARM Generic Timer interfaces * * Tim Deegan * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Unfortunately the hypervisor timer interrupt appears to be buggy in * some versions of the model. Disable this to use the physical timer * instead. */ #define USE_HYP_TIMER 1 uint64_t __read_mostly boot_count; /* For fine-grained timekeeping, we use the ARM "Generic Timer", a * register-mapped time source in the SoC. */ unsigned long __read_mostly cpu_khz; /* CPU clock frequency in kHz. */ static struct dt_irq timer_irq[MAX_TIMER_PPI]; const struct dt_irq *timer_dt_irq(enum timer_ppi ppi) { ASSERT(ppi >= TIMER_PHYS_SECURE_PPI && ppi < MAX_TIMER_PPI); return &timer_irq[ppi]; } /*static inline*/ s_time_t ticks_to_ns(uint64_t ticks) { return muldiv64(ticks, SECONDS(1), 1000 * cpu_khz); } /*static inline*/ uint64_t ns_to_ticks(s_time_t ns) { return muldiv64(ns, 1000 * cpu_khz, SECONDS(1)); } /* TODO: On a real system the firmware would have set the frequency in the CNTFRQ register. Also we'd need to use devicetree to find the RTC. When we've seen some real systems, we can delete this. static uint32_t calibrate_timer(void) { uint32_t sec; uint64_t start, end; paddr_t rtc_base = 0x1C170000ull; volatile uint32_t *rtc; ASSERT(!local_irq_is_enabled()); set_fixmap(FIXMAP_MISC, rtc_base >> PAGE_SHIFT, DEV_SHARED); rtc = (uint32_t *) FIXMAP_ADDR(FIXMAP_MISC); printk("Calibrating timer against RTC..."); // Turn on the RTC rtc[3] = 1; // Wait for an edge sec = rtc[0] + 1; do {} while ( rtc[0] != sec ); // Now time a few seconds start = READ_SYSREG64(CNTPCT_EL0); do {} while ( rtc[0] < sec + 32 ); end = READ_SYSREG64(CNTPCT_EL0); printk("done.\n"); clear_fixmap(FIXMAP_MISC); return (end - start) / 32; } */ /* Set up the timer on the boot CPU */ int __init init_xen_time(void) { static const struct dt_device_match timer_ids[] __initconst = { DT_MATCH_TIMER, { /* sentinel */ }, }; struct dt_device_node *dev; int res; unsigned int i; u32 rate; dev = dt_find_matching_node(NULL, timer_ids); if ( !dev ) panic("Unable to find a compatible timer in the device tree"); dt_device_set_used_by(dev, DOMID_XEN); /* Retrieve all IRQs for the timer */ for ( i = TIMER_PHYS_SECURE_PPI; i < MAX_TIMER_PPI; i++ ) { res = dt_device_get_irq(dev, i, &timer_irq[i]); if ( res ) panic("Timer: Unable to retrieve IRQ %u from the device tree", i); } printk("Generic Timer IRQ: phys=%u hyp=%u virt=%u\n", timer_irq[TIMER_PHYS_NONSECURE_PPI].irq, timer_irq[TIMER_HYP_PPI].irq, timer_irq[TIMER_VIRT_PPI].irq); res = platform_init_time(); if ( res ) panic("Timer: Cannot initialize platform timer"); /* Check that this CPU supports the Generic Timer interface */ if ( !cpu_has_gentimer ) panic("CPU does not support the Generic Timer v1 interface"); res = dt_property_read_u32(dev, "clock-frequency", &rate); if ( res ) cpu_khz = rate / 1000; else cpu_khz = READ_SYSREG32(CNTFRQ_EL0) / 1000; boot_count = READ_SYSREG64(CNTPCT_EL0); printk("Using generic timer at %lu KHz\n", cpu_khz); return 0; } /* Return number of nanoseconds since boot */ s_time_t get_s_time(void) { uint64_t ticks = READ_SYSREG64(CNTPCT_EL0) - boot_count; return ticks_to_ns(ticks); } /* Set the timer to wake us up at a particular time. * Timeout is a Xen system time (nanoseconds since boot); 0 disables the timer. * Returns 1 on success; 0 if the timeout is too soon or is in the past. */ int reprogram_timer(s_time_t timeout) { uint64_t deadline; if ( timeout == 0 ) { #if USE_HYP_TIMER WRITE_SYSREG32(0, CNTHP_CTL_EL2); #else WRITE_SYSREG32(0, CNTP_CTL_EL0); #endif return 1; } deadline = ns_to_ticks(timeout) + boot_count; #if USE_HYP_TIMER WRITE_SYSREG64(deadline, CNTHP_CVAL_EL2); WRITE_SYSREG32(CNTx_CTL_ENABLE, CNTHP_CTL_EL2); #else WRITE_SYSREG64(deadline, CNTP_CVAL_EL0); WRITE_SYSREG32(CNTx_CTL_ENABLE, CNTP_CTL_EL0); #endif isb(); /* No need to check for timers in the past; the Generic Timer fires * on a signed 63-bit comparison. */ return 1; } /* Handle the firing timer */ static void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs) { if ( irq == (timer_irq[TIMER_HYP_PPI].irq) && READ_SYSREG32(CNTHP_CTL_EL2) & CNTx_CTL_PENDING ) { /* Signal the generic timer code to do its work */ raise_softirq(TIMER_SOFTIRQ); /* Disable the timer to avoid more interrupts */ WRITE_SYSREG32(0, CNTHP_CTL_EL2); } if ( irq == (timer_irq[TIMER_PHYS_NONSECURE_PPI].irq) && READ_SYSREG32(CNTP_CTL_EL0) & CNTx_CTL_PENDING ) { /* Signal the generic timer code to do its work */ raise_softirq(TIMER_SOFTIRQ); /* Disable the timer to avoid more interrupts */ WRITE_SYSREG32(0, CNTP_CTL_EL0); } } static void vtimer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs) { current->arch.virt_timer.ctl = READ_SYSREG32(CNTV_CTL_EL0); WRITE_SYSREG32(current->arch.virt_timer.ctl | CNTx_CTL_MASK, CNTV_CTL_EL0); vgic_vcpu_inject_irq(current, current->arch.virt_timer.irq, 1); } /* Route timer's IRQ on this CPU */ void __cpuinit route_timer_interrupt(void) { gic_route_dt_irq(&timer_irq[TIMER_PHYS_NONSECURE_PPI], cpumask_of(smp_processor_id()), 0xa0); gic_route_dt_irq(&timer_irq[TIMER_HYP_PPI], cpumask_of(smp_processor_id()), 0xa0); gic_route_dt_irq(&timer_irq[TIMER_VIRT_PPI], cpumask_of(smp_processor_id()), 0xa0); } /* Set up the timer interrupt on this CPU */ void __cpuinit init_timer_interrupt(void) { /* Sensible defaults */ WRITE_SYSREG64(0, CNTVOFF_EL2); /* No VM-specific offset */ WRITE_SYSREG32(0, CNTKCTL_EL1); /* No user-mode access */ #if USE_HYP_TIMER /* Do not let the VMs program the physical timer, only read the physical counter */ WRITE_SYSREG32(CNTHCTL_PA, CNTHCTL_EL2); #else /* Cannot let VMs access physical counter if we are using it */ WRITE_SYSREG32(0, CNTHCTL_EL2); #endif WRITE_SYSREG32(0, CNTP_CTL_EL0); /* Physical timer disabled */ WRITE_SYSREG32(0, CNTHP_CTL_EL2); /* Hypervisor's timer disabled */ isb(); request_dt_irq(&timer_irq[TIMER_HYP_PPI], timer_interrupt, "hyptimer", NULL); request_dt_irq(&timer_irq[TIMER_VIRT_PPI], vtimer_interrupt, "virtimer", NULL); request_dt_irq(&timer_irq[TIMER_PHYS_NONSECURE_PPI], timer_interrupt, "phytimer", NULL); } /* Wait a set number of microseconds */ void udelay(unsigned long usecs) { s_time_t deadline = get_s_time() + 1000 * (s_time_t) usecs; while ( get_s_time() - deadline < 0 ) ; dsb(); isb(); } /* VCPU PV timers. */ void send_timer_event(struct vcpu *v) { send_guest_vcpu_virq(v, VIRQ_TIMER); } /* VCPU PV clock. */ void update_vcpu_system_time(struct vcpu *v) { /* XXX update shared_info->wc_* */ } void domain_set_time_offset(struct domain *d, int32_t time_offset_seconds) { d->time_offset_seconds = time_offset_seconds; /* XXX update guest visible wallclock time */ } struct tm wallclock_time(void) { return (struct tm) { 0 }; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/vtimer.h0000664000175000017500000000217412307313555014414 0ustar smbsmb/* * xen/arch/arm/vtimer.h * * ARM Virtual Timer emulation support * * Ian Campbell * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #ifndef __ARCH_ARM_VTIMER_H__ #define __ARCH_ARM_VTIMER_H__ extern int vcpu_domain_init(struct domain *d); extern int vcpu_vtimer_init(struct vcpu *v); extern int vtimer_emulate(struct cpu_user_regs *regs, union hsr hsr); extern int virt_timer_save(struct vcpu *v); extern int virt_timer_restore(struct vcpu *v); extern void vcpu_timer_destroy(struct vcpu *v); #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/mm.c0000664000175000017500000010762312307313555013517 0ustar smbsmb/* * xen/arch/arm/mm.c * * MMU code for an ARMv7-A with virt extensions. * * Tim Deegan * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct domain *dom_xen, *dom_io, *dom_cow; /* Static start-of-day pagetables that we use before the allocators * are up. These are used by all CPUs during bringup before switching * to the CPUs own pagetables. * * These pagetables have a very simple structure. They include: * - a 2MB mapping of xen at XEN_VIRT_START, boot_first and * boot_second are used to populate the trie down to that mapping. * - a 1:1 mapping of xen at its current physical address. This uses a * section mapping at whichever of boot_{pgtable,first,second} * covers that physical address. * * For the boot CPU these mappings point to the address where Xen was * loaded by the bootloader. For secondary CPUs they point to the * relocated copy of Xen for the benefit of secondary CPUs. * * In addition to the above for the boot CPU the device-tree is * initially mapped in the boot misc slot. This mapping is not present * for secondary CPUs. * * Finally, if EARLY_PRINTK is enabled then xen_fixmap will be mapped * by the CPU once it has moved off the 1:1 mapping. */ lpae_t boot_pgtable[LPAE_ENTRIES] __attribute__((__aligned__(4096))); #ifdef CONFIG_ARM_64 lpae_t boot_first[LPAE_ENTRIES] __attribute__((__aligned__(4096))); #endif lpae_t boot_second[LPAE_ENTRIES] __attribute__((__aligned__(4096))); /* Main runtime page tables */ /* * For arm32 xen_pgtable and xen_dommap are per-PCPU and are allocated before * bringing up each CPU. For arm64 xen_pgtable is common to all PCPUs. * * xen_second, xen_fixmap and xen_xenmap are always shared between all * PCPUs. */ #ifdef CONFIG_ARM_64 lpae_t xen_pgtable[LPAE_ENTRIES] __attribute__((__aligned__(4096))); lpae_t xen_first[LPAE_ENTRIES] __attribute__((__aligned__(4096))); #define THIS_CPU_PGTABLE xen_pgtable #else /* Per-CPU pagetable pages */ /* xen_pgtable == root of the trie (zeroeth level on 64-bit, first on 32-bit) */ static DEFINE_PER_CPU(lpae_t *, xen_pgtable); #define THIS_CPU_PGTABLE this_cpu(xen_pgtable) /* xen_dommap == pages used by map_domain_page, these pages contain * the second level pagetables which map the domheap region * DOMHEAP_VIRT_START...DOMHEAP_VIRT_END in 2MB chunks. */ static DEFINE_PER_CPU(lpae_t *, xen_dommap); /* Root of the trie for cpu0, other CPU's PTs are dynamically allocated */ lpae_t cpu0_pgtable[LPAE_ENTRIES] __attribute__((__aligned__(4096))); /* cpu0's domheap page tables */ lpae_t cpu0_dommap[LPAE_ENTRIES*DOMHEAP_SECOND_PAGES] __attribute__((__aligned__(4096*DOMHEAP_SECOND_PAGES))); #endif #ifdef CONFIG_ARM_64 /* The first page of the first level mapping of the xenheap. The * subsequent xenheap first level pages are dynamically allocated, but * we need this one to bootstrap ourselves. */ lpae_t xenheap_first_first[LPAE_ENTRIES] __attribute__((__aligned__(4096))); /* The zeroeth level slot which uses xenheap_first_first. Used because * setup_xenheap_mappings otherwise relies on mfn_to_virt which isn't * valid for a non-xenheap mapping. */ static __initdata int xenheap_first_first_slot = -1; #endif /* Common pagetable leaves */ /* Second level page tables. * * The second-level table is 2 contiguous pages long, and covers all * addresses from 0 to 0x7fffffff. Offsets into it are calculated * with second_linear_offset(), not second_table_offset(). */ lpae_t xen_second[LPAE_ENTRIES*2] __attribute__((__aligned__(4096*2))); /* First level page table used for fixmap */ lpae_t xen_fixmap[LPAE_ENTRIES] __attribute__((__aligned__(4096))); /* First level page table used to map Xen itself with the XN bit set * as appropriate. */ static lpae_t xen_xenmap[LPAE_ENTRIES] __attribute__((__aligned__(4096))); /* Non-boot CPUs use this to find the correct pagetables. */ uint64_t init_ttbr; static paddr_t phys_offset; /* Limits of the Xen heap */ unsigned long xenheap_mfn_start __read_mostly = ~0UL; unsigned long xenheap_mfn_end __read_mostly; unsigned long xenheap_virt_end __read_mostly; unsigned long frametable_base_mfn __read_mostly; unsigned long frametable_virt_end __read_mostly; unsigned long max_page; unsigned long total_pages; extern char __init_begin[], __init_end[]; /* Checking VA memory layout alignment. */ static inline void check_memory_layout_alignment_constraints(void) { /* 2MB aligned regions */ BUILD_BUG_ON(XEN_VIRT_START & ~SECOND_MASK); BUILD_BUG_ON(FIXMAP_ADDR(0) & ~SECOND_MASK); BUILD_BUG_ON(BOOT_RELOC_VIRT_START & ~SECOND_MASK); /* 1GB aligned regions */ BUILD_BUG_ON(XENHEAP_VIRT_START & ~FIRST_MASK); /* Page table structure constraints */ #ifdef CONFIG_ARM_64 BUILD_BUG_ON(zeroeth_table_offset(XEN_VIRT_START)); #endif BUILD_BUG_ON(first_table_offset(XEN_VIRT_START)); BUILD_BUG_ON(second_linear_offset(XEN_VIRT_START) >= LPAE_ENTRIES); #ifdef CONFIG_DOMAIN_PAGE BUILD_BUG_ON(DOMHEAP_VIRT_START & ~FIRST_MASK); #endif } void dump_pt_walk(lpae_t *first, paddr_t addr) { lpae_t *second = NULL, *third = NULL; if ( first_table_offset(addr) >= LPAE_ENTRIES ) return; printk("1ST[0x%x] = 0x%"PRIpaddr"\n", first_table_offset(addr), first[first_table_offset(addr)].bits); if ( !first[first_table_offset(addr)].walk.valid || !first[first_table_offset(addr)].walk.table ) goto done; second = map_domain_page(first[first_table_offset(addr)].walk.base); printk("2ND[0x%x] = 0x%"PRIpaddr"\n", second_table_offset(addr), second[second_table_offset(addr)].bits); if ( !second[second_table_offset(addr)].walk.valid || !second[second_table_offset(addr)].walk.table ) goto done; third = map_domain_page(second[second_table_offset(addr)].walk.base); printk("3RD[0x%x] = 0x%"PRIpaddr"\n", third_table_offset(addr), third[third_table_offset(addr)].bits); done: if (third) unmap_domain_page(third); if (second) unmap_domain_page(second); } void dump_hyp_walk(vaddr_t addr) { uint64_t ttbr = READ_SYSREG64(TTBR0_EL2); lpae_t *pgtable = THIS_CPU_PGTABLE; printk("Walking Hypervisor VA 0x%"PRIvaddr" " "on CPU%d via TTBR 0x%016"PRIx64"\n", addr, smp_processor_id(), ttbr); if ( smp_processor_id() == 0 ) BUG_ON( (lpae_t *)(unsigned long)(ttbr - phys_offset) != pgtable ); else BUG_ON( virt_to_maddr(pgtable) != ttbr ); dump_pt_walk(pgtable, addr); } /* Map a 4k page in a fixmap entry */ void set_fixmap(unsigned map, unsigned long mfn, unsigned attributes) { lpae_t pte = mfn_to_xen_entry(mfn); pte.pt.table = 1; /* 4k mappings always have this bit set */ pte.pt.ai = attributes; pte.pt.xn = 1; write_pte(xen_fixmap + third_table_offset(FIXMAP_ADDR(map)), pte); flush_xen_data_tlb_range_va(FIXMAP_ADDR(map), PAGE_SIZE); } /* Remove a mapping from a fixmap entry */ void clear_fixmap(unsigned map) { lpae_t pte = {0}; write_pte(xen_fixmap + third_table_offset(FIXMAP_ADDR(map)), pte); flush_xen_data_tlb_range_va(FIXMAP_ADDR(map), PAGE_SIZE); } #ifdef CONFIG_DOMAIN_PAGE void *map_domain_page_global(unsigned long mfn) { return vmap(&mfn, 1); } void unmap_domain_page_global(const void *va) { vunmap(va); } /* Map a page of domheap memory */ void *map_domain_page(unsigned long mfn) { unsigned long flags; lpae_t *map = this_cpu(xen_dommap); unsigned long slot_mfn = mfn & ~LPAE_ENTRY_MASK; vaddr_t va; lpae_t pte; int i, slot; local_irq_save(flags); /* The map is laid out as an open-addressed hash table where each * entry is a 2MB superpage pte. We use the available bits of each * PTE as a reference count; when the refcount is zero the slot can * be reused. */ for ( slot = (slot_mfn >> LPAE_SHIFT) % DOMHEAP_ENTRIES, i = 0; i < DOMHEAP_ENTRIES; slot = (slot + 1) % DOMHEAP_ENTRIES, i++ ) { if ( map[slot].pt.avail < 0xf && map[slot].pt.base == slot_mfn && map[slot].pt.valid ) { /* This slot already points to the right place; reuse it */ map[slot].pt.avail++; break; } else if ( map[slot].pt.avail == 0 ) { /* Commandeer this 2MB slot */ pte = mfn_to_xen_entry(slot_mfn); pte.pt.avail = 1; write_pte(map + slot, pte); break; } } /* If the map fills up, the callers have misbehaved. */ BUG_ON(i == DOMHEAP_ENTRIES); #ifndef NDEBUG /* Searching the hash could get slow if the map starts filling up. * Cross that bridge when we come to it */ { static int max_tries = 32; if ( i >= max_tries ) { dprintk(XENLOG_WARNING, "Domheap map is filling: %i tries\n", i); max_tries *= 2; } } #endif local_irq_restore(flags); va = (DOMHEAP_VIRT_START + (slot << SECOND_SHIFT) + ((mfn & LPAE_ENTRY_MASK) << THIRD_SHIFT)); /* * We may not have flushed this specific subpage at map time, * since we only flush the 4k page not the superpage */ flush_xen_data_tlb_range_va(va, PAGE_SIZE); return (void *)va; } /* Release a mapping taken with map_domain_page() */ void unmap_domain_page(const void *va) { unsigned long flags; lpae_t *map = this_cpu(xen_dommap); int slot = ((unsigned long) va - DOMHEAP_VIRT_START) >> SECOND_SHIFT; local_irq_save(flags); ASSERT(slot >= 0 && slot < DOMHEAP_ENTRIES); ASSERT(map[slot].pt.avail != 0); map[slot].pt.avail--; local_irq_restore(flags); } unsigned long domain_page_map_to_mfn(const void *ptr) { unsigned long va = (unsigned long)ptr; lpae_t *map = this_cpu(xen_dommap); int slot = (va - DOMHEAP_VIRT_START) >> SECOND_SHIFT; unsigned long offset = (va>>THIRD_SHIFT) & LPAE_ENTRY_MASK; if ( va >= VMAP_VIRT_START && va < VMAP_VIRT_END ) return virt_to_mfn(va); ASSERT(slot >= 0 && slot < DOMHEAP_ENTRIES); ASSERT(map[slot].pt.avail != 0); return map[slot].pt.base + offset; } #endif void flush_page_to_ram(unsigned long mfn) { void *p, *v = map_domain_page(mfn); dsb(); /* So the CPU issues all writes to the range */ for ( p = v; p < v + PAGE_SIZE ; p += cacheline_bytes ) asm volatile (__clean_and_invalidate_xen_dcache_one(0) : : "r" (p)); dsb(); /* So we know the flushes happen before continuing */ unmap_domain_page(v); } void __init arch_init_memory(void) { /* * Initialise our DOMID_XEN domain. * Any Xen-heap pages that we will allow to be mapped will have * their domain field set to dom_xen. */ dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0); BUG_ON(IS_ERR(dom_xen)); /* * Initialise our DOMID_IO domain. * This domain owns I/O pages that are within the range of the page_info * array. Mappings occur at the priv of the caller. */ dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0); BUG_ON(IS_ERR(dom_io)); /* * Initialise our COW domain. * This domain owns sharable pages. */ dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0); BUG_ON(IS_ERR(dom_cow)); } void __cpuinit setup_virt_paging(void) { /* Setup Stage 2 address translation */ /* SH0=00, ORGN0=IRGN0=01 * SL0=01 (Level-1) * ARVv7: T0SZ=(1)1000 = -8 (32-(-8) = 40 bit physical addresses) * ARMv8: T0SZ=01 1000 = 24 (64-24 = 40 bit physical addresses) * PS=010 == 40 bits */ #ifdef CONFIG_ARM_32 WRITE_SYSREG32(0x80002558, VTCR_EL2); #else WRITE_SYSREG32(0x80022558, VTCR_EL2); #endif isb(); } static inline lpae_t pte_of_xenaddr(vaddr_t va) { paddr_t ma = va + phys_offset; unsigned long mfn = ma >> PAGE_SHIFT; return mfn_to_xen_entry(mfn); } void __init remove_early_mappings(void) { lpae_t pte = {0}; write_pte(xen_second + second_table_offset(BOOT_FDT_VIRT_START), pte); flush_xen_data_tlb_range_va(BOOT_FDT_VIRT_START, SECOND_SIZE); } extern void relocate_xen(uint64_t ttbr, void *src, void *dst, size_t len); /* Boot-time pagetable setup. * Changes here may need matching changes in head.S */ void __init setup_pagetables(unsigned long boot_phys_offset, paddr_t xen_paddr) { uint64_t ttbr; unsigned long dest_va; lpae_t pte, *p; int i; /* Calculate virt-to-phys offset for the new location */ phys_offset = xen_paddr - (unsigned long) _start; #ifdef CONFIG_ARM_64 p = (void *) xen_pgtable; p[0] = pte_of_xenaddr((uintptr_t)xen_first); p[0].pt.table = 1; p[0].pt.xn = 0; p = (void *) xen_first; #else p = (void *) cpu0_pgtable; #endif /* Initialise first level entries, to point to second level entries */ for ( i = 0; i < 2; i++) { p[i] = pte_of_xenaddr((uintptr_t)(xen_second+i*LPAE_ENTRIES)); p[i].pt.table = 1; p[i].pt.xn = 0; } #ifdef CONFIG_ARM_32 for ( i = 0; i < DOMHEAP_SECOND_PAGES; i++ ) { p[first_table_offset(DOMHEAP_VIRT_START+i*FIRST_SIZE)] = pte_of_xenaddr((uintptr_t)(cpu0_dommap+i*LPAE_ENTRIES)); p[first_table_offset(DOMHEAP_VIRT_START+i*FIRST_SIZE)].pt.table = 1; } #endif /* Initialise xen second level entries ... */ /* ... Xen's text etc */ pte = mfn_to_xen_entry(xen_paddr>>PAGE_SHIFT); pte.pt.xn = 0;/* Contains our text mapping! */ xen_second[second_table_offset(XEN_VIRT_START)] = pte; /* ... Fixmap */ pte = pte_of_xenaddr((vaddr_t)xen_fixmap); pte.pt.table = 1; xen_second[second_table_offset(FIXMAP_ADDR(0))] = pte; /* ... DTB */ pte = boot_second[second_table_offset(BOOT_FDT_VIRT_START)]; xen_second[second_table_offset(BOOT_FDT_VIRT_START)] = pte; /* Map the destination in the boot misc area. */ dest_va = BOOT_RELOC_VIRT_START; pte = mfn_to_xen_entry(xen_paddr >> PAGE_SHIFT); write_pte(boot_second + second_table_offset(dest_va), pte); flush_xen_data_tlb_range_va(dest_va, SECOND_SIZE); #ifdef CONFIG_ARM_64 ttbr = (uintptr_t) xen_pgtable + phys_offset; #else ttbr = (uintptr_t) cpu0_pgtable + phys_offset; #endif relocate_xen(ttbr, _start, (void*)dest_va, _end - _start); /* Clear the copy of the boot pagetables. Each secondary CPU * rebuilds these itself (see head.S) */ memset(boot_pgtable, 0x0, PAGE_SIZE); clean_xen_dcache(boot_pgtable); #ifdef CONFIG_ARM_64 memset(boot_first, 0x0, PAGE_SIZE); clean_xen_dcache(boot_first); #endif memset(boot_second, 0x0, PAGE_SIZE); clean_xen_dcache(boot_second); /* Break up the Xen mapping into 4k pages and protect them separately. */ for ( i = 0; i < LPAE_ENTRIES; i++ ) { unsigned long mfn = paddr_to_pfn(xen_paddr) + i; unsigned long va = XEN_VIRT_START + (i << PAGE_SHIFT); if ( !is_kernel(va) ) break; pte = mfn_to_xen_entry(mfn); pte.pt.table = 1; /* 4k mappings always have this bit set */ if ( is_kernel_text(va) || is_kernel_inittext(va) ) { pte.pt.xn = 0; pte.pt.ro = 1; } if ( is_kernel_rodata(va) ) pte.pt.ro = 1; write_pte(xen_xenmap + i, pte); /* No flush required here as page table is not hooked in yet. */ } pte = pte_of_xenaddr((vaddr_t)xen_xenmap); pte.pt.table = 1; write_pte(xen_second + second_linear_offset(XEN_VIRT_START), pte); /* TLBFLUSH and ISB would be needed here, but wait until we set WXN */ /* From now on, no mapping may be both writable and executable. */ WRITE_SYSREG32(READ_SYSREG32(SCTLR_EL2) | SCTLR_WXN, SCTLR_EL2); /* Flush everything after setting WXN bit. */ flush_xen_text_tlb(); #ifdef CONFIG_ARM_32 per_cpu(xen_pgtable, 0) = cpu0_pgtable; per_cpu(xen_dommap, 0) = cpu0_dommap; /* Make sure it is clear */ memset(this_cpu(xen_dommap), 0, DOMHEAP_SECOND_PAGES*PAGE_SIZE); clean_xen_dcache_va_range(this_cpu(xen_dommap), DOMHEAP_SECOND_PAGES*PAGE_SIZE); #endif } #ifdef CONFIG_ARM_64 int init_secondary_pagetables(int cpu) { /* Set init_ttbr for this CPU coming up. All CPus share a single setof * pagetables, but rewrite it each time for consistency with 32 bit. */ init_ttbr = (uintptr_t) xen_pgtable + phys_offset; clean_xen_dcache(init_ttbr); return 0; } #else int init_secondary_pagetables(int cpu) { lpae_t *first, *domheap, pte; int i; first = alloc_xenheap_page(); /* root == first level on 32-bit 3-level trie */ domheap = alloc_xenheap_pages(get_order_from_pages(DOMHEAP_SECOND_PAGES), 0); if ( domheap == NULL || first == NULL ) { printk("Not enough free memory for secondary CPU%d pagetables\n", cpu); free_xenheap_pages(domheap, get_order_from_pages(DOMHEAP_SECOND_PAGES)); free_xenheap_page(first); return -ENOMEM; } /* Initialise root pagetable from root of boot tables */ memcpy(first, cpu0_pgtable, PAGE_SIZE); /* Ensure the domheap has no stray mappings */ memset(domheap, 0, DOMHEAP_SECOND_PAGES*PAGE_SIZE); /* Update the first level mapping to reference the local CPUs * domheap mapping pages. */ for ( i = 0; i < DOMHEAP_SECOND_PAGES; i++ ) { pte = mfn_to_xen_entry(virt_to_mfn(domheap+i*LPAE_ENTRIES)); pte.pt.table = 1; write_pte(&first[first_table_offset(DOMHEAP_VIRT_START+i*FIRST_SIZE)], pte); } clean_xen_dcache_va_range(first, PAGE_SIZE); clean_xen_dcache_va_range(domheap, DOMHEAP_SECOND_PAGES*PAGE_SIZE); per_cpu(xen_pgtable, cpu) = first; per_cpu(xen_dommap, cpu) = domheap; /* Set init_ttbr for this CPU coming up */ init_ttbr = __pa(first); clean_xen_dcache(init_ttbr); return 0; } #endif /* MMU setup for secondary CPUS (which already have paging enabled) */ void __cpuinit mmu_init_secondary_cpu(void) { /* From now on, no mapping may be both writable and executable. */ WRITE_SYSREG32(READ_SYSREG32(SCTLR_EL2) | SCTLR_WXN, SCTLR_EL2); flush_xen_text_tlb(); } /* Create Xen's mappings of memory. * Base and virt must be 32MB aligned and size a multiple of 32MB. * second must be a contiguous set of second level page tables * covering the region starting at virt_offset. */ static void __init create_32mb_mappings(lpae_t *second, unsigned long virt_offset, unsigned long base_mfn, unsigned long nr_mfns) { unsigned long i, count; lpae_t pte, *p; ASSERT(!((virt_offset >> PAGE_SHIFT) % (16 * LPAE_ENTRIES))); ASSERT(!(base_mfn % (16 * LPAE_ENTRIES))); ASSERT(!(nr_mfns % (16 * LPAE_ENTRIES))); count = nr_mfns / LPAE_ENTRIES; p = second + second_linear_offset(virt_offset); pte = mfn_to_xen_entry(base_mfn); pte.pt.contig = 1; /* These maps are in 16-entry contiguous chunks. */ for ( i = 0; i < count; i++ ) { write_pte(p + i, pte); pte.pt.base += 1 << LPAE_SHIFT; } flush_xen_data_tlb(); } #ifdef CONFIG_ARM_32 /* Set up the xenheap: up to 1GB of contiguous, always-mapped memory. */ void __init setup_xenheap_mappings(unsigned long base_mfn, unsigned long nr_mfns) { create_32mb_mappings(xen_second, XENHEAP_VIRT_START, base_mfn, nr_mfns); /* Record where the xenheap is, for translation routines. */ xenheap_virt_end = XENHEAP_VIRT_START + nr_mfns * PAGE_SIZE; xenheap_mfn_start = base_mfn; xenheap_mfn_end = base_mfn + nr_mfns; } #else /* CONFIG_ARM_64 */ void __init setup_xenheap_mappings(unsigned long base_mfn, unsigned long nr_mfns) { lpae_t *first, pte; unsigned long offset, end_mfn; vaddr_t vaddr; /* First call sets the xenheap physical offset. */ if ( xenheap_mfn_start == ~0UL ) xenheap_mfn_start = base_mfn; if ( base_mfn < xenheap_mfn_start ) early_panic("cannot add xenheap mapping at %lx below heap start %lx", base_mfn, xenheap_mfn_start); end_mfn = base_mfn + nr_mfns; /* Align to previous 1GB boundary */ base_mfn &= ~((FIRST_SIZE>>PAGE_SHIFT)-1); offset = base_mfn - xenheap_mfn_start; vaddr = DIRECTMAP_VIRT_START + offset*PAGE_SIZE; while ( base_mfn < end_mfn ) { int slot = zeroeth_table_offset(vaddr); lpae_t *p = &xen_pgtable[slot]; if ( p->pt.valid ) { /* mfn_to_virt is not valid on the 1st 1st mfn, since it * is not within the xenheap. */ first = slot == xenheap_first_first_slot ? xenheap_first_first : mfn_to_virt(p->pt.base); } else if ( xenheap_first_first_slot == -1) { /* Use xenheap_first_first to bootstrap the mappings */ first = xenheap_first_first; pte = pte_of_xenaddr((vaddr_t)xenheap_first_first); pte.pt.table = 1; write_pte(p, pte); xenheap_first_first_slot = slot; } else { unsigned long first_mfn = alloc_boot_pages(1, 1); pte = mfn_to_xen_entry(first_mfn); pte.pt.table = 1; write_pte(p, pte); first = mfn_to_virt(first_mfn); } pte = mfn_to_xen_entry(base_mfn); /* TODO: Set pte.pt.contig when appropriate. */ write_pte(&first[first_table_offset(vaddr)], pte); base_mfn += FIRST_SIZE>>PAGE_SHIFT; vaddr += FIRST_SIZE; } flush_xen_data_tlb(); } #endif /* Map a frame table to cover physical addresses ps through pe */ void __init setup_frametable_mappings(paddr_t ps, paddr_t pe) { unsigned long nr_pages = (pe - ps) >> PAGE_SHIFT; unsigned long frametable_size = nr_pages * sizeof(struct page_info); unsigned long base_mfn; #ifdef CONFIG_ARM_64 lpae_t *second, pte; unsigned long nr_second, second_base; int i; #endif frametable_base_mfn = ps >> PAGE_SHIFT; /* Round up to 32M boundary */ frametable_size = (frametable_size + 0x1ffffff) & ~0x1ffffff; base_mfn = alloc_boot_pages(frametable_size >> PAGE_SHIFT, 32<<(20-12)); #ifdef CONFIG_ARM_64 nr_second = frametable_size >> SECOND_SHIFT; second_base = alloc_boot_pages(nr_second, 1); second = mfn_to_virt(second_base); for ( i = 0; i < nr_second; i++ ) { pte = mfn_to_xen_entry(second_base + i); pte.pt.table = 1; write_pte(&xen_first[first_table_offset(FRAMETABLE_VIRT_START)+i], pte); } create_32mb_mappings(second, 0, base_mfn, frametable_size >> PAGE_SHIFT); #else create_32mb_mappings(xen_second, FRAMETABLE_VIRT_START, base_mfn, frametable_size >> PAGE_SHIFT); #endif memset(&frame_table[0], 0, nr_pages * sizeof(struct page_info)); memset(&frame_table[nr_pages], -1, frametable_size - (nr_pages * sizeof(struct page_info))); frametable_virt_end = FRAMETABLE_VIRT_START + (nr_pages * sizeof(struct page_info)); } void *__init arch_vmap_virt_end(void) { return (void *)VMAP_VIRT_END; } /* * This function should only be used to remap device address ranges * TODO: add a check to verify this assumption */ void *ioremap_attr(paddr_t pa, size_t len, unsigned int attributes) { unsigned long pfn = PFN_DOWN(pa); unsigned int offs = pa & (PAGE_SIZE - 1); unsigned int nr = PFN_UP(offs + len); void *ptr = __vmap(&pfn, nr, 1, 1, attributes); if ( ptr == NULL ) return NULL; return ptr + offs; } void *ioremap(paddr_t pa, size_t len) { return ioremap_attr(pa, len, PAGE_HYPERVISOR_NOCACHE); } static int create_xen_table(lpae_t *entry) { void *p; lpae_t pte; p = alloc_xenheap_page(); if ( p == NULL ) return -ENOMEM; clear_page(p); pte = mfn_to_xen_entry(virt_to_mfn(p)); pte.pt.table = 1; write_pte(entry, pte); return 0; } enum xenmap_operation { INSERT, REMOVE }; static int create_xen_entries(enum xenmap_operation op, unsigned long virt, unsigned long mfn, unsigned long nr_mfns, unsigned int ai) { int rc; unsigned long addr = virt, addr_end = addr + nr_mfns * PAGE_SIZE; lpae_t pte; lpae_t *third = NULL; for(; addr < addr_end; addr += PAGE_SIZE, mfn++) { if ( !xen_second[second_linear_offset(addr)].pt.valid || !xen_second[second_linear_offset(addr)].pt.table ) { rc = create_xen_table(&xen_second[second_linear_offset(addr)]); if ( rc < 0 ) { printk("create_xen_entries: L2 failed\n"); goto out; } } BUG_ON(!xen_second[second_linear_offset(addr)].pt.valid); third = __va(pfn_to_paddr(xen_second[second_linear_offset(addr)].pt.base)); switch ( op ) { case INSERT: if ( third[third_table_offset(addr)].pt.valid ) { printk("create_xen_entries: trying to replace an existing mapping addr=%lx mfn=%lx\n", addr, mfn); return -EINVAL; } pte = mfn_to_xen_entry(mfn); pte.pt.table = 1; pte.pt.ai = ai; write_pte(&third[third_table_offset(addr)], pte); break; case REMOVE: if ( !third[third_table_offset(addr)].pt.valid ) { printk("create_xen_entries: trying to remove a non-existing mapping addr=%lx\n", addr); return -EINVAL; } pte.bits = 0; write_pte(&third[third_table_offset(addr)], pte); break; default: BUG(); } } flush_xen_data_tlb_range_va(virt, PAGE_SIZE * nr_mfns); rc = 0; out: return rc; } int map_pages_to_xen(unsigned long virt, unsigned long mfn, unsigned long nr_mfns, unsigned int flags) { return create_xen_entries(INSERT, virt, mfn, nr_mfns, flags); } void destroy_xen_mappings(unsigned long v, unsigned long e) { create_xen_entries(REMOVE, v, 0, (e - v) >> PAGE_SHIFT, 0); } enum mg { mg_clear, mg_ro, mg_rw, mg_rx }; static void set_pte_flags_on_range(const char *p, unsigned long l, enum mg mg) { lpae_t pte; int i; ASSERT(is_kernel(p) && is_kernel(p + l)); /* Can only guard in page granularity */ ASSERT(!((unsigned long) p & ~PAGE_MASK)); ASSERT(!(l & ~PAGE_MASK)); for ( i = (p - _start) / PAGE_SIZE; i < (p + l - _start) / PAGE_SIZE; i++ ) { pte = xen_xenmap[i]; switch ( mg ) { case mg_clear: pte.pt.valid = 0; break; case mg_ro: pte.pt.valid = 1; pte.pt.pxn = 1; pte.pt.xn = 1; pte.pt.ro = 1; break; case mg_rw: pte.pt.valid = 1; pte.pt.pxn = 1; pte.pt.xn = 1; pte.pt.ro = 0; break; case mg_rx: pte.pt.valid = 1; pte.pt.pxn = 0; pte.pt.xn = 0; pte.pt.ro = 1; break; } write_pte(xen_xenmap + i, pte); } flush_xen_text_tlb(); } /* Release all __init and __initdata ranges to be reused */ void free_init_memory(void) { paddr_t pa = virt_to_maddr(__init_begin); unsigned long len = __init_end - __init_begin; set_pte_flags_on_range(__init_begin, len, mg_rw); memset(__init_begin, 0xcc, len); set_pte_flags_on_range(__init_begin, len, mg_clear); init_domheap_pages(pa, pa + len); printk("Freed %ldkB init memory.\n", (long)(__init_end-__init_begin)>>10); } void arch_dump_shared_mem_info(void) { } int donate_page(struct domain *d, struct page_info *page, unsigned int memflags) { ASSERT(0); return -ENOSYS; } int steal_page( struct domain *d, struct page_info *page, unsigned int memflags) { return -1; } int page_is_ram_type(unsigned long mfn, unsigned long mem_type) { ASSERT(0); return 0; } unsigned long domain_get_maximum_gpfn(struct domain *d) { return -ENOSYS; } void share_xen_page_with_guest(struct page_info *page, struct domain *d, int readonly) { if ( page_get_owner(page) == d ) return; spin_lock(&d->page_alloc_lock); /* The incremented type count pins as writable or read-only. */ page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page); page->u.inuse.type_info |= PGT_validated | 1; page_set_owner(page, d); wmb(); /* install valid domain ptr before updating refcnt. */ ASSERT((page->count_info & ~PGC_xen_heap) == 0); /* Only add to the allocation list if the domain isn't dying. */ if ( !d->is_dying ) { page->count_info |= PGC_allocated | 1; if ( unlikely(d->xenheap_pages++ == 0) ) get_knownalive_domain(d); page_list_add_tail(page, &d->xenpage_list); } spin_unlock(&d->page_alloc_lock); } void share_xen_page_with_privileged_guests( struct page_info *page, int readonly) { share_xen_page_with_guest(page, dom_xen, readonly); } int xenmem_add_to_physmap_one( struct domain *d, unsigned int space, domid_t foreign_domid, unsigned long idx, xen_pfn_t gpfn) { unsigned long mfn = 0; int rc; p2m_type_t t; switch ( space ) { case XENMAPSPACE_grant_table: spin_lock(&d->grant_table->lock); if ( d->grant_table->gt_version == 0 ) d->grant_table->gt_version = 1; if ( d->grant_table->gt_version == 2 && (idx & XENMAPIDX_grant_table_status) ) { idx &= ~XENMAPIDX_grant_table_status; if ( idx < nr_status_frames(d->grant_table) ) mfn = virt_to_mfn(d->grant_table->status[idx]); else return -EINVAL; } else { if ( (idx >= nr_grant_frames(d->grant_table)) && (idx < max_nr_grant_frames) ) gnttab_grow_table(d, idx + 1); if ( idx < nr_grant_frames(d->grant_table) ) mfn = virt_to_mfn(d->grant_table->shared_raw[idx]); else return -EINVAL; } d->arch.grant_table_gpfn[idx] = gpfn; t = p2m_ram_rw; spin_unlock(&d->grant_table->lock); break; case XENMAPSPACE_shared_info: if ( idx != 0 ) return -EINVAL; mfn = virt_to_mfn(d->shared_info); t = p2m_ram_rw; break; case XENMAPSPACE_gmfn_foreign: { struct domain *od; struct page_info *page; p2m_type_t p2mt; od = rcu_lock_domain_by_any_id(foreign_domid); if ( od == NULL ) return -ESRCH; if ( od == d ) { rcu_unlock_domain(od); return -EINVAL; } rc = xsm_map_gmfn_foreign(XSM_TARGET, d, od); if ( rc ) { rcu_unlock_domain(od); return rc; } /* Take reference to the foreign domain page. * Reference will be released in XENMEM_remove_from_physmap */ page = get_page_from_gfn(od, idx, &p2mt, P2M_ALLOC); if ( !page ) { dump_p2m_lookup(od, pfn_to_paddr(idx)); rcu_unlock_domain(od); return -EINVAL; } if ( !p2m_is_ram(p2mt) ) { put_page(page); rcu_unlock_domain(od); return -EINVAL; } mfn = page_to_mfn(page); t = p2m_map_foreign; rcu_unlock_domain(od); break; } default: return -ENOSYS; } /* Map at new location. */ rc = guest_physmap_add_entry(d, gpfn, mfn, 0, t); return rc; } long arch_memory_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg) { switch ( op ) { /* XXX: memsharing not working yet */ case XENMEM_get_sharing_shared_pages: case XENMEM_get_sharing_freed_pages: return 0; default: return -ENOSYS; } return 0; } struct domain *page_get_owner_and_reference(struct page_info *page) { unsigned long x, y = page->count_info; do { x = y; /* * Count == 0: Page is not allocated, so we cannot take a reference. * Count == -1: Reference count would wrap, which is invalid. */ if ( unlikely(((x + 1) & PGC_count_mask) <= 1) ) return NULL; } while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x ); return page_get_owner(page); } void put_page(struct page_info *page) { unsigned long nx, x, y = page->count_info; do { ASSERT((y & PGC_count_mask) != 0); x = y; nx = x - 1; } while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) ); if ( unlikely((nx & PGC_count_mask) == 0) ) { free_domheap_page(page); } } int get_page(struct page_info *page, struct domain *domain) { struct domain *owner = page_get_owner_and_reference(page); if ( likely(owner == domain) ) return 1; if ( owner != NULL ) put_page(page); return 0; } /* Common code requires get_page_type and put_page_type. * We don't care about typecounts so we just do the minimum to make it * happy. */ int get_page_type(struct page_info *page, unsigned long type) { return 1; } void put_page_type(struct page_info *page) { return; } void gnttab_clear_flag(unsigned long nr, uint16_t *addr) { /* * Note that this cannot be clear_bit(), as the access must be * confined to the specified 2 bytes. */ uint16_t mask = ~(1 << nr), old; do { old = *addr; } while (cmpxchg(addr, old, old & mask) != old); } void gnttab_mark_dirty(struct domain *d, unsigned long l) { /* XXX: mark dirty */ static int warning; if (!warning) { gdprintk(XENLOG_WARNING, "gnttab_mark_dirty not implemented yet\n"); warning = 1; } } int create_grant_host_mapping(unsigned long addr, unsigned long frame, unsigned int flags, unsigned int cache_flags) { int rc; p2m_type_t t = p2m_grant_map_rw; if ( cache_flags || (flags & ~GNTMAP_readonly) != GNTMAP_host_map ) return GNTST_general_error; if ( flags & GNTMAP_readonly ) t = p2m_grant_map_ro; rc = guest_physmap_add_entry(current->domain, addr >> PAGE_SHIFT, frame, 0, t); if ( rc ) return GNTST_general_error; else return GNTST_okay; } int replace_grant_host_mapping(unsigned long addr, unsigned long mfn, unsigned long new_addr, unsigned int flags) { unsigned long gfn = (unsigned long)(addr >> PAGE_SHIFT); struct domain *d = current->domain; if ( new_addr != 0 || (flags & GNTMAP_contains_pte) ) return GNTST_general_error; guest_physmap_remove_page(d, gfn, mfn, 0); return GNTST_okay; } int is_iomem_page(unsigned long mfn) { if ( !mfn_valid(mfn) ) return 1; return 0; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/smpboot.c0000664000175000017500000002550312307313555014565 0ustar smbsmb/* * xen/arch/arm/smpboot.c * * Dummy smpboot support * * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include cpumask_t cpu_online_map; cpumask_t cpu_present_map; cpumask_t cpu_possible_map; struct cpuinfo_arm cpu_data[NR_CPUS]; /* CPU logical map: map xen cpuid to an MPIDR */ u32 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = MPIDR_INVALID }; /* Fake one node for now. See also include/asm-arm/numa.h */ nodemask_t __read_mostly node_online_map = { { [0] = 1UL } }; /* Xen stack for bringing up the first CPU. */ static unsigned char __initdata cpu0_boot_stack[STACK_SIZE] __attribute__((__aligned__(STACK_SIZE))); /* Initial boot cpu data */ struct init_info __initdata init_data = { .stack = cpu0_boot_stack, }; /* Shared state for coordinating CPU bringup */ unsigned long smp_up_cpu = MPIDR_INVALID; /* Shared state for coordinating CPU teardown */ static bool_t cpu_is_dead = 0; /* ID of the PCPU we're running on */ DEFINE_PER_CPU(unsigned int, cpu_id); /* XXX these seem awfully x86ish... */ /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_mask); /* representing HT and core siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_mask); static void setup_cpu_sibling_map(int cpu) { if ( !zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) || !zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) ) panic("No memory for CPU sibling/core maps"); /* A CPU is a sibling with itself and is always on its own core. */ cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu)); cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, cpu)); } void __init smp_clear_cpu_maps (void) { cpumask_clear(&cpu_possible_map); cpumask_clear(&cpu_online_map); cpumask_set_cpu(0, &cpu_online_map); cpumask_set_cpu(0, &cpu_possible_map); cpu_logical_map(0) = READ_SYSREG(MPIDR_EL1) & MPIDR_HWID_MASK; } /* Parse the device tree and build the logical map array containing * MPIDR values related to logical cpus * Code base on Linux arch/arm/kernel/devtree.c */ void __init smp_init_cpus(void) { register_t mpidr; struct dt_device_node *cpus = dt_find_node_by_path("/cpus"); struct dt_device_node *cpu; unsigned int i, j; unsigned int cpuidx = 1; static u32 tmp_map[NR_CPUS] __initdata = { [0 ... NR_CPUS - 1] = MPIDR_INVALID }; bool_t bootcpu_valid = 0; int rc; /* scan the DTB for a PSCI node and set a global variable */ psci_init(); if ( (rc = arch_smp_init()) < 0 ) { printk(XENLOG_WARNING "SMP init failed (%d)\n" "Using only 1 CPU\n", rc); return; } mpidr = boot_cpu_data.mpidr.bits & MPIDR_HWID_MASK; if ( !cpus ) { printk(XENLOG_WARNING "WARNING: Can't find /cpus in the device tree.\n" "Using only 1 CPU\n"); return; } dt_for_each_child_node( cpus, cpu ) { const __be32 *prop; u64 addr; u32 reg_len, hwid; if ( !dt_device_type_is_equal(cpu, "cpu") ) continue; if ( dt_n_size_cells(cpu) != 0 ) printk(XENLOG_WARNING "cpu node `%s`: #size-cells %d\n", dt_node_full_name(cpu), dt_n_size_cells(cpu)); prop = dt_get_property(cpu, "reg", ®_len); if ( !prop ) { printk(XENLOG_WARNING "cpu node `%s`: has no reg property\n", dt_node_full_name(cpu)); continue; } if ( reg_len < dt_cells_to_size(dt_n_addr_cells(cpu)) ) { printk(XENLOG_WARNING "cpu node `%s`: reg property too short\n", dt_node_full_name(cpu)); continue; } addr = dt_read_number(prop, dt_n_addr_cells(cpu)); hwid = addr; if ( hwid != addr ) { printk(XENLOG_WARNING "cpu node `%s`: hwid overflow %"PRIx64"\n", dt_node_full_name(cpu), addr); continue; } /* * 8 MSBs must be set to 0 in the DT since the reg property * defines the MPIDR[23:0] */ if ( hwid & ~MPIDR_HWID_MASK ) { printk(XENLOG_WARNING "cpu node `%s`: invalid hwid value (0x%x)\n", dt_node_full_name(cpu), hwid); continue; } /* * Duplicate MPIDRs are a recipe for disaster. Scan all initialized * entries and check for duplicates. If any found just skip the node. * temp values values are initialized to MPIDR_INVALID to avoid * matching valid MPIDR[23:0] values. */ for ( j = 0; j < cpuidx; j++ ) { if ( tmp_map[j] == hwid ) { printk(XENLOG_WARNING "cpu node `%s`: duplicate /cpu reg properties %"PRIx32" in the DT\n", dt_node_full_name(cpu), hwid); break; } } if ( j != cpuidx ) continue; /* * Build a stashed array of MPIDR values. Numbering scheme requires * that if detected the boot CPU must be assigned logical id 0. Other * CPUs get sequential indexes starting from 1. If a CPU node * with a reg property matching the boot CPU MPIDR is detected, * this is recorded and so that the logical map build from DT is * validated and can be used to set the map. */ if ( hwid == mpidr ) { i = 0; bootcpu_valid = 1; } else i = cpuidx++; if ( cpuidx > NR_CPUS ) { printk(XENLOG_WARNING "DT /cpu %u node greater than max cores %u, capping them\n", cpuidx, NR_CPUS); cpuidx = NR_CPUS; break; } if ( (rc = arch_cpu_init(i, cpu)) < 0 ) { printk("cpu%d init failed (hwid %x): %d\n", i, hwid, rc); tmp_map[i] = MPIDR_INVALID; } else tmp_map[i] = hwid; } if ( !bootcpu_valid ) { printk(XENLOG_WARNING "DT missing boot CPU MPIDR[23:0]\n" "Using only 1 CPU\n"); return; } for ( i = 0; i < cpuidx; i++ ) { if ( tmp_map[i] == MPIDR_INVALID ) continue; cpumask_set_cpu(i, &cpu_possible_map); cpu_logical_map(i) = tmp_map[i]; } } int __init smp_get_max_cpus (void) { int i, max_cpus = 0; for ( i = 0; i < nr_cpu_ids; i++ ) if ( cpu_possible(i) ) max_cpus++; return max_cpus; } void __init smp_prepare_cpus (unsigned int max_cpus) { cpumask_copy(&cpu_present_map, &cpu_possible_map); setup_cpu_sibling_map(0); } /* Boot the current CPU */ void __cpuinit start_secondary(unsigned long boot_phys_offset, unsigned long fdt_paddr, unsigned long hwid) { unsigned int cpuid = init_data.cpuid; memset(get_cpu_info(), 0, sizeof (struct cpu_info)); set_processor_id(cpuid); current_cpu_data = boot_cpu_data; identify_cpu(¤t_cpu_data); init_traps(); setup_virt_paging(); mmu_init_secondary_cpu(); gic_init_secondary_cpu(); init_secondary_IRQ(); gic_route_ppis(); init_maintenance_interrupt(); init_timer_interrupt(); set_current(idle_vcpu[cpuid]); setup_cpu_sibling_map(cpuid); /* Run local notifiers */ notify_cpu_starting(cpuid); wmb(); /* Now report this CPU is up */ smp_up_cpu = MPIDR_INVALID; cpumask_set_cpu(cpuid, &cpu_online_map); wmb(); local_irq_enable(); local_abort_enable(); printk(XENLOG_DEBUG "CPU %u booted.\n", smp_processor_id()); startup_cpu_idle_loop(); } /* Shut down the current CPU */ void __cpu_disable(void) { unsigned int cpu = get_processor_id(); local_irq_disable(); gic_disable_cpu(); /* Allow any queued timer interrupts to get serviced */ local_irq_enable(); mdelay(1); local_irq_disable(); /* It's now safe to remove this processor from the online map */ cpumask_clear_cpu(cpu, &cpu_online_map); if ( cpu_disable_scheduler(cpu) ) BUG(); mb(); /* Return to caller; eventually the IPI mechanism will unwind and the * scheduler will drop to the idle loop, which will call stop_cpu(). */ } void stop_cpu(void) { local_irq_disable(); cpu_is_dead = 1; /* Make sure the write happens before we sleep forever */ dsb(); isb(); while ( 1 ) wfi(); } int __init cpu_up_send_sgi(int cpu) { /* We don't know the GIC ID of the CPU until it has woken up, so just * signal everyone and rely on our own smp_up_cpu gate to ensure only * the one we want gets through. */ send_SGI_allbutself(GIC_SGI_EVENT_CHECK); return 0; } /* Bring up a remote CPU */ int __cpu_up(unsigned int cpu) { int rc; printk("Bringing up CPU%d\n", cpu); rc = init_secondary_pagetables(cpu); if ( rc < 0 ) return rc; console_start_sync(); /* Secondary may use early_printk */ /* Tell the remote CPU which stack to boot on. */ init_data.stack = idle_vcpu[cpu]->arch.stack; /* Tell the remote CPU what is it's logical CPU ID */ init_data.cpuid = cpu; /* Open the gate for this CPU */ smp_up_cpu = cpu_logical_map(cpu); clean_xen_dcache(smp_up_cpu); rc = arch_cpu_up(cpu); console_end_sync(); if ( rc < 0 ) { printk("Failed to bring up CPU%d\n", cpu); return rc; } while ( !cpu_online(cpu) ) { cpu_relax(); process_pending_softirqs(); } return 0; } /* Wait for a remote CPU to die */ void __cpu_die(unsigned int cpu) { unsigned int i = 0; while ( !cpu_is_dead ) { mdelay(100); cpu_relax(); process_pending_softirqs(); if ( (++i % 10) == 0 ) printk(KERN_ERR "CPU %u still not dead...\n", cpu); mb(); } cpu_is_dead = 0; mb(); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/early_printk.c0000664000175000017500000000260712307313555015605 0ustar smbsmb/* * printk() for use before the final page tables are setup. * * Copyright (C) 2012 Citrix Systems, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include #include #include #include #include #include void early_putch(char c); void early_flush(void); /* Early printk buffer */ static char __initdata buf[512]; static void __init early_puts(const char *s) { while (*s != '\0') { if (*s == '\n') early_putch('\r'); early_putch(*s); s++; } } static void __init early_vprintk(const char *fmt, va_list args) { vsnprintf(buf, sizeof(buf), fmt, args); early_puts(buf); /* * Wait the UART has finished to transfer all characters before * to continue. This will avoid lost characters if Xen abort. */ early_flush(); } void __init early_printk(const char *fmt, ...) { va_list args; va_start(args, fmt); early_vprintk(fmt, args); va_end(args); } void __attribute__((noreturn)) __init early_panic(const char *fmt, ...) { va_list args; va_start(args, fmt); early_vprintk(fmt, args); va_end(args); early_printk("\n\nEarly Panic: Stopping\n"); while(1); } xen-4.4.0/xen/arch/arm/traps.c0000664000175000017500000013562112307313555014236 0ustar smbsmb/* * xen/arch/arm/traps.c * * ARM Trap handlers * * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "decode.h" #include "io.h" #include "vtimer.h" #include /* The base of the stack must always be double-word aligned, which means * that both the kernel half of struct cpu_user_regs (which is pushed in * entry.S) and struct cpu_info (which lives at the bottom of a Xen * stack) must be doubleword-aligned in size. */ static inline void check_stack_alignment_constraints(void) { #ifdef CONFIG_ARM_64 BUILD_BUG_ON((sizeof (struct cpu_user_regs)) & 0xf); BUILD_BUG_ON((offsetof(struct cpu_user_regs, spsr_el1)) & 0xf); BUILD_BUG_ON((offsetof(struct cpu_user_regs, lr)) & 0xf); BUILD_BUG_ON((sizeof (struct cpu_info)) & 0xf); #else BUILD_BUG_ON((sizeof (struct cpu_user_regs)) & 0x7); BUILD_BUG_ON((offsetof(struct cpu_user_regs, sp_usr)) & 0x7); BUILD_BUG_ON((sizeof (struct cpu_info)) & 0x7); #endif } #ifdef CONFIG_ARM_32 static int debug_stack_lines = 20; #define stack_words_per_line 8 #else static int debug_stack_lines = 40; #define stack_words_per_line 4 #endif integer_param("debug_stack_lines", debug_stack_lines); void __cpuinit init_traps(void) { /* Setup Hyp vector base */ WRITE_SYSREG((vaddr_t)hyp_traps_vector, VBAR_EL2); /* Setup hypervisor traps */ WRITE_SYSREG(HCR_PTW|HCR_BSU_OUTER|HCR_AMO|HCR_IMO|HCR_VM|HCR_TWI|HCR_TSC| HCR_TAC, HCR_EL2); isb(); } asmlinkage void __div0(void) { printk("Division by zero in hypervisor.\n"); BUG(); } /* XXX could/should be common code */ static void print_xen_info(void) { char taint_str[TAINT_STRING_MAX_LEN]; printk("----[ Xen-%d.%d%s %s debug=%c %s ]----\n", xen_major_version(), xen_minor_version(), xen_extra_version(), #ifdef CONFIG_ARM_32 "arm32", #else "arm64", #endif debug_build() ? 'y' : 'n', print_tainted(taint_str)); } register_t *select_user_reg(struct cpu_user_regs *regs, int reg) { BUG_ON( !guest_mode(regs) ); #ifdef CONFIG_ARM_32 /* * We rely heavily on the layout of cpu_user_regs to avoid having * to handle all of the registers individually. Use BUILD_BUG_ON to * ensure that things which expect are contiguous actually are. */ #define REGOFFS(R) offsetof(struct cpu_user_regs, R) switch ( reg ) { case 0 ... 7: /* Unbanked registers */ BUILD_BUG_ON(REGOFFS(r0) + 7*sizeof(register_t) != REGOFFS(r7)); return ®s->r0 + reg; case 8 ... 12: /* Register banked in FIQ mode */ BUILD_BUG_ON(REGOFFS(r8_fiq) + 4*sizeof(register_t) != REGOFFS(r12_fiq)); if ( fiq_mode(regs) ) return ®s->r8_fiq + reg - 8; else return ®s->r8 + reg - 8; case 13 ... 14: /* Banked SP + LR registers */ BUILD_BUG_ON(REGOFFS(sp_fiq) + 1*sizeof(register_t) != REGOFFS(lr_fiq)); BUILD_BUG_ON(REGOFFS(sp_irq) + 1*sizeof(register_t) != REGOFFS(lr_irq)); BUILD_BUG_ON(REGOFFS(sp_svc) + 1*sizeof(register_t) != REGOFFS(lr_svc)); BUILD_BUG_ON(REGOFFS(sp_abt) + 1*sizeof(register_t) != REGOFFS(lr_abt)); BUILD_BUG_ON(REGOFFS(sp_und) + 1*sizeof(register_t) != REGOFFS(lr_und)); switch ( regs->cpsr & PSR_MODE_MASK ) { case PSR_MODE_USR: case PSR_MODE_SYS: /* Sys regs are the usr regs */ if ( reg == 13 ) return ®s->sp_usr; else /* lr_usr == lr in a user frame */ return ®s->lr; case PSR_MODE_FIQ: return ®s->sp_fiq + reg - 13; case PSR_MODE_IRQ: return ®s->sp_irq + reg - 13; case PSR_MODE_SVC: return ®s->sp_svc + reg - 13; case PSR_MODE_ABT: return ®s->sp_abt + reg - 13; case PSR_MODE_UND: return ®s->sp_und + reg - 13; case PSR_MODE_MON: case PSR_MODE_HYP: default: BUG(); } case 15: /* PC */ return ®s->pc; default: BUG(); } #undef REGOFFS #else /* In 64 bit the syndrome register contains the AArch64 register * number even if the trap was from AArch32 mode. Except that * AArch32 R15 (PC) is encoded as 0b11111. */ if ( reg == 0x1f /* && is aarch32 guest */) return ®s->pc; return ®s->x0 + reg; #endif } static const char *decode_fsc(uint32_t fsc, int *level) { const char *msg = NULL; switch ( fsc & 0x3f ) { case FSC_FLT_TRANS ... FSC_FLT_TRANS + 3: msg = "Translation fault"; *level = fsc & FSC_LL_MASK; break; case FSC_FLT_ACCESS ... FSC_FLT_ACCESS + 3: msg = "Access fault"; *level = fsc & FSC_LL_MASK; break; case FSC_FLT_PERM ... FSC_FLT_PERM + 3: msg = "Permission fault"; *level = fsc & FSC_LL_MASK; break; case FSC_SEA: msg = "Synchronous External Abort"; break; case FSC_SPE: msg = "Memory Access Synchronous Parity Error"; break; case FSC_APE: msg = "Memory Access Asynchronous Parity Error"; break; case FSC_SEATT ... FSC_SEATT + 3: msg = "Sync. Ext. Abort Translation Table"; *level = fsc & FSC_LL_MASK; break; case FSC_SPETT ... FSC_SPETT + 3: msg = "Sync. Parity. Error Translation Table"; *level = fsc & FSC_LL_MASK; break; case FSC_AF: msg = "Alignment Fault"; break; case FSC_DE: msg = "Debug Event"; break; case FSC_LKD: msg = "Implementation Fault: Lockdown Abort"; break; case FSC_CPR: msg = "Implementation Fault: Coprocossor Abort"; break; default: msg = "Unknown Failure"; break; } return msg; } static const char *fsc_level_str(int level) { switch ( level ) { case -1: return ""; case 1: return " at level 1"; case 2: return " at level 2"; case 3: return " at level 3"; default: return " (level invalid)"; } } void panic_PAR(uint64_t par) { const char *msg; int level = -1; int stage = par & PAR_STAGE2 ? 2 : 1; int second_in_first = !!(par & PAR_STAGE21); msg = decode_fsc( (par&PAR_FSC_MASK) >> PAR_FSC_SHIFT, &level); printk("PAR: %016"PRIx64": %s stage %d%s%s\n", par, msg, stage, second_in_first ? " during second stage lookup" : "", fsc_level_str(level)); panic("Error during Hypervisor-to-physical address translation"); } static void cpsr_switch_mode(struct cpu_user_regs *regs, int mode) { uint32_t sctlr = READ_SYSREG32(SCTLR_EL1); regs->cpsr &= ~(PSR_MODE_MASK|PSR_IT_MASK|PSR_JAZELLE|PSR_BIG_ENDIAN|PSR_THUMB); regs->cpsr |= mode; regs->cpsr |= PSR_IRQ_MASK; if ( mode == PSR_MODE_ABT ) regs->cpsr |= PSR_ABT_MASK; if ( sctlr & SCTLR_TE ) regs->cpsr |= PSR_THUMB; if ( sctlr & SCTLR_EE ) regs->cpsr |= PSR_BIG_ENDIAN; } static vaddr_t exception_handler(vaddr_t offset) { uint32_t sctlr = READ_SYSREG32(SCTLR_EL1); if (sctlr & SCTLR_V) return 0xffff0000 + offset; else /* always have security exceptions */ return READ_SYSREG(VBAR_EL1) + offset; } /* Injects an Undefined Instruction exception into the current vcpu, * PC is the exact address of the faulting instruction (without * pipeline adjustments). See TakeUndefInstrException pseudocode in * ARM ARM. */ static void inject_undef32_exception(struct cpu_user_regs *regs) { uint32_t spsr = regs->cpsr; int is_thumb = (regs->cpsr & PSR_THUMB); /* Saved PC points to the instruction past the faulting instruction. */ uint32_t return_offset = is_thumb ? 2 : 4; BUG_ON( !is_pv32_domain(current->domain) ); /* Update processor mode */ cpsr_switch_mode(regs, PSR_MODE_UND); /* Update banked registers */ regs->spsr_und = spsr; regs->lr_und = regs->pc32 + return_offset; /* Branch to exception vector */ regs->pc32 = exception_handler(VECTOR32_UND); } /* Injects an Abort exception into the current vcpu, PC is the exact * address of the faulting instruction (without pipeline * adjustments). See TakePrefetchAbortException and * TakeDataAbortException pseudocode in ARM ARM. */ static void inject_abt32_exception(struct cpu_user_regs *regs, int prefetch, register_t addr) { uint32_t spsr = regs->cpsr; int is_thumb = (regs->cpsr & PSR_THUMB); /* Saved PC points to the instruction past the faulting instruction. */ uint32_t return_offset = is_thumb ? 4 : 0; register_t fsr; BUG_ON( !is_pv32_domain(current->domain) ); cpsr_switch_mode(regs, PSR_MODE_ABT); /* Update banked registers */ regs->spsr_abt = spsr; regs->lr_abt = regs->pc32 + return_offset; regs->pc32 = exception_handler(prefetch ? VECTOR32_PABT : VECTOR32_DABT); /* Inject a debug fault, best we can do right now */ if ( READ_SYSREG(TCR_EL1) & TTBCR_EAE ) fsr = FSR_LPAE | FSRL_STATUS_DEBUG; else fsr = FSRS_FS_DEBUG; if ( prefetch ) { /* Set IFAR and IFSR */ #ifdef CONFIG_ARM_32 WRITE_SYSREG(addr, IFAR); WRITE_SYSREG(fsr, IFSR); #else /* FAR_EL1[63:32] is AArch32 register IFAR */ register_t far = READ_SYSREG(FAR_EL1) & 0xffffffffUL; far |= addr << 32; WRITE_SYSREG(far, FAR_EL1); WRITE_SYSREG(fsr, IFSR32_EL2); #endif } else { #ifdef CONFIG_ARM_32 /* Set DFAR and DFSR */ WRITE_SYSREG(addr, DFAR); WRITE_SYSREG(fsr, DFSR); #else /* FAR_EL1[31:0] is AArch32 register DFAR */ register_t far = READ_SYSREG(FAR_EL1) & ~0xffffffffUL; far |= addr; WRITE_SYSREG(far, FAR_EL1); /* ESR_EL1 is AArch32 register DFSR */ WRITE_SYSREG(fsr, ESR_EL1); #endif } } static void inject_dabt32_exception(struct cpu_user_regs *regs, register_t addr) { inject_abt32_exception(regs, 0, addr); } static void inject_pabt32_exception(struct cpu_user_regs *regs, register_t addr) { inject_abt32_exception(regs, 1, addr); } #ifdef CONFIG_ARM_64 /* Inject an undefined exception into a 64 bit guest */ static void inject_undef64_exception(struct cpu_user_regs *regs, int instr_len) { union hsr esr = { .iss = 0, .len = instr_len, .ec = HSR_EC_UNKNOWN, }; BUG_ON( is_pv32_domain(current->domain) ); regs->spsr_el1 = regs->cpsr; regs->elr_el1 = regs->pc; regs->cpsr = PSR_MODE_EL1h | PSR_ABT_MASK | PSR_FIQ_MASK | \ PSR_IRQ_MASK | PSR_DBG_MASK; regs->pc = READ_SYSREG(VBAR_EL1) + VECTOR64_CURRENT_SPx_SYNC; WRITE_SYSREG32(esr.bits, ESR_EL1); } /* Inject an abort exception into a 64 bit guest */ static void inject_abt64_exception(struct cpu_user_regs *regs, int prefetch, register_t addr, int instr_len) { union hsr esr = { .iss = 0, .len = instr_len, }; /* * Trap may have been taken from EL0, which might be in AArch32 * mode (PSR_MODE_BIT set), or in AArch64 mode (PSR_MODE_EL0t). * * Since we know the kernel must be 64-bit any trap from a 32-bit * mode must have been from EL0. */ if ( psr_mode_is_32bit(regs->cpsr) || psr_mode(regs->cpsr,PSR_MODE_EL0t) ) esr.ec = prefetch ? HSR_EC_INSTR_ABORT_LOWER_EL : HSR_EC_DATA_ABORT_LOWER_EL; else esr.ec = prefetch ? HSR_EC_INSTR_ABORT_CURR_EL : HSR_EC_DATA_ABORT_CURR_EL; BUG_ON( is_pv32_domain(current->domain) ); regs->spsr_el1 = regs->cpsr; regs->elr_el1 = regs->pc; regs->cpsr = PSR_MODE_EL1h | PSR_ABT_MASK | PSR_FIQ_MASK | \ PSR_IRQ_MASK | PSR_DBG_MASK; regs->pc = READ_SYSREG(VBAR_EL1) + VECTOR64_CURRENT_SPx_SYNC; WRITE_SYSREG(addr, FAR_EL1); WRITE_SYSREG32(esr.bits, ESR_EL1); } static void inject_dabt64_exception(struct cpu_user_regs *regs, register_t addr, int instr_len) { inject_abt64_exception(regs, 0, addr, instr_len); } static void inject_iabt64_exception(struct cpu_user_regs *regs, register_t addr, int instr_len) { inject_abt64_exception(regs, 1, addr, instr_len); } #endif static void inject_iabt_exception(struct cpu_user_regs *regs, register_t addr, int instr_len) { if ( is_pv32_domain(current->domain) ) inject_pabt32_exception(regs, addr); #ifdef CONFIG_ARM_64 else inject_iabt64_exception(regs, addr, instr_len); #endif } static void inject_dabt_exception(struct cpu_user_regs *regs, register_t addr, int instr_len) { if ( is_pv32_domain(current->domain) ) inject_dabt32_exception(regs, addr); #ifdef CONFIG_ARM_64 else inject_dabt64_exception(regs, addr, instr_len); #endif } struct reg_ctxt { /* Guest-side state */ uint32_t sctlr_el1; register_t tcr_el1; uint64_t ttbr0_el1, ttbr1_el1; #ifdef CONFIG_ARM_32 uint32_t dfsr, ifsr; uint32_t dfar, ifar; #else uint32_t esr_el1; uint64_t far; uint32_t ifsr32_el2; #endif /* Hypervisor-side state */ uint64_t vttbr_el2; }; static const char *mode_string(uint32_t cpsr) { uint32_t mode; static const char *mode_strings[] = { [PSR_MODE_USR] = "32-bit Guest USR", [PSR_MODE_FIQ] = "32-bit Guest FIQ", [PSR_MODE_IRQ] = "32-bit Guest IRQ", [PSR_MODE_SVC] = "32-bit Guest SVC", [PSR_MODE_MON] = "32-bit Monitor", [PSR_MODE_ABT] = "32-bit Guest ABT", [PSR_MODE_HYP] = "Hypervisor", [PSR_MODE_UND] = "32-bit Guest UND", [PSR_MODE_SYS] = "32-bit Guest SYS", #ifdef CONFIG_ARM_64 [PSR_MODE_EL3h] = "64-bit EL3h (Monitor, handler)", [PSR_MODE_EL3t] = "64-bit EL3t (Monitor, thread)", [PSR_MODE_EL2h] = "64-bit EL2h (Hypervisor, handler)", [PSR_MODE_EL2t] = "64-bit EL2t (Hypervisor, thread)", [PSR_MODE_EL1h] = "64-bit EL1h (Guest Kernel, handler)", [PSR_MODE_EL1t] = "64-bit EL1t (Guest Kernel, thread)", [PSR_MODE_EL0t] = "64-bit EL0t (Guest User)", #endif }; mode = cpsr & PSR_MODE_MASK; if ( mode > ARRAY_SIZE(mode_strings) ) return "Unknown"; return mode_strings[mode] ? : "Unknown"; } static void show_registers_32(struct cpu_user_regs *regs, struct reg_ctxt *ctxt, int guest_mode, const struct vcpu *v) { #ifdef CONFIG_ARM_64 BUG_ON( ! (regs->cpsr & PSR_MODE_BIT) ); printk("PC: %08"PRIx32"\n", regs->pc32); #else printk("PC: %08"PRIx32, regs->pc); if ( !guest_mode ) printk(" %pS", _p(regs->pc)); printk("\n"); #endif printk("CPSR: %08"PRIx32" MODE:%s\n", regs->cpsr, mode_string(regs->cpsr)); printk(" R0: %08"PRIx32" R1: %08"PRIx32" R2: %08"PRIx32" R3: %08"PRIx32"\n", regs->r0, regs->r1, regs->r2, regs->r3); printk(" R4: %08"PRIx32" R5: %08"PRIx32" R6: %08"PRIx32" R7: %08"PRIx32"\n", regs->r4, regs->r5, regs->r6, regs->r7); printk(" R8: %08"PRIx32" R9: %08"PRIx32" R10:%08"PRIx32" R11:%08"PRIx32" R12:%08"PRIx32"\n", regs->r8, regs->r9, regs->r10, #ifdef CONFIG_ARM_64 regs->r11, #else regs->fp, #endif regs->r12); if ( guest_mode ) { printk("USR: SP: %08"PRIx32" LR: %08"PRIregister"\n", regs->sp_usr, regs->lr); printk("SVC: SP: %08"PRIx32" LR: %08"PRIx32" SPSR:%08"PRIx32"\n", regs->sp_svc, regs->lr_svc, regs->spsr_svc); printk("ABT: SP: %08"PRIx32" LR: %08"PRIx32" SPSR:%08"PRIx32"\n", regs->sp_abt, regs->lr_abt, regs->spsr_abt); printk("UND: SP: %08"PRIx32" LR: %08"PRIx32" SPSR:%08"PRIx32"\n", regs->sp_und, regs->lr_und, regs->spsr_und); printk("IRQ: SP: %08"PRIx32" LR: %08"PRIx32" SPSR:%08"PRIx32"\n", regs->sp_irq, regs->lr_irq, regs->spsr_irq); printk("FIQ: SP: %08"PRIx32" LR: %08"PRIx32" SPSR:%08"PRIx32"\n", regs->sp_fiq, regs->lr_fiq, regs->spsr_fiq); printk("FIQ: R8: %08"PRIx32" R9: %08"PRIx32" R10:%08"PRIx32" R11:%08"PRIx32" R12:%08"PRIx32"\n", regs->r8_fiq, regs->r9_fiq, regs->r10_fiq, regs->r11_fiq, regs->r11_fiq); } #ifndef CONFIG_ARM_64 else { printk("HYP: SP: %08"PRIx32" LR: %08"PRIregister"\n", regs->sp, regs->lr); } #endif printk("\n"); if ( guest_mode ) { printk(" SCTLR: %08"PRIx32"\n", ctxt->sctlr_el1); printk(" TCR: %08"PRIregister"\n", ctxt->tcr_el1); printk(" TTBR0: %016"PRIx64"\n", ctxt->ttbr0_el1); printk(" TTBR1: %016"PRIx64"\n", ctxt->ttbr1_el1); printk(" IFAR: %08"PRIx32", IFSR: %08"PRIx32"\n" " DFAR: %08"PRIx32", DFSR: %08"PRIx32"\n", #ifdef CONFIG_ARM_64 (uint32_t)(ctxt->far >> 32), ctxt->ifsr32_el2, (uint32_t)(ctxt->far & 0xffffffff), ctxt->esr_el1 #else ctxt->ifar, ctxt->ifsr, ctxt->dfar, ctxt->dfsr #endif ); printk("\n"); } } #ifdef CONFIG_ARM_64 static void show_registers_64(struct cpu_user_regs *regs, struct reg_ctxt *ctxt, int guest_mode, const struct vcpu *v) { BUG_ON( (regs->cpsr & PSR_MODE_BIT) ); printk("PC: %016"PRIx64, regs->pc); if ( !guest_mode ) printk(" %pS", _p(regs->pc)); printk("\n"); printk("LR: %016"PRIx64"\n", regs->lr); if ( guest_mode ) { printk("SP_EL0: %016"PRIx64"\n", regs->sp_el0); printk("SP_EL1: %016"PRIx64"\n", regs->sp_el1); } else { printk("SP: %016"PRIx64"\n", regs->sp); } printk("CPSR: %08"PRIx32" MODE:%s\n", regs->cpsr, mode_string(regs->cpsr)); printk(" X0: %016"PRIx64" X1: %016"PRIx64" X2: %016"PRIx64"\n", regs->x0, regs->x1, regs->x2); printk(" X3: %016"PRIx64" X4: %016"PRIx64" X5: %016"PRIx64"\n", regs->x3, regs->x4, regs->x5); printk(" X6: %016"PRIx64" X7: %016"PRIx64" X8: %016"PRIx64"\n", regs->x6, regs->x7, regs->x8); printk(" X9: %016"PRIx64" X10: %016"PRIx64" X11: %016"PRIx64"\n", regs->x9, regs->x10, regs->x11); printk(" X12: %016"PRIx64" X13: %016"PRIx64" X14: %016"PRIx64"\n", regs->x12, regs->x13, regs->x14); printk(" X15: %016"PRIx64" X16: %016"PRIx64" X17: %016"PRIx64"\n", regs->x15, regs->x16, regs->x17); printk(" X18: %016"PRIx64" X19: %016"PRIx64" X20: %016"PRIx64"\n", regs->x18, regs->x19, regs->x20); printk(" X21: %016"PRIx64" X22: %016"PRIx64" X23: %016"PRIx64"\n", regs->x21, regs->x22, regs->x23); printk(" X24: %016"PRIx64" X25: %016"PRIx64" X26: %016"PRIx64"\n", regs->x24, regs->x25, regs->x26); printk(" X27: %016"PRIx64" X28: %016"PRIx64" FP: %016"PRIx64"\n", regs->x27, regs->x28, regs->fp); printk("\n"); if ( guest_mode ) { printk(" ELR_EL1: %016"PRIx64"\n", regs->elr_el1); printk(" ESR_EL1: %08"PRIx32"\n", ctxt->esr_el1); printk(" FAR_EL1: %016"PRIx64"\n", ctxt->far); printk("\n"); printk(" SCTLR_EL1: %08"PRIx32"\n", ctxt->sctlr_el1); printk(" TCR_EL1: %08"PRIregister"\n", ctxt->tcr_el1); printk(" TTBR0_EL1: %016"PRIx64"\n", ctxt->ttbr0_el1); printk(" TTBR1_EL1: %016"PRIx64"\n", ctxt->ttbr1_el1); printk("\n"); } } #endif static void _show_registers(struct cpu_user_regs *regs, struct reg_ctxt *ctxt, int guest_mode, const struct vcpu *v) { print_xen_info(); printk("CPU: %d\n", smp_processor_id()); if ( guest_mode ) { if ( is_pv32_domain(v->domain) ) show_registers_32(regs, ctxt, guest_mode, v); #ifdef CONFIG_ARM_64 else if ( is_pv64_domain(v->domain) ) show_registers_64(regs, ctxt, guest_mode, v); #endif } else { #ifdef CONFIG_ARM_64 show_registers_64(regs, ctxt, guest_mode, v); #else show_registers_32(regs, ctxt, guest_mode, v); #endif } printk(" VTCR_EL2: %08"PRIx32"\n", READ_SYSREG32(VTCR_EL2)); printk(" VTTBR_EL2: %016"PRIx64"\n", ctxt->vttbr_el2); printk("\n"); printk(" SCTLR_EL2: %08"PRIx32"\n", READ_SYSREG32(SCTLR_EL2)); printk(" HCR_EL2: %016"PRIregister"\n", READ_SYSREG(HCR_EL2)); printk(" TTBR0_EL2: %016"PRIx64"\n", READ_SYSREG64(TTBR0_EL2)); printk("\n"); printk(" ESR_EL2: %08"PRIx32"\n", READ_SYSREG32(ESR_EL2)); printk(" HPFAR_EL2: %016"PRIregister"\n", READ_SYSREG(HPFAR_EL2)); #ifdef CONFIG_ARM_32 printk(" HDFAR: %08"PRIx32"\n", READ_CP32(HDFAR)); printk(" HIFAR: %08"PRIx32"\n", READ_CP32(HIFAR)); #else printk(" FAR_EL2: %016"PRIx64"\n", READ_SYSREG64(FAR_EL2)); #endif printk("\n"); } void show_registers(struct cpu_user_regs *regs) { struct reg_ctxt ctxt; ctxt.sctlr_el1 = READ_SYSREG(SCTLR_EL1); ctxt.tcr_el1 = READ_SYSREG(TCR_EL1); ctxt.ttbr0_el1 = READ_SYSREG64(TTBR0_EL1); ctxt.ttbr1_el1 = READ_SYSREG64(TTBR1_EL1); #ifdef CONFIG_ARM_32 ctxt.dfar = READ_CP32(DFAR); ctxt.ifar = READ_CP32(IFAR); ctxt.dfsr = READ_CP32(DFSR); ctxt.ifsr = READ_CP32(IFSR); #else ctxt.far = READ_SYSREG(FAR_EL1); ctxt.esr_el1 = READ_SYSREG(ESR_EL1); ctxt.ifsr32_el2 = READ_SYSREG(IFSR32_EL2); #endif ctxt.vttbr_el2 = READ_SYSREG64(VTTBR_EL2); _show_registers(regs, &ctxt, guest_mode(regs), current); } void vcpu_show_registers(const struct vcpu *v) { struct reg_ctxt ctxt; ctxt.sctlr_el1 = v->arch.sctlr; ctxt.tcr_el1 = v->arch.ttbcr; ctxt.ttbr0_el1 = v->arch.ttbr0; ctxt.ttbr1_el1 = v->arch.ttbr1; #ifdef CONFIG_ARM_32 ctxt.dfar = v->arch.dfar; ctxt.ifar = v->arch.ifar; ctxt.dfsr = v->arch.dfsr; ctxt.ifsr = v->arch.ifsr; #else ctxt.far = v->arch.far; ctxt.esr_el1 = v->arch.esr; ctxt.ifsr32_el2 = v->arch.ifsr; #endif ctxt.vttbr_el2 = v->domain->arch.vttbr; _show_registers(&v->arch.cpu_info->guest_cpu_user_regs, &ctxt, 1, v); } static void show_guest_stack(struct vcpu *v, struct cpu_user_regs *regs) { int i; vaddr_t sp; paddr_t stack_phys; void *mapped; unsigned long *stack, addr; if ( test_bit(_VPF_down, &v->pause_flags) ) { printk("No stack trace, VCPU offline\n"); return; } switch ( regs->cpsr & PSR_MODE_MASK ) { case PSR_MODE_USR: case PSR_MODE_SYS: #ifdef CONFIG_ARM_64 case PSR_MODE_EL0t: #endif printk("No stack trace for guest user-mode\n"); return; case PSR_MODE_FIQ: sp = regs->sp_fiq; break; case PSR_MODE_IRQ: sp = regs->sp_irq; break; case PSR_MODE_SVC: sp = regs->sp_svc; break; case PSR_MODE_ABT: sp = regs->sp_abt; break; case PSR_MODE_UND: sp = regs->sp_und; break; #ifdef CONFIG_ARM_64 case PSR_MODE_EL1t: sp = regs->sp_el0; break; case PSR_MODE_EL1h: sp = regs->sp_el1; break; #endif case PSR_MODE_HYP: case PSR_MODE_MON: #ifdef CONFIG_ARM_64 case PSR_MODE_EL3h: case PSR_MODE_EL3t: case PSR_MODE_EL2h: case PSR_MODE_EL2t: #endif default: BUG(); return; } printk("Guest stack trace from sp=%"PRIvaddr":\n ", sp); if ( gvirt_to_maddr(sp, &stack_phys) ) { printk("Failed to convert stack to physical address\n"); return; } mapped = map_domain_page(stack_phys >> PAGE_SHIFT); stack = mapped + (sp & ~PAGE_MASK); for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ ) { if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & PAGE_SIZE ) break; addr = *stack; if ( (i != 0) && ((i % stack_words_per_line) == 0) ) printk("\n "); printk(" %p", _p(addr)); stack++; } if ( i == 0 ) printk("Stack empty."); printk("\n"); unmap_domain_page(mapped); } #define STACK_BEFORE_EXCEPTION(regs) ((register_t*)(regs)->sp) #ifdef CONFIG_ARM_32 /* Frame pointer points to the return address: * (largest address) * | cpu_info * | [...] | * | return addr <-----------------, | * | fp --------------------------------+----' * | [...] | * | return addr <------------, | * | fp ---------------------------+----' * | [...] | * | return addr <- regs->fp | * | fp ---------------------------' * | * v (smallest address, sp) */ #define STACK_FRAME_BASE(fp) ((register_t*)(fp) - 1) #else /* Frame pointer points to the next frame: * (largest address) * | cpu_info * | [...] | * | return addr | * | fp <-------------------------------, >--' * | [...] | * | return addr | * | fp <--------------------------, >--' * | [...] | * | return addr <- regs->fp | * | fp ---------------------------' * | * v (smallest address, sp) */ #define STACK_FRAME_BASE(fp) ((register_t*)(fp)) #endif static void show_trace(struct cpu_user_regs *regs) { register_t *frame, next, addr, low, high; printk("Xen call trace:\n"); printk(" [<%p>] %pS (PC)\n", _p(regs->pc), _p(regs->pc)); printk(" [<%p>] %pS (LR)\n", _p(regs->lr), _p(regs->lr)); /* Bounds for range of valid frame pointer. */ low = (register_t)(STACK_BEFORE_EXCEPTION(regs)); high = (low & ~(STACK_SIZE - 1)) + (STACK_SIZE - sizeof(struct cpu_info)); /* The initial frame pointer. */ next = regs->fp; for ( ; ; ) { if ( (next < low) || (next >= high) ) break; /* Ordinary stack frame. */ frame = STACK_FRAME_BASE(next); next = frame[0]; addr = frame[1]; printk(" [<%p>] %pS\n", _p(addr), _p(addr)); low = (register_t)&frame[1]; } printk("\n"); } void show_stack(struct cpu_user_regs *regs) { register_t *stack = STACK_BEFORE_EXCEPTION(regs), addr; int i; if ( guest_mode(regs) ) return show_guest_stack(current, regs); printk("Xen stack trace from sp=%p:\n ", stack); for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ ) { if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 ) break; if ( (i != 0) && ((i % stack_words_per_line) == 0) ) printk("\n "); addr = *stack++; printk(" %p", _p(addr)); } if ( i == 0 ) printk("Stack empty."); printk("\n"); show_trace(regs); } void show_execution_state(struct cpu_user_regs *regs) { show_registers(regs); show_stack(regs); } void vcpu_show_execution_state(struct vcpu *v) { printk("*** Dumping Dom%d vcpu#%d state: ***\n", v->domain->domain_id, v->vcpu_id); if ( v == current ) { show_execution_state(guest_cpu_user_regs()); return; } vcpu_pause(v); /* acceptably dangerous */ vcpu_show_registers(v); if ( !usr_mode(&v->arch.cpu_info->guest_cpu_user_regs) ) show_guest_stack(v, &v->arch.cpu_info->guest_cpu_user_regs); vcpu_unpause(v); } void do_unexpected_trap(const char *msg, struct cpu_user_regs *regs) { printk("CPU%d: Unexpected Trap: %s\n", smp_processor_id(), msg); show_execution_state(regs); while(1); } typedef register_t (*arm_hypercall_fn_t)( register_t, register_t, register_t, register_t, register_t); typedef struct { arm_hypercall_fn_t fn; int nr_args; } arm_hypercall_t; #define HYPERCALL(_name, _nr_args) \ [ __HYPERVISOR_ ## _name ] = { \ .fn = (arm_hypercall_fn_t) &do_ ## _name, \ .nr_args = _nr_args, \ } #define HYPERCALL_ARM(_name, _nr_args) \ [ __HYPERVISOR_ ## _name ] = { \ .fn = (arm_hypercall_fn_t) &do_arm_ ## _name, \ .nr_args = _nr_args, \ } static arm_hypercall_t arm_hypercall_table[] = { HYPERCALL(memory_op, 2), HYPERCALL(domctl, 1), HYPERCALL(sched_op, 2), HYPERCALL(console_io, 3), HYPERCALL(xen_version, 2), HYPERCALL(event_channel_op, 2), HYPERCALL(physdev_op, 2), HYPERCALL(sysctl, 2), HYPERCALL(hvm_op, 2), HYPERCALL(grant_table_op, 3), HYPERCALL_ARM(vcpu_op, 3), }; typedef int (*arm_psci_fn_t)(uint32_t, register_t); typedef struct { arm_psci_fn_t fn; int nr_args; } arm_psci_t; #define PSCI(_name, _nr_args) \ [ PSCI_ ## _name ] = { \ .fn = (arm_psci_fn_t) &do_psci_ ## _name, \ .nr_args = _nr_args, \ } static arm_psci_t arm_psci_table[] = { PSCI(cpu_off, 1), PSCI(cpu_on, 2), }; #ifndef NDEBUG static void do_debug_trap(struct cpu_user_regs *regs, unsigned int code) { register_t *r; uint32_t reg; uint32_t domid = current->domain->domain_id; switch ( code ) { case 0xe0 ... 0xef: reg = code - 0xe0; r = select_user_reg(regs, reg); printk("DOM%d: R%d = 0x%"PRIregister" at 0x%"PRIvaddr"\n", domid, reg, *r, regs->pc); break; case 0xfd: printk("DOM%d: Reached %"PRIvaddr"\n", domid, regs->pc); break; case 0xfe: r = select_user_reg(regs, 0); printk("%c", (char)(*r & 0xff)); break; case 0xff: printk("DOM%d: DEBUG\n", domid); show_execution_state(regs); break; default: panic("DOM%d: Unhandled debug trap %#x", domid, code); break; } } #endif #ifdef CONFIG_ARM_64 #define PSCI_OP_REG(r) (r)->x0 #define PSCI_RESULT_REG(r) (r)->x0 #define PSCI_ARGS(r) (r)->x1, (r)->x2 #else #define PSCI_OP_REG(r) (r)->r0 #define PSCI_RESULT_REG(r) (r)->r0 #define PSCI_ARGS(r) (r)->r1, (r)->r2 #endif static void do_trap_psci(struct cpu_user_regs *regs) { arm_psci_fn_t psci_call = NULL; if ( PSCI_OP_REG(regs) >= ARRAY_SIZE(arm_psci_table) ) { domain_crash_synchronous(); return; } psci_call = arm_psci_table[PSCI_OP_REG(regs)].fn; if ( psci_call == NULL ) { domain_crash_synchronous(); return; } PSCI_RESULT_REG(regs) = psci_call(PSCI_ARGS(regs)); } #ifdef CONFIG_ARM_64 #define HYPERCALL_RESULT_REG(r) (r)->x0 #define HYPERCALL_ARG1(r) (r)->x0 #define HYPERCALL_ARG2(r) (r)->x1 #define HYPERCALL_ARG3(r) (r)->x2 #define HYPERCALL_ARG4(r) (r)->x3 #define HYPERCALL_ARG5(r) (r)->x4 #define HYPERCALL_ARGS(r) (r)->x0, (r)->x1, (r)->x2, (r)->x3, (r)->x4 #else #define HYPERCALL_RESULT_REG(r) (r)->r0 #define HYPERCALL_ARG1(r) (r)->r0 #define HYPERCALL_ARG2(r) (r)->r1 #define HYPERCALL_ARG3(r) (r)->r2 #define HYPERCALL_ARG4(r) (r)->r3 #define HYPERCALL_ARG5(r) (r)->r4 #define HYPERCALL_ARGS(r) (r)->r0, (r)->r1, (r)->r2, (r)->r3, (r)->r4 #endif static void do_trap_hypercall(struct cpu_user_regs *regs, register_t *nr, unsigned long iss) { arm_hypercall_fn_t call = NULL; #ifndef NDEBUG register_t orig_pc = regs->pc; #endif if ( iss != XEN_HYPERCALL_TAG ) domain_crash_synchronous(); if ( *nr >= ARRAY_SIZE(arm_hypercall_table) ) { HYPERCALL_RESULT_REG(regs) = -ENOSYS; return; } call = arm_hypercall_table[*nr].fn; if ( call == NULL ) { HYPERCALL_RESULT_REG(regs) = -ENOSYS; return; } HYPERCALL_RESULT_REG(regs) = call(HYPERCALL_ARGS(regs)); #ifndef NDEBUG /* * Clobber argument registers only if pc is unchanged, otherwise * this is a hypercall continuation. */ if ( orig_pc == regs->pc ) { switch ( arm_hypercall_table[*nr].nr_args ) { case 5: HYPERCALL_ARG5(regs) = 0xDEADBEEF; case 4: HYPERCALL_ARG4(regs) = 0xDEADBEEF; case 3: HYPERCALL_ARG3(regs) = 0xDEADBEEF; case 2: HYPERCALL_ARG2(regs) = 0xDEADBEEF; case 1: /* Don't clobber x0/r0 -- it's the return value */ break; default: BUG(); } *nr = 0xDEADBEEF; } #endif } void do_multicall_call(struct multicall_entry *multi) { arm_hypercall_fn_t call = NULL; if ( multi->op >= ARRAY_SIZE(arm_hypercall_table) ) { multi->result = -ENOSYS; return; } call = arm_hypercall_table[multi->op].fn; if ( call == NULL ) { multi->result = -ENOSYS; return; } multi->result = call(multi->args[0], multi->args[1], multi->args[2], multi->args[3], multi->args[4]); } /* * stolen from arch/arm/kernel/opcodes.c * * condition code lookup table * index into the table is test code: EQ, NE, ... LT, GT, AL, NV * * bit position in short is condition code: NZCV */ static const unsigned short cc_map[16] = { 0xF0F0, /* EQ == Z set */ 0x0F0F, /* NE */ 0xCCCC, /* CS == C set */ 0x3333, /* CC */ 0xFF00, /* MI == N set */ 0x00FF, /* PL */ 0xAAAA, /* VS == V set */ 0x5555, /* VC */ 0x0C0C, /* HI == C set && Z clear */ 0xF3F3, /* LS == C clear || Z set */ 0xAA55, /* GE == (N==V) */ 0x55AA, /* LT == (N!=V) */ 0x0A05, /* GT == (!Z && (N==V)) */ 0xF5FA, /* LE == (Z || (N!=V)) */ 0xFFFF, /* AL always */ 0 /* NV */ }; static int check_conditional_instr(struct cpu_user_regs *regs, union hsr hsr) { unsigned long cpsr, cpsr_cond; int cond; /* Unconditional Exception classes */ if ( hsr.ec >= 0x10 ) return 1; /* Check for valid condition in hsr */ cond = hsr.cond.ccvalid ? hsr.cond.cc : -1; /* Unconditional instruction */ if ( cond == 0xe ) return 1; cpsr = regs->cpsr; /* If cc is not valid then we need to examine the IT state */ if ( cond < 0 ) { unsigned long it; BUG_ON( !is_pv32_domain(current->domain) || !(cpsr&PSR_THUMB) ); it = ( (cpsr >> (10-2)) & 0xfc) | ((cpsr >> 25) & 0x3 ); /* it == 0 => unconditional. */ if ( it == 0 ) return 1; /* The cond for this instruction works out as the top 4 bits. */ cond = ( it >> 4 ); } cpsr_cond = cpsr >> 28; if ( !((cc_map[cond] >> cpsr_cond) & 1) ) return 0; return 1; } static void advance_pc(struct cpu_user_regs *regs, union hsr hsr) { unsigned long itbits, cond, cpsr = regs->cpsr; /* PSR_IT_MASK bits can only be set for 32-bit processors in Thumb mode. */ BUG_ON( (!is_pv32_domain(current->domain)||!(cpsr&PSR_THUMB)) && (cpsr&PSR_IT_MASK) ); if ( is_pv32_domain(current->domain) && (cpsr&PSR_IT_MASK) ) { /* The ITSTATE[7:0] block is contained in CPSR[15:10],CPSR[26:25] * * ITSTATE[7:5] are the condition code * ITSTATE[4:0] are the IT bits * * If the condition is non-zero then the IT state machine is * advanced by shifting the IT bits left. * * See A2-51 and B1-1148 of DDI 0406C.b. */ cond = (cpsr & 0xe000) >> 13; itbits = (cpsr & 0x1c00) >> (10 - 2); itbits |= (cpsr & (0x3 << 25)) >> 25; if ( (itbits & 0x7) == 0 ) itbits = cond = 0; else itbits = (itbits << 1) & 0x1f; cpsr &= ~PSR_IT_MASK; cpsr |= cond << 13; cpsr |= (itbits & 0x1c) << (10 - 2); cpsr |= (itbits & 0x3) << 25; regs->cpsr = cpsr; } regs->pc += hsr.len ? 4 : 2; } static void do_cp15_32(struct cpu_user_regs *regs, union hsr hsr) { struct hsr_cp32 cp32 = hsr.cp32; uint32_t *r = (uint32_t*)select_user_reg(regs, cp32.reg); struct vcpu *v = current; if ( !check_conditional_instr(regs, hsr) ) { advance_pc(regs, hsr); return; } switch ( hsr.bits & HSR_CP32_REGS_MASK ) { case HSR_CPREG32(CLIDR): if ( !cp32.read ) { dprintk(XENLOG_ERR, "attempt to write to read-only register CLIDR\n"); domain_crash_synchronous(); } *r = READ_SYSREG32(CLIDR_EL1); break; case HSR_CPREG32(CCSIDR): if ( !cp32.read ) { dprintk(XENLOG_ERR, "attempt to write to read-only register CCSIDR\n"); domain_crash_synchronous(); } *r = READ_SYSREG32(CCSIDR_EL1); break; case HSR_CPREG32(DCCISW): if ( cp32.read ) { dprintk(XENLOG_ERR, "attempt to read from write-only register DCCISW\n"); domain_crash_synchronous(); } #ifdef CONFIG_ARM_32 WRITE_CP32(*r, DCCISW); #else asm volatile("dc cisw, %0;" : : "r" (*r) : "memory"); #endif break; case HSR_CPREG32(CNTP_CTL): case HSR_CPREG32(CNTP_TVAL): if ( !vtimer_emulate(regs, hsr) ) { dprintk(XENLOG_ERR, "failed emulation of 32-bit vtimer CP register access\n"); domain_crash_synchronous(); } break; case HSR_CPREG32(ACTLR): if ( cp32.read ) *r = v->arch.actlr; break; default: printk("%s p15, %d, r%d, cr%d, cr%d, %d @ 0x%"PRIregister"\n", cp32.read ? "mrc" : "mcr", cp32.op1, cp32.reg, cp32.crn, cp32.crm, cp32.op2, regs->pc); panic("unhandled 32-bit CP15 access %#x", hsr.bits & HSR_CP32_REGS_MASK); } advance_pc(regs, hsr); } static void do_cp15_64(struct cpu_user_regs *regs, union hsr hsr) { struct hsr_cp64 cp64 = hsr.cp64; if ( !check_conditional_instr(regs, hsr) ) { advance_pc(regs, hsr); return; } switch ( hsr.bits & HSR_CP64_REGS_MASK ) { case HSR_CPREG64(CNTPCT): if ( !vtimer_emulate(regs, hsr) ) { dprintk(XENLOG_ERR, "failed emulation of 64-bit vtimer CP register access\n"); domain_crash_synchronous(); } break; default: printk("%s p15, %d, r%d, r%d, cr%d @ 0x%"PRIregister"\n", cp64.read ? "mrrc" : "mcrr", cp64.op1, cp64.reg1, cp64.reg2, cp64.crm, regs->pc); panic("unhandled 64-bit CP15 access %#x", hsr.bits & HSR_CP64_REGS_MASK); } advance_pc(regs, hsr); } #ifdef CONFIG_ARM_64 static void do_sysreg(struct cpu_user_regs *regs, union hsr hsr) { struct hsr_sysreg sysreg = hsr.sysreg; switch ( hsr.bits & HSR_SYSREG_REGS_MASK ) { case HSR_SYSREG_CNTP_CTL_EL0: case HSR_SYSREG_CNTP_TVAL_EL0: if ( !vtimer_emulate(regs, hsr) ) { dprintk(XENLOG_ERR, "failed emulation of 64-bit vtimer sysreg access\n"); domain_crash_synchronous(); } break; default: printk("%s %d, %d, c%d, c%d, %d %s x%d @ 0x%"PRIregister"\n", sysreg.read ? "mrs" : "msr", sysreg.op0, sysreg.op1, sysreg.crn, sysreg.crm, sysreg.op2, sysreg.read ? "=>" : "<=", sysreg.reg, regs->pc); panic("unhandled 64-bit sysreg access %#x", hsr.bits & HSR_SYSREG_REGS_MASK); } regs->pc += 4; } #endif void dump_guest_s1_walk(struct domain *d, vaddr_t addr) { register_t ttbcr = READ_SYSREG(TCR_EL1); uint64_t ttbr0 = READ_SYSREG64(TTBR0_EL1); paddr_t paddr; uint32_t offset; uint32_t *first = NULL, *second = NULL; printk("dom%d VA 0x%08"PRIvaddr"\n", d->domain_id, addr); printk(" TTBCR: 0x%08"PRIregister"\n", ttbcr); printk(" TTBR0: 0x%016"PRIx64" = 0x%"PRIpaddr"\n", ttbr0, p2m_lookup(d, ttbr0 & PAGE_MASK, NULL)); if ( ttbcr & TTBCR_EAE ) { printk("Cannot handle LPAE guest PT walk\n"); return; } if ( (ttbcr & TTBCR_N_MASK) != 0 ) { printk("Cannot handle TTBR1 guest walks\n"); return; } paddr = p2m_lookup(d, ttbr0 & PAGE_MASK, NULL); if ( paddr == INVALID_PADDR ) { printk("Failed TTBR0 maddr lookup\n"); goto done; } first = map_domain_page(paddr>>PAGE_SHIFT); offset = addr >> (12+10); printk("1ST[0x%"PRIx32"] (0x%"PRIpaddr") = 0x%08"PRIx32"\n", offset, paddr, first[offset]); if ( !(first[offset] & 0x1) || !(first[offset] & 0x2) ) goto done; paddr = p2m_lookup(d, first[offset] & PAGE_MASK, NULL); if ( paddr == INVALID_PADDR ) { printk("Failed L1 entry maddr lookup\n"); goto done; } second = map_domain_page(paddr>>PAGE_SHIFT); offset = (addr >> 12) & 0x3FF; printk("2ND[0x%"PRIx32"] (0x%"PRIpaddr") = 0x%08"PRIx32"\n", offset, paddr, second[offset]); done: if (second) unmap_domain_page(second); if (first) unmap_domain_page(first); } static void do_trap_instr_abort_guest(struct cpu_user_regs *regs, union hsr hsr) { register_t addr = READ_SYSREG(FAR_EL2); inject_iabt_exception(regs, addr, hsr.len); } static void do_trap_data_abort_guest(struct cpu_user_regs *regs, union hsr hsr) { struct hsr_dabt dabt = hsr.dabt; int rc; mmio_info_t info; if ( !check_conditional_instr(regs, hsr) ) { advance_pc(regs, hsr); return; } info.dabt = dabt; #ifdef CONFIG_ARM_32 info.gva = READ_CP32(HDFAR); #else info.gva = READ_SYSREG64(FAR_EL2); #endif if (dabt.s1ptw) goto bad_data_abort; rc = gva_to_ipa(info.gva, &info.gpa); if ( rc == -EFAULT ) goto bad_data_abort; /* XXX: Decode the instruction if ISS is not valid */ if ( !dabt.valid ) goto bad_data_abort; /* * Erratum 766422: Thumb store translation fault to Hypervisor may * not have correct HSR Rt value. */ if ( cpu_has_erratum_766422() && (regs->cpsr & PSR_THUMB) && dabt.write ) { rc = decode_instruction(regs, &info.dabt); if ( rc ) { gdprintk(XENLOG_DEBUG, "Unable to decode instruction\n"); goto bad_data_abort; } } if (handle_mmio(&info)) { advance_pc(regs, hsr); return; } bad_data_abort: inject_dabt_exception(regs, info.gva, hsr.len); } asmlinkage void do_trap_hypervisor(struct cpu_user_regs *regs) { union hsr hsr = { .bits = READ_SYSREG32(ESR_EL2) }; switch (hsr.ec) { case HSR_EC_WFI_WFE: if ( !check_conditional_instr(regs, hsr) ) { advance_pc(regs, hsr); return; } /* at the moment we only trap WFI */ vcpu_block(); /* The ARM spec declares that even if local irqs are masked in * the CPSR register, an irq should wake up a cpu from WFI anyway. * For this reason we need to check for irqs that need delivery, * ignoring the CPSR register, *after* calling SCHEDOP_block to * avoid races with vgic_vcpu_inject_irq. */ if ( local_events_need_delivery_nomask() ) vcpu_unblock(current); advance_pc(regs, hsr); break; case HSR_EC_CP15_32: if ( ! is_pv32_domain(current->domain) ) goto bad_trap; do_cp15_32(regs, hsr); break; case HSR_EC_CP15_64: if ( ! is_pv32_domain(current->domain) ) goto bad_trap; do_cp15_64(regs, hsr); break; case HSR_EC_SMC32: inject_undef32_exception(regs); break; case HSR_EC_HVC32: #ifndef NDEBUG if ( (hsr.iss & 0xff00) == 0xff00 ) return do_debug_trap(regs, hsr.iss & 0x00ff); #endif if ( hsr.iss == 0 ) return do_trap_psci(regs); do_trap_hypercall(regs, (register_t *)®s->r12, hsr.iss); break; #ifdef CONFIG_ARM_64 case HSR_EC_HVC64: #ifndef NDEBUG if ( (hsr.iss & 0xff00) == 0xff00 ) return do_debug_trap(regs, hsr.iss & 0x00ff); #endif if ( hsr.iss == 0 ) return do_trap_psci(regs); do_trap_hypercall(regs, ®s->x16, hsr.iss); break; case HSR_EC_SMC64: inject_undef64_exception(regs, hsr.len); break; case HSR_EC_SYSREG: if ( is_pv32_domain(current->domain) ) goto bad_trap; do_sysreg(regs, hsr); break; #endif case HSR_EC_INSTR_ABORT_LOWER_EL: do_trap_instr_abort_guest(regs, hsr); break; case HSR_EC_DATA_ABORT_LOWER_EL: do_trap_data_abort_guest(regs, hsr); break; default: bad_trap: printk("Hypervisor Trap. HSR=0x%x EC=0x%x IL=%x Syndrome=%"PRIx32"\n", hsr.bits, hsr.ec, hsr.len, hsr.iss); do_unexpected_trap("Hypervisor", regs); } } asmlinkage void do_trap_irq(struct cpu_user_regs *regs) { gic_interrupt(regs, 0); } asmlinkage void do_trap_fiq(struct cpu_user_regs *regs) { gic_interrupt(regs, 1); } asmlinkage void leave_hypervisor_tail(void) { while (1) { local_irq_disable(); if (!softirq_pending(smp_processor_id())) { gic_inject(); return; } local_irq_enable(); do_softirq(); } } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/psci.c0000664000175000017500000000477612307313555014051 0ustar smbsmb/* * xen/arch/arm/psci.c * * PSCI host support * * Andre Przywara * Copyright (c) 2013 Linaro Limited. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include bool_t psci_available; #ifdef CONFIG_ARM_32 #define REG_PREFIX "r" #else #define REG_PREFIX "x" #endif static noinline int __invoke_psci_fn_smc(register_t function_id, register_t arg0, register_t arg1, register_t arg2) { asm volatile( __asmeq("%0", REG_PREFIX"0") __asmeq("%1", REG_PREFIX"1") __asmeq("%2", REG_PREFIX"2") __asmeq("%3", REG_PREFIX"3") "smc #0" : "+r" (function_id) : "r" (arg0), "r" (arg1), "r" (arg2)); return function_id; } #undef REG_PREFIX static uint32_t psci_cpu_on_nr; int call_psci_cpu_on(int cpu) { return __invoke_psci_fn_smc(psci_cpu_on_nr, cpu, __pa(init_secondary), 0); } int __init psci_init(void) { const struct dt_device_node *psci; int ret; const char *prop_str; psci = dt_find_compatible_node(NULL, NULL, "arm,psci"); if ( !psci ) return -ENODEV; ret = dt_property_read_string(psci, "method", &prop_str); if ( ret ) { printk("/psci node does not provide a method (%d)\n", ret); return -EINVAL; } /* Since Xen runs in HYP all of the time, it does not make sense to * let it call into HYP for PSCI handling, since the handler just * won't be there. So bail out with an error if "smc" is not used. */ if ( strcmp(prop_str, "smc") ) { printk("/psci method must be smc, but is: \"%s\"\n", prop_str); return -EINVAL; } if ( !dt_property_read_u32(psci, "cpu_on", &psci_cpu_on_nr) ) { printk("/psci node is missing the \"cpu_on\" property\n"); return -ENOENT; } psci_available = 1; printk(XENLOG_INFO "Using PSCI for SMP bringup\n"); return 0; } xen-4.4.0/xen/arch/arm/sysctl.c0000664000175000017500000000126112307313555014416 0ustar smbsmb/****************************************************************************** * Arch-specific sysctl.c * * System management operations. For use by node control stack. * * Copyright (c) 2012, Citrix Systems */ #include #include #include #include #include #include void arch_do_physinfo(xen_sysctl_physinfo_t *pi) { } long arch_do_sysctl(struct xen_sysctl *sysctl, XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) u_sysctl) { return -ENOSYS; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/guestcopy.c0000664000175000017500000000547512307313555015132 0ustar smbsmb#include #include #include #include #include static unsigned long raw_copy_to_guest_helper(void *to, const void *from, unsigned len, int flush_dcache) { /* XXX needs to handle faults */ unsigned offset = (vaddr_t)to & ~PAGE_MASK; while ( len ) { paddr_t g; void *p; unsigned size = min(len, (unsigned)PAGE_SIZE - offset); if ( gvirt_to_maddr((vaddr_t) to, &g) ) return len; p = map_domain_page(g>>PAGE_SHIFT); p += offset; memcpy(p, from, size); if ( flush_dcache ) clean_xen_dcache_va_range(p, size); unmap_domain_page(p - offset); len -= size; from += size; to += size; /* * After the first iteration, guest virtual address is correctly * aligned to PAGE_SIZE. */ offset = 0; } return 0; } unsigned long raw_copy_to_guest(void *to, const void *from, unsigned len) { return raw_copy_to_guest_helper(to, from, len, 0); } unsigned long raw_copy_to_guest_flush_dcache(void *to, const void *from, unsigned len) { return raw_copy_to_guest_helper(to, from, len, 1); } unsigned long raw_clear_guest(void *to, unsigned len) { /* XXX needs to handle faults */ unsigned offset = (vaddr_t)to & ~PAGE_MASK; while ( len ) { paddr_t g; void *p; unsigned size = min(len, (unsigned)PAGE_SIZE - offset); if ( gvirt_to_maddr((vaddr_t) to, &g) ) return len; p = map_domain_page(g>>PAGE_SHIFT); p += offset; memset(p, 0x00, size); unmap_domain_page(p - offset); len -= size; to += size; /* * After the first iteration, guest virtual address is correctly * aligned to PAGE_SIZE. */ offset = 0; } return 0; } unsigned long raw_copy_from_guest(void *to, const void __user *from, unsigned len) { unsigned offset = (vaddr_t)from & ~PAGE_MASK; while ( len ) { paddr_t g; void *p; unsigned size = min(len, (unsigned)(PAGE_SIZE - offset)); if ( gvirt_to_maddr((vaddr_t) from & PAGE_MASK, &g) ) return len; p = map_domain_page(g>>PAGE_SHIFT); p += ((vaddr_t)from & (~PAGE_MASK)); memcpy(to, p, size); unmap_domain_page(p); len -= size; from += size; to += size; /* * After the first iteration, guest virtual address is correctly * aligned to PAGE_SIZE. */ offset = 0; } return 0; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/gic.c0000664000175000017500000006502512307313555013647 0ustar smbsmb/* * xen/arch/arm/gic.c * * ARM Generic Interrupt Controller support * * Tim Deegan * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Access to the GIC Distributor registers through the fixmap */ #define GICD ((volatile uint32_t *) FIXMAP_ADDR(FIXMAP_GICD)) #define GICC ((volatile uint32_t *) FIXMAP_ADDR(FIXMAP_GICC1)) #define GICH ((volatile uint32_t *) FIXMAP_ADDR(FIXMAP_GICH)) static void gic_restore_pending_irqs(struct vcpu *v); /* Global state */ static struct { paddr_t dbase; /* Address of distributor registers */ paddr_t cbase; /* Address of CPU interface registers */ paddr_t hbase; /* Address of virtual interface registers */ paddr_t vbase; /* Address of virtual cpu interface registers */ unsigned int lines; /* Number of interrupts (SPIs + PPIs + SGIs) */ struct dt_irq maintenance; /* IRQ maintenance */ unsigned int cpus; spinlock_t lock; } gic; static irq_desc_t irq_desc[NR_IRQS]; static DEFINE_PER_CPU(irq_desc_t[NR_LOCAL_IRQS], local_irq_desc); static DEFINE_PER_CPU(uint64_t, lr_mask); static unsigned nr_lrs; /* The GIC mapping of CPU interfaces does not necessarily match the * logical CPU numbering. Let's use mapping as returned by the GIC * itself */ static DEFINE_PER_CPU(u8, gic_cpu_id); /* Maximum cpu interface per GIC */ #define NR_GIC_CPU_IF 8 static unsigned int gic_cpu_mask(const cpumask_t *cpumask) { unsigned int cpu; unsigned int mask = 0; cpumask_t possible_mask; cpumask_and(&possible_mask, cpumask, &cpu_possible_map); for_each_cpu(cpu, &possible_mask) { ASSERT(cpu < NR_GIC_CPU_IF); mask |= per_cpu(gic_cpu_id, cpu); } return mask; } unsigned int gic_number_lines(void) { return gic.lines; } irq_desc_t *__irq_to_desc(int irq) { if (irq < NR_LOCAL_IRQS) return &this_cpu(local_irq_desc)[irq]; return &irq_desc[irq-NR_LOCAL_IRQS]; } void gic_save_state(struct vcpu *v) { int i; ASSERT(!local_irq_is_enabled()); /* No need for spinlocks here because interrupts are disabled around * this call and it only accesses struct vcpu fields that cannot be * accessed simultaneously by another pCPU. */ for ( i=0; iarch.gic_lr[i] = GICH[GICH_LR + i]; v->arch.lr_mask = this_cpu(lr_mask); v->arch.gic_apr = GICH[GICH_APR]; v->arch.gic_vmcr = GICH[GICH_VMCR]; /* Disable until next VCPU scheduled */ GICH[GICH_HCR] = 0; isb(); } void gic_restore_state(struct vcpu *v) { int i; if ( is_idle_vcpu(v) ) return; this_cpu(lr_mask) = v->arch.lr_mask; for ( i=0; iarch.gic_lr[i]; GICH[GICH_APR] = v->arch.gic_apr; GICH[GICH_VMCR] = v->arch.gic_vmcr; GICH[GICH_HCR] = GICH_HCR_EN; isb(); gic_restore_pending_irqs(v); } static void gic_irq_enable(struct irq_desc *desc) { int irq = desc->irq; unsigned long flags; spin_lock_irqsave(&desc->lock, flags); spin_lock(&gic.lock); desc->status &= ~IRQ_DISABLED; dsb(); /* Enable routing */ GICD[GICD_ISENABLER + irq / 32] = (1u << (irq % 32)); spin_unlock(&gic.lock); spin_unlock_irqrestore(&desc->lock, flags); } static void gic_irq_disable(struct irq_desc *desc) { int irq = desc->irq; unsigned long flags; spin_lock_irqsave(&desc->lock, flags); spin_lock(&gic.lock); /* Disable routing */ GICD[GICD_ICENABLER + irq / 32] = (1u << (irq % 32)); desc->status |= IRQ_DISABLED; spin_unlock(&gic.lock); spin_unlock_irqrestore(&desc->lock, flags); } static unsigned int gic_irq_startup(struct irq_desc *desc) { gic_irq_enable(desc); return 0; } static void gic_irq_shutdown(struct irq_desc *desc) { gic_irq_disable(desc); } static void gic_irq_ack(struct irq_desc *desc) { /* No ACK -- reading IAR has done this for us */ } static void gic_host_irq_end(struct irq_desc *desc) { int irq = desc->irq; /* Lower the priority */ GICC[GICC_EOIR] = irq; /* Deactivate */ GICC[GICC_DIR] = irq; } static void gic_guest_irq_end(struct irq_desc *desc) { int irq = desc->irq; /* Lower the priority of the IRQ */ GICC[GICC_EOIR] = irq; /* Deactivation happens in maintenance interrupt / via GICV */ } static void gic_irq_set_affinity(struct irq_desc *desc, const cpumask_t *mask) { BUG(); } /* XXX different for level vs edge */ static hw_irq_controller gic_host_irq_type = { .typename = "gic", .startup = gic_irq_startup, .shutdown = gic_irq_shutdown, .enable = gic_irq_enable, .disable = gic_irq_disable, .ack = gic_irq_ack, .end = gic_host_irq_end, .set_affinity = gic_irq_set_affinity, }; static hw_irq_controller gic_guest_irq_type = { .typename = "gic", .startup = gic_irq_startup, .shutdown = gic_irq_shutdown, .enable = gic_irq_enable, .disable = gic_irq_disable, .ack = gic_irq_ack, .end = gic_guest_irq_end, .set_affinity = gic_irq_set_affinity, }; /* * - needs to be called with gic.lock held * - needs to be called with a valid cpu_mask, ie each cpu in the mask has * already called gic_cpu_init */ static void gic_set_irq_properties(unsigned int irq, bool_t level, const cpumask_t *cpu_mask, unsigned int priority) { volatile unsigned char *bytereg; uint32_t cfg, edgebit; unsigned int mask = gic_cpu_mask(cpu_mask); /* Set edge / level */ cfg = GICD[GICD_ICFGR + irq / 16]; edgebit = 2u << (2 * (irq % 16)); if ( level ) cfg &= ~edgebit; else cfg |= edgebit; GICD[GICD_ICFGR + irq / 16] = cfg; /* Set target CPU mask (RAZ/WI on uniprocessor) */ bytereg = (unsigned char *) (GICD + GICD_ITARGETSR); bytereg[irq] = mask; /* Set priority */ bytereg = (unsigned char *) (GICD + GICD_IPRIORITYR); bytereg[irq] = priority; } /* Program the GIC to route an interrupt */ static int gic_route_irq(unsigned int irq, bool_t level, const cpumask_t *cpu_mask, unsigned int priority) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; ASSERT(priority <= 0xff); /* Only 8 bits of priority */ ASSERT(irq < gic.lines); /* Can't route interrupts that don't exist */ if ( desc->action != NULL ) return -EBUSY; /* Disable interrupt */ desc->handler->shutdown(desc); spin_lock_irqsave(&desc->lock, flags); desc->handler = &gic_host_irq_type; spin_lock(&gic.lock); gic_set_irq_properties(irq, level, cpu_mask, priority); spin_unlock(&gic.lock); spin_unlock_irqrestore(&desc->lock, flags); return 0; } /* Program the GIC to route an interrupt with a dt_irq */ void gic_route_dt_irq(const struct dt_irq *irq, const cpumask_t *cpu_mask, unsigned int priority) { bool_t level; level = dt_irq_is_level_triggered(irq); gic_route_irq(irq->irq, level, cpu_mask, priority); } static void __init gic_dist_init(void) { uint32_t type; uint32_t cpumask; int i; cpumask = GICD[GICD_ITARGETSR] & 0xff; cpumask |= cpumask << 8; cpumask |= cpumask << 16; /* Disable the distributor */ GICD[GICD_CTLR] = 0; type = GICD[GICD_TYPER]; gic.lines = 32 * ((type & GICD_TYPE_LINES) + 1); gic.cpus = 1 + ((type & GICD_TYPE_CPUS) >> 5); printk("GIC: %d lines, %d cpu%s%s (IID %8.8x).\n", gic.lines, gic.cpus, (gic.cpus == 1) ? "" : "s", (type & GICD_TYPE_SEC) ? ", secure" : "", GICD[GICD_IIDR]); /* Default all global IRQs to level, active low */ for ( i = 32; i < gic.lines; i += 16 ) GICD[GICD_ICFGR + i / 16] = 0x0; /* Route all global IRQs to this CPU */ for ( i = 32; i < gic.lines; i += 4 ) GICD[GICD_ITARGETSR + i / 4] = cpumask; /* Default priority for global interrupts */ for ( i = 32; i < gic.lines; i += 4 ) GICD[GICD_IPRIORITYR + i / 4] = 0xa0a0a0a0; /* Disable all global interrupts */ for ( i = 32; i < gic.lines; i += 32 ) GICD[GICD_ICENABLER + i / 32] = (uint32_t)~0ul; /* Turn on the distributor */ GICD[GICD_CTLR] = GICD_CTL_ENABLE; } static void __cpuinit gic_cpu_init(void) { int i; this_cpu(gic_cpu_id) = GICD[GICD_ITARGETSR] & 0xff; /* The first 32 interrupts (PPI and SGI) are banked per-cpu, so * even though they are controlled with GICD registers, they must * be set up here with the other per-cpu state. */ GICD[GICD_ICENABLER] = 0xffff0000; /* Disable all PPI */ GICD[GICD_ISENABLER] = 0x0000ffff; /* Enable all SGI */ /* Set PPI and SGI priorities */ for (i = 0; i < 32; i += 4) GICD[GICD_IPRIORITYR + i / 4] = 0xa0a0a0a0; /* Local settings: interface controller */ GICC[GICC_PMR] = 0xff; /* Don't mask by priority */ GICC[GICC_BPR] = 0; /* Finest granularity of priority */ GICC[GICC_CTLR] = GICC_CTL_ENABLE|GICC_CTL_EOI; /* Turn on delivery */ } static void gic_cpu_disable(void) { GICC[GICC_CTLR] = 0; } static void __cpuinit gic_hyp_init(void) { uint32_t vtr; vtr = GICH[GICH_VTR]; nr_lrs = (vtr & GICH_VTR_NRLRGS) + 1; GICH[GICH_MISR] = GICH_MISR_EOI; this_cpu(lr_mask) = 0ULL; } static void __cpuinit gic_hyp_disable(void) { GICH[GICH_HCR] = 0; } int gic_irq_xlate(const u32 *intspec, unsigned int intsize, unsigned int *out_hwirq, unsigned int *out_type) { if ( intsize < 3 ) return -EINVAL; /* Get the interrupt number and add 16 to skip over SGIs */ *out_hwirq = intspec[1] + 16; /* For SPIs, we need to add 16 more to get the GIC irq ID number */ if ( !intspec[0] ) *out_hwirq += 16; if ( out_type ) *out_type = intspec[2] & DT_IRQ_TYPE_SENSE_MASK; return 0; } /* Set up the GIC */ void __init gic_init(void) { static const struct dt_device_match gic_ids[] __initconst = { DT_MATCH_GIC, { /* sentinel */ }, }; struct dt_device_node *node; int res; node = dt_find_interrupt_controller(gic_ids); if ( !node ) panic("Unable to find compatible GIC in the device tree"); dt_device_set_used_by(node, DOMID_XEN); res = dt_device_get_address(node, 0, &gic.dbase, NULL); if ( res || !gic.dbase || (gic.dbase & ~PAGE_MASK) ) panic("GIC: Cannot find a valid address for the distributor"); res = dt_device_get_address(node, 1, &gic.cbase, NULL); if ( res || !gic.cbase || (gic.cbase & ~PAGE_MASK) ) panic("GIC: Cannot find a valid address for the CPU"); res = dt_device_get_address(node, 2, &gic.hbase, NULL); if ( res || !gic.hbase || (gic.hbase & ~PAGE_MASK) ) panic("GIC: Cannot find a valid address for the hypervisor"); res = dt_device_get_address(node, 3, &gic.vbase, NULL); if ( res || !gic.vbase || (gic.vbase & ~PAGE_MASK) ) panic("GIC: Cannot find a valid address for the virtual CPU"); res = dt_device_get_irq(node, 0, &gic.maintenance); if ( res ) panic("GIC: Cannot find the maintenance IRQ"); /* Set the GIC as the primary interrupt controller */ dt_interrupt_controller = node; /* TODO: Add check on distributor, cpu size */ printk("GIC initialization:\n" " gic_dist_addr=%"PRIpaddr"\n" " gic_cpu_addr=%"PRIpaddr"\n" " gic_hyp_addr=%"PRIpaddr"\n" " gic_vcpu_addr=%"PRIpaddr"\n" " gic_maintenance_irq=%u\n", gic.dbase, gic.cbase, gic.hbase, gic.vbase, gic.maintenance.irq); if ( (gic.dbase & ~PAGE_MASK) || (gic.cbase & ~PAGE_MASK) || (gic.hbase & ~PAGE_MASK) || (gic.vbase & ~PAGE_MASK) ) panic("GIC interfaces not page aligned"); set_fixmap(FIXMAP_GICD, gic.dbase >> PAGE_SHIFT, DEV_SHARED); BUILD_BUG_ON(FIXMAP_ADDR(FIXMAP_GICC1) != FIXMAP_ADDR(FIXMAP_GICC2)-PAGE_SIZE); set_fixmap(FIXMAP_GICC1, gic.cbase >> PAGE_SHIFT, DEV_SHARED); if ( platform_has_quirk(PLATFORM_QUIRK_GIC_64K_STRIDE) ) set_fixmap(FIXMAP_GICC2, (gic.cbase >> PAGE_SHIFT) + 0x10, DEV_SHARED); else set_fixmap(FIXMAP_GICC2, (gic.cbase >> PAGE_SHIFT) + 0x1, DEV_SHARED); set_fixmap(FIXMAP_GICH, gic.hbase >> PAGE_SHIFT, DEV_SHARED); /* Global settings: interrupt distributor */ spin_lock_init(&gic.lock); spin_lock(&gic.lock); gic_dist_init(); gic_cpu_init(); gic_hyp_init(); spin_unlock(&gic.lock); } void send_SGI_mask(const cpumask_t *cpumask, enum gic_sgi sgi) { unsigned int mask = 0; cpumask_t online_mask; ASSERT(sgi < 16); /* There are only 16 SGIs */ cpumask_and(&online_mask, cpumask, &cpu_online_map); mask = gic_cpu_mask(&online_mask); dsb(); GICD[GICD_SGIR] = GICD_SGI_TARGET_LIST | (mask<handler->shutdown(desc); spin_lock_irqsave(&desc->lock,flags); action = desc->action; desc->action = NULL; desc->status &= ~IRQ_GUEST; spin_unlock_irqrestore(&desc->lock,flags); /* Wait to make sure it's not being used on another CPU */ do { smp_mb(); } while ( desc->status & IRQ_INPROGRESS ); if (action && action->free_on_release) xfree(action); } static int __setup_irq(struct irq_desc *desc, unsigned int irq, struct irqaction *new) { if ( desc->action != NULL ) return -EBUSY; desc->action = new; dsb(); return 0; } int __init setup_dt_irq(const struct dt_irq *irq, struct irqaction *new) { int rc; unsigned long flags; struct irq_desc *desc; desc = irq_to_desc(irq->irq); spin_lock_irqsave(&desc->lock, flags); rc = __setup_irq(desc, irq->irq, new); spin_unlock_irqrestore(&desc->lock, flags); desc->handler->startup(desc); return rc; } static inline void gic_set_lr(int lr, unsigned int virtual_irq, unsigned int state, unsigned int priority) { int maintenance_int = GICH_LR_MAINTENANCE_IRQ; struct pending_irq *p = irq_to_pending(current, virtual_irq); BUG_ON(lr >= nr_lrs); BUG_ON(lr < 0); BUG_ON(state & ~(GICH_LR_STATE_MASK<> 3) << GICH_LR_PRIORITY_SHIFT) | ((virtual_irq & GICH_LR_VIRTUAL_MASK) << GICH_LR_VIRTUAL_SHIFT); set_bit(GIC_IRQ_GUEST_VISIBLE, &p->status); clear_bit(GIC_IRQ_GUEST_PENDING, &p->status); } static inline void gic_add_to_lr_pending(struct vcpu *v, unsigned int irq, unsigned int priority) { struct pending_irq *iter, *n = irq_to_pending(v, irq); if ( !list_empty(&n->lr_queue) ) return; list_for_each_entry ( iter, &v->arch.vgic.lr_pending, lr_queue ) { if ( iter->priority > priority ) { list_add_tail(&n->lr_queue, &iter->lr_queue); return; } } list_add_tail(&n->lr_queue, &v->arch.vgic.lr_pending); } void gic_remove_from_queues(struct vcpu *v, unsigned int virtual_irq) { struct pending_irq *p = irq_to_pending(v, virtual_irq); unsigned long flags; spin_lock_irqsave(&gic.lock, flags); if ( !list_empty(&p->lr_queue) ) list_del_init(&p->lr_queue); spin_unlock_irqrestore(&gic.lock, flags); } void gic_set_guest_irq(struct vcpu *v, unsigned int virtual_irq, unsigned int state, unsigned int priority) { int i; unsigned long flags; spin_lock_irqsave(&gic.lock, flags); if ( v == current && list_empty(&v->arch.vgic.lr_pending) ) { i = find_first_zero_bit(&this_cpu(lr_mask), nr_lrs); if (i < nr_lrs) { set_bit(i, &this_cpu(lr_mask)); gic_set_lr(i, virtual_irq, state, priority); goto out; } } gic_add_to_lr_pending(v, virtual_irq, priority); out: spin_unlock_irqrestore(&gic.lock, flags); return; } static void gic_restore_pending_irqs(struct vcpu *v) { int i; struct pending_irq *p, *t; unsigned long flags; list_for_each_entry_safe ( p, t, &v->arch.vgic.lr_pending, lr_queue ) { i = find_first_zero_bit(&this_cpu(lr_mask), nr_lrs); if ( i >= nr_lrs ) return; spin_lock_irqsave(&gic.lock, flags); gic_set_lr(i, p->irq, GICH_LR_PENDING, p->priority); list_del_init(&p->lr_queue); set_bit(i, &this_cpu(lr_mask)); spin_unlock_irqrestore(&gic.lock, flags); } } void gic_clear_pending_irqs(struct vcpu *v) { struct pending_irq *p, *t; unsigned long flags; spin_lock_irqsave(&gic.lock, flags); v->arch.lr_mask = 0; list_for_each_entry_safe ( p, t, &v->arch.vgic.lr_pending, lr_queue ) list_del_init(&p->lr_queue); spin_unlock_irqrestore(&gic.lock, flags); } static void gic_inject_irq_start(void) { register_t hcr = READ_SYSREG(HCR_EL2); WRITE_SYSREG(hcr | HCR_VI, HCR_EL2); isb(); } static void gic_inject_irq_stop(void) { register_t hcr = READ_SYSREG(HCR_EL2); if (hcr & HCR_VI) { WRITE_SYSREG(hcr & ~HCR_VI, HCR_EL2); isb(); } } int gic_events_need_delivery(void) { return (!list_empty(¤t->arch.vgic.lr_pending) || this_cpu(lr_mask)); } void gic_inject(void) { if ( vcpu_info(current, evtchn_upcall_pending) ) vgic_vcpu_inject_irq(current, current->domain->arch.evtchn_irq, 1); gic_restore_pending_irqs(current); if (!gic_events_need_delivery()) gic_inject_irq_stop(); else gic_inject_irq_start(); } int gic_route_irq_to_guest(struct domain *d, const struct dt_irq *irq, const char * devname) { struct irqaction *action; struct irq_desc *desc = irq_to_desc(irq->irq); unsigned long flags; int retval; bool_t level; struct pending_irq *p; action = xmalloc(struct irqaction); if (!action) return -ENOMEM; action->dev_id = d; action->name = devname; action->free_on_release = 1; spin_lock_irqsave(&desc->lock, flags); spin_lock(&gic.lock); desc->handler = &gic_guest_irq_type; desc->status |= IRQ_GUEST; level = dt_irq_is_level_triggered(irq); gic_set_irq_properties(irq->irq, level, cpumask_of(smp_processor_id()), 0xa0); retval = __setup_irq(desc, irq->irq, action); if (retval) { xfree(action); goto out; } /* TODO: do not assume delivery to vcpu0 */ p = irq_to_pending(d->vcpu[0], irq->irq); p->desc = desc; out: spin_unlock(&gic.lock); spin_unlock_irqrestore(&desc->lock, flags); return retval; } static void do_sgi(struct cpu_user_regs *regs, int othercpu, enum gic_sgi sgi) { /* Lower the priority */ GICC[GICC_EOIR] = sgi; switch (sgi) { case GIC_SGI_EVENT_CHECK: /* Nothing to do, will check for events on return path */ break; case GIC_SGI_DUMP_STATE: dump_execstate(regs); break; case GIC_SGI_CALL_FUNCTION: smp_call_function_interrupt(); break; default: panic("Unhandled SGI %d on CPU%d", sgi, smp_processor_id()); break; } /* Deactivate */ GICC[GICC_DIR] = sgi; } /* Accept an interrupt from the GIC and dispatch its handler */ void gic_interrupt(struct cpu_user_regs *regs, int is_fiq) { uint32_t intack; unsigned int irq; do { intack = GICC[GICC_IAR]; irq = intack & GICC_IA_IRQ; if ( likely(irq >= 16 && irq < 1021) ) { local_irq_enable(); do_IRQ(regs, irq, is_fiq); local_irq_disable(); } else if (unlikely(irq < 16)) { unsigned int cpu = (intack & GICC_IA_CPU_MASK) >> GICC_IA_CPU_SHIFT; do_sgi(regs, cpu, irq); } else { local_irq_disable(); break; } } while (1); } int gicv_setup(struct domain *d) { int ret; /* * Domain 0 gets the hardware address. * Guests get the virtual platform layout. */ if ( d->domain_id == 0 ) { d->arch.vgic.dbase = gic.dbase; d->arch.vgic.cbase = gic.cbase; } else { d->arch.vgic.dbase = GUEST_GICD_BASE; d->arch.vgic.cbase = GUEST_GICC_BASE; } d->arch.vgic.nr_lines = 0; /* * Map the gic virtual cpu interface in the gic cpu interface * region of the guest. * * The second page is always mapped at +4K irrespective of the * GIC_64K_STRIDE quirk. The DTB passed to the guest reflects this. */ ret = map_mmio_regions(d, d->arch.vgic.cbase, d->arch.vgic.cbase + PAGE_SIZE - 1, gic.vbase); if (ret) return ret; if ( !platform_has_quirk(PLATFORM_QUIRK_GIC_64K_STRIDE) ) ret = map_mmio_regions(d, d->arch.vgic.cbase + PAGE_SIZE, d->arch.vgic.cbase + (2 * PAGE_SIZE) - 1, gic.vbase + PAGE_SIZE); else ret = map_mmio_regions(d, d->arch.vgic.cbase + PAGE_SIZE, d->arch.vgic.cbase + (2 * PAGE_SIZE) - 1, gic.vbase + 16*PAGE_SIZE); return ret; } static void gic_irq_eoi(void *info) { int virq = (uintptr_t) info; GICC[GICC_DIR] = virq; } static void maintenance_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs) { int i = 0, virq, pirq = -1; uint32_t lr; struct vcpu *v = current; uint64_t eisr = GICH[GICH_EISR0] | (((uint64_t) GICH[GICH_EISR1]) << 32); while ((i = find_next_bit((const long unsigned int *) &eisr, 64, i)) < 64) { struct pending_irq *p, *p2; int cpu; bool_t inflight; cpu = -1; inflight = 0; spin_lock_irq(&gic.lock); lr = GICH[GICH_LR + i]; virq = lr & GICH_LR_VIRTUAL_MASK; GICH[GICH_LR + i] = 0; clear_bit(i, &this_cpu(lr_mask)); p = irq_to_pending(v, virq); if ( p->desc != NULL ) { p->desc->status &= ~IRQ_INPROGRESS; /* Assume only one pcpu needs to EOI the irq */ cpu = p->desc->arch.eoi_cpu; pirq = p->desc->irq; } if ( test_bit(GIC_IRQ_GUEST_PENDING, &p->status) && test_bit(GIC_IRQ_GUEST_ENABLED, &p->status)) { inflight = 1; gic_add_to_lr_pending(v, virq, p->priority); } clear_bit(GIC_IRQ_GUEST_VISIBLE, &p->status); if ( !list_empty(&v->arch.vgic.lr_pending) ) { p2 = list_entry(v->arch.vgic.lr_pending.next, typeof(*p2), lr_queue); gic_set_lr(i, p2->irq, GICH_LR_PENDING, p2->priority); list_del_init(&p2->lr_queue); set_bit(i, &this_cpu(lr_mask)); } spin_unlock_irq(&gic.lock); if ( !inflight ) { spin_lock_irq(&v->arch.vgic.lock); list_del_init(&p->inflight); spin_unlock_irq(&v->arch.vgic.lock); } if ( p->desc != NULL ) { /* this is not racy because we can't receive another irq of the * same type until we EOI it. */ if ( cpu == smp_processor_id() ) gic_irq_eoi((void*)(uintptr_t)pirq); else on_selected_cpus(cpumask_of(cpu), gic_irq_eoi, (void*)(uintptr_t)pirq, 0); } i++; } } void gic_dump_info(struct vcpu *v) { int i; struct pending_irq *p; printk("GICH_LRs (vcpu %d) mask=%"PRIx64"\n", v->vcpu_id, v->arch.lr_mask); if ( v == current ) { for ( i = 0; i < nr_lrs; i++ ) printk(" HW_LR[%d]=%x\n", i, GICH[GICH_LR + i]); } else { for ( i = 0; i < nr_lrs; i++ ) printk(" VCPU_LR[%d]=%x\n", i, v->arch.gic_lr[i]); } list_for_each_entry ( p, &v->arch.vgic.inflight_irqs, inflight ) { printk("Inflight irq=%d\n", p->irq); } list_for_each_entry( p, &v->arch.vgic.lr_pending, lr_queue ) { printk("Pending irq=%d\n", p->irq); } } void __cpuinit init_maintenance_interrupt(void) { request_dt_irq(&gic.maintenance, maintenance_interrupt, "irq-maintenance", NULL); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/vtimer.c0000664000175000017500000001652412307313555014413 0ustar smbsmb/* * xen/arch/arm/vtimer.c * * ARM Virtual Timer emulation support * * Ian Campbell * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include extern s_time_t ticks_to_ns(uint64_t ticks); extern uint64_t ns_to_ticks(s_time_t ns); static void phys_timer_expired(void *data) { struct vtimer *t = data; t->ctl |= CNTx_CTL_PENDING; if ( !(t->ctl & CNTx_CTL_MASK) ) vgic_vcpu_inject_irq(t->v, t->irq, 1); } static void virt_timer_expired(void *data) { struct vtimer *t = data; t->ctl |= CNTx_CTL_MASK; vgic_vcpu_inject_irq(t->v, t->irq, 1); } int vcpu_domain_init(struct domain *d) { d->arch.phys_timer_base.offset = NOW(); d->arch.virt_timer_base.offset = READ_SYSREG64(CNTPCT_EL0); return 0; } int vcpu_vtimer_init(struct vcpu *v) { struct vtimer *t = &v->arch.phys_timer; bool_t d0 = (v->domain == dom0); /* * Domain 0 uses the hardware interrupts, guests get the virtual platform. */ init_timer(&t->timer, phys_timer_expired, t, v->processor); t->ctl = 0; t->cval = NOW(); t->irq = d0 ? timer_dt_irq(TIMER_PHYS_NONSECURE_PPI)->irq : GUEST_TIMER_PHYS_NS_PPI; t->v = v; t = &v->arch.virt_timer; init_timer(&t->timer, virt_timer_expired, t, v->processor); t->ctl = 0; t->irq = d0 ? timer_dt_irq(TIMER_VIRT_PPI)->irq : GUEST_TIMER_VIRT_PPI; t->v = v; return 0; } void vcpu_timer_destroy(struct vcpu *v) { kill_timer(&v->arch.virt_timer.timer); kill_timer(&v->arch.phys_timer.timer); } int virt_timer_save(struct vcpu *v) { if ( is_idle_domain(v->domain) ) return 0; v->arch.virt_timer.ctl = READ_SYSREG32(CNTV_CTL_EL0); WRITE_SYSREG32(v->arch.virt_timer.ctl & ~CNTx_CTL_ENABLE, CNTV_CTL_EL0); v->arch.virt_timer.cval = READ_SYSREG64(CNTV_CVAL_EL0); if ( (v->arch.virt_timer.ctl & CNTx_CTL_ENABLE) && !(v->arch.virt_timer.ctl & CNTx_CTL_MASK)) { set_timer(&v->arch.virt_timer.timer, ticks_to_ns(v->arch.virt_timer.cval + v->domain->arch.virt_timer_base.offset - boot_count)); } return 0; } int virt_timer_restore(struct vcpu *v) { if ( is_idle_domain(v->domain) ) return 0; stop_timer(&v->arch.virt_timer.timer); migrate_timer(&v->arch.virt_timer.timer, v->processor); migrate_timer(&v->arch.phys_timer.timer, v->processor); WRITE_SYSREG64(v->domain->arch.virt_timer_base.offset, CNTVOFF_EL2); WRITE_SYSREG64(v->arch.virt_timer.cval, CNTV_CVAL_EL0); WRITE_SYSREG32(v->arch.virt_timer.ctl, CNTV_CTL_EL0); return 0; } static void vtimer_cntp_ctl(struct cpu_user_regs *regs, uint32_t *r, int read) { struct vcpu *v = current; if ( read ) { *r = v->arch.phys_timer.ctl; } else { uint32_t ctl = *r & ~CNTx_CTL_PENDING; if ( ctl & CNTx_CTL_ENABLE ) ctl |= v->arch.phys_timer.ctl & CNTx_CTL_PENDING; v->arch.phys_timer.ctl = ctl; if ( v->arch.phys_timer.ctl & CNTx_CTL_ENABLE ) { set_timer(&v->arch.phys_timer.timer, v->arch.phys_timer.cval + v->domain->arch.phys_timer_base.offset); } else stop_timer(&v->arch.phys_timer.timer); } } static void vtimer_cntp_tval(struct cpu_user_regs *regs, uint32_t *r, int read) { struct vcpu *v = current; s_time_t now; now = NOW() - v->domain->arch.phys_timer_base.offset; if ( read ) { *r = (uint32_t)(ns_to_ticks(v->arch.phys_timer.cval - now) & 0xffffffffull); } else { v->arch.phys_timer.cval = now + ticks_to_ns(*r); if ( v->arch.phys_timer.ctl & CNTx_CTL_ENABLE ) { v->arch.phys_timer.ctl &= ~CNTx_CTL_PENDING; set_timer(&v->arch.phys_timer.timer, v->arch.phys_timer.cval + v->domain->arch.phys_timer_base.offset); } } } static int vtimer_cntpct(struct cpu_user_regs *regs, uint64_t *r, int read) { struct vcpu *v = current; uint64_t ticks; s_time_t now; if ( read ) { now = NOW() - v->domain->arch.phys_timer_base.offset; ticks = ns_to_ticks(now); *r = ticks; return 1; } else { gdprintk(XENLOG_DEBUG, "WRITE to R/O CNTPCT\n"); return 0; } } static int vtimer_emulate_cp32(struct cpu_user_regs *regs, union hsr hsr) { struct hsr_cp32 cp32 = hsr.cp32; uint32_t *r = (uint32_t *)select_user_reg(regs, cp32.reg); switch ( hsr.bits & HSR_CP32_REGS_MASK ) { case HSR_CPREG32(CNTP_CTL): vtimer_cntp_ctl(regs, r, cp32.read); return 1; case HSR_CPREG32(CNTP_TVAL): vtimer_cntp_tval(regs, r, cp32.read); return 1; default: return 0; } } static int vtimer_emulate_cp64(struct cpu_user_regs *regs, union hsr hsr) { struct hsr_cp64 cp64 = hsr.cp64; uint32_t *r1 = (uint32_t *)select_user_reg(regs, cp64.reg1); uint32_t *r2 = (uint32_t *)select_user_reg(regs, cp64.reg2); uint64_t x; switch ( hsr.bits & HSR_CP64_REGS_MASK ) { case HSR_CPREG64(CNTPCT): if (!vtimer_cntpct(regs, &x, cp64.read)) return 0; if ( cp64.read ) { *r1 = (uint32_t)(x & 0xffffffff); *r2 = (uint32_t)(x >> 32); } return 1; default: return 0; } } #ifdef CONFIG_ARM_64 static int vtimer_emulate_sysreg(struct cpu_user_regs *regs, union hsr hsr) { struct hsr_sysreg sysreg = hsr.sysreg; register_t *x = select_user_reg(regs, sysreg.reg); uint32_t r = (uint32_t)*x; switch ( hsr.bits & HSR_SYSREG_REGS_MASK ) { case HSR_SYSREG_CNTP_CTL_EL0: vtimer_cntp_ctl(regs, &r, sysreg.read); if ( sysreg.read ) *x = r; return 1; case HSR_SYSREG_CNTP_TVAL_EL0: vtimer_cntp_tval(regs, &r, sysreg.read); if ( sysreg.read ) *x = r; return 1; case HSR_SYSREG_CNTPCT_EL0: return vtimer_cntpct(regs, x, sysreg.read); default: return 0; } } #endif int vtimer_emulate(struct cpu_user_regs *regs, union hsr hsr) { switch (hsr.ec) { case HSR_EC_CP15_32: if ( !is_pv32_domain(current->domain) ) return 0; return vtimer_emulate_cp32(regs, hsr); case HSR_EC_CP15_64: if ( !is_pv32_domain(current->domain) ) return 0; return vtimer_emulate_cp64(regs, hsr); #ifdef CONFIG_ARM_64 case HSR_EC_SYSREG: if ( is_pv32_domain(current->domain) ) return 0; return vtimer_emulate_sysreg(regs, hsr); #endif default: return 0; } } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/dtb.S0000664000175000017500000000007512307313555013630 0ustar smbsmb .section .dtb,#alloc .incbin CONFIG_DTB_FILE xen-4.4.0/xen/arch/arm/physdev.c0000664000175000017500000000104612307313555014560 0ustar smbsmb/****************************************************************************** * Arch-specific physdev.c * * Copyright (c) 2012, Citrix Systems */ #include #include #include #include #include int do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) { printk("%s %d cmd=%d: not implemented yet\n", __func__, __LINE__, cmd); return -ENOSYS; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/vuart.c0000664000175000017500000001006412307313555014237 0ustar smbsmb/* * xen/arch/arm/vuart.c * * Virtual UART Emulator. * * This emulator uses the information from dtuart. This is not intended to be * a full emulation of an UART device. Rather it is intended to provide a * sufficient veneer of one that early code (such as Linux's boot time * decompressor) which hardcodes output directly to such a device are able to * make progress. * * The minimal register set to emulate an UART are: * - Single byte transmit register * - Single status register * * /!\ This device is not intended to be enumerable or exposed to the OS * (e.g. via Device Tree). * * Julien Grall * Ian Campbell * Copyright (c) 2012 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include "vuart.h" #include "io.h" #define domain_has_vuart(d) ((d)->arch.vuart.info != NULL) int domain_vuart_init(struct domain *d) { ASSERT( !d->domain_id ); d->arch.vuart.info = serial_vuart_info(SERHND_DTUART); if ( !d->arch.vuart.info ) return 0; spin_lock_init(&d->arch.vuart.lock); d->arch.vuart.idx = 0; d->arch.vuart.buf = xzalloc_array(char, VUART_BUF_SIZE); if ( !d->arch.vuart.buf ) return -ENOMEM; return 0; } void domain_vuart_free(struct domain *d) { if ( !domain_has_vuart(d) ) return; xfree(d->arch.vuart.buf); } static void vuart_print_char(struct vcpu *v, char c) { struct domain *d = v->domain; struct vuart *uart = &d->arch.vuart; /* Accept only printable characters, newline, and horizontal tab. */ if ( !isprint(c) && (c != '\n') && (c != '\t') ) return ; spin_lock(&uart->lock); uart->buf[uart->idx++] = c; if ( (uart->idx == (VUART_BUF_SIZE - 2)) || (c == '\n') ) { if ( c != '\n' ) uart->buf[uart->idx++] = '\n'; uart->buf[uart->idx] = '\0'; printk(XENLOG_G_DEBUG "DOM%u: %s", d->domain_id, uart->buf); uart->idx = 0; } spin_unlock(&uart->lock); } static int vuart_mmio_check(struct vcpu *v, paddr_t addr) { const struct vuart_info *info = v->domain->arch.vuart.info; return (domain_has_vuart(v->domain) && addr >= info->base_addr && addr <= (info->base_addr + info->size)); } static int vuart_mmio_read(struct vcpu *v, mmio_info_t *info) { struct domain *d = v->domain; struct hsr_dabt dabt = info->dabt; struct cpu_user_regs *regs = guest_cpu_user_regs(); register_t *r = select_user_reg(regs, dabt.reg); paddr_t offset = info->gpa - d->arch.vuart.info->base_addr; /* By default zeroed the register */ *r = 0; if ( offset == d->arch.vuart.info->status_off ) /* All holding registers empty, ready to send etc */ *r = d->arch.vuart.info->status; return 1; } static int vuart_mmio_write(struct vcpu *v, mmio_info_t *info) { struct domain *d = v->domain; struct hsr_dabt dabt = info->dabt; struct cpu_user_regs *regs = guest_cpu_user_regs(); register_t *r = select_user_reg(regs, dabt.reg); paddr_t offset = info->gpa - d->arch.vuart.info->base_addr; if ( offset == d->arch.vuart.info->data_off ) /* ignore any status bits */ vuart_print_char(v, *r & 0xFF); return 1; } const struct mmio_handler vuart_mmio_handler = { .check_handler = vuart_mmio_check, .read_handler = vuart_mmio_read, .write_handler = vuart_mmio_write, }; /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/device.c0000664000175000017500000000350312307313555014335 0ustar smbsmb/* * xen/arch/arm/device.c * * Helpers to use a device retrieved via the device tree. * * Julien Grall * Copyright (C) 2013 Linaro Limited. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include extern const struct device_desc _sdevice[], _edevice[]; static bool_t __init device_is_compatible(const struct device_desc *desc, const struct dt_device_node *dev) { const char *const *compat; if ( !desc->compatible ) return 0; for ( compat = desc->compatible; *compat; compat++ ) { if ( dt_device_is_compatible(dev, *compat) ) return 1; } return 0; } int __init device_init(struct dt_device_node *dev, enum device_type type, const void *data) { const struct device_desc *desc; ASSERT(dev != NULL); if ( !dt_device_is_available(dev) ) return -ENODEV; for ( desc = _sdevice; desc != _edevice; desc++ ) { if ( desc->type != type ) continue; if ( device_is_compatible(desc, dev) ) { ASSERT(desc->init != NULL); return desc->init(dev, data); } } return -EBADF; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/arm/Rules.mk0000664000175000017500000000502512307313555014356 0ustar smbsmb######################################## # arm-specific definitions # # If you change any of these configuration options then you must # 'make clean' before rebuilding. # HAS_DEVICE_TREE := y HAS_VIDEO := y HAS_ARM_HDLCD := y CFLAGS += -I$(BASEDIR)/include $(call cc-options-add,CFLAGS,CC,$(EMBEDDED_EXTRA_CFLAGS)) $(call cc-option-add,CFLAGS,CC,-Wnested-externs) arm := y ifeq ($(TARGET_SUBARCH),arm32) # Prevent floating-point variables from creeping into Xen. CFLAGS += -msoft-float CFLAGS += -mcpu=cortex-a15 arm32 := y arm64 := n endif ifeq ($(TARGET_SUBARCH),arm64) CFLAGS += -mcpu=generic arm32 := n arm64 := y endif ifneq ($(call cc-option,$(CC),-fvisibility=hidden,n),n) CFLAGS += -DGCC_HAS_VISIBILITY_ATTRIBUTE endif EARLY_PRINTK := n ifeq ($(debug),y) # Early printk for versatile express ifeq ($(CONFIG_EARLY_PRINTK), vexpress) EARLY_PRINTK_INC := pl011 EARLY_PRINTK_BAUD := 38400 EARLY_UART_BASE_ADDRESS := 0x1c090000 endif ifeq ($(CONFIG_EARLY_PRINTK), fastmodel) EARLY_PRINTK_INC := pl011 EARLY_PRINTK_INIT_UART := y EARLY_PRINTK_BAUD := 115200 EARLY_UART_BASE_ADDRESS := 0x1c090000 endif ifeq ($(CONFIG_EARLY_PRINTK), exynos5250) EARLY_PRINTK_INC := exynos4210 EARLY_PRINTK_INIT_UART := y EARLY_PRINTK_BAUD := 115200 EARLY_UART_BASE_ADDRESS := 0x12c20000 endif ifeq ($(CONFIG_EARLY_PRINTK), midway) EARLY_PRINTK_INC := pl011 EARLY_PRINTK_BAUD := 115200 EARLY_UART_BASE_ADDRESS := 0xfff36000 endif ifeq ($(CONFIG_EARLY_PRINTK), omap5432) EARLY_PRINTK_INC := 8250 EARLY_UART_BASE_ADDRESS := 0x48020000 EARLY_UART_REG_SHIFT := 2 endif ifeq ($(CONFIG_EARLY_PRINTK), sun6i) EARLY_PRINTK_INC := 8250 EARLY_UART_BASE_ADDRESS := 0x01c28000 EARLY_UART_REG_SHIFT := 2 endif ifeq ($(CONFIG_EARLY_PRINTK), sun7i) EARLY_PRINTK_INC := 8250 EARLY_UART_BASE_ADDRESS := 0x01c28000 EARLY_UART_REG_SHIFT := 2 endif ifeq ($(CONFIG_EARLY_PRINTK), brcm) EARLY_PRINTK_INC := 8250 EARLY_UART_BASE_ADDRESS := 0xF0406B00 EARLY_UART_REG_SHIFT := 2 endif ifeq ($(CONFIG_EARLY_PRINTK), xgene-storm) EARLY_PRINTK_INC := 8250 EARLY_PRINTK_BAUD := 115200 EARLY_UART_BASE_ADDRESS := 0x1c020000 EARLY_UART_REG_SHIFT := 2 endif ifneq ($(EARLY_PRINTK_INC),) EARLY_PRINTK := y endif CFLAGS-$(EARLY_PRINTK) += -DEARLY_PRINTK CFLAGS-$(EARLY_PRINTK_INIT_UART) += -DEARLY_PRINTK_INIT_UART CFLAGS-$(EARLY_PRINTK) += -DEARLY_PRINTK_INC=\"debug-$(EARLY_PRINTK_INC).inc\" CFLAGS-$(EARLY_PRINTK) += -DEARLY_PRINTK_BAUD=$(EARLY_PRINTK_BAUD) CFLAGS-$(EARLY_PRINTK) += -DEARLY_UART_BASE_ADDRESS=$(EARLY_UART_BASE_ADDRESS) CFLAGS-$(EARLY_PRINTK) += -DEARLY_UART_REG_SHIFT=$(EARLY_UART_REG_SHIFT) endif xen-4.4.0/xen/arch/arm/vpsci.c0000664000175000017500000000447712307313555014235 0ustar smbsmb/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include int do_psci_cpu_on(uint32_t vcpuid, register_t entry_point) { struct vcpu *v; struct domain *d = current->domain; struct vcpu_guest_context *ctxt; int rc; int is_thumb = entry_point & 1; if ( (vcpuid < 0) || (vcpuid >= MAX_VIRT_CPUS) ) return PSCI_EINVAL; if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) return PSCI_EINVAL; /* THUMB set is not allowed with 64-bit domain */ if ( is_pv64_domain(d) && is_thumb ) return PSCI_EINVAL; if ( (ctxt = alloc_vcpu_guest_context()) == NULL ) return PSCI_DENIED; vgic_clear_pending_irqs(v); memset(ctxt, 0, sizeof(*ctxt)); ctxt->user_regs.pc64 = (u64) entry_point; ctxt->sctlr = SCTLR_GUEST_INIT; ctxt->ttbr0 = 0; ctxt->ttbr1 = 0; ctxt->ttbcr = 0; /* Defined Reset Value */ if ( is_pv32_domain(d) ) ctxt->user_regs.cpsr = PSR_GUEST32_INIT; #ifdef CONFIG_ARM_64 else ctxt->user_regs.cpsr = PSR_GUEST64_INIT; #endif /* Start the VCPU with THUMB set if it's requested by the kernel */ if ( is_thumb ) ctxt->user_regs.cpsr |= PSR_THUMB; ctxt->flags = VGCF_online; domain_lock(d); rc = arch_set_info_guest(v, ctxt); free_vcpu_guest_context(ctxt); if ( rc < 0 ) { domain_unlock(d); return PSCI_DENIED; } domain_unlock(d); vcpu_wake(v); return PSCI_SUCCESS; } int do_psci_cpu_off(uint32_t power_state) { struct vcpu *v = current; if ( !test_and_set_bit(_VPF_down, &v->pause_flags) ) vcpu_sleep_nosync(v); return PSCI_SUCCESS; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/0000775000175000017500000000000012307313555012577 5ustar smbsmbxen-4.4.0/xen/arch/x86/domctl.c0000664000175000017500000012407012307313555014231 0ustar smbsmb/****************************************************************************** * Arch-specific domctl.c * * Copyright (c) 2002-2006, K A Fraser */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for hvm_acpi_power_button */ #include /* for arch_do_domctl */ #include #include #include #include #include #include #include static int gdbsx_guest_mem_io( domid_t domid, struct xen_domctl_gdbsx_memio *iop) { ulong l_uva = (ulong)iop->uva; iop->remain = dbg_rw_mem( (dbgva_t)iop->gva, (dbgbyte_t *)l_uva, iop->len, domid, iop->gwr, iop->pgd3val); return (iop->remain ? -EFAULT : 0); } long arch_do_domctl( struct xen_domctl *domctl, struct domain *d, XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) { long ret = 0; bool_t copyback = 0; switch ( domctl->cmd ) { case XEN_DOMCTL_shadow_op: { ret = paging_domctl(d, &domctl->u.shadow_op, guest_handle_cast(u_domctl, void)); copyback = 1; } break; case XEN_DOMCTL_ioport_permission: { unsigned int fp = domctl->u.ioport_permission.first_port; unsigned int np = domctl->u.ioport_permission.nr_ports; int allow = domctl->u.ioport_permission.allow_access; ret = -EINVAL; if ( (fp + np) > 65536 ) break; if ( np == 0 ) ret = 0; else if ( xsm_ioport_permission(XSM_HOOK, d, fp, fp + np - 1, allow) ) ret = -EPERM; else if ( allow ) ret = ioports_permit_access(d, fp, fp + np - 1); else ret = ioports_deny_access(d, fp, fp + np - 1); } break; case XEN_DOMCTL_getpageframeinfo: { struct page_info *page; unsigned long mfn = domctl->u.getpageframeinfo.gmfn; ret = -EINVAL; if ( unlikely(!mfn_valid(mfn)) ) break; page = mfn_to_page(mfn); if ( likely(get_page(page, d)) ) { ret = 0; domctl->u.getpageframeinfo.type = XEN_DOMCTL_PFINFO_NOTAB; if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) { switch ( page->u.inuse.type_info & PGT_type_mask ) { case PGT_l1_page_table: domctl->u.getpageframeinfo.type = XEN_DOMCTL_PFINFO_L1TAB; break; case PGT_l2_page_table: domctl->u.getpageframeinfo.type = XEN_DOMCTL_PFINFO_L2TAB; break; case PGT_l3_page_table: domctl->u.getpageframeinfo.type = XEN_DOMCTL_PFINFO_L3TAB; break; case PGT_l4_page_table: domctl->u.getpageframeinfo.type = XEN_DOMCTL_PFINFO_L4TAB; break; } } put_page(page); } copyback = 1; } break; case XEN_DOMCTL_getpageframeinfo3: if (!has_32bit_shinfo(current->domain)) { unsigned int n, j; unsigned int num = domctl->u.getpageframeinfo3.num; struct page_info *page; xen_pfn_t *arr; if ( unlikely(num > 1024) || unlikely(num != domctl->u.getpageframeinfo3.num) ) { ret = -E2BIG; break; } page = alloc_domheap_page(NULL, 0); if ( !page ) { ret = -ENOMEM; break; } arr = __map_domain_page(page); for ( n = ret = 0; n < num; ) { unsigned int k = min_t(unsigned int, num - n, PAGE_SIZE / sizeof(*arr)); if ( copy_from_guest_offset(arr, domctl->u.getpageframeinfo3.array, n, k) ) { ret = -EFAULT; break; } for ( j = 0; j < k; j++ ) { unsigned long type = 0; p2m_type_t t; page = get_page_from_gfn(d, arr[j], &t, P2M_ALLOC); if ( unlikely(!page) || unlikely(is_xen_heap_page(page)) ) { if ( p2m_is_broken(t) ) type = XEN_DOMCTL_PFINFO_BROKEN; else type = XEN_DOMCTL_PFINFO_XTAB; } else { switch( page->u.inuse.type_info & PGT_type_mask ) { case PGT_l1_page_table: type = XEN_DOMCTL_PFINFO_L1TAB; break; case PGT_l2_page_table: type = XEN_DOMCTL_PFINFO_L2TAB; break; case PGT_l3_page_table: type = XEN_DOMCTL_PFINFO_L3TAB; break; case PGT_l4_page_table: type = XEN_DOMCTL_PFINFO_L4TAB; break; } if ( page->u.inuse.type_info & PGT_pinned ) type |= XEN_DOMCTL_PFINFO_LPINTAB; if ( page->count_info & PGC_broken ) type = XEN_DOMCTL_PFINFO_BROKEN; } if ( page ) put_page(page); arr[j] = type; } if ( copy_to_guest_offset(domctl->u.getpageframeinfo3.array, n, arr, k) ) { ret = -EFAULT; break; } n += k; } page = mfn_to_page(domain_page_map_to_mfn(arr)); unmap_domain_page(arr); free_domheap_page(page); break; } /* fall thru */ case XEN_DOMCTL_getpageframeinfo2: { int n,j; int num = domctl->u.getpageframeinfo2.num; uint32_t *arr32; if ( unlikely(num > 1024) ) { ret = -E2BIG; break; } arr32 = alloc_xenheap_page(); if ( !arr32 ) { ret = -ENOMEM; break; } ret = 0; for ( n = 0; n < num; ) { int k = PAGE_SIZE / 4; if ( (num - n) < k ) k = num - n; if ( copy_from_guest_offset(arr32, domctl->u.getpageframeinfo2.array, n, k) ) { ret = -EFAULT; break; } for ( j = 0; j < k; j++ ) { struct page_info *page; unsigned long gfn = arr32[j]; page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC); if ( domctl->cmd == XEN_DOMCTL_getpageframeinfo3) arr32[j] = 0; if ( unlikely(!page) || unlikely(is_xen_heap_page(page)) ) arr32[j] |= XEN_DOMCTL_PFINFO_XTAB; else { unsigned long type = 0; switch( page->u.inuse.type_info & PGT_type_mask ) { case PGT_l1_page_table: type = XEN_DOMCTL_PFINFO_L1TAB; break; case PGT_l2_page_table: type = XEN_DOMCTL_PFINFO_L2TAB; break; case PGT_l3_page_table: type = XEN_DOMCTL_PFINFO_L3TAB; break; case PGT_l4_page_table: type = XEN_DOMCTL_PFINFO_L4TAB; break; } if ( page->u.inuse.type_info & PGT_pinned ) type |= XEN_DOMCTL_PFINFO_LPINTAB; arr32[j] |= type; } if ( page ) put_page(page); } if ( copy_to_guest_offset(domctl->u.getpageframeinfo2.array, n, arr32, k) ) { ret = -EFAULT; break; } n += k; } free_xenheap_page(arr32); } break; case XEN_DOMCTL_getmemlist: { int i; unsigned long max_pfns = domctl->u.getmemlist.max_pfns; uint64_t mfn; struct page_info *page; if ( unlikely(d->is_dying) ) { ret = -EINVAL; break; } /* * XSA-74: This sub-hypercall is broken in several ways: * - lock order inversion (p2m locks inside page_alloc_lock) * - no preemption on huge max_pfns input * - not (re-)checking d->is_dying with page_alloc_lock held * - not honoring start_pfn input (which libxc also doesn't set) * Additionally it is rather useless, as the result is stale by the * time the caller gets to look at it. * As it only has a single, non-production consumer (xen-mceinj), * rather than trying to fix it we restrict it for the time being. */ if ( /* No nested locks inside copy_to_guest_offset(). */ paging_mode_external(current->domain) || /* Arbitrary limit capping processing time. */ max_pfns > GB(4) / PAGE_SIZE ) { ret = -EOPNOTSUPP; break; } spin_lock(&d->page_alloc_lock); ret = i = 0; page_list_for_each(page, &d->page_list) { if ( i >= max_pfns ) break; mfn = page_to_mfn(page); if ( copy_to_guest_offset(domctl->u.getmemlist.buffer, i, &mfn, 1) ) { ret = -EFAULT; break; } ++i; } spin_unlock(&d->page_alloc_lock); domctl->u.getmemlist.num_pfns = i; copyback = 1; } break; case XEN_DOMCTL_hypercall_init: { unsigned long gmfn = domctl->u.hypercall_init.gmfn; struct page_info *page; void *hypercall_page; page = get_page_from_gfn(d, gmfn, NULL, P2M_ALLOC); ret = -EACCES; if ( !page || !get_page_type(page, PGT_writable_page) ) { if ( page ) put_page(page); break; } ret = 0; hypercall_page = __map_domain_page(page); hypercall_page_initialise(d, hypercall_page); unmap_domain_page(hypercall_page); put_page_and_type(page); } break; case XEN_DOMCTL_sethvmcontext: { struct hvm_domain_context c = { .size = domctl->u.hvmcontext.size }; ret = -EINVAL; if ( !is_hvm_domain(d) ) goto sethvmcontext_out; ret = -ENOMEM; if ( (c.data = xmalloc_bytes(c.size)) == NULL ) goto sethvmcontext_out; ret = -EFAULT; if ( copy_from_guest(c.data, domctl->u.hvmcontext.buffer, c.size) != 0) goto sethvmcontext_out; domain_pause(d); ret = hvm_load(d, &c); domain_unpause(d); sethvmcontext_out: if ( c.data != NULL ) xfree(c.data); } break; case XEN_DOMCTL_gethvmcontext: { struct hvm_domain_context c = { 0 }; ret = -EINVAL; if ( !is_hvm_domain(d) ) goto gethvmcontext_out; c.size = hvm_save_size(d); if ( guest_handle_is_null(domctl->u.hvmcontext.buffer) ) { /* Client is querying for the correct buffer size */ domctl->u.hvmcontext.size = c.size; ret = 0; goto gethvmcontext_out; } /* Check that the client has a big enough buffer */ ret = -ENOSPC; if ( domctl->u.hvmcontext.size < c.size ) goto gethvmcontext_out; /* Allocate our own marshalling buffer */ ret = -ENOMEM; if ( (c.data = xmalloc_bytes(c.size)) == NULL ) goto gethvmcontext_out; domain_pause(d); ret = hvm_save(d, &c); domain_unpause(d); domctl->u.hvmcontext.size = c.cur; if ( copy_to_guest(domctl->u.hvmcontext.buffer, c.data, c.size) != 0 ) ret = -EFAULT; gethvmcontext_out: copyback = 1; if ( c.data != NULL ) xfree(c.data); } break; case XEN_DOMCTL_gethvmcontext_partial: { ret = -EINVAL; if ( !is_hvm_domain(d) ) break; domain_pause(d); ret = hvm_save_one(d, domctl->u.hvmcontext_partial.type, domctl->u.hvmcontext_partial.instance, domctl->u.hvmcontext_partial.buffer); domain_unpause(d); } break; case XEN_DOMCTL_set_address_size: { switch ( domctl->u.address_size.size ) { case 32: ret = switch_compat(d); break; case 64: ret = switch_native(d); break; default: ret = (domctl->u.address_size.size == BITS_PER_LONG) ? 0 : -EINVAL; break; } } break; case XEN_DOMCTL_get_address_size: { domctl->u.address_size.size = is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG; ret = 0; copyback = 1; } break; case XEN_DOMCTL_set_machine_address_size: { ret = -EBUSY; if ( d->tot_pages > 0 ) break; d->arch.physaddr_bitsize = domctl->u.address_size.size; ret = 0; } break; case XEN_DOMCTL_get_machine_address_size: { domctl->u.address_size.size = d->arch.physaddr_bitsize; ret = 0; copyback = 1; } break; case XEN_DOMCTL_sendtrigger: { struct vcpu *v; ret = -EINVAL; if ( domctl->u.sendtrigger.vcpu >= MAX_VIRT_CPUS ) break; ret = -ESRCH; if ( domctl->u.sendtrigger.vcpu >= d->max_vcpus || (v = d->vcpu[domctl->u.sendtrigger.vcpu]) == NULL ) break; switch ( domctl->u.sendtrigger.trigger ) { case XEN_DOMCTL_SENDTRIGGER_NMI: { ret = 0; if ( !test_and_set_bool(v->nmi_pending) ) vcpu_kick(v); } break; case XEN_DOMCTL_SENDTRIGGER_POWER: { ret = -EINVAL; if ( is_hvm_domain(d) ) { ret = 0; hvm_acpi_power_button(d); } } break; case XEN_DOMCTL_SENDTRIGGER_SLEEP: { ret = -EINVAL; if ( is_hvm_domain(d) ) { ret = 0; hvm_acpi_sleep_button(d); } } break; default: ret = -ENOSYS; } } break; case XEN_DOMCTL_bind_pt_irq: { xen_domctl_bind_pt_irq_t *bind = &domctl->u.bind_pt_irq; int irq; ret = -EINVAL; if ( !is_hvm_domain(d) ) break; ret = xsm_bind_pt_irq(XSM_HOOK, d, bind); if ( ret ) break; irq = domain_pirq_to_irq(d, bind->machine_irq); ret = -EPERM; if ( irq <= 0 || !irq_access_permitted(current->domain, irq) ) break; ret = -ESRCH; if ( iommu_enabled ) { spin_lock(&pcidevs_lock); ret = pt_irq_create_bind(d, bind); spin_unlock(&pcidevs_lock); } if ( ret < 0 ) printk(XENLOG_G_ERR "pt_irq_create_bind failed (%ld) for dom%d\n", ret, d->domain_id); } break; case XEN_DOMCTL_unbind_pt_irq: { xen_domctl_bind_pt_irq_t *bind = &domctl->u.bind_pt_irq; int irq = domain_pirq_to_irq(d, bind->machine_irq); ret = -EPERM; if ( irq <= 0 || !irq_access_permitted(current->domain, irq) ) break; ret = xsm_unbind_pt_irq(XSM_HOOK, d, bind); if ( ret ) break; if ( iommu_enabled ) { spin_lock(&pcidevs_lock); ret = pt_irq_destroy_bind(d, bind); spin_unlock(&pcidevs_lock); } if ( ret < 0 ) printk(XENLOG_G_ERR "pt_irq_destroy_bind failed (%ld) for dom%d\n", ret, d->domain_id); } break; case XEN_DOMCTL_memory_mapping: { unsigned long gfn = domctl->u.memory_mapping.first_gfn; unsigned long mfn = domctl->u.memory_mapping.first_mfn; unsigned long nr_mfns = domctl->u.memory_mapping.nr_mfns; int add = domctl->u.memory_mapping.add_mapping; unsigned long i; ret = -EINVAL; if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */ ((mfn | (mfn + nr_mfns - 1)) >> (paddr_bits - PAGE_SHIFT)) || (gfn + nr_mfns - 1) < gfn ) /* wrap? */ break; ret = -EPERM; if ( !iomem_access_permitted(current->domain, mfn, mfn + nr_mfns - 1) ) break; ret = xsm_iomem_mapping(XSM_HOOK, d, mfn, mfn + nr_mfns - 1, add); if ( ret ) break; if ( add ) { printk(XENLOG_G_INFO "memory_map:add: dom%d gfn=%lx mfn=%lx nr=%lx\n", d->domain_id, gfn, mfn, nr_mfns); ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1); if ( !ret && paging_mode_translate(d) ) { for ( i = 0; !ret && i < nr_mfns; i++ ) if ( !set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i)) ) ret = -EIO; if ( ret ) { printk(XENLOG_G_WARNING "memory_map:fail: dom%d gfn=%lx mfn=%lx\n", d->domain_id, gfn + i, mfn + i); while ( i-- ) clear_mmio_p2m_entry(d, gfn + i); if ( iomem_deny_access(d, mfn, mfn + nr_mfns - 1) && is_hardware_domain(current->domain) ) printk(XENLOG_ERR "memory_map: failed to deny dom%d access to [%lx,%lx]\n", d->domain_id, mfn, mfn + nr_mfns - 1); } } } else { printk(XENLOG_G_INFO "memory_map:remove: dom%d gfn=%lx mfn=%lx nr=%lx\n", d->domain_id, gfn, mfn, nr_mfns); if ( paging_mode_translate(d) ) for ( i = 0; i < nr_mfns; i++ ) add |= !clear_mmio_p2m_entry(d, gfn + i); ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1); if ( !ret && add ) ret = -EIO; if ( ret && is_hardware_domain(current->domain) ) printk(XENLOG_ERR "memory_map: error %ld %s dom%d access to [%lx,%lx]\n", ret, add ? "removing" : "denying", d->domain_id, mfn, mfn + nr_mfns - 1); } } break; case XEN_DOMCTL_ioport_mapping: { #define MAX_IOPORTS 0x10000 struct hvm_iommu *hd; unsigned int fgp = domctl->u.ioport_mapping.first_gport; unsigned int fmp = domctl->u.ioport_mapping.first_mport; unsigned int np = domctl->u.ioport_mapping.nr_ports; unsigned int add = domctl->u.ioport_mapping.add_mapping; struct g2m_ioport *g2m_ioport; int found = 0; ret = -EINVAL; if ( ((fgp | fmp | (np - 1)) >= MAX_IOPORTS) || ((fgp + np) > MAX_IOPORTS) || ((fmp + np) > MAX_IOPORTS) ) { printk(XENLOG_G_ERR "ioport_map:invalid:dom%d gport=%x mport=%x nr=%x\n", domctl->domain, fgp, fmp, np); break; } ret = -EPERM; if ( !ioports_access_permitted(current->domain, fmp, fmp + np - 1) ) break; ret = xsm_ioport_mapping(XSM_HOOK, d, fmp, fmp + np - 1, add); if ( ret ) break; hd = domain_hvm_iommu(d); if ( add ) { printk(XENLOG_G_INFO "ioport_map:add: dom%d gport=%x mport=%x nr=%x\n", d->domain_id, fgp, fmp, np); list_for_each_entry(g2m_ioport, &hd->g2m_ioport_list, list) if (g2m_ioport->mport == fmp ) { g2m_ioport->gport = fgp; g2m_ioport->np = np; found = 1; break; } if ( !found ) { g2m_ioport = xmalloc(struct g2m_ioport); if ( !g2m_ioport ) ret = -ENOMEM; } if ( !found && !ret ) { g2m_ioport->gport = fgp; g2m_ioport->mport = fmp; g2m_ioport->np = np; list_add_tail(&g2m_ioport->list, &hd->g2m_ioport_list); } if ( !ret ) ret = ioports_permit_access(d, fmp, fmp + np - 1); if ( ret && !found && g2m_ioport ) { list_del(&g2m_ioport->list); xfree(g2m_ioport); } } else { printk(XENLOG_G_INFO "ioport_map:remove: dom%d gport=%x mport=%x nr=%x\n", d->domain_id, fgp, fmp, np); list_for_each_entry(g2m_ioport, &hd->g2m_ioport_list, list) if ( g2m_ioport->mport == fmp ) { list_del(&g2m_ioport->list); xfree(g2m_ioport); break; } ret = ioports_deny_access(d, fmp, fmp + np - 1); if ( ret && is_hardware_domain(current->domain) ) printk(XENLOG_ERR "ioport_map: error %ld denying dom%d access to [%x,%x]\n", ret, d->domain_id, fmp, fmp + np - 1); } } break; case XEN_DOMCTL_pin_mem_cacheattr: { ret = hvm_set_mem_pinned_cacheattr( d, domctl->u.pin_mem_cacheattr.start, domctl->u.pin_mem_cacheattr.end, domctl->u.pin_mem_cacheattr.type); } break; case XEN_DOMCTL_set_ext_vcpucontext: case XEN_DOMCTL_get_ext_vcpucontext: { struct xen_domctl_ext_vcpucontext *evc; struct vcpu *v; evc = &domctl->u.ext_vcpucontext; ret = -ESRCH; if ( (evc->vcpu >= d->max_vcpus) || ((v = d->vcpu[evc->vcpu]) == NULL) ) break; if ( domctl->cmd == XEN_DOMCTL_get_ext_vcpucontext ) { if ( v == current ) /* no vcpu_pause() */ break; evc->size = sizeof(*evc); vcpu_pause(v); if ( is_pv_domain(d) ) { evc->sysenter_callback_cs = v->arch.pv_vcpu.sysenter_callback_cs; evc->sysenter_callback_eip = v->arch.pv_vcpu.sysenter_callback_eip; evc->sysenter_disables_events = v->arch.pv_vcpu.sysenter_disables_events; evc->syscall32_callback_cs = v->arch.pv_vcpu.syscall32_callback_cs; evc->syscall32_callback_eip = v->arch.pv_vcpu.syscall32_callback_eip; evc->syscall32_disables_events = v->arch.pv_vcpu.syscall32_disables_events; } else { evc->sysenter_callback_cs = 0; evc->sysenter_callback_eip = 0; evc->sysenter_disables_events = 0; evc->syscall32_callback_cs = 0; evc->syscall32_callback_eip = 0; evc->syscall32_disables_events = 0; } evc->vmce.caps = v->arch.vmce.mcg_cap; evc->vmce.mci_ctl2_bank0 = v->arch.vmce.bank[0].mci_ctl2; evc->vmce.mci_ctl2_bank1 = v->arch.vmce.bank[1].mci_ctl2; ret = 0; vcpu_unpause(v); copyback = 1; } else { if ( d == current->domain ) /* no domain_pause() */ break; ret = -EINVAL; if ( evc->size < offsetof(typeof(*evc), vmce) ) break; if ( is_pv_domain(d) ) { if ( !is_canonical_address(evc->sysenter_callback_eip) || !is_canonical_address(evc->syscall32_callback_eip) ) break; domain_pause(d); fixup_guest_code_selector(d, evc->sysenter_callback_cs); v->arch.pv_vcpu.sysenter_callback_cs = evc->sysenter_callback_cs; v->arch.pv_vcpu.sysenter_callback_eip = evc->sysenter_callback_eip; v->arch.pv_vcpu.sysenter_disables_events = evc->sysenter_disables_events; fixup_guest_code_selector(d, evc->syscall32_callback_cs); v->arch.pv_vcpu.syscall32_callback_cs = evc->syscall32_callback_cs; v->arch.pv_vcpu.syscall32_callback_eip = evc->syscall32_callback_eip; v->arch.pv_vcpu.syscall32_disables_events = evc->syscall32_disables_events; } else if ( (evc->sysenter_callback_cs & ~3) || evc->sysenter_callback_eip || (evc->syscall32_callback_cs & ~3) || evc->syscall32_callback_eip ) break; else domain_pause(d); BUILD_BUG_ON(offsetof(struct xen_domctl_ext_vcpucontext, mcg_cap) != offsetof(struct xen_domctl_ext_vcpucontext, vmce.caps)); BUILD_BUG_ON(sizeof(evc->mcg_cap) != sizeof(evc->vmce.caps)); if ( evc->size >= offsetof(typeof(*evc), vmce) + sizeof(evc->vmce) ) ret = vmce_restore_vcpu(v, &evc->vmce); else if ( evc->size >= offsetof(typeof(*evc), mcg_cap) + sizeof(evc->mcg_cap) ) { struct hvm_vmce_vcpu vmce = { .caps = evc->mcg_cap }; ret = vmce_restore_vcpu(v, &vmce); } else ret = 0; domain_unpause(d); } } break; case XEN_DOMCTL_set_cpuid: { xen_domctl_cpuid_t *ctl = &domctl->u.cpuid; cpuid_input_t *cpuid = NULL; int i; for ( i = 0; i < MAX_CPUID_INPUT; i++ ) { cpuid = &d->arch.cpuids[i]; if ( cpuid->input[0] == XEN_CPUID_INPUT_UNUSED ) break; if ( (cpuid->input[0] == ctl->input[0]) && ((cpuid->input[1] == XEN_CPUID_INPUT_UNUSED) || (cpuid->input[1] == ctl->input[1])) ) break; } if ( i == MAX_CPUID_INPUT ) { ret = -ENOENT; } else { memcpy(cpuid, ctl, sizeof(cpuid_input_t)); ret = 0; } } break; case XEN_DOMCTL_gettscinfo: { xen_guest_tsc_info_t info; domain_pause(d); tsc_get_info(d, &info.tsc_mode, &info.elapsed_nsec, &info.gtsc_khz, &info.incarnation); if ( copy_to_guest(domctl->u.tsc_info.out_info, &info, 1) ) ret = -EFAULT; else ret = 0; domain_unpause(d); } break; case XEN_DOMCTL_settscinfo: { domain_pause(d); tsc_set_info(d, domctl->u.tsc_info.info.tsc_mode, domctl->u.tsc_info.info.elapsed_nsec, domctl->u.tsc_info.info.gtsc_khz, domctl->u.tsc_info.info.incarnation); domain_unpause(d); ret = 0; } break; case XEN_DOMCTL_suppress_spurious_page_faults: { d->arch.suppress_spurious_page_faults = 1; ret = 0; } break; case XEN_DOMCTL_debug_op: { struct vcpu *v; ret = -EINVAL; if ( (domctl->u.debug_op.vcpu >= d->max_vcpus) || ((v = d->vcpu[domctl->u.debug_op.vcpu]) == NULL) ) break; ret = -EINVAL; if ( !is_hvm_domain(d)) break; ret = hvm_debug_op(v, domctl->u.debug_op.op); } break; case XEN_DOMCTL_gdbsx_guestmemio: { domctl->u.gdbsx_guest_memio.remain = domctl->u.gdbsx_guest_memio.len; ret = gdbsx_guest_mem_io(domctl->domain, &domctl->u.gdbsx_guest_memio); if ( !ret ) copyback = 1; } break; case XEN_DOMCTL_gdbsx_pausevcpu: { struct vcpu *v; ret = -EBUSY; if ( !d->is_paused_by_controller ) break; ret = -EINVAL; if ( domctl->u.gdbsx_pauseunp_vcpu.vcpu >= MAX_VIRT_CPUS || (v = d->vcpu[domctl->u.gdbsx_pauseunp_vcpu.vcpu]) == NULL ) break; vcpu_pause(v); ret = 0; } break; case XEN_DOMCTL_gdbsx_unpausevcpu: { struct vcpu *v; ret = -EBUSY; if ( !d->is_paused_by_controller ) break; ret = -EINVAL; if ( domctl->u.gdbsx_pauseunp_vcpu.vcpu >= MAX_VIRT_CPUS || (v = d->vcpu[domctl->u.gdbsx_pauseunp_vcpu.vcpu]) == NULL ) break; if ( !atomic_read(&v->pause_count) ) printk("WARN: Unpausing vcpu:%d which is not paused\n", v->vcpu_id); vcpu_unpause(v); ret = 0; } break; case XEN_DOMCTL_gdbsx_domstatus: { struct vcpu *v; domctl->u.gdbsx_domstatus.vcpu_id = -1; domctl->u.gdbsx_domstatus.paused = d->is_paused_by_controller; if ( domctl->u.gdbsx_domstatus.paused ) { for_each_vcpu ( d, v ) { if ( v->arch.gdbsx_vcpu_event ) { domctl->u.gdbsx_domstatus.vcpu_id = v->vcpu_id; domctl->u.gdbsx_domstatus.vcpu_ev = v->arch.gdbsx_vcpu_event; v->arch.gdbsx_vcpu_event = 0; break; } } } ret = 0; copyback = 1; } break; case XEN_DOMCTL_setvcpuextstate: case XEN_DOMCTL_getvcpuextstate: { struct xen_domctl_vcpuextstate *evc; struct vcpu *v; uint32_t offset = 0; #define PV_XSAVE_SIZE(xcr0) (2 * sizeof(uint64_t) + xstate_ctxt_size(xcr0)) evc = &domctl->u.vcpuextstate; ret = -ESRCH; if ( (evc->vcpu >= d->max_vcpus) || ((v = d->vcpu[evc->vcpu]) == NULL) ) goto vcpuextstate_out; if ( domctl->cmd == XEN_DOMCTL_getvcpuextstate ) { unsigned int size = PV_XSAVE_SIZE(v->arch.xcr0_accum); if ( !evc->size && !evc->xfeature_mask ) { evc->xfeature_mask = xfeature_mask; evc->size = size; ret = 0; goto vcpuextstate_out; } if ( evc->size != size || evc->xfeature_mask != xfeature_mask ) { ret = -EINVAL; goto vcpuextstate_out; } if ( copy_to_guest_offset(domctl->u.vcpuextstate.buffer, offset, (void *)&v->arch.xcr0, sizeof(v->arch.xcr0)) ) { ret = -EFAULT; goto vcpuextstate_out; } offset += sizeof(v->arch.xcr0); if ( copy_to_guest_offset(domctl->u.vcpuextstate.buffer, offset, (void *)&v->arch.xcr0_accum, sizeof(v->arch.xcr0_accum)) ) { ret = -EFAULT; goto vcpuextstate_out; } offset += sizeof(v->arch.xcr0_accum); if ( copy_to_guest_offset(domctl->u.vcpuextstate.buffer, offset, (void *)v->arch.xsave_area, size - 2 * sizeof(uint64_t)) ) { ret = -EFAULT; goto vcpuextstate_out; } } else { void *receive_buf; uint64_t _xcr0, _xcr0_accum; const struct xsave_struct *_xsave_area; ret = -EINVAL; if ( evc->size < 2 * sizeof(uint64_t) || evc->size > 2 * sizeof(uint64_t) + xstate_ctxt_size(xfeature_mask) ) goto vcpuextstate_out; receive_buf = xmalloc_bytes(evc->size); if ( !receive_buf ) { ret = -ENOMEM; goto vcpuextstate_out; } if ( copy_from_guest_offset(receive_buf, domctl->u.vcpuextstate.buffer, offset, evc->size) ) { ret = -EFAULT; xfree(receive_buf); goto vcpuextstate_out; } _xcr0 = *(uint64_t *)receive_buf; _xcr0_accum = *(uint64_t *)(receive_buf + sizeof(uint64_t)); _xsave_area = receive_buf + 2 * sizeof(uint64_t); if ( _xcr0_accum ) { if ( evc->size >= 2 * sizeof(uint64_t) + XSTATE_AREA_MIN_SIZE ) ret = validate_xstate(_xcr0, _xcr0_accum, _xsave_area->xsave_hdr.xstate_bv, evc->xfeature_mask); } else if ( !_xcr0 ) ret = 0; if ( ret ) { xfree(receive_buf); goto vcpuextstate_out; } if ( evc->size <= PV_XSAVE_SIZE(_xcr0_accum) ) { v->arch.xcr0 = _xcr0; v->arch.xcr0_accum = _xcr0_accum; if ( _xcr0_accum & XSTATE_NONLAZY ) v->arch.nonlazy_xstate_used = 1; memcpy(v->arch.xsave_area, _xsave_area, evc->size - 2 * sizeof(uint64_t)); } else ret = -EINVAL; xfree(receive_buf); } ret = 0; vcpuextstate_out: if ( domctl->cmd == XEN_DOMCTL_getvcpuextstate ) copyback = 1; } break; case XEN_DOMCTL_mem_event_op: { ret = mem_event_domctl(d, &domctl->u.mem_event_op, guest_handle_cast(u_domctl, void)); copyback = 1; } break; case XEN_DOMCTL_mem_sharing_op: { ret = mem_sharing_domctl(d, &domctl->u.mem_sharing_op); } break; #if P2M_AUDIT case XEN_DOMCTL_audit_p2m: { if ( d == current->domain ) { ret = -EPERM; break; } audit_p2m(d, &domctl->u.audit_p2m.orphans, &domctl->u.audit_p2m.m2p_bad, &domctl->u.audit_p2m.p2m_bad); copyback = 1; } break; #endif /* P2M_AUDIT */ case XEN_DOMCTL_set_access_required: { struct p2m_domain* p2m; ret = -EPERM; if ( current->domain == d ) break; ret = 0; p2m = p2m_get_hostp2m(d); p2m->access_required = domctl->u.access_required.access_required; } break; case XEN_DOMCTL_set_broken_page_p2m: { p2m_type_t pt; unsigned long pfn = domctl->u.set_broken_page_p2m.pfn; mfn_t mfn = get_gfn_query(d, pfn, &pt); if ( unlikely(!mfn_valid(mfn_x(mfn)) || !p2m_is_ram(pt) || (p2m_change_type(d, pfn, pt, p2m_ram_broken) != pt)) ) ret = -EINVAL; put_gfn(d, pfn); } break; default: ret = iommu_do_domctl(domctl, d, u_domctl); break; } if ( copyback && __copy_to_guest(u_domctl, domctl, 1) ) ret = -EFAULT; return ret; } #define xen_vcpu_guest_context vcpu_guest_context #define fpu_ctxt fpu_ctxt.x CHECK_FIELD_(struct, vcpu_guest_context, fpu_ctxt); #undef fpu_ctxt #undef xen_vcpu_guest_context void arch_get_info_guest(struct vcpu *v, vcpu_guest_context_u c) { unsigned int i; bool_t compat = is_pv_32on64_domain(v->domain); #define c(fld) (!compat ? (c.nat->fld) : (c.cmp->fld)) if ( !is_pv_vcpu(v) ) memset(c.nat, 0, sizeof(*c.nat)); memcpy(&c.nat->fpu_ctxt, v->arch.fpu_ctxt, sizeof(c.nat->fpu_ctxt)); c(flags = v->arch.vgc_flags & ~(VGCF_i387_valid|VGCF_in_kernel)); if ( v->fpu_initialised ) c(flags |= VGCF_i387_valid); if ( !test_bit(_VPF_down, &v->pause_flags) ) c(flags |= VGCF_online); if ( !compat ) { memcpy(&c.nat->user_regs, &v->arch.user_regs, sizeof(c.nat->user_regs)); if ( is_pv_vcpu(v) ) memcpy(c.nat->trap_ctxt, v->arch.pv_vcpu.trap_ctxt, sizeof(c.nat->trap_ctxt)); } else { XLAT_cpu_user_regs(&c.cmp->user_regs, &v->arch.user_regs); for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); ++i ) XLAT_trap_info(c.cmp->trap_ctxt + i, v->arch.pv_vcpu.trap_ctxt + i); } for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i ) c(debugreg[i] = v->arch.debugreg[i]); if ( has_hvm_container_vcpu(v) ) { struct segment_register sreg; c.nat->ctrlreg[0] = v->arch.hvm_vcpu.guest_cr[0]; c.nat->ctrlreg[2] = v->arch.hvm_vcpu.guest_cr[2]; c.nat->ctrlreg[3] = v->arch.hvm_vcpu.guest_cr[3]; c.nat->ctrlreg[4] = v->arch.hvm_vcpu.guest_cr[4]; hvm_get_segment_register(v, x86_seg_cs, &sreg); c.nat->user_regs.cs = sreg.sel; hvm_get_segment_register(v, x86_seg_ss, &sreg); c.nat->user_regs.ss = sreg.sel; hvm_get_segment_register(v, x86_seg_ds, &sreg); c.nat->user_regs.ds = sreg.sel; hvm_get_segment_register(v, x86_seg_es, &sreg); c.nat->user_regs.es = sreg.sel; hvm_get_segment_register(v, x86_seg_fs, &sreg); c.nat->user_regs.fs = sreg.sel; c.nat->fs_base = sreg.base; hvm_get_segment_register(v, x86_seg_gs, &sreg); c.nat->user_regs.gs = sreg.sel; if ( ring_0(&c.nat->user_regs) ) { c.nat->gs_base_kernel = sreg.base; c.nat->gs_base_user = hvm_get_shadow_gs_base(v); } else { c.nat->gs_base_user = sreg.base; c.nat->gs_base_kernel = hvm_get_shadow_gs_base(v); } } else { c(ldt_base = v->arch.pv_vcpu.ldt_base); c(ldt_ents = v->arch.pv_vcpu.ldt_ents); for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames); ++i ) c(gdt_frames[i] = v->arch.pv_vcpu.gdt_frames[i]); BUILD_BUG_ON(ARRAY_SIZE(c.nat->gdt_frames) != ARRAY_SIZE(c.cmp->gdt_frames)); for ( ; i < ARRAY_SIZE(c.nat->gdt_frames); ++i ) c(gdt_frames[i] = 0); c(gdt_ents = v->arch.pv_vcpu.gdt_ents); c(kernel_ss = v->arch.pv_vcpu.kernel_ss); c(kernel_sp = v->arch.pv_vcpu.kernel_sp); for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.ctrlreg); ++i ) c(ctrlreg[i] = v->arch.pv_vcpu.ctrlreg[i]); c(event_callback_eip = v->arch.pv_vcpu.event_callback_eip); c(failsafe_callback_eip = v->arch.pv_vcpu.failsafe_callback_eip); if ( !compat ) { c.nat->syscall_callback_eip = v->arch.pv_vcpu.syscall_callback_eip; c.nat->fs_base = v->arch.pv_vcpu.fs_base; c.nat->gs_base_kernel = v->arch.pv_vcpu.gs_base_kernel; c.nat->gs_base_user = v->arch.pv_vcpu.gs_base_user; } else { c(event_callback_cs = v->arch.pv_vcpu.event_callback_cs); c(failsafe_callback_cs = v->arch.pv_vcpu.failsafe_callback_cs); } c(vm_assist = v->arch.pv_vcpu.vm_assist); /* IOPL privileges are virtualised: merge back into returned eflags. */ BUG_ON((c(user_regs.eflags) & X86_EFLAGS_IOPL) != 0); c(user_regs.eflags |= v->arch.pv_vcpu.iopl << 12); if ( !is_pv_32on64_domain(v->domain) ) { c.nat->ctrlreg[3] = xen_pfn_to_cr3( pagetable_get_pfn(v->arch.guest_table)); c.nat->ctrlreg[1] = pagetable_is_null(v->arch.guest_table_user) ? 0 : xen_pfn_to_cr3(pagetable_get_pfn(v->arch.guest_table_user)); /* Merge shadow DR7 bits into real DR7. */ c.nat->debugreg[7] |= c.nat->debugreg[5]; c.nat->debugreg[5] = 0; } else { const l4_pgentry_t *l4e = map_domain_page(pagetable_get_pfn(v->arch.guest_table)); c.cmp->ctrlreg[3] = compat_pfn_to_cr3(l4e_get_pfn(*l4e)); unmap_domain_page(l4e); /* Merge shadow DR7 bits into real DR7. */ c.cmp->debugreg[7] |= c.cmp->debugreg[5]; c.cmp->debugreg[5] = 0; } if ( guest_kernel_mode(v, &v->arch.user_regs) ) c(flags |= VGCF_in_kernel); } c(vm_assist = v->domain->vm_assist); #undef c } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/ioport_emulate.c0000664000175000017500000000647212307313555016004 0ustar smbsmb/****************************************************************************** * ioport_emulate.c * * Handle I/O port access quirks of various platforms. */ #include #include #include #include static void ioemul_handle_proliant_quirk( u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs) { uint16_t port = regs->edx; uint8_t value = regs->eax; if ( (opcode != 0xee) || (port != 0xcd4) || !(value & 0x80) ) return; /* pushf */ io_emul_stub[0] = 0x9c; /* cli */ io_emul_stub[1] = 0xfa; /* out %al,%dx */ io_emul_stub[2] = 0xee; /* 1: in %dx,%al */ io_emul_stub[3] = 0xec; /* test $0x80,%al */ io_emul_stub[4] = 0xa8; io_emul_stub[5] = 0x80; /* jnz 1b */ io_emul_stub[6] = 0x75; io_emul_stub[7] = 0xfb; /* popf */ io_emul_stub[8] = 0x9d; /* ret */ io_emul_stub[9] = 0xc3; } static int __init proliant_quirk(struct dmi_system_id *d) { ioemul_handle_quirk = ioemul_handle_proliant_quirk; return 0; } /* This table is the set of system-specific I/O emulation hooks. */ static struct dmi_system_id __initdata ioport_quirks_tbl[] = { /* * I/O emulation hook for certain HP ProLiant servers with * 'special' SMM goodness. */ { .callback = proliant_quirk, .ident = "HP ProLiant DL3xx", .matches = { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL3"), }, }, { .callback = proliant_quirk, .ident = "HP ProLiant DL5xx", .matches = { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL5"), }, }, { .callback = proliant_quirk, .ident = "HP ProLiant DL7xx", .matches = { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL7"), }, }, { .callback = proliant_quirk, .ident = "HP ProLiant ML3xx", .matches = { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant ML3"), }, }, { .callback = proliant_quirk, .ident = "HP ProLiant ML5xx", .matches = { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant ML5"), }, }, { .callback = proliant_quirk, .ident = "HP ProLiant BL2xx", .matches = { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL2"), }, }, { .callback = proliant_quirk, .ident = "HP ProLiant BL4xx", .matches = { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL4"), }, }, { .callback = proliant_quirk, .ident = "HP ProLiant BL6xx", .matches = { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL6"), }, }, { } }; static int __init ioport_quirks_init(void) { dmi_check_system(ioport_quirks_tbl); return 0; } __initcall(ioport_quirks_init); /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/Makefile0000664000175000017500000001403612307313555014243 0ustar smbsmbsubdir-y += acpi subdir-y += cpu subdir-y += genapic subdir-y += hvm subdir-y += mm subdir-y += oprofile subdir-$(x86_64) += x86_64 obj-y += apic.o obj-y += bitops.o obj-bin-y += bzimage.init.o obj-bin-y += clear_page.o obj-bin-y += copy_page.o obj-y += compat.o obj-y += debug.o obj-y += delay.o obj-bin-y += dmi_scan.init.o obj-y += domctl.o obj-y += domain.o obj-bin-y += domain_build.init.o obj-y += domain_page.o obj-y += e820.o obj-y += extable.o obj-y += flushtlb.o obj-y += platform_hypercall.o obj-y += i387.o obj-y += i8259.o obj-y += io_apic.o obj-y += msi.o obj-y += ioport_emulate.o obj-y += irq.o obj-y += microcode_amd.o obj-y += microcode_intel.o # This must come after the vendor specific files. obj-y += microcode.o obj-y += mm.o obj-y += mpparse.o obj-y += nmi.o obj-y += numa.o obj-y += pci.o obj-y += percpu.o obj-y += physdev.o obj-y += setup.o obj-y += shutdown.o obj-y += smp.o obj-y += smpboot.o obj-y += srat.o obj-y += string.o obj-y += sysctl.o obj-y += time.o obj-y += trace.o obj-y += traps.o obj-y += usercopy.o obj-y += x86_emulate.o obj-y += machine_kexec.o obj-y += crash.o obj-y += tboot.o obj-y += hpet.o obj-y += xstate.o obj-$(crash_debug) += gdbstub.o x86_emulate.o: x86_emulate/x86_emulate.c x86_emulate/x86_emulate.h efi-$(x86_64) := $(shell if [ ! -r $(BASEDIR)/include/xen/compile.h -o \ -O $(BASEDIR)/include/xen/compile.h ]; then \ echo '$(TARGET).efi'; fi) $(TARGET): $(TARGET)-syms $(efi-y) boot/mkelf32 ./boot/mkelf32 $(TARGET)-syms $(TARGET) 0x100000 \ `$(NM) -nr $(TARGET)-syms | head -n 1 | sed -e 's/^\([^ ]*\).*/0x\1/'` ALL_OBJS := $(BASEDIR)/arch/x86/boot/built_in.o $(BASEDIR)/arch/x86/efi/built_in.o $(ALL_OBJS) ifeq ($(lto),y) # Gather all LTO objects together prelink_lto.o: $(ALL_OBJS) $(LD_LTO) -r -o $@ $^ prelink-efi_lto.o: $(ALL_OBJS) efi/runtime.o efi/compat.o $(guard) $(LD_LTO) -r -o $@ $(filter-out %/efi/built_in.o,$^) # Link it with all the binary objects prelink.o: $(patsubst %/built_in.o,%/built_in_bin.o,$(ALL_OBJS)) prelink_lto.o $(LD) $(LDFLAGS) -r -o $@ $^ prelink-efi.o: $(patsubst %/built_in.o,%/built_in_bin.o,$(ALL_OBJS)) prelink-efi_lto.o efi/boot.init.o $(guard) $(LD) $(LDFLAGS) -r -o $@ $^ else prelink.o: $(ALL_OBJS) $(LD) $(LDFLAGS) -r -o $@ $^ prelink-efi.o: $(ALL_OBJS) efi/boot.init.o efi/runtime.o efi/compat.o $(guard) $(LD) $(LDFLAGS) -r -o $@ $(filter-out %/efi/built_in.o,$^) endif $(BASEDIR)/common/symbols-dummy.o: $(MAKE) -f $(BASEDIR)/Rules.mk -C $(BASEDIR)/common symbols-dummy.o $(TARGET)-syms: prelink.o xen.lds $(BASEDIR)/common/symbols-dummy.o $(LD) $(LDFLAGS) -T xen.lds -N prelink.o \ $(BASEDIR)/common/symbols-dummy.o -o $(@D)/.$(@F).0 $(NM) -n $(@D)/.$(@F).0 | $(BASEDIR)/tools/symbols >$(@D)/.$(@F).0.S $(MAKE) -f $(BASEDIR)/Rules.mk $(@D)/.$(@F).0.o $(LD) $(LDFLAGS) -T xen.lds -N prelink.o \ $(@D)/.$(@F).0.o -o $(@D)/.$(@F).1 $(NM) -n $(@D)/.$(@F).1 | $(BASEDIR)/tools/symbols >$(@D)/.$(@F).1.S $(MAKE) -f $(BASEDIR)/Rules.mk $(@D)/.$(@F).1.o $(LD) $(LDFLAGS) -T xen.lds -N prelink.o \ $(@D)/.$(@F).1.o -o $@ rm -f $(@D)/.$(@F).[0-9]* EFI_LDFLAGS = $(patsubst -m%,-mi386pep,$(LDFLAGS)) --subsystem=10 EFI_LDFLAGS += --image-base=$(1) --stack=0,0 --heap=0,0 --strip-debug EFI_LDFLAGS += --section-alignment=0x200000 --file-alignment=0x20 EFI_LDFLAGS += --major-image-version=$(XEN_VERSION) EFI_LDFLAGS += --minor-image-version=$(XEN_SUBVERSION) EFI_LDFLAGS += --major-os-version=2 --minor-os-version=0 EFI_LDFLAGS += --major-subsystem-version=2 --minor-subsystem-version=0 $(TARGET).efi: VIRT_BASE = 0x$(shell $(NM) efi/relocs-dummy.o | sed -n 's, A VIRT_START$$,,p') $(TARGET).efi: ALT_BASE = 0x$(shell $(NM) efi/relocs-dummy.o | sed -n 's, A ALT_START$$,,p') # Don't use $(wildcard ...) here - at least make 3.80 expands this too early! $(TARGET).efi: guard = $(if $(shell echo efi/dis* | grep disabled),:) $(TARGET).efi: prelink-efi.o efi.lds efi/relocs-dummy.o $(BASEDIR)/common/symbols-dummy.o efi/mkreloc $(foreach base, $(VIRT_BASE) $(ALT_BASE), \ $(guard) $(LD) $(call EFI_LDFLAGS,$(base)) -T efi.lds -N $< efi/relocs-dummy.o \ $(BASEDIR)/common/symbols-dummy.o -o $(@D)/.$(@F).$(base).0 &&) : $(guard) efi/mkreloc $(foreach base,$(VIRT_BASE) $(ALT_BASE),$(@D)/.$(@F).$(base).0) >$(@D)/.$(@F).0r.S $(guard) $(NM) -n $(@D)/.$(@F).$(VIRT_BASE).0 | $(guard) $(BASEDIR)/tools/symbols >$(@D)/.$(@F).0s.S $(guard) $(MAKE) -f $(BASEDIR)/Rules.mk $(@D)/.$(@F).0r.o $(@D)/.$(@F).0s.o $(foreach base, $(VIRT_BASE) $(ALT_BASE), \ $(guard) $(LD) $(call EFI_LDFLAGS,$(base)) -T efi.lds -N $< \ $(@D)/.$(@F).0r.o $(@D)/.$(@F).0s.o -o $(@D)/.$(@F).$(base).1 &&) : $(guard) efi/mkreloc $(foreach base,$(VIRT_BASE) $(ALT_BASE),$(@D)/.$(@F).$(base).1) >$(@D)/.$(@F).1r.S $(guard) $(NM) -n $(@D)/.$(@F).$(VIRT_BASE).1 | $(guard) $(BASEDIR)/tools/symbols >$(@D)/.$(@F).1s.S $(guard) $(MAKE) -f $(BASEDIR)/Rules.mk $(@D)/.$(@F).1r.o $(@D)/.$(@F).1s.o $(guard) $(LD) $(call EFI_LDFLAGS,$(VIRT_BASE)) -T efi.lds -N $< \ $(@D)/.$(@F).1r.o $(@D)/.$(@F).1s.o -o $@ if $(guard) false; then rm -f $@; echo 'EFI support disabled'; fi rm -f $(@D)/.$(@F).[0-9]* efi/boot.init.o efi/runtime.o efi/compat.o: $(BASEDIR)/arch/x86/efi/built_in.o efi/boot.init.o efi/runtime.o efi/compat.o: ; asm-offsets.s: $(TARGET_SUBARCH)/asm-offsets.c $(CC) $(filter-out -flto,$(CFLAGS)) -S -o $@ $< xen.lds: xen.lds.S $(CC) -P -E -Ui386 $(AFLAGS) -o $@ $< sed -e 's/xen\.lds\.o:/xen\.lds:/g' <.xen.lds.d >.xen.lds.d.new mv -f .xen.lds.d.new .xen.lds.d efi.lds: xen.lds.S $(CC) -P -E -Ui386 -DEFI $(AFLAGS) -o $@ $< sed -e 's/efi\.lds\.o:/efi\.lds:/g' <.$(@F).d >.$(@F).d.new mv -f .$(@F).d.new .$(@F).d boot/mkelf32: boot/mkelf32.c $(HOSTCC) $(HOSTCFLAGS) -o $@ $< efi/mkreloc: efi/mkreloc.c $(HOSTCC) $(HOSTCFLAGS) -g -o $@ $< .PHONY: clean clean:: rm -f asm-offsets.s *.lds boot/*.o boot/*~ boot/core boot/mkelf32 rm -f $(BASEDIR)/.xen-syms.[0-9]* boot/.*.d rm -f $(BASEDIR)/.xen.efi.[0-9]* efi/*.o efi/.*.d efi/*.efi efi/disabled efi/mkreloc rm -f boot/reloc.S boot/reloc.lnk boot/reloc.bin xen-4.4.0/xen/arch/x86/trace.c0000664000175000017500000001066612307313555014052 0ustar smbsmb#include #include #include #include #include #include #include void __trace_hypercall_entry(void) { struct cpu_user_regs *regs = guest_cpu_user_regs(); unsigned long args[6]; if ( is_pv_32on64_vcpu(current) ) { args[0] = regs->ebx; args[1] = regs->ecx; args[2] = regs->edx; args[3] = regs->esi; args[4] = regs->edi; args[5] = regs->ebp; } else { args[0] = regs->rdi; args[1] = regs->rsi; args[2] = regs->rdx; args[3] = regs->r10; args[4] = regs->r8; args[5] = regs->r9; } __trace_hypercall(TRC_PV_HYPERCALL_V2, regs->eax, args); } void __trace_pv_trap(int trapnr, unsigned long eip, int use_error_code, unsigned error_code) { if ( is_pv_32on64_vcpu(current) ) { struct { unsigned eip:32, trapnr:15, use_error_code:1, error_code:16; } __attribute__((packed)) d; d.eip = eip; d.trapnr = trapnr; d.error_code = error_code; d.use_error_code=!!use_error_code; __trace_var(TRC_PV_TRAP, 1, sizeof(d), &d); } else { struct { unsigned long eip; unsigned trapnr:15, use_error_code:1, error_code:16; } __attribute__((packed)) d; unsigned event; d.eip = eip; d.trapnr = trapnr; d.error_code = error_code; d.use_error_code=!!use_error_code; event = TRC_PV_TRAP; event |= TRC_64_FLAG; __trace_var(event, 1, sizeof(d), &d); } } void __trace_pv_page_fault(unsigned long addr, unsigned error_code) { unsigned long eip = guest_cpu_user_regs()->eip; if ( is_pv_32on64_vcpu(current) ) { struct { u32 eip, addr, error_code; } __attribute__((packed)) d; d.eip = eip; d.addr = addr; d.error_code = error_code; __trace_var(TRC_PV_PAGE_FAULT, 1, sizeof(d), &d); } else { struct { unsigned long eip, addr; u32 error_code; } __attribute__((packed)) d; unsigned event; d.eip = eip; d.addr = addr; d.error_code = error_code; event = TRC_PV_PAGE_FAULT; event |= TRC_64_FLAG; __trace_var(event, 1, sizeof(d), &d); } } void __trace_trap_one_addr(unsigned event, unsigned long va) { if ( is_pv_32on64_vcpu(current) ) { u32 d = va; __trace_var(event, 1, sizeof(d), &d); } else { event |= TRC_64_FLAG; __trace_var(event, 1, sizeof(va), &va); } } void __trace_trap_two_addr(unsigned event, unsigned long va1, unsigned long va2) { if ( is_pv_32on64_vcpu(current) ) { struct { u32 va1, va2; } __attribute__((packed)) d; d.va1=va1; d.va2=va2; __trace_var(event, 1, sizeof(d), &d); } else { struct { unsigned long va1, va2; } __attribute__((packed)) d; d.va1=va1; d.va2=va2; event |= TRC_64_FLAG; __trace_var(event, 1, sizeof(d), &d); } } void __trace_ptwr_emulation(unsigned long addr, l1_pgentry_t npte) { unsigned long eip = guest_cpu_user_regs()->eip; /* We have a couple of different modes to worry about: * - 32-on-32: 32-bit pte, 32-bit virtual addresses * - pae-on-pae, pae-on-64: 64-bit pte, 32-bit virtual addresses * - 64-on-64: 64-bit pte, 64-bit virtual addresses * pae-on-64 is the only one that requires extra code; in all other * cases, "unsigned long" is the size of a guest virtual address. */ if ( is_pv_32on64_vcpu(current) ) { struct { l1_pgentry_t pte; u32 addr, eip; } __attribute__((packed)) d; d.addr = addr; d.eip = eip; d.pte = npte; __trace_var(TRC_PV_PTWR_EMULATION_PAE, 1, sizeof(d), &d); } else { struct { l1_pgentry_t pte; unsigned long addr, eip; } d; unsigned event; d.addr = addr; d.eip = eip; d.pte = npte; event = TRC_PV_PTWR_EMULATION; event |= TRC_64_FLAG; __trace_var(event, 1/*tsc*/, sizeof(d), &d); } } xen-4.4.0/xen/arch/x86/x86_emulate/0000775000175000017500000000000012307313555014740 5ustar smbsmbxen-4.4.0/xen/arch/x86/x86_emulate/x86_emulate.c0000664000175000017500000046420512307313555017260 0ustar smbsmb/****************************************************************************** * x86_emulate.c * * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. * * Copyright (c) 2005-2007 Keir Fraser * Copyright (c) 2005-2007 XenSource Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* Operand sizes: 8-bit operands or specified/overridden size. */ #define ByteOp (1<<0) /* 8-bit operands. */ /* Destination operand type. */ #define DstNone (0<<1) /* No destination operand. */ #define DstImplicit (0<<1) /* Destination operand is implicit in the opcode. */ #define DstBitBase (1<<1) /* Memory operand, bit string. */ #define DstReg (2<<1) /* Register operand. */ #define DstEax DstReg /* Register EAX (aka DstReg with no ModRM) */ #define DstMem (3<<1) /* Memory operand. */ #define DstMask (3<<1) /* Source operand type. */ #define SrcInvalid (0<<3) /* Unimplemented opcode. */ #define SrcNone (1<<3) /* No source operand. */ #define SrcImplicit (1<<3) /* Source operand is implicit in the opcode. */ #define SrcReg (2<<3) /* Register operand. */ #define SrcMem (3<<3) /* Memory operand. */ #define SrcMem16 (4<<3) /* Memory operand (16-bit). */ #define SrcImm (5<<3) /* Immediate operand. */ #define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ #define SrcMask (7<<3) /* Generic ModRM decode. */ #define ModRM (1<<6) /* Destination is only written; never read. */ #define Mov (1<<7) /* All operands are implicit in the opcode. */ #define ImplicitOps (DstImplicit|SrcImplicit) static uint8_t opcode_table[256] = { /* 0x00 - 0x07 */ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, ByteOp|DstEax|SrcImm, DstEax|SrcImm, ImplicitOps, ImplicitOps, /* 0x08 - 0x0F */ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, ByteOp|DstEax|SrcImm, DstEax|SrcImm, ImplicitOps, 0, /* 0x10 - 0x17 */ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, ByteOp|DstEax|SrcImm, DstEax|SrcImm, ImplicitOps, ImplicitOps, /* 0x18 - 0x1F */ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, ByteOp|DstEax|SrcImm, DstEax|SrcImm, ImplicitOps, ImplicitOps, /* 0x20 - 0x27 */ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, ByteOp|DstEax|SrcImm, DstEax|SrcImm, 0, ImplicitOps, /* 0x28 - 0x2F */ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, ByteOp|DstEax|SrcImm, DstEax|SrcImm, 0, ImplicitOps, /* 0x30 - 0x37 */ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, ByteOp|DstEax|SrcImm, DstEax|SrcImm, 0, ImplicitOps, /* 0x38 - 0x3F */ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, ByteOp|DstEax|SrcImm, DstEax|SrcImm, 0, ImplicitOps, /* 0x40 - 0x4F */ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0x50 - 0x5F */ ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, /* 0x60 - 0x67 */ ImplicitOps, ImplicitOps, DstReg|SrcMem|ModRM, DstReg|SrcNone|ModRM|Mov, 0, 0, 0, 0, /* 0x68 - 0x6F */ ImplicitOps|Mov, DstReg|SrcImm|ModRM|Mov, ImplicitOps|Mov, DstReg|SrcImmByte|ModRM|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps|Mov, /* 0x70 - 0x77 */ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0x78 - 0x7F */ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0x80 - 0x87 */ ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImm|ModRM, ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImmByte|ModRM, ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, /* 0x88 - 0x8F */ ByteOp|DstMem|SrcReg|ModRM|Mov, DstMem|SrcReg|ModRM|Mov, ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, DstMem|SrcReg|ModRM|Mov, DstReg|SrcNone|ModRM, DstReg|SrcMem16|ModRM|Mov, DstMem|SrcNone|ModRM|Mov, /* 0x90 - 0x97 */ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0x98 - 0x9F */ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0xA0 - 0xA7 */ ByteOp|ImplicitOps|Mov, ImplicitOps|Mov, ByteOp|ImplicitOps|Mov, ImplicitOps|Mov, ByteOp|ImplicitOps|Mov, ImplicitOps|Mov, ByteOp|ImplicitOps, ImplicitOps, /* 0xA8 - 0xAF */ ByteOp|DstEax|SrcImm, DstEax|SrcImm, ByteOp|ImplicitOps|Mov, ImplicitOps|Mov, ByteOp|ImplicitOps|Mov, ImplicitOps|Mov, ByteOp|ImplicitOps, ImplicitOps, /* 0xB0 - 0xB7 */ ByteOp|DstReg|SrcImm|Mov, ByteOp|DstReg|SrcImm|Mov, ByteOp|DstReg|SrcImm|Mov, ByteOp|DstReg|SrcImm|Mov, ByteOp|DstReg|SrcImm|Mov, ByteOp|DstReg|SrcImm|Mov, ByteOp|DstReg|SrcImm|Mov, ByteOp|DstReg|SrcImm|Mov, /* 0xB8 - 0xBF */ DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, DstReg|SrcImm|Mov, /* 0xC0 - 0xC7 */ ByteOp|DstMem|SrcImm|ModRM, DstMem|SrcImmByte|ModRM, ImplicitOps, ImplicitOps, DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, ByteOp|DstMem|SrcImm|ModRM|Mov, DstMem|SrcImm|ModRM|Mov, /* 0xC8 - 0xCF */ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0xD0 - 0xD7 */ ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0xD8 - 0xDF */ ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, /* 0xE0 - 0xE7 */ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0xE8 - 0xEF */ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0xF0 - 0xF7 */ 0, ImplicitOps, 0, 0, ImplicitOps, ImplicitOps, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM, /* 0xF8 - 0xFF */ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM }; static uint8_t twobyte_table[256] = { /* 0x00 - 0x07 */ SrcMem16|ModRM, ImplicitOps|ModRM, 0, 0, 0, ImplicitOps, ImplicitOps, 0, /* 0x08 - 0x0F */ ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps|ModRM, 0, 0, /* 0x10 - 0x17 */ ImplicitOps|ModRM, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0, /* 0x18 - 0x1F */ ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, /* 0x20 - 0x27 */ ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, 0, 0, 0, 0, /* 0x28 - 0x2F */ ImplicitOps|ModRM, ImplicitOps|ModRM, 0, ImplicitOps|ModRM, 0, 0, 0, 0, /* 0x30 - 0x37 */ ImplicitOps, ImplicitOps, ImplicitOps, 0, ImplicitOps, ImplicitOps, 0, 0, /* 0x38 - 0x3F */ 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x47 */ DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, /* 0x48 - 0x4F */ DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, /* 0x50 - 0x5F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 - 0x6F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, /* 0x70 - 0x7F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, /* 0x80 - 0x87 */ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0x88 - 0x8F */ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0x90 - 0x97 */ ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, /* 0x98 - 0x9F */ ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, /* 0xA0 - 0xA7 */ ImplicitOps, ImplicitOps, ImplicitOps, DstBitBase|SrcReg|ModRM, DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, 0, /* 0xA8 - 0xAF */ ImplicitOps, ImplicitOps, 0, DstBitBase|SrcReg|ModRM, DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, ImplicitOps|ModRM, DstReg|SrcMem|ModRM, /* 0xB0 - 0xB7 */ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, DstReg|SrcMem|ModRM|Mov, DstBitBase|SrcReg|ModRM, DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov, ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov, /* 0xB8 - 0xBF */ 0, 0, DstBitBase|SrcImmByte|ModRM, DstBitBase|SrcReg|ModRM, DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM, ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov, /* 0xC0 - 0xC7 */ ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, DstMem|SrcReg|ModRM|Mov, 0, 0, 0, ImplicitOps|ModRM, /* 0xC8 - 0xCF */ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0xD0 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xEF */ 0, 0, 0, 0, 0, 0, 0, ImplicitOps|ModRM, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0 - 0xFF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; #define REX_PREFIX 0x40 #define REX_B 0x01 #define REX_X 0x02 #define REX_R 0x04 #define REX_W 0x08 #define vex_none 0 enum vex_opcx { vex_0f = vex_none + 1, vex_0f38, vex_0f3a, }; enum vex_pfx { vex_66 = vex_none + 1, vex_f3, vex_f2 }; #define VEX_PREFIX_DOUBLE_MASK 0x1 #define VEX_PREFIX_SCALAR_MASK 0x2 static const uint8_t sse_prefix[] = { 0x66, 0xf3, 0xf2 }; #define SET_SSE_PREFIX(dst, vex_pfx) do { \ if ( vex_pfx ) \ (dst) = sse_prefix[(vex_pfx) - 1]; \ } while (0) union vex { uint8_t raw[2]; struct { uint8_t opcx:5; uint8_t b:1; uint8_t x:1; uint8_t r:1; uint8_t pfx:2; uint8_t l:1; uint8_t reg:4; uint8_t w:1; }; }; #define copy_REX_VEX(ptr, rex, vex) do { \ if ( (vex).opcx != vex_none ) \ ptr[0] = 0xc4, ptr[1] = (vex).raw[0], ptr[2] = (vex).raw[1]; \ else if ( mode_64bit() ) \ ptr[1] = rex | REX_PREFIX; \ } while (0) #define rep_prefix() (vex.pfx >= vex_f3) #define repe_prefix() (vex.pfx == vex_f3) #define repne_prefix() (vex.pfx == vex_f2) /* Type, address-of, and value of an instruction's operand. */ struct operand { enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; unsigned int bytes; /* Up to 128-byte operand value, addressable as ulong or uint32_t[]. */ union { unsigned long val; uint32_t bigval[4]; }; /* Up to 128-byte operand value, addressable as ulong or uint32_t[]. */ union { unsigned long orig_val; uint32_t orig_bigval[4]; }; union { /* OP_REG: Pointer to register field. */ unsigned long *reg; /* OP_MEM: Segment and offset. */ struct { enum x86_segment seg; unsigned long off; } mem; }; }; typedef union { uint64_t mmx; uint64_t __attribute__ ((aligned(16))) xmm[2]; uint64_t __attribute__ ((aligned(32))) ymm[4]; } mmval_t; /* * While proper alignment gets specified above, this doesn't get honored by * the compiler for automatic variables. Use this helper to instantiate a * suitably aligned variable, producing a pointer to access it. */ #define DECLARE_ALIGNED(type, var) \ long __##var[sizeof(type) + __alignof(type) - __alignof(long)]; \ type *const var##p = \ (void *)((long)(__##var + __alignof(type) - __alignof(long)) \ & -__alignof(type)) /* MSRs. */ #define MSR_TSC 0x00000010 #define MSR_SYSENTER_CS 0x00000174 #define MSR_SYSENTER_ESP 0x00000175 #define MSR_SYSENTER_EIP 0x00000176 #define MSR_EFER 0xc0000080 #define EFER_SCE (1u<<0) #define EFER_LMA (1u<<10) #define MSR_STAR 0xc0000081 #define MSR_LSTAR 0xc0000082 #define MSR_CSTAR 0xc0000083 #define MSR_FMASK 0xc0000084 #define MSR_TSC_AUX 0xc0000103 /* Control register flags. */ #define CR0_PE (1<<0) #define CR4_TSD (1<<2) /* EFLAGS bit definitions. */ #define EFLG_VIP (1<<20) #define EFLG_VIF (1<<19) #define EFLG_AC (1<<18) #define EFLG_VM (1<<17) #define EFLG_RF (1<<16) #define EFLG_NT (1<<14) #define EFLG_IOPL (3<<12) #define EFLG_OF (1<<11) #define EFLG_DF (1<<10) #define EFLG_IF (1<<9) #define EFLG_TF (1<<8) #define EFLG_SF (1<<7) #define EFLG_ZF (1<<6) #define EFLG_AF (1<<4) #define EFLG_PF (1<<2) #define EFLG_CF (1<<0) /* Exception definitions. */ #define EXC_DE 0 #define EXC_DB 1 #define EXC_BP 3 #define EXC_OF 4 #define EXC_BR 5 #define EXC_UD 6 #define EXC_TS 10 #define EXC_NP 11 #define EXC_SS 12 #define EXC_GP 13 #define EXC_PF 14 #define EXC_MF 16 /* * Instruction emulation: * Most instructions are emulated directly via a fragment of inline assembly * code. This allows us to save/restore EFLAGS and thus very easily pick up * any modified flags. */ #if defined(__x86_64__) #define _LO32 "k" /* force 32-bit operand */ #define _STK "%%rsp" /* stack pointer */ #define _BYTES_PER_LONG "8" #elif defined(__i386__) #define _LO32 "" /* force 32-bit operand */ #define _STK "%%esp" /* stack pointer */ #define _BYTES_PER_LONG "4" #endif /* * These EFLAGS bits are restored from saved value during emulation, and * any changes are written back to the saved value after emulation. */ #define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) /* Before executing instruction: restore necessary bits in EFLAGS. */ #define _PRE_EFLAGS(_sav, _msk, _tmp) \ /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ "movl %"_sav",%"_LO32 _tmp"; " \ "push %"_tmp"; " \ "push %"_tmp"; " \ "movl %"_msk",%"_LO32 _tmp"; " \ "andl %"_LO32 _tmp",("_STK"); " \ "pushf; " \ "notl %"_LO32 _tmp"; " \ "andl %"_LO32 _tmp",("_STK"); " \ "andl %"_LO32 _tmp",2*"_BYTES_PER_LONG"("_STK"); " \ "pop %"_tmp"; " \ "orl %"_LO32 _tmp",("_STK"); " \ "popf; " \ "pop %"_sav"; " /* After executing instruction: write-back necessary bits in EFLAGS. */ #define _POST_EFLAGS(_sav, _msk, _tmp) \ /* _sav |= EFLAGS & _msk; */ \ "pushf; " \ "pop %"_tmp"; " \ "andl %"_msk",%"_LO32 _tmp"; " \ "orl %"_LO32 _tmp",%"_sav"; " /* Raw emulation: instruction has two explicit operands. */ #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy)\ do{ unsigned long _tmp; \ switch ( (_dst).bytes ) \ { \ case 2: \ asm volatile ( \ _PRE_EFLAGS("0","4","2") \ _op"w %"_wx"3,%1; " \ _POST_EFLAGS("0","4","2") \ : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ : _wy ((_src).val), "i" (EFLAGS_MASK), \ "m" (_eflags), "m" ((_dst).val) ); \ break; \ case 4: \ asm volatile ( \ _PRE_EFLAGS("0","4","2") \ _op"l %"_lx"3,%1; " \ _POST_EFLAGS("0","4","2") \ : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ : _ly ((_src).val), "i" (EFLAGS_MASK), \ "m" (_eflags), "m" ((_dst).val) ); \ break; \ case 8: \ __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy); \ break; \ } \ } while (0) #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy)\ do{ unsigned long _tmp; \ switch ( (_dst).bytes ) \ { \ case 1: \ asm volatile ( \ _PRE_EFLAGS("0","4","2") \ _op"b %"_bx"3,%1; " \ _POST_EFLAGS("0","4","2") \ : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ : _by ((_src).val), "i" (EFLAGS_MASK), \ "m" (_eflags), "m" ((_dst).val) ); \ break; \ default: \ __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy);\ break; \ } \ } while (0) /* Source operand is byte-sized and may be restricted to just %cl. */ #define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ __emulate_2op(_op, _src, _dst, _eflags, \ "b", "c", "b", "c", "b", "c", "b", "c") /* Source operand is byte, word, long or quad sized. */ #define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ __emulate_2op(_op, _src, _dst, _eflags, \ "b", "q", "w", "r", _LO32, "r", "", "r") /* Source operand is word, long or quad sized. */ #define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ "w", "r", _LO32, "r", "", "r") /* Instruction has only one explicit operand (no source operand). */ #define emulate_1op(_op,_dst,_eflags) \ do{ unsigned long _tmp; \ switch ( (_dst).bytes ) \ { \ case 1: \ asm volatile ( \ _PRE_EFLAGS("0","3","2") \ _op"b %1; " \ _POST_EFLAGS("0","3","2") \ : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ : "i" (EFLAGS_MASK), "m" (_eflags), "m" ((_dst).val) ); \ break; \ case 2: \ asm volatile ( \ _PRE_EFLAGS("0","3","2") \ _op"w %1; " \ _POST_EFLAGS("0","3","2") \ : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ : "i" (EFLAGS_MASK), "m" (_eflags), "m" ((_dst).val) ); \ break; \ case 4: \ asm volatile ( \ _PRE_EFLAGS("0","3","2") \ _op"l %1; " \ _POST_EFLAGS("0","3","2") \ : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ : "i" (EFLAGS_MASK), "m" (_eflags), "m" ((_dst).val) ); \ break; \ case 8: \ __emulate_1op_8byte(_op, _dst, _eflags); \ break; \ } \ } while (0) /* Emulate an instruction with quadword operands (x86/64 only). */ #if defined(__x86_64__) #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ do{ asm volatile ( \ _PRE_EFLAGS("0","4","2") \ _op"q %"_qx"3,%1; " \ _POST_EFLAGS("0","4","2") \ : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ : _qy ((_src).val), "i" (EFLAGS_MASK), \ "m" (_eflags), "m" ((_dst).val) ); \ } while (0) #define __emulate_1op_8byte(_op, _dst, _eflags) \ do{ asm volatile ( \ _PRE_EFLAGS("0","3","2") \ _op"q %1; " \ _POST_EFLAGS("0","3","2") \ : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ : "i" (EFLAGS_MASK), "m" (_eflags), "m" ((_dst).val) ); \ } while (0) #elif defined(__i386__) #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) #define __emulate_1op_8byte(_op, _dst, _eflags) #endif /* __i386__ */ /* Fetch next part of the instruction being emulated. */ #define insn_fetch_bytes(_size) \ ({ unsigned long _x = 0, _eip = _regs.eip; \ if ( !mode_64bit() ) _eip = (uint32_t)_eip; /* ignore upper dword */ \ _regs.eip += (_size); /* real hardware doesn't truncate */ \ generate_exception_if((uint8_t)(_regs.eip - ctxt->regs->eip) > 15, \ EXC_GP, 0); \ rc = ops->insn_fetch(x86_seg_cs, _eip, &_x, (_size), ctxt); \ if ( rc ) goto done; \ _x; \ }) #define insn_fetch_type(_type) ((_type)insn_fetch_bytes(sizeof(_type))) #define truncate_word(ea, byte_width) \ ({ unsigned long __ea = (ea); \ unsigned int _width = (byte_width); \ ((_width == sizeof(unsigned long)) ? __ea : \ (__ea & ((1UL << (_width << 3)) - 1))); \ }) #define truncate_ea(ea) truncate_word((ea), ad_bytes) #define mode_64bit() (def_ad_bytes == 8) #define fail_if(p) \ do { \ rc = (p) ? X86EMUL_UNHANDLEABLE : X86EMUL_OKAY; \ if ( rc ) goto done; \ } while (0) #define generate_exception_if(p, e, ec) \ ({ if ( (p) ) { \ fail_if(ops->inject_hw_exception == NULL); \ rc = ops->inject_hw_exception(e, ec, ctxt) ? : X86EMUL_EXCEPTION; \ goto done; \ } \ }) /* * Given byte has even parity (even number of 1s)? SDM Vol. 1 Sec. 3.4.3.1, * "Status Flags": EFLAGS.PF reflects parity of least-sig. byte of result only. */ static bool_t even_parity(uint8_t v) { asm ( "test %b0,%b0; setp %b0" : "=a" (v) : "0" (v) ); return v; } /* Update address held in a register, based on addressing mode. */ #define _register_address_increment(reg, inc, byte_width) \ do { \ int _inc = (inc); /* signed type ensures sign extension to long */ \ unsigned int _width = (byte_width); \ if ( _width == sizeof(unsigned long) ) \ (reg) += _inc; \ else if ( mode_64bit() ) \ (reg) = ((reg) + _inc) & ((1UL << (_width << 3)) - 1); \ else \ (reg) = ((reg) & ~((1UL << (_width << 3)) - 1)) | \ (((reg) + _inc) & ((1UL << (_width << 3)) - 1)); \ } while (0) #define register_address_increment(reg, inc) \ _register_address_increment((reg), (inc), ad_bytes) #define sp_pre_dec(dec) ({ \ _register_address_increment(_regs.esp, -(dec), ctxt->sp_size/8); \ truncate_word(_regs.esp, ctxt->sp_size/8); \ }) #define sp_post_inc(inc) ({ \ unsigned long __esp = truncate_word(_regs.esp, ctxt->sp_size/8); \ _register_address_increment(_regs.esp, (inc), ctxt->sp_size/8); \ __esp; \ }) #define jmp_rel(rel) \ do { \ int _rel = (int)(rel); \ _regs.eip += _rel; \ if ( op_bytes == 2 ) \ _regs.eip = (uint16_t)_regs.eip; \ else if ( !mode_64bit() ) \ _regs.eip = (uint32_t)_regs.eip; \ } while (0) struct fpu_insn_ctxt { uint8_t insn_bytes; uint8_t exn_raised; }; static void fpu_handle_exception(void *_fic, struct cpu_user_regs *regs) { struct fpu_insn_ctxt *fic = _fic; fic->exn_raised = 1; regs->eip += fic->insn_bytes; } #define get_fpu(_type, _fic) \ do{ (_fic)->exn_raised = 0; \ fail_if(ops->get_fpu == NULL); \ rc = ops->get_fpu(fpu_handle_exception, _fic, _type, ctxt); \ if ( rc ) goto done; \ } while (0) #define put_fpu(_fic) \ do{ \ if ( ops->put_fpu != NULL ) \ ops->put_fpu(ctxt); \ generate_exception_if((_fic)->exn_raised, EXC_MF, -1); \ } while (0) #define emulate_fpu_insn(_op) \ do{ struct fpu_insn_ctxt fic; \ get_fpu(X86EMUL_FPU_fpu, &fic); \ asm volatile ( \ "movb $2f-1f,%0 \n" \ "1: " _op " \n" \ "2: \n" \ : "=m" (fic.insn_bytes) : : "memory" ); \ put_fpu(&fic); \ } while (0) #define emulate_fpu_insn_memdst(_op, _arg) \ do{ struct fpu_insn_ctxt fic; \ get_fpu(X86EMUL_FPU_fpu, &fic); \ asm volatile ( \ "movb $2f-1f,%0 \n" \ "1: " _op " %1 \n" \ "2: \n" \ : "=m" (fic.insn_bytes), "=m" (_arg) \ : : "memory" ); \ put_fpu(&fic); \ } while (0) #define emulate_fpu_insn_memsrc(_op, _arg) \ do{ struct fpu_insn_ctxt fic; \ get_fpu(X86EMUL_FPU_fpu, &fic); \ asm volatile ( \ "movb $2f-1f,%0 \n" \ "1: " _op " %1 \n" \ "2: \n" \ : "=m" (fic.insn_bytes) \ : "m" (_arg) : "memory" ); \ put_fpu(&fic); \ } while (0) #define emulate_fpu_insn_stub(_bytes...) \ do{ uint8_t stub[] = { _bytes, 0xc3 }; \ struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 }; \ get_fpu(X86EMUL_FPU_fpu, &fic); \ (*(void(*)(void))stub)(); \ put_fpu(&fic); \ } while (0) static unsigned long __get_rep_prefix( struct cpu_user_regs *int_regs, struct cpu_user_regs *ext_regs, int ad_bytes) { unsigned long ecx = ((ad_bytes == 2) ? (uint16_t)int_regs->ecx : (ad_bytes == 4) ? (uint32_t)int_regs->ecx : int_regs->ecx); /* Skip the instruction if no repetitions are required. */ if ( ecx == 0 ) ext_regs->eip = int_regs->eip; return ecx; } #define get_rep_prefix() ({ \ unsigned long max_reps = 1; \ if ( rep_prefix() ) \ max_reps = __get_rep_prefix(&_regs, ctxt->regs, ad_bytes); \ if ( max_reps == 0 ) \ goto done; \ max_reps; \ }) static void __put_rep_prefix( struct cpu_user_regs *int_regs, struct cpu_user_regs *ext_regs, int ad_bytes, unsigned long reps_completed) { unsigned long ecx = ((ad_bytes == 2) ? (uint16_t)int_regs->ecx : (ad_bytes == 4) ? (uint32_t)int_regs->ecx : int_regs->ecx); /* Reduce counter appropriately, and repeat instruction if non-zero. */ ecx -= reps_completed; if ( ecx != 0 ) int_regs->eip = ext_regs->eip; if ( ad_bytes == 2 ) *(uint16_t *)&int_regs->ecx = ecx; else if ( ad_bytes == 4 ) int_regs->ecx = (uint32_t)ecx; else int_regs->ecx = ecx; } #define put_rep_prefix(reps_completed) ({ \ if ( rep_prefix() ) \ __put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \ }) /* Clip maximum repetitions so that the index register at most just wraps. */ #define truncate_ea_and_reps(ea, reps, bytes_per_rep) ({ \ unsigned long todo__, ea__ = truncate_word(ea, ad_bytes); \ if ( !(ctxt->regs->eflags & EFLG_DF) ) \ todo__ = truncate_word(-(ea), ad_bytes) / (bytes_per_rep); \ else if ( truncate_word((ea) + (bytes_per_rep) - 1, ad_bytes) < ea__ )\ todo__ = 1; \ else \ todo__ = ea__ / (bytes_per_rep) + 1; \ if ( !todo__ ) \ (reps) = 1; \ else if ( todo__ < (reps) ) \ (reps) = todo__; \ ea__; \ }) /* Compatibility function: read guest memory, zero-extend result to a ulong. */ static int read_ulong( enum x86_segment seg, unsigned long offset, unsigned long *val, unsigned int bytes, struct x86_emulate_ctxt *ctxt, const struct x86_emulate_ops *ops) { *val = 0; return ops->read(seg, offset, val, bytes, ctxt); } /* * Unsigned multiplication with double-word result. * IN: Multiplicand=m[0], Multiplier=m[1] * OUT: Return CF/OF (overflow status); Result=m[1]:m[0] */ static bool_t mul_dbl(unsigned long m[2]) { bool_t rc; asm ( "mul %4; seto %b2" : "=a" (m[0]), "=d" (m[1]), "=q" (rc) : "0" (m[0]), "1" (m[1]), "2" (0) ); return rc; } /* * Signed multiplication with double-word result. * IN: Multiplicand=m[0], Multiplier=m[1] * OUT: Return CF/OF (overflow status); Result=m[1]:m[0] */ static bool_t imul_dbl(unsigned long m[2]) { bool_t rc; asm ( "imul %4; seto %b2" : "=a" (m[0]), "=d" (m[1]), "=q" (rc) : "0" (m[0]), "1" (m[1]), "2" (0) ); return rc; } /* * Unsigned division of double-word dividend. * IN: Dividend=u[1]:u[0], Divisor=v * OUT: Return 1: #DE * Return 0: Quotient=u[0], Remainder=u[1] */ static bool_t div_dbl(unsigned long u[2], unsigned long v) { if ( (v == 0) || (u[1] >= v) ) return 1; asm ( "div %4" : "=a" (u[0]), "=d" (u[1]) : "0" (u[0]), "1" (u[1]), "r" (v) ); return 0; } /* * Signed division of double-word dividend. * IN: Dividend=u[1]:u[0], Divisor=v * OUT: Return 1: #DE * Return 0: Quotient=u[0], Remainder=u[1] * NB. We don't use idiv directly as it's moderately hard to work out * ahead of time whether it will #DE, which we cannot allow to happen. */ static bool_t idiv_dbl(unsigned long u[2], unsigned long v) { bool_t negu = (long)u[1] < 0, negv = (long)v < 0; /* u = abs(u) */ if ( negu ) { u[1] = ~u[1]; if ( (u[0] = -u[0]) == 0 ) u[1]++; } /* abs(u) / abs(v) */ if ( div_dbl(u, negv ? -v : v) ) return 1; /* Remainder has same sign as dividend. It cannot overflow. */ if ( negu ) u[1] = -u[1]; /* Quotient is overflowed if sign bit is set. */ if ( negu ^ negv ) { if ( (long)u[0] >= 0 ) u[0] = -u[0]; else if ( (u[0] << 1) != 0 ) /* == 0x80...0 is okay */ return 1; } else if ( (long)u[0] < 0 ) return 1; return 0; } static bool_t test_cc( unsigned int condition, unsigned int flags) { int rc = 0; switch ( (condition & 15) >> 1 ) { case 0: /* o */ rc |= (flags & EFLG_OF); break; case 1: /* b/c/nae */ rc |= (flags & EFLG_CF); break; case 2: /* z/e */ rc |= (flags & EFLG_ZF); break; case 3: /* be/na */ rc |= (flags & (EFLG_CF|EFLG_ZF)); break; case 4: /* s */ rc |= (flags & EFLG_SF); break; case 5: /* p/pe */ rc |= (flags & EFLG_PF); break; case 7: /* le/ng */ rc |= (flags & EFLG_ZF); /* fall through */ case 6: /* l/nge */ rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); break; } /* Odd condition identifiers (lsb == 1) have inverted sense. */ return (!!rc ^ (condition & 1)); } static int get_cpl( struct x86_emulate_ctxt *ctxt, const struct x86_emulate_ops *ops) { struct segment_register reg; if ( ctxt->regs->eflags & EFLG_VM ) return 3; if ( (ops->read_segment == NULL) || ops->read_segment(x86_seg_ss, ®, ctxt) ) return -1; return reg.attr.fields.dpl; } static int _mode_iopl( struct x86_emulate_ctxt *ctxt, const struct x86_emulate_ops *ops) { int cpl = get_cpl(ctxt, ops); if ( cpl == -1 ) return -1; return (cpl <= ((ctxt->regs->eflags >> 12) & 3)); } #define mode_ring0() ({ \ int _cpl = get_cpl(ctxt, ops); \ fail_if(_cpl < 0); \ (_cpl == 0); \ }) #define mode_iopl() ({ \ int _iopl = _mode_iopl(ctxt, ops); \ fail_if(_iopl < 0); \ _iopl; \ }) static int ioport_access_check( unsigned int first_port, unsigned int bytes, struct x86_emulate_ctxt *ctxt, const struct x86_emulate_ops *ops) { unsigned long iobmp; struct segment_register tr; int rc = X86EMUL_OKAY; if ( !(ctxt->regs->eflags & EFLG_VM) && mode_iopl() ) return X86EMUL_OKAY; fail_if(ops->read_segment == NULL); if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 ) return rc; /* Ensure that the TSS is valid and has an io-bitmap-offset field. */ if ( !tr.attr.fields.p || ((tr.attr.fields.type & 0xd) != 0x9) || (tr.limit < 0x67) ) goto raise_exception; if ( (rc = read_ulong(x86_seg_none, tr.base + 0x66, &iobmp, 2, ctxt, ops)) ) return rc; /* Ensure TSS includes two bytes including byte containing first port. */ iobmp += first_port / 8; if ( tr.limit <= iobmp ) goto raise_exception; if ( (rc = read_ulong(x86_seg_none, tr.base + iobmp, &iobmp, 2, ctxt, ops)) ) return rc; if ( (iobmp & (((1<inject_hw_exception == NULL); return ops->inject_hw_exception(EXC_GP, 0, ctxt) ? : X86EMUL_EXCEPTION; } static bool_t in_realmode( struct x86_emulate_ctxt *ctxt, const struct x86_emulate_ops *ops) { unsigned long cr0; int rc; if ( ops->read_cr == NULL ) return 0; rc = ops->read_cr(0, &cr0, ctxt); return (!rc && !(cr0 & CR0_PE)); } static bool_t in_protmode( struct x86_emulate_ctxt *ctxt, const struct x86_emulate_ops *ops) { return !(in_realmode(ctxt, ops) || (ctxt->regs->eflags & EFLG_VM)); } #define EAX 0 #define ECX 1 #define EDX 2 #define EBX 3 static bool_t vcpu_has( unsigned int eax, unsigned int reg, unsigned int bit, struct x86_emulate_ctxt *ctxt, const struct x86_emulate_ops *ops) { unsigned int ebx = 0, ecx = 0, edx = 0; int rc = X86EMUL_OKAY; fail_if(!ops->cpuid); rc = ops->cpuid(&eax, &ebx, &ecx, &edx, ctxt); if ( rc == X86EMUL_OKAY ) { switch ( reg ) { case EAX: reg = eax; break; case EBX: reg = ebx; break; case ECX: reg = ecx; break; case EDX: reg = edx; break; default: BUG(); } if ( !(reg & (1U << bit)) ) rc = ~X86EMUL_OKAY; } done: return rc == X86EMUL_OKAY; } #define vcpu_has_lzcnt() vcpu_has(0x80000001, ECX, 5, ctxt, ops) #define vcpu_has_bmi1() vcpu_has(0x00000007, EBX, 3, ctxt, ops) #define vcpu_must_have(leaf, reg, bit) \ generate_exception_if(!vcpu_has(leaf, reg, bit, ctxt, ops), EXC_UD, -1) #define vcpu_must_have_mmx() vcpu_must_have(0x00000001, EDX, 23) #define vcpu_must_have_sse() vcpu_must_have(0x00000001, EDX, 25) #define vcpu_must_have_sse2() vcpu_must_have(0x00000001, EDX, 26) #define vcpu_must_have_sse3() vcpu_must_have(0x00000001, ECX, 0) #define vcpu_must_have_cx16() vcpu_must_have(0x00000001, ECX, 13) #define vcpu_must_have_avx() vcpu_must_have(0x00000001, ECX, 28) static int in_longmode( struct x86_emulate_ctxt *ctxt, const struct x86_emulate_ops *ops) { uint64_t efer; if (ops->read_msr == NULL) return -1; ops->read_msr(MSR_EFER, &efer, ctxt); return !!(efer & EFER_LMA); } static int realmode_load_seg( enum x86_segment seg, uint16_t sel, struct x86_emulate_ctxt *ctxt, const struct x86_emulate_ops *ops) { struct segment_register reg; int rc; if ( (rc = ops->read_segment(seg, ®, ctxt)) != 0 ) return rc; reg.sel = sel; reg.base = (uint32_t)sel << 4; return ops->write_segment(seg, ®, ctxt); } static int protmode_load_seg( enum x86_segment seg, uint16_t sel, struct x86_emulate_ctxt *ctxt, const struct x86_emulate_ops *ops) { struct segment_register desctab, ss, segr; struct { uint32_t a, b; } desc; uint8_t dpl, rpl, cpl; uint32_t new_desc_b, a_flag = 0x100; int rc, fault_type = EXC_GP; /* NULL selector? */ if ( (sel & 0xfffc) == 0 ) { if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) ) goto raise_exn; memset(&segr, 0, sizeof(segr)); return ops->write_segment(seg, &segr, ctxt); } /* System segment descriptors must reside in the GDT. */ if ( !is_x86_user_segment(seg) && (sel & 4) ) goto raise_exn; if ( (rc = ops->read_segment(x86_seg_ss, &ss, ctxt)) || (rc = ops->read_segment((sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab, ctxt)) ) return rc; /* Check against descriptor table limit. */ if ( ((sel & 0xfff8) + 7) > desctab.limit ) goto raise_exn; if ( (rc = ops->read(x86_seg_none, desctab.base + (sel & 0xfff8), &desc, sizeof(desc), ctxt)) ) return rc; /* Segment present in memory? */ if ( !(desc.b & (1u<<15)) ) { fault_type = EXC_NP; goto raise_exn; } if ( !is_x86_user_segment(seg) ) { /* System segments must have S flag == 0. */ if ( desc.b & (1u << 12) ) goto raise_exn; /* We do not support 64-bit descriptor types. */ if ( in_longmode(ctxt, ops) ) return X86EMUL_UNHANDLEABLE; } /* User segments must have S flag == 1. */ else if ( !(desc.b & (1u << 12)) ) goto raise_exn; dpl = (desc.b >> 13) & 3; rpl = sel & 3; cpl = ss.attr.fields.dpl; switch ( seg ) { case x86_seg_cs: /* Code segment? */ if ( !(desc.b & (1u<<11)) ) goto raise_exn; /* Non-conforming segment: check DPL against RPL. */ if ( ((desc.b & (6u<<9)) != (6u<<9)) && (dpl != rpl) ) goto raise_exn; break; case x86_seg_ss: /* Writable data segment? */ if ( (desc.b & (5u<<9)) != (1u<<9) ) goto raise_exn; if ( (dpl != cpl) || (dpl != rpl) ) goto raise_exn; break; case x86_seg_ldtr: /* LDT system segment? */ if ( (desc.b & (15u<<8)) != (2u<<8) ) goto raise_exn; goto skip_accessed_flag; case x86_seg_tr: /* Available TSS system segment? */ if ( (desc.b & (15u<<8)) != (9u<<8) ) goto raise_exn; a_flag = 0x200; /* busy flag */ break; default: /* Readable code or data segment? */ if ( (desc.b & (5u<<9)) == (4u<<9) ) goto raise_exn; /* Non-conforming segment: check DPL against RPL and CPL. */ if ( ((desc.b & (6u<<9)) != (6u<<9)) && ((dpl < cpl) || (dpl < rpl)) ) goto raise_exn; break; } /* Ensure Accessed flag is set. */ new_desc_b = desc.b | a_flag; if ( !(desc.b & a_flag) && ((rc = ops->cmpxchg( x86_seg_none, desctab.base + (sel & 0xfff8) + 4, &desc.b, &new_desc_b, 4, ctxt)) != 0) ) return rc; /* Force the Accessed flag in our local copy. */ desc.b |= a_flag; skip_accessed_flag: segr.base = (((desc.b << 0) & 0xff000000u) | ((desc.b << 16) & 0x00ff0000u) | ((desc.a >> 16) & 0x0000ffffu)); segr.attr.bytes = (((desc.b >> 8) & 0x00ffu) | ((desc.b >> 12) & 0x0f00u)); segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu); if ( segr.attr.fields.g ) segr.limit = (segr.limit << 12) | 0xfffu; segr.sel = sel; return ops->write_segment(seg, &segr, ctxt); raise_exn: if ( ops->inject_hw_exception == NULL ) return X86EMUL_UNHANDLEABLE; if ( (rc = ops->inject_hw_exception(fault_type, sel & 0xfffc, ctxt)) ) return rc; return X86EMUL_EXCEPTION; } static int load_seg( enum x86_segment seg, uint16_t sel, struct x86_emulate_ctxt *ctxt, const struct x86_emulate_ops *ops) { if ( (ops->read_segment == NULL) || (ops->write_segment == NULL) ) return X86EMUL_UNHANDLEABLE; if ( in_protmode(ctxt, ops) ) return protmode_load_seg(seg, sel, ctxt, ops); return realmode_load_seg(seg, sel, ctxt, ops); } void * decode_register( uint8_t modrm_reg, struct cpu_user_regs *regs, int highbyte_regs) { void *p; switch ( modrm_reg ) { case 0: p = ®s->eax; break; case 1: p = ®s->ecx; break; case 2: p = ®s->edx; break; case 3: p = ®s->ebx; break; case 4: p = (highbyte_regs ? ((unsigned char *)®s->eax + 1) : (unsigned char *)®s->esp); break; case 5: p = (highbyte_regs ? ((unsigned char *)®s->ecx + 1) : (unsigned char *)®s->ebp); break; case 6: p = (highbyte_regs ? ((unsigned char *)®s->edx + 1) : (unsigned char *)®s->esi); break; case 7: p = (highbyte_regs ? ((unsigned char *)®s->ebx + 1) : (unsigned char *)®s->edi); break; #if defined(__x86_64__) case 8: p = ®s->r8; break; case 9: p = ®s->r9; break; case 10: p = ®s->r10; break; case 11: p = ®s->r11; break; case 12: mark_regs_dirty(regs); p = ®s->r12; break; case 13: mark_regs_dirty(regs); p = ®s->r13; break; case 14: mark_regs_dirty(regs); p = ®s->r14; break; case 15: mark_regs_dirty(regs); p = ®s->r15; break; #endif default: BUG(); p = NULL; break; } return p; } #define decode_segment_failed x86_seg_tr static enum x86_segment decode_segment(uint8_t modrm_reg) { switch ( modrm_reg ) { case 0: return x86_seg_es; case 1: return x86_seg_cs; case 2: return x86_seg_ss; case 3: return x86_seg_ds; case 4: return x86_seg_fs; case 5: return x86_seg_gs; default: break; } return decode_segment_failed; } int x86_emulate( struct x86_emulate_ctxt *ctxt, const struct x86_emulate_ops *ops) { /* Shadow copy of register state. Committed on successful emulation. */ struct cpu_user_regs _regs = *ctxt->regs; uint8_t b, d, sib, sib_index, sib_base, twobyte = 0, rex_prefix = 0; uint8_t modrm = 0, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0; union vex vex = {}; unsigned int op_bytes, def_op_bytes, ad_bytes, def_ad_bytes; bool_t lock_prefix = 0; int override_seg = -1, rc = X86EMUL_OKAY; struct operand src, dst; DECLARE_ALIGNED(mmval_t, mmval); /* * Data operand effective address (usually computed from ModRM). * Default is a memory operand relative to segment DS. */ struct operand ea = { .type = OP_MEM }; ea.mem.seg = x86_seg_ds; /* gcc may reject anon union initializer */ ctxt->retire.byte = 0; op_bytes = def_op_bytes = ad_bytes = def_ad_bytes = ctxt->addr_size/8; if ( op_bytes == 8 ) { op_bytes = def_op_bytes = 4; #ifndef __x86_64__ return X86EMUL_UNHANDLEABLE; #endif } /* Prefix bytes. */ for ( ; ; ) { switch ( b = insn_fetch_type(uint8_t) ) { case 0x66: /* operand-size override */ op_bytes = def_op_bytes ^ 6; if ( !vex.pfx ) vex.pfx = vex_66; break; case 0x67: /* address-size override */ ad_bytes = def_ad_bytes ^ (mode_64bit() ? 12 : 6); break; case 0x2e: /* CS override */ override_seg = x86_seg_cs; break; case 0x3e: /* DS override */ override_seg = x86_seg_ds; break; case 0x26: /* ES override */ override_seg = x86_seg_es; break; case 0x64: /* FS override */ override_seg = x86_seg_fs; break; case 0x65: /* GS override */ override_seg = x86_seg_gs; break; case 0x36: /* SS override */ override_seg = x86_seg_ss; break; case 0xf0: /* LOCK */ lock_prefix = 1; break; case 0xf2: /* REPNE/REPNZ */ vex.pfx = vex_f2; break; case 0xf3: /* REP/REPE/REPZ */ vex.pfx = vex_f3; break; case 0x40 ... 0x4f: /* REX */ if ( !mode_64bit() ) goto done_prefixes; rex_prefix = b; continue; default: goto done_prefixes; } /* Any legacy prefix after a REX prefix nullifies its effect. */ rex_prefix = 0; } done_prefixes: if ( rex_prefix & REX_W ) op_bytes = 8; /* Opcode byte(s). */ d = opcode_table[b]; if ( d == 0 ) { /* Two-byte opcode? */ if ( b == 0x0f ) { twobyte = 1; b = insn_fetch_type(uint8_t); d = twobyte_table[b]; } /* Unrecognised? */ if ( d == 0 ) goto cannot_emulate; } /* Lock prefix is allowed only on RMW instructions. */ generate_exception_if((d & Mov) && lock_prefix, EXC_UD, -1); /* ModRM and SIB bytes. */ if ( d & ModRM ) { modrm = insn_fetch_type(uint8_t); modrm_mod = (modrm & 0xc0) >> 6; if ( !twobyte && ((b & ~1) == 0xc4) ) switch ( def_ad_bytes ) { default: BUG(); case 2: if ( in_realmode(ctxt, ops) || (_regs.eflags & EFLG_VM) ) break; /* fall through */ case 4: if ( modrm_mod != 3 ) break; /* fall through */ case 8: /* VEX */ generate_exception_if(rex_prefix || vex.pfx, EXC_UD, -1); vex.raw[0] = modrm; if ( b & 1 ) { vex.raw[1] = modrm; vex.opcx = vex_0f; vex.x = 1; vex.b = 1; vex.w = 0; } else { vex.raw[1] = insn_fetch_type(uint8_t); if ( mode_64bit() ) { if ( !vex.b ) rex_prefix |= REX_B; if ( !vex.x ) rex_prefix |= REX_X; if ( vex.w ) { rex_prefix |= REX_W; op_bytes = 8; } } } if ( mode_64bit() && !vex.r ) rex_prefix |= REX_R; fail_if(vex.opcx != vex_0f); twobyte = 1; b = insn_fetch_type(uint8_t); d = twobyte_table[b]; /* Unrecognised? */ if ( d == 0 ) goto cannot_emulate; modrm = insn_fetch_type(uint8_t); modrm_mod = (modrm & 0xc0) >> 6; break; } modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3); modrm_rm = modrm & 0x07; if ( modrm_mod == 3 ) { modrm_rm |= (rex_prefix & 1) << 3; ea.type = OP_REG; ea.reg = decode_register( modrm_rm, &_regs, (d & ByteOp) && (rex_prefix == 0)); } else if ( ad_bytes == 2 ) { /* 16-bit ModR/M decode. */ switch ( modrm_rm ) { case 0: ea.mem.off = _regs.ebx + _regs.esi; break; case 1: ea.mem.off = _regs.ebx + _regs.edi; break; case 2: ea.mem.seg = x86_seg_ss; ea.mem.off = _regs.ebp + _regs.esi; break; case 3: ea.mem.seg = x86_seg_ss; ea.mem.off = _regs.ebp + _regs.edi; break; case 4: ea.mem.off = _regs.esi; break; case 5: ea.mem.off = _regs.edi; break; case 6: if ( modrm_mod == 0 ) break; ea.mem.seg = x86_seg_ss; ea.mem.off = _regs.ebp; break; case 7: ea.mem.off = _regs.ebx; break; } switch ( modrm_mod ) { case 0: if ( modrm_rm == 6 ) ea.mem.off = insn_fetch_type(int16_t); break; case 1: ea.mem.off += insn_fetch_type(int8_t); break; case 2: ea.mem.off += insn_fetch_type(int16_t); break; } ea.mem.off = truncate_ea(ea.mem.off); } else { /* 32/64-bit ModR/M decode. */ if ( modrm_rm == 4 ) { sib = insn_fetch_type(uint8_t); sib_index = ((sib >> 3) & 7) | ((rex_prefix << 2) & 8); sib_base = (sib & 7) | ((rex_prefix << 3) & 8); if ( sib_index != 4 ) ea.mem.off = *(long*)decode_register(sib_index, &_regs, 0); ea.mem.off <<= (sib >> 6) & 3; if ( (modrm_mod == 0) && ((sib_base & 7) == 5) ) ea.mem.off += insn_fetch_type(int32_t); else if ( sib_base == 4 ) { ea.mem.seg = x86_seg_ss; ea.mem.off += _regs.esp; if ( !twobyte && (b == 0x8f) ) /* POP computes its EA post increment. */ ea.mem.off += ((mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes); } else if ( sib_base == 5 ) { ea.mem.seg = x86_seg_ss; ea.mem.off += _regs.ebp; } else ea.mem.off += *(long*)decode_register(sib_base, &_regs, 0); } else { modrm_rm |= (rex_prefix & 1) << 3; ea.mem.off = *(long *)decode_register(modrm_rm, &_regs, 0); if ( (modrm_rm == 5) && (modrm_mod != 0) ) ea.mem.seg = x86_seg_ss; } switch ( modrm_mod ) { case 0: if ( (modrm_rm & 7) != 5 ) break; ea.mem.off = insn_fetch_type(int32_t); if ( !mode_64bit() ) break; /* Relative to RIP of next instruction. Argh! */ ea.mem.off += _regs.eip; if ( (d & SrcMask) == SrcImm ) ea.mem.off += (d & ByteOp) ? 1 : ((op_bytes == 8) ? 4 : op_bytes); else if ( (d & SrcMask) == SrcImmByte ) ea.mem.off += 1; else if ( !twobyte && ((b & 0xfe) == 0xf6) && ((modrm_reg & 7) <= 1) ) /* Special case in Grp3: test has immediate operand. */ ea.mem.off += (d & ByteOp) ? 1 : ((op_bytes == 8) ? 4 : op_bytes); else if ( twobyte && ((b & 0xf7) == 0xa4) ) /* SHLD/SHRD with immediate byte third operand. */ ea.mem.off++; break; case 1: ea.mem.off += insn_fetch_type(int8_t); break; case 2: ea.mem.off += insn_fetch_type(int32_t); break; } ea.mem.off = truncate_ea(ea.mem.off); } } if ( override_seg != -1 ) ea.mem.seg = override_seg; /* Early operand adjustments. */ if ( !twobyte ) switch ( b ) { case 0xf6 ... 0xf7: /* Grp3 */ switch ( modrm_reg & 7 ) { case 0 ... 1: /* test */ d = (d & ~SrcMask) | SrcImm; break; case 4: /* mul */ case 5: /* imul */ case 6: /* div */ case 7: /* idiv */ d = (d & (ByteOp | ModRM)) | DstImplicit | SrcMem; break; } break; case 0xff: /* Grp5 */ switch ( modrm_reg & 7 ) { case 2: /* call (near) */ case 4: /* jmp (near) */ case 6: /* push */ if ( mode_64bit() && op_bytes == 4 ) op_bytes = 8; /* fall through */ case 3: /* call (far, absolute indirect) */ case 5: /* jmp (far, absolute indirect) */ d = DstNone|SrcMem|ModRM; break; } break; } /* Decode and fetch the source operand: register, memory or immediate. */ switch ( d & SrcMask ) { case SrcNone: /* case SrcImplicit: */ src.type = OP_NONE; break; case SrcReg: src.type = OP_REG; if ( d & ByteOp ) { src.reg = decode_register(modrm_reg, &_regs, (rex_prefix == 0)); src.val = *(uint8_t *)src.reg; src.bytes = 1; } else { src.reg = decode_register(modrm_reg, &_regs, 0); switch ( (src.bytes = op_bytes) ) { case 2: src.val = *(uint16_t *)src.reg; break; case 4: src.val = *(uint32_t *)src.reg; break; case 8: src.val = *(uint64_t *)src.reg; break; } } break; case SrcMem16: ea.bytes = 2; goto srcmem_common; case SrcMem: ea.bytes = (d & ByteOp) ? 1 : op_bytes; srcmem_common: src = ea; if ( src.type == OP_REG ) { switch ( src.bytes ) { case 1: src.val = *(uint8_t *)src.reg; break; case 2: src.val = *(uint16_t *)src.reg; break; case 4: src.val = *(uint32_t *)src.reg; break; case 8: src.val = *(uint64_t *)src.reg; break; } } else if ( (rc = read_ulong(src.mem.seg, src.mem.off, &src.val, src.bytes, ctxt, ops)) ) goto done; break; case SrcImm: src.type = OP_IMM; src.bytes = (d & ByteOp) ? 1 : op_bytes; if ( src.bytes == 8 ) src.bytes = 4; /* NB. Immediates are sign-extended as necessary. */ switch ( src.bytes ) { case 1: src.val = insn_fetch_type(int8_t); break; case 2: src.val = insn_fetch_type(int16_t); break; case 4: src.val = insn_fetch_type(int32_t); break; } break; case SrcImmByte: src.type = OP_IMM; src.bytes = 1; src.val = insn_fetch_type(int8_t); break; } /* Decode and fetch the destination operand: register or memory. */ switch ( d & DstMask ) { case DstNone: /* case DstImplicit: */ /* * The only implicit-operands instructions allowed a LOCK prefix are * CMPXCHG{8,16}B, MOV CRn, MOV DRn. */ generate_exception_if( lock_prefix && ((b < 0x20) || (b > 0x23)) && /* MOV CRn/DRn */ (b != 0xc7), /* CMPXCHG{8,16}B */ EXC_UD, -1); dst.type = OP_NONE; break; case DstReg: generate_exception_if(lock_prefix, EXC_UD, -1); dst.type = OP_REG; if ( d & ByteOp ) { dst.reg = decode_register(modrm_reg, &_regs, (rex_prefix == 0)); dst.val = *(uint8_t *)dst.reg; dst.bytes = 1; } else { dst.reg = decode_register(modrm_reg, &_regs, 0); switch ( (dst.bytes = op_bytes) ) { case 2: dst.val = *(uint16_t *)dst.reg; break; case 4: dst.val = *(uint32_t *)dst.reg; break; case 8: dst.val = *(uint64_t *)dst.reg; break; } } break; case DstBitBase: if ( ((d & SrcMask) == SrcImmByte) || (ea.type == OP_REG) ) { src.val &= (op_bytes << 3) - 1; } else { /* * EA += BitOffset DIV op_bytes*8 * BitOffset = BitOffset MOD op_bytes*8 * DIV truncates towards negative infinity. * MOD always produces a positive result. */ if ( op_bytes == 2 ) src.val = (int16_t)src.val; else if ( op_bytes == 4 ) src.val = (int32_t)src.val; if ( (long)src.val < 0 ) { unsigned long byte_offset; byte_offset = op_bytes + (((-src.val-1) >> 3) & ~(op_bytes-1)); ea.mem.off -= byte_offset; src.val = (byte_offset << 3) + src.val; } else { ea.mem.off += (src.val >> 3) & ~(op_bytes - 1); src.val &= (op_bytes << 3) - 1; } } /* Becomes a normal DstMem operation from here on. */ d = (d & ~DstMask) | DstMem; case DstMem: ea.bytes = (d & ByteOp) ? 1 : op_bytes; dst = ea; if ( dst.type == OP_REG ) { generate_exception_if(lock_prefix, EXC_UD, -1); switch ( dst.bytes ) { case 1: dst.val = *(uint8_t *)dst.reg; break; case 2: dst.val = *(uint16_t *)dst.reg; break; case 4: dst.val = *(uint32_t *)dst.reg; break; case 8: dst.val = *(uint64_t *)dst.reg; break; } } else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */ { if ( (rc = read_ulong(dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt, ops)) ) goto done; dst.orig_val = dst.val; } break; } if ( twobyte ) goto twobyte_insn; switch ( b ) { case 0x00 ... 0x05: add: /* add */ emulate_2op_SrcV("add", src, dst, _regs.eflags); break; case 0x08 ... 0x0d: or: /* or */ emulate_2op_SrcV("or", src, dst, _regs.eflags); break; case 0x10 ... 0x15: adc: /* adc */ emulate_2op_SrcV("adc", src, dst, _regs.eflags); break; case 0x18 ... 0x1d: sbb: /* sbb */ emulate_2op_SrcV("sbb", src, dst, _regs.eflags); break; case 0x20 ... 0x25: and: /* and */ emulate_2op_SrcV("and", src, dst, _regs.eflags); break; case 0x28 ... 0x2d: sub: /* sub */ emulate_2op_SrcV("sub", src, dst, _regs.eflags); break; case 0x30 ... 0x35: xor: /* xor */ emulate_2op_SrcV("xor", src, dst, _regs.eflags); break; case 0x38 ... 0x3d: cmp: /* cmp */ emulate_2op_SrcV("cmp", src, dst, _regs.eflags); dst.type = OP_NONE; break; case 0x06: /* push %%es */ { struct segment_register reg; src.val = x86_seg_es; push_seg: generate_exception_if(mode_64bit() && !twobyte, EXC_UD, -1); fail_if(ops->read_segment == NULL); if ( (rc = ops->read_segment(src.val, ®, ctxt)) != 0 ) return rc; /* 64-bit mode: PUSH defaults to a 64-bit operand. */ if ( mode_64bit() && (op_bytes == 4) ) op_bytes = 8; if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), ®.sel, op_bytes, ctxt)) != 0 ) goto done; break; } case 0x07: /* pop %%es */ src.val = x86_seg_es; pop_seg: generate_exception_if(mode_64bit() && !twobyte, EXC_UD, -1); fail_if(ops->write_segment == NULL); /* 64-bit mode: POP defaults to a 64-bit operand. */ if ( mode_64bit() && (op_bytes == 4) ) op_bytes = 8; if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), &dst.val, op_bytes, ctxt, ops)) != 0 ) goto done; if ( (rc = load_seg(src.val, (uint16_t)dst.val, ctxt, ops)) != 0 ) return rc; break; case 0x0e: /* push %%cs */ src.val = x86_seg_cs; goto push_seg; case 0x16: /* push %%ss */ src.val = x86_seg_ss; goto push_seg; case 0x17: /* pop %%ss */ src.val = x86_seg_ss; ctxt->retire.flags.mov_ss = 1; goto pop_seg; case 0x1e: /* push %%ds */ src.val = x86_seg_ds; goto push_seg; case 0x1f: /* pop %%ds */ src.val = x86_seg_ds; goto pop_seg; case 0x27: /* daa */ { uint8_t al = _regs.eax; unsigned long eflags = _regs.eflags; generate_exception_if(mode_64bit(), EXC_UD, -1); _regs.eflags &= ~(EFLG_CF|EFLG_AF); if ( ((al & 0x0f) > 9) || (eflags & EFLG_AF) ) { *(uint8_t *)&_regs.eax += 6; _regs.eflags |= EFLG_AF; } if ( (al > 0x99) || (eflags & EFLG_CF) ) { *(uint8_t *)&_regs.eax += 0x60; _regs.eflags |= EFLG_CF; } _regs.eflags &= ~(EFLG_SF|EFLG_ZF|EFLG_PF); _regs.eflags |= ((uint8_t)_regs.eax == 0) ? EFLG_ZF : 0; _regs.eflags |= (( int8_t)_regs.eax < 0) ? EFLG_SF : 0; _regs.eflags |= even_parity(_regs.eax) ? EFLG_PF : 0; break; } case 0x2f: /* das */ { uint8_t al = _regs.eax; unsigned long eflags = _regs.eflags; generate_exception_if(mode_64bit(), EXC_UD, -1); _regs.eflags &= ~(EFLG_CF|EFLG_AF); if ( ((al & 0x0f) > 9) || (eflags & EFLG_AF) ) { _regs.eflags |= EFLG_AF; if ( (al < 6) || (eflags & EFLG_CF) ) _regs.eflags |= EFLG_CF; *(uint8_t *)&_regs.eax -= 6; } if ( (al > 0x99) || (eflags & EFLG_CF) ) { *(uint8_t *)&_regs.eax -= 0x60; _regs.eflags |= EFLG_CF; } _regs.eflags &= ~(EFLG_SF|EFLG_ZF|EFLG_PF); _regs.eflags |= ((uint8_t)_regs.eax == 0) ? EFLG_ZF : 0; _regs.eflags |= (( int8_t)_regs.eax < 0) ? EFLG_SF : 0; _regs.eflags |= even_parity(_regs.eax) ? EFLG_PF : 0; break; } case 0x37: /* aaa */ case 0x3f: /* aas */ generate_exception_if(mode_64bit(), EXC_UD, -1); _regs.eflags &= ~EFLG_CF; if ( ((uint8_t)_regs.eax > 9) || (_regs.eflags & EFLG_AF) ) { ((uint8_t *)&_regs.eax)[0] += (b == 0x37) ? 6 : -6; ((uint8_t *)&_regs.eax)[1] += (b == 0x37) ? 1 : -1; _regs.eflags |= EFLG_CF | EFLG_AF; } ((uint8_t *)&_regs.eax)[0] &= 0x0f; break; case 0x40 ... 0x4f: /* inc/dec reg */ dst.type = OP_REG; dst.reg = decode_register(b & 7, &_regs, 0); dst.bytes = op_bytes; dst.val = *dst.reg; if ( b & 8 ) emulate_1op("dec", dst, _regs.eflags); else emulate_1op("inc", dst, _regs.eflags); break; case 0x50 ... 0x57: /* push reg */ src.val = *(unsigned long *)decode_register( (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0); goto push; case 0x58 ... 0x5f: /* pop reg */ dst.type = OP_REG; dst.reg = decode_register( (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0); dst.bytes = op_bytes; if ( mode_64bit() && (dst.bytes == 4) ) dst.bytes = 8; if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes), &dst.val, dst.bytes, ctxt, ops)) != 0 ) goto done; break; case 0x60: /* pusha */ { int i; unsigned long regs[] = { _regs.eax, _regs.ecx, _regs.edx, _regs.ebx, _regs.esp, _regs.ebp, _regs.esi, _regs.edi }; generate_exception_if(mode_64bit(), EXC_UD, -1); for ( i = 0; i < 8; i++ ) if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), ®s[i], op_bytes, ctxt)) != 0 ) goto done; break; } case 0x61: /* popa */ { int i; unsigned long dummy_esp, *regs[] = { (unsigned long *)&_regs.edi, (unsigned long *)&_regs.esi, (unsigned long *)&_regs.ebp, (unsigned long *)&dummy_esp, (unsigned long *)&_regs.ebx, (unsigned long *)&_regs.edx, (unsigned long *)&_regs.ecx, (unsigned long *)&_regs.eax }; generate_exception_if(mode_64bit(), EXC_UD, -1); for ( i = 0; i < 8; i++ ) { if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), &dst.val, op_bytes, ctxt, ops)) != 0 ) goto done; if ( op_bytes == 2 ) *(uint16_t *)regs[i] = (uint16_t)dst.val; else *regs[i] = dst.val; /* 64b: zero-ext done by read_ulong() */ } break; } case 0x62: /* bound */ { unsigned long src_val2; int lb, ub, idx; generate_exception_if(mode_64bit() || (src.type != OP_MEM), EXC_UD, -1); if ( (rc = read_ulong(src.mem.seg, src.mem.off + op_bytes, &src_val2, op_bytes, ctxt, ops)) ) goto done; ub = (op_bytes == 2) ? (int16_t)src_val2 : (int32_t)src_val2; lb = (op_bytes == 2) ? (int16_t)src.val : (int32_t)src.val; idx = (op_bytes == 2) ? (int16_t)dst.val : (int32_t)dst.val; generate_exception_if((idx < lb) || (idx > ub), EXC_BR, -1); dst.type = OP_NONE; break; } case 0x63: /* movsxd (x86/64) / arpl (x86/32) */ if ( mode_64bit() ) { /* movsxd */ if ( ea.type == OP_REG ) src.val = *ea.reg; else if ( (rc = read_ulong(ea.mem.seg, ea.mem.off, &src.val, 4, ctxt, ops)) ) goto done; dst.val = (int32_t)src.val; } else { /* arpl */ unsigned int src_rpl = dst.val & 3; dst = ea; dst.bytes = 2; if ( dst.type == OP_REG ) dst.val = *dst.reg; else if ( (rc = read_ulong(dst.mem.seg, dst.mem.off, &dst.val, 2, ctxt, ops)) ) goto done; if ( src_rpl > (dst.val & 3) ) { _regs.eflags |= EFLG_ZF; dst.val = (dst.val & ~3) | src_rpl; } else { _regs.eflags &= ~EFLG_ZF; dst.type = OP_NONE; } generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1); } break; case 0x68: /* push imm{16,32,64} */ src.val = ((op_bytes == 2) ? (int32_t)insn_fetch_type(int16_t) : insn_fetch_type(int32_t)); goto push; case 0x69: /* imul imm16/32 */ case 0x6b: /* imul imm8 */ if ( ea.type == OP_REG ) dst.val = *ea.reg; else if ( (rc = read_ulong(ea.mem.seg, ea.mem.off, &dst.val, op_bytes, ctxt, ops)) ) goto done; goto imul; case 0x6a: /* push imm8 */ src.val = insn_fetch_type(int8_t); push: d |= Mov; /* force writeback */ dst.type = OP_MEM; dst.bytes = op_bytes; if ( mode_64bit() && (dst.bytes == 4) ) dst.bytes = 8; dst.val = src.val; dst.mem.seg = x86_seg_ss; dst.mem.off = sp_pre_dec(dst.bytes); break; case 0x6c ... 0x6d: /* ins %dx,%es:%edi */ { unsigned long nr_reps = get_rep_prefix(); unsigned int port = (uint16_t)_regs.edx; dst.bytes = !(b & 1) ? 1 : (op_bytes == 8) ? 4 : op_bytes; dst.mem.seg = x86_seg_es; dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes); if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 ) goto done; if ( (nr_reps > 1) && (ops->rep_ins != NULL) && ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes, &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) ) { if ( rc != 0 ) goto done; } else { fail_if(ops->read_io == NULL); if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 ) goto done; dst.type = OP_MEM; nr_reps = 1; } register_address_increment( _regs.edi, nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes)); put_rep_prefix(nr_reps); break; } case 0x6e ... 0x6f: /* outs %esi,%dx */ { unsigned long nr_reps = get_rep_prefix(); unsigned int port = (uint16_t)_regs.edx; dst.bytes = !(b & 1) ? 1 : (op_bytes == 8) ? 4 : op_bytes; ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes); if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 ) goto done; if ( (nr_reps > 1) && (ops->rep_outs != NULL) && ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes, &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) ) { if ( rc != 0 ) goto done; } else { if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi), &dst.val, dst.bytes, ctxt, ops)) != 0 ) goto done; fail_if(ops->write_io == NULL); if ( (rc = ops->write_io(port, dst.bytes, dst.val, ctxt)) != 0 ) goto done; nr_reps = 1; } register_address_increment( _regs.esi, nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes)); put_rep_prefix(nr_reps); break; } case 0x70 ... 0x7f: /* jcc (short) */ { int rel = insn_fetch_type(int8_t); if ( test_cc(b, _regs.eflags) ) jmp_rel(rel); break; } case 0x82: /* Grp1 (x86/32 only) */ generate_exception_if(mode_64bit(), EXC_UD, -1); case 0x80: case 0x81: case 0x83: /* Grp1 */ switch ( modrm_reg & 7 ) { case 0: goto add; case 1: goto or; case 2: goto adc; case 3: goto sbb; case 4: goto and; case 5: goto sub; case 6: goto xor; case 7: goto cmp; } break; case 0xa8 ... 0xa9: /* test imm,%%eax */ case 0x84 ... 0x85: test: /* test */ emulate_2op_SrcV("test", src, dst, _regs.eflags); dst.type = OP_NONE; break; case 0x86 ... 0x87: xchg: /* xchg */ /* Write back the register source. */ switch ( dst.bytes ) { case 1: *(uint8_t *)src.reg = (uint8_t)dst.val; break; case 2: *(uint16_t *)src.reg = (uint16_t)dst.val; break; case 4: *src.reg = (uint32_t)dst.val; break; /* 64b reg: zero-extend */ case 8: *src.reg = dst.val; break; } /* Write back the memory destination with implicit LOCK prefix. */ dst.val = src.val; lock_prefix = 1; break; case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ generate_exception_if((modrm_reg & 7) != 0, EXC_UD, -1); case 0x88 ... 0x8b: /* mov */ dst.val = src.val; break; case 0x8c: /* mov Sreg,r/m */ { struct segment_register reg; enum x86_segment seg = decode_segment(modrm_reg); generate_exception_if(seg == decode_segment_failed, EXC_UD, -1); fail_if(ops->read_segment == NULL); if ( (rc = ops->read_segment(seg, ®, ctxt)) != 0 ) goto done; dst.val = reg.sel; if ( dst.type == OP_MEM ) dst.bytes = 2; break; } case 0x8e: /* mov r/m,Sreg */ { enum x86_segment seg = decode_segment(modrm_reg); generate_exception_if(seg == decode_segment_failed, EXC_UD, -1); generate_exception_if(seg == x86_seg_cs, EXC_UD, -1); if ( (rc = load_seg(seg, (uint16_t)src.val, ctxt, ops)) != 0 ) goto done; if ( seg == x86_seg_ss ) ctxt->retire.flags.mov_ss = 1; dst.type = OP_NONE; break; } case 0x8d: /* lea */ generate_exception_if(ea.type != OP_MEM, EXC_UD, -1); dst.val = ea.mem.off; break; case 0x8f: /* pop (sole member of Grp1a) */ generate_exception_if((modrm_reg & 7) != 0, EXC_UD, -1); /* 64-bit mode: POP defaults to a 64-bit operand. */ if ( mode_64bit() && (dst.bytes == 4) ) dst.bytes = 8; if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes), &dst.val, dst.bytes, ctxt, ops)) != 0 ) goto done; break; case 0x90: /* nop / xchg %%r8,%%rax */ if ( !(rex_prefix & 1) ) break; /* nop */ case 0x91 ... 0x97: /* xchg reg,%%rax */ src.type = dst.type = OP_REG; src.bytes = dst.bytes = op_bytes; src.reg = (unsigned long *)&_regs.eax; src.val = *src.reg; dst.reg = decode_register( (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0); dst.val = *dst.reg; goto xchg; case 0x98: /* cbw/cwde/cdqe */ switch ( op_bytes ) { case 2: *(int16_t *)&_regs.eax = (int8_t)_regs.eax; break; /* cbw */ case 4: _regs.eax = (uint32_t)(int16_t)_regs.eax; break; /* cwde */ case 8: _regs.eax = (int32_t)_regs.eax; break; /* cdqe */ } break; case 0x99: /* cwd/cdq/cqo */ switch ( op_bytes ) { case 2: *(int16_t *)&_regs.edx = ((int16_t)_regs.eax < 0) ? -1 : 0; break; case 4: _regs.edx = (uint32_t)(((int32_t)_regs.eax < 0) ? -1 : 0); break; #ifdef __x86_64__ /* compile warning with some versions of 32-bit gcc */ case 8: _regs.edx = ((int64_t)_regs.eax < 0) ? -1 : 0; break; #endif } break; case 0x9a: /* call (far, absolute) */ { struct segment_register reg; uint16_t sel; uint32_t eip; generate_exception_if(mode_64bit(), EXC_UD, -1); fail_if(ops->read_segment == NULL); eip = insn_fetch_bytes(op_bytes); sel = insn_fetch_type(uint16_t); if ( (rc = ops->read_segment(x86_seg_cs, ®, ctxt)) || (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), ®.sel, op_bytes, ctxt)) || (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), &_regs.eip, op_bytes, ctxt)) ) goto done; if ( (rc = load_seg(x86_seg_cs, sel, ctxt, ops)) != 0 ) goto done; _regs.eip = eip; break; } case 0x9b: /* wait/fwait */ emulate_fpu_insn("fwait"); break; case 0x9c: /* pushf */ src.val = _regs.eflags; goto push; case 0x9d: /* popf */ { uint32_t mask = EFLG_VIP | EFLG_VIF | EFLG_VM; if ( !mode_ring0() ) mask |= EFLG_IOPL; if ( !mode_iopl() ) mask |= EFLG_IF; /* 64-bit mode: POP defaults to a 64-bit operand. */ if ( mode_64bit() && (op_bytes == 4) ) op_bytes = 8; if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), &dst.val, op_bytes, ctxt, ops)) != 0 ) goto done; if ( op_bytes == 2 ) dst.val = (uint16_t)dst.val | (_regs.eflags & 0xffff0000u); dst.val &= 0x257fd5; _regs.eflags &= mask; _regs.eflags |= (uint32_t)(dst.val & ~mask) | 0x02; break; } case 0x9e: /* sahf */ *(uint8_t *)&_regs.eflags = (((uint8_t *)&_regs.eax)[1] & 0xd7) | 0x02; break; case 0x9f: /* lahf */ ((uint8_t *)&_regs.eax)[1] = (_regs.eflags & 0xd7) | 0x02; break; case 0xa0 ... 0xa1: /* mov mem.offs,{%al,%ax,%eax,%rax} */ /* Source EA is not encoded via ModRM. */ dst.type = OP_REG; dst.reg = (unsigned long *)&_regs.eax; dst.bytes = (d & ByteOp) ? 1 : op_bytes; if ( (rc = read_ulong(ea.mem.seg, insn_fetch_bytes(ad_bytes), &dst.val, dst.bytes, ctxt, ops)) != 0 ) goto done; break; case 0xa2 ... 0xa3: /* mov {%al,%ax,%eax,%rax},mem.offs */ /* Destination EA is not encoded via ModRM. */ dst.type = OP_MEM; dst.mem.seg = ea.mem.seg; dst.mem.off = insn_fetch_bytes(ad_bytes); dst.bytes = (d & ByteOp) ? 1 : op_bytes; dst.val = (unsigned long)_regs.eax; break; case 0xa4 ... 0xa5: /* movs */ { unsigned long nr_reps = get_rep_prefix(); dst.bytes = (d & ByteOp) ? 1 : op_bytes; dst.mem.seg = x86_seg_es; dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes); src.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes); if ( (nr_reps > 1) && (ops->rep_movs != NULL) && ((rc = ops->rep_movs(ea.mem.seg, src.mem.off, dst.mem.seg, dst.mem.off, dst.bytes, &nr_reps, ctxt)) != X86EMUL_UNHANDLEABLE) ) { if ( rc != 0 ) goto done; } else { if ( (rc = read_ulong(ea.mem.seg, src.mem.off, &dst.val, dst.bytes, ctxt, ops)) != 0 ) goto done; dst.type = OP_MEM; nr_reps = 1; } register_address_increment( _regs.esi, nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes)); register_address_increment( _regs.edi, nr_reps * ((_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes)); put_rep_prefix(nr_reps); break; } case 0xa6 ... 0xa7: /* cmps */ { unsigned long next_eip = _regs.eip; get_rep_prefix(); src.bytes = dst.bytes = (d & ByteOp) ? 1 : op_bytes; if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi), &dst.val, dst.bytes, ctxt, ops)) || (rc = read_ulong(x86_seg_es, truncate_ea(_regs.edi), &src.val, src.bytes, ctxt, ops)) ) goto done; register_address_increment( _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); register_address_increment( _regs.edi, (_regs.eflags & EFLG_DF) ? -src.bytes : src.bytes); put_rep_prefix(1); /* cmp: dst - src ==> src=*%%edi,dst=*%%esi ==> *%%esi - *%%edi */ emulate_2op_SrcV("cmp", src, dst, _regs.eflags); if ( (repe_prefix() && !(_regs.eflags & EFLG_ZF)) || (repne_prefix() && (_regs.eflags & EFLG_ZF)) ) _regs.eip = next_eip; break; } case 0xaa ... 0xab: /* stos */ { /* unsigned long max_reps = */get_rep_prefix(); dst.type = OP_MEM; dst.bytes = (d & ByteOp) ? 1 : op_bytes; dst.mem.seg = x86_seg_es; dst.mem.off = truncate_ea(_regs.edi); dst.val = _regs.eax; register_address_increment( _regs.edi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); put_rep_prefix(1); break; } case 0xac ... 0xad: /* lods */ { /* unsigned long max_reps = */get_rep_prefix(); dst.type = OP_REG; dst.bytes = (d & ByteOp) ? 1 : op_bytes; dst.reg = (unsigned long *)&_regs.eax; if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi), &dst.val, dst.bytes, ctxt, ops)) != 0 ) goto done; register_address_increment( _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); put_rep_prefix(1); break; } case 0xae ... 0xaf: /* scas */ { unsigned long next_eip = _regs.eip; get_rep_prefix(); src.bytes = dst.bytes = (d & ByteOp) ? 1 : op_bytes; dst.val = _regs.eax; if ( (rc = read_ulong(x86_seg_es, truncate_ea(_regs.edi), &src.val, src.bytes, ctxt, ops)) != 0 ) goto done; register_address_increment( _regs.edi, (_regs.eflags & EFLG_DF) ? -src.bytes : src.bytes); put_rep_prefix(1); /* cmp: dst - src ==> src=*%%edi,dst=%%eax ==> %%eax - *%%edi */ emulate_2op_SrcV("cmp", src, dst, _regs.eflags); if ( (repe_prefix() && !(_regs.eflags & EFLG_ZF)) || (repne_prefix() && (_regs.eflags & EFLG_ZF)) ) _regs.eip = next_eip; break; } case 0xb0 ... 0xb7: /* mov imm8,r8 */ dst.reg = decode_register( (b & 7) | ((rex_prefix & 1) << 3), &_regs, (rex_prefix == 0)); dst.val = src.val; break; case 0xb8 ... 0xbf: /* mov imm{16,32,64},r{16,32,64} */ if ( dst.bytes == 8 ) /* Fetch more bytes to obtain imm64 */ src.val = ((uint32_t)src.val | ((uint64_t)insn_fetch_type(uint32_t) << 32)); dst.reg = decode_register( (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0); dst.val = src.val; break; case 0xc0 ... 0xc1: grp2: /* Grp2 */ switch ( modrm_reg & 7 ) { case 0: /* rol */ emulate_2op_SrcB("rol", src, dst, _regs.eflags); break; case 1: /* ror */ emulate_2op_SrcB("ror", src, dst, _regs.eflags); break; case 2: /* rcl */ emulate_2op_SrcB("rcl", src, dst, _regs.eflags); break; case 3: /* rcr */ emulate_2op_SrcB("rcr", src, dst, _regs.eflags); break; case 4: /* sal/shl */ case 6: /* sal/shl */ emulate_2op_SrcB("sal", src, dst, _regs.eflags); break; case 5: /* shr */ emulate_2op_SrcB("shr", src, dst, _regs.eflags); break; case 7: /* sar */ emulate_2op_SrcB("sar", src, dst, _regs.eflags); break; } break; case 0xc2: /* ret imm16 (near) */ case 0xc3: /* ret (near) */ { int offset = (b == 0xc2) ? insn_fetch_type(uint16_t) : 0; op_bytes = ((op_bytes == 4) && mode_64bit()) ? 8 : op_bytes; if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes + offset), &dst.val, op_bytes, ctxt, ops)) != 0 ) goto done; _regs.eip = dst.val; break; } case 0xc4: /* les */ { unsigned long sel; dst.val = x86_seg_es; les: /* dst.val identifies the segment */ generate_exception_if(mode_64bit() && !twobyte, EXC_UD, -1); generate_exception_if(src.type != OP_MEM, EXC_UD, -1); if ( (rc = read_ulong(src.mem.seg, src.mem.off + src.bytes, &sel, 2, ctxt, ops)) != 0 ) goto done; if ( (rc = load_seg(dst.val, (uint16_t)sel, ctxt, ops)) != 0 ) goto done; dst.val = src.val; break; } case 0xc5: /* lds */ dst.val = x86_seg_ds; goto les; case 0xc8: /* enter imm16,imm8 */ { uint16_t size = insn_fetch_type(uint16_t); uint8_t depth = insn_fetch_type(uint8_t) & 31; int i; dst.type = OP_REG; dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes; dst.reg = (unsigned long *)&_regs.ebp; if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes), &_regs.ebp, dst.bytes, ctxt)) ) goto done; dst.val = _regs.esp; if ( depth > 0 ) { for ( i = 1; i < depth; i++ ) { unsigned long ebp, temp_data; ebp = truncate_word(_regs.ebp - i*dst.bytes, ctxt->sp_size/8); if ( (rc = read_ulong(x86_seg_ss, ebp, &temp_data, dst.bytes, ctxt, ops)) || (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes), &temp_data, dst.bytes, ctxt)) ) goto done; } if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes), &dst.val, dst.bytes, ctxt)) ) goto done; } sp_pre_dec(size); break; } case 0xc9: /* leave */ /* First writeback, to %%esp. */ dst.type = OP_REG; dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes; dst.reg = (unsigned long *)&_regs.esp; dst.val = _regs.ebp; /* Flush first writeback, since there is a second. */ switch ( dst.bytes ) { case 1: *(uint8_t *)dst.reg = (uint8_t)dst.val; break; case 2: *(uint16_t *)dst.reg = (uint16_t)dst.val; break; case 4: *dst.reg = (uint32_t)dst.val; break; /* 64b: zero-ext */ case 8: *dst.reg = dst.val; break; } /* Second writeback, to %%ebp. */ dst.reg = (unsigned long *)&_regs.ebp; if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes), &dst.val, dst.bytes, ctxt, ops)) ) goto done; break; case 0xca: /* ret imm16 (far) */ case 0xcb: /* ret (far) */ { int offset = (b == 0xca) ? insn_fetch_type(uint16_t) : 0; if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), &dst.val, op_bytes, ctxt, ops)) || (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes + offset), &src.val, op_bytes, ctxt, ops)) || (rc = load_seg(x86_seg_cs, (uint16_t)src.val, ctxt, ops)) ) goto done; _regs.eip = dst.val; break; } case 0xcc: /* int3 */ src.val = EXC_BP; goto swint; case 0xcd: /* int imm8 */ src.val = insn_fetch_type(uint8_t); swint: fail_if(ops->inject_sw_interrupt == NULL); rc = ops->inject_sw_interrupt(src.val, _regs.eip - ctxt->regs->eip, ctxt) ? : X86EMUL_EXCEPTION; goto done; case 0xce: /* into */ generate_exception_if(mode_64bit(), EXC_UD, -1); if ( !(_regs.eflags & EFLG_OF) ) break; src.val = EXC_OF; goto swint; case 0xcf: /* iret */ { unsigned long cs, eip, eflags; uint32_t mask = EFLG_VIP | EFLG_VIF | EFLG_VM; if ( !mode_ring0() ) mask |= EFLG_IOPL; if ( !mode_iopl() ) mask |= EFLG_IF; fail_if(!in_realmode(ctxt, ops)); if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), &eip, op_bytes, ctxt, ops)) || (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), &cs, op_bytes, ctxt, ops)) || (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), &eflags, op_bytes, ctxt, ops)) ) goto done; if ( op_bytes == 2 ) eflags = (uint16_t)eflags | (_regs.eflags & 0xffff0000u); eflags &= 0x257fd5; _regs.eflags &= mask; _regs.eflags |= (uint32_t)(eflags & ~mask) | 0x02; _regs.eip = eip; if ( (rc = load_seg(x86_seg_cs, (uint16_t)cs, ctxt, ops)) != 0 ) goto done; break; } case 0xd0 ... 0xd1: /* Grp2 */ src.val = 1; goto grp2; case 0xd2 ... 0xd3: /* Grp2 */ src.val = _regs.ecx; goto grp2; case 0xd4: /* aam */ { unsigned int base = insn_fetch_type(uint8_t); uint8_t al = _regs.eax; generate_exception_if(mode_64bit(), EXC_UD, -1); generate_exception_if(base == 0, EXC_DE, -1); *(uint16_t *)&_regs.eax = ((al / base) << 8) | (al % base); _regs.eflags &= ~(EFLG_SF|EFLG_ZF|EFLG_PF); _regs.eflags |= ((uint8_t)_regs.eax == 0) ? EFLG_ZF : 0; _regs.eflags |= (( int8_t)_regs.eax < 0) ? EFLG_SF : 0; _regs.eflags |= even_parity(_regs.eax) ? EFLG_PF : 0; break; } case 0xd5: /* aad */ { unsigned int base = insn_fetch_type(uint8_t); uint16_t ax = _regs.eax; generate_exception_if(mode_64bit(), EXC_UD, -1); *(uint16_t *)&_regs.eax = (uint8_t)(ax + ((ax >> 8) * base)); _regs.eflags &= ~(EFLG_SF|EFLG_ZF|EFLG_PF); _regs.eflags |= ((uint8_t)_regs.eax == 0) ? EFLG_ZF : 0; _regs.eflags |= (( int8_t)_regs.eax < 0) ? EFLG_SF : 0; _regs.eflags |= even_parity(_regs.eax) ? EFLG_PF : 0; break; } case 0xd6: /* salc */ generate_exception_if(mode_64bit(), EXC_UD, -1); *(uint8_t *)&_regs.eax = (_regs.eflags & EFLG_CF) ? 0xff : 0x00; break; case 0xd7: /* xlat */ { unsigned long al = (uint8_t)_regs.eax; if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.ebx + al), &al, 1, ctxt, ops)) != 0 ) goto done; *(uint8_t *)&_regs.eax = al; break; } case 0xd8: /* FPU 0xd8 */ switch ( modrm ) { case 0xc0 ... 0xc7: /* fadd %stN,%stN */ case 0xc8 ... 0xcf: /* fmul %stN,%stN */ case 0xd0 ... 0xd7: /* fcom %stN,%stN */ case 0xd8 ... 0xdf: /* fcomp %stN,%stN */ case 0xe0 ... 0xe7: /* fsub %stN,%stN */ case 0xe8 ... 0xef: /* fsubr %stN,%stN */ case 0xf0 ... 0xf7: /* fdiv %stN,%stN */ case 0xf8 ... 0xff: /* fdivr %stN,%stN */ emulate_fpu_insn_stub(0xd8, modrm); break; default: fail_if(modrm >= 0xc0); ea.bytes = 4; src = ea; if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, src.bytes, ctxt)) != 0 ) goto done; switch ( modrm_reg & 7 ) { case 0: /* fadd */ emulate_fpu_insn_memsrc("fadds", src.val); break; case 1: /* fmul */ emulate_fpu_insn_memsrc("fmuls", src.val); break; case 2: /* fcom */ emulate_fpu_insn_memsrc("fcoms", src.val); break; case 3: /* fcomp */ emulate_fpu_insn_memsrc("fcomps", src.val); break; case 4: /* fsub */ emulate_fpu_insn_memsrc("fsubs", src.val); break; case 5: /* fsubr */ emulate_fpu_insn_memsrc("fsubrs", src.val); break; case 6: /* fdiv */ emulate_fpu_insn_memsrc("fdivs", src.val); break; case 7: /* fdivr */ emulate_fpu_insn_memsrc("fdivrs", src.val); break; default: goto cannot_emulate; } } break; case 0xd9: /* FPU 0xd9 */ switch ( modrm ) { case 0xfb: /* fsincos */ fail_if(cpu_has_amd_erratum(573)); /* fall through */ case 0xc0 ... 0xc7: /* fld %stN */ case 0xc8 ... 0xcf: /* fxch %stN */ case 0xd0: /* fnop */ case 0xe0: /* fchs */ case 0xe1: /* fabs */ case 0xe4: /* ftst */ case 0xe5: /* fxam */ case 0xe8: /* fld1 */ case 0xe9: /* fldl2t */ case 0xea: /* fldl2e */ case 0xeb: /* fldpi */ case 0xec: /* fldlg2 */ case 0xed: /* fldln2 */ case 0xee: /* fldz */ case 0xf0: /* f2xm1 */ case 0xf1: /* fyl2x */ case 0xf2: /* fptan */ case 0xf3: /* fpatan */ case 0xf4: /* fxtract */ case 0xf5: /* fprem1 */ case 0xf6: /* fdecstp */ case 0xf7: /* fincstp */ case 0xf8: /* fprem */ case 0xf9: /* fyl2xp1 */ case 0xfa: /* fsqrt */ case 0xfc: /* frndint */ case 0xfd: /* fscale */ case 0xfe: /* fsin */ case 0xff: /* fcos */ emulate_fpu_insn_stub(0xd9, modrm); break; default: fail_if(modrm >= 0xc0); switch ( modrm_reg & 7 ) { case 0: /* fld m32fp */ ea.bytes = 4; src = ea; if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val, src.bytes, ctxt)) != 0 ) goto done; emulate_fpu_insn_memsrc("flds", src.val); break; case 2: /* fstp m32fp */ ea.bytes = 4; dst = ea; dst.type = OP_MEM; emulate_fpu_insn_memdst("fsts", dst.val); break; case 3: /* fstp m32fp */ ea.bytes = 4; dst = ea; dst.type = OP_MEM; emulate_fpu_insn_memdst("fstps", dst.val); break; /* case 4: fldenv - TODO */ case 5: /* fldcw m2byte */ ea.bytes = 2; src = ea; if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, src.bytes, ctxt)) != 0 ) goto done; emulate_fpu_insn_memsrc("fldcw", src.val); break; /* case 6: fstenv - TODO */ case 7: /* fnstcw m2byte */ ea.bytes = 2; dst = ea; dst.type = OP_MEM; emulate_fpu_insn_memdst("fnstcw", dst.val); break; default: goto cannot_emulate; } } break; case 0xda: /* FPU 0xda */ switch ( modrm ) { case 0xc0 ... 0xc7: /* fcmovb %stN */ case 0xc8 ... 0xcf: /* fcmove %stN */ case 0xd0 ... 0xd7: /* fcmovbe %stN */ case 0xd8 ... 0xdf: /* fcmovu %stN */ case 0xe9: /* fucompp */ emulate_fpu_insn_stub(0xda, modrm); break; default: fail_if(modrm >= 0xc0); ea.bytes = 4; src = ea; if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, src.bytes, ctxt)) != 0 ) goto done; switch ( modrm_reg & 7 ) { case 0: /* fiadd m32i */ emulate_fpu_insn_memsrc("fiaddl", src.val); break; case 1: /* fimul m32i */ emulate_fpu_insn_memsrc("fimull", src.val); break; case 2: /* ficom m32i */ emulate_fpu_insn_memsrc("ficoml", src.val); break; case 3: /* ficomp m32i */ emulate_fpu_insn_memsrc("ficompl", src.val); break; case 4: /* fisub m32i */ emulate_fpu_insn_memsrc("fisubl", src.val); break; case 5: /* fisubr m32i */ emulate_fpu_insn_memsrc("fisubrl", src.val); break; case 6: /* fidiv m32i */ emulate_fpu_insn_memsrc("fidivl", src.val); break; case 7: /* fidivr m32i */ emulate_fpu_insn_memsrc("fidivrl", src.val); break; default: goto cannot_emulate; } } break; case 0xdb: /* FPU 0xdb */ switch ( modrm ) { case 0xc0 ... 0xc7: /* fcmovnb %stN */ case 0xc8 ... 0xcf: /* fcmovne %stN */ case 0xd0 ... 0xd7: /* fcmovnbe %stN */ case 0xd8 ... 0xdf: /* fcmovnu %stN */ emulate_fpu_insn_stub(0xdb, modrm); break; case 0xe2: /* fnclex */ emulate_fpu_insn("fnclex"); break; case 0xe3: /* fninit */ emulate_fpu_insn("fninit"); break; case 0xe4: /* fsetpm - 287 only, ignored by 387 */ break; case 0xe8 ... 0xef: /* fucomi %stN */ case 0xf0 ... 0xf7: /* fcomi %stN */ emulate_fpu_insn_stub(0xdb, modrm); break; default: fail_if(modrm >= 0xc0); switch ( modrm_reg & 7 ) { case 0: /* fild m32i */ ea.bytes = 4; src = ea; if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, src.bytes, ctxt)) != 0 ) goto done; emulate_fpu_insn_memsrc("fildl", src.val); break; case 1: /* fisttp m32i */ vcpu_must_have_sse3(); ea.bytes = 4; dst = ea; dst.type = OP_MEM; emulate_fpu_insn_memdst("fisttpl", dst.val); break; case 2: /* fist m32i */ ea.bytes = 4; dst = ea; dst.type = OP_MEM; emulate_fpu_insn_memdst("fistl", dst.val); break; case 3: /* fistp m32i */ ea.bytes = 4; dst = ea; dst.type = OP_MEM; emulate_fpu_insn_memdst("fistpl", dst.val); break; case 5: /* fld m80fp */ ea.bytes = 10; src = ea; if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, src.bytes, ctxt)) != 0 ) goto done; emulate_fpu_insn_memdst("fldt", src.val); break; case 7: /* fstp m80fp */ ea.bytes = 10; dst.type = OP_MEM; dst = ea; emulate_fpu_insn_memdst("fstpt", dst.val); break; default: goto cannot_emulate; } } break; case 0xdc: /* FPU 0xdc */ switch ( modrm ) { case 0xc0 ... 0xc7: /* fadd %stN */ case 0xc8 ... 0xcf: /* fmul %stN */ case 0xe0 ... 0xe7: /* fsubr %stN */ case 0xe8 ... 0xef: /* fsub %stN */ case 0xf0 ... 0xf7: /* fdivr %stN */ case 0xf8 ... 0xff: /* fdiv %stN */ emulate_fpu_insn_stub(0xdc, modrm); break; default: fail_if(modrm >= 0xc0); ea.bytes = 8; src = ea; if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, src.bytes, ctxt)) != 0 ) goto done; switch ( modrm_reg & 7 ) { case 0: /* fadd m64fp */ emulate_fpu_insn_memsrc("faddl", src.val); break; case 1: /* fmul m64fp */ emulate_fpu_insn_memsrc("fmull", src.val); break; case 2: /* fcom m64fp */ emulate_fpu_insn_memsrc("fcoml", src.val); break; case 3: /* fcomp m64fp */ emulate_fpu_insn_memsrc("fcompl", src.val); break; case 4: /* fsub m64fp */ emulate_fpu_insn_memsrc("fsubl", src.val); break; case 5: /* fsubr m64fp */ emulate_fpu_insn_memsrc("fsubrl", src.val); break; case 6: /* fdiv m64fp */ emulate_fpu_insn_memsrc("fdivl", src.val); break; case 7: /* fdivr m64fp */ emulate_fpu_insn_memsrc("fdivrl", src.val); break; } } break; case 0xdd: /* FPU 0xdd */ switch ( modrm ) { case 0xc0 ... 0xc7: /* ffree %stN */ case 0xd0 ... 0xd7: /* fst %stN */ case 0xd8 ... 0xdf: /* fstp %stN */ case 0xe0 ... 0xe7: /* fucom %stN */ case 0xe8 ... 0xef: /* fucomp %stN */ emulate_fpu_insn_stub(0xdd, modrm); break; default: fail_if(modrm >= 0xc0); switch ( modrm_reg & 7 ) { case 0: /* fld m64fp */; ea.bytes = 8; src = ea; if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, src.bytes, ctxt)) != 0 ) goto done; emulate_fpu_insn_memsrc("fldl", src.val); break; case 1: /* fisttp m64i */ vcpu_must_have_sse3(); ea.bytes = 8; dst = ea; dst.type = OP_MEM; emulate_fpu_insn_memdst("fisttpll", dst.val); break; case 2: /* fst m64fp */ ea.bytes = 8; dst = ea; dst.type = OP_MEM; emulate_fpu_insn_memsrc("fstl", dst.val); break; case 3: /* fstp m64fp */ ea.bytes = 8; dst = ea; dst.type = OP_MEM; emulate_fpu_insn_memdst("fstpl", dst.val); break; case 7: /* fnstsw m2byte */ ea.bytes = 2; dst = ea; dst.type = OP_MEM; emulate_fpu_insn_memdst("fnstsw", dst.val); break; default: goto cannot_emulate; } } break; case 0xde: /* FPU 0xde */ switch ( modrm ) { case 0xc0 ... 0xc7: /* faddp %stN */ case 0xc8 ... 0xcf: /* fmulp %stN */ case 0xd9: /* fcompp */ case 0xe0 ... 0xe7: /* fsubrp %stN */ case 0xe8 ... 0xef: /* fsubp %stN */ case 0xf0 ... 0xf7: /* fdivrp %stN */ case 0xf8 ... 0xff: /* fdivp %stN */ emulate_fpu_insn_stub(0xde, modrm); break; default: fail_if(modrm >= 0xc0); ea.bytes = 2; src = ea; if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, src.bytes, ctxt)) != 0 ) goto done; switch ( modrm_reg & 7 ) { case 0: /* fiadd m16i */ emulate_fpu_insn_memsrc("fiadds", src.val); break; case 1: /* fimul m16i */ emulate_fpu_insn_memsrc("fimuls", src.val); break; case 2: /* ficom m16i */ emulate_fpu_insn_memsrc("ficoms", src.val); break; case 3: /* ficomp m16i */ emulate_fpu_insn_memsrc("ficomps", src.val); break; case 4: /* fisub m16i */ emulate_fpu_insn_memsrc("fisubs", src.val); break; case 5: /* fisubr m16i */ emulate_fpu_insn_memsrc("fisubrs", src.val); break; case 6: /* fidiv m16i */ emulate_fpu_insn_memsrc("fidivs", src.val); break; case 7: /* fidivr m16i */ emulate_fpu_insn_memsrc("fidivrs", src.val); break; default: goto cannot_emulate; } } break; case 0xdf: /* FPU 0xdf */ switch ( modrm ) { case 0xe0: /* fnstsw %ax */ dst.bytes = 2; dst.type = OP_REG; dst.reg = (unsigned long *)&_regs.eax; emulate_fpu_insn_memdst("fnstsw", dst.val); break; case 0xe8 ... 0xef: /* fucomip %stN */ case 0xf0 ... 0xf7: /* fcomip %stN */ emulate_fpu_insn_stub(0xdf, modrm); break; default: fail_if(modrm >= 0xc0); switch ( modrm_reg & 7 ) { case 0: /* fild m16i */ ea.bytes = 2; src = ea; if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, src.bytes, ctxt)) != 0 ) goto done; emulate_fpu_insn_memsrc("filds", src.val); break; case 1: /* fisttp m16i */ vcpu_must_have_sse3(); ea.bytes = 2; dst = ea; dst.type = OP_MEM; emulate_fpu_insn_memdst("fisttps", dst.val); break; case 2: /* fist m16i */ ea.bytes = 2; dst = ea; dst.type = OP_MEM; emulate_fpu_insn_memdst("fists", dst.val); break; case 3: /* fistp m16i */ ea.bytes = 2; dst = ea; dst.type = OP_MEM; emulate_fpu_insn_memdst("fistps", dst.val); break; case 4: /* fbld m80dec */ ea.bytes = 10; src = ea; if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, src.bytes, ctxt)) != 0 ) goto done; emulate_fpu_insn_memsrc("fbld", src.val); break; case 5: /* fild m64i */ ea.bytes = 8; src = ea; if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, src.bytes, ctxt)) != 0 ) goto done; emulate_fpu_insn_memsrc("fildll", src.val); break; case 6: /* fbstp packed bcd */ ea.bytes = 10; dst = ea; dst.type = OP_MEM; emulate_fpu_insn_memdst("fbstp", dst.val); break; case 7: /* fistp m64i */ ea.bytes = 8; dst = ea; dst.type = OP_MEM; emulate_fpu_insn_memdst("fistpll", dst.val); break; default: goto cannot_emulate; } } break; case 0xe0 ... 0xe2: /* loop{,z,nz} */ { int rel = insn_fetch_type(int8_t); int do_jmp = !(_regs.eflags & EFLG_ZF); /* loopnz */ if ( b == 0xe1 ) do_jmp = !do_jmp; /* loopz */ else if ( b == 0xe2 ) do_jmp = 1; /* loop */ switch ( ad_bytes ) { case 2: do_jmp &= --(*(uint16_t *)&_regs.ecx) != 0; break; case 4: do_jmp &= --(*(uint32_t *)&_regs.ecx) != 0; _regs.ecx = (uint32_t)_regs.ecx; /* zero extend in x86/64 mode */ break; default: /* case 8: */ do_jmp &= --_regs.ecx != 0; break; } if ( do_jmp ) jmp_rel(rel); break; } case 0xe3: /* jcxz/jecxz (short) */ { int rel = insn_fetch_type(int8_t); if ( (ad_bytes == 2) ? !(uint16_t)_regs.ecx : (ad_bytes == 4) ? !(uint32_t)_regs.ecx : !_regs.ecx ) jmp_rel(rel); break; } case 0xe4: /* in imm8,%al */ case 0xe5: /* in imm8,%eax */ case 0xe6: /* out %al,imm8 */ case 0xe7: /* out %eax,imm8 */ case 0xec: /* in %dx,%al */ case 0xed: /* in %dx,%eax */ case 0xee: /* out %al,%dx */ case 0xef: /* out %eax,%dx */ { unsigned int port = ((b < 0xe8) ? insn_fetch_type(uint8_t) : (uint16_t)_regs.edx); op_bytes = !(b & 1) ? 1 : (op_bytes == 8) ? 4 : op_bytes; if ( (rc = ioport_access_check(port, op_bytes, ctxt, ops)) != 0 ) goto done; if ( b & 2 ) { /* out */ fail_if(ops->write_io == NULL); rc = ops->write_io(port, op_bytes, _regs.eax, ctxt); } else { /* in */ dst.type = OP_REG; dst.bytes = op_bytes; dst.reg = (unsigned long *)&_regs.eax; fail_if(ops->read_io == NULL); rc = ops->read_io(port, dst.bytes, &dst.val, ctxt); } if ( rc != 0 ) goto done; break; } case 0xe8: /* call (near) */ { int rel = ((op_bytes == 2) ? (int32_t)insn_fetch_type(int16_t) : insn_fetch_type(int32_t)); op_bytes = ((op_bytes == 4) && mode_64bit()) ? 8 : op_bytes; src.val = _regs.eip; jmp_rel(rel); goto push; } case 0xe9: /* jmp (near) */ { int rel = ((op_bytes == 2) ? (int32_t)insn_fetch_type(int16_t) : insn_fetch_type(int32_t)); jmp_rel(rel); break; } case 0xea: /* jmp (far, absolute) */ { uint16_t sel; uint32_t eip; generate_exception_if(mode_64bit(), EXC_UD, -1); eip = insn_fetch_bytes(op_bytes); sel = insn_fetch_type(uint16_t); if ( (rc = load_seg(x86_seg_cs, sel, ctxt, ops)) != 0 ) goto done; _regs.eip = eip; break; } case 0xeb: /* jmp (short) */ { int rel = insn_fetch_type(int8_t); jmp_rel(rel); break; } case 0xf1: /* int1 (icebp) */ src.val = EXC_DB; goto swint; case 0xf4: /* hlt */ ctxt->retire.flags.hlt = 1; break; case 0xf5: /* cmc */ _regs.eflags ^= EFLG_CF; break; case 0xf6 ... 0xf7: /* Grp3 */ switch ( modrm_reg & 7 ) { case 0 ... 1: /* test */ goto test; case 2: /* not */ dst.val = ~dst.val; break; case 3: /* neg */ emulate_1op("neg", dst, _regs.eflags); break; case 4: /* mul */ dst.type = OP_REG; dst.reg = (unsigned long *)&_regs.eax; dst.val = *dst.reg; _regs.eflags &= ~(EFLG_OF|EFLG_CF); switch ( dst.bytes = src.bytes ) { case 1: dst.val = (uint8_t)dst.val; dst.val *= src.val; if ( (uint8_t)dst.val != (uint16_t)dst.val ) _regs.eflags |= EFLG_OF|EFLG_CF; dst.bytes = 2; break; case 2: dst.val = (uint16_t)dst.val; dst.val *= src.val; if ( (uint16_t)dst.val != (uint32_t)dst.val ) _regs.eflags |= EFLG_OF|EFLG_CF; *(uint16_t *)&_regs.edx = dst.val >> 16; break; #ifdef __x86_64__ case 4: dst.val = (uint32_t)dst.val; dst.val *= src.val; if ( (uint32_t)dst.val != dst.val ) _regs.eflags |= EFLG_OF|EFLG_CF; _regs.edx = (uint32_t)(dst.val >> 32); break; #endif default: { unsigned long m[2] = { src.val, dst.val }; if ( mul_dbl(m) ) _regs.eflags |= EFLG_OF|EFLG_CF; _regs.edx = m[1]; dst.val = m[0]; break; } } break; case 5: /* imul */ dst.type = OP_REG; dst.reg = (unsigned long *)&_regs.eax; dst.val = *dst.reg; dst.bytes = src.bytes; imul: _regs.eflags &= ~(EFLG_OF|EFLG_CF); switch ( dst.bytes ) { case 1: dst.val = (int8_t)src.val * (int8_t)dst.val; if ( (int8_t)dst.val != (int16_t)dst.val ) _regs.eflags |= EFLG_OF|EFLG_CF; ASSERT(b > 0x6b); dst.bytes = 2; break; case 2: dst.val = ((uint32_t)(int16_t)src.val * (uint32_t)(int16_t)dst.val); if ( (int16_t)dst.val != (int32_t)dst.val ) _regs.eflags |= EFLG_OF|EFLG_CF; if ( b > 0x6b ) *(uint16_t *)&_regs.edx = dst.val >> 16; break; #ifdef __x86_64__ case 4: dst.val = ((uint64_t)(int32_t)src.val * (uint64_t)(int32_t)dst.val); if ( (int32_t)dst.val != dst.val ) _regs.eflags |= EFLG_OF|EFLG_CF; if ( b > 0x6b ) _regs.edx = (uint32_t)(dst.val >> 32); break; #endif default: { unsigned long m[2] = { src.val, dst.val }; if ( imul_dbl(m) ) _regs.eflags |= EFLG_OF|EFLG_CF; if ( b > 0x6b ) _regs.edx = m[1]; dst.val = m[0]; break; } } break; case 6: /* div */ { unsigned long u[2], v; dst.type = OP_REG; dst.reg = (unsigned long *)&_regs.eax; switch ( dst.bytes = src.bytes ) { case 1: u[0] = (uint16_t)_regs.eax; u[1] = 0; v = (uint8_t)src.val; generate_exception_if( div_dbl(u, v) || ((uint8_t)u[0] != (uint16_t)u[0]), EXC_DE, -1); dst.val = (uint8_t)u[0]; ((uint8_t *)&_regs.eax)[1] = u[1]; break; case 2: u[0] = ((uint32_t)_regs.edx << 16) | (uint16_t)_regs.eax; u[1] = 0; v = (uint16_t)src.val; generate_exception_if( div_dbl(u, v) || ((uint16_t)u[0] != (uint32_t)u[0]), EXC_DE, -1); dst.val = (uint16_t)u[0]; *(uint16_t *)&_regs.edx = u[1]; break; #ifdef __x86_64__ case 4: u[0] = (_regs.edx << 32) | (uint32_t)_regs.eax; u[1] = 0; v = (uint32_t)src.val; generate_exception_if( div_dbl(u, v) || ((uint32_t)u[0] != u[0]), EXC_DE, -1); dst.val = (uint32_t)u[0]; _regs.edx = (uint32_t)u[1]; break; #endif default: u[0] = _regs.eax; u[1] = _regs.edx; v = src.val; generate_exception_if(div_dbl(u, v), EXC_DE, -1); dst.val = u[0]; _regs.edx = u[1]; break; } break; } case 7: /* idiv */ { unsigned long u[2], v; dst.type = OP_REG; dst.reg = (unsigned long *)&_regs.eax; switch ( dst.bytes = src.bytes ) { case 1: u[0] = (int16_t)_regs.eax; u[1] = ((long)u[0] < 0) ? ~0UL : 0UL; v = (int8_t)src.val; generate_exception_if( idiv_dbl(u, v) || ((int8_t)u[0] != (int16_t)u[0]), EXC_DE, -1); dst.val = (int8_t)u[0]; ((int8_t *)&_regs.eax)[1] = u[1]; break; case 2: u[0] = (int32_t)((_regs.edx << 16) | (uint16_t)_regs.eax); u[1] = ((long)u[0] < 0) ? ~0UL : 0UL; v = (int16_t)src.val; generate_exception_if( idiv_dbl(u, v) || ((int16_t)u[0] != (int32_t)u[0]), EXC_DE, -1); dst.val = (int16_t)u[0]; *(int16_t *)&_regs.edx = u[1]; break; #ifdef __x86_64__ case 4: u[0] = (_regs.edx << 32) | (uint32_t)_regs.eax; u[1] = ((long)u[0] < 0) ? ~0UL : 0UL; v = (int32_t)src.val; generate_exception_if( idiv_dbl(u, v) || ((int32_t)u[0] != u[0]), EXC_DE, -1); dst.val = (int32_t)u[0]; _regs.edx = (uint32_t)u[1]; break; #endif default: u[0] = _regs.eax; u[1] = _regs.edx; v = src.val; generate_exception_if(idiv_dbl(u, v), EXC_DE, -1); dst.val = u[0]; _regs.edx = u[1]; break; } break; } default: goto cannot_emulate; } break; case 0xf8: /* clc */ _regs.eflags &= ~EFLG_CF; break; case 0xf9: /* stc */ _regs.eflags |= EFLG_CF; break; case 0xfa: /* cli */ generate_exception_if(!mode_iopl(), EXC_GP, 0); _regs.eflags &= ~EFLG_IF; break; case 0xfb: /* sti */ generate_exception_if(!mode_iopl(), EXC_GP, 0); if ( !(_regs.eflags & EFLG_IF) ) { _regs.eflags |= EFLG_IF; ctxt->retire.flags.sti = 1; } break; case 0xfc: /* cld */ _regs.eflags &= ~EFLG_DF; break; case 0xfd: /* std */ _regs.eflags |= EFLG_DF; break; case 0xfe: /* Grp4 */ generate_exception_if((modrm_reg & 7) >= 2, EXC_UD, -1); case 0xff: /* Grp5 */ switch ( modrm_reg & 7 ) { case 0: /* inc */ emulate_1op("inc", dst, _regs.eflags); break; case 1: /* dec */ emulate_1op("dec", dst, _regs.eflags); break; case 2: /* call (near) */ dst.val = _regs.eip; _regs.eip = src.val; src.val = dst.val; goto push; case 4: /* jmp (near) */ _regs.eip = src.val; dst.type = OP_NONE; break; case 3: /* call (far, absolute indirect) */ case 5: /* jmp (far, absolute indirect) */ { unsigned long sel; generate_exception_if(src.type != OP_MEM, EXC_UD, -1); if ( (rc = read_ulong(src.mem.seg, src.mem.off + op_bytes, &sel, 2, ctxt, ops)) ) goto done; if ( (modrm_reg & 7) == 3 ) /* call */ { struct segment_register reg; fail_if(ops->read_segment == NULL); if ( (rc = ops->read_segment(x86_seg_cs, ®, ctxt)) || (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), ®.sel, op_bytes, ctxt)) || (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), &_regs.eip, op_bytes, ctxt)) ) goto done; } if ( (rc = load_seg(x86_seg_cs, sel, ctxt, ops)) != 0 ) goto done; _regs.eip = src.val; dst.type = OP_NONE; break; } case 6: /* push */ goto push; case 7: generate_exception_if(1, EXC_UD, -1); default: goto cannot_emulate; } break; } writeback: switch ( dst.type ) { case OP_REG: /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ switch ( dst.bytes ) { case 1: *(uint8_t *)dst.reg = (uint8_t)dst.val; break; case 2: *(uint16_t *)dst.reg = (uint16_t)dst.val; break; case 4: *dst.reg = (uint32_t)dst.val; break; /* 64b: zero-ext */ case 8: *dst.reg = dst.val; break; } break; case OP_MEM: if ( !(d & Mov) && (dst.orig_val == dst.val) && !ctxt->force_writeback ) /* nothing to do */; else if ( lock_prefix ) rc = ops->cmpxchg( dst.mem.seg, dst.mem.off, &dst.orig_val, &dst.val, dst.bytes, ctxt); else rc = ops->write( dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt); if ( rc != 0 ) goto done; default: break; } /* Inject #DB if single-step tracing was enabled at instruction start. */ if ( (ctxt->regs->eflags & EFLG_TF) && (rc == X86EMUL_OKAY) && (ops->inject_hw_exception != NULL) ) rc = ops->inject_hw_exception(EXC_DB, -1, ctxt) ? : X86EMUL_EXCEPTION; /* Commit shadow register state. */ _regs.eflags &= ~EFLG_RF; *ctxt->regs = _regs; done: return rc; twobyte_insn: switch ( b ) { case 0x00: /* Grp6 */ fail_if((modrm_reg & 6) != 2); generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1); generate_exception_if(!mode_ring0(), EXC_GP, 0); if ( (rc = load_seg((modrm_reg & 1) ? x86_seg_tr : x86_seg_ldtr, src.val, ctxt, ops)) != 0 ) goto done; break; case 0x01: /* Grp7 */ { struct segment_register reg; unsigned long base, limit, cr0, cr0w; if ( modrm == 0xdf ) /* invlpga */ { generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1); generate_exception_if(!mode_ring0(), EXC_GP, 0); fail_if(ops->invlpg == NULL); if ( (rc = ops->invlpg(x86_seg_none, truncate_ea(_regs.eax), ctxt)) ) goto done; break; } if ( modrm == 0xf9 ) /* rdtscp */ { uint64_t tsc_aux; fail_if(ops->read_msr == NULL); if ( (rc = ops->read_msr(MSR_TSC_AUX, &tsc_aux, ctxt)) != 0 ) goto done; _regs.ecx = (uint32_t)tsc_aux; goto rdtsc; } switch ( modrm_reg & 7 ) { case 0: /* sgdt */ case 1: /* sidt */ generate_exception_if(ea.type != OP_MEM, EXC_UD, -1); fail_if(ops->read_segment == NULL); if ( (rc = ops->read_segment((modrm_reg & 1) ? x86_seg_idtr : x86_seg_gdtr, ®, ctxt)) ) goto done; if ( op_bytes == 2 ) reg.base &= 0xffffff; if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, ®.limit, 2, ctxt)) || (rc = ops->write(ea.mem.seg, ea.mem.off+2, ®.base, mode_64bit() ? 8 : 4, ctxt)) ) goto done; break; case 2: /* lgdt */ case 3: /* lidt */ generate_exception_if(ea.type != OP_MEM, EXC_UD, -1); fail_if(ops->write_segment == NULL); memset(®, 0, sizeof(reg)); if ( (rc = read_ulong(ea.mem.seg, ea.mem.off+0, &limit, 2, ctxt, ops)) || (rc = read_ulong(ea.mem.seg, ea.mem.off+2, &base, mode_64bit() ? 8 : 4, ctxt, ops)) ) goto done; reg.base = base; reg.limit = limit; if ( op_bytes == 2 ) reg.base &= 0xffffff; if ( (rc = ops->write_segment((modrm_reg & 1) ? x86_seg_idtr : x86_seg_gdtr, ®, ctxt)) ) goto done; break; case 4: /* smsw */ ea.bytes = (ea.type == OP_MEM) ? 2 : op_bytes; dst = ea; fail_if(ops->read_cr == NULL); if ( (rc = ops->read_cr(0, &dst.val, ctxt)) ) goto done; d |= Mov; /* force writeback */ break; case 6: /* lmsw */ fail_if(ops->read_cr == NULL); fail_if(ops->write_cr == NULL); if ( (rc = ops->read_cr(0, &cr0, ctxt)) ) goto done; if ( ea.type == OP_REG ) cr0w = *ea.reg; else if ( (rc = read_ulong(ea.mem.seg, ea.mem.off, &cr0w, 2, ctxt, ops)) ) goto done; /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */ cr0 = (cr0 & ~0xe) | (cr0w & 0xf); if ( (rc = ops->write_cr(0, cr0, ctxt)) ) goto done; break; case 7: /* invlpg */ generate_exception_if(!mode_ring0(), EXC_GP, 0); generate_exception_if(ea.type != OP_MEM, EXC_UD, -1); fail_if(ops->invlpg == NULL); if ( (rc = ops->invlpg(ea.mem.seg, ea.mem.off, ctxt)) ) goto done; break; default: goto cannot_emulate; } break; } case 0x05: /* syscall */ { uint64_t msr_content; struct segment_register cs = { 0 }, ss = { 0 }; int rc; generate_exception_if(in_realmode(ctxt, ops), EXC_UD, -1); generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, -1); /* Inject #UD if syscall/sysret are disabled. */ fail_if(ops->read_msr == NULL); if ( (rc = ops->read_msr(MSR_EFER, &msr_content, ctxt)) != 0 ) goto done; generate_exception_if((msr_content & EFER_SCE) == 0, EXC_UD, -1); if ( (rc = ops->read_msr(MSR_STAR, &msr_content, ctxt)) != 0 ) goto done; msr_content >>= 32; cs.sel = (uint16_t)(msr_content & 0xfffc); ss.sel = (uint16_t)(msr_content + 8); cs.base = ss.base = 0; /* flat segment */ cs.limit = ss.limit = ~0u; /* 4GB limit */ cs.attr.bytes = 0xc9b; /* G+DB+P+S+Code */ ss.attr.bytes = 0xc93; /* G+DB+P+S+Data */ #ifdef __x86_64__ rc = in_longmode(ctxt, ops); if ( rc < 0 ) goto cannot_emulate; if ( rc ) { cs.attr.fields.db = 0; cs.attr.fields.l = 1; _regs.rcx = _regs.rip; _regs.r11 = _regs.eflags & ~EFLG_RF; if ( (rc = ops->read_msr(mode_64bit() ? MSR_LSTAR : MSR_CSTAR, &msr_content, ctxt)) != 0 ) goto done; _regs.rip = msr_content; if ( (rc = ops->read_msr(MSR_FMASK, &msr_content, ctxt)) != 0 ) goto done; _regs.eflags &= ~(msr_content | EFLG_RF); } else #endif { if ( (rc = ops->read_msr(MSR_STAR, &msr_content, ctxt)) != 0 ) goto done; _regs.ecx = _regs.eip; _regs.eip = (uint32_t)msr_content; _regs.eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); } fail_if(ops->write_segment == NULL); if ( (rc = ops->write_segment(x86_seg_cs, &cs, ctxt)) || (rc = ops->write_segment(x86_seg_ss, &ss, ctxt)) ) goto done; break; } case 0x06: /* clts */ generate_exception_if(!mode_ring0(), EXC_GP, 0); fail_if((ops->read_cr == NULL) || (ops->write_cr == NULL)); if ( (rc = ops->read_cr(0, &dst.val, ctxt)) || (rc = ops->write_cr(0, dst.val&~8, ctxt)) ) goto done; break; case 0x08: /* invd */ case 0x09: /* wbinvd */ generate_exception_if(!mode_ring0(), EXC_GP, 0); fail_if(ops->wbinvd == NULL); if ( (rc = ops->wbinvd(ctxt)) != 0 ) goto done; break; case 0x0d: /* GrpP (prefetch) */ case 0x18: /* Grp16 (prefetch/nop) */ case 0x19 ... 0x1f: /* nop (amd-defined) */ break; case 0x2b: /* {,v}movntp{s,d} xmm,m128 */ /* vmovntp{s,d} ymm,m256 */ fail_if(ea.type != OP_MEM); /* fall through */ case 0x28: /* {,v}movap{s,d} xmm/m128,xmm */ /* vmovap{s,d} ymm/m256,ymm */ case 0x29: /* {,v}movap{s,d} xmm,xmm/m128 */ /* vmovap{s,d} ymm,ymm/m256 */ fail_if(vex.pfx & VEX_PREFIX_SCALAR_MASK); /* fall through */ case 0x10: /* {,v}movup{s,d} xmm/m128,xmm */ /* vmovup{s,d} ymm/m256,ymm */ /* {,v}movss xmm/m32,xmm */ /* {,v}movsd xmm/m64,xmm */ case 0x11: /* {,v}movup{s,d} xmm,xmm/m128 */ /* vmovup{s,d} ymm,ymm/m256 */ /* {,v}movss xmm,xmm/m32 */ /* {,v}movsd xmm,xmm/m64 */ { uint8_t stub[] = { 0x3e, 0x3e, 0x0f, b, modrm, 0xc3 }; struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 }; if ( vex.opcx == vex_none ) { if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK ) vcpu_must_have_sse2(); else vcpu_must_have_sse(); ea.bytes = 16; SET_SSE_PREFIX(stub[0], vex.pfx); get_fpu(X86EMUL_FPU_xmm, &fic); } else { fail_if((vex.opcx != vex_0f) || ((vex.reg != 0xf) && ((ea.type == OP_MEM) || !(vex.pfx & VEX_PREFIX_SCALAR_MASK)))); vcpu_must_have_avx(); get_fpu(X86EMUL_FPU_ymm, &fic); ea.bytes = 16 << vex.l; } if ( vex.pfx & VEX_PREFIX_SCALAR_MASK ) ea.bytes = vex.pfx & VEX_PREFIX_DOUBLE_MASK ? 8 : 4; if ( ea.type == OP_MEM ) { /* XXX enable once there is ops->ea() or equivalent generate_exception_if((b >= 0x28) && (ops->ea(ea.mem.seg, ea.mem.off) & (ea.bytes - 1)), EXC_GP, 0); */ if ( !(b & 1) ) rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp, ea.bytes, ctxt); /* convert memory operand to (%rAX) */ rex_prefix &= ~REX_B; vex.b = 1; stub[4] &= 0x38; } if ( !rc ) { copy_REX_VEX(stub, rex_prefix, vex); asm volatile ( "call *%0" : : "r" (stub), "a" (mmvalp) : "memory" ); } put_fpu(&fic); if ( !rc && (b & 1) && (ea.type == OP_MEM) ) rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp, ea.bytes, ctxt); goto done; } case 0x20: /* mov cr,reg */ case 0x21: /* mov dr,reg */ case 0x22: /* mov reg,cr */ case 0x23: /* mov reg,dr */ generate_exception_if(ea.type != OP_REG, EXC_UD, -1); generate_exception_if(!mode_ring0(), EXC_GP, 0); modrm_reg |= lock_prefix << 3; if ( b & 2 ) { /* Write to CR/DR. */ src.val = *(unsigned long *)decode_register(modrm_rm, &_regs, 0); if ( !mode_64bit() ) src.val = (uint32_t)src.val; rc = ((b & 1) ? (ops->write_dr ? ops->write_dr(modrm_reg, src.val, ctxt) : X86EMUL_UNHANDLEABLE) : (ops->write_cr ? ops->write_cr(modrm_reg, src.val, ctxt) : X86EMUL_UNHANDLEABLE)); } else { /* Read from CR/DR. */ dst.type = OP_REG; dst.bytes = mode_64bit() ? 8 : 4; dst.reg = decode_register(modrm_rm, &_regs, 0); rc = ((b & 1) ? (ops->read_dr ? ops->read_dr(modrm_reg, &dst.val, ctxt) : X86EMUL_UNHANDLEABLE) : (ops->read_cr ? ops->read_cr(modrm_reg, &dst.val, ctxt) : X86EMUL_UNHANDLEABLE)); } if ( rc != 0 ) goto done; break; case 0x30: /* wrmsr */ { uint64_t val = ((uint64_t)_regs.edx << 32) | (uint32_t)_regs.eax; generate_exception_if(!mode_ring0(), EXC_GP, 0); fail_if(ops->write_msr == NULL); if ( (rc = ops->write_msr((uint32_t)_regs.ecx, val, ctxt)) != 0 ) goto done; break; } case 0x31: rdtsc: /* rdtsc */ { unsigned long cr4; uint64_t val; if ( !mode_ring0() ) { fail_if(ops->read_cr == NULL); if ( (rc = ops->read_cr(4, &cr4, ctxt)) ) goto done; generate_exception_if(cr4 & CR4_TSD, EXC_GP, 0); } fail_if(ops->read_msr == NULL); if ( (rc = ops->read_msr(MSR_TSC, &val, ctxt)) != 0 ) goto done; _regs.edx = (uint32_t)(val >> 32); _regs.eax = (uint32_t)(val >> 0); break; } case 0x32: /* rdmsr */ { uint64_t val; generate_exception_if(!mode_ring0(), EXC_GP, 0); fail_if(ops->read_msr == NULL); if ( (rc = ops->read_msr((uint32_t)_regs.ecx, &val, ctxt)) != 0 ) goto done; _regs.edx = (uint32_t)(val >> 32); _regs.eax = (uint32_t)(val >> 0); break; } case 0x40 ... 0x4f: /* cmovcc */ dst.val = src.val; if ( !test_cc(b, _regs.eflags) ) dst.type = OP_NONE; break; case 0x34: /* sysenter */ { uint64_t msr_content; struct segment_register cs, ss; int rc; generate_exception_if(mode_ring0(), EXC_GP, 0); generate_exception_if(in_realmode(ctxt, ops), EXC_GP, 0); generate_exception_if(!in_protmode(ctxt, ops), EXC_GP, 0); fail_if(ops->read_msr == NULL); if ( (rc = ops->read_msr(MSR_SYSENTER_CS, &msr_content, ctxt)) != 0 ) goto done; if ( mode_64bit() ) generate_exception_if(msr_content == 0, EXC_GP, 0); else generate_exception_if((msr_content & 0xfffc) == 0, EXC_GP, 0); _regs.eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); fail_if(ops->read_segment == NULL); ops->read_segment(x86_seg_cs, &cs, ctxt); cs.sel = (uint16_t)msr_content & ~3; /* SELECTOR_RPL_MASK */ cs.base = 0; /* flat segment */ cs.limit = ~0u; /* 4GB limit */ cs.attr.bytes = 0xc9b; /* G+DB+P+S+Code */ ss.sel = cs.sel + 8; ss.base = 0; /* flat segment */ ss.limit = ~0u; /* 4GB limit */ ss.attr.bytes = 0xc93; /* G+DB+P+S+Data */ rc = in_longmode(ctxt, ops); if ( rc < 0 ) goto cannot_emulate; if ( rc ) { cs.attr.fields.db = 0; cs.attr.fields.l = 1; } fail_if(ops->write_segment == NULL); if ( (rc = ops->write_segment(x86_seg_cs, &cs, ctxt)) != 0 || (rc = ops->write_segment(x86_seg_ss, &ss, ctxt)) != 0 ) goto done; if ( (rc = ops->read_msr(MSR_SYSENTER_EIP, &msr_content, ctxt)) != 0 ) goto done; _regs.eip = msr_content; if ( (rc = ops->read_msr(MSR_SYSENTER_ESP, &msr_content, ctxt)) != 0 ) goto done; _regs.esp = msr_content; break; } case 0x35: /* sysexit */ { uint64_t msr_content; struct segment_register cs, ss; bool_t user64 = !!(rex_prefix & REX_W); int rc; generate_exception_if(!mode_ring0(), EXC_GP, 0); generate_exception_if(in_realmode(ctxt, ops), EXC_GP, 0); generate_exception_if(!in_protmode(ctxt, ops), EXC_GP, 0); fail_if(ops->read_msr == NULL); if ( (rc = ops->read_msr(MSR_SYSENTER_CS, &msr_content, ctxt)) != 0 ) goto done; if ( user64 ) { cs.sel = (uint16_t)(msr_content + 32); ss.sel = (cs.sel + 8); generate_exception_if(msr_content == 0, EXC_GP, 0); } else { cs.sel = (uint16_t)(msr_content + 16); ss.sel = (uint16_t)(msr_content + 24); generate_exception_if((msr_content & 0xfffc) == 0, EXC_GP, 0); } cs.sel |= 0x3; /* SELECTOR_RPL_MASK */ cs.base = 0; /* flat segment */ cs.limit = ~0u; /* 4GB limit */ cs.attr.bytes = 0xcfb; /* G+DB+P+DPL3+S+Code */ ss.sel |= 0x3; /* SELECTOR_RPL_MASK */ ss.base = 0; /* flat segment */ ss.limit = ~0u; /* 4GB limit */ ss.attr.bytes = 0xcf3; /* G+DB+P+DPL3+S+Data */ if ( user64 ) { cs.attr.fields.db = 0; cs.attr.fields.l = 1; } fail_if(ops->write_segment == NULL); if ( (rc = ops->write_segment(x86_seg_cs, &cs, ctxt)) != 0 || (rc = ops->write_segment(x86_seg_ss, &ss, ctxt)) != 0 ) goto done; _regs.eip = _regs.edx; _regs.esp = _regs.ecx; break; } case 0xe7: /* movntq mm,m64 */ /* {,v}movntdq xmm,m128 */ /* vmovntdq ymm,m256 */ fail_if(ea.type != OP_MEM); fail_if(vex.pfx == vex_f3); /* fall through */ case 0x6f: /* movq mm/m64,mm */ /* {,v}movdq{a,u} xmm/m128,xmm */ /* vmovdq{a,u} ymm/m256,ymm */ case 0x7f: /* movq mm,mm/m64 */ /* {,v}movdq{a,u} xmm,xmm/m128 */ /* vmovdq{a,u} ymm,ymm/m256 */ { uint8_t stub[] = { 0x3e, 0x3e, 0x0f, b, modrm, 0xc3 }; struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 }; if ( vex.opcx == vex_none ) { switch ( vex.pfx ) { case vex_66: case vex_f3: vcpu_must_have_sse2(); stub[0] = 0x66; /* movdqa */ get_fpu(X86EMUL_FPU_xmm, &fic); ea.bytes = 16; break; case vex_none: if ( b != 0xe7 ) vcpu_must_have_mmx(); else vcpu_must_have_sse(); get_fpu(X86EMUL_FPU_mmx, &fic); ea.bytes = 8; break; default: goto cannot_emulate; } } else { fail_if((vex.opcx != vex_0f) || (vex.reg != 0xf) || ((vex.pfx != vex_66) && (vex.pfx != vex_f3))); vcpu_must_have_avx(); get_fpu(X86EMUL_FPU_ymm, &fic); ea.bytes = 16 << vex.l; } if ( ea.type == OP_MEM ) { /* XXX enable once there is ops->ea() or equivalent generate_exception_if((vex.pfx == vex_66) && (ops->ea(ea.mem.seg, ea.mem.off) & (ea.bytes - 1)), EXC_GP, 0); */ if ( b == 0x6f ) rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp, ea.bytes, ctxt); /* convert memory operand to (%rAX) */ rex_prefix &= ~REX_B; vex.b = 1; stub[4] &= 0x38; } if ( !rc ) { copy_REX_VEX(stub, rex_prefix, vex); asm volatile ( "call *%0" : : "r" (stub), "a" (mmvalp) : "memory" ); } put_fpu(&fic); if ( !rc && (b != 0x6f) && (ea.type == OP_MEM) ) rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp, ea.bytes, ctxt); goto done; } case 0x80 ... 0x8f: /* jcc (near) */ { int rel = ((op_bytes == 2) ? (int32_t)insn_fetch_type(int16_t) : insn_fetch_type(int32_t)); if ( test_cc(b, _regs.eflags) ) jmp_rel(rel); break; } case 0x90 ... 0x9f: /* setcc */ dst.val = test_cc(b, _regs.eflags); break; case 0xa0: /* push %%fs */ src.val = x86_seg_fs; goto push_seg; case 0xa1: /* pop %%fs */ src.val = x86_seg_fs; goto pop_seg; case 0xa2: /* cpuid */ { unsigned int eax = _regs.eax, ebx = _regs.ebx; unsigned int ecx = _regs.ecx, edx = _regs.edx; fail_if(ops->cpuid == NULL); if ( (rc = ops->cpuid(&eax, &ebx, &ecx, &edx, ctxt)) != 0 ) goto done; _regs.eax = eax; _regs.ebx = ebx; _regs.ecx = ecx; _regs.edx = edx; break; } case 0xa8: /* push %%gs */ src.val = x86_seg_gs; goto push_seg; case 0xa9: /* pop %%gs */ src.val = x86_seg_gs; goto pop_seg; case 0xb0 ... 0xb1: /* cmpxchg */ /* Save real source value, then compare EAX against destination. */ src.orig_val = src.val; src.val = _regs.eax; emulate_2op_SrcV("cmp", src, dst, _regs.eflags); if ( _regs.eflags & EFLG_ZF ) { /* Success: write back to memory. */ dst.val = src.orig_val; } else { /* Failure: write the value we saw to EAX. */ dst.type = OP_REG; dst.reg = (unsigned long *)&_regs.eax; } break; case 0xa3: bt: /* bt */ emulate_2op_SrcV_nobyte("bt", src, dst, _regs.eflags); dst.type = OP_NONE; break; case 0xa4: /* shld imm8,r,r/m */ case 0xa5: /* shld %%cl,r,r/m */ case 0xac: /* shrd imm8,r,r/m */ case 0xad: /* shrd %%cl,r,r/m */ { uint8_t shift, width = dst.bytes << 3; shift = (b & 1) ? (uint8_t)_regs.ecx : insn_fetch_type(uint8_t); if ( (shift &= width - 1) == 0 ) break; dst.orig_val = truncate_word(dst.val, dst.bytes); dst.val = ((shift == width) ? src.val : (b & 8) ? /* shrd */ ((dst.orig_val >> shift) | truncate_word(src.val << (width - shift), dst.bytes)) : /* shld */ ((dst.orig_val << shift) | ((src.val >> (width - shift)) & ((1ull << shift) - 1)))); dst.val = truncate_word(dst.val, dst.bytes); _regs.eflags &= ~(EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_PF|EFLG_CF); if ( (dst.val >> ((b & 8) ? (shift - 1) : (width - shift))) & 1 ) _regs.eflags |= EFLG_CF; if ( ((dst.val ^ dst.orig_val) >> (width - 1)) & 1 ) _regs.eflags |= EFLG_OF; _regs.eflags |= ((dst.val >> (width - 1)) & 1) ? EFLG_SF : 0; _regs.eflags |= (dst.val == 0) ? EFLG_ZF : 0; _regs.eflags |= even_parity(dst.val) ? EFLG_PF : 0; break; } case 0xb3: btr: /* btr */ emulate_2op_SrcV_nobyte("btr", src, dst, _regs.eflags); break; case 0xab: bts: /* bts */ emulate_2op_SrcV_nobyte("bts", src, dst, _regs.eflags); break; case 0xae: /* Grp15 */ switch ( modrm_reg & 7 ) { case 7: /* clflush */ fail_if(ops->wbinvd == NULL); if ( (rc = ops->wbinvd(ctxt)) != 0 ) goto done; break; default: goto cannot_emulate; } break; case 0xaf: /* imul */ _regs.eflags &= ~(EFLG_OF|EFLG_CF); switch ( dst.bytes ) { case 2: dst.val = ((uint32_t)(int16_t)src.val * (uint32_t)(int16_t)dst.val); if ( (int16_t)dst.val != (uint32_t)dst.val ) _regs.eflags |= EFLG_OF|EFLG_CF; break; #ifdef __x86_64__ case 4: dst.val = ((uint64_t)(int32_t)src.val * (uint64_t)(int32_t)dst.val); if ( (int32_t)dst.val != dst.val ) _regs.eflags |= EFLG_OF|EFLG_CF; break; #endif default: { unsigned long m[2] = { src.val, dst.val }; if ( imul_dbl(m) ) _regs.eflags |= EFLG_OF|EFLG_CF; dst.val = m[0]; break; } } break; case 0xb2: /* lss */ dst.val = x86_seg_ss; goto les; case 0xb4: /* lfs */ dst.val = x86_seg_fs; goto les; case 0xb5: /* lgs */ dst.val = x86_seg_gs; goto les; case 0xb6: /* movzx rm8,r{16,32,64} */ /* Recompute DstReg as we may have decoded AH/BH/CH/DH. */ dst.reg = decode_register(modrm_reg, &_regs, 0); dst.bytes = op_bytes; dst.val = (uint8_t)src.val; break; case 0xbc: /* bsf or tzcnt */ { bool_t zf; asm ( "bsf %2,%0; setz %b1" : "=r" (dst.val), "=q" (zf) : "r" (src.val) ); _regs.eflags &= ~EFLG_ZF; if ( (vex.pfx == vex_f3) && vcpu_has_bmi1() ) { _regs.eflags &= ~EFLG_CF; if ( zf ) { _regs.eflags |= EFLG_CF; dst.val = op_bytes * 8; } else if ( !dst.val ) _regs.eflags |= EFLG_ZF; } else if ( zf ) { _regs.eflags |= EFLG_ZF; dst.type = OP_NONE; } break; } case 0xbd: /* bsr or lzcnt */ { bool_t zf; asm ( "bsr %2,%0; setz %b1" : "=r" (dst.val), "=q" (zf) : "r" (src.val) ); _regs.eflags &= ~EFLG_ZF; if ( (vex.pfx == vex_f3) && vcpu_has_lzcnt() ) { _regs.eflags &= ~EFLG_CF; if ( zf ) { _regs.eflags |= EFLG_CF; dst.val = op_bytes * 8; } else { dst.val = op_bytes * 8 - 1 - dst.val; if ( !dst.val ) _regs.eflags |= EFLG_ZF; } } else if ( zf ) { _regs.eflags |= EFLG_ZF; dst.type = OP_NONE; } break; } case 0xb7: /* movzx rm16,r{16,32,64} */ dst.val = (uint16_t)src.val; break; case 0xbb: btc: /* btc */ emulate_2op_SrcV_nobyte("btc", src, dst, _regs.eflags); break; case 0xba: /* Grp8 */ switch ( modrm_reg & 7 ) { case 4: goto bt; case 5: goto bts; case 6: goto btr; case 7: goto btc; default: generate_exception_if(1, EXC_UD, -1); } break; case 0xbe: /* movsx rm8,r{16,32,64} */ /* Recompute DstReg as we may have decoded AH/BH/CH/DH. */ dst.reg = decode_register(modrm_reg, &_regs, 0); dst.bytes = op_bytes; dst.val = (int8_t)src.val; break; case 0xbf: /* movsx rm16,r{16,32,64} */ dst.val = (int16_t)src.val; break; case 0xc0 ... 0xc1: /* xadd */ /* Write back the register source. */ switch ( dst.bytes ) { case 1: *(uint8_t *)src.reg = (uint8_t)dst.val; break; case 2: *(uint16_t *)src.reg = (uint16_t)dst.val; break; case 4: *src.reg = (uint32_t)dst.val; break; /* 64b reg: zero-extend */ case 8: *src.reg = dst.val; break; } goto add; case 0xc3: /* movnti */ /* Ignore the non-temporal hint for now. */ vcpu_must_have_sse2(); generate_exception_if(dst.bytes <= 2, EXC_UD, -1); dst.val = src.val; break; case 0xc7: /* Grp9 (cmpxchg8b/cmpxchg16b) */ { unsigned long old[2], exp[2], new[2]; generate_exception_if((modrm_reg & 7) != 1, EXC_UD, -1); generate_exception_if(ea.type != OP_MEM, EXC_UD, -1); if ( op_bytes == 8 ) vcpu_must_have_cx16(); op_bytes *= 2; /* Get actual old value. */ if ( (rc = ops->read(ea.mem.seg, ea.mem.off, old, op_bytes, ctxt)) != 0 ) goto done; /* Get expected and proposed values. */ if ( op_bytes == 8 ) { ((uint32_t *)exp)[0] = _regs.eax; ((uint32_t *)exp)[1] = _regs.edx; ((uint32_t *)new)[0] = _regs.ebx; ((uint32_t *)new)[1] = _regs.ecx; } else { exp[0] = _regs.eax; exp[1] = _regs.edx; new[0] = _regs.ebx; new[1] = _regs.ecx; } if ( memcmp(old, exp, op_bytes) ) { /* Expected != actual: store actual to rDX:rAX and clear ZF. */ _regs.eax = (op_bytes == 8) ? ((uint32_t *)old)[0] : old[0]; _regs.edx = (op_bytes == 8) ? ((uint32_t *)old)[1] : old[1]; _regs.eflags &= ~EFLG_ZF; } else { /* Expected == actual: attempt atomic cmpxchg and set ZF. */ if ( (rc = ops->cmpxchg(ea.mem.seg, ea.mem.off, old, new, op_bytes, ctxt)) != 0 ) goto done; _regs.eflags |= EFLG_ZF; } break; } case 0xc8 ... 0xcf: /* bswap */ dst.type = OP_REG; dst.reg = decode_register( (b & 7) | ((rex_prefix & 1) << 3), &_regs, 0); switch ( dst.bytes = op_bytes ) { default: /* case 2: */ /* Undefined behaviour. Writes zero on all tested CPUs. */ dst.val = 0; break; case 4: #ifdef __x86_64__ asm ( "bswap %k0" : "=r" (dst.val) : "0" (*dst.reg) ); break; case 8: #endif asm ( "bswap %0" : "=r" (dst.val) : "0" (*dst.reg) ); break; } break; } goto writeback; cannot_emulate: return X86EMUL_UNHANDLEABLE; } xen-4.4.0/xen/arch/x86/x86_emulate/x86_emulate.h0000664000175000017500000003114612307313555017257 0ustar smbsmb/****************************************************************************** * x86_emulate.h * * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. * * Copyright (c) 2005-2007 Keir Fraser * Copyright (c) 2005-2007 XenSource Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef __X86_EMULATE_H__ #define __X86_EMULATE_H__ struct x86_emulate_ctxt; /* Comprehensive enumeration of x86 segment registers. */ enum x86_segment { /* General purpose. */ x86_seg_cs, x86_seg_ss, x86_seg_ds, x86_seg_es, x86_seg_fs, x86_seg_gs, /* System. */ x86_seg_tr, x86_seg_ldtr, x86_seg_gdtr, x86_seg_idtr, /* * Dummy: used to emulate direct processor accesses to management * structures (TSS, GDT, LDT, IDT, etc.) which use linear addressing * (no segment component) and bypass usual segment- and page-level * protection checks. */ x86_seg_none }; #define is_x86_user_segment(seg) ((unsigned)(seg) <= x86_seg_gs) /* * Attribute for segment selector. This is a copy of bit 40:47 & 52:55 of the * segment descriptor. It happens to match the format of an AMD SVM VMCB. */ typedef union segment_attributes { uint16_t bytes; struct { uint16_t type:4; /* 0; Bit 40-43 */ uint16_t s: 1; /* 4; Bit 44 */ uint16_t dpl: 2; /* 5; Bit 45-46 */ uint16_t p: 1; /* 7; Bit 47 */ uint16_t avl: 1; /* 8; Bit 52 */ uint16_t l: 1; /* 9; Bit 53 */ uint16_t db: 1; /* 10; Bit 54 */ uint16_t g: 1; /* 11; Bit 55 */ uint16_t pad: 4; } fields; } __attribute__ ((packed)) segment_attributes_t; /* * Full state of a segment register (visible and hidden portions). * Again, this happens to match the format of an AMD SVM VMCB. */ struct segment_register { uint16_t sel; segment_attributes_t attr; uint32_t limit; uint64_t base; } __attribute__ ((packed)); /* * Return codes from state-accessor functions and from x86_emulate(). */ /* Completed successfully. State modified appropriately. */ #define X86EMUL_OKAY 0 /* Unhandleable access or emulation. No state modified. */ #define X86EMUL_UNHANDLEABLE 1 /* Exception raised and requires delivery. */ #define X86EMUL_EXCEPTION 2 /* Retry the emulation for some reason. No state modified. */ #define X86EMUL_RETRY 3 /* (cmpxchg accessor): CMPXCHG failed. Maps to X86EMUL_RETRY in caller. */ #define X86EMUL_CMPXCHG_FAILED 3 /* FPU sub-types which may be requested via ->get_fpu(). */ enum x86_emulate_fpu_type { X86EMUL_FPU_fpu, /* Standard FPU coprocessor instruction set */ X86EMUL_FPU_mmx, /* MMX instruction set (%mm0-%mm7) */ X86EMUL_FPU_xmm, /* SSE instruction set (%xmm0-%xmm7/15) */ X86EMUL_FPU_ymm /* AVX/XOP instruction set (%ymm0-%ymm7/15) */ }; /* * These operations represent the instruction emulator's interface to memory, * I/O ports, privileged state... pretty much everything other than GPRs. * * NOTES: * 1. If the access fails (cannot emulate, or a standard access faults) then * it is up to the memop to propagate the fault to the guest VM via * some out-of-band mechanism, unknown to the emulator. The memop signals * failure by returning X86EMUL_EXCEPTION to the emulator, which will * then immediately bail. * 2. The emulator cannot handle 64-bit mode emulation on an x86/32 system. */ struct x86_emulate_ops { /* * All functions: * @ctxt: [IN ] Emulation context info as passed to the emulator. * All memory-access functions: * @seg: [IN ] Segment being dereferenced (specified as x86_seg_??). * @offset:[IN ] Offset within segment. * @p_data:[IN ] Pointer to i/o data buffer (length is @bytes) * Read functions: * @val: [OUT] Value read, zero-extended to 'ulong'. * Write functions: * @val: [IN ] Value to write (low-order bytes used as req'd). * Variable-length access functions: * @bytes: [IN ] Number of bytes to read or write. Valid access sizes are * 1, 2, 4 and 8 (x86/64 only) bytes, unless otherwise * stated. */ /* * read: Emulate a memory read. * @bytes: Access length (0 < @bytes < 4096). */ int (*read)( enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt); /* * insn_fetch: Emulate fetch from instruction byte stream. * Parameters are same as for 'read'. @seg is always x86_seg_cs. */ int (*insn_fetch)( enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt); /* * write: Emulate a memory write. * @bytes: Access length (0 < @bytes < 4096). */ int (*write)( enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt); /* * cmpxchg: Emulate an atomic (LOCKed) CMPXCHG operation. * @p_old: [IN ] Pointer to value expected to be current at @addr. * @p_new: [IN ] Pointer to value to write to @addr. * @bytes: [IN ] Operation size (up to 8 (x86/32) or 16 (x86/64) bytes). */ int (*cmpxchg)( enum x86_segment seg, unsigned long offset, void *p_old, void *p_new, unsigned int bytes, struct x86_emulate_ctxt *ctxt); /* * rep_ins: Emulate INS: -> . * @bytes_per_rep: [IN ] Bytes transferred per repetition. * @reps: [IN ] Maximum repetitions to be emulated. * [OUT] Number of repetitions actually emulated. */ int (*rep_ins)( uint16_t src_port, enum x86_segment dst_seg, unsigned long dst_offset, unsigned int bytes_per_rep, unsigned long *reps, struct x86_emulate_ctxt *ctxt); /* * rep_outs: Emulate OUTS: -> . * @bytes_per_rep: [IN ] Bytes transferred per repetition. * @reps: [IN ] Maximum repetitions to be emulated. * [OUT] Number of repetitions actually emulated. */ int (*rep_outs)( enum x86_segment src_seg, unsigned long src_offset, uint16_t dst_port, unsigned int bytes_per_rep, unsigned long *reps, struct x86_emulate_ctxt *ctxt); /* * rep_movs: Emulate MOVS: -> . * @bytes_per_rep: [IN ] Bytes transferred per repetition. * @reps: [IN ] Maximum repetitions to be emulated. * [OUT] Number of repetitions actually emulated. */ int (*rep_movs)( enum x86_segment src_seg, unsigned long src_offset, enum x86_segment dst_seg, unsigned long dst_offset, unsigned int bytes_per_rep, unsigned long *reps, struct x86_emulate_ctxt *ctxt); /* * read_segment: Emulate a read of full context of a segment register. * @reg: [OUT] Contents of segment register (visible and hidden state). */ int (*read_segment)( enum x86_segment seg, struct segment_register *reg, struct x86_emulate_ctxt *ctxt); /* * write_segment: Emulate a read of full context of a segment register. * @reg: [OUT] Contents of segment register (visible and hidden state). */ int (*write_segment)( enum x86_segment seg, struct segment_register *reg, struct x86_emulate_ctxt *ctxt); /* * read_io: Read from I/O port(s). * @port: [IN ] Base port for access. */ int (*read_io)( unsigned int port, unsigned int bytes, unsigned long *val, struct x86_emulate_ctxt *ctxt); /* * write_io: Write to I/O port(s). * @port: [IN ] Base port for access. */ int (*write_io)( unsigned int port, unsigned int bytes, unsigned long val, struct x86_emulate_ctxt *ctxt); /* * read_cr: Read from control register. * @reg: [IN ] Register to read (0-15). */ int (*read_cr)( unsigned int reg, unsigned long *val, struct x86_emulate_ctxt *ctxt); /* * write_cr: Write to control register. * @reg: [IN ] Register to write (0-15). */ int (*write_cr)( unsigned int reg, unsigned long val, struct x86_emulate_ctxt *ctxt); /* * read_dr: Read from debug register. * @reg: [IN ] Register to read (0-15). */ int (*read_dr)( unsigned int reg, unsigned long *val, struct x86_emulate_ctxt *ctxt); /* * write_dr: Write to debug register. * @reg: [IN ] Register to write (0-15). */ int (*write_dr)( unsigned int reg, unsigned long val, struct x86_emulate_ctxt *ctxt); /* * read_msr: Read from model-specific register. * @reg: [IN ] Register to read. */ int (*read_msr)( unsigned long reg, uint64_t *val, struct x86_emulate_ctxt *ctxt); /* * write_dr: Write to model-specific register. * @reg: [IN ] Register to write. */ int (*write_msr)( unsigned long reg, uint64_t val, struct x86_emulate_ctxt *ctxt); /* wbinvd: Write-back and invalidate cache contents. */ int (*wbinvd)( struct x86_emulate_ctxt *ctxt); /* cpuid: Emulate CPUID via given set of EAX-EDX inputs/outputs. */ int (*cpuid)( unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx, struct x86_emulate_ctxt *ctxt); /* inject_hw_exception */ int (*inject_hw_exception)( uint8_t vector, int32_t error_code, struct x86_emulate_ctxt *ctxt); /* inject_sw_interrupt */ int (*inject_sw_interrupt)( uint8_t vector, uint8_t insn_len, struct x86_emulate_ctxt *ctxt); /* * get_fpu: Load emulated environment's FPU state onto processor. * @exn_callback: On any FPU or SIMD exception, pass control to * (*exception_callback)(exception_callback_arg, regs). */ int (*get_fpu)( void (*exception_callback)(void *, struct cpu_user_regs *), void *exception_callback_arg, enum x86_emulate_fpu_type type, struct x86_emulate_ctxt *ctxt); /* put_fpu: Relinquish the FPU. Unhook from FPU/SIMD exception handlers. */ void (*put_fpu)( struct x86_emulate_ctxt *ctxt); /* invlpg: Invalidate paging structures which map addressed byte. */ int (*invlpg)( enum x86_segment seg, unsigned long offset, struct x86_emulate_ctxt *ctxt); }; struct cpu_user_regs; struct x86_emulate_ctxt { /* Register state before/after emulation. */ struct cpu_user_regs *regs; /* Default address size in current execution mode (16, 32, or 64). */ unsigned int addr_size; /* Stack pointer width in bits (16, 32 or 64). */ unsigned int sp_size; /* Set this if writes may have side effects. */ uint8_t force_writeback; /* Retirement state, set by the emulator (valid only on X86EMUL_OKAY). */ union { struct { uint8_t hlt:1; /* Instruction HLTed. */ uint8_t mov_ss:1; /* Instruction sets MOV-SS irq shadow. */ uint8_t sti:1; /* Instruction sets STI irq shadow. */ } flags; uint8_t byte; } retire; }; /* * x86_emulate: Emulate an instruction. * Returns -1 on failure, 0 on success. */ int x86_emulate( struct x86_emulate_ctxt *ctxt, const struct x86_emulate_ops *ops); /* * Given the 'reg' portion of a ModRM byte, and a register block, return a * pointer into the block that addresses the relevant register. * @highbyte_regs specifies whether to decode AH,CH,DH,BH. */ void * decode_register( uint8_t modrm_reg, struct cpu_user_regs *regs, int highbyte_regs); #endif /* __X86_EMULATE_H__ */ xen-4.4.0/xen/arch/x86/srat.c0000664000175000017500000002570112307313555013721 0ustar smbsmb/* * ACPI 3.0 based NUMA setup * Copyright 2004 Andi Kleen, SuSE Labs. * * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. * * Called from acpi_numa_init while reading the SRAT and SLIT tables. * Assumes all memory regions belonging to a single proximity domain * are in one chunk. Holes between them will be included in the node. * * Adapted for Xen: Ryan Harper */ #include #include #include #include #include #include #include #include #include static struct acpi_table_slit *__read_mostly acpi_slit; static nodemask_t memory_nodes_parsed __initdata; static nodemask_t processor_nodes_parsed __initdata; static nodemask_t nodes_found __initdata; static struct node nodes[MAX_NUMNODES] __initdata; static u8 __read_mostly pxm2node[256] = { [0 ... 255] = NUMA_NO_NODE }; static int num_node_memblks; static struct node node_memblk_range[NR_NODE_MEMBLKS]; static int memblk_nodeid[NR_NODE_MEMBLKS]; static int node_to_pxm(int n); int pxm_to_node(int pxm) { if ((unsigned)pxm >= 256) return -1; /* Extend 0xff to (int)-1 */ return (signed char)pxm2node[pxm]; } __devinit int setup_node(int pxm) { unsigned node = pxm2node[pxm]; if (node == 0xff) { if (nodes_weight(nodes_found) >= MAX_NUMNODES) return -1; node = first_unset_node(nodes_found); node_set(node, nodes_found); pxm2node[pxm] = node; } return pxm2node[pxm]; } int valid_numa_range(u64 start, u64 end, int node) { int i; for (i = 0; i < num_node_memblks; i++) { struct node *nd = &node_memblk_range[i]; if (nd->start <= start && nd->end > end && memblk_nodeid[i] == node ) return 1; } return 0; } static __init int conflicting_memblks(u64 start, u64 end) { int i; for (i = 0; i < num_node_memblks; i++) { struct node *nd = &node_memblk_range[i]; if (nd->start == nd->end) continue; if (nd->end > start && nd->start < end) return memblk_nodeid[i]; if (nd->end == end && nd->start == start) return memblk_nodeid[i]; } return -1; } static __init void cutoff_node(int i, u64 start, u64 end) { struct node *nd = &nodes[i]; if (nd->start < start) { nd->start = start; if (nd->end < nd->start) nd->start = nd->end; } if (nd->end > end) { nd->end = end; if (nd->start > nd->end) nd->start = nd->end; } } static __init void bad_srat(void) { int i; printk(KERN_ERR "SRAT: SRAT not used.\n"); acpi_numa = -1; for (i = 0; i < MAX_LOCAL_APIC; i++) apicid_to_node[i] = NUMA_NO_NODE; for (i = 0; i < ARRAY_SIZE(pxm2node); i++) pxm2node[i] = NUMA_NO_NODE; mem_hotplug = 0; } /* * A lot of BIOS fill in 10 (= no distance) everywhere. This messes * up the NUMA heuristics which wants the local node to have a smaller * distance than the others. * Do some quick checks here and only use the SLIT if it passes. */ static __init int slit_valid(struct acpi_table_slit *slit) { int i, j; int d = slit->locality_count; for (i = 0; i < d; i++) { for (j = 0; j < d; j++) { u8 val = slit->entry[d*i + j]; if (i == j) { if (val != 10) return 0; } else if (val <= 10) return 0; } } return 1; } /* Callback for SLIT parsing */ void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { unsigned long mfn; if (!slit_valid(slit)) { printk(KERN_INFO "ACPI: SLIT table looks invalid. " "Not used.\n"); return; } mfn = alloc_boot_pages(PFN_UP(slit->header.length), 1); if (!mfn) { printk(KERN_ERR "ACPI: Unable to allocate memory for " "saving ACPI SLIT numa information.\n"); return; } acpi_slit = mfn_to_virt(mfn); memcpy(acpi_slit, slit, slit->header.length); } /* Callback for Proximity Domain -> x2APIC mapping */ void __init acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) { int pxm, node; int apic_id; if (srat_disabled()) return; if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) { bad_srat(); return; } if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) return; pxm = pa->proximity_domain; node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); bad_srat(); return; } apic_id = pa->apic_id; apicid_to_node[apic_id] = node; acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", pxm, apic_id, node); } /* Callback for Proximity Domain -> LAPIC mapping */ void __init acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { int pxm, node; if (srat_disabled()) return; if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { bad_srat(); return; } if (!(pa->flags & ACPI_SRAT_CPU_ENABLED)) return; pxm = pa->proximity_domain_lo; if (srat_rev >= 2) { pxm |= pa->proximity_domain_hi[0] << 8; pxm |= pa->proximity_domain_hi[1] << 16; pxm |= pa->proximity_domain_hi[2] << 24; } node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); bad_srat(); return; } apicid_to_node[pa->apic_id] = node; node_set(node, processor_nodes_parsed); acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", pxm, pa->apic_id, node); } /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ void __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { struct node *nd; u64 start, end; int node, pxm; int i; if (srat_disabled()) return; if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { bad_srat(); return; } if (!(ma->flags & ACPI_SRAT_MEM_ENABLED)) return; if (num_node_memblks >= NR_NODE_MEMBLKS) { dprintk(XENLOG_WARNING, "Too many numa entry, try bigger NR_NODE_MEMBLKS \n"); bad_srat(); return; } start = ma->base_address; end = start + ma->length; pxm = ma->proximity_domain; if (srat_rev < 2) pxm &= 0xff; node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains.\n"); bad_srat(); return; } /* It is fine to add this area to the nodes data it will be used later*/ i = conflicting_memblks(start, end); if (i == node) { printk(KERN_WARNING "SRAT: Warning: PXM %d (%"PRIx64"-%"PRIx64") overlaps with itself (%" PRIx64"-%"PRIx64")\n", pxm, start, end, nodes[i].start, nodes[i].end); } else if (i >= 0) { printk(KERN_ERR "SRAT: PXM %d (%"PRIx64"-%"PRIx64") overlaps with PXM %d (%" PRIx64"-%"PRIx64")\n", pxm, start, end, node_to_pxm(i), nodes[i].start, nodes[i].end); bad_srat(); return; } nd = &nodes[node]; if (!node_test_and_set(node, memory_nodes_parsed)) { nd->start = start; nd->end = end; } else { if (start < nd->start) nd->start = start; if (nd->end < end) nd->end = end; } if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && end > mem_hotplug) mem_hotplug = end; printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIx64"-%"PRIx64"%s\n", node, pxm, start, end, ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE ? " (hotplug)" : ""); node_memblk_range[num_node_memblks].start = start; node_memblk_range[num_node_memblks].end = end; memblk_nodeid[num_node_memblks] = node; num_node_memblks++; } /* Sanity check to catch more bad SRATs (they are amazingly common). Make sure the PXMs cover all memory. */ static int nodes_cover_memory(void) { int i; for (i = 0; i < e820.nr_map; i++) { int j, found; unsigned long long start, end; if (e820.map[i].type != E820_RAM) { continue; } start = e820.map[i].addr; end = e820.map[i].addr + e820.map[i].size - 1; do { found = 0; for_each_node_mask(j, memory_nodes_parsed) if (start < nodes[j].end && end > nodes[j].start) { if (start >= nodes[j].start) { start = nodes[j].end; found = 1; } if (end <= nodes[j].end) { end = nodes[j].start; found = 1; } } } while (found && start < end); if (start < end) { printk(KERN_ERR "SRAT: No PXM for e820 range: " "%016Lx - %016Lx\n", start, end); return 0; } } return 1; } void __init acpi_numa_arch_fixup(void) {} static u64 __initdata srat_region_mask; static u64 __init fill_mask(u64 mask) { while (mask & (mask + 1)) mask |= mask + 1; return mask; } static int __init srat_parse_region(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_srat_mem_affinity *ma; if (!header) return -EINVAL; ma = container_of(header, struct acpi_srat_mem_affinity, header); if (!ma->length || !(ma->flags & ACPI_SRAT_MEM_ENABLED) || (ma->flags & ACPI_SRAT_MEM_NON_VOLATILE)) return 0; if (numa_off) printk(KERN_INFO "SRAT: %013"PRIx64"-%013"PRIx64"\n", ma->base_address, ma->base_address + ma->length - 1); srat_region_mask |= ma->base_address | fill_mask(ma->base_address ^ (ma->base_address + ma->length - 1)); return 0; } void __init srat_parse_regions(u64 addr) { u64 mask; unsigned int i; if (acpi_disabled || acpi_numa < 0 || acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) return; srat_region_mask = fill_mask(addr - 1); acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, srat_parse_region, 0); for (mask = srat_region_mask, i = 0; mask && i < e820.nr_map; i++) { if (e820.map[i].type != E820_RAM) continue; if (~mask & fill_mask(e820.map[i].addr ^ (e820.map[i].addr + e820.map[i].size - 1))) mask = 0; } pfn_pdx_hole_setup(mask >> PAGE_SHIFT); } /* Use the information discovered above to actually set up the nodes. */ int __init acpi_scan_nodes(u64 start, u64 end) { int i; nodemask_t all_nodes_parsed; /* First clean up the node list */ for (i = 0; i < MAX_NUMNODES; i++) cutoff_node(i, start, end); if (acpi_numa <= 0) return -1; if (!nodes_cover_memory()) { bad_srat(); return -1; } memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, memblk_nodeid); if (memnode_shift < 0) { printk(KERN_ERR "SRAT: No NUMA node hash function found. Contact maintainer\n"); bad_srat(); return -1; } nodes_or(all_nodes_parsed, memory_nodes_parsed, processor_nodes_parsed); /* Finally register nodes */ for_each_node_mask(i, all_nodes_parsed) { u64 size = nodes[i].end - nodes[i].start; if ( size == 0 ) printk(KERN_WARNING "SRAT: Node %u has no memory. " "BIOS Bug or mis-configured hardware?\n", i); setup_node_bootmem(i, nodes[i].start, nodes[i].end); } for (i = 0; i < nr_cpu_ids; i++) { if (cpu_to_node[i] == NUMA_NO_NODE) continue; if (!node_isset(cpu_to_node[i], processor_nodes_parsed)) numa_set_node(i, NUMA_NO_NODE); } numa_init_array(); return 0; } static int node_to_pxm(int n) { int i; if (pxm2node[n] == n) return n; for (i = 0; i < 256; i++) if (pxm2node[i] == n) return i; return 0; } int __node_distance(int a, int b) { int index; if (!acpi_slit) return a == b ? 10 : 20; index = acpi_slit->locality_count * node_to_pxm(a); return acpi_slit->entry[index + node_to_pxm(b)]; } EXPORT_SYMBOL(__node_distance); xen-4.4.0/xen/arch/x86/clear_page.S0000664000175000017500000000061012307313555015002 0ustar smbsmb#include #include #define ptr_reg %rdi ENTRY(clear_page_sse2) mov $PAGE_SIZE/16, %ecx xor %eax,%eax 0: dec %ecx movnti %eax, (ptr_reg) movnti %eax, 4(ptr_reg) movnti %eax, 8(ptr_reg) movnti %eax, 12(ptr_reg) lea 16(ptr_reg), ptr_reg jnz 0b sfence ret xen-4.4.0/xen/arch/x86/bzimage.c0000664000175000017500000001603412307313555014365 0ustar smbsmb#include #include #include #include #include #include #include #include #include #define HEAPORDER 3 static unsigned char *__initdata window; #define memptr long static memptr __initdata free_mem_ptr; static memptr __initdata free_mem_end_ptr; #define WSIZE 0x80000000 static unsigned char *__initdata inbuf; static unsigned __initdata insize; /* Index of next byte to be processed in inbuf: */ static unsigned __initdata inptr; /* Bytes in output buffer: */ static unsigned __initdata outcnt; #define OF(args) args #define STATIC static #define memzero(s, n) memset((s), 0, (n)) typedef unsigned char uch; typedef unsigned short ush; typedef unsigned long ulg; #define INIT __init #define INITDATA __initdata #define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf()) /* Diagnostic functions */ #ifdef DEBUG # define Assert(cond, msg) do { if (!(cond)) error(msg); } while (0) # define Trace(x) do { fprintf x; } while (0) # define Tracev(x) do { if (verbose) fprintf x ; } while (0) # define Tracevv(x) do { if (verbose > 1) fprintf x ; } while (0) # define Tracec(c, x) do { if (verbose && (c)) fprintf x ; } while (0) # define Tracecv(c, x) do { if (verbose > 1 && (c)) fprintf x ; } while (0) #else # define Assert(cond, msg) # define Trace(x) # define Tracev(x) # define Tracevv(x) # define Tracec(c, x) # define Tracecv(c, x) #endif static long __initdata bytes_out; static void flush_window(void); static __init void error(char *x) { panic("%s", x); } static __init int fill_inbuf(void) { error("ran out of input data"); return 0; } #include "../../common/inflate.c" static __init void flush_window(void) { /* * The window is equal to the output buffer therefore only need to * compute the crc. */ unsigned long c = crc; unsigned n; unsigned char *in, ch; in = window; for ( n = 0; n < outcnt; n++ ) { ch = *in++; c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); } crc = c; bytes_out += (unsigned long)outcnt; outcnt = 0; } static __init unsigned long output_length(char *image, unsigned long image_len) { return *(uint32_t *)&image[image_len - 4]; } static __init int gzip_check(char *image, unsigned long image_len) { unsigned char magic0, magic1; if ( image_len < 2 ) return 0; magic0 = (unsigned char)image[0]; magic1 = (unsigned char)image[1]; return (magic0 == 0x1f) && ((magic1 == 0x8b) || (magic1 == 0x9e)); } static __init int perform_gunzip(char *output, char *image, unsigned long image_len) { int rc; if ( !gzip_check(image, image_len) ) return 1; window = (unsigned char *)output; free_mem_ptr = (unsigned long)alloc_xenheap_pages(HEAPORDER, 0); free_mem_end_ptr = free_mem_ptr + (PAGE_SIZE << HEAPORDER); inbuf = (unsigned char *)image; insize = image_len; inptr = 0; makecrc(); if ( gunzip() < 0 ) { rc = -EINVAL; } else { rc = 0; } free_xenheap_pages((void *)free_mem_ptr, HEAPORDER); return rc; } struct setup_header { uint8_t _pad0[0x1f1]; /* skip uninteresting stuff */ uint8_t setup_sects; uint16_t root_flags; uint32_t syssize; uint16_t ram_size; uint16_t vid_mode; uint16_t root_dev; uint16_t boot_flag; uint16_t jump; uint32_t header; #define HDR_MAGIC "HdrS" #define HDR_MAGIC_SZ 4 uint16_t version; #define VERSION(h,l) (((h)<<8) | (l)) uint32_t realmode_swtch; uint16_t start_sys; uint16_t kernel_version; uint8_t type_of_loader; uint8_t loadflags; uint16_t setup_move_size; uint32_t code32_start; uint32_t ramdisk_image; uint32_t ramdisk_size; uint32_t bootsect_kludge; uint16_t heap_end_ptr; uint16_t _pad1; uint32_t cmd_line_ptr; uint32_t initrd_addr_max; uint32_t kernel_alignment; uint8_t relocatable_kernel; uint8_t _pad2[3]; uint32_t cmdline_size; uint32_t hardware_subarch; uint64_t hardware_subarch_data; uint32_t payload_offset; uint32_t payload_length; } __attribute__((packed)); static __init int bzimage_check(struct setup_header *hdr, unsigned long len) { if ( len < sizeof(struct setup_header) ) return 0; if ( memcmp(&hdr->header, HDR_MAGIC, HDR_MAGIC_SZ) != 0 ) return 0; if ( hdr->version < VERSION(2,8) ) { printk("Cannot load bzImage v%d.%02d at least v2.08 is required\n", hdr->version >> 8, hdr->version & 0xff); return -EINVAL; } return 1; } static unsigned long __initdata orig_image_len; unsigned long __init bzimage_headroom(char *image_start, unsigned long image_length) { struct setup_header *hdr = (struct setup_header *)image_start; int err; unsigned long headroom; err = bzimage_check(hdr, image_length); if ( err < 0 ) return 0; if ( err > 0 ) { image_start += (hdr->setup_sects + 1) * 512 + hdr->payload_offset; image_length = hdr->payload_length; } if ( elf_is_elfbinary(image_start, image_length) ) return 0; orig_image_len = image_length; headroom = output_length(image_start, image_length); if (gzip_check(image_start, image_length)) { headroom += headroom >> 12; /* Add 8 bytes for every 32K input block */ headroom += (32768 + 18); /* Add 32K + 18 bytes of extra headroom */ } else headroom += image_length; headroom = (headroom + 4095) & ~4095; return headroom; } int __init bzimage_parse(char *image_base, char **image_start, unsigned long *image_len) { struct setup_header *hdr = (struct setup_header *)(*image_start); int err = bzimage_check(hdr, *image_len); unsigned long output_len; if ( err < 0 ) return err; if ( err > 0 ) { *image_start += (hdr->setup_sects + 1) * 512 + hdr->payload_offset; *image_len = hdr->payload_length; } if ( elf_is_elfbinary(*image_start, *image_len) ) return 0; BUG_ON(!(image_base < *image_start)); output_len = output_length(*image_start, orig_image_len); if ( (err = perform_gunzip(image_base, *image_start, orig_image_len)) > 0 ) err = decompress(*image_start, orig_image_len, image_base); if ( !err ) { *image_start = image_base; *image_len = output_len; } return err > 0 ? 0 : err; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/hvm/0000775000175000017500000000000012307313555013371 5ustar smbsmbxen-4.4.0/xen/arch/x86/hvm/Makefile0000664000175000017500000000063012307313555015030 0ustar smbsmbsubdir-y += svm subdir-y += vmx obj-y += asid.o obj-y += emulate.o obj-y += hpet.o obj-y += hvm.o obj-y += i8254.o obj-y += intercept.o obj-y += io.o obj-y += irq.o obj-y += mtrr.o obj-y += nestedhvm.o obj-y += pmtimer.o obj-y += quirks.o obj-y += rtc.o obj-y += save.o obj-y += stdvga.o obj-y += vioapic.o obj-y += viridian.o obj-y += vlapic.o obj-y += vmsi.o obj-y += vpic.o obj-y += vpt.o obj-y += vpmu.oxen-4.4.0/xen/arch/x86/hvm/viridian.c0000664000175000017500000003416512307313555015353 0ustar smbsmb/****************************************************************************** * viridian.c * * An implementation of the Viridian hypercall interface. */ #include #include #include #include #include #include #include #include #include #include #include /* Viridian MSR numbers. */ #define VIRIDIAN_MSR_GUEST_OS_ID 0x40000000 #define VIRIDIAN_MSR_HYPERCALL 0x40000001 #define VIRIDIAN_MSR_VP_INDEX 0x40000002 #define VIRIDIAN_MSR_TIME_REF_COUNT 0x40000020 #define VIRIDIAN_MSR_TSC_FREQUENCY 0x40000022 #define VIRIDIAN_MSR_APIC_FREQUENCY 0x40000023 #define VIRIDIAN_MSR_EOI 0x40000070 #define VIRIDIAN_MSR_ICR 0x40000071 #define VIRIDIAN_MSR_TPR 0x40000072 #define VIRIDIAN_MSR_APIC_ASSIST 0x40000073 /* Viridian Hypercall Status Codes. */ #define HV_STATUS_SUCCESS 0x0000 #define HV_STATUS_INVALID_HYPERCALL_CODE 0x0002 /* Viridian Hypercall Codes and Parameters. */ #define HvNotifyLongSpinWait 8 /* Viridian CPUID 4000003, Viridian MSR availability. */ #define CPUID3A_MSR_REF_COUNT (1 << 1) #define CPUID3A_MSR_APIC_ACCESS (1 << 4) #define CPUID3A_MSR_HYPERCALL (1 << 5) #define CPUID3A_MSR_VP_INDEX (1 << 6) #define CPUID3A_MSR_FREQ (1 << 11) /* Viridian CPUID 4000004, Implementation Recommendations. */ #define CPUID4A_MSR_BASED_APIC (1 << 3) #define CPUID4A_RELAX_TIMER_INT (1 << 5) /* Viridian CPUID 4000006, Implementation HW features detected and in use. */ #define CPUID6A_APIC_OVERLAY (1 << 0) #define CPUID6A_MSR_BITMAPS (1 << 1) #define CPUID6A_NESTED_PAGING (1 << 3) int cpuid_viridian_leaves(unsigned int leaf, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { struct domain *d = current->domain; if ( !is_viridian_domain(d) ) return 0; leaf -= 0x40000000; if ( leaf > 6 ) return 0; *eax = *ebx = *ecx = *edx = 0; switch ( leaf ) { case 0: *eax = 0x40000006; /* Maximum leaf */ *ebx = 0x7263694d; /* Magic numbers */ *ecx = 0x666F736F; *edx = 0x76482074; break; case 1: *eax = 0x31237648; /* Version number */ break; case 2: /* Hypervisor information, but only if the guest has set its own version number. */ if ( d->arch.hvm_domain.viridian.guest_os_id.raw == 0 ) break; *eax = 1; /* Build number */ *ebx = (xen_major_version() << 16) | xen_minor_version(); *ecx = 0; /* SP */ *edx = 0; /* Service branch and number */ break; case 3: /* Which hypervisor MSRs are available to the guest */ *eax = (CPUID3A_MSR_APIC_ACCESS | CPUID3A_MSR_HYPERCALL | CPUID3A_MSR_VP_INDEX | CPUID3A_MSR_FREQ); break; case 4: /* Recommended hypercall usage. */ if ( (d->arch.hvm_domain.viridian.guest_os_id.raw == 0) || (d->arch.hvm_domain.viridian.guest_os_id.fields.os < 4) ) break; *eax = CPUID4A_RELAX_TIMER_INT; if ( !cpu_has_vmx_apic_reg_virt ) *eax |= CPUID4A_MSR_BASED_APIC; *ebx = 2047; /* long spin count */ break; case 6: /* Detected and in use hardware features. */ if ( cpu_has_vmx_virtualize_apic_accesses ) *eax |= CPUID6A_APIC_OVERLAY; if ( cpu_has_vmx_msr_bitmap || (read_efer() & EFER_SVME) ) *eax |= CPUID6A_MSR_BITMAPS; if ( hap_enabled(d) ) *eax |= CPUID6A_NESTED_PAGING; break; } return 1; } static void dump_guest_os_id(const struct domain *d) { gdprintk(XENLOG_INFO, "GUEST_OS_ID:\n"); gdprintk(XENLOG_INFO, "\tvendor: %x\n", d->arch.hvm_domain.viridian.guest_os_id.fields.vendor); gdprintk(XENLOG_INFO, "\tos: %x\n", d->arch.hvm_domain.viridian.guest_os_id.fields.os); gdprintk(XENLOG_INFO, "\tmajor: %x\n", d->arch.hvm_domain.viridian.guest_os_id.fields.major); gdprintk(XENLOG_INFO, "\tminor: %x\n", d->arch.hvm_domain.viridian.guest_os_id.fields.minor); gdprintk(XENLOG_INFO, "\tsp: %x\n", d->arch.hvm_domain.viridian.guest_os_id.fields.service_pack); gdprintk(XENLOG_INFO, "\tbuild: %x\n", d->arch.hvm_domain.viridian.guest_os_id.fields.build_number); } static void dump_hypercall(const struct domain *d) { gdprintk(XENLOG_INFO, "HYPERCALL:\n"); gdprintk(XENLOG_INFO, "\tenabled: %x\n", d->arch.hvm_domain.viridian.hypercall_gpa.fields.enabled); gdprintk(XENLOG_INFO, "\tpfn: %lx\n", (unsigned long)d->arch.hvm_domain.viridian.hypercall_gpa.fields.pfn); } static void dump_apic_assist(const struct vcpu *v) { gdprintk(XENLOG_INFO, "APIC_ASSIST[%d]:\n", v->vcpu_id); gdprintk(XENLOG_INFO, "\tenabled: %x\n", v->arch.hvm_vcpu.viridian.apic_assist.fields.enabled); gdprintk(XENLOG_INFO, "\tpfn: %lx\n", (unsigned long)v->arch.hvm_vcpu.viridian.apic_assist.fields.pfn); } static void enable_hypercall_page(struct domain *d) { unsigned long gmfn = d->arch.hvm_domain.viridian.hypercall_gpa.fields.pfn; struct page_info *page = get_page_from_gfn(d, gmfn, NULL, P2M_ALLOC); uint8_t *p; if ( !page || !get_page_type(page, PGT_writable_page) ) { if ( page ) put_page(page); gdprintk(XENLOG_WARNING, "Bad GMFN %lx (MFN %lx)\n", gmfn, page ? page_to_mfn(page) : INVALID_MFN); return; } p = __map_domain_page(page); /* * We set the bit 31 in %eax (reserved field in the Viridian hypercall * calling convention) to differentiate Xen and Viridian hypercalls. */ *(u8 *)(p + 0) = 0x0d; /* orl $0x80000000, %eax */ *(u32 *)(p + 1) = 0x80000000; *(u8 *)(p + 5) = 0x0f; /* vmcall/vmmcall */ *(u8 *)(p + 6) = 0x01; *(u8 *)(p + 7) = (cpu_has_vmx ? 0xc1 : 0xd9); *(u8 *)(p + 8) = 0xc3; /* ret */ memset(p + 9, 0xcc, PAGE_SIZE - 9); /* int3, int3, ... */ unmap_domain_page(p); put_page_and_type(page); } static void initialize_apic_assist(struct vcpu *v) { struct domain *d = v->domain; unsigned long gmfn = v->arch.hvm_vcpu.viridian.apic_assist.fields.pfn; struct page_info *page = get_page_from_gfn(d, gmfn, NULL, P2M_ALLOC); uint8_t *p; /* * We don't yet make use of the APIC assist page but by setting * the CPUID3A_MSR_APIC_ACCESS bit in CPUID leaf 40000003 we are duty * bound to support the MSR. We therefore do just enough to keep windows * happy. * * See http://msdn.microsoft.com/en-us/library/ff538657%28VS.85%29.aspx for * details of how Windows uses the page. */ if ( !page || !get_page_type(page, PGT_writable_page) ) { if ( page ) put_page(page); gdprintk(XENLOG_WARNING, "Bad GMFN %lx (MFN %lx)\n", gmfn, page ? page_to_mfn(page) : INVALID_MFN); return; } p = __map_domain_page(page); *(u32 *)p = 0; unmap_domain_page(p); put_page_and_type(page); } int wrmsr_viridian_regs(uint32_t idx, uint64_t val) { struct vcpu *v = current; struct domain *d = v->domain; if ( !is_viridian_domain(d) ) return 0; switch ( idx ) { case VIRIDIAN_MSR_GUEST_OS_ID: perfc_incr(mshv_wrmsr_osid); d->arch.hvm_domain.viridian.guest_os_id.raw = val; dump_guest_os_id(d); break; case VIRIDIAN_MSR_HYPERCALL: perfc_incr(mshv_wrmsr_hc_page); d->arch.hvm_domain.viridian.hypercall_gpa.raw = val; dump_hypercall(d); if ( d->arch.hvm_domain.viridian.hypercall_gpa.fields.enabled ) enable_hypercall_page(d); break; case VIRIDIAN_MSR_VP_INDEX: perfc_incr(mshv_wrmsr_vp_index); break; case VIRIDIAN_MSR_EOI: perfc_incr(mshv_wrmsr_eoi); vlapic_EOI_set(vcpu_vlapic(v)); break; case VIRIDIAN_MSR_ICR: { u32 eax = (u32)val, edx = (u32)(val >> 32); struct vlapic *vlapic = vcpu_vlapic(v); perfc_incr(mshv_wrmsr_icr); eax &= ~(1 << 12); edx &= 0xff000000; vlapic_set_reg(vlapic, APIC_ICR2, edx); vlapic_ipi(vlapic, eax, edx); vlapic_set_reg(vlapic, APIC_ICR, eax); break; } case VIRIDIAN_MSR_TPR: perfc_incr(mshv_wrmsr_tpr); vlapic_set_reg(vcpu_vlapic(v), APIC_TASKPRI, (uint8_t)val); break; case VIRIDIAN_MSR_APIC_ASSIST: perfc_incr(mshv_wrmsr_apic_msr); v->arch.hvm_vcpu.viridian.apic_assist.raw = val; dump_apic_assist(v); if (v->arch.hvm_vcpu.viridian.apic_assist.fields.enabled) initialize_apic_assist(v); break; default: return 0; } return 1; } int rdmsr_viridian_regs(uint32_t idx, uint64_t *val) { struct vcpu *v = current; struct domain *d = v->domain; if ( !is_viridian_domain(d) ) return 0; switch ( idx ) { case VIRIDIAN_MSR_GUEST_OS_ID: perfc_incr(mshv_rdmsr_osid); *val = d->arch.hvm_domain.viridian.guest_os_id.raw; break; case VIRIDIAN_MSR_HYPERCALL: perfc_incr(mshv_rdmsr_hc_page); *val = d->arch.hvm_domain.viridian.hypercall_gpa.raw; break; case VIRIDIAN_MSR_VP_INDEX: perfc_incr(mshv_rdmsr_vp_index); *val = v->vcpu_id; break; case VIRIDIAN_MSR_TSC_FREQUENCY: perfc_incr(mshv_rdmsr_tsc_frequency); *val = (uint64_t)d->arch.tsc_khz * 1000ull; break; case VIRIDIAN_MSR_APIC_FREQUENCY: perfc_incr(mshv_rdmsr_apic_frequency); *val = 1000000000ull / APIC_BUS_CYCLE_NS; break; case VIRIDIAN_MSR_ICR: perfc_incr(mshv_rdmsr_icr); *val = (((uint64_t)vlapic_get_reg(vcpu_vlapic(v), APIC_ICR2) << 32) | vlapic_get_reg(vcpu_vlapic(v), APIC_ICR)); break; case VIRIDIAN_MSR_TPR: perfc_incr(mshv_rdmsr_tpr); *val = vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI); break; case VIRIDIAN_MSR_APIC_ASSIST: perfc_incr(mshv_rdmsr_apic_msr); *val = v->arch.hvm_vcpu.viridian.apic_assist.raw; break; default: return 0; } return 1; } int viridian_hypercall(struct cpu_user_regs *regs) { int mode = hvm_guest_x86_mode(current); unsigned long input_params_gpa, output_params_gpa; uint16_t status = HV_STATUS_SUCCESS; union hypercall_input { uint64_t raw; struct { uint16_t call_code; uint16_t rsvd1; unsigned rep_count:12; unsigned rsvd2:4; unsigned rep_start:12; unsigned rsvd3:4; }; } input; union hypercall_output { uint64_t raw; struct { uint16_t result; uint16_t rsvd1; unsigned rep_complete:12; unsigned rsvd2:20; }; } output = { 0 }; ASSERT(is_viridian_domain(current->domain)); switch ( mode ) { case 8: input.raw = regs->rcx; input_params_gpa = regs->rdx; output_params_gpa = regs->r8; break; case 4: input.raw = ((uint64_t)regs->edx << 32) | regs->eax; input_params_gpa = ((uint64_t)regs->ebx << 32) | regs->ecx; output_params_gpa = ((uint64_t)regs->edi << 32) | regs->esi; break; default: goto out; } switch ( input.call_code ) { case HvNotifyLongSpinWait: perfc_incr(mshv_call_long_wait); do_sched_op_compat(SCHEDOP_yield, 0); status = HV_STATUS_SUCCESS; break; default: status = HV_STATUS_INVALID_HYPERCALL_CODE; break; } out: output.result = status; switch (mode) { case 8: regs->rax = output.raw; break; default: regs->edx = output.raw >> 32; regs->eax = output.raw; break; } return HVM_HCALL_completed; } static int viridian_save_domain_ctxt(struct domain *d, hvm_domain_context_t *h) { struct hvm_viridian_domain_context ctxt; if ( !is_viridian_domain(d) ) return 0; ctxt.hypercall_gpa = d->arch.hvm_domain.viridian.hypercall_gpa.raw; ctxt.guest_os_id = d->arch.hvm_domain.viridian.guest_os_id.raw; return (hvm_save_entry(VIRIDIAN_DOMAIN, 0, h, &ctxt) != 0); } static int viridian_load_domain_ctxt(struct domain *d, hvm_domain_context_t *h) { struct hvm_viridian_domain_context ctxt; if ( hvm_load_entry(VIRIDIAN_DOMAIN, h, &ctxt) != 0 ) return -EINVAL; d->arch.hvm_domain.viridian.hypercall_gpa.raw = ctxt.hypercall_gpa; d->arch.hvm_domain.viridian.guest_os_id.raw = ctxt.guest_os_id; return 0; } HVM_REGISTER_SAVE_RESTORE(VIRIDIAN_DOMAIN, viridian_save_domain_ctxt, viridian_load_domain_ctxt, 1, HVMSR_PER_DOM); static int viridian_save_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h) { struct vcpu *v; if ( !is_viridian_domain(d) ) return 0; for_each_vcpu( d, v ) { struct hvm_viridian_vcpu_context ctxt; ctxt.apic_assist = v->arch.hvm_vcpu.viridian.apic_assist.raw; if ( hvm_save_entry(VIRIDIAN_VCPU, v->vcpu_id, h, &ctxt) != 0 ) return 1; } return 0; } static int viridian_load_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h) { int vcpuid; struct vcpu *v; struct hvm_viridian_vcpu_context ctxt; vcpuid = hvm_load_instance(h); if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) { dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n", d->domain_id, vcpuid); return -EINVAL; } if ( hvm_load_entry(VIRIDIAN_VCPU, h, &ctxt) != 0 ) return -EINVAL; v->arch.hvm_vcpu.viridian.apic_assist.raw = ctxt.apic_assist; return 0; } HVM_REGISTER_SAVE_RESTORE(VIRIDIAN_VCPU, viridian_save_vcpu_ctxt, viridian_load_vcpu_ctxt, 1, HVMSR_PER_VCPU); xen-4.4.0/xen/arch/x86/hvm/vmx/0000775000175000017500000000000012307313555014203 5ustar smbsmbxen-4.4.0/xen/arch/x86/hvm/vmx/vpmu_core2.c0000664000175000017500000007213412307313555016437 0ustar smbsmb/* * vpmu_core2.c: CORE 2 specific PMU virtualization for HVM domain. * * Copyright (c) 2007, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Author: Haitao Shan */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * See Intel SDM Vol 2a Instruction Set Reference chapter 3 for CPUID * instruction. * cpuid 0xa - Architectural Performance Monitoring Leaf * Register eax */ #define PMU_VERSION_SHIFT 0 /* Version ID */ #define PMU_VERSION_BITS 8 /* 8 bits 0..7 */ #define PMU_VERSION_MASK (((1 << PMU_VERSION_BITS) - 1) << PMU_VERSION_SHIFT) #define PMU_GENERAL_NR_SHIFT 8 /* Number of general pmu registers */ #define PMU_GENERAL_NR_BITS 8 /* 8 bits 8..15 */ #define PMU_GENERAL_NR_MASK (((1 << PMU_GENERAL_NR_BITS) - 1) << PMU_GENERAL_NR_SHIFT) #define PMU_GENERAL_WIDTH_SHIFT 16 /* Width of general pmu registers */ #define PMU_GENERAL_WIDTH_BITS 8 /* 8 bits 16..23 */ #define PMU_GENERAL_WIDTH_MASK (((1 << PMU_GENERAL_WIDTH_BITS) - 1) << PMU_GENERAL_WIDTH_SHIFT) /* Register edx */ #define PMU_FIXED_NR_SHIFT 0 /* Number of fixed pmu registers */ #define PMU_FIXED_NR_BITS 5 /* 5 bits 0..4 */ #define PMU_FIXED_NR_MASK (((1 << PMU_FIXED_NR_BITS) -1) << PMU_FIXED_NR_SHIFT) #define PMU_FIXED_WIDTH_SHIFT 5 /* Width of fixed pmu registers */ #define PMU_FIXED_WIDTH_BITS 8 /* 8 bits 5..12 */ #define PMU_FIXED_WIDTH_MASK (((1 << PMU_FIXED_WIDTH_BITS) -1) << PMU_FIXED_WIDTH_SHIFT) /* Alias registers (0x4c1) for full-width writes to PMCs */ #define MSR_PMC_ALIAS_MASK (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_A_PERFCTR0)) static bool_t __read_mostly full_width_write; /* * QUIRK to workaround an issue on various family 6 cpus. * The issue leads to endless PMC interrupt loops on the processor. * If the interrupt handler is running and a pmc reaches the value 0, this * value remains forever and it triggers immediately a new interrupt after * finishing the handler. * A workaround is to read all flagged counters and if the value is 0 write * 1 (or another value != 0) into it. * There exist no errata and the real cause of this behaviour is unknown. */ bool_t __read_mostly is_pmc_quirk; static void check_pmc_quirk(void) { if ( current_cpu_data.x86 == 6 ) is_pmc_quirk = 1; else is_pmc_quirk = 0; } static int core2_get_pmc_count(void); static void handle_pmc_quirk(u64 msr_content) { int num_gen_pmc = core2_get_pmc_count(); int num_fix_pmc = 3; int i; u64 val; if ( !is_pmc_quirk ) return; val = msr_content; for ( i = 0; i < num_gen_pmc; i++ ) { if ( val & 0x1 ) { u64 cnt; rdmsrl(MSR_P6_PERFCTR0 + i, cnt); if ( cnt == 0 ) wrmsrl(MSR_P6_PERFCTR0 + i, 1); } val >>= 1; } val = msr_content >> 32; for ( i = 0; i < num_fix_pmc; i++ ) { if ( val & 0x1 ) { u64 cnt; rdmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, cnt); if ( cnt == 0 ) wrmsrl(MSR_CORE_PERF_FIXED_CTR0 + i, 1); } val >>= 1; } } static const u32 core2_fix_counters_msr[] = { MSR_CORE_PERF_FIXED_CTR0, MSR_CORE_PERF_FIXED_CTR1, MSR_CORE_PERF_FIXED_CTR2 }; /* * MSR_CORE_PERF_FIXED_CTR_CTRL contains the configuration of all fixed * counters. 4 bits for every counter. */ #define FIXED_CTR_CTRL_BITS 4 #define FIXED_CTR_CTRL_MASK ((1 << FIXED_CTR_CTRL_BITS) - 1) /* The index into the core2_ctrls_msr[] of this MSR used in core2_vpmu_dump() */ #define MSR_CORE_PERF_FIXED_CTR_CTRL_IDX 0 /* Core 2 Non-architectual Performance Control MSRs. */ static const u32 core2_ctrls_msr[] = { MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA }; struct pmumsr { unsigned int num; const u32 *msr; }; static const struct pmumsr core2_fix_counters = { VPMU_CORE2_NUM_FIXED, core2_fix_counters_msr }; static const struct pmumsr core2_ctrls = { VPMU_CORE2_NUM_CTRLS, core2_ctrls_msr }; static int arch_pmc_cnt; /* * Read the number of general counters via CPUID.EAX[0xa].EAX[8..15] */ static int core2_get_pmc_count(void) { u32 eax, ebx, ecx, edx; if ( arch_pmc_cnt == 0 ) { cpuid(0xa, &eax, &ebx, &ecx, &edx); arch_pmc_cnt = (eax & PMU_GENERAL_NR_MASK) >> PMU_GENERAL_NR_SHIFT; } return arch_pmc_cnt; } static u64 core2_calc_intial_glb_ctrl_msr(void) { int arch_pmc_bits = (1 << core2_get_pmc_count()) - 1; u64 fix_pmc_bits = (1 << 3) - 1; return ((fix_pmc_bits << 32) | arch_pmc_bits); } /* edx bits 5-12: Bit width of fixed-function performance counters */ static int core2_get_bitwidth_fix_count(void) { u32 eax, ebx, ecx, edx; cpuid(0xa, &eax, &ebx, &ecx, &edx); return ((edx & PMU_FIXED_WIDTH_MASK) >> PMU_FIXED_WIDTH_SHIFT); } static int is_core2_vpmu_msr(u32 msr_index, int *type, int *index) { int i; u32 msr_index_pmc; for ( i = 0; i < core2_fix_counters.num; i++ ) { if ( core2_fix_counters.msr[i] == msr_index ) { *type = MSR_TYPE_COUNTER; *index = i; return 1; } } for ( i = 0; i < core2_ctrls.num; i++ ) { if ( core2_ctrls.msr[i] == msr_index ) { *type = MSR_TYPE_CTRL; *index = i; return 1; } } if ( (msr_index == MSR_CORE_PERF_GLOBAL_CTRL) || (msr_index == MSR_CORE_PERF_GLOBAL_STATUS) || (msr_index == MSR_CORE_PERF_GLOBAL_OVF_CTRL) ) { *type = MSR_TYPE_GLOBAL; return 1; } msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK; if ( (msr_index_pmc >= MSR_IA32_PERFCTR0) && (msr_index_pmc < (MSR_IA32_PERFCTR0 + core2_get_pmc_count())) ) { *type = MSR_TYPE_ARCH_COUNTER; *index = msr_index_pmc - MSR_IA32_PERFCTR0; return 1; } if ( (msr_index >= MSR_P6_EVNTSEL0) && (msr_index < (MSR_P6_EVNTSEL0 + core2_get_pmc_count())) ) { *type = MSR_TYPE_ARCH_CTRL; *index = msr_index - MSR_P6_EVNTSEL0; return 1; } return 0; } static void core2_vpmu_set_msr_bitmap(unsigned long *msr_bitmap) { int i; /* Allow Read/Write PMU Counters MSR Directly. */ for ( i = 0; i < core2_fix_counters.num; i++ ) { clear_bit(msraddr_to_bitpos(core2_fix_counters.msr[i]), msr_bitmap); clear_bit(msraddr_to_bitpos(core2_fix_counters.msr[i]), msr_bitmap + 0x800/BYTES_PER_LONG); } for ( i = 0; i < core2_get_pmc_count(); i++ ) { clear_bit(msraddr_to_bitpos(MSR_IA32_PERFCTR0+i), msr_bitmap); clear_bit(msraddr_to_bitpos(MSR_IA32_PERFCTR0+i), msr_bitmap + 0x800/BYTES_PER_LONG); if ( full_width_write ) { clear_bit(msraddr_to_bitpos(MSR_IA32_A_PERFCTR0 + i), msr_bitmap); clear_bit(msraddr_to_bitpos(MSR_IA32_A_PERFCTR0 + i), msr_bitmap + 0x800/BYTES_PER_LONG); } } /* Allow Read PMU Non-global Controls Directly. */ for ( i = 0; i < core2_ctrls.num; i++ ) clear_bit(msraddr_to_bitpos(core2_ctrls.msr[i]), msr_bitmap); for ( i = 0; i < core2_get_pmc_count(); i++ ) clear_bit(msraddr_to_bitpos(MSR_P6_EVNTSEL0+i), msr_bitmap); } static void core2_vpmu_unset_msr_bitmap(unsigned long *msr_bitmap) { int i; for ( i = 0; i < core2_fix_counters.num; i++ ) { set_bit(msraddr_to_bitpos(core2_fix_counters.msr[i]), msr_bitmap); set_bit(msraddr_to_bitpos(core2_fix_counters.msr[i]), msr_bitmap + 0x800/BYTES_PER_LONG); } for ( i = 0; i < core2_get_pmc_count(); i++ ) { set_bit(msraddr_to_bitpos(MSR_IA32_PERFCTR0+i), msr_bitmap); set_bit(msraddr_to_bitpos(MSR_IA32_PERFCTR0+i), msr_bitmap + 0x800/BYTES_PER_LONG); if ( full_width_write ) { set_bit(msraddr_to_bitpos(MSR_IA32_A_PERFCTR0 + i), msr_bitmap); set_bit(msraddr_to_bitpos(MSR_IA32_A_PERFCTR0 + i), msr_bitmap + 0x800/BYTES_PER_LONG); } } for ( i = 0; i < core2_ctrls.num; i++ ) set_bit(msraddr_to_bitpos(core2_ctrls.msr[i]), msr_bitmap); for ( i = 0; i < core2_get_pmc_count(); i++ ) set_bit(msraddr_to_bitpos(MSR_P6_EVNTSEL0+i), msr_bitmap); } static inline void __core2_vpmu_save(struct vcpu *v) { int i; struct core2_vpmu_context *core2_vpmu_cxt = vcpu_vpmu(v)->context; for ( i = 0; i < core2_fix_counters.num; i++ ) rdmsrl(core2_fix_counters.msr[i], core2_vpmu_cxt->fix_counters[i]); for ( i = 0; i < core2_get_pmc_count(); i++ ) rdmsrl(MSR_IA32_PERFCTR0+i, core2_vpmu_cxt->arch_msr_pair[i].counter); } static int core2_vpmu_save(struct vcpu *v) { struct vpmu_struct *vpmu = vcpu_vpmu(v); if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_SAVE) ) return 0; if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) ) return 0; __core2_vpmu_save(v); /* Unset PMU MSR bitmap to trap lazy load. */ if ( !vpmu_is_set(vpmu, VPMU_RUNNING) && cpu_has_vmx_msr_bitmap ) core2_vpmu_unset_msr_bitmap(v->arch.hvm_vmx.msr_bitmap); return 1; } static inline void __core2_vpmu_load(struct vcpu *v) { unsigned int i, pmc_start; struct core2_vpmu_context *core2_vpmu_cxt = vcpu_vpmu(v)->context; for ( i = 0; i < core2_fix_counters.num; i++ ) wrmsrl(core2_fix_counters.msr[i], core2_vpmu_cxt->fix_counters[i]); if ( full_width_write ) pmc_start = MSR_IA32_A_PERFCTR0; else pmc_start = MSR_IA32_PERFCTR0; for ( i = 0; i < core2_get_pmc_count(); i++ ) wrmsrl(pmc_start + i, core2_vpmu_cxt->arch_msr_pair[i].counter); for ( i = 0; i < core2_ctrls.num; i++ ) wrmsrl(core2_ctrls.msr[i], core2_vpmu_cxt->ctrls[i]); for ( i = 0; i < core2_get_pmc_count(); i++ ) wrmsrl(MSR_P6_EVNTSEL0+i, core2_vpmu_cxt->arch_msr_pair[i].control); } static void core2_vpmu_load(struct vcpu *v) { struct vpmu_struct *vpmu = vcpu_vpmu(v); if ( vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) ) return; __core2_vpmu_load(v); } static int core2_vpmu_alloc_resource(struct vcpu *v) { struct vpmu_struct *vpmu = vcpu_vpmu(v); struct core2_vpmu_context *core2_vpmu_cxt; struct core2_pmu_enable *pmu_enable; if ( !acquire_pmu_ownership(PMU_OWNER_HVM) ) return 0; wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); if ( vmx_add_host_load_msr(MSR_CORE_PERF_GLOBAL_CTRL) ) return 0; if ( vmx_add_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL) ) return 0; vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, core2_calc_intial_glb_ctrl_msr()); pmu_enable = xzalloc_bytes(sizeof(struct core2_pmu_enable) + core2_get_pmc_count() - 1); if ( !pmu_enable ) goto out1; core2_vpmu_cxt = xzalloc_bytes(sizeof(struct core2_vpmu_context) + (core2_get_pmc_count()-1)*sizeof(struct arch_msr_pair)); if ( !core2_vpmu_cxt ) goto out2; core2_vpmu_cxt->pmu_enable = pmu_enable; vpmu->context = (void *)core2_vpmu_cxt; return 1; out2: xfree(pmu_enable); out1: gdprintk(XENLOG_WARNING, "Insufficient memory for PMU, PMU feature is " "unavailable on domain %d vcpu %d.\n", v->vcpu_id, v->domain->domain_id); return 0; } static void core2_vpmu_save_msr_context(struct vcpu *v, int type, int index, u64 msr_data) { struct core2_vpmu_context *core2_vpmu_cxt = vcpu_vpmu(v)->context; switch ( type ) { case MSR_TYPE_CTRL: core2_vpmu_cxt->ctrls[index] = msr_data; break; case MSR_TYPE_ARCH_CTRL: core2_vpmu_cxt->arch_msr_pair[index].control = msr_data; break; } } static int core2_vpmu_msr_common_check(u32 msr_index, int *type, int *index) { struct vpmu_struct *vpmu = vcpu_vpmu(current); if ( !is_core2_vpmu_msr(msr_index, type, index) ) return 0; if ( unlikely(!vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED)) && (vpmu->context != NULL || !core2_vpmu_alloc_resource(current)) ) return 0; vpmu_set(vpmu, VPMU_CONTEXT_ALLOCATED); /* Do the lazy load staff. */ if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) ) { __core2_vpmu_load(current); vpmu_set(vpmu, VPMU_CONTEXT_LOADED); if ( cpu_has_vmx_msr_bitmap ) core2_vpmu_set_msr_bitmap(current->arch.hvm_vmx.msr_bitmap); } return 1; } static int core2_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content) { u64 global_ctrl, non_global_ctrl; char pmu_enable = 0; int i, tmp; int type = -1, index = -1; struct vcpu *v = current; struct vpmu_struct *vpmu = vcpu_vpmu(v); struct core2_vpmu_context *core2_vpmu_cxt = NULL; if ( !core2_vpmu_msr_common_check(msr, &type, &index) ) { /* Special handling for BTS */ if ( msr == MSR_IA32_DEBUGCTLMSR ) { uint64_t supported = IA32_DEBUGCTLMSR_TR | IA32_DEBUGCTLMSR_BTS | IA32_DEBUGCTLMSR_BTINT; if ( cpu_has(¤t_cpu_data, X86_FEATURE_DSCPL) ) supported |= IA32_DEBUGCTLMSR_BTS_OFF_OS | IA32_DEBUGCTLMSR_BTS_OFF_USR; if ( msr_content & supported ) { if ( vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) ) return 1; gdprintk(XENLOG_WARNING, "Debug Store is not supported on this cpu\n"); hvm_inject_hw_exception(TRAP_gp_fault, 0); return 0; } } return 0; } core2_vpmu_cxt = vpmu->context; switch ( msr ) { case MSR_CORE_PERF_GLOBAL_OVF_CTRL: core2_vpmu_cxt->global_ovf_status &= ~msr_content; return 1; case MSR_CORE_PERF_GLOBAL_STATUS: gdprintk(XENLOG_INFO, "Can not write readonly MSR: " "MSR_PERF_GLOBAL_STATUS(0x38E)!\n"); hvm_inject_hw_exception(TRAP_gp_fault, 0); return 1; case MSR_IA32_PEBS_ENABLE: if ( msr_content & 1 ) gdprintk(XENLOG_WARNING, "Guest is trying to enable PEBS, " "which is not supported.\n"); return 1; case MSR_IA32_DS_AREA: if ( vpmu_is_set(vpmu, VPMU_CPU_HAS_DS) ) { if ( !is_canonical_address(msr_content) ) { gdprintk(XENLOG_WARNING, "Illegal address for IA32_DS_AREA: %#" PRIx64 "x\n", msr_content); hvm_inject_hw_exception(TRAP_gp_fault, 0); return 1; } core2_vpmu_cxt->pmu_enable->ds_area_enable = msr_content ? 1 : 0; break; } gdprintk(XENLOG_WARNING, "Guest setting of DTS is ignored.\n"); return 1; case MSR_CORE_PERF_GLOBAL_CTRL: global_ctrl = msr_content; for ( i = 0; i < core2_get_pmc_count(); i++ ) { rdmsrl(MSR_P6_EVNTSEL0+i, non_global_ctrl); core2_vpmu_cxt->pmu_enable->arch_pmc_enable[i] = global_ctrl & (non_global_ctrl >> 22) & 1; global_ctrl >>= 1; } rdmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, non_global_ctrl); global_ctrl = msr_content >> 32; for ( i = 0; i < core2_fix_counters.num; i++ ) { core2_vpmu_cxt->pmu_enable->fixed_ctr_enable[i] = (global_ctrl & 1) & ((non_global_ctrl & 0x3)? 1: 0); non_global_ctrl >>= FIXED_CTR_CTRL_BITS; global_ctrl >>= 1; } break; case MSR_CORE_PERF_FIXED_CTR_CTRL: non_global_ctrl = msr_content; vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl); global_ctrl >>= 32; for ( i = 0; i < core2_fix_counters.num; i++ ) { core2_vpmu_cxt->pmu_enable->fixed_ctr_enable[i] = (global_ctrl & 1) & ((non_global_ctrl & 0x3)? 1: 0); non_global_ctrl >>= 4; global_ctrl >>= 1; } break; default: tmp = msr - MSR_P6_EVNTSEL0; vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl); if ( tmp >= 0 && tmp < core2_get_pmc_count() ) core2_vpmu_cxt->pmu_enable->arch_pmc_enable[tmp] = (global_ctrl >> tmp) & (msr_content >> 22) & 1; } for ( i = 0; i < core2_fix_counters.num; i++ ) pmu_enable |= core2_vpmu_cxt->pmu_enable->fixed_ctr_enable[i]; for ( i = 0; i < core2_get_pmc_count(); i++ ) pmu_enable |= core2_vpmu_cxt->pmu_enable->arch_pmc_enable[i]; pmu_enable |= core2_vpmu_cxt->pmu_enable->ds_area_enable; if ( pmu_enable ) vpmu_set(vpmu, VPMU_RUNNING); else vpmu_reset(vpmu, VPMU_RUNNING); /* Setup LVTPC in local apic */ if ( vpmu_is_set(vpmu, VPMU_RUNNING) && is_vlapic_lvtpc_enabled(vcpu_vlapic(v)) ) { apic_write_around(APIC_LVTPC, PMU_APIC_VECTOR); vpmu->hw_lapic_lvtpc = PMU_APIC_VECTOR; } else { apic_write_around(APIC_LVTPC, PMU_APIC_VECTOR | APIC_LVT_MASKED); vpmu->hw_lapic_lvtpc = PMU_APIC_VECTOR | APIC_LVT_MASKED; } core2_vpmu_save_msr_context(v, type, index, msr_content); if ( type != MSR_TYPE_GLOBAL ) { u64 mask; int inject_gp = 0; switch ( type ) { case MSR_TYPE_ARCH_CTRL: /* MSR_P6_EVNTSEL[0,...] */ mask = ~((1ull << 32) - 1); if (msr_content & mask) inject_gp = 1; break; case MSR_TYPE_CTRL: /* IA32_FIXED_CTR_CTRL */ if ( msr == MSR_IA32_DS_AREA ) break; /* 4 bits per counter, currently 3 fixed counters implemented. */ mask = ~((1ull << (VPMU_CORE2_NUM_FIXED * FIXED_CTR_CTRL_BITS)) - 1); if (msr_content & mask) inject_gp = 1; break; case MSR_TYPE_COUNTER: /* IA32_FIXED_CTR[0-2] */ mask = ~((1ull << core2_get_bitwidth_fix_count()) - 1); if (msr_content & mask) inject_gp = 1; break; } if (inject_gp) hvm_inject_hw_exception(TRAP_gp_fault, 0); else wrmsrl(msr, msr_content); } else vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content); return 1; } static int core2_vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content) { int type = -1, index = -1; struct vcpu *v = current; struct vpmu_struct *vpmu = vcpu_vpmu(v); struct core2_vpmu_context *core2_vpmu_cxt = NULL; if ( core2_vpmu_msr_common_check(msr, &type, &index) ) { core2_vpmu_cxt = vpmu->context; switch ( msr ) { case MSR_CORE_PERF_GLOBAL_OVF_CTRL: *msr_content = 0; break; case MSR_CORE_PERF_GLOBAL_STATUS: *msr_content = core2_vpmu_cxt->global_ovf_status; break; case MSR_CORE_PERF_GLOBAL_CTRL: vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content); break; default: rdmsrl(msr, *msr_content); } } else { /* Extension for BTS */ if ( msr == MSR_IA32_MISC_ENABLE ) { if ( vpmu_is_set(vpmu, VPMU_CPU_HAS_BTS) ) *msr_content &= ~MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; } else return 0; } return 1; } static void core2_vpmu_do_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { if (input == 0x1) { struct vpmu_struct *vpmu = vcpu_vpmu(current); if ( vpmu_is_set(vpmu, VPMU_CPU_HAS_DS) ) { /* Switch on the 'Debug Store' feature in CPUID.EAX[1]:EDX[21] */ *edx |= cpufeat_mask(X86_FEATURE_DS); if ( cpu_has(¤t_cpu_data, X86_FEATURE_DTES64) ) *ecx |= cpufeat_mask(X86_FEATURE_DTES64); if ( cpu_has(¤t_cpu_data, X86_FEATURE_DSCPL) ) *ecx |= cpufeat_mask(X86_FEATURE_DSCPL); } } } /* Dump vpmu info on console, called in the context of keyhandler 'q'. */ static void core2_vpmu_dump(const struct vcpu *v) { const struct vpmu_struct *vpmu = vcpu_vpmu(v); int i, num; const struct core2_vpmu_context *core2_vpmu_cxt = NULL; u64 val; if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) ) return; if ( !vpmu_is_set(vpmu, VPMU_RUNNING) ) { if ( vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) ) printk(" vPMU loaded\n"); else printk(" vPMU allocated\n"); return; } printk(" vPMU running\n"); core2_vpmu_cxt = vpmu->context; num = core2_get_pmc_count(); /* Print the contents of the counter and its configuration msr. */ for ( i = 0; i < num; i++ ) { const struct arch_msr_pair *msr_pair = core2_vpmu_cxt->arch_msr_pair; if ( core2_vpmu_cxt->pmu_enable->arch_pmc_enable[i] ) printk(" general_%d: 0x%016lx ctrl: 0x%016lx\n", i, msr_pair[i].counter, msr_pair[i].control); } /* * The configuration of the fixed counter is 4 bits each in the * MSR_CORE_PERF_FIXED_CTR_CTRL. */ val = core2_vpmu_cxt->ctrls[MSR_CORE_PERF_FIXED_CTR_CTRL_IDX]; for ( i = 0; i < core2_fix_counters.num; i++ ) { if ( core2_vpmu_cxt->pmu_enable->fixed_ctr_enable[i] ) printk(" fixed_%d: 0x%016lx ctrl: %#lx\n", i, core2_vpmu_cxt->fix_counters[i], val & FIXED_CTR_CTRL_MASK); val >>= FIXED_CTR_CTRL_BITS; } } static int core2_vpmu_do_interrupt(struct cpu_user_regs *regs) { struct vcpu *v = current; u64 msr_content; struct vpmu_struct *vpmu = vcpu_vpmu(v); struct core2_vpmu_context *core2_vpmu_cxt = vpmu->context; rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, msr_content); if ( msr_content ) { if ( is_pmc_quirk ) handle_pmc_quirk(msr_content); core2_vpmu_cxt->global_ovf_status |= msr_content; msr_content = 0xC000000700000000 | ((1 << core2_get_pmc_count()) - 1); wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, msr_content); } else { /* No PMC overflow but perhaps a Trace Message interrupt. */ __vmread(GUEST_IA32_DEBUGCTL, &msr_content); if ( !(msr_content & IA32_DEBUGCTLMSR_TR) ) return 0; } /* HW sets the MASK bit when performance counter interrupt occurs*/ vpmu->hw_lapic_lvtpc = apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED; apic_write_around(APIC_LVTPC, vpmu->hw_lapic_lvtpc); return 1; } static int core2_vpmu_initialise(struct vcpu *v, unsigned int vpmu_flags) { struct vpmu_struct *vpmu = vcpu_vpmu(v); u64 msr_content; struct cpuinfo_x86 *c = ¤t_cpu_data; if ( !(vpmu_flags & VPMU_BOOT_BTS) ) goto func_out; /* Check the 'Debug Store' feature in the CPUID.EAX[1]:EDX[21] */ if ( cpu_has(c, X86_FEATURE_DS) ) { if ( !cpu_has(c, X86_FEATURE_DTES64) ) { printk(XENLOG_G_WARNING "CPU doesn't support 64-bit DS Area" " - Debug Store disabled for d%d:v%d\n", v->domain->domain_id, v->vcpu_id); goto func_out; } vpmu_set(vpmu, VPMU_CPU_HAS_DS); rdmsrl(MSR_IA32_MISC_ENABLE, msr_content); if ( msr_content & MSR_IA32_MISC_ENABLE_BTS_UNAVAIL ) { /* If BTS_UNAVAIL is set reset the DS feature. */ vpmu_reset(vpmu, VPMU_CPU_HAS_DS); printk(XENLOG_G_WARNING "CPU has set BTS_UNAVAIL" " - Debug Store disabled for d%d:v%d\n", v->domain->domain_id, v->vcpu_id); } else { vpmu_set(vpmu, VPMU_CPU_HAS_BTS); if ( !cpu_has(c, X86_FEATURE_DSCPL) ) printk(XENLOG_G_INFO "vpmu: CPU doesn't support CPL-Qualified BTS\n"); printk("******************************************************\n"); printk("** WARNING: Emulation of BTS Feature is switched on **\n"); printk("** Using this processor feature in a virtualized **\n"); printk("** environment is not 100%% safe. **\n"); printk("** Setting the DS buffer address with wrong values **\n"); printk("** may lead to hypervisor hangs or crashes. **\n"); printk("** It is NOT recommended for production use! **\n"); printk("******************************************************\n"); } } func_out: check_pmc_quirk(); return 0; } static void core2_vpmu_destroy(struct vcpu *v) { struct vpmu_struct *vpmu = vcpu_vpmu(v); struct core2_vpmu_context *core2_vpmu_cxt = vpmu->context; if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) ) return; xfree(core2_vpmu_cxt->pmu_enable); xfree(vpmu->context); if ( cpu_has_vmx_msr_bitmap ) core2_vpmu_unset_msr_bitmap(v->arch.hvm_vmx.msr_bitmap); release_pmu_ownship(PMU_OWNER_HVM); vpmu_reset(vpmu, VPMU_CONTEXT_ALLOCATED); } struct arch_vpmu_ops core2_vpmu_ops = { .do_wrmsr = core2_vpmu_do_wrmsr, .do_rdmsr = core2_vpmu_do_rdmsr, .do_interrupt = core2_vpmu_do_interrupt, .do_cpuid = core2_vpmu_do_cpuid, .arch_vpmu_destroy = core2_vpmu_destroy, .arch_vpmu_save = core2_vpmu_save, .arch_vpmu_load = core2_vpmu_load, .arch_vpmu_dump = core2_vpmu_dump }; static void core2_no_vpmu_do_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { /* * As in this case the vpmu is not enabled reset some bits in the * architectural performance monitoring related part. */ if ( input == 0xa ) { *eax &= ~PMU_VERSION_MASK; *eax &= ~PMU_GENERAL_NR_MASK; *eax &= ~PMU_GENERAL_WIDTH_MASK; *edx &= ~PMU_FIXED_NR_MASK; *edx &= ~PMU_FIXED_WIDTH_MASK; } } /* * If its a vpmu msr set it to 0. */ static int core2_no_vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content) { int type = -1, index = -1; if ( !is_core2_vpmu_msr(msr, &type, &index) ) return 0; *msr_content = 0; return 1; } /* * These functions are used in case vpmu is not enabled. */ struct arch_vpmu_ops core2_no_vpmu_ops = { .do_rdmsr = core2_no_vpmu_do_rdmsr, .do_cpuid = core2_no_vpmu_do_cpuid, }; int vmx_vpmu_initialise(struct vcpu *v, unsigned int vpmu_flags) { struct vpmu_struct *vpmu = vcpu_vpmu(v); uint8_t family = current_cpu_data.x86; uint8_t cpu_model = current_cpu_data.x86_model; int ret = 0; vpmu->arch_vpmu_ops = &core2_no_vpmu_ops; if ( !vpmu_flags ) return 0; if ( family == 6 ) { u64 caps; rdmsrl(MSR_IA32_PERF_CAPABILITIES, caps); full_width_write = (caps >> 13) & 1; switch ( cpu_model ) { /* Core2: */ case 0x0f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ case 0x16: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ case 0x17: /* 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ case 0x1d: /* six-core 45 nm xeon "Dunnington" */ case 0x2a: /* SandyBridge */ case 0x2d: /* SandyBridge, "Romley-EP" */ /* Nehalem: */ case 0x1a: /* 45 nm nehalem, "Bloomfield" */ case 0x1e: /* 45 nm nehalem, "Lynnfield", "Clarksfield", "Jasper Forest" */ case 0x2e: /* 45 nm nehalem-ex, "Beckton" */ /* Westmere: */ case 0x25: /* 32 nm nehalem, "Clarkdale", "Arrandale" */ case 0x2c: /* 32 nm nehalem, "Gulftown", "Westmere-EP" */ case 0x27: /* 32 nm Westmere-EX */ case 0x3a: /* IvyBridge */ case 0x3e: /* IvyBridge EP */ /* Haswell: */ case 0x3c: case 0x3f: case 0x45: case 0x46: ret = core2_vpmu_initialise(v, vpmu_flags); if ( !ret ) vpmu->arch_vpmu_ops = &core2_vpmu_ops; return ret; } } printk("VPMU: Initialization failed. " "Intel processor family %d model %d has not " "been supported\n", family, cpu_model); return -EINVAL; } xen-4.4.0/xen/arch/x86/hvm/vmx/Makefile0000664000175000017500000000017612307313555015647 0ustar smbsmbobj-bin-y += entry.o obj-y += intr.o obj-y += realmode.o obj-y += vmcs.o obj-y += vmx.o obj-y += vpmu_core2.o obj-y += vvmx.o xen-4.4.0/xen/arch/x86/hvm/vmx/intr.c0000664000175000017500000003054612307313555015333 0ustar smbsmb/* * intr.c: handling I/O, interrupts related VMX entry/exit * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2004-2007, XenSource Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * A few notes on virtual NMI and INTR delivery, and interactions with * interruptibility states: * * We can only inject an ExtInt if EFLAGS.IF = 1 and no blocking by * STI nor MOV SS. Otherwise the VM entry fails. The 'virtual interrupt * pending' control causes a VM exit when all these checks succeed. It will * exit immediately after VM entry if the checks succeed at that point. * * We can only inject an NMI if no blocking by MOV SS (also, depending on * implementation, if no blocking by STI). If pin-based 'virtual NMIs' * control is specified then the NMI-blocking interruptibility flag is * also checked. The 'virtual NMI pending' control (available only in * conjunction with 'virtual NMIs') causes a VM exit when all these checks * succeed. It will exit immediately after VM entry if the checks succeed * at that point. * * Because a processor may or may not check blocking-by-STI when injecting * a virtual NMI, it will be necessary to convert that to block-by-MOV-SS * before specifying the 'virtual NMI pending' control. Otherwise we could * enter an infinite loop where we check blocking-by-STI in software and * thus delay delivery of a virtual NMI, but the processor causes immediate * VM exit because it does not check blocking-by-STI. * * Injecting a virtual NMI sets the NMI-blocking interruptibility flag only * if the 'virtual NMIs' control is set. Injecting *any* kind of event clears * the STI- and MOV-SS-blocking interruptibility-state flags. */ static void enable_intr_window(struct vcpu *v, struct hvm_intack intack) { u32 ctl = CPU_BASED_VIRTUAL_INTR_PENDING; ASSERT(intack.source != hvm_intsrc_none); if ( unlikely(tb_init_done) ) { unsigned long intr; __vmread(VM_ENTRY_INTR_INFO, &intr); HVMTRACE_3D(INTR_WINDOW, intack.vector, intack.source, (intr & INTR_INFO_VALID_MASK) ? intr & 0xff : -1); } if ( (intack.source == hvm_intsrc_nmi) && cpu_has_vmx_vnmi ) { /* * We set MOV-SS blocking in lieu of STI blocking when delivering an * NMI. This is because it is processor-specific whether STI-blocking * blocks NMIs. Hence we *must* check for STI-blocking on NMI delivery * (otherwise vmentry will fail on processors that check for STI- * blocking) but if the processor does not check for STI-blocking then * we may immediately vmexit and hance make no progress! * (see SDM 3B 21.3, "Other Causes of VM Exits"). */ unsigned long intr_shadow; __vmread(GUEST_INTERRUPTIBILITY_INFO, &intr_shadow); if ( intr_shadow & VMX_INTR_SHADOW_STI ) { /* Having both STI-blocking and MOV-SS-blocking fails vmentry. */ intr_shadow &= ~VMX_INTR_SHADOW_STI; intr_shadow |= VMX_INTR_SHADOW_MOV_SS; __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow); } ctl = CPU_BASED_VIRTUAL_NMI_PENDING; } if ( !(v->arch.hvm_vmx.exec_control & ctl) ) { v->arch.hvm_vmx.exec_control |= ctl; vmx_update_cpu_exec_control(v); } } /* * Injecting interrupts for nested virtualization * * When injecting virtual interrupts (originated from L0), there are * two major possibilities, within L1 context and within L2 context * 1. L1 context (in_nesting == 0) * Everything is the same as without nested, check RFLAGS.IF to * see if the injection can be done, using VMCS to inject the * interrupt * * 2. L2 context (in_nesting == 1) * Causes a virtual VMExit, RFLAGS.IF is ignored, whether to ack * irq according to intr_ack_on_exit, shouldn't block normally, * except for: * a. context transition * interrupt needs to be blocked at virtual VMEntry time * b. L2 idtv reinjection * if L2 idtv is handled within L0 (e.g. L0 shadow page fault), * it needs to be reinjected without exiting to L1, interrupt * injection should be blocked as well at this point. * * Unfortunately, interrupt blocking in L2 won't work with simple * intr_window_open (which depends on L2's IF). To solve this, * the following algorithm can be used: * v->arch.hvm_vmx.exec_control.VIRTUAL_INTR_PENDING now denotes * only L0 control, physical control may be different from it. * - if in L1, it behaves normally, intr window is written * to physical control as it is * - if in L2, replace it to MTF (or NMI window) if possible * - if MTF/NMI window is not used, intr window can still be * used but may have negative impact on interrupt performance. */ enum hvm_intblk nvmx_intr_blocked(struct vcpu *v) { int r = hvm_intblk_none; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); if ( nestedhvm_vcpu_in_guestmode(v) ) { if ( nvcpu->nv_vmexit_pending || nvcpu->nv_vmswitch_in_progress ) r = hvm_intblk_rflags_ie; else { unsigned long intr_info; __vmread(VM_ENTRY_INTR_INFO, &intr_info); if ( intr_info & INTR_INFO_VALID_MASK ) r = hvm_intblk_rflags_ie; } } else if ( nvcpu->nv_vmentry_pending ) r = hvm_intblk_rflags_ie; return r; } static int nvmx_intr_intercept(struct vcpu *v, struct hvm_intack intack) { u32 ctrl; /* If blocked by L1's tpr, then nothing to do. */ if ( nestedhvm_vcpu_in_guestmode(v) && hvm_interrupt_blocked(v, intack) == hvm_intblk_tpr ) return 1; if ( nvmx_intr_blocked(v) != hvm_intblk_none ) { enable_intr_window(v, intack); return 1; } if ( nestedhvm_vcpu_in_guestmode(v) ) { if ( intack.source == hvm_intsrc_pic || intack.source == hvm_intsrc_lapic ) { ctrl = __get_vvmcs(vcpu_nestedhvm(v).nv_vvmcx, PIN_BASED_VM_EXEC_CONTROL); if ( !(ctrl & PIN_BASED_EXT_INTR_MASK) ) return 0; vmx_inject_extint(intack.vector, intack.source); ctrl = __get_vvmcs(vcpu_nestedhvm(v).nv_vvmcx, VM_EXIT_CONTROLS); if ( ctrl & VM_EXIT_ACK_INTR_ON_EXIT ) { /* for now, duplicate the ack path in vmx_intr_assist */ hvm_vcpu_ack_pending_irq(v, intack); pt_intr_post(v, intack); intack = hvm_vcpu_has_pending_irq(v); if ( unlikely(intack.source != hvm_intsrc_none) ) enable_intr_window(v, intack); } else enable_intr_window(v, intack); return 1; } } return 0; } void vmx_intr_assist(void) { struct hvm_intack intack; struct vcpu *v = current; unsigned int tpr_threshold = 0; enum hvm_intblk intblk; int pt_vector = -1; /* Block event injection when single step with MTF. */ if ( unlikely(v->arch.hvm_vcpu.single_step) ) { v->arch.hvm_vmx.exec_control |= CPU_BASED_MONITOR_TRAP_FLAG; vmx_update_cpu_exec_control(v); return; } /* Crank the handle on interrupt state. */ if ( is_hvm_vcpu(v) ) pt_vector = pt_update_irq(v); do { unsigned long intr_info; intack = hvm_vcpu_has_pending_irq(v); if ( likely(intack.source == hvm_intsrc_none) ) goto out; if ( unlikely(nvmx_intr_intercept(v, intack)) ) goto out; intblk = hvm_interrupt_blocked(v, intack); if ( cpu_has_vmx_virtual_intr_delivery ) { /* Set "Interrupt-window exiting" for ExtINT and NMI. */ if ( (intblk != hvm_intblk_none) && (intack.source == hvm_intsrc_pic || intack.source == hvm_intsrc_vector || intack.source == hvm_intsrc_nmi) ) { enable_intr_window(v, intack); goto out; } __vmread(VM_ENTRY_INTR_INFO, &intr_info); if ( intr_info & INTR_INFO_VALID_MASK ) { if ( (intack.source == hvm_intsrc_pic) || (intack.source == hvm_intsrc_nmi) || (intack.source == hvm_intsrc_mce) ) enable_intr_window(v, intack); goto out; } } else if ( intblk == hvm_intblk_tpr ) { ASSERT(vlapic_enabled(vcpu_vlapic(v))); ASSERT(intack.source == hvm_intsrc_lapic); tpr_threshold = intack.vector >> 4; goto out; } else if ( intblk != hvm_intblk_none ) { enable_intr_window(v, intack); goto out; } else { __vmread(VM_ENTRY_INTR_INFO, &intr_info); if ( intr_info & INTR_INFO_VALID_MASK ) { enable_intr_window(v, intack); goto out; } } intack = hvm_vcpu_ack_pending_irq(v, intack); } while ( intack.source == hvm_intsrc_none ); if ( intack.source == hvm_intsrc_nmi ) { vmx_inject_nmi(); } else if ( intack.source == hvm_intsrc_mce ) { hvm_inject_hw_exception(TRAP_machine_check, HVM_DELIVER_NO_ERROR_CODE); } else if ( cpu_has_vmx_virtual_intr_delivery && intack.source != hvm_intsrc_pic && intack.source != hvm_intsrc_vector ) { unsigned long status; unsigned int i, n; /* * Set eoi_exit_bitmap for periodic timer interrup to cause EOI-induced VM * exit, then pending periodic time interrups have the chance to be injected * for compensation */ if (pt_vector != -1) vmx_set_eoi_exit_bitmap(v, pt_vector); /* we need update the RVI field */ __vmread(GUEST_INTR_STATUS, &status); status &= ~VMX_GUEST_INTR_STATUS_SUBFIELD_BITMASK; status |= VMX_GUEST_INTR_STATUS_SUBFIELD_BITMASK & intack.vector; __vmwrite(GUEST_INTR_STATUS, status); n = ARRAY_SIZE(v->arch.hvm_vmx.eoi_exit_bitmap); while ( (i = find_first_bit(&v->arch.hvm_vmx.eoi_exitmap_changed, n)) < n ) { clear_bit(i, &v->arch.hvm_vmx.eoi_exitmap_changed); __vmwrite(EOI_EXIT_BITMAP(i), v->arch.hvm_vmx.eoi_exit_bitmap[i]); } pt_intr_post(v, intack); } else { HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0); vmx_inject_extint(intack.vector, intack.source); pt_intr_post(v, intack); } /* Is there another IRQ to queue up behind this one? */ intack = hvm_vcpu_has_pending_irq(v); if ( !cpu_has_vmx_virtual_intr_delivery || intack.source == hvm_intsrc_pic || intack.source == hvm_intsrc_vector ) { if ( unlikely(intack.source != hvm_intsrc_none) ) enable_intr_window(v, intack); } out: if ( !nestedhvm_vcpu_in_guestmode(v) && !cpu_has_vmx_virtual_intr_delivery && cpu_has_vmx_tpr_shadow ) __vmwrite(TPR_THRESHOLD, tpr_threshold); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/hvm/vmx/realmode.c0000664000175000017500000001760412307313555016147 0ustar smbsmb/****************************************************************************** * arch/x86/hvm/vmx/realmode.c * * Real-mode emulation for VMX. * * Copyright (c) 2007-2008 Citrix Systems, Inc. * * Authors: * Keir Fraser */ #include #include #include #include #include #include #include #include #include #include #include #include static void realmode_deliver_exception( unsigned int vector, unsigned int insn_len, struct hvm_emulate_ctxt *hvmemul_ctxt) { struct segment_register *idtr, *csr; struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs; uint32_t cs_eip, pstk; uint16_t frame[3]; unsigned int last_byte; idtr = hvmemul_get_seg_reg(x86_seg_idtr, hvmemul_ctxt); csr = hvmemul_get_seg_reg(x86_seg_cs, hvmemul_ctxt); __set_bit(x86_seg_cs, &hvmemul_ctxt->seg_reg_dirty); again: last_byte = (vector * 4) + 3; if ( idtr->limit < last_byte || hvm_copy_from_guest_phys(&cs_eip, idtr->base + vector * 4, 4) != HVMCOPY_okay ) { /* Software interrupt? */ if ( insn_len != 0 ) { insn_len = 0; vector = TRAP_gp_fault; goto again; } /* Exception or hardware interrupt. */ switch ( vector ) { case TRAP_double_fault: hvm_triple_fault(); return; case TRAP_gp_fault: vector = TRAP_double_fault; goto again; default: vector = TRAP_gp_fault; goto again; } } frame[0] = regs->eip + insn_len; frame[1] = csr->sel; frame[2] = regs->eflags & ~X86_EFLAGS_RF; /* We can't test hvmemul_ctxt->ctxt.sp_size: it may not be initialised. */ if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.db ) { regs->esp -= 6; pstk = regs->esp; } else { pstk = (uint16_t)(regs->esp - 6); regs->esp &= ~0xffff; regs->esp |= pstk; } pstk += hvmemul_get_seg_reg(x86_seg_ss, hvmemul_ctxt)->base; (void)hvm_copy_to_guest_phys(pstk, frame, sizeof(frame)); csr->sel = cs_eip >> 16; csr->base = (uint32_t)csr->sel << 4; regs->eip = (uint16_t)cs_eip; regs->eflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF | X86_EFLAGS_RF); /* Exception delivery clears STI and MOV-SS blocking. */ if ( hvmemul_ctxt->intr_shadow & (VMX_INTR_SHADOW_STI|VMX_INTR_SHADOW_MOV_SS) ) { hvmemul_ctxt->intr_shadow &= ~(VMX_INTR_SHADOW_STI|VMX_INTR_SHADOW_MOV_SS); __vmwrite(GUEST_INTERRUPTIBILITY_INFO, hvmemul_ctxt->intr_shadow); } } static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt) { struct vcpu *curr = current; int rc; perfc_incr(realmode_emulations); rc = hvm_emulate_one(hvmemul_ctxt); if ( rc == X86EMUL_UNHANDLEABLE ) { gdprintk(XENLOG_ERR, "Failed to emulate insn.\n"); goto fail; } if ( rc == X86EMUL_EXCEPTION ) { if ( !hvmemul_ctxt->exn_pending ) { unsigned long intr_info; __vmread(VM_ENTRY_INTR_INFO, &intr_info); __vmwrite(VM_ENTRY_INTR_INFO, 0); if ( !(intr_info & INTR_INFO_VALID_MASK) ) { gdprintk(XENLOG_ERR, "Exception pending but no info.\n"); goto fail; } hvmemul_ctxt->exn_vector = (uint8_t)intr_info; hvmemul_ctxt->exn_insn_len = 0; } if ( unlikely(curr->domain->debugger_attached) && ((hvmemul_ctxt->exn_vector == TRAP_debug) || (hvmemul_ctxt->exn_vector == TRAP_int3)) ) { domain_pause_for_debugger(); } else if ( curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE ) { gdprintk(XENLOG_ERR, "Exception %02x in protected mode.\n", hvmemul_ctxt->exn_vector); goto fail; } else { realmode_deliver_exception( hvmemul_ctxt->exn_vector, hvmemul_ctxt->exn_insn_len, hvmemul_ctxt); } } return; fail: gdprintk(XENLOG_ERR, "Real-mode emulation failed @ %04x:%08lx: " "%02x %02x %02x %02x %02x %02x\n", hvmemul_get_seg_reg(x86_seg_cs, hvmemul_ctxt)->sel, hvmemul_ctxt->insn_buf_eip, hvmemul_ctxt->insn_buf[0], hvmemul_ctxt->insn_buf[1], hvmemul_ctxt->insn_buf[2], hvmemul_ctxt->insn_buf[3], hvmemul_ctxt->insn_buf[4], hvmemul_ctxt->insn_buf[5]); domain_crash(curr->domain); } void vmx_realmode(struct cpu_user_regs *regs) { struct vcpu *curr = current; struct hvm_emulate_ctxt hvmemul_ctxt; struct segment_register *sreg; struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io; unsigned long intr_info; unsigned int emulations = 0; /* Get-and-clear VM_ENTRY_INTR_INFO. */ __vmread(VM_ENTRY_INTR_INFO, &intr_info); if ( intr_info & INTR_INFO_VALID_MASK ) __vmwrite(VM_ENTRY_INTR_INFO, 0); hvm_emulate_prepare(&hvmemul_ctxt, regs); if ( vio->io_state == HVMIO_completed ) realmode_emulate_one(&hvmemul_ctxt); /* Only deliver interrupts into emulated real mode. */ if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) && (intr_info & INTR_INFO_VALID_MASK) ) { realmode_deliver_exception((uint8_t)intr_info, 0, &hvmemul_ctxt); intr_info = 0; } curr->arch.hvm_vmx.vmx_emulate = 1; while ( curr->arch.hvm_vmx.vmx_emulate && !softirq_pending(smp_processor_id()) && (vio->io_state == HVMIO_none) ) { /* * Check for pending interrupts only every 16 instructions, because * hvm_local_events_need_delivery() is moderately expensive, and only * in real mode, because we don't emulate protected-mode IDT vectoring. */ if ( unlikely(!(++emulations & 15)) && curr->arch.hvm_vmx.vmx_realmode && hvm_local_events_need_delivery(curr) ) break; realmode_emulate_one(&hvmemul_ctxt); /* Stop emulating unless our segment state is not safe */ if ( curr->arch.hvm_vmx.vmx_realmode ) curr->arch.hvm_vmx.vmx_emulate = (curr->arch.hvm_vmx.vm86_segment_mask != 0); else curr->arch.hvm_vmx.vmx_emulate = ((hvmemul_ctxt.seg_reg[x86_seg_cs].sel & 3) || (hvmemul_ctxt.seg_reg[x86_seg_ss].sel & 3)); } /* Need to emulate next time if we've started an IO operation */ if ( vio->io_state != HVMIO_none ) curr->arch.hvm_vmx.vmx_emulate = 1; if ( !curr->arch.hvm_vmx.vmx_emulate && !curr->arch.hvm_vmx.vmx_realmode ) { /* * Cannot enter protected mode with bogus selector RPLs and DPLs. * At this point CS.RPL == SS.RPL == CS.DPL == SS.DPL == 0. For * DS, ES, FS and GS the most uninvasive trick is to set DPL == RPL. */ sreg = hvmemul_get_seg_reg(x86_seg_ds, &hvmemul_ctxt); sreg->attr.fields.dpl = sreg->sel & 3; sreg = hvmemul_get_seg_reg(x86_seg_es, &hvmemul_ctxt); sreg->attr.fields.dpl = sreg->sel & 3; sreg = hvmemul_get_seg_reg(x86_seg_fs, &hvmemul_ctxt); sreg->attr.fields.dpl = sreg->sel & 3; sreg = hvmemul_get_seg_reg(x86_seg_gs, &hvmemul_ctxt); sreg->attr.fields.dpl = sreg->sel & 3; hvmemul_ctxt.seg_reg_dirty |= (1ul << x86_seg_ds) | (1ul << x86_seg_es) | (1ul << x86_seg_fs) | (1ul << x86_seg_gs); } hvm_emulate_writeback(&hvmemul_ctxt); /* Re-instate VM_ENTRY_INTR_INFO if we did not discharge it. */ if ( intr_info & INTR_INFO_VALID_MASK ) __vmwrite(VM_ENTRY_INTR_INFO, intr_info); } xen-4.4.0/xen/arch/x86/hvm/vmx/vvmx.c0000664000175000017500000022330012307313555015347 0ustar smbsmb/* * vvmx.c: Support virtual VMX for nested virtualization. * * Copyright (c) 2010, Intel Corporation. * Author: Qing He * Eddie Dong * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #include #include #include #include #include #include #include static DEFINE_PER_CPU(u64 *, vvmcs_buf); static void nvmx_purge_vvmcs(struct vcpu *v); #define VMCS_BUF_SIZE 100 int nvmx_cpu_up_prepare(unsigned int cpu) { if ( per_cpu(vvmcs_buf, cpu) != NULL ) return 0; per_cpu(vvmcs_buf, cpu) = xzalloc_array(u64, VMCS_BUF_SIZE); if ( per_cpu(vvmcs_buf, cpu) != NULL ) return 0; return -ENOMEM; } void nvmx_cpu_dead(unsigned int cpu) { xfree(per_cpu(vvmcs_buf, cpu)); per_cpu(vvmcs_buf, cpu) = NULL; } int nvmx_vcpu_initialise(struct vcpu *v) { struct nestedvmx *nvmx = &vcpu_2_nvmx(v); struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); nvcpu->nv_n2vmcx = alloc_xenheap_page(); if ( !nvcpu->nv_n2vmcx ) { gdprintk(XENLOG_ERR, "nest: allocation for shadow vmcs failed\n"); return -ENOMEM; } /* non-root VMREAD/VMWRITE bitmap. */ if ( cpu_has_vmx_vmcs_shadowing ) { struct page_info *vmread_bitmap, *vmwrite_bitmap; unsigned long *vr, *vw; vmread_bitmap = alloc_domheap_page(NULL, 0); if ( !vmread_bitmap ) { gdprintk(XENLOG_ERR, "nest: allocation for vmread bitmap failed\n"); return -ENOMEM; } v->arch.hvm_vmx.vmread_bitmap = vmread_bitmap; vmwrite_bitmap = alloc_domheap_page(NULL, 0); if ( !vmwrite_bitmap ) { gdprintk(XENLOG_ERR, "nest: allocation for vmwrite bitmap failed\n"); return -ENOMEM; } v->arch.hvm_vmx.vmwrite_bitmap = vmwrite_bitmap; vr = __map_domain_page(vmread_bitmap); vw = __map_domain_page(vmwrite_bitmap); clear_page(vr); clear_page(vw); /* * For the following 4 encodings, we need to handle them in VMM. * Let them vmexit as usual. */ set_bit(IO_BITMAP_A, vw); set_bit(IO_BITMAP_A_HIGH, vw); set_bit(IO_BITMAP_B, vw); set_bit(IO_BITMAP_B_HIGH, vw); unmap_domain_page(vr); unmap_domain_page(vw); } nvmx->ept.enabled = 0; nvmx->guest_vpid = 0; nvmx->vmxon_region_pa = 0; nvcpu->nv_vvmcx = NULL; nvcpu->nv_vvmcxaddr = VMCX_EADDR; nvmx->intr.intr_info = 0; nvmx->intr.error_code = 0; nvmx->iobitmap[0] = NULL; nvmx->iobitmap[1] = NULL; nvmx->msrbitmap = NULL; INIT_LIST_HEAD(&nvmx->launched_list); return 0; } void nvmx_vcpu_destroy(struct vcpu *v) { struct nestedvmx *nvmx = &vcpu_2_nvmx(v); struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); struct vvmcs_list *item, *n; /* * When destroying the vcpu, it may be running on behalf of L2 guest. * Therefore we need to switch the VMCS pointer back to the L1 VMCS, * in order to avoid double free of L2 VMCS and the possible memory * leak of L1 VMCS page. */ if ( nvcpu->nv_n1vmcx ) v->arch.hvm_vmx.vmcs = nvcpu->nv_n1vmcx; if ( nvcpu->nv_n2vmcx ) { __vmpclear(virt_to_maddr(nvcpu->nv_n2vmcx)); free_xenheap_page(nvcpu->nv_n2vmcx); nvcpu->nv_n2vmcx = NULL; } /* Must also cope with nvmx_vcpu_initialise() not having got called. */ if ( nvmx->launched_list.next ) list_for_each_entry_safe(item, n, &nvmx->launched_list, node) { list_del(&item->node); xfree(item); } if ( v->arch.hvm_vmx.vmread_bitmap ) { free_domheap_page(v->arch.hvm_vmx.vmread_bitmap); v->arch.hvm_vmx.vmread_bitmap = NULL; } if ( v->arch.hvm_vmx.vmwrite_bitmap ) { free_domheap_page(v->arch.hvm_vmx.vmwrite_bitmap); v->arch.hvm_vmx.vmwrite_bitmap = NULL; } } void nvmx_domain_relinquish_resources(struct domain *d) { struct vcpu *v; for_each_vcpu ( d, v ) nvmx_purge_vvmcs(v); } int nvmx_vcpu_reset(struct vcpu *v) { return 0; } uint64_t nvmx_vcpu_guestcr3(struct vcpu *v) { /* TODO */ ASSERT(0); return 0; } uint64_t nvmx_vcpu_eptp_base(struct vcpu *v) { uint64_t eptp_base; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); eptp_base = __get_vvmcs(nvcpu->nv_vvmcx, EPT_POINTER); return eptp_base & PAGE_MASK; } uint32_t nvmx_vcpu_asid(struct vcpu *v) { /* TODO */ ASSERT(0); return 0; } bool_t nvmx_ept_enabled(struct vcpu *v) { struct nestedvmx *nvmx = &vcpu_2_nvmx(v); return !!(nvmx->ept.enabled); } static const enum x86_segment sreg_to_index[] = { [VMX_SREG_ES] = x86_seg_es, [VMX_SREG_CS] = x86_seg_cs, [VMX_SREG_SS] = x86_seg_ss, [VMX_SREG_DS] = x86_seg_ds, [VMX_SREG_FS] = x86_seg_fs, [VMX_SREG_GS] = x86_seg_gs, }; struct vmx_inst_decoded { #define VMX_INST_MEMREG_TYPE_MEMORY 0 #define VMX_INST_MEMREG_TYPE_REG 1 int type; union { struct { unsigned long mem; unsigned int len; }; enum vmx_regs_enc reg1; }; enum vmx_regs_enc reg2; }; enum vmx_ops_result { VMSUCCEED, VMFAIL_VALID, VMFAIL_INVALID, }; #define CASE_SET_REG(REG, reg) \ case VMX_REG_ ## REG: regs->reg = value; break #define CASE_GET_REG(REG, reg) \ case VMX_REG_ ## REG: value = regs->reg; break static int vvmcs_offset(u32 width, u32 type, u32 index) { int offset; offset = (index & 0x1f) | type << 5 | width << 7; if ( offset == 0 ) /* vpid */ offset = 0x3f; return offset; } u64 __get_vvmcs_virtual(void *vvmcs, u32 vmcs_encoding) { union vmcs_encoding enc; u64 *content = (u64 *) vvmcs; int offset; u64 res; enc.word = vmcs_encoding; offset = vvmcs_offset(enc.width, enc.type, enc.index); res = content[offset]; switch ( enc.width ) { case VVMCS_WIDTH_16: res &= 0xffff; break; case VVMCS_WIDTH_64: if ( enc.access_type ) res >>= 32; break; case VVMCS_WIDTH_32: res &= 0xffffffff; break; case VVMCS_WIDTH_NATURAL: default: break; } return res; } u64 __get_vvmcs_real(void *vvmcs, u32 vmcs_encoding) { return virtual_vmcs_vmread(vvmcs, vmcs_encoding); } void __set_vvmcs_virtual(void *vvmcs, u32 vmcs_encoding, u64 val) { union vmcs_encoding enc; u64 *content = (u64 *) vvmcs; int offset; u64 res; enc.word = vmcs_encoding; offset = vvmcs_offset(enc.width, enc.type, enc.index); res = content[offset]; switch ( enc.width ) { case VVMCS_WIDTH_16: res = val & 0xffff; break; case VVMCS_WIDTH_64: if ( enc.access_type ) { res &= 0xffffffff; res |= val << 32; } else res = val; break; case VVMCS_WIDTH_32: res = val & 0xffffffff; break; case VVMCS_WIDTH_NATURAL: default: res = val; break; } content[offset] = res; } void __set_vvmcs_real(void *vvmcs, u32 vmcs_encoding, u64 val) { virtual_vmcs_vmwrite(vvmcs, vmcs_encoding, val); } static unsigned long reg_read(struct cpu_user_regs *regs, enum vmx_regs_enc index) { unsigned long *pval = decode_register(index, regs, 0); return *pval; } static void reg_write(struct cpu_user_regs *regs, enum vmx_regs_enc index, unsigned long value) { unsigned long *pval = decode_register(index, regs, 0); *pval = value; } static inline u32 __n2_pin_exec_control(struct vcpu *v) { struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); return __get_vvmcs(nvcpu->nv_vvmcx, PIN_BASED_VM_EXEC_CONTROL); } static inline u32 __n2_exec_control(struct vcpu *v) { struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); return __get_vvmcs(nvcpu->nv_vvmcx, CPU_BASED_VM_EXEC_CONTROL); } static inline u32 __n2_secondary_exec_control(struct vcpu *v) { struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); u64 second_ctrl = 0; if ( __n2_exec_control(v) & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS ) second_ctrl = __get_vvmcs(nvcpu->nv_vvmcx, SECONDARY_VM_EXEC_CONTROL); return second_ctrl; } static int vmx_inst_check_privilege(struct cpu_user_regs *regs, int vmxop_check) { struct vcpu *v = current; struct segment_register cs; if ( vmxop_check ) { if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) || !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VMXE) ) goto invalid_op; } else if ( !vcpu_2_nvmx(v).vmxon_region_pa ) goto invalid_op; vmx_get_segment_register(v, x86_seg_cs, &cs); if ( (regs->eflags & X86_EFLAGS_VM) || (hvm_long_mode_enabled(v) && cs.attr.fields.l == 0) ) goto invalid_op; else if ( nestedhvm_vcpu_in_guestmode(v) ) goto vmexit; if ( (cs.sel & 3) > 0 ) goto gp_fault; return X86EMUL_OKAY; vmexit: gdprintk(XENLOG_ERR, "vmx_inst_check_privilege: vmexit\n"); vcpu_nestedhvm(v).nv_vmexit_pending = 1; return X86EMUL_EXCEPTION; invalid_op: gdprintk(XENLOG_ERR, "vmx_inst_check_privilege: invalid_op\n"); hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); return X86EMUL_EXCEPTION; gp_fault: gdprintk(XENLOG_ERR, "vmx_inst_check_privilege: gp_fault\n"); hvm_inject_hw_exception(TRAP_gp_fault, 0); return X86EMUL_EXCEPTION; } static int decode_vmx_inst(struct cpu_user_regs *regs, struct vmx_inst_decoded *decode, unsigned long *poperandS, int vmxon_check) { struct vcpu *v = current; union vmx_inst_info info; struct segment_register seg; unsigned long base, index, seg_base, disp, offset; int scale, size; if ( vmx_inst_check_privilege(regs, vmxon_check) != X86EMUL_OKAY ) return X86EMUL_EXCEPTION; __vmread(VMX_INSTRUCTION_INFO, &offset); info.word = offset; if ( info.fields.memreg ) { decode->type = VMX_INST_MEMREG_TYPE_REG; decode->reg1 = info.fields.reg1; if ( poperandS != NULL ) *poperandS = reg_read(regs, decode->reg1); } else { bool_t mode_64bit = 0; decode->type = VMX_INST_MEMREG_TYPE_MEMORY; if ( hvm_long_mode_enabled(v) ) { vmx_get_segment_register(v, x86_seg_cs, &seg); mode_64bit = seg.attr.fields.l; } if ( info.fields.segment > VMX_SREG_GS ) goto gp_fault; vmx_get_segment_register(v, sreg_to_index[info.fields.segment], &seg); seg_base = seg.base; base = info.fields.base_reg_invalid ? 0 : reg_read(regs, info.fields.base_reg); index = info.fields.index_reg_invalid ? 0 : reg_read(regs, info.fields.index_reg); scale = 1 << info.fields.scaling; __vmread(EXIT_QUALIFICATION, &disp); size = 1 << (info.fields.addr_size + 1); offset = base + index * scale + disp; base = !mode_64bit || info.fields.segment >= VMX_SREG_FS ? seg_base + offset : offset; if ( offset + size - 1 < offset || (mode_64bit ? !is_canonical_address((long)base < 0 ? base : base + size - 1) : offset + size - 1 > seg.limit) ) goto gp_fault; if ( poperandS != NULL && hvm_copy_from_guest_virt(poperandS, base, size, 0) != HVMCOPY_okay ) return X86EMUL_EXCEPTION; decode->mem = base; decode->len = size; } decode->reg2 = info.fields.reg2; return X86EMUL_OKAY; gp_fault: hvm_inject_hw_exception(TRAP_gp_fault, 0); return X86EMUL_EXCEPTION; } static void vmreturn(struct cpu_user_regs *regs, enum vmx_ops_result ops_res) { unsigned long eflags = regs->eflags; unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF; eflags &= ~mask; switch ( ops_res ) { case VMSUCCEED: break; case VMFAIL_VALID: /* TODO: error number, useful for guest VMM debugging */ eflags |= X86_EFLAGS_ZF; break; case VMFAIL_INVALID: default: eflags |= X86_EFLAGS_CF; break; } regs->eflags = eflags; } int nvmx_intercepts_exception(struct vcpu *v, unsigned int trap, int error_code) { struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); u32 exception_bitmap, pfec_match=0, pfec_mask=0; int r; ASSERT ( trap < 32 ); exception_bitmap = __get_vvmcs(nvcpu->nv_vvmcx, EXCEPTION_BITMAP); r = exception_bitmap & (1 << trap) ? 1: 0; if ( trap == TRAP_page_fault ) { pfec_match = __get_vvmcs(nvcpu->nv_vvmcx, PAGE_FAULT_ERROR_CODE_MATCH); pfec_mask = __get_vvmcs(nvcpu->nv_vvmcx, PAGE_FAULT_ERROR_CODE_MASK); if ( (error_code & pfec_mask) != pfec_match ) r = !r; } return r; } /* * Nested VMX uses "strict" condition to exit from * L2 guest if either L1 VMM or L0 VMM expect to exit. */ static inline u32 __shadow_control(struct vcpu *v, unsigned int field, u32 host_value) { struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); return (u32) __get_vvmcs(nvcpu->nv_vvmcx, field) | host_value; } static void set_shadow_control(struct vcpu *v, unsigned int field, u32 host_value) { __vmwrite(field, __shadow_control(v, field, host_value)); } unsigned long *_shadow_io_bitmap(struct vcpu *v) { struct nestedvmx *nvmx = &vcpu_2_nvmx(v); int port80, portED; u8 *bitmap; bitmap = nvmx->iobitmap[0]; port80 = bitmap[0x80 >> 3] & (1 << (0x80 & 0x7)) ? 1 : 0; portED = bitmap[0xed >> 3] & (1 << (0xed & 0x7)) ? 1 : 0; return nestedhvm_vcpu_iomap_get(port80, portED); } void nvmx_update_exec_control(struct vcpu *v, u32 host_cntrl) { u32 pio_cntrl = (CPU_BASED_ACTIVATE_IO_BITMAP | CPU_BASED_UNCOND_IO_EXITING); unsigned long *bitmap; u32 shadow_cntrl; shadow_cntrl = __n2_exec_control(v); pio_cntrl &= shadow_cntrl; /* Enforce the removed features */ shadow_cntrl &= ~(CPU_BASED_ACTIVATE_MSR_BITMAP | CPU_BASED_ACTIVATE_IO_BITMAP | CPU_BASED_UNCOND_IO_EXITING); shadow_cntrl |= host_cntrl; if ( pio_cntrl == CPU_BASED_UNCOND_IO_EXITING ) { /* L1 VMM intercepts all I/O instructions */ shadow_cntrl |= CPU_BASED_UNCOND_IO_EXITING; shadow_cntrl &= ~CPU_BASED_ACTIVATE_IO_BITMAP; } else { /* Use IO_BITMAP in shadow */ if ( pio_cntrl == 0 ) { /* * L1 VMM doesn't intercept IO instruction. * Use host configuration and reset IO_BITMAP */ bitmap = hvm_io_bitmap; } else { /* use IO bitmap */ bitmap = _shadow_io_bitmap(v); } __vmwrite(IO_BITMAP_A, virt_to_maddr(bitmap)); __vmwrite(IO_BITMAP_B, virt_to_maddr(bitmap) + PAGE_SIZE); } /* TODO: change L0 intr window to MTF or NMI window */ __vmwrite(CPU_BASED_VM_EXEC_CONTROL, shadow_cntrl); } void nvmx_update_secondary_exec_control(struct vcpu *v, unsigned long host_cntrl) { u32 shadow_cntrl; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); struct nestedvmx *nvmx = &vcpu_2_nvmx(v); u32 apicv_bit = SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; host_cntrl &= ~apicv_bit; shadow_cntrl = __get_vvmcs(nvcpu->nv_vvmcx, SECONDARY_VM_EXEC_CONTROL); /* No vAPIC-v support, so it shouldn't be set in vmcs12. */ ASSERT(!(shadow_cntrl & apicv_bit)); nvmx->ept.enabled = !!(shadow_cntrl & SECONDARY_EXEC_ENABLE_EPT); shadow_cntrl |= host_cntrl; __vmwrite(SECONDARY_VM_EXEC_CONTROL, shadow_cntrl); } static void nvmx_update_pin_control(struct vcpu *v, unsigned long host_cntrl) { u32 shadow_cntrl; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); host_cntrl &= ~PIN_BASED_POSTED_INTERRUPT; shadow_cntrl = __get_vvmcs(nvcpu->nv_vvmcx, PIN_BASED_VM_EXEC_CONTROL); /* No vAPIC-v support, so it shouldn't be set in vmcs12. */ ASSERT(!(shadow_cntrl & PIN_BASED_POSTED_INTERRUPT)); shadow_cntrl |= host_cntrl; __vmwrite(PIN_BASED_VM_EXEC_CONTROL, shadow_cntrl); } static void nvmx_update_exit_control(struct vcpu *v, unsigned long host_cntrl) { u32 shadow_cntrl; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); shadow_cntrl = __get_vvmcs(nvcpu->nv_vvmcx, VM_EXIT_CONTROLS); shadow_cntrl &= ~(VM_EXIT_SAVE_DEBUG_CNTRLS | VM_EXIT_LOAD_HOST_PAT | VM_EXIT_LOAD_HOST_EFER | VM_EXIT_LOAD_PERF_GLOBAL_CTRL); shadow_cntrl |= host_cntrl; __vmwrite(VM_EXIT_CONTROLS, shadow_cntrl); } static void nvmx_update_entry_control(struct vcpu *v) { u32 shadow_cntrl; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); shadow_cntrl = __get_vvmcs(nvcpu->nv_vvmcx, VM_ENTRY_CONTROLS); shadow_cntrl &= ~(VM_ENTRY_LOAD_GUEST_PAT | VM_ENTRY_LOAD_GUEST_EFER | VM_ENTRY_LOAD_PERF_GLOBAL_CTRL); __vmwrite(VM_ENTRY_CONTROLS, shadow_cntrl); } void nvmx_update_exception_bitmap(struct vcpu *v, unsigned long value) { set_shadow_control(v, EXCEPTION_BITMAP, value); } static void nvmx_update_apic_access_address(struct vcpu *v) { struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); u32 ctrl; ctrl = __n2_secondary_exec_control(v); if ( ctrl & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES ) { p2m_type_t p2mt; unsigned long apic_gpfn; struct page_info *apic_pg; apic_gpfn = __get_vvmcs(nvcpu->nv_vvmcx, APIC_ACCESS_ADDR) >> PAGE_SHIFT; apic_pg = get_page_from_gfn(v->domain, apic_gpfn, &p2mt, P2M_ALLOC); ASSERT(apic_pg && !p2m_is_paging(p2mt)); __vmwrite(APIC_ACCESS_ADDR, page_to_maddr(apic_pg)); put_page(apic_pg); } else __vmwrite(APIC_ACCESS_ADDR, 0); } static void nvmx_update_virtual_apic_address(struct vcpu *v) { struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); u32 ctrl; ctrl = __n2_exec_control(v); if ( ctrl & CPU_BASED_TPR_SHADOW ) { p2m_type_t p2mt; unsigned long vapic_gpfn; struct page_info *vapic_pg; vapic_gpfn = __get_vvmcs(nvcpu->nv_vvmcx, VIRTUAL_APIC_PAGE_ADDR) >> PAGE_SHIFT; vapic_pg = get_page_from_gfn(v->domain, vapic_gpfn, &p2mt, P2M_ALLOC); ASSERT(vapic_pg && !p2m_is_paging(p2mt)); __vmwrite(VIRTUAL_APIC_PAGE_ADDR, page_to_maddr(vapic_pg)); put_page(vapic_pg); } else __vmwrite(VIRTUAL_APIC_PAGE_ADDR, 0); } static void nvmx_update_tpr_threshold(struct vcpu *v) { struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); u32 ctrl = __n2_exec_control(v); if ( ctrl & CPU_BASED_TPR_SHADOW ) __vmwrite(TPR_THRESHOLD, __get_vvmcs(nvcpu->nv_vvmcx, TPR_THRESHOLD)); else __vmwrite(TPR_THRESHOLD, 0); } static void nvmx_update_pfec(struct vcpu *v) { struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); void *vvmcs = nvcpu->nv_vvmcx; __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, __get_vvmcs(vvmcs, PAGE_FAULT_ERROR_CODE_MASK)); __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, __get_vvmcs(vvmcs, PAGE_FAULT_ERROR_CODE_MATCH)); } static void __clear_current_vvmcs(struct vcpu *v) { struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); if ( nvcpu->nv_n2vmcx ) __vmpclear(virt_to_maddr(nvcpu->nv_n2vmcx)); } static bool_t __must_check _map_msr_bitmap(struct vcpu *v) { struct nestedvmx *nvmx = &vcpu_2_nvmx(v); unsigned long gpa; if ( nvmx->msrbitmap ) hvm_unmap_guest_frame(nvmx->msrbitmap, 1); gpa = __get_vvmcs(vcpu_nestedhvm(v).nv_vvmcx, MSR_BITMAP); nvmx->msrbitmap = hvm_map_guest_frame_ro(gpa >> PAGE_SHIFT, 1); return nvmx->msrbitmap != NULL; } static bool_t __must_check _map_io_bitmap(struct vcpu *v, u64 vmcs_reg) { struct nestedvmx *nvmx = &vcpu_2_nvmx(v); unsigned long gpa; int index; index = vmcs_reg == IO_BITMAP_A ? 0 : 1; if (nvmx->iobitmap[index]) hvm_unmap_guest_frame(nvmx->iobitmap[index], 1); gpa = __get_vvmcs(vcpu_nestedhvm(v).nv_vvmcx, vmcs_reg); nvmx->iobitmap[index] = hvm_map_guest_frame_ro(gpa >> PAGE_SHIFT, 1); return nvmx->iobitmap[index] != NULL; } static inline bool_t __must_check map_io_bitmap_all(struct vcpu *v) { return _map_io_bitmap(v, IO_BITMAP_A) && _map_io_bitmap(v, IO_BITMAP_B); } static void nvmx_purge_vvmcs(struct vcpu *v) { struct nestedvmx *nvmx = &vcpu_2_nvmx(v); struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); int i; __clear_current_vvmcs(v); if ( nvcpu->nv_vvmcxaddr != VMCX_EADDR ) hvm_unmap_guest_frame(nvcpu->nv_vvmcx, 1); nvcpu->nv_vvmcx = NULL; nvcpu->nv_vvmcxaddr = VMCX_EADDR; for (i=0; i<2; i++) { if ( nvmx->iobitmap[i] ) { hvm_unmap_guest_frame(nvmx->iobitmap[i], 1); nvmx->iobitmap[i] = NULL; } } if ( nvmx->msrbitmap ) { hvm_unmap_guest_frame(nvmx->msrbitmap, 1); nvmx->msrbitmap = NULL; } } u64 nvmx_get_tsc_offset(struct vcpu *v) { u64 offset = 0; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); if ( __get_vvmcs(nvcpu->nv_vvmcx, CPU_BASED_VM_EXEC_CONTROL) & CPU_BASED_USE_TSC_OFFSETING ) offset = __get_vvmcs(nvcpu->nv_vvmcx, TSC_OFFSET); return offset; } /* * Context synchronized between shadow and virtual VMCS. */ static const u16 vmcs_gstate_field[] = { /* 16 BITS */ GUEST_ES_SELECTOR, GUEST_CS_SELECTOR, GUEST_SS_SELECTOR, GUEST_DS_SELECTOR, GUEST_FS_SELECTOR, GUEST_GS_SELECTOR, GUEST_LDTR_SELECTOR, GUEST_TR_SELECTOR, /* 64 BITS */ VMCS_LINK_POINTER, GUEST_IA32_DEBUGCTL, GUEST_PAT, GUEST_EFER, GUEST_PERF_GLOBAL_CTRL, /* 32 BITS */ GUEST_ES_LIMIT, GUEST_CS_LIMIT, GUEST_SS_LIMIT, GUEST_DS_LIMIT, GUEST_FS_LIMIT, GUEST_GS_LIMIT, GUEST_LDTR_LIMIT, GUEST_TR_LIMIT, GUEST_GDTR_LIMIT, GUEST_IDTR_LIMIT, GUEST_ES_AR_BYTES, GUEST_CS_AR_BYTES, GUEST_SS_AR_BYTES, GUEST_DS_AR_BYTES, GUEST_FS_AR_BYTES, GUEST_GS_AR_BYTES, GUEST_LDTR_AR_BYTES, GUEST_TR_AR_BYTES, GUEST_INTERRUPTIBILITY_INFO, GUEST_ACTIVITY_STATE, GUEST_SYSENTER_CS, GUEST_PREEMPTION_TIMER, /* natural */ GUEST_ES_BASE, GUEST_CS_BASE, GUEST_SS_BASE, GUEST_DS_BASE, GUEST_FS_BASE, GUEST_GS_BASE, GUEST_LDTR_BASE, GUEST_TR_BASE, GUEST_GDTR_BASE, GUEST_IDTR_BASE, GUEST_DR7, /* * Following guest states are in local cache (cpu_user_regs) GUEST_RSP, GUEST_RIP, */ GUEST_RFLAGS, GUEST_PENDING_DBG_EXCEPTIONS, GUEST_SYSENTER_ESP, GUEST_SYSENTER_EIP, }; static const u16 gpdptr_fields[] = { GUEST_PDPTR0, GUEST_PDPTR1, GUEST_PDPTR2, GUEST_PDPTR3, }; /* * Context: shadow -> virtual VMCS */ static const u16 vmcs_ro_field[] = { GUEST_PHYSICAL_ADDRESS, VM_INSTRUCTION_ERROR, VM_EXIT_REASON, VM_EXIT_INTR_INFO, VM_EXIT_INTR_ERROR_CODE, IDT_VECTORING_INFO, IDT_VECTORING_ERROR_CODE, VM_EXIT_INSTRUCTION_LEN, VMX_INSTRUCTION_INFO, EXIT_QUALIFICATION, GUEST_LINEAR_ADDRESS }; static struct vmcs_host_to_guest { u16 host_field; u16 guest_field; } const vmcs_h2g_field[] = { {HOST_ES_SELECTOR, GUEST_ES_SELECTOR}, {HOST_CS_SELECTOR, GUEST_CS_SELECTOR}, {HOST_SS_SELECTOR, GUEST_SS_SELECTOR}, {HOST_DS_SELECTOR, GUEST_DS_SELECTOR}, {HOST_FS_SELECTOR, GUEST_FS_SELECTOR}, {HOST_GS_SELECTOR, GUEST_GS_SELECTOR}, {HOST_TR_SELECTOR, GUEST_TR_SELECTOR}, {HOST_SYSENTER_CS, GUEST_SYSENTER_CS}, {HOST_FS_BASE, GUEST_FS_BASE}, {HOST_GS_BASE, GUEST_GS_BASE}, {HOST_TR_BASE, GUEST_TR_BASE}, {HOST_GDTR_BASE, GUEST_GDTR_BASE}, {HOST_IDTR_BASE, GUEST_IDTR_BASE}, {HOST_SYSENTER_ESP, GUEST_SYSENTER_ESP}, {HOST_SYSENTER_EIP, GUEST_SYSENTER_EIP}, }; static void vvmcs_to_shadow(void *vvmcs, unsigned int field) { u64 value; value = __get_vvmcs(vvmcs, field); __vmwrite(field, value); } static void vvmcs_to_shadow_bulk(struct vcpu *v, unsigned int n, const u16 *field) { struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); void *vvmcs = nvcpu->nv_vvmcx; u64 *value = this_cpu(vvmcs_buf); unsigned int i; if ( !cpu_has_vmx_vmcs_shadowing ) goto fallback; if ( !value || n > VMCS_BUF_SIZE ) { gdprintk(XENLOG_DEBUG, "vmcs sync fall back to non-bulk mode, \ buffer: %p, buffer size: %d, fields number: %d.\n", value, VMCS_BUF_SIZE, n); goto fallback; } virtual_vmcs_enter(vvmcs); for ( i = 0; i < n; i++ ) __vmread(field[i], &value[i]); virtual_vmcs_exit(vvmcs); for ( i = 0; i < n; i++ ) __vmwrite(field[i], value[i]); return; fallback: for ( i = 0; i < n; i++ ) vvmcs_to_shadow(vvmcs, field[i]); } static inline void shadow_to_vvmcs(void *vvmcs, unsigned int field) { unsigned long value; if ( __vmread_safe(field, &value) ) __set_vvmcs(vvmcs, field, value); } static void shadow_to_vvmcs_bulk(struct vcpu *v, unsigned int n, const u16 *field) { struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); void *vvmcs = nvcpu->nv_vvmcx; u64 *value = this_cpu(vvmcs_buf); unsigned int i; if ( !cpu_has_vmx_vmcs_shadowing ) goto fallback; if ( !value || n > VMCS_BUF_SIZE ) { gdprintk(XENLOG_DEBUG, "vmcs sync fall back to non-bulk mode, \ buffer: %p, buffer size: %d, fields number: %d.\n", value, VMCS_BUF_SIZE, n); goto fallback; } for ( i = 0; i < n; i++ ) __vmread(field[i], &value[i]); virtual_vmcs_enter(vvmcs); for ( i = 0; i < n; i++ ) __vmwrite(field[i], value[i]); virtual_vmcs_exit(vvmcs); return; fallback: for ( i = 0; i < n; i++ ) shadow_to_vvmcs(vvmcs, field[i]); } static void load_shadow_control(struct vcpu *v) { /* * Set shadow controls: PIN_BASED, CPU_BASED, EXIT, ENTRY * and EXCEPTION * Enforce the removed features */ nvmx_update_pin_control(v, vmx_pin_based_exec_control); vmx_update_cpu_exec_control(v); vmx_update_secondary_exec_control(v); nvmx_update_exit_control(v, vmx_vmexit_control); nvmx_update_entry_control(v); vmx_update_exception_bitmap(v); nvmx_update_apic_access_address(v); nvmx_update_virtual_apic_address(v); nvmx_update_tpr_threshold(v); nvmx_update_pfec(v); } static void load_shadow_guest_state(struct vcpu *v) { struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); void *vvmcs = nvcpu->nv_vvmcx; u32 control; u64 cr_gh_mask, cr_read_shadow; static const u16 vmentry_fields[] = { VM_ENTRY_INTR_INFO, VM_ENTRY_EXCEPTION_ERROR_CODE, VM_ENTRY_INSTRUCTION_LEN, }; /* vvmcs.gstate to shadow vmcs.gstate */ vvmcs_to_shadow_bulk(v, ARRAY_SIZE(vmcs_gstate_field), vmcs_gstate_field); nvcpu->guest_cr[0] = __get_vvmcs(vvmcs, CR0_READ_SHADOW); nvcpu->guest_cr[4] = __get_vvmcs(vvmcs, CR4_READ_SHADOW); hvm_set_cr0(__get_vvmcs(vvmcs, GUEST_CR0)); hvm_set_cr4(__get_vvmcs(vvmcs, GUEST_CR4)); hvm_set_cr3(__get_vvmcs(vvmcs, GUEST_CR3)); control = __get_vvmcs(vvmcs, VM_ENTRY_CONTROLS); if ( control & VM_ENTRY_LOAD_GUEST_PAT ) hvm_set_guest_pat(v, __get_vvmcs(vvmcs, GUEST_PAT)); if ( control & VM_ENTRY_LOAD_PERF_GLOBAL_CTRL ) hvm_msr_write_intercept(MSR_CORE_PERF_GLOBAL_CTRL, __get_vvmcs(vvmcs, GUEST_PERF_GLOBAL_CTRL)); hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset); vvmcs_to_shadow_bulk(v, ARRAY_SIZE(vmentry_fields), vmentry_fields); /* * While emulate CR0 and CR4 for nested virtualization, set the CR0/CR4 * guest host mask to 0xffffffff in shadow VMCS (follow the host L1 VMCS), * then calculate the corresponding read shadow separately for CR0 and CR4. */ cr_gh_mask = __get_vvmcs(vvmcs, CR0_GUEST_HOST_MASK); cr_read_shadow = (__get_vvmcs(vvmcs, GUEST_CR0) & ~cr_gh_mask) | (__get_vvmcs(vvmcs, CR0_READ_SHADOW) & cr_gh_mask); __vmwrite(CR0_READ_SHADOW, cr_read_shadow); cr_gh_mask = __get_vvmcs(vvmcs, CR4_GUEST_HOST_MASK); cr_read_shadow = (__get_vvmcs(vvmcs, GUEST_CR4) & ~cr_gh_mask) | (__get_vvmcs(vvmcs, CR4_READ_SHADOW) & cr_gh_mask); __vmwrite(CR4_READ_SHADOW, cr_read_shadow); /* TODO: CR3 target control */ } uint64_t get_shadow_eptp(struct vcpu *v) { uint64_t np2m_base = nvmx_vcpu_eptp_base(v); struct p2m_domain *p2m = p2m_get_nestedp2m(v, np2m_base); struct ept_data *ept = &p2m->ept; ept->asr = pagetable_get_pfn(p2m_get_pagetable(p2m)); return ept_get_eptp(ept); } static uint64_t get_host_eptp(struct vcpu *v) { struct domain *d = v->domain; struct ept_data *ept_data = &p2m_get_hostp2m(d)->ept; return ept_get_eptp(ept_data); } static bool_t nvmx_vpid_enabled(struct nestedvcpu *nvcpu) { uint32_t second_cntl; second_cntl = __get_vvmcs(nvcpu->nv_vvmcx, SECONDARY_VM_EXEC_CONTROL); if ( second_cntl & SECONDARY_EXEC_ENABLE_VPID ) return 1; return 0; } static void nvmx_set_vmcs_pointer(struct vcpu *v, struct vmcs_struct *vvmcs) { unsigned long vvmcs_mfn = domain_page_map_to_mfn(vvmcs); paddr_t vvmcs_maddr = vvmcs_mfn << PAGE_SHIFT; __vmpclear(vvmcs_maddr); vvmcs->vmcs_revision_id |= VMCS_RID_TYPE_MASK; v->arch.hvm_vmx.vmcs_shadow_maddr = vvmcs_maddr; __vmwrite(VMCS_LINK_POINTER, vvmcs_maddr); __vmwrite(VMREAD_BITMAP, page_to_maddr(v->arch.hvm_vmx.vmread_bitmap)); __vmwrite(VMWRITE_BITMAP, page_to_maddr(v->arch.hvm_vmx.vmwrite_bitmap)); } static void nvmx_clear_vmcs_pointer(struct vcpu *v, struct vmcs_struct *vvmcs) { unsigned long vvmcs_mfn = domain_page_map_to_mfn(vvmcs); paddr_t vvmcs_maddr = vvmcs_mfn << PAGE_SHIFT; __vmpclear(vvmcs_maddr); vvmcs->vmcs_revision_id &= ~VMCS_RID_TYPE_MASK; v->arch.hvm_vmx.vmcs_shadow_maddr = 0; __vmwrite(VMCS_LINK_POINTER, ~0ul); __vmwrite(VMREAD_BITMAP, 0); __vmwrite(VMWRITE_BITMAP, 0); } static void virtual_vmentry(struct cpu_user_regs *regs) { struct vcpu *v = current; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); void *vvmcs = nvcpu->nv_vvmcx; unsigned long lm_l1, lm_l2; vmx_vmcs_switch(v->arch.hvm_vmx.vmcs, nvcpu->nv_n2vmcx); nestedhvm_vcpu_enter_guestmode(v); nvcpu->nv_vmentry_pending = 0; nvcpu->nv_vmswitch_in_progress = 1; /* * EFER handling: * hvm_set_efer won't work if CR0.PG = 1, so we change the value * directly to make hvm_long_mode_enabled(v) work in L2. * An additional update_paging_modes is also needed if * there is 32/64 switch. v->arch.hvm_vcpu.guest_efer doesn't * need to be saved, since its value on vmexit is determined by * L1 exit_controls */ lm_l1 = !!hvm_long_mode_enabled(v); lm_l2 = !!(__get_vvmcs(vvmcs, VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE); if ( lm_l2 ) v->arch.hvm_vcpu.guest_efer |= EFER_LMA | EFER_LME; else v->arch.hvm_vcpu.guest_efer &= ~(EFER_LMA | EFER_LME); load_shadow_control(v); load_shadow_guest_state(v); if ( lm_l1 != lm_l2 ) paging_update_paging_modes(v); if ( nvmx_ept_enabled(v) && hvm_pae_enabled(v) && !(v->arch.hvm_vcpu.guest_efer & EFER_LMA) ) vvmcs_to_shadow_bulk(v, ARRAY_SIZE(gpdptr_fields), gpdptr_fields); regs->eip = __get_vvmcs(vvmcs, GUEST_RIP); regs->esp = __get_vvmcs(vvmcs, GUEST_RSP); regs->eflags = __get_vvmcs(vvmcs, GUEST_RFLAGS); /* updating host cr0 to sync TS bit */ __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); /* Setup virtual ETP for L2 guest*/ if ( nestedhvm_paging_mode_hap(v) ) __vmwrite(EPT_POINTER, get_shadow_eptp(v)); else __vmwrite(EPT_POINTER, get_host_eptp(v)); /* nested VPID support! */ if ( cpu_has_vmx_vpid && nvmx_vpid_enabled(nvcpu) ) { struct nestedvmx *nvmx = &vcpu_2_nvmx(v); uint32_t new_vpid = __get_vvmcs(vvmcs, VIRTUAL_PROCESSOR_ID); if ( nvmx->guest_vpid != new_vpid ) { hvm_asid_flush_vcpu_asid(&vcpu_nestedhvm(v).nv_n2asid); nvmx->guest_vpid = new_vpid; } } } static void sync_vvmcs_guest_state(struct vcpu *v, struct cpu_user_regs *regs) { struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); void *vvmcs = nvcpu->nv_vvmcx; /* copy shadow vmcs.gstate back to vvmcs.gstate */ shadow_to_vvmcs_bulk(v, ARRAY_SIZE(vmcs_gstate_field), vmcs_gstate_field); /* RIP, RSP are in user regs */ __set_vvmcs(vvmcs, GUEST_RIP, regs->eip); __set_vvmcs(vvmcs, GUEST_RSP, regs->esp); /* CR3 sync if exec doesn't want cr3 load exiting: i.e. nested EPT */ if ( !(__n2_exec_control(v) & CPU_BASED_CR3_LOAD_EXITING) ) shadow_to_vvmcs(vvmcs, GUEST_CR3); } static void sync_vvmcs_ro(struct vcpu *v) { struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); struct nestedvmx *nvmx = &vcpu_2_nvmx(v); void *vvmcs = nvcpu->nv_vvmcx; shadow_to_vvmcs_bulk(v, ARRAY_SIZE(vmcs_ro_field), vmcs_ro_field); /* Adjust exit_reason/exit_qualifciation for violation case */ if ( __get_vvmcs(vvmcs, VM_EXIT_REASON) == EXIT_REASON_EPT_VIOLATION ) { __set_vvmcs(vvmcs, EXIT_QUALIFICATION, nvmx->ept.exit_qual); __set_vvmcs(vvmcs, VM_EXIT_REASON, nvmx->ept.exit_reason); } } static void load_vvmcs_host_state(struct vcpu *v) { int i; u64 r; void *vvmcs = vcpu_nestedhvm(v).nv_vvmcx; u32 control; for ( i = 0; i < ARRAY_SIZE(vmcs_h2g_field); i++ ) { r = __get_vvmcs(vvmcs, vmcs_h2g_field[i].host_field); __vmwrite(vmcs_h2g_field[i].guest_field, r); } hvm_set_cr0(__get_vvmcs(vvmcs, HOST_CR0)); hvm_set_cr4(__get_vvmcs(vvmcs, HOST_CR4)); hvm_set_cr3(__get_vvmcs(vvmcs, HOST_CR3)); control = __get_vvmcs(vvmcs, VM_EXIT_CONTROLS); if ( control & VM_EXIT_LOAD_HOST_PAT ) hvm_set_guest_pat(v, __get_vvmcs(vvmcs, HOST_PAT)); if ( control & VM_EXIT_LOAD_PERF_GLOBAL_CTRL ) hvm_msr_write_intercept(MSR_CORE_PERF_GLOBAL_CTRL, __get_vvmcs(vvmcs, HOST_PERF_GLOBAL_CTRL)); hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset); __set_vvmcs(vvmcs, VM_ENTRY_INTR_INFO, 0); } static void sync_exception_state(struct vcpu *v) { struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); struct nestedvmx *nvmx = &vcpu_2_nvmx(v); if ( !(nvmx->intr.intr_info & INTR_INFO_VALID_MASK) ) return; switch ( (nvmx->intr.intr_info & INTR_INFO_INTR_TYPE_MASK) >> 8 ) { case X86_EVENTTYPE_EXT_INTR: /* rename exit_reason to EXTERNAL_INTERRUPT */ __set_vvmcs(nvcpu->nv_vvmcx, VM_EXIT_REASON, EXIT_REASON_EXTERNAL_INTERRUPT); __set_vvmcs(nvcpu->nv_vvmcx, EXIT_QUALIFICATION, 0); __set_vvmcs(nvcpu->nv_vvmcx, VM_EXIT_INTR_INFO, nvmx->intr.intr_info); break; case X86_EVENTTYPE_HW_EXCEPTION: case X86_EVENTTYPE_SW_INTERRUPT: case X86_EVENTTYPE_SW_EXCEPTION: /* throw to L1 */ __set_vvmcs(nvcpu->nv_vvmcx, VM_EXIT_INTR_INFO, nvmx->intr.intr_info); __set_vvmcs(nvcpu->nv_vvmcx, VM_EXIT_INTR_ERROR_CODE, nvmx->intr.error_code); break; case X86_EVENTTYPE_NMI: __set_vvmcs(nvcpu->nv_vvmcx, VM_EXIT_REASON, EXIT_REASON_EXCEPTION_NMI); __set_vvmcs(nvcpu->nv_vvmcx, EXIT_QUALIFICATION, 0); __set_vvmcs(nvcpu->nv_vvmcx, VM_EXIT_INTR_INFO, nvmx->intr.intr_info); break; default: gdprintk(XENLOG_ERR, "Exception state %lx not handled\n", nvmx->intr.intr_info); break; } } static void nvmx_update_apicv(struct vcpu *v) { struct nestedvmx *nvmx = &vcpu_2_nvmx(v); struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); unsigned long reason = __get_vvmcs(nvcpu->nv_vvmcx, VM_EXIT_REASON); uint32_t intr_info = __get_vvmcs(nvcpu->nv_vvmcx, VM_EXIT_INTR_INFO); if ( reason == EXIT_REASON_EXTERNAL_INTERRUPT && nvmx->intr.source == hvm_intsrc_lapic && (intr_info & INTR_INFO_VALID_MASK) ) { uint16_t status; uint32_t rvi, ppr; uint32_t vector = intr_info & 0xff; struct vlapic *vlapic = vcpu_vlapic(v); vlapic_ack_pending_irq(v, vector, 1); ppr = vlapic_set_ppr(vlapic); WARN_ON((ppr & 0xf0) != (vector & 0xf0)); status = vector << 8; rvi = vlapic_has_pending_irq(v); if ( rvi != -1 ) status |= rvi & 0xff; __vmwrite(GUEST_INTR_STATUS, status); } } static void virtual_vmexit(struct cpu_user_regs *regs) { struct vcpu *v = current; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); unsigned long lm_l1, lm_l2; sync_vvmcs_ro(v); sync_vvmcs_guest_state(v, regs); sync_exception_state(v); if ( nvmx_ept_enabled(v) && hvm_pae_enabled(v) && !(v->arch.hvm_vcpu.guest_efer & EFER_LMA) ) shadow_to_vvmcs_bulk(v, ARRAY_SIZE(gpdptr_fields), gpdptr_fields); vmx_vmcs_switch(v->arch.hvm_vmx.vmcs, nvcpu->nv_n1vmcx); nestedhvm_vcpu_exit_guestmode(v); nvcpu->nv_vmexit_pending = 0; nvcpu->nv_vmswitch_in_progress = 1; lm_l2 = !!hvm_long_mode_enabled(v); lm_l1 = !!(__get_vvmcs(nvcpu->nv_vvmcx, VM_EXIT_CONTROLS) & VM_EXIT_IA32E_MODE); if ( lm_l1 ) v->arch.hvm_vcpu.guest_efer |= EFER_LMA | EFER_LME; else v->arch.hvm_vcpu.guest_efer &= ~(EFER_LMA | EFER_LME); vmx_update_cpu_exec_control(v); vmx_update_secondary_exec_control(v); vmx_update_exception_bitmap(v); load_vvmcs_host_state(v); if ( lm_l1 != lm_l2 ) paging_update_paging_modes(v); regs->eip = __get_vvmcs(nvcpu->nv_vvmcx, HOST_RIP); regs->esp = __get_vvmcs(nvcpu->nv_vvmcx, HOST_RSP); /* VM exit clears all bits except bit 1 */ regs->eflags = 0x2; /* updating host cr0 to sync TS bit */ __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); if ( cpu_has_vmx_virtual_intr_delivery ) nvmx_update_apicv(v); nvcpu->nv_vmswitch_in_progress = 0; vmreturn(regs, VMSUCCEED); } void nvmx_switch_guest(void) { struct vcpu *v = current; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); struct cpu_user_regs *regs = guest_cpu_user_regs(); const ioreq_t *ioreq = get_ioreq(v); /* * A pending IO emulation may still be not finished. In this case, no * virtual vmswitch is allowed. Or else, the following IO emulation will * be handled in a wrong VCPU context. If there are no IO backends - PVH * guest by itself or a PVH guest with an HVM guest running inside - we * don't want to continue as this setup is not implemented nor supported * as of right now. */ if ( !ioreq || ioreq->state != STATE_IOREQ_NONE ) return; /* * a softirq may interrupt us between a virtual vmentry is * just handled and the true vmentry. If during this window, * a L1 virtual interrupt causes another virtual vmexit, we * cannot let that happen or VM_ENTRY_INTR_INFO will be lost. */ if ( unlikely(nvcpu->nv_vmswitch_in_progress) ) return; if ( nestedhvm_vcpu_in_guestmode(v) && nvcpu->nv_vmexit_pending ) virtual_vmexit(regs); else if ( !nestedhvm_vcpu_in_guestmode(v) && nvcpu->nv_vmentry_pending ) virtual_vmentry(regs); } /* * VMX instructions handling */ int nvmx_handle_vmxon(struct cpu_user_regs *regs) { struct vcpu *v=current; struct nestedvmx *nvmx = &vcpu_2_nvmx(v); struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); struct vmx_inst_decoded decode; unsigned long gpa = 0; int rc; rc = decode_vmx_inst(regs, &decode, &gpa, 1); if ( rc != X86EMUL_OKAY ) return rc; if ( nvmx->vmxon_region_pa ) gdprintk(XENLOG_WARNING, "vmxon again: orig %"PRIpaddr" new %lx\n", nvmx->vmxon_region_pa, gpa); nvmx->vmxon_region_pa = gpa; /* * `fork' the host vmcs to shadow_vmcs * vmcs_lock is not needed since we are on current */ nvcpu->nv_n1vmcx = v->arch.hvm_vmx.vmcs; __vmpclear(virt_to_maddr(v->arch.hvm_vmx.vmcs)); memcpy(nvcpu->nv_n2vmcx, v->arch.hvm_vmx.vmcs, PAGE_SIZE); __vmptrld(virt_to_maddr(v->arch.hvm_vmx.vmcs)); v->arch.hvm_vmx.launched = 0; vmreturn(regs, VMSUCCEED); return X86EMUL_OKAY; } int nvmx_handle_vmxoff(struct cpu_user_regs *regs) { struct vcpu *v=current; struct nestedvmx *nvmx = &vcpu_2_nvmx(v); int rc; rc = vmx_inst_check_privilege(regs, 0); if ( rc != X86EMUL_OKAY ) return rc; nvmx_purge_vvmcs(v); nvmx->vmxon_region_pa = 0; vmreturn(regs, VMSUCCEED); return X86EMUL_OKAY; } static bool_t vvmcs_launched(struct list_head *launched_list, unsigned long vvmcs_mfn) { struct vvmcs_list *vvmcs; struct list_head *pos; bool_t launched = 0; list_for_each(pos, launched_list) { vvmcs = list_entry(pos, struct vvmcs_list, node); if ( vvmcs_mfn == vvmcs->vvmcs_mfn ) { launched = 1; break; } } return launched; } static int set_vvmcs_launched(struct list_head *launched_list, unsigned long vvmcs_mfn) { struct vvmcs_list *vvmcs; if ( vvmcs_launched(launched_list, vvmcs_mfn) ) return 0; vvmcs = xzalloc(struct vvmcs_list); if ( !vvmcs ) return -ENOMEM; vvmcs->vvmcs_mfn = vvmcs_mfn; list_add(&vvmcs->node, launched_list); return 0; } static void clear_vvmcs_launched(struct list_head *launched_list, paddr_t vvmcs_mfn) { struct vvmcs_list *vvmcs; struct list_head *pos; list_for_each(pos, launched_list) { vvmcs = list_entry(pos, struct vvmcs_list, node); if ( vvmcs_mfn == vvmcs->vvmcs_mfn ) { list_del(&vvmcs->node); xfree(vvmcs); break; } } } static int nvmx_vmresume(struct vcpu *v, struct cpu_user_regs *regs) { struct nestedvmx *nvmx = &vcpu_2_nvmx(v); struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); /* check VMCS is valid and IO BITMAP is set */ if ( (nvcpu->nv_vvmcxaddr != VMCX_EADDR) && ((nvmx->iobitmap[0] && nvmx->iobitmap[1]) || !(__n2_exec_control(v) & CPU_BASED_ACTIVATE_IO_BITMAP) ) ) nvcpu->nv_vmentry_pending = 1; else vmreturn(regs, VMFAIL_INVALID); return X86EMUL_OKAY; } int nvmx_handle_vmresume(struct cpu_user_regs *regs) { bool_t launched; struct vcpu *v = current; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); struct nestedvmx *nvmx = &vcpu_2_nvmx(v); int rc = vmx_inst_check_privilege(regs, 0); if ( rc != X86EMUL_OKAY ) return rc; if ( vcpu_nestedhvm(v).nv_vvmcxaddr == VMCX_EADDR ) { vmreturn (regs, VMFAIL_INVALID); return X86EMUL_OKAY; } launched = vvmcs_launched(&nvmx->launched_list, domain_page_map_to_mfn(nvcpu->nv_vvmcx)); if ( !launched ) { vmreturn (regs, VMFAIL_VALID); return X86EMUL_OKAY; } return nvmx_vmresume(v,regs); } int nvmx_handle_vmlaunch(struct cpu_user_regs *regs) { bool_t launched; struct vcpu *v = current; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); struct nestedvmx *nvmx = &vcpu_2_nvmx(v); int rc = vmx_inst_check_privilege(regs, 0); if ( rc != X86EMUL_OKAY ) return rc; if ( vcpu_nestedhvm(v).nv_vvmcxaddr == VMCX_EADDR ) { vmreturn (regs, VMFAIL_INVALID); return X86EMUL_OKAY; } launched = vvmcs_launched(&nvmx->launched_list, domain_page_map_to_mfn(nvcpu->nv_vvmcx)); if ( launched ) { vmreturn (regs, VMFAIL_VALID); return X86EMUL_OKAY; } else { rc = nvmx_vmresume(v,regs); if ( rc == X86EMUL_OKAY ) { if ( set_vvmcs_launched(&nvmx->launched_list, domain_page_map_to_mfn(nvcpu->nv_vvmcx)) < 0 ) return X86EMUL_UNHANDLEABLE; } } return rc; } int nvmx_handle_vmptrld(struct cpu_user_regs *regs) { struct vcpu *v = current; struct vmx_inst_decoded decode; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); unsigned long gpa = 0; int rc; rc = decode_vmx_inst(regs, &decode, &gpa, 0); if ( rc != X86EMUL_OKAY ) return rc; if ( gpa == vcpu_2_nvmx(v).vmxon_region_pa || gpa & 0xfff ) { vmreturn(regs, VMFAIL_INVALID); goto out; } if ( nvcpu->nv_vvmcxaddr != gpa ) nvmx_purge_vvmcs(v); if ( nvcpu->nv_vvmcxaddr == VMCX_EADDR ) { nvcpu->nv_vvmcx = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 1); if ( nvcpu->nv_vvmcx ) nvcpu->nv_vvmcxaddr = gpa; if ( !nvcpu->nv_vvmcx || !map_io_bitmap_all(v) || !_map_msr_bitmap(v) ) { vmreturn(regs, VMFAIL_VALID); goto out; } } if ( cpu_has_vmx_vmcs_shadowing ) nvmx_set_vmcs_pointer(v, nvcpu->nv_vvmcx); vmreturn(regs, VMSUCCEED); out: return X86EMUL_OKAY; } int nvmx_handle_vmptrst(struct cpu_user_regs *regs) { struct vcpu *v = current; struct vmx_inst_decoded decode; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); unsigned long gpa = 0; int rc; rc = decode_vmx_inst(regs, &decode, &gpa, 0); if ( rc != X86EMUL_OKAY ) return rc; gpa = nvcpu->nv_vvmcxaddr; rc = hvm_copy_to_guest_virt(decode.mem, &gpa, decode.len, 0); if ( rc != HVMCOPY_okay ) return X86EMUL_EXCEPTION; vmreturn(regs, VMSUCCEED); return X86EMUL_OKAY; } int nvmx_handle_vmclear(struct cpu_user_regs *regs) { struct vcpu *v = current; struct vmx_inst_decoded decode; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); struct nestedvmx *nvmx = &vcpu_2_nvmx(v); unsigned long gpa = 0; void *vvmcs; int rc; rc = decode_vmx_inst(regs, &decode, &gpa, 0); if ( rc != X86EMUL_OKAY ) return rc; if ( gpa & 0xfff ) { vmreturn(regs, VMFAIL_INVALID); return X86EMUL_OKAY; } if ( gpa == nvcpu->nv_vvmcxaddr ) { if ( cpu_has_vmx_vmcs_shadowing ) nvmx_clear_vmcs_pointer(v, nvcpu->nv_vvmcx); clear_vvmcs_launched(&nvmx->launched_list, domain_page_map_to_mfn(nvcpu->nv_vvmcx)); nvmx_purge_vvmcs(v); } else { /* Even if this VMCS isn't the current one, we must clear it. */ vvmcs = hvm_map_guest_frame_rw(gpa >> PAGE_SHIFT, 0); if ( vvmcs ) clear_vvmcs_launched(&nvmx->launched_list, domain_page_map_to_mfn(vvmcs)); hvm_unmap_guest_frame(vvmcs, 0); } vmreturn(regs, VMSUCCEED); return X86EMUL_OKAY; } int nvmx_handle_vmread(struct cpu_user_regs *regs) { struct vcpu *v = current; struct vmx_inst_decoded decode; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); u64 value = 0; int rc; rc = decode_vmx_inst(regs, &decode, NULL, 0); if ( rc != X86EMUL_OKAY ) return rc; value = __get_vvmcs(nvcpu->nv_vvmcx, reg_read(regs, decode.reg2)); switch ( decode.type ) { case VMX_INST_MEMREG_TYPE_MEMORY: rc = hvm_copy_to_guest_virt(decode.mem, &value, decode.len, 0); if ( rc != HVMCOPY_okay ) return X86EMUL_EXCEPTION; break; case VMX_INST_MEMREG_TYPE_REG: reg_write(regs, decode.reg1, value); break; } vmreturn(regs, VMSUCCEED); return X86EMUL_OKAY; } int nvmx_handle_vmwrite(struct cpu_user_regs *regs) { struct vcpu *v = current; struct vmx_inst_decoded decode; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); unsigned long operand; u64 vmcs_encoding; bool_t okay = 1; if ( decode_vmx_inst(regs, &decode, &operand, 0) != X86EMUL_OKAY ) return X86EMUL_EXCEPTION; vmcs_encoding = reg_read(regs, decode.reg2); __set_vvmcs(nvcpu->nv_vvmcx, vmcs_encoding, operand); switch ( vmcs_encoding ) { case IO_BITMAP_A: case IO_BITMAP_A_HIGH: okay = _map_io_bitmap(v, IO_BITMAP_A); break; case IO_BITMAP_B: case IO_BITMAP_B_HIGH: okay = _map_io_bitmap(v, IO_BITMAP_B); break; case MSR_BITMAP: case MSR_BITMAP_HIGH: okay = _map_msr_bitmap(v); break; } vmreturn(regs, okay ? VMSUCCEED : VMFAIL_VALID); return X86EMUL_OKAY; } int nvmx_handle_invept(struct cpu_user_regs *regs) { struct vmx_inst_decoded decode; unsigned long eptp; int ret; if ( (ret = decode_vmx_inst(regs, &decode, &eptp, 0)) != X86EMUL_OKAY ) return ret; switch ( reg_read(regs, decode.reg2) ) { case INVEPT_SINGLE_CONTEXT: { struct p2m_domain *p2m = p2m_get_nestedp2m(current, eptp); if ( p2m ) { p2m_flush(current, p2m); ept_sync_domain(p2m); } break; } case INVEPT_ALL_CONTEXT: p2m_flush_nestedp2m(current->domain); __invept(INVEPT_ALL_CONTEXT, 0, 0); break; default: vmreturn(regs, VMFAIL_INVALID); return X86EMUL_OKAY; } vmreturn(regs, VMSUCCEED); return X86EMUL_OKAY; } int nvmx_handle_invvpid(struct cpu_user_regs *regs) { struct vmx_inst_decoded decode; unsigned long vpid; int ret; if ( (ret = decode_vmx_inst(regs, &decode, &vpid, 0)) != X86EMUL_OKAY ) return ret; switch ( reg_read(regs, decode.reg2) ) { /* Just invalidate all tlb entries for all types! */ case INVVPID_INDIVIDUAL_ADDR: case INVVPID_SINGLE_CONTEXT: case INVVPID_ALL_CONTEXT: hvm_asid_flush_vcpu_asid(&vcpu_nestedhvm(current).nv_n2asid); break; default: vmreturn(regs, VMFAIL_INVALID); return X86EMUL_OKAY; } vmreturn(regs, VMSUCCEED); return X86EMUL_OKAY; } #define __emul_value(enable1, default1) \ ((enable1 | default1) << 32 | (default1)) #define gen_vmx_msr(enable1, default1, host_value) \ (((__emul_value(enable1, default1) & host_value) & (~0ul << 32)) | \ ((uint32_t)(__emul_value(enable1, default1) | host_value))) /* * Capability reporting */ int nvmx_msr_read_intercept(unsigned int msr, u64 *msr_content) { struct vcpu *v = current; unsigned int eax, ebx, ecx, edx; u64 data = 0, host_data = 0; int r = 1; if ( !nestedhvm_enabled(v->domain) ) return 0; /* VMX capablity MSRs are available only when guest supports VMX. */ hvm_cpuid(0x1, NULL, NULL, &ecx, &edx); if ( !(ecx & cpufeat_mask(X86_FEATURE_VMXE)) ) return 0; /* * Those MSRs are available only when bit 55 of * MSR_IA32_VMX_BASIC is set. */ switch ( msr ) { case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: case MSR_IA32_VMX_TRUE_EXIT_CTLS: case MSR_IA32_VMX_TRUE_ENTRY_CTLS: if ( !(vmx_basic_msr & VMX_BASIC_DEFAULT1_ZERO) ) return 0; break; } rdmsrl(msr, host_data); /* * Remove unsupport features from n1 guest capability MSR */ switch (msr) { case MSR_IA32_VMX_BASIC: data = (host_data & (~0ul << 32)) | (v->arch.hvm_vmx.vmcs->vmcs_revision_id & 0x7fffffff); break; case MSR_IA32_VMX_PINBASED_CTLS: case MSR_IA32_VMX_TRUE_PINBASED_CTLS: /* 1-seetings */ data = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | PIN_BASED_PREEMPT_TIMER; data = gen_vmx_msr(data, VMX_PINBASED_CTLS_DEFAULT1, host_data); break; case MSR_IA32_VMX_PROCBASED_CTLS: case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: { u32 default1_bits = VMX_PROCBASED_CTLS_DEFAULT1; /* 1-seetings */ data = CPU_BASED_HLT_EXITING | CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | CPU_BASED_MONITOR_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_ACTIVATE_IO_BITMAP | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_RDTSC_EXITING | CPU_BASED_MONITOR_TRAP_FLAG | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_ACTIVATE_MSR_BITMAP | CPU_BASED_PAUSE_EXITING | CPU_BASED_RDPMC_EXITING | CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; if ( msr == MSR_IA32_VMX_TRUE_PROCBASED_CTLS ) default1_bits &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | CPU_BASED_INVLPG_EXITING); data = gen_vmx_msr(data, default1_bits, host_data); break; } case MSR_IA32_VMX_PROCBASED_CTLS2: /* 1-seetings */ data = SECONDARY_EXEC_DESCRIPTOR_TABLE_EXITING | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_ENABLE_EPT; data = gen_vmx_msr(data, 0, host_data); break; case MSR_IA32_VMX_EXIT_CTLS: case MSR_IA32_VMX_TRUE_EXIT_CTLS: /* 1-seetings */ data = VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_IA32E_MODE | VM_EXIT_SAVE_PREEMPT_TIMER | VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_LOAD_HOST_PAT | VM_EXIT_SAVE_GUEST_EFER | VM_EXIT_LOAD_HOST_EFER | VM_EXIT_LOAD_PERF_GLOBAL_CTRL; data = gen_vmx_msr(data, VMX_EXIT_CTLS_DEFAULT1, host_data); break; case MSR_IA32_VMX_ENTRY_CTLS: case MSR_IA32_VMX_TRUE_ENTRY_CTLS: /* 1-seetings */ data = VM_ENTRY_LOAD_GUEST_PAT | VM_ENTRY_LOAD_GUEST_EFER | VM_ENTRY_LOAD_PERF_GLOBAL_CTRL | VM_ENTRY_IA32E_MODE; data = gen_vmx_msr(data, VMX_ENTRY_CTLS_DEFAULT1, host_data); break; case IA32_FEATURE_CONTROL_MSR: data = IA32_FEATURE_CONTROL_MSR_LOCK | IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_OUTSIDE_SMX; break; case MSR_IA32_VMX_VMCS_ENUM: /* The max index of VVMCS encoding is 0x1f. */ data = 0x1f << 1; break; case MSR_IA32_VMX_CR0_FIXED0: /* PG, PE bits must be 1 in VMX operation */ data = X86_CR0_PE | X86_CR0_PG; break; case MSR_IA32_VMX_CR0_FIXED1: /* allow 0-settings for all bits */ data = 0xffffffff; break; case MSR_IA32_VMX_CR4_FIXED0: /* VMXE bit must be 1 in VMX operation */ data = X86_CR4_VMXE; break; case MSR_IA32_VMX_CR4_FIXED1: if ( edx & cpufeat_mask(X86_FEATURE_VME) ) data |= X86_CR4_VME | X86_CR4_PVI; if ( edx & cpufeat_mask(X86_FEATURE_TSC) ) data |= X86_CR4_TSD; if ( edx & cpufeat_mask(X86_FEATURE_DE) ) data |= X86_CR4_DE; if ( edx & cpufeat_mask(X86_FEATURE_PSE) ) data |= X86_CR4_PSE; if ( edx & cpufeat_mask(X86_FEATURE_PAE) ) data |= X86_CR4_PAE; if ( edx & cpufeat_mask(X86_FEATURE_MCE) ) data |= X86_CR4_MCE; if ( edx & cpufeat_mask(X86_FEATURE_PGE) ) data |= X86_CR4_PGE; if ( edx & cpufeat_mask(X86_FEATURE_FXSR) ) data |= X86_CR4_OSFXSR; if ( edx & cpufeat_mask(X86_FEATURE_XMM) ) data |= X86_CR4_OSXMMEXCPT; if ( ecx & cpufeat_mask(X86_FEATURE_VMXE) ) data |= X86_CR4_VMXE; if ( ecx & cpufeat_mask(X86_FEATURE_SMXE) ) data |= X86_CR4_SMXE; if ( ecx & cpufeat_mask(X86_FEATURE_PCID) ) data |= X86_CR4_PCIDE; if ( ecx & cpufeat_mask(X86_FEATURE_XSAVE) ) data |= X86_CR4_OSXSAVE; hvm_cpuid(0x0, &eax, NULL, NULL, NULL); switch ( eax ) { default: hvm_cpuid(0xa, &eax, NULL, NULL, NULL); /* Check whether guest has the perf monitor feature. */ if ( (eax & 0xff) && (eax & 0xff00) ) data |= X86_CR4_PCE; /* fall through */ case 0x7 ... 0x9: ecx = 0; hvm_cpuid(0x7, NULL, &ebx, &ecx, NULL); if ( ebx & cpufeat_mask(X86_FEATURE_FSGSBASE) ) data |= X86_CR4_FSGSBASE; if ( ebx & cpufeat_mask(X86_FEATURE_SMEP) ) data |= X86_CR4_SMEP; if ( ebx & cpufeat_mask(X86_FEATURE_SMAP) ) data |= X86_CR4_SMAP; /* fall through */ case 0x0 ... 0x6: break; } break; case MSR_IA32_VMX_MISC: /* Do not support CR3-target feature now */ data = host_data & ~VMX_MISC_CR3_TARGET; break; case MSR_IA32_VMX_EPT_VPID_CAP: data = nept_get_ept_vpid_cap(); break; default: r = 0; break; } *msr_content = data; return r; } int nvmx_msr_write_intercept(unsigned int msr, u64 msr_content) { /* silently ignore for now */ return 1; } /* This function uses L2_gpa to walk the P2M page table in L1. If the * walk is successful, the translated value is returned in * L1_gpa. The result value tells what to do next. */ int nvmx_hap_walk_L1_p2m(struct vcpu *v, paddr_t L2_gpa, paddr_t *L1_gpa, unsigned int *page_order, uint8_t *p2m_acc, bool_t access_r, bool_t access_w, bool_t access_x) { int rc; unsigned long gfn; uint64_t exit_qual; uint32_t exit_reason = EXIT_REASON_EPT_VIOLATION; uint32_t rwx_rights = (access_x << 2) | (access_w << 1) | access_r; struct nestedvmx *nvmx = &vcpu_2_nvmx(v); __vmread(EXIT_QUALIFICATION, &exit_qual); rc = nept_translate_l2ga(v, L2_gpa, page_order, rwx_rights, &gfn, p2m_acc, &exit_qual, &exit_reason); switch ( rc ) { case EPT_TRANSLATE_SUCCEED: *L1_gpa = (gfn << PAGE_SHIFT) + (L2_gpa & ~PAGE_MASK); rc = NESTEDHVM_PAGEFAULT_DONE; break; case EPT_TRANSLATE_VIOLATION: case EPT_TRANSLATE_MISCONFIG: rc = NESTEDHVM_PAGEFAULT_INJECT; nvmx->ept.exit_reason = exit_reason; nvmx->ept.exit_qual = exit_qual; break; case EPT_TRANSLATE_RETRY: rc = NESTEDHVM_PAGEFAULT_RETRY; break; default: gdprintk(XENLOG_ERR, "GUEST EPT translation error!:%d\n", rc); BUG(); break; } return rc; } void nvmx_idtv_handling(void) { struct vcpu *v = current; struct nestedvmx *nvmx = &vcpu_2_nvmx(v); struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); unsigned long idtv_info, reason; __vmread(IDT_VECTORING_INFO, &idtv_info); if ( likely(!(idtv_info & INTR_INFO_VALID_MASK)) ) return; /* * If L0 can solve the fault that causes idt vectoring, it should * be reinjected, otherwise, pass to L1. */ __vmread(VM_EXIT_REASON, &reason); if ( reason != EXIT_REASON_EPT_VIOLATION ? !(nvmx->intr.intr_info & INTR_INFO_VALID_MASK) : !nvcpu->nv_vmexit_pending ) { __vmwrite(VM_ENTRY_INTR_INFO, idtv_info & ~INTR_INFO_RESVD_BITS_MASK); if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK ) { __vmread(IDT_VECTORING_ERROR_CODE, &reason); __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, reason); } /* * SDM 23.2.4, if L1 tries to inject a software interrupt * and the delivery fails, VM_EXIT_INSTRUCTION_LEN receives * the value of previous VM_ENTRY_INSTRUCTION_LEN. * * This means EXIT_INSTRUCTION_LEN is always valid here, for * software interrupts both injected by L1, and generated in L2. */ __vmread(VM_EXIT_INSTRUCTION_LEN, &reason); __vmwrite(VM_ENTRY_INSTRUCTION_LEN, reason); } } /* * L2 VMExit handling * return 1: Done or skip the normal layer 0 hypervisor process. * Typically it requires layer 1 hypervisor processing * or it may be already processed here. * 0: Require the normal layer 0 process. */ int nvmx_n2_vmexit_handler(struct cpu_user_regs *regs, unsigned int exit_reason) { struct vcpu *v = current; struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v); struct nestedvmx *nvmx = &vcpu_2_nvmx(v); u32 ctrl; nvcpu->nv_vmexit_pending = 0; nvmx->intr.intr_info = 0; nvmx->intr.error_code = 0; switch (exit_reason) { case EXIT_REASON_EXCEPTION_NMI: { unsigned long intr_info; u32 valid_mask = (X86_EVENTTYPE_HW_EXCEPTION << 8) | INTR_INFO_VALID_MASK; u64 exec_bitmap; int vector; __vmread(VM_EXIT_INTR_INFO, &intr_info); vector = intr_info & INTR_INFO_VECTOR_MASK; /* * decided by L0 and L1 exception bitmap, if the vetor is set by * both, L0 has priority on #PF and #NM, L1 has priority on others */ if ( vector == TRAP_page_fault ) { if ( paging_mode_hap(v->domain) ) nvcpu->nv_vmexit_pending = 1; } else if ( vector == TRAP_no_device ) { if ( v->fpu_dirtied ) nvcpu->nv_vmexit_pending = 1; } else if ( (intr_info & valid_mask) == valid_mask ) { exec_bitmap =__get_vvmcs(nvcpu->nv_vvmcx, EXCEPTION_BITMAP); if ( exec_bitmap & (1 << vector) ) nvcpu->nv_vmexit_pending = 1; } break; } case EXIT_REASON_WBINVD: case EXIT_REASON_EPT_VIOLATION: case EXIT_REASON_EPT_MISCONFIG: case EXIT_REASON_EXTERNAL_INTERRUPT: /* pass to L0 handler */ break; case VMX_EXIT_REASONS_FAILED_VMENTRY: case EXIT_REASON_TRIPLE_FAULT: case EXIT_REASON_TASK_SWITCH: case EXIT_REASON_CPUID: case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: case EXIT_REASON_VMXOFF: case EXIT_REASON_VMXON: case EXIT_REASON_INVEPT: case EXIT_REASON_XSETBV: /* inject to L1 */ nvcpu->nv_vmexit_pending = 1; break; case EXIT_REASON_MSR_READ: case EXIT_REASON_MSR_WRITE: { int status; ctrl = __n2_exec_control(v); if ( ctrl & CPU_BASED_ACTIVATE_MSR_BITMAP ) { status = vmx_check_msr_bitmap(nvmx->msrbitmap, regs->ecx, !!(exit_reason == EXIT_REASON_MSR_WRITE)); if ( status ) nvcpu->nv_vmexit_pending = 1; } else nvcpu->nv_vmexit_pending = 1; break; } case EXIT_REASON_IO_INSTRUCTION: ctrl = __n2_exec_control(v); if ( ctrl & CPU_BASED_ACTIVATE_IO_BITMAP ) { unsigned long qual; u16 port, size; __vmread(EXIT_QUALIFICATION, &qual); port = qual >> 16; size = (qual & 7) + 1; do { const u8 *bitmap = nvmx->iobitmap[port >> 15]; if ( bitmap[(port & 0x7fff) >> 3] & (1 << (port & 7)) ) nvcpu->nv_vmexit_pending = 1; if ( !--size ) break; if ( !++port ) nvcpu->nv_vmexit_pending = 1; } while ( !nvcpu->nv_vmexit_pending ); if ( !nvcpu->nv_vmexit_pending ) printk(XENLOG_G_WARNING "L0 PIO %04x\n", port); } else if ( ctrl & CPU_BASED_UNCOND_IO_EXITING ) nvcpu->nv_vmexit_pending = 1; break; case EXIT_REASON_PENDING_VIRT_INTR: ctrl = __n2_exec_control(v); if ( ctrl & CPU_BASED_VIRTUAL_INTR_PENDING ) nvcpu->nv_vmexit_pending = 1; break; case EXIT_REASON_PENDING_VIRT_NMI: ctrl = __n2_exec_control(v); if ( ctrl & CPU_BASED_VIRTUAL_NMI_PENDING ) nvcpu->nv_vmexit_pending = 1; break; case EXIT_REASON_MONITOR_TRAP_FLAG: ctrl = __n2_exec_control(v); if ( ctrl & CPU_BASED_MONITOR_TRAP_FLAG) nvcpu->nv_vmexit_pending = 1; break; case EXIT_REASON_ACCESS_GDTR_OR_IDTR: case EXIT_REASON_ACCESS_LDTR_OR_TR: ctrl = __n2_secondary_exec_control(v); if ( ctrl & SECONDARY_EXEC_DESCRIPTOR_TABLE_EXITING ) nvcpu->nv_vmexit_pending = 1; break; case EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED: ctrl = __n2_pin_exec_control(v); if ( ctrl & PIN_BASED_PREEMPT_TIMER ) nvcpu->nv_vmexit_pending = 1; break; /* L1 has priority handling several other types of exits */ case EXIT_REASON_HLT: ctrl = __n2_exec_control(v); if ( ctrl & CPU_BASED_HLT_EXITING ) nvcpu->nv_vmexit_pending = 1; break; case EXIT_REASON_RDTSC: ctrl = __n2_exec_control(v); if ( ctrl & CPU_BASED_RDTSC_EXITING ) nvcpu->nv_vmexit_pending = 1; else { uint64_t tsc; /* * special handler is needed if L1 doesn't intercept rdtsc, * avoiding changing guest_tsc and messing up timekeeping in L1 */ tsc = hvm_get_guest_tsc(v); tsc += __get_vvmcs(nvcpu->nv_vvmcx, TSC_OFFSET); regs->eax = (uint32_t)tsc; regs->edx = (uint32_t)(tsc >> 32); update_guest_eip(); return 1; } break; case EXIT_REASON_RDPMC: ctrl = __n2_exec_control(v); if ( ctrl & CPU_BASED_RDPMC_EXITING ) nvcpu->nv_vmexit_pending = 1; break; case EXIT_REASON_MWAIT_INSTRUCTION: ctrl = __n2_exec_control(v); if ( ctrl & CPU_BASED_MWAIT_EXITING ) nvcpu->nv_vmexit_pending = 1; break; case EXIT_REASON_PAUSE_INSTRUCTION: ctrl = __n2_exec_control(v); if ( ctrl & CPU_BASED_PAUSE_EXITING ) nvcpu->nv_vmexit_pending = 1; break; case EXIT_REASON_MONITOR_INSTRUCTION: ctrl = __n2_exec_control(v); if ( ctrl & CPU_BASED_MONITOR_EXITING ) nvcpu->nv_vmexit_pending = 1; break; case EXIT_REASON_DR_ACCESS: ctrl = __n2_exec_control(v); if ( (ctrl & CPU_BASED_MOV_DR_EXITING) && v->arch.hvm_vcpu.flag_dr_dirty ) nvcpu->nv_vmexit_pending = 1; break; case EXIT_REASON_INVLPG: ctrl = __n2_exec_control(v); if ( ctrl & CPU_BASED_INVLPG_EXITING ) nvcpu->nv_vmexit_pending = 1; break; case EXIT_REASON_CR_ACCESS: { unsigned long exit_qualification; int cr, write; u32 mask = 0; __vmread(EXIT_QUALIFICATION, &exit_qualification); cr = exit_qualification & 0xf; write = (exit_qualification >> 4) & 3; /* also according to guest exec_control */ ctrl = __n2_exec_control(v); if ( cr == 3 ) { mask = write? CPU_BASED_CR3_STORE_EXITING: CPU_BASED_CR3_LOAD_EXITING; if ( ctrl & mask ) nvcpu->nv_vmexit_pending = 1; } else if ( cr == 8 ) { mask = write? CPU_BASED_CR8_STORE_EXITING: CPU_BASED_CR8_LOAD_EXITING; if ( ctrl & mask ) nvcpu->nv_vmexit_pending = 1; } else /* CR0, CR4, CLTS, LMSW */ { /* * While getting the VM exit for CR0/CR4 access, check if L1 VMM owns * the bit. * If so, inject the VM exit to L1 VMM. * Otherwise, L0 will handle it and sync the value to L1 virtual VMCS. */ unsigned long old_val, val, changed_bits; switch ( VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification) ) { case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR: { unsigned long gp = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification); unsigned long *reg; if ( (reg = decode_register(gp, guest_cpu_user_regs(), 0)) == NULL ) { gdprintk(XENLOG_ERR, "invalid gpr: %lx\n", gp); break; } val = *reg; if ( cr == 0 ) { u64 cr0_gh_mask = __get_vvmcs(nvcpu->nv_vvmcx, CR0_GUEST_HOST_MASK); __vmread(CR0_READ_SHADOW, &old_val); changed_bits = old_val ^ val; if ( changed_bits & cr0_gh_mask ) nvcpu->nv_vmexit_pending = 1; else { u64 guest_cr0 = __get_vvmcs(nvcpu->nv_vvmcx, GUEST_CR0); __set_vvmcs(nvcpu->nv_vvmcx, GUEST_CR0, (guest_cr0 & cr0_gh_mask) | (val & ~cr0_gh_mask)); } } else if ( cr == 4 ) { u64 cr4_gh_mask = __get_vvmcs(nvcpu->nv_vvmcx, CR4_GUEST_HOST_MASK); __vmread(CR4_READ_SHADOW, &old_val); changed_bits = old_val ^ val; if ( changed_bits & cr4_gh_mask ) nvcpu->nv_vmexit_pending = 1; else { u64 guest_cr4 = __get_vvmcs(nvcpu->nv_vvmcx, GUEST_CR4); __set_vvmcs(nvcpu->nv_vvmcx, GUEST_CR4, (guest_cr4 & cr4_gh_mask) | (val & ~cr4_gh_mask)); } } else nvcpu->nv_vmexit_pending = 1; break; } case VMX_CONTROL_REG_ACCESS_TYPE_CLTS: { u64 cr0_gh_mask = __get_vvmcs(nvcpu->nv_vvmcx, CR0_GUEST_HOST_MASK); if ( cr0_gh_mask & X86_CR0_TS ) nvcpu->nv_vmexit_pending = 1; else { u64 guest_cr0 = __get_vvmcs(nvcpu->nv_vvmcx, GUEST_CR0); __set_vvmcs(nvcpu->nv_vvmcx, GUEST_CR0, (guest_cr0 & ~X86_CR0_TS)); } break; } case VMX_CONTROL_REG_ACCESS_TYPE_LMSW: { u64 cr0_gh_mask = __get_vvmcs(nvcpu->nv_vvmcx, CR0_GUEST_HOST_MASK); __vmread(CR0_READ_SHADOW, &old_val); old_val &= 0xf; val = (exit_qualification >> 16) & 0xf; changed_bits = old_val ^ val; if ( changed_bits & cr0_gh_mask ) nvcpu->nv_vmexit_pending = 1; else { u64 guest_cr0 = __get_vvmcs(nvcpu->nv_vvmcx, GUEST_CR0); __set_vvmcs(nvcpu->nv_vvmcx, GUEST_CR0, (guest_cr0 & cr0_gh_mask) | (val & ~cr0_gh_mask)); } break; } default: break; } } break; } case EXIT_REASON_APIC_ACCESS: ctrl = __n2_secondary_exec_control(v); if ( ctrl & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES ) nvcpu->nv_vmexit_pending = 1; break; case EXIT_REASON_TPR_BELOW_THRESHOLD: ctrl = __n2_exec_control(v); if ( ctrl & CPU_BASED_TPR_SHADOW ) nvcpu->nv_vmexit_pending = 1; break; default: gdprintk(XENLOG_WARNING, "Unknown nested vmexit reason %x.\n", exit_reason); } return ( nvcpu->nv_vmexit_pending == 1 ); } void nvmx_set_cr_read_shadow(struct vcpu *v, unsigned int cr) { unsigned long cr_field, read_shadow_field, mask_field; switch ( cr ) { case 0: cr_field = GUEST_CR0; read_shadow_field = CR0_READ_SHADOW; mask_field = CR0_GUEST_HOST_MASK; break; case 4: cr_field = GUEST_CR4; read_shadow_field = CR4_READ_SHADOW; mask_field = CR4_GUEST_HOST_MASK; break; default: gdprintk(XENLOG_WARNING, "Set read shadow for CR%d.\n", cr); return; } if ( !nestedhvm_vmswitch_in_progress(v) ) { unsigned long virtual_cr_mask = __get_vvmcs(vcpu_nestedhvm(v).nv_vvmcx, mask_field); /* * We get here when L2 changed cr in a way that did not change * any of L1's shadowed bits (see nvmx_n2_vmexit_handler), * but did change L0 shadowed bits. So we first calculate the * effective cr value that L1 would like to write into the * hardware. It consists of the L2-owned bits from the new * value combined with the L1-owned bits from L1's guest cr. */ v->arch.hvm_vcpu.guest_cr[cr] &= ~virtual_cr_mask; v->arch.hvm_vcpu.guest_cr[cr] |= virtual_cr_mask & __get_vvmcs(vcpu_nestedhvm(v).nv_vvmcx, cr_field); } /* nvcpu.guest_cr is what L2 write to cr actually. */ __vmwrite(read_shadow_field, v->arch.hvm_vcpu.nvcpu.guest_cr[cr]); } xen-4.4.0/xen/arch/x86/hvm/vmx/vmx.c0000664000175000017500000027172712307313555015201 0ustar smbsmb/* * vmx.c: handling VMX architecture-related VM exits * Copyright (c) 2004, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised }; static void vmx_ctxt_switch_from(struct vcpu *v); static void vmx_ctxt_switch_to(struct vcpu *v); static int vmx_alloc_vlapic_mapping(struct domain *d); static void vmx_free_vlapic_mapping(struct domain *d); static void vmx_install_vlapic_mapping(struct vcpu *v); static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr); static void vmx_update_guest_efer(struct vcpu *v); static void vmx_cpuid_intercept( unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); static void vmx_wbinvd_intercept(void); static void vmx_fpu_dirty_intercept(void); static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content); static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content); static void vmx_invlpg_intercept(unsigned long vaddr); uint8_t __read_mostly posted_intr_vector; static int vmx_domain_initialise(struct domain *d) { int rc; if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 ) return rc; return 0; } static void vmx_domain_destroy(struct domain *d) { vmx_free_vlapic_mapping(d); } static int vmx_vcpu_initialise(struct vcpu *v) { int rc; spin_lock_init(&v->arch.hvm_vmx.vmcs_lock); v->arch.schedule_tail = vmx_do_resume; v->arch.ctxt_switch_from = vmx_ctxt_switch_from; v->arch.ctxt_switch_to = vmx_ctxt_switch_to; if ( (rc = vmx_create_vmcs(v)) != 0 ) { dprintk(XENLOG_WARNING, "Failed to create VMCS for vcpu %d: err=%d.\n", v->vcpu_id, rc); return rc; } vpmu_initialise(v); vmx_install_vlapic_mapping(v); /* %eax == 1 signals full real-mode support to the guest loader. */ if ( v->vcpu_id == 0 ) v->arch.user_regs.eax = 1; return 0; } static void vmx_vcpu_destroy(struct vcpu *v) { vmx_destroy_vmcs(v); vpmu_destroy(v); passive_domain_destroy(v); } static DEFINE_PER_CPU(struct vmx_msr_state, host_msr_state); static const u32 msr_index[] = { MSR_LSTAR, MSR_STAR, MSR_SYSCALL_MASK }; #define MSR_INDEX_SIZE (ARRAY_SIZE(msr_index)) void vmx_save_host_msrs(void) { struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state); int i; for ( i = 0; i < MSR_INDEX_SIZE; i++ ) rdmsrl(msr_index[i], host_msr_state->msrs[i]); } #define WRITE_MSR(address) \ guest_msr_state->msrs[VMX_INDEX_MSR_ ## address] = msr_content; \ set_bit(VMX_INDEX_MSR_ ## address, &guest_msr_state->flags); \ wrmsrl(MSR_ ## address, msr_content); \ set_bit(VMX_INDEX_MSR_ ## address, &host_msr_state->flags); \ break static enum handler_return long_mode_do_msr_read(unsigned int msr, uint64_t *msr_content) { struct vcpu *v = current; struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state; switch ( msr ) { case MSR_FS_BASE: __vmread(GUEST_FS_BASE, msr_content); break; case MSR_GS_BASE: __vmread(GUEST_GS_BASE, msr_content); break; case MSR_SHADOW_GS_BASE: rdmsrl(MSR_SHADOW_GS_BASE, *msr_content); break; case MSR_STAR: *msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_STAR]; break; case MSR_LSTAR: *msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_LSTAR]; break; case MSR_CSTAR: *msr_content = v->arch.hvm_vmx.cstar; break; case MSR_SYSCALL_MASK: *msr_content = guest_msr_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK]; break; default: return HNDL_unhandled; } HVM_DBG_LOG(DBG_LEVEL_0, "msr %#x content %#"PRIx64, msr, *msr_content); return HNDL_done; } static enum handler_return long_mode_do_msr_write(unsigned int msr, uint64_t msr_content) { struct vcpu *v = current; struct vmx_msr_state *guest_msr_state = &v->arch.hvm_vmx.msr_state; struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state); HVM_DBG_LOG(DBG_LEVEL_0, "msr %#x content %#"PRIx64, msr, msr_content); switch ( msr ) { case MSR_FS_BASE: case MSR_GS_BASE: case MSR_SHADOW_GS_BASE: if ( !is_canonical_address(msr_content) ) goto uncanonical_address; if ( msr == MSR_FS_BASE ) __vmwrite(GUEST_FS_BASE, msr_content); else if ( msr == MSR_GS_BASE ) __vmwrite(GUEST_GS_BASE, msr_content); else wrmsrl(MSR_SHADOW_GS_BASE, msr_content); break; case MSR_STAR: WRITE_MSR(STAR); case MSR_LSTAR: if ( !is_canonical_address(msr_content) ) goto uncanonical_address; WRITE_MSR(LSTAR); case MSR_CSTAR: if ( !is_canonical_address(msr_content) ) goto uncanonical_address; v->arch.hvm_vmx.cstar = msr_content; break; case MSR_SYSCALL_MASK: WRITE_MSR(SYSCALL_MASK); default: return HNDL_unhandled; } return HNDL_done; uncanonical_address: HVM_DBG_LOG(DBG_LEVEL_0, "Not cano address of msr write %x", msr); hvm_inject_hw_exception(TRAP_gp_fault, 0); return HNDL_exception_raised; } /* * To avoid MSR save/restore at every VM exit/entry time, we restore * the x86_64 specific MSRs at domain switch time. Since these MSRs * are not modified once set for para domains, we don't save them, * but simply reset them to values set in percpu_traps_init(). */ static void vmx_restore_host_msrs(void) { struct vmx_msr_state *host_msr_state = &this_cpu(host_msr_state); int i; while ( host_msr_state->flags ) { i = find_first_set_bit(host_msr_state->flags); wrmsrl(msr_index[i], host_msr_state->msrs[i]); clear_bit(i, &host_msr_state->flags); } } static void vmx_save_guest_msrs(struct vcpu *v) { /* * We cannot cache SHADOW_GS_BASE while the VCPU runs, as it can * be updated at any time via SWAPGS, which we cannot trap. */ rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs); } static void vmx_restore_guest_msrs(struct vcpu *v) { struct vmx_msr_state *guest_msr_state, *host_msr_state; unsigned long guest_flags; int i; guest_msr_state = &v->arch.hvm_vmx.msr_state; host_msr_state = &this_cpu(host_msr_state); wrmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_vmx.shadow_gs); guest_flags = guest_msr_state->flags; while ( guest_flags ) { i = find_first_set_bit(guest_flags); HVM_DBG_LOG(DBG_LEVEL_2, "restore guest's index %d msr %x with value %lx", i, msr_index[i], guest_msr_state->msrs[i]); set_bit(i, &host_msr_state->flags); wrmsrl(msr_index[i], guest_msr_state->msrs[i]); clear_bit(i, &guest_flags); } if ( (v->arch.hvm_vcpu.guest_efer ^ read_efer()) & EFER_SCE ) { HVM_DBG_LOG(DBG_LEVEL_2, "restore guest's EFER with value %lx", v->arch.hvm_vcpu.guest_efer); write_efer((read_efer() & ~EFER_SCE) | (v->arch.hvm_vcpu.guest_efer & EFER_SCE)); } if ( cpu_has_rdtscp ) wrmsrl(MSR_TSC_AUX, hvm_msr_tsc_aux(v)); } void vmx_update_cpu_exec_control(struct vcpu *v) { if ( nestedhvm_vcpu_in_guestmode(v) ) nvmx_update_exec_control(v, v->arch.hvm_vmx.exec_control); else __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control); } void vmx_update_secondary_exec_control(struct vcpu *v) { if ( nestedhvm_vcpu_in_guestmode(v) ) nvmx_update_secondary_exec_control(v, v->arch.hvm_vmx.secondary_exec_control); else __vmwrite(SECONDARY_VM_EXEC_CONTROL, v->arch.hvm_vmx.secondary_exec_control); } void vmx_update_exception_bitmap(struct vcpu *v) { if ( nestedhvm_vcpu_in_guestmode(v) ) nvmx_update_exception_bitmap(v, v->arch.hvm_vmx.exception_bitmap); else __vmwrite(EXCEPTION_BITMAP, v->arch.hvm_vmx.exception_bitmap); } static int vmx_guest_x86_mode(struct vcpu *v) { unsigned long cs_ar_bytes; if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) ) return 0; if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) ) return 1; __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes); if ( hvm_long_mode_enabled(v) && likely(cs_ar_bytes & X86_SEG_AR_CS_LM_ACTIVE) ) return 8; return (likely(cs_ar_bytes & X86_SEG_AR_DEF_OP_SIZE) ? 4 : 2); } static void vmx_save_dr(struct vcpu *v) { if ( !v->arch.hvm_vcpu.flag_dr_dirty ) return; /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */ v->arch.hvm_vcpu.flag_dr_dirty = 0; v->arch.hvm_vmx.exec_control |= CPU_BASED_MOV_DR_EXITING; vmx_update_cpu_exec_control(v); v->arch.debugreg[0] = read_debugreg(0); v->arch.debugreg[1] = read_debugreg(1); v->arch.debugreg[2] = read_debugreg(2); v->arch.debugreg[3] = read_debugreg(3); v->arch.debugreg[6] = read_debugreg(6); /* DR7 must be saved as it is used by vmx_restore_dr(). */ __vmread(GUEST_DR7, &v->arch.debugreg[7]); } static void __restore_debug_registers(struct vcpu *v) { if ( v->arch.hvm_vcpu.flag_dr_dirty ) return; v->arch.hvm_vcpu.flag_dr_dirty = 1; write_debugreg(0, v->arch.debugreg[0]); write_debugreg(1, v->arch.debugreg[1]); write_debugreg(2, v->arch.debugreg[2]); write_debugreg(3, v->arch.debugreg[3]); write_debugreg(6, v->arch.debugreg[6]); /* DR7 is loaded from the VMCS. */ } /* * DR7 is saved and restored on every vmexit. Other debug registers only * need to be restored if their value is going to affect execution -- i.e., * if one of the breakpoints is enabled. So mask out all bits that don't * enable some breakpoint functionality. */ static void vmx_restore_dr(struct vcpu *v) { /* NB. __vmread() is not usable here, so we cannot read from the VMCS. */ if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) ) __restore_debug_registers(v); } static void vmx_vmcs_save(struct vcpu *v, struct hvm_hw_cpu *c) { unsigned long ev; vmx_vmcs_enter(v); c->cr0 = v->arch.hvm_vcpu.guest_cr[0]; c->cr2 = v->arch.hvm_vcpu.guest_cr[2]; c->cr3 = v->arch.hvm_vcpu.guest_cr[3]; c->cr4 = v->arch.hvm_vcpu.guest_cr[4]; c->msr_efer = v->arch.hvm_vcpu.guest_efer; __vmread(GUEST_SYSENTER_CS, &c->sysenter_cs); __vmread(GUEST_SYSENTER_ESP, &c->sysenter_esp); __vmread(GUEST_SYSENTER_EIP, &c->sysenter_eip); c->pending_event = 0; c->error_code = 0; __vmread(VM_ENTRY_INTR_INFO, &ev); if ( (ev & INTR_INFO_VALID_MASK) && hvm_event_needs_reinjection((ev >> 8) & 7, ev & 0xff) ) { c->pending_event = ev; __vmread(VM_ENTRY_EXCEPTION_ERROR_CODE, &ev); c->error_code = ev; } vmx_vmcs_exit(v); } static int vmx_restore_cr0_cr3( struct vcpu *v, unsigned long cr0, unsigned long cr3) { struct page_info *page = NULL; if ( paging_mode_shadow(v->domain) ) { if ( cr0 & X86_CR0_PG ) { page = get_page_from_gfn(v->domain, cr3 >> PAGE_SHIFT, NULL, P2M_ALLOC); if ( !page ) { gdprintk(XENLOG_ERR, "Invalid CR3 value=%#lx\n", cr3); return -EINVAL; } } if ( hvm_paging_enabled(v) ) put_page(pagetable_get_page(v->arch.guest_table)); v->arch.guest_table = page ? pagetable_from_page(page) : pagetable_null(); } v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET; v->arch.hvm_vcpu.guest_cr[3] = cr3; return 0; } static int vmx_vmcs_restore(struct vcpu *v, struct hvm_hw_cpu *c) { int rc; if ( c->pending_valid && ((c->pending_type == 1) || (c->pending_type > 6) || (c->pending_reserved != 0)) ) { gdprintk(XENLOG_ERR, "Invalid pending event %#"PRIx32".\n", c->pending_event); return -EINVAL; } rc = vmx_restore_cr0_cr3(v, c->cr0, c->cr3); if ( rc ) return rc; vmx_vmcs_enter(v); v->arch.hvm_vcpu.guest_cr[2] = c->cr2; v->arch.hvm_vcpu.guest_cr[4] = c->cr4; vmx_update_guest_cr(v, 0); vmx_update_guest_cr(v, 2); vmx_update_guest_cr(v, 4); v->arch.hvm_vcpu.guest_efer = c->msr_efer; vmx_update_guest_efer(v); __vmwrite(GUEST_SYSENTER_CS, c->sysenter_cs); __vmwrite(GUEST_SYSENTER_ESP, c->sysenter_esp); __vmwrite(GUEST_SYSENTER_EIP, c->sysenter_eip); __vmwrite(GUEST_DR7, c->dr7); vmx_vmcs_exit(v); paging_update_paging_modes(v); if ( c->pending_valid ) { gdprintk(XENLOG_INFO, "Re-injecting %#"PRIx32", %#"PRIx32"\n", c->pending_event, c->error_code); if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) ) { vmx_vmcs_enter(v); __vmwrite(VM_ENTRY_INTR_INFO, c->pending_event); __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, c->error_code); vmx_vmcs_exit(v); } } return 0; } static void vmx_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data) { struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state; unsigned long guest_flags = guest_state->flags; data->shadow_gs = v->arch.hvm_vmx.shadow_gs; data->msr_cstar = v->arch.hvm_vmx.cstar; /* save msrs */ data->msr_flags = guest_flags; data->msr_lstar = guest_state->msrs[VMX_INDEX_MSR_LSTAR]; data->msr_star = guest_state->msrs[VMX_INDEX_MSR_STAR]; data->msr_syscall_mask = guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK]; data->tsc = hvm_get_guest_tsc(v); } static void vmx_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data) { struct vmx_msr_state *guest_state = &v->arch.hvm_vmx.msr_state; /* restore msrs */ guest_state->flags = data->msr_flags & 7; guest_state->msrs[VMX_INDEX_MSR_LSTAR] = data->msr_lstar; guest_state->msrs[VMX_INDEX_MSR_STAR] = data->msr_star; guest_state->msrs[VMX_INDEX_MSR_SYSCALL_MASK] = data->msr_syscall_mask; v->arch.hvm_vmx.cstar = data->msr_cstar; v->arch.hvm_vmx.shadow_gs = data->shadow_gs; hvm_set_guest_tsc(v, data->tsc); } static void vmx_save_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt) { vmx_save_cpu_state(v, ctxt); vmx_vmcs_save(v, ctxt); } static int vmx_load_vmcs_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt) { vmx_load_cpu_state(v, ctxt); if ( vmx_vmcs_restore(v, ctxt) ) { gdprintk(XENLOG_ERR, "vmx_vmcs restore failed!\n"); domain_crash(v->domain); return -EINVAL; } return 0; } static void vmx_fpu_enter(struct vcpu *v) { vcpu_restore_fpu_lazy(v); v->arch.hvm_vmx.exception_bitmap &= ~(1u << TRAP_no_device); vmx_update_exception_bitmap(v); v->arch.hvm_vmx.host_cr0 &= ~X86_CR0_TS; __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); } static void vmx_fpu_leave(struct vcpu *v) { ASSERT(!v->fpu_dirtied); ASSERT(read_cr0() & X86_CR0_TS); if ( !(v->arch.hvm_vmx.host_cr0 & X86_CR0_TS) ) { v->arch.hvm_vmx.host_cr0 |= X86_CR0_TS; __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); } /* * If the guest does not have TS enabled then we must cause and handle an * exception on first use of the FPU. If the guest *does* have TS enabled * then this is not necessary: no FPU activity can occur until the guest * clears CR0.TS, and we will initialise the FPU when that happens. */ if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) ) { v->arch.hvm_vcpu.hw_cr[0] |= X86_CR0_TS; __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]); v->arch.hvm_vmx.exception_bitmap |= (1u << TRAP_no_device); vmx_update_exception_bitmap(v); } } static void vmx_ctxt_switch_from(struct vcpu *v) { vmx_fpu_leave(v); vmx_save_guest_msrs(v); vmx_restore_host_msrs(); vmx_save_dr(v); } static void vmx_ctxt_switch_to(struct vcpu *v) { struct domain *d = v->domain; unsigned long old_cr4 = read_cr4(), new_cr4 = mmu_cr4_features; struct ept_data *ept_data = &p2m_get_hostp2m(d)->ept; /* HOST_CR4 in VMCS is always mmu_cr4_features. Sync CR4 now. */ if ( old_cr4 != new_cr4 ) write_cr4(new_cr4); if ( paging_mode_hap(d) ) { unsigned int cpu = smp_processor_id(); /* Test-and-test-and-set this CPU in the EPT-is-synced mask. */ if ( !cpumask_test_cpu(cpu, ept_get_synced_mask(ept_data)) && !cpumask_test_and_set_cpu(cpu, ept_get_synced_mask(ept_data)) ) __invept(INVEPT_SINGLE_CONTEXT, ept_get_eptp(ept_data), 0); } vmx_restore_guest_msrs(v); vmx_restore_dr(v); } /* SDM volume 3b section 22.3.1.2: we can only enter virtual 8086 mode * if all of CS, SS, DS, ES, FS and GS are 16bit ring-3 data segments. * The guest thinks it's got ring-0 segments, so we need to fudge * things. We store the ring-3 version in the VMCS to avoid lots of * shuffling on vmenter and vmexit, and translate in these accessors. */ #define rm_cs_attr (((union segment_attributes) { \ .fields = { .type = 0xb, .s = 1, .dpl = 0, .p = 1, .avl = 0, \ .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes) #define rm_ds_attr (((union segment_attributes) { \ .fields = { .type = 0x3, .s = 1, .dpl = 0, .p = 1, .avl = 0, \ .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes) #define vm86_ds_attr (((union segment_attributes) { \ .fields = { .type = 0x3, .s = 1, .dpl = 3, .p = 1, .avl = 0, \ .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes) #define vm86_tr_attr (((union segment_attributes) { \ .fields = { .type = 0xb, .s = 0, .dpl = 0, .p = 1, .avl = 0, \ .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes) void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg, struct segment_register *reg) { unsigned long attr = 0, sel = 0, limit; /* * We may get here in the context of dump_execstate(), which may have * interrupted context switching between setting "current" and * vmx_do_resume() reaching the end of vmx_load_vmcs(). That would make * all the VMREADs below fail if we don't bail right away. */ if ( unlikely(!vmx_vmcs_try_enter(v)) ) { static bool_t warned; if ( !warned ) { warned = 1; printk(XENLOG_WARNING "Segment register inaccessible for d%dv%d\n" "(If you see this outside of debugging activity," " please report to xen-devel@lists.xenproject.org)\n", v->domain->domain_id, v->vcpu_id); } memset(reg, 0, sizeof(*reg)); return; } switch ( seg ) { case x86_seg_cs: __vmread(GUEST_CS_SELECTOR, &sel); __vmread(GUEST_CS_LIMIT, &limit); __vmread(GUEST_CS_BASE, ®->base); __vmread(GUEST_CS_AR_BYTES, &attr); break; case x86_seg_ds: __vmread(GUEST_DS_SELECTOR, &sel); __vmread(GUEST_DS_LIMIT, &limit); __vmread(GUEST_DS_BASE, ®->base); __vmread(GUEST_DS_AR_BYTES, &attr); break; case x86_seg_es: __vmread(GUEST_ES_SELECTOR, &sel); __vmread(GUEST_ES_LIMIT, &limit); __vmread(GUEST_ES_BASE, ®->base); __vmread(GUEST_ES_AR_BYTES, &attr); break; case x86_seg_fs: __vmread(GUEST_FS_SELECTOR, &sel); __vmread(GUEST_FS_LIMIT, &limit); __vmread(GUEST_FS_BASE, ®->base); __vmread(GUEST_FS_AR_BYTES, &attr); break; case x86_seg_gs: __vmread(GUEST_GS_SELECTOR, &sel); __vmread(GUEST_GS_LIMIT, &limit); __vmread(GUEST_GS_BASE, ®->base); __vmread(GUEST_GS_AR_BYTES, &attr); break; case x86_seg_ss: __vmread(GUEST_SS_SELECTOR, &sel); __vmread(GUEST_SS_LIMIT, &limit); __vmread(GUEST_SS_BASE, ®->base); __vmread(GUEST_SS_AR_BYTES, &attr); break; case x86_seg_tr: __vmread(GUEST_TR_SELECTOR, &sel); __vmread(GUEST_TR_LIMIT, &limit); __vmread(GUEST_TR_BASE, ®->base); __vmread(GUEST_TR_AR_BYTES, &attr); break; case x86_seg_gdtr: __vmread(GUEST_GDTR_LIMIT, &limit); __vmread(GUEST_GDTR_BASE, ®->base); break; case x86_seg_idtr: __vmread(GUEST_IDTR_LIMIT, &limit); __vmread(GUEST_IDTR_BASE, ®->base); break; case x86_seg_ldtr: __vmread(GUEST_LDTR_SELECTOR, &sel); __vmread(GUEST_LDTR_LIMIT, &limit); __vmread(GUEST_LDTR_BASE, ®->base); __vmread(GUEST_LDTR_AR_BYTES, &attr); break; default: BUG(); return; } vmx_vmcs_exit(v); reg->sel = sel; reg->limit = limit; reg->attr.bytes = (attr & 0xff) | ((attr >> 4) & 0xf00); /* Unusable flag is folded into Present flag. */ if ( attr & (1u<<16) ) reg->attr.fields.p = 0; /* Adjust for virtual 8086 mode */ if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr && !(v->arch.hvm_vmx.vm86_segment_mask & (1u << seg)) ) { struct segment_register *sreg = &v->arch.hvm_vmx.vm86_saved_seg[seg]; if ( seg == x86_seg_tr ) *reg = *sreg; else if ( reg->base != sreg->base || seg == x86_seg_ss ) { /* If the guest's reloaded the segment, remember the new version. * We can't tell if the guest reloaded the segment with another * one that has the same base. By default we assume it hasn't, * since we don't want to lose big-real-mode segment attributes, * but for SS we assume it has: the Ubuntu graphical bootloader * does this and gets badly confused if we leave the old SS in * place. */ reg->attr.bytes = (seg == x86_seg_cs ? rm_cs_attr : rm_ds_attr); *sreg = *reg; } else { /* Always give realmode guests a selector that matches the base * but keep the attr and limit from before */ *reg = *sreg; reg->sel = reg->base >> 4; } } } static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg, struct segment_register *reg) { uint32_t attr, sel, limit; uint64_t base; sel = reg->sel; attr = reg->attr.bytes; limit = reg->limit; base = reg->base; /* Adjust CS/SS/DS/ES/FS/GS/TR for virtual 8086 mode */ if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr ) { /* Remember the proper contents */ v->arch.hvm_vmx.vm86_saved_seg[seg] = *reg; if ( seg == x86_seg_tr ) { if ( v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS] ) { sel = 0; attr = vm86_tr_attr; limit = 0xff; base = v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS]; v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg); } else v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg); } else { /* Try to fake it out as a 16bit data segment. This could * cause confusion for the guest if it reads the selector, * but otherwise we have to emulate if *any* segment hasn't * been reloaded. */ if ( base < 0x100000 && !(base & 0xf) && limit >= 0xffff && reg->attr.fields.p ) { sel = base >> 4; attr = vm86_ds_attr; limit = 0xffff; v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg); } else v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg); } } attr = ((attr & 0xf00) << 4) | (attr & 0xff); /* Not-present must mean unusable. */ if ( !reg->attr.fields.p ) attr |= (1u << 16); /* VMX has strict consistency requirement for flag G. */ attr |= !!(limit >> 20) << 15; vmx_vmcs_enter(v); switch ( seg ) { case x86_seg_cs: __vmwrite(GUEST_CS_SELECTOR, sel); __vmwrite(GUEST_CS_LIMIT, limit); __vmwrite(GUEST_CS_BASE, base); __vmwrite(GUEST_CS_AR_BYTES, attr); break; case x86_seg_ds: __vmwrite(GUEST_DS_SELECTOR, sel); __vmwrite(GUEST_DS_LIMIT, limit); __vmwrite(GUEST_DS_BASE, base); __vmwrite(GUEST_DS_AR_BYTES, attr); break; case x86_seg_es: __vmwrite(GUEST_ES_SELECTOR, sel); __vmwrite(GUEST_ES_LIMIT, limit); __vmwrite(GUEST_ES_BASE, base); __vmwrite(GUEST_ES_AR_BYTES, attr); break; case x86_seg_fs: __vmwrite(GUEST_FS_SELECTOR, sel); __vmwrite(GUEST_FS_LIMIT, limit); __vmwrite(GUEST_FS_BASE, base); __vmwrite(GUEST_FS_AR_BYTES, attr); break; case x86_seg_gs: __vmwrite(GUEST_GS_SELECTOR, sel); __vmwrite(GUEST_GS_LIMIT, limit); __vmwrite(GUEST_GS_BASE, base); __vmwrite(GUEST_GS_AR_BYTES, attr); break; case x86_seg_ss: __vmwrite(GUEST_SS_SELECTOR, sel); __vmwrite(GUEST_SS_LIMIT, limit); __vmwrite(GUEST_SS_BASE, base); __vmwrite(GUEST_SS_AR_BYTES, attr); break; case x86_seg_tr: __vmwrite(GUEST_TR_SELECTOR, sel); __vmwrite(GUEST_TR_LIMIT, limit); __vmwrite(GUEST_TR_BASE, base); /* VMX checks that the the busy flag (bit 1) is set. */ __vmwrite(GUEST_TR_AR_BYTES, attr | 2); break; case x86_seg_gdtr: __vmwrite(GUEST_GDTR_LIMIT, limit); __vmwrite(GUEST_GDTR_BASE, base); break; case x86_seg_idtr: __vmwrite(GUEST_IDTR_LIMIT, limit); __vmwrite(GUEST_IDTR_BASE, base); break; case x86_seg_ldtr: __vmwrite(GUEST_LDTR_SELECTOR, sel); __vmwrite(GUEST_LDTR_LIMIT, limit); __vmwrite(GUEST_LDTR_BASE, base); __vmwrite(GUEST_LDTR_AR_BYTES, attr); break; default: BUG(); } vmx_vmcs_exit(v); } static unsigned long vmx_get_shadow_gs_base(struct vcpu *v) { return v->arch.hvm_vmx.shadow_gs; } static int vmx_set_guest_pat(struct vcpu *v, u64 gpat) { if ( !paging_mode_hap(v->domain) || unlikely(v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) return 0; vmx_vmcs_enter(v); __vmwrite(GUEST_PAT, gpat); vmx_vmcs_exit(v); return 1; } static int vmx_get_guest_pat(struct vcpu *v, u64 *gpat) { if ( !paging_mode_hap(v->domain) || unlikely(v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) return 0; vmx_vmcs_enter(v); __vmread(GUEST_PAT, gpat); vmx_vmcs_exit(v); return 1; } static void vmx_handle_cd(struct vcpu *v, unsigned long value) { if ( !paging_mode_hap(v->domain) ) { /* * For shadow, 'load IA32_PAT' VM-entry control is 0, so it cannot * set guest memory type as UC via IA32_PAT. Xen drop all shadows * so that any new ones would be created on demand. */ hvm_shadow_handle_cd(v, value); } else { u64 *pat = &v->arch.hvm_vcpu.pat_cr; if ( value & X86_CR0_CD ) { /* * For EPT, set guest IA32_PAT fields as UC so that guest * memory type are all UC. */ u64 uc_pat = ((uint64_t)PAT_TYPE_UNCACHABLE) | /* PAT0 */ ((uint64_t)PAT_TYPE_UNCACHABLE << 8) | /* PAT1 */ ((uint64_t)PAT_TYPE_UNCACHABLE << 16) | /* PAT2 */ ((uint64_t)PAT_TYPE_UNCACHABLE << 24) | /* PAT3 */ ((uint64_t)PAT_TYPE_UNCACHABLE << 32) | /* PAT4 */ ((uint64_t)PAT_TYPE_UNCACHABLE << 40) | /* PAT5 */ ((uint64_t)PAT_TYPE_UNCACHABLE << 48) | /* PAT6 */ ((uint64_t)PAT_TYPE_UNCACHABLE << 56); /* PAT7 */ vmx_get_guest_pat(v, pat); vmx_set_guest_pat(v, uc_pat); wbinvd(); /* flush possibly polluted cache */ hvm_asid_flush_vcpu(v); /* invalidate memory type cached in TLB */ v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE; } else { v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE; vmx_set_guest_pat(v, *pat); hvm_asid_flush_vcpu(v); /* no need to flush cache */ } } } static void vmx_set_tsc_offset(struct vcpu *v, u64 offset) { vmx_vmcs_enter(v); if ( nestedhvm_vcpu_in_guestmode(v) ) offset += nvmx_get_tsc_offset(v); __vmwrite(TSC_OFFSET, offset); vmx_vmcs_exit(v); } static void vmx_set_rdtsc_exiting(struct vcpu *v, bool_t enable) { vmx_vmcs_enter(v); v->arch.hvm_vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING; if ( enable ) v->arch.hvm_vmx.exec_control |= CPU_BASED_RDTSC_EXITING; vmx_update_cpu_exec_control(v); vmx_vmcs_exit(v); } static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page) { char *p; int i; for ( i = 0; i < (PAGE_SIZE / 32); i++ ) { if ( i == __HYPERVISOR_iret ) continue; p = (char *)(hypercall_page + (i * 32)); *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */ *(u32 *)(p + 1) = i; *(u8 *)(p + 5) = 0x0f; /* vmcall */ *(u8 *)(p + 6) = 0x01; *(u8 *)(p + 7) = 0xc1; *(u8 *)(p + 8) = 0xc3; /* ret */ } /* Don't support HYPERVISOR_iret at the moment */ *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */ } static unsigned int vmx_get_interrupt_shadow(struct vcpu *v) { unsigned long intr_shadow; __vmread(GUEST_INTERRUPTIBILITY_INFO, &intr_shadow); return intr_shadow; } static void vmx_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow) { __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow); } static void vmx_load_pdptrs(struct vcpu *v) { unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3]; uint64_t *guest_pdptrs; struct page_info *page; p2m_type_t p2mt; char *p; /* EPT needs to load PDPTRS into VMCS for PAE. */ if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) ) return; if ( (cr3 & 0x1fUL) && !hvm_pcid_enabled(v) ) goto crash; page = get_page_from_gfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt, P2M_UNSHARE); if ( !page ) { /* Ideally you don't want to crash but rather go into a wait * queue, but this is the wrong place. We're holding at least * the paging lock */ gdprintk(XENLOG_ERR, "Bad cr3 on load pdptrs gfn %lx type %d\n", cr3 >> PAGE_SHIFT, (int) p2mt); goto crash; } p = __map_domain_page(page); guest_pdptrs = (uint64_t *)(p + (cr3 & ~PAGE_MASK)); /* * We do not check the PDPTRs for validity. The CPU will do this during * vm entry, and we can handle the failure there and crash the guest. * The only thing we could do better here is #GP instead. */ vmx_vmcs_enter(v); __vmwrite(GUEST_PDPTR0, guest_pdptrs[0]); __vmwrite(GUEST_PDPTR1, guest_pdptrs[1]); __vmwrite(GUEST_PDPTR2, guest_pdptrs[2]); __vmwrite(GUEST_PDPTR3, guest_pdptrs[3]); vmx_vmcs_exit(v); unmap_domain_page(p); put_page(page); return; crash: domain_crash(v->domain); } static void vmx_update_host_cr3(struct vcpu *v) { vmx_vmcs_enter(v); __vmwrite(HOST_CR3, v->arch.cr3); vmx_vmcs_exit(v); } void vmx_update_debug_state(struct vcpu *v) { unsigned long mask; mask = 1u << TRAP_int3; if ( !cpu_has_monitor_trap_flag ) mask |= 1u << TRAP_debug; if ( v->arch.hvm_vcpu.debug_state_latch ) v->arch.hvm_vmx.exception_bitmap |= mask; else v->arch.hvm_vmx.exception_bitmap &= ~mask; vmx_vmcs_enter(v); vmx_update_exception_bitmap(v); vmx_vmcs_exit(v); } static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr) { vmx_vmcs_enter(v); switch ( cr ) { case 0: { int realmode; unsigned long hw_cr0_mask = X86_CR0_NE; if ( !vmx_unrestricted_guest(v) ) hw_cr0_mask |= X86_CR0_PG | X86_CR0_PE; if ( paging_mode_shadow(v->domain) ) hw_cr0_mask |= X86_CR0_WP; if ( paging_mode_hap(v->domain) ) { /* Manage GUEST_CR3 when CR0.PE=0. */ uint32_t cr3_ctls = (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); v->arch.hvm_vmx.exec_control &= ~cr3_ctls; if ( !hvm_paging_enabled(v) && !vmx_unrestricted_guest(v) ) v->arch.hvm_vmx.exec_control |= cr3_ctls; /* Trap CR3 updates if CR3 memory events are enabled. */ if ( v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_CR3] ) v->arch.hvm_vmx.exec_control |= CPU_BASED_CR3_LOAD_EXITING; vmx_update_cpu_exec_control(v); } if ( !nestedhvm_vcpu_in_guestmode(v) ) __vmwrite(CR0_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[0]); else nvmx_set_cr_read_shadow(v, 0); if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) ) { if ( v != current ) hw_cr0_mask |= X86_CR0_TS; else if ( v->arch.hvm_vcpu.hw_cr[0] & X86_CR0_TS ) vmx_fpu_enter(v); } realmode = !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE); if ( (!vmx_unrestricted_guest(v)) && (realmode != v->arch.hvm_vmx.vmx_realmode) ) { enum x86_segment s; struct segment_register reg[x86_seg_tr + 1]; /* Entering or leaving real mode: adjust the segment registers. * Need to read them all either way, as realmode reads can update * the saved values we'll use when returning to prot mode. */ for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ ) vmx_get_segment_register(v, s, ®[s]); v->arch.hvm_vmx.vmx_realmode = realmode; if ( realmode ) { for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ ) vmx_set_segment_register(v, s, ®[s]); v->arch.hvm_vmx.exception_bitmap = 0xffffffff; vmx_update_exception_bitmap(v); } else { for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ ) if ( !(v->arch.hvm_vmx.vm86_segment_mask & (1<arch.hvm_vmx.vm86_saved_seg[s]); v->arch.hvm_vmx.exception_bitmap = HVM_TRAP_MASK | (paging_mode_hap(v->domain) ? 0 : (1U << TRAP_page_fault)) | (1U << TRAP_no_device); vmx_update_exception_bitmap(v); vmx_update_debug_state(v); } } v->arch.hvm_vcpu.hw_cr[0] = v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask; __vmwrite(GUEST_CR0, v->arch.hvm_vcpu.hw_cr[0]); /* Changing CR0 can change some bits in real CR4. */ vmx_update_guest_cr(v, 4); break; } case 2: /* CR2 is updated in exit stub. */ break; case 3: if ( paging_mode_hap(v->domain) ) { if ( !hvm_paging_enabled(v) && !vmx_unrestricted_guest(v) ) v->arch.hvm_vcpu.hw_cr[3] = v->domain->arch.hvm_domain.params[HVM_PARAM_IDENT_PT]; vmx_load_pdptrs(v); } __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]); hvm_asid_flush_vcpu(v); break; case 4: v->arch.hvm_vcpu.hw_cr[4] = HVM_CR4_HOST_MASK; if ( paging_mode_hap(v->domain) ) v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE; if ( !nestedhvm_vcpu_in_guestmode(v) ) __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]); else nvmx_set_cr_read_shadow(v, 4); v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4]; if ( v->arch.hvm_vmx.vmx_realmode ) v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME; if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) ) { v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE; v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE; } if ( !hvm_paging_enabled(v) ) { /* * SMEP is disabled if CPU is in non-paging mode in hardware. * However Xen always uses paging mode to emulate guest non-paging * mode. To emulate this behavior, SMEP needs to be manually * disabled when guest VCPU is in non-paging mode. */ v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_SMEP; } __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]); break; default: BUG(); } vmx_vmcs_exit(v); } static void vmx_update_guest_efer(struct vcpu *v) { unsigned long vm_entry_value; vmx_vmcs_enter(v); __vmread(VM_ENTRY_CONTROLS, &vm_entry_value); if ( v->arch.hvm_vcpu.guest_efer & EFER_LMA ) vm_entry_value |= VM_ENTRY_IA32E_MODE; else vm_entry_value &= ~VM_ENTRY_IA32E_MODE; __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value); vmx_vmcs_exit(v); if ( v == current ) write_efer((read_efer() & ~EFER_SCE) | (v->arch.hvm_vcpu.guest_efer & EFER_SCE)); } void nvmx_enqueue_n2_exceptions(struct vcpu *v, unsigned long intr_fields, int error_code, uint8_t source) { struct nestedvmx *nvmx = &vcpu_2_nvmx(v); if ( !(nvmx->intr.intr_info & INTR_INFO_VALID_MASK) ) { /* enqueue the exception till the VMCS switch back to L1 */ nvmx->intr.intr_info = intr_fields; nvmx->intr.error_code = error_code; nvmx->intr.source = source; vcpu_nestedhvm(v).nv_vmexit_pending = 1; return; } else gdprintk(XENLOG_ERR, "Double Fault on Nested Guest: exception %lx %x" "on %lx %x\n", intr_fields, error_code, nvmx->intr.intr_info, nvmx->intr.error_code); } static int nvmx_vmexit_trap(struct vcpu *v, struct hvm_trap *trap) { nvmx_enqueue_n2_exceptions(v, trap->vector, trap->error_code, hvm_intsrc_none); return NESTEDHVM_VMEXIT_DONE; } static void __vmx_inject_exception(int trap, int type, int error_code) { unsigned long intr_fields; struct vcpu *curr = current; /* * NB. Callers do not need to worry about clearing STI/MOV-SS blocking: * "If the VM entry is injecting, there is no blocking by STI or by * MOV SS following the VM entry, regardless of the contents of the * interruptibility-state field [in the guest-state area before the * VM entry]", PRM Vol. 3, 22.6.1 (Interruptibility State). */ intr_fields = (INTR_INFO_VALID_MASK | (type<<8) | trap); if ( error_code != HVM_DELIVER_NO_ERROR_CODE ) { __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); intr_fields |= INTR_INFO_DELIVER_CODE_MASK; } __vmwrite(VM_ENTRY_INTR_INFO, intr_fields); /* Can't inject exceptions in virtual 8086 mode because they would * use the protected-mode IDT. Emulate at the next vmenter instead. */ if ( curr->arch.hvm_vmx.vmx_realmode ) curr->arch.hvm_vmx.vmx_emulate = 1; } void vmx_inject_extint(int trap, uint8_t source) { struct vcpu *v = current; u32 pin_based_cntrl; if ( nestedhvm_vcpu_in_guestmode(v) ) { pin_based_cntrl = __get_vvmcs(vcpu_nestedhvm(v).nv_vvmcx, PIN_BASED_VM_EXEC_CONTROL); if ( pin_based_cntrl & PIN_BASED_EXT_INTR_MASK ) { nvmx_enqueue_n2_exceptions (v, INTR_INFO_VALID_MASK | (X86_EVENTTYPE_EXT_INTR<<8) | trap, HVM_DELIVER_NO_ERROR_CODE, source); return; } } __vmx_inject_exception(trap, X86_EVENTTYPE_EXT_INTR, HVM_DELIVER_NO_ERROR_CODE); } void vmx_inject_nmi(void) { struct vcpu *v = current; u32 pin_based_cntrl; if ( nestedhvm_vcpu_in_guestmode(v) ) { pin_based_cntrl = __get_vvmcs(vcpu_nestedhvm(v).nv_vvmcx, PIN_BASED_VM_EXEC_CONTROL); if ( pin_based_cntrl & PIN_BASED_NMI_EXITING ) { nvmx_enqueue_n2_exceptions (v, INTR_INFO_VALID_MASK | (X86_EVENTTYPE_NMI<<8) | TRAP_nmi, HVM_DELIVER_NO_ERROR_CODE, hvm_intsrc_nmi); return; } } __vmx_inject_exception(2, X86_EVENTTYPE_NMI, HVM_DELIVER_NO_ERROR_CODE); } /* * Generate a virtual event in the guest. * NOTES: * - INT 3 (CC) and INTO (CE) are X86_EVENTTYPE_SW_EXCEPTION; * - INT nn (CD nn) is X86_EVENTTYPE_SW_INTERRUPT; * - #DB is X86_EVENTTYPE_HW_EXCEPTION, except when generated by * opcode 0xf1 (which is X86_EVENTTYPE_PRI_SW_EXCEPTION) */ static void vmx_inject_trap(struct hvm_trap *trap) { unsigned long intr_info; struct vcpu *curr = current; struct hvm_trap _trap = *trap; if ( (_trap.vector == TRAP_page_fault) && (_trap.type == X86_EVENTTYPE_HW_EXCEPTION) ) current->arch.hvm_vcpu.guest_cr[2] = _trap.cr2; if ( nestedhvm_vcpu_in_guestmode(curr) ) intr_info = vcpu_2_nvmx(curr).intr.intr_info; else __vmread(VM_ENTRY_INTR_INFO, &intr_info); switch ( _trap.vector ) { case TRAP_debug: if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF ) { __restore_debug_registers(curr); write_debugreg(6, read_debugreg(6) | 0x4000); } if ( cpu_has_monitor_trap_flag ) break; /* fall through */ case TRAP_int3: if ( curr->domain->debugger_attached ) { /* Debug/Int3: Trap to debugger. */ domain_pause_for_debugger(); return; } } if ( unlikely(intr_info & INTR_INFO_VALID_MASK) && (((intr_info >> 8) & 7) == X86_EVENTTYPE_HW_EXCEPTION) ) { _trap.vector = hvm_combine_hw_exceptions( (uint8_t)intr_info, _trap.vector); if ( _trap.vector == TRAP_double_fault ) _trap.error_code = 0; } if ( _trap.type >= X86_EVENTTYPE_SW_INTERRUPT ) __vmwrite(VM_ENTRY_INSTRUCTION_LEN, _trap.insn_len); if ( nestedhvm_vcpu_in_guestmode(curr) && nvmx_intercepts_exception(curr, _trap.vector, _trap.error_code) ) { nvmx_enqueue_n2_exceptions (curr, INTR_INFO_VALID_MASK | (_trap.type<<8) | _trap.vector, _trap.error_code, hvm_intsrc_none); return; } else __vmx_inject_exception(_trap.vector, _trap.type, _trap.error_code); if ( (_trap.vector == TRAP_page_fault) && (_trap.type == X86_EVENTTYPE_HW_EXCEPTION) ) HVMTRACE_LONG_2D(PF_INJECT, _trap.error_code, TRC_PAR_LONG(current->arch.hvm_vcpu.guest_cr[2])); else HVMTRACE_2D(INJ_EXC, _trap.vector, _trap.error_code); } static int vmx_event_pending(struct vcpu *v) { unsigned long intr_info; ASSERT(v == current); __vmread(VM_ENTRY_INTR_INFO, &intr_info); return intr_info & INTR_INFO_VALID_MASK; } static void vmx_set_info_guest(struct vcpu *v) { unsigned long intr_shadow; vmx_vmcs_enter(v); __vmwrite(GUEST_DR7, v->arch.debugreg[7]); /* * If the interruptibility-state field indicates blocking by STI, * setting the TF flag in the EFLAGS may cause VM entry to fail * and crash the guest. See SDM 3B 22.3.1.5. * Resetting the VMX_INTR_SHADOW_STI flag looks hackish but * to set the GUEST_PENDING_DBG_EXCEPTIONS.BS here incurs * immediately vmexit and hence make no progress. */ __vmread(GUEST_INTERRUPTIBILITY_INFO, &intr_shadow); if ( v->domain->debugger_attached && (v->arch.user_regs.eflags & X86_EFLAGS_TF) && (intr_shadow & VMX_INTR_SHADOW_STI) ) { intr_shadow &= ~VMX_INTR_SHADOW_STI; __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow); } vmx_vmcs_exit(v); } static void vmx_update_eoi_exit_bitmap(struct vcpu *v, u8 vector, u8 trig) { if ( trig ) vmx_set_eoi_exit_bitmap(v, vector); else vmx_clear_eoi_exit_bitmap(v, vector); } static int vmx_virtual_intr_delivery_enabled(void) { return cpu_has_vmx_virtual_intr_delivery; } static void vmx_process_isr(int isr, struct vcpu *v) { unsigned long status; u8 old; if ( isr < 0 ) isr = 0; vmx_vmcs_enter(v); __vmread(GUEST_INTR_STATUS, &status); old = status >> VMX_GUEST_INTR_STATUS_SVI_OFFSET; if ( isr != old ) { status &= VMX_GUEST_INTR_STATUS_SUBFIELD_BITMASK; status |= isr << VMX_GUEST_INTR_STATUS_SVI_OFFSET; __vmwrite(GUEST_INTR_STATUS, status); } vmx_vmcs_exit(v); } static void __vmx_deliver_posted_interrupt(struct vcpu *v) { bool_t running = v->is_running; vcpu_unblock(v); if ( running && (in_irq() || (v != current)) ) { unsigned int cpu = v->processor; if ( !test_and_set_bit(VCPU_KICK_SOFTIRQ, &softirq_pending(cpu)) && (cpu != smp_processor_id()) ) send_IPI_mask(cpumask_of(cpu), posted_intr_vector); } } static void vmx_deliver_posted_intr(struct vcpu *v, u8 vector) { if ( pi_test_and_set_pir(vector, &v->arch.hvm_vmx.pi_desc) ) return; if ( unlikely(v->arch.hvm_vmx.eoi_exitmap_changed) ) { /* * If EOI exitbitmap needs to changed or notification vector * can't be allocated, interrupt will not be injected till * VMEntry as it used to be. */ pi_set_on(&v->arch.hvm_vmx.pi_desc); } else if ( !pi_test_and_set_on(&v->arch.hvm_vmx.pi_desc) ) { __vmx_deliver_posted_interrupt(v); return; } vcpu_kick(v); } static void vmx_sync_pir_to_irr(struct vcpu *v) { struct vlapic *vlapic = vcpu_vlapic(v); unsigned int group, i; DECLARE_BITMAP(pending_intr, NR_VECTORS); if ( !pi_test_and_clear_on(&v->arch.hvm_vmx.pi_desc) ) return; for ( group = 0; group < ARRAY_SIZE(pending_intr); group++ ) pending_intr[group] = pi_get_pir(&v->arch.hvm_vmx.pi_desc, group); for_each_set_bit(i, pending_intr, NR_VECTORS) vlapic_set_vector(i, &vlapic->regs->data[APIC_IRR]); } static void vmx_handle_eoi(u8 vector) { unsigned long status; /* We need to clear the SVI field. */ __vmread(GUEST_INTR_STATUS, &status); status &= VMX_GUEST_INTR_STATUS_SUBFIELD_BITMASK; __vmwrite(GUEST_INTR_STATUS, status); } static struct hvm_function_table __initdata vmx_function_table = { .name = "VMX", .cpu_up_prepare = vmx_cpu_up_prepare, .cpu_dead = vmx_cpu_dead, .domain_initialise = vmx_domain_initialise, .domain_destroy = vmx_domain_destroy, .vcpu_initialise = vmx_vcpu_initialise, .vcpu_destroy = vmx_vcpu_destroy, .save_cpu_ctxt = vmx_save_vmcs_ctxt, .load_cpu_ctxt = vmx_load_vmcs_ctxt, .get_interrupt_shadow = vmx_get_interrupt_shadow, .set_interrupt_shadow = vmx_set_interrupt_shadow, .guest_x86_mode = vmx_guest_x86_mode, .get_segment_register = vmx_get_segment_register, .set_segment_register = vmx_set_segment_register, .get_shadow_gs_base = vmx_get_shadow_gs_base, .update_host_cr3 = vmx_update_host_cr3, .update_guest_cr = vmx_update_guest_cr, .update_guest_efer = vmx_update_guest_efer, .set_guest_pat = vmx_set_guest_pat, .get_guest_pat = vmx_get_guest_pat, .set_tsc_offset = vmx_set_tsc_offset, .inject_trap = vmx_inject_trap, .init_hypercall_page = vmx_init_hypercall_page, .event_pending = vmx_event_pending, .cpu_up = vmx_cpu_up, .cpu_down = vmx_cpu_down, .cpuid_intercept = vmx_cpuid_intercept, .wbinvd_intercept = vmx_wbinvd_intercept, .fpu_dirty_intercept = vmx_fpu_dirty_intercept, .msr_read_intercept = vmx_msr_read_intercept, .msr_write_intercept = vmx_msr_write_intercept, .invlpg_intercept = vmx_invlpg_intercept, .handle_cd = vmx_handle_cd, .set_info_guest = vmx_set_info_guest, .set_rdtsc_exiting = vmx_set_rdtsc_exiting, .nhvm_vcpu_initialise = nvmx_vcpu_initialise, .nhvm_vcpu_destroy = nvmx_vcpu_destroy, .nhvm_vcpu_reset = nvmx_vcpu_reset, .nhvm_vcpu_guestcr3 = nvmx_vcpu_guestcr3, .nhvm_vcpu_p2m_base = nvmx_vcpu_eptp_base, .nhvm_vcpu_asid = nvmx_vcpu_asid, .nhvm_vmcx_hap_enabled = nvmx_ept_enabled, .nhvm_vmcx_guest_intercepts_trap = nvmx_intercepts_exception, .nhvm_vcpu_vmexit_trap = nvmx_vmexit_trap, .nhvm_intr_blocked = nvmx_intr_blocked, .nhvm_domain_relinquish_resources = nvmx_domain_relinquish_resources, .update_eoi_exit_bitmap = vmx_update_eoi_exit_bitmap, .virtual_intr_delivery_enabled = vmx_virtual_intr_delivery_enabled, .process_isr = vmx_process_isr, .deliver_posted_intr = vmx_deliver_posted_intr, .sync_pir_to_irr = vmx_sync_pir_to_irr, .handle_eoi = vmx_handle_eoi, .nhvm_hap_walk_L1_p2m = nvmx_hap_walk_L1_p2m, }; const struct hvm_function_table * __init start_vmx(void) { set_in_cr4(X86_CR4_VMXE); if ( vmx_cpu_up() ) { printk("VMX: failed to initialise.\n"); return NULL; } /* * Do not enable EPT when (!cpu_has_vmx_pat), to prevent security hole * (refer to http://xenbits.xen.org/xsa/advisory-60.html). */ if ( cpu_has_vmx_ept && cpu_has_vmx_pat ) { vmx_function_table.hap_supported = 1; vmx_function_table.hap_capabilities = 0; if ( cpu_has_vmx_ept_2mb ) vmx_function_table.hap_capabilities |= HVM_HAP_SUPERPAGE_2MB; if ( cpu_has_vmx_ept_1gb ) vmx_function_table.hap_capabilities |= HVM_HAP_SUPERPAGE_1GB; setup_ept_dump(); } if ( !cpu_has_vmx_virtual_intr_delivery ) { vmx_function_table.update_eoi_exit_bitmap = NULL; vmx_function_table.process_isr = NULL; vmx_function_table.handle_eoi = NULL; } if ( cpu_has_vmx_posted_intr_processing ) alloc_direct_apic_vector(&posted_intr_vector, event_check_interrupt); else { vmx_function_table.deliver_posted_intr = NULL; vmx_function_table.sync_pir_to_irr = NULL; } if ( cpu_has_vmx_ept && cpu_has_vmx_pat && cpu_has_vmx_msr_bitmap && cpu_has_vmx_secondary_exec_control ) vmx_function_table.pvh_supported = 1; setup_vmcs_dump(); return &vmx_function_table; } /* * Not all cases receive valid value in the VM-exit instruction length field. * Callers must know what they're doing! */ static int get_instruction_length(void) { unsigned long len; __vmread(VM_EXIT_INSTRUCTION_LEN, &len); /* Safe: callers audited */ BUG_ON((len < 1) || (len > 15)); return len; } void update_guest_eip(void) { struct cpu_user_regs *regs = guest_cpu_user_regs(); unsigned long x; regs->eip += get_instruction_length(); /* Safe: callers audited */ regs->eflags &= ~X86_EFLAGS_RF; __vmread(GUEST_INTERRUPTIBILITY_INFO, &x); if ( x & (VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS) ) { x &= ~(VMX_INTR_SHADOW_STI | VMX_INTR_SHADOW_MOV_SS); __vmwrite(GUEST_INTERRUPTIBILITY_INFO, x); } if ( regs->eflags & X86_EFLAGS_TF ) hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE); } static void vmx_fpu_dirty_intercept(void) { struct vcpu *curr = current; vmx_fpu_enter(curr); /* Disable TS in guest CR0 unless the guest wants the exception too. */ if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) ) { curr->arch.hvm_vcpu.hw_cr[0] &= ~X86_CR0_TS; __vmwrite(GUEST_CR0, curr->arch.hvm_vcpu.hw_cr[0]); } } static void vmx_cpuid_intercept( unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { unsigned int input = *eax; struct segment_register cs; struct vcpu *v = current; hvm_cpuid(input, eax, ebx, ecx, edx); switch ( input ) { case 0x80000001: /* SYSCALL is visible iff running in long mode. */ vmx_get_segment_register(v, x86_seg_cs, &cs); if ( cs.attr.fields.l ) *edx |= cpufeat_mask(X86_FEATURE_SYSCALL); else *edx &= ~(cpufeat_mask(X86_FEATURE_SYSCALL)); break; } vpmu_do_cpuid(input, eax, ebx, ecx, edx); HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx); } static void vmx_do_cpuid(struct cpu_user_regs *regs) { unsigned int eax, ebx, ecx, edx; eax = regs->eax; ebx = regs->ebx; ecx = regs->ecx; edx = regs->edx; vmx_cpuid_intercept(&eax, &ebx, &ecx, &edx); regs->eax = eax; regs->ebx = ebx; regs->ecx = ecx; regs->edx = edx; } static void vmx_dr_access(unsigned long exit_qualification, struct cpu_user_regs *regs) { struct vcpu *v = current; HVMTRACE_0D(DR_WRITE); if ( !v->arch.hvm_vcpu.flag_dr_dirty ) __restore_debug_registers(v); /* Allow guest direct access to DR registers */ v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MOV_DR_EXITING; vmx_update_cpu_exec_control(v); } static void vmx_invlpg_intercept(unsigned long vaddr) { struct vcpu *curr = current; HVMTRACE_LONG_2D(INVLPG, /*invlpga=*/ 0, TRC_PAR_LONG(vaddr)); if ( paging_invlpg(curr, vaddr) && cpu_has_vmx_vpid ) vpid_sync_vcpu_gva(curr, vaddr); } static int vmx_cr_access(unsigned long exit_qualification) { struct vcpu *curr = current; switch ( VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification) ) { case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR: { unsigned long gp = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification); unsigned long cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification); return hvm_mov_to_cr(cr, gp); } case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR: { unsigned long gp = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification); unsigned long cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification); return hvm_mov_from_cr(cr, gp); } case VMX_CONTROL_REG_ACCESS_TYPE_CLTS: { unsigned long old = curr->arch.hvm_vcpu.guest_cr[0]; curr->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS; vmx_update_guest_cr(curr, 0); hvm_memory_event_cr0(curr->arch.hvm_vcpu.guest_cr[0], old); HVMTRACE_0D(CLTS); break; } case VMX_CONTROL_REG_ACCESS_TYPE_LMSW: { unsigned long value = curr->arch.hvm_vcpu.guest_cr[0]; /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */ value = (value & ~0xe) | ((exit_qualification >> 16) & 0xf); HVMTRACE_LONG_1D(LMSW, value); return hvm_set_cr0(value); } default: BUG(); } return X86EMUL_OKAY; } static const struct lbr_info { u32 base, count; } p4_lbr[] = { { MSR_P4_LER_FROM_LIP, 1 }, { MSR_P4_LER_TO_LIP, 1 }, { MSR_P4_LASTBRANCH_TOS, 1 }, { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO }, { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO }, { 0, 0 } }, c2_lbr[] = { { MSR_IA32_LASTINTFROMIP, 1 }, { MSR_IA32_LASTINTTOIP, 1 }, { MSR_C2_LASTBRANCH_TOS, 1 }, { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO }, { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_C2_LASTBRANCH_FROM_TO }, { 0, 0 } }, nh_lbr[] = { { MSR_IA32_LASTINTFROMIP, 1 }, { MSR_IA32_LASTINTTOIP, 1 }, { MSR_C2_LASTBRANCH_TOS, 1 }, { MSR_P4_LASTBRANCH_0_FROM_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO }, { MSR_P4_LASTBRANCH_0_TO_LIP, NUM_MSR_P4_LASTBRANCH_FROM_TO }, { 0, 0 } }, at_lbr[] = { { MSR_IA32_LASTINTFROMIP, 1 }, { MSR_IA32_LASTINTTOIP, 1 }, { MSR_C2_LASTBRANCH_TOS, 1 }, { MSR_C2_LASTBRANCH_0_FROM_IP, NUM_MSR_ATOM_LASTBRANCH_FROM_TO }, { MSR_C2_LASTBRANCH_0_TO_IP, NUM_MSR_ATOM_LASTBRANCH_FROM_TO }, { 0, 0 } }; static const struct lbr_info *last_branch_msr_get(void) { switch ( boot_cpu_data.x86 ) { case 6: switch ( boot_cpu_data.x86_model ) { /* Core2 Duo */ case 15: /* Enhanced Core */ case 23: return c2_lbr; break; /* Nehalem */ case 26: case 30: case 31: case 46: /* Westmere */ case 37: case 44: case 47: /* Sandy Bridge */ case 42: case 45: /* Ivy Bridge */ case 58: case 62: /* Haswell */ case 60: case 63: case 69: case 70: return nh_lbr; break; /* Atom */ case 28: return at_lbr; break; } break; case 15: switch ( boot_cpu_data.x86_model ) { /* Pentium4/Xeon with em64t */ case 3: case 4: case 6: return p4_lbr; break; } break; } return NULL; } static int is_last_branch_msr(u32 ecx) { const struct lbr_info *lbr = last_branch_msr_get(); if ( lbr == NULL ) return 0; for ( ; lbr->count; lbr++ ) if ( (ecx >= lbr->base) && (ecx < (lbr->base + lbr->count)) ) return 1; return 0; } static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content) { HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%#x", msr); switch ( msr ) { case MSR_IA32_SYSENTER_CS: __vmread(GUEST_SYSENTER_CS, msr_content); break; case MSR_IA32_SYSENTER_ESP: __vmread(GUEST_SYSENTER_ESP, msr_content); break; case MSR_IA32_SYSENTER_EIP: __vmread(GUEST_SYSENTER_EIP, msr_content); break; case MSR_IA32_DEBUGCTLMSR: __vmread(GUEST_IA32_DEBUGCTL, msr_content); break; case IA32_FEATURE_CONTROL_MSR: case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_TRUE_ENTRY_CTLS: if ( !nvmx_msr_read_intercept(msr, msr_content) ) goto gp_fault; break; case MSR_IA32_MISC_ENABLE: rdmsrl(MSR_IA32_MISC_ENABLE, *msr_content); /* Debug Trace Store is not supported. */ *msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL | MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL; /* Perhaps vpmu will change some bits. */ if ( vpmu_do_rdmsr(msr, msr_content) ) goto done; break; default: if ( vpmu_do_rdmsr(msr, msr_content) ) break; if ( passive_domain_do_rdmsr(msr, msr_content) ) goto done; switch ( long_mode_do_msr_read(msr, msr_content) ) { case HNDL_unhandled: break; case HNDL_exception_raised: return X86EMUL_EXCEPTION; case HNDL_done: goto done; } if ( vmx_read_guest_msr(msr, msr_content) == 0 ) break; if ( is_last_branch_msr(msr) ) { *msr_content = 0; break; } if ( rdmsr_viridian_regs(msr, msr_content) || rdmsr_hypervisor_regs(msr, msr_content) ) break; if ( rdmsr_safe(msr, *msr_content) == 0 ) break; goto gp_fault; } done: HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%#x, msr_value=%#"PRIx64, msr, *msr_content); return X86EMUL_OKAY; gp_fault: hvm_inject_hw_exception(TRAP_gp_fault, 0); return X86EMUL_EXCEPTION; } static int vmx_alloc_vlapic_mapping(struct domain *d) { void *apic_va; if ( !cpu_has_vmx_virtualize_apic_accesses ) return 0; apic_va = alloc_xenheap_page(); if ( apic_va == NULL ) return -ENOMEM; share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable); set_mmio_p2m_entry(d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(virt_to_mfn(apic_va))); d->arch.hvm_domain.vmx.apic_access_mfn = virt_to_mfn(apic_va); return 0; } static void vmx_free_vlapic_mapping(struct domain *d) { unsigned long mfn = d->arch.hvm_domain.vmx.apic_access_mfn; if ( mfn != 0 ) free_xenheap_page(mfn_to_virt(mfn)); } static void vmx_install_vlapic_mapping(struct vcpu *v) { paddr_t virt_page_ma, apic_page_ma; if ( !cpu_has_vmx_virtualize_apic_accesses ) return; virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page); apic_page_ma = v->domain->arch.hvm_domain.vmx.apic_access_mfn; apic_page_ma <<= PAGE_SHIFT; vmx_vmcs_enter(v); __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma); __vmwrite(APIC_ACCESS_ADDR, apic_page_ma); vmx_vmcs_exit(v); } void vmx_vlapic_msr_changed(struct vcpu *v) { int virtualize_x2apic_mode; struct vlapic *vlapic = vcpu_vlapic(v); virtualize_x2apic_mode = ( (cpu_has_vmx_apic_reg_virt || cpu_has_vmx_virtual_intr_delivery) && cpu_has_vmx_virtualize_x2apic_mode ); if ( !cpu_has_vmx_virtualize_apic_accesses && !virtualize_x2apic_mode ) return; vmx_vmcs_enter(v); v->arch.hvm_vmx.secondary_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); if ( !vlapic_hw_disabled(vlapic) && (vlapic_base_address(vlapic) == APIC_DEFAULT_PHYS_BASE) ) { unsigned int msr; if ( virtualize_x2apic_mode && vlapic_x2apic_mode(vlapic) ) { v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; if ( cpu_has_vmx_apic_reg_virt ) { for ( msr = MSR_IA32_APICBASE_MSR; msr <= MSR_IA32_APICBASE_MSR + 0xff; msr++ ) vmx_disable_intercept_for_msr(v, msr, MSR_TYPE_R); vmx_enable_intercept_for_msr(v, MSR_IA32_APICPPR_MSR, MSR_TYPE_R); vmx_enable_intercept_for_msr(v, MSR_IA32_APICTMICT_MSR, MSR_TYPE_R); vmx_enable_intercept_for_msr(v, MSR_IA32_APICTMCCT_MSR, MSR_TYPE_R); } if ( cpu_has_vmx_virtual_intr_delivery ) { vmx_disable_intercept_for_msr(v, MSR_IA32_APICTPR_MSR, MSR_TYPE_W); vmx_disable_intercept_for_msr(v, MSR_IA32_APICEOI_MSR, MSR_TYPE_W); vmx_disable_intercept_for_msr(v, MSR_IA32_APICSELF_MSR, MSR_TYPE_W); } } else { v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; for ( msr = MSR_IA32_APICBASE_MSR; msr <= MSR_IA32_APICBASE_MSR + 0xff; msr++ ) vmx_enable_intercept_for_msr(v, msr, MSR_TYPE_R | MSR_TYPE_W); } } vmx_update_secondary_exec_control(v); vmx_vmcs_exit(v); } static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content) { struct vcpu *v = current; HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%#x, msr_value=%#"PRIx64, msr, msr_content); switch ( msr ) { case MSR_IA32_SYSENTER_CS: __vmwrite(GUEST_SYSENTER_CS, msr_content); break; case MSR_IA32_SYSENTER_ESP: __vmwrite(GUEST_SYSENTER_ESP, msr_content); break; case MSR_IA32_SYSENTER_EIP: __vmwrite(GUEST_SYSENTER_EIP, msr_content); break; case MSR_IA32_DEBUGCTLMSR: { int i, rc = 0; uint64_t supported = IA32_DEBUGCTLMSR_LBR | IA32_DEBUGCTLMSR_BTF; if ( !msr_content ) break; if ( msr_content & ~supported ) { /* Perhaps some other bits are supported in vpmu. */ if ( !vpmu_do_wrmsr(msr, msr_content) ) break; } if ( msr_content & IA32_DEBUGCTLMSR_LBR ) { const struct lbr_info *lbr = last_branch_msr_get(); if ( lbr == NULL ) break; for ( ; (rc == 0) && lbr->count; lbr++ ) for ( i = 0; (rc == 0) && (i < lbr->count); i++ ) if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 ) vmx_disable_intercept_for_msr(v, lbr->base + i, MSR_TYPE_R | MSR_TYPE_W); } if ( (rc < 0) || (vmx_add_host_load_msr(msr) < 0) ) hvm_inject_hw_exception(TRAP_machine_check, 0); else { __vmwrite(GUEST_IA32_DEBUGCTL, msr_content); } break; } case IA32_FEATURE_CONTROL_MSR: case MSR_IA32_VMX_BASIC...MSR_IA32_VMX_TRUE_ENTRY_CTLS: if ( !nvmx_msr_write_intercept(msr, msr_content) ) goto gp_fault; break; default: if ( vpmu_do_wrmsr(msr, msr_content) ) return X86EMUL_OKAY; if ( passive_domain_do_wrmsr(msr, msr_content) ) return X86EMUL_OKAY; if ( wrmsr_viridian_regs(msr, msr_content) ) break; switch ( long_mode_do_msr_write(msr, msr_content) ) { case HNDL_unhandled: if ( (vmx_write_guest_msr(msr, msr_content) != 0) && !is_last_branch_msr(msr) ) switch ( wrmsr_hypervisor_regs(msr, msr_content) ) { case -EAGAIN: return X86EMUL_RETRY; case 0: case 1: break; default: goto gp_fault; } break; case HNDL_exception_raised: return X86EMUL_EXCEPTION; case HNDL_done: break; } break; } return X86EMUL_OKAY; gp_fault: hvm_inject_hw_exception(TRAP_gp_fault, 0); return X86EMUL_EXCEPTION; } static void vmx_do_extint(struct cpu_user_regs *regs) { unsigned long vector; __vmread(VM_EXIT_INTR_INFO, &vector); BUG_ON(!(vector & INTR_INFO_VALID_MASK)); vector &= INTR_INFO_VECTOR_MASK; HVMTRACE_1D(INTR, vector); regs->entry_vector = vector; do_IRQ(regs); } static void wbinvd_ipi(void *info) { wbinvd(); } static void vmx_wbinvd_intercept(void) { if ( !cache_flush_permitted(current->domain) || iommu_snoop ) return; if ( cpu_has_wbinvd_exiting ) on_each_cpu(wbinvd_ipi, NULL, 1); else wbinvd(); } static void ept_handle_violation(unsigned long qualification, paddr_t gpa) { unsigned long gla, gfn = gpa >> PAGE_SHIFT; mfn_t mfn; p2m_type_t p2mt; int ret; struct domain *d = current->domain; if ( tb_init_done ) { struct { uint64_t gpa; uint64_t mfn; u32 qualification; u32 p2mt; } _d; _d.gpa = gpa; _d.qualification = qualification; _d.mfn = mfn_x(get_gfn_query_unlocked(d, gfn, &_d.p2mt)); __trace_var(TRC_HVM_NPF, 0, sizeof(_d), &_d); } if ( qualification & EPT_GLA_VALID ) __vmread(GUEST_LINEAR_ADDRESS, &gla); else gla = ~0ull; ret = hvm_hap_nested_page_fault(gpa, !!(qualification & EPT_GLA_VALID), gla, !!(qualification & EPT_READ_VIOLATION), !!(qualification & EPT_WRITE_VIOLATION), !!(qualification & EPT_EXEC_VIOLATION)); switch ( ret ) { case 0: // Unhandled L1 EPT violation break; case 1: // This violation is handled completly /*Current nested EPT maybe flushed by other vcpus, so need * to re-set its shadow EPTP pointer. */ if ( nestedhvm_vcpu_in_guestmode(current) && nestedhvm_paging_mode_hap(current ) ) __vmwrite(EPT_POINTER, get_shadow_eptp(current)); return; case -1: // This vioaltion should be injected to L1 VMM vcpu_nestedhvm(current).nv_vmexit_pending = 1; return; } /* Everything else is an error. */ mfn = get_gfn_query_unlocked(d, gfn, &p2mt); gdprintk(XENLOG_ERR, "EPT violation %#lx (%c%c%c/%c%c%c), " "gpa %#"PRIpaddr", mfn %#lx, type %i.\n", qualification, (qualification & EPT_READ_VIOLATION) ? 'r' : '-', (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-', (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-', (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-', (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-', (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-', gpa, mfn_x(mfn), p2mt); ept_walk_table(d, gfn); if ( qualification & EPT_GLA_VALID ) gdprintk(XENLOG_ERR, " --- GLA %#lx\n", gla); domain_crash(d); } static void vmx_failed_vmentry(unsigned int exit_reason, struct cpu_user_regs *regs) { unsigned int failed_vmentry_reason = (uint16_t)exit_reason; unsigned long exit_qualification; struct vcpu *curr = current; printk("Failed vm entry (exit reason %#x) ", exit_reason); __vmread(EXIT_QUALIFICATION, &exit_qualification); switch ( failed_vmentry_reason ) { case EXIT_REASON_INVALID_GUEST_STATE: printk("caused by invalid guest state (%ld).\n", exit_qualification); break; case EXIT_REASON_MSR_LOADING: printk("caused by MSR entry %ld loading.\n", exit_qualification); break; case EXIT_REASON_MCE_DURING_VMENTRY: printk("caused by machine check.\n"); HVMTRACE_0D(MCE); /* Already handled. */ break; default: printk("reason not known yet!"); break; } printk("************* VMCS Area **************\n"); vmcs_dump_vcpu(curr); printk("**************************************\n"); domain_crash(curr->domain); } void vmx_enter_realmode(struct cpu_user_regs *regs) { struct vcpu *v = current; /* Adjust RFLAGS to enter virtual 8086 mode with IOPL == 3. Since * we have CR4.VME == 1 and our own TSS with an empty interrupt * redirection bitmap, all software INTs will be handled by vm86 */ v->arch.hvm_vmx.vm86_saved_eflags = regs->eflags; regs->eflags |= (X86_EFLAGS_VM | X86_EFLAGS_IOPL); } static void vmx_vmexit_ud_intercept(struct cpu_user_regs *regs) { struct hvm_emulate_ctxt ctxt; int rc; hvm_emulate_prepare(&ctxt, regs); rc = hvm_emulate_one(&ctxt); switch ( rc ) { case X86EMUL_UNHANDLEABLE: hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); break; case X86EMUL_EXCEPTION: if ( ctxt.exn_pending ) hvm_inject_hw_exception(ctxt.exn_vector, ctxt.exn_error_code); /* fall through */ default: hvm_emulate_writeback(&ctxt); break; } } static int vmx_handle_eoi_write(void) { unsigned long exit_qualification; /* * 1. Must be a linear access data write. * 2. Data write must be to the EOI register. */ __vmread(EXIT_QUALIFICATION, &exit_qualification); if ( (((exit_qualification >> 12) & 0xf) == 1) && ((exit_qualification & 0xfff) == APIC_EOI) ) { update_guest_eip(); /* Safe: APIC data write */ vlapic_EOI_set(vcpu_vlapic(current)); HVMTRACE_0D(VLAPIC); return 1; } return 0; } static void vmx_idtv_reinject(unsigned long idtv_info) { /* Event delivery caused this intercept? Queue for redelivery. */ if ( unlikely(idtv_info & INTR_INFO_VALID_MASK) ) { if ( hvm_event_needs_reinjection((idtv_info>>8)&7, idtv_info&0xff) ) { /* See SDM 3B 25.7.1.1 and .2 for info about masking resvd bits. */ __vmwrite(VM_ENTRY_INTR_INFO, idtv_info & ~INTR_INFO_RESVD_BITS_MASK); if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK ) { unsigned long ec; __vmread(IDT_VECTORING_ERROR_CODE, &ec); __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, ec); } } /* * Clear NMI-blocking interruptibility info if an NMI delivery faulted. * Re-delivery will re-set it (see SDM 3B 25.7.1.2). */ if ( (idtv_info & INTR_INFO_INTR_TYPE_MASK) == (X86_EVENTTYPE_NMI<<8) ) { unsigned long intr_info; __vmread(GUEST_INTERRUPTIBILITY_INFO, &intr_info); __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_info & ~VMX_INTR_SHADOW_NMI); } } } static int vmx_handle_apic_write(void) { unsigned long exit_qualification; ASSERT(cpu_has_vmx_apic_reg_virt); __vmread(EXIT_QUALIFICATION, &exit_qualification); return vlapic_apicv_write(current, exit_qualification & 0xfff); } /* * When "Virtual Interrupt Delivery" is enabled, this function is used * to handle EOI-induced VM exit */ void vmx_handle_EOI_induced_exit(struct vlapic *vlapic, int vector) { ASSERT(cpu_has_vmx_virtual_intr_delivery); vlapic_handle_EOI_induced_exit(vlapic, vector); } void vmx_vmexit_handler(struct cpu_user_regs *regs) { unsigned long exit_qualification, exit_reason, idtv_info, intr_info = 0; unsigned int vector = 0; struct vcpu *v = current; __vmread(GUEST_RIP, ®s->rip); __vmread(GUEST_RSP, ®s->rsp); __vmread(GUEST_RFLAGS, ®s->rflags); hvm_invalidate_regs_fields(regs); if ( paging_mode_hap(v->domain) ) { __vmread(GUEST_CR3, &v->arch.hvm_vcpu.hw_cr[3]); if ( vmx_unrestricted_guest(v) || hvm_paging_enabled(v) ) v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3]; } __vmread(VM_EXIT_REASON, &exit_reason); if ( hvm_long_mode_enabled(v) ) HVMTRACE_ND(VMEXIT64, 0, 1/*cycles*/, 3, exit_reason, (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32), 0, 0, 0); else HVMTRACE_ND(VMEXIT, 0, 1/*cycles*/, 2, exit_reason, (uint32_t)regs->eip, 0, 0, 0, 0); perfc_incra(vmexits, exit_reason); /* Handle the interrupt we missed before allowing any more in. */ switch ( (uint16_t)exit_reason ) { case EXIT_REASON_EXTERNAL_INTERRUPT: vmx_do_extint(regs); break; case EXIT_REASON_EXCEPTION_NMI: __vmread(VM_EXIT_INTR_INFO, &intr_info); BUG_ON(!(intr_info & INTR_INFO_VALID_MASK)); vector = intr_info & INTR_INFO_VECTOR_MASK; if ( vector == TRAP_machine_check ) do_machine_check(regs); if ( vector == TRAP_nmi && ((intr_info & INTR_INFO_INTR_TYPE_MASK) == (X86_EVENTTYPE_NMI << 8)) ) { do_nmi(regs); enable_nmis(); } break; case EXIT_REASON_MCE_DURING_VMENTRY: do_machine_check(regs); break; } /* Now enable interrupts so it's safe to take locks. */ local_irq_enable(); /* XXX: This looks ugly, but we need a mechanism to ensure * any pending vmresume has really happened */ vcpu_nestedhvm(v).nv_vmswitch_in_progress = 0; if ( nestedhvm_vcpu_in_guestmode(v) ) { if ( nvmx_n2_vmexit_handler(regs, exit_reason) ) goto out; } if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) ) return vmx_failed_vmentry(exit_reason, regs); if ( v->arch.hvm_vmx.vmx_realmode ) { /* Put RFLAGS back the way the guest wants it */ regs->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IOPL); regs->eflags |= (v->arch.hvm_vmx.vm86_saved_eflags & X86_EFLAGS_IOPL); /* Unless this exit was for an interrupt, we've hit something * vm86 can't handle. Try again, using the emulator. */ switch ( exit_reason ) { case EXIT_REASON_EXCEPTION_NMI: if ( vector != TRAP_page_fault && vector != TRAP_nmi && vector != TRAP_machine_check ) { perfc_incr(realmode_exits); v->arch.hvm_vmx.vmx_emulate = 1; HVMTRACE_0D(REALMODE_EMULATE); return; } case EXIT_REASON_EXTERNAL_INTERRUPT: case EXIT_REASON_INIT: case EXIT_REASON_SIPI: case EXIT_REASON_PENDING_VIRT_INTR: case EXIT_REASON_PENDING_VIRT_NMI: case EXIT_REASON_MCE_DURING_VMENTRY: case EXIT_REASON_GETSEC: case EXIT_REASON_ACCESS_GDTR_OR_IDTR: case EXIT_REASON_ACCESS_LDTR_OR_TR: case EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED: case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: break; default: v->arch.hvm_vmx.vmx_emulate = 1; perfc_incr(realmode_exits); HVMTRACE_0D(REALMODE_EMULATE); return; } } hvm_maybe_deassert_evtchn_irq(); __vmread(IDT_VECTORING_INFO, &idtv_info); if ( !nestedhvm_vcpu_in_guestmode(v) && exit_reason != EXIT_REASON_TASK_SWITCH ) vmx_idtv_reinject(idtv_info); switch ( exit_reason ) { unsigned long ecode; case EXIT_REASON_EXCEPTION_NMI: { /* * We don't set the software-interrupt exiting (INT n). * (1) We can get an exception (e.g. #PG) in the guest, or * (2) NMI */ /* * Re-set the NMI shadow if vmexit caused by a guest IRET fault (see 3B * 25.7.1.2, "Resuming Guest Software after Handling an Exception"). * (NB. If we emulate this IRET for any reason, we should re-clear!) */ if ( unlikely(intr_info & INTR_INFO_NMI_UNBLOCKED_BY_IRET) && !(idtv_info & INTR_INFO_VALID_MASK) && (vector != TRAP_double_fault) ) { unsigned long guest_info; __vmread(GUEST_INTERRUPTIBILITY_INFO, &guest_info); __vmwrite(GUEST_INTERRUPTIBILITY_INFO, guest_info | VMX_INTR_SHADOW_NMI); } perfc_incra(cause_vector, vector); switch ( vector ) { case TRAP_debug: /* * Updates DR6 where debugger can peek (See 3B 23.2.1, * Table 23-1, "Exit Qualification for Debug Exceptions"). */ __vmread(EXIT_QUALIFICATION, &exit_qualification); HVMTRACE_1D(TRAP_DEBUG, exit_qualification); write_debugreg(6, exit_qualification | 0xffff0ff0); if ( !v->domain->debugger_attached || cpu_has_monitor_trap_flag ) goto exit_and_crash; domain_pause_for_debugger(); break; case TRAP_int3: { HVMTRACE_1D(TRAP, vector); if ( v->domain->debugger_attached ) { update_guest_eip(); /* Safe: INT3 */ current->arch.gdbsx_vcpu_event = TRAP_int3; domain_pause_for_debugger(); break; } else { int handled = hvm_memory_event_int3(regs->eip); if ( handled < 0 ) { struct hvm_trap trap = { .vector = TRAP_int3, .type = X86_EVENTTYPE_SW_EXCEPTION, .error_code = HVM_DELIVER_NO_ERROR_CODE, }; unsigned long insn_len; __vmread(VM_EXIT_INSTRUCTION_LEN, &insn_len); trap.insn_len = insn_len; hvm_inject_trap(&trap); break; } else if ( handled ) break; } goto exit_and_crash; } case TRAP_no_device: HVMTRACE_1D(TRAP, vector); vmx_fpu_dirty_intercept(); break; case TRAP_page_fault: __vmread(EXIT_QUALIFICATION, &exit_qualification); __vmread(VM_EXIT_INTR_ERROR_CODE, &ecode); regs->error_code = ecode; HVM_DBG_LOG(DBG_LEVEL_VMMU, "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx", (unsigned long)regs->eax, (unsigned long)regs->ebx, (unsigned long)regs->ecx, (unsigned long)regs->edx, (unsigned long)regs->esi, (unsigned long)regs->edi); if ( paging_fault(exit_qualification, regs) ) { if ( trace_will_trace_event(TRC_SHADOW) ) break; if ( hvm_long_mode_enabled(v) ) HVMTRACE_LONG_2D(PF_XEN, regs->error_code, TRC_PAR_LONG(exit_qualification) ); else HVMTRACE_2D(PF_XEN, regs->error_code, exit_qualification ); break; } hvm_inject_page_fault(regs->error_code, exit_qualification); break; case TRAP_nmi: if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) != (X86_EVENTTYPE_NMI << 8) ) goto exit_and_crash; HVMTRACE_0D(NMI); /* Already handled above. */ break; case TRAP_machine_check: HVMTRACE_0D(MCE); /* Already handled above. */ break; case TRAP_invalid_op: HVMTRACE_1D(TRAP, vector); vmx_vmexit_ud_intercept(regs); break; default: HVMTRACE_1D(TRAP, vector); goto exit_and_crash; } break; } case EXIT_REASON_EXTERNAL_INTERRUPT: /* Already handled above. */ break; case EXIT_REASON_TRIPLE_FAULT: hvm_triple_fault(); break; case EXIT_REASON_PENDING_VIRT_INTR: /* Disable the interrupt window. */ v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; vmx_update_cpu_exec_control(v); break; case EXIT_REASON_PENDING_VIRT_NMI: /* Disable the NMI window. */ v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; vmx_update_cpu_exec_control(v); break; case EXIT_REASON_TASK_SWITCH: { static const enum hvm_task_switch_reason reasons[] = { TSW_call_or_int, TSW_iret, TSW_jmp, TSW_call_or_int }; unsigned int inst_len, source; __vmread(EXIT_QUALIFICATION, &exit_qualification); source = (exit_qualification >> 30) & 3; /* Vectored event should fill in interrupt information. */ WARN_ON((source == 3) && !(idtv_info & INTR_INFO_VALID_MASK)); /* * In the following cases there is an instruction to skip over: * - TSW is due to a CALL, IRET or JMP instruction. * - TSW is a vectored event due to a SW exception or SW interrupt. */ inst_len = ((source != 3) || /* CALL, IRET, or JMP? */ (idtv_info & (1u<<10))) /* IntrType > 3? */ ? get_instruction_length() /* Safe: SDM 3B 23.2.4 */ : 0; if ( (source == 3) && (idtv_info & INTR_INFO_DELIVER_CODE_MASK) ) __vmread(IDT_VECTORING_ERROR_CODE, &ecode); else ecode = -1; regs->eip += inst_len; hvm_task_switch((uint16_t)exit_qualification, reasons[source], ecode); break; } case EXIT_REASON_CPUID: is_pvh_vcpu(v) ? pv_cpuid(regs) : vmx_do_cpuid(regs); update_guest_eip(); /* Safe: CPUID */ break; case EXIT_REASON_HLT: update_guest_eip(); /* Safe: HLT */ hvm_hlt(regs->eflags); break; case EXIT_REASON_INVLPG: update_guest_eip(); /* Safe: INVLPG */ __vmread(EXIT_QUALIFICATION, &exit_qualification); vmx_invlpg_intercept(exit_qualification); break; case EXIT_REASON_RDTSCP: regs->ecx = hvm_msr_tsc_aux(v); /* fall through */ case EXIT_REASON_RDTSC: update_guest_eip(); /* Safe: RDTSC, RDTSCP */ hvm_rdtsc_intercept(regs); break; case EXIT_REASON_VMCALL: { int rc; HVMTRACE_1D(VMMCALL, regs->eax); rc = hvm_do_hypercall(regs); if ( rc != HVM_HCALL_preempted ) { update_guest_eip(); /* Safe: VMCALL */ if ( rc == HVM_HCALL_invalidate ) send_invalidate_req(); } break; } case EXIT_REASON_CR_ACCESS: { __vmread(EXIT_QUALIFICATION, &exit_qualification); if ( vmx_cr_access(exit_qualification) == X86EMUL_OKAY ) update_guest_eip(); /* Safe: MOV Cn, LMSW, CLTS */ break; } case EXIT_REASON_DR_ACCESS: __vmread(EXIT_QUALIFICATION, &exit_qualification); vmx_dr_access(exit_qualification, regs); break; case EXIT_REASON_MSR_READ: { uint64_t msr_content; if ( hvm_msr_read_intercept(regs->ecx, &msr_content) == X86EMUL_OKAY ) { regs->eax = (uint32_t)msr_content; regs->edx = (uint32_t)(msr_content >> 32); update_guest_eip(); /* Safe: RDMSR */ } break; } case EXIT_REASON_MSR_WRITE: { uint64_t msr_content; msr_content = ((uint64_t)regs->edx << 32) | (uint32_t)regs->eax; if ( hvm_msr_write_intercept(regs->ecx, msr_content) == X86EMUL_OKAY ) update_guest_eip(); /* Safe: WRMSR */ break; } case EXIT_REASON_VMXOFF: if ( nvmx_handle_vmxoff(regs) == X86EMUL_OKAY ) update_guest_eip(); break; case EXIT_REASON_VMXON: if ( nvmx_handle_vmxon(regs) == X86EMUL_OKAY ) update_guest_eip(); break; case EXIT_REASON_VMCLEAR: if ( nvmx_handle_vmclear(regs) == X86EMUL_OKAY ) update_guest_eip(); break; case EXIT_REASON_VMPTRLD: if ( nvmx_handle_vmptrld(regs) == X86EMUL_OKAY ) update_guest_eip(); break; case EXIT_REASON_VMPTRST: if ( nvmx_handle_vmptrst(regs) == X86EMUL_OKAY ) update_guest_eip(); break; case EXIT_REASON_VMREAD: if ( nvmx_handle_vmread(regs) == X86EMUL_OKAY ) update_guest_eip(); break; case EXIT_REASON_VMWRITE: if ( nvmx_handle_vmwrite(regs) == X86EMUL_OKAY ) update_guest_eip(); break; case EXIT_REASON_VMLAUNCH: if ( nvmx_handle_vmlaunch(regs) == X86EMUL_OKAY ) update_guest_eip(); break; case EXIT_REASON_VMRESUME: if ( nvmx_handle_vmresume(regs) == X86EMUL_OKAY ) update_guest_eip(); break; case EXIT_REASON_INVEPT: if ( nvmx_handle_invept(regs) == X86EMUL_OKAY ) update_guest_eip(); break; case EXIT_REASON_INVVPID: if ( nvmx_handle_invvpid(regs) == X86EMUL_OKAY ) update_guest_eip(); break; case EXIT_REASON_MWAIT_INSTRUCTION: case EXIT_REASON_MONITOR_INSTRUCTION: case EXIT_REASON_GETSEC: /* * We should never exit on GETSEC because CR4.SMXE is always 0 when * running in guest context, and the CPU checks that before getting * as far as vmexit. */ WARN_ON(exit_reason == EXIT_REASON_GETSEC); hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); break; case EXIT_REASON_TPR_BELOW_THRESHOLD: break; case EXIT_REASON_APIC_ACCESS: if ( !vmx_handle_eoi_write() && !handle_mmio() ) hvm_inject_hw_exception(TRAP_gp_fault, 0); break; case EXIT_REASON_EOI_INDUCED: { int vector; __vmread(EXIT_QUALIFICATION, &exit_qualification); vector = exit_qualification & 0xff; vmx_handle_EOI_induced_exit(vcpu_vlapic(current), vector); break; } case EXIT_REASON_IO_INSTRUCTION: __vmread(EXIT_QUALIFICATION, &exit_qualification); if ( exit_qualification & 0x10 ) { /* INS, OUTS */ if ( !handle_mmio() ) hvm_inject_hw_exception(TRAP_gp_fault, 0); } else { /* IN, OUT */ uint16_t port = (exit_qualification >> 16) & 0xFFFF; int bytes = (exit_qualification & 0x07) + 1; int dir = (exit_qualification & 0x08) ? IOREQ_READ : IOREQ_WRITE; if ( handle_pio(port, bytes, dir) ) update_guest_eip(); /* Safe: IN, OUT */ } break; case EXIT_REASON_INVD: case EXIT_REASON_WBINVD: { update_guest_eip(); /* Safe: INVD, WBINVD */ vmx_wbinvd_intercept(); break; } case EXIT_REASON_EPT_VIOLATION: { paddr_t gpa; __vmread(GUEST_PHYSICAL_ADDRESS, &gpa); __vmread(EXIT_QUALIFICATION, &exit_qualification); ept_handle_violation(exit_qualification, gpa); break; } case EXIT_REASON_MONITOR_TRAP_FLAG: v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG; vmx_update_cpu_exec_control(v); if ( v->arch.hvm_vcpu.single_step ) { hvm_memory_event_single_step(regs->eip); if ( v->domain->debugger_attached ) domain_pause_for_debugger(); } break; case EXIT_REASON_PAUSE_INSTRUCTION: perfc_incr(pauseloop_exits); do_sched_op_compat(SCHEDOP_yield, 0); break; case EXIT_REASON_XSETBV: if ( hvm_handle_xsetbv(regs->ecx, (regs->rdx << 32) | regs->_eax) == 0 ) update_guest_eip(); /* Safe: XSETBV */ break; case EXIT_REASON_APIC_WRITE: if ( vmx_handle_apic_write() ) hvm_inject_hw_exception(TRAP_gp_fault, 0); break; case EXIT_REASON_ACCESS_GDTR_OR_IDTR: case EXIT_REASON_ACCESS_LDTR_OR_TR: case EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED: case EXIT_REASON_INVPCID: /* fall through */ default: exit_and_crash: gdprintk(XENLOG_ERR, "Bad vmexit (reason %#lx)\n", exit_reason); domain_crash(v->domain); break; } out: if ( nestedhvm_vcpu_in_guestmode(v) ) nvmx_idtv_handling(); } void vmx_vmenter_helper(const struct cpu_user_regs *regs) { struct vcpu *curr = current; u32 new_asid, old_asid; struct hvm_vcpu_asid *p_asid; bool_t need_flush; if ( !cpu_has_vmx_vpid ) goto out; if ( nestedhvm_vcpu_in_guestmode(curr) ) p_asid = &vcpu_nestedhvm(curr).nv_n2asid; else p_asid = &curr->arch.hvm_vcpu.n1asid; old_asid = p_asid->asid; need_flush = hvm_asid_handle_vmenter(p_asid); new_asid = p_asid->asid; if ( unlikely(new_asid != old_asid) ) { __vmwrite(VIRTUAL_PROCESSOR_ID, new_asid); if ( !old_asid && new_asid ) { /* VPID was disabled: now enabled. */ curr->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_VPID; vmx_update_secondary_exec_control(curr); } else if ( old_asid && !new_asid ) { /* VPID was enabled: now disabled. */ curr->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; vmx_update_secondary_exec_control(curr); } } if ( unlikely(need_flush) ) vpid_sync_all(); out: HVMTRACE_ND(VMENTRY, 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0); __vmwrite(GUEST_RIP, regs->rip); __vmwrite(GUEST_RSP, regs->rsp); __vmwrite(GUEST_RFLAGS, regs->rflags | X86_EFLAGS_MBS); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/hvm/vmx/entry.S0000664000175000017500000000607612307313555015501 0ustar smbsmb/* * entry.S: VMX architecture-specific entry/exit handling. * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2008, Citrix Systems, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #define VMRESUME .byte 0x0f,0x01,0xc3 #define VMLAUNCH .byte 0x0f,0x01,0xc2 ENTRY(vmx_asm_vmexit_handler) push %rdi push %rsi push %rdx push %rcx push %rax mov %cr2,%rax push %r8 push %r9 push %r10 push %r11 push %rbx GET_CURRENT(%rbx) push %rbp push %r12 push %r13 push %r14 push %r15 movb $1,VCPU_vmx_launched(%rbx) mov %rax,VCPU_hvm_guest_cr2(%rbx) mov %rsp,%rdi call vmx_vmexit_handler .Lvmx_do_vmentry: call vmx_intr_assist call nvmx_switch_guest ASSERT_NOT_IN_ATOMIC mov VCPU_processor(%rbx),%eax lea irq_stat+IRQSTAT_softirq_pending(%rip),%rdx xor %ecx,%ecx shl $IRQSTAT_shift,%eax cli cmp %ecx,(%rdx,%rax,1) jnz .Lvmx_process_softirqs cmp %cl,VCPU_vmx_emulate(%rbx) jne .Lvmx_goto_emulator cmp %cl,VCPU_vmx_realmode(%rbx) UNLIKELY_START(ne, realmode) cmp %cx,VCPU_vm86_seg_mask(%rbx) jnz .Lvmx_goto_emulator mov %rsp,%rdi call vmx_enter_realmode UNLIKELY_END(realmode) mov %rsp,%rdi call vmx_vmenter_helper mov VCPU_hvm_guest_cr2(%rbx),%rax pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp mov %rax,%cr2 cmpb $0,VCPU_vmx_launched(%rbx) pop %rbx pop %r11 pop %r10 pop %r9 pop %r8 pop %rax pop %rcx pop %rdx pop %rsi pop %rdi je .Lvmx_launch /*.Lvmx_resume:*/ VMRESUME sti call vm_resume_fail ud2 .Lvmx_launch: VMLAUNCH sti call vm_launch_fail ud2 ENTRY(vmx_asm_do_vmentry) GET_CURRENT(%rbx) jmp .Lvmx_do_vmentry .Lvmx_goto_emulator: sti mov %rsp,%rdi call vmx_realmode jmp .Lvmx_do_vmentry .Lvmx_process_softirqs: sti call do_softirq jmp .Lvmx_do_vmentry xen-4.4.0/xen/arch/x86/hvm/vmx/vmcs.c0000664000175000017500000014317412307313555015331 0ustar smbsmb/* * vmcs.c: VMCS management * Copyright (c) 2004, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static bool_t __read_mostly opt_vpid_enabled = 1; boolean_param("vpid", opt_vpid_enabled); static bool_t __read_mostly opt_unrestricted_guest_enabled = 1; boolean_param("unrestricted_guest", opt_unrestricted_guest_enabled); static bool_t __read_mostly opt_apicv_enabled = 1; boolean_param("apicv", opt_apicv_enabled); /* * These two parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive * executions of PAUSE in a loop. * ple_window: upper bound on the amount of time a guest is allowed to execute * in a PAUSE loop. * Time is measured based on a counter that runs at the same rate as the TSC, * refer SDM volume 3b section 21.6.13 & 22.1.3. */ static unsigned int __read_mostly ple_gap = 128; integer_param("ple_gap", ple_gap); static unsigned int __read_mostly ple_window = 4096; integer_param("ple_window", ple_window); /* Dynamic (run-time adjusted) execution control flags. */ u32 vmx_pin_based_exec_control __read_mostly; u32 vmx_cpu_based_exec_control __read_mostly; u32 vmx_secondary_exec_control __read_mostly; u32 vmx_vmexit_control __read_mostly; u32 vmx_vmentry_control __read_mostly; u64 vmx_ept_vpid_cap __read_mostly; static DEFINE_PER_CPU_READ_MOSTLY(struct vmcs_struct *, vmxon_region); static DEFINE_PER_CPU(struct vmcs_struct *, current_vmcs); static DEFINE_PER_CPU(struct list_head, active_vmcs_list); static DEFINE_PER_CPU(bool_t, vmxon); static u32 vmcs_revision_id __read_mostly; u64 __read_mostly vmx_basic_msr; static void __init vmx_display_features(void) { int printed = 0; printk("VMX: Supported advanced features:\n"); #define P(p,s) if ( p ) { printk(" - %s\n", s); printed = 1; } P(cpu_has_vmx_virtualize_apic_accesses, "APIC MMIO access virtualisation"); P(cpu_has_vmx_tpr_shadow, "APIC TPR shadow"); P(cpu_has_vmx_ept, "Extended Page Tables (EPT)"); P(cpu_has_vmx_vpid, "Virtual-Processor Identifiers (VPID)"); P(cpu_has_vmx_vnmi, "Virtual NMI"); P(cpu_has_vmx_msr_bitmap, "MSR direct-access bitmap"); P(cpu_has_vmx_unrestricted_guest, "Unrestricted Guest"); P(cpu_has_vmx_apic_reg_virt, "APIC Register Virtualization"); P(cpu_has_vmx_virtual_intr_delivery, "Virtual Interrupt Delivery"); P(cpu_has_vmx_posted_intr_processing, "Posted Interrupt Processing"); P(cpu_has_vmx_vmcs_shadowing, "VMCS shadowing"); #undef P if ( !printed ) printk(" - none\n"); } static u32 adjust_vmx_controls( const char *name, u32 ctl_min, u32 ctl_opt, u32 msr, bool_t *mismatch) { u32 vmx_msr_low, vmx_msr_high, ctl = ctl_min | ctl_opt; rdmsr(msr, vmx_msr_low, vmx_msr_high); ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ /* Ensure minimum (required) set of control bits are supported. */ if ( ctl_min & ~ctl ) { *mismatch = 1; printk("VMX: CPU%d has insufficient %s (%08x; requires %08x)\n", smp_processor_id(), name, ctl, ctl_min); } return ctl; } static bool_t cap_check(const char *name, u32 expected, u32 saw) { if ( saw != expected ) printk("VMX %s: saw %#x expected %#x\n", name, saw, expected); return saw != expected; } static int vmx_init_vmcs_config(void) { u32 vmx_basic_msr_low, vmx_basic_msr_high, min, opt; u32 _vmx_pin_based_exec_control; u32 _vmx_cpu_based_exec_control; u32 _vmx_secondary_exec_control = 0; u64 _vmx_ept_vpid_cap = 0; u64 _vmx_misc_cap = 0; u32 _vmx_vmexit_control; u32 _vmx_vmentry_control; bool_t mismatch = 0; rdmsr(MSR_IA32_VMX_BASIC, vmx_basic_msr_low, vmx_basic_msr_high); min = (PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING); opt = (PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTERRUPT); _vmx_pin_based_exec_control = adjust_vmx_controls( "Pin-Based Exec Control", min, opt, MSR_IA32_VMX_PINBASED_CTLS, &mismatch); min = (CPU_BASED_HLT_EXITING | CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | CPU_BASED_MONITOR_EXITING | CPU_BASED_MWAIT_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_ACTIVATE_IO_BITMAP | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_RDTSC_EXITING); opt = (CPU_BASED_ACTIVATE_MSR_BITMAP | CPU_BASED_TPR_SHADOW | CPU_BASED_MONITOR_TRAP_FLAG | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); _vmx_cpu_based_exec_control = adjust_vmx_controls( "CPU-Based Exec Control", min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &mismatch); _vmx_cpu_based_exec_control &= ~CPU_BASED_RDTSC_EXITING; if ( _vmx_cpu_based_exec_control & CPU_BASED_TPR_SHADOW ) _vmx_cpu_based_exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING); if ( _vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS ) { min = 0; opt = (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_PAUSE_LOOP_EXITING | SECONDARY_EXEC_ENABLE_INVPCID); rdmsrl(MSR_IA32_VMX_MISC, _vmx_misc_cap); if ( _vmx_misc_cap & VMX_MISC_VMWRITE_ALL ) opt |= SECONDARY_EXEC_ENABLE_VMCS_SHADOWING; if ( opt_vpid_enabled ) opt |= SECONDARY_EXEC_ENABLE_VPID; if ( opt_unrestricted_guest_enabled ) opt |= SECONDARY_EXEC_UNRESTRICTED_GUEST; /* * "APIC Register Virtualization" and "Virtual Interrupt Delivery" * can be set only when "use TPR shadow" is set */ if ( (_vmx_cpu_based_exec_control & CPU_BASED_TPR_SHADOW) && opt_apicv_enabled ) opt |= SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; _vmx_secondary_exec_control = adjust_vmx_controls( "Secondary Exec Control", min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, &mismatch); } /* The IA32_VMX_EPT_VPID_CAP MSR exists only when EPT or VPID available */ if ( _vmx_secondary_exec_control & (SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_ENABLE_VPID) ) { rdmsrl(MSR_IA32_VMX_EPT_VPID_CAP, _vmx_ept_vpid_cap); /* * Additional sanity checking before using EPT: * 1) the CPU we are running on must support EPT WB, as we will set * ept paging structures memory type to WB; * 2) the CPU must support the EPT page-walk length of 4 according to * Intel SDM 25.2.2. * 3) the CPU must support INVEPT all context invalidation, because we * will use it as final resort if other types are not supported. * * Or we just don't use EPT. */ if ( !(_vmx_ept_vpid_cap & VMX_EPT_MEMORY_TYPE_WB) || !(_vmx_ept_vpid_cap & VMX_EPT_WALK_LENGTH_4_SUPPORTED) || !(_vmx_ept_vpid_cap & VMX_EPT_INVEPT_ALL_CONTEXT) ) _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; /* * the CPU must support INVVPID all context invalidation, because we * will use it as final resort if other types are not supported. * * Or we just don't use VPID. */ if ( !(_vmx_ept_vpid_cap & VMX_VPID_INVVPID_ALL_CONTEXT) ) _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; } if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT ) { /* * To use EPT we expect to be able to clear certain intercepts. * We check VMX_BASIC_MSR[55] to correctly handle default controls. */ uint32_t must_be_one, must_be_zero, msr = MSR_IA32_VMX_PROCBASED_CTLS; if ( vmx_basic_msr_high & (VMX_BASIC_DEFAULT1_ZERO >> 32) ) msr = MSR_IA32_VMX_TRUE_PROCBASED_CTLS; rdmsr(msr, must_be_one, must_be_zero); if ( must_be_one & (CPU_BASED_INVLPG_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING) ) _vmx_secondary_exec_control &= ~(SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST); } if ( (_vmx_secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) && ple_gap == 0 ) { if ( !vmx_pin_based_exec_control ) printk(XENLOG_INFO "Disable Pause-Loop Exiting.\n"); _vmx_secondary_exec_control &= ~ SECONDARY_EXEC_PAUSE_LOOP_EXITING; } min = VM_EXIT_ACK_INTR_ON_EXIT; opt = VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_LOAD_HOST_PAT; min |= VM_EXIT_IA32E_MODE; _vmx_vmexit_control = adjust_vmx_controls( "VMExit Control", min, opt, MSR_IA32_VMX_EXIT_CTLS, &mismatch); /* * "Process posted interrupt" can be set only when "virtual-interrupt * delivery" and "acknowledge interrupt on exit" is set */ if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || !(_vmx_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT) ) _vmx_pin_based_exec_control &= ~ PIN_BASED_POSTED_INTERRUPT; min = 0; opt = VM_ENTRY_LOAD_GUEST_PAT; _vmx_vmentry_control = adjust_vmx_controls( "VMEntry Control", min, opt, MSR_IA32_VMX_ENTRY_CTLS, &mismatch); if ( mismatch ) return -EINVAL; if ( !vmx_pin_based_exec_control ) { /* First time through. */ vmcs_revision_id = vmx_basic_msr_low & VMX_BASIC_REVISION_MASK; vmx_pin_based_exec_control = _vmx_pin_based_exec_control; vmx_cpu_based_exec_control = _vmx_cpu_based_exec_control; vmx_secondary_exec_control = _vmx_secondary_exec_control; vmx_ept_vpid_cap = _vmx_ept_vpid_cap; vmx_vmexit_control = _vmx_vmexit_control; vmx_vmentry_control = _vmx_vmentry_control; vmx_basic_msr = ((u64)vmx_basic_msr_high << 32) | vmx_basic_msr_low; vmx_display_features(); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ if ( (vmx_basic_msr_high & (VMX_BASIC_VMCS_SIZE_MASK >> 32)) > PAGE_SIZE ) { printk("VMX: CPU%d VMCS size is too big (%Lu bytes)\n", smp_processor_id(), vmx_basic_msr_high & (VMX_BASIC_VMCS_SIZE_MASK >> 32)); return -EINVAL; } } else { /* Globals are already initialised: re-check them. */ mismatch |= cap_check( "VMCS revision ID", vmcs_revision_id, vmx_basic_msr_low & VMX_BASIC_REVISION_MASK); mismatch |= cap_check( "Pin-Based Exec Control", vmx_pin_based_exec_control, _vmx_pin_based_exec_control); mismatch |= cap_check( "CPU-Based Exec Control", vmx_cpu_based_exec_control, _vmx_cpu_based_exec_control); mismatch |= cap_check( "Secondary Exec Control", vmx_secondary_exec_control, _vmx_secondary_exec_control); mismatch |= cap_check( "VMExit Control", vmx_vmexit_control, _vmx_vmexit_control); mismatch |= cap_check( "VMEntry Control", vmx_vmentry_control, _vmx_vmentry_control); mismatch |= cap_check( "EPT and VPID Capability", vmx_ept_vpid_cap, _vmx_ept_vpid_cap); if ( cpu_has_vmx_ins_outs_instr_info != !!(vmx_basic_msr_high & (VMX_BASIC_INS_OUT_INFO >> 32)) ) { printk("VMX INS/OUTS Instruction Info: saw %d expected %d\n", !!(vmx_basic_msr_high & (VMX_BASIC_INS_OUT_INFO >> 32)), cpu_has_vmx_ins_outs_instr_info); mismatch = 1; } if ( (vmx_basic_msr_high & (VMX_BASIC_VMCS_SIZE_MASK >> 32)) != ((vmx_basic_msr & VMX_BASIC_VMCS_SIZE_MASK) >> 32) ) { printk("VMX: CPU%d unexpected VMCS size %Lu\n", smp_processor_id(), vmx_basic_msr_high & (VMX_BASIC_VMCS_SIZE_MASK >> 32)); mismatch = 1; } if ( mismatch ) { printk("VMX: Capabilities fatally differ between CPU%d and CPU0\n", smp_processor_id()); return -EINVAL; } } /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ if ( vmx_basic_msr_high & (VMX_BASIC_32BIT_ADDRESSES >> 32) ) { printk("VMX: CPU%d limits VMX structure pointers to 32 bits\n", smp_processor_id()); return -EINVAL; } /* Require Write-Back (WB) memory type for VMCS accesses. */ opt = (vmx_basic_msr_high & (VMX_BASIC_MEMORY_TYPE_MASK >> 32)) / ((VMX_BASIC_MEMORY_TYPE_MASK & -VMX_BASIC_MEMORY_TYPE_MASK) >> 32); if ( opt != MTRR_TYPE_WRBACK ) { printk("VMX: CPU%d has unexpected VMCS access type %u\n", smp_processor_id(), opt); return -EINVAL; } return 0; } static struct vmcs_struct *vmx_alloc_vmcs(void) { struct vmcs_struct *vmcs; if ( (vmcs = alloc_xenheap_page()) == NULL ) { gdprintk(XENLOG_WARNING, "Failed to allocate VMCS.\n"); return NULL; } clear_page(vmcs); vmcs->vmcs_revision_id = vmcs_revision_id; return vmcs; } static void vmx_free_vmcs(struct vmcs_struct *vmcs) { free_xenheap_page(vmcs); } static void __vmx_clear_vmcs(void *info) { struct vcpu *v = info; struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx; /* Otherwise we can nest (vmx_cpu_down() vs. vmx_clear_vmcs()). */ ASSERT(!local_irq_is_enabled()); if ( arch_vmx->active_cpu == smp_processor_id() ) { __vmpclear(virt_to_maddr(arch_vmx->vmcs)); if ( arch_vmx->vmcs_shadow_maddr ) __vmpclear(arch_vmx->vmcs_shadow_maddr); arch_vmx->active_cpu = -1; arch_vmx->launched = 0; list_del(&arch_vmx->active_list); if ( arch_vmx->vmcs == this_cpu(current_vmcs) ) this_cpu(current_vmcs) = NULL; } } static void vmx_clear_vmcs(struct vcpu *v) { int cpu = v->arch.hvm_vmx.active_cpu; if ( cpu != -1 ) on_selected_cpus(cpumask_of(cpu), __vmx_clear_vmcs, v, 1); } static void vmx_load_vmcs(struct vcpu *v) { unsigned long flags; local_irq_save(flags); if ( v->arch.hvm_vmx.active_cpu == -1 ) { list_add(&v->arch.hvm_vmx.active_list, &this_cpu(active_vmcs_list)); v->arch.hvm_vmx.active_cpu = smp_processor_id(); } ASSERT(v->arch.hvm_vmx.active_cpu == smp_processor_id()); __vmptrld(virt_to_maddr(v->arch.hvm_vmx.vmcs)); this_cpu(current_vmcs) = v->arch.hvm_vmx.vmcs; local_irq_restore(flags); } int vmx_cpu_up_prepare(unsigned int cpu) { /* * If nvmx_cpu_up_prepare() failed, do not return failure and just fallback * to legacy mode for vvmcs synchronization. */ if ( nvmx_cpu_up_prepare(cpu) != 0 ) printk("CPU%d: Could not allocate virtual VMCS buffer.\n", cpu); if ( per_cpu(vmxon_region, cpu) != NULL ) return 0; per_cpu(vmxon_region, cpu) = vmx_alloc_vmcs(); if ( per_cpu(vmxon_region, cpu) != NULL ) return 0; printk("CPU%d: Could not allocate host VMCS\n", cpu); nvmx_cpu_dead(cpu); return -ENOMEM; } void vmx_cpu_dead(unsigned int cpu) { vmx_free_vmcs(per_cpu(vmxon_region, cpu)); per_cpu(vmxon_region, cpu) = NULL; nvmx_cpu_dead(cpu); } int vmx_cpu_up(void) { u32 eax, edx; int rc, bios_locked, cpu = smp_processor_id(); u64 cr0, vmx_cr0_fixed0, vmx_cr0_fixed1; BUG_ON(!(read_cr4() & X86_CR4_VMXE)); vmx_save_host_msrs(); /* * Ensure the current processor operating mode meets * the requred CRO fixed bits in VMX operation. */ cr0 = read_cr0(); rdmsrl(MSR_IA32_VMX_CR0_FIXED0, vmx_cr0_fixed0); rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx_cr0_fixed1); if ( (~cr0 & vmx_cr0_fixed0) || (cr0 & ~vmx_cr0_fixed1) ) { printk("CPU%d: some settings of host CR0 are " "not allowed in VMX operation.\n", cpu); return -EINVAL; } rdmsr(IA32_FEATURE_CONTROL_MSR, eax, edx); bios_locked = !!(eax & IA32_FEATURE_CONTROL_MSR_LOCK); if ( bios_locked ) { if ( !(eax & (tboot_in_measured_env() ? IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_INSIDE_SMX : IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_OUTSIDE_SMX)) ) { printk("CPU%d: VMX disabled by BIOS.\n", cpu); return -EINVAL; } } else { eax = IA32_FEATURE_CONTROL_MSR_LOCK; eax |= IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_OUTSIDE_SMX; if ( test_bit(X86_FEATURE_SMXE, &boot_cpu_data.x86_capability) ) eax |= IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_INSIDE_SMX; wrmsr(IA32_FEATURE_CONTROL_MSR, eax, 0); } if ( (rc = vmx_init_vmcs_config()) != 0 ) return rc; INIT_LIST_HEAD(&this_cpu(active_vmcs_list)); if ( (rc = vmx_cpu_up_prepare(cpu)) != 0 ) return rc; switch ( __vmxon(virt_to_maddr(this_cpu(vmxon_region))) ) { case -2: /* #UD or #GP */ if ( bios_locked && test_bit(X86_FEATURE_SMXE, &boot_cpu_data.x86_capability) && (!(eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_OUTSIDE_SMX) || !(eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_INSIDE_SMX)) ) { printk("CPU%d: VMXON failed: perhaps because of TXT settings " "in your BIOS configuration?\n", cpu); printk(" --> Disable TXT in your BIOS unless using a secure " "bootloader.\n"); return -EINVAL; } /* fall through */ case -1: /* CF==1 or ZF==1 */ printk("CPU%d: unexpected VMXON failure\n", cpu); return -EINVAL; case 0: /* success */ this_cpu(vmxon) = 1; break; default: BUG(); } hvm_asid_init(cpu_has_vmx_vpid ? (1u << VMCS_VPID_WIDTH) : 0); if ( cpu_has_vmx_ept ) ept_sync_all(); if ( cpu_has_vmx_vpid ) vpid_sync_all(); return 0; } void vmx_cpu_down(void) { struct list_head *active_vmcs_list = &this_cpu(active_vmcs_list); unsigned long flags; if ( !this_cpu(vmxon) ) return; local_irq_save(flags); while ( !list_empty(active_vmcs_list) ) __vmx_clear_vmcs(list_entry(active_vmcs_list->next, struct vcpu, arch.hvm_vmx.active_list)); BUG_ON(!(read_cr4() & X86_CR4_VMXE)); this_cpu(vmxon) = 0; __vmxoff(); local_irq_restore(flags); } struct foreign_vmcs { struct vcpu *v; unsigned int count; }; static DEFINE_PER_CPU(struct foreign_vmcs, foreign_vmcs); bool_t vmx_vmcs_try_enter(struct vcpu *v) { struct foreign_vmcs *fv; /* * NB. We must *always* run an HVM VCPU on its own VMCS, except for * vmx_vmcs_enter/exit and scheduling tail critical regions. */ if ( likely(v == current) ) return v->arch.hvm_vmx.vmcs == this_cpu(current_vmcs); fv = &this_cpu(foreign_vmcs); if ( fv->v == v ) { BUG_ON(fv->count == 0); } else { BUG_ON(fv->v != NULL); BUG_ON(fv->count != 0); vcpu_pause(v); spin_lock(&v->arch.hvm_vmx.vmcs_lock); vmx_clear_vmcs(v); vmx_load_vmcs(v); fv->v = v; } fv->count++; return 1; } void vmx_vmcs_enter(struct vcpu *v) { bool_t okay = vmx_vmcs_try_enter(v); ASSERT(okay); } void vmx_vmcs_exit(struct vcpu *v) { struct foreign_vmcs *fv; if ( likely(v == current) ) return; fv = &this_cpu(foreign_vmcs); BUG_ON(fv->v != v); BUG_ON(fv->count == 0); if ( --fv->count == 0 ) { /* Don't confuse vmx_do_resume (for @v or @current!) */ vmx_clear_vmcs(v); if ( has_hvm_container_vcpu(current) ) vmx_load_vmcs(current); spin_unlock(&v->arch.hvm_vmx.vmcs_lock); vcpu_unpause(v); fv->v = NULL; } } struct xgt_desc { unsigned short size; unsigned long address __attribute__((packed)); }; static void vmx_set_host_env(struct vcpu *v) { unsigned int cpu = smp_processor_id(); __vmwrite(HOST_GDTR_BASE, (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY)); __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]); __vmwrite(HOST_TR_SELECTOR, TSS_ENTRY << 3); __vmwrite(HOST_TR_BASE, (unsigned long)&per_cpu(init_tss, cpu)); __vmwrite(HOST_SYSENTER_ESP, get_stack_bottom()); /* * Skip end of cpu_user_regs when entering the hypervisor because the * CPU does not save context onto the stack. SS,RSP,CS,RIP,RFLAGS,etc * all get saved into the VMCS instead. */ __vmwrite(HOST_RSP, (unsigned long)&get_cpu_info()->guest_cpu_user_regs.error_code); } void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr, int type) { unsigned long *msr_bitmap = v->arch.hvm_vmx.msr_bitmap; /* VMX MSR bitmap supported? */ if ( msr_bitmap == NULL ) return; /* * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals * have the write-low and read-high bitmap offsets the wrong way round. * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. */ if ( msr <= 0x1fff ) { if ( type & MSR_TYPE_R ) clear_bit(msr, msr_bitmap + 0x000/BYTES_PER_LONG); /* read-low */ if ( type & MSR_TYPE_W ) clear_bit(msr, msr_bitmap + 0x800/BYTES_PER_LONG); /* write-low */ } else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) ) { msr &= 0x1fff; if ( type & MSR_TYPE_R ) clear_bit(msr, msr_bitmap + 0x400/BYTES_PER_LONG); /* read-high */ if ( type & MSR_TYPE_W ) clear_bit(msr, msr_bitmap + 0xc00/BYTES_PER_LONG); /* write-high */ } else HVM_DBG_LOG(DBG_LEVEL_0, "msr %x is out of the control range" "0x00000000-0x00001fff and 0xc0000000-0xc0001fff" "RDMSR or WRMSR will cause a VM exit", msr); } void vmx_enable_intercept_for_msr(struct vcpu *v, u32 msr, int type) { unsigned long *msr_bitmap = v->arch.hvm_vmx.msr_bitmap; /* VMX MSR bitmap supported? */ if ( msr_bitmap == NULL ) return; /* * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals * have the write-low and read-high bitmap offsets the wrong way round. * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. */ if ( msr <= 0x1fff ) { if ( type & MSR_TYPE_R ) set_bit(msr, msr_bitmap + 0x000/BYTES_PER_LONG); /* read-low */ if ( type & MSR_TYPE_W ) set_bit(msr, msr_bitmap + 0x800/BYTES_PER_LONG); /* write-low */ } else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) ) { msr &= 0x1fff; if ( type & MSR_TYPE_R ) set_bit(msr, msr_bitmap + 0x400/BYTES_PER_LONG); /* read-high */ if ( type & MSR_TYPE_W ) set_bit(msr, msr_bitmap + 0xc00/BYTES_PER_LONG); /* write-high */ } else HVM_DBG_LOG(DBG_LEVEL_0, "msr %x is out of the control range" "0x00000000-0x00001fff and 0xc0000000-0xc0001fff" "RDMSR or WRMSR will cause a VM exit", msr); } /* * access_type: read == 0, write == 1 */ int vmx_check_msr_bitmap(unsigned long *msr_bitmap, u32 msr, int access_type) { int ret = 1; if ( !msr_bitmap ) return 1; if ( msr <= 0x1fff ) { if ( access_type == 0 ) ret = test_bit(msr, msr_bitmap + 0x000/BYTES_PER_LONG); /* read-low */ else if ( access_type == 1 ) ret = test_bit(msr, msr_bitmap + 0x800/BYTES_PER_LONG); /* write-low */ } else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) ) { msr &= 0x1fff; if ( access_type == 0 ) ret = test_bit(msr, msr_bitmap + 0x400/BYTES_PER_LONG); /* read-high */ else if ( access_type == 1 ) ret = test_bit(msr, msr_bitmap + 0xc00/BYTES_PER_LONG); /* write-high */ } return ret; } /* * Switch VMCS between layer 1 & 2 guest */ void vmx_vmcs_switch(struct vmcs_struct *from, struct vmcs_struct *to) { struct arch_vmx_struct *vmx = ¤t->arch.hvm_vmx; spin_lock(&vmx->vmcs_lock); __vmpclear(virt_to_maddr(from)); if ( vmx->vmcs_shadow_maddr ) __vmpclear(vmx->vmcs_shadow_maddr); __vmptrld(virt_to_maddr(to)); vmx->vmcs = to; vmx->launched = 0; this_cpu(current_vmcs) = to; if ( vmx->hostenv_migrated ) { vmx->hostenv_migrated = 0; vmx_set_host_env(current); } spin_unlock(&vmx->vmcs_lock); } void virtual_vmcs_enter(void *vvmcs) { __vmptrld(pfn_to_paddr(domain_page_map_to_mfn(vvmcs))); } void virtual_vmcs_exit(void *vvmcs) { __vmpclear(pfn_to_paddr(domain_page_map_to_mfn(vvmcs))); __vmptrld(virt_to_maddr(this_cpu(current_vmcs))); } u64 virtual_vmcs_vmread(void *vvmcs, u32 vmcs_encoding) { u64 res; virtual_vmcs_enter(vvmcs); __vmread(vmcs_encoding, &res); virtual_vmcs_exit(vvmcs); return res; } void virtual_vmcs_vmwrite(void *vvmcs, u32 vmcs_encoding, u64 val) { virtual_vmcs_enter(vvmcs); __vmwrite(vmcs_encoding, val); virtual_vmcs_exit(vvmcs); } static int construct_vmcs(struct vcpu *v) { struct domain *d = v->domain; uint16_t sysenter_cs; unsigned long sysenter_eip; u32 vmexit_ctl = vmx_vmexit_control; u32 vmentry_ctl = vmx_vmentry_control; vmx_vmcs_enter(v); /* VMCS controls. */ __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control); v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control; if ( d->arch.vtsc ) v->arch.hvm_vmx.exec_control |= CPU_BASED_RDTSC_EXITING; v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control; /* Disable VPID for now: we decide when to enable it on VMENTER. */ v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; if ( paging_mode_hap(d) ) { v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); } else { v->arch.hvm_vmx.secondary_exec_control &= ~(SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_ENABLE_INVPCID); vmexit_ctl &= ~(VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_LOAD_HOST_PAT); vmentry_ctl &= ~VM_ENTRY_LOAD_GUEST_PAT; } /* Disable Virtualize x2APIC mode by default. */ v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; /* Do not enable Monitor Trap Flag unless start single step debug */ v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG; if ( is_pvh_domain(d) ) { /* Disable virtual apics, TPR */ v->arch.hvm_vmx.secondary_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); v->arch.hvm_vmx.exec_control &= ~CPU_BASED_TPR_SHADOW; /* Unrestricted guest (real mode for EPT) */ v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; /* Start in 64-bit mode. PVH 32bitfixme. */ vmentry_ctl |= VM_ENTRY_IA32E_MODE; /* GUEST_EFER.LME/LMA ignored */ ASSERT(v->arch.hvm_vmx.exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); ASSERT(v->arch.hvm_vmx.exec_control & CPU_BASED_ACTIVATE_MSR_BITMAP); ASSERT(!(v->arch.hvm_vmx.exec_control & CPU_BASED_RDTSC_EXITING)); } vmx_update_cpu_exec_control(v); __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl); __vmwrite(VM_ENTRY_CONTROLS, vmentry_ctl); if ( cpu_has_vmx_ple ) { __vmwrite(PLE_GAP, ple_gap); __vmwrite(PLE_WINDOW, ple_window); } if ( cpu_has_vmx_secondary_exec_control ) __vmwrite(SECONDARY_VM_EXEC_CONTROL, v->arch.hvm_vmx.secondary_exec_control); /* MSR access bitmap. */ if ( cpu_has_vmx_msr_bitmap ) { unsigned long *msr_bitmap = alloc_xenheap_page(); if ( msr_bitmap == NULL ) { vmx_vmcs_exit(v); return -ENOMEM; } memset(msr_bitmap, ~0, PAGE_SIZE); v->arch.hvm_vmx.msr_bitmap = msr_bitmap; __vmwrite(MSR_BITMAP, virt_to_maddr(msr_bitmap)); vmx_disable_intercept_for_msr(v, MSR_FS_BASE, MSR_TYPE_R | MSR_TYPE_W); vmx_disable_intercept_for_msr(v, MSR_GS_BASE, MSR_TYPE_R | MSR_TYPE_W); vmx_disable_intercept_for_msr(v, MSR_SHADOW_GS_BASE, MSR_TYPE_R | MSR_TYPE_W); vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, MSR_TYPE_R | MSR_TYPE_W); vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, MSR_TYPE_R | MSR_TYPE_W); vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, MSR_TYPE_R | MSR_TYPE_W); if ( paging_mode_hap(d) && (!iommu_enabled || iommu_snoop) ) vmx_disable_intercept_for_msr(v, MSR_IA32_CR_PAT, MSR_TYPE_R | MSR_TYPE_W); } /* I/O access bitmap. */ __vmwrite(IO_BITMAP_A, virt_to_maddr((char *)hvm_io_bitmap + 0)); __vmwrite(IO_BITMAP_B, virt_to_maddr((char *)hvm_io_bitmap + PAGE_SIZE)); if ( cpu_has_vmx_virtual_intr_delivery ) { unsigned int i; /* EOI-exit bitmap */ bitmap_zero(v->arch.hvm_vmx.eoi_exit_bitmap, NR_VECTORS); for ( i = 0; i < ARRAY_SIZE(v->arch.hvm_vmx.eoi_exit_bitmap); ++i ) __vmwrite(EOI_EXIT_BITMAP(i), 0); /* Initialise Guest Interrupt Status (RVI and SVI) to 0 */ __vmwrite(GUEST_INTR_STATUS, 0); } if ( cpu_has_vmx_posted_intr_processing ) { __vmwrite(PI_DESC_ADDR, virt_to_maddr(&v->arch.hvm_vmx.pi_desc)); __vmwrite(POSTED_INTR_NOTIFICATION_VECTOR, posted_intr_vector); } /* Host data selectors. */ __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS); __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS); __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS); __vmwrite(HOST_FS_SELECTOR, 0); __vmwrite(HOST_GS_SELECTOR, 0); __vmwrite(HOST_FS_BASE, 0); __vmwrite(HOST_GS_BASE, 0); /* Host control registers. */ v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS; __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); __vmwrite(HOST_CR4, mmu_cr4_features); /* Host CS:RIP. */ __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS); __vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler); /* Host SYSENTER CS:RIP. */ rdmsrl(MSR_IA32_SYSENTER_CS, sysenter_cs); __vmwrite(HOST_SYSENTER_CS, sysenter_cs); rdmsrl(MSR_IA32_SYSENTER_EIP, sysenter_eip); __vmwrite(HOST_SYSENTER_EIP, sysenter_eip); /* MSR intercepts. */ __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0); __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0); __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0); __vmwrite(VM_ENTRY_INTR_INFO, 0); __vmwrite(CR0_GUEST_HOST_MASK, ~0UL); __vmwrite(CR4_GUEST_HOST_MASK, ~0UL); __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0); __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0); __vmwrite(CR3_TARGET_COUNT, 0); __vmwrite(GUEST_ACTIVITY_STATE, 0); /* Guest segment bases. */ __vmwrite(GUEST_ES_BASE, 0); __vmwrite(GUEST_SS_BASE, 0); __vmwrite(GUEST_DS_BASE, 0); __vmwrite(GUEST_FS_BASE, 0); __vmwrite(GUEST_GS_BASE, 0); __vmwrite(GUEST_CS_BASE, 0); /* Guest segment limits. */ __vmwrite(GUEST_ES_LIMIT, ~0u); __vmwrite(GUEST_SS_LIMIT, ~0u); __vmwrite(GUEST_DS_LIMIT, ~0u); __vmwrite(GUEST_FS_LIMIT, ~0u); __vmwrite(GUEST_GS_LIMIT, ~0u); __vmwrite(GUEST_CS_LIMIT, ~0u); /* Guest segment AR bytes. */ __vmwrite(GUEST_ES_AR_BYTES, 0xc093); /* read/write, accessed */ __vmwrite(GUEST_SS_AR_BYTES, 0xc093); __vmwrite(GUEST_DS_AR_BYTES, 0xc093); __vmwrite(GUEST_FS_AR_BYTES, 0xc093); __vmwrite(GUEST_GS_AR_BYTES, 0xc093); if ( is_pvh_domain(d) ) /* CS.L == 1, exec, read/write, accessed. PVH 32bitfixme. */ __vmwrite(GUEST_CS_AR_BYTES, 0xa09b); else __vmwrite(GUEST_CS_AR_BYTES, 0xc09b); /* exec/read, accessed */ /* Guest IDT. */ __vmwrite(GUEST_IDTR_BASE, 0); __vmwrite(GUEST_IDTR_LIMIT, 0); /* Guest GDT. */ __vmwrite(GUEST_GDTR_BASE, 0); __vmwrite(GUEST_GDTR_LIMIT, 0); /* Guest LDT. */ __vmwrite(GUEST_LDTR_AR_BYTES, 0x0082); /* LDT */ __vmwrite(GUEST_LDTR_SELECTOR, 0); __vmwrite(GUEST_LDTR_BASE, 0); __vmwrite(GUEST_LDTR_LIMIT, 0); /* Guest TSS. */ __vmwrite(GUEST_TR_AR_BYTES, 0x008b); /* 32-bit TSS (busy) */ __vmwrite(GUEST_TR_BASE, 0); __vmwrite(GUEST_TR_LIMIT, 0xff); __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0); __vmwrite(GUEST_DR7, 0); __vmwrite(VMCS_LINK_POINTER, ~0UL); v->arch.hvm_vmx.exception_bitmap = HVM_TRAP_MASK | (paging_mode_hap(d) ? 0 : (1U << TRAP_page_fault)) | (1U << TRAP_no_device); vmx_update_exception_bitmap(v); /* * In HVM domains, this happens on the realmode->paging * transition. Since PVH never goes through this transition, we * need to do it at start-of-day. */ if ( is_pvh_domain(d) ) vmx_update_debug_state(v); v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET; /* PVH domains always start in paging mode */ if ( is_pvh_domain(d) ) v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_PG; hvm_update_guest_cr(v, 0); v->arch.hvm_vcpu.guest_cr[4] = is_pvh_domain(d) ? X86_CR4_PAE : 0; hvm_update_guest_cr(v, 4); if ( cpu_has_vmx_tpr_shadow ) { __vmwrite(VIRTUAL_APIC_PAGE_ADDR, page_to_maddr(vcpu_vlapic(v)->regs_page)); __vmwrite(TPR_THRESHOLD, 0); } if ( paging_mode_hap(d) ) { struct p2m_domain *p2m = p2m_get_hostp2m(d); struct ept_data *ept = &p2m->ept; ept->asr = pagetable_get_pfn(p2m_get_pagetable(p2m)); __vmwrite(EPT_POINTER, ept_get_eptp(ept)); } if ( paging_mode_hap(d) ) { u64 host_pat, guest_pat; rdmsrl(MSR_IA32_CR_PAT, host_pat); guest_pat = MSR_IA32_CR_PAT_RESET; __vmwrite(HOST_PAT, host_pat); __vmwrite(GUEST_PAT, guest_pat); } vmx_vmcs_exit(v); /* PVH: paging mode is updated by arch_set_info_guest(). */ if ( is_hvm_vcpu(v) ) { /* will update HOST & GUEST_CR3 as reqd */ paging_update_paging_modes(v); vmx_vlapic_msr_changed(v); } return 0; } int vmx_read_guest_msr(u32 msr, u64 *val) { struct vcpu *curr = current; unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count; const struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area; for ( i = 0; i < msr_count; i++ ) { if ( msr_area[i].index == msr ) { *val = msr_area[i].data; return 0; } } return -ESRCH; } int vmx_write_guest_msr(u32 msr, u64 val) { struct vcpu *curr = current; unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count; struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area; for ( i = 0; i < msr_count; i++ ) { if ( msr_area[i].index == msr ) { msr_area[i].data = val; return 0; } } return -ESRCH; } int vmx_add_guest_msr(u32 msr) { struct vcpu *curr = current; unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count; struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area; if ( msr_area == NULL ) { if ( (msr_area = alloc_xenheap_page()) == NULL ) return -ENOMEM; curr->arch.hvm_vmx.msr_area = msr_area; __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(msr_area)); __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(msr_area)); } for ( i = 0; i < msr_count; i++ ) if ( msr_area[i].index == msr ) return 0; if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) ) return -ENOSPC; msr_area[msr_count].index = msr; msr_area[msr_count].mbz = 0; msr_area[msr_count].data = 0; curr->arch.hvm_vmx.msr_count = ++msr_count; __vmwrite(VM_EXIT_MSR_STORE_COUNT, msr_count); __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, msr_count); return 0; } int vmx_add_host_load_msr(u32 msr) { struct vcpu *curr = current; unsigned int i, msr_count = curr->arch.hvm_vmx.host_msr_count; struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.host_msr_area; if ( msr_area == NULL ) { if ( (msr_area = alloc_xenheap_page()) == NULL ) return -ENOMEM; curr->arch.hvm_vmx.host_msr_area = msr_area; __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area)); } for ( i = 0; i < msr_count; i++ ) if ( msr_area[i].index == msr ) return 0; if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) ) return -ENOSPC; msr_area[msr_count].index = msr; msr_area[msr_count].mbz = 0; rdmsrl(msr, msr_area[msr_count].data); curr->arch.hvm_vmx.host_msr_count = ++msr_count; __vmwrite(VM_EXIT_MSR_LOAD_COUNT, msr_count); return 0; } void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector) { if ( !test_and_set_bit(vector, v->arch.hvm_vmx.eoi_exit_bitmap) ) set_bit(vector / BITS_PER_LONG, &v->arch.hvm_vmx.eoi_exitmap_changed); } void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 vector) { if ( test_and_clear_bit(vector, v->arch.hvm_vmx.eoi_exit_bitmap) ) set_bit(vector / BITS_PER_LONG, &v->arch.hvm_vmx.eoi_exitmap_changed); } int vmx_create_vmcs(struct vcpu *v) { struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx; int rc; if ( (arch_vmx->vmcs = vmx_alloc_vmcs()) == NULL ) return -ENOMEM; INIT_LIST_HEAD(&arch_vmx->active_list); __vmpclear(virt_to_maddr(arch_vmx->vmcs)); arch_vmx->active_cpu = -1; arch_vmx->launched = 0; if ( (rc = construct_vmcs(v)) != 0 ) { vmx_free_vmcs(arch_vmx->vmcs); return rc; } return 0; } void vmx_destroy_vmcs(struct vcpu *v) { struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx; vmx_clear_vmcs(v); vmx_free_vmcs(arch_vmx->vmcs); free_xenheap_page(v->arch.hvm_vmx.host_msr_area); free_xenheap_page(v->arch.hvm_vmx.msr_area); free_xenheap_page(v->arch.hvm_vmx.msr_bitmap); } void vm_launch_fail(void) { unsigned long error; __vmread(VM_INSTRUCTION_ERROR, &error); printk(" error code %lx\n", error); domain_crash_synchronous(); } void vm_resume_fail(void) { unsigned long error; __vmread(VM_INSTRUCTION_ERROR, &error); printk(" error code %lx\n", error); domain_crash_synchronous(); } static void wbinvd_ipi(void *info) { wbinvd(); } void vmx_do_resume(struct vcpu *v) { bool_t debug_state; if ( v->arch.hvm_vmx.active_cpu == smp_processor_id() ) { if ( v->arch.hvm_vmx.vmcs != this_cpu(current_vmcs) ) vmx_load_vmcs(v); } else { /* * For pass-through domain, guest PCI-E device driver may leverage the * "Non-Snoop" I/O, and explicitly WBINVD or CLFLUSH to a RAM space. * Since migration may occur before WBINVD or CLFLUSH, we need to * maintain data consistency either by: * 1: flushing cache (wbinvd) when the guest is scheduled out if * there is no wbinvd exit, or * 2: execute wbinvd on all dirty pCPUs when guest wbinvd exits. * If VT-d engine can force snooping, we don't need to do these. */ if ( has_arch_pdevs(v->domain) && !iommu_snoop && !cpu_has_wbinvd_exiting ) { int cpu = v->arch.hvm_vmx.active_cpu; if ( cpu != -1 ) on_selected_cpus(cpumask_of(cpu), wbinvd_ipi, NULL, 1); } vmx_clear_vmcs(v); vmx_load_vmcs(v); hvm_migrate_timers(v); hvm_migrate_pirqs(v); vmx_set_host_env(v); /* * Both n1 VMCS and n2 VMCS need to update the host environment after * VCPU migration. The environment of current VMCS is updated in place, * but the action of another VMCS is deferred till it is switched in. */ v->arch.hvm_vmx.hostenv_migrated = 1; hvm_asid_flush_vcpu(v); } debug_state = v->domain->debugger_attached || v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_INT3] || v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP]; if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) ) { v->arch.hvm_vcpu.debug_state_latch = debug_state; vmx_update_debug_state(v); } hvm_do_resume(v); reset_stack_and_jump(vmx_asm_do_vmentry); } static inline unsigned long vmr(unsigned long field) { unsigned long val; return __vmread_safe(field, &val) ? val : 0; } static void vmx_dump_sel(char *name, uint32_t selector) { uint32_t sel, attr, limit; uint64_t base; sel = vmr(selector); attr = vmr(selector + (GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR)); limit = vmr(selector + (GUEST_ES_LIMIT - GUEST_ES_SELECTOR)); base = vmr(selector + (GUEST_ES_BASE - GUEST_ES_SELECTOR)); printk("%s: sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016"PRIx64"\n", name, sel, attr, limit, base); } static void vmx_dump_sel2(char *name, uint32_t lim) { uint32_t limit; uint64_t base; limit = vmr(lim); base = vmr(lim + (GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); printk("%s: limit=0x%08x, base=0x%016"PRIx64"\n", name, limit, base); } void vmcs_dump_vcpu(struct vcpu *v) { struct cpu_user_regs *regs = &v->arch.user_regs; unsigned long long x; if ( v == current ) regs = guest_cpu_user_regs(); vmx_vmcs_enter(v); printk("*** Guest State ***\n"); printk("CR0: actual=0x%016llx, shadow=0x%016llx, gh_mask=%016llx\n", (unsigned long long)vmr(GUEST_CR0), (unsigned long long)vmr(CR0_READ_SHADOW), (unsigned long long)vmr(CR0_GUEST_HOST_MASK)); printk("CR4: actual=0x%016llx, shadow=0x%016llx, gh_mask=%016llx\n", (unsigned long long)vmr(GUEST_CR4), (unsigned long long)vmr(CR4_READ_SHADOW), (unsigned long long)vmr(CR4_GUEST_HOST_MASK)); printk("CR3: actual=0x%016llx, target_count=%d\n", (unsigned long long)vmr(GUEST_CR3), (int)vmr(CR3_TARGET_COUNT)); printk(" target0=%016llx, target1=%016llx\n", (unsigned long long)vmr(CR3_TARGET_VALUE0), (unsigned long long)vmr(CR3_TARGET_VALUE1)); printk(" target2=%016llx, target3=%016llx\n", (unsigned long long)vmr(CR3_TARGET_VALUE2), (unsigned long long)vmr(CR3_TARGET_VALUE3)); printk("RSP = 0x%016llx (0x%016llx) RIP = 0x%016llx (0x%016llx)\n", (unsigned long long)vmr(GUEST_RSP), (unsigned long long)regs->esp, (unsigned long long)vmr(GUEST_RIP), (unsigned long long)regs->eip); printk("RFLAGS=0x%016llx (0x%016llx) DR7 = 0x%016llx\n", (unsigned long long)vmr(GUEST_RFLAGS), (unsigned long long)regs->eflags, (unsigned long long)vmr(GUEST_DR7)); printk("Sysenter RSP=%016llx CS:RIP=%04x:%016llx\n", (unsigned long long)vmr(GUEST_SYSENTER_ESP), (int)vmr(GUEST_SYSENTER_CS), (unsigned long long)vmr(GUEST_SYSENTER_EIP)); vmx_dump_sel("CS", GUEST_CS_SELECTOR); vmx_dump_sel("DS", GUEST_DS_SELECTOR); vmx_dump_sel("SS", GUEST_SS_SELECTOR); vmx_dump_sel("ES", GUEST_ES_SELECTOR); vmx_dump_sel("FS", GUEST_FS_SELECTOR); vmx_dump_sel("GS", GUEST_GS_SELECTOR); vmx_dump_sel2("GDTR", GUEST_GDTR_LIMIT); vmx_dump_sel("LDTR", GUEST_LDTR_SELECTOR); vmx_dump_sel2("IDTR", GUEST_IDTR_LIMIT); vmx_dump_sel("TR", GUEST_TR_SELECTOR); printk("Guest PAT = 0x%08x%08x\n", (uint32_t)vmr(GUEST_PAT_HIGH), (uint32_t)vmr(GUEST_PAT)); x = (unsigned long long)vmr(TSC_OFFSET_HIGH) << 32; x |= (uint32_t)vmr(TSC_OFFSET); printk("TSC Offset = %016llx\n", x); x = (unsigned long long)vmr(GUEST_IA32_DEBUGCTL_HIGH) << 32; x |= (uint32_t)vmr(GUEST_IA32_DEBUGCTL); printk("DebugCtl=%016llx DebugExceptions=%016llx\n", x, (unsigned long long)vmr(GUEST_PENDING_DBG_EXCEPTIONS)); printk("Interruptibility=%04x ActivityState=%04x\n", (int)vmr(GUEST_INTERRUPTIBILITY_INFO), (int)vmr(GUEST_ACTIVITY_STATE)); printk("*** Host State ***\n"); printk("RSP = 0x%016llx RIP = 0x%016llx\n", (unsigned long long)vmr(HOST_RSP), (unsigned long long)vmr(HOST_RIP)); printk("CS=%04x DS=%04x ES=%04x FS=%04x GS=%04x SS=%04x TR=%04x\n", (uint16_t)vmr(HOST_CS_SELECTOR), (uint16_t)vmr(HOST_DS_SELECTOR), (uint16_t)vmr(HOST_ES_SELECTOR), (uint16_t)vmr(HOST_FS_SELECTOR), (uint16_t)vmr(HOST_GS_SELECTOR), (uint16_t)vmr(HOST_SS_SELECTOR), (uint16_t)vmr(HOST_TR_SELECTOR)); printk("FSBase=%016llx GSBase=%016llx TRBase=%016llx\n", (unsigned long long)vmr(HOST_FS_BASE), (unsigned long long)vmr(HOST_GS_BASE), (unsigned long long)vmr(HOST_TR_BASE)); printk("GDTBase=%016llx IDTBase=%016llx\n", (unsigned long long)vmr(HOST_GDTR_BASE), (unsigned long long)vmr(HOST_IDTR_BASE)); printk("CR0=%016llx CR3=%016llx CR4=%016llx\n", (unsigned long long)vmr(HOST_CR0), (unsigned long long)vmr(HOST_CR3), (unsigned long long)vmr(HOST_CR4)); printk("Sysenter RSP=%016llx CS:RIP=%04x:%016llx\n", (unsigned long long)vmr(HOST_SYSENTER_ESP), (int)vmr(HOST_SYSENTER_CS), (unsigned long long)vmr(HOST_SYSENTER_EIP)); printk("Host PAT = 0x%08x%08x\n", (uint32_t)vmr(HOST_PAT_HIGH), (uint32_t)vmr(HOST_PAT)); printk("*** Control State ***\n"); printk("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", (uint32_t)vmr(PIN_BASED_VM_EXEC_CONTROL), (uint32_t)vmr(CPU_BASED_VM_EXEC_CONTROL), (uint32_t)vmr(SECONDARY_VM_EXEC_CONTROL)); printk("EntryControls=%08x ExitControls=%08x\n", (uint32_t)vmr(VM_ENTRY_CONTROLS), (uint32_t)vmr(VM_EXIT_CONTROLS)); printk("ExceptionBitmap=%08x\n", (uint32_t)vmr(EXCEPTION_BITMAP)); printk("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", (uint32_t)vmr(VM_ENTRY_INTR_INFO), (uint32_t)vmr(VM_ENTRY_EXCEPTION_ERROR_CODE), (uint32_t)vmr(VM_ENTRY_INSTRUCTION_LEN)); printk("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", (uint32_t)vmr(VM_EXIT_INTR_INFO), (uint32_t)vmr(VM_EXIT_INTR_ERROR_CODE), (uint32_t)vmr(VM_ENTRY_INSTRUCTION_LEN)); printk(" reason=%08x qualification=%08x\n", (uint32_t)vmr(VM_EXIT_REASON), (uint32_t)vmr(EXIT_QUALIFICATION)); printk("IDTVectoring: info=%08x errcode=%08x\n", (uint32_t)vmr(IDT_VECTORING_INFO), (uint32_t)vmr(IDT_VECTORING_ERROR_CODE)); printk("TPR Threshold = 0x%02x\n", (uint32_t)vmr(TPR_THRESHOLD)); printk("EPT pointer = 0x%08x%08x\n", (uint32_t)vmr(EPT_POINTER_HIGH), (uint32_t)vmr(EPT_POINTER)); printk("Virtual processor ID = 0x%04x\n", (uint32_t)vmr(VIRTUAL_PROCESSOR_ID)); vmx_vmcs_exit(v); } static void vmcs_dump(unsigned char ch) { struct domain *d; struct vcpu *v; printk("*********** VMCS Areas **************\n"); rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) { if ( !has_hvm_container_domain(d) ) continue; printk("\n>>> Domain %d <<<\n", d->domain_id); for_each_vcpu ( d, v ) { printk("\tVCPU %d\n", v->vcpu_id); vmcs_dump_vcpu(v); } } rcu_read_unlock(&domlist_read_lock); printk("**************************************\n"); } static struct keyhandler vmcs_dump_keyhandler = { .diagnostic = 1, .u.fn = vmcs_dump, .desc = "dump Intel's VMCS" }; void __init setup_vmcs_dump(void) { register_keyhandler('v', &vmcs_dump_keyhandler); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/hvm/rtc.c0000664000175000017500000006030112307313555014325 0ustar smbsmb/* * QEMU MC146818 RTC emulation * * Copyright (c) 2003-2004 Fabrice Bellard * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include #include #include #include #include #define USEC_PER_SEC 1000000UL #define NS_PER_USEC 1000UL #define NS_PER_SEC 1000000000ULL #define SEC_PER_MIN 60 #define SEC_PER_HOUR 3600 #define MIN_PER_HOUR 60 #define HOUR_PER_DAY 24 #define domain_vrtc(x) (&(x)->arch.hvm_domain.pl_time.vrtc) #define vcpu_vrtc(x) (domain_vrtc((x)->domain)) #define vrtc_domain(x) (container_of((x), struct domain, \ arch.hvm_domain.pl_time.vrtc)) #define vrtc_vcpu(x) (pt_global_vcpu_target(vrtc_domain(x))) #define epoch_year 1900 #define get_year(x) (x + epoch_year) enum rtc_mode { rtc_mode_no_ack, rtc_mode_strict }; /* This must be in sync with how hvmloader sets the ACPI WAET flags. */ #define mode_is(d, m) ((void)(d), rtc_mode_##m == rtc_mode_no_ack) #define rtc_mode_is(s, m) mode_is(vrtc_domain(s), m) static void rtc_copy_date(RTCState *s); static void rtc_set_time(RTCState *s); static inline int from_bcd(RTCState *s, int a); static inline int convert_hour(RTCState *s, int hour); static void rtc_update_irq(RTCState *s) { ASSERT(spin_is_locked(&s->lock)); if ( rtc_mode_is(s, strict) && (s->hw.cmos_data[RTC_REG_C] & RTC_IRQF) ) return; /* IRQ is raised if any source is both raised & enabled */ if ( !(s->hw.cmos_data[RTC_REG_B] & s->hw.cmos_data[RTC_REG_C] & (RTC_PF | RTC_AF | RTC_UF)) ) return; s->hw.cmos_data[RTC_REG_C] |= RTC_IRQF; if ( rtc_mode_is(s, no_ack) ) hvm_isa_irq_deassert(vrtc_domain(s), RTC_IRQ); hvm_isa_irq_assert(vrtc_domain(s), RTC_IRQ); } bool_t rtc_periodic_interrupt(void *opaque) { RTCState *s = opaque; bool_t ret; spin_lock(&s->lock); ret = rtc_mode_is(s, no_ack) || !(s->hw.cmos_data[RTC_REG_C] & RTC_IRQF); if ( rtc_mode_is(s, no_ack) || !(s->hw.cmos_data[RTC_REG_C] & RTC_PF) ) { s->hw.cmos_data[RTC_REG_C] |= RTC_PF; rtc_update_irq(s); } else if ( ++(s->pt_dead_ticks) >= 10 ) { /* VM is ignoring its RTC; no point in running the timer */ destroy_periodic_time(&s->pt); s->pt_code = 0; } if ( !(s->hw.cmos_data[RTC_REG_C] & RTC_IRQF) ) ret = 0; spin_unlock(&s->lock); return ret; } /* Enable/configure/disable the periodic timer based on the RTC_PIE and * RTC_RATE_SELECT settings */ static void rtc_timer_update(RTCState *s) { int period_code, period, delta; struct vcpu *v = vrtc_vcpu(s); ASSERT(spin_is_locked(&s->lock)); s->pt_dead_ticks = 0; period_code = s->hw.cmos_data[RTC_REG_A] & RTC_RATE_SELECT; switch ( s->hw.cmos_data[RTC_REG_A] & RTC_DIV_CTL ) { case RTC_REF_CLCK_32KHZ: if ( (period_code != 0) && (period_code <= 2) ) period_code += 7; /* fall through */ case RTC_REF_CLCK_1MHZ: case RTC_REF_CLCK_4MHZ: if ( period_code != 0 ) { if ( period_code != s->pt_code ) { s->pt_code = period_code; period = 1 << (period_code - 1); /* period in 32 Khz cycles */ period = DIV_ROUND(period * 1000000000ULL, 32768); /* in ns */ if ( v->domain->arch.hvm_domain.params[HVM_PARAM_VPT_ALIGN] ) delta = 0; else delta = period - ((NOW() - s->start_time) % period); create_periodic_time(v, &s->pt, delta, period, RTC_IRQ, NULL, s); } break; } /* fall through */ default: destroy_periodic_time(&s->pt); s->pt_code = 0; break; } } /* handle update-ended timer */ static void check_update_timer(RTCState *s) { uint64_t next_update_time, expire_time; uint64_t guest_usec; struct domain *d = vrtc_domain(s); stop_timer(&s->update_timer); stop_timer(&s->update_timer2); ASSERT(spin_is_locked(&s->lock)); if (!(s->hw.cmos_data[RTC_REG_C] & RTC_UF) && !(s->hw.cmos_data[RTC_REG_B] & RTC_SET)) { s->use_timer = 1; guest_usec = get_localtime_us(d) % USEC_PER_SEC; if (guest_usec >= (USEC_PER_SEC - 244)) { /* RTC is in update cycle */ s->hw.cmos_data[RTC_REG_A] |= RTC_UIP; next_update_time = (USEC_PER_SEC - guest_usec) * NS_PER_USEC; expire_time = NOW() + next_update_time; /* release lock before set timer */ spin_unlock(&s->lock); set_timer(&s->update_timer2, expire_time); /* fetch lock again */ spin_lock(&s->lock); } else { next_update_time = (USEC_PER_SEC - guest_usec - 244) * NS_PER_USEC; expire_time = NOW() + next_update_time; s->next_update_time = expire_time; /* release lock before set timer */ spin_unlock(&s->lock); set_timer(&s->update_timer, expire_time); /* fetch lock again */ spin_lock(&s->lock); } } else s->use_timer = 0; } static void rtc_update_timer(void *opaque) { RTCState *s = opaque; spin_lock(&s->lock); if (!(s->hw.cmos_data[RTC_REG_B] & RTC_SET)) { s->hw.cmos_data[RTC_REG_A] |= RTC_UIP; set_timer(&s->update_timer2, s->next_update_time + 244000UL); } spin_unlock(&s->lock); } static void rtc_update_timer2(void *opaque) { RTCState *s = opaque; spin_lock(&s->lock); if (!(s->hw.cmos_data[RTC_REG_B] & RTC_SET)) { s->hw.cmos_data[RTC_REG_C] |= RTC_UF; s->hw.cmos_data[RTC_REG_A] &= ~RTC_UIP; rtc_update_irq(s); check_update_timer(s); } spin_unlock(&s->lock); } /* handle alarm timer */ static void alarm_timer_update(RTCState *s) { uint64_t next_update_time, next_alarm_sec; uint64_t expire_time; int32_t alarm_sec, alarm_min, alarm_hour, cur_hour, cur_min, cur_sec; int32_t hour, min; struct domain *d = vrtc_domain(s); ASSERT(spin_is_locked(&s->lock)); stop_timer(&s->alarm_timer); if (!(s->hw.cmos_data[RTC_REG_C] & RTC_AF) && !(s->hw.cmos_data[RTC_REG_B] & RTC_SET)) { s->current_tm = gmtime(get_localtime(d)); rtc_copy_date(s); alarm_sec = from_bcd(s, s->hw.cmos_data[RTC_SECONDS_ALARM]); alarm_min = from_bcd(s, s->hw.cmos_data[RTC_MINUTES_ALARM]); alarm_hour = convert_hour(s, s->hw.cmos_data[RTC_HOURS_ALARM]); cur_sec = from_bcd(s, s->hw.cmos_data[RTC_SECONDS]); cur_min = from_bcd(s, s->hw.cmos_data[RTC_MINUTES]); cur_hour = convert_hour(s, s->hw.cmos_data[RTC_HOURS]); next_update_time = USEC_PER_SEC - (get_localtime_us(d) % USEC_PER_SEC); next_update_time = next_update_time * NS_PER_USEC + NOW(); if ((s->hw.cmos_data[RTC_HOURS_ALARM] & 0xc0) == 0xc0) { if ((s->hw.cmos_data[RTC_MINUTES_ALARM] & 0xc0) == 0xc0) { if ((s->hw.cmos_data[RTC_SECONDS_ALARM] & 0xc0) == 0xc0) next_alarm_sec = 1; else if (cur_sec < alarm_sec) next_alarm_sec = alarm_sec - cur_sec; else next_alarm_sec = alarm_sec + SEC_PER_MIN - cur_sec; } else { if (cur_min < alarm_min) { min = alarm_min - cur_min; next_alarm_sec = min * SEC_PER_MIN - cur_sec; if ((s->hw.cmos_data[RTC_SECONDS_ALARM] & 0xc0) == 0xc0) next_alarm_sec += 0; else next_alarm_sec += alarm_sec; } else if (cur_min == alarm_min) { if ((s->hw.cmos_data[RTC_SECONDS_ALARM] & 0xc0) == 0xc0) next_alarm_sec = 1; else if (cur_sec < alarm_sec) next_alarm_sec = alarm_sec - cur_sec; else { min = alarm_min + MIN_PER_HOUR - cur_min; next_alarm_sec = alarm_sec + min * SEC_PER_MIN - cur_sec; } } else { min = alarm_min + MIN_PER_HOUR - cur_min; next_alarm_sec = min * SEC_PER_MIN - cur_sec; if ((s->hw.cmos_data[RTC_SECONDS_ALARM] & 0xc0) == 0xc0) next_alarm_sec += 0; else next_alarm_sec += alarm_sec; } } } else { if (cur_hour < alarm_hour) { hour = alarm_hour - cur_hour; next_alarm_sec = hour * SEC_PER_HOUR - cur_min * SEC_PER_MIN - cur_sec; if ((s->hw.cmos_data[RTC_MINUTES_ALARM] & 0xc0) == 0xc0) { if ((s->hw.cmos_data[RTC_SECONDS_ALARM] & 0xc0) == 0xc0) next_alarm_sec += 0; else next_alarm_sec += alarm_sec; } else { next_alarm_sec += alarm_min * SEC_PER_MIN; if ((s->hw.cmos_data[RTC_SECONDS_ALARM] & 0xc0) == 0xc0) next_alarm_sec += 0; else next_alarm_sec += alarm_sec; } } else if (cur_hour == alarm_hour) { if ((s->hw.cmos_data[RTC_MINUTES_ALARM] & 0xc0) == 0xc0) { if ((s->hw.cmos_data[RTC_SECONDS_ALARM] & 0xc0) == 0xc0) next_alarm_sec = 1; else if (cur_sec < alarm_sec) next_alarm_sec = alarm_sec - cur_sec; else next_alarm_sec = alarm_sec + SEC_PER_MIN - cur_sec; } else if (cur_min < alarm_min) { min = alarm_min - cur_min; next_alarm_sec = min * SEC_PER_MIN - cur_sec; if ((s->hw.cmos_data[RTC_SECONDS_ALARM] & 0xc0) == 0xc0) next_alarm_sec += 0; else next_alarm_sec += alarm_sec; } else if (cur_min == alarm_min) { if ((s->hw.cmos_data[RTC_SECONDS_ALARM] & 0xc0) == 0xc0) next_alarm_sec = 1; else if (cur_sec < alarm_sec) next_alarm_sec = alarm_sec - cur_sec; else { hour = alarm_hour + HOUR_PER_DAY - cur_hour; next_alarm_sec = hour * SEC_PER_HOUR - cur_min * SEC_PER_MIN - cur_sec; next_alarm_sec += alarm_min * SEC_PER_MIN + alarm_sec; } } else { hour = alarm_hour + HOUR_PER_DAY - cur_hour; next_alarm_sec = hour * SEC_PER_HOUR - cur_min * SEC_PER_MIN - cur_sec; next_alarm_sec += alarm_min * SEC_PER_MIN; if ((s->hw.cmos_data[RTC_SECONDS_ALARM] & 0xc0) == 0xc0) next_alarm_sec += 0; else next_alarm_sec += alarm_sec; } } else { hour = alarm_hour + HOUR_PER_DAY - cur_hour; next_alarm_sec = hour * SEC_PER_HOUR - cur_min * SEC_PER_MIN - cur_sec; if ((s->hw.cmos_data[RTC_MINUTES_ALARM] & 0xc0) == 0xc0) { if ((s->hw.cmos_data[RTC_SECONDS_ALARM] & 0xc0) == 0xc0) next_alarm_sec += 0; else next_alarm_sec += alarm_sec; } else { next_alarm_sec += alarm_min * SEC_PER_MIN; if ((s->hw.cmos_data[RTC_SECONDS_ALARM] & 0xc0) == 0xc0) next_alarm_sec += 0; else next_alarm_sec += alarm_sec; } } } expire_time = (next_alarm_sec - 1) * NS_PER_SEC + next_update_time; /* release lock before set timer */ spin_unlock(&s->lock); set_timer(&s->alarm_timer, expire_time); /* fetch lock again */ spin_lock(&s->lock); } } static void rtc_alarm_cb(void *opaque) { RTCState *s = opaque; spin_lock(&s->lock); if (!(s->hw.cmos_data[RTC_REG_B] & RTC_SET)) { s->hw.cmos_data[RTC_REG_C] |= RTC_AF; rtc_update_irq(s); alarm_timer_update(s); } spin_unlock(&s->lock); } static int rtc_ioport_write(void *opaque, uint32_t addr, uint32_t data) { RTCState *s = opaque; struct domain *d = vrtc_domain(s); uint32_t orig; spin_lock(&s->lock); if ( (addr & 1) == 0 ) { data &= 0x7f; s->hw.cmos_index = data; spin_unlock(&s->lock); return (data < RTC_CMOS_SIZE); } if ( s->hw.cmos_index >= RTC_CMOS_SIZE ) { spin_unlock(&s->lock); return 0; } orig = s->hw.cmos_data[s->hw.cmos_index]; switch ( s->hw.cmos_index ) { case RTC_SECONDS_ALARM: case RTC_MINUTES_ALARM: case RTC_HOURS_ALARM: s->hw.cmos_data[s->hw.cmos_index] = data; alarm_timer_update(s); break; case RTC_SECONDS: case RTC_MINUTES: case RTC_HOURS: case RTC_DAY_OF_WEEK: case RTC_DAY_OF_MONTH: case RTC_MONTH: case RTC_YEAR: /* if in set mode, just write the register */ if ( (s->hw.cmos_data[RTC_REG_B] & RTC_SET) ) s->hw.cmos_data[s->hw.cmos_index] = data; else { /* Fetch the current time and update just this field. */ s->current_tm = gmtime(get_localtime(d)); rtc_copy_date(s); s->hw.cmos_data[s->hw.cmos_index] = data; rtc_set_time(s); } alarm_timer_update(s); break; case RTC_REG_A: /* UIP bit is read only */ s->hw.cmos_data[RTC_REG_A] = (data & ~RTC_UIP) | (orig & RTC_UIP); if ( (data ^ orig) & ~RTC_UIP ) rtc_timer_update(s); break; case RTC_REG_B: if ( data & RTC_SET ) { /* set mode: reset UIP mode */ s->hw.cmos_data[RTC_REG_A] &= ~RTC_UIP; /* adjust cmos before stopping */ if (!(orig & RTC_SET)) { s->current_tm = gmtime(get_localtime(d)); rtc_copy_date(s); } } else { /* if disabling set mode, update the time */ if ( orig & RTC_SET ) rtc_set_time(s); } s->hw.cmos_data[RTC_REG_B] = data; /* * If the interrupt is already set when the interrupt becomes * enabled, raise an interrupt immediately. */ rtc_update_irq(s); if ( (data & RTC_PIE) && !(orig & RTC_PIE) ) rtc_timer_update(s); if ( (data ^ orig) & RTC_SET ) check_update_timer(s); if ( (data ^ orig) & (RTC_24H | RTC_DM_BINARY | RTC_SET) ) alarm_timer_update(s); break; case RTC_REG_C: case RTC_REG_D: /* cannot write to them */ break; } spin_unlock(&s->lock); return 1; } static inline int to_bcd(RTCState *s, int a) { if ( s->hw.cmos_data[RTC_REG_B] & RTC_DM_BINARY ) return a; else return ((a / 10) << 4) | (a % 10); } static inline int from_bcd(RTCState *s, int a) { if ( s->hw.cmos_data[RTC_REG_B] & RTC_DM_BINARY ) return a; else return ((a >> 4) * 10) + (a & 0x0f); } /* Hours in 12 hour mode are in 1-12 range, not 0-11. * So we need convert it before using it*/ static inline int convert_hour(RTCState *s, int raw) { int hour = from_bcd(s, raw & 0x7f); if (!(s->hw.cmos_data[RTC_REG_B] & RTC_24H)) { hour %= 12; if (raw & 0x80) hour += 12; } return hour; } static void rtc_set_time(RTCState *s) { struct tm *tm = &s->current_tm; struct domain *d = vrtc_domain(s); unsigned long before, after; /* XXX s_time_t */ ASSERT(spin_is_locked(&s->lock)); before = mktime(get_year(tm->tm_year), tm->tm_mon + 1, tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec); tm->tm_sec = from_bcd(s, s->hw.cmos_data[RTC_SECONDS]); tm->tm_min = from_bcd(s, s->hw.cmos_data[RTC_MINUTES]); tm->tm_hour = convert_hour(s, s->hw.cmos_data[RTC_HOURS]); tm->tm_wday = from_bcd(s, s->hw.cmos_data[RTC_DAY_OF_WEEK]); tm->tm_mday = from_bcd(s, s->hw.cmos_data[RTC_DAY_OF_MONTH]); tm->tm_mon = from_bcd(s, s->hw.cmos_data[RTC_MONTH]) - 1; tm->tm_year = from_bcd(s, s->hw.cmos_data[RTC_YEAR]) + 100; after = mktime(get_year(tm->tm_year), tm->tm_mon + 1, tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec); /* We use the guest's setting of the RTC to define the local-time * offset for this domain. */ d->time_offset_seconds += (after - before); update_domain_wallclock_time(d); /* Also tell qemu-dm about it so it will be remembered for next boot. */ send_timeoffset_req(after - before); } static void rtc_copy_date(RTCState *s) { const struct tm *tm = &s->current_tm; ASSERT(spin_is_locked(&s->lock)); s->hw.cmos_data[RTC_SECONDS] = to_bcd(s, tm->tm_sec); s->hw.cmos_data[RTC_MINUTES] = to_bcd(s, tm->tm_min); if ( s->hw.cmos_data[RTC_REG_B] & RTC_24H ) { /* 24 hour format */ s->hw.cmos_data[RTC_HOURS] = to_bcd(s, tm->tm_hour); } else { /* 12 hour format */ int h = (tm->tm_hour % 12) ? tm->tm_hour % 12 : 12; s->hw.cmos_data[RTC_HOURS] = to_bcd(s, h); if ( tm->tm_hour >= 12 ) s->hw.cmos_data[RTC_HOURS] |= 0x80; } s->hw.cmos_data[RTC_DAY_OF_WEEK] = to_bcd(s, tm->tm_wday); s->hw.cmos_data[RTC_DAY_OF_MONTH] = to_bcd(s, tm->tm_mday); s->hw.cmos_data[RTC_MONTH] = to_bcd(s, tm->tm_mon + 1); s->hw.cmos_data[RTC_YEAR] = to_bcd(s, tm->tm_year % 100); } static int update_in_progress(RTCState *s) { uint64_t guest_usec; struct domain *d = vrtc_domain(s); if (s->hw.cmos_data[RTC_REG_B] & RTC_SET) return 0; guest_usec = get_localtime_us(d); /* UIP bit will be set at last 244us of every second. */ if ((guest_usec % USEC_PER_SEC) >= (USEC_PER_SEC - 244)) return 1; return 0; } static uint32_t rtc_ioport_read(RTCState *s, uint32_t addr) { int ret; struct domain *d = vrtc_domain(s); if ( (addr & 1) == 0 ) return 0xff; spin_lock(&s->lock); switch ( s->hw.cmos_index ) { case RTC_SECONDS: case RTC_MINUTES: case RTC_HOURS: case RTC_DAY_OF_WEEK: case RTC_DAY_OF_MONTH: case RTC_MONTH: case RTC_YEAR: /* if not in set mode, adjust cmos before reading*/ if (!(s->hw.cmos_data[RTC_REG_B] & RTC_SET)) { s->current_tm = gmtime(get_localtime(d)); rtc_copy_date(s); } ret = s->hw.cmos_data[s->hw.cmos_index]; break; case RTC_REG_A: ret = s->hw.cmos_data[s->hw.cmos_index]; if ((s->use_timer == 0) && update_in_progress(s)) ret |= RTC_UIP; break; case RTC_REG_C: ret = s->hw.cmos_data[s->hw.cmos_index]; s->hw.cmos_data[RTC_REG_C] = 0x00; if ( (ret & RTC_IRQF) && !rtc_mode_is(s, no_ack) ) hvm_isa_irq_deassert(d, RTC_IRQ); rtc_update_irq(s); check_update_timer(s); alarm_timer_update(s); rtc_timer_update(s); break; default: ret = s->hw.cmos_data[s->hw.cmos_index]; break; } spin_unlock(&s->lock); return ret; } static int handle_rtc_io( int dir, uint32_t port, uint32_t bytes, uint32_t *val) { struct RTCState *vrtc = vcpu_vrtc(current); if ( bytes != 1 ) { gdprintk(XENLOG_WARNING, "HVM_RTC bas access\n"); return X86EMUL_OKAY; } if ( dir == IOREQ_WRITE ) { if ( rtc_ioport_write(vrtc, port, (uint8_t)*val) ) return X86EMUL_OKAY; } else if ( vrtc->hw.cmos_index < RTC_CMOS_SIZE ) { *val = rtc_ioport_read(vrtc, port); return X86EMUL_OKAY; } return X86EMUL_UNHANDLEABLE; } void rtc_migrate_timers(struct vcpu *v) { RTCState *s = vcpu_vrtc(v); if ( v->vcpu_id == 0 ) { migrate_timer(&s->update_timer, v->processor);; migrate_timer(&s->update_timer2, v->processor);; migrate_timer(&s->alarm_timer, v->processor);; } } /* Save RTC hardware state */ static int rtc_save(struct domain *d, hvm_domain_context_t *h) { RTCState *s = domain_vrtc(d); int rc; spin_lock(&s->lock); rc = hvm_save_entry(RTC, 0, h, &s->hw); spin_unlock(&s->lock); return rc; } /* Reload the hardware state from a saved domain */ static int rtc_load(struct domain *d, hvm_domain_context_t *h) { RTCState *s = domain_vrtc(d); spin_lock(&s->lock); /* Restore the registers */ if ( hvm_load_entry(RTC, h, &s->hw) != 0 ) { spin_unlock(&s->lock); return -EINVAL; } /* Reset the wall-clock time. In normal running, this runs with host * time, so let's keep doing that. */ s->current_tm = gmtime(get_localtime(d)); rtc_copy_date(s); /* Reset the periodic interrupt timer based on the registers */ rtc_timer_update(s); check_update_timer(s); alarm_timer_update(s); spin_unlock(&s->lock); return 0; } HVM_REGISTER_SAVE_RESTORE(RTC, rtc_save, rtc_load, 1, HVMSR_PER_DOM); void rtc_reset(struct domain *d) { RTCState *s = domain_vrtc(d); destroy_periodic_time(&s->pt); s->pt_code = 0; s->pt.source = PTSRC_isa; } void rtc_init(struct domain *d) { RTCState *s = domain_vrtc(d); spin_lock_init(&s->lock); init_timer(&s->update_timer, rtc_update_timer, s, smp_processor_id()); init_timer(&s->update_timer2, rtc_update_timer2, s, smp_processor_id()); init_timer(&s->alarm_timer, rtc_alarm_cb, s, smp_processor_id()); register_portio_handler(d, RTC_PORT(0), 2, handle_rtc_io); rtc_reset(d); spin_lock(&s->lock); s->hw.cmos_data[RTC_REG_A] = RTC_REF_CLCK_32KHZ | 6; /* ~1kHz */ s->hw.cmos_data[RTC_REG_B] = RTC_24H; s->hw.cmos_data[RTC_REG_C] = 0; s->hw.cmos_data[RTC_REG_D] = RTC_VRT; s->current_tm = gmtime(get_localtime(d)); s->start_time = NOW(); rtc_copy_date(s); check_update_timer(s); spin_unlock(&s->lock); } void rtc_deinit(struct domain *d) { RTCState *s = domain_vrtc(d); spin_barrier(&s->lock); destroy_periodic_time(&s->pt); kill_timer(&s->update_timer); kill_timer(&s->update_timer2); kill_timer(&s->alarm_timer); } void rtc_update_clock(struct domain *d) { RTCState *s = domain_vrtc(d); spin_lock(&s->lock); s->current_tm = gmtime(get_localtime(d)); spin_unlock(&s->lock); } xen-4.4.0/xen/arch/x86/hvm/irq.c0000664000175000017500000004762312307313555014344 0ustar smbsmb/****************************************************************************** * irq.c * * Interrupt distribution and delivery logic. * * Copyright (c) 2006, K A Fraser, XenSource Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include /* Must be called with hvm_domain->irq_lock hold */ static void assert_gsi(struct domain *d, unsigned ioapic_gsi) { struct pirq *pirq = pirq_info(d, domain_emuirq_to_pirq(d, ioapic_gsi)); if ( hvm_domain_use_pirq(d, pirq) ) { send_guest_pirq(d, pirq); return; } vioapic_irq_positive_edge(d, ioapic_gsi); } static void assert_irq(struct domain *d, unsigned ioapic_gsi, unsigned pic_irq) { assert_gsi(d, ioapic_gsi); vpic_irq_positive_edge(d, pic_irq); } /* Must be called with hvm_domain->irq_lock hold */ static void deassert_irq(struct domain *d, unsigned isa_irq) { struct pirq *pirq = pirq_info(d, domain_emuirq_to_pirq(d, isa_irq)); if ( !hvm_domain_use_pirq(d, pirq) ) vpic_irq_negative_edge(d, isa_irq); } static void __hvm_pci_intx_assert( struct domain *d, unsigned int device, unsigned int intx) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; unsigned int gsi, link, isa_irq; ASSERT((device <= 31) && (intx <= 3)); if ( __test_and_set_bit(device*4 + intx, &hvm_irq->pci_intx.i) ) return; gsi = hvm_pci_intx_gsi(device, intx); if ( hvm_irq->gsi_assert_count[gsi]++ == 0 ) assert_gsi(d, gsi); link = hvm_pci_intx_link(device, intx); isa_irq = hvm_irq->pci_link.route[link]; if ( (hvm_irq->pci_link_assert_count[link]++ == 0) && isa_irq && (hvm_irq->gsi_assert_count[isa_irq]++ == 0) ) assert_irq(d, isa_irq, isa_irq); } void hvm_pci_intx_assert( struct domain *d, unsigned int device, unsigned int intx) { spin_lock(&d->arch.hvm_domain.irq_lock); __hvm_pci_intx_assert(d, device, intx); spin_unlock(&d->arch.hvm_domain.irq_lock); } static void __hvm_pci_intx_deassert( struct domain *d, unsigned int device, unsigned int intx) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; unsigned int gsi, link, isa_irq; ASSERT((device <= 31) && (intx <= 3)); if ( !__test_and_clear_bit(device*4 + intx, &hvm_irq->pci_intx.i) ) return; gsi = hvm_pci_intx_gsi(device, intx); --hvm_irq->gsi_assert_count[gsi]; link = hvm_pci_intx_link(device, intx); isa_irq = hvm_irq->pci_link.route[link]; if ( (--hvm_irq->pci_link_assert_count[link] == 0) && isa_irq && (--hvm_irq->gsi_assert_count[isa_irq] == 0) ) deassert_irq(d, isa_irq); } void hvm_pci_intx_deassert( struct domain *d, unsigned int device, unsigned int intx) { spin_lock(&d->arch.hvm_domain.irq_lock); __hvm_pci_intx_deassert(d, device, intx); spin_unlock(&d->arch.hvm_domain.irq_lock); } void hvm_isa_irq_assert( struct domain *d, unsigned int isa_irq) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; unsigned int gsi = hvm_isa_irq_to_gsi(isa_irq); ASSERT(isa_irq <= 15); spin_lock(&d->arch.hvm_domain.irq_lock); if ( !__test_and_set_bit(isa_irq, &hvm_irq->isa_irq.i) && (hvm_irq->gsi_assert_count[gsi]++ == 0) ) assert_irq(d, gsi, isa_irq); spin_unlock(&d->arch.hvm_domain.irq_lock); } void hvm_isa_irq_deassert( struct domain *d, unsigned int isa_irq) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; unsigned int gsi = hvm_isa_irq_to_gsi(isa_irq); ASSERT(isa_irq <= 15); spin_lock(&d->arch.hvm_domain.irq_lock); if ( __test_and_clear_bit(isa_irq, &hvm_irq->isa_irq.i) && (--hvm_irq->gsi_assert_count[gsi] == 0) ) deassert_irq(d, isa_irq); spin_unlock(&d->arch.hvm_domain.irq_lock); } static void hvm_set_callback_irq_level(struct vcpu *v) { struct domain *d = v->domain; struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; unsigned int gsi, pdev, pintx, asserted; ASSERT(v->vcpu_id == 0); spin_lock(&d->arch.hvm_domain.irq_lock); /* NB. Do not check the evtchn_upcall_mask. It is not used in HVM mode. */ asserted = !!vcpu_info(v, evtchn_upcall_pending); if ( hvm_irq->callback_via_asserted == asserted ) goto out; hvm_irq->callback_via_asserted = asserted; /* Callback status has changed. Update the callback via. */ switch ( hvm_irq->callback_via_type ) { case HVMIRQ_callback_gsi: gsi = hvm_irq->callback_via.gsi; if ( asserted && (hvm_irq->gsi_assert_count[gsi]++ == 0) ) { vioapic_irq_positive_edge(d, gsi); if ( gsi <= 15 ) vpic_irq_positive_edge(d, gsi); } else if ( !asserted && (--hvm_irq->gsi_assert_count[gsi] == 0) ) { if ( gsi <= 15 ) vpic_irq_negative_edge(d, gsi); } break; case HVMIRQ_callback_pci_intx: pdev = hvm_irq->callback_via.pci.dev; pintx = hvm_irq->callback_via.pci.intx; if ( asserted ) __hvm_pci_intx_assert(d, pdev, pintx); else __hvm_pci_intx_deassert(d, pdev, pintx); default: break; } out: spin_unlock(&d->arch.hvm_domain.irq_lock); } void hvm_maybe_deassert_evtchn_irq(void) { struct domain *d = current->domain; struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; if ( hvm_irq->callback_via_asserted && !vcpu_info(d->vcpu[0], evtchn_upcall_pending) ) hvm_set_callback_irq_level(d->vcpu[0]); } void hvm_assert_evtchn_irq(struct vcpu *v) { if ( unlikely(in_irq() || !local_irq_is_enabled()) ) { tasklet_schedule(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet); return; } if ( is_hvm_pv_evtchn_vcpu(v) ) vcpu_kick(v); else if ( v->vcpu_id == 0 ) hvm_set_callback_irq_level(v); } void hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; u8 old_isa_irq; int i; ASSERT((link <= 3) && (isa_irq <= 15)); spin_lock(&d->arch.hvm_domain.irq_lock); old_isa_irq = hvm_irq->pci_link.route[link]; if ( old_isa_irq == isa_irq ) goto out; hvm_irq->pci_link.route[link] = isa_irq; /* PCI pass-through fixup. */ if ( hvm_irq->dpci ) { if ( old_isa_irq ) clear_bit(old_isa_irq, &hvm_irq->dpci->isairq_map); for ( i = 0; i < NR_LINK; i++ ) if ( hvm_irq->dpci->link_cnt[i] && hvm_irq->pci_link.route[i] ) set_bit(hvm_irq->pci_link.route[i], &hvm_irq->dpci->isairq_map); } if ( hvm_irq->pci_link_assert_count[link] == 0 ) goto out; if ( old_isa_irq && (--hvm_irq->gsi_assert_count[old_isa_irq] == 0) ) vpic_irq_negative_edge(d, old_isa_irq); if ( isa_irq && (hvm_irq->gsi_assert_count[isa_irq]++ == 0) ) { vioapic_irq_positive_edge(d, isa_irq); vpic_irq_positive_edge(d, isa_irq); } out: spin_unlock(&d->arch.hvm_domain.irq_lock); dprintk(XENLOG_G_INFO, "Dom%u PCI link %u changed %u -> %u\n", d->domain_id, link, old_isa_irq, isa_irq); } void hvm_inject_msi(struct domain *d, uint64_t addr, uint32_t data) { uint32_t tmp = (uint32_t) addr; uint8_t dest = (tmp & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; uint8_t dest_mode = !!(tmp & MSI_ADDR_DESTMODE_MASK); uint8_t delivery_mode = (data & MSI_DATA_DELIVERY_MODE_MASK) >> MSI_DATA_DELIVERY_MODE_SHIFT; uint8_t trig_mode = (data & MSI_DATA_TRIGGER_MASK) >> MSI_DATA_TRIGGER_SHIFT; uint8_t vector = data & MSI_DATA_VECTOR_MASK; if ( !vector ) { int pirq = ((addr >> 32) & 0xffffff00) | ((addr >> 12) & 0xff); if ( pirq > 0 ) { struct pirq *info = pirq_info(d, pirq); /* if it is the first time, allocate the pirq */ if (info->arch.hvm.emuirq == IRQ_UNBOUND) { spin_lock(&d->event_lock); map_domain_emuirq_pirq(d, pirq, IRQ_MSI_EMU); spin_unlock(&d->event_lock); } else if (info->arch.hvm.emuirq != IRQ_MSI_EMU) { printk("%s: pirq %d does not correspond to an emulated MSI\n", __func__, pirq); return; } send_guest_pirq(d, info); return; } else { printk("%s: error getting pirq from MSI: pirq = %d\n", __func__, pirq); } } vmsi_deliver(d, vector, dest, dest_mode, delivery_mode, trig_mode); } void hvm_set_callback_via(struct domain *d, uint64_t via) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; unsigned int gsi=0, pdev=0, pintx=0; uint8_t via_type; via_type = (uint8_t)(via >> 56) + 1; if ( ((via_type == HVMIRQ_callback_gsi) && (via == 0)) || (via_type > HVMIRQ_callback_vector) ) via_type = HVMIRQ_callback_none; spin_lock(&d->arch.hvm_domain.irq_lock); /* Tear down old callback via. */ if ( hvm_irq->callback_via_asserted ) { switch ( hvm_irq->callback_via_type ) { case HVMIRQ_callback_gsi: gsi = hvm_irq->callback_via.gsi; if ( (--hvm_irq->gsi_assert_count[gsi] == 0) && (gsi <= 15) ) vpic_irq_negative_edge(d, gsi); break; case HVMIRQ_callback_pci_intx: pdev = hvm_irq->callback_via.pci.dev; pintx = hvm_irq->callback_via.pci.intx; __hvm_pci_intx_deassert(d, pdev, pintx); break; default: break; } } /* Set up new callback via. */ switch ( hvm_irq->callback_via_type = via_type ) { case HVMIRQ_callback_gsi: gsi = hvm_irq->callback_via.gsi = (uint8_t)via; if ( (gsi == 0) || (gsi >= ARRAY_SIZE(hvm_irq->gsi_assert_count)) ) hvm_irq->callback_via_type = HVMIRQ_callback_none; else if ( hvm_irq->callback_via_asserted && (hvm_irq->gsi_assert_count[gsi]++ == 0) ) { vioapic_irq_positive_edge(d, gsi); if ( gsi <= 15 ) vpic_irq_positive_edge(d, gsi); } break; case HVMIRQ_callback_pci_intx: pdev = hvm_irq->callback_via.pci.dev = (uint8_t)(via >> 11) & 31; pintx = hvm_irq->callback_via.pci.intx = (uint8_t)via & 3; if ( hvm_irq->callback_via_asserted ) __hvm_pci_intx_assert(d, pdev, pintx); break; case HVMIRQ_callback_vector: hvm_irq->callback_via.vector = (uint8_t)via; break; default: break; } spin_unlock(&d->arch.hvm_domain.irq_lock); dprintk(XENLOG_G_INFO, "Dom%u callback via changed to ", d->domain_id); switch ( via_type ) { case HVMIRQ_callback_gsi: printk("GSI %u\n", gsi); break; case HVMIRQ_callback_pci_intx: printk("PCI INTx Dev 0x%02x Int%c\n", pdev, 'A' + pintx); break; case HVMIRQ_callback_vector: printk("Direct Vector 0x%02x\n", (uint8_t)via); break; default: printk("None\n"); break; } } struct hvm_intack hvm_vcpu_has_pending_irq(struct vcpu *v) { struct hvm_domain *plat = &v->domain->arch.hvm_domain; int vector; if ( unlikely(v->nmi_pending) ) return hvm_intack_nmi; if ( unlikely(v->mce_pending) ) return hvm_intack_mce; if ( (plat->irq.callback_via_type == HVMIRQ_callback_vector) && vcpu_info(v, evtchn_upcall_pending) ) return hvm_intack_vector(plat->irq.callback_via.vector); if ( is_pvh_vcpu(v) ) return hvm_intack_none; if ( vlapic_accept_pic_intr(v) && plat->vpic[0].int_output ) return hvm_intack_pic(0); vector = vlapic_has_pending_irq(v); if ( vector != -1 ) return hvm_intack_lapic(vector); return hvm_intack_none; } struct hvm_intack hvm_vcpu_ack_pending_irq( struct vcpu *v, struct hvm_intack intack) { int vector; switch ( intack.source ) { case hvm_intsrc_nmi: if ( !test_and_clear_bool(v->nmi_pending) ) intack = hvm_intack_none; break; case hvm_intsrc_mce: if ( !test_and_clear_bool(v->mce_pending) ) intack = hvm_intack_none; break; case hvm_intsrc_pic: if ( (vector = vpic_ack_pending_irq(v)) == -1 ) intack = hvm_intack_none; else intack.vector = (uint8_t)vector; break; case hvm_intsrc_lapic: if ( !vlapic_ack_pending_irq(v, intack.vector, 0) ) intack = hvm_intack_none; break; case hvm_intsrc_vector: break; default: intack = hvm_intack_none; break; } return intack; } int hvm_local_events_need_delivery(struct vcpu *v) { struct hvm_intack intack = hvm_vcpu_has_pending_irq(v); if ( likely(intack.source == hvm_intsrc_none) ) return 0; return !hvm_interrupt_blocked(v, intack); } static void irq_dump(struct domain *d) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; int i; printk("Domain %d:\n", d->domain_id); printk("PCI 0x%16.16"PRIx64"%16.16"PRIx64 " ISA 0x%8.8"PRIx32" ROUTE %u %u %u %u\n", hvm_irq->pci_intx.pad[0], hvm_irq->pci_intx.pad[1], (uint32_t) hvm_irq->isa_irq.pad[0], hvm_irq->pci_link.route[0], hvm_irq->pci_link.route[1], hvm_irq->pci_link.route[2], hvm_irq->pci_link.route[3]); for ( i = 0 ; i < VIOAPIC_NUM_PINS; i += 8 ) printk("GSI [%x - %x] %2.2"PRIu8" %2.2"PRIu8" %2.2"PRIu8" %2.2"PRIu8 " %2.2"PRIu8" %2.2"PRIu8" %2.2"PRIu8" %2.2"PRIu8"\n", i, i+7, hvm_irq->gsi_assert_count[i+0], hvm_irq->gsi_assert_count[i+1], hvm_irq->gsi_assert_count[i+2], hvm_irq->gsi_assert_count[i+3], hvm_irq->gsi_assert_count[i+4], hvm_irq->gsi_assert_count[i+5], hvm_irq->gsi_assert_count[i+6], hvm_irq->gsi_assert_count[i+7]); printk("Link %2.2"PRIu8" %2.2"PRIu8" %2.2"PRIu8" %2.2"PRIu8"\n", hvm_irq->pci_link_assert_count[0], hvm_irq->pci_link_assert_count[1], hvm_irq->pci_link_assert_count[2], hvm_irq->pci_link_assert_count[3]); printk("Callback via %i:%#"PRIx32",%s asserted\n", hvm_irq->callback_via_type, hvm_irq->callback_via.gsi, hvm_irq->callback_via_asserted ? "" : " not"); } static void dump_irq_info(unsigned char key) { struct domain *d; printk("'%c' pressed -> dumping HVM irq info\n", key); rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) if ( is_hvm_domain(d) ) irq_dump(d); rcu_read_unlock(&domlist_read_lock); } static struct keyhandler dump_irq_info_keyhandler = { .diagnostic = 1, .u.fn = dump_irq_info, .desc = "dump HVM irq info" }; static int __init dump_irq_info_key_init(void) { register_keyhandler('I', &dump_irq_info_keyhandler); return 0; } __initcall(dump_irq_info_key_init); static int irq_save_pci(struct domain *d, hvm_domain_context_t *h) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; unsigned int asserted, pdev, pintx; int rc; spin_lock(&d->arch.hvm_domain.irq_lock); pdev = hvm_irq->callback_via.pci.dev; pintx = hvm_irq->callback_via.pci.intx; asserted = (hvm_irq->callback_via_asserted && (hvm_irq->callback_via_type == HVMIRQ_callback_pci_intx)); /* * Deassert virtual interrupt via PCI INTx line. The virtual interrupt * status is not save/restored, so the INTx line must be deasserted in * the restore context. */ if ( asserted ) __hvm_pci_intx_deassert(d, pdev, pintx); /* Save PCI IRQ lines */ rc = hvm_save_entry(PCI_IRQ, 0, h, &hvm_irq->pci_intx); if ( asserted ) __hvm_pci_intx_assert(d, pdev, pintx); spin_unlock(&d->arch.hvm_domain.irq_lock); return rc; } static int irq_save_isa(struct domain *d, hvm_domain_context_t *h) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; /* Save ISA IRQ lines */ return ( hvm_save_entry(ISA_IRQ, 0, h, &hvm_irq->isa_irq) ); } static int irq_save_link(struct domain *d, hvm_domain_context_t *h) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; /* Save PCI-ISA link state */ return ( hvm_save_entry(PCI_LINK, 0, h, &hvm_irq->pci_link) ); } static int irq_load_pci(struct domain *d, hvm_domain_context_t *h) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; int link, dev, intx, gsi; /* Load the PCI IRQ lines */ if ( hvm_load_entry(PCI_IRQ, h, &hvm_irq->pci_intx) != 0 ) return -EINVAL; /* Clear the PCI link assert counts */ for ( link = 0; link < 4; link++ ) hvm_irq->pci_link_assert_count[link] = 0; /* Clear the GSI link assert counts */ for ( gsi = 0; gsi < VIOAPIC_NUM_PINS; gsi++ ) hvm_irq->gsi_assert_count[gsi] = 0; /* Recalculate the counts from the IRQ line state */ for ( dev = 0; dev < 32; dev++ ) for ( intx = 0; intx < 4; intx++ ) if ( test_bit(dev*4 + intx, &hvm_irq->pci_intx.i) ) { /* Direct GSI assert */ gsi = hvm_pci_intx_gsi(dev, intx); hvm_irq->gsi_assert_count[gsi]++; /* PCI-ISA bridge assert */ link = hvm_pci_intx_link(dev, intx); hvm_irq->pci_link_assert_count[link]++; } return 0; } static int irq_load_isa(struct domain *d, hvm_domain_context_t *h) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; int irq; /* Load the ISA IRQ lines */ if ( hvm_load_entry(ISA_IRQ, h, &hvm_irq->isa_irq) != 0 ) return -EINVAL; /* Adjust the GSI assert counts for the ISA IRQ line state. * This relies on the PCI IRQ state being loaded first. */ for ( irq = 0; platform_legacy_irq(irq); irq++ ) if ( test_bit(irq, &hvm_irq->isa_irq.i) ) hvm_irq->gsi_assert_count[hvm_isa_irq_to_gsi(irq)]++; return 0; } static int irq_load_link(struct domain *d, hvm_domain_context_t *h) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; int link, gsi; /* Load the PCI-ISA IRQ link routing table */ if ( hvm_load_entry(PCI_LINK, h, &hvm_irq->pci_link) != 0 ) return -EINVAL; /* Sanity check */ for ( link = 0; link < 4; link++ ) if ( hvm_irq->pci_link.route[link] > 15 ) { gdprintk(XENLOG_ERR, "HVM restore: PCI-ISA link %u out of range (%u)\n", link, hvm_irq->pci_link.route[link]); return -EINVAL; } /* Adjust the GSI assert counts for the link outputs. * This relies on the PCI and ISA IRQ state being loaded first */ for ( link = 0; link < 4; link++ ) { if ( hvm_irq->pci_link_assert_count[link] != 0 ) { gsi = hvm_irq->pci_link.route[link]; if ( gsi != 0 ) hvm_irq->gsi_assert_count[gsi]++; } } return 0; } HVM_REGISTER_SAVE_RESTORE(PCI_IRQ, irq_save_pci, irq_load_pci, 1, HVMSR_PER_DOM); HVM_REGISTER_SAVE_RESTORE(ISA_IRQ, irq_save_isa, irq_load_isa, 1, HVMSR_PER_DOM); HVM_REGISTER_SAVE_RESTORE(PCI_LINK, irq_save_link, irq_load_link, 1, HVMSR_PER_DOM); xen-4.4.0/xen/arch/x86/hvm/vmsi.c0000664000175000017500000003467212307313555014527 0ustar smbsmb/* * Copyright (C) 2001 MandrakeSoft S.A. * * MandrakeSoft S.A. * 43, rue d'Aboukir * 75002 Paris - France * http://www.linux-mandrake.com/ * http://www.mandrakesoft.com/ * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Support for virtual MSI logic * Will be merged it with virtual IOAPIC logic, since most is the same */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void vmsi_inj_irq( struct domain *d, struct vlapic *target, uint8_t vector, uint8_t trig_mode, uint8_t delivery_mode) { HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "vmsi_inj_irq " "irq %d trig %d delive mode %d\n", vector, trig_mode, delivery_mode); switch ( delivery_mode ) { case dest_Fixed: case dest_LowestPrio: vlapic_set_irq(target, vector, trig_mode); break; default: gdprintk(XENLOG_WARNING, "error delivery mode %d\n", delivery_mode); break; } } int vmsi_deliver( struct domain *d, int vector, uint8_t dest, uint8_t dest_mode, uint8_t delivery_mode, uint8_t trig_mode) { struct vlapic *target; struct vcpu *v; switch ( delivery_mode ) { case dest_LowestPrio: { target = vlapic_lowest_prio(d, NULL, 0, dest, dest_mode); if ( target != NULL ) vmsi_inj_irq(d, target, vector, trig_mode, delivery_mode); else HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "null round robin: " "vector=%x delivery_mode=%x\n", vector, dest_LowestPrio); break; } case dest_Fixed: case dest_ExtINT: { for_each_vcpu ( d, v ) if ( vlapic_match_dest(vcpu_vlapic(v), NULL, 0, dest, dest_mode) ) vmsi_inj_irq(d, vcpu_vlapic(v), vector, trig_mode, delivery_mode); break; } case dest_SMI: case dest_NMI: case dest_INIT: case dest__reserved_2: default: gdprintk(XENLOG_WARNING, "Unsupported delivery mode %d\n", delivery_mode); break; } return 1; } void vmsi_deliver_pirq(struct domain *d, const struct hvm_pirq_dpci *pirq_dpci) { uint32_t flags = pirq_dpci->gmsi.gflags; int vector = pirq_dpci->gmsi.gvec; uint8_t dest = (uint8_t)flags; uint8_t dest_mode = !!(flags & VMSI_DM_MASK); uint8_t delivery_mode = (flags & VMSI_DELIV_MASK) >> GFLAGS_SHIFT_DELIV_MODE; uint8_t trig_mode = (flags&VMSI_TRIG_MODE) >> GFLAGS_SHIFT_TRG_MODE; HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "msi: dest=%x dest_mode=%x delivery_mode=%x " "vector=%x trig_mode=%x\n", dest, dest_mode, delivery_mode, vector, trig_mode); ASSERT(pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI); vmsi_deliver(d, vector, dest, dest_mode, delivery_mode, trig_mode); } /* Return value, -1 : multi-dests, non-negative value: dest_vcpu_id */ int hvm_girq_dest_2_vcpu_id(struct domain *d, uint8_t dest, uint8_t dest_mode) { int dest_vcpu_id = -1, w = 0; struct vcpu *v; if ( d->max_vcpus == 1 ) return 0; for_each_vcpu ( d, v ) { if ( vlapic_match_dest(vcpu_vlapic(v), NULL, 0, dest, dest_mode) ) { w++; dest_vcpu_id = v->vcpu_id; } } if ( w > 1 ) return -1; return dest_vcpu_id; } /* MSI-X mask bit hypervisor interception */ struct msixtbl_entry { struct list_head list; atomic_t refcnt; /* how many bind_pt_irq called for the device */ /* TODO: resolve the potential race by destruction of pdev */ struct pci_dev *pdev; unsigned long gtable; /* gpa of msix table */ unsigned long table_len; unsigned long table_flags[BITS_TO_LONGS(MAX_MSIX_TABLE_ENTRIES)]; #define MAX_MSIX_ACC_ENTRIES 3 struct { uint32_t msi_ad[3]; /* Shadow of address low, high and data */ } gentries[MAX_MSIX_ACC_ENTRIES]; struct rcu_head rcu; }; static DEFINE_RCU_READ_LOCK(msixtbl_rcu_lock); static struct msixtbl_entry *msixtbl_find_entry( struct vcpu *v, unsigned long addr) { struct msixtbl_entry *entry; struct domain *d = v->domain; list_for_each_entry( entry, &d->arch.hvm_domain.msixtbl_list, list ) if ( addr >= entry->gtable && addr < entry->gtable + entry->table_len ) return entry; return NULL; } static struct msi_desc *virt_to_msi_desc(struct pci_dev *dev, void *virt) { struct msi_desc *desc; list_for_each_entry( desc, &dev->msi_list, list ) if ( desc->msi_attrib.type == PCI_CAP_ID_MSIX && virt >= desc->mask_base && virt < desc->mask_base + PCI_MSIX_ENTRY_SIZE ) return desc; return NULL; } static void __iomem *msixtbl_addr_to_virt( struct msixtbl_entry *entry, unsigned long addr) { unsigned int idx, nr_page; if ( !entry || !entry->pdev ) return NULL; nr_page = (addr >> PAGE_SHIFT) - (entry->gtable >> PAGE_SHIFT); idx = entry->pdev->msix->table_idx[nr_page]; if ( !idx ) return NULL; return (void *)(fix_to_virt(idx) + (addr & ((1UL << PAGE_SHIFT) - 1))); } static int msixtbl_read( struct vcpu *v, unsigned long address, unsigned long len, unsigned long *pval) { unsigned long offset; struct msixtbl_entry *entry; void *virt; unsigned int nr_entry, index; int r = X86EMUL_UNHANDLEABLE; if ( len != 4 || (address & 3) ) return r; rcu_read_lock(&msixtbl_rcu_lock); entry = msixtbl_find_entry(v, address); offset = address & (PCI_MSIX_ENTRY_SIZE - 1); if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET ) { nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; if ( nr_entry >= MAX_MSIX_ACC_ENTRIES ) goto out; index = offset / sizeof(uint32_t); *pval = entry->gentries[nr_entry].msi_ad[index]; } else { virt = msixtbl_addr_to_virt(entry, address); if ( !virt ) goto out; *pval = readl(virt); } r = X86EMUL_OKAY; out: rcu_read_unlock(&msixtbl_rcu_lock); return r; } static int msixtbl_write(struct vcpu *v, unsigned long address, unsigned long len, unsigned long val) { unsigned long offset; struct msixtbl_entry *entry; const struct msi_desc *msi_desc; void *virt; unsigned int nr_entry, index; int r = X86EMUL_UNHANDLEABLE; unsigned long flags, orig; struct irq_desc *desc; if ( len != 4 || (address & 3) ) return r; rcu_read_lock(&msixtbl_rcu_lock); entry = msixtbl_find_entry(v, address); nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE; offset = address & (PCI_MSIX_ENTRY_SIZE - 1); if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) { if ( nr_entry < MAX_MSIX_ACC_ENTRIES ) { index = offset / sizeof(uint32_t); entry->gentries[nr_entry].msi_ad[index] = val; } set_bit(nr_entry, &entry->table_flags); goto out; } /* exit to device model if address/data has been modified */ if ( test_and_clear_bit(nr_entry, &entry->table_flags) ) { if ( !(val & PCI_MSIX_VECTOR_BITMASK) ) v->arch.hvm_vcpu.hvm_io.msix_unmask_address = address; goto out; } virt = msixtbl_addr_to_virt(entry, address); if ( !virt ) goto out; msi_desc = virt_to_msi_desc(entry->pdev, virt); if ( !msi_desc || msi_desc->irq < 0 ) goto out; desc = irq_to_desc(msi_desc->irq); if ( !desc ) goto out; spin_lock_irqsave(&desc->lock, flags); if ( !desc->msi_desc ) goto unlock; ASSERT(msi_desc == desc->msi_desc); orig = readl(virt); /* * Do not allow guest to modify MSI-X control bit if it is masked * by Xen. We'll only handle the case where Xen thinks that * bit is unmasked, but hardware has silently masked the bit * (in case of SR-IOV VF reset, etc). On the other hand, if Xen * thinks that the bit is masked, but it's really not, * we log a warning. */ if ( msi_desc->msi_attrib.masked ) { if ( !(orig & PCI_MSIX_VECTOR_BITMASK) ) printk(XENLOG_WARNING "MSI-X control bit is unmasked when" " it is expected to be masked [%04x:%02x:%02x.%u]\n", entry->pdev->seg, entry->pdev->bus, PCI_SLOT(entry->pdev->devfn), PCI_FUNC(entry->pdev->devfn)); goto unlock; } /* * The mask bit is the only defined bit in the word. But we * ought to preserve the reserved bits. Clearing the reserved * bits can result in undefined behaviour (see PCI Local Bus * Specification revision 2.3). */ val &= PCI_MSIX_VECTOR_BITMASK; val |= (orig & ~PCI_MSIX_VECTOR_BITMASK); writel(val, virt); unlock: spin_unlock_irqrestore(&desc->lock, flags); r = X86EMUL_OKAY; out: rcu_read_unlock(&msixtbl_rcu_lock); return r; } static int msixtbl_range(struct vcpu *v, unsigned long addr) { struct msixtbl_entry *entry; void *virt; rcu_read_lock(&msixtbl_rcu_lock); entry = msixtbl_find_entry(v, addr); virt = msixtbl_addr_to_virt(entry, addr); rcu_read_unlock(&msixtbl_rcu_lock); return !!virt; } const struct hvm_mmio_handler msixtbl_mmio_handler = { .check_handler = msixtbl_range, .read_handler = msixtbl_read, .write_handler = msixtbl_write }; static void add_msixtbl_entry(struct domain *d, struct pci_dev *pdev, uint64_t gtable, struct msixtbl_entry *entry) { u32 len; memset(entry, 0, sizeof(struct msixtbl_entry)); INIT_LIST_HEAD(&entry->list); INIT_RCU_HEAD(&entry->rcu); atomic_set(&entry->refcnt, 0); len = pci_msix_get_table_len(pdev); entry->table_len = len; entry->pdev = pdev; entry->gtable = (unsigned long) gtable; list_add_rcu(&entry->list, &d->arch.hvm_domain.msixtbl_list); } static void free_msixtbl_entry(struct rcu_head *rcu) { struct msixtbl_entry *entry; entry = container_of (rcu, struct msixtbl_entry, rcu); xfree(entry); } static void del_msixtbl_entry(struct msixtbl_entry *entry) { list_del_rcu(&entry->list); call_rcu(&entry->rcu, free_msixtbl_entry); } int msixtbl_pt_register(struct domain *d, struct pirq *pirq, uint64_t gtable) { struct irq_desc *irq_desc; struct msi_desc *msi_desc; struct pci_dev *pdev; struct msixtbl_entry *entry, *new_entry; int r = -EINVAL; ASSERT(spin_is_locked(&pcidevs_lock)); ASSERT(spin_is_locked(&d->event_lock)); /* * xmalloc() with irq_disabled causes the failure of check_lock() * for xenpool->lock. So we allocate an entry beforehand. */ new_entry = xmalloc(struct msixtbl_entry); if ( !new_entry ) return -ENOMEM; irq_desc = pirq_spin_lock_irq_desc(pirq, NULL); if ( !irq_desc ) { xfree(new_entry); return r; } if ( !irq_desc->msi_desc ) goto out; msi_desc = irq_desc->msi_desc; if ( !msi_desc ) goto out; pdev = msi_desc->dev; spin_lock(&d->arch.hvm_domain.msixtbl_list_lock); list_for_each_entry( entry, &d->arch.hvm_domain.msixtbl_list, list ) if ( pdev == entry->pdev ) goto found; entry = new_entry; new_entry = NULL; add_msixtbl_entry(d, pdev, gtable, entry); found: atomic_inc(&entry->refcnt); spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock); r = 0; out: spin_unlock_irq(&irq_desc->lock); xfree(new_entry); return r; } void msixtbl_pt_unregister(struct domain *d, struct pirq *pirq) { struct irq_desc *irq_desc; struct msi_desc *msi_desc; struct pci_dev *pdev; struct msixtbl_entry *entry; ASSERT(spin_is_locked(&pcidevs_lock)); ASSERT(spin_is_locked(&d->event_lock)); irq_desc = pirq_spin_lock_irq_desc(pirq, NULL); if ( !irq_desc ) return; if ( !irq_desc->msi_desc ) goto out; msi_desc = irq_desc->msi_desc; if ( !msi_desc ) goto out; pdev = msi_desc->dev; spin_lock(&d->arch.hvm_domain.msixtbl_list_lock); list_for_each_entry( entry, &d->arch.hvm_domain.msixtbl_list, list ) if ( pdev == entry->pdev ) goto found; spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock); out: spin_unlock_irq(&irq_desc->lock); return; found: if ( !atomic_dec_and_test(&entry->refcnt) ) del_msixtbl_entry(entry); spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock); spin_unlock_irq(&irq_desc->lock); } void msixtbl_pt_cleanup(struct domain *d) { struct msixtbl_entry *entry, *temp; unsigned long flags; /* msixtbl_list_lock must be acquired with irq_disabled for check_lock() */ local_irq_save(flags); spin_lock(&d->arch.hvm_domain.msixtbl_list_lock); list_for_each_entry_safe( entry, temp, &d->arch.hvm_domain.msixtbl_list, list ) del_msixtbl_entry(entry); spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock); local_irq_restore(flags); } void msix_write_completion(struct vcpu *v) { unsigned long ctrl_address = v->arch.hvm_vcpu.hvm_io.msix_unmask_address; if ( !ctrl_address ) return; v->arch.hvm_vcpu.hvm_io.msix_unmask_address = 0; if ( msixtbl_write(v, ctrl_address, 4, 0) != X86EMUL_OKAY ) gdprintk(XENLOG_WARNING, "MSI-X write completion failure\n"); } xen-4.4.0/xen/arch/x86/hvm/vpmu.c0000664000175000017500000001625112307313555014531 0ustar smbsmb/* * vpmu.c: PMU virtualization for HVM domain. * * Copyright (c) 2007, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Author: Haitao Shan */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* * "vpmu" : vpmu generally enabled * "vpmu=off" : vpmu generally disabled * "vpmu=bts" : vpmu enabled and Intel BTS feature switched on. */ static unsigned int __read_mostly opt_vpmu_enabled; static void parse_vpmu_param(char *s); custom_param("vpmu", parse_vpmu_param); static DEFINE_PER_CPU(struct vcpu *, last_vcpu); static void __init parse_vpmu_param(char *s) { switch ( parse_bool(s) ) { case 0: break; default: if ( !strcmp(s, "bts") ) opt_vpmu_enabled |= VPMU_BOOT_BTS; else if ( *s ) { printk("VPMU: unknown flag: %s - vpmu disabled!\n", s); break; } /* fall through */ case 1: opt_vpmu_enabled |= VPMU_BOOT_ENABLED; break; } } int vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content) { struct vpmu_struct *vpmu = vcpu_vpmu(current); if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->do_wrmsr ) return vpmu->arch_vpmu_ops->do_wrmsr(msr, msr_content); return 0; } int vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content) { struct vpmu_struct *vpmu = vcpu_vpmu(current); if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->do_rdmsr ) return vpmu->arch_vpmu_ops->do_rdmsr(msr, msr_content); return 0; } int vpmu_do_interrupt(struct cpu_user_regs *regs) { struct vcpu *v = current; struct vpmu_struct *vpmu = vcpu_vpmu(v); if ( vpmu->arch_vpmu_ops ) { struct vlapic *vlapic = vcpu_vlapic(v); u32 vlapic_lvtpc; unsigned char int_vec; if ( !vpmu->arch_vpmu_ops->do_interrupt(regs) ) return 0; if ( !is_vlapic_lvtpc_enabled(vlapic) ) return 1; vlapic_lvtpc = vlapic_get_reg(vlapic, APIC_LVTPC); int_vec = vlapic_lvtpc & APIC_VECTOR_MASK; if ( GET_APIC_DELIVERY_MODE(vlapic_lvtpc) == APIC_MODE_FIXED ) vlapic_set_irq(vcpu_vlapic(v), int_vec, 0); else v->nmi_pending = 1; return 1; } return 0; } void vpmu_do_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { struct vpmu_struct *vpmu = vcpu_vpmu(current); if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->do_cpuid ) vpmu->arch_vpmu_ops->do_cpuid(input, eax, ebx, ecx, edx); } static void vpmu_save_force(void *arg) { struct vcpu *v = (struct vcpu *)arg; struct vpmu_struct *vpmu = vcpu_vpmu(v); if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) ) return; if ( vpmu->arch_vpmu_ops ) (void)vpmu->arch_vpmu_ops->arch_vpmu_save(v); vpmu_reset(vpmu, VPMU_CONTEXT_SAVE); per_cpu(last_vcpu, smp_processor_id()) = NULL; } void vpmu_save(struct vcpu *v) { struct vpmu_struct *vpmu = vcpu_vpmu(v); int pcpu = smp_processor_id(); if ( !(vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) && vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED)) ) return; vpmu->last_pcpu = pcpu; per_cpu(last_vcpu, pcpu) = v; if ( vpmu->arch_vpmu_ops ) if ( vpmu->arch_vpmu_ops->arch_vpmu_save(v) ) vpmu_reset(vpmu, VPMU_CONTEXT_LOADED); apic_write(APIC_LVTPC, PMU_APIC_VECTOR | APIC_LVT_MASKED); } void vpmu_load(struct vcpu *v) { struct vpmu_struct *vpmu = vcpu_vpmu(v); int pcpu = smp_processor_id(); struct vcpu *prev = NULL; if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) ) return; /* First time this VCPU is running here */ if ( vpmu->last_pcpu != pcpu ) { /* * Get the context from last pcpu that we ran on. Note that if another * VCPU is running there it must have saved this VPCU's context before * startig to run (see below). * There should be no race since remote pcpu will disable interrupts * before saving the context. */ if ( vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) ) { vpmu_set(vpmu, VPMU_CONTEXT_SAVE); on_selected_cpus(cpumask_of(vpmu->last_pcpu), vpmu_save_force, (void *)v, 1); vpmu_reset(vpmu, VPMU_CONTEXT_LOADED); } } /* Prevent forced context save from remote CPU */ local_irq_disable(); prev = per_cpu(last_vcpu, pcpu); if ( prev != v && prev ) { vpmu = vcpu_vpmu(prev); /* Someone ran here before us */ vpmu_set(vpmu, VPMU_CONTEXT_SAVE); vpmu_save_force(prev); vpmu_reset(vpmu, VPMU_CONTEXT_LOADED); vpmu = vcpu_vpmu(v); } local_irq_enable(); /* Only when PMU is counting, we load PMU context immediately. */ if ( !vpmu_is_set(vpmu, VPMU_RUNNING) ) return; if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->arch_vpmu_load ) { apic_write_around(APIC_LVTPC, vpmu->hw_lapic_lvtpc); vpmu->arch_vpmu_ops->arch_vpmu_load(v); } vpmu_set(vpmu, VPMU_CONTEXT_LOADED); } void vpmu_initialise(struct vcpu *v) { struct vpmu_struct *vpmu = vcpu_vpmu(v); uint8_t vendor = current_cpu_data.x86_vendor; if ( vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) ) vpmu_destroy(v); vpmu_clear(vpmu); vpmu->context = NULL; switch ( vendor ) { case X86_VENDOR_AMD: if ( svm_vpmu_initialise(v, opt_vpmu_enabled) != 0 ) opt_vpmu_enabled = 0; break; case X86_VENDOR_INTEL: if ( vmx_vpmu_initialise(v, opt_vpmu_enabled) != 0 ) opt_vpmu_enabled = 0; break; default: printk("VPMU: Initialization failed. " "Unknown CPU vendor %d\n", vendor); opt_vpmu_enabled = 0; break; } } void vpmu_destroy(struct vcpu *v) { struct vpmu_struct *vpmu = vcpu_vpmu(v); if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->arch_vpmu_destroy ) vpmu->arch_vpmu_ops->arch_vpmu_destroy(v); } /* Dump some vpmu informations on console. Used in keyhandler dump_domains(). */ void vpmu_dump(struct vcpu *v) { struct vpmu_struct *vpmu = vcpu_vpmu(v); if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->arch_vpmu_dump ) vpmu->arch_vpmu_ops->arch_vpmu_dump(v); } xen-4.4.0/xen/arch/x86/hvm/vioapic.c0000664000175000017500000003027612307313555015177 0ustar smbsmb/* * Copyright (C) 2001 MandrakeSoft S.A. * * MandrakeSoft S.A. * 43, rue d'Aboukir * 75002 Paris - France * http://www.linux-mandrake.com/ * http://www.mandrakesoft.com/ * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Yunhong Jiang * Ported to xen by using virtual IRQ line. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* HACK: Route IRQ0 only to VCPU0 to prevent time jumps. */ #define IRQ0_SPECIAL_ROUTING 1 static void vioapic_deliver(struct hvm_hw_vioapic *vioapic, int irq); static unsigned long vioapic_read_indirect(struct hvm_hw_vioapic *vioapic, unsigned long addr, unsigned long length) { unsigned long result = 0; switch ( vioapic->ioregsel ) { case VIOAPIC_REG_VERSION: result = ((((VIOAPIC_NUM_PINS-1) & 0xff) << 16) | (VIOAPIC_VERSION_ID & 0xff)); break; case VIOAPIC_REG_APIC_ID: case VIOAPIC_REG_ARB_ID: result = ((vioapic->id & 0xf) << 24); break; default: { uint32_t redir_index = (vioapic->ioregsel - 0x10) >> 1; uint64_t redir_content; if ( redir_index >= VIOAPIC_NUM_PINS ) { gdprintk(XENLOG_WARNING, "apic_mem_readl:undefined ioregsel %x\n", vioapic->ioregsel); break; } redir_content = vioapic->redirtbl[redir_index].bits; result = (vioapic->ioregsel & 0x1)? (redir_content >> 32) & 0xffffffff : redir_content & 0xffffffff; break; } } return result; } static int vioapic_read( struct vcpu *v, unsigned long addr, unsigned long length, unsigned long *pval) { struct hvm_hw_vioapic *vioapic = domain_vioapic(v->domain); uint32_t result; HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "addr %lx", addr); addr &= 0xff; switch ( addr ) { case VIOAPIC_REG_SELECT: result = vioapic->ioregsel; break; case VIOAPIC_REG_WINDOW: result = vioapic_read_indirect(vioapic, addr, length); break; default: result = 0; break; } *pval = result; return X86EMUL_OKAY; } static void vioapic_write_redirent( struct hvm_hw_vioapic *vioapic, unsigned int idx, int top_word, uint32_t val) { struct domain *d = vioapic_domain(vioapic); struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; union vioapic_redir_entry *pent, ent; int unmasked = 0; spin_lock(&d->arch.hvm_domain.irq_lock); pent = &vioapic->redirtbl[idx]; ent = *pent; if ( top_word ) { /* Contains only the dest_id. */ ent.bits = (uint32_t)ent.bits | ((uint64_t)val << 32); } else { unmasked = ent.fields.mask; /* Remote IRR and Delivery Status are read-only. */ ent.bits = ((ent.bits >> 32) << 32) | val; ent.fields.delivery_status = 0; ent.fields.remote_irr = pent->fields.remote_irr; unmasked = unmasked && !ent.fields.mask; } *pent = ent; if ( idx == 0 ) { vlapic_adjust_i8259_target(d); } else if ( ent.fields.trig_mode == VIOAPIC_EDGE_TRIG ) pent->fields.remote_irr = 0; else if ( !ent.fields.mask && !ent.fields.remote_irr && hvm_irq->gsi_assert_count[idx] ) { pent->fields.remote_irr = 1; vioapic_deliver(vioapic, idx); } spin_unlock(&d->arch.hvm_domain.irq_lock); if ( idx == 0 || unmasked ) pt_may_unmask_irq(d, NULL); } static void vioapic_write_indirect( struct hvm_hw_vioapic *vioapic, unsigned long length, unsigned long val) { switch ( vioapic->ioregsel ) { case VIOAPIC_REG_VERSION: /* Writes are ignored. */ break; case VIOAPIC_REG_APIC_ID: vioapic->id = (val >> 24) & 0xf; break; case VIOAPIC_REG_ARB_ID: break; default: { uint32_t redir_index = (vioapic->ioregsel - 0x10) >> 1; HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "change redir index %x val %lx", redir_index, val); if ( redir_index >= VIOAPIC_NUM_PINS ) { gdprintk(XENLOG_WARNING, "vioapic_write_indirect " "error register %x\n", vioapic->ioregsel); break; } vioapic_write_redirent( vioapic, redir_index, vioapic->ioregsel&1, val); break; } } } static int vioapic_write( struct vcpu *v, unsigned long addr, unsigned long length, unsigned long val) { struct hvm_hw_vioapic *vioapic = domain_vioapic(v->domain); addr &= 0xff; switch ( addr ) { case VIOAPIC_REG_SELECT: vioapic->ioregsel = val; break; case VIOAPIC_REG_WINDOW: vioapic_write_indirect(vioapic, length, val); break; #if VIOAPIC_VERSION_ID >= 0x20 case VIOAPIC_REG_EOI: vioapic_update_EOI(v->domain, val); break; #endif default: break; } return X86EMUL_OKAY; } static int vioapic_range(struct vcpu *v, unsigned long addr) { struct hvm_hw_vioapic *vioapic = domain_vioapic(v->domain); return ((addr >= vioapic->base_address && (addr < vioapic->base_address + VIOAPIC_MEM_LENGTH))); } const struct hvm_mmio_handler vioapic_mmio_handler = { .check_handler = vioapic_range, .read_handler = vioapic_read, .write_handler = vioapic_write }; static void ioapic_inj_irq( struct hvm_hw_vioapic *vioapic, struct vlapic *target, uint8_t vector, uint8_t trig_mode, uint8_t delivery_mode) { HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "irq %d trig %d deliv %d", vector, trig_mode, delivery_mode); ASSERT((delivery_mode == dest_Fixed) || (delivery_mode == dest_LowestPrio)); vlapic_set_irq(target, vector, trig_mode); } static inline int pit_channel0_enabled(void) { return pt_active(¤t->domain->arch.vpit.pt0); } static void vioapic_deliver(struct hvm_hw_vioapic *vioapic, int irq) { uint16_t dest = vioapic->redirtbl[irq].fields.dest_id; uint8_t dest_mode = vioapic->redirtbl[irq].fields.dest_mode; uint8_t delivery_mode = vioapic->redirtbl[irq].fields.delivery_mode; uint8_t vector = vioapic->redirtbl[irq].fields.vector; uint8_t trig_mode = vioapic->redirtbl[irq].fields.trig_mode; struct domain *d = vioapic_domain(vioapic); struct vlapic *target; struct vcpu *v; ASSERT(spin_is_locked(&d->arch.hvm_domain.irq_lock)); HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "dest=%x dest_mode=%x delivery_mode=%x " "vector=%x trig_mode=%x", dest, dest_mode, delivery_mode, vector, trig_mode); switch ( delivery_mode ) { case dest_LowestPrio: { #ifdef IRQ0_SPECIAL_ROUTING /* Force round-robin to pick VCPU 0 */ if ( (irq == hvm_isa_irq_to_gsi(0)) && pit_channel0_enabled() ) { v = d->vcpu ? d->vcpu[0] : NULL; target = v ? vcpu_vlapic(v) : NULL; } else #endif target = vlapic_lowest_prio(d, NULL, 0, dest, dest_mode); if ( target != NULL ) { ioapic_inj_irq(vioapic, target, vector, trig_mode, delivery_mode); } else { HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "null round robin: " "vector=%x delivery_mode=%x", vector, dest_LowestPrio); } break; } case dest_Fixed: { #ifdef IRQ0_SPECIAL_ROUTING /* Do not deliver timer interrupts to VCPU != 0 */ if ( (irq == hvm_isa_irq_to_gsi(0)) && pit_channel0_enabled() ) { if ( (v = d->vcpu ? d->vcpu[0] : NULL) != NULL ) ioapic_inj_irq(vioapic, vcpu_vlapic(v), vector, trig_mode, delivery_mode); } else #endif { for_each_vcpu ( d, v ) if ( vlapic_match_dest(vcpu_vlapic(v), NULL, 0, dest, dest_mode) ) ioapic_inj_irq(vioapic, vcpu_vlapic(v), vector, trig_mode, delivery_mode); } break; } case dest_NMI: { for_each_vcpu ( d, v ) if ( vlapic_match_dest(vcpu_vlapic(v), NULL, 0, dest, dest_mode) && !test_and_set_bool(v->nmi_pending) ) vcpu_kick(v); break; } default: gdprintk(XENLOG_WARNING, "Unsupported delivery mode %d\n", delivery_mode); break; } } void vioapic_irq_positive_edge(struct domain *d, unsigned int irq) { struct hvm_hw_vioapic *vioapic = domain_vioapic(d); union vioapic_redir_entry *ent; HVM_DBG_LOG(DBG_LEVEL_IOAPIC, "irq %x", irq); ASSERT(irq < VIOAPIC_NUM_PINS); ASSERT(spin_is_locked(&d->arch.hvm_domain.irq_lock)); ent = &vioapic->redirtbl[irq]; if ( ent->fields.mask ) return; if ( ent->fields.trig_mode == VIOAPIC_EDGE_TRIG ) { vioapic_deliver(vioapic, irq); } else if ( !ent->fields.remote_irr ) { ent->fields.remote_irr = 1; vioapic_deliver(vioapic, irq); } } void vioapic_update_EOI(struct domain *d, int vector) { struct hvm_hw_vioapic *vioapic = domain_vioapic(d); struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; union vioapic_redir_entry *ent; int gsi; spin_lock(&d->arch.hvm_domain.irq_lock); for ( gsi = 0; gsi < VIOAPIC_NUM_PINS; gsi++ ) { ent = &vioapic->redirtbl[gsi]; if ( ent->fields.vector != vector ) continue; ent->fields.remote_irr = 0; if ( iommu_enabled ) { spin_unlock(&d->arch.hvm_domain.irq_lock); hvm_dpci_eoi(d, gsi, ent); spin_lock(&d->arch.hvm_domain.irq_lock); } if ( (ent->fields.trig_mode == VIOAPIC_LEVEL_TRIG) && !ent->fields.mask && hvm_irq->gsi_assert_count[gsi] ) { ent->fields.remote_irr = 1; vioapic_deliver(vioapic, gsi); } } spin_unlock(&d->arch.hvm_domain.irq_lock); } static int ioapic_save(struct domain *d, hvm_domain_context_t *h) { struct hvm_hw_vioapic *s = domain_vioapic(d); return hvm_save_entry(IOAPIC, 0, h, s); } static int ioapic_load(struct domain *d, hvm_domain_context_t *h) { struct hvm_hw_vioapic *s = domain_vioapic(d); return hvm_load_entry(IOAPIC, h, s); } HVM_REGISTER_SAVE_RESTORE(IOAPIC, ioapic_save, ioapic_load, 1, HVMSR_PER_DOM); void vioapic_reset(struct domain *d) { struct hvm_vioapic *vioapic = d->arch.hvm_domain.vioapic; int i; memset(&vioapic->hvm_hw_vioapic, 0, sizeof(vioapic->hvm_hw_vioapic)); for ( i = 0; i < VIOAPIC_NUM_PINS; i++ ) vioapic->hvm_hw_vioapic.redirtbl[i].fields.mask = 1; vioapic->hvm_hw_vioapic.base_address = VIOAPIC_DEFAULT_BASE_ADDRESS; } int vioapic_init(struct domain *d) { if ( (d->arch.hvm_domain.vioapic == NULL) && ((d->arch.hvm_domain.vioapic = xmalloc(struct hvm_vioapic)) == NULL) ) return -ENOMEM; d->arch.hvm_domain.vioapic->domain = d; vioapic_reset(d); return 0; } void vioapic_deinit(struct domain *d) { xfree(d->arch.hvm_domain.vioapic); d->arch.hvm_domain.vioapic = NULL; } xen-4.4.0/xen/arch/x86/hvm/nestedhvm.c0000664000175000017500000001240712307313555015536 0ustar smbsmb/* * Nested HVM * Copyright (c) 2011, Advanced Micro Devices, Inc. * Author: Christoph Egger * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include /* for HVM_DELIVER_NO_ERROR_CODE */ #include #include /* for struct p2m_domain */ #include #include /* for local_event_delivery_(en|dis)able */ #include /* for paging_mode_hap() */ static unsigned long *shadow_io_bitmap[3]; /* Nested HVM on/off per domain */ bool_t nestedhvm_enabled(struct domain *d) { return is_hvm_domain(d) && d->arch.hvm_domain.params[HVM_PARAM_NESTEDHVM]; } /* Nested VCPU */ bool_t nestedhvm_vcpu_in_guestmode(struct vcpu *v) { return vcpu_nestedhvm(v).nv_guestmode; } void nestedhvm_vcpu_reset(struct vcpu *v) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); nv->nv_vmentry_pending = 0; nv->nv_vmexit_pending = 0; nv->nv_vmswitch_in_progress = 0; nv->nv_ioport80 = 0; nv->nv_ioportED = 0; hvm_unmap_guest_frame(nv->nv_vvmcx, 1); nv->nv_vvmcx = NULL; nv->nv_vvmcxaddr = VMCX_EADDR; nv->nv_flushp2m = 0; nv->nv_p2m = NULL; hvm_asid_flush_vcpu_asid(&nv->nv_n2asid); if ( hvm_funcs.nhvm_vcpu_reset ) hvm_funcs.nhvm_vcpu_reset(v); /* vcpu is in host mode */ nestedhvm_vcpu_exit_guestmode(v); } int nestedhvm_vcpu_initialise(struct vcpu *v) { int rc = -EOPNOTSUPP; if ( !shadow_io_bitmap[0] ) return -ENOMEM; if ( !hvm_funcs.nhvm_vcpu_initialise || ((rc = hvm_funcs.nhvm_vcpu_initialise(v)) != 0) ) return rc; nestedhvm_vcpu_reset(v); return 0; } void nestedhvm_vcpu_destroy(struct vcpu *v) { if ( hvm_funcs.nhvm_vcpu_destroy ) hvm_funcs.nhvm_vcpu_destroy(v); } static void nestedhvm_flushtlb_ipi(void *info) { struct vcpu *v = current; struct domain *d = info; ASSERT(d != NULL); if (v->domain != d) { /* This cpu doesn't belong to the domain */ return; } /* Just flush the ASID (or request a new one). * This is cheaper than flush_tlb_local() and has * the same desired effect. */ hvm_asid_flush_core(); vcpu_nestedhvm(v).nv_p2m = NULL; } void nestedhvm_vmcx_flushtlb(struct p2m_domain *p2m) { on_selected_cpus(p2m->dirty_cpumask, nestedhvm_flushtlb_ipi, p2m->domain, 1); cpumask_clear(p2m->dirty_cpumask); } bool_t nestedhvm_is_n2(struct vcpu *v) { if (!nestedhvm_enabled(v->domain) || nestedhvm_vmswitch_in_progress(v) || !nestedhvm_paging_mode_hap(v)) return 0; if (nestedhvm_vcpu_in_guestmode(v)) return 1; return 0; } /* Common shadow IO Permission bitmap */ /* There four global patterns of io bitmap each guest can * choose depending on interception of io port 0x80 and/or * 0xED (shown in table below). * The users of the bitmap patterns are in SVM/VMX specific code. * * bitmap port 0x80 port 0xed * hvm_io_bitmap cleared cleared * iomap[0] cleared set * iomap[1] set cleared * iomap[2] set set */ static int __init nestedhvm_setup(void) { /* Same format and size as hvm_io_bitmap (Intel needs only 2 pages). */ unsigned nr = cpu_has_vmx ? 2 : 3; unsigned int i, order = get_order_from_pages(nr); if ( !hvm_funcs.name ) return 0; /* shadow_io_bitmaps can't be declared static because * they must fulfill hw requirements (page aligned section) * and doing so triggers the ASSERT(va >= XEN_VIRT_START) * in __virt_to_maddr() * * So as a compromise pre-allocate them when xen boots. * This function must be called from within start_xen() when * it is valid to use _xmalloc() */ for ( i = 0; i < ARRAY_SIZE(shadow_io_bitmap); i++ ) { shadow_io_bitmap[i] = alloc_xenheap_pages(order, 0); if ( !shadow_io_bitmap[i] ) { while ( i-- ) { free_xenheap_pages(shadow_io_bitmap[i], order); shadow_io_bitmap[i] = NULL; } return -ENOMEM; } memset(shadow_io_bitmap[i], ~0U, nr << PAGE_SHIFT); } __clear_bit(0x80, shadow_io_bitmap[0]); __clear_bit(0xed, shadow_io_bitmap[1]); return 0; } __initcall(nestedhvm_setup); unsigned long * nestedhvm_vcpu_iomap_get(bool_t port_80, bool_t port_ed) { int i; if (!hvm_port80_allowed) port_80 = 1; if (port_80 == 0) { if (port_ed == 0) return hvm_io_bitmap; i = 0; } else { if (port_ed == 0) i = 1; else i = 2; } return shadow_io_bitmap[i]; } xen-4.4.0/xen/arch/x86/hvm/io.c0000664000175000017500000003130512307313555014146 0ustar smbsmb/* * io.c: Handling I/O and interrupts. * * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2005, International Business Machines Corporation. * Copyright (c) 2008, Citrix Systems, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int hvm_buffered_io_send(ioreq_t *p) { struct vcpu *v = current; struct hvm_ioreq_page *iorp = &v->domain->arch.hvm_domain.buf_ioreq; buffered_iopage_t *pg = iorp->va; buf_ioreq_t bp; /* Timeoffset sends 64b data, but no address. Use two consecutive slots. */ int qw = 0; /* Ensure buffered_iopage fits in a page */ BUILD_BUG_ON(sizeof(buffered_iopage_t) > PAGE_SIZE); /* * Return 0 for the cases we can't deal with: * - 'addr' is only a 20-bit field, so we cannot address beyond 1MB * - we cannot buffer accesses to guest memory buffers, as the guest * may expect the memory buffer to be synchronously accessed * - the count field is usually used with data_is_ptr and since we don't * support data_is_ptr we do not waste space for the count field either */ if ( (p->addr > 0xffffful) || p->data_is_ptr || (p->count != 1) ) return 0; bp.type = p->type; bp.dir = p->dir; switch ( p->size ) { case 1: bp.size = 0; break; case 2: bp.size = 1; break; case 4: bp.size = 2; break; case 8: bp.size = 3; qw = 1; break; default: gdprintk(XENLOG_WARNING, "unexpected ioreq size: %u\n", p->size); return 0; } bp.data = p->data; bp.addr = p->addr; spin_lock(&iorp->lock); if ( (pg->write_pointer - pg->read_pointer) >= (IOREQ_BUFFER_SLOT_NUM - qw) ) { /* The queue is full: send the iopacket through the normal path. */ spin_unlock(&iorp->lock); return 0; } memcpy(&pg->buf_ioreq[pg->write_pointer % IOREQ_BUFFER_SLOT_NUM], &bp, sizeof(bp)); if ( qw ) { bp.data = p->data >> 32; memcpy(&pg->buf_ioreq[(pg->write_pointer+1) % IOREQ_BUFFER_SLOT_NUM], &bp, sizeof(bp)); } /* Make the ioreq_t visible /before/ write_pointer. */ wmb(); pg->write_pointer += qw ? 2 : 1; notify_via_xen_event_channel(v->domain, v->domain->arch.hvm_domain.params[HVM_PARAM_BUFIOREQ_EVTCHN]); spin_unlock(&iorp->lock); return 1; } void send_timeoffset_req(unsigned long timeoff) { ioreq_t p[1]; if ( timeoff == 0 ) return; memset(p, 0, sizeof(*p)); p->type = IOREQ_TYPE_TIMEOFFSET; p->size = 8; p->count = 1; p->dir = IOREQ_WRITE; p->data = timeoff; p->state = STATE_IOREQ_READY; if ( !hvm_buffered_io_send(p) ) printk("Unsuccessful timeoffset update\n"); } /* Ask ioemu mapcache to invalidate mappings. */ void send_invalidate_req(void) { struct vcpu *v = current; ioreq_t *p = get_ioreq(v); if ( !p ) return; if ( p->state != STATE_IOREQ_NONE ) { gdprintk(XENLOG_ERR, "WARNING: send invalidate req with something " "already pending (%d)?\n", p->state); domain_crash(v->domain); return; } p->type = IOREQ_TYPE_INVALIDATE; p->size = 4; p->dir = IOREQ_WRITE; p->data = ~0UL; /* flush all */ (void)hvm_send_assist_req(v); } int handle_mmio(void) { struct hvm_emulate_ctxt ctxt; struct vcpu *curr = current; struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io; int rc; hvm_emulate_prepare(&ctxt, guest_cpu_user_regs()); rc = hvm_emulate_one(&ctxt); if ( rc != X86EMUL_RETRY ) vio->io_state = HVMIO_none; if ( vio->io_state == HVMIO_awaiting_completion ) vio->io_state = HVMIO_handle_mmio_awaiting_completion; else vio->mmio_gva = 0; switch ( rc ) { case X86EMUL_UNHANDLEABLE: gdprintk(XENLOG_WARNING, "MMIO emulation failed @ %04x:%lx: " "%02x %02x %02x %02x %02x %02x %02x %02x %02x %02x\n", hvmemul_get_seg_reg(x86_seg_cs, &ctxt)->sel, ctxt.insn_buf_eip, ctxt.insn_buf[0], ctxt.insn_buf[1], ctxt.insn_buf[2], ctxt.insn_buf[3], ctxt.insn_buf[4], ctxt.insn_buf[5], ctxt.insn_buf[6], ctxt.insn_buf[7], ctxt.insn_buf[8], ctxt.insn_buf[9]); return 0; case X86EMUL_EXCEPTION: if ( ctxt.exn_pending ) hvm_inject_hw_exception(ctxt.exn_vector, ctxt.exn_error_code); break; default: break; } hvm_emulate_writeback(&ctxt); return 1; } int handle_mmio_with_translation(unsigned long gva, unsigned long gpfn) { struct hvm_vcpu_io *vio = ¤t->arch.hvm_vcpu.hvm_io; vio->mmio_gva = gva & PAGE_MASK; vio->mmio_gpfn = gpfn; return handle_mmio(); } int handle_pio(uint16_t port, unsigned int size, int dir) { struct vcpu *curr = current; struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io; unsigned long data, reps = 1; int rc; ASSERT((size - 1) < 4 && size != 3); if ( dir == IOREQ_WRITE ) data = guest_cpu_user_regs()->eax; rc = hvmemul_do_pio(port, &reps, size, 0, dir, 0, &data); switch ( rc ) { case X86EMUL_OKAY: if ( dir == IOREQ_READ ) { if ( size == 4 ) /* Needs zero extension. */ guest_cpu_user_regs()->rax = (uint32_t)data; else memcpy(&guest_cpu_user_regs()->rax, &data, size); } break; case X86EMUL_RETRY: if ( vio->io_state != HVMIO_awaiting_completion ) return 0; /* Completion in hvm_io_assist() with no re-emulation required. */ ASSERT(dir == IOREQ_READ); vio->io_state = HVMIO_handle_pio_awaiting_completion; break; default: gdprintk(XENLOG_ERR, "Weird HVM ioemulation status %d.\n", rc); domain_crash(curr->domain); break; } return 1; } void hvm_io_assist(ioreq_t *p) { struct vcpu *curr = current; struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io; enum hvm_io_state io_state; rmb(); /* see IORESP_READY /then/ read contents of ioreq */ p->state = STATE_IOREQ_NONE; io_state = vio->io_state; vio->io_state = HVMIO_none; switch ( io_state ) { case HVMIO_awaiting_completion: vio->io_state = HVMIO_completed; vio->io_data = p->data; break; case HVMIO_handle_mmio_awaiting_completion: vio->io_state = HVMIO_completed; vio->io_data = p->data; (void)handle_mmio(); break; case HVMIO_handle_pio_awaiting_completion: if ( vio->io_size == 4 ) /* Needs zero extension. */ guest_cpu_user_regs()->rax = (uint32_t)p->data; else memcpy(&guest_cpu_user_regs()->rax, &p->data, vio->io_size); break; default: break; } if ( p->state == STATE_IOREQ_NONE ) { msix_write_completion(curr); vcpu_end_shutdown_deferral(curr); } } static int dpci_ioport_read(uint32_t mport, ioreq_t *p) { struct hvm_vcpu_io *vio = ¤t->arch.hvm_vcpu.hvm_io; int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size; uint32_t data = 0; for ( i = 0; i < p->count; i++ ) { if ( vio->mmio_retrying ) { if ( vio->mmio_large_read_bytes != p->size ) return X86EMUL_UNHANDLEABLE; memcpy(&data, vio->mmio_large_read, p->size); vio->mmio_large_read_bytes = 0; vio->mmio_retrying = 0; } else switch ( p->size ) { case 1: data = inb(mport); break; case 2: data = inw(mport); break; case 4: data = inl(mport); break; default: BUG(); } if ( p->data_is_ptr ) { switch ( hvm_copy_to_guest_phys(p->data + step * i, &data, p->size) ) { case HVMCOPY_okay: break; case HVMCOPY_gfn_paged_out: case HVMCOPY_gfn_shared: rc = X86EMUL_RETRY; break; case HVMCOPY_bad_gfn_to_mfn: /* Drop the write as real hardware would. */ continue; case HVMCOPY_bad_gva_to_gfn: ASSERT(0); /* fall through */ default: rc = X86EMUL_UNHANDLEABLE; break; } if ( rc != X86EMUL_OKAY) break; } else p->data = data; } if ( rc == X86EMUL_RETRY ) { vio->mmio_retry = 1; vio->mmio_large_read_bytes = p->size; memcpy(vio->mmio_large_read, &data, p->size); } if ( i != 0 ) { p->count = i; rc = X86EMUL_OKAY; } return rc; } static int dpci_ioport_write(uint32_t mport, ioreq_t *p) { int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size; uint32_t data; for ( i = 0; i < p->count; i++ ) { data = p->data; if ( p->data_is_ptr ) { switch ( hvm_copy_from_guest_phys(&data, p->data + step * i, p->size) ) { case HVMCOPY_okay: break; case HVMCOPY_gfn_paged_out: case HVMCOPY_gfn_shared: rc = X86EMUL_RETRY; break; case HVMCOPY_bad_gfn_to_mfn: data = ~0; break; case HVMCOPY_bad_gva_to_gfn: ASSERT(0); /* fall through */ default: rc = X86EMUL_UNHANDLEABLE; break; } if ( rc != X86EMUL_OKAY) break; } switch ( p->size ) { case 1: outb(data, mport); break; case 2: outw(data, mport); break; case 4: outl(data, mport); break; default: BUG(); } } if ( rc == X86EMUL_RETRY ) current->arch.hvm_vcpu.hvm_io.mmio_retry = 1; if ( i != 0 ) { p->count = i; rc = X86EMUL_OKAY; } return rc; } int dpci_ioport_intercept(ioreq_t *p) { struct domain *d = current->domain; struct hvm_iommu *hd = domain_hvm_iommu(d); struct g2m_ioport *g2m_ioport; unsigned int mport, gport = p->addr; unsigned int s = 0, e = 0; int rc; list_for_each_entry( g2m_ioport, &hd->g2m_ioport_list, list ) { s = g2m_ioport->gport; e = s + g2m_ioport->np; if ( (gport >= s) && (gport < e) ) goto found; } return X86EMUL_UNHANDLEABLE; found: mport = (gport - s) + g2m_ioport->mport; if ( !ioports_access_permitted(d, mport, mport + p->size - 1) ) { gdprintk(XENLOG_ERR, "Error: access to gport=%#x denied!\n", (uint32_t)p->addr); return X86EMUL_UNHANDLEABLE; } switch ( p->dir ) { case IOREQ_READ: rc = dpci_ioport_read(mport, p); break; case IOREQ_WRITE: rc = dpci_ioport_write(mport, p); break; default: gdprintk(XENLOG_ERR, "Error: couldn't handle p->dir = %d", p->dir); rc = X86EMUL_UNHANDLEABLE; } return rc; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/hvm/save.c0000664000175000017500000000502712307313555014477 0ustar smbsmb/* * hvm/save.c: Save and restore HVM guest's emulated hardware state. * * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2007, Isaku Yamahata * VA Linux Systems Japan K.K. * split x86 specific part * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include void arch_hvm_save(struct domain *d, struct hvm_save_header *hdr) { uint32_t eax, ebx, ecx, edx; /* Save some CPUID bits */ cpuid(1, &eax, &ebx, &ecx, &edx); hdr->cpuid = eax; /* Save guest's preferred TSC. */ hdr->gtsc_khz = d->arch.tsc_khz; } int arch_hvm_load(struct domain *d, struct hvm_save_header *hdr) { uint32_t eax, ebx, ecx, edx; if ( hdr->magic != HVM_FILE_MAGIC ) { printk(XENLOG_G_ERR "HVM%d restore: bad magic number %#"PRIx32"\n", d->domain_id, hdr->magic); return -1; } if ( hdr->version != HVM_FILE_VERSION ) { printk(XENLOG_G_ERR "HVM%d restore: unsupported version %u\n", d->domain_id, hdr->version); return -1; } cpuid(1, &eax, &ebx, &ecx, &edx); /* CPUs ought to match but with feature-masking they might not */ if ( (hdr->cpuid & ~0x0fUL) != (eax & ~0x0fUL) ) printk(XENLOG_G_INFO "HVM%d restore: VM saved on one CPU " "(%#"PRIx32") and restored on another (%#"PRIx32").\n", d->domain_id, hdr->cpuid, eax); /* Restore guest's preferred TSC frequency. */ if ( hdr->gtsc_khz ) d->arch.tsc_khz = hdr->gtsc_khz; if ( d->arch.vtsc ) hvm_set_rdtsc_exiting(d, 1); /* VGA state is not saved/restored, so we nobble the cache. */ d->arch.hvm_domain.stdvga.cache = 0; return 0; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/hvm/emulate.c0000664000175000017500000011314712307313555015200 0ustar smbsmb/****************************************************************************** * hvm/emulate.c * * HVM instruction emulation. Used for MMIO and VMX real mode. * * Copyright (c) 2008, Citrix Systems, Inc. * * Authors: * Keir Fraser */ #include #include #include #include #include #include #include #include #include #include #include #include static void hvmtrace_io_assist(int is_mmio, ioreq_t *p) { unsigned int size, event; unsigned char buffer[12]; if ( likely(!tb_init_done) ) return; if ( is_mmio ) event = p->dir ? TRC_HVM_IOMEM_READ : TRC_HVM_IOMEM_WRITE; else event = p->dir ? TRC_HVM_IOPORT_READ : TRC_HVM_IOPORT_WRITE; *(uint64_t *)buffer = p->addr; size = (p->addr != (u32)p->addr) ? 8 : 4; if ( size == 8 ) event |= TRC_64_FLAG; if ( !p->data_is_ptr ) { *(uint32_t *)&buffer[size] = p->data; size += 4; } trace_var(event, 0/*!cycles*/, size, buffer); } static int hvmemul_do_io( int is_mmio, paddr_t addr, unsigned long *reps, int size, paddr_t ram_gpa, int dir, int df, void *p_data) { paddr_t value = ram_gpa; int value_is_ptr = (p_data == NULL); struct vcpu *curr = current; struct hvm_vcpu_io *vio; ioreq_t *p = get_ioreq(curr); ioreq_t _ioreq; unsigned long ram_gfn = paddr_to_pfn(ram_gpa); p2m_type_t p2mt; struct page_info *ram_page; int rc; bool_t has_dm = 1; /* * Domains without a backing DM, don't have an ioreq page. Just * point to a struct on the stack, initialising the state as needed. */ if ( !p ) { has_dm = 0; p = &_ioreq; p->state = STATE_IOREQ_NONE; } /* Check for paged out page */ ram_page = get_page_from_gfn(curr->domain, ram_gfn, &p2mt, P2M_UNSHARE); if ( p2m_is_paging(p2mt) ) { if ( ram_page ) put_page(ram_page); p2m_mem_paging_populate(curr->domain, ram_gfn); return X86EMUL_RETRY; } if ( p2m_is_shared(p2mt) ) { if ( ram_page ) put_page(ram_page); return X86EMUL_RETRY; } /* * Weird-sized accesses have undefined behaviour: we discard writes * and read all-ones. */ if ( unlikely((size > sizeof(long)) || (size & (size - 1))) ) { gdprintk(XENLOG_WARNING, "bad mmio size %d\n", size); ASSERT(p_data != NULL); /* cannot happen with a REP prefix */ if ( dir == IOREQ_READ ) memset(p_data, ~0, size); if ( ram_page ) put_page(ram_page); return X86EMUL_UNHANDLEABLE; } if ( (p_data != NULL) && (dir == IOREQ_WRITE) ) { memcpy(&value, p_data, size); p_data = NULL; } vio = &curr->arch.hvm_vcpu.hvm_io; if ( is_mmio && !value_is_ptr ) { /* Part of a multi-cycle read or write? */ if ( dir == IOREQ_WRITE ) { paddr_t pa = vio->mmio_large_write_pa; unsigned int bytes = vio->mmio_large_write_bytes; if ( (addr >= pa) && ((addr + size) <= (pa + bytes)) ) { if ( ram_page ) put_page(ram_page); return X86EMUL_OKAY; } } else { paddr_t pa = vio->mmio_large_read_pa; unsigned int bytes = vio->mmio_large_read_bytes; if ( (addr >= pa) && ((addr + size) <= (pa + bytes)) ) { memcpy(p_data, &vio->mmio_large_read[addr - pa], size); if ( ram_page ) put_page(ram_page); return X86EMUL_OKAY; } } } switch ( vio->io_state ) { case HVMIO_none: break; case HVMIO_completed: vio->io_state = HVMIO_none; if ( p_data == NULL ) { if ( ram_page ) put_page(ram_page); return X86EMUL_UNHANDLEABLE; } goto finish_access; case HVMIO_dispatched: /* May have to wait for previous cycle of a multi-write to complete. */ if ( is_mmio && !value_is_ptr && (dir == IOREQ_WRITE) && (addr == (vio->mmio_large_write_pa + vio->mmio_large_write_bytes)) ) { if ( ram_page ) put_page(ram_page); return X86EMUL_RETRY; } default: if ( ram_page ) put_page(ram_page); return X86EMUL_UNHANDLEABLE; } if ( p->state != STATE_IOREQ_NONE ) { gdprintk(XENLOG_WARNING, "WARNING: io already pending (%d)?\n", p->state); if ( ram_page ) put_page(ram_page); return X86EMUL_UNHANDLEABLE; } vio->io_state = (p_data == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion; vio->io_size = size; /* * When retrying a repeated string instruction, force exit to guest after * completion of the retried iteration to allow handling of interrupts. */ if ( vio->mmio_retrying ) *reps = 1; p->dir = dir; p->data_is_ptr = value_is_ptr; p->type = is_mmio ? IOREQ_TYPE_COPY : IOREQ_TYPE_PIO; p->size = size; p->addr = addr; p->count = *reps; p->df = df; p->data = value; if ( dir == IOREQ_WRITE ) hvmtrace_io_assist(is_mmio, p); if ( is_mmio ) { rc = hvm_mmio_intercept(p); if ( rc == X86EMUL_UNHANDLEABLE ) rc = hvm_buffered_io_intercept(p); } else { rc = hvm_portio_intercept(p); } switch ( rc ) { case X86EMUL_OKAY: case X86EMUL_RETRY: *reps = p->count; p->state = STATE_IORESP_READY; if ( !vio->mmio_retry ) { hvm_io_assist(p); vio->io_state = HVMIO_none; } else /* Defer hvm_io_assist() invocation to hvm_do_resume(). */ vio->io_state = HVMIO_handle_mmio_awaiting_completion; break; case X86EMUL_UNHANDLEABLE: /* If there is no backing DM, just ignore accesses */ if ( !has_dm ) { rc = X86EMUL_OKAY; vio->io_state = HVMIO_none; } else { rc = X86EMUL_RETRY; if ( !hvm_send_assist_req(curr) ) vio->io_state = HVMIO_none; else if ( p_data == NULL ) rc = X86EMUL_OKAY; } break; default: BUG(); } if ( rc != X86EMUL_OKAY ) { if ( ram_page ) put_page(ram_page); return rc; } finish_access: if ( dir == IOREQ_READ ) hvmtrace_io_assist(is_mmio, p); if ( p_data != NULL ) memcpy(p_data, &vio->io_data, size); if ( is_mmio && !value_is_ptr ) { /* Part of a multi-cycle read or write? */ if ( dir == IOREQ_WRITE ) { paddr_t pa = vio->mmio_large_write_pa; unsigned int bytes = vio->mmio_large_write_bytes; if ( bytes == 0 ) pa = vio->mmio_large_write_pa = addr; if ( addr == (pa + bytes) ) vio->mmio_large_write_bytes += size; } else { paddr_t pa = vio->mmio_large_read_pa; unsigned int bytes = vio->mmio_large_read_bytes; if ( bytes == 0 ) pa = vio->mmio_large_read_pa = addr; if ( (addr == (pa + bytes)) && ((bytes + size) <= sizeof(vio->mmio_large_read)) ) { memcpy(&vio->mmio_large_read[bytes], p_data, size); vio->mmio_large_read_bytes += size; } } } if ( ram_page ) put_page(ram_page); return X86EMUL_OKAY; } int hvmemul_do_pio( unsigned long port, unsigned long *reps, int size, paddr_t ram_gpa, int dir, int df, void *p_data) { return hvmemul_do_io(0, port, reps, size, ram_gpa, dir, df, p_data); } static int hvmemul_do_mmio( paddr_t gpa, unsigned long *reps, int size, paddr_t ram_gpa, int dir, int df, void *p_data) { return hvmemul_do_io(1, gpa, reps, size, ram_gpa, dir, df, p_data); } /* * Convert addr from linear to physical form, valid over the range * [addr, addr + *reps * bytes_per_rep]. *reps is adjusted according to * the valid computed range. It is always >0 when X86EMUL_OKAY is returned. * @pfec indicates the access checks to be performed during page-table walks. */ static int hvmemul_linear_to_phys( unsigned long addr, paddr_t *paddr, unsigned int bytes_per_rep, unsigned long *reps, uint32_t pfec, struct hvm_emulate_ctxt *hvmemul_ctxt) { struct vcpu *curr = current; unsigned long pfn, npfn, done, todo, i, offset = addr & ~PAGE_MASK; int reverse; /* * Clip repetitions to a sensible maximum. This avoids extensive looping in * this function while still amortising the cost of I/O trap-and-emulate. */ *reps = min_t(unsigned long, *reps, 4096); /* With no paging it's easy: linear == physical. */ if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG) ) { *paddr = addr; return X86EMUL_OKAY; } /* Reverse mode if this is a backwards multi-iteration string operation. */ reverse = (hvmemul_ctxt->ctxt.regs->eflags & X86_EFLAGS_DF) && (*reps > 1); if ( reverse && ((PAGE_SIZE - offset) < bytes_per_rep) ) { /* Do page-straddling first iteration forwards via recursion. */ paddr_t _paddr; unsigned long one_rep = 1; int rc = hvmemul_linear_to_phys( addr, &_paddr, bytes_per_rep, &one_rep, pfec, hvmemul_ctxt); if ( rc != X86EMUL_OKAY ) return rc; pfn = _paddr >> PAGE_SHIFT; } else if ( (pfn = paging_gva_to_gfn(curr, addr, &pfec)) == INVALID_GFN ) { if ( pfec == PFEC_page_paged || pfec == PFEC_page_shared ) return X86EMUL_RETRY; hvm_inject_page_fault(pfec, addr); return X86EMUL_EXCEPTION; } done = reverse ? bytes_per_rep + offset : PAGE_SIZE - offset; todo = *reps * bytes_per_rep; for ( i = 1; done < todo; i++ ) { /* Get the next PFN in the range. */ addr += reverse ? -PAGE_SIZE : PAGE_SIZE; npfn = paging_gva_to_gfn(curr, addr, &pfec); /* Is it contiguous with the preceding PFNs? If not then we're done. */ if ( (npfn == INVALID_GFN) || (npfn != (pfn + (reverse ? -i : i))) ) { if ( pfec == PFEC_page_paged || pfec == PFEC_page_shared ) return X86EMUL_RETRY; done /= bytes_per_rep; if ( done == 0 ) { ASSERT(!reverse); if ( npfn != INVALID_GFN ) return X86EMUL_UNHANDLEABLE; hvm_inject_page_fault(pfec, addr & PAGE_MASK); return X86EMUL_EXCEPTION; } *reps = done; break; } done += PAGE_SIZE; } *paddr = ((paddr_t)pfn << PAGE_SHIFT) | offset; return X86EMUL_OKAY; } static int hvmemul_virtual_to_linear( enum x86_segment seg, unsigned long offset, unsigned int bytes_per_rep, unsigned long *reps, enum hvm_access_type access_type, struct hvm_emulate_ctxt *hvmemul_ctxt, unsigned long *paddr) { struct segment_register *reg; int okay; if ( seg == x86_seg_none ) { *paddr = offset; return X86EMUL_OKAY; } /* * Clip repetitions to avoid overflow when multiplying by @bytes_per_rep. * The chosen maximum is very conservative but it's what we use in * hvmemul_linear_to_phys() so there is no point in using a larger value. */ *reps = min_t(unsigned long, *reps, 4096); reg = hvmemul_get_seg_reg(seg, hvmemul_ctxt); if ( (hvmemul_ctxt->ctxt.regs->eflags & X86_EFLAGS_DF) && (*reps > 1) ) { /* * x86_emulate() clips the repetition count to ensure we don't wrap * the effective-address index register. Hence this assertion holds. */ ASSERT(offset >= ((*reps - 1) * bytes_per_rep)); okay = hvm_virtual_to_linear_addr( seg, reg, offset - (*reps - 1) * bytes_per_rep, *reps * bytes_per_rep, access_type, hvmemul_ctxt->ctxt.addr_size, paddr); *paddr += (*reps - 1) * bytes_per_rep; if ( hvmemul_ctxt->ctxt.addr_size != 64 ) *paddr = (uint32_t)*paddr; } else { okay = hvm_virtual_to_linear_addr( seg, reg, offset, *reps * bytes_per_rep, access_type, hvmemul_ctxt->ctxt.addr_size, paddr); } if ( okay ) return X86EMUL_OKAY; /* If this is a string operation, emulate each iteration separately. */ if ( *reps != 1 ) return X86EMUL_UNHANDLEABLE; /* This is a singleton operation: fail it with an exception. */ hvmemul_ctxt->exn_pending = 1; hvmemul_ctxt->exn_vector = TRAP_gp_fault; hvmemul_ctxt->exn_error_code = 0; hvmemul_ctxt->exn_insn_len = 0; return X86EMUL_EXCEPTION; } static int __hvmemul_read( enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, enum hvm_access_type access_type, struct hvm_emulate_ctxt *hvmemul_ctxt) { struct vcpu *curr = current; unsigned long addr, reps = 1; unsigned int off, chunk = min(bytes, 1U << LONG_BYTEORDER); uint32_t pfec = PFEC_page_present; struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io; paddr_t gpa; int rc; rc = hvmemul_virtual_to_linear( seg, offset, bytes, &reps, access_type, hvmemul_ctxt, &addr); if ( rc != X86EMUL_OKAY ) return rc; off = addr & (PAGE_SIZE - 1); /* * We only need to handle sizes actual instruction operands can have. All * such sizes are either powers of 2 or the sum of two powers of 2. Thus * picking as initial chunk size the largest power of 2 not greater than * the total size will always result in only power-of-2 size requests * issued to hvmemul_do_mmio() (hvmemul_do_io() rejects non-powers-of-2). */ while ( chunk & (chunk - 1) ) chunk &= chunk - 1; if ( off + bytes > PAGE_SIZE ) while ( off & (chunk - 1) ) chunk >>= 1; if ( unlikely(vio->mmio_gva == (addr & PAGE_MASK)) && vio->mmio_gva ) { if ( access_type == hvm_access_insn_fetch ) return X86EMUL_UNHANDLEABLE; gpa = (((paddr_t)vio->mmio_gpfn << PAGE_SHIFT) | off); while ( (off + chunk) <= PAGE_SIZE ) { rc = hvmemul_do_mmio(gpa, &reps, chunk, 0, IOREQ_READ, 0, p_data); if ( rc != X86EMUL_OKAY || bytes == chunk ) return rc; addr += chunk; off += chunk; gpa += chunk; p_data += chunk; bytes -= chunk; if ( bytes < chunk ) chunk = bytes; } } if ( (seg != x86_seg_none) && (hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3) ) pfec |= PFEC_user_mode; rc = ((access_type == hvm_access_insn_fetch) ? hvm_fetch_from_guest_virt(p_data, addr, bytes, pfec) : hvm_copy_from_guest_virt(p_data, addr, bytes, pfec)); switch ( rc ) { case HVMCOPY_okay: break; case HVMCOPY_bad_gva_to_gfn: return X86EMUL_EXCEPTION; case HVMCOPY_bad_gfn_to_mfn: if ( access_type == hvm_access_insn_fetch ) return X86EMUL_UNHANDLEABLE; rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec, hvmemul_ctxt); while ( rc == X86EMUL_OKAY ) { rc = hvmemul_do_mmio(gpa, &reps, chunk, 0, IOREQ_READ, 0, p_data); if ( rc != X86EMUL_OKAY || bytes == chunk ) break; addr += chunk; off += chunk; p_data += chunk; bytes -= chunk; if ( bytes < chunk ) chunk = bytes; if ( off < PAGE_SIZE ) gpa += chunk; else { rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec, hvmemul_ctxt); off = 0; } } return rc; case HVMCOPY_gfn_paged_out: case HVMCOPY_gfn_shared: return X86EMUL_RETRY; default: return X86EMUL_UNHANDLEABLE; } return X86EMUL_OKAY; } static int hvmemul_read( enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { return __hvmemul_read( seg, offset, p_data, bytes, hvm_access_read, container_of(ctxt, struct hvm_emulate_ctxt, ctxt)); } static int hvmemul_insn_fetch( enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { struct hvm_emulate_ctxt *hvmemul_ctxt = container_of(ctxt, struct hvm_emulate_ctxt, ctxt); unsigned int insn_off = offset - hvmemul_ctxt->insn_buf_eip; /* Fall back if requested bytes are not in the prefetch cache. */ if ( unlikely((insn_off + bytes) > hvmemul_ctxt->insn_buf_bytes) ) { int rc = __hvmemul_read(seg, offset, p_data, bytes, hvm_access_insn_fetch, hvmemul_ctxt); if ( rc == X86EMUL_OKAY ) { ASSERT(insn_off + bytes <= sizeof(hvmemul_ctxt->insn_buf)); memcpy(&hvmemul_ctxt->insn_buf[insn_off], p_data, bytes); hvmemul_ctxt->insn_buf_bytes = insn_off + bytes; } return rc; } /* Hit the cache. Simple memcpy. */ memcpy(p_data, &hvmemul_ctxt->insn_buf[insn_off], bytes); return X86EMUL_OKAY; } static int hvmemul_write( enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { struct hvm_emulate_ctxt *hvmemul_ctxt = container_of(ctxt, struct hvm_emulate_ctxt, ctxt); struct vcpu *curr = current; unsigned long addr, reps = 1; unsigned int off, chunk = min(bytes, 1U << LONG_BYTEORDER); uint32_t pfec = PFEC_page_present | PFEC_write_access; struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io; paddr_t gpa; int rc; rc = hvmemul_virtual_to_linear( seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr); if ( rc != X86EMUL_OKAY ) return rc; off = addr & (PAGE_SIZE - 1); /* See the respective comment in __hvmemul_read(). */ while ( chunk & (chunk - 1) ) chunk &= chunk - 1; if ( off + bytes > PAGE_SIZE ) while ( off & (chunk - 1) ) chunk >>= 1; if ( unlikely(vio->mmio_gva == (addr & PAGE_MASK)) && vio->mmio_gva ) { gpa = (((paddr_t)vio->mmio_gpfn << PAGE_SHIFT) | off); while ( (off + chunk) <= PAGE_SIZE ) { rc = hvmemul_do_mmio(gpa, &reps, chunk, 0, IOREQ_WRITE, 0, p_data); if ( rc != X86EMUL_OKAY || bytes == chunk ) return rc; addr += chunk; off += chunk; gpa += chunk; p_data += chunk; bytes -= chunk; if ( bytes < chunk ) chunk = bytes; } } if ( (seg != x86_seg_none) && (hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3) ) pfec |= PFEC_user_mode; rc = hvm_copy_to_guest_virt(addr, p_data, bytes, pfec); switch ( rc ) { case HVMCOPY_okay: break; case HVMCOPY_bad_gva_to_gfn: return X86EMUL_EXCEPTION; case HVMCOPY_bad_gfn_to_mfn: rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec, hvmemul_ctxt); while ( rc == X86EMUL_OKAY ) { rc = hvmemul_do_mmio(gpa, &reps, chunk, 0, IOREQ_WRITE, 0, p_data); if ( rc != X86EMUL_OKAY || bytes == chunk ) break; addr += chunk; off += chunk; p_data += chunk; bytes -= chunk; if ( bytes < chunk ) chunk = bytes; if ( off < PAGE_SIZE ) gpa += chunk; else { rc = hvmemul_linear_to_phys(addr, &gpa, chunk, &reps, pfec, hvmemul_ctxt); off = 0; } } return rc; case HVMCOPY_gfn_paged_out: case HVMCOPY_gfn_shared: return X86EMUL_RETRY; default: return X86EMUL_UNHANDLEABLE; } return X86EMUL_OKAY; } static int hvmemul_cmpxchg( enum x86_segment seg, unsigned long offset, void *p_old, void *p_new, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { /* Fix this in case the guest is really relying on r-m-w atomicity. */ return hvmemul_write(seg, offset, p_new, bytes, ctxt); } static int hvmemul_rep_ins( uint16_t src_port, enum x86_segment dst_seg, unsigned long dst_offset, unsigned int bytes_per_rep, unsigned long *reps, struct x86_emulate_ctxt *ctxt) { struct hvm_emulate_ctxt *hvmemul_ctxt = container_of(ctxt, struct hvm_emulate_ctxt, ctxt); unsigned long addr; uint32_t pfec = PFEC_page_present | PFEC_write_access; paddr_t gpa; p2m_type_t p2mt; int rc; rc = hvmemul_virtual_to_linear( dst_seg, dst_offset, bytes_per_rep, reps, hvm_access_write, hvmemul_ctxt, &addr); if ( rc != X86EMUL_OKAY ) return rc; if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 ) pfec |= PFEC_user_mode; rc = hvmemul_linear_to_phys( addr, &gpa, bytes_per_rep, reps, pfec, hvmemul_ctxt); if ( rc != X86EMUL_OKAY ) return rc; (void) get_gfn_query_unlocked(current->domain, gpa >> PAGE_SHIFT, &p2mt); if ( p2mt == p2m_mmio_direct || p2mt == p2m_mmio_dm ) return X86EMUL_UNHANDLEABLE; return hvmemul_do_pio(src_port, reps, bytes_per_rep, gpa, IOREQ_READ, !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL); } static int hvmemul_rep_outs( enum x86_segment src_seg, unsigned long src_offset, uint16_t dst_port, unsigned int bytes_per_rep, unsigned long *reps, struct x86_emulate_ctxt *ctxt) { struct hvm_emulate_ctxt *hvmemul_ctxt = container_of(ctxt, struct hvm_emulate_ctxt, ctxt); unsigned long addr; uint32_t pfec = PFEC_page_present; paddr_t gpa; p2m_type_t p2mt; int rc; rc = hvmemul_virtual_to_linear( src_seg, src_offset, bytes_per_rep, reps, hvm_access_read, hvmemul_ctxt, &addr); if ( rc != X86EMUL_OKAY ) return rc; if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 ) pfec |= PFEC_user_mode; rc = hvmemul_linear_to_phys( addr, &gpa, bytes_per_rep, reps, pfec, hvmemul_ctxt); if ( rc != X86EMUL_OKAY ) return rc; (void) get_gfn_query_unlocked(current->domain, gpa >> PAGE_SHIFT, &p2mt); if ( p2mt == p2m_mmio_direct || p2mt == p2m_mmio_dm ) return X86EMUL_UNHANDLEABLE; return hvmemul_do_pio(dst_port, reps, bytes_per_rep, gpa, IOREQ_WRITE, !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL); } static int hvmemul_rep_movs( enum x86_segment src_seg, unsigned long src_offset, enum x86_segment dst_seg, unsigned long dst_offset, unsigned int bytes_per_rep, unsigned long *reps, struct x86_emulate_ctxt *ctxt) { struct hvm_emulate_ctxt *hvmemul_ctxt = container_of(ctxt, struct hvm_emulate_ctxt, ctxt); unsigned long saddr, daddr, bytes; paddr_t sgpa, dgpa; uint32_t pfec = PFEC_page_present; p2m_type_t sp2mt, dp2mt; int rc, df = !!(ctxt->regs->eflags & X86_EFLAGS_DF); char *buf; rc = hvmemul_virtual_to_linear( src_seg, src_offset, bytes_per_rep, reps, hvm_access_read, hvmemul_ctxt, &saddr); if ( rc != X86EMUL_OKAY ) return rc; rc = hvmemul_virtual_to_linear( dst_seg, dst_offset, bytes_per_rep, reps, hvm_access_write, hvmemul_ctxt, &daddr); if ( rc != X86EMUL_OKAY ) return rc; if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 ) pfec |= PFEC_user_mode; rc = hvmemul_linear_to_phys( saddr, &sgpa, bytes_per_rep, reps, pfec, hvmemul_ctxt); if ( rc != X86EMUL_OKAY ) return rc; rc = hvmemul_linear_to_phys( daddr, &dgpa, bytes_per_rep, reps, pfec | PFEC_write_access, hvmemul_ctxt); if ( rc != X86EMUL_OKAY ) return rc; /* Check for MMIO ops */ (void) get_gfn_query_unlocked(current->domain, sgpa >> PAGE_SHIFT, &sp2mt); (void) get_gfn_query_unlocked(current->domain, dgpa >> PAGE_SHIFT, &dp2mt); if ( sp2mt == p2m_mmio_direct || dp2mt == p2m_mmio_direct || (sp2mt == p2m_mmio_dm && dp2mt == p2m_mmio_dm) ) return X86EMUL_UNHANDLEABLE; if ( sp2mt == p2m_mmio_dm ) return hvmemul_do_mmio( sgpa, reps, bytes_per_rep, dgpa, IOREQ_READ, df, NULL); if ( dp2mt == p2m_mmio_dm ) return hvmemul_do_mmio( dgpa, reps, bytes_per_rep, sgpa, IOREQ_WRITE, df, NULL); /* RAM-to-RAM copy: emulate as equivalent of memmove(dgpa, sgpa, bytes). */ bytes = *reps * bytes_per_rep; /* Adjust source address for reverse copy. */ if ( df ) sgpa -= bytes - bytes_per_rep; /* * Will first iteration copy fall within source range? If not then entire * copy does not corrupt itself. If so, then this is more complex than * can be emulated by a source-to-buffer-to-destination block copy. */ if ( ((dgpa + bytes_per_rep) > sgpa) && (dgpa < (sgpa + bytes)) ) return X86EMUL_UNHANDLEABLE; /* Adjust destination address for reverse copy. */ if ( df ) dgpa -= bytes - bytes_per_rep; /* Allocate temporary buffer. Fall back to slow emulation if this fails. */ buf = xmalloc_bytes(bytes); if ( buf == NULL ) return X86EMUL_UNHANDLEABLE; /* * We do a modicum of checking here, just for paranoia's sake and to * definitely avoid copying an unitialised buffer into guest address space. */ rc = hvm_copy_from_guest_phys(buf, sgpa, bytes); if ( rc == HVMCOPY_okay ) rc = hvm_copy_to_guest_phys(dgpa, buf, bytes); xfree(buf); if ( rc == HVMCOPY_gfn_paged_out ) return X86EMUL_RETRY; if ( rc == HVMCOPY_gfn_shared ) return X86EMUL_RETRY; if ( rc != HVMCOPY_okay ) { gdprintk(XENLOG_WARNING, "Failed memory-to-memory REP MOVS: sgpa=%" PRIpaddr" dgpa=%"PRIpaddr" reps=%lu bytes_per_rep=%u\n", sgpa, dgpa, *reps, bytes_per_rep); return X86EMUL_UNHANDLEABLE; } return X86EMUL_OKAY; } static int hvmemul_read_segment( enum x86_segment seg, struct segment_register *reg, struct x86_emulate_ctxt *ctxt) { struct hvm_emulate_ctxt *hvmemul_ctxt = container_of(ctxt, struct hvm_emulate_ctxt, ctxt); struct segment_register *sreg = hvmemul_get_seg_reg(seg, hvmemul_ctxt); memcpy(reg, sreg, sizeof(struct segment_register)); return X86EMUL_OKAY; } static int hvmemul_write_segment( enum x86_segment seg, struct segment_register *reg, struct x86_emulate_ctxt *ctxt) { struct hvm_emulate_ctxt *hvmemul_ctxt = container_of(ctxt, struct hvm_emulate_ctxt, ctxt); struct segment_register *sreg = hvmemul_get_seg_reg(seg, hvmemul_ctxt); memcpy(sreg, reg, sizeof(struct segment_register)); __set_bit(seg, &hvmemul_ctxt->seg_reg_dirty); return X86EMUL_OKAY; } static int hvmemul_read_io( unsigned int port, unsigned int bytes, unsigned long *val, struct x86_emulate_ctxt *ctxt) { unsigned long reps = 1; *val = 0; return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_READ, 0, val); } static int hvmemul_write_io( unsigned int port, unsigned int bytes, unsigned long val, struct x86_emulate_ctxt *ctxt) { unsigned long reps = 1; return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_WRITE, 0, &val); } static int hvmemul_read_cr( unsigned int reg, unsigned long *val, struct x86_emulate_ctxt *ctxt) { switch ( reg ) { case 0: case 2: case 3: case 4: *val = current->arch.hvm_vcpu.guest_cr[reg]; HVMTRACE_LONG_2D(CR_READ, reg, TRC_PAR_LONG(*val)); return X86EMUL_OKAY; default: break; } return X86EMUL_UNHANDLEABLE; } static int hvmemul_write_cr( unsigned int reg, unsigned long val, struct x86_emulate_ctxt *ctxt) { HVMTRACE_LONG_2D(CR_WRITE, reg, TRC_PAR_LONG(val)); switch ( reg ) { case 0: return hvm_set_cr0(val); case 2: current->arch.hvm_vcpu.guest_cr[2] = val; return X86EMUL_OKAY; case 3: return hvm_set_cr3(val); case 4: return hvm_set_cr4(val); default: break; } return X86EMUL_UNHANDLEABLE; } static int hvmemul_read_msr( unsigned long reg, uint64_t *val, struct x86_emulate_ctxt *ctxt) { return hvm_msr_read_intercept(reg, val); } static int hvmemul_write_msr( unsigned long reg, uint64_t val, struct x86_emulate_ctxt *ctxt) { return hvm_msr_write_intercept(reg, val); } static int hvmemul_wbinvd( struct x86_emulate_ctxt *ctxt) { hvm_funcs.wbinvd_intercept(); return X86EMUL_OKAY; } static int hvmemul_cpuid( unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx, struct x86_emulate_ctxt *ctxt) { hvm_funcs.cpuid_intercept(eax, ebx, ecx, edx); return X86EMUL_OKAY; } static int hvmemul_inject_hw_exception( uint8_t vector, int32_t error_code, struct x86_emulate_ctxt *ctxt) { struct hvm_emulate_ctxt *hvmemul_ctxt = container_of(ctxt, struct hvm_emulate_ctxt, ctxt); hvmemul_ctxt->exn_pending = 1; hvmemul_ctxt->exn_vector = vector; hvmemul_ctxt->exn_error_code = error_code; hvmemul_ctxt->exn_insn_len = 0; return X86EMUL_OKAY; } static int hvmemul_inject_sw_interrupt( uint8_t vector, uint8_t insn_len, struct x86_emulate_ctxt *ctxt) { struct hvm_emulate_ctxt *hvmemul_ctxt = container_of(ctxt, struct hvm_emulate_ctxt, ctxt); hvmemul_ctxt->exn_pending = 1; hvmemul_ctxt->exn_vector = vector; hvmemul_ctxt->exn_error_code = -1; hvmemul_ctxt->exn_insn_len = insn_len; return X86EMUL_OKAY; } static int hvmemul_get_fpu( void (*exception_callback)(void *, struct cpu_user_regs *), void *exception_callback_arg, enum x86_emulate_fpu_type type, struct x86_emulate_ctxt *ctxt) { struct vcpu *curr = current; switch ( type ) { case X86EMUL_FPU_fpu: break; case X86EMUL_FPU_mmx: if ( !cpu_has_mmx ) return X86EMUL_UNHANDLEABLE; break; case X86EMUL_FPU_xmm: if ( !cpu_has_xmm || (curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_EM) || !(curr->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSFXSR) ) return X86EMUL_UNHANDLEABLE; break; case X86EMUL_FPU_ymm: if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) || vm86_mode(ctxt->regs) || !(curr->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE) || !(curr->arch.xcr0 & XSTATE_SSE) || !(curr->arch.xcr0 & XSTATE_YMM) ) return X86EMUL_UNHANDLEABLE; break; default: return X86EMUL_UNHANDLEABLE; } if ( !curr->fpu_dirtied ) hvm_funcs.fpu_dirty_intercept(); curr->arch.hvm_vcpu.fpu_exception_callback = exception_callback; curr->arch.hvm_vcpu.fpu_exception_callback_arg = exception_callback_arg; return X86EMUL_OKAY; } static void hvmemul_put_fpu( struct x86_emulate_ctxt *ctxt) { struct vcpu *curr = current; curr->arch.hvm_vcpu.fpu_exception_callback = NULL; } static int hvmemul_invlpg( enum x86_segment seg, unsigned long offset, struct x86_emulate_ctxt *ctxt) { struct hvm_emulate_ctxt *hvmemul_ctxt = container_of(ctxt, struct hvm_emulate_ctxt, ctxt); unsigned long addr, reps = 1; int rc; rc = hvmemul_virtual_to_linear( seg, offset, 1, &reps, hvm_access_none, hvmemul_ctxt, &addr); if ( rc == X86EMUL_OKAY ) hvm_funcs.invlpg_intercept(addr); return rc; } static const struct x86_emulate_ops hvm_emulate_ops = { .read = hvmemul_read, .insn_fetch = hvmemul_insn_fetch, .write = hvmemul_write, .cmpxchg = hvmemul_cmpxchg, .rep_ins = hvmemul_rep_ins, .rep_outs = hvmemul_rep_outs, .rep_movs = hvmemul_rep_movs, .read_segment = hvmemul_read_segment, .write_segment = hvmemul_write_segment, .read_io = hvmemul_read_io, .write_io = hvmemul_write_io, .read_cr = hvmemul_read_cr, .write_cr = hvmemul_write_cr, .read_msr = hvmemul_read_msr, .write_msr = hvmemul_write_msr, .wbinvd = hvmemul_wbinvd, .cpuid = hvmemul_cpuid, .inject_hw_exception = hvmemul_inject_hw_exception, .inject_sw_interrupt = hvmemul_inject_sw_interrupt, .get_fpu = hvmemul_get_fpu, .put_fpu = hvmemul_put_fpu, .invlpg = hvmemul_invlpg }; int hvm_emulate_one( struct hvm_emulate_ctxt *hvmemul_ctxt) { struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs; struct vcpu *curr = current; uint32_t new_intr_shadow, pfec = PFEC_page_present; struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io; unsigned long addr; int rc; if ( hvm_long_mode_enabled(curr) && hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.l ) { hvmemul_ctxt->ctxt.addr_size = hvmemul_ctxt->ctxt.sp_size = 64; } else { hvmemul_ctxt->ctxt.addr_size = hvmemul_ctxt->seg_reg[x86_seg_cs].attr.fields.db ? 32 : 16; hvmemul_ctxt->ctxt.sp_size = hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.db ? 32 : 16; } if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 ) pfec |= PFEC_user_mode; hvmemul_ctxt->insn_buf_eip = regs->eip; if ( !vio->mmio_insn_bytes ) { hvmemul_ctxt->insn_buf_bytes = hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?: (hvm_virtual_to_linear_addr(x86_seg_cs, &hvmemul_ctxt->seg_reg[x86_seg_cs], regs->eip, sizeof(hvmemul_ctxt->insn_buf), hvm_access_insn_fetch, hvmemul_ctxt->ctxt.addr_size, &addr) && hvm_fetch_from_guest_virt_nofault(hvmemul_ctxt->insn_buf, addr, sizeof(hvmemul_ctxt->insn_buf), pfec) == HVMCOPY_okay) ? sizeof(hvmemul_ctxt->insn_buf) : 0; } else { hvmemul_ctxt->insn_buf_bytes = vio->mmio_insn_bytes; memcpy(hvmemul_ctxt->insn_buf, vio->mmio_insn, vio->mmio_insn_bytes); } hvmemul_ctxt->exn_pending = 0; vio->mmio_retrying = vio->mmio_retry; vio->mmio_retry = 0; rc = x86_emulate(&hvmemul_ctxt->ctxt, &hvm_emulate_ops); if ( rc == X86EMUL_OKAY && vio->mmio_retry ) rc = X86EMUL_RETRY; if ( rc != X86EMUL_RETRY ) { vio->mmio_large_read_bytes = vio->mmio_large_write_bytes = 0; vio->mmio_insn_bytes = 0; } else { BUILD_BUG_ON(sizeof(vio->mmio_insn) < sizeof(hvmemul_ctxt->insn_buf)); vio->mmio_insn_bytes = hvmemul_ctxt->insn_buf_bytes; memcpy(vio->mmio_insn, hvmemul_ctxt->insn_buf, vio->mmio_insn_bytes); } if ( rc != X86EMUL_OKAY ) return rc; new_intr_shadow = hvmemul_ctxt->intr_shadow; /* MOV-SS instruction toggles MOV-SS shadow, else we just clear it. */ if ( hvmemul_ctxt->ctxt.retire.flags.mov_ss ) new_intr_shadow ^= HVM_INTR_SHADOW_MOV_SS; else new_intr_shadow &= ~HVM_INTR_SHADOW_MOV_SS; /* STI instruction toggles STI shadow, else we just clear it. */ if ( hvmemul_ctxt->ctxt.retire.flags.sti ) new_intr_shadow ^= HVM_INTR_SHADOW_STI; else new_intr_shadow &= ~HVM_INTR_SHADOW_STI; if ( hvmemul_ctxt->intr_shadow != new_intr_shadow ) { hvmemul_ctxt->intr_shadow = new_intr_shadow; hvm_funcs.set_interrupt_shadow(curr, new_intr_shadow); } if ( hvmemul_ctxt->ctxt.retire.flags.hlt && !hvm_local_events_need_delivery(curr) ) { hvm_hlt(regs->eflags); } return X86EMUL_OKAY; } void hvm_emulate_prepare( struct hvm_emulate_ctxt *hvmemul_ctxt, struct cpu_user_regs *regs) { hvmemul_ctxt->intr_shadow = hvm_funcs.get_interrupt_shadow(current); hvmemul_ctxt->ctxt.regs = regs; hvmemul_ctxt->ctxt.force_writeback = 1; hvmemul_ctxt->seg_reg_accessed = 0; hvmemul_ctxt->seg_reg_dirty = 0; hvmemul_get_seg_reg(x86_seg_cs, hvmemul_ctxt); hvmemul_get_seg_reg(x86_seg_ss, hvmemul_ctxt); } void hvm_emulate_writeback( struct hvm_emulate_ctxt *hvmemul_ctxt) { enum x86_segment seg; seg = find_first_bit(&hvmemul_ctxt->seg_reg_dirty, ARRAY_SIZE(hvmemul_ctxt->seg_reg)); while ( seg < ARRAY_SIZE(hvmemul_ctxt->seg_reg) ) { hvm_set_segment_register(current, seg, &hvmemul_ctxt->seg_reg[seg]); seg = find_next_bit(&hvmemul_ctxt->seg_reg_dirty, ARRAY_SIZE(hvmemul_ctxt->seg_reg), seg+1); } } struct segment_register *hvmemul_get_seg_reg( enum x86_segment seg, struct hvm_emulate_ctxt *hvmemul_ctxt) { if ( !__test_and_set_bit(seg, &hvmemul_ctxt->seg_reg_accessed) ) hvm_get_segment_register(current, seg, &hvmemul_ctxt->seg_reg[seg]); return &hvmemul_ctxt->seg_reg[seg]; } xen-4.4.0/xen/arch/x86/hvm/vpic.c0000664000175000017500000003164412307313555014506 0ustar smbsmb/* * i8259 interrupt controller emulation * * Copyright (c) 2003-2004 Fabrice Bellard * Copyright (c) 2005 Intel Corperation * Copyright (c) 2006 Keir Fraser, XenSource Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #define vpic_domain(v) (container_of((v), struct domain, \ arch.hvm_domain.vpic[!vpic->is_master])) #define __vpic_lock(v) &container_of((v), struct hvm_domain, \ vpic[!(v)->is_master])->irq_lock #define vpic_lock(v) spin_lock(__vpic_lock(v)) #define vpic_unlock(v) spin_unlock(__vpic_lock(v)) #define vpic_is_locked(v) spin_is_locked(__vpic_lock(v)) #define vpic_elcr_mask(v) (vpic->is_master ? (uint8_t)0xf8 : (uint8_t)0xde); /* Return the highest priority found in mask. Return 8 if none. */ #define VPIC_PRIO_NONE 8 static int vpic_get_priority(struct hvm_hw_vpic *vpic, uint8_t mask) { int prio; ASSERT(vpic_is_locked(vpic)); if ( mask == 0 ) return VPIC_PRIO_NONE; /* prio = ffs(mask ROR vpic->priority_add); */ asm ( "ror %%cl,%b1 ; bsf %1,%0" : "=r" (prio) : "q" ((uint32_t)mask), "c" (vpic->priority_add) ); return prio; } /* Return the PIC's highest priority pending interrupt. Return -1 if none. */ static int vpic_get_highest_priority_irq(struct hvm_hw_vpic *vpic) { int cur_priority, priority, irq; uint8_t mask; ASSERT(vpic_is_locked(vpic)); mask = vpic->irr & ~vpic->imr; priority = vpic_get_priority(vpic, mask); if ( priority == VPIC_PRIO_NONE ) return -1; irq = (priority + vpic->priority_add) & 7; /* * Compute current priority. If special fully nested mode on the master, * the IRQ coming from the slave is not taken into account for the * priority computation. In special mask mode, masked interrupts do not * block lower-priority interrupts even if their IS bit is set. */ mask = vpic->isr; if ( vpic->special_fully_nested_mode && vpic->is_master && (irq == 2) ) mask &= ~(1 << 2); if ( vpic->special_mask_mode ) mask &= ~vpic->imr; cur_priority = vpic_get_priority(vpic, mask); /* If a higher priority is found then an irq should be generated. */ return (priority < cur_priority) ? irq : -1; } static void vpic_update_int_output(struct hvm_hw_vpic *vpic) { int irq; ASSERT(vpic_is_locked(vpic)); irq = vpic_get_highest_priority_irq(vpic); if ( vpic->int_output == (irq >= 0) ) return; /* INT line transition L->H or H->L. */ vpic->int_output = !vpic->int_output; if ( vpic->int_output ) { if ( vpic->is_master ) { /* Master INT line is connected in Virtual Wire Mode. */ struct vcpu *v = vpic_domain(vpic)->arch.hvm_domain.i8259_target; if ( v != NULL ) vcpu_kick(v); } else { /* Assert slave line in master PIC. */ (--vpic)->irr |= 1 << 2; vpic_update_int_output(vpic); } } else if ( !vpic->is_master ) { /* Clear slave line in master PIC. */ (--vpic)->irr &= ~(1 << 2); vpic_update_int_output(vpic); } } static void __vpic_intack(struct hvm_hw_vpic *vpic, int irq) { uint8_t mask = 1 << irq; ASSERT(vpic_is_locked(vpic)); /* Edge-triggered: clear the IRR (forget the edge). */ if ( !(vpic->elcr & mask) ) vpic->irr &= ~mask; if ( !vpic->auto_eoi ) vpic->isr |= mask; else if ( vpic->rotate_on_auto_eoi ) vpic->priority_add = (irq + 1) & 7; vpic_update_int_output(vpic); } static int vpic_intack(struct hvm_hw_vpic *vpic) { int irq = -1; vpic_lock(vpic); if ( !vpic->int_output ) goto out; irq = vpic_get_highest_priority_irq(vpic); BUG_ON(irq < 0); __vpic_intack(vpic, irq); if ( (irq == 2) && vpic->is_master ) { vpic++; /* Slave PIC */ irq = vpic_get_highest_priority_irq(vpic); BUG_ON(irq < 0); __vpic_intack(vpic, irq); irq += 8; } out: vpic_unlock(vpic); return irq; } static void vpic_ioport_write( struct hvm_hw_vpic *vpic, uint32_t addr, uint32_t val) { int priority, cmd, irq; uint8_t mask, unmasked = 0; vpic_lock(vpic); if ( (addr & 1) == 0 ) { if ( val & 0x10 ) { /* ICW1 */ /* Clear edge-sensing logic. */ vpic->irr &= vpic->elcr; unmasked = vpic->imr; /* No interrupts masked or in service. */ vpic->imr = vpic->isr = 0; /* IR7 is lowest priority. */ vpic->priority_add = 0; vpic->rotate_on_auto_eoi = 0; vpic->special_mask_mode = 0; vpic->readsel_isr = 0; vpic->poll = 0; if ( !(val & 1) ) { /* NO ICW4: ICW4 features are cleared. */ vpic->auto_eoi = 0; vpic->special_fully_nested_mode = 0; } vpic->init_state = ((val & 3) << 2) | 1; } else if ( val & 0x08 ) { /* OCW3 */ if ( val & 0x04 ) vpic->poll = 1; if ( val & 0x02 ) vpic->readsel_isr = val & 1; if ( val & 0x40 ) vpic->special_mask_mode = (val >> 5) & 1; } else { /* OCW2 */ cmd = val >> 5; switch ( cmd ) { case 0: /* Rotate in AEOI Mode (Clear) */ case 4: /* Rotate in AEOI Mode (Set) */ vpic->rotate_on_auto_eoi = cmd >> 2; break; case 1: /* Non-Specific EOI */ case 5: /* Non-Specific EOI & Rotate */ mask = vpic->isr; if ( vpic->special_mask_mode ) mask &= ~vpic->imr; /* SMM: ignore masked IRs. */ priority = vpic_get_priority(vpic, mask); if ( priority == VPIC_PRIO_NONE ) break; irq = (priority + vpic->priority_add) & 7; vpic->isr &= ~(1 << irq); if ( cmd == 5 ) vpic->priority_add = (irq + 1) & 7; break; case 3: /* Specific EOI */ case 7: /* Specific EOI & Rotate */ irq = val & 7; vpic->isr &= ~(1 << irq); if ( cmd == 7 ) vpic->priority_add = (irq + 1) & 7; /* Release lock and EOI the physical interrupt (if any). */ vpic_update_int_output(vpic); vpic_unlock(vpic); hvm_dpci_eoi(current->domain, hvm_isa_irq_to_gsi((addr >> 7) ? (irq|8) : irq), NULL); return; /* bail immediately */ case 6: /* Set Priority */ vpic->priority_add = (val + 1) & 7; break; } } } else { switch ( vpic->init_state & 3 ) { case 0: /* OCW1 */ unmasked = vpic->imr & (~val); vpic->imr = val; break; case 1: /* ICW2 */ vpic->irq_base = val & 0xf8; vpic->init_state++; if ( !(vpic->init_state & 8) ) break; /* CASCADE mode: wait for write to ICW3. */ /* SNGL mode: fall through (no ICW3). */ case 2: /* ICW3 */ vpic->init_state++; if ( !(vpic->init_state & 4) ) vpic->init_state = 0; /* No ICW4: init done */ break; case 3: /* ICW4 */ vpic->special_fully_nested_mode = (val >> 4) & 1; vpic->auto_eoi = (val >> 1) & 1; vpic->init_state = 0; break; } } vpic_update_int_output(vpic); vpic_unlock(vpic); if ( unmasked ) pt_may_unmask_irq(vpic_domain(vpic), NULL); } static uint32_t vpic_ioport_read(struct hvm_hw_vpic *vpic, uint32_t addr) { if ( vpic->poll ) { vpic->poll = 0; return vpic_intack(vpic); } if ( (addr & 1) == 0 ) return (vpic->readsel_isr ? vpic->isr : vpic->irr); return vpic->imr; } static int vpic_intercept_pic_io( int dir, uint32_t port, uint32_t bytes, uint32_t *val) { struct hvm_hw_vpic *vpic; if ( bytes != 1 ) { gdprintk(XENLOG_WARNING, "PIC_IO bad access size %d\n", bytes); return X86EMUL_OKAY; } vpic = ¤t->domain->arch.hvm_domain.vpic[port >> 7]; if ( dir == IOREQ_WRITE ) vpic_ioport_write(vpic, port, (uint8_t)*val); else *val = (uint8_t)vpic_ioport_read(vpic, port); return X86EMUL_OKAY; } static int vpic_intercept_elcr_io( int dir, uint32_t port, uint32_t bytes, uint32_t *val) { struct hvm_hw_vpic *vpic; uint32_t data; BUG_ON(bytes != 1); vpic = ¤t->domain->arch.hvm_domain.vpic[port & 1]; if ( dir == IOREQ_WRITE ) { /* Some IRs are always edge trig. Slave IR is always level trig. */ data = *val & vpic_elcr_mask(vpic); if ( vpic->is_master ) data |= 1 << 2; vpic->elcr = data; } else { /* Reader should not see hardcoded level-triggered slave IR. */ *val = vpic->elcr & vpic_elcr_mask(vpic); } return X86EMUL_OKAY; } static int vpic_save(struct domain *d, hvm_domain_context_t *h) { struct hvm_hw_vpic *s; int i; /* Save the state of both PICs */ for ( i = 0; i < 2 ; i++ ) { s = &d->arch.hvm_domain.vpic[i]; if ( hvm_save_entry(PIC, i, h, s) ) return 1; } return 0; } static int vpic_load(struct domain *d, hvm_domain_context_t *h) { struct hvm_hw_vpic *s; uint16_t inst; /* Which PIC is this? */ inst = hvm_load_instance(h); if ( inst > 1 ) return -EINVAL; s = &d->arch.hvm_domain.vpic[inst]; /* Load the state */ if ( hvm_load_entry(PIC, h, s) != 0 ) return -EINVAL; return 0; } HVM_REGISTER_SAVE_RESTORE(PIC, vpic_save, vpic_load, 2, HVMSR_PER_DOM); void vpic_reset(struct domain *d) { struct hvm_hw_vpic *vpic; /* Master PIC. */ vpic = &d->arch.hvm_domain.vpic[0]; memset(vpic, 0, sizeof(*vpic)); vpic->is_master = 1; vpic->elcr = 1 << 2; /* Slave PIC. */ vpic++; memset(vpic, 0, sizeof(*vpic)); } void vpic_init(struct domain *d) { vpic_reset(d); register_portio_handler(d, 0x20, 2, vpic_intercept_pic_io); register_portio_handler(d, 0xa0, 2, vpic_intercept_pic_io); register_portio_handler(d, 0x4d0, 1, vpic_intercept_elcr_io); register_portio_handler(d, 0x4d1, 1, vpic_intercept_elcr_io); } void vpic_irq_positive_edge(struct domain *d, int irq) { struct hvm_hw_vpic *vpic = &d->arch.hvm_domain.vpic[irq >> 3]; uint8_t mask = 1 << (irq & 7); ASSERT(irq <= 15); ASSERT(vpic_is_locked(vpic)); if ( irq == 2 ) return; vpic->irr |= mask; if ( !(vpic->imr & mask) ) vpic_update_int_output(vpic); } void vpic_irq_negative_edge(struct domain *d, int irq) { struct hvm_hw_vpic *vpic = &d->arch.hvm_domain.vpic[irq >> 3]; uint8_t mask = 1 << (irq & 7); ASSERT(irq <= 15); ASSERT(vpic_is_locked(vpic)); if ( irq == 2 ) return; vpic->irr &= ~mask; if ( !(vpic->imr & mask) ) vpic_update_int_output(vpic); } int vpic_ack_pending_irq(struct vcpu *v) { int irq, vector; struct hvm_hw_vpic *vpic = &v->domain->arch.hvm_domain.vpic[0]; if ( !vlapic_accept_pic_intr(v) || !vpic->int_output ) return -1; irq = vpic_intack(vpic); if ( irq == -1 ) return -1; vector = vpic[irq >> 3].irq_base + (irq & 7); return vector; } xen-4.4.0/xen/arch/x86/hvm/stdvga.c0000664000175000017500000004106512307313555015033 0ustar smbsmb/* * Copyright (c) 2003-2007, Virtual Iron Software, Inc. * * Portions have been modified by Virtual Iron Software, Inc. * (c) 2007. This file and the modifications can be redistributed and/or * modified under the terms and conditions of the GNU General Public * License, version 2.1 and not any later version of the GPL, as published * by the Free Software Foundation. * * This improves the performance of Standard VGA, * the mode used during Windows boot and by the Linux * splash screen. * * It does so by buffering all the stdvga programmed output ops * and memory mapped ops (both reads and writes) that are sent to QEMU. * * We maintain locally essential VGA state so we can respond * immediately to input and read ops without waiting for * QEMU. We snoop output and write ops to keep our state * up-to-date. * * PIO input ops are satisfied from cached state without * bothering QEMU. * * PIO output and mmio ops are passed through to QEMU, including * mmio read ops. This is necessary because mmio reads * can have side effects. */ #include #include #include #include #include #include #include #define VGA_MEM_BASE 0xa0000 #define VGA_MEM_SIZE 0x20000 #define PAT(x) (x) static const uint32_t mask16[16] = { PAT(0x00000000), PAT(0x000000ff), PAT(0x0000ff00), PAT(0x0000ffff), PAT(0x00ff0000), PAT(0x00ff00ff), PAT(0x00ffff00), PAT(0x00ffffff), PAT(0xff000000), PAT(0xff0000ff), PAT(0xff00ff00), PAT(0xff00ffff), PAT(0xffff0000), PAT(0xffff00ff), PAT(0xffffff00), PAT(0xffffffff), }; /* force some bits to zero */ static const uint8_t sr_mask[8] = { (uint8_t)~0xfc, (uint8_t)~0xc2, (uint8_t)~0xf0, (uint8_t)~0xc0, (uint8_t)~0xf1, (uint8_t)~0xff, (uint8_t)~0xff, (uint8_t)~0x00, }; static const uint8_t gr_mask[9] = { (uint8_t)~0xf0, /* 0x00 */ (uint8_t)~0xf0, /* 0x01 */ (uint8_t)~0xf0, /* 0x02 */ (uint8_t)~0xe0, /* 0x03 */ (uint8_t)~0xfc, /* 0x04 */ (uint8_t)~0x84, /* 0x05 */ (uint8_t)~0xf0, /* 0x06 */ (uint8_t)~0xf0, /* 0x07 */ (uint8_t)~0x00, /* 0x08 */ }; static uint8_t *vram_getb(struct hvm_hw_stdvga *s, unsigned int a) { struct page_info *pg = s->vram_page[(a >> 12) & 0x3f]; uint8_t *p = __map_domain_page(pg); return &p[a & 0xfff]; } static uint32_t *vram_getl(struct hvm_hw_stdvga *s, unsigned int a) { struct page_info *pg = s->vram_page[(a >> 10) & 0x3f]; uint32_t *p = __map_domain_page(pg); return &p[a & 0x3ff]; } static void vram_put(struct hvm_hw_stdvga *s, void *p) { unmap_domain_page(p); } static int stdvga_outb(uint64_t addr, uint8_t val) { struct hvm_hw_stdvga *s = ¤t->domain->arch.hvm_domain.stdvga; int rc = 1, prev_stdvga = s->stdvga; switch ( addr ) { case 0x3c4: /* sequencer address register */ s->sr_index = val; break; case 0x3c5: /* sequencer data register */ rc = (s->sr_index < sizeof(s->sr)); if ( rc ) s->sr[s->sr_index] = val & sr_mask[s->sr_index] ; break; case 0x3ce: /* graphics address register */ s->gr_index = val; break; case 0x3cf: /* graphics data register */ rc = (s->gr_index < sizeof(s->gr)); if ( rc ) s->gr[s->gr_index] = val & gr_mask[s->gr_index]; break; default: rc = 0; break; } /* When in standard vga mode, emulate here all writes to the vram buffer * so we can immediately satisfy reads without waiting for qemu. */ s->stdvga = (s->sr[7] == 0x00); if ( !prev_stdvga && s->stdvga ) { /* * (Re)start caching of video buffer. * XXX TODO: In case of a restart the cache could be unsynced. */ s->cache = 1; gdprintk(XENLOG_INFO, "entering stdvga and caching modes\n"); } else if ( prev_stdvga && !s->stdvga ) { gdprintk(XENLOG_INFO, "leaving stdvga\n"); } return rc; } static void stdvga_out(uint32_t port, uint32_t bytes, uint32_t val) { switch ( bytes ) { case 1: stdvga_outb(port, val); break; case 2: stdvga_outb(port + 0, val >> 0); stdvga_outb(port + 1, val >> 8); break; default: break; } } static int stdvga_intercept_pio( int dir, uint32_t port, uint32_t bytes, uint32_t *val) { struct hvm_hw_stdvga *s = ¤t->domain->arch.hvm_domain.stdvga; if ( dir == IOREQ_WRITE ) { spin_lock(&s->lock); stdvga_out(port, bytes, *val); spin_unlock(&s->lock); } return X86EMUL_UNHANDLEABLE; /* propagate to external ioemu */ } static unsigned int stdvga_mem_offset( struct hvm_hw_stdvga *s, unsigned int mmio_addr) { unsigned int memory_map_mode = (s->gr[6] >> 2) & 3; unsigned int offset = mmio_addr & 0x1ffff; switch ( memory_map_mode ) { case 0: break; case 1: if ( offset >= 0x10000 ) goto fail; offset += 0; /* assume bank_offset == 0; */ break; case 2: offset -= 0x10000; if ( offset >= 0x8000 ) goto fail; break; default: case 3: offset -= 0x18000; if ( offset >= 0x8000 ) goto fail; break; } return offset; fail: return ~0u; } #define GET_PLANE(data, p) (((data) >> ((p) * 8)) & 0xff) static uint8_t stdvga_mem_readb(uint64_t addr) { struct hvm_hw_stdvga *s = ¤t->domain->arch.hvm_domain.stdvga; int plane; uint32_t ret, *vram_l; uint8_t *vram_b; addr = stdvga_mem_offset(s, addr); if ( addr == ~0u ) return 0xff; if ( s->sr[4] & 0x08 ) { /* chain 4 mode : simplest access */ vram_b = vram_getb(s, addr); ret = *vram_b; vram_put(s, vram_b); } else if ( s->gr[5] & 0x10 ) { /* odd/even mode (aka text mode mapping) */ plane = (s->gr[4] & 2) | (addr & 1); vram_b = vram_getb(s, ((addr & ~1) << 1) | plane); ret = *vram_b; vram_put(s, vram_b); } else { /* standard VGA latched access */ vram_l = vram_getl(s, addr); s->latch = *vram_l; vram_put(s, vram_l); if ( !(s->gr[5] & 0x08) ) { /* read mode 0 */ plane = s->gr[4]; ret = GET_PLANE(s->latch, plane); } else { /* read mode 1 */ ret = (s->latch ^ mask16[s->gr[2]]) & mask16[s->gr[7]]; ret |= ret >> 16; ret |= ret >> 8; ret = (~ret) & 0xff; } } return ret; } static uint64_t stdvga_mem_read(uint64_t addr, uint64_t size) { uint64_t data = 0; switch ( size ) { case 1: data = stdvga_mem_readb(addr); break; case 2: data = stdvga_mem_readb(addr); data |= stdvga_mem_readb(addr + 1) << 8; break; case 4: data = stdvga_mem_readb(addr); data |= stdvga_mem_readb(addr + 1) << 8; data |= stdvga_mem_readb(addr + 2) << 16; data |= stdvga_mem_readb(addr + 3) << 24; break; case 8: data = (uint64_t)(stdvga_mem_readb(addr)); data |= (uint64_t)(stdvga_mem_readb(addr + 1)) << 8; data |= (uint64_t)(stdvga_mem_readb(addr + 2)) << 16; data |= (uint64_t)(stdvga_mem_readb(addr + 3)) << 24; data |= (uint64_t)(stdvga_mem_readb(addr + 4)) << 32; data |= (uint64_t)(stdvga_mem_readb(addr + 5)) << 40; data |= (uint64_t)(stdvga_mem_readb(addr + 6)) << 48; data |= (uint64_t)(stdvga_mem_readb(addr + 7)) << 56; break; default: gdprintk(XENLOG_WARNING, "invalid io size: %"PRId64"\n", size); break; } return data; } static void stdvga_mem_writeb(uint64_t addr, uint32_t val) { struct hvm_hw_stdvga *s = ¤t->domain->arch.hvm_domain.stdvga; int plane, write_mode, b, func_select, mask; uint32_t write_mask, bit_mask, set_mask, *vram_l; uint8_t *vram_b; addr = stdvga_mem_offset(s, addr); if ( addr == ~0u ) return; if ( s->sr[4] & 0x08 ) { /* chain 4 mode : simplest access */ plane = addr & 3; mask = (1 << plane); if ( s->sr[2] & mask ) { vram_b = vram_getb(s, addr); *vram_b = val; vram_put(s, vram_b); } } else if ( s->gr[5] & 0x10 ) { /* odd/even mode (aka text mode mapping) */ plane = (s->gr[4] & 2) | (addr & 1); mask = (1 << plane); if ( s->sr[2] & mask ) { addr = ((addr & ~1) << 1) | plane; vram_b = vram_getb(s, addr); *vram_b = val; vram_put(s, vram_b); } } else { write_mode = s->gr[5] & 3; switch ( write_mode ) { default: case 0: /* rotate */ b = s->gr[3] & 7; val = ((val >> b) | (val << (8 - b))) & 0xff; val |= val << 8; val |= val << 16; /* apply set/reset mask */ set_mask = mask16[s->gr[1]]; val = (val & ~set_mask) | (mask16[s->gr[0]] & set_mask); bit_mask = s->gr[8]; break; case 1: val = s->latch; goto do_write; case 2: val = mask16[val & 0x0f]; bit_mask = s->gr[8]; break; case 3: /* rotate */ b = s->gr[3] & 7; val = (val >> b) | (val << (8 - b)); bit_mask = s->gr[8] & val; val = mask16[s->gr[0]]; break; } /* apply logical operation */ func_select = s->gr[3] >> 3; switch ( func_select ) { case 0: default: /* nothing to do */ break; case 1: /* and */ val &= s->latch; break; case 2: /* or */ val |= s->latch; break; case 3: /* xor */ val ^= s->latch; break; } /* apply bit mask */ bit_mask |= bit_mask << 8; bit_mask |= bit_mask << 16; val = (val & bit_mask) | (s->latch & ~bit_mask); do_write: /* mask data according to sr[2] */ mask = s->sr[2]; write_mask = mask16[mask]; vram_l = vram_getl(s, addr); *vram_l = (*vram_l & ~write_mask) | (val & write_mask); vram_put(s, vram_l); } } static void stdvga_mem_write(uint64_t addr, uint64_t data, uint64_t size) { /* Intercept mmio write */ switch ( size ) { case 1: stdvga_mem_writeb(addr, (data >> 0) & 0xff); break; case 2: stdvga_mem_writeb(addr+0, (data >> 0) & 0xff); stdvga_mem_writeb(addr+1, (data >> 8) & 0xff); break; case 4: stdvga_mem_writeb(addr+0, (data >> 0) & 0xff); stdvga_mem_writeb(addr+1, (data >> 8) & 0xff); stdvga_mem_writeb(addr+2, (data >> 16) & 0xff); stdvga_mem_writeb(addr+3, (data >> 24) & 0xff); break; case 8: stdvga_mem_writeb(addr+0, (data >> 0) & 0xff); stdvga_mem_writeb(addr+1, (data >> 8) & 0xff); stdvga_mem_writeb(addr+2, (data >> 16) & 0xff); stdvga_mem_writeb(addr+3, (data >> 24) & 0xff); stdvga_mem_writeb(addr+4, (data >> 32) & 0xff); stdvga_mem_writeb(addr+5, (data >> 40) & 0xff); stdvga_mem_writeb(addr+6, (data >> 48) & 0xff); stdvga_mem_writeb(addr+7, (data >> 56) & 0xff); break; default: gdprintk(XENLOG_WARNING, "invalid io size: %"PRId64"\n", size); break; } } static uint32_t read_data; static int mmio_move(struct hvm_hw_stdvga *s, ioreq_t *p) { int i; uint64_t addr = p->addr; p2m_type_t p2mt; struct domain *d = current->domain; if ( p->data_is_ptr ) { uint64_t data = p->data, tmp; int step = p->df ? -p->size : p->size; if ( p->dir == IOREQ_READ ) { for ( i = 0; i < p->count; i++ ) { tmp = stdvga_mem_read(addr, p->size); if ( hvm_copy_to_guest_phys(data, &tmp, p->size) != HVMCOPY_okay ) { struct page_info *dp = get_page_from_gfn( d, data >> PAGE_SHIFT, &p2mt, P2M_ALLOC); /* * The only case we handle is vga_mem <-> vga_mem. * Anything else disables caching and leaves it to qemu-dm. */ if ( (p2mt != p2m_mmio_dm) || (data < VGA_MEM_BASE) || ((data + p->size) > (VGA_MEM_BASE + VGA_MEM_SIZE)) ) { if ( dp ) put_page(dp); return 0; } ASSERT(!dp); stdvga_mem_write(data, tmp, p->size); } data += step; addr += step; } } else { for ( i = 0; i < p->count; i++ ) { if ( hvm_copy_from_guest_phys(&tmp, data, p->size) != HVMCOPY_okay ) { struct page_info *dp = get_page_from_gfn( d, data >> PAGE_SHIFT, &p2mt, P2M_ALLOC); if ( (p2mt != p2m_mmio_dm) || (data < VGA_MEM_BASE) || ((data + p->size) > (VGA_MEM_BASE + VGA_MEM_SIZE)) ) { if ( dp ) put_page(dp); return 0; } ASSERT(!dp); tmp = stdvga_mem_read(data, p->size); } stdvga_mem_write(addr, tmp, p->size); data += step; addr += step; } } } else { ASSERT(p->count == 1); if ( p->dir == IOREQ_READ ) p->data = stdvga_mem_read(addr, p->size); else stdvga_mem_write(addr, p->data, p->size); } read_data = p->data; return 1; } static int stdvga_intercept_mmio(ioreq_t *p) { struct domain *d = current->domain; struct hvm_hw_stdvga *s = &d->arch.hvm_domain.stdvga; int buf = 0, rc; if ( p->size > 8 ) { gdprintk(XENLOG_WARNING, "invalid mmio size %d\n", (int)p->size); return X86EMUL_UNHANDLEABLE; } spin_lock(&s->lock); if ( s->stdvga && s->cache ) { switch ( p->type ) { case IOREQ_TYPE_COPY: buf = mmio_move(s, p); if ( !buf ) s->cache = 0; break; default: gdprintk(XENLOG_WARNING, "unsupported mmio request type:%d " "addr:0x%04x data:0x%04x size:%d count:%d state:%d " "isptr:%d dir:%d df:%d\n", p->type, (int)p->addr, (int)p->data, (int)p->size, (int)p->count, p->state, p->data_is_ptr, p->dir, p->df); s->cache = 0; } } else { buf = (p->dir == IOREQ_WRITE); } rc = (buf && hvm_buffered_io_send(p)); spin_unlock(&s->lock); return rc ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE; } void stdvga_init(struct domain *d) { struct hvm_hw_stdvga *s = &d->arch.hvm_domain.stdvga; struct page_info *pg; void *p; int i; memset(s, 0, sizeof(*s)); spin_lock_init(&s->lock); for ( i = 0; i != ARRAY_SIZE(s->vram_page); i++ ) { pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d))); if ( pg == NULL ) break; s->vram_page[i] = pg; p = __map_domain_page(pg); clear_page(p); unmap_domain_page(p); } if ( i == ARRAY_SIZE(s->vram_page) ) { /* Sequencer registers. */ register_portio_handler(d, 0x3c4, 2, stdvga_intercept_pio); /* Graphics registers. */ register_portio_handler(d, 0x3ce, 2, stdvga_intercept_pio); /* MMIO. */ register_buffered_io_handler( d, VGA_MEM_BASE, VGA_MEM_SIZE, stdvga_intercept_mmio); } } void stdvga_deinit(struct domain *d) { struct hvm_hw_stdvga *s = &d->arch.hvm_domain.stdvga; int i; for ( i = 0; i != ARRAY_SIZE(s->vram_page); i++ ) { if ( s->vram_page[i] == NULL ) continue; free_domheap_page(s->vram_page[i]); s->vram_page[i] = NULL; } } xen-4.4.0/xen/arch/x86/hvm/hvm.c0000664000175000017500000041534412307313555014342 0ustar smbsmb/* * hvm.c: Common hardware virtual machine abstractions. * * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2005, International Business Machines Corporation. * Copyright (c) 2008, Citrix Systems, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include bool_t __read_mostly hvm_enabled; unsigned int opt_hvm_debug_level __read_mostly; integer_param("hvm_debug", opt_hvm_debug_level); struct hvm_function_table hvm_funcs __read_mostly; /* I/O permission bitmap is globally shared by all HVM guests. */ unsigned long __attribute__ ((__section__ (".bss.page_aligned"))) hvm_io_bitmap[3*PAGE_SIZE/BYTES_PER_LONG]; /* Xen command-line option to enable HAP */ static bool_t __initdata opt_hap_enabled = 1; boolean_param("hap", opt_hap_enabled); static int cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int rc = 0; switch ( action ) { case CPU_UP_PREPARE: rc = hvm_funcs.cpu_up_prepare(cpu); break; case CPU_DYING: hvm_cpu_down(); break; case CPU_UP_CANCELED: case CPU_DEAD: hvm_funcs.cpu_dead(cpu); break; default: break; } return !rc ? NOTIFY_DONE : notifier_from_errno(rc); } static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; static int __init hvm_enable(void) { const struct hvm_function_table *fns = NULL; if ( cpu_has_vmx ) fns = start_vmx(); else if ( cpu_has_svm ) fns = start_svm(); if ( fns == NULL ) return 0; hvm_funcs = *fns; hvm_enabled = 1; printk("HVM: %s enabled\n", fns->name); if ( !fns->hap_supported ) printk("HVM: Hardware Assisted Paging (HAP) not detected\n"); else if ( !opt_hap_enabled ) { hvm_funcs.hap_supported = 0; printk("HVM: Hardware Assisted Paging (HAP) detected but disabled\n"); } else { printk("HVM: Hardware Assisted Paging (HAP) detected\n"); printk("HVM: HAP page sizes: 4kB"); if ( fns->hap_capabilities & HVM_HAP_SUPERPAGE_2MB ) printk(", 2MB%s", opt_hap_2mb ? "" : " [disabled]"); if ( fns->hap_capabilities & HVM_HAP_SUPERPAGE_1GB ) printk(", 1GB%s", opt_hap_1gb ? "" : " [disabled]"); printk("\n"); } if ( !fns->pvh_supported ) printk(XENLOG_INFO "HVM: PVH mode not supported on this platform\n"); /* * Allow direct access to the PC debug ports 0x80 and 0xed (they are * often used for I/O delays, but the vmexits simply slow things down). */ memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap)); if ( hvm_port80_allowed ) __clear_bit(0x80, hvm_io_bitmap); __clear_bit(0xed, hvm_io_bitmap); register_cpu_notifier(&cpu_nfb); return 0; } presmp_initcall(hvm_enable); /* * Need to re-inject a given event? We avoid re-injecting software exceptions * and interrupts because the faulting/trapping instruction can simply be * re-executed (neither VMX nor SVM update RIP when they VMEXIT during * INT3/INTO/INTn). */ int hvm_event_needs_reinjection(uint8_t type, uint8_t vector) { switch ( type ) { case X86_EVENTTYPE_EXT_INTR: case X86_EVENTTYPE_NMI: return 1; case X86_EVENTTYPE_HW_EXCEPTION: /* * SVM uses type 3 ("HW Exception") for #OF and #BP. We explicitly * check for these vectors, as they are really SW Exceptions. SVM has * not updated RIP to point after the trapping instruction (INT3/INTO). */ return (vector != 3) && (vector != 4); default: /* Software exceptions/interrupts can be re-executed (e.g., INT n). */ break; } return 0; } /* * Combine two hardware exceptions: @vec2 was raised during delivery of @vec1. * This means we can assume that @vec2 is contributory or a page fault. */ uint8_t hvm_combine_hw_exceptions(uint8_t vec1, uint8_t vec2) { /* Exception during double-fault delivery always causes a triple fault. */ if ( vec1 == TRAP_double_fault ) { hvm_triple_fault(); return TRAP_double_fault; /* dummy return */ } /* Exception during page-fault delivery always causes a double fault. */ if ( vec1 == TRAP_page_fault ) return TRAP_double_fault; /* Discard the first exception if it's benign or if we now have a #PF. */ if ( !((1u << vec1) & 0x7c01u) || (vec2 == TRAP_page_fault) ) return vec2; /* Cannot combine the exceptions: double fault. */ return TRAP_double_fault; } void hvm_set_rdtsc_exiting(struct domain *d, bool_t enable) { struct vcpu *v; for_each_vcpu ( d, v ) hvm_funcs.set_rdtsc_exiting(v, enable); } void hvm_get_guest_pat(struct vcpu *v, u64 *guest_pat) { if ( !hvm_funcs.get_guest_pat(v, guest_pat) ) *guest_pat = v->arch.hvm_vcpu.pat_cr; } int hvm_set_guest_pat(struct vcpu *v, u64 guest_pat) { int i; uint8_t *value = (uint8_t *)&guest_pat; for ( i = 0; i < 8; i++ ) if ( unlikely(!(value[i] == 0 || value[i] == 1 || value[i] == 4 || value[i] == 5 || value[i] == 6 || value[i] == 7)) ) { HVM_DBG_LOG(DBG_LEVEL_MSR, "invalid guest PAT: %"PRIx64"\n", guest_pat); return 0; } if ( !hvm_funcs.set_guest_pat(v, guest_pat) ) v->arch.hvm_vcpu.pat_cr = guest_pat; return 1; } void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc) { uint64_t tsc; uint64_t delta_tsc; if ( v->domain->arch.vtsc ) { tsc = hvm_get_guest_time(v); tsc = gtime_to_gtsc(v->domain, tsc); } else { rdtscll(tsc); } delta_tsc = guest_tsc - tsc; v->arch.hvm_vcpu.msr_tsc_adjust += delta_tsc - v->arch.hvm_vcpu.cache_tsc_offset; v->arch.hvm_vcpu.cache_tsc_offset = delta_tsc; hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset); } void hvm_set_guest_tsc_adjust(struct vcpu *v, u64 tsc_adjust) { v->arch.hvm_vcpu.cache_tsc_offset += tsc_adjust - v->arch.hvm_vcpu.msr_tsc_adjust; hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset); v->arch.hvm_vcpu.msr_tsc_adjust = tsc_adjust; } u64 hvm_get_guest_tsc(struct vcpu *v) { uint64_t tsc; if ( v->domain->arch.vtsc ) { tsc = hvm_get_guest_time(v); tsc = gtime_to_gtsc(v->domain, tsc); v->domain->arch.vtsc_kerncount++; } else { rdtscll(tsc); } return tsc + v->arch.hvm_vcpu.cache_tsc_offset; } u64 hvm_get_guest_tsc_adjust(struct vcpu *v) { return v->arch.hvm_vcpu.msr_tsc_adjust; } void hvm_migrate_timers(struct vcpu *v) { /* PVH doesn't use rtc and emulated timers, it uses pvclock mechanism. */ if ( is_pvh_vcpu(v) ) return; rtc_migrate_timers(v); pt_migrate(v); } static int hvm_migrate_pirq(struct domain *d, struct hvm_pirq_dpci *pirq_dpci, void *arg) { struct vcpu *v = arg; if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI) && (pirq_dpci->gmsi.dest_vcpu_id == v->vcpu_id) ) { struct irq_desc *desc = pirq_spin_lock_irq_desc(dpci_pirq(pirq_dpci), NULL); if ( !desc ) return 0; ASSERT(MSI_IRQ(desc - irq_desc)); irq_set_affinity(desc, cpumask_of(v->processor)); spin_unlock_irq(&desc->lock); } return 0; } void hvm_migrate_pirqs(struct vcpu *v) { struct domain *d = v->domain; if ( !iommu_enabled || !d->arch.hvm_domain.irq.dpci ) return; spin_lock(&d->event_lock); pt_pirq_iterate(d, hvm_migrate_pirq, v); spin_unlock(&d->event_lock); } void hvm_do_resume(struct vcpu *v) { ioreq_t *p; check_wakeup_from_wait(); if ( is_hvm_vcpu(v) ) pt_restore_timer(v); /* NB. Optimised for common case (p->state == STATE_IOREQ_NONE). */ if ( !(p = get_ioreq(v)) ) goto check_inject_trap; while ( p->state != STATE_IOREQ_NONE ) { switch ( p->state ) { case STATE_IORESP_READY: /* IORESP_READY -> NONE */ hvm_io_assist(p); break; case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */ case STATE_IOREQ_INPROCESS: wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port, (p->state != STATE_IOREQ_READY) && (p->state != STATE_IOREQ_INPROCESS)); break; default: gdprintk(XENLOG_ERR, "Weird HVM iorequest state %d.\n", p->state); domain_crash(v->domain); return; /* bail */ } } check_inject_trap: /* Inject pending hw/sw trap */ if ( v->arch.hvm_vcpu.inject_trap.vector != -1 ) { hvm_inject_trap(&v->arch.hvm_vcpu.inject_trap); v->arch.hvm_vcpu.inject_trap.vector = -1; } } static void hvm_init_ioreq_page( struct domain *d, struct hvm_ioreq_page *iorp) { memset(iorp, 0, sizeof(*iorp)); spin_lock_init(&iorp->lock); domain_pause(d); } void destroy_ring_for_helper( void **_va, struct page_info *page) { void *va = *_va; if ( va != NULL ) { unmap_domain_page_global(va); put_page_and_type(page); *_va = NULL; } } static void hvm_destroy_ioreq_page( struct domain *d, struct hvm_ioreq_page *iorp) { spin_lock(&iorp->lock); ASSERT(d->is_dying); destroy_ring_for_helper(&iorp->va, iorp->page); spin_unlock(&iorp->lock); } int prepare_ring_for_helper( struct domain *d, unsigned long gmfn, struct page_info **_page, void **_va) { struct page_info *page; p2m_type_t p2mt; void *va; page = get_page_from_gfn(d, gmfn, &p2mt, P2M_UNSHARE); if ( p2m_is_paging(p2mt) ) { if ( page ) put_page(page); p2m_mem_paging_populate(d, gmfn); return -ENOENT; } if ( p2m_is_shared(p2mt) ) { if ( page ) put_page(page); return -ENOENT; } if ( !page ) return -EINVAL; if ( !get_page_type(page, PGT_writable_page) ) { put_page(page); return -EINVAL; } va = __map_domain_page_global(page); if ( va == NULL ) { put_page_and_type(page); return -ENOMEM; } *_va = va; *_page = page; return 0; } static int hvm_set_ioreq_page( struct domain *d, struct hvm_ioreq_page *iorp, unsigned long gmfn) { struct page_info *page; void *va; int rc; if ( (rc = prepare_ring_for_helper(d, gmfn, &page, &va)) ) return rc; spin_lock(&iorp->lock); if ( (iorp->va != NULL) || d->is_dying ) { destroy_ring_for_helper(&iorp->va, iorp->page); spin_unlock(&iorp->lock); return -EINVAL; } iorp->va = va; iorp->page = page; spin_unlock(&iorp->lock); domain_unpause(d); return 0; } static int hvm_print_line( int dir, uint32_t port, uint32_t bytes, uint32_t *val) { struct domain *cd = current->domain; char c = *val; BUG_ON(bytes != 1); /* Accept only printable characters, newline, and horizontal tab. */ if ( !isprint(c) && (c != '\n') && (c != '\t') ) return X86EMUL_OKAY; spin_lock(&cd->pbuf_lock); if ( c != '\n' ) cd->pbuf[cd->pbuf_idx++] = c; if ( (cd->pbuf_idx == (DOMAIN_PBUF_SIZE - 1)) || (c == '\n') ) { cd->pbuf[cd->pbuf_idx] = '\0'; guest_printk(cd, XENLOG_G_DEBUG "%s\n", cd->pbuf); cd->pbuf_idx = 0; } spin_unlock(&cd->pbuf_lock); return X86EMUL_OKAY; } static int handle_pvh_io( int dir, uint32_t port, uint32_t bytes, uint32_t *val) { struct vcpu *curr = current; struct cpu_user_regs *regs = guest_cpu_user_regs(); if ( dir == IOREQ_WRITE ) guest_io_write(port, bytes, *val, curr, regs); else *val = guest_io_read(port, bytes, curr, regs); return X86EMUL_OKAY; } int hvm_domain_initialise(struct domain *d) { int rc; if ( !hvm_enabled ) { gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest " "on a non-VT/AMDV platform.\n"); return -EINVAL; } if ( is_pvh_domain(d) ) { if ( !hvm_funcs.pvh_supported ) { printk(XENLOG_G_WARNING "Attempt to create a PVH guest " "on a system without necessary hardware support\n"); return -EINVAL; } if ( !hap_enabled(d) ) { printk(XENLOG_G_INFO "PVH guest must have HAP on\n"); return -EINVAL; } } spin_lock_init(&d->arch.hvm_domain.irq_lock); spin_lock_init(&d->arch.hvm_domain.uc_lock); INIT_LIST_HEAD(&d->arch.hvm_domain.msixtbl_list); spin_lock_init(&d->arch.hvm_domain.msixtbl_list_lock); hvm_init_cacheattr_region_list(d); rc = paging_enable(d, PG_refcounts|PG_translate|PG_external); if ( rc != 0 ) goto fail0; d->arch.hvm_domain.params = xzalloc_array(uint64_t, HVM_NR_PARAMS); d->arch.hvm_domain.io_handler = xmalloc(struct hvm_io_handler); rc = -ENOMEM; if ( !d->arch.hvm_domain.params || !d->arch.hvm_domain.io_handler ) goto fail1; d->arch.hvm_domain.io_handler->num_slot = 0; if ( is_pvh_domain(d) ) { register_portio_handler(d, 0, 0x10003, handle_pvh_io); return 0; } hvm_init_guest_time(d); d->arch.hvm_domain.params[HVM_PARAM_HPET_ENABLED] = 1; d->arch.hvm_domain.params[HVM_PARAM_TRIPLE_FAULT_REASON] = SHUTDOWN_reboot; vpic_init(d); rc = vioapic_init(d); if ( rc != 0 ) goto fail1; stdvga_init(d); rtc_init(d); hvm_init_ioreq_page(d, &d->arch.hvm_domain.ioreq); hvm_init_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq); register_portio_handler(d, 0xe9, 1, hvm_print_line); rc = hvm_funcs.domain_initialise(d); if ( rc != 0 ) goto fail2; return 0; fail2: rtc_deinit(d); stdvga_deinit(d); vioapic_deinit(d); fail1: xfree(d->arch.hvm_domain.io_handler); xfree(d->arch.hvm_domain.params); fail0: hvm_destroy_cacheattr_region_list(d); return rc; } void hvm_domain_relinquish_resources(struct domain *d) { xfree(d->arch.hvm_domain.io_handler); xfree(d->arch.hvm_domain.params); if ( is_pvh_domain(d) ) return; if ( hvm_funcs.nhvm_domain_relinquish_resources ) hvm_funcs.nhvm_domain_relinquish_resources(d); hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.ioreq); hvm_destroy_ioreq_page(d, &d->arch.hvm_domain.buf_ioreq); msixtbl_pt_cleanup(d); /* Stop all asynchronous timer actions. */ rtc_deinit(d); if ( d->vcpu != NULL && d->vcpu[0] != NULL ) { pit_deinit(d); pmtimer_deinit(d); hpet_deinit(d); } } void hvm_domain_destroy(struct domain *d) { hvm_destroy_cacheattr_region_list(d); if ( is_pvh_domain(d) ) return; hvm_funcs.domain_destroy(d); rtc_deinit(d); stdvga_deinit(d); vioapic_deinit(d); } static int hvm_save_tsc_adjust(struct domain *d, hvm_domain_context_t *h) { struct vcpu *v; struct hvm_tsc_adjust ctxt; int err = 0; for_each_vcpu ( d, v ) { ctxt.tsc_adjust = v->arch.hvm_vcpu.msr_tsc_adjust; err = hvm_save_entry(TSC_ADJUST, v->vcpu_id, h, &ctxt); if ( err ) break; } return err; } static int hvm_load_tsc_adjust(struct domain *d, hvm_domain_context_t *h) { unsigned int vcpuid = hvm_load_instance(h); struct vcpu *v; struct hvm_tsc_adjust ctxt; if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) { dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n", d->domain_id, vcpuid); return -EINVAL; } if ( hvm_load_entry(TSC_ADJUST, h, &ctxt) != 0 ) return -EINVAL; v->arch.hvm_vcpu.msr_tsc_adjust = ctxt.tsc_adjust; return 0; } HVM_REGISTER_SAVE_RESTORE(TSC_ADJUST, hvm_save_tsc_adjust, hvm_load_tsc_adjust, 1, HVMSR_PER_VCPU); static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h) { struct vcpu *v; struct hvm_hw_cpu ctxt; struct segment_register seg; for_each_vcpu ( d, v ) { /* We don't need to save state for a vcpu that is down; the restore * code will leave it down if there is nothing saved. */ if ( test_bit(_VPF_down, &v->pause_flags) ) continue; /* Architecture-specific vmcs/vmcb bits */ hvm_funcs.save_cpu_ctxt(v, &ctxt); ctxt.msr_tsc_aux = hvm_msr_tsc_aux(v); hvm_get_segment_register(v, x86_seg_idtr, &seg); ctxt.idtr_limit = seg.limit; ctxt.idtr_base = seg.base; hvm_get_segment_register(v, x86_seg_gdtr, &seg); ctxt.gdtr_limit = seg.limit; ctxt.gdtr_base = seg.base; hvm_get_segment_register(v, x86_seg_cs, &seg); ctxt.cs_sel = seg.sel; ctxt.cs_limit = seg.limit; ctxt.cs_base = seg.base; ctxt.cs_arbytes = seg.attr.bytes; hvm_get_segment_register(v, x86_seg_ds, &seg); ctxt.ds_sel = seg.sel; ctxt.ds_limit = seg.limit; ctxt.ds_base = seg.base; ctxt.ds_arbytes = seg.attr.bytes; hvm_get_segment_register(v, x86_seg_es, &seg); ctxt.es_sel = seg.sel; ctxt.es_limit = seg.limit; ctxt.es_base = seg.base; ctxt.es_arbytes = seg.attr.bytes; hvm_get_segment_register(v, x86_seg_ss, &seg); ctxt.ss_sel = seg.sel; ctxt.ss_limit = seg.limit; ctxt.ss_base = seg.base; ctxt.ss_arbytes = seg.attr.bytes; hvm_get_segment_register(v, x86_seg_fs, &seg); ctxt.fs_sel = seg.sel; ctxt.fs_limit = seg.limit; ctxt.fs_base = seg.base; ctxt.fs_arbytes = seg.attr.bytes; hvm_get_segment_register(v, x86_seg_gs, &seg); ctxt.gs_sel = seg.sel; ctxt.gs_limit = seg.limit; ctxt.gs_base = seg.base; ctxt.gs_arbytes = seg.attr.bytes; hvm_get_segment_register(v, x86_seg_tr, &seg); ctxt.tr_sel = seg.sel; ctxt.tr_limit = seg.limit; ctxt.tr_base = seg.base; ctxt.tr_arbytes = seg.attr.bytes; hvm_get_segment_register(v, x86_seg_ldtr, &seg); ctxt.ldtr_sel = seg.sel; ctxt.ldtr_limit = seg.limit; ctxt.ldtr_base = seg.base; ctxt.ldtr_arbytes = seg.attr.bytes; if ( v->fpu_initialised ) memcpy(ctxt.fpu_regs, v->arch.fpu_ctxt, sizeof(ctxt.fpu_regs)); else memset(ctxt.fpu_regs, 0, sizeof(ctxt.fpu_regs)); ctxt.rax = v->arch.user_regs.eax; ctxt.rbx = v->arch.user_regs.ebx; ctxt.rcx = v->arch.user_regs.ecx; ctxt.rdx = v->arch.user_regs.edx; ctxt.rbp = v->arch.user_regs.ebp; ctxt.rsi = v->arch.user_regs.esi; ctxt.rdi = v->arch.user_regs.edi; ctxt.rsp = v->arch.user_regs.esp; ctxt.rip = v->arch.user_regs.eip; ctxt.rflags = v->arch.user_regs.eflags; ctxt.r8 = v->arch.user_regs.r8; ctxt.r9 = v->arch.user_regs.r9; ctxt.r10 = v->arch.user_regs.r10; ctxt.r11 = v->arch.user_regs.r11; ctxt.r12 = v->arch.user_regs.r12; ctxt.r13 = v->arch.user_regs.r13; ctxt.r14 = v->arch.user_regs.r14; ctxt.r15 = v->arch.user_regs.r15; ctxt.dr0 = v->arch.debugreg[0]; ctxt.dr1 = v->arch.debugreg[1]; ctxt.dr2 = v->arch.debugreg[2]; ctxt.dr3 = v->arch.debugreg[3]; ctxt.dr6 = v->arch.debugreg[6]; ctxt.dr7 = v->arch.debugreg[7]; if ( hvm_save_entry(CPU, v->vcpu_id, h, &ctxt) != 0 ) return 1; } return 0; } static bool_t hvm_efer_valid(struct domain *d, uint64_t value, uint64_t efer_validbits) { if ( nestedhvm_enabled(d) && cpu_has_svm ) efer_validbits |= EFER_SVME; return !((value & ~efer_validbits) || ((sizeof(long) != 8) && (value & EFER_LME)) || (!cpu_has_svm && (value & EFER_SVME)) || (!cpu_has_nx && (value & EFER_NX)) || (!cpu_has_syscall && (value & EFER_SCE)) || (!cpu_has_lmsl && (value & EFER_LMSLE)) || (!cpu_has_ffxsr && (value & EFER_FFXSE)) || ((value & (EFER_LME|EFER_LMA)) == EFER_LMA)); } static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h) { int vcpuid; struct vcpu *v; struct hvm_hw_cpu ctxt; struct segment_register seg; uint64_t efer_validbits; /* Which vcpu is this? */ vcpuid = hvm_load_instance(h); if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) { dprintk(XENLOG_G_ERR, "HVM restore: dom%u has no vcpu%u\n", d->domain_id, vcpuid); return -EINVAL; } if ( hvm_load_entry(CPU, h, &ctxt) != 0 ) return -EINVAL; /* Sanity check some control registers. */ if ( (ctxt.cr0 & HVM_CR0_GUEST_RESERVED_BITS) || !(ctxt.cr0 & X86_CR0_ET) || ((ctxt.cr0 & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG) ) { printk(XENLOG_G_ERR "HVM%d restore: bad CR0 %#" PRIx64 "\n", d->domain_id, ctxt.cr0); return -EINVAL; } if ( ctxt.cr4 & HVM_CR4_GUEST_RESERVED_BITS(v) ) { printk(XENLOG_G_ERR "HVM%d restore: bad CR4 %#" PRIx64 "\n", d->domain_id, ctxt.cr4); return -EINVAL; } efer_validbits = EFER_FFXSE | EFER_LMSLE | EFER_LME | EFER_LMA | EFER_NX | EFER_SCE; if ( !hvm_efer_valid(d, ctxt.msr_efer, efer_validbits) ) { printk(XENLOG_G_ERR "HVM%d restore: bad EFER %#" PRIx64 "\n", d->domain_id, ctxt.msr_efer); return -EINVAL; } /* Older Xen versions used to save the segment arbytes directly * from the VMCS on Intel hosts. Detect this and rearrange them * into the struct segment_register format. */ #define UNFOLD_ARBYTES(_r) \ if ( (_r & 0xf000) && !(_r & 0x0f00) ) \ _r = ((_r & 0xff) | ((_r >> 4) & 0xf00)) UNFOLD_ARBYTES(ctxt.cs_arbytes); UNFOLD_ARBYTES(ctxt.ds_arbytes); UNFOLD_ARBYTES(ctxt.es_arbytes); UNFOLD_ARBYTES(ctxt.fs_arbytes); UNFOLD_ARBYTES(ctxt.gs_arbytes); UNFOLD_ARBYTES(ctxt.ss_arbytes); UNFOLD_ARBYTES(ctxt.tr_arbytes); UNFOLD_ARBYTES(ctxt.ldtr_arbytes); #undef UNFOLD_ARBYTES /* Architecture-specific vmcs/vmcb bits */ if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 ) return -EINVAL; v->arch.hvm_vcpu.msr_tsc_aux = ctxt.msr_tsc_aux; seg.limit = ctxt.idtr_limit; seg.base = ctxt.idtr_base; hvm_set_segment_register(v, x86_seg_idtr, &seg); seg.limit = ctxt.gdtr_limit; seg.base = ctxt.gdtr_base; hvm_set_segment_register(v, x86_seg_gdtr, &seg); seg.sel = ctxt.cs_sel; seg.limit = ctxt.cs_limit; seg.base = ctxt.cs_base; seg.attr.bytes = ctxt.cs_arbytes; hvm_set_segment_register(v, x86_seg_cs, &seg); seg.sel = ctxt.ds_sel; seg.limit = ctxt.ds_limit; seg.base = ctxt.ds_base; seg.attr.bytes = ctxt.ds_arbytes; hvm_set_segment_register(v, x86_seg_ds, &seg); seg.sel = ctxt.es_sel; seg.limit = ctxt.es_limit; seg.base = ctxt.es_base; seg.attr.bytes = ctxt.es_arbytes; hvm_set_segment_register(v, x86_seg_es, &seg); seg.sel = ctxt.ss_sel; seg.limit = ctxt.ss_limit; seg.base = ctxt.ss_base; seg.attr.bytes = ctxt.ss_arbytes; hvm_set_segment_register(v, x86_seg_ss, &seg); seg.sel = ctxt.fs_sel; seg.limit = ctxt.fs_limit; seg.base = ctxt.fs_base; seg.attr.bytes = ctxt.fs_arbytes; hvm_set_segment_register(v, x86_seg_fs, &seg); seg.sel = ctxt.gs_sel; seg.limit = ctxt.gs_limit; seg.base = ctxt.gs_base; seg.attr.bytes = ctxt.gs_arbytes; hvm_set_segment_register(v, x86_seg_gs, &seg); seg.sel = ctxt.tr_sel; seg.limit = ctxt.tr_limit; seg.base = ctxt.tr_base; seg.attr.bytes = ctxt.tr_arbytes; hvm_set_segment_register(v, x86_seg_tr, &seg); seg.sel = ctxt.ldtr_sel; seg.limit = ctxt.ldtr_limit; seg.base = ctxt.ldtr_base; seg.attr.bytes = ctxt.ldtr_arbytes; hvm_set_segment_register(v, x86_seg_ldtr, &seg); /* In case xsave-absent save file is restored on a xsave-capable host */ if ( cpu_has_xsave && !xsave_enabled(v) ) { struct xsave_struct *xsave_area = v->arch.xsave_area; memcpy(v->arch.xsave_area, ctxt.fpu_regs, sizeof(ctxt.fpu_regs)); xsave_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE; } else memcpy(v->arch.fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs)); v->arch.user_regs.eax = ctxt.rax; v->arch.user_regs.ebx = ctxt.rbx; v->arch.user_regs.ecx = ctxt.rcx; v->arch.user_regs.edx = ctxt.rdx; v->arch.user_regs.ebp = ctxt.rbp; v->arch.user_regs.esi = ctxt.rsi; v->arch.user_regs.edi = ctxt.rdi; v->arch.user_regs.esp = ctxt.rsp; v->arch.user_regs.eip = ctxt.rip; v->arch.user_regs.eflags = ctxt.rflags | X86_EFLAGS_MBS; v->arch.user_regs.r8 = ctxt.r8; v->arch.user_regs.r9 = ctxt.r9; v->arch.user_regs.r10 = ctxt.r10; v->arch.user_regs.r11 = ctxt.r11; v->arch.user_regs.r12 = ctxt.r12; v->arch.user_regs.r13 = ctxt.r13; v->arch.user_regs.r14 = ctxt.r14; v->arch.user_regs.r15 = ctxt.r15; v->arch.debugreg[0] = ctxt.dr0; v->arch.debugreg[1] = ctxt.dr1; v->arch.debugreg[2] = ctxt.dr2; v->arch.debugreg[3] = ctxt.dr3; v->arch.debugreg[6] = ctxt.dr6; v->arch.debugreg[7] = ctxt.dr7; v->arch.vgc_flags = VGCF_online; v->fpu_initialised = 1; /* Auxiliary processors should be woken immediately. */ v->is_initialised = 1; clear_bit(_VPF_down, &v->pause_flags); vcpu_wake(v); return 0; } HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt, 1, HVMSR_PER_VCPU); #define HVM_CPU_XSAVE_SIZE(xcr0) (offsetof(struct hvm_hw_cpu_xsave, \ save_area) + \ xstate_ctxt_size(xcr0)) static int hvm_save_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h) { struct vcpu *v; struct hvm_hw_cpu_xsave *ctxt; if ( !cpu_has_xsave ) return 0; /* do nothing */ for_each_vcpu ( d, v ) { unsigned int size = HVM_CPU_XSAVE_SIZE(v->arch.xcr0_accum); if ( !xsave_enabled(v) ) continue; if ( _hvm_init_entry(h, CPU_XSAVE_CODE, v->vcpu_id, size) ) return 1; ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur]; h->cur += size; ctxt->xfeature_mask = xfeature_mask; ctxt->xcr0 = v->arch.xcr0; ctxt->xcr0_accum = v->arch.xcr0_accum; memcpy(&ctxt->save_area, v->arch.xsave_area, size - offsetof(struct hvm_hw_cpu_xsave, save_area)); } return 0; } static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h) { unsigned int vcpuid, size; int err; struct vcpu *v; struct hvm_hw_cpu_xsave *ctxt; struct hvm_save_descriptor *desc; /* Which vcpu is this? */ vcpuid = hvm_load_instance(h); if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) { dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n", d->domain_id, vcpuid); return -EINVAL; } /* Fails since we can't restore an img saved on xsave-capable host. */ if ( !cpu_has_xsave ) return -EOPNOTSUPP; /* Customized checking for entry since our entry is of variable length */ desc = (struct hvm_save_descriptor *)&h->data[h->cur]; if ( sizeof (*desc) > h->size - h->cur) { printk(XENLOG_G_WARNING "HVM%d.%d restore: not enough data left to read xsave descriptor\n", d->domain_id, vcpuid); return -ENODATA; } if ( desc->length + sizeof (*desc) > h->size - h->cur) { printk(XENLOG_G_WARNING "HVM%d.%d restore: not enough data left to read %u xsave bytes\n", d->domain_id, vcpuid, desc->length); return -ENODATA; } if ( desc->length < offsetof(struct hvm_hw_cpu_xsave, save_area) + XSTATE_AREA_MIN_SIZE ) { printk(XENLOG_G_WARNING "HVM%d.%d restore mismatch: xsave length %u < %zu\n", d->domain_id, vcpuid, desc->length, offsetof(struct hvm_hw_cpu_xsave, save_area) + XSTATE_AREA_MIN_SIZE); return -EINVAL; } size = HVM_CPU_XSAVE_SIZE(xfeature_mask); if ( desc->length > size ) { printk(XENLOG_G_WARNING "HVM%d.%d restore mismatch: xsave length %u > %u\n", d->domain_id, vcpuid, desc->length, size); return -EOPNOTSUPP; } h->cur += sizeof (*desc); ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur]; h->cur += desc->length; err = validate_xstate(ctxt->xcr0, ctxt->xcr0_accum, ctxt->save_area.xsave_hdr.xstate_bv, ctxt->xfeature_mask); if ( err ) { printk(XENLOG_G_WARNING "HVM%d.%d restore: inconsistent xsave state (feat=%#"PRIx64 " accum=%#"PRIx64" xcr0=%#"PRIx64" bv=%#"PRIx64" err=%d)\n", d->domain_id, vcpuid, ctxt->xfeature_mask, ctxt->xcr0_accum, ctxt->xcr0, ctxt->save_area.xsave_hdr.xstate_bv, err); return err; } size = HVM_CPU_XSAVE_SIZE(ctxt->xcr0_accum); if ( desc->length > size ) { printk(XENLOG_G_WARNING "HVM%d.%d restore mismatch: xsave length %u > %u\n", d->domain_id, vcpuid, desc->length, size); return -EOPNOTSUPP; } /* Checking finished */ v->arch.xcr0 = ctxt->xcr0; v->arch.xcr0_accum = ctxt->xcr0_accum; if ( ctxt->xcr0_accum & XSTATE_NONLAZY ) v->arch.nonlazy_xstate_used = 1; memcpy(v->arch.xsave_area, &ctxt->save_area, desc->length - offsetof(struct hvm_hw_cpu_xsave, save_area)); return 0; } /* We need variable length data chunk for xsave area, hence customized * declaration other than HVM_REGISTER_SAVE_RESTORE. */ static int __init __hvm_register_CPU_XSAVE_save_and_restore(void) { hvm_register_savevm(CPU_XSAVE_CODE, "CPU_XSAVE", hvm_save_cpu_xsave_states, hvm_load_cpu_xsave_states, HVM_CPU_XSAVE_SIZE(xfeature_mask) + sizeof(struct hvm_save_descriptor), HVMSR_PER_VCPU); return 0; } __initcall(__hvm_register_CPU_XSAVE_save_and_restore); int hvm_vcpu_initialise(struct vcpu *v) { int rc; struct domain *d = v->domain; domid_t dm_domid; hvm_asid_flush_vcpu(v); spin_lock_init(&v->arch.hvm_vcpu.tm_lock); INIT_LIST_HEAD(&v->arch.hvm_vcpu.tm_list); rc = hvm_vcpu_cacheattr_init(v); /* teardown: vcpu_cacheattr_destroy */ if ( rc != 0 ) goto fail1; /* NB: vlapic_init must be called before hvm_funcs.vcpu_initialise */ if ( is_hvm_vcpu(v) ) rc = vlapic_init(v); if ( rc != 0 ) /* teardown: vlapic_destroy */ goto fail2; if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 ) /* teardown: hvm_funcs.vcpu_destroy */ goto fail3; softirq_tasklet_init( &v->arch.hvm_vcpu.assert_evtchn_irq_tasklet, (void(*)(unsigned long))hvm_assert_evtchn_irq, (unsigned long)v); v->arch.hvm_vcpu.inject_trap.vector = -1; if ( is_pvh_vcpu(v) ) { v->arch.hvm_vcpu.hcall_64bit = 1; /* PVH 32bitfixme. */ /* This is for hvm_long_mode_enabled(v). */ v->arch.hvm_vcpu.guest_efer = EFER_SCE | EFER_LMA | EFER_LME; return 0; } rc = setup_compat_arg_xlat(v); /* teardown: free_compat_arg_xlat() */ if ( rc != 0 ) goto fail4; if ( nestedhvm_enabled(d) && (rc = nestedhvm_vcpu_initialise(v)) < 0 ) /* teardown: nestedhvm_vcpu_destroy */ goto fail5; dm_domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN]; /* Create ioreq event channel. */ rc = alloc_unbound_xen_event_channel(v, dm_domid, NULL); /* teardown: none */ if ( rc < 0 ) goto fail6; /* Register ioreq event channel. */ v->arch.hvm_vcpu.xen_port = rc; if ( v->vcpu_id == 0 ) { /* Create bufioreq event channel. */ rc = alloc_unbound_xen_event_channel(v, dm_domid, NULL); /* teardown: none */ if ( rc < 0 ) goto fail6; d->arch.hvm_domain.params[HVM_PARAM_BUFIOREQ_EVTCHN] = rc; } spin_lock(&d->arch.hvm_domain.ioreq.lock); if ( d->arch.hvm_domain.ioreq.va != NULL ) get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port; spin_unlock(&d->arch.hvm_domain.ioreq.lock); if ( v->vcpu_id == 0 ) { /* NB. All these really belong in hvm_domain_initialise(). */ pit_init(v, cpu_khz); pmtimer_init(v); hpet_init(v); /* Init guest TSC to start from zero. */ hvm_set_guest_tsc(v, 0); /* Can start up without SIPI-SIPI or setvcpucontext domctl. */ v->is_initialised = 1; clear_bit(_VPF_down, &v->pause_flags); } return 0; fail6: nestedhvm_vcpu_destroy(v); fail5: free_compat_arg_xlat(v); fail4: hvm_funcs.vcpu_destroy(v); fail3: vlapic_destroy(v); fail2: hvm_vcpu_cacheattr_destroy(v); fail1: return rc; } void hvm_vcpu_destroy(struct vcpu *v) { nestedhvm_vcpu_destroy(v); free_compat_arg_xlat(v); tasklet_kill(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet); hvm_vcpu_cacheattr_destroy(v); if ( is_hvm_vcpu(v) ) vlapic_destroy(v); hvm_funcs.vcpu_destroy(v); /* Event channel is already freed by evtchn_destroy(). */ /*free_xen_event_channel(v, v->arch.hvm_vcpu.xen_port);*/ } void hvm_vcpu_down(struct vcpu *v) { struct domain *d = v->domain; int online_count = 0; /* Doesn't halt us immediately, but we'll never return to guest context. */ set_bit(_VPF_down, &v->pause_flags); vcpu_sleep_nosync(v); /* Any other VCPUs online? ... */ domain_lock(d); for_each_vcpu ( d, v ) if ( !test_bit(_VPF_down, &v->pause_flags) ) online_count++; domain_unlock(d); /* ... Shut down the domain if not. */ if ( online_count == 0 ) { gdprintk(XENLOG_INFO, "All CPUs offline -- powering off.\n"); domain_shutdown(d, SHUTDOWN_poweroff); } } bool_t hvm_send_assist_req(struct vcpu *v) { ioreq_t *p; if ( unlikely(!vcpu_start_shutdown_deferral(v)) ) return 0; /* implicitly bins the i/o operation */ if ( !(p = get_ioreq(v)) ) return 0; if ( unlikely(p->state != STATE_IOREQ_NONE) ) { /* This indicates a bug in the device model. Crash the domain. */ gdprintk(XENLOG_ERR, "Device model set bad IO state %d.\n", p->state); domain_crash(v->domain); return 0; } prepare_wait_on_xen_event_channel(v->arch.hvm_vcpu.xen_port); /* * Following happens /after/ blocking and setting up ioreq contents. * prepare_wait_on_xen_event_channel() is an implicit barrier. */ p->state = STATE_IOREQ_READY; notify_via_xen_event_channel(v->domain, v->arch.hvm_vcpu.xen_port); return 1; } void hvm_hlt(unsigned long rflags) { struct vcpu *curr = current; if ( hvm_event_pending(curr) ) return; /* * If we halt with interrupts disabled, that's a pretty sure sign that we * want to shut down. In a real processor, NMIs are the only way to break * out of this. */ if ( unlikely(!(rflags & X86_EFLAGS_IF)) ) return hvm_vcpu_down(curr); do_sched_op_compat(SCHEDOP_block, 0); HVMTRACE_1D(HLT, /* pending = */ vcpu_runnable(curr)); } void hvm_triple_fault(void) { struct vcpu *v = current; struct domain *d = v->domain; u8 reason = d->arch.hvm_domain.params[HVM_PARAM_TRIPLE_FAULT_REASON]; gdprintk(XENLOG_INFO, "Triple fault on VCPU%d - " "invoking HVM shutdown action %"PRIu8".\n", v->vcpu_id, reason); domain_shutdown(d, reason); } void hvm_inject_trap(struct hvm_trap *trap) { struct vcpu *curr = current; if ( nestedhvm_enabled(curr->domain) && !nestedhvm_vmswitch_in_progress(curr) && nestedhvm_vcpu_in_guestmode(curr) && nhvm_vmcx_guest_intercepts_trap( curr, trap->vector, trap->error_code) ) { enum nestedhvm_vmexits nsret; nsret = nhvm_vcpu_vmexit_trap(curr, trap); switch ( nsret ) { case NESTEDHVM_VMEXIT_DONE: case NESTEDHVM_VMEXIT_ERROR: /* L1 guest will crash L2 guest */ return; case NESTEDHVM_VMEXIT_HOST: case NESTEDHVM_VMEXIT_CONTINUE: case NESTEDHVM_VMEXIT_FATALERROR: default: gdprintk(XENLOG_ERR, "unexpected nestedhvm error %i\n", nsret); return; } } hvm_funcs.inject_trap(trap); } void hvm_inject_hw_exception(unsigned int trapnr, int errcode) { struct hvm_trap trap = { .vector = trapnr, .type = X86_EVENTTYPE_HW_EXCEPTION, .error_code = errcode }; hvm_inject_trap(&trap); } void hvm_inject_page_fault(int errcode, unsigned long cr2) { struct hvm_trap trap = { .vector = TRAP_page_fault, .type = X86_EVENTTYPE_HW_EXCEPTION, .error_code = errcode, .cr2 = cr2 }; hvm_inject_trap(&trap); } int hvm_hap_nested_page_fault(paddr_t gpa, bool_t gla_valid, unsigned long gla, bool_t access_r, bool_t access_w, bool_t access_x) { unsigned long gfn = gpa >> PAGE_SHIFT; p2m_type_t p2mt; p2m_access_t p2ma; mfn_t mfn; struct vcpu *v = current; struct p2m_domain *p2m; int rc, fall_through = 0, paged = 0; int sharing_enomem = 0; mem_event_request_t *req_ptr = NULL; /* On Nested Virtualization, walk the guest page table. * If this succeeds, all is fine. * If this fails, inject a nested page fault into the guest. */ if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) && nestedhvm_paging_mode_hap(v) ) { int rv; /* The vcpu is in guest mode and the l1 guest * uses hap. That means 'gpa' is in l2 guest * physical address space. * Fix the nested p2m or inject nested page fault * into l1 guest if not fixable. The algorithm is * the same as for shadow paging. */ rv = nestedhvm_hap_nested_page_fault(v, &gpa, access_r, access_w, access_x); switch (rv) { case NESTEDHVM_PAGEFAULT_DONE: case NESTEDHVM_PAGEFAULT_RETRY: return 1; case NESTEDHVM_PAGEFAULT_L1_ERROR: /* An error occured while translating gpa from * l2 guest address to l1 guest address. */ return 0; case NESTEDHVM_PAGEFAULT_INJECT: return -1; case NESTEDHVM_PAGEFAULT_MMIO: if ( !handle_mmio() ) hvm_inject_hw_exception(TRAP_gp_fault, 0); return 1; case NESTEDHVM_PAGEFAULT_L0_ERROR: /* gpa is now translated to l1 guest address, update gfn. */ gfn = gpa >> PAGE_SHIFT; break; } } /* For the benefit of 32-bit WinXP (& older Windows) on AMD CPUs, * a fast path for LAPIC accesses, skipping the p2m lookup. */ if ( !nestedhvm_vcpu_in_guestmode(v) && is_hvm_vcpu(v) && gfn == PFN_DOWN(vlapic_base_address(vcpu_vlapic(v))) ) { if ( !handle_mmio() ) hvm_inject_hw_exception(TRAP_gp_fault, 0); rc = 1; goto out; } p2m = p2m_get_hostp2m(v->domain); mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, P2M_ALLOC | (access_w ? P2M_UNSHARE : 0), NULL); /* Check access permissions first, then handle faults */ if ( mfn_x(mfn) != INVALID_MFN ) { int violation = 0; /* If the access is against the permissions, then send to mem_event */ switch (p2ma) { case p2m_access_n: case p2m_access_n2rwx: default: violation = access_r || access_w || access_x; break; case p2m_access_r: violation = access_w || access_x; break; case p2m_access_w: violation = access_r || access_x; break; case p2m_access_x: violation = access_r || access_w; break; case p2m_access_rx: case p2m_access_rx2rw: violation = access_w; break; case p2m_access_wx: violation = access_r; break; case p2m_access_rw: violation = access_x; break; case p2m_access_rwx: break; } if ( violation ) { if ( p2m_mem_access_check(gpa, gla_valid, gla, access_r, access_w, access_x, &req_ptr) ) { fall_through = 1; } else { /* Rights not promoted, vcpu paused, work here is done */ rc = 1; goto out_put_gfn; } } } /* * If this GFN is emulated MMIO or marked as read-only, pass the fault * to the mmio handler. */ if ( (p2mt == p2m_mmio_dm) || (access_w && (p2mt == p2m_ram_ro)) ) { put_gfn(p2m->domain, gfn); if ( !handle_mmio() ) hvm_inject_hw_exception(TRAP_gp_fault, 0); rc = 1; goto out; } /* Check if the page has been paged out */ if ( p2m_is_paged(p2mt) || (p2mt == p2m_ram_paging_out) ) paged = 1; /* Mem sharing: unshare the page and try again */ if ( access_w && (p2mt == p2m_ram_shared) ) { ASSERT(!p2m_is_nestedp2m(p2m)); sharing_enomem = (mem_sharing_unshare_page(p2m->domain, gfn, 0) < 0); rc = 1; goto out_put_gfn; } /* Spurious fault? PoD and log-dirty also take this path. */ if ( p2m_is_ram(p2mt) ) { /* * Page log dirty is always done with order 0. If this mfn resides in * a large page, we do not change other pages type within that large * page. */ if ( access_w ) { paging_mark_dirty(v->domain, mfn_x(mfn)); p2m_change_type(v->domain, gfn, p2m_ram_logdirty, p2m_ram_rw); } rc = 1; goto out_put_gfn; } /* Shouldn't happen: Maybe the guest was writing to a r/o grant mapping? */ if ( access_w && (p2mt == p2m_grant_map_ro) ) { gdprintk(XENLOG_WARNING, "trying to write to read-only grant mapping\n"); hvm_inject_hw_exception(TRAP_gp_fault, 0); rc = 1; goto out_put_gfn; } /* If we fell through, the vcpu will retry now that access restrictions have * been removed. It may fault again if the p2m entry type still requires so. * Otherwise, this is an error condition. */ rc = fall_through; out_put_gfn: put_gfn(p2m->domain, gfn); out: /* All of these are delayed until we exit, since we might * sleep on event ring wait queues, and we must not hold * locks in such circumstance */ if ( paged ) p2m_mem_paging_populate(v->domain, gfn); if ( sharing_enomem ) { int rv; if ( (rv = mem_sharing_notify_enomem(v->domain, gfn, 1)) < 0 ) { gdprintk(XENLOG_ERR, "Domain %hu attempt to unshare " "gfn %lx, ENOMEM and no helper (rc %d)\n", v->domain->domain_id, gfn, rv); /* Crash the domain */ rc = 0; } } if ( req_ptr ) { mem_access_send_req(v->domain, req_ptr); xfree(req_ptr); } return rc; } int hvm_handle_xsetbv(u32 index, u64 new_bv) { struct segment_register sreg; hvm_get_segment_register(current, x86_seg_ss, &sreg); if ( sreg.attr.fields.dpl != 0 ) goto err; if ( handle_xsetbv(index, new_bv) ) goto err; return 0; err: hvm_inject_hw_exception(TRAP_gp_fault, 0); return -1; } int hvm_set_efer(uint64_t value) { struct vcpu *v = current; uint64_t efer_validbits; value &= ~EFER_LMA; efer_validbits = EFER_FFXSE | EFER_LMSLE | EFER_LME | EFER_NX | EFER_SCE; if ( !hvm_efer_valid(v->domain, value, efer_validbits) ) { gdprintk(XENLOG_WARNING, "Trying to set reserved bit in " "EFER: %#"PRIx64"\n", value); hvm_inject_hw_exception(TRAP_gp_fault, 0); return X86EMUL_EXCEPTION; } if ( ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_LME) && hvm_paging_enabled(v) ) { gdprintk(XENLOG_WARNING, "Trying to change EFER.LME with paging enabled\n"); hvm_inject_hw_exception(TRAP_gp_fault, 0); return X86EMUL_EXCEPTION; } if ( nestedhvm_enabled(v->domain) && cpu_has_svm && ((value & EFER_SVME) == 0 ) && ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_SVME) ) { /* Cleared EFER.SVME: Flush all nestedp2m tables */ p2m_flush_nestedp2m(v->domain); nestedhvm_vcpu_reset(v); } value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA; v->arch.hvm_vcpu.guest_efer = value; hvm_update_guest_efer(v); return X86EMUL_OKAY; } /* Exit UC mode only if all VCPUs agree on MTRR/PAT and are not in no_fill. */ static bool_t domain_exit_uc_mode(struct vcpu *v) { struct domain *d = v->domain; struct vcpu *vs; for_each_vcpu ( d, vs ) { if ( (vs == v) || !vs->is_initialised ) continue; if ( (vs->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) || mtrr_pat_not_equal(vs, v) ) return 0; } return 1; } static void local_flush_cache(void *info) { wbinvd(); } static void hvm_set_uc_mode(struct vcpu *v, bool_t is_in_uc_mode) { v->domain->arch.hvm_domain.is_in_uc_mode = is_in_uc_mode; shadow_blow_tables_per_domain(v->domain); } int hvm_mov_to_cr(unsigned int cr, unsigned int gpr) { struct vcpu *curr = current; unsigned long val, *reg; if ( (reg = decode_register(gpr, guest_cpu_user_regs(), 0)) == NULL ) { gdprintk(XENLOG_ERR, "invalid gpr: %u\n", gpr); goto exit_and_crash; } val = *reg; HVMTRACE_LONG_2D(CR_WRITE, cr, TRC_PAR_LONG(val)); HVM_DBG_LOG(DBG_LEVEL_1, "CR%u, value = %lx", cr, val); switch ( cr ) { case 0: return hvm_set_cr0(val); case 3: return hvm_set_cr3(val); case 4: return hvm_set_cr4(val); case 8: vlapic_set_reg(vcpu_vlapic(curr), APIC_TASKPRI, ((val & 0x0f) << 4)); break; default: gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr); goto exit_and_crash; } return X86EMUL_OKAY; exit_and_crash: domain_crash(curr->domain); return X86EMUL_UNHANDLEABLE; } int hvm_mov_from_cr(unsigned int cr, unsigned int gpr) { struct vcpu *curr = current; unsigned long val = 0, *reg; if ( (reg = decode_register(gpr, guest_cpu_user_regs(), 0)) == NULL ) { gdprintk(XENLOG_ERR, "invalid gpr: %u\n", gpr); goto exit_and_crash; } switch ( cr ) { case 0: case 2: case 3: case 4: val = curr->arch.hvm_vcpu.guest_cr[cr]; break; case 8: val = (vlapic_get_reg(vcpu_vlapic(curr), APIC_TASKPRI) & 0xf0) >> 4; break; default: gdprintk(XENLOG_ERR, "invalid cr: %u\n", cr); goto exit_and_crash; } *reg = val; HVMTRACE_LONG_2D(CR_READ, cr, TRC_PAR_LONG(val)); HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%u, value = %lx", cr, val); return X86EMUL_OKAY; exit_and_crash: domain_crash(curr->domain); return X86EMUL_UNHANDLEABLE; } void hvm_shadow_handle_cd(struct vcpu *v, unsigned long value) { if ( value & X86_CR0_CD ) { /* Entering no fill cache mode. */ spin_lock(&v->domain->arch.hvm_domain.uc_lock); v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE; if ( !v->domain->arch.hvm_domain.is_in_uc_mode ) { domain_pause_nosync(v->domain); /* Flush physical caches. */ on_each_cpu(local_flush_cache, NULL, 1); hvm_set_uc_mode(v, 1); domain_unpause(v->domain); } spin_unlock(&v->domain->arch.hvm_domain.uc_lock); } else if ( !(value & X86_CR0_CD) && (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) { /* Exit from no fill cache mode. */ spin_lock(&v->domain->arch.hvm_domain.uc_lock); v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE; if ( domain_exit_uc_mode(v) ) hvm_set_uc_mode(v, 0); spin_unlock(&v->domain->arch.hvm_domain.uc_lock); } } static void hvm_update_cr(struct vcpu *v, unsigned int cr, unsigned long value) { v->arch.hvm_vcpu.guest_cr[cr] = value; nestedhvm_set_cr(v, cr, value); hvm_update_guest_cr(v, cr); } int hvm_set_cr0(unsigned long value) { struct vcpu *v = current; unsigned long gfn, old_value = v->arch.hvm_vcpu.guest_cr[0]; struct page_info *page; HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value); if ( (u32)value != value ) { HVM_DBG_LOG(DBG_LEVEL_1, "Guest attempts to set upper 32 bits in CR0: %lx", value); goto gpf; } value &= ~HVM_CR0_GUEST_RESERVED_BITS; /* ET is reserved and should be always be 1. */ value |= X86_CR0_ET; if ( !nestedhvm_vmswitch_in_progress(v) && (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PG ) goto gpf; /* A pvh is not expected to change to real mode. */ if ( is_pvh_vcpu(v) && (value & (X86_CR0_PE | X86_CR0_PG)) != (X86_CR0_PG | X86_CR0_PE) ) { printk(XENLOG_G_WARNING "PVH attempting to turn off PE/PG. CR0:%lx\n", value); goto gpf; } if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) ) { if ( v->arch.hvm_vcpu.guest_efer & EFER_LME ) { if ( !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE) && !nestedhvm_vmswitch_in_progress(v) ) { HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable"); goto gpf; } HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode"); v->arch.hvm_vcpu.guest_efer |= EFER_LMA; hvm_update_guest_efer(v); } if ( !paging_mode_hap(v->domain) ) { /* The guest CR3 must be pointing to the guest physical. */ gfn = v->arch.hvm_vcpu.guest_cr[3]>>PAGE_SHIFT; page = get_page_from_gfn(v->domain, gfn, NULL, P2M_ALLOC); if ( !page ) { gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx\n", v->arch.hvm_vcpu.guest_cr[3]); domain_crash(v->domain); return X86EMUL_UNHANDLEABLE; } /* Now arch.guest_table points to machine physical. */ v->arch.guest_table = pagetable_from_page(page); HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx", v->arch.hvm_vcpu.guest_cr[3], page_to_mfn(page)); } } else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) ) { if ( hvm_pcid_enabled(v) ) { HVM_DBG_LOG(DBG_LEVEL_1, "Guest attempts to clear CR0.PG " "while CR4.PCIDE=1"); goto gpf; } /* When CR0.PG is cleared, LMA is cleared immediately. */ if ( hvm_long_mode_enabled(v) ) { v->arch.hvm_vcpu.guest_efer &= ~EFER_LMA; hvm_update_guest_efer(v); } if ( !paging_mode_hap(v->domain) ) { put_page(pagetable_get_page(v->arch.guest_table)); v->arch.guest_table = pagetable_null(); } } /* * When cr0.cd setting * 1. For guest w/o VT-d, and for guest with VT-d but snooped, Xen need not * do anything, since hardware snoop mechanism has ensured cache coherency; * 2. For guest with VT-d but non-snooped, cache coherency cannot be * guaranteed by h/w so need emulate UC memory type to guest. */ if ( ((value ^ old_value) & X86_CR0_CD) && has_arch_pdevs(v->domain) && iommu_enabled && !iommu_snoop && hvm_funcs.handle_cd ) hvm_funcs.handle_cd(v, value); hvm_update_cr(v, 0, value); if ( (value ^ old_value) & X86_CR0_PG ) { if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) ) paging_update_nestedmode(v); else paging_update_paging_modes(v); } return X86EMUL_OKAY; gpf: hvm_inject_hw_exception(TRAP_gp_fault, 0); return X86EMUL_EXCEPTION; } int hvm_set_cr3(unsigned long value) { struct vcpu *v = current; struct page_info *page; unsigned long old; if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) && (value != v->arch.hvm_vcpu.guest_cr[3]) ) { /* Shadow-mode CR3 change. Check PDBR and update refcounts. */ HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value); page = get_page_from_gfn(v->domain, value >> PAGE_SHIFT, NULL, P2M_ALLOC); if ( !page ) goto bad_cr3; put_page(pagetable_get_page(v->arch.guest_table)); v->arch.guest_table = pagetable_from_page(page); HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value); } old=v->arch.hvm_vcpu.guest_cr[3]; v->arch.hvm_vcpu.guest_cr[3] = value; paging_update_cr3(v); hvm_memory_event_cr3(value, old); return X86EMUL_OKAY; bad_cr3: gdprintk(XENLOG_ERR, "Invalid CR3\n"); domain_crash(v->domain); return X86EMUL_UNHANDLEABLE; } int hvm_set_cr4(unsigned long value) { struct vcpu *v = current; unsigned long old_cr; if ( value & HVM_CR4_GUEST_RESERVED_BITS(v) ) { HVM_DBG_LOG(DBG_LEVEL_1, "Guest attempts to set reserved bit in CR4: %lx", value); goto gpf; } if ( !(value & X86_CR4_PAE) && hvm_long_mode_enabled(v) ) { HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while " "EFER.LMA is set"); goto gpf; } old_cr = v->arch.hvm_vcpu.guest_cr[4]; if ( (value & X86_CR4_PCIDE) && !(old_cr & X86_CR4_PCIDE) && (!hvm_long_mode_enabled(v) || (v->arch.hvm_vcpu.guest_cr[3] & 0xfff)) ) { HVM_DBG_LOG(DBG_LEVEL_1, "Guest attempts to change CR4.PCIDE from " "0 to 1 while either EFER.LMA=0 or CR3[11:0]!=000H"); goto gpf; } hvm_update_cr(v, 4, value); hvm_memory_event_cr4(value, old_cr); /* * Modifying CR4.{PSE,PAE,PGE,SMEP}, or clearing CR4.PCIDE * invalidate all TLB entries. */ if ( ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE | X86_CR4_SMEP)) || (!(value & X86_CR4_PCIDE) && (old_cr & X86_CR4_PCIDE)) ) { if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) ) paging_update_nestedmode(v); else paging_update_paging_modes(v); } return X86EMUL_OKAY; gpf: hvm_inject_hw_exception(TRAP_gp_fault, 0); return X86EMUL_EXCEPTION; } int hvm_virtual_to_linear_addr( enum x86_segment seg, struct segment_register *reg, unsigned long offset, unsigned int bytes, enum hvm_access_type access_type, unsigned int addr_size, unsigned long *linear_addr) { unsigned long addr = offset, last_byte; if ( !(current->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) ) { /* * REAL MODE: Don't bother with segment access checks. * Certain of them are not done in native real mode anyway. */ addr = (uint32_t)(addr + reg->base); last_byte = (uint32_t)addr + bytes - 1; if ( last_byte < addr ) return 0; } else if ( addr_size != 64 ) { /* * COMPATIBILITY MODE: Apply segment checks and add base. */ switch ( access_type ) { case hvm_access_read: if ( (reg->attr.fields.type & 0xa) == 0x8 ) return 0; /* execute-only code segment */ break; case hvm_access_write: if ( (reg->attr.fields.type & 0xa) != 0x2 ) return 0; /* not a writable data segment */ break; default: break; } last_byte = (uint32_t)offset + bytes - 1; /* Is this a grows-down data segment? Special limit check if so. */ if ( (reg->attr.fields.type & 0xc) == 0x4 ) { /* Is upper limit 0xFFFF or 0xFFFFFFFF? */ if ( !reg->attr.fields.db ) last_byte = (uint16_t)last_byte; /* Check first byte and last byte against respective bounds. */ if ( (offset <= reg->limit) || (last_byte < offset) ) return 0; } else if ( (last_byte > reg->limit) || (last_byte < offset) ) return 0; /* last byte is beyond limit or wraps 0xFFFFFFFF */ /* * Hardware truncates to 32 bits in compatibility mode. * It does not truncate to 16 bits in 16-bit address-size mode. */ addr = (uint32_t)(addr + reg->base); } else { /* * LONG MODE: FS and GS add segment base. Addresses must be canonical. */ if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) ) addr += reg->base; last_byte = addr + bytes - 1; if ( !is_canonical_address(addr) || last_byte < addr || !is_canonical_address(last_byte) ) return 0; } *linear_addr = addr; return 1; } /* On non-NULL return, we leave this function holding an additional * ref on the underlying mfn, if any */ static void *__hvm_map_guest_frame(unsigned long gfn, bool_t writable, bool_t permanent) { void *map; p2m_type_t p2mt; struct page_info *page; struct domain *d = current->domain; page = get_page_from_gfn(d, gfn, &p2mt, writable ? P2M_UNSHARE : P2M_ALLOC); if ( (p2m_is_shared(p2mt) && writable) || !page ) { if ( page ) put_page(page); return NULL; } if ( p2m_is_paging(p2mt) ) { put_page(page); p2m_mem_paging_populate(d, gfn); return NULL; } if ( writable ) paging_mark_dirty(d, page_to_mfn(page)); if ( !permanent ) return __map_domain_page(page); map = __map_domain_page_global(page); if ( !map ) put_page(page); return map; } void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent) { return __hvm_map_guest_frame(gfn, 1, permanent); } void *hvm_map_guest_frame_ro(unsigned long gfn, bool_t permanent) { return __hvm_map_guest_frame(gfn, 0, permanent); } void hvm_unmap_guest_frame(void *p, bool_t permanent) { unsigned long mfn; if ( !p ) return; mfn = domain_page_map_to_mfn(p); if ( !permanent ) unmap_domain_page(p); else unmap_domain_page_global(p); put_page(mfn_to_page(mfn)); } static void *hvm_map_entry(unsigned long va) { unsigned long gfn; uint32_t pfec; char *v; if ( ((va & ~PAGE_MASK) + 8) > PAGE_SIZE ) { gdprintk(XENLOG_ERR, "Descriptor table entry " "straddles page boundary\n"); goto fail; } /* * We're mapping on behalf of the segment-load logic, which might write * the accessed flags in the descriptors (in 32-bit mode), but we still * treat it as a kernel-mode read (i.e. no access checks). */ pfec = PFEC_page_present; gfn = paging_gva_to_gfn(current, va, &pfec); if ( (pfec == PFEC_page_paged) || (pfec == PFEC_page_shared) ) goto fail; v = hvm_map_guest_frame_rw(gfn, 0); if ( v == NULL ) goto fail; return v + (va & ~PAGE_MASK); fail: domain_crash(current->domain); return NULL; } static void hvm_unmap_entry(void *p) { hvm_unmap_guest_frame(p, 0); } static int hvm_load_segment_selector( enum x86_segment seg, uint16_t sel) { struct segment_register desctab, cs, segr; struct desc_struct *pdesc, desc; u8 dpl, rpl, cpl; int fault_type = TRAP_invalid_tss; struct cpu_user_regs *regs = guest_cpu_user_regs(); struct vcpu *v = current; if ( regs->eflags & X86_EFLAGS_VM ) { segr.sel = sel; segr.base = (uint32_t)sel << 4; segr.limit = 0xffffu; segr.attr.bytes = 0xf3; hvm_set_segment_register(v, seg, &segr); return 0; } /* NULL selector? */ if ( (sel & 0xfffc) == 0 ) { if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) ) goto fail; memset(&segr, 0, sizeof(segr)); hvm_set_segment_register(v, seg, &segr); return 0; } /* LDT descriptor must be in the GDT. */ if ( (seg == x86_seg_ldtr) && (sel & 4) ) goto fail; hvm_get_segment_register(v, x86_seg_cs, &cs); hvm_get_segment_register( v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab); /* Check against descriptor table limit. */ if ( ((sel & 0xfff8) + 7) > desctab.limit ) goto fail; pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8)); if ( pdesc == NULL ) goto hvm_map_fail; do { desc = *pdesc; /* Segment present in memory? */ if ( !(desc.b & _SEGMENT_P) ) { fault_type = TRAP_no_segment; goto unmap_and_fail; } /* LDT descriptor is a system segment. All others are code/data. */ if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) ) goto unmap_and_fail; dpl = (desc.b >> 13) & 3; rpl = sel & 3; cpl = cs.sel & 3; switch ( seg ) { case x86_seg_cs: /* Code segment? */ if ( !(desc.b & _SEGMENT_CODE) ) goto unmap_and_fail; /* Non-conforming segment: check DPL against RPL. */ if ( !(desc.b & _SEGMENT_EC) && (dpl != rpl) ) goto unmap_and_fail; break; case x86_seg_ss: /* Writable data segment? */ if ( (desc.b & (_SEGMENT_CODE|_SEGMENT_WR)) != _SEGMENT_WR ) goto unmap_and_fail; if ( (dpl != cpl) || (dpl != rpl) ) goto unmap_and_fail; break; case x86_seg_ldtr: /* LDT system segment? */ if ( (desc.b & _SEGMENT_TYPE) != (2u<<8) ) goto unmap_and_fail; goto skip_accessed_flag; default: /* Readable code or data segment? */ if ( (desc.b & (_SEGMENT_CODE|_SEGMENT_WR)) == _SEGMENT_CODE ) goto unmap_and_fail; /* * Data or non-conforming code segment: * check DPL against RPL and CPL. */ if ( ((desc.b & (_SEGMENT_EC|_SEGMENT_CODE)) != (_SEGMENT_EC|_SEGMENT_CODE)) && ((dpl < cpl) || (dpl < rpl)) ) goto unmap_and_fail; break; } } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */ (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) ); /* Force the Accessed flag in our local copy. */ desc.b |= 0x100; skip_accessed_flag: hvm_unmap_entry(pdesc); segr.base = (((desc.b << 0) & 0xff000000u) | ((desc.b << 16) & 0x00ff0000u) | ((desc.a >> 16) & 0x0000ffffu)); segr.attr.bytes = (((desc.b >> 8) & 0x00ffu) | ((desc.b >> 12) & 0x0f00u)); segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu); if ( segr.attr.fields.g ) segr.limit = (segr.limit << 12) | 0xfffu; segr.sel = sel; hvm_set_segment_register(v, seg, &segr); return 0; unmap_and_fail: hvm_unmap_entry(pdesc); fail: hvm_inject_hw_exception(fault_type, sel & 0xfffc); hvm_map_fail: return 1; } void hvm_task_switch( uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason, int32_t errcode) { struct vcpu *v = current; struct cpu_user_regs *regs = guest_cpu_user_regs(); struct segment_register gdt, tr, prev_tr, segr; struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc; unsigned long eflags; int exn_raised, rc; struct { u16 back_link,__blh; u32 esp0; u16 ss0, _0; u32 esp1; u16 ss1, _1; u32 esp2; u16 ss2, _2; u32 cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi; u16 es, _3, cs, _4, ss, _5, ds, _6, fs, _7, gs, _8, ldt, _9; u16 trace, iomap; } tss = { 0 }; hvm_get_segment_register(v, x86_seg_gdtr, &gdt); hvm_get_segment_register(v, x86_seg_tr, &prev_tr); if ( ((tss_sel & 0xfff8) + 7) > gdt.limit ) { hvm_inject_hw_exception((taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault, tss_sel & 0xfff8); goto out; } optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8)); if ( optss_desc == NULL ) goto out; nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8)); if ( nptss_desc == NULL ) goto out; tss_desc = *nptss_desc; tr.sel = tss_sel; tr.base = (((tss_desc.b << 0) & 0xff000000u) | ((tss_desc.b << 16) & 0x00ff0000u) | ((tss_desc.a >> 16) & 0x0000ffffu)); tr.attr.bytes = (((tss_desc.b >> 8) & 0x00ffu) | ((tss_desc.b >> 12) & 0x0f00u)); tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu); if ( tr.attr.fields.g ) tr.limit = (tr.limit << 12) | 0xfffu; if ( !tr.attr.fields.p ) { hvm_inject_hw_exception(TRAP_no_segment, tss_sel & 0xfff8); goto out; } if ( tr.attr.fields.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) ) { hvm_inject_hw_exception( (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault, tss_sel & 0xfff8); goto out; } if ( tr.limit < (sizeof(tss)-1) ) { hvm_inject_hw_exception(TRAP_invalid_tss, tss_sel & 0xfff8); goto out; } rc = hvm_copy_from_guest_virt( &tss, prev_tr.base, sizeof(tss), PFEC_page_present); if ( rc != HVMCOPY_okay ) goto out; eflags = regs->eflags; if ( taskswitch_reason == TSW_iret ) eflags &= ~X86_EFLAGS_NT; tss.cr3 = v->arch.hvm_vcpu.guest_cr[3]; tss.eip = regs->eip; tss.eflags = eflags; tss.eax = regs->eax; tss.ecx = regs->ecx; tss.edx = regs->edx; tss.ebx = regs->ebx; tss.esp = regs->esp; tss.ebp = regs->ebp; tss.esi = regs->esi; tss.edi = regs->edi; hvm_get_segment_register(v, x86_seg_es, &segr); tss.es = segr.sel; hvm_get_segment_register(v, x86_seg_cs, &segr); tss.cs = segr.sel; hvm_get_segment_register(v, x86_seg_ss, &segr); tss.ss = segr.sel; hvm_get_segment_register(v, x86_seg_ds, &segr); tss.ds = segr.sel; hvm_get_segment_register(v, x86_seg_fs, &segr); tss.fs = segr.sel; hvm_get_segment_register(v, x86_seg_gs, &segr); tss.gs = segr.sel; hvm_get_segment_register(v, x86_seg_ldtr, &segr); tss.ldt = segr.sel; rc = hvm_copy_to_guest_virt( prev_tr.base, &tss, sizeof(tss), PFEC_page_present); if ( rc != HVMCOPY_okay ) goto out; rc = hvm_copy_from_guest_virt( &tss, tr.base, sizeof(tss), PFEC_page_present); /* * Note: The HVMCOPY_gfn_shared case could be optimised, if the callee * functions knew we want RO access. */ if ( rc != HVMCOPY_okay ) goto out; if ( hvm_set_cr3(tss.cr3) ) goto out; regs->eip = tss.eip; regs->eflags = tss.eflags | 2; regs->eax = tss.eax; regs->ecx = tss.ecx; regs->edx = tss.edx; regs->ebx = tss.ebx; regs->esp = tss.esp; regs->ebp = tss.ebp; regs->esi = tss.esi; regs->edi = tss.edi; if ( (taskswitch_reason == TSW_call_or_int) ) { regs->eflags |= X86_EFLAGS_NT; tss.back_link = prev_tr.sel; } exn_raised = 0; if ( hvm_load_segment_selector(x86_seg_ldtr, tss.ldt) || hvm_load_segment_selector(x86_seg_es, tss.es) || hvm_load_segment_selector(x86_seg_cs, tss.cs) || hvm_load_segment_selector(x86_seg_ss, tss.ss) || hvm_load_segment_selector(x86_seg_ds, tss.ds) || hvm_load_segment_selector(x86_seg_fs, tss.fs) || hvm_load_segment_selector(x86_seg_gs, tss.gs) ) exn_raised = 1; rc = hvm_copy_to_guest_virt( tr.base, &tss, sizeof(tss), PFEC_page_present); if ( rc == HVMCOPY_bad_gva_to_gfn ) exn_raised = 1; else if ( rc != HVMCOPY_okay ) goto out; if ( (tss.trace & 1) && !exn_raised ) hvm_inject_hw_exception(TRAP_debug, tss_sel & 0xfff8); tr.attr.fields.type = 0xb; /* busy 32-bit tss */ hvm_set_segment_register(v, x86_seg_tr, &tr); v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS; hvm_update_guest_cr(v, 0); if ( (taskswitch_reason == TSW_iret) || (taskswitch_reason == TSW_jmp) ) clear_bit(41, optss_desc); /* clear B flag of old task */ if ( taskswitch_reason != TSW_iret ) set_bit(41, nptss_desc); /* set B flag of new task */ if ( errcode >= 0 ) { struct segment_register reg; unsigned long linear_addr; regs->esp -= 4; hvm_get_segment_register(current, x86_seg_ss, ®); /* Todo: do not ignore access faults here. */ if ( hvm_virtual_to_linear_addr(x86_seg_ss, ®, regs->esp, 4, hvm_access_write, 32, &linear_addr) ) hvm_copy_to_guest_virt_nofault(linear_addr, &errcode, 4, 0); } out: hvm_unmap_entry(optss_desc); hvm_unmap_entry(nptss_desc); } #define HVMCOPY_from_guest (0u<<0) #define HVMCOPY_to_guest (1u<<0) #define HVMCOPY_no_fault (0u<<1) #define HVMCOPY_fault (1u<<1) #define HVMCOPY_phys (0u<<2) #define HVMCOPY_virt (1u<<2) static enum hvm_copy_result __hvm_copy( void *buf, paddr_t addr, int size, unsigned int flags, uint32_t pfec) { struct vcpu *curr = current; unsigned long gfn; struct page_info *page; p2m_type_t p2mt; char *p; int count, todo = size; /* * XXX Disable for 4.1.0: PV-on-HVM drivers will do grant-table ops * such as query_size. Grant-table code currently does copy_to/from_guest * accesses under the big per-domain lock, which this test would disallow. * The test is not needed until we implement sleeping-on-waitqueue when * we access a paged-out frame, and that's post 4.1.0 now. */ #if 0 /* * If the required guest memory is paged out, this function may sleep. * Hence we bail immediately if called from atomic context. */ if ( in_atomic() ) return HVMCOPY_unhandleable; #endif while ( todo > 0 ) { count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo); if ( flags & HVMCOPY_virt ) { gfn = paging_gva_to_gfn(curr, addr, &pfec); if ( gfn == INVALID_GFN ) { if ( pfec == PFEC_page_paged ) return HVMCOPY_gfn_paged_out; if ( pfec == PFEC_page_shared ) return HVMCOPY_gfn_shared; if ( flags & HVMCOPY_fault ) hvm_inject_page_fault(pfec, addr); return HVMCOPY_bad_gva_to_gfn; } } else { gfn = addr >> PAGE_SHIFT; } /* For the benefit of 32-bit WinXP (& older Windows) on AMD CPUs, * a fast path for LAPIC accesses, skipping the p2m lookup. */ if ( !nestedhvm_vcpu_in_guestmode(curr) && gfn == PFN_DOWN(vlapic_base_address(vcpu_vlapic(curr))) ) return HVMCOPY_bad_gfn_to_mfn; page = get_page_from_gfn(curr->domain, gfn, &p2mt, P2M_UNSHARE); if ( !page ) return HVMCOPY_bad_gfn_to_mfn; if ( p2m_is_paging(p2mt) ) { put_page(page); p2m_mem_paging_populate(curr->domain, gfn); return HVMCOPY_gfn_paged_out; } if ( p2m_is_shared(p2mt) ) { put_page(page); return HVMCOPY_gfn_shared; } if ( p2m_is_grant(p2mt) ) { put_page(page); return HVMCOPY_unhandleable; } p = (char *)__map_domain_page(page) + (addr & ~PAGE_MASK); if ( flags & HVMCOPY_to_guest ) { if ( p2mt == p2m_ram_ro ) { static unsigned long lastpage; if ( xchg(&lastpage, gfn) != gfn ) gdprintk(XENLOG_DEBUG, "guest attempted write to read-only" " memory page. gfn=%#lx, mfn=%#lx\n", gfn, page_to_mfn(page)); } else { memcpy(p, buf, count); paging_mark_dirty(curr->domain, page_to_mfn(page)); } } else { memcpy(buf, p, count); } unmap_domain_page(p); addr += count; buf += count; todo -= count; put_page(page); } return HVMCOPY_okay; } static enum hvm_copy_result __hvm_clear(paddr_t addr, int size) { struct vcpu *curr = current; unsigned long gfn; struct page_info *page; p2m_type_t p2mt; char *p; int count, todo = size; uint32_t pfec = PFEC_page_present | PFEC_write_access; /* * XXX Disable for 4.1.0: PV-on-HVM drivers will do grant-table ops * such as query_size. Grant-table code currently does copy_to/from_guest * accesses under the big per-domain lock, which this test would disallow. * The test is not needed until we implement sleeping-on-waitqueue when * we access a paged-out frame, and that's post 4.1.0 now. */ #if 0 /* * If the required guest memory is paged out, this function may sleep. * Hence we bail immediately if called from atomic context. */ if ( in_atomic() ) return HVMCOPY_unhandleable; #endif while ( todo > 0 ) { count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo); gfn = paging_gva_to_gfn(curr, addr, &pfec); if ( gfn == INVALID_GFN ) { if ( pfec == PFEC_page_paged ) return HVMCOPY_gfn_paged_out; if ( pfec == PFEC_page_shared ) return HVMCOPY_gfn_shared; return HVMCOPY_bad_gva_to_gfn; } page = get_page_from_gfn(curr->domain, gfn, &p2mt, P2M_UNSHARE); if ( !page ) return HVMCOPY_bad_gfn_to_mfn; if ( p2m_is_paging(p2mt) ) { put_page(page); p2m_mem_paging_populate(curr->domain, gfn); return HVMCOPY_gfn_paged_out; } if ( p2m_is_shared(p2mt) ) { put_page(page); return HVMCOPY_gfn_shared; } if ( p2m_is_grant(p2mt) ) { put_page(page); return HVMCOPY_unhandleable; } p = (char *)__map_domain_page(page) + (addr & ~PAGE_MASK); if ( p2mt == p2m_ram_ro ) { static unsigned long lastpage; if ( xchg(&lastpage, gfn) != gfn ) gdprintk(XENLOG_DEBUG, "guest attempted write to read-only" " memory page. gfn=%#lx, mfn=%#lx\n", gfn, page_to_mfn(page)); } else { memset(p, 0x00, count); paging_mark_dirty(curr->domain, page_to_mfn(page)); } unmap_domain_page(p); addr += count; todo -= count; put_page(page); } return HVMCOPY_okay; } enum hvm_copy_result hvm_copy_to_guest_phys( paddr_t paddr, void *buf, int size) { return __hvm_copy(buf, paddr, size, HVMCOPY_to_guest | HVMCOPY_fault | HVMCOPY_phys, 0); } enum hvm_copy_result hvm_copy_from_guest_phys( void *buf, paddr_t paddr, int size) { return __hvm_copy(buf, paddr, size, HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_phys, 0); } enum hvm_copy_result hvm_copy_to_guest_virt( unsigned long vaddr, void *buf, int size, uint32_t pfec) { return __hvm_copy(buf, vaddr, size, HVMCOPY_to_guest | HVMCOPY_fault | HVMCOPY_virt, PFEC_page_present | PFEC_write_access | pfec); } enum hvm_copy_result hvm_copy_from_guest_virt( void *buf, unsigned long vaddr, int size, uint32_t pfec) { return __hvm_copy(buf, vaddr, size, HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_virt, PFEC_page_present | pfec); } enum hvm_copy_result hvm_fetch_from_guest_virt( void *buf, unsigned long vaddr, int size, uint32_t pfec) { if ( hvm_nx_enabled(current) || hvm_smep_enabled(current) ) pfec |= PFEC_insn_fetch; return __hvm_copy(buf, vaddr, size, HVMCOPY_from_guest | HVMCOPY_fault | HVMCOPY_virt, PFEC_page_present | pfec); } enum hvm_copy_result hvm_copy_to_guest_virt_nofault( unsigned long vaddr, void *buf, int size, uint32_t pfec) { return __hvm_copy(buf, vaddr, size, HVMCOPY_to_guest | HVMCOPY_no_fault | HVMCOPY_virt, PFEC_page_present | PFEC_write_access | pfec); } enum hvm_copy_result hvm_copy_from_guest_virt_nofault( void *buf, unsigned long vaddr, int size, uint32_t pfec) { return __hvm_copy(buf, vaddr, size, HVMCOPY_from_guest | HVMCOPY_no_fault | HVMCOPY_virt, PFEC_page_present | pfec); } enum hvm_copy_result hvm_fetch_from_guest_virt_nofault( void *buf, unsigned long vaddr, int size, uint32_t pfec) { if ( hvm_nx_enabled(current) || hvm_smep_enabled(current) ) pfec |= PFEC_insn_fetch; return __hvm_copy(buf, vaddr, size, HVMCOPY_from_guest | HVMCOPY_no_fault | HVMCOPY_virt, PFEC_page_present | pfec); } unsigned long copy_to_user_hvm(void *to, const void *from, unsigned int len) { int rc; if ( !current->arch.hvm_vcpu.hcall_64bit && is_compat_arg_xlat_range(to, len) ) { memcpy(to, from, len); return 0; } rc = hvm_copy_to_guest_virt_nofault((unsigned long)to, (void *)from, len, 0); return rc ? len : 0; /* fake a copy_to_user() return code */ } unsigned long clear_user_hvm(void *to, unsigned int len) { int rc; if ( !current->arch.hvm_vcpu.hcall_64bit && is_compat_arg_xlat_range(to, len) ) { memset(to, 0x00, len); return 0; } rc = __hvm_clear((unsigned long)to, len); return rc ? len : 0; /* fake a copy_to_user() return code */ } unsigned long copy_from_user_hvm(void *to, const void *from, unsigned len) { int rc; if ( !current->arch.hvm_vcpu.hcall_64bit && is_compat_arg_xlat_range(from, len) ) { memcpy(to, from, len); return 0; } rc = hvm_copy_from_guest_virt_nofault(to, (unsigned long)from, len, 0); return rc ? len : 0; /* fake a copy_from_user() return code */ } void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { struct vcpu *v = current; struct domain *d = v->domain; unsigned int count, dummy = 0; if ( !eax ) eax = &dummy; if ( !ebx ) ebx = &dummy; if ( !ecx ) ecx = &dummy; count = *ecx; if ( !edx ) edx = &dummy; if ( cpuid_viridian_leaves(input, eax, ebx, ecx, edx) ) return; if ( cpuid_hypervisor_leaves(input, count, eax, ebx, ecx, edx) ) return; domain_cpuid(d, input, count, eax, ebx, ecx, edx); switch ( input ) { case 0x1: /* Fix up VLAPIC details. */ *ebx &= 0x00FFFFFFu; *ebx |= (v->vcpu_id * 2) << 24; if ( vlapic_hw_disabled(vcpu_vlapic(v)) ) __clear_bit(X86_FEATURE_APIC & 31, edx); /* Fix up OSXSAVE. */ if ( cpu_has_xsave ) *ecx |= (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE) ? cpufeat_mask(X86_FEATURE_OSXSAVE) : 0; /* Don't expose PCID to non-hap hvm. */ if ( !hap_enabled(d) ) *ecx &= ~cpufeat_mask(X86_FEATURE_PCID); /* Only provide PSE36 when guest runs in 32bit PAE or in long mode */ if ( !(hvm_pae_enabled(v) || hvm_long_mode_enabled(v)) ) *edx &= ~cpufeat_mask(X86_FEATURE_PSE36); break; case 0x7: if ( (count == 0) && !cpu_has_smep ) *ebx &= ~cpufeat_mask(X86_FEATURE_SMEP); /* Don't expose INVPCID to non-hap hvm. */ if ( (count == 0) && !hap_enabled(d) ) *ebx &= ~cpufeat_mask(X86_FEATURE_INVPCID); break; case 0xb: /* Fix the x2APIC identifier. */ *edx = v->vcpu_id * 2; break; case 0xd: { unsigned int sub_leaf, _eax, _ebx, _ecx, _edx; /* EBX value of main leaf 0 depends on enabled xsave features */ if ( count == 0 && v->arch.xcr0 ) { /* reset EBX to default value first */ *ebx = XSTATE_AREA_MIN_SIZE; for ( sub_leaf = 2; sub_leaf < 63; sub_leaf++ ) { if ( !(v->arch.xcr0 & (1ULL << sub_leaf)) ) continue; domain_cpuid(d, input, sub_leaf, &_eax, &_ebx, &_ecx, &_edx); if ( (_eax + _ebx) > *ebx ) *ebx = _eax + _ebx; } } break; } case 0x80000001: /* We expose RDTSCP feature to guest only when tsc_mode == TSC_MODE_DEFAULT and host_tsc_is_safe() returns 1 */ if ( d->arch.tsc_mode != TSC_MODE_DEFAULT || !host_tsc_is_safe() ) *edx &= ~cpufeat_mask(X86_FEATURE_RDTSCP); /* Hide 1GB-superpage feature if we can't emulate it. */ if (!hvm_pse1gb_supported(d)) *edx &= ~cpufeat_mask(X86_FEATURE_PAGE1GB); /* Only provide PSE36 when guest runs in 32bit PAE or in long mode */ if ( !(hvm_pae_enabled(v) || hvm_long_mode_enabled(v)) ) *edx &= ~cpufeat_mask(X86_FEATURE_PSE36); break; } } void hvm_rdtsc_intercept(struct cpu_user_regs *regs) { uint64_t tsc; struct vcpu *v = current; tsc = hvm_get_guest_tsc(v); regs->eax = (uint32_t)tsc; regs->edx = (uint32_t)(tsc >> 32); HVMTRACE_2D(RDTSC, regs->eax, regs->edx); } int hvm_msr_read_intercept(unsigned int msr, uint64_t *msr_content) { struct vcpu *v = current; uint64_t *var_range_base, *fixed_range_base; bool_t mtrr; unsigned int edx, index; int ret = X86EMUL_OKAY; var_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.var_ranges; fixed_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.fixed_ranges; hvm_cpuid(1, NULL, NULL, NULL, &edx); mtrr = !!(edx & cpufeat_mask(X86_FEATURE_MTRR)); switch ( msr ) { case MSR_EFER: *msr_content = v->arch.hvm_vcpu.guest_efer; break; case MSR_IA32_TSC: *msr_content = hvm_get_guest_tsc(v); break; case MSR_IA32_TSC_ADJUST: *msr_content = hvm_get_guest_tsc_adjust(v); break; case MSR_TSC_AUX: *msr_content = hvm_msr_tsc_aux(v); break; case MSR_IA32_APICBASE: *msr_content = vcpu_vlapic(v)->hw.apic_base_msr; break; case MSR_IA32_APICBASE_MSR ... MSR_IA32_APICBASE_MSR + 0x3ff: if ( hvm_x2apic_msr_read(v, msr, msr_content) ) goto gp_fault; break; case MSR_IA32_TSC_DEADLINE: *msr_content = vlapic_tdt_msr_get(vcpu_vlapic(v)); break; case MSR_IA32_CR_PAT: hvm_get_guest_pat(v, msr_content); break; case MSR_MTRRcap: if ( !mtrr ) goto gp_fault; *msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap; break; case MSR_MTRRdefType: if ( !mtrr ) goto gp_fault; *msr_content = v->arch.hvm_vcpu.mtrr.def_type | (v->arch.hvm_vcpu.mtrr.enabled << 10); break; case MSR_MTRRfix64K_00000: if ( !mtrr ) goto gp_fault; *msr_content = fixed_range_base[0]; break; case MSR_MTRRfix16K_80000: case MSR_MTRRfix16K_A0000: if ( !mtrr ) goto gp_fault; index = msr - MSR_MTRRfix16K_80000; *msr_content = fixed_range_base[index + 1]; break; case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000: if ( !mtrr ) goto gp_fault; index = msr - MSR_MTRRfix4K_C0000; *msr_content = fixed_range_base[index + 3]; break; case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7: if ( !mtrr ) goto gp_fault; index = msr - MSR_IA32_MTRR_PHYSBASE0; *msr_content = var_range_base[index]; break; case MSR_K8_ENABLE_C1E: case MSR_AMD64_NB_CFG: /* * These AMD-only registers may be accessed if this HVM guest * has been migrated to an Intel host. This fixes a guest crash * in this case. */ *msr_content = 0; break; default: if ( (ret = vmce_rdmsr(msr, msr_content)) < 0 ) goto gp_fault; /* If ret == 0 then this is not an MCE MSR, see other MSRs. */ ret = ((ret == 0) ? hvm_funcs.msr_read_intercept(msr, msr_content) : X86EMUL_OKAY); break; } out: HVMTRACE_3D(MSR_READ, msr, (uint32_t)*msr_content, (uint32_t)(*msr_content >> 32)); return ret; gp_fault: hvm_inject_hw_exception(TRAP_gp_fault, 0); ret = X86EMUL_EXCEPTION; *msr_content = -1ull; goto out; } int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content) { struct vcpu *v = current; bool_t mtrr; unsigned int edx, index; int ret = X86EMUL_OKAY; HVMTRACE_3D(MSR_WRITE, msr, (uint32_t)msr_content, (uint32_t)(msr_content >> 32)); hvm_cpuid(1, NULL, NULL, NULL, &edx); mtrr = !!(edx & cpufeat_mask(X86_FEATURE_MTRR)); hvm_memory_event_msr(msr, msr_content); switch ( msr ) { case MSR_EFER: if ( hvm_set_efer(msr_content) ) return X86EMUL_EXCEPTION; break; case MSR_IA32_TSC: hvm_set_guest_tsc(v, msr_content); break; case MSR_IA32_TSC_ADJUST: hvm_set_guest_tsc_adjust(v, msr_content); break; case MSR_TSC_AUX: v->arch.hvm_vcpu.msr_tsc_aux = (uint32_t)msr_content; if ( cpu_has_rdtscp && (v->domain->arch.tsc_mode != TSC_MODE_PVRDTSCP) ) wrmsrl(MSR_TSC_AUX, (uint32_t)msr_content); break; case MSR_IA32_APICBASE: vlapic_msr_set(vcpu_vlapic(v), msr_content); break; case MSR_IA32_TSC_DEADLINE: vlapic_tdt_msr_set(vcpu_vlapic(v), msr_content); break; case MSR_IA32_APICBASE_MSR ... MSR_IA32_APICBASE_MSR + 0x3ff: if ( hvm_x2apic_msr_write(v, msr, msr_content) ) goto gp_fault; break; case MSR_IA32_CR_PAT: if ( !hvm_set_guest_pat(v, msr_content) ) goto gp_fault; break; case MSR_MTRRcap: if ( !mtrr ) goto gp_fault; goto gp_fault; case MSR_MTRRdefType: if ( !mtrr ) goto gp_fault; if ( !mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, msr_content) ) goto gp_fault; break; case MSR_MTRRfix64K_00000: if ( !mtrr ) goto gp_fault; if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, 0, msr_content) ) goto gp_fault; break; case MSR_MTRRfix16K_80000: case MSR_MTRRfix16K_A0000: if ( !mtrr ) goto gp_fault; index = msr - MSR_MTRRfix16K_80000 + 1; if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, index, msr_content) ) goto gp_fault; break; case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000: if ( !mtrr ) goto gp_fault; index = msr - MSR_MTRRfix4K_C0000 + 3; if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, index, msr_content) ) goto gp_fault; break; case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7: if ( !mtrr ) goto gp_fault; if ( !mtrr_var_range_msr_set(v->domain, &v->arch.hvm_vcpu.mtrr, msr, msr_content) ) goto gp_fault; break; case MSR_AMD64_NB_CFG: /* ignore the write */ break; default: if ( (ret = vmce_wrmsr(msr, msr_content)) < 0 ) goto gp_fault; /* If ret == 0 then this is not an MCE MSR, see other MSRs. */ ret = ((ret == 0) ? hvm_funcs.msr_write_intercept(msr, msr_content) : X86EMUL_OKAY); break; } return ret; gp_fault: hvm_inject_hw_exception(TRAP_gp_fault, 0); return X86EMUL_EXCEPTION; } enum hvm_intblk hvm_interrupt_blocked(struct vcpu *v, struct hvm_intack intack) { unsigned long intr_shadow; ASSERT(v == current); if ( nestedhvm_enabled(v->domain) ) { enum hvm_intblk intr; intr = nhvm_interrupt_blocked(v); if ( intr != hvm_intblk_none ) return intr; } if ( (intack.source != hvm_intsrc_nmi) && !(guest_cpu_user_regs()->eflags & X86_EFLAGS_IF) ) return hvm_intblk_rflags_ie; intr_shadow = hvm_funcs.get_interrupt_shadow(v); if ( intr_shadow & (HVM_INTR_SHADOW_STI|HVM_INTR_SHADOW_MOV_SS) ) return hvm_intblk_shadow; if ( intack.source == hvm_intsrc_nmi ) return ((intr_shadow & HVM_INTR_SHADOW_NMI) ? hvm_intblk_nmi_iret : hvm_intblk_none); if ( intack.source == hvm_intsrc_lapic ) { uint32_t tpr = vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xF0; if ( (tpr >> 4) >= (intack.vector >> 4) ) return hvm_intblk_tpr; } return hvm_intblk_none; } static int grant_table_op_is_allowed(unsigned int cmd) { switch (cmd) { case GNTTABOP_query_size: case GNTTABOP_setup_table: case GNTTABOP_set_version: case GNTTABOP_get_version: case GNTTABOP_copy: case GNTTABOP_map_grant_ref: case GNTTABOP_unmap_grant_ref: case GNTTABOP_swap_grant_ref: return 1; default: /* all other commands need auditing */ return 0; } } static long hvm_grant_table_op( unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) uop, unsigned int count) { if ( !grant_table_op_is_allowed(cmd) ) return -ENOSYS; /* all other commands need auditing */ return do_grant_table_op(cmd, uop, count); } static long hvm_memory_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) { long rc; switch ( cmd & MEMOP_CMD_MASK ) { case XENMEM_memory_map: case XENMEM_machine_memory_map: case XENMEM_machphys_mapping: return -ENOSYS; case XENMEM_decrease_reservation: rc = do_memory_op(cmd, arg); current->domain->arch.hvm_domain.qemu_mapcache_invalidate = 1; return rc; } return do_memory_op(cmd, arg); } static long hvm_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) { switch ( cmd ) { default: if ( !is_pvh_vcpu(current) || !is_hardware_domain(current->domain) ) return -ENOSYS; /* fall through */ case PHYSDEVOP_map_pirq: case PHYSDEVOP_unmap_pirq: case PHYSDEVOP_eoi: case PHYSDEVOP_irq_status_query: case PHYSDEVOP_get_free_pirq: return do_physdev_op(cmd, arg); } } static long hvm_vcpu_op( int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg) { long rc; switch ( cmd ) { case VCPUOP_register_runstate_memory_area: case VCPUOP_get_runstate_info: case VCPUOP_set_periodic_timer: case VCPUOP_stop_periodic_timer: case VCPUOP_set_singleshot_timer: case VCPUOP_stop_singleshot_timer: case VCPUOP_register_vcpu_info: case VCPUOP_register_vcpu_time_memory_area: rc = do_vcpu_op(cmd, vcpuid, arg); break; default: rc = -ENOSYS; break; } return rc; } typedef unsigned long hvm_hypercall_t( unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); #define HYPERCALL(x) \ [ __HYPERVISOR_ ## x ] = (hvm_hypercall_t *) do_ ## x static long hvm_grant_table_op_compat32(unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) uop, unsigned int count) { if ( !grant_table_op_is_allowed(cmd) ) return -ENOSYS; return compat_grant_table_op(cmd, uop, count); } static long hvm_memory_op_compat32(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) { int rc; switch ( cmd & MEMOP_CMD_MASK ) { case XENMEM_memory_map: case XENMEM_machine_memory_map: case XENMEM_machphys_mapping: return -ENOSYS; case XENMEM_decrease_reservation: rc = compat_memory_op(cmd, arg); current->domain->arch.hvm_domain.qemu_mapcache_invalidate = 1; return rc; } return compat_memory_op(cmd, arg); } static long hvm_vcpu_op_compat32( int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg) { long rc; switch ( cmd ) { case VCPUOP_register_runstate_memory_area: case VCPUOP_get_runstate_info: case VCPUOP_set_periodic_timer: case VCPUOP_stop_periodic_timer: case VCPUOP_set_singleshot_timer: case VCPUOP_stop_singleshot_timer: case VCPUOP_register_vcpu_info: case VCPUOP_register_vcpu_time_memory_area: rc = compat_vcpu_op(cmd, vcpuid, arg); break; default: rc = -ENOSYS; break; } return rc; } static long hvm_physdev_op_compat32( int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) { switch ( cmd ) { case PHYSDEVOP_map_pirq: case PHYSDEVOP_unmap_pirq: case PHYSDEVOP_eoi: case PHYSDEVOP_irq_status_query: case PHYSDEVOP_get_free_pirq: return compat_physdev_op(cmd, arg); break; default: return -ENOSYS; break; } } static hvm_hypercall_t *const hvm_hypercall64_table[NR_hypercalls] = { [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op, [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op, [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op, [ __HYPERVISOR_physdev_op ] = (hvm_hypercall_t *)hvm_physdev_op, HYPERCALL(xen_version), HYPERCALL(console_io), HYPERCALL(event_channel_op), HYPERCALL(sched_op), HYPERCALL(set_timer_op), HYPERCALL(xsm_op), HYPERCALL(hvm_op), HYPERCALL(sysctl), HYPERCALL(domctl), HYPERCALL(tmem_op) }; #define COMPAT_CALL(x) \ [ __HYPERVISOR_ ## x ] = (hvm_hypercall_t *) compat_ ## x static hvm_hypercall_t *const hvm_hypercall32_table[NR_hypercalls] = { [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op_compat32, [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op_compat32, [ __HYPERVISOR_vcpu_op ] = (hvm_hypercall_t *)hvm_vcpu_op_compat32, [ __HYPERVISOR_physdev_op ] = (hvm_hypercall_t *)hvm_physdev_op_compat32, COMPAT_CALL(xen_version), HYPERCALL(console_io), HYPERCALL(event_channel_op), COMPAT_CALL(sched_op), COMPAT_CALL(set_timer_op), HYPERCALL(xsm_op), HYPERCALL(hvm_op), HYPERCALL(sysctl), HYPERCALL(domctl), HYPERCALL(tmem_op) }; /* PVH 32bitfixme. */ static hvm_hypercall_t *const pvh_hypercall64_table[NR_hypercalls] = { HYPERCALL(platform_op), HYPERCALL(memory_op), HYPERCALL(xen_version), HYPERCALL(console_io), [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op, HYPERCALL(vcpu_op), HYPERCALL(mmuext_op), HYPERCALL(xsm_op), HYPERCALL(sched_op), HYPERCALL(event_channel_op), [ __HYPERVISOR_physdev_op ] = (hvm_hypercall_t *)hvm_physdev_op, HYPERCALL(hvm_op), HYPERCALL(sysctl), HYPERCALL(domctl) }; int hvm_do_hypercall(struct cpu_user_regs *regs) { struct vcpu *curr = current; struct segment_register sreg; int mode = hvm_guest_x86_mode(curr); uint32_t eax = regs->eax; switch ( mode ) { case 8: case 4: case 2: hvm_get_segment_register(curr, x86_seg_ss, &sreg); if ( unlikely(sreg.attr.fields.dpl) ) { default: regs->eax = -EPERM; return HVM_HCALL_completed; } case 0: break; } if ( (eax & 0x80000000) && is_viridian_domain(curr->domain) ) return viridian_hypercall(regs); if ( (eax >= NR_hypercalls) || (is_pvh_vcpu(curr) ? !pvh_hypercall64_table[eax] : !hvm_hypercall32_table[eax]) ) { regs->eax = -ENOSYS; return HVM_HCALL_completed; } curr->arch.hvm_vcpu.hcall_preempted = 0; if ( mode == 8 ) { HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%lx, %lx, %lx, %lx, %lx, %lx)", eax, regs->rdi, regs->rsi, regs->rdx, regs->r10, regs->r8, regs->r9); curr->arch.hvm_vcpu.hcall_64bit = 1; if ( is_pvh_vcpu(curr) ) regs->rax = pvh_hypercall64_table[eax](regs->rdi, regs->rsi, regs->rdx, regs->r10, regs->r8, regs->r9); else regs->rax = hvm_hypercall64_table[eax](regs->rdi, regs->rsi, regs->rdx, regs->r10, regs->r8, regs->r9); curr->arch.hvm_vcpu.hcall_64bit = 0; } else { ASSERT(!is_pvh_vcpu(curr)); /* PVH 32bitfixme. */ HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u(%x, %x, %x, %x, %x, %x)", eax, (uint32_t)regs->ebx, (uint32_t)regs->ecx, (uint32_t)regs->edx, (uint32_t)regs->esi, (uint32_t)regs->edi, (uint32_t)regs->ebp); regs->eax = hvm_hypercall32_table[eax]((uint32_t)regs->ebx, (uint32_t)regs->ecx, (uint32_t)regs->edx, (uint32_t)regs->esi, (uint32_t)regs->edi, (uint32_t)regs->ebp); } HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%u -> %lx", eax, (unsigned long)regs->eax); if ( curr->arch.hvm_vcpu.hcall_preempted ) return HVM_HCALL_preempted; if ( unlikely(curr->domain->arch.hvm_domain.qemu_mapcache_invalidate) && test_and_clear_bool(curr->domain->arch.hvm_domain. qemu_mapcache_invalidate) ) return HVM_HCALL_invalidate; return HVM_HCALL_completed; } static void hvm_latch_shinfo_size(struct domain *d) { /* * Called from operations which are among the very first executed by * PV drivers on initialisation or after save/restore. These are sensible * points at which to sample the execution mode of the guest and latch * 32- or 64-bit format for shared state. */ if ( current->domain == d ) { d->arch.has_32bit_shinfo = (hvm_guest_x86_mode(current) != 8); /* * Make sure that the timebase in the shared info structure is correct. * * If the bit-ness changed we should arguably try to convert the other * fields as well, but that's much more problematic (e.g. what do you * do if you're going from 64 bit to 32 bit and there's an event * channel pending which doesn't exist in the 32 bit version?). Just * setting the wallclock time seems to be sufficient for everything * we do, even if it is a bit of a hack. */ update_domain_wallclock_time(d); } } /* Initialise a hypercall transfer page for a VMX domain using paravirtualised drivers. */ void hvm_hypercall_page_initialise(struct domain *d, void *hypercall_page) { hvm_latch_shinfo_size(d); hvm_funcs.init_hypercall_page(d, hypercall_page); } static int hvmop_set_pci_intx_level( XEN_GUEST_HANDLE_PARAM(xen_hvm_set_pci_intx_level_t) uop) { struct xen_hvm_set_pci_intx_level op; struct domain *d; int rc; if ( copy_from_guest(&op, uop, 1) ) return -EFAULT; if ( (op.domain > 0) || (op.bus > 0) || (op.device > 31) || (op.intx > 3) ) return -EINVAL; rc = rcu_lock_remote_domain_by_id(op.domid, &d); if ( rc != 0 ) return rc; rc = -EINVAL; if ( !is_hvm_domain(d) ) goto out; rc = xsm_hvm_set_pci_intx_level(XSM_DM_PRIV, d); if ( rc ) goto out; rc = 0; switch ( op.level ) { case 0: hvm_pci_intx_deassert(d, op.device, op.intx); break; case 1: hvm_pci_intx_assert(d, op.device, op.intx); break; default: rc = -EINVAL; break; } out: rcu_unlock_domain(d); return rc; } void hvm_vcpu_reset_state(struct vcpu *v, uint16_t cs, uint16_t ip) { struct domain *d = v->domain; struct segment_register reg; typeof(v->arch.xsave_area->fpu_sse) *fpu_ctxt = v->arch.fpu_ctxt; domain_lock(d); if ( v->is_initialised ) goto out; if ( !paging_mode_hap(d) ) { if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG ) put_page(pagetable_get_page(v->arch.guest_table)); v->arch.guest_table = pagetable_null(); } memset(fpu_ctxt, 0, sizeof(*fpu_ctxt)); fpu_ctxt->fcw = FCW_RESET; fpu_ctxt->mxcsr = MXCSR_DEFAULT; if ( v->arch.xsave_area ) v->arch.xsave_area->xsave_hdr.xstate_bv = XSTATE_FP; v->arch.vgc_flags = VGCF_online; memset(&v->arch.user_regs, 0, sizeof(v->arch.user_regs)); v->arch.user_regs.eflags = X86_EFLAGS_MBS; v->arch.user_regs.edx = 0x00000f00; v->arch.user_regs.eip = ip; memset(&v->arch.debugreg, 0, sizeof(v->arch.debugreg)); v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_ET; hvm_update_guest_cr(v, 0); v->arch.hvm_vcpu.guest_cr[2] = 0; hvm_update_guest_cr(v, 2); v->arch.hvm_vcpu.guest_cr[3] = 0; hvm_update_guest_cr(v, 3); v->arch.hvm_vcpu.guest_cr[4] = 0; hvm_update_guest_cr(v, 4); v->arch.hvm_vcpu.guest_efer = 0; hvm_update_guest_efer(v); reg.sel = cs; reg.base = (uint32_t)reg.sel << 4; reg.limit = 0xffff; reg.attr.bytes = 0x09b; hvm_set_segment_register(v, x86_seg_cs, ®); reg.sel = reg.base = 0; reg.limit = 0xffff; reg.attr.bytes = 0x093; hvm_set_segment_register(v, x86_seg_ds, ®); hvm_set_segment_register(v, x86_seg_es, ®); hvm_set_segment_register(v, x86_seg_fs, ®); hvm_set_segment_register(v, x86_seg_gs, ®); hvm_set_segment_register(v, x86_seg_ss, ®); reg.attr.bytes = 0x82; /* LDT */ hvm_set_segment_register(v, x86_seg_ldtr, ®); reg.attr.bytes = 0x8b; /* 32-bit TSS (busy) */ hvm_set_segment_register(v, x86_seg_tr, ®); reg.attr.bytes = 0; hvm_set_segment_register(v, x86_seg_gdtr, ®); hvm_set_segment_register(v, x86_seg_idtr, ®); /* Sync AP's TSC with BSP's. */ v->arch.hvm_vcpu.cache_tsc_offset = v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset; hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset); v->arch.hvm_vcpu.msr_tsc_adjust = 0; paging_update_paging_modes(v); v->arch.flags |= TF_kernel_mode; v->is_initialised = 1; clear_bit(_VPF_down, &v->pause_flags); out: domain_unlock(d); } static void hvm_s3_suspend(struct domain *d) { struct vcpu *v; domain_pause(d); domain_lock(d); if ( d->is_dying || (d->vcpu == NULL) || (d->vcpu[0] == NULL) || test_and_set_bool(d->arch.hvm_domain.is_s3_suspended) ) { domain_unlock(d); domain_unpause(d); return; } for_each_vcpu ( d, v ) { int rc; vlapic_reset(vcpu_vlapic(v)); rc = vcpu_reset(v); ASSERT(!rc); } vpic_reset(d); vioapic_reset(d); pit_reset(d); rtc_reset(d); pmtimer_reset(d); hpet_reset(d); hvm_vcpu_reset_state(d->vcpu[0], 0xf000, 0xfff0); domain_unlock(d); } static void hvm_s3_resume(struct domain *d) { if ( test_and_clear_bool(d->arch.hvm_domain.is_s3_suspended) ) { struct vcpu *v; for_each_vcpu( d, v ) hvm_set_guest_tsc(v, 0); domain_unpause(d); } } static int hvmop_set_isa_irq_level( XEN_GUEST_HANDLE_PARAM(xen_hvm_set_isa_irq_level_t) uop) { struct xen_hvm_set_isa_irq_level op; struct domain *d; int rc; if ( copy_from_guest(&op, uop, 1) ) return -EFAULT; if ( op.isa_irq > 15 ) return -EINVAL; rc = rcu_lock_remote_domain_by_id(op.domid, &d); if ( rc != 0 ) return rc; rc = -EINVAL; if ( !is_hvm_domain(d) ) goto out; rc = xsm_hvm_set_isa_irq_level(XSM_DM_PRIV, d); if ( rc ) goto out; rc = 0; switch ( op.level ) { case 0: hvm_isa_irq_deassert(d, op.isa_irq); break; case 1: hvm_isa_irq_assert(d, op.isa_irq); break; default: rc = -EINVAL; break; } out: rcu_unlock_domain(d); return rc; } static int hvmop_set_pci_link_route( XEN_GUEST_HANDLE_PARAM(xen_hvm_set_pci_link_route_t) uop) { struct xen_hvm_set_pci_link_route op; struct domain *d; int rc; if ( copy_from_guest(&op, uop, 1) ) return -EFAULT; if ( (op.link > 3) || (op.isa_irq > 15) ) return -EINVAL; rc = rcu_lock_remote_domain_by_id(op.domid, &d); if ( rc != 0 ) return rc; rc = -EINVAL; if ( !is_hvm_domain(d) ) goto out; rc = xsm_hvm_set_pci_link_route(XSM_DM_PRIV, d); if ( rc ) goto out; rc = 0; hvm_set_pci_link_route(d, op.link, op.isa_irq); out: rcu_unlock_domain(d); return rc; } static int hvmop_inject_msi( XEN_GUEST_HANDLE_PARAM(xen_hvm_inject_msi_t) uop) { struct xen_hvm_inject_msi op; struct domain *d; int rc; if ( copy_from_guest(&op, uop, 1) ) return -EFAULT; rc = rcu_lock_remote_domain_by_id(op.domid, &d); if ( rc != 0 ) return rc; rc = -EINVAL; if ( !is_hvm_domain(d) ) goto out; rc = xsm_hvm_inject_msi(XSM_DM_PRIV, d); if ( rc ) goto out; hvm_inject_msi(d, op.addr, op.data); out: rcu_unlock_domain(d); return rc; } static int hvmop_flush_tlb_all(void) { struct domain *d = current->domain; struct vcpu *v; if ( !is_hvm_domain(d) ) return -EINVAL; /* Avoid deadlock if more than one vcpu tries this at the same time. */ if ( !spin_trylock(&d->hypercall_deadlock_mutex) ) return -EAGAIN; /* Pause all other vcpus. */ for_each_vcpu ( d, v ) if ( v != current ) vcpu_pause_nosync(v); /* Now that all VCPUs are signalled to deschedule, we wait... */ for_each_vcpu ( d, v ) if ( v != current ) while ( !vcpu_runnable(v) && v->is_running ) cpu_relax(); /* All other vcpus are paused, safe to unlock now. */ spin_unlock(&d->hypercall_deadlock_mutex); /* Flush paging-mode soft state (e.g., va->gfn cache; PAE PDPE cache). */ for_each_vcpu ( d, v ) paging_update_cr3(v); /* Flush all dirty TLBs. */ flush_tlb_mask(d->domain_dirty_cpumask); /* Done. */ for_each_vcpu ( d, v ) if ( v != current ) vcpu_unpause(v); return 0; } static int hvm_replace_event_channel(struct vcpu *v, domid_t remote_domid, int *p_port) { int old_port, new_port; new_port = alloc_unbound_xen_event_channel(v, remote_domid, NULL); if ( new_port < 0 ) return new_port; /* xchg() ensures that only we call free_xen_event_channel(). */ old_port = xchg(p_port, new_port); free_xen_event_channel(v, old_port); return 0; } long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg) { struct domain *curr_d = current->domain; long rc = 0; switch ( op ) { case HVMOP_set_param: case HVMOP_get_param: { struct xen_hvm_param a; struct hvm_ioreq_page *iorp; struct domain *d; struct vcpu *v; if ( copy_from_guest(&a, arg, 1) ) return -EFAULT; if ( a.index >= HVM_NR_PARAMS ) return -EINVAL; d = rcu_lock_domain_by_any_id(a.domid); if ( d == NULL ) return -ESRCH; rc = -EINVAL; if ( !has_hvm_container_domain(d) ) goto param_fail; if ( is_pvh_domain(d) && (a.index != HVM_PARAM_CALLBACK_IRQ) ) goto param_fail; rc = xsm_hvm_param(XSM_TARGET, d, op); if ( rc ) goto param_fail; if ( op == HVMOP_set_param ) { rc = 0; switch ( a.index ) { case HVM_PARAM_IOREQ_PFN: iorp = &d->arch.hvm_domain.ioreq; if ( (rc = hvm_set_ioreq_page(d, iorp, a.value)) != 0 ) break; spin_lock(&iorp->lock); if ( iorp->va != NULL ) /* Initialise evtchn port info if VCPUs already created. */ for_each_vcpu ( d, v ) get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port; spin_unlock(&iorp->lock); break; case HVM_PARAM_BUFIOREQ_PFN: iorp = &d->arch.hvm_domain.buf_ioreq; rc = hvm_set_ioreq_page(d, iorp, a.value); break; case HVM_PARAM_CALLBACK_IRQ: hvm_set_callback_via(d, a.value); hvm_latch_shinfo_size(d); break; case HVM_PARAM_TIMER_MODE: if ( a.value > HVMPTM_one_missed_tick_pending ) rc = -EINVAL; break; case HVM_PARAM_VIRIDIAN: if ( a.value > 1 ) rc = -EINVAL; break; case HVM_PARAM_IDENT_PT: /* Not reflexive, as we must domain_pause(). */ rc = -EPERM; if ( curr_d == d ) break; rc = -EINVAL; if ( d->arch.hvm_domain.params[a.index] != 0 ) break; rc = 0; if ( !paging_mode_hap(d) ) break; /* * Update GUEST_CR3 in each VMCS to point at identity map. * All foreign updates to guest state must synchronise on * the domctl_lock. */ rc = -EAGAIN; if ( !domctl_lock_acquire() ) break; rc = 0; domain_pause(d); d->arch.hvm_domain.params[a.index] = a.value; for_each_vcpu ( d, v ) paging_update_cr3(v); domain_unpause(d); domctl_lock_release(); break; case HVM_PARAM_DM_DOMAIN: /* Not reflexive, as we must domain_pause(). */ rc = -EPERM; if ( curr_d == d ) break; if ( a.value == DOMID_SELF ) a.value = curr_d->domain_id; rc = 0; domain_pause(d); /* safe to change per-vcpu xen_port */ if ( d->vcpu[0] ) rc = hvm_replace_event_channel(d->vcpu[0], a.value, (int *)&d->vcpu[0]->domain->arch.hvm_domain.params [HVM_PARAM_BUFIOREQ_EVTCHN]); if ( rc ) { domain_unpause(d); break; } iorp = &d->arch.hvm_domain.ioreq; for_each_vcpu ( d, v ) { rc = hvm_replace_event_channel(v, a.value, &v->arch.hvm_vcpu.xen_port); if ( rc ) break; spin_lock(&iorp->lock); if ( iorp->va != NULL ) get_ioreq(v)->vp_eport = v->arch.hvm_vcpu.xen_port; spin_unlock(&iorp->lock); } domain_unpause(d); break; case HVM_PARAM_ACPI_S_STATE: /* Not reflexive, as we must domain_pause(). */ rc = -EPERM; if ( curr_d == d ) break; rc = 0; if ( a.value == 3 ) hvm_s3_suspend(d); else if ( a.value == 0 ) hvm_s3_resume(d); else rc = -EINVAL; break; case HVM_PARAM_ACPI_IOPORTS_LOCATION: rc = pmtimer_change_ioport(d, a.value); break; case HVM_PARAM_MEMORY_EVENT_CR0: case HVM_PARAM_MEMORY_EVENT_CR3: case HVM_PARAM_MEMORY_EVENT_CR4: if ( d == current->domain ) rc = -EPERM; break; case HVM_PARAM_MEMORY_EVENT_INT3: case HVM_PARAM_MEMORY_EVENT_SINGLE_STEP: case HVM_PARAM_MEMORY_EVENT_MSR: if ( d == current->domain ) { rc = -EPERM; break; } if ( a.value & HVMPME_onchangeonly ) rc = -EINVAL; break; case HVM_PARAM_NESTEDHVM: rc = xsm_hvm_param_nested(XSM_PRIV, d); if ( rc ) break; if ( a.value > 1 ) rc = -EINVAL; /* Remove the check below once we have * shadow-on-shadow. */ if ( cpu_has_svm && !paging_mode_hap(d) && a.value ) rc = -EINVAL; /* Set up NHVM state for any vcpus that are already up */ if ( a.value && !d->arch.hvm_domain.params[HVM_PARAM_NESTEDHVM] ) for_each_vcpu(d, v) if ( rc == 0 ) rc = nestedhvm_vcpu_initialise(v); if ( !a.value || rc ) for_each_vcpu(d, v) nestedhvm_vcpu_destroy(v); break; case HVM_PARAM_BUFIOREQ_EVTCHN: rc = -EINVAL; break; case HVM_PARAM_TRIPLE_FAULT_REASON: if ( a.value > SHUTDOWN_MAX ) rc = -EINVAL; break; } if ( rc == 0 ) { d->arch.hvm_domain.params[a.index] = a.value; switch( a.index ) { case HVM_PARAM_MEMORY_EVENT_INT3: case HVM_PARAM_MEMORY_EVENT_SINGLE_STEP: { domain_pause(d); domain_unpause(d); /* Causes guest to latch new status */ break; } case HVM_PARAM_MEMORY_EVENT_CR3: { for_each_vcpu ( d, v ) hvm_funcs.update_guest_cr(v, 0); /* Latches new CR3 mask through CR0 code */ break; } } } } else { switch ( a.index ) { case HVM_PARAM_ACPI_S_STATE: a.value = d->arch.hvm_domain.is_s3_suspended ? 3 : 0; break; default: a.value = d->arch.hvm_domain.params[a.index]; break; } rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0; } HVM_DBG_LOG(DBG_LEVEL_HCALL, "%s param %u = %"PRIx64, op == HVMOP_set_param ? "set" : "get", a.index, a.value); param_fail: rcu_unlock_domain(d); break; } case HVMOP_set_pci_intx_level: rc = hvmop_set_pci_intx_level( guest_handle_cast(arg, xen_hvm_set_pci_intx_level_t)); break; case HVMOP_set_isa_irq_level: rc = hvmop_set_isa_irq_level( guest_handle_cast(arg, xen_hvm_set_isa_irq_level_t)); break; case HVMOP_inject_msi: rc = hvmop_inject_msi( guest_handle_cast(arg, xen_hvm_inject_msi_t)); break; case HVMOP_set_pci_link_route: rc = hvmop_set_pci_link_route( guest_handle_cast(arg, xen_hvm_set_pci_link_route_t)); break; case HVMOP_flush_tlbs: rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -ENOSYS; break; case HVMOP_track_dirty_vram: { struct xen_hvm_track_dirty_vram a; struct domain *d; if ( copy_from_guest(&a, arg, 1) ) return -EFAULT; rc = rcu_lock_remote_domain_by_id(a.domid, &d); if ( rc != 0 ) return rc; rc = -EINVAL; if ( !is_hvm_domain(d) ) goto param_fail2; if ( a.nr > GB(1) >> PAGE_SHIFT ) goto param_fail2; rc = xsm_hvm_param(XSM_TARGET, d, op); if ( rc ) goto param_fail2; rc = -ESRCH; if ( d->is_dying ) goto param_fail2; rc = -EINVAL; if ( d->vcpu == NULL || d->vcpu[0] == NULL ) goto param_fail2; if ( shadow_mode_enabled(d) ) rc = shadow_track_dirty_vram(d, a.first_pfn, a.nr, a.dirty_bitmap); else rc = hap_track_dirty_vram(d, a.first_pfn, a.nr, a.dirty_bitmap); param_fail2: rcu_unlock_domain(d); break; } case HVMOP_modified_memory: { struct xen_hvm_modified_memory a; struct domain *d; if ( copy_from_guest(&a, arg, 1) ) return -EFAULT; rc = rcu_lock_remote_domain_by_id(a.domid, &d); if ( rc != 0 ) return rc; rc = -EINVAL; if ( !is_hvm_domain(d) ) goto param_fail3; rc = xsm_hvm_param(XSM_TARGET, d, op); if ( rc ) goto param_fail3; rc = -EINVAL; if ( (a.first_pfn > domain_get_maximum_gpfn(d)) || ((a.first_pfn + a.nr - 1) < a.first_pfn) || ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) ) goto param_fail3; rc = 0; if ( !paging_mode_log_dirty(d) ) goto param_fail3; while ( a.nr > 0 ) { unsigned long pfn = a.first_pfn; struct page_info *page; page = get_page_from_gfn(d, pfn, NULL, P2M_UNSHARE); if ( page ) { paging_mark_dirty(d, page_to_mfn(page)); /* These are most probably not page tables any more */ /* don't take a long time and don't die either */ sh_remove_shadows(d->vcpu[0], _mfn(page_to_mfn(page)), 1, 0); put_page(page); } a.first_pfn++; a.nr--; /* Check for continuation if it's not the last interation */ if ( a.nr > 0 && hypercall_preempt_check() ) { if ( copy_to_guest(arg, &a, 1) ) rc = -EFAULT; else rc = -EAGAIN; break; } } param_fail3: rcu_unlock_domain(d); break; } case HVMOP_get_mem_type: { struct xen_hvm_get_mem_type a; struct domain *d; p2m_type_t t; if ( copy_from_guest(&a, arg, 1) ) return -EFAULT; d = rcu_lock_domain_by_any_id(a.domid); if ( d == NULL ) return -ESRCH; rc = xsm_hvm_param(XSM_TARGET, d, op); if ( rc ) goto param_fail_getmemtype; rc = -EINVAL; if ( is_hvm_domain(d) ) { /* Use get_gfn query as we are interested in the current * type, not in allocating or unsharing. That'll happen * on access. */ get_gfn_query_unlocked(d, a.pfn, &t); if ( p2m_is_mmio(t) ) a.mem_type = HVMMEM_mmio_dm; else if ( p2m_is_readonly(t) ) a.mem_type = HVMMEM_ram_ro; else if ( p2m_is_ram(t) ) a.mem_type = HVMMEM_ram_rw; else if ( p2m_is_pod(t) ) a.mem_type = HVMMEM_ram_rw; else if ( p2m_is_grant(t) ) a.mem_type = HVMMEM_ram_rw; else a.mem_type = HVMMEM_mmio_dm; rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0; } param_fail_getmemtype: rcu_unlock_domain(d); break; } case HVMOP_set_mem_type: { struct xen_hvm_set_mem_type a; struct domain *d; /* Interface types to internal p2m types */ static const p2m_type_t memtype[] = { [HVMMEM_ram_rw] = p2m_ram_rw, [HVMMEM_ram_ro] = p2m_ram_ro, [HVMMEM_mmio_dm] = p2m_mmio_dm }; if ( copy_from_guest(&a, arg, 1) ) return -EFAULT; rc = rcu_lock_remote_domain_by_id(a.domid, &d); if ( rc != 0 ) return rc; rc = -EINVAL; if ( !is_hvm_domain(d) ) goto param_fail4; rc = xsm_hvm_param(XSM_TARGET, d, op); if ( rc ) goto param_fail4; rc = -EINVAL; if ( (a.first_pfn > domain_get_maximum_gpfn(d)) || ((a.first_pfn + a.nr - 1) < a.first_pfn) || ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) ) goto param_fail4; if ( a.hvmmem_type >= ARRAY_SIZE(memtype) ) goto param_fail4; while ( a.nr ) { unsigned long pfn = a.first_pfn; p2m_type_t t; p2m_type_t nt; mfn_t mfn; mfn = get_gfn_unshare(d, pfn, &t); if ( p2m_is_paging(t) ) { put_gfn(d, pfn); p2m_mem_paging_populate(d, pfn); rc = -EINVAL; goto param_fail4; } if ( p2m_is_shared(t) ) { put_gfn(d, pfn); rc = -EINVAL; goto param_fail4; } if ( p2m_is_grant(t) ) { put_gfn(d, pfn); gdprintk(XENLOG_WARNING, "type for pfn %#lx changed to grant while " "we were working?\n", pfn); goto param_fail4; } else { nt = p2m_change_type(d, pfn, t, memtype[a.hvmmem_type]); if ( nt != t ) { put_gfn(d, pfn); gdprintk(XENLOG_WARNING, "type of pfn %#lx changed from %d to %d while " "we were trying to change it to %d\n", pfn, t, nt, memtype[a.hvmmem_type]); goto param_fail4; } } put_gfn(d, pfn); a.first_pfn++; a.nr--; /* Check for continuation if it's not the last interation */ if ( a.nr > 0 && hypercall_preempt_check() ) { if ( copy_to_guest(arg, &a, 1) ) rc = -EFAULT; else rc = -EAGAIN; goto param_fail4; } } rc = 0; param_fail4: rcu_unlock_domain(d); break; } case HVMOP_set_mem_access: { struct xen_hvm_set_mem_access a; struct domain *d; if ( copy_from_guest(&a, arg, 1) ) return -EFAULT; rc = rcu_lock_remote_domain_by_id(a.domid, &d); if ( rc != 0 ) return rc; rc = -EINVAL; if ( !is_hvm_domain(d) ) goto param_fail5; rc = xsm_hvm_param(XSM_TARGET, d, op); if ( rc ) goto param_fail5; rc = -EINVAL; if ( (a.first_pfn != ~0ull) && ((a.first_pfn > domain_get_maximum_gpfn(d)) || ((a.first_pfn + a.nr - 1) < a.first_pfn) || ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d))) ) goto param_fail5; rc = p2m_set_mem_access(d, a.first_pfn, a.nr, a.hvmmem_access); param_fail5: rcu_unlock_domain(d); break; } case HVMOP_get_mem_access: { struct xen_hvm_get_mem_access a; struct domain *d; hvmmem_access_t access; if ( copy_from_guest(&a, arg, 1) ) return -EFAULT; rc = rcu_lock_remote_domain_by_id(a.domid, &d); if ( rc != 0 ) return rc; rc = -EINVAL; if ( !is_hvm_domain(d) ) goto param_fail6; rc = xsm_hvm_param(XSM_TARGET, d, op); if ( rc ) goto param_fail6; rc = -EINVAL; if ( (a.pfn > domain_get_maximum_gpfn(d)) && a.pfn != ~0ull ) goto param_fail6; rc = p2m_get_mem_access(d, a.pfn, &access); if ( rc != 0 ) goto param_fail6; a.hvmmem_access = access; rc = copy_to_guest(arg, &a, 1) ? -EFAULT : 0; param_fail6: rcu_unlock_domain(d); break; } case HVMOP_pagetable_dying: { struct xen_hvm_pagetable_dying a; struct domain *d; if ( copy_from_guest(&a, arg, 1) ) return -EFAULT; d = rcu_lock_domain_by_any_id(a.domid); if ( d == NULL ) return -ESRCH; rc = -EINVAL; if ( !is_hvm_domain(d) || !paging_mode_shadow(d) ) goto param_fail7; rc = xsm_hvm_param(XSM_TARGET, d, op); if ( rc ) goto param_fail7; rc = 0; pagetable_dying(d, a.gpa); param_fail7: rcu_unlock_domain(d); break; } case HVMOP_get_time: { xen_hvm_get_time_t gxt; gxt.now = NOW(); if ( copy_to_guest(arg, &gxt, 1) ) rc = -EFAULT; break; } case HVMOP_xentrace: { xen_hvm_xentrace_t tr; if ( copy_from_guest(&tr, arg, 1 ) ) return -EFAULT; if ( tr.extra_bytes > sizeof(tr.extra) || (tr.event & ~((1u<= d->max_vcpus || (v = d->vcpu[tr.vcpuid]) == NULL ) goto param_fail8; if ( v->arch.hvm_vcpu.inject_trap.vector != -1 ) rc = -EBUSY; else { v->arch.hvm_vcpu.inject_trap.vector = tr.vector; v->arch.hvm_vcpu.inject_trap.type = tr.type; v->arch.hvm_vcpu.inject_trap.error_code = tr.error_code; v->arch.hvm_vcpu.inject_trap.insn_len = tr.insn_len; v->arch.hvm_vcpu.inject_trap.cr2 = tr.cr2; rc = 0; } param_fail8: rcu_unlock_domain(d); break; } default: { gdprintk(XENLOG_DEBUG, "Bad HVM op %ld.\n", op); rc = -ENOSYS; break; } } if ( rc == -EAGAIN ) rc = hypercall_create_continuation( __HYPERVISOR_hvm_op, "lh", op, arg); return rc; } int hvm_debug_op(struct vcpu *v, int32_t op) { int rc; switch ( op ) { case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON: case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF: rc = -ENOSYS; if ( !cpu_has_monitor_trap_flag ) break; rc = 0; vcpu_pause(v); v->arch.hvm_vcpu.single_step = (op == XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON); vcpu_unpause(v); /* guest will latch new state */ break; default: rc = -ENOSYS; break; } return rc; } static int hvm_memory_event_traps(long p, uint32_t reason, unsigned long value, unsigned long old, bool_t gla_valid, unsigned long gla) { struct vcpu* v = current; struct domain *d = v->domain; mem_event_request_t req = { .reason = reason }; int rc; if ( !(p & HVMPME_MODE_MASK) ) return 0; if ( (p & HVMPME_onchangeonly) && (value == old) ) return 1; rc = mem_event_claim_slot(d, &d->mem_event->access); if ( rc == -ENOSYS ) { /* If there was no ring to handle the event, then * simple continue executing normally. */ return 1; } else if ( rc < 0 ) return rc; if ( (p & HVMPME_MODE_MASK) == HVMPME_mode_sync ) { req.flags |= MEM_EVENT_FLAG_VCPU_PAUSED; vcpu_pause_nosync(v); } req.gfn = value; req.vcpu_id = v->vcpu_id; if ( gla_valid ) { req.offset = gla & ((1 << PAGE_SHIFT) - 1); req.gla = gla; req.gla_valid = 1; } mem_event_put_request(d, &d->mem_event->access, &req); return 1; } void hvm_memory_event_cr0(unsigned long value, unsigned long old) { hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_CR0], MEM_EVENT_REASON_CR0, value, old, 0, 0); } void hvm_memory_event_cr3(unsigned long value, unsigned long old) { hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_CR3], MEM_EVENT_REASON_CR3, value, old, 0, 0); } void hvm_memory_event_cr4(unsigned long value, unsigned long old) { hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_CR4], MEM_EVENT_REASON_CR4, value, old, 0, 0); } void hvm_memory_event_msr(unsigned long msr, unsigned long value) { hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_MSR], MEM_EVENT_REASON_MSR, value, ~value, 1, msr); } int hvm_memory_event_int3(unsigned long gla) { uint32_t pfec = PFEC_page_present; unsigned long gfn; gfn = paging_gva_to_gfn(current, gla, &pfec); return hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_INT3], MEM_EVENT_REASON_INT3, gfn, 0, 1, gla); } int hvm_memory_event_single_step(unsigned long gla) { uint32_t pfec = PFEC_page_present; unsigned long gfn; gfn = paging_gva_to_gfn(current, gla, &pfec); return hvm_memory_event_traps(current->domain->arch.hvm_domain .params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP], MEM_EVENT_REASON_SINGLESTEP, gfn, 0, 1, gla); } int nhvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs) { if (hvm_funcs.nhvm_vcpu_hostrestore) return hvm_funcs.nhvm_vcpu_hostrestore(v, regs); return -EOPNOTSUPP; } int nhvm_vcpu_vmexit(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode) { if (hvm_funcs.nhvm_vcpu_vmexit) return hvm_funcs.nhvm_vcpu_vmexit(v, regs, exitcode); return -EOPNOTSUPP; } int nhvm_vcpu_vmexit_trap(struct vcpu *v, struct hvm_trap *trap) { return hvm_funcs.nhvm_vcpu_vmexit_trap(v, trap); } uint64_t nhvm_vcpu_guestcr3(struct vcpu *v) { if (hvm_funcs.nhvm_vcpu_guestcr3) return hvm_funcs.nhvm_vcpu_guestcr3(v); return -EOPNOTSUPP; } uint64_t nhvm_vcpu_p2m_base(struct vcpu *v) { if ( hvm_funcs.nhvm_vcpu_p2m_base ) return hvm_funcs.nhvm_vcpu_p2m_base(v); return -EOPNOTSUPP; } uint32_t nhvm_vcpu_asid(struct vcpu *v) { if (hvm_funcs.nhvm_vcpu_asid) return hvm_funcs.nhvm_vcpu_asid(v); return -EOPNOTSUPP; } int nhvm_vmcx_guest_intercepts_trap(struct vcpu *v, unsigned int trap, int errcode) { if (hvm_funcs.nhvm_vmcx_guest_intercepts_trap) return hvm_funcs.nhvm_vmcx_guest_intercepts_trap(v, trap, errcode); return -EOPNOTSUPP; } bool_t nhvm_vmcx_hap_enabled(struct vcpu *v) { if (hvm_funcs.nhvm_vmcx_hap_enabled) return hvm_funcs.nhvm_vmcx_hap_enabled(v); return -EOPNOTSUPP; } enum hvm_intblk nhvm_interrupt_blocked(struct vcpu *v) { return hvm_funcs.nhvm_intr_blocked(v); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/hvm/vlapic.c0000664000175000017500000010637712307313555015031 0ustar smbsmb/* * vlapic.c: virtualize LAPIC for HVM vcpus. * * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2006 Keir Fraser, XenSource Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define VLAPIC_VERSION 0x00050014 #define VLAPIC_LVT_NUM 6 #define LVT_MASK \ APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK #define LINT_MASK \ LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY |\ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER static const unsigned int vlapic_lvt_mask[VLAPIC_LVT_NUM] = { /* LVTT */ LVT_MASK | APIC_TIMER_MODE_MASK, /* LVTTHMR */ LVT_MASK | APIC_MODE_MASK, /* LVTPC */ LVT_MASK | APIC_MODE_MASK, /* LVT0-1 */ LINT_MASK, LINT_MASK, /* LVTERR */ LVT_MASK }; /* Following could belong in apicdef.h */ #define APIC_SHORT_MASK 0xc0000 #define APIC_DEST_NOSHORT 0x0 #define APIC_DEST_MASK 0x800 #define vlapic_lvt_vector(vlapic, lvt_type) \ (vlapic_get_reg(vlapic, lvt_type) & APIC_VECTOR_MASK) #define vlapic_lvt_dm(vlapic, lvt_type) \ (vlapic_get_reg(vlapic, lvt_type) & APIC_MODE_MASK) #define vlapic_lvtt_period(vlapic) \ ((vlapic_get_reg(vlapic, APIC_LVTT) & APIC_TIMER_MODE_MASK) \ == APIC_TIMER_MODE_PERIODIC) #define vlapic_lvtt_oneshot(vlapic) \ ((vlapic_get_reg(vlapic, APIC_LVTT) & APIC_TIMER_MODE_MASK) \ == APIC_TIMER_MODE_ONESHOT) #define vlapic_lvtt_tdt(vlapic) \ ((vlapic_get_reg(vlapic, APIC_LVTT) & APIC_TIMER_MODE_MASK) \ == APIC_TIMER_MODE_TSC_DEADLINE) static int vlapic_find_highest_vector(const void *bitmap) { const uint32_t *word = bitmap; unsigned int word_offset = NR_VECTORS / 32; /* Work backwards through the bitmap (first 32-bit word in every four). */ while ( (word_offset != 0) && (word[(--word_offset)*4] == 0) ) continue; return (fls(word[word_offset*4]) - 1) + (word_offset * 32); } /* * IRR-specific bitmap update & search routines. */ static int vlapic_test_and_set_irr(int vector, struct vlapic *vlapic) { return vlapic_test_and_set_vector(vector, &vlapic->regs->data[APIC_IRR]); } static void vlapic_clear_irr(int vector, struct vlapic *vlapic) { vlapic_clear_vector(vector, &vlapic->regs->data[APIC_IRR]); } static int vlapic_find_highest_irr(struct vlapic *vlapic) { if ( hvm_funcs.sync_pir_to_irr ) hvm_funcs.sync_pir_to_irr(vlapic_vcpu(vlapic)); return vlapic_find_highest_vector(&vlapic->regs->data[APIC_IRR]); } void vlapic_set_irq(struct vlapic *vlapic, uint8_t vec, uint8_t trig) { struct vcpu *target = vlapic_vcpu(vlapic); if ( trig ) vlapic_set_vector(vec, &vlapic->regs->data[APIC_TMR]); if ( hvm_funcs.update_eoi_exit_bitmap ) hvm_funcs.update_eoi_exit_bitmap(target, vec, trig); if ( hvm_funcs.deliver_posted_intr ) hvm_funcs.deliver_posted_intr(target, vec); else if ( !vlapic_test_and_set_irr(vec, vlapic) ) vcpu_kick(target); } static int vlapic_find_highest_isr(struct vlapic *vlapic) { return vlapic_find_highest_vector(&vlapic->regs->data[APIC_ISR]); } static uint32_t vlapic_get_ppr(struct vlapic *vlapic) { uint32_t tpr, isrv, ppr; int isr; tpr = vlapic_get_reg(vlapic, APIC_TASKPRI); isr = vlapic_find_highest_isr(vlapic); isrv = (isr != -1) ? isr : 0; if ( (tpr & 0xf0) >= (isrv & 0xf0) ) ppr = tpr & 0xff; else ppr = isrv & 0xf0; HVM_DBG_LOG(DBG_LEVEL_VLAPIC_INTERRUPT, "vlapic %p, ppr %#x, isr %#x, isrv %#x", vlapic, ppr, isr, isrv); return ppr; } uint32_t vlapic_set_ppr(struct vlapic *vlapic) { uint32_t ppr = vlapic_get_ppr(vlapic); vlapic_set_reg(vlapic, APIC_PROCPRI, ppr); return ppr; } static int vlapic_match_logical_addr(struct vlapic *vlapic, uint8_t mda) { int result = 0; uint32_t logical_id; if ( vlapic_x2apic_mode(vlapic) ) { logical_id = vlapic_get_reg(vlapic, APIC_LDR); return !!(logical_id & mda); } logical_id = GET_xAPIC_LOGICAL_ID(vlapic_get_reg(vlapic, APIC_LDR)); switch ( vlapic_get_reg(vlapic, APIC_DFR) ) { case APIC_DFR_FLAT: if ( logical_id & mda ) result = 1; break; case APIC_DFR_CLUSTER: if ( ((logical_id >> 4) == (mda >> 0x4)) && (logical_id & mda & 0xf) ) result = 1; break; default: gdprintk(XENLOG_WARNING, "Bad DFR value for lapic of vcpu %d: %08x\n", vlapic_vcpu(vlapic)->vcpu_id, vlapic_get_reg(vlapic, APIC_DFR)); break; } return result; } bool_t vlapic_match_dest( struct vlapic *target, struct vlapic *source, int short_hand, uint8_t dest, uint8_t dest_mode) { HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "target %p, source %p, dest %#x, " "dest_mode %#x, short_hand %#x", target, source, dest, dest_mode, short_hand); switch ( short_hand ) { case APIC_DEST_NOSHORT: if ( dest_mode ) return vlapic_match_logical_addr(target, dest); return ((dest == 0xFF) || (dest == VLAPIC_ID(target))); case APIC_DEST_SELF: return (target == source); case APIC_DEST_ALLINC: return 1; case APIC_DEST_ALLBUT: return (target != source); default: gdprintk(XENLOG_WARNING, "Bad dest shorthand value %x\n", short_hand); break; } return 0; } static void vlapic_init_sipi_one(struct vcpu *target, uint32_t icr) { vcpu_pause(target); switch ( icr & APIC_MODE_MASK ) { case APIC_DM_INIT: { bool_t fpu_initialised; int rc; /* No work on INIT de-assert for P4-type APIC. */ if ( (icr & (APIC_INT_LEVELTRIG | APIC_INT_ASSERT)) == APIC_INT_LEVELTRIG ) break; /* Nothing to do if the VCPU is already reset. */ if ( !target->is_initialised ) break; hvm_vcpu_down(target); domain_lock(target->domain); /* Reset necessary VCPU state. This does not include FPU state. */ fpu_initialised = target->fpu_initialised; rc = vcpu_reset(target); ASSERT(!rc); target->fpu_initialised = fpu_initialised; vlapic_reset(vcpu_vlapic(target)); domain_unlock(target->domain); break; } case APIC_DM_STARTUP: { uint16_t reset_cs = (icr & 0xffu) << 8; hvm_vcpu_reset_state(target, reset_cs, 0); break; } default: BUG(); } vcpu_unpause(target); } static void vlapic_init_sipi_action(unsigned long _vcpu) { struct vcpu *origin = (struct vcpu *)_vcpu; uint32_t icr = vcpu_vlapic(origin)->init_sipi.icr; uint32_t dest = vcpu_vlapic(origin)->init_sipi.dest; uint32_t short_hand = icr & APIC_SHORT_MASK; uint32_t dest_mode = !!(icr & APIC_DEST_MASK); struct vcpu *v; if ( icr == 0 ) return; for_each_vcpu ( origin->domain, v ) { if ( vlapic_match_dest(vcpu_vlapic(v), vcpu_vlapic(origin), short_hand, dest, dest_mode) ) vlapic_init_sipi_one(v, icr); } vcpu_vlapic(origin)->init_sipi.icr = 0; vcpu_unpause(origin); } /* Add a pending IRQ into lapic. */ static void vlapic_accept_irq(struct vcpu *v, uint32_t icr_low) { struct vlapic *vlapic = vcpu_vlapic(v); uint8_t vector = (uint8_t)icr_low; switch ( icr_low & APIC_MODE_MASK ) { case APIC_DM_FIXED: case APIC_DM_LOWEST: if ( vlapic_enabled(vlapic) ) vlapic_set_irq(vlapic, vector, 0); break; case APIC_DM_REMRD: gdprintk(XENLOG_WARNING, "Ignoring delivery mode 3\n"); break; case APIC_DM_SMI: gdprintk(XENLOG_WARNING, "Ignoring guest SMI\n"); break; case APIC_DM_NMI: if ( !test_and_set_bool(v->nmi_pending) ) { bool_t wake = 0; domain_lock(v->domain); if ( v->is_initialised ) wake = test_and_clear_bit(_VPF_down, &v->pause_flags); domain_unlock(v->domain); if ( wake ) vcpu_wake(v); vcpu_kick(v); } break; case APIC_DM_INIT: case APIC_DM_STARTUP: /* Handled in vlapic_ipi(). */ BUG(); default: gdprintk(XENLOG_ERR, "TODO: unsupported delivery mode in ICR %x\n", icr_low); domain_crash(v->domain); } } struct vlapic *vlapic_lowest_prio( struct domain *d, struct vlapic *source, int short_hand, uint8_t dest, uint8_t dest_mode) { int old = d->arch.hvm_domain.irq.round_robin_prev_vcpu; uint32_t ppr, target_ppr = UINT_MAX; struct vlapic *vlapic, *target = NULL; struct vcpu *v; if ( unlikely(!d->vcpu) || unlikely((v = d->vcpu[old]) == NULL) ) return NULL; do { v = v->next_in_list ? : d->vcpu[0]; vlapic = vcpu_vlapic(v); if ( vlapic_match_dest(vlapic, source, short_hand, dest, dest_mode) && vlapic_enabled(vlapic) && ((ppr = vlapic_get_ppr(vlapic)) < target_ppr) ) { target = vlapic; target_ppr = ppr; } } while ( v->vcpu_id != old ); if ( target != NULL ) d->arch.hvm_domain.irq.round_robin_prev_vcpu = vlapic_vcpu(target)->vcpu_id; return target; } void vlapic_EOI_set(struct vlapic *vlapic) { int vector = vlapic_find_highest_isr(vlapic); /* Some EOI writes may not have a matching to an in-service interrupt. */ if ( vector == -1 ) return; vlapic_clear_vector(vector, &vlapic->regs->data[APIC_ISR]); if ( hvm_funcs.handle_eoi ) hvm_funcs.handle_eoi(vector); if ( vlapic_test_and_clear_vector(vector, &vlapic->regs->data[APIC_TMR]) ) vioapic_update_EOI(vlapic_domain(vlapic), vector); hvm_dpci_msi_eoi(current->domain, vector); } void vlapic_handle_EOI_induced_exit(struct vlapic *vlapic, int vector) { if ( vlapic_test_and_clear_vector(vector, &vlapic->regs->data[APIC_TMR]) ) vioapic_update_EOI(vlapic_domain(vlapic), vector); hvm_dpci_msi_eoi(current->domain, vector); } void vlapic_ipi( struct vlapic *vlapic, uint32_t icr_low, uint32_t icr_high) { unsigned int dest; unsigned int short_hand = icr_low & APIC_SHORT_MASK; unsigned int dest_mode = !!(icr_low & APIC_DEST_MASK); HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "icr = 0x%08x:%08x", icr_high, icr_low); dest = (vlapic_x2apic_mode(vlapic) ? icr_high : GET_xAPIC_DEST_FIELD(icr_high)); switch ( icr_low & APIC_MODE_MASK ) { case APIC_DM_INIT: case APIC_DM_STARTUP: if ( vlapic->init_sipi.icr != 0 ) { WARN(); /* should be impossible but don't BUG, just in case */ break; } vcpu_pause_nosync(vlapic_vcpu(vlapic)); vlapic->init_sipi.icr = icr_low; vlapic->init_sipi.dest = dest; tasklet_schedule(&vlapic->init_sipi.tasklet); break; case APIC_DM_LOWEST: { struct vlapic *target = vlapic_lowest_prio( vlapic_domain(vlapic), vlapic, short_hand, dest, dest_mode); if ( target != NULL ) vlapic_accept_irq(vlapic_vcpu(target), icr_low); break; } default: { struct vcpu *v; for_each_vcpu ( vlapic_domain(vlapic), v ) { if ( vlapic_match_dest(vcpu_vlapic(v), vlapic, short_hand, dest, dest_mode) ) vlapic_accept_irq(v, icr_low); } break; } } } static uint32_t vlapic_get_tmcct(struct vlapic *vlapic) { struct vcpu *v = current; uint32_t tmcct = 0, tmict = vlapic_get_reg(vlapic, APIC_TMICT); uint64_t counter_passed; counter_passed = ((hvm_get_guest_time(v) - vlapic->timer_last_update) / (APIC_BUS_CYCLE_NS * vlapic->hw.timer_divisor)); if ( tmict != 0 ) { if ( vlapic_lvtt_period(vlapic) ) counter_passed %= tmict; if ( counter_passed < tmict ) tmcct = tmict - counter_passed; } HVM_DBG_LOG(DBG_LEVEL_VLAPIC_TIMER, "timer initial count %d, timer current count %d, " "offset %"PRId64, tmict, tmcct, counter_passed); return tmcct; } static void vlapic_set_tdcr(struct vlapic *vlapic, unsigned int val) { /* Only bits 0, 1 and 3 are settable; others are MBZ. */ val &= 0xb; vlapic_set_reg(vlapic, APIC_TDCR, val); /* Update the demangled hw.timer_divisor. */ val = ((val & 3) | ((val & 8) >> 1)) + 1; vlapic->hw.timer_divisor = 1 << (val & 7); HVM_DBG_LOG(DBG_LEVEL_VLAPIC_TIMER, "timer_divisor: %d", vlapic->hw.timer_divisor); } static void vlapic_read_aligned( struct vlapic *vlapic, unsigned int offset, unsigned int *result) { switch ( offset ) { case APIC_PROCPRI: *result = vlapic_get_ppr(vlapic); break; case APIC_TMCCT: /* Timer CCR */ if ( !vlapic_lvtt_oneshot(vlapic) && !vlapic_lvtt_period(vlapic) ) { *result = 0; break; } *result = vlapic_get_tmcct(vlapic); break; case APIC_TMICT: /* Timer ICR */ if ( !vlapic_lvtt_oneshot(vlapic) && !vlapic_lvtt_period(vlapic) ) { *result = 0; break; } default: *result = vlapic_get_reg(vlapic, offset); break; } } static int vlapic_read( struct vcpu *v, unsigned long address, unsigned long len, unsigned long *pval) { unsigned int alignment; unsigned int tmp; unsigned long result = 0; struct vlapic *vlapic = vcpu_vlapic(v); unsigned int offset = address - vlapic_base_address(vlapic); if ( offset > (APIC_TDCR + 0x3) ) goto out; alignment = offset & 0x3; vlapic_read_aligned(vlapic, offset & ~0x3, &tmp); switch ( len ) { case 1: result = *((unsigned char *)&tmp + alignment); break; case 2: if ( alignment == 3 ) goto unaligned_exit_and_crash; result = *(unsigned short *)((unsigned char *)&tmp + alignment); break; case 4: if ( alignment != 0 ) goto unaligned_exit_and_crash; result = *(unsigned int *)((unsigned char *)&tmp + alignment); break; default: gdprintk(XENLOG_ERR, "Local APIC read with len=%#lx, " "should be 4 instead.\n", len); goto exit_and_crash; } HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "offset %#x with length %#lx, " "and the result is %#lx", offset, len, result); out: *pval = result; return X86EMUL_OKAY; unaligned_exit_and_crash: gdprintk(XENLOG_ERR, "Unaligned LAPIC read len=%#lx at offset=%#x.\n", len, offset); exit_and_crash: domain_crash(v->domain); return X86EMUL_OKAY; } int hvm_x2apic_msr_read(struct vcpu *v, unsigned int msr, uint64_t *msr_content) { struct vlapic *vlapic = vcpu_vlapic(v); uint32_t low, high = 0, offset = (msr - MSR_IA32_APICBASE_MSR) << 4; if ( !vlapic_x2apic_mode(vlapic) ) return 1; vlapic_read_aligned(vlapic, offset, &low); switch ( offset ) { case APIC_ID: low = GET_xAPIC_ID(low); break; case APIC_ICR: vlapic_read_aligned(vlapic, APIC_ICR2, &high); break; case APIC_ICR2: return 1; } *msr_content = (((uint64_t)high) << 32) | low; return 0; } static void vlapic_pt_cb(struct vcpu *v, void *data) { *(s_time_t *)data = hvm_get_guest_time(v); } static void vlapic_tdt_pt_cb(struct vcpu *v, void *data) { *(s_time_t *)data = hvm_get_guest_time(v); vcpu_vlapic(v)->hw.tdt_msr = 0; } static int vlapic_reg_write(struct vcpu *v, unsigned int offset, unsigned long val) { struct vlapic *vlapic = vcpu_vlapic(v); int rc = X86EMUL_OKAY; switch ( offset ) { case APIC_ID: if ( !vlapic_x2apic_mode(vlapic) ) vlapic_set_reg(vlapic, APIC_ID, val); else rc = X86EMUL_UNHANDLEABLE; break; case APIC_TASKPRI: vlapic_set_reg(vlapic, APIC_TASKPRI, val & 0xff); break; case APIC_EOI: vlapic_EOI_set(vlapic); break; case APIC_LDR: if ( !vlapic_x2apic_mode(vlapic) ) vlapic_set_reg(vlapic, APIC_LDR, val & APIC_LDR_MASK); else rc = X86EMUL_UNHANDLEABLE; break; case APIC_DFR: if ( !vlapic_x2apic_mode(vlapic) ) vlapic_set_reg(vlapic, APIC_DFR, val | 0x0FFFFFFF); else rc = X86EMUL_UNHANDLEABLE; break; case APIC_SPIV: vlapic_set_reg(vlapic, APIC_SPIV, val & 0x3ff); if ( !(val & APIC_SPIV_APIC_ENABLED) ) { int i; uint32_t lvt_val; vlapic->hw.disabled |= VLAPIC_SW_DISABLED; for ( i = 0; i < VLAPIC_LVT_NUM; i++ ) { lvt_val = vlapic_get_reg(vlapic, APIC_LVTT + 0x10 * i); vlapic_set_reg(vlapic, APIC_LVTT + 0x10 * i, lvt_val | APIC_LVT_MASKED); } } else { vlapic->hw.disabled &= ~VLAPIC_SW_DISABLED; pt_may_unmask_irq(vlapic_domain(vlapic), &vlapic->pt); } break; case APIC_ESR: if ( vlapic_x2apic_mode(vlapic) && (val != 0) ) { gdprintk(XENLOG_ERR, "Local APIC write ESR with non-zero %lx\n", val); rc = X86EMUL_UNHANDLEABLE; } break; case APIC_SELF_IPI: rc = vlapic_x2apic_mode(vlapic) ? vlapic_reg_write(v, APIC_ICR, 0x40000 | (val & 0xff)) : X86EMUL_UNHANDLEABLE; break; case APIC_ICR: val &= ~(1 << 12); /* always clear the pending bit */ vlapic_ipi(vlapic, val, vlapic_get_reg(vlapic, APIC_ICR2)); vlapic_set_reg(vlapic, APIC_ICR, val); break; case APIC_ICR2: if ( !vlapic_x2apic_mode(vlapic) ) val &= 0xff000000; vlapic_set_reg(vlapic, APIC_ICR2, val); break; case APIC_LVTT: /* LVT Timer Reg */ if ( (vlapic_get_reg(vlapic, offset) & APIC_TIMER_MODE_MASK) != (val & APIC_TIMER_MODE_MASK) ) { destroy_periodic_time(&vlapic->pt); vlapic_set_reg(vlapic, APIC_TMICT, 0); vlapic_set_reg(vlapic, APIC_TMCCT, 0); vlapic->hw.tdt_msr = 0; } vlapic->pt.irq = val & APIC_VECTOR_MASK; case APIC_LVTTHMR: /* LVT Thermal Monitor */ case APIC_LVTPC: /* LVT Performance Counter */ case APIC_LVT0: /* LVT LINT0 Reg */ case APIC_LVT1: /* LVT Lint1 Reg */ case APIC_LVTERR: /* LVT Error Reg */ if ( vlapic_sw_disabled(vlapic) ) val |= APIC_LVT_MASKED; val &= vlapic_lvt_mask[(offset - APIC_LVTT) >> 4]; vlapic_set_reg(vlapic, offset, val); if ( offset == APIC_LVT0 ) { vlapic_adjust_i8259_target(v->domain); pt_may_unmask_irq(v->domain, NULL); } if ( (offset == APIC_LVTT) && !(val & APIC_LVT_MASKED) ) pt_may_unmask_irq(NULL, &vlapic->pt); break; case APIC_TMICT: { uint64_t period; if ( !vlapic_lvtt_oneshot(vlapic) && !vlapic_lvtt_period(vlapic) ) break; vlapic_set_reg(vlapic, APIC_TMICT, val); if ( val == 0 ) { destroy_periodic_time(&vlapic->pt); break; } period = ((uint64_t)APIC_BUS_CYCLE_NS * (uint32_t)val * vlapic->hw.timer_divisor); create_periodic_time(current, &vlapic->pt, period, vlapic_lvtt_period(vlapic) ? period : 0, vlapic->pt.irq, vlapic_lvtt_period(vlapic) ? vlapic_pt_cb : NULL, &vlapic->timer_last_update); vlapic->timer_last_update = vlapic->pt.last_plt_gtime; HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "bus cycle is %uns, " "initial count %lu, period %"PRIu64"ns", APIC_BUS_CYCLE_NS, val, period); } break; case APIC_TDCR: vlapic_set_tdcr(vlapic, val & 0xb); HVM_DBG_LOG(DBG_LEVEL_VLAPIC_TIMER, "timer divisor is %#x", vlapic->hw.timer_divisor); break; default: break; } if (rc == X86EMUL_UNHANDLEABLE) gdprintk(XENLOG_DEBUG, "Local APIC Write wrong to register %#x\n", offset); return rc; } static int vlapic_write(struct vcpu *v, unsigned long address, unsigned long len, unsigned long val) { struct vlapic *vlapic = vcpu_vlapic(v); unsigned int offset = address - vlapic_base_address(vlapic); int rc = X86EMUL_OKAY; if ( offset != 0xb0 ) HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "offset %#x with length %#lx, and value is %#lx", offset, len, val); /* * According to the IA32 Manual, all accesses should be 32 bits. * Some OSes do 8- or 16-byte accesses, however. */ val = (uint32_t)val; if ( len != 4 ) { unsigned int tmp; unsigned char alignment; gdprintk(XENLOG_INFO, "Notice: Local APIC write with len = %lx\n",len); alignment = offset & 0x3; (void)vlapic_read_aligned(vlapic, offset & ~0x3, &tmp); switch ( len ) { case 1: val = ((tmp & ~(0xff << (8*alignment))) | ((val & 0xff) << (8*alignment))); break; case 2: if ( alignment & 1 ) goto unaligned_exit_and_crash; val = ((tmp & ~(0xffff << (8*alignment))) | ((val & 0xffff) << (8*alignment))); break; default: gdprintk(XENLOG_ERR, "Local APIC write with len = %lx, " "should be 4 instead\n", len); goto exit_and_crash; } } else if ( (offset & 0x3) != 0 ) goto unaligned_exit_and_crash; offset &= ~0x3; return vlapic_reg_write(v, offset, val); unaligned_exit_and_crash: gdprintk(XENLOG_ERR, "Unaligned LAPIC write len=%#lx at offset=%#x.\n", len, offset); exit_and_crash: domain_crash(v->domain); return rc; } int vlapic_apicv_write(struct vcpu *v, unsigned int offset) { uint32_t val = vlapic_get_reg(vcpu_vlapic(v), offset); return vlapic_reg_write(v, offset, val); } int hvm_x2apic_msr_write(struct vcpu *v, unsigned int msr, uint64_t msr_content) { struct vlapic *vlapic = vcpu_vlapic(v); uint32_t offset = (msr - MSR_IA32_APICBASE_MSR) << 4; if ( !vlapic_x2apic_mode(vlapic) ) return X86EMUL_UNHANDLEABLE; switch ( offset ) { int rc; case APIC_ICR: rc = vlapic_reg_write(v, APIC_ICR2, (uint32_t)(msr_content >> 32)); if ( rc ) return rc; break; case APIC_ICR2: return X86EMUL_UNHANDLEABLE; } return vlapic_reg_write(v, offset, (uint32_t)msr_content); } static int vlapic_range(struct vcpu *v, unsigned long addr) { struct vlapic *vlapic = vcpu_vlapic(v); unsigned long offset = addr - vlapic_base_address(vlapic); return (!vlapic_hw_disabled(vlapic) && (offset < PAGE_SIZE)); } const struct hvm_mmio_handler vlapic_mmio_handler = { .check_handler = vlapic_range, .read_handler = vlapic_read, .write_handler = vlapic_write }; void vlapic_msr_set(struct vlapic *vlapic, uint64_t value) { if ( (vlapic->hw.apic_base_msr ^ value) & MSR_IA32_APICBASE_ENABLE ) { if ( value & MSR_IA32_APICBASE_ENABLE ) { vlapic_reset(vlapic); vlapic->hw.disabled &= ~VLAPIC_HW_DISABLED; pt_may_unmask_irq(vlapic_domain(vlapic), &vlapic->pt); } else { vlapic->hw.disabled |= VLAPIC_HW_DISABLED; pt_may_unmask_irq(vlapic_domain(vlapic), NULL); } } vlapic->hw.apic_base_msr = value; if ( vlapic_x2apic_mode(vlapic) ) { u32 id = vlapic_get_reg(vlapic, APIC_ID); u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); vlapic_set_reg(vlapic, APIC_LDR, ldr); } vmx_vlapic_msr_changed(vlapic_vcpu(vlapic)); HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "apic base msr is 0x%016"PRIx64, vlapic->hw.apic_base_msr); } uint64_t vlapic_tdt_msr_get(struct vlapic *vlapic) { if ( !vlapic_lvtt_tdt(vlapic) ) return 0; return vlapic->hw.tdt_msr; } void vlapic_tdt_msr_set(struct vlapic *vlapic, uint64_t value) { uint64_t guest_tsc; struct vcpu *v = vlapic_vcpu(vlapic); /* may need to exclude some other conditions like vlapic->hw.disabled */ if ( !vlapic_lvtt_tdt(vlapic) ) { HVM_DBG_LOG(DBG_LEVEL_VLAPIC_TIMER, "ignore tsc deadline msr write"); return; } /* new_value = 0, >0 && <= now, > now */ guest_tsc = hvm_get_guest_tsc(v); if ( value > guest_tsc ) { uint64_t delta = gtsc_to_gtime(v->domain, value - guest_tsc); delta = max_t(s64, delta, 0); HVM_DBG_LOG(DBG_LEVEL_VLAPIC_TIMER, "delta[0x%016"PRIx64"]", delta); vlapic->hw.tdt_msr = value; /* .... reprogram tdt timer */ create_periodic_time(v, &vlapic->pt, delta, 0, vlapic->pt.irq, vlapic_tdt_pt_cb, &vlapic->timer_last_update); vlapic->timer_last_update = vlapic->pt.last_plt_gtime; } else { vlapic->hw.tdt_msr = 0; /* trigger a timer event if needed */ if ( value > 0 ) { create_periodic_time(v, &vlapic->pt, 0, 0, vlapic->pt.irq, vlapic_tdt_pt_cb, &vlapic->timer_last_update); vlapic->timer_last_update = vlapic->pt.last_plt_gtime; } else { /* .... stop tdt timer */ destroy_periodic_time(&vlapic->pt); } HVM_DBG_LOG(DBG_LEVEL_VLAPIC_TIMER, "value[0x%016"PRIx64"]", value); } HVM_DBG_LOG(DBG_LEVEL_VLAPIC_TIMER, "tdt_msr[0x%016"PRIx64"]," " gtsc[0x%016"PRIx64"]", vlapic->hw.tdt_msr, guest_tsc); } static int __vlapic_accept_pic_intr(struct vcpu *v) { struct domain *d = v->domain; struct vlapic *vlapic = vcpu_vlapic(v); uint32_t lvt0 = vlapic_get_reg(vlapic, APIC_LVT0); union vioapic_redir_entry redir0 = domain_vioapic(d)->redirtbl[0]; /* We deliver 8259 interrupts to the appropriate CPU as follows. */ return ((/* IOAPIC pin0 is unmasked and routing to this LAPIC? */ ((redir0.fields.delivery_mode == dest_ExtINT) && !redir0.fields.mask && redir0.fields.dest_id == VLAPIC_ID(vlapic) && !vlapic_disabled(vlapic)) || /* LAPIC has LVT0 unmasked for ExtInts? */ ((lvt0 & (APIC_MODE_MASK|APIC_LVT_MASKED)) == APIC_DM_EXTINT) || /* LAPIC is fully disabled? */ vlapic_hw_disabled(vlapic))); } int vlapic_accept_pic_intr(struct vcpu *v) { return ((v == v->domain->arch.hvm_domain.i8259_target) && __vlapic_accept_pic_intr(v)); } void vlapic_adjust_i8259_target(struct domain *d) { struct vcpu *v; for_each_vcpu ( d, v ) if ( __vlapic_accept_pic_intr(v) ) goto found; v = d->vcpu ? d->vcpu[0] : NULL; found: if ( d->arch.hvm_domain.i8259_target == v ) return; d->arch.hvm_domain.i8259_target = v; pt_adjust_global_vcpu_target(v); } int vlapic_virtual_intr_delivery_enabled(void) { if ( hvm_funcs.virtual_intr_delivery_enabled ) return hvm_funcs.virtual_intr_delivery_enabled(); else return 0; } int vlapic_has_pending_irq(struct vcpu *v) { struct vlapic *vlapic = vcpu_vlapic(v); int irr, isr; if ( !vlapic_enabled(vlapic) ) return -1; irr = vlapic_find_highest_irr(vlapic); if ( irr == -1 ) return -1; if ( vlapic_virtual_intr_delivery_enabled() && !nestedhvm_vcpu_in_guestmode(v) ) return irr; isr = vlapic_find_highest_isr(vlapic); isr = (isr != -1) ? isr : 0; if ( (isr & 0xf0) >= (irr & 0xf0) ) return -1; return irr; } int vlapic_ack_pending_irq(struct vcpu *v, int vector, bool_t force_ack) { struct vlapic *vlapic = vcpu_vlapic(v); if ( force_ack || !vlapic_virtual_intr_delivery_enabled() ) { vlapic_set_vector(vector, &vlapic->regs->data[APIC_ISR]); vlapic_clear_irr(vector, vlapic); } return 1; } bool_t is_vlapic_lvtpc_enabled(struct vlapic *vlapic) { return (vlapic_enabled(vlapic) && !(vlapic_get_reg(vlapic, APIC_LVTPC) & APIC_LVT_MASKED)); } /* Reset the VLPAIC back to its power-on/reset state. */ void vlapic_reset(struct vlapic *vlapic) { struct vcpu *v = vlapic_vcpu(vlapic); int i; vlapic_set_reg(vlapic, APIC_ID, (v->vcpu_id * 2) << 24); vlapic_set_reg(vlapic, APIC_LVR, VLAPIC_VERSION); for ( i = 0; i < 8; i++ ) { vlapic_set_reg(vlapic, APIC_IRR + 0x10 * i, 0); vlapic_set_reg(vlapic, APIC_ISR + 0x10 * i, 0); vlapic_set_reg(vlapic, APIC_TMR + 0x10 * i, 0); } vlapic_set_reg(vlapic, APIC_ICR, 0); vlapic_set_reg(vlapic, APIC_ICR2, 0); vlapic_set_reg(vlapic, APIC_LDR, 0); vlapic_set_reg(vlapic, APIC_TASKPRI, 0); vlapic_set_reg(vlapic, APIC_TMICT, 0); vlapic_set_reg(vlapic, APIC_TMCCT, 0); vlapic_set_tdcr(vlapic, 0); vlapic_set_reg(vlapic, APIC_DFR, 0xffffffffU); for ( i = 0; i < VLAPIC_LVT_NUM; i++ ) vlapic_set_reg(vlapic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); vlapic_set_reg(vlapic, APIC_SPIV, 0xff); vlapic->hw.disabled |= VLAPIC_SW_DISABLED; destroy_periodic_time(&vlapic->pt); } /* rearm the actimer if needed, after a HVM restore */ static void lapic_rearm(struct vlapic *s) { unsigned long tmict; uint64_t period, tdt_msr; s->pt.irq = vlapic_get_reg(s, APIC_LVTT) & APIC_VECTOR_MASK; if ( vlapic_lvtt_tdt(s) ) { if ( (tdt_msr = vlapic_tdt_msr_get(s)) != 0 ) vlapic_tdt_msr_set(s, tdt_msr); return; } if ( (tmict = vlapic_get_reg(s, APIC_TMICT)) == 0 ) return; period = ((uint64_t)APIC_BUS_CYCLE_NS * (uint32_t)tmict * s->hw.timer_divisor); create_periodic_time(vlapic_vcpu(s), &s->pt, period, vlapic_lvtt_period(s) ? period : 0, s->pt.irq, vlapic_lvtt_period(s) ? vlapic_pt_cb : NULL, &s->timer_last_update); s->timer_last_update = s->pt.last_plt_gtime; } static int lapic_save_hidden(struct domain *d, hvm_domain_context_t *h) { struct vcpu *v; struct vlapic *s; int rc = 0; for_each_vcpu ( d, v ) { s = vcpu_vlapic(v); if ( (rc = hvm_save_entry(LAPIC, v->vcpu_id, h, &s->hw)) != 0 ) break; } return rc; } static int lapic_save_regs(struct domain *d, hvm_domain_context_t *h) { struct vcpu *v; struct vlapic *s; int rc = 0; for_each_vcpu ( d, v ) { s = vcpu_vlapic(v); if ( (rc = hvm_save_entry(LAPIC_REGS, v->vcpu_id, h, s->regs)) != 0 ) break; } return rc; } static int lapic_load_hidden(struct domain *d, hvm_domain_context_t *h) { uint16_t vcpuid; struct vcpu *v; struct vlapic *s; /* Which vlapic to load? */ vcpuid = hvm_load_instance(h); if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) { dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no apic%u\n", d->domain_id, vcpuid); return -EINVAL; } s = vcpu_vlapic(v); if ( hvm_load_entry_zeroextend(LAPIC, h, &s->hw) != 0 ) return -EINVAL; vmx_vlapic_msr_changed(v); return 0; } static int lapic_load_regs(struct domain *d, hvm_domain_context_t *h) { uint16_t vcpuid; struct vcpu *v; struct vlapic *s; /* Which vlapic to load? */ vcpuid = hvm_load_instance(h); if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) { dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no apic%u\n", d->domain_id, vcpuid); return -EINVAL; } s = vcpu_vlapic(v); if ( hvm_load_entry(LAPIC_REGS, h, s->regs) != 0 ) return -EINVAL; if ( hvm_funcs.process_isr ) hvm_funcs.process_isr(vlapic_find_highest_isr(s), v); vlapic_adjust_i8259_target(d); lapic_rearm(s); return 0; } HVM_REGISTER_SAVE_RESTORE(LAPIC, lapic_save_hidden, lapic_load_hidden, 1, HVMSR_PER_VCPU); HVM_REGISTER_SAVE_RESTORE(LAPIC_REGS, lapic_save_regs, lapic_load_regs, 1, HVMSR_PER_VCPU); int vlapic_init(struct vcpu *v) { struct vlapic *vlapic = vcpu_vlapic(v); unsigned int memflags = MEMF_node(vcpu_to_node(v)); HVM_DBG_LOG(DBG_LEVEL_VLAPIC, "%d", v->vcpu_id); vlapic->pt.source = PTSRC_lapic; if (vlapic->regs_page == NULL) { vlapic->regs_page = alloc_domheap_page(NULL, memflags); if ( vlapic->regs_page == NULL ) { dprintk(XENLOG_ERR, "alloc vlapic regs error: %d/%d\n", v->domain->domain_id, v->vcpu_id); return -ENOMEM; } } if (vlapic->regs == NULL) { vlapic->regs = __map_domain_page_global(vlapic->regs_page); if ( vlapic->regs == NULL ) { dprintk(XENLOG_ERR, "map vlapic regs error: %d/%d\n", v->domain->domain_id, v->vcpu_id); return -ENOMEM; } } clear_page(vlapic->regs); vlapic_reset(vlapic); vlapic->hw.apic_base_msr = (MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE); if ( v->vcpu_id == 0 ) vlapic->hw.apic_base_msr |= MSR_IA32_APICBASE_BSP; tasklet_init(&vlapic->init_sipi.tasklet, vlapic_init_sipi_action, (unsigned long)v); return 0; } void vlapic_destroy(struct vcpu *v) { struct vlapic *vlapic = vcpu_vlapic(v); tasklet_kill(&vlapic->init_sipi.tasklet); destroy_periodic_time(&vlapic->pt); unmap_domain_page_global(vlapic->regs); free_domheap_page(vlapic->regs_page); } xen-4.4.0/xen/arch/x86/hvm/intercept.c0000664000175000017500000002534612307313555015544 0ustar smbsmb/* * intercept.c: Handle performance critical I/O packets in hypervisor space * * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2008, Citrix Systems, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include #include static const struct hvm_mmio_handler *const hvm_mmio_handlers[HVM_MMIO_HANDLER_NR] = { &hpet_mmio_handler, &vlapic_mmio_handler, &vioapic_mmio_handler, &msixtbl_mmio_handler, &iommu_mmio_handler }; static int hvm_mmio_access(struct vcpu *v, ioreq_t *p, hvm_mmio_read_t read_handler, hvm_mmio_write_t write_handler) { struct hvm_vcpu_io *vio = &v->arch.hvm_vcpu.hvm_io; unsigned long data; int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size; if ( !p->data_is_ptr ) { if ( p->dir == IOREQ_READ ) { if ( vio->mmio_retrying ) { if ( vio->mmio_large_read_bytes != p->size ) return X86EMUL_UNHANDLEABLE; memcpy(&data, vio->mmio_large_read, p->size); vio->mmio_large_read_bytes = 0; vio->mmio_retrying = 0; } else rc = read_handler(v, p->addr, p->size, &data); p->data = data; } else /* p->dir == IOREQ_WRITE */ rc = write_handler(v, p->addr, p->size, p->data); return rc; } if ( p->dir == IOREQ_READ ) { for ( i = 0; i < p->count; i++ ) { if ( vio->mmio_retrying ) { if ( vio->mmio_large_read_bytes != p->size ) return X86EMUL_UNHANDLEABLE; memcpy(&data, vio->mmio_large_read, p->size); vio->mmio_large_read_bytes = 0; vio->mmio_retrying = 0; } else { rc = read_handler(v, p->addr + step * i, p->size, &data); if ( rc != X86EMUL_OKAY ) break; } switch ( hvm_copy_to_guest_phys(p->data + step * i, &data, p->size) ) { case HVMCOPY_okay: break; case HVMCOPY_gfn_paged_out: case HVMCOPY_gfn_shared: rc = X86EMUL_RETRY; break; case HVMCOPY_bad_gfn_to_mfn: /* Drop the write as real hardware would. */ continue; case HVMCOPY_bad_gva_to_gfn: ASSERT(0); /* fall through */ default: rc = X86EMUL_UNHANDLEABLE; break; } if ( rc != X86EMUL_OKAY) break; } if ( rc == X86EMUL_RETRY ) { vio->mmio_retry = 1; vio->mmio_large_read_bytes = p->size; memcpy(vio->mmio_large_read, &data, p->size); } } else { for ( i = 0; i < p->count; i++ ) { switch ( hvm_copy_from_guest_phys(&data, p->data + step * i, p->size) ) { case HVMCOPY_okay: break; case HVMCOPY_gfn_paged_out: case HVMCOPY_gfn_shared: rc = X86EMUL_RETRY; break; case HVMCOPY_bad_gfn_to_mfn: data = ~0; break; case HVMCOPY_bad_gva_to_gfn: ASSERT(0); /* fall through */ default: rc = X86EMUL_UNHANDLEABLE; break; } if ( rc != X86EMUL_OKAY ) break; rc = write_handler(v, p->addr + step * i, p->size, data); if ( rc != X86EMUL_OKAY ) break; } if ( rc == X86EMUL_RETRY ) vio->mmio_retry = 1; } if ( i != 0 ) { p->count = i; rc = X86EMUL_OKAY; } return rc; } int hvm_mmio_intercept(ioreq_t *p) { struct vcpu *v = current; int i; for ( i = 0; i < HVM_MMIO_HANDLER_NR; i++ ) if ( hvm_mmio_handlers[i]->check_handler(v, p->addr) ) return hvm_mmio_access( v, p, hvm_mmio_handlers[i]->read_handler, hvm_mmio_handlers[i]->write_handler); return X86EMUL_UNHANDLEABLE; } static int process_portio_intercept(portio_action_t action, ioreq_t *p) { struct hvm_vcpu_io *vio = ¤t->arch.hvm_vcpu.hvm_io; int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size; uint32_t data; if ( !p->data_is_ptr ) { if ( p->dir == IOREQ_READ ) { if ( vio->mmio_retrying ) { if ( vio->mmio_large_read_bytes != p->size ) return X86EMUL_UNHANDLEABLE; memcpy(&data, vio->mmio_large_read, p->size); vio->mmio_large_read_bytes = 0; vio->mmio_retrying = 0; } else rc = action(IOREQ_READ, p->addr, p->size, &data); p->data = data; } else { data = p->data; rc = action(IOREQ_WRITE, p->addr, p->size, &data); } return rc; } if ( p->dir == IOREQ_READ ) { for ( i = 0; i < p->count; i++ ) { if ( vio->mmio_retrying ) { if ( vio->mmio_large_read_bytes != p->size ) return X86EMUL_UNHANDLEABLE; memcpy(&data, vio->mmio_large_read, p->size); vio->mmio_large_read_bytes = 0; vio->mmio_retrying = 0; } else { rc = action(IOREQ_READ, p->addr, p->size, &data); if ( rc != X86EMUL_OKAY ) break; } switch ( hvm_copy_to_guest_phys(p->data + step * i, &data, p->size) ) { case HVMCOPY_okay: break; case HVMCOPY_gfn_paged_out: case HVMCOPY_gfn_shared: rc = X86EMUL_RETRY; break; case HVMCOPY_bad_gfn_to_mfn: /* Drop the write as real hardware would. */ continue; case HVMCOPY_bad_gva_to_gfn: ASSERT(0); /* fall through */ default: rc = X86EMUL_UNHANDLEABLE; break; } if ( rc != X86EMUL_OKAY) break; } if ( rc == X86EMUL_RETRY ) { vio->mmio_retry = 1; vio->mmio_large_read_bytes = p->size; memcpy(vio->mmio_large_read, &data, p->size); } } else /* p->dir == IOREQ_WRITE */ { for ( i = 0; i < p->count; i++ ) { data = 0; switch ( hvm_copy_from_guest_phys(&data, p->data + step * i, p->size) ) { case HVMCOPY_okay: break; case HVMCOPY_gfn_paged_out: case HVMCOPY_gfn_shared: rc = X86EMUL_RETRY; break; case HVMCOPY_bad_gfn_to_mfn: data = ~0; break; case HVMCOPY_bad_gva_to_gfn: ASSERT(0); /* fall through */ default: rc = X86EMUL_UNHANDLEABLE; break; } if ( rc != X86EMUL_OKAY ) break; rc = action(IOREQ_WRITE, p->addr, p->size, &data); if ( rc != X86EMUL_OKAY ) break; } if ( rc == X86EMUL_RETRY ) vio->mmio_retry = 1; } if ( i != 0 ) { p->count = i; rc = X86EMUL_OKAY; } return rc; } /* * Check if the request is handled inside xen * return value: 0 --not handled; 1 --handled */ int hvm_io_intercept(ioreq_t *p, int type) { struct vcpu *v = current; struct hvm_io_handler *handler = v->domain->arch.hvm_domain.io_handler; int i; unsigned long addr, size; if ( type == HVM_PORTIO ) { int rc = dpci_ioport_intercept(p); if ( (rc == X86EMUL_OKAY) || (rc == X86EMUL_RETRY) ) return rc; } for ( i = 0; i < handler->num_slot; i++ ) { if ( type != handler->hdl_list[i].type ) continue; addr = handler->hdl_list[i].addr; size = handler->hdl_list[i].size; if ( (p->addr >= addr) && ((p->addr + p->size) <= (addr + size)) ) { if ( type == HVM_PORTIO ) return process_portio_intercept( handler->hdl_list[i].action.portio, p); return handler->hdl_list[i].action.mmio(p); } } return X86EMUL_UNHANDLEABLE; } void register_io_handler( struct domain *d, unsigned long addr, unsigned long size, void *action, int type) { struct hvm_io_handler *handler = d->arch.hvm_domain.io_handler; int num = handler->num_slot; BUG_ON(num >= MAX_IO_HANDLER); handler->hdl_list[num].addr = addr; handler->hdl_list[num].size = size; handler->hdl_list[num].action.ptr = action; handler->hdl_list[num].type = type; handler->num_slot++; } void relocate_io_handler( struct domain *d, unsigned long old_addr, unsigned long new_addr, unsigned long size, int type) { struct hvm_io_handler *handler = d->arch.hvm_domain.io_handler; int i; for ( i = 0; i < handler->num_slot; i++ ) if ( (handler->hdl_list[i].addr == old_addr) && (handler->hdl_list[i].size == size) && (handler->hdl_list[i].type == type) ) handler->hdl_list[i].addr = new_addr; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/hvm/asid.c0000664000175000017500000001142412307313555014457 0ustar smbsmb/* * asid.c: ASID management * Copyright (c) 2007, Advanced Micro Devices, Inc. * Copyright (c) 2009, Citrix Systems, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include /* Xen command-line option to enable ASIDs */ static int opt_asid_enabled = 1; boolean_param("asid", opt_asid_enabled); /* * ASIDs partition the physical TLB. In the current implementation ASIDs are * introduced to reduce the number of TLB flushes. Each time the guest's * virtual address space changes (e.g. due to an INVLPG, MOV-TO-{CR3, CR4} * operation), instead of flushing the TLB, a new ASID is assigned. This * reduces the number of TLB flushes to at most 1/#ASIDs. The biggest * advantage is that hot parts of the hypervisor's code and data retain in * the TLB. * * Sketch of the Implementation: * * ASIDs are a CPU-local resource. As preemption of ASIDs is not possible, * ASIDs are assigned in a round-robin scheme. To minimize the overhead of * ASID invalidation, at the time of a TLB flush, ASIDs are tagged with a * 64-bit generation. Only on a generation overflow the code needs to * invalidate all ASID information stored at the VCPUs with are run on the * specific physical processor. This overflow appears after about 2^80 * host processor cycles, so we do not optimize this case, but simply disable * ASID useage to retain correctness. */ /* Per-CPU ASID management. */ struct hvm_asid_data { uint64_t core_asid_generation; uint32_t next_asid; uint32_t max_asid; bool_t disabled; }; static DEFINE_PER_CPU(struct hvm_asid_data, hvm_asid_data); void hvm_asid_init(int nasids) { static int8_t g_disabled = -1; struct hvm_asid_data *data = &this_cpu(hvm_asid_data); data->max_asid = nasids - 1; data->disabled = !opt_asid_enabled || (nasids <= 1); if ( g_disabled != data->disabled ) { printk("HVM: ASIDs %sabled.\n", data->disabled ? "dis" : "en"); if ( g_disabled < 0 ) g_disabled = data->disabled; } /* Zero indicates 'invalid generation', so we start the count at one. */ data->core_asid_generation = 1; /* Zero indicates 'ASIDs disabled', so we start the count at one. */ data->next_asid = 1; } void hvm_asid_flush_vcpu_asid(struct hvm_vcpu_asid *asid) { asid->generation = 0; } void hvm_asid_flush_vcpu(struct vcpu *v) { hvm_asid_flush_vcpu_asid(&v->arch.hvm_vcpu.n1asid); hvm_asid_flush_vcpu_asid(&vcpu_nestedhvm(v).nv_n2asid); } void hvm_asid_flush_core(void) { struct hvm_asid_data *data = &this_cpu(hvm_asid_data); if ( data->disabled ) return; if ( likely(++data->core_asid_generation != 0) ) return; /* * ASID generations are 64 bit. Overflow of generations never happens. * For safety, we simply disable ASIDs, so correctness is established; it * only runs a bit slower. */ printk("HVM: ASID generation overrun. Disabling ASIDs.\n"); data->disabled = 1; } bool_t hvm_asid_handle_vmenter(struct hvm_vcpu_asid *asid) { struct hvm_asid_data *data = &this_cpu(hvm_asid_data); /* On erratum #170 systems we must flush the TLB. * Generation overruns are taken here, too. */ if ( data->disabled ) goto disabled; /* Test if VCPU has valid ASID. */ if ( asid->generation == data->core_asid_generation ) return 0; /* If there are no free ASIDs, need to go to a new generation */ if ( unlikely(data->next_asid > data->max_asid) ) { hvm_asid_flush_core(); data->next_asid = 1; if ( data->disabled ) goto disabled; } /* Now guaranteed to be a free ASID. */ asid->asid = data->next_asid++; asid->generation = data->core_asid_generation; /* * When we assign ASID 1, flush all TLB entries as we are starting a new * generation, and all old ASID allocations are now stale. */ return (asid->asid == 1); disabled: asid->asid = 0; return 0; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/hvm/vpt.c0000664000175000017500000003530512307313555014354 0ustar smbsmb/* * vpt.c: Virtual Platform Timer * * Copyright (c) 2006, Xiaowei Yang, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #define mode_is(d, name) \ ((d)->arch.hvm_domain.params[HVM_PARAM_TIMER_MODE] == HVMPTM_##name) void hvm_init_guest_time(struct domain *d) { struct pl_time *pl = &d->arch.hvm_domain.pl_time; spin_lock_init(&pl->pl_time_lock); pl->stime_offset = -(u64)get_s_time(); pl->last_guest_time = 0; } u64 hvm_get_guest_time(struct vcpu *v) { struct pl_time *pl = &v->domain->arch.hvm_domain.pl_time; u64 now; /* Called from device models shared with PV guests. Be careful. */ ASSERT(is_hvm_vcpu(v)); spin_lock(&pl->pl_time_lock); now = get_s_time() + pl->stime_offset; if ( (int64_t)(now - pl->last_guest_time) > 0 ) pl->last_guest_time = now; else now = ++pl->last_guest_time; spin_unlock(&pl->pl_time_lock); return now + v->arch.hvm_vcpu.stime_offset; } void hvm_set_guest_time(struct vcpu *v, u64 guest_time) { u64 offset = guest_time - hvm_get_guest_time(v); if ( offset ) { v->arch.hvm_vcpu.stime_offset += offset; /* * If hvm_vcpu.stime_offset is updated make sure to * also update vcpu time, since this value is used to * calculate the TSC. */ if ( v == current ) update_vcpu_system_time(v); } } static int pt_irq_vector(struct periodic_time *pt, enum hvm_intsrc src) { struct vcpu *v = pt->vcpu; unsigned int gsi, isa_irq; if ( pt->source == PTSRC_lapic ) return pt->irq; isa_irq = pt->irq; gsi = hvm_isa_irq_to_gsi(isa_irq); if ( src == hvm_intsrc_pic ) return (v->domain->arch.hvm_domain.vpic[isa_irq >> 3].irq_base + (isa_irq & 7)); ASSERT(src == hvm_intsrc_lapic); return domain_vioapic(v->domain)->redirtbl[gsi].fields.vector; } static int pt_irq_masked(struct periodic_time *pt) { struct vcpu *v = pt->vcpu; unsigned int gsi, isa_irq; uint8_t pic_imr; if ( pt->source == PTSRC_lapic ) { struct vlapic *vlapic = vcpu_vlapic(v); return (!vlapic_enabled(vlapic) || (vlapic_get_reg(vlapic, APIC_LVTT) & APIC_LVT_MASKED)); } isa_irq = pt->irq; gsi = hvm_isa_irq_to_gsi(isa_irq); pic_imr = v->domain->arch.hvm_domain.vpic[isa_irq >> 3].imr; return (((pic_imr & (1 << (isa_irq & 7))) || !vlapic_accept_pic_intr(v)) && domain_vioapic(v->domain)->redirtbl[gsi].fields.mask); } static void pt_lock(struct periodic_time *pt) { struct vcpu *v; for ( ; ; ) { v = pt->vcpu; spin_lock(&v->arch.hvm_vcpu.tm_lock); if ( likely(pt->vcpu == v) ) break; spin_unlock(&v->arch.hvm_vcpu.tm_lock); } } static void pt_unlock(struct periodic_time *pt) { spin_unlock(&pt->vcpu->arch.hvm_vcpu.tm_lock); } static void pt_process_missed_ticks(struct periodic_time *pt) { s_time_t missed_ticks, now = NOW(); if ( pt->one_shot ) return; missed_ticks = now - pt->scheduled; if ( missed_ticks <= 0 ) return; missed_ticks = missed_ticks / (s_time_t) pt->period + 1; if ( mode_is(pt->vcpu->domain, no_missed_ticks_pending) ) pt->do_not_freeze = !pt->pending_intr_nr; else pt->pending_intr_nr += missed_ticks; pt->scheduled += missed_ticks * pt->period; } static void pt_freeze_time(struct vcpu *v) { if ( !mode_is(v->domain, delay_for_missed_ticks) ) return; v->arch.hvm_vcpu.guest_time = hvm_get_guest_time(v); } static void pt_thaw_time(struct vcpu *v) { if ( !mode_is(v->domain, delay_for_missed_ticks) ) return; if ( v->arch.hvm_vcpu.guest_time == 0 ) return; hvm_set_guest_time(v, v->arch.hvm_vcpu.guest_time); v->arch.hvm_vcpu.guest_time = 0; } void pt_save_timer(struct vcpu *v) { struct list_head *head = &v->arch.hvm_vcpu.tm_list; struct periodic_time *pt; if ( test_bit(_VPF_blocked, &v->pause_flags) ) return; spin_lock(&v->arch.hvm_vcpu.tm_lock); list_for_each_entry ( pt, head, list ) if ( !pt->do_not_freeze ) stop_timer(&pt->timer); pt_freeze_time(v); spin_unlock(&v->arch.hvm_vcpu.tm_lock); } void pt_restore_timer(struct vcpu *v) { struct list_head *head = &v->arch.hvm_vcpu.tm_list; struct periodic_time *pt; spin_lock(&v->arch.hvm_vcpu.tm_lock); list_for_each_entry ( pt, head, list ) { if ( pt->pending_intr_nr == 0 ) { pt_process_missed_ticks(pt); set_timer(&pt->timer, pt->scheduled); } } pt_thaw_time(v); spin_unlock(&v->arch.hvm_vcpu.tm_lock); } static void pt_timer_fn(void *data) { struct periodic_time *pt = data; pt_lock(pt); pt->pending_intr_nr++; pt->scheduled += pt->period; pt->do_not_freeze = 0; vcpu_kick(pt->vcpu); pt_unlock(pt); } int pt_update_irq(struct vcpu *v) { struct list_head *head = &v->arch.hvm_vcpu.tm_list; struct periodic_time *pt, *temp, *earliest_pt; uint64_t max_lag; int irq, is_lapic; void *pt_priv; rescan: spin_lock(&v->arch.hvm_vcpu.tm_lock); rescan_locked: earliest_pt = NULL; max_lag = -1ULL; list_for_each_entry_safe ( pt, temp, head, list ) { if ( pt->pending_intr_nr ) { /* RTC code takes care of disabling the timer itself. */ if ( (pt->irq != RTC_IRQ || !pt->priv) && pt_irq_masked(pt) ) { /* suspend timer emulation */ list_del(&pt->list); pt->on_list = 0; } else { if ( (pt->last_plt_gtime + pt->period) < max_lag ) { max_lag = pt->last_plt_gtime + pt->period; earliest_pt = pt; } } } } if ( earliest_pt == NULL ) { spin_unlock(&v->arch.hvm_vcpu.tm_lock); return -1; } earliest_pt->irq_issued = 1; irq = earliest_pt->irq; is_lapic = (earliest_pt->source == PTSRC_lapic); pt_priv = earliest_pt->priv; spin_unlock(&v->arch.hvm_vcpu.tm_lock); if ( is_lapic ) vlapic_set_irq(vcpu_vlapic(v), irq, 0); else if ( irq == RTC_IRQ && pt_priv ) { if ( !rtc_periodic_interrupt(pt_priv) ) irq = -1; pt_lock(earliest_pt); if ( irq < 0 && earliest_pt->pending_intr_nr ) { /* * RTC periodic timer runs without the corresponding interrupt * being enabled - need to mimic enough of pt_intr_post() to keep * things going. */ earliest_pt->pending_intr_nr = 0; earliest_pt->irq_issued = 0; set_timer(&earliest_pt->timer, earliest_pt->scheduled); } else if ( irq >= 0 && pt_irq_masked(earliest_pt) ) { if ( earliest_pt->on_list ) { /* suspend timer emulation */ list_del(&earliest_pt->list); earliest_pt->on_list = 0; } irq = -1; } /* Avoid dropping the lock if we can. */ if ( irq < 0 && v == earliest_pt->vcpu ) goto rescan_locked; pt_unlock(earliest_pt); if ( irq < 0 ) goto rescan; } else { hvm_isa_irq_deassert(v->domain, irq); hvm_isa_irq_assert(v->domain, irq); } /* * If periodic timer interrut is handled by lapic, its vector in * IRR is returned and used to set eoi_exit_bitmap for virtual * interrupt delivery case. Otherwise return -1 to do nothing. */ if ( !is_lapic && platform_legacy_irq(irq) && vlapic_accept_pic_intr(v) && (&v->domain->arch.hvm_domain)->vpic[irq >> 3].int_output ) return -1; else return pt_irq_vector(earliest_pt, hvm_intsrc_lapic); } static struct periodic_time *is_pt_irq( struct vcpu *v, struct hvm_intack intack) { struct list_head *head = &v->arch.hvm_vcpu.tm_list; struct periodic_time *pt; list_for_each_entry ( pt, head, list ) { if ( pt->pending_intr_nr && pt->irq_issued && (intack.vector == pt_irq_vector(pt, intack.source)) ) return pt; } return NULL; } void pt_intr_post(struct vcpu *v, struct hvm_intack intack) { struct periodic_time *pt; time_cb *cb; void *cb_priv; if ( intack.source == hvm_intsrc_vector ) return; spin_lock(&v->arch.hvm_vcpu.tm_lock); pt = is_pt_irq(v, intack); if ( pt == NULL ) { spin_unlock(&v->arch.hvm_vcpu.tm_lock); return; } pt->irq_issued = 0; if ( pt->one_shot ) { if ( pt->on_list ) list_del(&pt->list); pt->on_list = 0; pt->pending_intr_nr = 0; } else if ( mode_is(v->domain, one_missed_tick_pending) || mode_is(v->domain, no_missed_ticks_pending) ) { pt->last_plt_gtime = hvm_get_guest_time(v); pt_process_missed_ticks(pt); pt->pending_intr_nr = 0; /* 'collapse' all missed ticks */ set_timer(&pt->timer, pt->scheduled); } else { pt->last_plt_gtime += pt->period; if ( --pt->pending_intr_nr == 0 ) { pt_process_missed_ticks(pt); if ( pt->pending_intr_nr == 0 ) set_timer(&pt->timer, pt->scheduled); } } if ( mode_is(v->domain, delay_for_missed_ticks) && (hvm_get_guest_time(v) < pt->last_plt_gtime) ) hvm_set_guest_time(v, pt->last_plt_gtime); cb = pt->cb; cb_priv = pt->priv; spin_unlock(&v->arch.hvm_vcpu.tm_lock); if ( cb != NULL ) cb(v, cb_priv); } void pt_migrate(struct vcpu *v) { struct list_head *head = &v->arch.hvm_vcpu.tm_list; struct periodic_time *pt; spin_lock(&v->arch.hvm_vcpu.tm_lock); list_for_each_entry ( pt, head, list ) migrate_timer(&pt->timer, v->processor); spin_unlock(&v->arch.hvm_vcpu.tm_lock); } void create_periodic_time( struct vcpu *v, struct periodic_time *pt, uint64_t delta, uint64_t period, uint8_t irq, time_cb *cb, void *data) { ASSERT(pt->source != 0); destroy_periodic_time(pt); spin_lock(&v->arch.hvm_vcpu.tm_lock); pt->pending_intr_nr = 0; pt->do_not_freeze = 0; pt->irq_issued = 0; /* Periodic timer must be at least 0.1ms. */ if ( (period < 100000) && period ) { if ( !test_and_set_bool(pt->warned_timeout_too_short) ) gdprintk(XENLOG_WARNING, "HVM_PlatformTime: program too " "small period %"PRIu64"\n", period); period = 100000; } pt->period = period; pt->vcpu = v; pt->last_plt_gtime = hvm_get_guest_time(pt->vcpu); pt->irq = irq; pt->one_shot = !period; pt->scheduled = NOW() + delta; if ( !pt->one_shot ) { if ( v->domain->arch.hvm_domain.params[HVM_PARAM_VPT_ALIGN] ) { pt->scheduled = align_timer(pt->scheduled, pt->period); } else if ( pt->source == PTSRC_lapic ) { /* * Offset LAPIC ticks from other timer ticks. Otherwise guests * which use LAPIC ticks for process accounting can see long * sequences of process ticks incorrectly accounted to interrupt * processing (seen with RHEL3 guest). */ pt->scheduled += delta >> 1; } } pt->cb = cb; pt->priv = data; pt->on_list = 1; list_add(&pt->list, &v->arch.hvm_vcpu.tm_list); init_timer(&pt->timer, pt_timer_fn, pt, v->processor); set_timer(&pt->timer, pt->scheduled); spin_unlock(&v->arch.hvm_vcpu.tm_lock); } void destroy_periodic_time(struct periodic_time *pt) { /* Was this structure previously initialised by create_periodic_time()? */ if ( pt->vcpu == NULL ) return; pt_lock(pt); if ( pt->on_list ) list_del(&pt->list); pt->on_list = 0; pt->pending_intr_nr = 0; pt_unlock(pt); /* * pt_timer_fn() can run until this kill_timer() returns. We must do this * outside pt_lock() otherwise we can deadlock with pt_timer_fn(). */ kill_timer(&pt->timer); } static void pt_adjust_vcpu(struct periodic_time *pt, struct vcpu *v) { int on_list; ASSERT(pt->source == PTSRC_isa); if ( pt->vcpu == NULL ) return; pt_lock(pt); on_list = pt->on_list; if ( pt->on_list ) list_del(&pt->list); pt->on_list = 0; pt_unlock(pt); spin_lock(&v->arch.hvm_vcpu.tm_lock); pt->vcpu = v; if ( on_list ) { pt->on_list = 1; list_add(&pt->list, &v->arch.hvm_vcpu.tm_list); migrate_timer(&pt->timer, v->processor); } spin_unlock(&v->arch.hvm_vcpu.tm_lock); } void pt_adjust_global_vcpu_target(struct vcpu *v) { struct PITState *vpit; struct pl_time *pl_time; int i; if ( v == NULL ) return; vpit = &v->domain->arch.vpit; spin_lock(&vpit->lock); pt_adjust_vcpu(&vpit->pt0, v); spin_unlock(&vpit->lock); pl_time = &v->domain->arch.hvm_domain.pl_time; spin_lock(&pl_time->vrtc.lock); pt_adjust_vcpu(&pl_time->vrtc.pt, v); spin_unlock(&pl_time->vrtc.lock); spin_lock(&pl_time->vhpet.lock); for ( i = 0; i < HPET_TIMER_NUM; i++ ) pt_adjust_vcpu(&pl_time->vhpet.pt[i], v); spin_unlock(&pl_time->vhpet.lock); } static void pt_resume(struct periodic_time *pt) { if ( pt->vcpu == NULL ) return; pt_lock(pt); if ( pt->pending_intr_nr && !pt->on_list ) { pt->on_list = 1; list_add(&pt->list, &pt->vcpu->arch.hvm_vcpu.tm_list); vcpu_kick(pt->vcpu); } pt_unlock(pt); } void pt_may_unmask_irq(struct domain *d, struct periodic_time *vlapic_pt) { int i; if ( d ) { pt_resume(&d->arch.vpit.pt0); pt_resume(&d->arch.hvm_domain.pl_time.vrtc.pt); for ( i = 0; i < HPET_TIMER_NUM; i++ ) pt_resume(&d->arch.hvm_domain.pl_time.vhpet.pt[i]); } if ( vlapic_pt ) pt_resume(vlapic_pt); } xen-4.4.0/xen/arch/x86/hvm/quirks.c0000664000175000017500000000572712307313555015066 0ustar smbsmb/****************************************************************************** * x86/hvm/quirks.c * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include s8 __read_mostly hvm_port80_allowed = -1; boolean_param("hvm_port80", hvm_port80_allowed); static int __init dmi_hvm_deny_port80(/*const*/ struct dmi_system_id *id) { printk(XENLOG_WARNING "%s: port 0x80 access %s allowed for HVM guests\n", id->ident, hvm_port80_allowed > 0 ? "forcibly" : "not"); if ( hvm_port80_allowed < 0 ) hvm_port80_allowed = 0; return 0; } static int __init check_port80(void) { /* * Quirk table for systems that misbehave (lock up, etc.) if port * 0x80 is used: */ static struct dmi_system_id __initdata hvm_no_port80_dmi_table[] = { { .callback = dmi_hvm_deny_port80, .ident = "Compaq Presario V6000", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), DMI_MATCH(DMI_BOARD_NAME, "30B7") } }, { .callback = dmi_hvm_deny_port80, .ident = "HP Pavilion dv9000z", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), DMI_MATCH(DMI_BOARD_NAME, "30B9") } }, { .callback = dmi_hvm_deny_port80, .ident = "HP Pavilion dv6000", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), DMI_MATCH(DMI_BOARD_NAME, "30B8") } }, { .callback = dmi_hvm_deny_port80, .ident = "HP Pavilion tx1000", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), DMI_MATCH(DMI_BOARD_NAME, "30BF") } }, { .callback = dmi_hvm_deny_port80, .ident = "Presario F700", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), DMI_MATCH(DMI_BOARD_NAME, "30D3") } }, { } }; dmi_check_system(hvm_no_port80_dmi_table); if ( !hvm_port80_allowed ) __set_bit(0x80, hvm_io_bitmap); return 0; } __initcall(check_port80); xen-4.4.0/xen/arch/x86/hvm/pmtimer.c0000664000175000017500000002522612307313555015221 0ustar smbsmb/* * hvm/pmtimer.c: emulation of the ACPI PM timer * * Copyright (c) 2007, XenSource inc. * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include /* for hvm_acpi_power_button prototype */ #include /* Slightly more readable port I/O addresses for the registers we intercept */ #define PM1a_STS_ADDR_V0 (ACPI_PM1A_EVT_BLK_ADDRESS_V0) #define PM1a_EN_ADDR_V0 (ACPI_PM1A_EVT_BLK_ADDRESS_V0 + 2) #define TMR_VAL_ADDR_V0 (ACPI_PM_TMR_BLK_ADDRESS_V0) #define PM1a_STS_ADDR_V1 (ACPI_PM1A_EVT_BLK_ADDRESS_V1) #define PM1a_EN_ADDR_V1 (ACPI_PM1A_EVT_BLK_ADDRESS_V1 + 2) #define TMR_VAL_ADDR_V1 (ACPI_PM_TMR_BLK_ADDRESS_V1) /* The interesting bits of the PM1a_STS register */ #define TMR_STS (1 << 0) #define GBL_STS (1 << 5) #define PWRBTN_STS (1 << 8) #define SLPBTN_STS (1 << 9) /* The same in PM1a_EN */ #define TMR_EN (1 << 0) #define GBL_EN (1 << 5) #define PWRBTN_EN (1 << 8) #define SLPBTN_EN (1 << 9) /* Mask of bits in PM1a_STS that can generate an SCI. */ #define SCI_MASK (TMR_STS|PWRBTN_STS|SLPBTN_STS|GBL_STS) /* SCI IRQ number (must match SCI_INT number in ACPI FADT in hvmloader) */ #define SCI_IRQ 9 /* We provide a 32-bit counter (must match the TMR_VAL_EXT bit in the FADT) */ #define TMR_VAL_MASK (0xffffffff) #define TMR_VAL_MSB (0x80000000) /* Dispatch SCIs based on the PM1a_STS and PM1a_EN registers */ static void pmt_update_sci(PMTState *s) { ASSERT(spin_is_locked(&s->lock)); if ( s->pm.pm1a_en & s->pm.pm1a_sts & SCI_MASK ) hvm_isa_irq_assert(s->vcpu->domain, SCI_IRQ); else hvm_isa_irq_deassert(s->vcpu->domain, SCI_IRQ); } void hvm_acpi_power_button(struct domain *d) { PMTState *s = &d->arch.hvm_domain.pl_time.vpmt; spin_lock(&s->lock); s->pm.pm1a_sts |= PWRBTN_STS; pmt_update_sci(s); spin_unlock(&s->lock); } void hvm_acpi_sleep_button(struct domain *d) { PMTState *s = &d->arch.hvm_domain.pl_time.vpmt; spin_lock(&s->lock); s->pm.pm1a_sts |= SLPBTN_STS; pmt_update_sci(s); spin_unlock(&s->lock); } /* Set the correct value in the timer, accounting for time elapsed * since the last time we did that. */ static void pmt_update_time(PMTState *s) { uint64_t curr_gtime, tmp; uint32_t tmr_val = s->pm.tmr_val, msb = tmr_val & TMR_VAL_MSB; ASSERT(spin_is_locked(&s->lock)); /* Update the timer */ curr_gtime = hvm_get_guest_time(s->vcpu); tmp = ((curr_gtime - s->last_gtime) * s->scale) + s->not_accounted; s->not_accounted = (uint32_t)tmp; tmr_val += tmp >> 32; tmr_val &= TMR_VAL_MASK; s->last_gtime = curr_gtime; /* Update timer value atomically wrt lock-free reads in handle_pmt_io(). */ *(volatile uint32_t *)&s->pm.tmr_val = tmr_val; /* If the counter's MSB has changed, set the status bit */ if ( (tmr_val & TMR_VAL_MSB) != msb ) { s->pm.pm1a_sts |= TMR_STS; pmt_update_sci(s); } } /* This function should be called soon after each time the MSB of the * pmtimer register rolls over, to make sure we update the status * registers and SCI at least once per rollover */ static void pmt_timer_callback(void *opaque) { PMTState *s = opaque; uint32_t pmt_cycles_until_flip; uint64_t time_until_flip; spin_lock(&s->lock); /* Recalculate the timer and make sure we get an SCI if we need one */ pmt_update_time(s); /* How close are we to the next MSB flip? */ pmt_cycles_until_flip = TMR_VAL_MSB - (s->pm.tmr_val & (TMR_VAL_MSB - 1)); /* Overall time between MSB flips */ time_until_flip = (1000000000ULL << 23) / FREQUENCE_PMTIMER; /* Reduced appropriately */ time_until_flip = (time_until_flip * pmt_cycles_until_flip) >> 23; /* Wake up again near the next bit-flip */ set_timer(&s->timer, NOW() + time_until_flip + MILLISECS(1)); spin_unlock(&s->lock); } /* Handle port I/O to the PM1a_STS and PM1a_EN registers */ static int handle_evt_io( int dir, uint32_t port, uint32_t bytes, uint32_t *val) { struct vcpu *v = current; PMTState *s = &v->domain->arch.hvm_domain.pl_time.vpmt; uint32_t addr, data, byte; int i; addr = port - ((v->domain->arch.hvm_domain.params[ HVM_PARAM_ACPI_IOPORTS_LOCATION] == 0) ? PM1a_STS_ADDR_V0 : PM1a_STS_ADDR_V1); spin_lock(&s->lock); if ( dir == IOREQ_WRITE ) { /* Handle this I/O one byte at a time */ for ( i = bytes, data = *val; i > 0; i--, addr++, data >>= 8 ) { byte = data & 0xff; switch ( addr ) { /* PM1a_STS register bits are write-to-clear */ case 0 /* PM1a_STS_ADDR */: s->pm.pm1a_sts &= ~byte; break; case 1 /* PM1a_STS_ADDR + 1 */: s->pm.pm1a_sts &= ~(byte << 8); break; case 2 /* PM1a_EN_ADDR */: s->pm.pm1a_en = (s->pm.pm1a_en & 0xff00) | byte; break; case 3 /* PM1a_EN_ADDR + 1 */: s->pm.pm1a_en = (s->pm.pm1a_en & 0xff) | (byte << 8); break; default: gdprintk(XENLOG_WARNING, "Bad ACPI PM register write: %x bytes (%x) at %x\n", bytes, *val, port); } } /* Fix up the SCI state to match the new register state */ pmt_update_sci(s); } else /* p->dir == IOREQ_READ */ { data = s->pm.pm1a_sts | (((uint32_t) s->pm.pm1a_en) << 16); data >>= 8 * addr; if ( bytes == 1 ) data &= 0xff; else if ( bytes == 2 ) data &= 0xffff; *val = data; } spin_unlock(&s->lock); return X86EMUL_OKAY; } /* Handle port I/O to the TMR_VAL register */ static int handle_pmt_io( int dir, uint32_t port, uint32_t bytes, uint32_t *val) { struct vcpu *v = current; PMTState *s = &v->domain->arch.hvm_domain.pl_time.vpmt; if ( bytes != 4 ) { gdprintk(XENLOG_WARNING, "HVM_PMT bad access\n"); return X86EMUL_OKAY; } if ( dir == IOREQ_READ ) { if ( spin_trylock(&s->lock) ) { /* We hold the lock: update timer value and return it. */ pmt_update_time(s); *val = s->pm.tmr_val; spin_unlock(&s->lock); } else { /* * Someone else is updating the timer: rather than do the work * again ourselves, wait for them to finish and then steal their * updated value with a lock-free atomic read. */ spin_barrier(&s->lock); *val = *(volatile uint32_t *)&s->pm.tmr_val; } return X86EMUL_OKAY; } return X86EMUL_UNHANDLEABLE; } static int pmtimer_save(struct domain *d, hvm_domain_context_t *h) { PMTState *s = &d->arch.hvm_domain.pl_time.vpmt; uint32_t x, msb = s->pm.tmr_val & TMR_VAL_MSB; int rc; spin_lock(&s->lock); /* Update the counter to the guest's current time. We always save * with the domain paused, so the saved time should be after the * last_gtime, but just in case, make sure we only go forwards */ x = ((s->vcpu->arch.hvm_vcpu.guest_time - s->last_gtime) * s->scale) >> 32; if ( x < 1UL<<31 ) s->pm.tmr_val += x; if ( (s->pm.tmr_val & TMR_VAL_MSB) != msb ) s->pm.pm1a_sts |= TMR_STS; /* No point in setting the SCI here because we'll already have saved the * IRQ and *PIC state; we'll fix it up when we restore the domain */ rc = hvm_save_entry(PMTIMER, 0, h, &s->pm); spin_unlock(&s->lock); return rc; } static int pmtimer_load(struct domain *d, hvm_domain_context_t *h) { PMTState *s = &d->arch.hvm_domain.pl_time.vpmt; spin_lock(&s->lock); /* Reload the registers */ if ( hvm_load_entry(PMTIMER, h, &s->pm) ) { spin_unlock(&s->lock); return -EINVAL; } /* Calculate future counter values from now. */ s->last_gtime = hvm_get_guest_time(s->vcpu); s->not_accounted = 0; /* Set the SCI state from the registers */ pmt_update_sci(s); spin_unlock(&s->lock); return 0; } HVM_REGISTER_SAVE_RESTORE(PMTIMER, pmtimer_save, pmtimer_load, 1, HVMSR_PER_DOM); int pmtimer_change_ioport(struct domain *d, unsigned int version) { unsigned int old_version; /* Check that version is changing. */ old_version = d->arch.hvm_domain.params[HVM_PARAM_ACPI_IOPORTS_LOCATION]; if ( version == old_version ) return 0; /* Only allow changes between versions 0 and 1. */ if ( (version ^ old_version) != 1 ) return -EINVAL; if ( version == 1 ) { /* Moving from version 0 to version 1. */ relocate_portio_handler(d, TMR_VAL_ADDR_V0, TMR_VAL_ADDR_V1, 4); relocate_portio_handler(d, PM1a_STS_ADDR_V0, PM1a_STS_ADDR_V1, 4); } else { /* Moving from version 1 to version 0. */ relocate_portio_handler(d, TMR_VAL_ADDR_V1, TMR_VAL_ADDR_V0, 4); relocate_portio_handler(d, PM1a_STS_ADDR_V1, PM1a_STS_ADDR_V0, 4); } return 0; } void pmtimer_init(struct vcpu *v) { PMTState *s = &v->domain->arch.hvm_domain.pl_time.vpmt; spin_lock_init(&s->lock); s->scale = ((uint64_t)FREQUENCE_PMTIMER << 32) / SYSTEM_TIME_HZ; s->not_accounted = 0; s->vcpu = v; /* Intercept port I/O (need two handlers because PM1a_CNT is between * PM1a_EN and TMR_VAL and is handled by qemu) */ register_portio_handler(v->domain, TMR_VAL_ADDR_V0, 4, handle_pmt_io); register_portio_handler(v->domain, PM1a_STS_ADDR_V0, 4, handle_evt_io); /* Set up callback to fire SCIs when the MSB of TMR_VAL changes */ init_timer(&s->timer, pmt_timer_callback, s, v->processor); pmt_timer_callback(s); } void pmtimer_deinit(struct domain *d) { PMTState *s = &d->arch.hvm_domain.pl_time.vpmt; kill_timer(&s->timer); } void pmtimer_reset(struct domain *d) { /* Reset the counter. */ d->arch.hvm_domain.pl_time.vpmt.pm.tmr_val = 0; } xen-4.4.0/xen/arch/x86/hvm/hpet.c0000664000175000017500000004617212307313555014507 0ustar smbsmb/* * hpet.c: HPET emulation for HVM guests. * Copyright (c) 2006, Intel Corporation. * Copyright (c) 2006, Keir Fraser * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #define domain_vhpet(x) (&(x)->arch.hvm_domain.pl_time.vhpet) #define vcpu_vhpet(x) (domain_vhpet((x)->domain)) #define vhpet_domain(x) (container_of((x), struct domain, \ arch.hvm_domain.pl_time.vhpet)) #define vhpet_vcpu(x) (pt_global_vcpu_target(vhpet_domain(x))) #define HPET_BASE_ADDRESS 0xfed00000ULL #define HPET_MMAP_SIZE 1024 #define S_TO_NS 1000000000ULL /* 1s = 10^9 ns */ #define S_TO_FS 1000000000000000ULL /* 1s = 10^15 fs */ /* Frequency_of_Xen_systeme_time / frequency_of_HPET = 16 */ #define STIME_PER_HPET_TICK 16 #define guest_time_hpet(hpet) \ (hvm_get_guest_time(vhpet_vcpu(hpet)) / STIME_PER_HPET_TICK) #define HPET_TN_INT_ROUTE_CAP_SHIFT 32 #define HPET_TN_CFG_BITS_READONLY_OR_RESERVED (HPET_TN_RESERVED | \ HPET_TN_PERIODIC_CAP | HPET_TN_64BIT_CAP | HPET_TN_FSB_CAP) /* can be routed to IOAPIC.redirect_table[23..20] */ #define HPET_TN_INT_ROUTE_CAP (0x00f00000ULL \ << HPET_TN_INT_ROUTE_CAP_SHIFT) #define HPET_TN_INT_ROUTE_CAP_MASK (0xffffffffULL \ << HPET_TN_INT_ROUTE_CAP_SHIFT) #define HPET_TN(reg, addr) (((addr) - HPET_Tn_##reg(0)) / \ (HPET_Tn_##reg(1) - HPET_Tn_##reg(0))) #define hpet_tick_to_ns(h, tick) \ ((s_time_t)((((tick) > (h)->hpet_to_ns_limit) ? \ ~0ULL : (tick) * (h)->hpet_to_ns_scale) >> 10)) #define timer_config(h, n) (h->hpet.timers[n].config) #define timer_enabled(h, n) (timer_config(h, n) & HPET_TN_ENABLE) #define timer_is_periodic(h, n) (timer_config(h, n) & HPET_TN_PERIODIC) #define timer_is_32bit(h, n) (timer_config(h, n) & HPET_TN_32BIT) #define hpet_enabled(h) (h->hpet.config & HPET_CFG_ENABLE) #define timer_level(h, n) (timer_config(h, n) & HPET_TN_LEVEL) #define timer_int_route(h, n) \ ((timer_config(h, n) & HPET_TN_ROUTE) >> HPET_TN_ROUTE_SHIFT) #define timer_int_route_cap(h, n) \ ((timer_config(h, n) & HPET_TN_INT_ROUTE_CAP_MASK) \ >> HPET_TN_INT_ROUTE_CAP_SHIFT) static inline uint64_t hpet_read_maincounter(HPETState *h) { ASSERT(spin_is_locked(&h->lock)); if ( hpet_enabled(h) ) return guest_time_hpet(h) + h->mc_offset; else return h->hpet.mc64; } static uint64_t hpet_get_comparator(HPETState *h, unsigned int tn) { uint64_t comparator; uint64_t elapsed; comparator = h->hpet.comparator64[tn]; if ( timer_is_periodic(h, tn) ) { /* update comparator by number of periods elapsed since last update */ uint64_t period = h->hpet.period[tn]; if (period) { elapsed = hpet_read_maincounter(h) + period - 1 - comparator; comparator += (elapsed / period) * period; h->hpet.comparator64[tn] = comparator; } } /* truncate if timer is in 32 bit mode */ if ( timer_is_32bit(h, tn) ) comparator = (uint32_t)comparator; h->hpet.timers[tn].cmp = comparator; return comparator; } static inline uint64_t hpet_read64(HPETState *h, unsigned long addr) { addr &= ~7; switch ( addr ) { case HPET_ID: return h->hpet.capability; case HPET_CFG: return h->hpet.config; case HPET_STATUS: return h->hpet.isr; case HPET_COUNTER: return hpet_read_maincounter(h); case HPET_Tn_CFG(0): case HPET_Tn_CFG(1): case HPET_Tn_CFG(2): return h->hpet.timers[HPET_TN(CFG, addr)].config; case HPET_Tn_CMP(0): case HPET_Tn_CMP(1): case HPET_Tn_CMP(2): return hpet_get_comparator(h, HPET_TN(CMP, addr)); case HPET_Tn_ROUTE(0): case HPET_Tn_ROUTE(1): case HPET_Tn_ROUTE(2): return h->hpet.timers[HPET_TN(ROUTE, addr)].fsb; } return 0; } static inline int hpet_check_access_length( unsigned long addr, unsigned long len) { if ( (addr & (len - 1)) || (len > 8) ) { /* * According to ICH9 specification, unaligned accesses may result * in unexpected behaviour or master abort, but should not crash/hang. * Hence we read all-ones, drop writes, and log a warning. */ gdprintk(XENLOG_WARNING, "HPET: access across register boundary: " "%lx %lx\n", addr, len); return -EINVAL; } return 0; } static int hpet_read( struct vcpu *v, unsigned long addr, unsigned long length, unsigned long *pval) { HPETState *h = vcpu_vhpet(v); unsigned long result; uint64_t val; addr &= HPET_MMAP_SIZE-1; if ( hpet_check_access_length(addr, length) != 0 ) { result = ~0ul; goto out; } spin_lock(&h->lock); val = hpet_read64(h, addr); result = val; if ( length != 8 ) result = (val >> ((addr & 7) * 8)) & ((1ULL << (length * 8)) - 1); spin_unlock(&h->lock); out: *pval = result; return X86EMUL_OKAY; } static void hpet_stop_timer(HPETState *h, unsigned int tn) { ASSERT(tn < HPET_TIMER_NUM); ASSERT(spin_is_locked(&h->lock)); destroy_periodic_time(&h->pt[tn]); /* read the comparator to get it updated so a read while stopped will * return the expected value. */ hpet_get_comparator(h, tn); } /* the number of HPET tick that stands for * 1/(2^10) second, namely, 0.9765625 milliseconds */ #define HPET_TINY_TIME_SPAN ((h->stime_freq >> 10) / STIME_PER_HPET_TICK) static void hpet_set_timer(HPETState *h, unsigned int tn) { uint64_t tn_cmp, cur_tick, diff; unsigned int irq; unsigned int oneshot; ASSERT(tn < HPET_TIMER_NUM); ASSERT(spin_is_locked(&h->lock)); if ( (tn == 0) && (h->hpet.config & HPET_CFG_LEGACY) ) { /* HPET specification requires PIT shouldn't generate * interrupts if LegacyReplacementRoute is set for timer0 */ pit_stop_channel0_irq(&vhpet_domain(h)->arch.vpit); } if ( !timer_enabled(h, tn) ) return; tn_cmp = hpet_get_comparator(h, tn); cur_tick = hpet_read_maincounter(h); if ( timer_is_32bit(h, tn) ) { tn_cmp = (uint32_t)tn_cmp; cur_tick = (uint32_t)cur_tick; } diff = tn_cmp - cur_tick; /* * Detect time values set in the past. This is hard to do for 32-bit * comparators as the timer does not have to be set that far in the future * for the counter difference to wrap a 32-bit signed integer. We fudge * by looking for a 'small' time value in the past. */ if ( (int64_t)diff < 0 ) diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN)) ? (uint32_t)diff : 0; if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) ) /* if LegacyReplacementRoute bit is set, HPET specification requires timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC, timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */ irq = (tn == 0) ? 0 : 8; else irq = timer_int_route(h, tn); /* * diff is the time from now when the timer should fire, for a periodic * timer we also need the period which may be different because time may * have elapsed between the time the comparator was written and the timer * being enabled (now). */ oneshot = !timer_is_periodic(h, tn); create_periodic_time(vhpet_vcpu(h), &h->pt[tn], hpet_tick_to_ns(h, diff), oneshot ? 0 : hpet_tick_to_ns(h, h->hpet.period[tn]), irq, NULL, NULL); } static inline uint64_t hpet_fixup_reg( uint64_t new, uint64_t old, uint64_t mask) { new &= mask; new |= old & ~mask; return new; } static int hpet_write( struct vcpu *v, unsigned long addr, unsigned long length, unsigned long val) { HPETState *h = vcpu_vhpet(v); uint64_t old_val, new_val; int tn, i; /* Acculumate a bit mask of timers whos state is changed by this write. */ unsigned long start_timers = 0; unsigned long stop_timers = 0; #define set_stop_timer(n) (__set_bit((n), &stop_timers)) #define set_start_timer(n) (__set_bit((n), &start_timers)) #define set_restart_timer(n) (set_stop_timer(n),set_start_timer(n)) addr &= HPET_MMAP_SIZE-1; if ( hpet_check_access_length(addr, length) != 0 ) goto out; spin_lock(&h->lock); old_val = hpet_read64(h, addr); new_val = val; if ( length != 8 ) new_val = hpet_fixup_reg( new_val << (addr & 7) * 8, old_val, ((1ULL << (length*8)) - 1) << ((addr & 7) * 8)); switch ( addr & ~7 ) { case HPET_CFG: h->hpet.config = hpet_fixup_reg(new_val, old_val, 0x3); if ( !(old_val & HPET_CFG_ENABLE) && (new_val & HPET_CFG_ENABLE) ) { /* Enable main counter and interrupt generation. */ h->mc_offset = h->hpet.mc64 - guest_time_hpet(h); for ( i = 0; i < HPET_TIMER_NUM; i++ ) { h->hpet.comparator64[i] = h->hpet.timers[i].config & HPET_TN_32BIT ? (uint32_t)h->hpet.timers[i].cmp : h->hpet.timers[i].cmp; if ( timer_enabled(h, i) ) set_start_timer(i); } } else if ( (old_val & HPET_CFG_ENABLE) && !(new_val & HPET_CFG_ENABLE) ) { /* Halt main counter and disable interrupt generation. */ h->hpet.mc64 = h->mc_offset + guest_time_hpet(h); for ( i = 0; i < HPET_TIMER_NUM; i++ ) if ( timer_enabled(h, i) ) set_stop_timer(i); } break; case HPET_COUNTER: h->hpet.mc64 = new_val; if ( hpet_enabled(h) ) { gdprintk(XENLOG_WARNING, "HPET: writing main counter but it's not halted!\n"); for ( i = 0; i < HPET_TIMER_NUM; i++ ) if ( timer_enabled(h, i) ) set_restart_timer(i); } break; case HPET_Tn_CFG(0): case HPET_Tn_CFG(1): case HPET_Tn_CFG(2): tn = HPET_TN(CFG, addr); h->hpet.timers[tn].config = hpet_fixup_reg(new_val, old_val, 0x3f4e); if ( timer_level(h, tn) ) { gdprintk(XENLOG_ERR, "HPET: level triggered interrupt not supported now\n"); domain_crash(current->domain); break; } if ( new_val & HPET_TN_32BIT ) { h->hpet.timers[tn].cmp = (uint32_t)h->hpet.timers[tn].cmp; h->hpet.period[tn] = (uint32_t)h->hpet.period[tn]; } if ( hpet_enabled(h) ) { if ( new_val & HPET_TN_ENABLE ) { if ( (new_val ^ old_val) & HPET_TN_PERIODIC ) /* timer is enabled but switching mode to/from periodic/ * one-shot, stop and restart the vpt timer to get it in * the right mode. */ set_restart_timer(tn); else if ( (new_val & HPET_TN_32BIT) && !(old_val & HPET_TN_32BIT) ) /* switching from 64 bit to 32 bit mode could cause timer * next fire time, or period, to change. */ set_restart_timer(tn); else if ( !(old_val & HPET_TN_ENABLE) ) /* transition from timer disabled to timer enabled. */ set_start_timer(tn); } else if ( old_val & HPET_TN_ENABLE ) /* transition from timer enabled to timer disabled. */ set_stop_timer(tn); } break; case HPET_Tn_CMP(0): case HPET_Tn_CMP(1): case HPET_Tn_CMP(2): tn = HPET_TN(CMP, addr); if ( timer_is_32bit(h, tn) ) new_val = (uint32_t)new_val; h->hpet.timers[tn].cmp = new_val; if ( h->hpet.timers[tn].config & HPET_TN_SETVAL ) /* * When SETVAL is one, software is able to "directly set a periodic * timer's accumulator." That is, set the comparator without * adjusting the period. Much the same as just setting the * comparator on an enabled one-shot timer. * * This configuration bit clears when the comparator is written. */ h->hpet.timers[tn].config &= ~HPET_TN_SETVAL; else if ( timer_is_periodic(h, tn) ) { /* * Clamp period to reasonable min/max values: * - minimum is 100us, same as timers controlled by vpt.c * - maximum is to prevent overflow in time_after() calculations */ if ( hpet_tick_to_ns(h, new_val) < MICROSECS(100) ) new_val = (MICROSECS(100) << 10) / h->hpet_to_ns_scale; new_val &= (timer_is_32bit(h, tn) ? ~0u : ~0ull) >> 1; h->hpet.period[tn] = new_val; } h->hpet.comparator64[tn] = new_val; if ( hpet_enabled(h) && timer_enabled(h, tn) ) set_restart_timer(tn); break; case HPET_Tn_ROUTE(0): case HPET_Tn_ROUTE(1): case HPET_Tn_ROUTE(2): tn = HPET_TN(ROUTE, addr); h->hpet.timers[tn].fsb = new_val; break; default: /* Ignore writes to unsupported and reserved registers. */ break; } /* stop/start timers whos state was changed by this write. */ while (stop_timers) { i = find_first_set_bit(stop_timers); __clear_bit(i, &stop_timers); hpet_stop_timer(h, i); } while (start_timers) { i = find_first_set_bit(start_timers); __clear_bit(i, &start_timers); hpet_set_timer(h, i); } #undef set_stop_timer #undef set_start_timer #undef set_restart_timer spin_unlock(&h->lock); out: return X86EMUL_OKAY; } static int hpet_range(struct vcpu *v, unsigned long addr) { return (v->domain->arch.hvm_domain.params[HVM_PARAM_HPET_ENABLED] && (addr >= HPET_BASE_ADDRESS) && (addr < (HPET_BASE_ADDRESS + HPET_MMAP_SIZE))); } const struct hvm_mmio_handler hpet_mmio_handler = { .check_handler = hpet_range, .read_handler = hpet_read, .write_handler = hpet_write }; static int hpet_save(struct domain *d, hvm_domain_context_t *h) { HPETState *hp = domain_vhpet(d); int rc; spin_lock(&hp->lock); /* Write the proper value into the main counter */ hp->hpet.mc64 = hp->mc_offset + guest_time_hpet(hp); /* Save the HPET registers */ rc = _hvm_init_entry(h, HVM_SAVE_CODE(HPET), 0, HVM_SAVE_LENGTH(HPET)); if ( rc == 0 ) { struct hvm_hw_hpet *rec = (struct hvm_hw_hpet *)&h->data[h->cur]; h->cur += HVM_SAVE_LENGTH(HPET); memset(rec, 0, HVM_SAVE_LENGTH(HPET)); #define C(x) rec->x = hp->hpet.x C(capability); C(config); C(isr); C(mc64); C(timers[0].config); C(timers[0].fsb); C(timers[1].config); C(timers[1].fsb); C(timers[2].config); C(timers[2].fsb); C(period[0]); C(period[1]); C(period[2]); #undef C /* save the 64 bit comparator in the 64 bit timer[n].cmp field * regardless of whether or not the timer is in 32 bit mode. */ rec->timers[0].cmp = hp->hpet.comparator64[0]; rec->timers[1].cmp = hp->hpet.comparator64[1]; rec->timers[2].cmp = hp->hpet.comparator64[2]; } spin_unlock(&hp->lock); return rc; } static int hpet_load(struct domain *d, hvm_domain_context_t *h) { HPETState *hp = domain_vhpet(d); struct hvm_hw_hpet *rec; uint64_t cmp; int i; spin_lock(&hp->lock); /* Reload the HPET registers */ if ( _hvm_check_entry(h, HVM_SAVE_CODE(HPET), HVM_SAVE_LENGTH(HPET), 1) ) { spin_unlock(&hp->lock); return -EINVAL; } rec = (struct hvm_hw_hpet *)&h->data[h->cur]; h->cur += HVM_SAVE_LENGTH(HPET); #define C(x) hp->hpet.x = rec->x C(capability); C(config); C(isr); C(mc64); /* The following define will generate a compiler error if HPET_TIMER_NUM * changes. This indicates an incompatability with previous saved state. */ #define HPET_TIMER_NUM 3 for ( i = 0; i < HPET_TIMER_NUM; i++ ) { C(timers[i].config); C(timers[i].fsb); C(period[i]); /* restore the hidden 64 bit comparator and truncate the timer's * visible comparator field if in 32 bit mode. */ cmp = rec->timers[i].cmp; hp->hpet.comparator64[i] = cmp; if ( timer_is_32bit(hp, i) ) cmp = (uint32_t)cmp; hp->hpet.timers[i].cmp = cmp; } #undef C /* Recalculate the offset between the main counter and guest time */ hp->mc_offset = hp->hpet.mc64 - guest_time_hpet(hp); /* restart all timers */ if ( hpet_enabled(hp) ) for ( i = 0; i < HPET_TIMER_NUM; i++ ) if ( timer_enabled(hp, i) ) hpet_set_timer(hp, i); spin_unlock(&hp->lock); return 0; } HVM_REGISTER_SAVE_RESTORE(HPET, hpet_save, hpet_load, 1, HVMSR_PER_DOM); void hpet_init(struct vcpu *v) { HPETState *h = vcpu_vhpet(v); int i; memset(h, 0, sizeof(HPETState)); spin_lock_init(&h->lock); h->stime_freq = S_TO_NS; h->hpet_to_ns_scale = ((S_TO_NS * STIME_PER_HPET_TICK) << 10) / h->stime_freq; h->hpet_to_ns_limit = ~0ULL / h->hpet_to_ns_scale; h->hpet.capability = 0x80860001ULL | ((HPET_TIMER_NUM - 1) << HPET_ID_NUMBER_SHIFT) | HPET_ID_64BIT | HPET_ID_LEGSUP; /* This is the number of femptoseconds per HPET tick. */ /* Here we define HPET's frequency to be 1/16 of Xen system time */ h->hpet.capability |= ((S_TO_FS*STIME_PER_HPET_TICK/h->stime_freq) << 32); for ( i = 0; i < HPET_TIMER_NUM; i++ ) { h->hpet.timers[i].config = HPET_TN_INT_ROUTE_CAP | HPET_TN_64BIT_CAP | HPET_TN_PERIODIC_CAP; h->hpet.timers[i].cmp = ~0ULL; h->pt[i].source = PTSRC_isa; } } void hpet_deinit(struct domain *d) { int i; HPETState *h = domain_vhpet(d); spin_lock(&h->lock); if ( hpet_enabled(h) ) for ( i = 0; i < HPET_TIMER_NUM; i++ ) if ( timer_enabled(h, i) ) hpet_stop_timer(h, i); spin_unlock(&h->lock); } void hpet_reset(struct domain *d) { hpet_deinit(d); hpet_init(d->vcpu[0]); } xen-4.4.0/xen/arch/x86/hvm/svm/0000775000175000017500000000000012307313555014176 5ustar smbsmbxen-4.4.0/xen/arch/x86/hvm/svm/Makefile0000664000175000017500000000024012307313555015632 0ustar smbsmbobj-y += asid.o obj-y += emulate.o obj-bin-y += entry.o obj-y += intr.o obj-y += nestedsvm.o obj-y += svm.o obj-y += svmdebug.o obj-y += vmcb.o obj-y += vpmu.o xen-4.4.0/xen/arch/x86/hvm/svm/vmcb.c0000664000175000017500000002207012307313555015272 0ustar smbsmb/* * vmcb.c: VMCB management * Copyright (c) 2005-2007, Advanced Micro Devices, Inc. * Copyright (c) 2004, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include struct vmcb_struct *alloc_vmcb(void) { struct vmcb_struct *vmcb; vmcb = alloc_xenheap_page(); if ( vmcb == NULL ) { printk(XENLOG_WARNING "Warning: failed to allocate vmcb.\n"); return NULL; } clear_page(vmcb); return vmcb; } void free_vmcb(struct vmcb_struct *vmcb) { free_xenheap_page(vmcb); } struct host_save_area *alloc_host_save_area(void) { struct host_save_area *hsa; hsa = alloc_xenheap_page(); if ( hsa == NULL ) { printk(XENLOG_WARNING "Warning: failed to allocate hsa.\n"); return NULL; } clear_page(hsa); return hsa; } /* This function can directly access fields which are covered by clean bits. */ static int construct_vmcb(struct vcpu *v) { struct arch_svm_struct *arch_svm = &v->arch.hvm_svm; struct vmcb_struct *vmcb = arch_svm->vmcb; vmcb->_general1_intercepts = GENERAL1_INTERCEPT_INTR | GENERAL1_INTERCEPT_NMI | GENERAL1_INTERCEPT_SMI | GENERAL1_INTERCEPT_INIT | GENERAL1_INTERCEPT_CPUID | GENERAL1_INTERCEPT_INVD | GENERAL1_INTERCEPT_HLT | GENERAL1_INTERCEPT_INVLPG | GENERAL1_INTERCEPT_INVLPGA | GENERAL1_INTERCEPT_IOIO_PROT | GENERAL1_INTERCEPT_MSR_PROT | GENERAL1_INTERCEPT_SHUTDOWN_EVT| GENERAL1_INTERCEPT_TASK_SWITCH; vmcb->_general2_intercepts = GENERAL2_INTERCEPT_VMRUN | GENERAL2_INTERCEPT_VMMCALL | GENERAL2_INTERCEPT_VMLOAD | GENERAL2_INTERCEPT_VMSAVE | GENERAL2_INTERCEPT_STGI | GENERAL2_INTERCEPT_CLGI | GENERAL2_INTERCEPT_SKINIT | GENERAL2_INTERCEPT_MWAIT | GENERAL2_INTERCEPT_WBINVD | GENERAL2_INTERCEPT_MONITOR | GENERAL2_INTERCEPT_XSETBV; /* Intercept all debug-register writes. */ vmcb->_dr_intercepts = ~0u; /* Intercept all control-register accesses except for CR2 and CR8. */ vmcb->_cr_intercepts = ~(CR_INTERCEPT_CR2_READ | CR_INTERCEPT_CR2_WRITE | CR_INTERCEPT_CR8_READ | CR_INTERCEPT_CR8_WRITE); /* I/O and MSR permission bitmaps. */ arch_svm->msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0); if ( arch_svm->msrpm == NULL ) return -ENOMEM; memset(arch_svm->msrpm, 0xff, MSRPM_SIZE); svm_disable_intercept_for_msr(v, MSR_FS_BASE); svm_disable_intercept_for_msr(v, MSR_GS_BASE); svm_disable_intercept_for_msr(v, MSR_SHADOW_GS_BASE); svm_disable_intercept_for_msr(v, MSR_CSTAR); svm_disable_intercept_for_msr(v, MSR_LSTAR); svm_disable_intercept_for_msr(v, MSR_STAR); svm_disable_intercept_for_msr(v, MSR_SYSCALL_MASK); /* LWP_CBADDR MSR is saved and restored by FPU code. So SVM doesn't need to * intercept it. */ if ( cpu_has_lwp ) svm_disable_intercept_for_msr(v, MSR_AMD64_LWP_CBADDR); vmcb->_msrpm_base_pa = (u64)virt_to_maddr(arch_svm->msrpm); vmcb->_iopm_base_pa = (u64)virt_to_maddr(hvm_io_bitmap); /* Virtualise EFLAGS.IF and LAPIC TPR (CR8). */ vmcb->_vintr.fields.intr_masking = 1; /* Initialise event injection to no-op. */ vmcb->eventinj.bytes = 0; /* TSC. */ vmcb->_tsc_offset = 0; /* Don't need to intercept RDTSC if CPU supports TSC rate scaling */ if ( v->domain->arch.vtsc && !cpu_has_tsc_ratio ) { vmcb->_general1_intercepts |= GENERAL1_INTERCEPT_RDTSC; vmcb->_general2_intercepts |= GENERAL2_INTERCEPT_RDTSCP; } /* Guest EFER. */ v->arch.hvm_vcpu.guest_efer = 0; hvm_update_guest_efer(v); /* Guest segment limits. */ vmcb->cs.limit = ~0u; vmcb->es.limit = ~0u; vmcb->ss.limit = ~0u; vmcb->ds.limit = ~0u; vmcb->fs.limit = ~0u; vmcb->gs.limit = ~0u; /* Guest segment bases. */ vmcb->cs.base = 0; vmcb->es.base = 0; vmcb->ss.base = 0; vmcb->ds.base = 0; vmcb->fs.base = 0; vmcb->gs.base = 0; /* Guest segment AR bytes. */ vmcb->es.attr.bytes = 0xc93; /* read/write, accessed */ vmcb->ss.attr.bytes = 0xc93; vmcb->ds.attr.bytes = 0xc93; vmcb->fs.attr.bytes = 0xc93; vmcb->gs.attr.bytes = 0xc93; vmcb->cs.attr.bytes = 0xc9b; /* exec/read, accessed */ /* Guest IDT. */ vmcb->idtr.base = 0; vmcb->idtr.limit = 0; /* Guest GDT. */ vmcb->gdtr.base = 0; vmcb->gdtr.limit = 0; /* Guest LDT. */ vmcb->ldtr.sel = 0; vmcb->ldtr.base = 0; vmcb->ldtr.limit = 0; vmcb->ldtr.attr.bytes = 0; /* Guest TSS. */ vmcb->tr.attr.bytes = 0x08b; /* 32-bit TSS (busy) */ vmcb->tr.base = 0; vmcb->tr.limit = 0xff; v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET; hvm_update_guest_cr(v, 0); v->arch.hvm_vcpu.guest_cr[4] = 0; hvm_update_guest_cr(v, 4); paging_update_paging_modes(v); vmcb->_exception_intercepts = HVM_TRAP_MASK | (1U << TRAP_no_device); if ( paging_mode_hap(v->domain) ) { vmcb->_np_enable = 1; /* enable nested paging */ vmcb->_g_pat = MSR_IA32_CR_PAT_RESET; /* guest PAT */ vmcb->_h_cr3 = pagetable_get_paddr( p2m_get_pagetable(p2m_get_hostp2m(v->domain))); /* No point in intercepting CR3 reads/writes. */ vmcb->_cr_intercepts &= ~(CR_INTERCEPT_CR3_READ|CR_INTERCEPT_CR3_WRITE); /* * No point in intercepting INVLPG if we don't have shadow pagetables * that need to be fixed up. */ vmcb->_general1_intercepts &= ~GENERAL1_INTERCEPT_INVLPG; /* PAT is under complete control of SVM when using nested paging. */ svm_disable_intercept_for_msr(v, MSR_IA32_CR_PAT); } else { vmcb->_exception_intercepts |= (1U << TRAP_page_fault); } if ( cpu_has_pause_filter ) { vmcb->_pause_filter_count = SVM_PAUSEFILTER_INIT; vmcb->_general1_intercepts |= GENERAL1_INTERCEPT_PAUSE; } vmcb->cleanbits.bytes = 0; return 0; } int svm_create_vmcb(struct vcpu *v) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct arch_svm_struct *arch_svm = &v->arch.hvm_svm; int rc; if ( (nv->nv_n1vmcx == NULL) && (nv->nv_n1vmcx = alloc_vmcb()) == NULL ) { printk("Failed to create a new VMCB\n"); return -ENOMEM; } arch_svm->vmcb = nv->nv_n1vmcx; rc = construct_vmcb(v); if ( rc != 0 ) { free_vmcb(nv->nv_n1vmcx); nv->nv_n1vmcx = NULL; arch_svm->vmcb = NULL; return rc; } arch_svm->vmcb_pa = nv->nv_n1vmcx_pa = virt_to_maddr(arch_svm->vmcb); return 0; } void svm_destroy_vmcb(struct vcpu *v) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct arch_svm_struct *arch_svm = &v->arch.hvm_svm; if ( nv->nv_n1vmcx != NULL ) free_vmcb(nv->nv_n1vmcx); if ( arch_svm->msrpm != NULL ) { free_xenheap_pages( arch_svm->msrpm, get_order_from_bytes(MSRPM_SIZE)); arch_svm->msrpm = NULL; } nv->nv_n1vmcx = NULL; nv->nv_n1vmcx_pa = VMCX_EADDR; arch_svm->vmcb = NULL; } static void vmcb_dump(unsigned char ch) { struct domain *d; struct vcpu *v; printk("*********** VMCB Areas **************\n"); rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) { if ( !is_hvm_domain(d) ) continue; printk("\n>>> Domain %d <<<\n", d->domain_id); for_each_vcpu ( d, v ) { printk("\tVCPU %d\n", v->vcpu_id); svm_vmcb_dump("key_handler", v->arch.hvm_svm.vmcb); } } rcu_read_unlock(&domlist_read_lock); printk("**************************************\n"); } static struct keyhandler vmcb_dump_keyhandler = { .diagnostic = 1, .u.fn = vmcb_dump, .desc = "dump AMD-V VMCBs" }; void __init setup_vmcb_dump(void) { register_keyhandler('v', &vmcb_dump_keyhandler); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/hvm/svm/intr.c0000664000175000017500000001652412307313555015326 0ustar smbsmb/* * intr.c: Interrupt handling for SVM. * Copyright (c) 2005, AMD Inc. * Copyright (c) 2004, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for nestedhvm_vcpu_in_guestmode */ #include #include #include #include #include static void svm_inject_nmi(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb); eventinj_t event; event.bytes = 0; event.fields.v = 1; event.fields.type = X86_EVENTTYPE_NMI; event.fields.vector = 2; ASSERT(vmcb->eventinj.fields.v == 0); vmcb->eventinj = event; /* * SVM does not virtualise the NMI mask, so we emulate it by intercepting * the next IRET and blocking NMI injection until the intercept triggers. */ vmcb_set_general1_intercepts( vmcb, general1_intercepts | GENERAL1_INTERCEPT_IRET); } static void svm_inject_extint(struct vcpu *v, int vector) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; eventinj_t event; event.bytes = 0; event.fields.v = 1; event.fields.type = X86_EVENTTYPE_EXT_INTR; event.fields.vector = vector; ASSERT(vmcb->eventinj.fields.v == 0); vmcb->eventinj = event; } static void enable_intr_window(struct vcpu *v, struct hvm_intack intack) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; uint32_t general1_intercepts = vmcb_get_general1_intercepts(vmcb); vintr_t intr; ASSERT(intack.source != hvm_intsrc_none); if ( nestedhvm_enabled(v->domain) ) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); if ( nv->nv_vmentry_pending ) { struct vmcb_struct *gvmcb = nv->nv_vvmcx; /* check if l1 guest injects interrupt into l2 guest via vintr. * return here or l2 guest looses interrupts, otherwise. */ ASSERT(gvmcb != NULL); intr = vmcb_get_vintr(gvmcb); if ( intr.fields.irq ) return; } } HVMTRACE_3D(INTR_WINDOW, intack.vector, intack.source, vmcb->eventinj.fields.v?vmcb->eventinj.fields.vector:-1); /* * Create a dummy virtual interrupt to intercept as soon as the * guest can accept the real interrupt. * * TODO: Better NMI handling. We need a way to skip a MOV SS interrupt * shadow. This is hard to do without hardware support. Also we should * not be waiting for EFLAGS.IF to become 1. */ /* * NMI-blocking window is handled by IRET interception. We should not * inject a VINTR in this case as VINTR is unaware of NMI-blocking and * hence we can enter an endless loop (VINTR intercept fires, yet * hvm_interrupt_blocked() still indicates NMI-blocking is active, so * we inject a VINTR, ...). */ if ( (intack.source == hvm_intsrc_nmi) && (general1_intercepts & GENERAL1_INTERCEPT_IRET) ) return; intr = vmcb_get_vintr(vmcb); intr.fields.irq = 1; intr.fields.vector = 0; intr.fields.prio = intack.vector >> 4; intr.fields.ign_tpr = (intack.source != hvm_intsrc_lapic); vmcb_set_vintr(vmcb, intr); vmcb_set_general1_intercepts( vmcb, general1_intercepts | GENERAL1_INTERCEPT_VINTR); } void svm_intr_assist(void) { struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; struct hvm_intack intack; enum hvm_intblk intblk; /* Crank the handle on interrupt state. */ pt_update_irq(v); do { intack = hvm_vcpu_has_pending_irq(v); if ( likely(intack.source == hvm_intsrc_none) ) return; intblk = hvm_interrupt_blocked(v, intack); if ( intblk == hvm_intblk_svm_gif ) { ASSERT(nestedhvm_enabled(v->domain)); return; } /* Interrupts for the nested guest are already * in the vmcb. */ if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) ) { int rc; /* l2 guest was running when an interrupt for * the l1 guest occured. */ rc = nestedsvm_vcpu_interrupt(v, intack); switch (rc) { case NSVM_INTR_NOTINTERCEPTED: /* Inject interrupt into 2nd level guest directly. */ break; case NSVM_INTR_NOTHANDLED: case NSVM_INTR_FORCEVMEXIT: return; case NSVM_INTR_MASKED: /* Guest already enabled an interrupt window. */ return; default: panic("%s: nestedsvm_vcpu_interrupt can't handle value %#x", __func__, rc); } } /* * Pending IRQs must be delayed if: * 1. An event is already pending. This is despite the fact that SVM * provides a VINTR delivery method quite separate from the EVENTINJ * mechanism. The event delivery can arbitrarily delay the injection * of the vintr (for example, if the exception is handled via an * interrupt gate, hence zeroing RFLAGS.IF). In the meantime: * - the vTPR could be modified upwards, so we need to wait until * the exception is delivered before we can safely decide that an * interrupt is deliverable; and * - the guest might look at the APIC/PIC state, so we ought not to * have cleared the interrupt out of the IRR. * 2. The IRQ is masked. */ if ( unlikely(vmcb->eventinj.fields.v) || intblk ) { enable_intr_window(v, intack); return; } intack = hvm_vcpu_ack_pending_irq(v, intack); } while ( intack.source == hvm_intsrc_none ); if ( intack.source == hvm_intsrc_nmi ) { svm_inject_nmi(v); } else { HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0); svm_inject_extint(v, intack.vector); pt_intr_post(v, intack); } /* Is there another IRQ to queue up behind this one? */ intack = hvm_vcpu_has_pending_irq(v); if ( unlikely(intack.source != hvm_intsrc_none) ) enable_intr_window(v, intack); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/hvm/svm/vpmu.c0000664000175000017500000003126712307313555015342 0ustar smbsmb/* * vpmu.c: PMU virtualization for HVM domain. * * Copyright (c) 2010, Advanced Micro Devices, Inc. * Parts of this code are Copyright (c) 2007, Intel Corporation * * Author: Wei Wang * Tested by: Suravee Suthikulpanit * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #include #include #include #include #include #include #include #include #define F10H_NUM_COUNTERS 4 #define F15H_NUM_COUNTERS 6 #define MAX_NUM_COUNTERS F15H_NUM_COUNTERS #define MSR_F10H_EVNTSEL_GO_SHIFT 40 #define MSR_F10H_EVNTSEL_EN_SHIFT 22 #define MSR_F10H_COUNTER_LENGTH 48 #define is_guest_mode(msr) ((msr) & (1ULL << MSR_F10H_EVNTSEL_GO_SHIFT)) #define is_pmu_enabled(msr) ((msr) & (1ULL << MSR_F10H_EVNTSEL_EN_SHIFT)) #define set_guest_mode(msr) (msr |= (1ULL << MSR_F10H_EVNTSEL_GO_SHIFT)) #define is_overflowed(msr) (!((msr) & (1ULL << (MSR_F10H_COUNTER_LENGTH-1)))) static unsigned int __read_mostly num_counters; static const u32 __read_mostly *counters; static const u32 __read_mostly *ctrls; static bool_t __read_mostly k7_counters_mirrored; /* PMU Counter MSRs. */ static const u32 AMD_F10H_COUNTERS[] = { MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3 }; /* PMU Control MSRs. */ static const u32 AMD_F10H_CTRLS[] = { MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3 }; static const u32 AMD_F15H_COUNTERS[] = { MSR_AMD_FAM15H_PERFCTR0, MSR_AMD_FAM15H_PERFCTR1, MSR_AMD_FAM15H_PERFCTR2, MSR_AMD_FAM15H_PERFCTR3, MSR_AMD_FAM15H_PERFCTR4, MSR_AMD_FAM15H_PERFCTR5 }; static const u32 AMD_F15H_CTRLS[] = { MSR_AMD_FAM15H_EVNTSEL0, MSR_AMD_FAM15H_EVNTSEL1, MSR_AMD_FAM15H_EVNTSEL2, MSR_AMD_FAM15H_EVNTSEL3, MSR_AMD_FAM15H_EVNTSEL4, MSR_AMD_FAM15H_EVNTSEL5 }; /* storage for context switching */ struct amd_vpmu_context { u64 counters[MAX_NUM_COUNTERS]; u64 ctrls[MAX_NUM_COUNTERS]; bool_t msr_bitmap_set; }; static inline int get_pmu_reg_type(u32 addr) { if ( (addr >= MSR_K7_EVNTSEL0) && (addr <= MSR_K7_EVNTSEL3) ) return MSR_TYPE_CTRL; if ( (addr >= MSR_K7_PERFCTR0) && (addr <= MSR_K7_PERFCTR3) ) return MSR_TYPE_COUNTER; if ( (addr >= MSR_AMD_FAM15H_EVNTSEL0) && (addr <= MSR_AMD_FAM15H_PERFCTR5 ) ) { if (addr & 1) return MSR_TYPE_COUNTER; else return MSR_TYPE_CTRL; } /* unsupported registers */ return -1; } static inline u32 get_fam15h_addr(u32 addr) { switch ( addr ) { case MSR_K7_PERFCTR0: return MSR_AMD_FAM15H_PERFCTR0; case MSR_K7_PERFCTR1: return MSR_AMD_FAM15H_PERFCTR1; case MSR_K7_PERFCTR2: return MSR_AMD_FAM15H_PERFCTR2; case MSR_K7_PERFCTR3: return MSR_AMD_FAM15H_PERFCTR3; case MSR_K7_EVNTSEL0: return MSR_AMD_FAM15H_EVNTSEL0; case MSR_K7_EVNTSEL1: return MSR_AMD_FAM15H_EVNTSEL1; case MSR_K7_EVNTSEL2: return MSR_AMD_FAM15H_EVNTSEL2; case MSR_K7_EVNTSEL3: return MSR_AMD_FAM15H_EVNTSEL3; default: break; } return addr; } static void amd_vpmu_set_msr_bitmap(struct vcpu *v) { unsigned int i; struct vpmu_struct *vpmu = vcpu_vpmu(v); struct amd_vpmu_context *ctxt = vpmu->context; for ( i = 0; i < num_counters; i++ ) { svm_intercept_msr(v, counters[i], MSR_INTERCEPT_NONE); svm_intercept_msr(v, ctrls[i], MSR_INTERCEPT_WRITE); } ctxt->msr_bitmap_set = 1; } static void amd_vpmu_unset_msr_bitmap(struct vcpu *v) { unsigned int i; struct vpmu_struct *vpmu = vcpu_vpmu(v); struct amd_vpmu_context *ctxt = vpmu->context; for ( i = 0; i < num_counters; i++ ) { svm_intercept_msr(v, counters[i], MSR_INTERCEPT_RW); svm_intercept_msr(v, ctrls[i], MSR_INTERCEPT_RW); } ctxt->msr_bitmap_set = 0; } static int amd_vpmu_do_interrupt(struct cpu_user_regs *regs) { return 1; } static inline void context_load(struct vcpu *v) { unsigned int i; struct vpmu_struct *vpmu = vcpu_vpmu(v); struct amd_vpmu_context *ctxt = vpmu->context; for ( i = 0; i < num_counters; i++ ) { wrmsrl(counters[i], ctxt->counters[i]); wrmsrl(ctrls[i], ctxt->ctrls[i]); } } static void amd_vpmu_load(struct vcpu *v) { struct vpmu_struct *vpmu = vcpu_vpmu(v); struct amd_vpmu_context *ctxt = vpmu->context; vpmu_reset(vpmu, VPMU_FROZEN); if ( vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) ) { unsigned int i; for ( i = 0; i < num_counters; i++ ) wrmsrl(ctrls[i], ctxt->ctrls[i]); return; } context_load(v); } static inline void context_save(struct vcpu *v) { unsigned int i; struct vpmu_struct *vpmu = vcpu_vpmu(v); struct amd_vpmu_context *ctxt = vpmu->context; /* No need to save controls -- they are saved in amd_vpmu_do_wrmsr */ for ( i = 0; i < num_counters; i++ ) rdmsrl(counters[i], ctxt->counters[i]); } static int amd_vpmu_save(struct vcpu *v) { struct vpmu_struct *vpmu = vcpu_vpmu(v); struct amd_vpmu_context *ctx = vpmu->context; unsigned int i; /* * Stop the counters. If we came here via vpmu_save_force (i.e. * when VPMU_CONTEXT_SAVE is set) counters are already stopped. */ if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_SAVE) ) { vpmu_set(vpmu, VPMU_FROZEN); for ( i = 0; i < num_counters; i++ ) wrmsrl(ctrls[i], 0); return 0; } if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) ) return 0; context_save(v); if ( !vpmu_is_set(vpmu, VPMU_RUNNING) && ctx->msr_bitmap_set ) amd_vpmu_unset_msr_bitmap(v); return 1; } static void context_update(unsigned int msr, u64 msr_content) { unsigned int i; struct vcpu *v = current; struct vpmu_struct *vpmu = vcpu_vpmu(v); struct amd_vpmu_context *ctxt = vpmu->context; if ( k7_counters_mirrored && ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3)) ) { msr = get_fam15h_addr(msr); } for ( i = 0; i < num_counters; i++ ) { if ( msr == ctrls[i] ) { ctxt->ctrls[i] = msr_content; return; } else if (msr == counters[i] ) { ctxt->counters[i] = msr_content; return; } } } static int amd_vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content) { struct vcpu *v = current; struct vpmu_struct *vpmu = vcpu_vpmu(v); /* For all counters, enable guest only mode for HVM guest */ if ( (get_pmu_reg_type(msr) == MSR_TYPE_CTRL) && !(is_guest_mode(msr_content)) ) { set_guest_mode(msr_content); } /* check if the first counter is enabled */ if ( (get_pmu_reg_type(msr) == MSR_TYPE_CTRL) && is_pmu_enabled(msr_content) && !vpmu_is_set(vpmu, VPMU_RUNNING) ) { if ( !acquire_pmu_ownership(PMU_OWNER_HVM) ) return 1; vpmu_set(vpmu, VPMU_RUNNING); apic_write(APIC_LVTPC, PMU_APIC_VECTOR); vpmu->hw_lapic_lvtpc = PMU_APIC_VECTOR; if ( !((struct amd_vpmu_context *)vpmu->context)->msr_bitmap_set ) amd_vpmu_set_msr_bitmap(v); } /* stop saving & restore if guest stops first counter */ if ( (get_pmu_reg_type(msr) == MSR_TYPE_CTRL) && (is_pmu_enabled(msr_content) == 0) && vpmu_is_set(vpmu, VPMU_RUNNING) ) { apic_write(APIC_LVTPC, PMU_APIC_VECTOR | APIC_LVT_MASKED); vpmu->hw_lapic_lvtpc = PMU_APIC_VECTOR | APIC_LVT_MASKED; vpmu_reset(vpmu, VPMU_RUNNING); if ( ((struct amd_vpmu_context *)vpmu->context)->msr_bitmap_set ) amd_vpmu_unset_msr_bitmap(v); release_pmu_ownship(PMU_OWNER_HVM); } if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) || vpmu_is_set(vpmu, VPMU_FROZEN) ) { context_load(v); vpmu_set(vpmu, VPMU_CONTEXT_LOADED); vpmu_reset(vpmu, VPMU_FROZEN); } /* Update vpmu context immediately */ context_update(msr, msr_content); /* Write to hw counters */ wrmsrl(msr, msr_content); return 1; } static int amd_vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content) { struct vcpu *v = current; struct vpmu_struct *vpmu = vcpu_vpmu(v); if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) || vpmu_is_set(vpmu, VPMU_FROZEN) ) { context_load(v); vpmu_set(vpmu, VPMU_CONTEXT_LOADED); vpmu_reset(vpmu, VPMU_FROZEN); } rdmsrl(msr, *msr_content); return 1; } static int amd_vpmu_initialise(struct vcpu *v) { struct amd_vpmu_context *ctxt; struct vpmu_struct *vpmu = vcpu_vpmu(v); uint8_t family = current_cpu_data.x86; if ( vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) ) return 0; if ( counters == NULL ) { switch ( family ) { case 0x15: num_counters = F15H_NUM_COUNTERS; counters = AMD_F15H_COUNTERS; ctrls = AMD_F15H_CTRLS; k7_counters_mirrored = 1; break; case 0x10: case 0x12: case 0x14: case 0x16: default: num_counters = F10H_NUM_COUNTERS; counters = AMD_F10H_COUNTERS; ctrls = AMD_F10H_CTRLS; k7_counters_mirrored = 0; break; } } ctxt = xzalloc(struct amd_vpmu_context); if ( !ctxt ) { gdprintk(XENLOG_WARNING, "Insufficient memory for PMU, " " PMU feature is unavailable on domain %d vcpu %d.\n", v->vcpu_id, v->domain->domain_id); return -ENOMEM; } vpmu->context = ctxt; vpmu_set(vpmu, VPMU_CONTEXT_ALLOCATED); return 0; } static void amd_vpmu_destroy(struct vcpu *v) { struct vpmu_struct *vpmu = vcpu_vpmu(v); if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) ) return; if ( ((struct amd_vpmu_context *)vpmu->context)->msr_bitmap_set ) amd_vpmu_unset_msr_bitmap(v); xfree(vpmu->context); vpmu_reset(vpmu, VPMU_CONTEXT_ALLOCATED); if ( vpmu_is_set(vpmu, VPMU_RUNNING) ) { vpmu_reset(vpmu, VPMU_RUNNING); release_pmu_ownship(PMU_OWNER_HVM); } } /* VPMU part of the 'q' keyhandler */ static void amd_vpmu_dump(const struct vcpu *v) { const struct vpmu_struct *vpmu = vcpu_vpmu(v); const struct amd_vpmu_context *ctxt = vpmu->context; unsigned int i; printk(" VPMU state: 0x%x ", vpmu->flags); if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) ) { printk("\n"); return; } printk("("); if ( vpmu_is_set(vpmu, VPMU_PASSIVE_DOMAIN_ALLOCATED) ) printk("PASSIVE_DOMAIN_ALLOCATED, "); if ( vpmu_is_set(vpmu, VPMU_FROZEN) ) printk("FROZEN, "); if ( vpmu_is_set(vpmu, VPMU_CONTEXT_SAVE) ) printk("SAVE, "); if ( vpmu_is_set(vpmu, VPMU_RUNNING) ) printk("RUNNING, "); if ( vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) ) printk("LOADED, "); printk("ALLOCATED)\n"); for ( i = 0; i < num_counters; i++ ) { uint64_t ctrl, cntr; rdmsrl(ctrls[i], ctrl); rdmsrl(counters[i], cntr); printk(" %#x: %#lx (%#lx in HW) %#x: %#lx (%#lx in HW)\n", ctrls[i], ctxt->ctrls[i], ctrl, counters[i], ctxt->counters[i], cntr); } } struct arch_vpmu_ops amd_vpmu_ops = { .do_wrmsr = amd_vpmu_do_wrmsr, .do_rdmsr = amd_vpmu_do_rdmsr, .do_interrupt = amd_vpmu_do_interrupt, .arch_vpmu_destroy = amd_vpmu_destroy, .arch_vpmu_save = amd_vpmu_save, .arch_vpmu_load = amd_vpmu_load, .arch_vpmu_dump = amd_vpmu_dump }; int svm_vpmu_initialise(struct vcpu *v, unsigned int vpmu_flags) { struct vpmu_struct *vpmu = vcpu_vpmu(v); uint8_t family = current_cpu_data.x86; int ret = 0; /* vpmu enabled? */ if ( !vpmu_flags ) return 0; switch ( family ) { case 0x10: case 0x12: case 0x14: case 0x15: case 0x16: ret = amd_vpmu_initialise(v); if ( !ret ) vpmu->arch_vpmu_ops = &amd_vpmu_ops; return ret; } printk("VPMU: Initialization failed. " "AMD processor family %d has not " "been supported\n", family); return -EINVAL; } xen-4.4.0/xen/arch/x86/hvm/svm/entry.S0000664000175000017500000000726212307313555015472 0ustar smbsmb/* * entry.S: SVM architecture-specific entry/exit handling. * Copyright (c) 2005-2007, Advanced Micro Devices, Inc. * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2008, Citrix Systems, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #define VMRUN .byte 0x0F,0x01,0xD8 #define STGI .byte 0x0F,0x01,0xDC #define CLGI .byte 0x0F,0x01,0xDD ENTRY(svm_asm_do_resume) GET_CURRENT(%rbx) .Lsvm_do_resume: call svm_intr_assist mov %rsp,%rdi call nsvm_vcpu_switch ASSERT_NOT_IN_ATOMIC mov VCPU_processor(%rbx),%eax lea irq_stat+IRQSTAT_softirq_pending(%rip),%rdx xor %ecx,%ecx shl $IRQSTAT_shift,%eax CLGI cmp %ecx,(%rdx,%rax,1) jne .Lsvm_process_softirqs cmp %cl,VCPU_nsvm_hap_enabled(%rbx) UNLIKELY_START(ne, nsvm_hap) cmp %rcx,VCPU_nhvm_p2m(%rbx) sete %al test VCPU_nhvm_guestmode(%rbx),%al UNLIKELY_DONE(z, nsvm_hap) /* * Someone shot down our nested p2m table; go round again * and nsvm_vcpu_switch() will fix it for us. */ STGI jmp .Lsvm_do_resume __UNLIKELY_END(nsvm_hap) call svm_asid_handle_vmrun cmpb $0,tb_init_done(%rip) UNLIKELY_START(nz, svm_trace) call svm_trace_vmentry UNLIKELY_END(svm_trace) mov VCPU_svm_vmcb(%rbx),%rcx mov UREGS_rax(%rsp),%rax mov %rax,VMCB_rax(%rcx) mov UREGS_rip(%rsp),%rax mov %rax,VMCB_rip(%rcx) mov UREGS_rsp(%rsp),%rax mov %rax,VMCB_rsp(%rcx) mov UREGS_eflags(%rsp),%rax or $X86_EFLAGS_MBS,%rax mov %rax,VMCB_rflags(%rcx) pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp mov VCPU_svm_vmcb_pa(%rbx),%rax pop %rbx pop %r11 pop %r10 pop %r9 pop %r8 add $8,%rsp /* Skip %rax: restored by VMRUN. */ pop %rcx pop %rdx pop %rsi pop %rdi VMRUN GET_CURRENT(%rax) push %rdi push %rsi push %rdx push %rcx mov VCPU_svm_vmcb(%rax),%rcx push %rax push %r8 push %r9 push %r10 push %r11 push %rbx mov %rax,%rbx push %rbp push %r12 push %r13 push %r14 push %r15 movb $0,VCPU_svm_vmcb_in_sync(%rbx) mov VMCB_rax(%rcx),%rax mov %rax,UREGS_rax(%rsp) mov VMCB_rip(%rcx),%rax mov %rax,UREGS_rip(%rsp) mov VMCB_rsp(%rcx),%rax mov %rax,UREGS_rsp(%rsp) mov VMCB_rflags(%rcx),%rax mov %rax,UREGS_eflags(%rsp) STGI GLOBAL(svm_stgi_label) mov %rsp,%rdi call svm_vmexit_handler jmp .Lsvm_do_resume .Lsvm_process_softirqs: STGI call do_softirq jmp .Lsvm_do_resume xen-4.4.0/xen/arch/x86/hvm/svm/emulate.c0000664000175000017500000001520212307313555015776 0ustar smbsmb/* * emulate.c: handling SVM emulate instructions help. * Copyright (c) 2005 AMD Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include #include #define MAX_INST_LEN 15 static unsigned int is_prefix(u8 opc) { switch ( opc ) { case 0x66: case 0x67: case 0x2E: case 0x3E: case 0x26: case 0x64: case 0x65: case 0x36: case 0xF0: case 0xF3: case 0xF2: case 0x40 ... 0x4f: return 1; } return 0; } static unsigned long svm_rip2pointer(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; unsigned long p = vmcb->cs.base + guest_cpu_user_regs()->eip; if ( !(vmcb->cs.attr.fields.l && hvm_long_mode_enabled(v)) ) return (u32)p; /* mask to 32 bits */ return p; } static unsigned long svm_nextrip_insn_length(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; if ( !cpu_has_svm_nrips || (vmcb->nextrip <= vmcb->rip) ) return 0; #ifndef NDEBUG switch ( vmcb->exitcode ) { case VMEXIT_CR0_READ... VMEXIT_DR15_WRITE: /* faults due to instruction intercepts */ /* (exitcodes 84-95) are reserved */ case VMEXIT_IDTR_READ ... VMEXIT_TR_WRITE: case VMEXIT_RDTSC ... VMEXIT_MSR: case VMEXIT_VMRUN ... VMEXIT_XSETBV: /* ...and the rest of the #VMEXITs */ case VMEXIT_CR0_SEL_WRITE: case VMEXIT_EXCEPTION_BP: break; default: BUG(); } #endif return vmcb->nextrip - vmcb->rip; } /* First byte: Length. Following bytes: Opcode bytes. */ #define MAKE_INSTR(nm, ...) static const u8 OPCODE_##nm[] = { __VA_ARGS__ } MAKE_INSTR(INVD, 2, 0x0f, 0x08); MAKE_INSTR(WBINVD, 2, 0x0f, 0x09); MAKE_INSTR(CPUID, 2, 0x0f, 0xa2); MAKE_INSTR(RDMSR, 2, 0x0f, 0x32); MAKE_INSTR(WRMSR, 2, 0x0f, 0x30); MAKE_INSTR(VMCALL, 3, 0x0f, 0x01, 0xd9); MAKE_INSTR(HLT, 1, 0xf4); MAKE_INSTR(INT3, 1, 0xcc); MAKE_INSTR(RDTSC, 2, 0x0f, 0x31); MAKE_INSTR(PAUSE, 1, 0x90); MAKE_INSTR(XSETBV, 3, 0x0f, 0x01, 0xd1); MAKE_INSTR(VMRUN, 3, 0x0f, 0x01, 0xd8); MAKE_INSTR(VMLOAD, 3, 0x0f, 0x01, 0xda); MAKE_INSTR(VMSAVE, 3, 0x0f, 0x01, 0xdb); MAKE_INSTR(STGI, 3, 0x0f, 0x01, 0xdc); MAKE_INSTR(CLGI, 3, 0x0f, 0x01, 0xdd); MAKE_INSTR(INVLPGA,3, 0x0f, 0x01, 0xdf); static const u8 *const opc_bytes[INSTR_MAX_COUNT] = { [INSTR_INVD] = OPCODE_INVD, [INSTR_WBINVD] = OPCODE_WBINVD, [INSTR_CPUID] = OPCODE_CPUID, [INSTR_RDMSR] = OPCODE_RDMSR, [INSTR_WRMSR] = OPCODE_WRMSR, [INSTR_VMCALL] = OPCODE_VMCALL, [INSTR_HLT] = OPCODE_HLT, [INSTR_INT3] = OPCODE_INT3, [INSTR_RDTSC] = OPCODE_RDTSC, [INSTR_PAUSE] = OPCODE_PAUSE, [INSTR_XSETBV] = OPCODE_XSETBV, [INSTR_VMRUN] = OPCODE_VMRUN, [INSTR_VMLOAD] = OPCODE_VMLOAD, [INSTR_VMSAVE] = OPCODE_VMSAVE, [INSTR_STGI] = OPCODE_STGI, [INSTR_CLGI] = OPCODE_CLGI, [INSTR_INVLPGA] = OPCODE_INVLPGA, }; static int fetch(struct vcpu *v, u8 *buf, unsigned long addr, int len) { uint32_t pfec; pfec = (vmcb_get_cpl(v->arch.hvm_svm.vmcb) == 3) ? PFEC_user_mode : 0; switch ( hvm_fetch_from_guest_virt(buf, addr, len, pfec) ) { case HVMCOPY_okay: break; case HVMCOPY_bad_gva_to_gfn: /* OK just to give up; we'll have injected #PF already */ return 0; default: /* Not OK: fetches from non-RAM pages are not supportable. */ gdprintk(XENLOG_WARNING, "Bad instruction fetch at %#lx (%#lx)\n", (unsigned long) guest_cpu_user_regs()->eip, addr); hvm_inject_hw_exception(TRAP_gp_fault, 0); return 0; } return 1; } int __get_instruction_length_from_list(struct vcpu *v, const enum instruction_index *list, unsigned int list_count) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; unsigned int i, j, inst_len = 0; enum instruction_index instr = 0; u8 buf[MAX_INST_LEN]; const u8 *opcode = NULL; unsigned long fetch_addr; unsigned int fetch_len; if ( (inst_len = svm_nextrip_insn_length(v)) != 0 ) return inst_len; if ( vmcb->exitcode == VMEXIT_IOIO ) return vmcb->exitinfo2 - vmcb->rip; /* Fetch up to the next page break; we'll fetch from the next page * later if we have to. */ fetch_addr = svm_rip2pointer(v); fetch_len = min_t(unsigned int, MAX_INST_LEN, PAGE_SIZE - (fetch_addr & ~PAGE_MASK)); if ( !fetch(v, buf, fetch_addr, fetch_len) ) return 0; while ( (inst_len < MAX_INST_LEN) && is_prefix(buf[inst_len]) ) { inst_len++; if ( inst_len >= fetch_len ) { if ( !fetch(v, buf + fetch_len, fetch_addr + fetch_len, MAX_INST_LEN - fetch_len) ) return 0; fetch_len = MAX_INST_LEN; } } for ( j = 0; j < list_count; j++ ) { instr = list[j]; opcode = opc_bytes[instr]; for ( i = 0; (i < opcode[0]) && ((inst_len + i) < MAX_INST_LEN); i++ ) { if ( (inst_len + i) >= fetch_len ) { if ( !fetch(v, buf + fetch_len, fetch_addr + fetch_len, MAX_INST_LEN - fetch_len) ) return 0; fetch_len = MAX_INST_LEN; } if ( buf[inst_len+i] != opcode[i+1] ) goto mismatch; } goto done; mismatch: ; } gdprintk(XENLOG_WARNING, "%s: Mismatch between expected and actual instruction bytes: " "eip = %lx\n", __func__, (unsigned long)vmcb->rip); hvm_inject_hw_exception(TRAP_gp_fault, 0); return 0; done: inst_len += opcode[0]; ASSERT(inst_len <= MAX_INST_LEN); return inst_len; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/hvm/svm/nestedsvm.c0000664000175000017500000013526012307313555016361 0ustar smbsmb/* * nestedsvm.c: Nested Virtualization * Copyright (c) 2011, Advanced Micro Devices, Inc * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #include #include #include #include #include #include #include #include /* paging_mode_hap */ #include /* for local_event_delivery_(en|dis)able */ #include /* p2m_get_pagetable, p2m_get_nestedp2m */ #define NSVM_ERROR_VVMCB 1 #define NSVM_ERROR_VMENTRY 2 static void nestedsvm_vcpu_clgi(struct vcpu *v) { /* clear gif flag */ vcpu_nestedsvm(v).ns_gif = 0; local_event_delivery_disable(); /* mask events for PV drivers */ } static void nestedsvm_vcpu_stgi(struct vcpu *v) { /* enable gif flag */ vcpu_nestedsvm(v).ns_gif = 1; local_event_delivery_enable(); /* unmask events for PV drivers */ } static int nestedsvm_vmcb_isvalid(struct vcpu *v, uint64_t vmcxaddr) { /* Address must be 4k aligned */ if ( (vmcxaddr & ~PAGE_MASK) != 0 ) return 0; /* Maximum valid physical address. * See AMD BKDG for HSAVE_PA MSR. */ if ( vmcxaddr > 0xfd00000000ULL ) return 0; return 1; } int nestedsvm_vmcb_map(struct vcpu *v, uint64_t vmcbaddr) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); if (nv->nv_vvmcx != NULL && nv->nv_vvmcxaddr != vmcbaddr) { ASSERT(nv->nv_vvmcxaddr != VMCX_EADDR); hvm_unmap_guest_frame(nv->nv_vvmcx, 1); nv->nv_vvmcx = NULL; nv->nv_vvmcxaddr = VMCX_EADDR; } if (nv->nv_vvmcx == NULL) { nv->nv_vvmcx = hvm_map_guest_frame_rw(vmcbaddr >> PAGE_SHIFT, 1); if (nv->nv_vvmcx == NULL) return 0; nv->nv_vvmcxaddr = vmcbaddr; } return 1; } /* Interface methods */ int nsvm_vcpu_initialise(struct vcpu *v) { void *msrpm; struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0); svm->ns_cached_msrpm = msrpm; if (msrpm == NULL) goto err; memset(msrpm, 0x0, MSRPM_SIZE); msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0); svm->ns_merged_msrpm = msrpm; if (msrpm == NULL) goto err; memset(msrpm, 0x0, MSRPM_SIZE); nv->nv_n2vmcx = alloc_vmcb(); if (nv->nv_n2vmcx == NULL) goto err; nv->nv_n2vmcx_pa = virt_to_maddr(nv->nv_n2vmcx); return 0; err: nsvm_vcpu_destroy(v); return -ENOMEM; } void nsvm_vcpu_destroy(struct vcpu *v) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); /* * When destroying the vcpu, it may be running on behalf of l2 guest. * Therefore we need to switch the VMCB pointer back to the l1 vmcb, * in order to avoid double free of l2 vmcb and the possible memory leak * of l1 vmcb page. */ if (nv->nv_n1vmcx) v->arch.hvm_svm.vmcb = nv->nv_n1vmcx; if (svm->ns_cached_msrpm) { free_xenheap_pages(svm->ns_cached_msrpm, get_order_from_bytes(MSRPM_SIZE)); svm->ns_cached_msrpm = NULL; } if (svm->ns_merged_msrpm) { free_xenheap_pages(svm->ns_merged_msrpm, get_order_from_bytes(MSRPM_SIZE)); svm->ns_merged_msrpm = NULL; } hvm_unmap_guest_frame(nv->nv_vvmcx, 1); nv->nv_vvmcx = NULL; if (nv->nv_n2vmcx) { free_vmcb(nv->nv_n2vmcx); nv->nv_n2vmcx = NULL; nv->nv_n2vmcx_pa = VMCX_EADDR; } if (svm->ns_iomap) svm->ns_iomap = NULL; } int nsvm_vcpu_reset(struct vcpu *v) { struct nestedsvm *svm = &vcpu_nestedsvm(v); svm->ns_msr_hsavepa = VMCX_EADDR; svm->ns_ovvmcb_pa = VMCX_EADDR; svm->ns_tscratio = DEFAULT_TSC_RATIO; svm->ns_cr_intercepts = 0; svm->ns_dr_intercepts = 0; svm->ns_exception_intercepts = 0; svm->ns_general1_intercepts = 0; svm->ns_general2_intercepts = 0; svm->ns_lbr_control.bytes = 0; svm->ns_hap_enabled = 0; svm->ns_vmcb_guestcr3 = 0; svm->ns_vmcb_hostcr3 = 0; svm->ns_guest_asid = 0; svm->ns_hostflags.bytes = 0; svm->ns_vmexit.exitinfo1 = 0; svm->ns_vmexit.exitinfo2 = 0; if (svm->ns_iomap) svm->ns_iomap = NULL; nestedsvm_vcpu_stgi(v); return 0; } static uint64_t nestedsvm_fpu_vmentry(uint64_t n1cr0, struct vmcb_struct *vvmcb, struct vmcb_struct *n1vmcb, struct vmcb_struct *n2vmcb) { uint64_t vcr0; vcr0 = vvmcb->_cr0; if ( !(n1cr0 & X86_CR0_TS) && (n1vmcb->_cr0 & X86_CR0_TS) ) { /* svm_fpu_leave() run while l1 guest was running. * Sync FPU state with l2 guest. */ vcr0 |= X86_CR0_TS; n2vmcb->_exception_intercepts |= (1U << TRAP_no_device); } else if ( !(vcr0 & X86_CR0_TS) && (n2vmcb->_cr0 & X86_CR0_TS) ) { /* svm_fpu_enter() run while l1 guest was running. * Sync FPU state with l2 guest. */ vcr0 &= ~X86_CR0_TS; n2vmcb->_exception_intercepts &= ~(1U << TRAP_no_device); } return vcr0; } static void nestedsvm_fpu_vmexit(struct vmcb_struct *n1vmcb, struct vmcb_struct *n2vmcb, uint64_t n1cr0, uint64_t guest_cr0) { if ( !(guest_cr0 & X86_CR0_TS) && (n2vmcb->_cr0 & X86_CR0_TS) ) { /* svm_fpu_leave() run while l2 guest was running. * Sync FPU state with l1 guest. */ n1vmcb->_cr0 |= X86_CR0_TS; n1vmcb->_exception_intercepts |= (1U << TRAP_no_device); } else if ( !(n1cr0 & X86_CR0_TS) && (n1vmcb->_cr0 & X86_CR0_TS) ) { /* svm_fpu_enter() run while l2 guest was running. * Sync FPU state with l1 guest. */ n1vmcb->_cr0 &= ~X86_CR0_TS; n1vmcb->_exception_intercepts &= ~(1U << TRAP_no_device); } } static int nsvm_vcpu_hostsave(struct vcpu *v, unsigned int inst_len) { struct nestedsvm *svm = &vcpu_nestedsvm(v); struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct vmcb_struct *n1vmcb; n1vmcb = nv->nv_n1vmcx; ASSERT(n1vmcb != NULL); n1vmcb->rip += inst_len; /* Save shadowed values. This ensures that the l1 guest * cannot override them to break out. */ n1vmcb->_efer = v->arch.hvm_vcpu.guest_efer; n1vmcb->_cr0 = v->arch.hvm_vcpu.guest_cr[0]; n1vmcb->_cr2 = v->arch.hvm_vcpu.guest_cr[2]; n1vmcb->_cr4 = v->arch.hvm_vcpu.guest_cr[4]; /* Remember the host interrupt flag */ svm->ns_hostflags.fields.rflagsif = (n1vmcb->rflags & X86_EFLAGS_IF) ? 1 : 0; return 0; } int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *n1vmcb, *n2vmcb; int rc; n1vmcb = nv->nv_n1vmcx; n2vmcb = nv->nv_n2vmcx; ASSERT(n1vmcb != NULL); ASSERT(n2vmcb != NULL); /* nsvm_vmcb_prepare4vmexit() already saved register values * handled by VMSAVE/VMLOAD into n1vmcb directly. */ /* switch vmcb to l1 guest's vmcb */ v->arch.hvm_svm.vmcb = n1vmcb; v->arch.hvm_svm.vmcb_pa = nv->nv_n1vmcx_pa; /* EFER */ v->arch.hvm_vcpu.guest_efer = n1vmcb->_efer; rc = hvm_set_efer(n1vmcb->_efer); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_efer failed, rc: %u\n", rc); /* CR4 */ v->arch.hvm_vcpu.guest_cr[4] = n1vmcb->_cr4; rc = hvm_set_cr4(n1vmcb->_cr4); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc); /* CR0 */ nestedsvm_fpu_vmexit(n1vmcb, n2vmcb, svm->ns_cr0, v->arch.hvm_vcpu.guest_cr[0]); v->arch.hvm_vcpu.guest_cr[0] = n1vmcb->_cr0 | X86_CR0_PE; n1vmcb->rflags &= ~X86_EFLAGS_VM; rc = hvm_set_cr0(n1vmcb->_cr0 | X86_CR0_PE); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc); svm->ns_cr0 = v->arch.hvm_vcpu.guest_cr[0]; /* CR2 */ v->arch.hvm_vcpu.guest_cr[2] = n1vmcb->_cr2; hvm_update_guest_cr(v, 2); /* CR3 */ /* Nested paging mode */ if (nestedhvm_paging_mode_hap(v)) { /* host nested paging + guest nested paging. */ /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ } else if (paging_mode_hap(v->domain)) { /* host nested paging + guest shadow paging. */ /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ } else { /* host shadow paging + guest shadow paging. */ /* Reset MMU context -- XXX (hostrestore) not yet working*/ if (!pagetable_is_null(v->arch.guest_table)) put_page(pagetable_get_page(v->arch.guest_table)); v->arch.guest_table = pagetable_null(); /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ } rc = hvm_set_cr3(n1vmcb->_cr3); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc); regs->eax = n1vmcb->rax; regs->esp = n1vmcb->rsp; regs->eip = n1vmcb->rip; regs->eflags = n1vmcb->rflags; n1vmcb->_dr7 = 0; /* disable all breakpoints */ n1vmcb->_cpl = 0; /* Clear exitintinfo to prevent a fault loop of re-injecting * exceptions forever. */ n1vmcb->exitintinfo.bytes = 0; /* Cleanbits */ n1vmcb->cleanbits.bytes = 0; return 0; } static int nsvm_vmrun_permissionmap(struct vcpu *v, bool_t viopm) { struct arch_svm_struct *arch_svm = &v->arch.hvm_svm; struct nestedsvm *svm = &vcpu_nestedsvm(v); struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct vmcb_struct *ns_vmcb = nv->nv_vvmcx; struct vmcb_struct *host_vmcb = arch_svm->vmcb; unsigned long *ns_msrpm_ptr; unsigned int i; enum hvm_copy_result ret; unsigned long *ns_viomap; bool_t ioport_80 = 1, ioport_ed = 1; ns_msrpm_ptr = (unsigned long *)svm->ns_cached_msrpm; ret = hvm_copy_from_guest_phys(svm->ns_cached_msrpm, ns_vmcb->_msrpm_base_pa, MSRPM_SIZE); if (ret != HVMCOPY_okay) { gdprintk(XENLOG_ERR, "hvm_copy_from_guest_phys msrpm %u\n", ret); return 1; } /* Check l1 guest io permission map and get a shadow one based on * if l1 guest intercepts io ports 0x80 and/or 0xED. */ svm->ns_oiomap_pa = svm->ns_iomap_pa; svm->ns_iomap_pa = ns_vmcb->_iopm_base_pa; ns_viomap = hvm_map_guest_frame_ro(svm->ns_iomap_pa >> PAGE_SHIFT, 0); if ( ns_viomap ) { ioport_80 = test_bit(0x80, ns_viomap); ioport_ed = test_bit(0xed, ns_viomap); hvm_unmap_guest_frame(ns_viomap, 0); } svm->ns_iomap = nestedhvm_vcpu_iomap_get(ioport_80, ioport_ed); nv->nv_ioport80 = ioport_80; nv->nv_ioportED = ioport_ed; /* v->arch.hvm_svm.msrpm has type unsigned long, thus * BYTES_PER_LONG. */ for (i = 0; i < MSRPM_SIZE / BYTES_PER_LONG; i++) svm->ns_merged_msrpm[i] = arch_svm->msrpm[i] | ns_msrpm_ptr[i]; host_vmcb->_iopm_base_pa = (uint64_t)virt_to_maddr(svm->ns_iomap); host_vmcb->_msrpm_base_pa = (uint64_t)virt_to_maddr(svm->ns_merged_msrpm); return 0; } static void nestedsvm_vmcb_set_nestedp2m(struct vcpu *v, struct vmcb_struct *vvmcb, struct vmcb_struct *n2vmcb) { struct p2m_domain *p2m; ASSERT(v != NULL); ASSERT(vvmcb != NULL); ASSERT(n2vmcb != NULL); p2m = p2m_get_nestedp2m(v, vvmcb->_h_cr3); n2vmcb->_h_cr3 = pagetable_get_paddr(p2m_get_pagetable(p2m)); } static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *ns_vmcb, *n1vmcb, *n2vmcb; bool_t vcleanbits_valid; int rc; uint64_t cr0; ns_vmcb = nv->nv_vvmcx; n1vmcb = nv->nv_n1vmcx; n2vmcb = nv->nv_n2vmcx; ASSERT(ns_vmcb != NULL); ASSERT(n1vmcb != NULL); ASSERT(n2vmcb != NULL); /* Check if virtual VMCB cleanbits are valid */ vcleanbits_valid = 1; if (svm->ns_ovvmcb_pa == VMCX_EADDR) vcleanbits_valid = 0; if (svm->ns_ovvmcb_pa != nv->nv_vvmcxaddr) vcleanbits_valid = 0; #define vcleanbit_set(_name) \ (vcleanbits_valid && ns_vmcb->cleanbits.fields._name) /* Enable l2 guest intercepts */ if (!vcleanbit_set(intercepts)) { svm->ns_cr_intercepts = ns_vmcb->_cr_intercepts; svm->ns_dr_intercepts = ns_vmcb->_dr_intercepts; svm->ns_exception_intercepts = ns_vmcb->_exception_intercepts; svm->ns_general1_intercepts = ns_vmcb->_general1_intercepts; svm->ns_general2_intercepts = ns_vmcb->_general2_intercepts; } /* We could track the cleanbits of the n1vmcb from * last emulated #VMEXIT to this emulated VMRUN to safe the merges * below. Those cleanbits would be tracked in an integer field * in struct nestedsvm. * But this effort is not worth doing because: * - Only the intercepts bit of the n1vmcb can effectively be used here * - The CPU runs more instructions for the tracking than can be * safed here. * The overhead comes from (ordered from highest to lowest): * - svm_ctxt_switch_to (CPU context switching) * - svm_fpu_enter, svm_fpu_leave (lazy FPU switching) * - emulated CLGI (clears VINTR intercept) * - host clears VINTR intercept * Test results show that the overhead is high enough that the * tracked intercepts bit of the n1vmcb is practically *always* cleared. */ n2vmcb->_cr_intercepts = n1vmcb->_cr_intercepts | ns_vmcb->_cr_intercepts; n2vmcb->_dr_intercepts = n1vmcb->_dr_intercepts | ns_vmcb->_dr_intercepts; n2vmcb->_exception_intercepts = n1vmcb->_exception_intercepts | ns_vmcb->_exception_intercepts; n2vmcb->_general1_intercepts = n1vmcb->_general1_intercepts | ns_vmcb->_general1_intercepts; n2vmcb->_general2_intercepts = n1vmcb->_general2_intercepts | ns_vmcb->_general2_intercepts; /* Nested Pause Filter */ if (ns_vmcb->_general1_intercepts & GENERAL1_INTERCEPT_PAUSE) n2vmcb->_pause_filter_count = min(n1vmcb->_pause_filter_count, ns_vmcb->_pause_filter_count); else n2vmcb->_pause_filter_count = n1vmcb->_pause_filter_count; /* TSC offset */ n2vmcb->_tsc_offset = n1vmcb->_tsc_offset + ns_vmcb->_tsc_offset; /* Nested IO permission bitmaps */ rc = nsvm_vmrun_permissionmap(v, vcleanbit_set(iopm)); if (rc) return rc; /* ASID - Emulation handled in hvm_asid_handle_vmenter() */ /* TLB control */ n2vmcb->tlb_control = ns_vmcb->tlb_control; /* Virtual Interrupts */ if (!vcleanbit_set(tpr)) { n2vmcb->_vintr = ns_vmcb->_vintr; n2vmcb->_vintr.fields.intr_masking = 1; } /* Shadow Mode */ n2vmcb->interrupt_shadow = ns_vmcb->interrupt_shadow; /* Exit codes */ n2vmcb->exitcode = ns_vmcb->exitcode; n2vmcb->exitinfo1 = ns_vmcb->exitinfo1; n2vmcb->exitinfo2 = ns_vmcb->exitinfo2; n2vmcb->exitintinfo = ns_vmcb->exitintinfo; /* Pending Interrupts */ n2vmcb->eventinj = ns_vmcb->eventinj; /* LBR virtualization */ if (!vcleanbit_set(lbr)) { svm->ns_lbr_control = ns_vmcb->lbr_control; } n2vmcb->lbr_control.bytes = n1vmcb->lbr_control.bytes | ns_vmcb->lbr_control.bytes; /* NextRIP - only evaluated on #VMEXIT. */ /* * VMCB Save State Area */ /* Segments */ if (!vcleanbit_set(seg)) { n2vmcb->es = ns_vmcb->es; n2vmcb->cs = ns_vmcb->cs; n2vmcb->ss = ns_vmcb->ss; n2vmcb->ds = ns_vmcb->ds; /* CPL */ n2vmcb->_cpl = ns_vmcb->_cpl; } if (!vcleanbit_set(dt)) { n2vmcb->gdtr = ns_vmcb->gdtr; n2vmcb->idtr = ns_vmcb->idtr; } /* EFER */ v->arch.hvm_vcpu.guest_efer = ns_vmcb->_efer; rc = hvm_set_efer(ns_vmcb->_efer); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_efer failed, rc: %u\n", rc); /* CR4 */ v->arch.hvm_vcpu.guest_cr[4] = ns_vmcb->_cr4; rc = hvm_set_cr4(ns_vmcb->_cr4); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc); /* CR0 */ svm->ns_cr0 = v->arch.hvm_vcpu.guest_cr[0]; cr0 = nestedsvm_fpu_vmentry(svm->ns_cr0, ns_vmcb, n1vmcb, n2vmcb); v->arch.hvm_vcpu.guest_cr[0] = ns_vmcb->_cr0; rc = hvm_set_cr0(cr0); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc); /* CR2 */ v->arch.hvm_vcpu.guest_cr[2] = ns_vmcb->_cr2; hvm_update_guest_cr(v, 2); /* Nested paging mode */ if (nestedhvm_paging_mode_hap(v)) { /* host nested paging + guest nested paging. */ n2vmcb->_np_enable = 1; nestedsvm_vmcb_set_nestedp2m(v, ns_vmcb, n2vmcb); /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ rc = hvm_set_cr3(ns_vmcb->_cr3); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc); } else if (paging_mode_hap(v->domain)) { /* host nested paging + guest shadow paging. */ n2vmcb->_np_enable = 1; /* Keep h_cr3 as it is. */ n2vmcb->_h_cr3 = n1vmcb->_h_cr3; /* When l1 guest does shadow paging * we assume it intercepts page faults. */ /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ rc = hvm_set_cr3(ns_vmcb->_cr3); if (rc != X86EMUL_OKAY) gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc); } else { /* host shadow paging + guest shadow paging. */ n2vmcb->_np_enable = 0; n2vmcb->_h_cr3 = 0x0; /* TODO: Once shadow-shadow paging is in place come back to here * and set host_vmcb->_cr3 to the shadowed shadow table. */ } /* DRn */ if (!vcleanbit_set(dr)) { n2vmcb->_dr7 = ns_vmcb->_dr7; n2vmcb->_dr6 = ns_vmcb->_dr6; } /* RFLAGS */ n2vmcb->rflags = ns_vmcb->rflags; /* RIP */ n2vmcb->rip = ns_vmcb->rip; /* RSP */ n2vmcb->rsp = ns_vmcb->rsp; /* RAX */ n2vmcb->rax = ns_vmcb->rax; /* Keep the host values of the fs, gs, ldtr, tr, kerngsbase, * star, lstar, cstar, sfmask, sysenter_cs, sysenter_esp, * sysenter_eip. These are handled via VMSAVE/VMLOAD emulation. */ /* Page tables */ n2vmcb->pdpe0 = ns_vmcb->pdpe0; n2vmcb->pdpe1 = ns_vmcb->pdpe1; n2vmcb->pdpe2 = ns_vmcb->pdpe2; n2vmcb->pdpe3 = ns_vmcb->pdpe3; /* PAT */ if (!vcleanbit_set(np)) { n2vmcb->_g_pat = ns_vmcb->_g_pat; } if (!vcleanbit_set(lbr)) { /* Debug Control MSR */ n2vmcb->_debugctlmsr = ns_vmcb->_debugctlmsr; /* LBR MSRs */ n2vmcb->_lastbranchfromip = ns_vmcb->_lastbranchfromip; n2vmcb->_lastbranchtoip = ns_vmcb->_lastbranchtoip; n2vmcb->_lastintfromip = ns_vmcb->_lastintfromip; n2vmcb->_lastinttoip = ns_vmcb->_lastinttoip; } /* Cleanbits */ n2vmcb->cleanbits.bytes = 0; rc = svm_vmcb_isvalid(__func__, ns_vmcb, 1); if (rc) { gdprintk(XENLOG_ERR, "virtual vmcb invalid\n"); return NSVM_ERROR_VVMCB; } rc = svm_vmcb_isvalid(__func__, n2vmcb, 1); if (rc) { gdprintk(XENLOG_ERR, "n2vmcb invalid\n"); return NSVM_ERROR_VMENTRY; } /* Switch guest registers to l2 guest */ regs->eax = ns_vmcb->rax; regs->eip = ns_vmcb->rip; regs->esp = ns_vmcb->rsp; regs->eflags = ns_vmcb->rflags; #undef vcleanbit_set return 0; } static int nsvm_vcpu_vmentry(struct vcpu *v, struct cpu_user_regs *regs, unsigned int inst_len) { int ret; struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *ns_vmcb; ns_vmcb = nv->nv_vvmcx; ASSERT(ns_vmcb != NULL); ASSERT(nv->nv_n2vmcx != NULL); ASSERT(nv->nv_n2vmcx_pa != VMCX_EADDR); /* Save values for later use. Needed for Nested-on-Nested and * Shadow-on-Shadow paging. */ svm->ns_vmcb_guestcr3 = ns_vmcb->_cr3; svm->ns_vmcb_hostcr3 = ns_vmcb->_h_cr3; /* Convert explicitely to boolean. Deals with l1 guests * that use flush-by-asid w/o checking the cpuid bits */ nv->nv_flushp2m = !!ns_vmcb->tlb_control; if ( svm->ns_guest_asid != ns_vmcb->_guest_asid ) { nv->nv_flushp2m = 1; hvm_asid_flush_vcpu_asid(&vcpu_nestedhvm(v).nv_n2asid); svm->ns_guest_asid = ns_vmcb->_guest_asid; } /* nested paging for the guest */ svm->ns_hap_enabled = (ns_vmcb->_np_enable) ? 1 : 0; /* Remember the V_INTR_MASK in hostflags */ svm->ns_hostflags.fields.vintrmask = (ns_vmcb->_vintr.fields.intr_masking) ? 1 : 0; /* Save l1 guest state (= host state) */ ret = nsvm_vcpu_hostsave(v, inst_len); if (ret) { gdprintk(XENLOG_ERR, "hostsave failed, ret = %i\n", ret); return ret; } /* switch vmcb to shadow vmcb */ v->arch.hvm_svm.vmcb = nv->nv_n2vmcx; v->arch.hvm_svm.vmcb_pa = nv->nv_n2vmcx_pa; ret = nsvm_vmcb_prepare4vmrun(v, regs); if (ret) { gdprintk(XENLOG_ERR, "prepare4vmrun failed, ret = %i\n", ret); return ret; } nestedsvm_vcpu_stgi(v); return 0; } int nsvm_vcpu_vmrun(struct vcpu *v, struct cpu_user_regs *regs) { int ret; unsigned int inst_len; struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); inst_len = __get_instruction_length(v, INSTR_VMRUN); if (inst_len == 0) { svm->ns_vmexit.exitcode = VMEXIT_SHUTDOWN; return -1; } nv->nv_vmswitch_in_progress = 1; ASSERT(nv->nv_vvmcx != NULL); /* save host state */ ret = nsvm_vcpu_vmentry(v, regs, inst_len); /* Switch vcpu to guest mode. In the error case * this ensures the host mode is restored correctly * and l1 guest keeps alive. */ nestedhvm_vcpu_enter_guestmode(v); switch (ret) { case 0: break; case NSVM_ERROR_VVMCB: gdprintk(XENLOG_ERR, "inject VMEXIT(INVALID)\n"); svm->ns_vmexit.exitcode = VMEXIT_INVALID; return -1; case NSVM_ERROR_VMENTRY: default: gdprintk(XENLOG_ERR, "nsvm_vcpu_vmentry failed, injecting #UD\n"); hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); /* Must happen after hvm_inject_hw_exception or it doesn't work right. */ nv->nv_vmswitch_in_progress = 0; return 1; } /* If l1 guest uses shadow paging, update the paging mode. */ if (!nestedhvm_paging_mode_hap(v)) paging_update_paging_modes(v); nv->nv_vmswitch_in_progress = 0; return 0; } int nsvm_vcpu_vmexit_inject(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *ns_vmcb; ASSERT(svm->ns_gif == 0); ns_vmcb = nv->nv_vvmcx; if (nv->nv_vmexit_pending) { switch (exitcode) { case VMEXIT_INTR: if ( unlikely(ns_vmcb->eventinj.fields.v) && nv->nv_vmentry_pending && hvm_event_needs_reinjection(ns_vmcb->eventinj.fields.type, ns_vmcb->eventinj.fields.vector) ) { ns_vmcb->exitintinfo.bytes = ns_vmcb->eventinj.bytes; } break; case VMEXIT_EXCEPTION_PF: ns_vmcb->_cr2 = ns_vmcb->exitinfo2; /* fall through */ case VMEXIT_NPF: /* PF error code */ ns_vmcb->exitinfo1 = svm->ns_vmexit.exitinfo1; /* fault address */ ns_vmcb->exitinfo2 = svm->ns_vmexit.exitinfo2; break; case VMEXIT_EXCEPTION_NP: case VMEXIT_EXCEPTION_SS: case VMEXIT_EXCEPTION_GP: case VMEXIT_EXCEPTION_15: case VMEXIT_EXCEPTION_MF: case VMEXIT_EXCEPTION_AC: ns_vmcb->exitinfo1 = svm->ns_vmexit.exitinfo1; break; default: break; } } ns_vmcb->exitcode = exitcode; ns_vmcb->eventinj.bytes = 0; return 0; } int nsvm_vcpu_vmexit_trap(struct vcpu *v, struct hvm_trap *trap) { ASSERT(vcpu_nestedhvm(v).nv_vvmcx != NULL); nestedsvm_vmexit_defer(v, VMEXIT_EXCEPTION_DE + trap->vector, trap->error_code, trap->cr2); return NESTEDHVM_VMEXIT_DONE; } uint64_t nsvm_vcpu_guestcr3(struct vcpu *v) { return vcpu_nestedsvm(v).ns_vmcb_guestcr3; } uint64_t nsvm_vcpu_hostcr3(struct vcpu *v) { return vcpu_nestedsvm(v).ns_vmcb_hostcr3; } uint32_t nsvm_vcpu_asid(struct vcpu *v) { return vcpu_nestedsvm(v).ns_guest_asid; } static int nsvm_vmcb_guest_intercepts_msr(unsigned long *msr_bitmap, uint32_t msr, bool_t write) { bool_t enabled; unsigned long *msr_bit; msr_bit = svm_msrbit(msr_bitmap, msr); if (msr_bit == NULL) /* MSR not in the permission map: Let the guest handle it. */ return NESTEDHVM_VMEXIT_INJECT; msr &= 0x1fff; if (write) /* write access */ enabled = test_bit(msr * 2 + 1, msr_bit); else /* read access */ enabled = test_bit(msr * 2, msr_bit); if (!enabled) return NESTEDHVM_VMEXIT_HOST; return NESTEDHVM_VMEXIT_INJECT; } static int nsvm_vmcb_guest_intercepts_ioio(paddr_t iopm_pa, uint64_t exitinfo1) { unsigned long gfn = iopm_pa >> PAGE_SHIFT; unsigned long *io_bitmap; ioio_info_t ioinfo; uint16_t port; unsigned int size; bool_t enabled; ioinfo.bytes = exitinfo1; port = ioinfo.fields.port; size = ioinfo.fields.sz32 ? 4 : ioinfo.fields.sz16 ? 2 : 1; switch ( port ) { case 0 ... 8 * PAGE_SIZE - 1: /* first 4KB page */ break; case 8 * PAGE_SIZE ... 2 * 8 * PAGE_SIZE - 1: /* second 4KB page */ port -= 8 * PAGE_SIZE; ++gfn; break; default: BUG(); break; } for ( io_bitmap = hvm_map_guest_frame_ro(gfn, 0); ; ) { enabled = io_bitmap && test_bit(port, io_bitmap); if ( !enabled || !--size ) break; if ( unlikely(++port == 8 * PAGE_SIZE) ) { hvm_unmap_guest_frame(io_bitmap, 0); io_bitmap = hvm_map_guest_frame_ro(++gfn, 0); port -= 8 * PAGE_SIZE; } } hvm_unmap_guest_frame(io_bitmap, 0); if ( !enabled ) return NESTEDHVM_VMEXIT_HOST; return NESTEDHVM_VMEXIT_INJECT; } int nsvm_vmcb_guest_intercepts_exitcode(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode) { uint64_t exit_bits; struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *ns_vmcb = nv->nv_vvmcx; enum nestedhvm_vmexits vmexits; switch (exitcode) { case VMEXIT_CR0_READ ... VMEXIT_CR15_READ: case VMEXIT_CR0_WRITE ... VMEXIT_CR15_WRITE: exit_bits = 1ULL << (exitcode - VMEXIT_CR0_READ); if (svm->ns_cr_intercepts & exit_bits) break; return 0; case VMEXIT_DR0_READ ... VMEXIT_DR7_READ: case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE: exit_bits = 1ULL << (exitcode - VMEXIT_DR0_READ); if (svm->ns_dr_intercepts & exit_bits) break; return 0; case VMEXIT_EXCEPTION_DE ... VMEXIT_EXCEPTION_XF: exit_bits = 1ULL << (exitcode - VMEXIT_EXCEPTION_DE); if (svm->ns_exception_intercepts & exit_bits) break; return 0; case VMEXIT_INTR ... VMEXIT_SHUTDOWN: exit_bits = 1ULL << (exitcode - VMEXIT_INTR); if (svm->ns_general1_intercepts & exit_bits) break; return 0; case VMEXIT_VMRUN ... VMEXIT_XSETBV: exit_bits = 1ULL << (exitcode - VMEXIT_VMRUN); if (svm->ns_general2_intercepts & exit_bits) break; return 0; case VMEXIT_NPF: if (nestedhvm_paging_mode_hap(v)) break; return 0; case VMEXIT_INVALID: /* Always intercepted */ break; default: gdprintk(XENLOG_ERR, "Illegal exitcode %#"PRIx64"\n", exitcode); BUG(); break; } /* Special cases: Do more detailed checks */ switch (exitcode) { case VMEXIT_MSR: ASSERT(regs != NULL); if ( !nestedsvm_vmcb_map(v, nv->nv_vvmcxaddr) ) break; ns_vmcb = nv->nv_vvmcx; vmexits = nsvm_vmcb_guest_intercepts_msr(svm->ns_cached_msrpm, regs->ecx, ns_vmcb->exitinfo1 != 0); if (vmexits == NESTEDHVM_VMEXIT_HOST) return 0; break; case VMEXIT_IOIO: if ( !nestedsvm_vmcb_map(v, nv->nv_vvmcxaddr) ) break; ns_vmcb = nv->nv_vvmcx; vmexits = nsvm_vmcb_guest_intercepts_ioio(ns_vmcb->_iopm_base_pa, ns_vmcb->exitinfo1); if (vmexits == NESTEDHVM_VMEXIT_HOST) return 0; break; } return 1; } int nsvm_vmcb_guest_intercepts_trap(struct vcpu *v, unsigned int trapnr, int errcode) { return nsvm_vmcb_guest_intercepts_exitcode(v, guest_cpu_user_regs(), VMEXIT_EXCEPTION_DE + trapnr); } static int nsvm_vmcb_prepare4vmexit(struct vcpu *v, struct cpu_user_regs *regs) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct nestedsvm *svm = &vcpu_nestedsvm(v); struct vmcb_struct *ns_vmcb = nv->nv_vvmcx; struct vmcb_struct *n2vmcb = nv->nv_n2vmcx; svm_vmsave(nv->nv_n1vmcx); /* Cache guest physical address of virtual vmcb * for VMCB Cleanbit emulation. */ svm->ns_ovvmcb_pa = nv->nv_vvmcxaddr; /* Intercepts - keep them as they are */ /* Pausefilter - keep it as is */ /* Nested IO permission bitmap */ /* Just keep the iopm_base_pa and msrpm_base_pa values. * The guest must not see the virtualized values. */ /* TSC offset */ /* Keep it. It's maintainted by the l1 guest. */ /* ASID */ /* ns_vmcb->_guest_asid = n2vmcb->_guest_asid; */ /* TLB control */ ns_vmcb->tlb_control = 0; /* Virtual Interrupts */ ns_vmcb->_vintr = n2vmcb->_vintr; if (!(svm->ns_hostflags.fields.vintrmask)) ns_vmcb->_vintr.fields.intr_masking = 0; /* Shadow mode */ ns_vmcb->interrupt_shadow = n2vmcb->interrupt_shadow; /* Exit codes */ ns_vmcb->exitcode = n2vmcb->exitcode; ns_vmcb->exitinfo1 = n2vmcb->exitinfo1; ns_vmcb->exitinfo2 = n2vmcb->exitinfo2; ns_vmcb->exitintinfo = n2vmcb->exitintinfo; /* Interrupts */ /* If we emulate a VMRUN/#VMEXIT in the same host #VMEXIT cycle we have * to make sure that we do not lose injected events. So check eventinj * here and copy it to exitintinfo if it is valid. * exitintinfo and eventinj can't be both valid because the case below * only happens on a VMRUN instruction intercept which has no valid * exitintinfo set. */ if ( unlikely(n2vmcb->eventinj.fields.v) && hvm_event_needs_reinjection(n2vmcb->eventinj.fields.type, n2vmcb->eventinj.fields.vector) ) { ns_vmcb->exitintinfo = n2vmcb->eventinj; } ns_vmcb->eventinj.bytes = 0; /* Nested paging mode */ if (nestedhvm_paging_mode_hap(v)) { /* host nested paging + guest nested paging. */ ns_vmcb->_np_enable = n2vmcb->_np_enable; ns_vmcb->_cr3 = n2vmcb->_cr3; /* The vmcb->h_cr3 is the shadowed h_cr3. The original * unshadowed guest h_cr3 is kept in ns_vmcb->h_cr3, * hence we keep the ns_vmcb->h_cr3 value. */ } else if (paging_mode_hap(v->domain)) { /* host nested paging + guest shadow paging. */ ns_vmcb->_np_enable = 0; /* Throw h_cr3 away. Guest is not allowed to set it or * it can break out, otherwise (security hole!) */ ns_vmcb->_h_cr3 = 0x0; /* Stop intercepting #PF (already done above * by restoring cached intercepts). */ ns_vmcb->_cr3 = n2vmcb->_cr3; } else { /* host shadow paging + guest shadow paging. */ ns_vmcb->_np_enable = 0; ns_vmcb->_h_cr3 = 0x0; /* The vmcb->_cr3 is the shadowed cr3. The original * unshadowed guest cr3 is kept in ns_vmcb->_cr3, * hence we keep the ns_vmcb->_cr3 value. */ } /* LBR virtualization - keep lbr control as is */ /* NextRIP */ ns_vmcb->nextrip = n2vmcb->nextrip; /* Decode Assist */ ns_vmcb->guest_ins_len = n2vmcb->guest_ins_len; memcpy(ns_vmcb->guest_ins, n2vmcb->guest_ins, sizeof(ns_vmcb->guest_ins)); /* * VMCB Save State Area */ /* Segments */ ns_vmcb->es = n2vmcb->es; ns_vmcb->cs = n2vmcb->cs; ns_vmcb->ss = n2vmcb->ss; ns_vmcb->ds = n2vmcb->ds; ns_vmcb->gdtr = n2vmcb->gdtr; ns_vmcb->idtr = n2vmcb->idtr; /* CPL */ ns_vmcb->_cpl = n2vmcb->_cpl; /* EFER */ ns_vmcb->_efer = n2vmcb->_efer; /* CRn */ ns_vmcb->_cr4 = n2vmcb->_cr4; ns_vmcb->_cr0 = n2vmcb->_cr0; /* DRn */ ns_vmcb->_dr7 = n2vmcb->_dr7; ns_vmcb->_dr6 = n2vmcb->_dr6; /* Restore registers from regs as those values * can be newer than in n2vmcb (e.g. due to an * instruction emulation right before). */ /* RFLAGS */ ns_vmcb->rflags = n2vmcb->rflags = regs->rflags; /* RIP */ ns_vmcb->rip = n2vmcb->rip = regs->rip; /* RSP */ ns_vmcb->rsp = n2vmcb->rsp = regs->rsp; /* RAX */ ns_vmcb->rax = n2vmcb->rax = regs->rax; /* Keep the l2 guest values of the fs, gs, ldtr, tr, kerngsbase, * star, lstar, cstar, sfmask, sysenter_cs, sysenter_esp, * sysenter_eip. These are handled via VMSAVE/VMLOAD emulation. */ /* CR2 */ ns_vmcb->_cr2 = n2vmcb->_cr2; /* Page tables */ ns_vmcb->pdpe0 = n2vmcb->pdpe0; ns_vmcb->pdpe1 = n2vmcb->pdpe1; ns_vmcb->pdpe2 = n2vmcb->pdpe2; ns_vmcb->pdpe3 = n2vmcb->pdpe3; /* PAT */ ns_vmcb->_g_pat = n2vmcb->_g_pat; /* Debug Control MSR */ ns_vmcb->_debugctlmsr = n2vmcb->_debugctlmsr; /* LBR MSRs */ ns_vmcb->_lastbranchfromip = n2vmcb->_lastbranchfromip; ns_vmcb->_lastbranchtoip = n2vmcb->_lastbranchtoip; ns_vmcb->_lastintfromip = n2vmcb->_lastintfromip; ns_vmcb->_lastinttoip = n2vmcb->_lastinttoip; return 0; } bool_t nsvm_vmcb_hap_enabled(struct vcpu *v) { return vcpu_nestedsvm(v).ns_hap_enabled; } /* This function uses L2_gpa to walk the P2M page table in L1. If the * walk is successful, the translated value is returned in * L1_gpa. The result value tells what to do next. */ int nsvm_hap_walk_L1_p2m(struct vcpu *v, paddr_t L2_gpa, paddr_t *L1_gpa, unsigned int *page_order, uint8_t *p2m_acc, bool_t access_r, bool_t access_w, bool_t access_x) { uint32_t pfec; unsigned long nested_cr3, gfn; nested_cr3 = nhvm_vcpu_p2m_base(v); pfec = PFEC_user_mode | PFEC_page_present; if ( access_w ) pfec |= PFEC_write_access; if ( access_x ) pfec |= PFEC_insn_fetch; /* Walk the guest-supplied NPT table, just as if it were a pagetable */ gfn = paging_ga_to_gfn_cr3(v, nested_cr3, L2_gpa, &pfec, page_order); if ( gfn == INVALID_GFN ) return NESTEDHVM_PAGEFAULT_INJECT; *L1_gpa = (gfn << PAGE_SHIFT) + (L2_gpa & ~PAGE_MASK); return NESTEDHVM_PAGEFAULT_DONE; } enum hvm_intblk nsvm_intr_blocked(struct vcpu *v) { struct nestedsvm *svm = &vcpu_nestedsvm(v); struct nestedvcpu *nv = &vcpu_nestedhvm(v); ASSERT(nestedhvm_enabled(v->domain)); if ( !nestedsvm_gif_isset(v) ) return hvm_intblk_svm_gif; if ( nestedhvm_vcpu_in_guestmode(v) ) { struct vmcb_struct *n2vmcb = nv->nv_n2vmcx; if ( svm->ns_hostflags.fields.vintrmask ) if ( !svm->ns_hostflags.fields.rflagsif ) return hvm_intblk_rflags_ie; /* when l1 guest passes its devices through to the l2 guest * and l2 guest does an MMIO access then we may want to * inject an VMEXIT(#INTR) exitcode into the l1 guest. * Delay the injection because this would result in delivering * an interrupt *within* the execution of an instruction. */ if ( v->arch.hvm_vcpu.hvm_io.io_state != HVMIO_none ) return hvm_intblk_shadow; if ( !nv->nv_vmexit_pending && n2vmcb->exitintinfo.bytes != 0 ) { /* Give the l2 guest a chance to finish the delivery of * the last injected interrupt or exception before we * emulate a VMEXIT (e.g. VMEXIT(INTR) ). */ return hvm_intblk_shadow; } } if ( nv->nv_vmexit_pending ) { /* hvm_inject_hw_exception() must have run before. * exceptions have higher priority than interrupts. */ return hvm_intblk_rflags_ie; } return hvm_intblk_none; } /* MSR handling */ int nsvm_rdmsr(struct vcpu *v, unsigned int msr, uint64_t *msr_content) { struct nestedsvm *svm = &vcpu_nestedsvm(v); int ret = 1; *msr_content = 0; switch (msr) { case MSR_K8_VM_CR: break; case MSR_K8_VM_HSAVE_PA: *msr_content = svm->ns_msr_hsavepa; break; case MSR_AMD64_TSC_RATIO: *msr_content = svm->ns_tscratio; break; default: ret = 0; break; } return ret; } int nsvm_wrmsr(struct vcpu *v, unsigned int msr, uint64_t msr_content) { int ret = 1; struct nestedsvm *svm = &vcpu_nestedsvm(v); switch (msr) { case MSR_K8_VM_CR: /* ignore write. handle all bits as read-only. */ break; case MSR_K8_VM_HSAVE_PA: if (!nestedsvm_vmcb_isvalid(v, msr_content)) { gdprintk(XENLOG_ERR, "MSR_K8_VM_HSAVE_PA value invalid %#"PRIx64"\n", msr_content); ret = -1; /* inject #GP */ break; } svm->ns_msr_hsavepa = msr_content; break; case MSR_AMD64_TSC_RATIO: if ((msr_content & ~TSC_RATIO_RSVD_BITS) != msr_content) { gdprintk(XENLOG_ERR, "reserved bits set in MSR_AMD64_TSC_RATIO %#"PRIx64"\n", msr_content); ret = -1; /* inject #GP */ break; } svm->ns_tscratio = msr_content; break; default: ret = 0; break; } return ret; } /* VMEXIT emulation */ void nestedsvm_vmexit_defer(struct vcpu *v, uint64_t exitcode, uint64_t exitinfo1, uint64_t exitinfo2) { struct nestedsvm *svm = &vcpu_nestedsvm(v); nestedsvm_vcpu_clgi(v); svm->ns_vmexit.exitcode = exitcode; svm->ns_vmexit.exitinfo1 = exitinfo1; svm->ns_vmexit.exitinfo2 = exitinfo2; vcpu_nestedhvm(v).nv_vmexit_pending = 1; } enum nestedhvm_vmexits nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode) { bool_t is_intercepted; ASSERT(vcpu_nestedhvm(v).nv_vmexit_pending == 0); is_intercepted = nsvm_vmcb_guest_intercepts_exitcode(v, regs, exitcode); switch (exitcode) { case VMEXIT_INVALID: if (is_intercepted) return NESTEDHVM_VMEXIT_INJECT; return NESTEDHVM_VMEXIT_HOST; case VMEXIT_INTR: case VMEXIT_NMI: return NESTEDHVM_VMEXIT_HOST; case VMEXIT_EXCEPTION_NM: /* Host must handle lazy fpu context switching first. * Then inject the VMEXIT if L1 guest intercepts this. */ return NESTEDHVM_VMEXIT_HOST; case VMEXIT_NPF: if (nestedhvm_paging_mode_hap(v)) { if (!is_intercepted) return NESTEDHVM_VMEXIT_FATALERROR; /* host nested paging + guest nested paging */ return NESTEDHVM_VMEXIT_HOST; } if (paging_mode_hap(v->domain)) { if (is_intercepted) return NESTEDHVM_VMEXIT_FATALERROR; /* host nested paging + guest shadow paging */ return NESTEDHVM_VMEXIT_HOST; } /* host shadow paging + guest shadow paging */ /* Can this happen? */ BUG(); return NESTEDHVM_VMEXIT_FATALERROR; case VMEXIT_EXCEPTION_PF: if (nestedhvm_paging_mode_hap(v)) { /* host nested paging + guest nested paging */ if (!is_intercepted) /* l1 guest intercepts #PF unnecessarily */ return NESTEDHVM_VMEXIT_HOST; /* l2 guest intercepts #PF unnecessarily */ return NESTEDHVM_VMEXIT_INJECT; } if (!paging_mode_hap(v->domain)) { /* host shadow paging + guest shadow paging */ return NESTEDHVM_VMEXIT_HOST; } /* host nested paging + guest shadow paging */ return NESTEDHVM_VMEXIT_INJECT; case VMEXIT_VMMCALL: /* Always let the guest handle VMMCALL/VMCALL */ return NESTEDHVM_VMEXIT_INJECT; default: break; } if (is_intercepted) return NESTEDHVM_VMEXIT_INJECT; return NESTEDHVM_VMEXIT_HOST; } enum nestedhvm_vmexits nestedsvm_vmexit_n2n1(struct vcpu *v, struct cpu_user_regs *regs) { int rc; enum nestedhvm_vmexits ret = NESTEDHVM_VMEXIT_DONE; ASSERT(vcpu_nestedhvm(v).nv_vmswitch_in_progress); ASSERT(nestedhvm_vcpu_in_guestmode(v)); rc = nsvm_vmcb_prepare4vmexit(v, regs); if (rc) ret = NESTEDHVM_VMEXIT_ERROR; rc = nhvm_vcpu_hostrestore(v, regs); if (rc) ret = NESTEDHVM_VMEXIT_FATALERROR; nestedhvm_vcpu_exit_guestmode(v); return ret; } /* The exitcode is in native SVM/VMX format. The forced exitcode * is in generic format. */ static enum nestedhvm_vmexits nestedsvm_vcpu_vmexit(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode) { int rc; struct nestedvcpu *nv = &vcpu_nestedhvm(v); nv->nv_vmswitch_in_progress = 1; ASSERT(nv->nv_vvmcx != NULL); /* On special intercepts the host has to handle * the vcpu is still in guest mode here. */ if (nestedhvm_vcpu_in_guestmode(v)) { enum nestedhvm_vmexits ret; ret = nestedsvm_vmexit_n2n1(v, regs); switch (ret) { case NESTEDHVM_VMEXIT_FATALERROR: gdprintk(XENLOG_ERR, "VMEXIT: fatal error\n"); return ret; case NESTEDHVM_VMEXIT_HOST: BUG(); return ret; case NESTEDHVM_VMEXIT_ERROR: exitcode = VMEXIT_INVALID; break; default: ASSERT(!nestedhvm_vcpu_in_guestmode(v)); break; } /* host state has been restored */ } ASSERT(!nestedhvm_vcpu_in_guestmode(v)); /* Prepare for running the l1 guest. Make the actual * modifications to the virtual VMCB/VMCS. */ rc = nhvm_vcpu_vmexit(v, regs, exitcode); /* If l1 guest uses shadow paging, update the paging mode. */ if (!nestedhvm_paging_mode_hap(v)) paging_update_paging_modes(v); nv->nv_vmswitch_in_progress = 0; if (rc) return NESTEDHVM_VMEXIT_FATALERROR; return NESTEDHVM_VMEXIT_DONE; } /* VCPU switch */ void nsvm_vcpu_switch(struct cpu_user_regs *regs) { struct vcpu *v = current; struct nestedvcpu *nv; struct nestedsvm *svm; if (!nestedhvm_enabled(v->domain)) return; nv = &vcpu_nestedhvm(v); svm = &vcpu_nestedsvm(v); ASSERT(v->arch.hvm_svm.vmcb != NULL); ASSERT(nv->nv_n1vmcx != NULL); ASSERT(nv->nv_n2vmcx != NULL); ASSERT(nv->nv_n1vmcx_pa != VMCX_EADDR); ASSERT(nv->nv_n2vmcx_pa != VMCX_EADDR); if (nv->nv_vmexit_pending) { vmexit: nestedsvm_vcpu_vmexit(v, regs, svm->ns_vmexit.exitcode); nv->nv_vmexit_pending = 0; nv->nv_vmentry_pending = 0; return; } if (nv->nv_vmentry_pending) { int ret; ASSERT(!nv->nv_vmexit_pending); ret = nsvm_vcpu_vmrun(v, regs); if (ret) goto vmexit; ASSERT(nestedhvm_vcpu_in_guestmode(v)); nv->nv_vmentry_pending = 0; } if (nestedhvm_vcpu_in_guestmode(v) && nestedhvm_paging_mode_hap(v)) { /* In case left the l2 guest due to a physical interrupt (e.g. IPI) * that is not for the l1 guest then we continue running the l2 guest * but check if the nestedp2m is still valid. */ if (nv->nv_p2m == NULL) nestedsvm_vmcb_set_nestedp2m(v, nv->nv_vvmcx, nv->nv_n2vmcx); } } /* Interrupts, Virtual GIF */ int nestedsvm_vcpu_interrupt(struct vcpu *v, const struct hvm_intack intack) { int ret; enum hvm_intblk intr; uint64_t exitcode = VMEXIT_INTR; uint64_t exitinfo2 = 0; ASSERT(nestedhvm_vcpu_in_guestmode(v)); intr = nhvm_interrupt_blocked(v); if ( intr != hvm_intblk_none ) return NSVM_INTR_MASKED; switch (intack.source) { case hvm_intsrc_pic: case hvm_intsrc_lapic: case hvm_intsrc_vector: exitcode = VMEXIT_INTR; exitinfo2 = intack.vector; break; case hvm_intsrc_nmi: exitcode = VMEXIT_NMI; exitinfo2 = intack.vector; break; case hvm_intsrc_mce: exitcode = VMEXIT_EXCEPTION_MC; exitinfo2 = intack.vector; break; case hvm_intsrc_none: return NSVM_INTR_NOTHANDLED; default: BUG(); } ret = nsvm_vmcb_guest_intercepts_exitcode(v, guest_cpu_user_regs(), exitcode); if (ret) { nestedsvm_vmexit_defer(v, exitcode, intack.source, exitinfo2); return NSVM_INTR_FORCEVMEXIT; } return NSVM_INTR_NOTINTERCEPTED; } bool_t nestedsvm_gif_isset(struct vcpu *v) { struct nestedsvm *svm = &vcpu_nestedsvm(v); return (!!svm->ns_gif); } void svm_vmexit_do_stgi(struct cpu_user_regs *regs, struct vcpu *v) { unsigned int inst_len; if ( !nestedhvm_enabled(v->domain) ) { hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); return; } if ( (inst_len = __get_instruction_length(v, INSTR_STGI)) == 0 ) return; nestedsvm_vcpu_stgi(v); __update_guest_eip(regs, inst_len); } void svm_vmexit_do_clgi(struct cpu_user_regs *regs, struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; unsigned int inst_len; uint32_t general1_intercepts = vmcb_get_general1_intercepts(vmcb); vintr_t intr; if ( !nestedhvm_enabled(v->domain) ) { hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); return; } if ( (inst_len = __get_instruction_length(v, INSTR_CLGI)) == 0 ) return; nestedsvm_vcpu_clgi(v); /* After a CLGI no interrupts should come */ intr = vmcb_get_vintr(vmcb); intr.fields.irq = 0; general1_intercepts &= ~GENERAL1_INTERCEPT_VINTR; vmcb_set_vintr(vmcb, intr); vmcb_set_general1_intercepts(vmcb, general1_intercepts); __update_guest_eip(regs, inst_len); } xen-4.4.0/xen/arch/x86/hvm/svm/asid.c0000664000175000017500000000410212307313555015257 0ustar smbsmb/* * asid.c: handling ASIDs in SVM. * Copyright (c) 2007, Advanced Micro Devices, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include void svm_asid_init(struct cpuinfo_x86 *c) { int nasids = 0; /* Check for erratum #170, and leave ASIDs disabled if it's present. */ if ( !cpu_has_amd_erratum(c, AMD_ERRATUM_170) ) nasids = cpuid_ebx(0x8000000A); hvm_asid_init(nasids); } /* * Called directly before VMRUN. Checks if the VCPU needs a new ASID, * assigns it, and if required, issues required TLB flushes. */ void svm_asid_handle_vmrun(void) { struct vcpu *curr = current; struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb; struct hvm_vcpu_asid *p_asid = nestedhvm_vcpu_in_guestmode(curr) ? &vcpu_nestedhvm(curr).nv_n2asid : &curr->arch.hvm_vcpu.n1asid; bool_t need_flush = hvm_asid_handle_vmenter(p_asid); /* ASID 0 indicates that ASIDs are disabled. */ if ( p_asid->asid == 0 ) { vmcb_set_guest_asid(vmcb, 1); vmcb->tlb_control = 1; return; } if (vmcb_get_guest_asid(vmcb) != p_asid->asid) vmcb_set_guest_asid(vmcb, p_asid->asid); vmcb->tlb_control = need_flush; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/hvm/svm/svm.c0000664000175000017500000021366412307313555015163 0ustar smbsmb/* * svm.c: handling SVM architecture-related VM exits * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2005-2007, Advanced Micro Devices, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void svm_asm_do_resume(void); u32 svm_feature_flags; /* Indicates whether guests may use EFER.LMSLE. */ bool_t cpu_has_lmsl; #define set_segment_register(name, value) \ asm volatile ( "movw %%ax ,%%" STR(name) "" : : "a" (value) ) static void svm_update_guest_efer(struct vcpu *); static struct hvm_function_table svm_function_table; /* va of hardware host save area */ static DEFINE_PER_CPU_READ_MOSTLY(void *, hsa); /* vmcb used for extended host state */ static DEFINE_PER_CPU_READ_MOSTLY(void *, root_vmcb); static bool_t amd_erratum383_found __read_mostly; /* OSVW bits */ static uint64_t osvw_length, osvw_status; static DEFINE_SPINLOCK(osvw_lock); void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len) { struct vcpu *curr = current; if ( unlikely(inst_len == 0) ) return; if ( unlikely(inst_len > 15) ) { gdprintk(XENLOG_ERR, "Bad instruction length %u\n", inst_len); domain_crash(curr->domain); return; } ASSERT(regs == guest_cpu_user_regs()); regs->eip += inst_len; regs->eflags &= ~X86_EFLAGS_RF; curr->arch.hvm_svm.vmcb->interrupt_shadow = 0; if ( regs->eflags & X86_EFLAGS_TF ) hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE); } static void svm_cpu_down(void) { write_efer(read_efer() & ~EFER_SVME); } unsigned long * svm_msrbit(unsigned long *msr_bitmap, uint32_t msr) { unsigned long *msr_bit = NULL; /* * See AMD64 Programmers Manual, Vol 2, Section 15.10 (MSR-Bitmap Address). */ if ( msr <= 0x1fff ) msr_bit = msr_bitmap + 0x0000 / BYTES_PER_LONG; else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) ) msr_bit = msr_bitmap + 0x0800 / BYTES_PER_LONG; else if ( (msr >= 0xc0010000) && (msr <= 0xc0011fff) ) msr_bit = msr_bitmap + 0x1000 / BYTES_PER_LONG; return msr_bit; } void svm_intercept_msr(struct vcpu *v, uint32_t msr, int flags) { unsigned long *msr_bit; msr_bit = svm_msrbit(v->arch.hvm_svm.msrpm, msr); BUG_ON(msr_bit == NULL); msr &= 0x1fff; if ( flags & MSR_INTERCEPT_READ ) __set_bit(msr * 2, msr_bit); else __clear_bit(msr * 2, msr_bit); if ( flags & MSR_INTERCEPT_WRITE ) __set_bit(msr * 2 + 1, msr_bit); else __clear_bit(msr * 2 + 1, msr_bit); } static void svm_save_dr(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; if ( !v->arch.hvm_vcpu.flag_dr_dirty ) return; /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */ v->arch.hvm_vcpu.flag_dr_dirty = 0; vmcb_set_dr_intercepts(vmcb, ~0u); v->arch.debugreg[0] = read_debugreg(0); v->arch.debugreg[1] = read_debugreg(1); v->arch.debugreg[2] = read_debugreg(2); v->arch.debugreg[3] = read_debugreg(3); v->arch.debugreg[6] = vmcb_get_dr6(vmcb); v->arch.debugreg[7] = vmcb_get_dr7(vmcb); } static void __restore_debug_registers(struct vmcb_struct *vmcb, struct vcpu *v) { if ( v->arch.hvm_vcpu.flag_dr_dirty ) return; v->arch.hvm_vcpu.flag_dr_dirty = 1; vmcb_set_dr_intercepts(vmcb, 0); write_debugreg(0, v->arch.debugreg[0]); write_debugreg(1, v->arch.debugreg[1]); write_debugreg(2, v->arch.debugreg[2]); write_debugreg(3, v->arch.debugreg[3]); vmcb_set_dr6(vmcb, v->arch.debugreg[6]); vmcb_set_dr7(vmcb, v->arch.debugreg[7]); } /* * DR7 is saved and restored on every vmexit. Other debug registers only * need to be restored if their value is going to affect execution -- i.e., * if one of the breakpoints is enabled. So mask out all bits that don't * enable some breakpoint functionality. */ static void svm_restore_dr(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) ) __restore_debug_registers(vmcb, v); } static int svm_vmcb_save(struct vcpu *v, struct hvm_hw_cpu *c) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; c->cr0 = v->arch.hvm_vcpu.guest_cr[0]; c->cr2 = v->arch.hvm_vcpu.guest_cr[2]; c->cr3 = v->arch.hvm_vcpu.guest_cr[3]; c->cr4 = v->arch.hvm_vcpu.guest_cr[4]; c->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs; c->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp; c->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip; c->pending_event = 0; c->error_code = 0; if ( vmcb->eventinj.fields.v && hvm_event_needs_reinjection(vmcb->eventinj.fields.type, vmcb->eventinj.fields.vector) ) { c->pending_event = (uint32_t)vmcb->eventinj.bytes; c->error_code = vmcb->eventinj.fields.errorcode; } return 1; } static int svm_vmcb_restore(struct vcpu *v, struct hvm_hw_cpu *c) { struct page_info *page = NULL; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; struct p2m_domain *p2m = p2m_get_hostp2m(v->domain); if ( c->pending_valid && ((c->pending_type == 1) || (c->pending_type > 6) || (c->pending_reserved != 0)) ) { gdprintk(XENLOG_ERR, "Invalid pending event %#"PRIx32".\n", c->pending_event); return -EINVAL; } if ( !paging_mode_hap(v->domain) ) { if ( c->cr0 & X86_CR0_PG ) { page = get_page_from_gfn(v->domain, c->cr3 >> PAGE_SHIFT, NULL, P2M_ALLOC); if ( !page ) { gdprintk(XENLOG_ERR, "Invalid CR3 value=%#"PRIx64"\n", c->cr3); return -EINVAL; } } if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG ) put_page(pagetable_get_page(v->arch.guest_table)); v->arch.guest_table = page ? pagetable_from_page(page) : pagetable_null(); } v->arch.hvm_vcpu.guest_cr[0] = c->cr0 | X86_CR0_ET; v->arch.hvm_vcpu.guest_cr[2] = c->cr2; v->arch.hvm_vcpu.guest_cr[3] = c->cr3; v->arch.hvm_vcpu.guest_cr[4] = c->cr4; svm_update_guest_cr(v, 0); svm_update_guest_cr(v, 2); svm_update_guest_cr(v, 4); /* Load sysenter MSRs into both VMCB save area and VCPU fields. */ vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = c->sysenter_cs; vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = c->sysenter_esp; vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = c->sysenter_eip; if ( paging_mode_hap(v->domain) ) { vmcb_set_np_enable(vmcb, 1); vmcb_set_g_pat(vmcb, MSR_IA32_CR_PAT_RESET /* guest PAT */); vmcb_set_h_cr3(vmcb, pagetable_get_paddr(p2m_get_pagetable(p2m))); } if ( c->pending_valid ) { gdprintk(XENLOG_INFO, "Re-injecting %#"PRIx32", %#"PRIx32"\n", c->pending_event, c->error_code); if ( hvm_event_needs_reinjection(c->pending_type, c->pending_vector) ) { vmcb->eventinj.bytes = c->pending_event; vmcb->eventinj.fields.errorcode = c->error_code; } } vmcb->cleanbits.bytes = 0; paging_update_paging_modes(v); return 0; } static void svm_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; data->shadow_gs = vmcb->kerngsbase; data->msr_lstar = vmcb->lstar; data->msr_star = vmcb->star; data->msr_cstar = vmcb->cstar; data->msr_syscall_mask = vmcb->sfmask; data->msr_efer = v->arch.hvm_vcpu.guest_efer; data->msr_flags = -1ULL; data->tsc = hvm_get_guest_tsc(v); } static void svm_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; vmcb->kerngsbase = data->shadow_gs; vmcb->lstar = data->msr_lstar; vmcb->star = data->msr_star; vmcb->cstar = data->msr_cstar; vmcb->sfmask = data->msr_syscall_mask; v->arch.hvm_vcpu.guest_efer = data->msr_efer; svm_update_guest_efer(v); hvm_set_guest_tsc(v, data->tsc); } static void svm_save_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt) { svm_save_cpu_state(v, ctxt); svm_vmcb_save(v, ctxt); } static int svm_load_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt) { svm_load_cpu_state(v, ctxt); if (svm_vmcb_restore(v, ctxt)) { gdprintk(XENLOG_ERR, "svm_vmcb restore failed!\n"); domain_crash(v->domain); return -EINVAL; } return 0; } static void svm_fpu_enter(struct vcpu *v) { struct vmcb_struct *n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx; vcpu_restore_fpu_lazy(v); vmcb_set_exception_intercepts( n1vmcb, vmcb_get_exception_intercepts(n1vmcb) & ~(1U << TRAP_no_device)); } static void svm_fpu_leave(struct vcpu *v) { struct vmcb_struct *n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx; ASSERT(!v->fpu_dirtied); ASSERT(read_cr0() & X86_CR0_TS); /* * If the guest does not have TS enabled then we must cause and handle an * exception on first use of the FPU. If the guest *does* have TS enabled * then this is not necessary: no FPU activity can occur until the guest * clears CR0.TS, and we will initialise the FPU when that happens. */ if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) ) { vmcb_set_exception_intercepts( n1vmcb, vmcb_get_exception_intercepts(n1vmcb) | (1U << TRAP_no_device)); vmcb_set_cr0(n1vmcb, vmcb_get_cr0(n1vmcb) | X86_CR0_TS); } } static unsigned int svm_get_interrupt_shadow(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; unsigned int intr_shadow = 0; if ( vmcb->interrupt_shadow ) intr_shadow |= HVM_INTR_SHADOW_MOV_SS | HVM_INTR_SHADOW_STI; if ( vmcb_get_general1_intercepts(vmcb) & GENERAL1_INTERCEPT_IRET ) intr_shadow |= HVM_INTR_SHADOW_NMI; return intr_shadow; } static void svm_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb); vmcb->interrupt_shadow = !!(intr_shadow & (HVM_INTR_SHADOW_MOV_SS|HVM_INTR_SHADOW_STI)); general1_intercepts &= ~GENERAL1_INTERCEPT_IRET; if ( intr_shadow & HVM_INTR_SHADOW_NMI ) general1_intercepts |= GENERAL1_INTERCEPT_IRET; vmcb_set_general1_intercepts(vmcb, general1_intercepts); } static int svm_guest_x86_mode(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) ) return 0; if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) ) return 1; if ( hvm_long_mode_enabled(v) && likely(vmcb->cs.attr.fields.l) ) return 8; return (likely(vmcb->cs.attr.fields.db) ? 4 : 2); } void svm_update_guest_cr(struct vcpu *v, unsigned int cr) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; uint64_t value; switch ( cr ) { case 0: { unsigned long hw_cr0_mask = 0; if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) ) { if ( v != current ) hw_cr0_mask |= X86_CR0_TS; else if ( vmcb_get_cr0(vmcb) & X86_CR0_TS ) svm_fpu_enter(v); } value = v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask; if ( !paging_mode_hap(v->domain) ) value |= X86_CR0_PG | X86_CR0_WP; vmcb_set_cr0(vmcb, value); break; } case 2: vmcb_set_cr2(vmcb, v->arch.hvm_vcpu.guest_cr[2]); break; case 3: vmcb_set_cr3(vmcb, v->arch.hvm_vcpu.hw_cr[3]); if ( !nestedhvm_enabled(v->domain) ) hvm_asid_flush_vcpu(v); else if ( nestedhvm_vmswitch_in_progress(v) ) ; /* CR3 switches during VMRUN/VMEXIT do not flush the TLB. */ else hvm_asid_flush_vcpu_asid( nestedhvm_vcpu_in_guestmode(v) ? &vcpu_nestedhvm(v).nv_n2asid : &v->arch.hvm_vcpu.n1asid); break; case 4: value = HVM_CR4_HOST_MASK; if ( paging_mode_hap(v->domain) ) value &= ~X86_CR4_PAE; value |= v->arch.hvm_vcpu.guest_cr[4]; vmcb_set_cr4(vmcb, value); break; default: BUG(); } } static void svm_update_guest_efer(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; bool_t lma = !!(v->arch.hvm_vcpu.guest_efer & EFER_LMA); uint64_t new_efer; new_efer = (v->arch.hvm_vcpu.guest_efer | EFER_SVME) & ~EFER_LME; if ( lma ) new_efer |= EFER_LME; vmcb_set_efer(vmcb, new_efer); } static void svm_sync_vmcb(struct vcpu *v) { struct arch_svm_struct *arch_svm = &v->arch.hvm_svm; if ( arch_svm->vmcb_in_sync ) return; arch_svm->vmcb_in_sync = 1; svm_vmsave(arch_svm->vmcb); } static void svm_get_segment_register(struct vcpu *v, enum x86_segment seg, struct segment_register *reg) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; ASSERT((v == current) || !vcpu_runnable(v)); switch ( seg ) { case x86_seg_cs: memcpy(reg, &vmcb->cs, sizeof(*reg)); reg->attr.fields.g = reg->limit > 0xFFFFF; break; case x86_seg_ds: memcpy(reg, &vmcb->ds, sizeof(*reg)); if ( reg->attr.fields.type != 0 ) reg->attr.fields.type |= 0x1; break; case x86_seg_es: memcpy(reg, &vmcb->es, sizeof(*reg)); if ( reg->attr.fields.type != 0 ) reg->attr.fields.type |= 0x1; break; case x86_seg_fs: svm_sync_vmcb(v); memcpy(reg, &vmcb->fs, sizeof(*reg)); if ( reg->attr.fields.type != 0 ) reg->attr.fields.type |= 0x1; break; case x86_seg_gs: svm_sync_vmcb(v); memcpy(reg, &vmcb->gs, sizeof(*reg)); if ( reg->attr.fields.type != 0 ) reg->attr.fields.type |= 0x1; break; case x86_seg_ss: memcpy(reg, &vmcb->ss, sizeof(*reg)); reg->attr.fields.dpl = vmcb->_cpl; if ( reg->attr.fields.type == 0 ) reg->attr.fields.db = 0; break; case x86_seg_tr: svm_sync_vmcb(v); memcpy(reg, &vmcb->tr, sizeof(*reg)); reg->attr.fields.type |= 0x2; break; case x86_seg_gdtr: memcpy(reg, &vmcb->gdtr, sizeof(*reg)); break; case x86_seg_idtr: memcpy(reg, &vmcb->idtr, sizeof(*reg)); break; case x86_seg_ldtr: svm_sync_vmcb(v); memcpy(reg, &vmcb->ldtr, sizeof(*reg)); break; default: BUG(); } } static void svm_set_segment_register(struct vcpu *v, enum x86_segment seg, struct segment_register *reg) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; int sync = 0; ASSERT((v == current) || !vcpu_runnable(v)); switch ( seg ) { case x86_seg_cs: case x86_seg_ds: case x86_seg_es: case x86_seg_ss: /* cpl */ vmcb->cleanbits.fields.seg = 0; break; case x86_seg_gdtr: case x86_seg_idtr: vmcb->cleanbits.fields.dt = 0; break; case x86_seg_fs: case x86_seg_gs: case x86_seg_tr: case x86_seg_ldtr: sync = (v == current); break; default: break; } if ( sync ) svm_sync_vmcb(v); switch ( seg ) { case x86_seg_cs: memcpy(&vmcb->cs, reg, sizeof(*reg)); break; case x86_seg_ds: memcpy(&vmcb->ds, reg, sizeof(*reg)); break; case x86_seg_es: memcpy(&vmcb->es, reg, sizeof(*reg)); break; case x86_seg_fs: memcpy(&vmcb->fs, reg, sizeof(*reg)); break; case x86_seg_gs: memcpy(&vmcb->gs, reg, sizeof(*reg)); break; case x86_seg_ss: memcpy(&vmcb->ss, reg, sizeof(*reg)); vmcb->_cpl = vmcb->ss.attr.fields.dpl; break; case x86_seg_tr: memcpy(&vmcb->tr, reg, sizeof(*reg)); break; case x86_seg_gdtr: vmcb->gdtr.base = reg->base; vmcb->gdtr.limit = (uint16_t)reg->limit; break; case x86_seg_idtr: vmcb->idtr.base = reg->base; vmcb->idtr.limit = (uint16_t)reg->limit; break; case x86_seg_ldtr: memcpy(&vmcb->ldtr, reg, sizeof(*reg)); break; default: BUG(); } if ( sync ) svm_vmload(vmcb); } static unsigned long svm_get_shadow_gs_base(struct vcpu *v) { return v->arch.hvm_svm.vmcb->kerngsbase; } static int svm_set_guest_pat(struct vcpu *v, u64 gpat) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; if ( !paging_mode_hap(v->domain) ) return 0; vmcb_set_g_pat(vmcb, gpat); return 1; } static int svm_get_guest_pat(struct vcpu *v, u64 *gpat) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; if ( !paging_mode_hap(v->domain) ) return 0; *gpat = vmcb_get_g_pat(vmcb); return 1; } static uint64_t svm_get_tsc_offset(uint64_t host_tsc, uint64_t guest_tsc, uint64_t ratio) { uint64_t offset; if (ratio == DEFAULT_TSC_RATIO) return guest_tsc - host_tsc; /* calculate hi,lo parts in 64bits to prevent overflow */ offset = (((host_tsc >> 32U) * (ratio >> 32U)) << 32U) + (host_tsc & 0xffffffffULL) * (ratio & 0xffffffffULL); return guest_tsc - offset; } static void svm_set_tsc_offset(struct vcpu *v, u64 offset) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; struct vmcb_struct *n1vmcb, *n2vmcb; uint64_t n2_tsc_offset = 0; struct domain *d = v->domain; uint64_t host_tsc, guest_tsc; guest_tsc = hvm_get_guest_tsc(v); /* Re-adjust the offset value when TSC_RATIO is available */ if ( cpu_has_tsc_ratio && d->arch.vtsc ) { rdtscll(host_tsc); offset = svm_get_tsc_offset(host_tsc, guest_tsc, vcpu_tsc_ratio(v)); } if ( !nestedhvm_enabled(d) ) { vmcb_set_tsc_offset(vmcb, offset); return; } n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx; n2vmcb = vcpu_nestedhvm(v).nv_n2vmcx; if ( nestedhvm_vcpu_in_guestmode(v) ) { struct nestedsvm *svm = &vcpu_nestedsvm(v); n2_tsc_offset = vmcb_get_tsc_offset(n2vmcb) - vmcb_get_tsc_offset(n1vmcb); if ( svm->ns_tscratio != DEFAULT_TSC_RATIO ) { n2_tsc_offset = svm_get_tsc_offset(guest_tsc, guest_tsc + n2_tsc_offset, svm->ns_tscratio); } vmcb_set_tsc_offset(n1vmcb, offset); } vmcb_set_tsc_offset(vmcb, offset + n2_tsc_offset); } static void svm_set_rdtsc_exiting(struct vcpu *v, bool_t enable) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb); u32 general2_intercepts = vmcb_get_general2_intercepts(vmcb); general1_intercepts &= ~GENERAL1_INTERCEPT_RDTSC; general2_intercepts &= ~GENERAL2_INTERCEPT_RDTSCP; if ( enable && !cpu_has_tsc_ratio ) { general1_intercepts |= GENERAL1_INTERCEPT_RDTSC; general2_intercepts |= GENERAL2_INTERCEPT_RDTSCP; } vmcb_set_general1_intercepts(vmcb, general1_intercepts); vmcb_set_general2_intercepts(vmcb, general2_intercepts); } static unsigned int svm_get_insn_bytes(struct vcpu *v, uint8_t *buf) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; unsigned int len = v->arch.hvm_svm.cached_insn_len; if ( len != 0 ) { /* Latch and clear the cached instruction. */ memcpy(buf, vmcb->guest_ins, 15); v->arch.hvm_svm.cached_insn_len = 0; } return len; } static void svm_init_hypercall_page(struct domain *d, void *hypercall_page) { char *p; int i; for ( i = 0; i < (PAGE_SIZE / 32); i++ ) { if ( i == __HYPERVISOR_iret ) continue; p = (char *)(hypercall_page + (i * 32)); *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */ *(u32 *)(p + 1) = i; *(u8 *)(p + 5) = 0x0f; /* vmmcall */ *(u8 *)(p + 6) = 0x01; *(u8 *)(p + 7) = 0xd9; *(u8 *)(p + 8) = 0xc3; /* ret */ } /* Don't support HYPERVISOR_iret at the moment */ *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */ } static void svm_lwp_interrupt(struct cpu_user_regs *regs) { struct vcpu *curr = current; ack_APIC_irq(); vlapic_set_irq( vcpu_vlapic(curr), (curr->arch.hvm_svm.guest_lwp_cfg >> 40) & 0xff, 0); } static inline void svm_lwp_save(struct vcpu *v) { /* Don't mess up with other guests. Disable LWP for next VCPU. */ if ( v->arch.hvm_svm.guest_lwp_cfg ) { wrmsrl(MSR_AMD64_LWP_CFG, 0x0); wrmsrl(MSR_AMD64_LWP_CBADDR, 0x0); } } static inline void svm_lwp_load(struct vcpu *v) { /* Only LWP_CFG is reloaded. LWP_CBADDR will be reloaded via xrstor. */ if ( v->arch.hvm_svm.guest_lwp_cfg ) wrmsrl(MSR_AMD64_LWP_CFG, v->arch.hvm_svm.cpu_lwp_cfg); } /* Update LWP_CFG MSR (0xc0000105). Return -1 if error; otherwise returns 0. */ static int svm_update_lwp_cfg(struct vcpu *v, uint64_t msr_content) { unsigned int edx; uint32_t msr_low; static uint8_t lwp_intr_vector; if ( xsave_enabled(v) && cpu_has_lwp ) { hvm_cpuid(0x8000001c, NULL, NULL, NULL, &edx); msr_low = (uint32_t)msr_content; /* generate #GP if guest tries to turn on unsupported features. */ if ( msr_low & ~edx) return -1; v->arch.hvm_svm.guest_lwp_cfg = msr_content; /* setup interrupt handler if needed */ if ( (msr_content & 0x80000000) && ((msr_content >> 40) & 0xff) ) { alloc_direct_apic_vector(&lwp_intr_vector, svm_lwp_interrupt); v->arch.hvm_svm.cpu_lwp_cfg = (msr_content & 0xffff00ffffffffffULL) | ((uint64_t)lwp_intr_vector << 40); } else { /* otherwise disable it */ v->arch.hvm_svm.cpu_lwp_cfg = msr_content & 0xffff00ff7fffffffULL; } wrmsrl(MSR_AMD64_LWP_CFG, v->arch.hvm_svm.cpu_lwp_cfg); /* track nonalzy state if LWP_CFG is non-zero. */ v->arch.nonlazy_xstate_used = !!(msr_content); } return 0; } static inline void svm_tsc_ratio_save(struct vcpu *v) { /* Other vcpus might not have vtsc enabled. So disable TSC_RATIO here. */ if ( cpu_has_tsc_ratio && v->domain->arch.vtsc ) wrmsrl(MSR_AMD64_TSC_RATIO, DEFAULT_TSC_RATIO); } static inline void svm_tsc_ratio_load(struct vcpu *v) { if ( cpu_has_tsc_ratio && v->domain->arch.vtsc ) wrmsrl(MSR_AMD64_TSC_RATIO, vcpu_tsc_ratio(v)); } static void svm_ctxt_switch_from(struct vcpu *v) { int cpu = smp_processor_id(); svm_fpu_leave(v); svm_save_dr(v); svm_lwp_save(v); svm_tsc_ratio_save(v); svm_sync_vmcb(v); svm_vmload(per_cpu(root_vmcb, cpu)); /* Resume use of ISTs now that the host TR is reinstated. */ set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF); set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI); set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE); } static void svm_ctxt_switch_to(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; int cpu = smp_processor_id(); /* * This is required, because VMRUN does consistency check * and some of the DOM0 selectors are pointing to * invalid GDT locations, and cause AMD processors * to shutdown. */ set_segment_register(ds, 0); set_segment_register(es, 0); set_segment_register(ss, 0); /* * Cannot use ISTs for NMI/#MC/#DF while we are running with the guest TR. * But this doesn't matter: the IST is only req'd to handle SYSCALL/SYSRET. */ set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE); set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE); set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE); svm_restore_dr(v); svm_vmsave(per_cpu(root_vmcb, cpu)); svm_vmload(vmcb); vmcb->cleanbits.bytes = 0; svm_lwp_load(v); svm_tsc_ratio_load(v); if ( cpu_has_rdtscp ) wrmsrl(MSR_TSC_AUX, hvm_msr_tsc_aux(v)); } static void svm_do_resume(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; bool_t debug_state = v->domain->debugger_attached; bool_t vcpu_guestmode = 0; if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) ) vcpu_guestmode = 1; if ( !vcpu_guestmode && unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) ) { uint32_t intercepts = vmcb_get_exception_intercepts(vmcb); uint32_t mask = (1U << TRAP_debug) | (1U << TRAP_int3); v->arch.hvm_vcpu.debug_state_latch = debug_state; vmcb_set_exception_intercepts( vmcb, debug_state ? (intercepts | mask) : (intercepts & ~mask)); } if ( v->arch.hvm_svm.launch_core != smp_processor_id() ) { v->arch.hvm_svm.launch_core = smp_processor_id(); hvm_migrate_timers(v); hvm_migrate_pirqs(v); /* Migrating to another ASID domain. Request a new ASID. */ hvm_asid_flush_vcpu(v); } if ( !vcpu_guestmode ) { vintr_t intr; /* Reflect the vlapic's TPR in the hardware vtpr */ intr = vmcb_get_vintr(vmcb); intr.fields.tpr = (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4; vmcb_set_vintr(vmcb, intr); } hvm_do_resume(v); reset_stack_and_jump(svm_asm_do_resume); } static void svm_guest_osvw_init(struct vcpu *vcpu) { if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) return; /* * Guests should see errata 400 and 415 as fixed (assuming that * HLT and IO instructions are intercepted). */ vcpu->arch.hvm_svm.osvw.length = (osvw_length >= 3) ? osvw_length : 3; vcpu->arch.hvm_svm.osvw.status = osvw_status & ~(6ULL); /* * By increasing VCPU's osvw.length to 3 we are telling the guest that * all osvw.status bits inside that length, including bit 0 (which is * reserved for erratum 298), are valid. However, if host processor's * osvw_len is 0 then osvw_status[0] carries no information. We need to * be conservative here and therefore we tell the guest that erratum 298 * is present (because we really don't know). */ if ( osvw_length == 0 && boot_cpu_data.x86 == 0x10 ) vcpu->arch.hvm_svm.osvw.status |= 1; } void svm_host_osvw_reset() { spin_lock(&osvw_lock); osvw_length = 64; /* One register (MSRC001_0141) worth of errata */ osvw_status = 0; spin_unlock(&osvw_lock); } void svm_host_osvw_init() { spin_lock(&osvw_lock); /* * Get OSVW bits. If bits are not the same on different processors then * choose the worst case (i.e. if erratum is present on one processor and * not on another assume that the erratum is present everywhere). */ if ( test_bit(X86_FEATURE_OSVW, &boot_cpu_data.x86_capability) ) { uint64_t len, status; if ( rdmsr_safe(MSR_AMD_OSVW_ID_LENGTH, len) || rdmsr_safe(MSR_AMD_OSVW_STATUS, status) ) len = status = 0; if (len < osvw_length) osvw_length = len; osvw_status |= status; osvw_status &= (1ULL << osvw_length) - 1; } else osvw_length = osvw_status = 0; spin_unlock(&osvw_lock); } static int svm_domain_initialise(struct domain *d) { return 0; } static void svm_domain_destroy(struct domain *d) { } static int svm_vcpu_initialise(struct vcpu *v) { int rc; v->arch.schedule_tail = svm_do_resume; v->arch.ctxt_switch_from = svm_ctxt_switch_from; v->arch.ctxt_switch_to = svm_ctxt_switch_to; v->arch.hvm_svm.launch_core = -1; if ( (rc = svm_create_vmcb(v)) != 0 ) { dprintk(XENLOG_WARNING, "Failed to create VMCB for vcpu %d: err=%d.\n", v->vcpu_id, rc); return rc; } vpmu_initialise(v); svm_guest_osvw_init(v); return 0; } static void svm_vcpu_destroy(struct vcpu *v) { vpmu_destroy(v); svm_destroy_vmcb(v); passive_domain_destroy(v); } static void svm_inject_trap(struct hvm_trap *trap) { struct vcpu *curr = current; struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb; eventinj_t event = vmcb->eventinj; struct hvm_trap _trap = *trap; switch ( _trap.vector ) { case TRAP_debug: if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF ) { __restore_debug_registers(vmcb, curr); vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | 0x4000); } if ( cpu_has_monitor_trap_flag ) break; /* fall through */ case TRAP_int3: if ( curr->domain->debugger_attached ) { /* Debug/Int3: Trap to debugger. */ domain_pause_for_debugger(); return; } } if ( unlikely(event.fields.v) && (event.fields.type == X86_EVENTTYPE_HW_EXCEPTION) ) { _trap.vector = hvm_combine_hw_exceptions( event.fields.vector, _trap.vector); if ( _trap.vector == TRAP_double_fault ) _trap.error_code = 0; } event.bytes = 0; event.fields.v = 1; event.fields.type = X86_EVENTTYPE_HW_EXCEPTION; event.fields.vector = _trap.vector; event.fields.ev = (_trap.error_code != HVM_DELIVER_NO_ERROR_CODE); event.fields.errorcode = _trap.error_code; vmcb->eventinj = event; if ( _trap.vector == TRAP_page_fault ) { curr->arch.hvm_vcpu.guest_cr[2] = _trap.cr2; vmcb_set_cr2(vmcb, _trap.cr2); HVMTRACE_LONG_2D(PF_INJECT, _trap.error_code, TRC_PAR_LONG(_trap.cr2)); } else { HVMTRACE_2D(INJ_EXC, _trap.vector, _trap.error_code); } } static int svm_event_pending(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; return vmcb->eventinj.fields.v; } static void svm_cpu_dead(unsigned int cpu) { free_xenheap_page(per_cpu(hsa, cpu)); per_cpu(hsa, cpu) = NULL; free_vmcb(per_cpu(root_vmcb, cpu)); per_cpu(root_vmcb, cpu) = NULL; } static int svm_cpu_up_prepare(unsigned int cpu) { if ( ((per_cpu(hsa, cpu) == NULL) && ((per_cpu(hsa, cpu) = alloc_host_save_area()) == NULL)) || ((per_cpu(root_vmcb, cpu) == NULL) && ((per_cpu(root_vmcb, cpu) = alloc_vmcb()) == NULL)) ) { svm_cpu_dead(cpu); return -ENOMEM; } return 0; } static void svm_init_erratum_383(struct cpuinfo_x86 *c) { uint64_t msr_content; /* check whether CPU is affected */ if ( !cpu_has_amd_erratum(c, AMD_ERRATUM_383) ) return; /* use safe methods to be compatible with nested virtualization */ if (rdmsr_safe(MSR_AMD64_DC_CFG, msr_content) == 0 && wrmsr_safe(MSR_AMD64_DC_CFG, msr_content | (1ULL << 47)) == 0) { amd_erratum383_found = 1; } else { printk("Failed to enable erratum 383\n"); } } static int svm_handle_osvw(struct vcpu *v, uint32_t msr, uint64_t *val, bool_t read) { unsigned int ecx; /* Guest OSVW support */ hvm_cpuid(0x80000001, NULL, NULL, &ecx, NULL); if ( !test_bit((X86_FEATURE_OSVW & 31), &ecx) ) return -1; if ( read ) { if (msr == MSR_AMD_OSVW_ID_LENGTH) *val = v->arch.hvm_svm.osvw.length; else *val = v->arch.hvm_svm.osvw.status; } /* Writes are ignored */ return 0; } static int svm_cpu_up(void) { uint64_t msr_content; int rc, cpu = smp_processor_id(); struct cpuinfo_x86 *c = &cpu_data[cpu]; /* Check whether SVM feature is disabled in BIOS */ rdmsrl(MSR_K8_VM_CR, msr_content); if ( msr_content & K8_VMCR_SVME_DISABLE ) { printk("CPU%d: AMD SVM Extension is disabled in BIOS.\n", cpu); return -EINVAL; } if ( (rc = svm_cpu_up_prepare(cpu)) != 0 ) return rc; write_efer(read_efer() | EFER_SVME); /* Initialize the HSA for this core. */ wrmsrl(MSR_K8_VM_HSAVE_PA, (uint64_t)virt_to_maddr(per_cpu(hsa, cpu))); /* check for erratum 383 */ svm_init_erratum_383(c); /* Initialize core's ASID handling. */ svm_asid_init(c); /* * Check whether EFER.LMSLE can be written. * Unfortunately there's no feature bit defined for this. */ msr_content = read_efer(); if ( wrmsr_safe(MSR_EFER, msr_content | EFER_LMSLE) == 0 ) rdmsrl(MSR_EFER, msr_content); if ( msr_content & EFER_LMSLE ) { if ( c == &boot_cpu_data ) cpu_has_lmsl = 1; wrmsrl(MSR_EFER, msr_content ^ EFER_LMSLE); } else { if ( cpu_has_lmsl ) printk(XENLOG_WARNING "Inconsistent LMSLE support across CPUs!\n"); cpu_has_lmsl = 0; } /* Initialize OSVW bits to be used by guests */ svm_host_osvw_init(); return 0; } const struct hvm_function_table * __init start_svm(void) { bool_t printed = 0; svm_host_osvw_reset(); if ( svm_cpu_up() ) { printk("SVM: failed to initialise.\n"); return NULL; } setup_vmcb_dump(); svm_feature_flags = ((cpuid_eax(0x80000000) >= 0x8000000A) ? cpuid_edx(0x8000000A) : 0); printk("SVM: Supported advanced features:\n"); /* DecodeAssists fast paths assume nextrip is valid for fast rIP update. */ if ( !cpu_has_svm_nrips ) clear_bit(SVM_FEATURE_DECODEASSISTS, &svm_feature_flags); #define P(p,s) if ( p ) { printk(" - %s\n", s); printed = 1; } P(cpu_has_svm_npt, "Nested Page Tables (NPT)"); P(cpu_has_svm_lbrv, "Last Branch Record (LBR) Virtualisation"); P(cpu_has_svm_nrips, "Next-RIP Saved on #VMEXIT"); P(cpu_has_svm_cleanbits, "VMCB Clean Bits"); P(cpu_has_svm_decode, "DecodeAssists"); P(cpu_has_pause_filter, "Pause-Intercept Filter"); P(cpu_has_tsc_ratio, "TSC Rate MSR"); #undef P if ( !printed ) printk(" - none\n"); svm_function_table.hap_supported = cpu_has_svm_npt; svm_function_table.hap_capabilities = HVM_HAP_SUPERPAGE_2MB | ((cpuid_edx(0x80000001) & 0x04000000) ? HVM_HAP_SUPERPAGE_1GB : 0); return &svm_function_table; } static void svm_do_nested_pgfault(struct vcpu *v, struct cpu_user_regs *regs, uint32_t npfec, paddr_t gpa) { int ret; unsigned long gfn = gpa >> PAGE_SHIFT; mfn_t mfn; p2m_type_t p2mt; p2m_access_t p2ma; struct p2m_domain *p2m = NULL; ret = hvm_hap_nested_page_fault(gpa, 0, ~0ul, 1, /* All NPFs count as reads */ npfec & PFEC_write_access, npfec & PFEC_insn_fetch); if ( tb_init_done ) { struct { uint64_t gpa; uint64_t mfn; uint32_t qualification; uint32_t p2mt; } _d; p2m = p2m_get_p2m(v); _d.gpa = gpa; _d.qualification = 0; mfn = __get_gfn_type_access(p2m, gfn, &_d.p2mt, &p2ma, 0, NULL, 0); _d.mfn = mfn_x(mfn); __trace_var(TRC_HVM_NPF, 0, sizeof(_d), &_d); } switch (ret) { case 0: break; case 1: return; case -1: ASSERT(nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v)); /* inject #VMEXIT(NPF) into guest. */ nestedsvm_vmexit_defer(v, VMEXIT_NPF, npfec, gpa); return; } if ( p2m == NULL ) p2m = p2m_get_p2m(v); /* Everything else is an error. */ mfn = __get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL, 0); gdprintk(XENLOG_ERR, "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n", gpa, mfn_x(mfn), p2mt); domain_crash(v->domain); } static void svm_fpu_dirty_intercept(void) { struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; struct vmcb_struct *n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx; svm_fpu_enter(v); if ( vmcb != n1vmcb ) { /* Check if l1 guest must make FPU ready for the l2 guest */ if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS ) hvm_inject_hw_exception(TRAP_no_device, HVM_DELIVER_NO_ERROR_CODE); else vmcb_set_cr0(n1vmcb, vmcb_get_cr0(n1vmcb) & ~X86_CR0_TS); return; } if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) ) vmcb_set_cr0(vmcb, vmcb_get_cr0(vmcb) & ~X86_CR0_TS); } static void svm_cpuid_intercept( unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { unsigned int input = *eax; struct vcpu *v = current; hvm_cpuid(input, eax, ebx, ecx, edx); switch (input) { case 0x80000001: /* Fix up VLAPIC details. */ if ( vlapic_hw_disabled(vcpu_vlapic(v)) ) __clear_bit(X86_FEATURE_APIC & 31, edx); break; case 0x8000001c: { /* LWP capability CPUID */ uint64_t lwp_cfg = v->arch.hvm_svm.guest_lwp_cfg; if ( cpu_has_lwp ) { if ( !(v->arch.xcr0 & XSTATE_LWP) ) { *eax = 0x0; break; } /* turn on available bit and other features specified in lwp_cfg */ *eax = (*edx & lwp_cfg) | 0x00000001; } break; } default: break; } HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx); } static void svm_vmexit_do_cpuid(struct cpu_user_regs *regs) { unsigned int eax, ebx, ecx, edx, inst_len; if ( (inst_len = __get_instruction_length(current, INSTR_CPUID)) == 0 ) return; eax = regs->eax; ebx = regs->ebx; ecx = regs->ecx; edx = regs->edx; svm_cpuid_intercept(&eax, &ebx, &ecx, &edx); regs->eax = eax; regs->ebx = ebx; regs->ecx = ecx; regs->edx = edx; __update_guest_eip(regs, inst_len); } static void svm_vmexit_do_cr_access( struct vmcb_struct *vmcb, struct cpu_user_regs *regs) { int gp, cr, dir, rc; cr = vmcb->exitcode - VMEXIT_CR0_READ; dir = (cr > 15); cr &= 0xf; gp = vmcb->exitinfo1 & 0xf; rc = dir ? hvm_mov_to_cr(cr, gp) : hvm_mov_from_cr(cr, gp); if ( rc == X86EMUL_OKAY ) __update_guest_eip(regs, vmcb->nextrip - vmcb->rip); } static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs) { struct vmcb_struct *vmcb = vcpu_nestedhvm(v).nv_n1vmcx; HVMTRACE_0D(DR_WRITE); __restore_debug_registers(vmcb, v); } static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content) { int ret; struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; switch ( msr ) { case MSR_IA32_SYSENTER_CS: *msr_content = v->arch.hvm_svm.guest_sysenter_cs; break; case MSR_IA32_SYSENTER_ESP: *msr_content = v->arch.hvm_svm.guest_sysenter_esp; break; case MSR_IA32_SYSENTER_EIP: *msr_content = v->arch.hvm_svm.guest_sysenter_eip; break; case MSR_IA32_MCx_MISC(4): /* Threshold register */ case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3: /* * MCA/MCE: We report that the threshold register is unavailable * for OS use (locked by the BIOS). */ *msr_content = 1ULL << 61; /* MC4_MISC.Locked */ break; case MSR_IA32_EBC_FREQUENCY_ID: /* * This Intel-only register may be accessed if this HVM guest * has been migrated from an Intel host. The value zero is not * particularly meaningful, but at least avoids the guest crashing! */ *msr_content = 0; break; case MSR_IA32_DEBUGCTLMSR: *msr_content = vmcb_get_debugctlmsr(vmcb); break; case MSR_IA32_LASTBRANCHFROMIP: *msr_content = vmcb_get_lastbranchfromip(vmcb); break; case MSR_IA32_LASTBRANCHTOIP: *msr_content = vmcb_get_lastbranchtoip(vmcb); break; case MSR_IA32_LASTINTFROMIP: *msr_content = vmcb_get_lastintfromip(vmcb); break; case MSR_IA32_LASTINTTOIP: *msr_content = vmcb_get_lastinttoip(vmcb); break; case MSR_AMD64_LWP_CFG: *msr_content = v->arch.hvm_svm.guest_lwp_cfg; break; case MSR_K7_PERFCTR0: case MSR_K7_PERFCTR1: case MSR_K7_PERFCTR2: case MSR_K7_PERFCTR3: case MSR_K7_EVNTSEL0: case MSR_K7_EVNTSEL1: case MSR_K7_EVNTSEL2: case MSR_K7_EVNTSEL3: case MSR_AMD_FAM15H_PERFCTR0: case MSR_AMD_FAM15H_PERFCTR1: case MSR_AMD_FAM15H_PERFCTR2: case MSR_AMD_FAM15H_PERFCTR3: case MSR_AMD_FAM15H_PERFCTR4: case MSR_AMD_FAM15H_PERFCTR5: case MSR_AMD_FAM15H_EVNTSEL0: case MSR_AMD_FAM15H_EVNTSEL1: case MSR_AMD_FAM15H_EVNTSEL2: case MSR_AMD_FAM15H_EVNTSEL3: case MSR_AMD_FAM15H_EVNTSEL4: case MSR_AMD_FAM15H_EVNTSEL5: vpmu_do_rdmsr(msr, msr_content); break; case MSR_AMD_OSVW_ID_LENGTH: case MSR_AMD_OSVW_STATUS: ret = svm_handle_osvw(v, msr, msr_content, 1); if ( ret < 0 ) goto gpf; break; default: ret = nsvm_rdmsr(v, msr, msr_content); if ( ret < 0 ) goto gpf; else if ( ret ) break; if ( rdmsr_viridian_regs(msr, msr_content) || rdmsr_hypervisor_regs(msr, msr_content) ) break; if ( rdmsr_safe(msr, *msr_content) == 0 ) break; if ( boot_cpu_data.x86 == 0xf && msr == MSR_F10_BU_CFG ) { /* Win2k8 x64 reads this MSR on revF chips, where it * wasn't publically available; it uses a magic constant * in %rdi as a password, which we don't have in * rdmsr_safe(). Since we'll ignore the later writes, * just use a plausible value here (the reset value from * rev10h chips) if the real CPU didn't provide one. */ *msr_content = 0x0000000010200020ull; break; } goto gpf; } HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, msr_value=%"PRIx64, msr, *msr_content); return X86EMUL_OKAY; gpf: hvm_inject_hw_exception(TRAP_gp_fault, 0); return X86EMUL_EXCEPTION; } static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content) { int ret, result = X86EMUL_OKAY; struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; int sync = 0; switch ( msr ) { case MSR_IA32_SYSENTER_CS: case MSR_IA32_SYSENTER_ESP: case MSR_IA32_SYSENTER_EIP: sync = 1; break; default: break; } if ( sync ) svm_sync_vmcb(v); switch ( msr ) { case MSR_IA32_SYSENTER_CS: vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = msr_content; break; case MSR_IA32_SYSENTER_ESP: vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = msr_content; break; case MSR_IA32_SYSENTER_EIP: vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = msr_content; break; case MSR_IA32_DEBUGCTLMSR: vmcb_set_debugctlmsr(vmcb, msr_content); if ( !msr_content || !cpu_has_svm_lbrv ) break; vmcb->lbr_control.fields.enable = 1; svm_disable_intercept_for_msr(v, MSR_IA32_DEBUGCTLMSR); svm_disable_intercept_for_msr(v, MSR_IA32_LASTBRANCHFROMIP); svm_disable_intercept_for_msr(v, MSR_IA32_LASTBRANCHTOIP); svm_disable_intercept_for_msr(v, MSR_IA32_LASTINTFROMIP); svm_disable_intercept_for_msr(v, MSR_IA32_LASTINTTOIP); break; case MSR_IA32_LASTBRANCHFROMIP: vmcb_set_lastbranchfromip(vmcb, msr_content); break; case MSR_IA32_LASTBRANCHTOIP: vmcb_set_lastbranchtoip(vmcb, msr_content); break; case MSR_IA32_LASTINTFROMIP: vmcb_set_lastintfromip(vmcb, msr_content); break; case MSR_IA32_LASTINTTOIP: vmcb_set_lastinttoip(vmcb, msr_content); break; case MSR_AMD64_LWP_CFG: if ( svm_update_lwp_cfg(v, msr_content) < 0 ) goto gpf; break; case MSR_K7_PERFCTR0: case MSR_K7_PERFCTR1: case MSR_K7_PERFCTR2: case MSR_K7_PERFCTR3: case MSR_K7_EVNTSEL0: case MSR_K7_EVNTSEL1: case MSR_K7_EVNTSEL2: case MSR_K7_EVNTSEL3: case MSR_AMD_FAM15H_PERFCTR0: case MSR_AMD_FAM15H_PERFCTR1: case MSR_AMD_FAM15H_PERFCTR2: case MSR_AMD_FAM15H_PERFCTR3: case MSR_AMD_FAM15H_PERFCTR4: case MSR_AMD_FAM15H_PERFCTR5: case MSR_AMD_FAM15H_EVNTSEL0: case MSR_AMD_FAM15H_EVNTSEL1: case MSR_AMD_FAM15H_EVNTSEL2: case MSR_AMD_FAM15H_EVNTSEL3: case MSR_AMD_FAM15H_EVNTSEL4: case MSR_AMD_FAM15H_EVNTSEL5: vpmu_do_wrmsr(msr, msr_content); break; case MSR_IA32_MCx_MISC(4): /* Threshold register */ case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3: /* * MCA/MCE: Threshold register is reported to be locked, so we ignore * all write accesses. This behaviour matches real HW, so guests should * have no problem with this. */ break; case MSR_AMD_OSVW_ID_LENGTH: case MSR_AMD_OSVW_STATUS: ret = svm_handle_osvw(v, msr, &msr_content, 0); if ( ret < 0 ) goto gpf; break; default: ret = nsvm_wrmsr(v, msr, msr_content); if ( ret < 0 ) goto gpf; else if ( ret ) break; if ( wrmsr_viridian_regs(msr, msr_content) ) break; switch ( wrmsr_hypervisor_regs(msr, msr_content) ) { case -EAGAIN: result = X86EMUL_RETRY; break; case 0: case 1: break; default: goto gpf; } break; } if ( sync ) svm_vmload(vmcb); return result; gpf: hvm_inject_hw_exception(TRAP_gp_fault, 0); return X86EMUL_EXCEPTION; } static void svm_do_msr_access(struct cpu_user_regs *regs) { int rc, inst_len; struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; uint64_t msr_content; if ( vmcb->exitinfo1 == 0 ) { if ( (inst_len = __get_instruction_length(v, INSTR_RDMSR)) == 0 ) return; rc = hvm_msr_read_intercept(regs->ecx, &msr_content); regs->eax = (uint32_t)msr_content; regs->edx = (uint32_t)(msr_content >> 32); } else { if ( (inst_len = __get_instruction_length(v, INSTR_WRMSR)) == 0 ) return; msr_content = ((uint64_t)regs->edx << 32) | (uint32_t)regs->eax; rc = hvm_msr_write_intercept(regs->ecx, msr_content); } if ( rc == X86EMUL_OKAY ) __update_guest_eip(regs, inst_len); } static void svm_vmexit_do_hlt(struct vmcb_struct *vmcb, struct cpu_user_regs *regs) { unsigned int inst_len; if ( (inst_len = __get_instruction_length(current, INSTR_HLT)) == 0 ) return; __update_guest_eip(regs, inst_len); hvm_hlt(regs->eflags); } static void svm_vmexit_do_rdtsc(struct cpu_user_regs *regs) { unsigned int inst_len; if ( (inst_len = __get_instruction_length(current, INSTR_RDTSC)) == 0 ) return; __update_guest_eip(regs, inst_len); hvm_rdtsc_intercept(regs); } static void svm_vmexit_do_pause(struct cpu_user_regs *regs) { unsigned int inst_len; if ( (inst_len = __get_instruction_length(current, INSTR_PAUSE)) == 0 ) return; __update_guest_eip(regs, inst_len); /* * The guest is running a contended spinlock and we've detected it. * Do something useful, like reschedule the guest */ perfc_incr(pauseloop_exits); do_sched_op_compat(SCHEDOP_yield, 0); } static void svm_vmexit_do_vmrun(struct cpu_user_regs *regs, struct vcpu *v, uint64_t vmcbaddr) { if ( !nsvm_efer_svm_enabled(v) ) { gdprintk(XENLOG_ERR, "VMRUN: nestedhvm disabled, injecting #UD\n"); hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); return; } if ( !nestedsvm_vmcb_map(v, vmcbaddr) ) { gdprintk(XENLOG_ERR, "VMRUN: mapping vmcb failed, injecting #GP\n"); hvm_inject_hw_exception(TRAP_gp_fault, HVM_DELIVER_NO_ERROR_CODE); return; } vcpu_nestedhvm(v).nv_vmentry_pending = 1; return; } static struct page_info * nsvm_get_nvmcb_page(struct vcpu *v, uint64_t vmcbaddr) { p2m_type_t p2mt; struct page_info *page; struct nestedvcpu *nv = &vcpu_nestedhvm(v); if ( !nestedsvm_vmcb_map(v, vmcbaddr) ) return NULL; /* Need to translate L1-GPA to MPA */ page = get_page_from_gfn(v->domain, nv->nv_vvmcxaddr >> PAGE_SHIFT, &p2mt, P2M_ALLOC | P2M_UNSHARE); if ( !page ) return NULL; if ( !p2m_is_ram(p2mt) || p2m_is_readonly(p2mt) ) { put_page(page); return NULL; } return page; } static void svm_vmexit_do_vmload(struct vmcb_struct *vmcb, struct cpu_user_regs *regs, struct vcpu *v, uint64_t vmcbaddr) { int ret; unsigned int inst_len; struct page_info *page; if ( (inst_len = __get_instruction_length(v, INSTR_VMLOAD)) == 0 ) return; if ( !nsvm_efer_svm_enabled(v) ) { gdprintk(XENLOG_ERR, "VMLOAD: nestedhvm disabled, injecting #UD\n"); ret = TRAP_invalid_op; goto inject; } page = nsvm_get_nvmcb_page(v, vmcbaddr); if ( !page ) { gdprintk(XENLOG_ERR, "VMLOAD: mapping failed, injecting #GP\n"); ret = TRAP_gp_fault; goto inject; } svm_vmload_pa(page_to_maddr(page)); put_page(page); /* State in L1 VMCB is stale now */ v->arch.hvm_svm.vmcb_in_sync = 0; __update_guest_eip(regs, inst_len); return; inject: hvm_inject_hw_exception(ret, HVM_DELIVER_NO_ERROR_CODE); return; } static void svm_vmexit_do_vmsave(struct vmcb_struct *vmcb, struct cpu_user_regs *regs, struct vcpu *v, uint64_t vmcbaddr) { int ret; unsigned int inst_len; struct page_info *page; if ( (inst_len = __get_instruction_length(v, INSTR_VMSAVE)) == 0 ) return; if ( !nsvm_efer_svm_enabled(v) ) { gdprintk(XENLOG_ERR, "VMSAVE: nestedhvm disabled, injecting #UD\n"); ret = TRAP_invalid_op; goto inject; } page = nsvm_get_nvmcb_page(v, vmcbaddr); if ( !page ) { gdprintk(XENLOG_ERR, "VMSAVE: mapping vmcb failed, injecting #GP\n"); ret = TRAP_gp_fault; goto inject; } svm_vmsave_pa(page_to_maddr(page)); put_page(page); __update_guest_eip(regs, inst_len); return; inject: hvm_inject_hw_exception(ret, HVM_DELIVER_NO_ERROR_CODE); return; } static void svm_vmexit_ud_intercept(struct cpu_user_regs *regs) { struct hvm_emulate_ctxt ctxt; int rc; hvm_emulate_prepare(&ctxt, regs); rc = hvm_emulate_one(&ctxt); switch ( rc ) { case X86EMUL_UNHANDLEABLE: hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); break; case X86EMUL_EXCEPTION: if ( ctxt.exn_pending ) hvm_inject_hw_exception(ctxt.exn_vector, ctxt.exn_error_code); /* fall through */ default: hvm_emulate_writeback(&ctxt); break; } } static int svm_is_erratum_383(struct cpu_user_regs *regs) { uint64_t msr_content; uint32_t i; struct vcpu *v = current; if ( !amd_erratum383_found ) return 0; rdmsrl(MSR_IA32_MC0_STATUS, msr_content); /* Bit 62 may or may not be set for this mce */ msr_content &= ~(1ULL << 62); if ( msr_content != 0xb600000000010015ULL ) return 0; /* Clear MCi_STATUS registers */ for (i = 0; i < nr_mce_banks; i++) wrmsrl(MSR_IA32_MCx_STATUS(i), 0ULL); rdmsrl(MSR_IA32_MCG_STATUS, msr_content); wrmsrl(MSR_IA32_MCG_STATUS, msr_content & ~(1ULL << 2)); /* flush TLB */ flush_tlb_mask(v->domain->domain_dirty_cpumask); return 1; } static void svm_vmexit_mce_intercept( struct vcpu *v, struct cpu_user_regs *regs) { if ( svm_is_erratum_383(regs) ) { gdprintk(XENLOG_ERR, "SVM hits AMD erratum 383\n"); domain_crash(v->domain); } } static void wbinvd_ipi(void *info) { wbinvd(); } static void svm_wbinvd_intercept(void) { if ( cache_flush_permitted(current->domain) ) on_each_cpu(wbinvd_ipi, NULL, 1); } static void svm_vmexit_do_invalidate_cache(struct cpu_user_regs *regs) { static const enum instruction_index list[] = { INSTR_INVD, INSTR_WBINVD }; int inst_len; inst_len = __get_instruction_length_from_list( current, list, ARRAY_SIZE(list)); if ( inst_len == 0 ) return; svm_wbinvd_intercept(); __update_guest_eip(regs, inst_len); } static void svm_invlpga_intercept( struct vcpu *v, unsigned long vaddr, uint32_t asid) { svm_invlpga(vaddr, (asid == 0) ? v->arch.hvm_vcpu.n1asid.asid : vcpu_nestedhvm(v).nv_n2asid.asid); } static void svm_invlpg_intercept(unsigned long vaddr) { struct vcpu *curr = current; HVMTRACE_LONG_2D(INVLPG, 0, TRC_PAR_LONG(vaddr)); paging_invlpg(curr, vaddr); svm_asid_g_invlpg(curr, vaddr); } static struct hvm_function_table __initdata svm_function_table = { .name = "SVM", .cpu_up_prepare = svm_cpu_up_prepare, .cpu_dead = svm_cpu_dead, .cpu_up = svm_cpu_up, .cpu_down = svm_cpu_down, .domain_initialise = svm_domain_initialise, .domain_destroy = svm_domain_destroy, .vcpu_initialise = svm_vcpu_initialise, .vcpu_destroy = svm_vcpu_destroy, .save_cpu_ctxt = svm_save_vmcb_ctxt, .load_cpu_ctxt = svm_load_vmcb_ctxt, .get_interrupt_shadow = svm_get_interrupt_shadow, .set_interrupt_shadow = svm_set_interrupt_shadow, .guest_x86_mode = svm_guest_x86_mode, .get_segment_register = svm_get_segment_register, .set_segment_register = svm_set_segment_register, .get_shadow_gs_base = svm_get_shadow_gs_base, .update_guest_cr = svm_update_guest_cr, .update_guest_efer = svm_update_guest_efer, .set_guest_pat = svm_set_guest_pat, .get_guest_pat = svm_get_guest_pat, .set_tsc_offset = svm_set_tsc_offset, .inject_trap = svm_inject_trap, .init_hypercall_page = svm_init_hypercall_page, .event_pending = svm_event_pending, .cpuid_intercept = svm_cpuid_intercept, .wbinvd_intercept = svm_wbinvd_intercept, .fpu_dirty_intercept = svm_fpu_dirty_intercept, .msr_read_intercept = svm_msr_read_intercept, .msr_write_intercept = svm_msr_write_intercept, .invlpg_intercept = svm_invlpg_intercept, .set_rdtsc_exiting = svm_set_rdtsc_exiting, .get_insn_bytes = svm_get_insn_bytes, .nhvm_vcpu_initialise = nsvm_vcpu_initialise, .nhvm_vcpu_destroy = nsvm_vcpu_destroy, .nhvm_vcpu_reset = nsvm_vcpu_reset, .nhvm_vcpu_hostrestore = nsvm_vcpu_hostrestore, .nhvm_vcpu_vmexit = nsvm_vcpu_vmexit_inject, .nhvm_vcpu_vmexit_trap = nsvm_vcpu_vmexit_trap, .nhvm_vcpu_guestcr3 = nsvm_vcpu_guestcr3, .nhvm_vcpu_p2m_base = nsvm_vcpu_hostcr3, .nhvm_vcpu_asid = nsvm_vcpu_asid, .nhvm_vmcx_guest_intercepts_trap = nsvm_vmcb_guest_intercepts_trap, .nhvm_vmcx_hap_enabled = nsvm_vmcb_hap_enabled, .nhvm_intr_blocked = nsvm_intr_blocked, .nhvm_hap_walk_L1_p2m = nsvm_hap_walk_L1_p2m, }; void svm_vmexit_handler(struct cpu_user_regs *regs) { uint64_t exit_reason; struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; eventinj_t eventinj; int inst_len, rc; vintr_t intr; bool_t vcpu_guestmode = 0; hvm_invalidate_regs_fields(regs); if ( paging_mode_hap(v->domain) ) v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] = vmcb_get_cr3(vmcb); if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) ) vcpu_guestmode = 1; /* * Before doing anything else, we need to sync up the VLAPIC's TPR with * SVM's vTPR. It's OK if the guest doesn't touch CR8 (e.g. 32-bit Windows) * because we update the vTPR on MMIO writes to the TPR. * NB. We need to preserve the low bits of the TPR to make checked builds * of Windows work, even though they don't actually do anything. */ if ( !vcpu_guestmode ) { intr = vmcb_get_vintr(vmcb); vlapic_set_reg(vcpu_vlapic(v), APIC_TASKPRI, ((intr.fields.tpr & 0x0F) << 4) | (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0x0F)); } exit_reason = vmcb->exitcode; if ( hvm_long_mode_enabled(v) ) HVMTRACE_ND(VMEXIT64, vcpu_guestmode ? TRC_HVM_NESTEDFLAG : 0, 1/*cycles*/, 3, exit_reason, (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32), 0, 0, 0); else HVMTRACE_ND(VMEXIT, vcpu_guestmode ? TRC_HVM_NESTEDFLAG : 0, 1/*cycles*/, 2, exit_reason, (uint32_t)regs->eip, 0, 0, 0, 0); if ( vcpu_guestmode ) { enum nestedhvm_vmexits nsret; struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct vmcb_struct *ns_vmcb = nv->nv_vvmcx; uint64_t exitinfo1, exitinfo2; paging_update_nestedmode(v); /* Write real exitinfo1 back into virtual vmcb. * nestedsvm_check_intercepts() expects to have the correct * exitinfo1 value there. */ exitinfo1 = ns_vmcb->exitinfo1; ns_vmcb->exitinfo1 = vmcb->exitinfo1; nsret = nestedsvm_check_intercepts(v, regs, exit_reason); switch (nsret) { case NESTEDHVM_VMEXIT_CONTINUE: BUG(); break; case NESTEDHVM_VMEXIT_HOST: break; case NESTEDHVM_VMEXIT_INJECT: /* Switch vcpu from l2 to l1 guest. We must perform * the switch here to have svm_do_resume() working * as intended. */ exitinfo1 = vmcb->exitinfo1; exitinfo2 = vmcb->exitinfo2; nv->nv_vmswitch_in_progress = 1; nsret = nestedsvm_vmexit_n2n1(v, regs); nv->nv_vmswitch_in_progress = 0; switch (nsret) { case NESTEDHVM_VMEXIT_DONE: /* defer VMEXIT injection */ nestedsvm_vmexit_defer(v, exit_reason, exitinfo1, exitinfo2); goto out; case NESTEDHVM_VMEXIT_FATALERROR: gdprintk(XENLOG_ERR, "unexpected nestedsvm_vmexit() error\n"); goto exit_and_crash; default: BUG(); case NESTEDHVM_VMEXIT_ERROR: break; } case NESTEDHVM_VMEXIT_ERROR: gdprintk(XENLOG_ERR, "nestedsvm_check_intercepts() returned NESTEDHVM_VMEXIT_ERROR\n"); goto out; case NESTEDHVM_VMEXIT_FATALERROR: gdprintk(XENLOG_ERR, "unexpected nestedsvm_check_intercepts() error\n"); goto exit_and_crash; default: gdprintk(XENLOG_INFO, "nestedsvm_check_intercepts() returned %i\n", nsret); goto exit_and_crash; } } if ( unlikely(exit_reason == VMEXIT_INVALID) ) { svm_vmcb_dump(__func__, vmcb); goto exit_and_crash; } perfc_incra(svmexits, exit_reason); hvm_maybe_deassert_evtchn_irq(); vmcb->cleanbits.bytes = cpu_has_svm_cleanbits ? ~0u : 0u; /* Event delivery caused this intercept? Queue for redelivery. */ eventinj = vmcb->exitintinfo; if ( unlikely(eventinj.fields.v) && hvm_event_needs_reinjection(eventinj.fields.type, eventinj.fields.vector) ) vmcb->eventinj = eventinj; switch ( exit_reason ) { case VMEXIT_INTR: /* Asynchronous event, handled when we STGI'd after the VMEXIT. */ HVMTRACE_0D(INTR); break; case VMEXIT_NMI: /* Asynchronous event, handled when we STGI'd after the VMEXIT. */ HVMTRACE_0D(NMI); break; case VMEXIT_SMI: /* Asynchronous event, handled when we STGI'd after the VMEXIT. */ HVMTRACE_0D(SMI); break; case VMEXIT_EXCEPTION_DB: if ( !v->domain->debugger_attached ) goto exit_and_crash; domain_pause_for_debugger(); break; case VMEXIT_EXCEPTION_BP: if ( !v->domain->debugger_attached ) goto exit_and_crash; /* AMD Vol2, 15.11: INT3, INTO, BOUND intercepts do not update RIP. */ if ( (inst_len = __get_instruction_length(v, INSTR_INT3)) == 0 ) break; __update_guest_eip(regs, inst_len); current->arch.gdbsx_vcpu_event = TRAP_int3; domain_pause_for_debugger(); break; case VMEXIT_EXCEPTION_NM: svm_fpu_dirty_intercept(); break; case VMEXIT_EXCEPTION_PF: { unsigned long va; va = vmcb->exitinfo2; regs->error_code = vmcb->exitinfo1; HVM_DBG_LOG(DBG_LEVEL_VMMU, "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx", (unsigned long)regs->eax, (unsigned long)regs->ebx, (unsigned long)regs->ecx, (unsigned long)regs->edx, (unsigned long)regs->esi, (unsigned long)regs->edi); if ( cpu_has_svm_decode ) v->arch.hvm_svm.cached_insn_len = vmcb->guest_ins_len & 0xf; rc = paging_fault(va, regs); v->arch.hvm_svm.cached_insn_len = 0; if ( rc ) { if ( trace_will_trace_event(TRC_SHADOW) ) break; if ( hvm_long_mode_enabled(v) ) HVMTRACE_LONG_2D(PF_XEN, regs->error_code, TRC_PAR_LONG(va)); else HVMTRACE_2D(PF_XEN, regs->error_code, va); break; } hvm_inject_page_fault(regs->error_code, va); break; } case VMEXIT_EXCEPTION_UD: svm_vmexit_ud_intercept(regs); break; /* Asynchronous event, handled when we STGI'd after the VMEXIT. */ case VMEXIT_EXCEPTION_MC: HVMTRACE_0D(MCE); svm_vmexit_mce_intercept(v, regs); break; case VMEXIT_VINTR: { u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb); intr = vmcb_get_vintr(vmcb); intr.fields.irq = 0; general1_intercepts &= ~GENERAL1_INTERCEPT_VINTR; vmcb_set_vintr(vmcb, intr); vmcb_set_general1_intercepts(vmcb, general1_intercepts); break; } case VMEXIT_INVD: case VMEXIT_WBINVD: svm_vmexit_do_invalidate_cache(regs); break; case VMEXIT_TASK_SWITCH: { enum hvm_task_switch_reason reason; int32_t errcode = -1; if ( (vmcb->exitinfo2 >> 36) & 1 ) reason = TSW_iret; else if ( (vmcb->exitinfo2 >> 38) & 1 ) reason = TSW_jmp; else reason = TSW_call_or_int; if ( (vmcb->exitinfo2 >> 44) & 1 ) errcode = (uint32_t)vmcb->exitinfo2; /* * Some processors set the EXITINTINFO field when the task switch * is caused by a task gate in the IDT. In this case we will be * emulating the event injection, so we do not want the processor * to re-inject the original event! */ vmcb->eventinj.bytes = 0; hvm_task_switch((uint16_t)vmcb->exitinfo1, reason, errcode); break; } case VMEXIT_CPUID: svm_vmexit_do_cpuid(regs); break; case VMEXIT_HLT: svm_vmexit_do_hlt(vmcb, regs); break; case VMEXIT_IOIO: if ( (vmcb->exitinfo1 & (1u<<2)) == 0 ) { uint16_t port = (vmcb->exitinfo1 >> 16) & 0xFFFF; int bytes = ((vmcb->exitinfo1 >> 4) & 0x07); int dir = (vmcb->exitinfo1 & 1) ? IOREQ_READ : IOREQ_WRITE; if ( handle_pio(port, bytes, dir) ) __update_guest_eip(regs, vmcb->exitinfo2 - vmcb->rip); } else if ( !handle_mmio() ) hvm_inject_hw_exception(TRAP_gp_fault, 0); break; case VMEXIT_CR0_READ ... VMEXIT_CR15_READ: case VMEXIT_CR0_WRITE ... VMEXIT_CR15_WRITE: if ( cpu_has_svm_decode && (vmcb->exitinfo1 & (1ULL << 63)) ) svm_vmexit_do_cr_access(vmcb, regs); else if ( !handle_mmio() ) hvm_inject_hw_exception(TRAP_gp_fault, 0); break; case VMEXIT_INVLPG: if ( cpu_has_svm_decode ) { svm_invlpg_intercept(vmcb->exitinfo1); __update_guest_eip(regs, vmcb->nextrip - vmcb->rip); } else if ( !handle_mmio() ) hvm_inject_hw_exception(TRAP_gp_fault, 0); break; case VMEXIT_INVLPGA: if ( (inst_len = __get_instruction_length(v, INSTR_INVLPGA)) == 0 ) break; svm_invlpga_intercept(v, regs->eax, regs->ecx); __update_guest_eip(regs, inst_len); break; case VMEXIT_VMMCALL: if ( (inst_len = __get_instruction_length(v, INSTR_VMCALL)) == 0 ) break; BUG_ON(vcpu_guestmode); HVMTRACE_1D(VMMCALL, regs->eax); rc = hvm_do_hypercall(regs); if ( rc != HVM_HCALL_preempted ) { __update_guest_eip(regs, inst_len); if ( rc == HVM_HCALL_invalidate ) send_invalidate_req(); } break; case VMEXIT_DR0_READ ... VMEXIT_DR7_READ: case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE: svm_dr_access(v, regs); break; case VMEXIT_MSR: svm_do_msr_access(regs); break; case VMEXIT_SHUTDOWN: hvm_triple_fault(); break; case VMEXIT_RDTSCP: regs->ecx = hvm_msr_tsc_aux(v); /* fall through */ case VMEXIT_RDTSC: svm_vmexit_do_rdtsc(regs); break; case VMEXIT_MONITOR: case VMEXIT_MWAIT: hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); break; case VMEXIT_VMRUN: svm_vmexit_do_vmrun(regs, v, regs->eax); break; case VMEXIT_VMLOAD: svm_vmexit_do_vmload(vmcb, regs, v, regs->eax); break; case VMEXIT_VMSAVE: svm_vmexit_do_vmsave(vmcb, regs, v, regs->eax); break; case VMEXIT_STGI: svm_vmexit_do_stgi(regs, v); break; case VMEXIT_CLGI: svm_vmexit_do_clgi(regs, v); break; case VMEXIT_SKINIT: hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE); break; case VMEXIT_XSETBV: if ( (inst_len = __get_instruction_length(current, INSTR_XSETBV))==0 ) break; if ( hvm_handle_xsetbv(regs->ecx, (regs->rdx << 32) | regs->_eax) == 0 ) __update_guest_eip(regs, inst_len); break; case VMEXIT_NPF: perfc_incra(svmexits, VMEXIT_NPF_PERFC); if ( cpu_has_svm_decode ) v->arch.hvm_svm.cached_insn_len = vmcb->guest_ins_len & 0xf; svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2); v->arch.hvm_svm.cached_insn_len = 0; break; case VMEXIT_IRET: { u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb); /* * IRET clears the NMI mask. However because we clear the mask * /before/ executing IRET, we set the interrupt shadow to prevent * a pending NMI from being injected immediately. This will work * perfectly unless the IRET instruction faults: in that case we * may inject an NMI before the NMI handler's IRET instruction is * retired. */ general1_intercepts &= ~GENERAL1_INTERCEPT_IRET; vmcb->interrupt_shadow = 1; vmcb_set_general1_intercepts(vmcb, general1_intercepts); break; } case VMEXIT_PAUSE: svm_vmexit_do_pause(regs); break; default: exit_and_crash: gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = %#"PRIx64", " "exitinfo1 = %#"PRIx64", exitinfo2 = %#"PRIx64"\n", exit_reason, (u64)vmcb->exitinfo1, (u64)vmcb->exitinfo2); domain_crash(v->domain); break; } out: if ( vcpu_guestmode ) /* Don't clobber TPR of the nested guest. */ return; /* The exit may have updated the TPR: reflect this in the hardware vtpr */ intr = vmcb_get_vintr(vmcb); intr.fields.tpr = (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4; vmcb_set_vintr(vmcb, intr); } void svm_trace_vmentry(void) { struct vcpu *curr = current; HVMTRACE_ND(VMENTRY, nestedhvm_vcpu_in_guestmode(curr) ? TRC_HVM_NESTEDFLAG : 0, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/hvm/svm/svmdebug.c0000664000175000017500000001600412307313555016157 0ustar smbsmb/* * svmdebug.c: debug functions * Copyright (c) 2011, Advanced Micro Devices, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #include #include #include static void svm_dump_sel(const char *name, svm_segment_register_t *s) { printk("%s: sel=0x%04x, attr=0x%04x, limit=0x%08x, base=0x%016llx\n", name, s->sel, s->attr.bytes, s->limit, (unsigned long long)s->base); } /* This function can directly access fields which are covered by clean bits. */ void svm_vmcb_dump(const char *from, struct vmcb_struct *vmcb) { printk("Dumping guest's current state at %s...\n", from); printk("Size of VMCB = %d, paddr = %#lx, vaddr = %p\n", (int) sizeof(struct vmcb_struct), virt_to_maddr(vmcb), vmcb); printk("cr_intercepts = %#x dr_intercepts = %#x " "exception_intercepts = %#x\n", vmcb->_cr_intercepts, vmcb->_dr_intercepts, vmcb->_exception_intercepts); printk("general1_intercepts = %#x general2_intercepts = %#x\n", vmcb->_general1_intercepts, vmcb->_general2_intercepts); printk("iopm_base_pa = %#Lx msrpm_base_pa = %#Lx tsc_offset = %#Lx\n", (unsigned long long)vmcb->_iopm_base_pa, (unsigned long long)vmcb->_msrpm_base_pa, (unsigned long long)vmcb->_tsc_offset); printk("tlb_control = %#x vintr = %#Lx interrupt_shadow = %#Lx\n", vmcb->tlb_control, (unsigned long long)vmcb->_vintr.bytes, (unsigned long long)vmcb->interrupt_shadow); printk("exitcode = %#Lx exitintinfo = %#Lx\n", (unsigned long long)vmcb->exitcode, (unsigned long long)vmcb->exitintinfo.bytes); printk("exitinfo1 = %#Lx exitinfo2 = %#Lx \n", (unsigned long long)vmcb->exitinfo1, (unsigned long long)vmcb->exitinfo2); printk("np_enable = %Lx guest_asid = %#x\n", (unsigned long long)vmcb->_np_enable, vmcb->_guest_asid); printk("cpl = %d efer = %#Lx star = %#Lx lstar = %#Lx\n", vmcb->_cpl, (unsigned long long)vmcb->_efer, (unsigned long long)vmcb->star, (unsigned long long)vmcb->lstar); printk("CR0 = 0x%016llx CR2 = 0x%016llx\n", (unsigned long long)vmcb->_cr0, (unsigned long long)vmcb->_cr2); printk("CR3 = 0x%016llx CR4 = 0x%016llx\n", (unsigned long long)vmcb->_cr3, (unsigned long long)vmcb->_cr4); printk("RSP = 0x%016llx RIP = 0x%016llx\n", (unsigned long long)vmcb->rsp, (unsigned long long)vmcb->rip); printk("RAX = 0x%016llx RFLAGS=0x%016llx\n", (unsigned long long)vmcb->rax, (unsigned long long)vmcb->rflags); printk("DR6 = 0x%016llx, DR7 = 0x%016llx\n", (unsigned long long)vmcb->_dr6, (unsigned long long)vmcb->_dr7); printk("CSTAR = 0x%016llx SFMask = 0x%016llx\n", (unsigned long long)vmcb->cstar, (unsigned long long)vmcb->sfmask); printk("KernGSBase = 0x%016llx PAT = 0x%016llx \n", (unsigned long long)vmcb->kerngsbase, (unsigned long long)vmcb->_g_pat); printk("H_CR3 = 0x%016llx CleanBits = %#x\n", (unsigned long long)vmcb->_h_cr3, vmcb->cleanbits.bytes); /* print out all the selectors */ svm_dump_sel("CS", &vmcb->cs); svm_dump_sel("DS", &vmcb->ds); svm_dump_sel("SS", &vmcb->ss); svm_dump_sel("ES", &vmcb->es); svm_dump_sel("FS", &vmcb->fs); svm_dump_sel("GS", &vmcb->gs); svm_dump_sel("GDTR", &vmcb->gdtr); svm_dump_sel("LDTR", &vmcb->ldtr); svm_dump_sel("IDTR", &vmcb->idtr); svm_dump_sel("TR", &vmcb->tr); } bool_t svm_vmcb_isvalid(const char *from, struct vmcb_struct *vmcb, bool_t verbose) { bool_t ret = 0; /* ok */ #define PRINTF(...) \ if (verbose) { ret = 1; printk("%s: ", from); printk(__VA_ARGS__); \ } else return 1; if ((vmcb->_efer & EFER_SVME) == 0) { PRINTF("EFER: SVME bit not set (%#"PRIx64")\n", vmcb->_efer); } if ((vmcb->_cr0 & X86_CR0_CD) == 0 && (vmcb->_cr0 & X86_CR0_NW) != 0) { PRINTF("CR0: CD bit is zero and NW bit set (%#"PRIx64")\n", vmcb->_cr0); } if ((vmcb->_cr0 >> 32U) != 0) { PRINTF("CR0: bits [63:32] are not zero (%#"PRIx64")\n", vmcb->_cr0); } if ((vmcb->_cr3 & 0x7) != 0) { PRINTF("CR3: MBZ bits are set (%#"PRIx64")\n", vmcb->_cr3); } if ((vmcb->_efer & EFER_LMA) && (vmcb->_cr3 & 0xfe) != 0) { PRINTF("CR3: MBZ bits are set (%#"PRIx64")\n", vmcb->_cr3); } if ((vmcb->_cr4 >> 19U) != 0) { PRINTF("CR4: bits [63:19] are not zero (%#"PRIx64")\n", vmcb->_cr4); } if (((vmcb->_cr4 >> 11U) & 0x7fU) != 0) { PRINTF("CR4: bits [17:11] are not zero (%#"PRIx64")\n", vmcb->_cr4); } if ((vmcb->_dr6 >> 32U) != 0) { PRINTF("DR6: bits [63:32] are not zero (%#"PRIx64")\n", vmcb->_dr6); } if ((vmcb->_dr7 >> 32U) != 0) { PRINTF("DR7: bits [63:32] are not zero (%#"PRIx64")\n", vmcb->_dr7); } if ((vmcb->_efer >> 15U) != 0) { PRINTF("EFER: bits [63:15] are not zero (%#"PRIx64")\n", vmcb->_efer); } if ((vmcb->_efer & EFER_LME) != 0 && ((vmcb->_cr0 & X86_CR0_PG) != 0)) { if ((vmcb->_cr4 & X86_CR4_PAE) == 0) { PRINTF("EFER_LME and CR0.PG are both set and CR4.PAE is zero.\n"); } if ((vmcb->_cr0 & X86_CR0_PE) == 0) { PRINTF("EFER_LME and CR0.PG are both set and CR0.PE is zero.\n"); } } if ((vmcb->_efer & EFER_LME) != 0 && (vmcb->_cr0 & X86_CR0_PG) != 0 && (vmcb->_cr4 & X86_CR4_PAE) != 0 && (vmcb->cs.attr.fields.l != 0) && (vmcb->cs.attr.fields.db != 0)) { PRINTF("EFER_LME, CR0.PG, CR4.PAE, CS.L and CS.D are all non-zero.\n"); } if ((vmcb->_general2_intercepts & GENERAL2_INTERCEPT_VMRUN) == 0) { PRINTF("GENERAL2_INTERCEPT: VMRUN intercept bit is clear (%#"PRIx32")\n", vmcb->_general2_intercepts); } if (vmcb->eventinj.fields.resvd1 != 0) { PRINTF("eventinj: MBZ bits are set (%#"PRIx64")\n", vmcb->eventinj.bytes); } if (vmcb->_np_enable && vmcb->_h_cr3 == 0) { PRINTF("nested paging enabled but host cr3 is 0\n"); } #undef PRINTF return ret; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/hvm/mtrr.c0000664000175000017500000005212012307313555014521 0ustar smbsmb/* * mtrr.c: MTRR/PAT virtualization * * Copyright (c) 2007, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include #include static uint32_t size_or_mask; /* Get page attribute fields (PAn) from PAT MSR. */ #define pat_cr_2_paf(pat_cr,n) ((((uint64_t)pat_cr) >> ((n)<<3)) & 0xff) /* PAT entry to PTE flags (PAT, PCD, PWT bits). */ static const uint8_t pat_entry_2_pte_flags[8] = { 0, _PAGE_PWT, _PAGE_PCD, _PAGE_PCD | _PAGE_PWT, _PAGE_PAT, _PAGE_PAT | _PAGE_PWT, _PAGE_PAT | _PAGE_PCD, _PAGE_PAT | _PAGE_PCD | _PAGE_PWT }; /* Effective mm type lookup table, according to MTRR and PAT. */ static const uint8_t mm_type_tbl[MTRR_NUM_TYPES][PAT_TYPE_NUMS] = { /********PAT(UC,WC,RS,RS,WT,WP,WB,UC-)*/ /* RS means reserved type(2,3), and type is hardcoded here */ /*MTRR(UC):(UC,WC,RS,RS,UC,UC,UC,UC)*/ {0, 1, 2, 2, 0, 0, 0, 0}, /*MTRR(WC):(UC,WC,RS,RS,UC,UC,WC,WC)*/ {0, 1, 2, 2, 0, 0, 1, 1}, /*MTRR(RS):(RS,RS,RS,RS,RS,RS,RS,RS)*/ {2, 2, 2, 2, 2, 2, 2, 2}, /*MTRR(RS):(RS,RS,RS,RS,RS,RS,RS,RS)*/ {2, 2, 2, 2, 2, 2, 2, 2}, /*MTRR(WT):(UC,WC,RS,RS,WT,WP,WT,UC)*/ {0, 1, 2, 2, 4, 5, 4, 0}, /*MTRR(WP):(UC,WC,RS,RS,WT,WP,WP,WC)*/ {0, 1, 2, 2, 4, 5, 5, 1}, /*MTRR(WB):(UC,WC,RS,RS,WT,WP,WB,UC)*/ {0, 1, 2, 2, 4, 5, 6, 0} }; /* * Reverse lookup table, to find a pat type according to MTRR and effective * memory type. This table is dynamically generated. */ static uint8_t mtrr_epat_tbl[MTRR_NUM_TYPES][MEMORY_NUM_TYPES]; /* Lookup table for PAT entry of a given PAT value in host PAT. */ static uint8_t pat_entry_tbl[PAT_TYPE_NUMS]; static void get_mtrr_range(uint64_t base_msr, uint64_t mask_msr, uint64_t *base, uint64_t *end) { uint32_t mask_lo = (uint32_t)mask_msr; uint32_t mask_hi = (uint32_t)(mask_msr >> 32); uint32_t base_lo = (uint32_t)base_msr; uint32_t base_hi = (uint32_t)(base_msr >> 32); uint32_t size; if ( (mask_lo & 0x800) == 0 ) { /* Invalid (i.e. free) range */ *base = 0; *end = 0; return; } /* Work out the shifted address mask. */ mask_lo = (size_or_mask | (mask_hi << (32 - PAGE_SHIFT)) | (mask_lo >> PAGE_SHIFT)); /* This works correctly if size is a power of two (a contiguous range). */ size = -mask_lo; *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; *end = *base + size - 1; } bool_t is_var_mtrr_overlapped(struct mtrr_state *m) { int32_t seg, i; uint64_t phys_base, phys_mask, phys_base_pre, phys_mask_pre; uint64_t base_pre, end_pre, base, end; uint8_t num_var_ranges = (uint8_t)m->mtrr_cap; for ( i = 0; i < num_var_ranges; i++ ) { phys_base_pre = ((uint64_t*)m->var_ranges)[i*2]; phys_mask_pre = ((uint64_t*)m->var_ranges)[i*2 + 1]; get_mtrr_range(phys_base_pre, phys_mask_pre, &base_pre, &end_pre); for ( seg = i + 1; seg < num_var_ranges; seg ++ ) { phys_base = ((uint64_t*)m->var_ranges)[seg*2]; phys_mask = ((uint64_t*)m->var_ranges)[seg*2 + 1]; get_mtrr_range(phys_base, phys_mask, &base, &end); if ( ((base_pre != end_pre) && (base != end)) || ((base >= base_pre) && (base <= end_pre)) || ((end >= base_pre) && (end <= end_pre)) || ((base_pre >= base) && (base_pre <= end)) || ((end_pre >= base) && (end_pre <= end)) ) { /* MTRR is overlapped. */ return 1; } } } return 0; } #define MTRR_PHYSMASK_VALID_BIT 11 #define MTRR_PHYSMASK_SHIFT 12 #define MTRR_PHYSBASE_TYPE_MASK 0xff /* lowest 8 bits */ #define MTRR_PHYSBASE_SHIFT 12 #define MTRR_VCNT 8 #define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) static int hvm_mtrr_pat_init(void) { unsigned int i, j, phys_addr; memset(&mtrr_epat_tbl, INVALID_MEM_TYPE, sizeof(mtrr_epat_tbl)); for ( i = 0; i < MTRR_NUM_TYPES; i++ ) { for ( j = 0; j < PAT_TYPE_NUMS; j++ ) { int32_t tmp = mm_type_tbl[i][j]; if ( (tmp >= 0) && (tmp < MEMORY_NUM_TYPES) ) mtrr_epat_tbl[i][tmp] = j; } } memset(&pat_entry_tbl, INVALID_MEM_TYPE, PAT_TYPE_NUMS * sizeof(pat_entry_tbl[0])); for ( i = 0; i < PAT_TYPE_NUMS; i++ ) { for ( j = 0; j < PAT_TYPE_NUMS; j++ ) { if ( pat_cr_2_paf(host_pat, j) == i ) { pat_entry_tbl[i] = j; break; } } } phys_addr = 36; if ( cpuid_eax(0x80000000) >= 0x80000008 ) phys_addr = (uint8_t)cpuid_eax(0x80000008); size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1); return 0; } __initcall(hvm_mtrr_pat_init); uint8_t pat_type_2_pte_flags(uint8_t pat_type) { int32_t pat_entry = pat_entry_tbl[pat_type]; /* INVALID_MEM_TYPE, means doesn't find the pat_entry in host pat for * a given pat_type. If host pat covers all the pat types, * it can't happen. */ if ( likely(pat_entry != INVALID_MEM_TYPE) ) return pat_entry_2_pte_flags[pat_entry]; return pat_entry_2_pte_flags[pat_entry_tbl[PAT_TYPE_UNCACHABLE]]; } int hvm_vcpu_cacheattr_init(struct vcpu *v) { struct mtrr_state *m = &v->arch.hvm_vcpu.mtrr; memset(m, 0, sizeof(*m)); m->var_ranges = xzalloc_array(struct mtrr_var_range, MTRR_VCNT); if ( m->var_ranges == NULL ) return -ENOMEM; m->mtrr_cap = (1u << 10) | (1u << 8) | MTRR_VCNT; v->arch.hvm_vcpu.pat_cr = ((uint64_t)PAT_TYPE_WRBACK) | /* PAT0: WB */ ((uint64_t)PAT_TYPE_WRTHROUGH << 8) | /* PAT1: WT */ ((uint64_t)PAT_TYPE_UC_MINUS << 16) | /* PAT2: UC- */ ((uint64_t)PAT_TYPE_UNCACHABLE << 24) | /* PAT3: UC */ ((uint64_t)PAT_TYPE_WRBACK << 32) | /* PAT4: WB */ ((uint64_t)PAT_TYPE_WRTHROUGH << 40) | /* PAT5: WT */ ((uint64_t)PAT_TYPE_UC_MINUS << 48) | /* PAT6: UC- */ ((uint64_t)PAT_TYPE_UNCACHABLE << 56); /* PAT7: UC */ return 0; } void hvm_vcpu_cacheattr_destroy(struct vcpu *v) { xfree(v->arch.hvm_vcpu.mtrr.var_ranges); } /* * Get MTRR memory type for physical address pa. */ static uint8_t get_mtrr_type(struct mtrr_state *m, paddr_t pa) { int32_t addr, seg, index; uint8_t overlap_mtrr = 0; uint8_t overlap_mtrr_pos = 0; uint64_t phys_base; uint64_t phys_mask; uint8_t num_var_ranges = m->mtrr_cap & 0xff; if ( unlikely(!(m->enabled & 0x2)) ) return MTRR_TYPE_UNCACHABLE; if ( (pa < 0x100000) && (m->enabled & 1) ) { /* Fixed range MTRR takes effective */ addr = (uint32_t) pa; if ( addr < 0x80000 ) { seg = (addr >> 16); return m->fixed_ranges[seg]; } else if ( addr < 0xc0000 ) { seg = (addr - 0x80000) >> 14; index = (seg >> 3) + 1; seg &= 7; /* select 0-7 segments */ return m->fixed_ranges[index*8 + seg]; } else { /* 0xC0000 --- 0x100000 */ seg = (addr - 0xc0000) >> 12; index = (seg >> 3) + 3; seg &= 7; /* select 0-7 segments */ return m->fixed_ranges[index*8 + seg]; } } /* Match with variable MTRRs. */ for ( seg = 0; seg < num_var_ranges; seg++ ) { phys_base = ((uint64_t*)m->var_ranges)[seg*2]; phys_mask = ((uint64_t*)m->var_ranges)[seg*2 + 1]; if ( phys_mask & (1 << MTRR_PHYSMASK_VALID_BIT) ) { if ( ((uint64_t) pa & phys_mask) >> MTRR_PHYSMASK_SHIFT == (phys_base & phys_mask) >> MTRR_PHYSMASK_SHIFT ) { if ( unlikely(m->overlapped) ) { overlap_mtrr |= 1 << (phys_base & MTRR_PHYSBASE_TYPE_MASK); overlap_mtrr_pos = phys_base & MTRR_PHYSBASE_TYPE_MASK; } else { /* If no overlap, return the found one */ return (phys_base & MTRR_PHYSBASE_TYPE_MASK); } } } } /* Overlapped or not found. */ if ( unlikely(overlap_mtrr == 0) ) return m->def_type; if ( likely(!(overlap_mtrr & ~( ((uint8_t)1) << overlap_mtrr_pos ))) ) /* Covers both one variable memory range matches and * two or more identical match. */ return overlap_mtrr_pos; if ( overlap_mtrr & 0x1 ) /* Two or more match, one is UC. */ return MTRR_TYPE_UNCACHABLE; if ( !(overlap_mtrr & 0xaf) ) /* Two or more match, WT and WB. */ return MTRR_TYPE_WRTHROUGH; /* Behaviour is undefined, but return the last overlapped type. */ return overlap_mtrr_pos; } /* * return the memory type from PAT. * NOTE: valid only when paging is enabled. * Only 4K page PTE is supported now. */ static uint8_t page_pat_type(uint64_t pat_cr, uint32_t pte_flags) { int32_t pat_entry; /* PCD/PWT -> bit 1/0 of PAT entry */ pat_entry = ( pte_flags >> 3 ) & 0x3; /* PAT bits as bit 2 of PAT entry */ if ( pte_flags & _PAGE_PAT ) pat_entry |= 4; return (uint8_t)pat_cr_2_paf(pat_cr, pat_entry); } /* * Effective memory type for leaf page. */ static uint8_t effective_mm_type(struct mtrr_state *m, uint64_t pat, paddr_t gpa, uint32_t pte_flags, uint8_t gmtrr_mtype) { uint8_t mtrr_mtype, pat_value, effective; /* if get_pat_flags() gives a dedicated MTRR type, * just use it */ if ( gmtrr_mtype == NO_HARDCODE_MEM_TYPE ) mtrr_mtype = get_mtrr_type(m, gpa); else mtrr_mtype = gmtrr_mtype; pat_value = page_pat_type(pat, pte_flags); effective = mm_type_tbl[mtrr_mtype][pat_value]; return effective; } uint32_t get_pat_flags(struct vcpu *v, uint32_t gl1e_flags, paddr_t gpaddr, paddr_t spaddr, uint8_t gmtrr_mtype) { uint8_t guest_eff_mm_type; uint8_t shadow_mtrr_type; uint8_t pat_entry_value; uint64_t pat = v->arch.hvm_vcpu.pat_cr; struct mtrr_state *g = &v->arch.hvm_vcpu.mtrr; /* 1. Get the effective memory type of guest physical address, * with the pair of guest MTRR and PAT */ guest_eff_mm_type = effective_mm_type(g, pat, gpaddr, gl1e_flags, gmtrr_mtype); /* 2. Get the memory type of host physical address, with MTRR */ shadow_mtrr_type = get_mtrr_type(&mtrr_state, spaddr); /* 3. Find the memory type in PAT, with host MTRR memory type * and guest effective memory type. */ pat_entry_value = mtrr_epat_tbl[shadow_mtrr_type][guest_eff_mm_type]; /* If conflit occurs(e.g host MTRR is UC, guest memory type is * WB),set UC as effective memory. Here, returning PAT_TYPE_UNCACHABLE will * always set effective memory as UC. */ if ( pat_entry_value == INVALID_MEM_TYPE ) { struct domain *d = v->domain; p2m_type_t p2mt; get_gfn_query_unlocked(d, paddr_to_pfn(gpaddr), &p2mt); if (p2m_is_ram(p2mt)) gdprintk(XENLOG_WARNING, "Conflict occurs for a given guest l1e flags:%x " "at %"PRIx64" (the effective mm type:%d), " "because the host mtrr type is:%d\n", gl1e_flags, (uint64_t)gpaddr, guest_eff_mm_type, shadow_mtrr_type); pat_entry_value = PAT_TYPE_UNCACHABLE; } /* 4. Get the pte flags */ return pat_type_2_pte_flags(pat_entry_value); } bool_t mtrr_def_type_msr_set(struct mtrr_state *m, uint64_t msr_content) { uint8_t def_type = msr_content & 0xff; uint8_t enabled = (msr_content >> 10) & 0x3; if ( unlikely(!(def_type == 0 || def_type == 1 || def_type == 4 || def_type == 5 || def_type == 6)) ) { HVM_DBG_LOG(DBG_LEVEL_MSR, "invalid MTRR def type:%x\n", def_type); return 0; } if ( unlikely(msr_content && (msr_content & ~0xcffUL)) ) { HVM_DBG_LOG(DBG_LEVEL_MSR, "invalid msr content:%"PRIx64"\n", msr_content); return 0; } m->enabled = enabled; m->def_type = def_type; return 1; } bool_t mtrr_fix_range_msr_set(struct mtrr_state *m, uint32_t row, uint64_t msr_content) { uint64_t *fixed_range_base = (uint64_t *)m->fixed_ranges; if ( fixed_range_base[row] != msr_content ) { uint8_t *range = (uint8_t*)&msr_content; int32_t i, type; for ( i = 0; i < 8; i++ ) { type = range[i]; if ( unlikely(!(type == 0 || type == 1 || type == 4 || type == 5 || type == 6)) ) return 0; } fixed_range_base[row] = msr_content; } return 1; } bool_t mtrr_var_range_msr_set( struct domain *d, struct mtrr_state *m, uint32_t msr, uint64_t msr_content) { uint32_t index, type, phys_addr, eax, ebx, ecx, edx; uint64_t msr_mask; uint64_t *var_range_base = (uint64_t*)m->var_ranges; index = msr - MSR_IA32_MTRR_PHYSBASE0; if ( var_range_base[index] == msr_content ) return 1; type = (uint8_t)msr_content; if ( unlikely(!(type == 0 || type == 1 || type == 4 || type == 5 || type == 6)) ) return 0; phys_addr = 36; domain_cpuid(d, 0x80000000, 0, &eax, &ebx, &ecx, &edx); if ( eax >= 0x80000008 ) { domain_cpuid(d, 0x80000008, 0, &eax, &ebx, &ecx, &edx); phys_addr = (uint8_t)eax; } msr_mask = ~((((uint64_t)1) << phys_addr) - 1); msr_mask |= (index & 1) ? 0x7ffUL : 0xf00UL; if ( unlikely(msr_content && (msr_content & msr_mask)) ) { HVM_DBG_LOG(DBG_LEVEL_MSR, "invalid msr content:%"PRIx64"\n", msr_content); return 0; } var_range_base[index] = msr_content; m->overlapped = is_var_mtrr_overlapped(m); return 1; } bool_t mtrr_pat_not_equal(struct vcpu *vd, struct vcpu *vs) { struct mtrr_state *md = &vd->arch.hvm_vcpu.mtrr; struct mtrr_state *ms = &vs->arch.hvm_vcpu.mtrr; int32_t res; uint8_t num_var_ranges = (uint8_t)md->mtrr_cap; /* Test fixed ranges. */ res = memcmp(md->fixed_ranges, ms->fixed_ranges, NUM_FIXED_RANGES*sizeof(mtrr_type)); if ( res ) return 1; /* Test var ranges. */ res = memcmp(md->var_ranges, ms->var_ranges, num_var_ranges*sizeof(struct mtrr_var_range)); if ( res ) return 1; /* Test default type MSR. */ if ( (md->def_type != ms->def_type) && (md->enabled != ms->enabled) ) return 1; /* Test PAT. */ if ( vd->arch.hvm_vcpu.pat_cr != vs->arch.hvm_vcpu.pat_cr ) return 1; return 0; } void hvm_init_cacheattr_region_list( struct domain *d) { INIT_LIST_HEAD(&d->arch.hvm_domain.pinned_cacheattr_ranges); } void hvm_destroy_cacheattr_region_list( struct domain *d) { struct list_head *head = &d->arch.hvm_domain.pinned_cacheattr_ranges; struct hvm_mem_pinned_cacheattr_range *range; while ( !list_empty(head) ) { range = list_entry(head->next, struct hvm_mem_pinned_cacheattr_range, list); list_del(&range->list); xfree(range); } } int32_t hvm_get_mem_pinned_cacheattr( struct domain *d, uint64_t guest_fn, uint32_t *type) { struct hvm_mem_pinned_cacheattr_range *range; *type = 0; if ( !is_hvm_domain(d) ) return 0; list_for_each_entry_rcu ( range, &d->arch.hvm_domain.pinned_cacheattr_ranges, list ) { if ( (guest_fn >= range->start) && (guest_fn <= range->end) ) { *type = range->type; return 1; } } return 0; } int32_t hvm_set_mem_pinned_cacheattr( struct domain *d, uint64_t gfn_start, uint64_t gfn_end, uint32_t type) { struct hvm_mem_pinned_cacheattr_range *range; if ( !((type == PAT_TYPE_UNCACHABLE) || (type == PAT_TYPE_WRCOMB) || (type == PAT_TYPE_WRTHROUGH) || (type == PAT_TYPE_WRPROT) || (type == PAT_TYPE_WRBACK) || (type == PAT_TYPE_UC_MINUS)) || !is_hvm_domain(d) ) return -EINVAL; range = xzalloc(struct hvm_mem_pinned_cacheattr_range); if ( range == NULL ) return -ENOMEM; range->start = gfn_start; range->end = gfn_end; range->type = type; list_add_rcu(&range->list, &d->arch.hvm_domain.pinned_cacheattr_ranges); return 0; } static int hvm_save_mtrr_msr(struct domain *d, hvm_domain_context_t *h) { int i; struct vcpu *v; struct hvm_hw_mtrr hw_mtrr; struct mtrr_state *mtrr_state; /* save mtrr&pat */ for_each_vcpu(d, v) { mtrr_state = &v->arch.hvm_vcpu.mtrr; hvm_get_guest_pat(v, &hw_mtrr.msr_pat_cr); hw_mtrr.msr_mtrr_def_type = mtrr_state->def_type | (mtrr_state->enabled << 10); hw_mtrr.msr_mtrr_cap = mtrr_state->mtrr_cap; for ( i = 0; i < MTRR_VCNT; i++ ) { /* save physbase */ hw_mtrr.msr_mtrr_var[i*2] = ((uint64_t*)mtrr_state->var_ranges)[i*2]; /* save physmask */ hw_mtrr.msr_mtrr_var[i*2+1] = ((uint64_t*)mtrr_state->var_ranges)[i*2+1]; } for ( i = 0; i < NUM_FIXED_MSR; i++ ) hw_mtrr.msr_mtrr_fixed[i] = ((uint64_t*)mtrr_state->fixed_ranges)[i]; if ( hvm_save_entry(MTRR, v->vcpu_id, h, &hw_mtrr) != 0 ) return 1; } return 0; } static int hvm_load_mtrr_msr(struct domain *d, hvm_domain_context_t *h) { int vcpuid, i; struct vcpu *v; struct mtrr_state *mtrr_state; struct hvm_hw_mtrr hw_mtrr; vcpuid = hvm_load_instance(h); if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) { dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n", d->domain_id, vcpuid); return -EINVAL; } if ( hvm_load_entry(MTRR, h, &hw_mtrr) != 0 ) return -EINVAL; mtrr_state = &v->arch.hvm_vcpu.mtrr; hvm_set_guest_pat(v, hw_mtrr.msr_pat_cr); mtrr_state->mtrr_cap = hw_mtrr.msr_mtrr_cap; for ( i = 0; i < NUM_FIXED_MSR; i++ ) mtrr_fix_range_msr_set(mtrr_state, i, hw_mtrr.msr_mtrr_fixed[i]); for ( i = 0; i < MTRR_VCNT; i++ ) { mtrr_var_range_msr_set(d, mtrr_state, MTRRphysBase_MSR(i), hw_mtrr.msr_mtrr_var[i*2]); mtrr_var_range_msr_set(d, mtrr_state, MTRRphysMask_MSR(i), hw_mtrr.msr_mtrr_var[i*2+1]); } mtrr_def_type_msr_set(mtrr_state, hw_mtrr.msr_mtrr_def_type); return 0; } HVM_REGISTER_SAVE_RESTORE(MTRR, hvm_save_mtrr_msr, hvm_load_mtrr_msr, 1, HVMSR_PER_VCPU); uint8_t epte_get_entry_emt(struct domain *d, unsigned long gfn, mfn_t mfn, uint8_t *ipat, bool_t direct_mmio) { uint8_t gmtrr_mtype, hmtrr_mtype; uint32_t type; struct vcpu *v = current; *ipat = 0; if ( (current->domain != d) && ((d->vcpu == NULL) || ((v = d->vcpu[0]) == NULL)) ) return MTRR_TYPE_WRBACK; if ( !is_pvh_vcpu(v) && !v->domain->arch.hvm_domain.params[HVM_PARAM_IDENT_PT] ) return MTRR_TYPE_WRBACK; if ( !mfn_valid(mfn_x(mfn)) ) return MTRR_TYPE_UNCACHABLE; if ( hvm_get_mem_pinned_cacheattr(d, gfn, &type) ) return type; if ( !iommu_enabled ) { *ipat = 1; return MTRR_TYPE_WRBACK; } if ( direct_mmio ) return MTRR_TYPE_UNCACHABLE; if ( iommu_snoop ) { *ipat = 1; return MTRR_TYPE_WRBACK; } gmtrr_mtype = is_hvm_vcpu(v) ? get_mtrr_type(&v->arch.hvm_vcpu.mtrr, (gfn << PAGE_SHIFT)) : MTRR_TYPE_WRBACK; hmtrr_mtype = get_mtrr_type(&mtrr_state, (mfn_x(mfn) << PAGE_SHIFT)); return ((gmtrr_mtype <= hmtrr_mtype) ? gmtrr_mtype : hmtrr_mtype); } xen-4.4.0/xen/arch/x86/hvm/i8254.c0000664000175000017500000003331112307313555014311 0ustar smbsmb/* * QEMU 8253/8254 interval timer emulation * * Copyright (c) 2003-2004 Fabrice Bellard * Copyright (c) 2006 Intel Corperation * Copyright (c) 2007 Keir Fraser, XenSource Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define domain_vpit(x) (&(x)->arch.vpit) #define vcpu_vpit(x) (domain_vpit((x)->domain)) #define vpit_domain(x) (container_of((x), struct domain, arch.vpit)) #define vpit_vcpu(x) (pt_global_vcpu_target(vpit_domain(x))) #define RW_STATE_LSB 1 #define RW_STATE_MSB 2 #define RW_STATE_WORD0 3 #define RW_STATE_WORD1 4 static int handle_pit_io( int dir, uint32_t port, uint32_t bytes, uint32_t *val); static int handle_speaker_io( int dir, uint32_t port, uint32_t bytes, uint32_t *val); #define get_guest_time(v) \ (is_hvm_vcpu(v) ? hvm_get_guest_time(v) : (u64)get_s_time()) static int pit_get_count(PITState *pit, int channel) { uint64_t d; int counter; struct hvm_hw_pit_channel *c = &pit->hw.channels[channel]; struct vcpu *v = vpit_vcpu(pit); ASSERT(spin_is_locked(&pit->lock)); d = muldiv64(get_guest_time(v) - pit->count_load_time[channel], PIT_FREQ, SYSTEM_TIME_HZ); switch ( c->mode ) { case 0: case 1: case 4: case 5: counter = (c->count - d) & 0xffff; break; case 3: /* XXX: may be incorrect for odd counts */ counter = c->count - ((2 * d) % c->count); break; default: counter = c->count - (d % c->count); break; } return counter; } static int pit_get_out(PITState *pit, int channel) { struct hvm_hw_pit_channel *s = &pit->hw.channels[channel]; uint64_t d; int out; struct vcpu *v = vpit_vcpu(pit); ASSERT(spin_is_locked(&pit->lock)); d = muldiv64(get_guest_time(v) - pit->count_load_time[channel], PIT_FREQ, SYSTEM_TIME_HZ); switch ( s->mode ) { default: case 0: out = (d >= s->count); break; case 1: out = (d < s->count); break; case 2: out = (((d % s->count) == 0) && (d != 0)); break; case 3: out = ((d % s->count) < ((s->count + 1) >> 1)); break; case 4: case 5: out = (d == s->count); break; } return out; } static void pit_set_gate(PITState *pit, int channel, int val) { struct hvm_hw_pit_channel *s = &pit->hw.channels[channel]; struct vcpu *v = vpit_vcpu(pit); ASSERT(spin_is_locked(&pit->lock)); switch ( s->mode ) { default: case 0: case 4: /* XXX: just disable/enable counting */ break; case 1: case 5: case 2: case 3: /* Restart counting on rising edge. */ if ( s->gate < val ) pit->count_load_time[channel] = get_guest_time(v); break; } s->gate = val; } static int pit_get_gate(PITState *pit, int channel) { ASSERT(spin_is_locked(&pit->lock)); return pit->hw.channels[channel].gate; } static void pit_time_fired(struct vcpu *v, void *priv) { uint64_t *count_load_time = priv; *count_load_time = get_guest_time(v); } static void pit_load_count(PITState *pit, int channel, int val) { u32 period; struct hvm_hw_pit_channel *s = &pit->hw.channels[channel]; struct vcpu *v = vpit_vcpu(pit); ASSERT(spin_is_locked(&pit->lock)); if ( val == 0 ) val = 0x10000; if ( v == NULL ) pit->count_load_time[channel] = 0; else pit->count_load_time[channel] = get_guest_time(v); s->count = val; period = DIV_ROUND(val * SYSTEM_TIME_HZ, PIT_FREQ); if ( (v == NULL) || !is_hvm_vcpu(v) || (channel != 0) ) return; switch ( s->mode ) { case 2: case 3: /* Periodic timer. */ create_periodic_time(v, &pit->pt0, period, period, 0, pit_time_fired, &pit->count_load_time[channel]); break; case 1: case 4: /* One-shot timer. */ create_periodic_time(v, &pit->pt0, period, 0, 0, pit_time_fired, &pit->count_load_time[channel]); break; default: destroy_periodic_time(&pit->pt0); break; } } static void pit_latch_count(PITState *pit, int channel) { struct hvm_hw_pit_channel *c = &pit->hw.channels[channel]; ASSERT(spin_is_locked(&pit->lock)); if ( !c->count_latched ) { c->latched_count = pit_get_count(pit, channel); c->count_latched = c->rw_mode; } } static void pit_latch_status(PITState *pit, int channel) { struct hvm_hw_pit_channel *c = &pit->hw.channels[channel]; ASSERT(spin_is_locked(&pit->lock)); if ( !c->status_latched ) { /* TODO: Return NULL COUNT (bit 6). */ c->status = ((pit_get_out(pit, channel) << 7) | (c->rw_mode << 4) | (c->mode << 1) | c->bcd); c->status_latched = 1; } } static void pit_ioport_write(struct PITState *pit, uint32_t addr, uint32_t val) { int channel, access; struct hvm_hw_pit_channel *s; val &= 0xff; addr &= 3; spin_lock(&pit->lock); if ( addr == 3 ) { channel = val >> 6; if ( channel == 3 ) { /* Read-Back Command. */ for ( channel = 0; channel < 3; channel++ ) { s = &pit->hw.channels[channel]; if ( val & (2 << channel) ) { if ( !(val & 0x20) ) pit_latch_count(pit, channel); if ( !(val & 0x10) ) pit_latch_status(pit, channel); } } } else { /* Select Counter . */ s = &pit->hw.channels[channel]; access = (val >> 4) & 3; if ( access == 0 ) { pit_latch_count(pit, channel); } else { s->rw_mode = access; s->read_state = access; s->write_state = access; s->mode = (val >> 1) & 7; if ( s->mode > 5 ) s->mode -= 4; s->bcd = val & 1; /* XXX: update irq timer ? */ } } } else { /* Write Count. */ s = &pit->hw.channels[addr]; switch ( s->write_state ) { default: case RW_STATE_LSB: pit_load_count(pit, addr, val); break; case RW_STATE_MSB: pit_load_count(pit, addr, val << 8); break; case RW_STATE_WORD0: s->write_latch = val; s->write_state = RW_STATE_WORD1; break; case RW_STATE_WORD1: pit_load_count(pit, addr, s->write_latch | (val << 8)); s->write_state = RW_STATE_WORD0; break; } } spin_unlock(&pit->lock); } static uint32_t pit_ioport_read(struct PITState *pit, uint32_t addr) { int ret, count; struct hvm_hw_pit_channel *s; addr &= 3; s = &pit->hw.channels[addr]; spin_lock(&pit->lock); if ( s->status_latched ) { s->status_latched = 0; ret = s->status; } else if ( s->count_latched ) { switch ( s->count_latched ) { default: case RW_STATE_LSB: ret = s->latched_count & 0xff; s->count_latched = 0; break; case RW_STATE_MSB: ret = s->latched_count >> 8; s->count_latched = 0; break; case RW_STATE_WORD0: ret = s->latched_count & 0xff; s->count_latched = RW_STATE_MSB; break; } } else { switch ( s->read_state ) { default: case RW_STATE_LSB: count = pit_get_count(pit, addr); ret = count & 0xff; break; case RW_STATE_MSB: count = pit_get_count(pit, addr); ret = (count >> 8) & 0xff; break; case RW_STATE_WORD0: count = pit_get_count(pit, addr); ret = count & 0xff; s->read_state = RW_STATE_WORD1; break; case RW_STATE_WORD1: count = pit_get_count(pit, addr); ret = (count >> 8) & 0xff; s->read_state = RW_STATE_WORD0; break; } } spin_unlock(&pit->lock); return ret; } void pit_stop_channel0_irq(PITState *pit) { spin_lock(&pit->lock); destroy_periodic_time(&pit->pt0); spin_unlock(&pit->lock); } static int pit_save(struct domain *d, hvm_domain_context_t *h) { PITState *pit = domain_vpit(d); int rc; spin_lock(&pit->lock); rc = hvm_save_entry(PIT, 0, h, &pit->hw); spin_unlock(&pit->lock); return rc; } static int pit_load(struct domain *d, hvm_domain_context_t *h) { PITState *pit = domain_vpit(d); int i; spin_lock(&pit->lock); if ( hvm_load_entry(PIT, h, &pit->hw) ) { spin_unlock(&pit->lock); return 1; } /* * Recreate platform timers from hardware state. There will be some * time jitter here, but the wall-clock will have jumped massively, so * we hope the guest can handle it. */ pit->pt0.last_plt_gtime = get_guest_time(d->vcpu[0]); for ( i = 0; i < 3; i++ ) pit_load_count(pit, i, pit->hw.channels[i].count); spin_unlock(&pit->lock); return 0; } HVM_REGISTER_SAVE_RESTORE(PIT, pit_save, pit_load, 1, HVMSR_PER_DOM); void pit_reset(struct domain *d) { PITState *pit = domain_vpit(d); struct hvm_hw_pit_channel *s; int i; destroy_periodic_time(&pit->pt0); pit->pt0.source = PTSRC_isa; spin_lock(&pit->lock); for ( i = 0; i < 3; i++ ) { s = &pit->hw.channels[i]; s->mode = 0xff; /* the init mode */ s->gate = (i != 2); pit_load_count(pit, i, 0); } spin_unlock(&pit->lock); } void pit_init(struct vcpu *v, unsigned long cpu_khz) { struct domain *d = v->domain; PITState *pit = domain_vpit(d); spin_lock_init(&pit->lock); if ( is_hvm_domain(d) ) { register_portio_handler(d, PIT_BASE, 4, handle_pit_io); register_portio_handler(d, 0x61, 1, handle_speaker_io); } pit_reset(d); } void pit_deinit(struct domain *d) { PITState *pit = domain_vpit(d); destroy_periodic_time(&pit->pt0); } /* the intercept action for PIT DM retval:0--not handled; 1--handled */ static int handle_pit_io( int dir, uint32_t port, uint32_t bytes, uint32_t *val) { struct PITState *vpit = vcpu_vpit(current); if ( bytes != 1 ) { gdprintk(XENLOG_WARNING, "PIT bad access\n"); return X86EMUL_OKAY; } if ( dir == IOREQ_WRITE ) { pit_ioport_write(vpit, port, *val); } else { if ( (port & 3) != 3 ) *val = pit_ioport_read(vpit, port); else gdprintk(XENLOG_WARNING, "PIT: read A1:A0=3!\n"); } return X86EMUL_OKAY; } static void speaker_ioport_write( struct PITState *pit, uint32_t addr, uint32_t val) { pit->hw.speaker_data_on = (val >> 1) & 1; pit_set_gate(pit, 2, val & 1); } static uint32_t speaker_ioport_read( struct PITState *pit, uint32_t addr) { /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ unsigned int refresh_clock = ((unsigned int)NOW() >> 14) & 1; return ((pit->hw.speaker_data_on << 1) | pit_get_gate(pit, 2) | (pit_get_out(pit, 2) << 5) | (refresh_clock << 4)); } static int handle_speaker_io( int dir, uint32_t port, uint32_t bytes, uint32_t *val) { struct PITState *vpit = vcpu_vpit(current); BUG_ON(bytes != 1); spin_lock(&vpit->lock); if ( dir == IOREQ_WRITE ) speaker_ioport_write(vpit, port, *val); else *val = speaker_ioport_read(vpit, port); spin_unlock(&vpit->lock); return X86EMUL_OKAY; } int pv_pit_handler(int port, int data, int write) { ioreq_t ioreq = { .size = 1, .type = IOREQ_TYPE_PIO, .addr = port, .dir = write ? IOREQ_WRITE : IOREQ_READ, .data = data }; if ( (current->domain->domain_id == 0) && dom0_pit_access(&ioreq) ) { /* nothing to do */; } else { uint32_t val = data; if ( port == 0x61 ) handle_speaker_io(ioreq.dir, port, 1, &val); else handle_pit_io(ioreq.dir, port, 1, &val); ioreq.data = val; } return !write ? ioreq.data : 0; } xen-4.4.0/xen/arch/x86/i8259.c0000664000175000017500000003061712307313555013532 0ustar smbsmb/****************************************************************************** * i8259.c * * Well, this is required for SMP systems as well, as it build interrupt * tables for IO APICS as well as uniprocessor 8259-alikes. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Common place to define all x86 IRQ vectors * * This builds up the IRQ handler stubs using some ugly macros in irq.h * * These macros create the low-level assembly IRQ routines that save * register context and call do_IRQ(). do_IRQ() then does all the * operations that are needed to keep the AT (or SMP IOAPIC) * interrupt-controller happy. */ __asm__(".section .text"); BUILD_COMMON_IRQ() #define IRQ_NAME(nr) VEC##nr##_interrupt #define BI(nr) \ void IRQ_NAME(nr)(void); \ __asm__( \ ".if " STR(0x##nr) " >= " STR(FIRST_DYNAMIC_VECTOR) "\n" \ __ALIGN_STR "\n" \ STR(IRQ_NAME(nr)) ":\n\t" \ BUILD_IRQ(0x##nr) "\n" \ ".else\n" \ ".equ " STR(IRQ_NAME(nr)) ", 0\n" \ ".endif\n") #define BUILD_16_IRQS(x) \ BI(x##0); BI(x##1); BI(x##2); BI(x##3); \ BI(x##4); BI(x##5); BI(x##6); BI(x##7); \ BI(x##8); BI(x##9); BI(x##a); BI(x##b); \ BI(x##c); BI(x##d); BI(x##e); BI(x##f) BUILD_16_IRQS(0); BUILD_16_IRQS(1); BUILD_16_IRQS(2); BUILD_16_IRQS(3); BUILD_16_IRQS(4); BUILD_16_IRQS(5); BUILD_16_IRQS(6); BUILD_16_IRQS(7); BUILD_16_IRQS(8); BUILD_16_IRQS(9); BUILD_16_IRQS(a); BUILD_16_IRQS(b); BUILD_16_IRQS(c); BUILD_16_IRQS(d); BUILD_16_IRQS(e); BUILD_16_IRQS(f); #undef BUILD_16_IRQS #undef BI #define IRQ(x,y) IRQ_NAME(x##y) #define IRQLIST_16(x) \ IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) static void (*__initdata interrupt[NR_VECTORS])(void) = { IRQLIST_16(0), IRQLIST_16(1), IRQLIST_16(2), IRQLIST_16(3), IRQLIST_16(4), IRQLIST_16(5), IRQLIST_16(6), IRQLIST_16(7), IRQLIST_16(8), IRQLIST_16(9), IRQLIST_16(a), IRQLIST_16(b), IRQLIST_16(c), IRQLIST_16(d), IRQLIST_16(e), IRQLIST_16(f) }; #undef IRQ #undef IRQLIST_16 /* * This is the 'legacy' 8259A Programmable Interrupt Controller, * present in the majority of PC/AT boxes. * plus some generic x86 specific things if generic specifics makes * any sense at all. * this file should become arch/i386/kernel/irq.c when the old irq.c * moves to arch independent land */ static DEFINE_SPINLOCK(i8259A_lock); static bool_t _mask_and_ack_8259A_irq(unsigned int irq); bool_t bogus_8259A_irq(unsigned int irq) { return _mask_and_ack_8259A_irq(irq); } static void mask_and_ack_8259A_irq(struct irq_desc *desc) { _mask_and_ack_8259A_irq(desc->irq); } static unsigned int startup_8259A_irq(struct irq_desc *desc) { enable_8259A_irq(desc); return 0; /* never anything pending */ } static void end_8259A_irq(struct irq_desc *desc, u8 vector) { if (!(desc->status & (IRQ_DISABLED|IRQ_INPROGRESS))) enable_8259A_irq(desc); } static struct hw_interrupt_type __read_mostly i8259A_irq_type = { .typename = "XT-PIC", .startup = startup_8259A_irq, .shutdown = disable_8259A_irq, .enable = enable_8259A_irq, .disable = disable_8259A_irq, .ack = mask_and_ack_8259A_irq, .end = end_8259A_irq }; /* * 8259A PIC functions to handle ISA devices: */ #define aeoi_mode (i8259A_irq_type.ack == disable_8259A_irq) /* * This contains the irq mask for both 8259A irq controllers, */ static unsigned int cached_irq_mask = 0xffff; #define __byte(x,y) (((unsigned char *)&(y))[x]) #define cached_21 (__byte(0,cached_irq_mask)) #define cached_A1 (__byte(1,cached_irq_mask)) /* * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) * boards the timer interrupt is not really connected to any IO-APIC pin, * it's fed to the master 8259A's IR0 line only. * * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. * this 'mixed mode' IRQ handling costs nothing because it's only used * at IRQ setup time. */ unsigned int __read_mostly io_apic_irqs; static void _disable_8259A_irq(unsigned int irq) { unsigned int mask = 1 << irq; unsigned long flags; spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask |= mask; if (irq & 8) outb(cached_A1,0xA1); else outb(cached_21,0x21); per_cpu(vector_irq, 0)[LEGACY_VECTOR(irq)] = ~irq; spin_unlock_irqrestore(&i8259A_lock, flags); } void disable_8259A_irq(struct irq_desc *desc) { _disable_8259A_irq(desc->irq); } void enable_8259A_irq(struct irq_desc *desc) { unsigned int mask = ~(1 << desc->irq); unsigned long flags; spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask &= mask; per_cpu(vector_irq, 0)[LEGACY_VECTOR(desc->irq)] = desc->irq; if (desc->irq & 8) outb(cached_A1,0xA1); else outb(cached_21,0x21); spin_unlock_irqrestore(&i8259A_lock, flags); } int i8259A_irq_pending(unsigned int irq) { unsigned int mask = 1<> 8); spin_unlock_irqrestore(&i8259A_lock, flags); return ret; } void mask_8259A(void) { unsigned long flags; spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, 0xA1); outb(0xff, 0x21); spin_unlock_irqrestore(&i8259A_lock, flags); } void unmask_8259A(void) { unsigned long flags; spin_lock_irqsave(&i8259A_lock, flags); outb(cached_A1, 0xA1); outb(cached_21, 0x21); spin_unlock_irqrestore(&i8259A_lock, flags); } /* * This function assumes to be called rarely. Switching between * 8259A registers is slow. * This has to be protected by the irq controller spinlock * before being called. */ static inline int i8259A_irq_real(unsigned int irq) { int value; int irqmask = 1<> 8); outb(0x0A,0xA0); /* back to the IRR register */ return value; } /* * Careful! The 8259A is a fragile beast, it pretty * much _has_ to be done exactly like this (mask it * first, _then_ send the EOI, and the order of EOI * to the two 8259s is important! Return a boolean * indicating whether the irq was genuine or spurious. */ static bool_t _mask_and_ack_8259A_irq(unsigned int irq) { unsigned int irqmask = 1 << irq; unsigned long flags; bool_t is_real_irq = 1; /* Assume real unless spurious */ spin_lock_irqsave(&i8259A_lock, flags); /* * Lightweight spurious IRQ detection. We do not want * to overdo spurious IRQ handling - it's usually a sign * of hardware problems, so we only do the checks we can * do without slowing down good hardware unnecesserily. * * Note that IRQ7 and IRQ15 (the two spurious IRQs * usually resulting from the 8259A-1|2 PICs) occur * even if the IRQ is masked in the 8259A. Thus we * can check spurious 8259A IRQs without doing the * quite slow i8259A_irq_real() call for every IRQ. * This does not cover 100% of spurious interrupts, * but should be enough to warn the user that there * is something bad going on ... */ if ((cached_irq_mask & irqmask) && !i8259A_irq_real(irq)) { static int spurious_irq_mask; is_real_irq = 0; /* Report spurious IRQ, once per IRQ line. */ if (!(spurious_irq_mask & irqmask)) { printk("spurious 8259A interrupt: IRQ%d.\n", irq); spurious_irq_mask |= irqmask; } /* * Theoretically we do not have to handle this IRQ, * but in Linux this does not cause problems and is * simpler for us. */ } cached_irq_mask |= irqmask; if (irq & 8) { inb(0xA1); /* DUMMY - (do we need this?) */ outb(cached_A1,0xA1); if (!aeoi_mode) { outb(0x60 + (irq & 7), 0xA0);/* 'Specific EOI' to slave */ outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ } } else { inb(0x21); /* DUMMY - (do we need this?) */ outb(cached_21,0x21); if (!aeoi_mode) outb(0x60 + irq, 0x20);/* 'Specific EOI' to master */ } spin_unlock_irqrestore(&i8259A_lock, flags); return is_real_irq; } static char irq_trigger[2]; /** * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ */ static void restore_ELCR(char *trigger) { outb(trigger[0], 0x4d0); outb(trigger[1], 0x4d1); } static void save_ELCR(char *trigger) { /* IRQ 0,1,2,8,13 are marked as reserved */ trigger[0] = inb(0x4d0) & 0xF8; trigger[1] = inb(0x4d1) & 0xDE; } int i8259A_resume(void) { init_8259A(aeoi_mode); restore_ELCR(irq_trigger); return 0; } int i8259A_suspend(void) { save_ELCR(irq_trigger); return 0; } void __devinit init_8259A(int auto_eoi) { unsigned long flags; spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, 0x21); /* mask all of 8259A-1 */ outb(0xff, 0xA1); /* mask all of 8259A-2 */ /* * outb_p - this has to work on a wide range of PC hardware. */ outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */ outb_p(FIRST_LEGACY_VECTOR + 0, 0x21); /* ICW2: 8259A-1 IR0-7 */ outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */ if (auto_eoi) outb_p(0x03, 0x21); /* master does Auto EOI */ else outb_p(0x01, 0x21); /* master expects normal EOI */ outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */ outb_p(FIRST_LEGACY_VECTOR + 8, 0xA1); /* ICW2: 8259A-2 IR0-7 */ outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */ outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode is to be investigated) */ if (auto_eoi) /* * in AEOI mode we just have to mask the interrupt * when acking. */ i8259A_irq_type.ack = disable_8259A_irq; else i8259A_irq_type.ack = mask_and_ack_8259A_irq; udelay(100); /* wait for 8259A to initialize */ outb(cached_21, 0x21); /* restore master IRQ mask */ outb(cached_A1, 0xA1); /* restore slave IRQ mask */ spin_unlock_irqrestore(&i8259A_lock, flags); } void __init make_8259A_irq(unsigned int irq) { io_apic_irqs &= ~(1 << irq); irq_to_desc(irq)->handler = &i8259A_irq_type; } static struct irqaction __read_mostly cascade = { no_action, "cascade", NULL}; void __init init_IRQ(void) { int vector, irq, cpu = smp_processor_id(); init_bsp_APIC(); init_8259A(0); BUG_ON(init_irq_data() < 0); for ( vector = FIRST_DYNAMIC_VECTOR; vector < NR_VECTORS; vector++ ) { if (vector == HYPERCALL_VECTOR || vector == LEGACY_SYSCALL_VECTOR) continue; BUG_ON(!interrupt[vector]); set_intr_gate(vector, interrupt[vector]); } for (irq = 0; platform_legacy_irq(irq); irq++) { struct irq_desc *desc = irq_to_desc(irq); if ( irq == 2 ) /* IRQ2 doesn't exist */ continue; desc->handler = &i8259A_irq_type; per_cpu(vector_irq, cpu)[FIRST_LEGACY_VECTOR + irq] = irq; cpumask_copy(desc->arch.cpu_mask, cpumask_of(cpu)); desc->arch.vector = FIRST_LEGACY_VECTOR + irq; } per_cpu(vector_irq, cpu)[IRQ0_VECTOR] = 0; apic_intr_init(); /* Set the clock to HZ Hz */ #define CLOCK_TICK_RATE 1193182 /* crystal freq (Hz) */ #define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ) outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ outb(LATCH >> 8, PIT_CH0); /* MSB */ setup_irq(2, &cascade); } xen-4.4.0/xen/arch/x86/smp.c0000664000175000017500000001773112307313555013553 0ustar smbsmb/* * Intel SMP support routines. * * (c) 1995 Alan Cox, Building #3 * (c) 1998-99, 2000 Ingo Molnar * * This code is released under the GNU General Public License version 2 or * later. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include int hard_smp_processor_id(void) { return get_apic_id(); } /* * send_IPI_mask(cpumask, vector): sends @vector IPI to CPUs in @cpumask, * excluding the local CPU. @cpumask may be empty. */ void send_IPI_mask(const cpumask_t *mask, int vector) { genapic->send_IPI_mask(mask, vector); } void send_IPI_self(int vector) { genapic->send_IPI_self(vector); } /* * Some notes on x86 processor bugs affecting SMP operation: * * Pentium, Pentium Pro, II, III (and all CPUs) have bugs. * The Linux implications for SMP are handled as follows: * * Pentium III / [Xeon] * None of the E1AP-E3AP errata are visible to the user. * * E1AP. see PII A1AP * E2AP. see PII A2AP * E3AP. see PII A3AP * * Pentium II / [Xeon] * None of the A1AP-A3AP errata are visible to the user. * * A1AP. see PPro 1AP * A2AP. see PPro 2AP * A3AP. see PPro 7AP * * Pentium Pro * None of 1AP-9AP errata are visible to the normal user, * except occasional delivery of 'spurious interrupt' as trap #15. * This is very rare and a non-problem. * * 1AP. Linux maps APIC as non-cacheable * 2AP. worked around in hardware * 3AP. fixed in C0 and above steppings microcode update. * Linux does not use excessive STARTUP_IPIs. * 4AP. worked around in hardware * 5AP. symmetric IO mode (normal Linux operation) not affected. * 'noapic' mode has vector 0xf filled out properly. * 6AP. 'noapic' mode might be affected - fixed in later steppings * 7AP. We do not assume writes to the LVT deassering IRQs * 8AP. We do not enable low power mode (deep sleep) during MP bootup * 9AP. We do not use mixed mode */ /* * The following functions deal with sending IPIs between CPUs. */ static inline int __prepare_ICR (unsigned int shortcut, int vector) { return APIC_DM_FIXED | shortcut | vector; } static inline int __prepare_ICR2 (unsigned int mask) { return SET_xAPIC_DEST_FIELD(mask); } void apic_wait_icr_idle(void) { if ( x2apic_enabled ) return; while ( apic_read( APIC_ICR ) & APIC_ICR_BUSY ) cpu_relax(); } static void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest) { unsigned int cfg; /* * Wait for idle. */ apic_wait_icr_idle(); /* * prepare target chip field */ cfg = __prepare_ICR(shortcut, vector) | dest; /* * Send the IPI. The write to APIC_ICR fires this off. */ apic_write_around(APIC_ICR, cfg); } void send_IPI_self_flat(int vector) { __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); } void send_IPI_self_phys(int vector) { __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); } void send_IPI_self_x2apic(int vector) { apic_write(APIC_SELF_IPI, vector); } void send_IPI_mask_flat(const cpumask_t *cpumask, int vector) { unsigned long mask = cpumask_bits(cpumask)[0]; unsigned long cfg; unsigned long flags; mask &= cpumask_bits(&cpu_online_map)[0]; mask &= ~(1UL << smp_processor_id()); if ( mask == 0 ) return; local_irq_save(flags); /* * Wait for idle. */ apic_wait_icr_idle(); /* * prepare target chip field */ cfg = __prepare_ICR2(mask); apic_write_around(APIC_ICR2, cfg); /* * program the ICR */ cfg = __prepare_ICR(0, vector) | APIC_DEST_LOGICAL; /* * Send the IPI. The write to APIC_ICR fires this off. */ apic_write_around(APIC_ICR, cfg); local_irq_restore(flags); } void send_IPI_mask_phys(const cpumask_t *mask, int vector) { unsigned long cfg, flags; unsigned int query_cpu; local_irq_save(flags); for_each_cpu ( query_cpu, mask ) { if ( !cpu_online(query_cpu) || (query_cpu == smp_processor_id()) ) continue; /* * Wait for idle. */ apic_wait_icr_idle(); /* * prepare target chip field */ cfg = __prepare_ICR2(cpu_physical_id(query_cpu)); apic_write_around(APIC_ICR2, cfg); /* * program the ICR */ cfg = __prepare_ICR(0, vector) | APIC_DEST_PHYSICAL; /* * Send the IPI. The write to APIC_ICR fires this off. */ apic_write_around(APIC_ICR, cfg); } local_irq_restore(flags); } static DEFINE_SPINLOCK(flush_lock); static cpumask_t flush_cpumask; static const void *flush_va; static unsigned int flush_flags; void invalidate_interrupt(struct cpu_user_regs *regs) { ack_APIC_irq(); perfc_incr(ipis); if ( !__sync_local_execstate() || (flush_flags & (FLUSH_TLB_GLOBAL | FLUSH_CACHE)) ) flush_area_local(flush_va, flush_flags); cpumask_clear_cpu(smp_processor_id(), &flush_cpumask); } void flush_area_mask(const cpumask_t *mask, const void *va, unsigned int flags) { ASSERT(local_irq_is_enabled()); if ( cpumask_test_cpu(smp_processor_id(), mask) ) flush_area_local(va, flags); if ( !cpumask_subset(mask, cpumask_of(smp_processor_id())) ) { spin_lock(&flush_lock); cpumask_and(&flush_cpumask, mask, &cpu_online_map); cpumask_clear_cpu(smp_processor_id(), &flush_cpumask); flush_va = va; flush_flags = flags; send_IPI_mask(&flush_cpumask, INVALIDATE_TLB_VECTOR); while ( !cpumask_empty(&flush_cpumask) ) cpu_relax(); spin_unlock(&flush_lock); } } /* Call with no locks held and interrupts enabled (e.g., softirq context). */ void new_tlbflush_clock_period(void) { cpumask_t allbutself; /* Flush everyone else. We definitely flushed just before entry. */ cpumask_andnot(&allbutself, &cpu_online_map, cpumask_of(smp_processor_id())); flush_mask(&allbutself, FLUSH_TLB); /* No need for atomicity: we are the only possible updater. */ ASSERT(tlbflush_clock == 0); tlbflush_clock++; } void smp_send_event_check_mask(const cpumask_t *mask) { send_IPI_mask(mask, EVENT_CHECK_VECTOR); } void smp_send_call_function_mask(const cpumask_t *mask) { send_IPI_mask(mask, CALL_FUNCTION_VECTOR); if ( cpumask_test_cpu(smp_processor_id(), mask) ) { local_irq_disable(); smp_call_function_interrupt(); local_irq_enable(); } } void __stop_this_cpu(void) { ASSERT(!local_irq_is_enabled()); disable_local_APIC(); hvm_cpu_down(); /* * Clear FPU, zapping any pending exceptions. Needed for warm reset with * some BIOSes. */ clts(); asm volatile ( "fninit" ); cpumask_clear_cpu(smp_processor_id(), &cpu_online_map); } static void stop_this_cpu(void *dummy) { __stop_this_cpu(); for ( ; ; ) halt(); } /* * Stop all CPUs and turn off local APICs and the IO-APIC, so other OSs see a * clean IRQ state. */ void smp_send_stop(void) { int timeout = 10; smp_call_function(stop_this_cpu, NULL, 0); /* Wait 10ms for all other CPUs to go offline. */ while ( (num_online_cpus() > 1) && (timeout-- > 0) ) mdelay(1); local_irq_disable(); __stop_this_cpu(); disable_IO_APIC(); hpet_disable(); local_irq_enable(); } void smp_send_nmi_allbutself(void) { send_IPI_mask(&cpu_online_map, APIC_DM_NMI); } void event_check_interrupt(struct cpu_user_regs *regs) { ack_APIC_irq(); perfc_incr(ipis); this_cpu(irq_count)++; } void call_function_interrupt(struct cpu_user_regs *regs) { ack_APIC_irq(); perfc_incr(ipis); smp_call_function_interrupt(); } xen-4.4.0/xen/arch/x86/tboot.c0000664000175000017500000004147112307313555014101 0ustar smbsmb#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* tboot= */ static char __initdata opt_tboot[20] = ""; string_param("tboot", opt_tboot); /* Global pointer to shared data; NULL means no measured launch. */ tboot_shared_t *g_tboot_shared; static vmac_t domain_mac; /* MAC for all domains during S3 */ static vmac_t xenheap_mac; /* MAC for xen heap during S3 */ static vmac_t frametable_mac; /* MAC for frame table during S3 */ static const uuid_t tboot_shared_uuid = TBOOT_SHARED_UUID; /* used by tboot_protect_mem_regions() and/or tboot_parse_dmar_table() */ static uint64_t __initdata txt_heap_base, __initdata txt_heap_size; static uint64_t __initdata sinit_base, __initdata sinit_size; /* * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE) */ #define TXT_PUB_CONFIG_REGS_BASE 0xfed30000 #define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000 /* # pages for each config regs space - used by fixmap */ #define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \ TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT) /* offsets from pub/priv config space */ #define TXTCR_SINIT_BASE 0x0270 #define TXTCR_SINIT_SIZE 0x0278 #define TXTCR_HEAP_BASE 0x0300 #define TXTCR_HEAP_SIZE 0x0308 extern char __init_begin[], __bss_start[]; #define SHA1_SIZE 20 typedef uint8_t sha1_hash_t[SHA1_SIZE]; typedef struct __packed { uint32_t version; /* currently 6 */ sha1_hash_t bios_acm_id; uint32_t edx_senter_flags; uint64_t mseg_valid; sha1_hash_t sinit_hash; sha1_hash_t mle_hash; sha1_hash_t stm_hash; sha1_hash_t lcp_policy_hash; uint32_t lcp_policy_control; uint32_t rlp_wakeup_addr; uint32_t reserved; uint32_t num_mdrs; uint32_t mdrs_off; uint32_t num_vtd_dmars; uint32_t vtd_dmars_off; } sinit_mle_data_t; static void __init tboot_copy_memory(unsigned char *va, uint32_t size, unsigned long pa) { unsigned long map_base = 0; unsigned char *map_addr = NULL; unsigned int i; for ( i = 0; i < size; i++ ) { if ( map_base != PFN_DOWN(pa + i) ) { map_base = PFN_DOWN(pa + i); set_fixmap(FIX_TBOOT_MAP_ADDRESS, map_base << PAGE_SHIFT); map_addr = (unsigned char *)fix_to_virt(FIX_TBOOT_MAP_ADDRESS); } va[i] = map_addr[pa + i - (map_base << PAGE_SHIFT)]; } } void __init tboot_probe(void) { tboot_shared_t *tboot_shared; unsigned long p_tboot_shared; /* Look for valid page-aligned address for shared page. */ p_tboot_shared = simple_strtoul(opt_tboot, NULL, 0); if ( (p_tboot_shared == 0) || ((p_tboot_shared & ~PAGE_MASK) != 0) ) return; /* Map and check for tboot UUID. */ set_fixmap(FIX_TBOOT_SHARED_BASE, p_tboot_shared); tboot_shared = (tboot_shared_t *)fix_to_virt(FIX_TBOOT_SHARED_BASE); if ( tboot_shared == NULL ) return; if ( memcmp(&tboot_shared_uuid, (uuid_t *)tboot_shared, sizeof(uuid_t)) ) return; /* new tboot_shared (w/ GAS support, integrity, etc.) is not backwards compatible */ if ( tboot_shared->version < 4 ) { printk("unsupported version of tboot (%u)\n", tboot_shared->version); return; } g_tboot_shared = tboot_shared; printk("TBOOT: found shared page at phys addr %lx:\n", p_tboot_shared); printk(" version: %d\n", tboot_shared->version); printk(" log_addr: %#x\n", tboot_shared->log_addr); printk(" shutdown_entry: %#x\n", tboot_shared->shutdown_entry); printk(" tboot_base: %#x\n", tboot_shared->tboot_base); printk(" tboot_size: %#x\n", tboot_shared->tboot_size); if ( tboot_shared->version >= 6 ) printk(" flags: %#x\n", tboot_shared->flags); /* these will be needed by tboot_protect_mem_regions() and/or tboot_parse_dmar_table(), so get them now */ txt_heap_base = txt_heap_size = sinit_base = sinit_size = 0; /* TXT Heap */ tboot_copy_memory((unsigned char *)&txt_heap_base, sizeof(txt_heap_base), TXT_PUB_CONFIG_REGS_BASE + TXTCR_HEAP_BASE); tboot_copy_memory((unsigned char *)&txt_heap_size, sizeof(txt_heap_size), TXT_PUB_CONFIG_REGS_BASE + TXTCR_HEAP_SIZE); /* SINIT */ tboot_copy_memory((unsigned char *)&sinit_base, sizeof(sinit_base), TXT_PUB_CONFIG_REGS_BASE + TXTCR_SINIT_BASE); tboot_copy_memory((unsigned char *)&sinit_size, sizeof(sinit_size), TXT_PUB_CONFIG_REGS_BASE + TXTCR_SINIT_SIZE); } /* definitions from xen/drivers/passthrough/vtd/iommu.h * used to walk through vtd page tables */ #define LEVEL_STRIDE (9) #define PTE_NUM (1<> PAGE_SHIFT_4K); vmac_update((void *)pt_vaddr, PAGE_SIZE, ctx); for ( i = 0; i < PTE_NUM; i++ ) { pte = &pt_vaddr[i]; if ( !dma_pte_present(*pte) ) continue; if ( next_level >= 1 ) update_iommu_mac(ctx, dma_pte_addr(*pte), next_level); } unmap_domain_page(pt_vaddr); } #define is_page_in_use(page) \ (page_state_is(page, inuse) || page_state_is(page, offlining)) static void update_pagetable_mac(vmac_ctx_t *ctx) { unsigned long mfn; for ( mfn = 0; mfn < max_page; mfn++ ) { struct page_info *page = mfn_to_page(mfn); if ( !mfn_valid(mfn) ) continue; if ( is_page_in_use(page) && !is_xen_heap_page(page) ) { if ( page->count_info & PGC_page_table ) { void *pg = map_domain_page(mfn); vmac_update(pg, PAGE_SIZE, ctx); unmap_domain_page(pg); } } } } static void tboot_gen_domain_integrity(const uint8_t key[TB_KEY_SIZE], vmac_t *mac) { struct domain *d; struct page_info *page; uint8_t nonce[16] = {}; vmac_ctx_t ctx; vmac_set_key((uint8_t *)key, &ctx); for_each_domain( d ) { if ( !d->arch.s3_integrity ) continue; printk("MACing Domain %u\n", d->domain_id); spin_lock(&d->page_alloc_lock); page_list_for_each(page, &d->page_list) { void *pg = __map_domain_page(page); vmac_update(pg, PAGE_SIZE, &ctx); unmap_domain_page(pg); } spin_unlock(&d->page_alloc_lock); if ( !is_idle_domain(d) ) { struct hvm_iommu *hd = domain_hvm_iommu(d); update_iommu_mac(&ctx, hd->pgd_maddr, agaw_to_level(hd->agaw)); } } /* MAC all shadow page tables */ update_pagetable_mac(&ctx); *mac = vmac(NULL, 0, nonce, NULL, &ctx); /* wipe ctx to ensure key is not left in memory */ memset(&ctx, 0, sizeof(ctx)); } /* * For stack overflow detection in debug build, a guard page is set up. * This fn is used to detect whether a page is in the guarded pages for * the above reason. */ static int mfn_in_guarded_stack(unsigned long mfn) { void *p; int i; for ( i = 0; i < nr_cpu_ids; i++ ) { if ( !stack_base[i] ) continue; p = (void *)((unsigned long)stack_base[i] + STACK_SIZE - PRIMARY_STACK_SIZE - PAGE_SIZE); if ( mfn == virt_to_mfn(p) ) return -1; } return 0; } static void tboot_gen_xenheap_integrity(const uint8_t key[TB_KEY_SIZE], vmac_t *mac) { unsigned long mfn; uint8_t nonce[16] = {}; vmac_ctx_t ctx; vmac_set_key((uint8_t *)key, &ctx); for ( mfn = 0; mfn < max_page; mfn++ ) { struct page_info *page = __mfn_to_page(mfn); if ( !mfn_valid(mfn) ) continue; if ( (mfn << PAGE_SHIFT) < __pa(&_end) ) continue; /* skip Xen */ if ( (mfn >= PFN_DOWN(g_tboot_shared->tboot_base - 3 * PAGE_SIZE)) && (mfn < PFN_UP(g_tboot_shared->tboot_base + g_tboot_shared->tboot_size + 3 * PAGE_SIZE)) ) continue; /* skip tboot and its page tables */ if ( is_page_in_use(page) && is_xen_heap_page(page) ) { void *pg; if ( mfn_in_guarded_stack(mfn) ) continue; /* skip guard stack, see memguard_guard_stack() in mm.c */ pg = mfn_to_virt(mfn); vmac_update((uint8_t *)pg, PAGE_SIZE, &ctx); } } *mac = vmac(NULL, 0, nonce, NULL, &ctx); /* wipe ctx to ensure key is not left in memory */ memset(&ctx, 0, sizeof(ctx)); } static void tboot_gen_frametable_integrity(const uint8_t key[TB_KEY_SIZE], vmac_t *mac) { unsigned int sidx, eidx, nidx; unsigned int max_idx = (max_pdx + PDX_GROUP_COUNT - 1)/PDX_GROUP_COUNT; uint8_t nonce[16] = {}; vmac_ctx_t ctx; vmac_set_key((uint8_t *)key, &ctx); for ( sidx = 0; ; sidx = nidx ) { eidx = find_next_zero_bit(pdx_group_valid, max_idx, sidx); nidx = find_next_bit(pdx_group_valid, max_idx, eidx); if ( nidx >= max_idx ) break; vmac_update((uint8_t *)pdx_to_page(sidx * PDX_GROUP_COUNT), pdx_to_page(eidx * PDX_GROUP_COUNT) - pdx_to_page(sidx * PDX_GROUP_COUNT), &ctx); } vmac_update((uint8_t *)pdx_to_page(sidx * PDX_GROUP_COUNT), pdx_to_page(max_pdx - 1) + 1 - pdx_to_page(sidx * PDX_GROUP_COUNT), &ctx); *mac = vmac(NULL, 0, nonce, NULL, &ctx); /* wipe ctx to ensure key is not left in memory */ memset(&ctx, 0, sizeof(ctx)); } void tboot_shutdown(uint32_t shutdown_type) { uint32_t map_base, map_size; int err; g_tboot_shared->shutdown_type = shutdown_type; local_irq_disable(); /* Create identity map for tboot shutdown code. */ /* do before S3 integrity because mapping tboot may change xenheap */ map_base = PFN_DOWN(g_tboot_shared->tboot_base); map_size = PFN_UP(g_tboot_shared->tboot_size); err = map_pages_to_xen(map_base << PAGE_SHIFT, map_base, map_size, __PAGE_HYPERVISOR); if ( err != 0 ) { printk("error (%#x) mapping tboot pages (mfns) @ %#x, %#x\n", err, map_base, map_size); return; } /* if this is S3 then set regions to MAC */ if ( shutdown_type == TB_SHUTDOWN_S3 ) { /* * Xen regions for tboot to MAC */ g_tboot_shared->num_mac_regions = 3; /* S3 resume code (and other real mode trampoline code) */ g_tboot_shared->mac_regions[0].start = bootsym_phys(trampoline_start); g_tboot_shared->mac_regions[0].size = bootsym_phys(trampoline_end) - bootsym_phys(trampoline_start); /* hypervisor code + data */ g_tboot_shared->mac_regions[1].start = (uint64_t)__pa(&_stext); g_tboot_shared->mac_regions[1].size = __pa(&__init_begin) - __pa(&_stext); /* bss */ g_tboot_shared->mac_regions[2].start = (uint64_t)__pa(&__bss_start); g_tboot_shared->mac_regions[2].size = __pa(&_end) - __pa(&__bss_start); /* * MAC domains and other Xen memory */ /* Xen has no better entropy source for MAC key than tboot's */ /* MAC domains first in case it perturbs xenheap */ tboot_gen_domain_integrity(g_tboot_shared->s3_key, &domain_mac); tboot_gen_frametable_integrity(g_tboot_shared->s3_key, &frametable_mac); tboot_gen_xenheap_integrity(g_tboot_shared->s3_key, &xenheap_mac); } write_ptbase(idle_vcpu[0]); ((void(*)(void))(unsigned long)g_tboot_shared->shutdown_entry)(); BUG(); /* should not reach here */ } int tboot_in_measured_env(void) { return (g_tboot_shared != NULL); } int __init tboot_protect_mem_regions(void) { int rc; if ( !tboot_in_measured_env() ) return 1; /* TXT Heap */ if ( txt_heap_base == 0 ) return 0; rc = e820_change_range_type(&e820, txt_heap_base, txt_heap_base + txt_heap_size, E820_RESERVED, E820_UNUSABLE); if ( !rc ) return 0; /* SINIT */ if ( sinit_base == 0 ) return 0; rc = e820_change_range_type(&e820, sinit_base, sinit_base + sinit_size, E820_RESERVED, E820_UNUSABLE); if ( !rc ) return 0; /* TXT Private Space */ rc = e820_change_range_type(&e820, TXT_PRIV_CONFIG_REGS_BASE, TXT_PRIV_CONFIG_REGS_BASE + NR_TXT_CONFIG_PAGES * PAGE_SIZE, E820_RESERVED, E820_UNUSABLE); if ( !rc ) return 0; return 1; } int __init tboot_parse_dmar_table(acpi_table_handler dmar_handler) { struct acpi_table_header *dmar_table; int rc; uint64_t size; uint32_t dmar_table_length; unsigned long pa; sinit_mle_data_t sinit_mle_data; unsigned char *dmar_table_raw; if ( !tboot_in_measured_env() ) return acpi_table_parse(ACPI_SIG_DMAR, dmar_handler); /* ACPI tables may not be DMA protected by tboot, so use DMAR copy */ /* SINIT saved in SinitMleData in TXT heap (which is DMA protected) */ if ( txt_heap_base == 0 ) return 1; /* map TXT heap into Xen addr space */ /* walk heap to SinitMleData */ pa = txt_heap_base; /* skip BiosData */ tboot_copy_memory((unsigned char *)&size, sizeof(size), pa); pa += size; /* skip OsMleData */ tboot_copy_memory((unsigned char *)&size, sizeof(size), pa); pa += size; /* skip OsSinitData */ tboot_copy_memory((unsigned char *)&size, sizeof(size), pa); pa += size; /* now points to SinitMleDataSize; set to SinitMleData */ pa += sizeof(uint64_t); tboot_copy_memory((unsigned char *)&sinit_mle_data, sizeof(sinit_mle_data), pa); /* get addr of DMAR table */ pa += sinit_mle_data.vtd_dmars_off - sizeof(uint64_t); tboot_copy_memory((unsigned char *)&dmar_table_length, sizeof(dmar_table_length), pa + sizeof(char) * ACPI_NAME_SIZE); dmar_table_raw = xmalloc_array(unsigned char, dmar_table_length); tboot_copy_memory(dmar_table_raw, dmar_table_length, pa); dmar_table = (struct acpi_table_header *)dmar_table_raw; rc = dmar_handler(dmar_table); xfree(dmar_table_raw); /* acpi_parse_dmar() zaps APCI DMAR signature in TXT heap table */ /* but dom0 will read real table, so must zap it there too */ acpi_dmar_zap(); return rc; } static vmac_t orig_mac, resume_mac; int tboot_s3_resume(void) { if ( !tboot_in_measured_env() ) return 0; /* need to do these in reverse order of shutdown */ tboot_gen_xenheap_integrity(g_tboot_shared->s3_key, &resume_mac); orig_mac = xenheap_mac; if ( resume_mac != xenheap_mac ) return -1; tboot_gen_frametable_integrity(g_tboot_shared->s3_key, &resume_mac); orig_mac = frametable_mac; if ( resume_mac != frametable_mac ) return -2; tboot_gen_domain_integrity(g_tboot_shared->s3_key, &resume_mac); orig_mac = domain_mac; if ( resume_mac != domain_mac ) return -3; return 0; } void tboot_s3_error(int error) { const char *what = "???"; BUG_ON(!error || !tboot_in_measured_env()); switch ( error ) { case -1: what = "Xen heap"; break; case -2: what = "frame table"; break; case -3: what = "domains"; break; } printk("MAC for %s before S3 is: 0x%08"PRIx64"\n", what, orig_mac); printk("MAC for %s after S3 is: 0x%08"PRIx64"\n", what, resume_mac); panic("Memory integrity was lost on resume (%d)", error); } int tboot_wake_ap(int apicid, unsigned long sipi_vec) { if ( g_tboot_shared->version >= 6 && (g_tboot_shared->flags & TB_FLAG_AP_WAKE_SUPPORT) ) { g_tboot_shared->ap_wake_addr = sipi_vec; g_tboot_shared->ap_wake_trigger = apicid; return 0; } return 1; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/oprofile/0000775000175000017500000000000012307313555014416 5ustar smbsmbxen-4.4.0/xen/arch/x86/oprofile/Makefile0000664000175000017500000000020712307313555016055 0ustar smbsmbobj-y += xenoprof.o obj-y += nmi_int.o obj-y += op_model_p4.o obj-y += op_model_ppro.o obj-y += op_model_athlon.o obj-y += backtrace.o xen-4.4.0/xen/arch/x86/oprofile/op_x86_model.h0000664000175000017500000000322612307313555017075 0ustar smbsmb/** * @file op_x86_model.h * interface to x86 model-specific MSR operations * * @remark Copyright 2002 OProfile authors * @remark Read the file COPYING * * @author Graydon Hoare */ #ifndef OP_X86_MODEL_H #define OP_X86_MODEL_H struct op_msr { unsigned long addr; uint64_t value; }; struct op_msrs { struct op_msr * counters; struct op_msr * controls; }; struct pt_regs; /* The model vtable abstracts the differences between * various x86 CPU model's perfctr support. */ struct op_x86_model_spec { unsigned int num_counters; unsigned int num_controls; void (*fill_in_addresses)(struct op_msrs * const msrs); void (*setup_ctrs)(struct op_msrs const * const msrs); int (*check_ctrs)(unsigned int const cpu, struct op_msrs const * const msrs, struct cpu_user_regs * const regs); void (*start)(struct op_msrs const * const msrs); void (*stop)(struct op_msrs const * const msrs); int (*is_arch_pmu_msr)(u64 msr_index, int *type, int *index); int (*allocated_msr)(struct vcpu *v); void (*free_msr)(struct vcpu *v); void (*load_msr)(struct vcpu * const v, int type, int index, u64 *msr_content); void (*save_msr)(struct vcpu * const v, int type, int index, u64 msr_content); }; extern struct op_x86_model_spec op_ppro_spec; extern struct op_x86_model_spec op_arch_perfmon_spec; extern struct op_x86_model_spec const op_p4_spec; extern struct op_x86_model_spec const op_p4_ht2_spec; extern struct op_x86_model_spec const op_athlon_spec; extern struct op_x86_model_spec const op_amd_fam15h_spec; void arch_perfmon_setup_counters(void); extern int ppro_has_global_ctrl; extern struct op_x86_model_spec const *model; #endif /* OP_X86_MODEL_H */ xen-4.4.0/xen/arch/x86/oprofile/op_model_ppro.c0000664000175000017500000002150512307313555017423 0ustar smbsmb/** * @file op_model_ppro.h * pentium pro / P6 model-specific MSR operations * * @remark Copyright 2002 OProfile authors * @remark Read the file COPYING * * @author John Levon * @author Philippe Elie * @author Graydon Hoare */ #include #include #include #include #include #include #include #include #include #include #include #include "op_x86_model.h" #include "op_counter.h" /* * Intel "Architectural Performance Monitoring" CPUID * detection/enumeration details: */ union cpuid10_eax { struct { unsigned int version_id:8; unsigned int num_counters:8; unsigned int bit_width:8; unsigned int mask_length:8; } split; unsigned int full; }; static int num_counters = 2; static int counter_width = 32; #define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1)))) #define CTRL_READ(msr_content,msrs,c) do {rdmsrl((msrs->controls[(c)].addr), (msr_content));} while (0) #define CTRL_WRITE(msr_content,msrs,c) do {wrmsrl((msrs->controls[(c)].addr), (msr_content));} while (0) #define CTRL_SET_ACTIVE(n) (n |= (1ULL<<22)) #define CTRL_SET_INACTIVE(n) (n &= ~(1ULL<<22)) #define CTRL_CLEAR(x) (x &= (1ULL<<21)) #define CTRL_SET_ENABLE(val) (val |= 1ULL<<20) #define CTRL_SET_USR(val,u) (val |= ((u & 1ULL) << 16)) #define CTRL_SET_KERN(val,k) (val |= ((k & 1ULL) << 17)) #define CTRL_SET_UM(val, m) (val |= (m << 8)) #define CTRL_SET_EVENT(val, e) (val |= e) #define IS_ACTIVE(val) (val & (1ULL << 22) ) #define IS_ENABLE(val) (val & (1ULL << 20) ) static unsigned long reset_value[OP_MAX_COUNTER]; int ppro_has_global_ctrl = 0; static void ppro_fill_in_addresses(struct op_msrs * const msrs) { int i; for (i = 0; i < num_counters; i++) msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; for (i = 0; i < num_counters; i++) msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; } static void ppro_setup_ctrs(struct op_msrs const * const msrs) { uint64_t msr_content; int i; if (cpu_has_arch_perfmon) { union cpuid10_eax eax; eax.full = cpuid_eax(0xa); /* * For Core2 (family 6, model 15), don't reset the * counter width: */ if (!(eax.split.version_id == 0 && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15)) { if (counter_width < eax.split.bit_width) counter_width = eax.split.bit_width; } } /* clear all counters */ for (i = 0 ; i < num_counters; ++i) { CTRL_READ(msr_content, msrs, i); CTRL_CLEAR(msr_content); CTRL_WRITE(msr_content, msrs, i); } /* avoid a false detection of ctr overflows in NMI handler */ for (i = 0; i < num_counters; ++i) wrmsrl(msrs->counters[i].addr, ~0x0ULL); /* enable active counters */ for (i = 0; i < num_counters; ++i) { if (counter_config[i].enabled) { reset_value[i] = counter_config[i].count; wrmsrl(msrs->counters[i].addr, -reset_value[i]); CTRL_READ(msr_content, msrs, i); CTRL_CLEAR(msr_content); CTRL_SET_ENABLE(msr_content); CTRL_SET_USR(msr_content, counter_config[i].user); CTRL_SET_KERN(msr_content, counter_config[i].kernel); CTRL_SET_UM(msr_content, counter_config[i].unit_mask); CTRL_SET_EVENT(msr_content, counter_config[i].event); CTRL_WRITE(msr_content, msrs, i); } else { reset_value[i] = 0; } } } static int ppro_check_ctrs(unsigned int const cpu, struct op_msrs const * const msrs, struct cpu_user_regs * const regs) { u64 val; int i; int ovf = 0; unsigned long eip = regs->eip; int mode = xenoprofile_get_mode(current, regs); struct arch_msr_pair *msrs_content = vcpu_vpmu(current)->context; for (i = 0 ; i < num_counters; ++i) { if (!reset_value[i]) continue; rdmsrl(msrs->counters[i].addr, val); if (CTR_OVERFLOWED(val)) { xenoprof_log_event(current, regs, eip, mode, i); wrmsrl(msrs->counters[i].addr, -reset_value[i]); if ( is_passive(current->domain) && (mode != 2) && vpmu_is_set(vcpu_vpmu(current), VPMU_PASSIVE_DOMAIN_ALLOCATED) ) { if ( IS_ACTIVE(msrs_content[i].control) ) { msrs_content[i].counter = val; if ( IS_ENABLE(msrs_content[i].control) ) ovf = 2; } } if ( !ovf ) ovf = 1; } } /* Only P6 based Pentium M need to re-unmask the apic vector but it * doesn't hurt other P6 variant */ apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); return ovf; } static void ppro_start(struct op_msrs const * const msrs) { uint64_t msr_content; int i; for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { CTRL_READ(msr_content, msrs, i); CTRL_SET_ACTIVE(msr_content); CTRL_WRITE(msr_content, msrs, i); } } /* Global Control MSR is enabled by default when system power on. * However, this may not hold true when xenoprof starts to run. */ if ( ppro_has_global_ctrl ) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, (1ULL<= MSR_IA32_PERFCTR0) && (msr_index < (MSR_IA32_PERFCTR0 + num_counters)) ) { *type = MSR_TYPE_ARCH_COUNTER; *index = msr_index - MSR_IA32_PERFCTR0; return 1; } if ( (msr_index >= MSR_P6_EVNTSEL0) && (msr_index < (MSR_P6_EVNTSEL0 + num_counters)) ) { *type = MSR_TYPE_ARCH_CTRL; *index = msr_index - MSR_P6_EVNTSEL0; return 1; } return 0; } static int ppro_allocate_msr(struct vcpu *v) { struct vpmu_struct *vpmu = vcpu_vpmu(v); struct arch_msr_pair *msr_content; msr_content = xzalloc_array(struct arch_msr_pair, num_counters); if ( !msr_content ) goto out; vpmu->context = (void *)msr_content; vpmu_clear(vpmu); vpmu_set(vpmu, VPMU_PASSIVE_DOMAIN_ALLOCATED); return 1; out: printk(XENLOG_G_WARNING "Insufficient memory for oprofile," " oprofile is unavailable on dom%d vcpu%d\n", v->vcpu_id, v->domain->domain_id); return 0; } static void ppro_free_msr(struct vcpu *v) { struct vpmu_struct *vpmu = vcpu_vpmu(v); if ( !vpmu_is_set(vpmu, VPMU_PASSIVE_DOMAIN_ALLOCATED) ) return; xfree(vpmu->context); vpmu_reset(vpmu, VPMU_PASSIVE_DOMAIN_ALLOCATED); } static void ppro_load_msr(struct vcpu *v, int type, int index, u64 *msr_content) { struct arch_msr_pair *msrs = vcpu_vpmu(v)->context; switch ( type ) { case MSR_TYPE_ARCH_COUNTER: *msr_content = msrs[index].counter; break; case MSR_TYPE_ARCH_CTRL: *msr_content = msrs[index].control; break; } } static void ppro_save_msr(struct vcpu *v, int type, int index, u64 msr_content) { struct arch_msr_pair *msrs = vcpu_vpmu(v)->context; switch ( type ) { case MSR_TYPE_ARCH_COUNTER: msrs[index].counter = msr_content; break; case MSR_TYPE_ARCH_CTRL: msrs[index].control = msr_content; break; } } /* * Architectural performance monitoring. * * Newer Intel CPUs (Core1+) have support for architectural * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details. * The advantage of this is that it can be done without knowing about * the specific CPU. */ void arch_perfmon_setup_counters(void) { union cpuid10_eax eax; eax.full = cpuid_eax(0xa); /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) { eax.split.version_id = 2; eax.split.num_counters = 2; eax.split.bit_width = 40; } num_counters = min_t(u8, eax.split.num_counters, OP_MAX_COUNTER); op_arch_perfmon_spec.num_counters = num_counters; op_arch_perfmon_spec.num_controls = num_counters; op_ppro_spec.num_counters = num_counters; op_ppro_spec.num_controls = num_counters; } struct op_x86_model_spec __read_mostly op_ppro_spec = { .num_counters = 2, .num_controls = 2, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, .start = &ppro_start, .stop = &ppro_stop, .is_arch_pmu_msr = &ppro_is_arch_pmu_msr, .allocated_msr = &ppro_allocate_msr, .free_msr = &ppro_free_msr, .load_msr = &ppro_load_msr, .save_msr = &ppro_save_msr }; struct op_x86_model_spec __read_mostly op_arch_perfmon_spec = { /* num_counters/num_controls filled in at runtime */ .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, .start = &ppro_start, .stop = &ppro_stop, .is_arch_pmu_msr = &ppro_is_arch_pmu_msr, .allocated_msr = &ppro_allocate_msr, .free_msr = &ppro_free_msr, .load_msr = &ppro_load_msr, .save_msr = &ppro_save_msr }; xen-4.4.0/xen/arch/x86/oprofile/xenoprof.c0000664000175000017500000000546012307313555016427 0ustar smbsmb/* * Copyright (C) 2005 Hewlett-Packard Co. * written by Aravind Menon & Jose Renato Santos * (email: xenoprof@groups.hp.com) * * Copyright (c) 2006 Isaku Yamahata * VA Linux Systems Japan K.K. * x86 specific part */ #include #include #include #include #include #include #include "op_counter.h" int xenoprof_arch_counter(XEN_GUEST_HANDLE_PARAM(void) arg) { struct xenoprof_counter counter; if ( copy_from_guest(&counter, arg, 1) ) return -EFAULT; if ( counter.ind >= OP_MAX_COUNTER ) return -E2BIG; counter_config[counter.ind].count = counter.count; counter_config[counter.ind].enabled = counter.enabled; counter_config[counter.ind].event = counter.event; counter_config[counter.ind].kernel = counter.kernel; counter_config[counter.ind].user = counter.user; counter_config[counter.ind].unit_mask = counter.unit_mask; return 0; } int xenoprof_arch_ibs_counter(XEN_GUEST_HANDLE_PARAM(void) arg) { struct xenoprof_ibs_counter ibs_counter; if ( copy_from_guest(&ibs_counter, arg, 1) ) return -EFAULT; ibs_config.op_enabled = ibs_counter.op_enabled; ibs_config.fetch_enabled = ibs_counter.fetch_enabled; ibs_config.max_cnt_fetch = ibs_counter.max_cnt_fetch; ibs_config.max_cnt_op = ibs_counter.max_cnt_op; ibs_config.rand_en = ibs_counter.rand_en; ibs_config.dispatched_ops = ibs_counter.dispatched_ops; return 0; } int compat_oprof_arch_counter(XEN_GUEST_HANDLE_PARAM(void) arg) { struct compat_oprof_counter counter; if ( copy_from_guest(&counter, arg, 1) ) return -EFAULT; if ( counter.ind >= OP_MAX_COUNTER ) return -E2BIG; counter_config[counter.ind].count = counter.count; counter_config[counter.ind].enabled = counter.enabled; counter_config[counter.ind].event = counter.event; counter_config[counter.ind].kernel = counter.kernel; counter_config[counter.ind].user = counter.user; counter_config[counter.ind].unit_mask = counter.unit_mask; return 0; } int xenoprofile_get_mode(struct vcpu *curr, const struct cpu_user_regs *regs) { if ( !guest_mode(regs) ) return 2; if ( !is_hvm_vcpu(curr) ) return guest_kernel_mode(curr, regs); switch ( hvm_guest_x86_mode(curr) ) { struct segment_register ss; case 0: /* real mode */ return 1; case 1: /* vm86 mode */ return 0; default: hvm_get_segment_register(curr, x86_seg_ss, &ss); return (ss.sel & 3) != 3; } } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/oprofile/backtrace.c0000664000175000017500000001135412307313555016505 0ustar smbsmb/** * @file backtrace.c * * @remark Copyright 2002 OProfile authors * @remark Read the file COPYING * * @author John Levon * @author David Smith * Modified for Xen by Amitabha Roy * */ #include #include #include #include struct frame_head { struct frame_head * ebp; unsigned long ret; } __attribute__((packed)); typedef struct frame_head frame_head_t; DEFINE_XEN_GUEST_HANDLE(frame_head_t); struct frame_head_32bit { uint32_t ebp; uint32_t ret; } __attribute__((packed)); typedef struct frame_head_32bit frame_head32_t; DEFINE_COMPAT_HANDLE(frame_head32_t); static struct frame_head * dump_hypervisor_backtrace(struct vcpu *vcpu, const struct frame_head *head, int mode) { if (!xenoprof_add_trace(vcpu, head->ret, mode)) return 0; /* frame pointers should strictly progress back up the stack * (towards higher addresses) */ if (head >= head->ebp) return NULL; return head->ebp; } static inline int is_32bit_vcpu(struct vcpu *vcpu) { if (is_hvm_vcpu(vcpu)) return !hvm_long_mode_enabled(vcpu); else return is_pv_32bit_vcpu(vcpu); } static struct frame_head * dump_guest_backtrace(struct vcpu *vcpu, const struct frame_head *head, int mode) { frame_head_t bufhead; if ( is_32bit_vcpu(vcpu) ) { __compat_handle_const_frame_head32_t guest_head = { .c = (unsigned long)head }; frame_head32_t bufhead32; /* Also check accessibility of one struct frame_head beyond */ if (!compat_handle_okay(guest_head, 2)) return 0; if (__copy_from_compat(&bufhead32, guest_head, 1)) return 0; bufhead.ebp = (struct frame_head *)(unsigned long)bufhead32.ebp; bufhead.ret = bufhead32.ret; } else { XEN_GUEST_HANDLE(const_frame_head_t) guest_head; XEN_GUEST_HANDLE_PARAM(const_frame_head_t) guest_head_param = const_guest_handle_from_ptr(head, frame_head_t); guest_head = guest_handle_from_param(guest_head_param, const_frame_head_t); /* Also check accessibility of one struct frame_head beyond */ if (!guest_handle_okay(guest_head, 2)) return 0; if (__copy_from_guest(&bufhead, guest_head, 1)) return 0; } if (!xenoprof_add_trace(vcpu, bufhead.ret, mode)) return 0; /* frame pointers should strictly progress back up the stack * (towards higher addresses) */ if (head >= bufhead.ebp) return NULL; return bufhead.ebp; } /* * | | /\ Higher addresses * | | * --------------- stack base (address of current_thread_info) * | thread info | * . . * | stack | * --------------- saved regs->ebp value if valid (frame_head address) * . . * --------------- saved regs->rsp value if x86_64 * | | * --------------- struct pt_regs * stored on stack if 32-bit * | | * . . * | | * --------------- %esp * | | * | | \/ Lower addresses * * Thus, regs (or regs->rsp for x86_64) <-> stack base restricts the * valid(ish) ebp values. Note: (1) for x86_64, NMI and several other * exceptions use special stacks, maintained by the interrupt stack table * (IST). These stacks are set up in trap_init() in * arch/x86_64/kernel/traps.c. Thus, for x86_64, regs now does not point * to the kernel stack; instead, it points to some location on the NMI * stack. On the other hand, regs->rsp is the stack pointer saved when the * NMI occurred. (2) For 32-bit, regs->esp is not valid because the * processor does not save %esp on the kernel stack when interrupts occur * in the kernel mode. */ #if defined(CONFIG_FRAME_POINTER) static int valid_hypervisor_stack(const struct frame_head *head, const struct cpu_user_regs *regs) { unsigned long headaddr = (unsigned long)head; unsigned long stack = (unsigned long)regs->rsp; unsigned long stack_base = (stack & ~(STACK_SIZE - 1)) + STACK_SIZE; return headaddr > stack && headaddr < stack_base; } #else /* without fp, it's just junk */ static int valid_hypervisor_stack(const struct frame_head *head, const struct cpu_user_regs *regs) { return 0; } #endif void xenoprof_backtrace(struct vcpu *vcpu, const struct cpu_user_regs *regs, unsigned long depth, int mode) { const struct frame_head *head = (void *)regs->ebp; if (mode > 1) { while (depth-- && valid_hypervisor_stack(head, regs)) head = dump_hypervisor_backtrace(vcpu, head, mode); return; } while (depth-- && head) head = dump_guest_backtrace(vcpu, head, mode); } xen-4.4.0/xen/arch/x86/oprofile/nmi_int.c0000664000175000017500000002355212307313555016226 0ustar smbsmb/** * @file nmi_int.c * * @remark Copyright 2002 OProfile authors * @remark Read the file COPYING * * @author John Levon * * Modified for Xen: by Aravind Menon & Jose Renato Santos * These modifications are: * Copyright (C) 2005 Hewlett-Packard Co. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "op_counter.h" #include "op_x86_model.h" struct op_counter_config counter_config[OP_MAX_COUNTER]; struct op_ibs_config ibs_config; struct op_x86_model_spec const *__read_mostly model; static struct op_msrs cpu_msrs[NR_CPUS]; static unsigned long saved_lvtpc[NR_CPUS]; static char *cpu_type; static int passive_domain_msr_op_checks(unsigned int msr, int *typep, int *indexp) { struct vpmu_struct *vpmu = vcpu_vpmu(current); if ( model == NULL ) return 0; if ( model->is_arch_pmu_msr == NULL ) return 0; if ( !model->is_arch_pmu_msr(msr, typep, indexp) ) return 0; if ( !vpmu_is_set(vpmu, VPMU_PASSIVE_DOMAIN_ALLOCATED) ) if ( ! model->allocated_msr(current) ) return 0; return 1; } int passive_domain_do_rdmsr(unsigned int msr, uint64_t *msr_content) { int type, index; if ( !passive_domain_msr_op_checks(msr, &type, &index)) return 0; model->load_msr(current, type, index, msr_content); return 1; } int passive_domain_do_wrmsr(unsigned int msr, uint64_t msr_content) { int type, index; if ( !passive_domain_msr_op_checks(msr, &type, &index)) return 0; model->save_msr(current, type, index, msr_content); return 1; } void passive_domain_destroy(struct vcpu *v) { struct vpmu_struct *vpmu = vcpu_vpmu(v); if ( vpmu_is_set(vpmu, VPMU_PASSIVE_DOMAIN_ALLOCATED) ) model->free_msr(v); } static int nmi_callback(struct cpu_user_regs *regs, int cpu) { int xen_mode, ovf; ovf = model->check_ctrs(cpu, &cpu_msrs[cpu], regs); xen_mode = ring_0(regs); if ( ovf && is_active(current->domain) && !xen_mode ) send_guest_vcpu_virq(current, VIRQ_XENOPROF); if ( ovf == 2 ) current->nmi_pending = 1; return 1; } static void nmi_cpu_save_registers(struct op_msrs *msrs) { unsigned int const nr_ctrs = model->num_counters; unsigned int const nr_ctrls = model->num_controls; struct op_msr *counters = msrs->counters; struct op_msr *controls = msrs->controls; unsigned int i; for (i = 0; i < nr_ctrs; ++i) { rdmsrl(counters[i].addr, counters[i].value); } for (i = 0; i < nr_ctrls; ++i) { rdmsrl(controls[i].addr, controls[i].value); } } static void nmi_save_registers(void * dummy) { int cpu = smp_processor_id(); struct op_msrs * msrs = &cpu_msrs[cpu]; model->fill_in_addresses(msrs); nmi_cpu_save_registers(msrs); } static void free_msrs(void) { int i; for (i = 0; i < nr_cpu_ids; ++i) { xfree(cpu_msrs[i].counters); cpu_msrs[i].counters = NULL; xfree(cpu_msrs[i].controls); cpu_msrs[i].controls = NULL; } } static int allocate_msrs(void) { int success = 1; size_t controls_size = sizeof(struct op_msr) * model->num_controls; size_t counters_size = sizeof(struct op_msr) * model->num_counters; int i; for_each_online_cpu (i) { cpu_msrs[i].counters = xmalloc_bytes(counters_size); if (!cpu_msrs[i].counters) { success = 0; break; } cpu_msrs[i].controls = xmalloc_bytes(controls_size); if (!cpu_msrs[i].controls) { success = 0; break; } } if (!success) free_msrs(); return success; } static void nmi_cpu_setup(void * dummy) { int cpu = smp_processor_id(); struct op_msrs * msrs = &cpu_msrs[cpu]; model->setup_ctrs(msrs); } int nmi_setup_events(void) { on_each_cpu(nmi_cpu_setup, NULL, 1); return 0; } int nmi_reserve_counters(void) { if (!allocate_msrs()) return -ENOMEM; /* We walk a thin line between law and rape here. * We need to be careful to install our NMI handler * without actually triggering any NMIs as this will * break the core code horrifically. */ if (reserve_lapic_nmi() < 0) { free_msrs(); return -EBUSY; } /* We need to serialize save and setup for HT because the subset * of msrs are distinct for save and setup operations */ on_each_cpu(nmi_save_registers, NULL, 1); return 0; } int nmi_enable_virq(void) { set_nmi_callback(nmi_callback); return 0; } void nmi_disable_virq(void) { unset_nmi_callback(); } static void nmi_restore_registers(struct op_msrs * msrs) { unsigned int const nr_ctrs = model->num_counters; unsigned int const nr_ctrls = model->num_controls; struct op_msr * counters = msrs->counters; struct op_msr * controls = msrs->controls; unsigned int i; for (i = 0; i < nr_ctrls; ++i) { wrmsrl(controls[i].addr, controls[i].value); } for (i = 0; i < nr_ctrs; ++i) { wrmsrl(counters[i].addr, counters[i].value); } } static void nmi_cpu_shutdown(void * dummy) { int cpu = smp_processor_id(); struct op_msrs * msrs = &cpu_msrs[cpu]; nmi_restore_registers(msrs); } void nmi_release_counters(void) { on_each_cpu(nmi_cpu_shutdown, NULL, 1); release_lapic_nmi(); free_msrs(); } static void nmi_cpu_start(void * dummy) { int cpu = smp_processor_id(); struct op_msrs const * msrs = &cpu_msrs[cpu]; saved_lvtpc[cpu] = apic_read(APIC_LVTPC); apic_write(APIC_LVTPC, APIC_DM_NMI); model->start(msrs); } int nmi_start(void) { on_each_cpu(nmi_cpu_start, NULL, 1); return 0; } static void nmi_cpu_stop(void * dummy) { unsigned int v; int cpu = smp_processor_id(); struct op_msrs const * msrs = &cpu_msrs[cpu]; model->stop(msrs); /* restoring APIC_LVTPC can trigger an apic error because the delivery * mode and vector nr combination can be illegal. That's by design: on * power on apic lvt contain a zero vector nr which are legal only for * NMI delivery mode. So inhibit apic err before restoring lvtpc */ if ( !(apic_read(APIC_LVTPC) & APIC_DM_NMI) || (apic_read(APIC_LVTPC) & APIC_LVT_MASKED) ) { printk("nmi_stop: APIC not good %ul\n", apic_read(APIC_LVTPC)); mdelay(5000); } v = apic_read(APIC_LVTERR); apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); apic_write(APIC_LVTPC, saved_lvtpc[cpu]); apic_write(APIC_LVTERR, v); } void nmi_stop(void) { on_each_cpu(nmi_cpu_stop, NULL, 1); } static int __init p4_init(char ** cpu_type) { __u8 cpu_model = current_cpu_data.x86_model; if ((cpu_model > 6) || (cpu_model == 5)) { printk("xenoprof: Initialization failed. " "Intel processor model %d for pentium 4 family is not " "supported\n", cpu_model); return 0; } switch (current_cpu_data.x86_num_siblings) { case 1: *cpu_type = "i386/p4"; model = &op_p4_spec; return 1; case 2: *cpu_type = "i386/p4-ht"; model = &op_p4_ht2_spec; return 1; } printk("Xenoprof ERROR: P4 HyperThreading detected with > 2 threads\n"); return 0; } static int force_arch_perfmon; static int force_cpu_type(const char *str) { if (!strcmp(str, "arch_perfmon")) { force_arch_perfmon = 1; printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); } return 0; } custom_param("cpu_type", force_cpu_type); static int __init ppro_init(char ** cpu_type) { __u8 cpu_model = current_cpu_data.x86_model; if (force_arch_perfmon && cpu_has_arch_perfmon) return 0; switch (cpu_model) { case 14: *cpu_type = "i386/core"; break; case 15: *cpu_type = "i386/core_2"; ppro_has_global_ctrl = 1; break; default: /* Unknown */ return 0; } model = &op_ppro_spec; return 1; } static int __init arch_perfmon_init(char **cpu_type) { if (!cpu_has_arch_perfmon) return 0; *cpu_type = "i386/arch_perfmon"; model = &op_arch_perfmon_spec; arch_perfmon_setup_counters(); ppro_has_global_ctrl = 1; return 1; } static int __init nmi_init(void) { __u8 vendor = current_cpu_data.x86_vendor; __u8 family = current_cpu_data.x86; __u8 _model = current_cpu_data.x86_model; if (!cpu_has_apic) { printk("xenoprof: Initialization failed. No APIC\n"); return -ENODEV; } switch (vendor) { case X86_VENDOR_AMD: /* Needs to be at least an Athlon (or hammer in 32bit mode) */ switch (family) { default: printk("xenoprof: Initialization failed. " "AMD processor family %d is not " "supported\n", family); return -ENODEV; case 0xf: model = &op_athlon_spec; cpu_type = "x86-64/hammer"; break; case 0x10: model = &op_athlon_spec; cpu_type = "x86-64/family10"; ibs_init(); break; case 0x11: model = &op_athlon_spec; cpu_type = "x86-64/family11h"; break; case 0x12: model = &op_athlon_spec; cpu_type = "x86-64/family12h"; break; case 0x14: model = &op_athlon_spec; cpu_type = "x86-64/family14h"; break; case 0x15: model = &op_amd_fam15h_spec; cpu_type = "x86-64/family15h"; break; case 0x16: model = &op_athlon_spec; cpu_type = "x86-64/family16h"; break; } break; case X86_VENDOR_INTEL: switch (family) { /* Pentium IV */ case 0xf: p4_init(&cpu_type); break; /* A P6-class processor */ case 6: ppro_init(&cpu_type); break; default: break; } if (!cpu_type && !arch_perfmon_init(&cpu_type)) { printk("xenoprof: Initialization failed. " "Intel processor family %d model %d" "is not supported\n", family, _model); return -ENODEV; } break; default: printk("xenoprof: Initialization failed. " "Unsupported processor. Unknown vendor %d\n", vendor); return -ENODEV; } return 0; } __initcall(nmi_init); int xenoprof_arch_init(int *num_events, char *_cpu_type) { if (cpu_type == NULL) return -ENODEV; *num_events = model->num_counters; strlcpy(_cpu_type, cpu_type, XENOPROF_CPU_TYPE_SIZE); return 0; } xen-4.4.0/xen/arch/x86/oprofile/op_model_p4.c0000664000175000017500000004433612307313555016775 0ustar smbsmb/** * @file op_model_p4.c * P4 model-specific MSR operations * * @remark Copyright 2002 OProfile authors * @remark Read the file COPYING * * @author Graydon Hoare */ #include #include #include #include #include #include #include #include #include "op_x86_model.h" #include "op_counter.h" #define NUM_EVENTS 39 #define NUM_COUNTERS_NON_HT 8 #define NUM_ESCRS_NON_HT 45 #define NUM_CCCRS_NON_HT 18 #define NUM_CONTROLS_NON_HT (NUM_ESCRS_NON_HT + NUM_CCCRS_NON_HT) #define NUM_COUNTERS_HT2 4 #define NUM_ESCRS_HT2 23 #define NUM_CCCRS_HT2 9 #define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) static unsigned int num_counters = NUM_COUNTERS_NON_HT; /* this has to be checked dynamically since the hyper-threadedness of a chip is discovered at kernel boot-time. */ static inline void setup_num_counters(void) { if (boot_cpu_data.x86_num_siblings == 2) /* XXX */ num_counters = NUM_COUNTERS_HT2; } static int inline addr_increment(void) { return boot_cpu_data.x86_num_siblings == 2 ? 2 : 1; } /* tables to simulate simplified hardware view of p4 registers */ struct p4_counter_binding { int virt_counter; int counter_address; int cccr_address; }; struct p4_event_binding { int escr_select; /* value to put in CCCR */ int event_select; /* value to put in ESCR */ struct { int virt_counter; /* for this counter... */ int escr_address; /* use this ESCR */ } bindings[2]; }; /* nb: these CTR_* defines are a duplicate of defines in event/i386.p4*events. */ #define CTR_BPU_0 (1 << 0) #define CTR_MS_0 (1 << 1) #define CTR_FLAME_0 (1 << 2) #define CTR_IQ_4 (1 << 3) #define CTR_BPU_2 (1 << 4) #define CTR_MS_2 (1 << 5) #define CTR_FLAME_2 (1 << 6) #define CTR_IQ_5 (1 << 7) static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, { CTR_IQ_4, MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_CCCR4 }, { CTR_BPU_2, MSR_P4_BPU_PERFCTR2, MSR_P4_BPU_CCCR2 }, { CTR_MS_2, MSR_P4_MS_PERFCTR2, MSR_P4_MS_CCCR2 }, { CTR_FLAME_2, MSR_P4_FLAME_PERFCTR2, MSR_P4_FLAME_CCCR2 }, { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } }; #define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT /* All cccr we don't use. */ static int p4_unused_cccr[NUM_UNUSED_CCCRS] = { MSR_P4_BPU_CCCR1, MSR_P4_BPU_CCCR3, MSR_P4_MS_CCCR1, MSR_P4_MS_CCCR3, MSR_P4_FLAME_CCCR1, MSR_P4_FLAME_CCCR3, MSR_P4_IQ_CCCR0, MSR_P4_IQ_CCCR1, MSR_P4_IQ_CCCR2, MSR_P4_IQ_CCCR3 }; /* p4 event codes in libop/op_event.h are indices into this table. */ static const struct p4_event_binding p4_events[NUM_EVENTS] = { { /* BRANCH_RETIRED */ 0x05, 0x06, { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, {CTR_IQ_5, MSR_P4_CRU_ESCR3} } }, { /* MISPRED_BRANCH_RETIRED */ 0x04, 0x03, { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, { CTR_IQ_5, MSR_P4_CRU_ESCR1} } }, { /* TC_DELIVER_MODE */ 0x01, 0x01, { { CTR_MS_0, MSR_P4_TC_ESCR0}, { CTR_MS_2, MSR_P4_TC_ESCR1} } }, { /* BPU_FETCH_REQUEST */ 0x00, 0x03, { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, { CTR_BPU_2, MSR_P4_BPU_ESCR1} } }, { /* ITLB_REFERENCE */ 0x03, 0x18, { { CTR_BPU_0, MSR_P4_ITLB_ESCR0}, { CTR_BPU_2, MSR_P4_ITLB_ESCR1} } }, { /* MEMORY_CANCEL */ 0x05, 0x02, { { CTR_FLAME_0, MSR_P4_DAC_ESCR0}, { CTR_FLAME_2, MSR_P4_DAC_ESCR1} } }, { /* MEMORY_COMPLETE */ 0x02, 0x08, { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } }, { /* LOAD_PORT_REPLAY */ 0x02, 0x04, { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } }, { /* STORE_PORT_REPLAY */ 0x02, 0x05, { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } }, { /* MOB_LOAD_REPLAY */ 0x02, 0x03, { { CTR_BPU_0, MSR_P4_MOB_ESCR0}, { CTR_BPU_2, MSR_P4_MOB_ESCR1} } }, { /* PAGE_WALK_TYPE */ 0x04, 0x01, { { CTR_BPU_0, MSR_P4_PMH_ESCR0}, { CTR_BPU_2, MSR_P4_PMH_ESCR1} } }, { /* BSQ_CACHE_REFERENCE */ 0x07, 0x0c, { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, { CTR_BPU_2, MSR_P4_BSU_ESCR1} } }, { /* IOQ_ALLOCATION */ 0x06, 0x03, { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, { 0, 0 } } }, { /* IOQ_ACTIVE_ENTRIES */ 0x06, 0x1a, { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, { 0, 0 } } }, { /* FSB_DATA_ACTIVITY */ 0x06, 0x17, { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, { CTR_BPU_2, MSR_P4_FSB_ESCR1} } }, { /* BSQ_ALLOCATION */ 0x07, 0x05, { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, { 0, 0 } } }, { /* BSQ_ACTIVE_ENTRIES */ 0x07, 0x06, { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, { 0, 0 } } }, { /* X87_ASSIST */ 0x05, 0x03, { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, { CTR_IQ_5, MSR_P4_CRU_ESCR3} } }, { /* SSE_INPUT_ASSIST */ 0x01, 0x34, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, { /* PACKED_SP_UOP */ 0x01, 0x08, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, { /* PACKED_DP_UOP */ 0x01, 0x0c, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, { /* SCALAR_SP_UOP */ 0x01, 0x0a, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, { /* SCALAR_DP_UOP */ 0x01, 0x0e, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, { /* 64BIT_MMX_UOP */ 0x01, 0x02, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, { /* 128BIT_MMX_UOP */ 0x01, 0x1a, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, { /* X87_FP_UOP */ 0x01, 0x04, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, { /* X87_SIMD_MOVES_UOP */ 0x01, 0x2e, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, { /* MACHINE_CLEAR */ 0x05, 0x02, { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, { CTR_IQ_5, MSR_P4_CRU_ESCR3} } }, { /* GLOBAL_POWER_EVENTS */ 0x06, 0x13 /* older manual says 0x05, newer 0x13 */, { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, { CTR_BPU_2, MSR_P4_FSB_ESCR1} } }, { /* TC_MS_XFER */ 0x00, 0x05, { { CTR_MS_0, MSR_P4_MS_ESCR0}, { CTR_MS_2, MSR_P4_MS_ESCR1} } }, { /* UOP_QUEUE_WRITES */ 0x00, 0x09, { { CTR_MS_0, MSR_P4_MS_ESCR0}, { CTR_MS_2, MSR_P4_MS_ESCR1} } }, { /* FRONT_END_EVENT */ 0x05, 0x08, { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, { CTR_IQ_5, MSR_P4_CRU_ESCR3} } }, { /* EXECUTION_EVENT */ 0x05, 0x0c, { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, { CTR_IQ_5, MSR_P4_CRU_ESCR3} } }, { /* REPLAY_EVENT */ 0x05, 0x09, { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, { CTR_IQ_5, MSR_P4_CRU_ESCR3} } }, { /* INSTR_RETIRED */ 0x04, 0x02, { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, { CTR_IQ_5, MSR_P4_CRU_ESCR1} } }, { /* UOPS_RETIRED */ 0x04, 0x01, { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, { CTR_IQ_5, MSR_P4_CRU_ESCR1} } }, { /* UOP_TYPE */ 0x02, 0x02, { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, { CTR_IQ_5, MSR_P4_RAT_ESCR1} } }, { /* RETIRED_MISPRED_BRANCH_TYPE */ 0x02, 0x05, { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, { CTR_MS_2, MSR_P4_TBPU_ESCR1} } }, { /* RETIRED_BRANCH_TYPE */ 0x02, 0x04, { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, { CTR_MS_2, MSR_P4_TBPU_ESCR1} } } }; #define MISC_PMC_ENABLED_P(x) ((x) & 1ULL << 7) #define ESCR_RESERVED_BITS 0x80000003ULL #define ESCR_CLEAR(escr) ((escr) &= ESCR_RESERVED_BITS) #define ESCR_SET_USR_0(escr, usr) ((escr) |= (((usr) & 1ULL) << 2)) #define ESCR_SET_OS_0(escr, os) ((escr) |= (((os) & 1ULL) << 3)) #define ESCR_SET_USR_1(escr, usr) ((escr) |= (((usr) & 1ULL))) #define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1ULL) << 1)) #define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3fULL) << 25)) #define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffffULL) << 9)) #define ESCR_READ(escr,ev,i) do {rdmsrl(ev->bindings[(i)].escr_address, (escr));} while (0) #define ESCR_WRITE(escr,ev,i) do {wrmsrl(ev->bindings[(i)].escr_address, (escr));} while (0) #define CCCR_RESERVED_BITS 0x38030FFFULL #define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) #define CCCR_SET_REQUIRED_BITS(cccr) ((cccr) |= 0x00030000ULL) #define CCCR_SET_ESCR_SELECT(cccr, sel) ((cccr) |= (((sel) & 0x07ULL) << 13)) #define CCCR_SET_PMI_OVF_0(cccr) ((cccr) |= (1ULL<<26)) #define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1ULL<<27)) #define CCCR_SET_ENABLE(cccr) ((cccr) |= (1ULL<<12)) #define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1ULL<<12)) #define CCCR_READ(msr_content, i) do {rdmsrl(p4_counters[(i)].cccr_address, (msr_content));} while (0) #define CCCR_WRITE(msr_content, i) do {wrmsrl(p4_counters[(i)].cccr_address, (msr_content));} while (0) #define CCCR_OVF_P(cccr) ((cccr) & (1ULL<<31)) #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1ULL<<31))) #define CTR_READ(msr_content,i) do {rdmsrl(p4_counters[(i)].counter_address, (msr_content));} while (0) #define CTR_WRITE(msr_content,i) do {wrmsrl(p4_counters[(i)].counter_address, -(msr_content));} while (0) #define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000ULL)) /* this assigns a "stagger" to the current CPU, which is used throughout the code in this module as an extra array offset, to select the "even" or "odd" part of all the divided resources. */ static unsigned int get_stagger(void) { int cpu = smp_processor_id(); return (cpu != cpumask_first(per_cpu(cpu_sibling_mask, cpu))); } /* finally, mediate access to a real hardware counter by passing a "virtual" counter numer to this macro, along with your stagger setting. */ #define VIRT_CTR(stagger, i) ((i) + ((num_counters) * (stagger))) static unsigned long reset_value[NUM_COUNTERS_NON_HT]; static void p4_fill_in_addresses(struct op_msrs * const msrs) { unsigned int i; unsigned int addr, stag; setup_num_counters(); stag = get_stagger(); /* the counter registers we pay attention to */ for (i = 0; i < num_counters; ++i) { msrs->counters[i].addr = p4_counters[VIRT_CTR(stag, i)].counter_address; } /* FIXME: bad feeling, we don't save the 10 counters we don't use. */ /* 18 CCCR registers */ for (i = 0, addr = MSR_P4_BPU_CCCR0 + stag; addr <= MSR_P4_IQ_CCCR5; ++i, addr += addr_increment()) { msrs->controls[i].addr = addr; } /* 43 ESCR registers in three or four discontiguous group */ for (addr = MSR_P4_BSU_ESCR0 + stag; addr < MSR_P4_IQ_ESCR0; ++i, addr += addr_increment()) { msrs->controls[i].addr = addr; } /* no IQ_ESCR0/1 on some models, we save a seconde time BSU_ESCR0/1 * to avoid special case in nmi_{save|restore}_registers() */ if (boot_cpu_data.x86_model >= 0x3) { for (addr = MSR_P4_BSU_ESCR0 + stag; addr <= MSR_P4_BSU_ESCR1; ++i, addr += addr_increment()) { msrs->controls[i].addr = addr; } } else { for (addr = MSR_P4_IQ_ESCR0 + stag; addr <= MSR_P4_IQ_ESCR1; ++i, addr += addr_increment()) { msrs->controls[i].addr = addr; } } for (addr = MSR_P4_RAT_ESCR0 + stag; addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) { msrs->controls[i].addr = addr; } for (addr = MSR_P4_MS_ESCR0 + stag; addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { msrs->controls[i].addr = addr; } for (addr = MSR_P4_IX_ESCR0 + stag; addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { msrs->controls[i].addr = addr; } /* there are 2 remaining non-contiguously located ESCRs */ if (num_counters == NUM_COUNTERS_NON_HT) { /* standard non-HT CPUs handle both remaining ESCRs*/ msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; msrs->controls[i++].addr = MSR_P4_CRU_ESCR4; } else if (stag == 0) { /* HT CPUs give the first remainder to the even thread, as the 32nd control register */ msrs->controls[i++].addr = MSR_P4_CRU_ESCR4; } else { /* and two copies of the second to the odd thread, for the 22st and 23nd control registers */ msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; } } static void pmc_setup_one_p4_counter(unsigned int ctr) { int i; int const maxbind = 2; uint64_t cccr = 0; uint64_t escr = 0; unsigned int counter_bit; const struct p4_event_binding *ev = NULL; unsigned int stag; stag = get_stagger(); /* convert from counter *number* to counter *bit* */ counter_bit = 1 << VIRT_CTR(stag, ctr); /* find our event binding structure. */ if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { printk(KERN_ERR "oprofile: P4 event code %#lx out of range\n", counter_config[ctr].event); return; } ev = &(p4_events[counter_config[ctr].event - 1]); for (i = 0; i < maxbind; i++) { if (ev->bindings[i].virt_counter & counter_bit) { /* modify ESCR */ ESCR_READ(escr, ev, i); ESCR_CLEAR(escr); if (stag == 0) { ESCR_SET_USR_0(escr, counter_config[ctr].user); ESCR_SET_OS_0(escr, counter_config[ctr].kernel); } else { ESCR_SET_USR_1(escr, counter_config[ctr].user); ESCR_SET_OS_1(escr, counter_config[ctr].kernel); } ESCR_SET_EVENT_SELECT(escr, ev->event_select); ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); ESCR_WRITE(escr, ev, i); /* modify CCCR */ CCCR_READ(cccr, VIRT_CTR(stag, ctr)); CCCR_CLEAR(cccr); CCCR_SET_REQUIRED_BITS(cccr); CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); if (stag == 0) { CCCR_SET_PMI_OVF_0(cccr); } else { CCCR_SET_PMI_OVF_1(cccr); } CCCR_WRITE(cccr, VIRT_CTR(stag, ctr)); return; } } printk(KERN_ERR "oprofile: P4 event code %#lx no binding, stag %d ctr %d\n", counter_config[ctr].event, stag, ctr); } static void p4_setup_ctrs(struct op_msrs const * const msrs) { unsigned int i; uint64_t msr_content; unsigned int addr; unsigned int stag; stag = get_stagger(); rdmsrl(MSR_IA32_MISC_ENABLE, msr_content); if (! MISC_PMC_ENABLED_P(msr_content)) { printk(KERN_ERR "oprofile: P4 PMC not available\n"); return; } /* clear the cccrs we will use */ for (i = 0 ; i < num_counters ; i++) { rdmsrl(p4_counters[VIRT_CTR(stag, i)].cccr_address, msr_content); CCCR_CLEAR(msr_content); CCCR_SET_REQUIRED_BITS(msr_content); wrmsrl(p4_counters[VIRT_CTR(stag, i)].cccr_address, msr_content); } /* clear cccrs outside our concern */ for (i = stag ; i < NUM_UNUSED_CCCRS ; i += addr_increment()) { rdmsrl(p4_unused_cccr[i], msr_content); CCCR_CLEAR(msr_content); CCCR_SET_REQUIRED_BITS(msr_content); wrmsrl(p4_unused_cccr[i], msr_content); } /* clear all escrs (including those outside our concern) */ for (addr = MSR_P4_BSU_ESCR0 + stag; addr < MSR_P4_IQ_ESCR0; addr += addr_increment()) { wrmsrl(addr, 0x0ULL); } /* On older models clear also MSR_P4_IQ_ESCR0/1 */ if (boot_cpu_data.x86_model < 0x3) { wrmsrl(MSR_P4_IQ_ESCR0, 0x0ULL); wrmsrl(MSR_P4_IQ_ESCR1, 0x0ULL); } for (addr = MSR_P4_RAT_ESCR0 + stag; addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) { wrmsrl(addr, 0x0ULL); } for (addr = MSR_P4_MS_ESCR0 + stag; addr <= MSR_P4_TC_ESCR1; addr += addr_increment()){ wrmsrl(addr, 0x0ULL); } for (addr = MSR_P4_IX_ESCR0 + stag; addr <= MSR_P4_CRU_ESCR3; addr += addr_increment()){ wrmsrl(addr, 0x0ULL); } if (num_counters == NUM_COUNTERS_NON_HT) { wrmsrl(MSR_P4_CRU_ESCR4, 0x0ULL); wrmsrl(MSR_P4_CRU_ESCR5, 0x0ULL); } else if (stag == 0) { wrmsrl(MSR_P4_CRU_ESCR4, 0x0ULL); } else { wrmsrl(MSR_P4_CRU_ESCR5, 0x0ULL); } /* setup all counters */ for (i = 0 ; i < num_counters ; ++i) { if (counter_config[i].enabled) { reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); } else { reset_value[i] = 0; } } } static int p4_check_ctrs(unsigned int const cpu, struct op_msrs const * const msrs, struct cpu_user_regs * const regs) { unsigned long ctr, stag, real; uint64_t msr_content; int i; int ovf = 0; unsigned long eip = regs->eip; int mode = xenoprofile_get_mode(current, regs); stag = get_stagger(); for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; /* * there is some eccentricity in the hardware which * requires that we perform 2 extra corrections: * * - check both the CCCR:OVF flag for overflow and the * counter high bit for un-flagged overflows. * * - write the counter back twice to ensure it gets * updated properly. * * the former seems to be related to extra NMIs happening * during the current NMI; the latter is reported as errata * N15 in intel doc 249199-029, pentium 4 specification * update, though their suggested work-around does not * appear to solve the problem. */ real = VIRT_CTR(stag, i); CCCR_READ(msr_content, real); CTR_READ(ctr, real); if (CCCR_OVF_P(msr_content) || CTR_OVERFLOW_P(ctr)) { xenoprof_log_event(current, regs, eip, mode, i); CTR_WRITE(reset_value[i], real); CCCR_CLEAR_OVF(msr_content); CCCR_WRITE(msr_content, real); CTR_WRITE(reset_value[i], real); ovf = 1; } } /* P4 quirk: you have to re-unmask the apic vector */ apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); return ovf; } static void p4_start(struct op_msrs const * const msrs) { unsigned int stag; uint64_t msr_content; int i; stag = get_stagger(); for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; CCCR_READ(msr_content, VIRT_CTR(stag, i)); CCCR_SET_ENABLE(msr_content); CCCR_WRITE(msr_content, VIRT_CTR(stag, i)); } } static void p4_stop(struct op_msrs const * const msrs) { unsigned int stag; uint64_t msr_content; int i; stag = get_stagger(); for (i = 0; i < num_counters; ++i) { CCCR_READ(msr_content, VIRT_CTR(stag, i)); CCCR_SET_DISABLE(msr_content); CCCR_WRITE(msr_content, VIRT_CTR(stag, i)); } } struct op_x86_model_spec const op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, .num_controls = NUM_CONTROLS_HT2, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, .start = &p4_start, .stop = &p4_stop }; struct op_x86_model_spec const op_p4_spec = { .num_counters = NUM_COUNTERS_NON_HT, .num_controls = NUM_CONTROLS_NON_HT, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, .start = &p4_start, .stop = &p4_stop }; xen-4.4.0/xen/arch/x86/oprofile/op_model_athlon.c0000664000175000017500000003523212307313555017732 0ustar smbsmb/** * @file op_model_athlon.h * athlon / K7 model-specific MSR operations * * @remark Copyright 2002 OProfile authors * @remark Read the file COPYING * * @author John Levon * @author Philippe Elie * @author Graydon Hoare */ #include #include #include #include #include #include #include #include #include #include #include "op_x86_model.h" #include "op_counter.h" #define K7_NUM_COUNTERS 4 #define K7_NUM_CONTROLS 4 #define FAM15H_NUM_COUNTERS 6 #define FAM15H_NUM_CONTROLS 6 #define MAX_COUNTERS FAM15H_NUM_COUNTERS #define CTR_READ(msr_content,msrs,c) do {rdmsrl(msrs->counters[(c)].addr, (msr_content));} while (0) #define CTR_WRITE(l,msrs,c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1);} while (0) #define CTR_OVERFLOWED(n) (!((n) & (1ULL<<31))) #define CTRL_READ(msr_content,msrs,c) do {rdmsrl(msrs->controls[(c)].addr, (msr_content));} while (0) #define CTRL_WRITE(msr_content,msrs,c) do {wrmsrl(msrs->controls[(c)].addr, (msr_content));} while (0) #define CTRL_SET_ACTIVE(n) (n |= (1ULL<<22)) #define CTRL_SET_INACTIVE(n) (n &= ~(1ULL<<22)) #define CTRL_CLEAR(val) (val &= (1ULL<<21)) #define CTRL_SET_ENABLE(val) (val |= 1ULL<<20) #define CTRL_SET_USR(val,u) (val |= ((u & 1) << 16)) #define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17)) #define CTRL_SET_UM(val, m) (val |= ((m & 0xff) << 8)) #define CTRL_SET_EVENT(val, e) (val |= (((e >> 8) & 0xf) | (e & 0xff))) #define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 0x1ULL) << 41)) #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 0x1ULL) << 40)) static unsigned long reset_value[MAX_COUNTERS]; extern char svm_stgi_label[]; u32 ibs_caps = 0; static u64 ibs_op_ctl; /* IBS cpuid feature detection */ #define IBS_CPUID_FEATURES 0x8000001b /* IBS MSRs */ #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 #define MSR_AMD64_IBSOPCTL 0xc0011033 #define MSR_AMD64_IBSOPRIP 0xc0011034 #define MSR_AMD64_IBSOPDATA 0xc0011035 #define MSR_AMD64_IBSOPDATA2 0xc0011036 #define MSR_AMD64_IBSOPDATA3 0xc0011037 #define MSR_AMD64_IBSDCLINAD 0xc0011038 #define MSR_AMD64_IBSDCPHYSAD 0xc0011039 #define MSR_AMD64_IBSCTL 0xc001103a /* * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but * bit 0 is used to indicate the existence of IBS. */ #define IBS_CAPS_AVAIL (1LL<<0) #define IBS_CAPS_RDWROPCNT (1LL<<3) #define IBS_CAPS_OPCNT (1LL<<4) /* IBS randomization macros */ #define IBS_RANDOM_BITS 12 #define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1) #define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5)) /* IbsFetchCtl bits/masks */ #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) #define IBS_FETCH_ENABLE (1ULL<<48) #define IBS_FETCH_CNT 0xFFFF0000ULL #define IBS_FETCH_MAX_CNT 0x0000FFFFULL /* IbsOpCtl bits */ #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) #define IBS_OP_MAX_CNT 0x0000FFFFULL /* IBS sample identifier */ #define IBS_FETCH_CODE 13 #define IBS_OP_CODE 14 #define clamp(val, min, max) ({ \ typeof(val) __val = (val); \ typeof(min) __min = (min); \ typeof(max) __max = (max); \ (void) (&__val == &__min); \ (void) (&__val == &__max); \ __val = __val < __min ? __min: __val; \ __val > __max ? __max: __val; }) /* * 16-bit Linear Feedback Shift Register (LFSR) */ static unsigned int lfsr_random(void) { static unsigned int lfsr_value = 0xF00D; unsigned int bit; /* Compute next bit to shift in */ bit = ((lfsr_value >> 0) ^ (lfsr_value >> 2) ^ (lfsr_value >> 3) ^ (lfsr_value >> 5)) & 0x0001; /* Advance to next register value */ lfsr_value = (lfsr_value >> 1) | (bit << 15); return lfsr_value; } /* * IBS software randomization * * The IBS periodic op counter is randomized in software. The lower 12 * bits of the 20 bit counter are randomized. IbsOpCurCnt is * initialized with a 12 bit random value. */ static inline u64 op_amd_randomize_ibs_op(u64 val) { unsigned int random = lfsr_random(); if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) /* * Work around if the hw can not write to IbsOpCurCnt * * Randomize the lower 8 bits of the 16 bit * IbsOpMaxCnt [15:0] value in the range of -128 to * +127 by adding/subtracting an offset to the * maximum count (IbsOpMaxCnt). * * To avoid over or underflows and protect upper bits * starting at bit 16, the initial value for * IbsOpMaxCnt must fit in the range from 0x0081 to * 0xff80. */ val += (s8)(random >> 4); else val |= (u64)(random & IBS_RANDOM_MASK) << 32; return val; } static void athlon_fill_in_addresses(struct op_msrs * const msrs) { msrs->counters[0].addr = MSR_K7_PERFCTR0; msrs->counters[1].addr = MSR_K7_PERFCTR1; msrs->counters[2].addr = MSR_K7_PERFCTR2; msrs->counters[3].addr = MSR_K7_PERFCTR3; msrs->controls[0].addr = MSR_K7_EVNTSEL0; msrs->controls[1].addr = MSR_K7_EVNTSEL1; msrs->controls[2].addr = MSR_K7_EVNTSEL2; msrs->controls[3].addr = MSR_K7_EVNTSEL3; } static void fam15h_fill_in_addresses(struct op_msrs * const msrs) { msrs->counters[0].addr = MSR_AMD_FAM15H_PERFCTR0; msrs->counters[1].addr = MSR_AMD_FAM15H_PERFCTR1; msrs->counters[2].addr = MSR_AMD_FAM15H_PERFCTR2; msrs->counters[3].addr = MSR_AMD_FAM15H_PERFCTR3; msrs->counters[4].addr = MSR_AMD_FAM15H_PERFCTR4; msrs->counters[5].addr = MSR_AMD_FAM15H_PERFCTR5; msrs->controls[0].addr = MSR_AMD_FAM15H_EVNTSEL0; msrs->controls[1].addr = MSR_AMD_FAM15H_EVNTSEL1; msrs->controls[2].addr = MSR_AMD_FAM15H_EVNTSEL2; msrs->controls[3].addr = MSR_AMD_FAM15H_EVNTSEL3; msrs->controls[4].addr = MSR_AMD_FAM15H_EVNTSEL4; msrs->controls[5].addr = MSR_AMD_FAM15H_EVNTSEL5; } static void athlon_setup_ctrs(struct op_msrs const * const msrs) { uint64_t msr_content; int i; unsigned int const nr_ctrs = model->num_counters; unsigned int const nr_ctrls = model->num_controls; /* clear all counters */ for (i = 0 ; i < nr_ctrls; ++i) { CTRL_READ(msr_content, msrs, i); CTRL_CLEAR(msr_content); CTRL_WRITE(msr_content, msrs, i); } /* avoid a false detection of ctr overflows in NMI handler */ for (i = 0; i < nr_ctrs; ++i) { CTR_WRITE(1, msrs, i); } /* enable active counters */ for (i = 0; i < nr_ctrs; ++i) { if (counter_config[i].enabled) { reset_value[i] = counter_config[i].count; CTR_WRITE(counter_config[i].count, msrs, i); CTRL_READ(msr_content, msrs, i); CTRL_CLEAR(msr_content); CTRL_SET_ENABLE(msr_content); CTRL_SET_USR(msr_content, counter_config[i].user); CTRL_SET_KERN(msr_content, counter_config[i].kernel); CTRL_SET_UM(msr_content, counter_config[i].unit_mask); CTRL_SET_EVENT(msr_content, counter_config[i].event); CTRL_SET_HOST_ONLY(msr_content, 0); CTRL_SET_GUEST_ONLY(msr_content, 0); CTRL_WRITE(msr_content, msrs, i); } else { reset_value[i] = 0; } } } static inline void ibs_log_event(u64 data, struct cpu_user_regs * const regs, int mode) { struct vcpu *v = current; u32 temp = 0; temp = data & 0xFFFFFFFF; xenoprof_log_event(v, regs, temp, mode, 0); temp = (data >> 32) & 0xFFFFFFFF; xenoprof_log_event(v, regs, temp, mode, 0); } static inline int handle_ibs(int mode, struct cpu_user_regs * const regs) { u64 val, ctl; struct vcpu *v = current; if (!ibs_caps) return 1; if (ibs_config.fetch_enabled) { rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl); if (ctl & IBS_FETCH_VAL) { rdmsrl(MSR_AMD64_IBSFETCHLINAD, val); xenoprof_log_event(v, regs, IBS_FETCH_CODE, mode, 0); xenoprof_log_event(v, regs, val, mode, 0); ibs_log_event(val, regs, mode); ibs_log_event(ctl, regs, mode); rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val); ibs_log_event(val, regs, mode); /* reenable the IRQ */ ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT); ctl |= IBS_FETCH_ENABLE; wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); } } if (ibs_config.op_enabled) { rdmsrl(MSR_AMD64_IBSOPCTL, ctl); if (ctl & IBS_OP_VAL) { rdmsrl(MSR_AMD64_IBSOPRIP, val); xenoprof_log_event(v, regs, IBS_OP_CODE, mode, 0); xenoprof_log_event(v, regs, val, mode, 0); ibs_log_event(val, regs, mode); rdmsrl(MSR_AMD64_IBSOPDATA, val); ibs_log_event(val, regs, mode); rdmsrl(MSR_AMD64_IBSOPDATA2, val); ibs_log_event(val, regs, mode); rdmsrl(MSR_AMD64_IBSOPDATA3, val); ibs_log_event(val, regs, mode); rdmsrl(MSR_AMD64_IBSDCLINAD, val); ibs_log_event(val, regs, mode); rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); ibs_log_event(val, regs, mode); /* reenable the IRQ */ ctl = op_amd_randomize_ibs_op(ibs_op_ctl); wrmsrl(MSR_AMD64_IBSOPCTL, ctl); } } return 1; } static int athlon_check_ctrs(unsigned int const cpu, struct op_msrs const * const msrs, struct cpu_user_regs * const regs) { uint64_t msr_content; int i; int ovf = 0; unsigned long eip = regs->eip; int mode = 0; struct vcpu *v = current; struct cpu_user_regs *guest_regs = guest_cpu_user_regs(); unsigned int const nr_ctrs = model->num_counters; if (!guest_mode(regs) && (regs->eip == (unsigned long)svm_stgi_label)) { /* SVM guest was running when NMI occurred */ ASSERT(is_hvm_vcpu(v)); eip = guest_regs->eip; mode = xenoprofile_get_mode(v, guest_regs); } else { eip = regs->eip; mode = xenoprofile_get_mode(v, regs); } for (i = 0 ; i < nr_ctrs; ++i) { CTR_READ(msr_content, msrs, i); if (CTR_OVERFLOWED(msr_content)) { xenoprof_log_event(current, regs, eip, mode, i); CTR_WRITE(reset_value[i], msrs, i); ovf = 1; } } ovf = handle_ibs(mode, regs); /* See op_model_ppro.c */ return ovf; } static inline void start_ibs(void) { u64 val = 0; if (!ibs_caps) return; if (ibs_config.fetch_enabled) { val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT; val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; val |= IBS_FETCH_ENABLE; wrmsrl(MSR_AMD64_IBSFETCHCTL, val); } if (ibs_config.op_enabled) { ibs_op_ctl = ibs_config.max_cnt_op >> 4; if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) { /* * IbsOpCurCnt not supported. See * op_amd_randomize_ibs_op() for details. */ ibs_op_ctl = clamp((unsigned long long)ibs_op_ctl, 0x0081ULL, 0xFF80ULL); } else { /* * The start value is randomized with a * positive offset, we need to compensate it * with the half of the randomized range. Also * avoid underflows. */ ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET, IBS_OP_MAX_CNT); } if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops) ibs_op_ctl |= IBS_OP_CNT_CTL; ibs_op_ctl |= IBS_OP_ENABLE; val = op_amd_randomize_ibs_op(ibs_op_ctl); wrmsrl(MSR_AMD64_IBSOPCTL, val); } } static void athlon_start(struct op_msrs const * const msrs) { uint64_t msr_content; int i; unsigned int const nr_ctrs = model->num_counters; for (i = 0 ; i < nr_ctrs ; ++i) { if (reset_value[i]) { CTRL_READ(msr_content, msrs, i); CTRL_SET_ACTIVE(msr_content); CTRL_WRITE(msr_content, msrs, i); } } start_ibs(); } static void stop_ibs(void) { if (!ibs_caps) return; if (ibs_config.fetch_enabled) /* clear max count and enable */ wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); if (ibs_config.op_enabled) /* clear max count and enable */ wrmsrl(MSR_AMD64_IBSOPCTL, 0); } static void athlon_stop(struct op_msrs const * const msrs) { uint64_t msr_content; int i; unsigned int const nr_ctrs = model->num_counters; /* Subtle: stop on all counters to avoid race with * setting our pm callback */ for (i = 0 ; i < nr_ctrs ; ++i) { CTRL_READ(msr_content, msrs, i); CTRL_SET_INACTIVE(msr_content); CTRL_WRITE(msr_content, msrs, i); } stop_ibs(); } #define IBSCTL_LVTOFFSETVAL (1 << 8) #define APIC_EILVT_MSG_NMI 0x4 #define APIC_EILVT_LVTOFF_IBS 1 #define APIC_EILVTn(n) (0x500 + 0x10 * n) static inline void __init init_ibs_nmi_per_cpu(void *arg) { unsigned long reg; reg = (APIC_EILVT_LVTOFF_IBS << 4) + APIC_EILVTn(0); apic_write(reg, APIC_EILVT_MSG_NMI << 8); } #define PCI_VENDOR_ID_AMD 0x1022 #define PCI_DEVICE_ID_AMD_10H_NB_MISC 0x1203 #define IBSCTL 0x1cc static int __init init_ibs_nmi(void) { int bus, dev, func; u32 id, value; u16 vendor_id, dev_id; int nodes; /* per CPU setup */ on_each_cpu(init_ibs_nmi_per_cpu, NULL, 1); nodes = 0; for (bus = 0; bus < 256; bus++) { for (dev = 0; dev < 32; dev++) { for (func = 0; func < 8; func++) { id = pci_conf_read32(0, bus, dev, func, PCI_VENDOR_ID); vendor_id = id & 0xffff; dev_id = (id >> 16) & 0xffff; if ((vendor_id == PCI_VENDOR_ID_AMD) && (dev_id == PCI_DEVICE_ID_AMD_10H_NB_MISC)) { pci_conf_write32(0, bus, dev, func, IBSCTL, IBSCTL_LVTOFFSETVAL | APIC_EILVT_LVTOFF_IBS); value = pci_conf_read32(0, bus, dev, func, IBSCTL); if (value != (IBSCTL_LVTOFFSETVAL | APIC_EILVT_LVTOFF_IBS)) { printk("Xenoprofile: Failed to setup IBS LVT offset, " "IBSCTL = %#x\n", value); return 1; } nodes++; } } } } if (!nodes) { printk("Xenoprofile: No CPU node configured for IBS\n"); return 1; } return 0; } static void __init get_ibs_caps(void) { unsigned int max_level; if (!boot_cpu_has(X86_FEATURE_IBS)) return; /* check IBS cpuid feature flags */ max_level = cpuid_eax(0x80000000); if (max_level >= IBS_CPUID_FEATURES) ibs_caps = cpuid_eax(IBS_CPUID_FEATURES); if (!(ibs_caps & IBS_CAPS_AVAIL)) /* cpuid flags not valid */ ibs_caps = 0; } void __init ibs_init(void) { get_ibs_caps(); if ( !ibs_caps ) return; if (init_ibs_nmi()) { ibs_caps = 0; return; } printk("Xenoprofile: AMD IBS detected (%#x)\n", (unsigned)ibs_caps); } struct op_x86_model_spec const op_athlon_spec = { .num_counters = K7_NUM_COUNTERS, .num_controls = K7_NUM_CONTROLS, .fill_in_addresses = &athlon_fill_in_addresses, .setup_ctrs = &athlon_setup_ctrs, .check_ctrs = &athlon_check_ctrs, .start = &athlon_start, .stop = &athlon_stop }; struct op_x86_model_spec const op_amd_fam15h_spec = { .num_counters = FAM15H_NUM_COUNTERS, .num_controls = FAM15H_NUM_CONTROLS, .fill_in_addresses = &fam15h_fill_in_addresses, .setup_ctrs = &athlon_setup_ctrs, .check_ctrs = &athlon_check_ctrs, .start = &athlon_start, .stop = &athlon_stop }; xen-4.4.0/xen/arch/x86/oprofile/op_counter.h0000664000175000017500000000150512307313555016745 0ustar smbsmb/** * @file op_counter.h * * @remark Copyright 2002 OProfile authors * @remark Read the file COPYING * * @author John Levon */ #ifndef OP_COUNTER_H #define OP_COUNTER_H #define OP_MAX_COUNTER 8 /* Per-perfctr configuration as set via * oprofilefs. */ struct op_counter_config { unsigned long count; unsigned long enabled; unsigned long event; unsigned long kernel; unsigned long user; unsigned long unit_mask; }; extern struct op_counter_config counter_config[]; /* AMD IBS configuration */ struct op_ibs_config { unsigned long op_enabled; unsigned long fetch_enabled; unsigned long max_cnt_fetch; unsigned long max_cnt_op; unsigned long rand_en; unsigned long dispatched_ops; }; extern struct op_ibs_config ibs_config; #endif /* OP_COUNTER_H */ xen-4.4.0/xen/arch/x86/percpu.c0000664000175000017500000000460212307313555014243 0ustar smbsmb#include #include #include #include #include #include unsigned long __per_cpu_offset[NR_CPUS]; /* * Force uses of per_cpu() with an invalid area to attempt to access the * middle of the non-canonical address space resulting in a #GP, rather than a * possible #PF at (NULL + a little) which has security implications in the * context of PV guests. */ #define INVALID_PERCPU_AREA (0x8000000000000000L - (long)__per_cpu_start) #define PERCPU_ORDER (get_order_from_bytes(__per_cpu_data_end-__per_cpu_start)) void __init percpu_init_areas(void) { unsigned int cpu; for ( cpu = 1; cpu < NR_CPUS; cpu++ ) __per_cpu_offset[cpu] = INVALID_PERCPU_AREA; } static int init_percpu_area(unsigned int cpu) { char *p; if ( __per_cpu_offset[cpu] != INVALID_PERCPU_AREA ) return -EBUSY; if ( (p = alloc_xenheap_pages(PERCPU_ORDER, 0)) == NULL ) return -ENOMEM; memset(p, 0, __per_cpu_data_end - __per_cpu_start); __per_cpu_offset[cpu] = p - __per_cpu_start; return 0; } struct free_info { unsigned int cpu; struct rcu_head rcu; }; static DEFINE_PER_CPU(struct free_info, free_info); static void _free_percpu_area(struct rcu_head *head) { struct free_info *info = container_of(head, struct free_info, rcu); unsigned int cpu = info->cpu; char *p = __per_cpu_start + __per_cpu_offset[cpu]; free_xenheap_pages(p, PERCPU_ORDER); __per_cpu_offset[cpu] = INVALID_PERCPU_AREA; } static void free_percpu_area(unsigned int cpu) { struct free_info *info = &per_cpu(free_info, cpu); info->cpu = cpu; call_rcu(&info->rcu, _free_percpu_area); } static int cpu_percpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int rc = 0; switch ( action ) { case CPU_UP_PREPARE: rc = init_percpu_area(cpu); break; case CPU_UP_CANCELED: case CPU_DEAD: free_percpu_area(cpu); break; default: break; } return !rc ? NOTIFY_DONE : notifier_from_errno(rc); } static struct notifier_block cpu_percpu_nfb = { .notifier_call = cpu_percpu_callback, .priority = 100 /* highest priority */ }; static int __init percpu_presmp_init(void) { register_cpu_notifier(&cpu_percpu_nfb); return 0; } presmp_initcall(percpu_presmp_init); xen-4.4.0/xen/arch/x86/numa.c0000664000175000017500000002427112307313555013711 0ustar smbsmb/* * Generic VM initialization for x86-64 NUMA setups. * Copyright 2002,2003 Andi Kleen, SuSE Labs. * Adapted for Xen: Ryan Harper */ #include #include #include #include #include #include #include #include #include #include #include #include static int numa_setup(char *s); custom_param("numa", numa_setup); #ifndef Dprintk #define Dprintk(x...) #endif /* from proto.h */ #define round_up(x,y) ((((x)+(y))-1) & (~((y)-1))) struct node_data node_data[MAX_NUMNODES]; /* Mapping from pdx to node id */ int memnode_shift; static typeof(*memnodemap) _memnodemap[64]; unsigned long memnodemapsize; u8 *memnodemap; unsigned char cpu_to_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; /* * Keep BIOS's CPU2node information, should not be used for memory allocaion */ unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; nodemask_t __read_mostly node_online_map = { { [0] = 1UL } }; int numa_off __devinitdata = 0; int acpi_numa __devinitdata; int srat_disabled(void) { return numa_off || acpi_numa < 0; } /* * Given a shift value, try to populate memnodemap[] * Returns : * 1 if OK * 0 if memnodmap[] too small (of shift too small) * -1 if node overlap or lost ram (shift too big) */ static int __init populate_memnodemap(const struct node *nodes, int numnodes, int shift, int *nodeids) { unsigned long spdx, epdx; int i, res = -1; memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap)); for (i = 0; i < numnodes; i++) { spdx = paddr_to_pdx(nodes[i].start); epdx = paddr_to_pdx(nodes[i].end - 1) + 1; if (spdx >= epdx) continue; if ((epdx >> shift) >= memnodemapsize) return 0; do { if (memnodemap[spdx >> shift] != NUMA_NO_NODE) return -1; if (!nodeids) memnodemap[spdx >> shift] = i; else memnodemap[spdx >> shift] = nodeids[i]; spdx += (1UL << shift); } while (spdx < epdx); res = 1; } return res; } static int __init allocate_cachealigned_memnodemap(void) { unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap)); unsigned long mfn = alloc_boot_pages(size, 1); if (!mfn) { printk(KERN_ERR "NUMA: Unable to allocate Memory to Node hash map\n"); memnodemapsize = 0; return -1; } memnodemap = mfn_to_virt(mfn); mfn <<= PAGE_SHIFT; size <<= PAGE_SHIFT; printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", mfn, mfn + size); memnodemapsize = size / sizeof(*memnodemap); return 0; } /* * The LSB of all start and end addresses in the node map is the value of the * maximum possible shift. */ static int __init extract_lsb_from_nodes(const struct node *nodes, int numnodes) { int i, nodes_used = 0; unsigned long spdx, epdx; unsigned long bitfield = 0, memtop = 0; for (i = 0; i < numnodes; i++) { spdx = paddr_to_pdx(nodes[i].start); epdx = paddr_to_pdx(nodes[i].end - 1) + 1; if (spdx >= epdx) continue; bitfield |= spdx; nodes_used++; if (epdx > memtop) memtop = epdx; } if (nodes_used <= 1) i = BITS_PER_LONG - 1; else i = find_first_bit(&bitfield, sizeof(unsigned long)*8); memnodemapsize = (memtop >> i) + 1; return i; } int __init compute_hash_shift(struct node *nodes, int numnodes, int *nodeids) { int shift; shift = extract_lsb_from_nodes(nodes, numnodes); if (memnodemapsize <= ARRAY_SIZE(_memnodemap)) memnodemap = _memnodemap; else if (allocate_cachealigned_memnodemap()) return -1; printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift); if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { printk(KERN_INFO "Your memory is not aligned you need to " "rebuild your kernel with a bigger NODEMAPSIZE " "shift=%d\n", shift); return -1; } return shift; } /* initialize NODE_DATA given nodeid and start/end */ void __init setup_node_bootmem(int nodeid, u64 start, u64 end) { unsigned long start_pfn, end_pfn; start_pfn = start >> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; NODE_DATA(nodeid)->node_id = nodeid; NODE_DATA(nodeid)->node_start_pfn = start_pfn; NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; node_set_online(nodeid); } void __init numa_init_array(void) { int rr, i; /* There are unfortunately some poorly designed mainboards around that only connect memory to a single CPU. This breaks the 1:1 cpu->node mapping. To avoid this fill in the mapping for all possible CPUs, as the number of CPUs is not known yet. We round robin the existing nodes. */ rr = first_node(node_online_map); for (i = 0; i < nr_cpu_ids; i++) { if (cpu_to_node[i] != NUMA_NO_NODE) continue; numa_set_node(i, rr); rr = next_node(rr, node_online_map); if (rr == MAX_NUMNODES) rr = first_node(node_online_map); } } #ifdef CONFIG_NUMA_EMU static int numa_fake __initdata = 0; /* Numa emulation */ static int __init numa_emulation(u64 start_pfn, u64 end_pfn) { int i; struct node nodes[MAX_NUMNODES]; u64 sz = ((end_pfn - start_pfn)< 1) { u64 x = 1; while ((x << 1) < sz) x <<= 1; if (x < sz/2) printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); sz = x; } memset(&nodes,0,sizeof(nodes)); for (i = 0; i < numa_fake; i++) { nodes[i].start = (start_pfn<> 20); node_set_online(i); } memnode_shift = compute_hash_shift(nodes, numa_fake, NULL); if (memnode_shift < 0) { memnode_shift = 0; printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); return -1; } for_each_online_node(i) setup_node_bootmem(i, nodes[i].start, nodes[i].end); numa_init_array(); return 0; } #endif void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) { int i; #ifdef CONFIG_NUMA_EMU if (numa_fake && !numa_emulation(start_pfn, end_pfn)) return; #endif #ifdef CONFIG_ACPI_NUMA if (!numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT, (u64)end_pfn << PAGE_SHIFT)) return; #endif printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n", (u64)start_pfn << PAGE_SHIFT, (u64)end_pfn << PAGE_SHIFT); /* setup dummy node covering all memory */ memnode_shift = BITS_PER_LONG - 1; memnodemap = _memnodemap; nodes_clear(node_online_map); node_set_online(0); for (i = 0; i < nr_cpu_ids; i++) numa_set_node(i, 0); cpumask_copy(&node_to_cpumask[0], cpumask_of(0)); setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT, (u64)end_pfn << PAGE_SHIFT); } __cpuinit void numa_add_cpu(int cpu) { cpumask_set_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]); } void __cpuinit numa_set_node(int cpu, int node) { cpu_to_node[cpu] = node; } /* [numa=off] */ static __init int numa_setup(char *opt) { if (!strncmp(opt,"off",3)) numa_off = 1; if (!strncmp(opt,"on",2)) numa_off = 0; #ifdef CONFIG_NUMA_EMU if(!strncmp(opt, "fake=", 5)) { numa_off = 0; numa_fake = simple_strtoul(opt+5,NULL,0); ; if (numa_fake >= MAX_NUMNODES) numa_fake = MAX_NUMNODES; } #endif #ifdef CONFIG_ACPI_NUMA if (!strncmp(opt,"noacpi",6)) { numa_off = 0; acpi_numa = -1; } #endif return 1; } /* * Setup early cpu_to_node. * * Populate cpu_to_node[] only if x86_cpu_to_apicid[], * and apicid_to_node[] tables have valid entries for a CPU. * This means we skip cpu_to_node[] initialisation for NUMA * emulation and faking node case (when running a kernel compiled * for NUMA on a non NUMA box), which is OK as cpu_to_node[] * is already initialized in a round robin manner at numa_init_array, * prior to this call, and this initialization is good enough * for the fake NUMA cases. */ void __init init_cpu_to_node(void) { int i, node; for (i = 0; i < nr_cpu_ids; i++) { u32 apicid = x86_cpu_to_apicid[i]; if (apicid == BAD_APICID) continue; node = apicid_to_node[apicid]; if ( node == NUMA_NO_NODE || !node_online(node) ) node = 0; numa_set_node(i, node); } } EXPORT_SYMBOL(cpu_to_node); EXPORT_SYMBOL(node_to_cpumask); EXPORT_SYMBOL(memnode_shift); EXPORT_SYMBOL(memnodemap); EXPORT_SYMBOL(node_data); static void dump_numa(unsigned char key) { s_time_t now = NOW(); int i; struct domain *d; struct page_info *page; unsigned int page_num_node[MAX_NUMNODES]; printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key, (u32)(now>>32), (u32)now); for_each_online_node(i) { paddr_t pa = (paddr_t)(NODE_DATA(i)->node_start_pfn + 1)<< PAGE_SHIFT; printk("idx%d -> NODE%d start->%lu size->%lu free->%lu\n", i, NODE_DATA(i)->node_id, NODE_DATA(i)->node_start_pfn, NODE_DATA(i)->node_spanned_pages, avail_node_heap_pages(i)); /* sanity check phys_to_nid() */ printk("phys_to_nid(%"PRIpaddr") -> %d should be %d\n", pa, phys_to_nid(pa), NODE_DATA(i)->node_id); } for_each_online_cpu(i) printk("CPU%d -> NODE%d\n", i, cpu_to_node[i]); rcu_read_lock(&domlist_read_lock); printk("Memory location of each domain:\n"); for_each_domain(d) { printk("Domain %u (total: %u):\n", d->domain_id, d->tot_pages); for_each_online_node(i) page_num_node[i] = 0; spin_lock(&d->page_alloc_lock); page_list_for_each(page, &d->page_list) { i = phys_to_nid((paddr_t)page_to_mfn(page) << PAGE_SHIFT); page_num_node[i]++; } spin_unlock(&d->page_alloc_lock); for_each_online_node(i) printk(" Node %u: %u\n", i, page_num_node[i]); } rcu_read_unlock(&domlist_read_lock); } static struct keyhandler dump_numa_keyhandler = { .diagnostic = 1, .u.fn = dump_numa, .desc = "dump numa info" }; static __init int register_numa_trigger(void) { register_keyhandler('u', &dump_numa_keyhandler); return 0; } __initcall(register_numa_trigger); xen-4.4.0/xen/arch/x86/nmi.c0000664000175000017500000003513212307313555013532 0ustar smbsmb/* * linux/arch/i386/nmi.c * * NMI watchdog support on APIC systems * * Started by Ingo Molnar * * Fixes: * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. * Mikael Pettersson : Power Management for local APIC NMI watchdog. * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. * Pavel Machek and * Mikael Pettersson : PM converted to driver model. Disable/enable API. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include unsigned int nmi_watchdog = NMI_NONE; static unsigned int nmi_hz = HZ; static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ static unsigned int nmi_p4_cccr_val; static DEFINE_PER_CPU(struct timer, nmi_timer); static DEFINE_PER_CPU(unsigned int, nmi_timer_ticks); /* opt_watchdog: If true, run a watchdog NMI on each processor. */ bool_t __initdata opt_watchdog = 0; boolean_param("watchdog", opt_watchdog); /* opt_watchdog_timeout: Number of seconds to wait before panic. */ static unsigned int opt_watchdog_timeout = 5; static void parse_watchdog_timeout(char * s) { opt_watchdog_timeout = simple_strtoull(s, NULL, 0); opt_watchdog = !!opt_watchdog_timeout; } custom_param("watchdog_timeout", parse_watchdog_timeout); /* * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: * - it may be reserved by some other driver, or not * - when not reserved by some other driver, it may be used for * the NMI watchdog, or not * * This is maintained separately from nmi_active because the NMI * watchdog may also be driven from the I/O APIC timer. */ static DEFINE_SPINLOCK(lapic_nmi_owner_lock); static unsigned int lapic_nmi_owner; #define LAPIC_NMI_WATCHDOG (1<<0) #define LAPIC_NMI_RESERVED (1<<1) /* nmi_active: * +1: the lapic NMI watchdog is active, but can be disabled * 0: the lapic NMI watchdog has not been set up, and cannot * be enabled * -1: the lapic NMI watchdog is disabled, but can be enabled */ int nmi_active; #define K7_EVNTSEL_ENABLE (1 << 22) #define K7_EVNTSEL_INT (1 << 20) #define K7_EVNTSEL_OS (1 << 17) #define K7_EVNTSEL_USR (1 << 16) #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING #define P6_EVNTSEL0_ENABLE (1 << 22) #define P6_EVNTSEL_INT (1 << 20) #define P6_EVNTSEL_OS (1 << 17) #define P6_EVNTSEL_USR (1 << 16) #define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 #define CORE_EVENT_CPU_CLOCKS_NOT_HALTED 0x3c #define P4_ESCR_EVENT_SELECT(N) ((N)<<25) #define P4_CCCR_OVF_PMI0 (1<<26) #define P4_CCCR_OVF_PMI1 (1<<27) #define P4_CCCR_THRESHOLD(N) ((N)<<20) #define P4_CCCR_COMPLEMENT (1<<19) #define P4_CCCR_COMPARE (1<<18) #define P4_CCCR_REQUIRED (3<<16) #define P4_CCCR_ESCR_SELECT(N) ((N)<<13) #define P4_CCCR_ENABLE (1<<12) /* * Set up IQ_PERFCTR0 to behave like a clock, by having IQ_CCCR0 filter * CRU_ESCR0 (with any non-null event selector) through a complemented * max threshold. [IA32-Vol3, Section 14.9.9] */ #define P4_NMI_CRU_ESCR0 P4_ESCR_EVENT_SELECT(0x3F) #define P4_NMI_IQ_CCCR0 \ (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) static void __init wait_for_nmis(void *p) { mdelay((10*1000)/nmi_hz); /* wait 10 ticks */ } int __init check_nmi_watchdog (void) { static unsigned int __initdata prev_nmi_count[NR_CPUS]; int cpu; if ( !nmi_watchdog ) return 0; printk("Testing NMI watchdog --- "); for_each_online_cpu ( cpu ) prev_nmi_count[cpu] = nmi_count(cpu); local_irq_enable(); /* Wait for 10 ticks. Busy-wait on all CPUs: the LAPIC counter that * the NMI watchdog uses only runs while the core's not halted */ if ( nmi_watchdog == NMI_LOCAL_APIC ) smp_call_function(wait_for_nmis, NULL, 0); wait_for_nmis(NULL); for_each_online_cpu ( cpu ) { if ( nmi_count(cpu) - prev_nmi_count[cpu] <= 5 ) printk("CPU#%d stuck. ", cpu); else printk("CPU#%d okay. ", cpu); } printk("\n"); /* * Now that we know it works we can reduce NMI frequency to * something more reasonable; makes a difference in some configs. * There's a limit to how slow we can go because writing the perfctr * MSRs only sets the low 32 bits, with the top 8 bits sign-extended * from those, so it's not possible to set up a delay larger than * 2^31 cycles and smaller than (2^40 - 2^31) cycles. * (Intel SDM, section 18.22.2) */ if ( nmi_watchdog == NMI_LOCAL_APIC ) nmi_hz = max(1ul, cpu_khz >> 20); return 0; } static void nmi_timer_fn(void *unused) { this_cpu(nmi_timer_ticks)++; set_timer(&this_cpu(nmi_timer), NOW() + MILLISECS(1000)); } void disable_lapic_nmi_watchdog(void) { if (nmi_active <= 0) return; switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: wrmsr(MSR_K7_EVNTSEL0, 0, 0); break; case X86_VENDOR_INTEL: switch (boot_cpu_data.x86) { case 6: wrmsr(MSR_P6_EVNTSEL0, 0, 0); break; case 15: wrmsr(MSR_P4_IQ_CCCR0, 0, 0); wrmsr(MSR_P4_CRU_ESCR0, 0, 0); break; } break; } nmi_active = -1; /* tell do_nmi() and others that we're not active any more */ nmi_watchdog = NMI_NONE; } static void enable_lapic_nmi_watchdog(void) { if (nmi_active < 0) { nmi_watchdog = NMI_LOCAL_APIC; setup_apic_nmi_watchdog(); } } int reserve_lapic_nmi(void) { unsigned int old_owner; spin_lock(&lapic_nmi_owner_lock); old_owner = lapic_nmi_owner; lapic_nmi_owner |= LAPIC_NMI_RESERVED; spin_unlock(&lapic_nmi_owner_lock); if (old_owner & LAPIC_NMI_RESERVED) return -EBUSY; if (old_owner & LAPIC_NMI_WATCHDOG) disable_lapic_nmi_watchdog(); return 0; } void release_lapic_nmi(void) { unsigned int new_owner; spin_lock(&lapic_nmi_owner_lock); new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; lapic_nmi_owner = new_owner; spin_unlock(&lapic_nmi_owner_lock); if (new_owner & LAPIC_NMI_WATCHDOG) enable_lapic_nmi_watchdog(); } #define __pminit __devinit /* * Activate the NMI watchdog via the local APIC. * Original code written by Keith Owens. */ static void __pminit clear_msr_range(unsigned int base, unsigned int n) { unsigned int i; for (i = 0; i < n; i++) wrmsr(base+i, 0, 0); } static inline void write_watchdog_counter(const char *descr) { u64 count = (u64)cpu_khz * 1000; do_div(count, nmi_hz); if(descr) Dprintk("setting %s to -%#"PRIx64"\n", descr, count); wrmsrl(nmi_perfctr_msr, 0 - count); } static void __pminit setup_k7_watchdog(void) { unsigned int evntsel; nmi_perfctr_msr = MSR_K7_PERFCTR0; clear_msr_range(MSR_K7_EVNTSEL0, 4); clear_msr_range(MSR_K7_PERFCTR0, 4); evntsel = K7_EVNTSEL_INT | K7_EVNTSEL_OS | K7_EVNTSEL_USR | K7_NMI_EVENT; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); write_watchdog_counter("K7_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); } static void __pminit setup_p6_watchdog(unsigned counter) { unsigned int evntsel; nmi_perfctr_msr = MSR_P6_PERFCTR0; clear_msr_range(MSR_P6_EVNTSEL0, 2); clear_msr_range(MSR_P6_PERFCTR0, 2); evntsel = P6_EVNTSEL_INT | P6_EVNTSEL_OS | P6_EVNTSEL_USR | counter; wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); write_watchdog_counter("P6_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= P6_EVNTSEL0_ENABLE; wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); } static int __pminit setup_p4_watchdog(void) { uint64_t misc_enable; rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (!(misc_enable & MSR_IA32_MISC_ENABLE_PERF_AVAIL)) return 0; nmi_perfctr_msr = MSR_P4_IQ_PERFCTR0; nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; if ( boot_cpu_data.x86_num_siblings == 2 ) nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; if (!(misc_enable & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL)) clear_msr_range(0x3F1, 2); /* MSR 0x3F0 seems to have a default value of 0xFC00, but current docs doesn't fully define it, so leave it alone for now. */ if (boot_cpu_data.x86_model >= 0x3) { /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */ clear_msr_range(0x3A0, 26); clear_msr_range(0x3BC, 3); } else { clear_msr_range(0x3A0, 31); } clear_msr_range(0x3C0, 6); clear_msr_range(0x3C8, 6); clear_msr_range(0x3E0, 2); clear_msr_range(MSR_P4_BPU_CCCR0, 18); clear_msr_range(MSR_P4_BPU_PERFCTR0, 18); wrmsrl(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0); wrmsrl(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE); write_watchdog_counter("P4_IQ_COUNTER0"); apic_write(APIC_LVTPC, APIC_DM_NMI); wrmsrl(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val); return 1; } void __pminit setup_apic_nmi_watchdog(void) { if (!nmi_watchdog) return; switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: switch (boot_cpu_data.x86) { case 6: case 0xf ... 0x17: setup_k7_watchdog(); break; default: return; } break; case X86_VENDOR_INTEL: switch (boot_cpu_data.x86) { case 6: setup_p6_watchdog((boot_cpu_data.x86_model < 14) ? P6_EVENT_CPU_CLOCKS_NOT_HALTED : CORE_EVENT_CPU_CLOCKS_NOT_HALTED); break; case 15: if (!setup_p4_watchdog()) return; break; default: return; } break; default: return; } lapic_nmi_owner = LAPIC_NMI_WATCHDOG; nmi_active = 1; } static int cpu_nmi_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; switch ( action ) { case CPU_UP_PREPARE: init_timer(&per_cpu(nmi_timer, cpu), nmi_timer_fn, NULL, cpu); set_timer(&per_cpu(nmi_timer, cpu), NOW()); break; case CPU_UP_CANCELED: case CPU_DEAD: kill_timer(&per_cpu(nmi_timer, cpu)); break; default: break; } return NOTIFY_DONE; } static struct notifier_block cpu_nmi_nfb = { .notifier_call = cpu_nmi_callback }; static DEFINE_PER_CPU(unsigned int, last_irq_sums); static DEFINE_PER_CPU(unsigned int, alert_counter); static atomic_t watchdog_disable_count = ATOMIC_INIT(1); void watchdog_disable(void) { atomic_inc(&watchdog_disable_count); } void watchdog_enable(void) { atomic_dec(&watchdog_disable_count); } bool_t watchdog_enabled(void) { return !atomic_read(&watchdog_disable_count); } int __init watchdog_setup(void) { unsigned int cpu; /* * Activate periodic heartbeats. We cannot do this earlier during * setup because the timer infrastructure is not available. */ for_each_online_cpu ( cpu ) cpu_nmi_callback(&cpu_nmi_nfb, CPU_UP_PREPARE, (void *)(long)cpu); register_cpu_notifier(&cpu_nmi_nfb); watchdog_enable(); return 0; } void nmi_watchdog_tick(struct cpu_user_regs * regs) { unsigned int sum = this_cpu(nmi_timer_ticks); if ( (this_cpu(last_irq_sums) == sum) && watchdog_enabled() ) { /* * Ayiee, looks like this CPU is stuck ... wait for the timeout * before doing the oops ... */ this_cpu(alert_counter)++; if ( this_cpu(alert_counter) == opt_watchdog_timeout*nmi_hz ) { console_force_unlock(); printk("Watchdog timer detects that CPU%d is stuck!\n", smp_processor_id()); fatal_trap(TRAP_nmi, regs); } } else { this_cpu(last_irq_sums) = sum; this_cpu(alert_counter) = 0; } if ( nmi_perfctr_msr ) { if ( nmi_perfctr_msr == MSR_P4_IQ_PERFCTR0 ) { /* * P4 quirks: * - An overflown perfctr will assert its interrupt * until the OVF flag in its CCCR is cleared. * - LVTPC is masked on interrupt and must be * unmasked by the LVTPC handler. */ wrmsrl(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val); apic_write(APIC_LVTPC, APIC_DM_NMI); } else if ( nmi_perfctr_msr == MSR_P6_PERFCTR0 ) { /* * Only P6 based Pentium M need to re-unmask the apic vector but * it doesn't hurt other P6 variants. */ apic_write(APIC_LVTPC, APIC_DM_NMI); } write_watchdog_counter(NULL); } } /* * For some reason the destination shorthand for self is not valid * when used with the NMI delivery mode. This is documented in Tables * 8-3 and 8-4 in IA32 Reference Manual Volume 3. We send the IPI to * our own APIC ID explicitly which is valid. */ void self_nmi(void) { unsigned long flags; u32 id = get_apic_id(); local_irq_save(flags); apic_wait_icr_idle(); apic_icr_write(APIC_DM_NMI | APIC_DEST_PHYSICAL, id); local_irq_restore(flags); } static void do_nmi_trigger(unsigned char key) { printk("Triggering NMI on APIC ID %x\n", get_apic_id()); self_nmi(); } static struct keyhandler nmi_trigger_keyhandler = { .u.fn = do_nmi_trigger, .desc = "trigger an NMI" }; static void do_nmi_stats(unsigned char key) { int i; struct domain *d; struct vcpu *v; printk("CPU\tNMI\n"); for_each_online_cpu ( i ) printk("%3d\t%3d\n", i, nmi_count(i)); if ( ((d = dom0) == NULL) || (d->vcpu == NULL) || ((v = d->vcpu[0]) == NULL) ) return; i = v->async_exception_mask & (1 << VCPU_TRAP_NMI); if ( v->nmi_pending || i ) printk("dom0 vpu0: NMI %s%s\n", v->nmi_pending ? "pending " : "", i ? "masked " : ""); else printk("dom0 vcpu0: NMI neither pending nor masked\n"); } static struct keyhandler nmi_stats_keyhandler = { .diagnostic = 1, .u.fn = do_nmi_stats, .desc = "NMI statistics" }; static __init int register_nmi_trigger(void) { register_keyhandler('N', &nmi_trigger_keyhandler); register_keyhandler('n', &nmi_stats_keyhandler); return 0; } __initcall(register_nmi_trigger); xen-4.4.0/xen/arch/x86/machine_kexec.c0000664000175000017500000001270012307313555015526 0ustar smbsmb/****************************************************************************** * machine_kexec.c * * Copyright (C) 2013 Citrix Systems R&D Ltd. * * Portions derived from Linux's arch/x86/kernel/machine_kexec_64.c. * * Copyright (C) 2002-2005 Eric Biederman * * Xen port written by: * - Simon 'Horms' Horman * - Magnus Damm * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ #include #include #include #include #include #include #include /* * Add a mapping for a page to the page tables used during kexec. */ int machine_kexec_add_page(struct kexec_image *image, unsigned long vaddr, unsigned long maddr) { struct page_info *l4_page; struct page_info *l3_page; struct page_info *l2_page; struct page_info *l1_page; l4_pgentry_t *l4 = NULL; l3_pgentry_t *l3 = NULL; l2_pgentry_t *l2 = NULL; l1_pgentry_t *l1 = NULL; int ret = -ENOMEM; l4_page = image->aux_page; if ( !l4_page ) { l4_page = kimage_alloc_control_page(image, 0); if ( !l4_page ) goto out; image->aux_page = l4_page; } l4 = __map_domain_page(l4_page); l4 += l4_table_offset(vaddr); if ( !(l4e_get_flags(*l4) & _PAGE_PRESENT) ) { l3_page = kimage_alloc_control_page(image, 0); if ( !l3_page ) goto out; l4e_write(l4, l4e_from_page(l3_page, __PAGE_HYPERVISOR)); } else l3_page = l4e_get_page(*l4); l3 = __map_domain_page(l3_page); l3 += l3_table_offset(vaddr); if ( !(l3e_get_flags(*l3) & _PAGE_PRESENT) ) { l2_page = kimage_alloc_control_page(image, 0); if ( !l2_page ) goto out; l3e_write(l3, l3e_from_page(l2_page, __PAGE_HYPERVISOR)); } else l2_page = l3e_get_page(*l3); l2 = __map_domain_page(l2_page); l2 += l2_table_offset(vaddr); if ( !(l2e_get_flags(*l2) & _PAGE_PRESENT) ) { l1_page = kimage_alloc_control_page(image, 0); if ( !l1_page ) goto out; l2e_write(l2, l2e_from_page(l1_page, __PAGE_HYPERVISOR)); } else l1_page = l2e_get_page(*l2); l1 = __map_domain_page(l1_page); l1 += l1_table_offset(vaddr); l1e_write(l1, l1e_from_pfn(maddr >> PAGE_SHIFT, __PAGE_HYPERVISOR)); ret = 0; out: if ( l1 ) unmap_domain_page(l1); if ( l2 ) unmap_domain_page(l2); if ( l3 ) unmap_domain_page(l3); if ( l4 ) unmap_domain_page(l4); return ret; } int machine_kexec_load(struct kexec_image *image) { void *code_page; int ret; switch ( image->arch ) { case EM_386: case EM_X86_64: break; default: return -EINVAL; } code_page = __map_domain_page(image->control_code_page); memcpy(code_page, kexec_reloc, kexec_reloc_size); unmap_domain_page(code_page); /* * Add a mapping for the control code page to the same virtual * address as kexec_reloc. This allows us to keep running after * these page tables are loaded in kexec_reloc. */ ret = machine_kexec_add_page(image, (unsigned long)kexec_reloc, page_to_maddr(image->control_code_page)); if ( ret < 0 ) return ret; return 0; } void machine_kexec_unload(struct kexec_image *image) { /* no-op. kimage_free() frees all control pages. */ } void machine_reboot_kexec(struct kexec_image *image) { BUG_ON(smp_processor_id() != 0); smp_send_stop(); machine_kexec(image); BUG(); } void machine_kexec(struct kexec_image *image) { int i; unsigned long reloc_flags = 0; /* We are about to permenantly jump out of the Xen context into the kexec * purgatory code. We really dont want to be still servicing interupts. */ local_irq_disable(); /* Now regular interrupts are disabled, we need to reduce the impact * of interrupts not disabled by 'cli'. * * The NMI handlers have already been set up nmi_shootdown_cpus(). All * pcpus other than us have the nmi_crash handler, while we have the nop * handler. * * The MCE handlers touch extensive areas of Xen code and data. At this * point, there is nothing we can usefully do, so set the nop handler. */ for ( i = 0; i < nr_cpu_ids; i++ ) { if ( idt_tables[i] == NULL ) continue; _update_gate_addr_lower(&idt_tables[i][TRAP_machine_check], &trap_nop); } /* Explicitly enable NMIs on this CPU. Some crashdump kernels do * not like running with NMIs disabled. */ enable_nmis(); if ( image->arch == EM_386 ) reloc_flags |= KEXEC_RELOC_FLAG_COMPAT; kexec_reloc(page_to_maddr(image->control_code_page), page_to_maddr(image->aux_page), image->head, image->entry_maddr, reloc_flags); } int machine_kexec_get(xen_kexec_range_t *range) { if (range->range != KEXEC_RANGE_MA_XEN) return -EINVAL; return machine_kexec_get_xen(range); } void arch_crash_save_vmcoreinfo(void) { VMCOREINFO_SYMBOL(dom_xen); VMCOREINFO_SYMBOL(dom_io); VMCOREINFO_SYMBOL_ALIAS(pgd_l4, idle_pg_table); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/genapic/0000775000175000017500000000000012307313555014205 5ustar smbsmbxen-4.4.0/xen/arch/x86/genapic/Makefile0000664000175000017500000000013412307313555015643 0ustar smbsmbobj-y += bigsmp.o obj-y += x2apic.o obj-y += default.o obj-y += delivery.o obj-y += probe.o xen-4.4.0/xen/arch/x86/genapic/x2apic.c0000664000175000017500000001752112307313555015545 0ustar smbsmb/* * x2APIC driver. * * Copyright (c) 2008, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include static DEFINE_PER_CPU_READ_MOSTLY(u32, cpu_2_logical_apicid); static DEFINE_PER_CPU_READ_MOSTLY(cpumask_t *, cluster_cpus); static cpumask_t *cluster_cpus_spare; static DEFINE_PER_CPU(cpumask_var_t, scratch_mask); static inline u32 x2apic_cluster(unsigned int cpu) { return per_cpu(cpu_2_logical_apicid, cpu) >> 16; } static void init_apic_ldr_x2apic_phys(void) { } static void init_apic_ldr_x2apic_cluster(void) { unsigned int cpu, this_cpu = smp_processor_id(); per_cpu(cpu_2_logical_apicid, this_cpu) = apic_read(APIC_LDR); if ( per_cpu(cluster_cpus, this_cpu) ) { ASSERT(cpumask_test_cpu(this_cpu, per_cpu(cluster_cpus, this_cpu))); return; } per_cpu(cluster_cpus, this_cpu) = cluster_cpus_spare; for_each_online_cpu ( cpu ) { if (this_cpu == cpu || x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) continue; per_cpu(cluster_cpus, this_cpu) = per_cpu(cluster_cpus, cpu); break; } if ( per_cpu(cluster_cpus, this_cpu) == cluster_cpus_spare ) cluster_cpus_spare = NULL; cpumask_set_cpu(this_cpu, per_cpu(cluster_cpus, this_cpu)); } static void __init clustered_apic_check_x2apic(void) { } static const cpumask_t *vector_allocation_cpumask_x2apic_cluster(int cpu) { return per_cpu(cluster_cpus, cpu); } static unsigned int cpu_mask_to_apicid_x2apic_cluster(const cpumask_t *cpumask) { unsigned int cpu = cpumask_any(cpumask); unsigned int dest = per_cpu(cpu_2_logical_apicid, cpu); const cpumask_t *cluster_cpus = per_cpu(cluster_cpus, cpu); for_each_cpu ( cpu, cluster_cpus ) if ( cpumask_test_cpu(cpu, cpumask) ) dest |= per_cpu(cpu_2_logical_apicid, cpu); return dest; } static void send_IPI_mask_x2apic_phys(const cpumask_t *cpumask, int vector) { unsigned int cpu; unsigned long flags; uint64_t msr_content; /* * Ensure that any synchronisation data written in program order by this * CPU is seen by notified remote CPUs. The WRMSR contained within * apic_icr_write() can otherwise be executed early. * * The reason mb() is sufficient here is subtle: the register arguments * to WRMSR must depend on a memory read executed after the barrier. This * is guaranteed by cpu_physical_id(), which reads from a global array (and * so cannot be hoisted above the barrier even by a clever compiler). */ mb(); local_irq_save(flags); for_each_cpu ( cpu, cpumask ) { if ( !cpu_online(cpu) || (cpu == smp_processor_id()) ) continue; msr_content = cpu_physical_id(cpu); msr_content = (msr_content << 32) | APIC_DM_FIXED | APIC_DEST_PHYSICAL | vector; apic_wrmsr(APIC_ICR, msr_content); } local_irq_restore(flags); } static void send_IPI_mask_x2apic_cluster(const cpumask_t *cpumask, int vector) { unsigned int cpu = smp_processor_id(); cpumask_t *ipimask = per_cpu(scratch_mask, cpu); const cpumask_t *cluster_cpus; unsigned long flags; mb(); /* See above for an explanation. */ local_irq_save(flags); cpumask_andnot(ipimask, &cpu_online_map, cpumask_of(cpu)); for ( cpumask_and(ipimask, cpumask, ipimask); !cpumask_empty(ipimask); cpumask_andnot(ipimask, ipimask, cluster_cpus) ) { uint64_t msr_content = 0; cluster_cpus = per_cpu(cluster_cpus, cpumask_first(ipimask)); for_each_cpu ( cpu, cluster_cpus ) { if ( !cpumask_test_cpu(cpu, ipimask) ) continue; msr_content |= per_cpu(cpu_2_logical_apicid, cpu); } BUG_ON(!msr_content); msr_content = (msr_content << 32) | APIC_DM_FIXED | APIC_DEST_LOGICAL | vector; apic_wrmsr(APIC_ICR, msr_content); } local_irq_restore(flags); } static const struct genapic apic_x2apic_phys = { APIC_INIT("x2apic_phys", NULL), .int_delivery_mode = dest_Fixed, .int_dest_mode = 0 /* physical delivery */, .init_apic_ldr = init_apic_ldr_x2apic_phys, .clustered_apic_check = clustered_apic_check_x2apic, .target_cpus = target_cpus_all, .vector_allocation_cpumask = vector_allocation_cpumask_phys, .cpu_mask_to_apicid = cpu_mask_to_apicid_phys, .send_IPI_mask = send_IPI_mask_x2apic_phys, .send_IPI_self = send_IPI_self_x2apic }; static const struct genapic apic_x2apic_cluster = { APIC_INIT("x2apic_cluster", NULL), .int_delivery_mode = dest_LowestPrio, .int_dest_mode = 1 /* logical delivery */, .init_apic_ldr = init_apic_ldr_x2apic_cluster, .clustered_apic_check = clustered_apic_check_x2apic, .target_cpus = target_cpus_all, .vector_allocation_cpumask = vector_allocation_cpumask_x2apic_cluster, .cpu_mask_to_apicid = cpu_mask_to_apicid_x2apic_cluster, .send_IPI_mask = send_IPI_mask_x2apic_cluster, .send_IPI_self = send_IPI_self_x2apic }; static int update_clusterinfo( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int err = 0; switch (action) { case CPU_UP_PREPARE: per_cpu(cpu_2_logical_apicid, cpu) = BAD_APICID; if ( !cluster_cpus_spare ) cluster_cpus_spare = xzalloc(cpumask_t); if ( !cluster_cpus_spare || !alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) ) err = -ENOMEM; break; case CPU_UP_CANCELED: case CPU_DEAD: if ( per_cpu(cluster_cpus, cpu) ) { cpumask_clear_cpu(cpu, per_cpu(cluster_cpus, cpu)); if ( cpumask_empty(per_cpu(cluster_cpus, cpu)) ) xfree(per_cpu(cluster_cpus, cpu)); } free_cpumask_var(per_cpu(scratch_mask, cpu)); break; } return !err ? NOTIFY_DONE : notifier_from_errno(err); } static struct notifier_block x2apic_cpu_nfb = { .notifier_call = update_clusterinfo }; static s8 __initdata x2apic_phys = -1; /* By default we use logical cluster mode. */ boolean_param("x2apic_phys", x2apic_phys); const struct genapic *__init apic_x2apic_probe(void) { if ( x2apic_phys < 0 ) x2apic_phys = !!(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL); if ( x2apic_phys ) return &apic_x2apic_phys; if ( !this_cpu(cluster_cpus) ) { update_clusterinfo(NULL, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); init_apic_ldr_x2apic_cluster(); register_cpu_notifier(&x2apic_cpu_nfb); } return &apic_x2apic_cluster; } void __init check_x2apic_preenabled(void) { u32 lo, hi; if ( !cpu_has_x2apic ) return; /* Check whether x2apic mode was already enabled by the BIOS. */ rdmsr(MSR_IA32_APICBASE, lo, hi); if ( lo & MSR_IA32_APICBASE_EXTD ) { printk("x2APIC mode is already enabled by BIOS.\n"); x2apic_enabled = 1; genapic = apic_x2apic_probe(); } } xen-4.4.0/xen/arch/x86/genapic/delivery.c0000664000175000017500000000306612307313555016201 0ustar smbsmb#include #include #include #include #include #include #include const cpumask_t *target_cpus_all(void) { return &cpu_online_map; } /* * LOGICAL FLAT DELIVERY MODE (multicast via bitmask to <= 8 logical APIC IDs). */ void init_apic_ldr_flat(void) { unsigned long val; apic_write_around(APIC_DFR, APIC_DFR_FLAT); val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; val |= SET_xAPIC_LOGICAL_ID(1UL << smp_processor_id()); apic_write_around(APIC_LDR, val); } void __init clustered_apic_check_flat(void) { printk("Enabling APIC mode: Flat. Using %d I/O APICs\n", nr_ioapics); } const cpumask_t *vector_allocation_cpumask_flat(int cpu) { return &cpu_online_map; } unsigned int cpu_mask_to_apicid_flat(const cpumask_t *cpumask) { return cpumask_bits(cpumask)[0]&0xFF; } /* * PHYSICAL DELIVERY MODE (unicast to physical APIC IDs). */ void init_apic_ldr_phys(void) { unsigned long val; apic_write_around(APIC_DFR, APIC_DFR_FLAT); /* A dummy logical ID should be fine. We only deliver in phys mode. */ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; apic_write_around(APIC_LDR, val); } void __init clustered_apic_check_phys(void) { printk("Enabling APIC mode: Phys. Using %d I/O APICs\n", nr_ioapics); } const cpumask_t *vector_allocation_cpumask_phys(int cpu) { return cpumask_of(cpu); } unsigned int cpu_mask_to_apicid_phys(const cpumask_t *cpumask) { /* As we are using single CPU as destination, pick only one CPU here */ return cpu_physical_id(cpumask_any(cpumask)); } xen-4.4.0/xen/arch/x86/genapic/probe.c0000664000175000017500000000501512307313555015461 0ustar smbsmb/* Copyright 2003 Andi Kleen, SuSE Labs. * Subject to the GNU Public License, v.2 * * Generic x86 APIC driver probe layer. */ #include #include #include #include #include #include #include #include #include #include #include #include extern const struct genapic apic_bigsmp; const struct genapic *__read_mostly genapic; const struct genapic *apic_probe[] __initdata = { &apic_bigsmp, &apic_default, /* must be last */ NULL, }; static bool_t __initdata cmdline_apic; void __init generic_bigsmp_probe(void) { /* * This routine is used to switch to bigsmp mode when * - There is no apic= option specified by the user * - generic_apic_probe() has choosen apic_default as the sub_arch * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support */ if (!cmdline_apic && genapic == &apic_default) if (apic_bigsmp.probe()) { genapic = &apic_bigsmp; printk(KERN_INFO "Overriding APIC driver with %s\n", genapic->name); } } static void __init genapic_apic_force(char *str) { int i; for (i = 0; apic_probe[i]; i++) if (!strcmp(apic_probe[i]->name, str)) genapic = apic_probe[i]; } custom_param("apic", genapic_apic_force); void __init generic_apic_probe(void) { int i, changed; record_boot_APIC_mode(); check_x2apic_preenabled(); cmdline_apic = changed = (genapic != NULL); for (i = 0; !changed && apic_probe[i]; i++) { if (apic_probe[i]->probe()) { changed = 1; genapic = apic_probe[i]; } } if (!changed) genapic = &apic_default; printk(KERN_INFO "Using APIC driver %s\n", genapic->name); } /* These functions can switch the APIC even after the initial ->probe() */ int __init mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid) { int i; for (i = 0; apic_probe[i]; ++i) { if (apic_probe[i]->mps_oem_check(mpc,oem,productid)) { if (!cmdline_apic) { genapic = apic_probe[i]; printk(KERN_INFO "Switched to APIC driver `%s'.\n", genapic->name); } return 1; } } return 0; } int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) { int i; for (i = 0; apic_probe[i]; ++i) { if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { if (!cmdline_apic) { genapic = apic_probe[i]; printk(KERN_INFO "Switched to APIC driver `%s'.\n", genapic->name); } return 1; } } return 0; } xen-4.4.0/xen/arch/x86/genapic/bigsmp.c0000664000175000017500000000207212307313555015633 0ustar smbsmb#include #include #include #include #include #include #include #include #include #include #include #include #include static __init int force_bigsmp(struct dmi_system_id *d) { printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); def_to_bigsmp = 1; return 0; } static struct dmi_system_id __initdata bigsmp_dmi_table[] = { { force_bigsmp, "UNISYS ES7000-ONE", { DMI_MATCH(DMI_PRODUCT_NAME, "ES7000-ONE") }}, { } }; static __init int probe_bigsmp(void) { /* * We don't implement cluster mode, so force use of * physical mode in both cases. */ if (acpi_gbl_FADT.flags & (ACPI_FADT_APIC_CLUSTER | ACPI_FADT_APIC_PHYSICAL)) def_to_bigsmp = 1; else if (!def_to_bigsmp) dmi_check_system(bigsmp_dmi_table); return def_to_bigsmp; } const struct genapic apic_bigsmp = { APIC_INIT("bigsmp", probe_bigsmp), GENAPIC_PHYS }; xen-4.4.0/xen/arch/x86/genapic/default.c0000664000175000017500000000110012307313555015765 0ustar smbsmb/* * Default generic APIC driver. This handles upto 8 CPUs. */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* should be called last. */ static __init int probe_default(void) { return 1; } const struct genapic apic_default = { APIC_INIT("default", probe_default), GENAPIC_FLAT }; xen-4.4.0/xen/arch/x86/domain_page.c0000664000175000017500000002413612307313555015214 0ustar smbsmb/****************************************************************************** * domain_page.h * * Allow temporary mapping of domain pages. * * Copyright (c) 2003-2006, Keir Fraser */ #include #include #include #include #include #include #include #include #include static struct vcpu *__read_mostly override; static inline struct vcpu *mapcache_current_vcpu(void) { /* In the common case we use the mapcache of the running VCPU. */ struct vcpu *v = override ?: current; /* * When current isn't properly set up yet, this is equivalent to * running in an idle vCPU (callers must check for NULL). */ if ( v == (struct vcpu *)0xfffff000 ) return NULL; /* * If guest_table is NULL, and we are running a paravirtualised guest, * then it means we are running on the idle domain's page table and must * therefore use its mapcache. */ if ( unlikely(pagetable_is_null(v->arch.guest_table)) && is_pv_vcpu(v) ) { /* If we really are idling, perform lazy context switch now. */ if ( (v = idle_vcpu[smp_processor_id()]) == current ) sync_local_execstate(); /* We must now be running on the idle page table. */ ASSERT(read_cr3() == __pa(idle_pg_table)); } return v; } void __init mapcache_override_current(struct vcpu *v) { override = v; } #define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER) #define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1) #define MAPCACHE_L1ENT(idx) \ __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))] void *map_domain_page(unsigned long mfn) { unsigned long flags; unsigned int idx, i; struct vcpu *v; struct mapcache_domain *dcache; struct mapcache_vcpu *vcache; struct vcpu_maphash_entry *hashent; #ifdef NDEBUG if ( mfn <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) ) return mfn_to_virt(mfn); #endif v = mapcache_current_vcpu(); if ( !v || !is_pv_vcpu(v) ) return mfn_to_virt(mfn); dcache = &v->domain->arch.pv_domain.mapcache; vcache = &v->arch.pv_vcpu.mapcache; if ( !dcache->inuse ) return mfn_to_virt(mfn); perfc_incr(map_domain_page_count); local_irq_save(flags); hashent = &vcache->hash[MAPHASH_HASHFN(mfn)]; if ( hashent->mfn == mfn ) { idx = hashent->idx; ASSERT(idx < dcache->entries); hashent->refcnt++; ASSERT(hashent->refcnt); ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(idx)) == mfn); goto out; } spin_lock(&dcache->lock); /* Has some other CPU caused a wrap? We must flush if so. */ if ( unlikely(dcache->epoch != vcache->shadow_epoch) ) { vcache->shadow_epoch = dcache->epoch; if ( NEED_FLUSH(this_cpu(tlbflush_time), dcache->tlbflush_timestamp) ) { perfc_incr(domain_page_tlb_flush); flush_tlb_local(); } } idx = find_next_zero_bit(dcache->inuse, dcache->entries, dcache->cursor); if ( unlikely(idx >= dcache->entries) ) { unsigned long accum = 0, prev = 0; /* /First/, clean the garbage map and update the inuse list. */ for ( i = 0; i < BITS_TO_LONGS(dcache->entries); i++ ) { accum |= prev; dcache->inuse[i] &= ~xchg(&dcache->garbage[i], 0); prev = ~dcache->inuse[i]; } if ( accum | (prev & BITMAP_LAST_WORD_MASK(dcache->entries)) ) idx = find_first_zero_bit(dcache->inuse, dcache->entries); else { /* Replace a hash entry instead. */ i = MAPHASH_HASHFN(mfn); do { hashent = &vcache->hash[i]; if ( hashent->idx != MAPHASHENT_NOTINUSE && !hashent->refcnt ) { idx = hashent->idx; ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(idx)) == hashent->mfn); l1e_write(&MAPCACHE_L1ENT(idx), l1e_empty()); hashent->idx = MAPHASHENT_NOTINUSE; hashent->mfn = ~0UL; break; } if ( ++i == MAPHASH_ENTRIES ) i = 0; } while ( i != MAPHASH_HASHFN(mfn) ); } BUG_ON(idx >= dcache->entries); /* /Second/, flush TLBs. */ perfc_incr(domain_page_tlb_flush); flush_tlb_local(); vcache->shadow_epoch = ++dcache->epoch; dcache->tlbflush_timestamp = tlbflush_current_time(); } set_bit(idx, dcache->inuse); dcache->cursor = idx + 1; spin_unlock(&dcache->lock); l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_pfn(mfn, __PAGE_HYPERVISOR)); out: local_irq_restore(flags); return (void *)MAPCACHE_VIRT_START + pfn_to_paddr(idx); } void unmap_domain_page(const void *ptr) { unsigned int idx; struct vcpu *v; struct mapcache_domain *dcache; unsigned long va = (unsigned long)ptr, mfn, flags; struct vcpu_maphash_entry *hashent; if ( va >= DIRECTMAP_VIRT_START ) return; ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END); v = mapcache_current_vcpu(); ASSERT(v && is_pv_vcpu(v)); dcache = &v->domain->arch.pv_domain.mapcache; ASSERT(dcache->inuse); idx = PFN_DOWN(va - MAPCACHE_VIRT_START); mfn = l1e_get_pfn(MAPCACHE_L1ENT(idx)); hashent = &v->arch.pv_vcpu.mapcache.hash[MAPHASH_HASHFN(mfn)]; local_irq_save(flags); if ( hashent->idx == idx ) { ASSERT(hashent->mfn == mfn); ASSERT(hashent->refcnt); hashent->refcnt--; } else if ( !hashent->refcnt ) { if ( hashent->idx != MAPHASHENT_NOTINUSE ) { /* /First/, zap the PTE. */ ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(hashent->idx)) == hashent->mfn); l1e_write(&MAPCACHE_L1ENT(hashent->idx), l1e_empty()); /* /Second/, mark as garbage. */ set_bit(hashent->idx, dcache->garbage); } /* Add newly-freed mapping to the maphash. */ hashent->mfn = mfn; hashent->idx = idx; } else { /* /First/, zap the PTE. */ l1e_write(&MAPCACHE_L1ENT(idx), l1e_empty()); /* /Second/, mark as garbage. */ set_bit(idx, dcache->garbage); } local_irq_restore(flags); } void clear_domain_page(unsigned long mfn) { void *ptr = map_domain_page(mfn); clear_page(ptr); unmap_domain_page(ptr); } void copy_domain_page(unsigned long dmfn, unsigned long smfn) { const void *src = map_domain_page(smfn); void *dst = map_domain_page(dmfn); copy_page(dst, src); unmap_domain_page(dst); unmap_domain_page(src); } int mapcache_domain_init(struct domain *d) { struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache; unsigned int bitmap_pages; if ( !is_pv_domain(d) || is_idle_domain(d) ) return 0; #ifdef NDEBUG if ( !mem_hotplug && max_page <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) ) return 0; #endif BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 + 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) > MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20)); bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long)); dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE; dcache->garbage = dcache->inuse + (bitmap_pages + 1) * PAGE_SIZE / sizeof(long); spin_lock_init(&dcache->lock); return create_perdomain_mapping(d, (unsigned long)dcache->inuse, 2 * bitmap_pages + 1, NIL(l1_pgentry_t *), NULL); } int mapcache_vcpu_init(struct vcpu *v) { struct domain *d = v->domain; struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache; unsigned long i; unsigned int ents = d->max_vcpus * MAPCACHE_VCPU_ENTRIES; unsigned int nr = PFN_UP(BITS_TO_LONGS(ents) * sizeof(long)); if ( !is_pv_vcpu(v) || !dcache->inuse ) return 0; if ( ents > dcache->entries ) { /* Populate page tables. */ int rc = create_perdomain_mapping(d, MAPCACHE_VIRT_START, ents, NIL(l1_pgentry_t *), NULL); /* Populate bit maps. */ if ( !rc ) rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse, nr, NULL, NIL(struct page_info *)); if ( !rc ) rc = create_perdomain_mapping(d, (unsigned long)dcache->garbage, nr, NULL, NIL(struct page_info *)); if ( rc ) return rc; dcache->entries = ents; } /* Mark all maphash entries as not in use. */ BUILD_BUG_ON(MAPHASHENT_NOTINUSE < MAPCACHE_ENTRIES); for ( i = 0; i < MAPHASH_ENTRIES; i++ ) { struct vcpu_maphash_entry *hashent = &v->arch.pv_vcpu.mapcache.hash[i]; hashent->mfn = ~0UL; /* never valid to map */ hashent->idx = MAPHASHENT_NOTINUSE; } return 0; } void *map_domain_page_global(unsigned long mfn) { ASSERT(!in_irq() && local_irq_is_enabled()); #ifdef NDEBUG if ( mfn <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) ) return mfn_to_virt(mfn); #endif return vmap(&mfn, 1); } void unmap_domain_page_global(const void *ptr) { unsigned long va = (unsigned long)ptr; if ( va >= DIRECTMAP_VIRT_START ) return; ASSERT(va >= VMAP_VIRT_START && va < VMAP_VIRT_END); vunmap(ptr); } /* Translate a map-domain-page'd address to the underlying MFN */ unsigned long domain_page_map_to_mfn(const void *ptr) { unsigned long va = (unsigned long)ptr; const l1_pgentry_t *pl1e; if ( va >= DIRECTMAP_VIRT_START ) return virt_to_mfn(ptr); if ( va >= VMAP_VIRT_START && va < VMAP_VIRT_END ) { pl1e = virt_to_xen_l1e(va); BUG_ON(!pl1e); } else { ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END); pl1e = &__linear_l1_table[l1_linear_offset(va)]; } return l1e_get_pfn(*pl1e); } xen-4.4.0/xen/arch/x86/i387.c0000664000175000017500000002146412307313555013444 0ustar smbsmb/* * linux/arch/i386/kernel/i387.c * * Copyright (C) 1994 Linus Torvalds * * Pentium III FXSR, SSE support * General FPU state handling cleanups * Gareth Hughes , May 2000 */ #include #include #include #include #include #include #include #include static void fpu_init(void) { unsigned long val; asm volatile ( "fninit" ); if ( cpu_has_xmm ) { /* load default value into MXCSR control/status register */ val = MXCSR_DEFAULT; asm volatile ( "ldmxcsr %0" : : "m" (val) ); } } /*******************************/ /* FPU Restore Functions */ /*******************************/ /* Restore x87 extended state */ static inline void fpu_xrstor(struct vcpu *v, uint64_t mask) { bool_t ok; ASSERT(v->arch.xsave_area); /* * XCR0 normally represents what guest OS set. In case of Xen itself, * we set the accumulated feature mask before doing save/restore. */ ok = set_xcr0(v->arch.xcr0_accum | XSTATE_FP_SSE); ASSERT(ok); xrstor(v, mask); ok = set_xcr0(v->arch.xcr0 ?: XSTATE_FP_SSE); ASSERT(ok); } /* Restor x87 FPU, MMX, SSE and SSE2 state */ static inline void fpu_fxrstor(struct vcpu *v) { const typeof(v->arch.xsave_area->fpu_sse) *fpu_ctxt = v->arch.fpu_ctxt; /* * AMD CPUs don't save/restore FDP/FIP/FOP unless an exception * is pending. Clear the x87 state here by setting it to fixed * values. The hypervisor data segment can be sometimes 0 and * sometimes new user value. Both should be ok. Use the FPU saved * data block as a safe address because it should be in L1. */ if ( !(fpu_ctxt->fsw & 0x0080) && boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) { asm volatile ( "fnclex\n\t" "ffree %%st(7)\n\t" /* clear stack tag */ "fildl %0" /* load to clear state */ : : "m" (*fpu_ctxt) ); } /* * FXRSTOR can fault if passed a corrupted data block. We handle this * possibility, which may occur if the block was passed to us by control * tools or through VCPUOP_initialise, by silently clearing the block. */ switch ( __builtin_expect(fpu_ctxt->x[FPU_WORD_SIZE_OFFSET], 8) ) { default: asm volatile ( /* See below for why the operands/constraints are this way. */ "1: " REX64_PREFIX "fxrstor (%2)\n" ".section .fixup,\"ax\" \n" "2: push %%"__OP"ax \n" " push %%"__OP"cx \n" " push %%"__OP"di \n" " mov %2,%%"__OP"di \n" " mov %1,%%ecx \n" " xor %%eax,%%eax \n" " rep ; stosl \n" " pop %%"__OP"di \n" " pop %%"__OP"cx \n" " pop %%"__OP"ax \n" " jmp 1b \n" ".previous \n" _ASM_EXTABLE(1b, 2b) : : "m" (*fpu_ctxt), "i" (sizeof(*fpu_ctxt) / 4), "R" (fpu_ctxt) ); break; case 4: case 2: asm volatile ( "1: fxrstor %0 \n" ".section .fixup,\"ax\"\n" "2: push %%"__OP"ax \n" " push %%"__OP"cx \n" " push %%"__OP"di \n" " lea %0,%%"__OP"di \n" " mov %1,%%ecx \n" " xor %%eax,%%eax \n" " rep ; stosl \n" " pop %%"__OP"di \n" " pop %%"__OP"cx \n" " pop %%"__OP"ax \n" " jmp 1b \n" ".previous \n" _ASM_EXTABLE(1b, 2b) : : "m" (*fpu_ctxt), "i" (sizeof(*fpu_ctxt) / 4) ); break; } } /* Restore x87 extended state */ static inline void fpu_frstor(struct vcpu *v) { const char *fpu_ctxt = v->arch.fpu_ctxt; asm volatile ( "frstor %0" : : "m" (*fpu_ctxt) ); } /*******************************/ /* FPU Save Functions */ /*******************************/ static inline uint64_t vcpu_xsave_mask(const struct vcpu *v) { if ( v->fpu_dirtied ) return v->arch.nonlazy_xstate_used ? XSTATE_ALL : XSTATE_LAZY; return v->arch.nonlazy_xstate_used ? XSTATE_NONLAZY : 0; } /* Save x87 extended state */ static inline void fpu_xsave(struct vcpu *v) { bool_t ok; uint64_t mask = vcpu_xsave_mask(v); ASSERT(mask); ASSERT(v->arch.xsave_area); /* * XCR0 normally represents what guest OS set. In case of Xen itself, * we set the accumulated feature mask before doing save/restore. */ ok = set_xcr0(v->arch.xcr0_accum | XSTATE_FP_SSE); ASSERT(ok); xsave(v, mask); ok = set_xcr0(v->arch.xcr0 ?: XSTATE_FP_SSE); ASSERT(ok); } /* Save x87 FPU, MMX, SSE and SSE2 state */ static inline void fpu_fxsave(struct vcpu *v) { typeof(v->arch.xsave_area->fpu_sse) *fpu_ctxt = v->arch.fpu_ctxt; int word_size = cpu_has_fpu_sel ? 8 : 0; if ( !is_pv_32bit_vcpu(v) ) { /* * The only way to force fxsaveq on a wide range of gas versions. * On older versions the rex64 prefix works only if we force an * addressing mode that doesn't require extended registers. */ asm volatile ( REX64_PREFIX "fxsave (%1)" : "=m" (*fpu_ctxt) : "R" (fpu_ctxt) ); /* * AMD CPUs don't save/restore FDP/FIP/FOP unless an exception * is pending. */ if ( !(fpu_ctxt->fsw & 0x0080) && boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) return; if ( word_size > 0 && !((fpu_ctxt->fip.addr | fpu_ctxt->fdp.addr) >> 32) ) { struct ix87_env fpu_env; asm volatile ( "fnstenv %0" : "=m" (fpu_env) ); fpu_ctxt->fip.sel = fpu_env.fcs; fpu_ctxt->fdp.sel = fpu_env.fds; word_size = 4; } } else { asm volatile ( "fxsave %0" : "=m" (*fpu_ctxt) ); word_size = 4; } if ( word_size >= 0 ) fpu_ctxt->x[FPU_WORD_SIZE_OFFSET] = word_size; } /* Save x87 FPU state */ static inline void fpu_fsave(struct vcpu *v) { char *fpu_ctxt = v->arch.fpu_ctxt; /* FWAIT is required to make FNSAVE synchronous. */ asm volatile ( "fnsave %0 ; fwait" : "=m" (*fpu_ctxt) ); } /*******************************/ /* VCPU FPU Functions */ /*******************************/ /* Restore FPU state whenever VCPU is schduled in. */ void vcpu_restore_fpu_eager(struct vcpu *v) { ASSERT(!is_idle_vcpu(v)); /* save the nonlazy extended state which is not tracked by CR0.TS bit */ if ( v->arch.nonlazy_xstate_used ) { /* Avoid recursion */ clts(); fpu_xrstor(v, XSTATE_NONLAZY); stts(); } } /* * Restore FPU state when #NM is triggered. */ void vcpu_restore_fpu_lazy(struct vcpu *v) { ASSERT(!is_idle_vcpu(v)); /* Avoid recursion. */ clts(); if ( v->fpu_dirtied ) return; if ( cpu_has_xsave ) fpu_xrstor(v, XSTATE_LAZY); else if ( v->fpu_initialised ) { if ( cpu_has_fxsr ) fpu_fxrstor(v); else fpu_frstor(v); } else fpu_init(); v->fpu_initialised = 1; v->fpu_dirtied = 1; } /* * On each context switch, save the necessary FPU info of VCPU being switch * out. It dispatches saving operation based on CPU's capability. */ void vcpu_save_fpu(struct vcpu *v) { if ( !v->fpu_dirtied && !v->arch.nonlazy_xstate_used ) return; ASSERT(!is_idle_vcpu(v)); /* This can happen, if a paravirtualised guest OS has set its CR0.TS. */ clts(); if ( cpu_has_xsave ) fpu_xsave(v); else if ( cpu_has_fxsr ) fpu_fxsave(v); else fpu_fsave(v); v->fpu_dirtied = 0; stts(); } /* Initialize FPU's context save area */ int vcpu_init_fpu(struct vcpu *v) { int rc = 0; /* Idle domain doesn't have FPU state allocated */ if ( is_idle_vcpu(v) ) goto done; if ( (rc = xstate_alloc_save_area(v)) != 0 ) return rc; if ( v->arch.xsave_area ) v->arch.fpu_ctxt = &v->arch.xsave_area->fpu_sse; else { v->arch.fpu_ctxt = _xzalloc(sizeof(v->arch.xsave_area->fpu_sse), 16); if ( !v->arch.fpu_ctxt ) { rc = -ENOMEM; goto done; } } done: return rc; } /* Free FPU's context save area */ void vcpu_destroy_fpu(struct vcpu *v) { if ( v->arch.xsave_area ) xstate_free_save_area(v); else xfree(v->arch.fpu_ctxt); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/io_apic.c0000664000175000017500000022373212307313555014357 0ustar smbsmb/* * Intel IO-APIC support for multi-Pentium hosts. * * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo * * Many thanks to Stig Venaas for trying out countless experimental * patches and reporting/debugging problems patiently! * * (c) 1999, Multiple IO-APIC support, developed by * Ken-ichi Yaku and * Hidemi Kishimoto , * further tested and cleaned up by Zach Brown * and Ingo Molnar * * Fixes * Maciej W. Rozycki : Bits for genuine 82489DX APICs; * thanks to Eric Gilmore * and Rolf G. Tews * for testing these extensively * Paul Diefenbaugh : Added full ACPI support */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; static DEFINE_SPINLOCK(ioapic_lock); bool_t __read_mostly skip_ioapic_setup; bool_t __initdata ioapic_ack_new = 1; bool_t __initdata ioapic_ack_forced = 0; /* * # of IRQ routing registers */ int __read_mostly nr_ioapic_entries[MAX_IO_APICS]; int __read_mostly nr_ioapics; /* * Rough estimation of how many shared IRQs there are, can * be changed anytime. */ #define MAX_PLUS_SHARED_IRQS nr_irqs_gsi #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + nr_irqs_gsi) #define ioapic_has_eoi_reg(apic) (mp_ioapics[(apic)].mpc_apicver >= 0x20) static int apic_pin_2_gsi_irq(int apic, int pin); static vmask_t *__read_mostly vector_map[MAX_IO_APICS]; static void share_vector_maps(unsigned int src, unsigned int dst) { unsigned int pin; if (vector_map[src] == vector_map[dst]) return; bitmap_or(vector_map[src]->_bits, vector_map[src]->_bits, vector_map[dst]->_bits, NR_VECTORS); for (pin = 0; pin < nr_ioapic_entries[dst]; ++pin) { int irq = apic_pin_2_gsi_irq(dst, pin); struct irq_desc *desc; if (irq < 0) continue; desc = irq_to_desc(irq); if (desc->arch.used_vectors == vector_map[dst]) desc->arch.used_vectors = vector_map[src]; } vector_map[dst] = vector_map[src]; } /* * This is performance-critical, we want to do it O(1) * * the indexing order of this array favors 1:1 mappings * between pins and IRQs. */ static struct irq_pin_list { int apic, pin; unsigned int next; } *__read_mostly irq_2_pin; static unsigned int irq_2_pin_free_entry; /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ static void add_pin_to_irq(unsigned int irq, int apic, int pin) { struct irq_pin_list *entry = irq_2_pin + irq; while (entry->next) { BUG_ON((entry->apic == apic) && (entry->pin == pin)); entry = irq_2_pin + entry->next; } BUG_ON((entry->apic == apic) && (entry->pin == pin)); if (entry->pin != -1) { if (irq_2_pin_free_entry >= PIN_MAP_SIZE) panic("io_apic.c: whoops"); entry->next = irq_2_pin_free_entry; entry = irq_2_pin + entry->next; irq_2_pin_free_entry = entry->next; entry->next = 0; } entry->apic = apic; entry->pin = pin; share_vector_maps(irq_2_pin[irq].apic, apic); } static void remove_pin_from_irq(unsigned int irq, int apic, int pin) { struct irq_pin_list *entry, *prev; for (entry = &irq_2_pin[irq]; ; entry = &irq_2_pin[entry->next]) { if ((entry->apic == apic) && (entry->pin == pin)) break; BUG_ON(!entry->next); } entry->pin = entry->apic = -1; if (entry != &irq_2_pin[irq]) { /* Removed entry is not at head of list. */ prev = &irq_2_pin[irq]; while (&irq_2_pin[prev->next] != entry) prev = &irq_2_pin[prev->next]; prev->next = entry->next; } else if (entry->next) { /* Removed entry is at head of multi-item list. */ prev = entry; entry = &irq_2_pin[entry->next]; *prev = *entry; entry->pin = entry->apic = -1; } else return; entry->next = irq_2_pin_free_entry; irq_2_pin_free_entry = entry - irq_2_pin; } /* * Reroute an IRQ to a different pin. */ static void __init replace_pin_at_irq(unsigned int irq, int oldapic, int oldpin, int newapic, int newpin) { struct irq_pin_list *entry = irq_2_pin + irq; while (1) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; share_vector_maps(oldapic, newapic); } if (!entry->next) break; entry = irq_2_pin + entry->next; } } vmask_t *io_apic_get_used_vector_map(unsigned int irq) { struct irq_pin_list *entry = irq_2_pin + irq; if (entry->pin == -1) return NULL; return vector_map[entry->apic]; } struct IO_APIC_route_entry **alloc_ioapic_entries(void) { int apic; struct IO_APIC_route_entry **ioapic_entries; ioapic_entries = xmalloc_array(struct IO_APIC_route_entry *, nr_ioapics); if (!ioapic_entries) return 0; for (apic = 0; apic < nr_ioapics; apic++) { ioapic_entries[apic] = xmalloc_array(struct IO_APIC_route_entry, nr_ioapic_entries[apic]); if (!ioapic_entries[apic] && nr_ioapic_entries[apic]) goto nomem; } return ioapic_entries; nomem: while (--apic >= 0) xfree(ioapic_entries[apic]); xfree(ioapic_entries); return 0; } union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; }; struct IO_APIC_route_entry __ioapic_read_entry( unsigned int apic, unsigned int pin, bool_t raw) { unsigned int (*read)(unsigned int, unsigned int) = raw ? __io_apic_read : io_apic_read; union entry_union eu; eu.w1 = (*read)(apic, 0x10 + 2 * pin); eu.w2 = (*read)(apic, 0x11 + 2 * pin); return eu.entry; } static struct IO_APIC_route_entry ioapic_read_entry( unsigned int apic, unsigned int pin, bool_t raw) { struct IO_APIC_route_entry entry; unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); entry = __ioapic_read_entry(apic, pin, raw); spin_unlock_irqrestore(&ioapic_lock, flags); return entry; } void __ioapic_write_entry( unsigned int apic, unsigned int pin, bool_t raw, struct IO_APIC_route_entry e) { void (*write)(unsigned int, unsigned int, unsigned int) = raw ? __io_apic_write : io_apic_write; union entry_union eu = { .entry = e }; (*write)(apic, 0x11 + 2*pin, eu.w2); (*write)(apic, 0x10 + 2*pin, eu.w1); } static void ioapic_write_entry( unsigned int apic, unsigned int pin, bool_t raw, struct IO_APIC_route_entry e) { unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); __ioapic_write_entry(apic, pin, raw, e); spin_unlock_irqrestore(&ioapic_lock, flags); } /* EOI an IO-APIC entry. Vector may be -1, indicating that it should be * worked out using the pin. This function expects that the ioapic_lock is * being held, and interrupts are disabled (or there is a good reason not * to), and that if both pin and vector are passed, that they refer to the * same redirection entry in the IO-APIC. */ static void __io_apic_eoi(unsigned int apic, unsigned int vector, unsigned int pin) { /* Prefer the use of the EOI register if available */ if ( ioapic_has_eoi_reg(apic) ) { /* If vector is unknown, read it from the IO-APIC */ if ( vector == IRQ_VECTOR_UNASSIGNED ) vector = __ioapic_read_entry(apic, pin, TRUE).vector; *(IO_APIC_BASE(apic)+16) = vector; } else { /* Else fake an EOI by switching to edge triggered mode * and back */ struct IO_APIC_route_entry entry; bool_t need_to_unmask = 0; entry = __ioapic_read_entry(apic, pin, TRUE); if ( ! entry.mask ) { /* If entry is not currently masked, mask it and make * a note to unmask it later */ entry.mask = 1; __ioapic_write_entry(apic, pin, TRUE, entry); need_to_unmask = 1; } /* Flip the trigger mode to edge and back */ entry.trigger = 0; __ioapic_write_entry(apic, pin, TRUE, entry); entry.trigger = 1; __ioapic_write_entry(apic, pin, TRUE, entry); if ( need_to_unmask ) { /* Unmask if neccesary */ entry.mask = 0; __ioapic_write_entry(apic, pin, TRUE, entry); } } } /* * Saves all the IO-APIC RTE's */ int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) { int apic, pin; if (!ioapic_entries) return -ENOMEM; for (apic = 0; apic < nr_ioapics; apic++) { if (!nr_ioapic_entries[apic]) continue; if (!ioapic_entries[apic]) return -ENOMEM; for (pin = 0; pin < nr_ioapic_entries[apic]; pin++) ioapic_entries[apic][pin] = __ioapic_read_entry(apic, pin, 1); } return 0; } /* * Mask all IO APIC entries. */ void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) { int apic, pin; if (!ioapic_entries) return; for (apic = 0; apic < nr_ioapics; apic++) { if (!nr_ioapic_entries[apic]) continue; if (!ioapic_entries[apic]) break; for (pin = 0; pin < nr_ioapic_entries[apic]; pin++) { struct IO_APIC_route_entry entry; entry = ioapic_entries[apic][pin]; if (!entry.mask) { entry.mask = 1; ioapic_write_entry(apic, pin, 1, entry); } } } } /* * Restore IO APIC entries which was saved in ioapic_entries. */ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) { int apic, pin; if (!ioapic_entries) return -ENOMEM; for (apic = 0; apic < nr_ioapics; apic++) { if (!nr_ioapic_entries[apic]) continue; if (!ioapic_entries[apic]) return -ENOMEM; for (pin = 0; pin < nr_ioapic_entries[apic]; pin++) ioapic_write_entry(apic, pin, 1, ioapic_entries[apic][pin]); } return 0; } void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) { int apic; for (apic = 0; apic < nr_ioapics; apic++) xfree(ioapic_entries[apic]); xfree(ioapic_entries); } static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) { struct irq_pin_list *entry = irq_2_pin + irq; unsigned int pin, reg; for (;;) { pin = entry->pin; if (pin == -1) break; reg = io_apic_read(entry->apic, 0x10 + pin*2); reg &= ~disable; reg |= enable; io_apic_modify(entry->apic, 0x10 + pin*2, reg); if (!entry->next) break; entry = irq_2_pin + entry->next; } } /* mask = 1 */ static void __mask_IO_APIC_irq (unsigned int irq) { __modify_IO_APIC_irq(irq, 0x00010000, 0); } /* mask = 0 */ static void __unmask_IO_APIC_irq (unsigned int irq) { __modify_IO_APIC_irq(irq, 0, 0x00010000); } /* trigger = 0 */ static void __edge_IO_APIC_irq (unsigned int irq) { __modify_IO_APIC_irq(irq, 0, 0x00008000); } /* trigger = 1 */ static void __level_IO_APIC_irq (unsigned int irq) { __modify_IO_APIC_irq(irq, 0x00008000, 0); } static void mask_IO_APIC_irq(struct irq_desc *desc) { unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); __mask_IO_APIC_irq(desc->irq); spin_unlock_irqrestore(&ioapic_lock, flags); } static void unmask_IO_APIC_irq(struct irq_desc *desc) { unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); __unmask_IO_APIC_irq(desc->irq); spin_unlock_irqrestore(&ioapic_lock, flags); } static void __eoi_IO_APIC_irq(struct irq_desc *desc) { struct irq_pin_list *entry = irq_2_pin + desc->irq; unsigned int pin, vector = desc->arch.vector; for (;;) { pin = entry->pin; if (pin == -1) break; __io_apic_eoi(entry->apic, vector, pin); if (!entry->next) break; entry = irq_2_pin + entry->next; } } static void eoi_IO_APIC_irq(struct irq_desc *desc) { unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); __eoi_IO_APIC_irq(desc); spin_unlock_irqrestore(&ioapic_lock, flags); } static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; /* Check delivery_mode to be sure we're not clearing an SMI pin */ entry = __ioapic_read_entry(apic, pin, FALSE); if (entry.delivery_mode == dest_SMI) return; /* * Make sure the entry is masked and re-read the contents to check * if it is a level triggered pin and if the remoteIRR is set. */ if (!entry.mask) { entry.mask = 1; __ioapic_write_entry(apic, pin, FALSE, entry); } entry = __ioapic_read_entry(apic, pin, TRUE); if (entry.irr) { /* Make sure the trigger mode is set to level. */ if (!entry.trigger) { entry.trigger = 1; __ioapic_write_entry(apic, pin, TRUE, entry); } __io_apic_eoi(apic, entry.vector, pin); } /* * Disable it in the IO-APIC irq-routing table: */ memset(&entry, 0, sizeof(entry)); entry.mask = 1; __ioapic_write_entry(apic, pin, TRUE, entry); entry = __ioapic_read_entry(apic, pin, TRUE); if (entry.irr) printk(KERN_ERR "IO-APIC%02x-%u: Unable to reset IRR\n", IO_APIC_ID(apic), pin); } static void clear_IO_APIC (void) { int apic, pin; for (apic = 0; apic < nr_ioapics; apic++) { for (pin = 0; pin < nr_ioapic_entries[apic]; pin++) clear_IO_APIC_pin(apic, pin); } } static void set_ioapic_affinity_irq(struct irq_desc *desc, const cpumask_t *mask) { unsigned long flags; unsigned int dest; int pin, irq; struct irq_pin_list *entry; irq = desc->irq; spin_lock_irqsave(&ioapic_lock, flags); dest = set_desc_affinity(desc, mask); if (dest != BAD_APICID) { if ( !x2apic_enabled ) dest = SET_APIC_LOGICAL_ID(dest); entry = irq_2_pin + irq; for (;;) { unsigned int data; pin = entry->pin; if (pin == -1) break; io_apic_write(entry->apic, 0x10 + 1 + pin*2, dest); data = io_apic_read(entry->apic, 0x10 + pin*2); data &= ~IO_APIC_REDIR_VECTOR_MASK; data |= desc->arch.vector & 0xFF; io_apic_modify(entry->apic, 0x10 + pin*2, data); if (!entry->next) break; entry = irq_2_pin + entry->next; } } spin_unlock_irqrestore(&ioapic_lock, flags); } /* * Find the IRQ entry number of a certain pin. */ static int find_irq_entry(int apic, int pin, int type) { int i; for (i = 0; i < mp_irq_entries; i++) if (mp_irqs[i].mpc_irqtype == type && (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && mp_irqs[i].mpc_dstirq == pin) return i; return -1; } /* * Find the pin to which IRQ[irq] (ISA) is connected */ static int __init find_isa_irq_pin(int irq, int type) { int i; for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || mp_bus_id_to_type[lbus] == MP_BUS_EISA || mp_bus_id_to_type[lbus] == MP_BUS_MCA || mp_bus_id_to_type[lbus] == MP_BUS_NEC98 ) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) return mp_irqs[i].mpc_dstirq; } return -1; } static int __init find_isa_irq_apic(int irq, int type) { int i; for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || mp_bus_id_to_type[lbus] == MP_BUS_EISA || mp_bus_id_to_type[lbus] == MP_BUS_MCA || mp_bus_id_to_type[lbus] == MP_BUS_NEC98 ) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) break; } if (i < mp_irq_entries) { int apic; for(apic = 0; apic < nr_ioapics; apic++) { if (!nr_ioapic_entries[apic]) continue; if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) return apic; } } return -1; } /* * Find a specific PCI IRQ entry. * Not an __init, possibly needed by modules */ static int pin_2_irq(int idx, int apic, int pin); /* * This function currently is only a helper for the i386 smp boot process where * we need to reprogram the ioredtbls to cater for the cpus which have come online * so mask in all cases should simply be TARGET_CPUS */ void /*__init*/ setup_ioapic_dest(void) { int pin, ioapic, irq, irq_entry; if (skip_ioapic_setup) return; for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { for (pin = 0; pin < nr_ioapic_entries[ioapic]; pin++) { struct irq_desc *desc; irq_entry = find_irq_entry(ioapic, pin, mp_INT); if (irq_entry == -1) continue; irq = pin_2_irq(irq_entry, ioapic, pin); desc = irq_to_desc(irq); BUG_ON(cpumask_empty(desc->arch.cpu_mask)); set_ioapic_affinity_irq(desc, desc->arch.cpu_mask); } } } /* * EISA Edge/Level control register, ELCR */ static int EISA_ELCR(unsigned int irq) { if (platform_legacy_irq(irq)) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } apic_printk(APIC_VERBOSE, KERN_INFO "Broken MPtable reports ISA irq %d\n", irq); return 0; } /* EISA interrupts are always polarity zero and can be edge or level * trigger depending on the ELCR value. If an interrupt is listed as * EISA conforming in the MP table, that means its trigger type must * be read in from the ELCR */ #define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) #define default_EISA_polarity(idx) (0) /* ISA interrupts are always polarity zero edge triggered, * when listed as conforming in the MP table. */ #define default_ISA_trigger(idx) (0) #define default_ISA_polarity(idx) (0) /* PCI interrupts are always polarity one level triggered, * when listed as conforming in the MP table. */ #define default_PCI_trigger(idx) (1) #define default_PCI_polarity(idx) (1) /* MCA interrupts are always polarity zero level triggered, * when listed as conforming in the MP table. */ #define default_MCA_trigger(idx) (1) #define default_MCA_polarity(idx) (0) /* NEC98 interrupts are always polarity zero edge triggered, * when listed as conforming in the MP table. */ #define default_NEC98_trigger(idx) (0) #define default_NEC98_polarity(idx) (0) static int __init MPBIOS_polarity(int idx) { int bus = mp_irqs[idx].mpc_srcbus; int polarity; /* * Determine IRQ line polarity (high active or low active): */ switch (mp_irqs[idx].mpc_irqflag & 3) { case 0: /* conforms, ie. bus-type dependent polarity */ { switch (mp_bus_id_to_type[bus]) { case MP_BUS_ISA: /* ISA pin */ { polarity = default_ISA_polarity(idx); break; } case MP_BUS_EISA: /* EISA pin */ { polarity = default_EISA_polarity(idx); break; } case MP_BUS_PCI: /* PCI pin */ { polarity = default_PCI_polarity(idx); break; } case MP_BUS_MCA: /* MCA pin */ { polarity = default_MCA_polarity(idx); break; } case MP_BUS_NEC98: /* NEC 98 pin */ { polarity = default_NEC98_polarity(idx); break; } default: { printk(KERN_WARNING "broken BIOS!!\n"); polarity = 1; break; } } break; } case 1: /* high active */ { polarity = 0; break; } case 2: /* reserved */ { printk(KERN_WARNING "broken BIOS!!\n"); polarity = 1; break; } case 3: /* low active */ { polarity = 1; break; } default: /* invalid */ { printk(KERN_WARNING "broken BIOS!!\n"); polarity = 1; break; } } return polarity; } static int MPBIOS_trigger(int idx) { int bus = mp_irqs[idx].mpc_srcbus; int trigger; /* * Determine IRQ trigger mode (edge or level sensitive): */ switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) { case 0: /* conforms, ie. bus-type dependent */ { switch (mp_bus_id_to_type[bus]) { case MP_BUS_ISA: /* ISA pin */ { trigger = default_ISA_trigger(idx); break; } case MP_BUS_EISA: /* EISA pin */ { trigger = default_EISA_trigger(idx); break; } case MP_BUS_PCI: /* PCI pin */ { trigger = default_PCI_trigger(idx); break; } case MP_BUS_MCA: /* MCA pin */ { trigger = default_MCA_trigger(idx); break; } case MP_BUS_NEC98: /* NEC 98 pin */ { trigger = default_NEC98_trigger(idx); break; } default: { printk(KERN_WARNING "broken BIOS!!\n"); trigger = 1; break; } } break; } case 1: /* edge */ { trigger = 0; break; } case 2: /* reserved */ { printk(KERN_WARNING "broken BIOS!!\n"); trigger = 1; break; } case 3: /* level */ { trigger = 1; break; } default: /* invalid */ { printk(KERN_WARNING "broken BIOS!!\n"); trigger = 0; break; } } return trigger; } static inline int irq_polarity(int idx) { return MPBIOS_polarity(idx); } static inline int irq_trigger(int idx) { return MPBIOS_trigger(idx); } static int pin_2_irq(int idx, int apic, int pin) { int irq, i; int bus = mp_irqs[idx].mpc_srcbus; /* * Debugging check, we are in big trouble if this message pops up! */ if (mp_irqs[idx].mpc_dstirq != pin) printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); switch (mp_bus_id_to_type[bus]) { case MP_BUS_ISA: /* ISA pin */ case MP_BUS_EISA: case MP_BUS_MCA: case MP_BUS_NEC98: { irq = mp_irqs[idx].mpc_srcbusirq; break; } case MP_BUS_PCI: /* PCI pin */ { /* * PCI IRQs are mapped in order */ i = irq = 0; while (i < apic) irq += nr_ioapic_entries[i++]; irq += pin; break; } default: { printk(KERN_ERR "unknown bus type %d.\n",bus); irq = 0; break; } } return irq; } static inline int IO_APIC_irq_trigger(int irq) { int apic, idx, pin; for (apic = 0; apic < nr_ioapics; apic++) { for (pin = 0; pin < nr_ioapic_entries[apic]; pin++) { idx = find_irq_entry(apic,pin,mp_INT); if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) return irq_trigger(idx); } } /* * nonexistent IRQs are edge default */ return 0; } static struct hw_interrupt_type ioapic_level_type; static hw_irq_controller ioapic_edge_type; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 #define SET_DEST(x, y, value) \ do { if ( x2apic_enabled ) x = value; else y = value; } while(0) static inline void ioapic_register_intr(int irq, unsigned long trigger) { if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) irq_desc[irq].handler = &ioapic_level_type; else irq_desc[irq].handler = &ioapic_edge_type; } static void __init setup_IO_APIC_irqs(void) { struct IO_APIC_route_entry entry; int apic, pin, idx, irq, first_notcon = 1, vector; unsigned long flags; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); for (apic = 0; apic < nr_ioapics; apic++) { for (pin = 0; pin < nr_ioapic_entries[apic]; pin++) { struct irq_desc *desc; /* * add it to the IO-APIC irq-routing table: */ memset(&entry,0,sizeof(entry)); entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; entry.mask = 0; /* enable IRQ */ idx = find_irq_entry(apic,pin,mp_INT); if (idx == -1) { if (first_notcon) { apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); first_notcon = 0; } else apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); continue; } entry.trigger = irq_trigger(idx); entry.polarity = irq_polarity(idx); if (irq_trigger(idx)) { entry.trigger = 1; entry.mask = 1; } irq = pin_2_irq(idx, apic, pin); /* * skip adding the timer int on secondary nodes, which causes * a small but painful rift in the time-space continuum */ if (multi_timer_check(apic, irq)) continue; else add_pin_to_irq(irq, apic, pin); if (!IO_APIC_IRQ(irq)) continue; vector = assign_irq_vector(irq, NULL); BUG_ON(vector < 0); entry.vector = vector; ioapic_register_intr(irq, IOAPIC_AUTO); if (platform_legacy_irq(irq)) disable_8259A_irq(irq_to_desc(irq)); desc = irq_to_desc(irq); SET_DEST(entry.dest.dest32, entry.dest.logical.logical_dest, cpu_mask_to_apicid(desc->arch.cpu_mask)); spin_lock_irqsave(&ioapic_lock, flags); __ioapic_write_entry(apic, pin, 0, entry); set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); } } if (!first_notcon) apic_printk(APIC_VERBOSE, " not connected.\n"); } /* * Set up the 8259A-master output pin: */ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) { struct IO_APIC_route_entry entry; memset(&entry,0,sizeof(entry)); disable_8259A_irq(irq_to_desc(0)); /* mask LVT0 */ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); /* * We use logical delivery to get the timer IRQ * to the first CPU. */ entry.dest_mode = INT_DEST_MODE; entry.mask = 0; /* unmask IRQ now */ SET_DEST(entry.dest.dest32, entry.dest.logical.logical_dest, cpu_mask_to_apicid(TARGET_CPUS)); entry.delivery_mode = INT_DELIVERY_MODE; entry.polarity = 0; entry.trigger = 0; entry.vector = vector; /* * The timer IRQ doesn't have to know that behind the * scene we have a 8259A-master in AEOI mode ... */ irq_desc[0].handler = &ioapic_edge_type; /* * Add it to the IO-APIC irq-routing table: */ ioapic_write_entry(apic, pin, 0, entry); enable_8259A_irq(irq_to_desc(0)); } static inline void UNEXPECTED_IO_APIC(void) { } static void /*__init*/ __print_IO_APIC(void) { int apic, i; union IO_APIC_reg_00 reg_00; union IO_APIC_reg_01 reg_01; union IO_APIC_reg_02 reg_02; union IO_APIC_reg_03 reg_03; unsigned long flags; printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", mp_ioapics[i].mpc_apicid, nr_ioapic_entries[i]); /* * We are a bit conservative about what we expect. We have to * know about every hardware change ASAP. */ printk(KERN_INFO "testing the IO APIC.......................\n"); for (apic = 0; apic < nr_ioapics; apic++) { if (!nr_ioapic_entries[apic]) continue; spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic, 0); reg_01.raw = io_apic_read(apic, 1); if (reg_01.bits.version >= 0x10) reg_02.raw = io_apic_read(apic, 2); if (reg_01.bits.version >= 0x20) reg_03.raw = io_apic_read(apic, 3); spin_unlock_irqrestore(&ioapic_lock, flags); printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); if (reg_00.bits.ID >= get_physical_broadcast()) UNEXPECTED_IO_APIC(); if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) UNEXPECTED_IO_APIC(); printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ (reg_01.bits.entries != 0x2E) && (reg_01.bits.entries != 0x3F) ) UNEXPECTED_IO_APIC(); printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ ) UNEXPECTED_IO_APIC(); if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) UNEXPECTED_IO_APIC(); /* * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, * but the value of reg_02 is read as the previous read register * value, so ignore it if reg_02 == reg_01. */ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) UNEXPECTED_IO_APIC(); } /* * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 * or reg_03, but the value of reg_0[23] is read as the previous read * register value, so ignore it if reg_03 == reg_0[12]. */ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && reg_03.raw != reg_01.raw) { printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); if (reg_03.bits.__reserved_1) UNEXPECTED_IO_APIC(); } printk(KERN_DEBUG ".... IRQ redirection table:\n"); printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" " Stat Dest Deli Vect: \n"); for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; entry = ioapic_read_entry(apic, i, 0); printk(KERN_DEBUG " %02x %03X %02X ", i, entry.dest.logical.logical_dest, entry.dest.physical.physical_dest ); printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", entry.mask, entry.trigger, entry.irr, entry.polarity, entry.delivery_status, entry.dest_mode, entry.delivery_mode, entry.vector ); } } printk(KERN_INFO "Using vector-based indexing\n"); printk(KERN_DEBUG "IRQ to pin mappings:\n"); for (i = 0; i < nr_irqs_gsi; i++) { struct irq_pin_list *entry = irq_2_pin + i; if (entry->pin < 0) continue; printk(KERN_DEBUG "IRQ%d ", irq_to_desc(i)->arch.vector); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) break; entry = irq_2_pin + entry->next; } printk("\n"); } printk(KERN_INFO ".................................... done.\n"); return; } static void __init print_IO_APIC(void) { if (apic_verbosity != APIC_QUIET) __print_IO_APIC(); } static void _print_IO_APIC_keyhandler(unsigned char key) { __print_IO_APIC(); } static struct keyhandler print_IO_APIC_keyhandler = { .diagnostic = 1, .u.fn = _print_IO_APIC_keyhandler, .desc = "print ioapic info" }; static void __init enable_IO_APIC(void) { int i8259_apic, i8259_pin; int i, apic; /* Initialise dynamic irq_2_pin free list. */ irq_2_pin = xzalloc_array(struct irq_pin_list, PIN_MAP_SIZE); for (i = 0; i < PIN_MAP_SIZE; i++) irq_2_pin[i].pin = -1; for (i = irq_2_pin_free_entry = nr_irqs_gsi; i < PIN_MAP_SIZE; i++) irq_2_pin[i].next = i + 1; if (directed_eoi_enabled) { for (apic = 0; apic < nr_ioapics; apic++) { if (!nr_ioapic_entries[apic]) continue; vector_map[apic] = xzalloc(vmask_t); BUG_ON(!vector_map[apic]); } } else { vector_map[0] = xzalloc(vmask_t); BUG_ON(!vector_map[0]); for (apic = 1; apic < nr_ioapics; apic++) vector_map[apic] = vector_map[0]; } for(apic = 0; apic < nr_ioapics; apic++) { int pin; /* See if any of the pins is in ExtINT mode */ for (pin = 0; pin < nr_ioapic_entries[apic]; pin++) { struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin, 0); /* If the interrupt line is enabled and in ExtInt mode * I have found the pin where the i8259 is connected. */ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { ioapic_i8259.apic = apic; ioapic_i8259.pin = pin; goto found_i8259; } } } found_i8259: /* Look to see what if the MP table has reported the ExtINT */ /* If we could not find the appropriate pin by looking at the ioapic * the i8259 probably is not connected the ioapic but give the * mptable a chance anyway. */ i8259_pin = find_isa_irq_pin(0, mp_ExtINT); i8259_apic = find_isa_irq_apic(0, mp_ExtINT); /* Trust the MP table if nothing is setup in the hardware */ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); ioapic_i8259.pin = i8259_pin; ioapic_i8259.apic = i8259_apic; } /* Complain if the MP table and the hardware disagree */ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) { printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); } /* * Do not trust the IO-APIC being empty at bootup */ clear_IO_APIC(); } /* * Not an __init, needed by the reboot code */ void disable_IO_APIC(void) { /* * Clear the IO-APIC before rebooting: */ clear_IO_APIC(); /* * If the i8259 is routed through an IOAPIC * Put that IOAPIC in virtual wire mode * so legacy interrupts can be delivered. */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; memset(&entry, 0, sizeof(entry)); entry.mask = 0; /* Enabled */ entry.trigger = 0; /* Edge */ entry.irr = 0; entry.polarity = 0; /* High */ entry.delivery_status = 0; entry.dest_mode = 0; /* Physical */ entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; SET_DEST(entry.dest.dest32, entry.dest.physical.physical_dest, get_apic_id()); /* * Add it to the IO-APIC irq-routing table: */ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, 0, entry); } disconnect_bsp_APIC(ioapic_i8259.pin != -1); } /* * function to set the IO-APIC physical IDs based on the * values stored in the MPC table. * * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 */ static void __init setup_ioapic_ids_from_mpc(void) { union IO_APIC_reg_00 reg_00; static physid_mask_t __initdata phys_id_present_map; int apic; int i; unsigned char old_id; unsigned long flags; /* * Don't check I/O APIC IDs for xAPIC systems. They have * no meaning without the serial APIC bus. */ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) return; /* * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. */ ioapic_phys_id_map(&phys_id_present_map); /* * Set the IOAPIC ID to the value stored in the MPC table. */ for (apic = 0; apic < nr_ioapics; apic++) { if (!nr_ioapic_entries[apic]) continue; /* Read the register 0 value */ spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic, 0); spin_unlock_irqrestore(&ioapic_lock, flags); old_id = mp_ioapics[apic].mpc_apicid; if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", apic, mp_ioapics[apic].mpc_apicid); printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", reg_00.bits.ID); mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; } /* * Sanity check, is the ID really free? Every APIC in a * system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ if (check_apicid_used(&phys_id_present_map, mp_ioapics[apic].mpc_apicid)) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", apic, mp_ioapics[apic].mpc_apicid); for (i = 0; i < get_physical_broadcast(); i++) if (!physid_isset(i, phys_id_present_map)) break; if (i >= get_physical_broadcast()) panic("Max APIC ID exceeded"); printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", i); mp_ioapics[apic].mpc_apicid = i; } else { apic_printk(APIC_VERBOSE, "Setting %d in the " "phys_id_present_map\n", mp_ioapics[apic].mpc_apicid); } set_apicid(mp_ioapics[apic].mpc_apicid, &phys_id_present_map); /* * We need to adjust the IRQ routing table * if the ID changed. */ if (old_id != mp_ioapics[apic].mpc_apicid) for (i = 0; i < mp_irq_entries; i++) if (mp_irqs[i].mpc_dstapic == old_id) mp_irqs[i].mpc_dstapic = mp_ioapics[apic].mpc_apicid; /* * Read the right value from the MPC table and * write it into the ID register. */ apic_printk(APIC_VERBOSE, KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", mp_ioapics[apic].mpc_apicid); reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0, reg_00.raw); spin_unlock_irqrestore(&ioapic_lock, flags); /* * Sanity check */ spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic, 0); spin_unlock_irqrestore(&ioapic_lock, flags); if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) printk("could not set ID!\n"); else apic_printk(APIC_VERBOSE, " ok.\n"); } } /* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: * * - timer IRQ defaults to IO-APIC IRQ * - if this function detects that timer IRQs are defunct, then we fall * back to ISA timer IRQs */ static int __init timer_irq_works(void) { unsigned long t1, flags; t1 = pit0_ticks; mb(); local_save_flags(flags); local_irq_enable(); /* Let ten ticks pass... */ mdelay((10 * 1000) / HZ); local_irq_restore(flags); /* * Expect a few ticks at least, to be sure some possible * glue logic does not lock up after one or two first * ticks in a non-ExtINT mode. Also the local APIC * might have cached one ExtINT interrupt. Finally, at * least one tick may be lost due to delays. */ mb(); if (pit0_ticks - t1 > 4) return 1; return 0; } /* * In the SMP+IOAPIC case it might happen that there are an unspecified * number of pending IRQ events unhandled. These cases are very rare, * so we 'resend' these IRQs via IPIs, to the same CPU. It's much * better to do it this way as thus we do not have to be aware of * 'pending' interrupts in the IRQ path, except at this point. */ /* * Edge triggered needs to resend any interrupt * that was delayed but this is now handled in the device * independent code. */ /* * Starting up a edge-triggered IO-APIC interrupt is * nasty - we need to make sure that we get the edge. * If it is already asserted for some reason, we need * return 1 to indicate that is was pending. * * This is not complete - we should be able to fake * an edge even if it isn't on the 8259A... */ static unsigned int startup_edge_ioapic_irq(struct irq_desc *desc) { int was_pending = 0; unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); if (platform_legacy_irq(desc->irq)) { disable_8259A_irq(desc); if (i8259A_irq_pending(desc->irq)) was_pending = 1; } __unmask_IO_APIC_irq(desc->irq); spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; } /* * Once we have recorded IRQ_PENDING already, we can mask the * interrupt for real. This prevents IRQ storms from unhandled * devices. */ static void ack_edge_ioapic_irq(struct irq_desc *desc) { irq_complete_move(desc); move_native_irq(desc); if ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == (IRQ_PENDING | IRQ_DISABLED)) mask_IO_APIC_irq(desc); ack_APIC_irq(); } /* * Level triggered interrupts can just be masked, * and shutting down and starting up the interrupt * is the same as enabling and disabling them -- except * with a startup need to return a "was pending" value. * * Level triggered interrupts are special because we * do not touch any IO-APIC register while handling * them. We ack the APIC in the end-IRQ handler, not * in the start-IRQ-handler. Protection against reentrance * from the same interrupt is still provided, both by the * generic IRQ layer and by the fact that an unacked local * APIC does not accept IRQs. */ static unsigned int startup_level_ioapic_irq(struct irq_desc *desc) { unmask_IO_APIC_irq(desc); return 0; /* don't check for pending */ } static void __init setup_ioapic_ack(char *s) { if ( !strcmp(s, "old") ) { ioapic_ack_new = 0; ioapic_ack_forced = 1; } else if ( !strcmp(s, "new") ) { ioapic_ack_new = 1; ioapic_ack_forced = 1; } else printk("Unknown ioapic_ack value specified: '%s'\n", s); } custom_param("ioapic_ack", setup_ioapic_ack); static bool_t io_apic_level_ack_pending(unsigned int irq) { struct irq_pin_list *entry; unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); entry = &irq_2_pin[irq]; for (;;) { unsigned int reg; int pin; if (!entry) break; pin = entry->pin; if (pin == -1) continue; reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ if (reg & IO_APIC_REDIR_REMOTE_IRR) { spin_unlock_irqrestore(&ioapic_lock, flags); return 1; } if (!entry->next) break; entry = irq_2_pin + entry->next; } spin_unlock_irqrestore(&ioapic_lock, flags); return 0; } static void mask_and_ack_level_ioapic_irq(struct irq_desc *desc) { unsigned long v; int i; irq_complete_move(desc); if ( !directed_eoi_enabled ) mask_IO_APIC_irq(desc); /* * It appears there is an erratum which affects at least version 0x11 * of I/O APIC (that's the 82093AA and cores integrated into various * chipsets). Under certain conditions a level-triggered interrupt is * erroneously delivered as edge-triggered one but the respective IRR * bit gets set nevertheless. As a result the I/O unit expects an EOI * message but it will never arrive and further interrupts are blocked * from the source. The exact reason is so far unknown, but the * phenomenon was observed when two consecutive interrupt requests * from a given source get delivered to the same CPU and the source is * temporarily disabled in between. * * A workaround is to simulate an EOI message manually. We achieve it * by setting the trigger mode to edge and then to level when the edge * trigger mode gets detected in the TMR of a local APIC for a * level-triggered interrupt. We mask the source for the time of the * operation to prevent an edge-triggered interrupt escaping meanwhile. * The idea is from Manfred Spraul. --macro */ i = desc->arch.vector; v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); ack_APIC_irq(); if ( directed_eoi_enabled ) return; if ((desc->status & IRQ_MOVE_PENDING) && !io_apic_level_ack_pending(desc->irq)) move_masked_irq(desc); if ( !(v & (1 << (i & 0x1f))) ) { spin_lock(&ioapic_lock); __edge_IO_APIC_irq(desc->irq); __level_IO_APIC_irq(desc->irq); spin_unlock(&ioapic_lock); } } static void end_level_ioapic_irq_old(struct irq_desc *desc, u8 vector) { if ( directed_eoi_enabled ) { if ( !(desc->status & (IRQ_DISABLED|IRQ_MOVE_PENDING)) ) { eoi_IO_APIC_irq(desc); return; } mask_IO_APIC_irq(desc); eoi_IO_APIC_irq(desc); if ( (desc->status & IRQ_MOVE_PENDING) && !io_apic_level_ack_pending(desc->irq) ) move_masked_irq(desc); } if ( !(desc->status & IRQ_DISABLED) ) unmask_IO_APIC_irq(desc); } static void end_level_ioapic_irq_new(struct irq_desc *desc, u8 vector) { /* * It appears there is an erratum which affects at least version 0x11 * of I/O APIC (that's the 82093AA and cores integrated into various * chipsets). Under certain conditions a level-triggered interrupt is * erroneously delivered as edge-triggered one but the respective IRR * bit gets set nevertheless. As a result the I/O unit expects an EOI * message but it will never arrive and further interrupts are blocked * from the source. The exact reason is so far unknown, but the * phenomenon was observed when two consecutive interrupt requests * from a given source get delivered to the same CPU and the source is * temporarily disabled in between. * * A workaround is to simulate an EOI message manually. We achieve it * by setting the trigger mode to edge and then to level when the edge * trigger mode gets detected in the TMR of a local APIC for a * level-triggered interrupt. We mask the source for the time of the * operation to prevent an edge-triggered interrupt escaping meanwhile. * The idea is from Manfred Spraul. --macro */ unsigned int v, i = desc->arch.vector; /* Manually EOI the old vector if we are moving to the new */ if ( vector && i != vector ) eoi_IO_APIC_irq(desc); v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); ack_APIC_irq(); if ( (desc->status & IRQ_MOVE_PENDING) && !io_apic_level_ack_pending(desc->irq) ) move_native_irq(desc); if (!(v & (1 << (i & 0x1f)))) { spin_lock(&ioapic_lock); __mask_IO_APIC_irq(desc->irq); __edge_IO_APIC_irq(desc->irq); __level_IO_APIC_irq(desc->irq); if ( !(desc->status & IRQ_DISABLED) ) __unmask_IO_APIC_irq(desc->irq); spin_unlock(&ioapic_lock); } } /* * Level and edge triggered IO-APIC interrupts need different handling, * so we use two separate IRQ descriptors. Edge triggered IRQs can be * handled with the level-triggered descriptor, but that one has slightly * more overhead. Level-triggered interrupts cannot be handled with the * edge-triggered handler, without risking IRQ storms and other ugly * races. */ static hw_irq_controller ioapic_edge_type = { .typename = "IO-APIC-edge", .startup = startup_edge_ioapic_irq, .shutdown = irq_shutdown_none, .enable = unmask_IO_APIC_irq, .disable = irq_disable_none, .ack = ack_edge_ioapic_irq, .set_affinity = set_ioapic_affinity_irq, }; static struct hw_interrupt_type __read_mostly ioapic_level_type = { .typename = "IO-APIC-level", .startup = startup_level_ioapic_irq, .shutdown = mask_IO_APIC_irq, .enable = unmask_IO_APIC_irq, .disable = mask_IO_APIC_irq, .ack = mask_and_ack_level_ioapic_irq, .end = end_level_ioapic_irq_old, .set_affinity = set_ioapic_affinity_irq, }; static inline void init_IO_APIC_traps(void) { int irq; /* Xen: This is way simpler than the Linux implementation. */ for (irq = 0; platform_legacy_irq(irq); irq++) if (IO_APIC_IRQ(irq) && !irq_to_vector(irq)) make_8259A_irq(irq); } static void enable_lapic_irq(struct irq_desc *desc) { unsigned long v; v = apic_read(APIC_LVT0); apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); } static void disable_lapic_irq(struct irq_desc *desc) { unsigned long v; v = apic_read(APIC_LVT0); apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); } static void ack_lapic_irq(struct irq_desc *desc) { ack_APIC_irq(); } static hw_irq_controller lapic_irq_type = { .typename = "local-APIC-edge", .startup = NULL, /* startup_irq() not used for IRQ0 */ .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ .enable = enable_lapic_irq, .disable = disable_lapic_irq, .ack = ack_lapic_irq, }; /* * This looks a bit hackish but it's about the only one way of sending * a few INTA cycles to 8259As and any associated glue logic. ICR does * not support the ExtINT mode, unfortunately. We need to send these * cycles as some i82489DX-based boards have glue logic that keeps the * 8259A interrupt line asserted until INTA. --macro */ static void __init unlock_ExtINT_logic(void) { int apic, pin, i; struct IO_APIC_route_entry entry0, entry1; unsigned char save_control, save_freq_select; pin = find_isa_irq_pin(8, mp_INT); apic = find_isa_irq_apic(8, mp_INT); if ( pin == -1 || apic == -1 ) return; entry0 = ioapic_read_entry(apic, pin, 0); clear_IO_APIC_pin(apic, pin); memset(&entry1, 0, sizeof(entry1)); entry1.dest_mode = 0; /* physical delivery */ entry1.mask = 0; /* unmask IRQ now */ SET_DEST(entry1.dest.dest32, entry1.dest.physical.physical_dest, hard_smp_processor_id()); entry1.delivery_mode = dest_ExtINT; entry1.polarity = entry0.polarity; entry1.trigger = 0; entry1.vector = 0; ioapic_write_entry(apic, pin, 0, entry1); save_control = CMOS_READ(RTC_CONTROL); save_freq_select = CMOS_READ(RTC_FREQ_SELECT); CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, RTC_FREQ_SELECT); CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); i = 100; while (i-- > 0) { mdelay(10); if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) i -= 10; } CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); clear_IO_APIC_pin(apic, pin); ioapic_write_entry(apic, pin, 0, entry0); } /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast * fanatically on his truly buggy board. */ static void __init check_timer(void) { int apic1, pin1, apic2, pin2; int vector, ret; unsigned long flags; cpumask_t mask_all; local_irq_save(flags); /* * get/set the timer IRQ vector: */ disable_8259A_irq(irq_to_desc(0)); vector = IRQ0_VECTOR; clear_irq_vector(0); cpumask_setall(&mask_all); if ((ret = bind_irq_vector(0, vector, &mask_all))) printk(KERN_ERR"..IRQ0 is not set correctly with ioapic!!!, err:%d\n", ret); irq_desc[0].status &= ~IRQ_DISABLED; /* * Subtle, code in do_timer_interrupt() expects an AEOI * mode for the 8259A whenever interrupts are routed * through I/O APICs. Also IRQ0 has to be enabled in * the 8259A which implies the virtual wire has to be * disabled in the local APIC. */ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); /* XEN: Ripped out the legacy missed-tick logic, so below is not needed. */ /*timer_ack = 1;*/ /*enable_8259A_irq(irq_to_desc(0));*/ pin1 = find_isa_irq_pin(0, mp_INT); apic1 = find_isa_irq_apic(0, mp_INT); pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", vector, apic1, pin1, apic2, pin2); if (pin1 != -1) { /* * Ok, does IRQ0 through the IOAPIC work? */ unmask_IO_APIC_irq(irq_to_desc(0)); if (timer_irq_works()) { local_irq_restore(flags); return; } clear_IO_APIC_pin(apic1, pin1); printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " "IO-APIC\n"); } printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); if (pin2 != -1) { printk("\n..... (found pin %d) ...", pin2); /* * legacy devices should be connected to IO APIC #0 */ setup_ExtINT_IRQ0_pin(apic2, pin2, vector); if (timer_irq_works()) { local_irq_restore(flags); printk("works.\n"); if (pin1 != -1) replace_pin_at_irq(0, apic1, pin1, apic2, pin2); else add_pin_to_irq(0, apic2, pin2); return; } /* * Cleanup, just in case ... */ clear_IO_APIC_pin(apic2, pin2); } printk(" failed.\n"); if (nmi_watchdog == NMI_IO_APIC) { printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = 0; } printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); disable_8259A_irq(irq_to_desc(0)); irq_desc[0].handler = &lapic_irq_type; apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(irq_to_desc(0)); if (timer_irq_works()) { local_irq_restore(flags); printk(" works.\n"); return; } apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); printk(" failed.\n"); printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); /*timer_ack = 0;*/ init_8259A(0); make_8259A_irq(0); apic_write_around(APIC_LVT0, APIC_DM_EXTINT); unlock_ExtINT_logic(); local_irq_restore(flags); if (timer_irq_works()) { printk(" works.\n"); return; } printk(" failed :(.\n"); panic("IO-APIC + timer doesn't work! Boot with apic_verbosity=debug " "and send a report. Then try booting with the 'noapic' option"); } /* * * IRQ's that are handled by the PIC in the MPS IOAPIC case. * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. * Linux doesn't really care, as it's not actually used * for any interrupt handling anyway. */ #define PIC_IRQS (1 << PIC_CASCADE_IR) static struct IO_APIC_route_entry *ioapic_pm_state; static void __init ioapic_pm_state_alloc(void) { int i, nr_entry = 0; for (i = 0; i < nr_ioapics; i++) nr_entry += nr_ioapic_entries[i]; ioapic_pm_state = _xmalloc(sizeof(struct IO_APIC_route_entry)*nr_entry, sizeof(struct IO_APIC_route_entry)); BUG_ON(ioapic_pm_state == NULL); } void __init setup_IO_APIC(void) { enable_IO_APIC(); if (acpi_ioapic) io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ else io_apic_irqs = ~PIC_IRQS; printk("ENABLING IO-APIC IRQs\n"); printk(" -> Using %s ACK method\n", ioapic_ack_new ? "new" : "old"); if (ioapic_ack_new) { ioapic_level_type.ack = irq_complete_move; ioapic_level_type.end = end_level_ioapic_irq_new; } /* * Set up IO-APIC IRQ routing. */ if (!acpi_ioapic) setup_ioapic_ids_from_mpc(); sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); check_timer(); print_IO_APIC(); ioapic_pm_state_alloc(); register_keyhandler('z', &print_IO_APIC_keyhandler); } void ioapic_suspend(void) { struct IO_APIC_route_entry *entry = ioapic_pm_state; unsigned long flags; int apic, i; spin_lock_irqsave(&ioapic_lock, flags); for (apic = 0; apic < nr_ioapics; apic++) { for (i = 0; i < nr_ioapic_entries[apic]; i ++, entry ++ ) { *(((int *)entry) + 1) = __io_apic_read(apic, 0x11 + 2 * i); *(((int *)entry) + 0) = __io_apic_read(apic, 0x10 + 2 * i); } } spin_unlock_irqrestore(&ioapic_lock, flags); } void ioapic_resume(void) { struct IO_APIC_route_entry *entry = ioapic_pm_state; unsigned long flags; union IO_APIC_reg_00 reg_00; int i, apic; spin_lock_irqsave(&ioapic_lock, flags); for (apic = 0; apic < nr_ioapics; apic++){ if (!nr_ioapic_entries[apic]) continue; reg_00.raw = __io_apic_read(apic, 0); if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) { reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; __io_apic_write(apic, 0, reg_00.raw); } for (i = 0; i < nr_ioapic_entries[apic]; i++, entry++) { __io_apic_write(apic, 0x11+2*i, *(((int *)entry)+1)); __io_apic_write(apic, 0x10+2*i, *(((int *)entry)+0)); } } spin_unlock_irqrestore(&ioapic_lock, flags); } /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ #ifdef CONFIG_ACPI_BOOT int __init io_apic_get_unique_id (int ioapic, int apic_id) { union IO_APIC_reg_00 reg_00; static physid_mask_t __initdata apic_id_map = PHYSID_MASK_NONE; unsigned long flags; int i = 0; /* * The P4 platform supports up to 256 APIC IDs on two separate APIC * buses (one for LAPICs, one for IOAPICs), where predecessors only * supports up to 16 on one shared APIC bus. * * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full * advantage of new APIC bus architecture. */ if (physids_empty(apic_id_map)) ioapic_phys_id_map(&apic_id_map); spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); spin_unlock_irqrestore(&ioapic_lock, flags); if (apic_id >= get_physical_broadcast()) { printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " "%d\n", ioapic, apic_id, reg_00.bits.ID); apic_id = reg_00.bits.ID; } /* * Every APIC in a system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ if (check_apicid_used(&apic_id_map, apic_id)) { for (i = 0; i < get_physical_broadcast(); i++) { if (!check_apicid_used(&apic_id_map, i)) break; } if (i == get_physical_broadcast()) panic("Max apic_id exceeded"); printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " "trying %d\n", ioapic, apic_id, i); apic_id = i; } set_apicid(apic_id, &apic_id_map); if (reg_00.bits.ID != apic_id) { reg_00.bits.ID = apic_id; spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(ioapic, 0, reg_00.raw); reg_00.raw = io_apic_read(ioapic, 0); spin_unlock_irqrestore(&ioapic_lock, flags); /* Sanity check */ if (reg_00.bits.ID != apic_id) { printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); return -1; } } apic_printk(APIC_VERBOSE, KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); return apic_id; } int __init io_apic_get_version (int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(ioapic, 1); spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.version; } int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(ioapic, 1); spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.entries; } int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) { struct irq_desc *desc = irq_to_desc(irq); struct IO_APIC_route_entry entry; unsigned long flags; int vector; if (!IO_APIC_IRQ(irq)) { printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ %d\n", ioapic, irq); return -EINVAL; } /* * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. * Note that we mask (disable) IRQs now -- these get enabled when the * corresponding device driver registers for this IRQ. */ memset(&entry,0,sizeof(entry)); entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; SET_DEST(entry.dest.dest32, entry.dest.logical.logical_dest, cpu_mask_to_apicid(TARGET_CPUS)); entry.trigger = edge_level; entry.polarity = active_high_low; entry.mask = 1; /* * IRQs < 16 are already in the irq_2_pin[] map */ if (!platform_legacy_irq(irq)) add_pin_to_irq(irq, ioapic, pin); vector = assign_irq_vector(irq, NULL); if (vector < 0) return vector; entry.vector = vector; apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " "(%d-%d -> %#x -> IRQ %d Mode:%i Active:%i)\n", ioapic, mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, edge_level, active_high_low); ioapic_register_intr(irq, edge_level); if (!ioapic && platform_legacy_irq(irq)) disable_8259A_irq(desc); spin_lock_irqsave(&ioapic_lock, flags); __ioapic_write_entry(ioapic, pin, 0, entry); set_native_irq_info(irq, TARGET_CPUS); spin_unlock(&ioapic_lock); spin_lock(&desc->lock); if (!(desc->status & (IRQ_DISABLED | IRQ_GUEST))) desc->handler->startup(desc); spin_unlock_irqrestore(&desc->lock, flags); return 0; } #endif /*CONFIG_ACPI_BOOT*/ static int ioapic_physbase_to_id(unsigned long physbase) { int apic; for ( apic = 0; apic < nr_ioapics; apic++ ) { if ( !nr_ioapic_entries[apic] ) continue; if ( mp_ioapics[apic].mpc_apicaddr == physbase ) return apic; } return -EINVAL; } unsigned apic_gsi_base(int apic); static int apic_pin_2_gsi_irq(int apic, int pin) { int idx; if (apic < 0) return -EINVAL; idx = find_irq_entry(apic, pin, mp_INT); return idx >= 0 ? pin_2_irq(idx, apic, pin) : apic_gsi_base(apic) + pin; } int ioapic_guest_read(unsigned long physbase, unsigned int reg, u32 *pval) { int apic; unsigned long flags; if ( (apic = ioapic_physbase_to_id(physbase)) < 0 ) return apic; spin_lock_irqsave(&ioapic_lock, flags); *pval = io_apic_read(apic, reg); spin_unlock_irqrestore(&ioapic_lock, flags); return 0; } #define WARN_BOGUS_WRITE(f, a...) \ dprintk(XENLOG_INFO, "\n%s: " \ "apic=%d, pin=%d, irq=%d\n" \ "%s: new_entry=%08x\n" \ "%s: " f, __FUNCTION__, apic, pin, irq, \ __FUNCTION__, *(u32 *)&rte, \ __FUNCTION__ , ##a ) int ioapic_guest_write(unsigned long physbase, unsigned int reg, u32 val) { int apic, pin, irq, ret, pirq; struct IO_APIC_route_entry rte = { 0 }; unsigned long flags; struct irq_desc *desc; if ( (apic = ioapic_physbase_to_id(physbase)) < 0 ) return apic; /* Only write to the first half of a route entry. */ if ( (reg < 0x10) || (reg & 1) ) return 0; pin = (reg - 0x10) >> 1; /* Write first half from guest; second half is target info. */ *(u32 *)&rte = val; /* * What about weird destination types? * SMI: Ignore? Ought to be set up by the BIOS. * NMI: Ignore? Watchdog functionality is Xen's concern. * INIT: Definitely ignore: probably a guest OS bug. * ExtINT: Ignore? Linux only asserts this at start of day. * For now, print a message and return an error. We can fix up on demand. */ if ( rte.delivery_mode > dest_LowestPrio ) { printk("ERROR: Attempt to write weird IOAPIC destination mode!\n"); printk(" APIC=%d/%d, lo-reg=%x\n", apic, pin, val); return -EINVAL; } /* * The guest does not know physical APIC arrangement (flat vs. cluster). * Apply genapic conventions for this platform. */ rte.delivery_mode = INT_DELIVERY_MODE; rte.dest_mode = INT_DEST_MODE; irq = apic_pin_2_gsi_irq(apic, pin); if ( irq < 0 ) return irq; desc = irq_to_desc(irq); /* * Since PHYSDEVOP_alloc_irq_vector is dummy, rte.vector is the pirq * which corresponds to this ioapic pin, retrieve it for building * pirq and irq mapping. Where the GSI is greater than 256, we assume * that dom0 pirq == irq. */ pirq = (irq >= 256) ? irq : rte.vector; if ( (pirq < 0) || (pirq >= dom0->nr_pirqs) ) return -EINVAL; if ( desc->action ) { spin_lock_irqsave(&ioapic_lock, flags); ret = io_apic_read(apic, 0x10 + 2 * pin); spin_unlock_irqrestore(&ioapic_lock, flags); rte.vector = desc->arch.vector; if ( *(u32*)&rte != ret ) WARN_BOGUS_WRITE("old_entry=%08x pirq=%d\n%s: " "Attempt to modify IO-APIC pin for in-use IRQ!\n", ret, pirq, __FUNCTION__); return 0; } if ( desc->arch.vector <= 0 || desc->arch.vector > LAST_DYNAMIC_VECTOR ) { int vector = desc->arch.vector; if ( vector < FIRST_HIPRIORITY_VECTOR ) add_pin_to_irq(irq, apic, pin); else desc->arch.vector = IRQ_VECTOR_UNASSIGNED; ret = assign_irq_vector(irq, NULL); if ( ret < 0 ) { if ( vector < FIRST_HIPRIORITY_VECTOR ) remove_pin_from_irq(irq, apic, pin); else desc->arch.vector = vector; return ret; } printk(XENLOG_INFO "allocated vector %02x for irq %d\n", ret, irq); } spin_lock(&dom0->event_lock); ret = map_domain_pirq(dom0, pirq, irq, MAP_PIRQ_TYPE_GSI, NULL); spin_unlock(&dom0->event_lock); if ( ret < 0 ) return ret; spin_lock_irqsave(&ioapic_lock, flags); /* Set the correct irq-handling type. */ desc->handler = rte.trigger ? &ioapic_level_type: &ioapic_edge_type; /* Mask iff level triggered. */ rte.mask = rte.trigger; /* Set the vector field to the real vector! */ rte.vector = desc->arch.vector; SET_DEST(rte.dest.dest32, rte.dest.logical.logical_dest, cpu_mask_to_apicid(desc->arch.cpu_mask)); __ioapic_write_entry(apic, pin, 0, rte); spin_unlock_irqrestore(&ioapic_lock, flags); return 0; } static const char * delivery_mode_2_str( const enum ioapic_irq_destination_types mode) { switch ( mode ) { case dest_Fixed: return "Fixed"; case dest_LowestPrio: return "LoPri"; case dest_SMI: return "SMI"; case dest_NMI: return "NMI"; case dest_INIT: return "INIT"; case dest_ExtINT: return "ExINT"; case dest__reserved_1: case dest__reserved_2: return "Resvd"; default: return "INVAL"; } } void dump_ioapic_irq_info(void) { struct irq_pin_list *entry; struct IO_APIC_route_entry rte; unsigned int irq, pin, printed = 0; if ( !irq_2_pin ) return; for ( irq = 0; irq < nr_irqs_gsi; irq++ ) { entry = &irq_2_pin[irq]; if ( entry->pin == -1 ) continue; if ( !printed++ ) printk("IO-APIC interrupt information:\n"); printk(" IRQ%3d Vec%3d:\n", irq, irq_to_vector(irq)); for ( ; ; ) { pin = entry->pin; printk(" Apic 0x%02x, Pin %2d: ", entry->apic, pin); rte = ioapic_read_entry(entry->apic, pin, 0); printk("vec=%02x delivery=%-5s dest=%c status=%d " "polarity=%d irr=%d trig=%c mask=%d dest_id:%d\n", rte.vector, delivery_mode_2_str(rte.delivery_mode), rte.dest_mode ? 'L' : 'P', rte.delivery_status, rte.polarity, rte.irr, rte.trigger ? 'L' : 'E', rte.mask, rte.dest.logical.logical_dest); if ( entry->next == 0 ) break; entry = &irq_2_pin[entry->next]; } } } static unsigned int __initdata max_gsi_irqs; integer_param("max_gsi_irqs", max_gsi_irqs); static __init bool_t bad_ioapic_register(unsigned int idx) { union IO_APIC_reg_00 reg_00 = { .raw = io_apic_read(idx, 0) }; union IO_APIC_reg_01 reg_01 = { .raw = io_apic_read(idx, 1) }; union IO_APIC_reg_02 reg_02 = { .raw = io_apic_read(idx, 2) }; if ( reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1 ) { printk(KERN_WARNING "I/O APIC %#x registers return all ones, skipping!\n", mp_ioapics[idx].mpc_apicaddr); return 1; } return 0; } void __init init_ioapic_mappings(void) { unsigned long ioapic_phys; unsigned int i, idx = FIX_IO_APIC_BASE_0; union IO_APIC_reg_01 reg_01; if ( smp_found_config ) nr_irqs_gsi = 0; for ( i = 0; i < nr_ioapics; i++ ) { if ( smp_found_config ) { ioapic_phys = mp_ioapics[i].mpc_apicaddr; if ( !ioapic_phys ) { printk(KERN_ERR "WARNING: bogus zero IO-APIC address " "found in MPTABLE, disabling IO/APIC support!\n"); smp_found_config = 0; skip_ioapic_setup = 1; goto fake_ioapic_page; } } else { fake_ioapic_page: ioapic_phys = __pa(alloc_xenheap_page()); clear_page(__va(ioapic_phys)); } set_fixmap_nocache(idx, ioapic_phys); apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", __fix_to_virt(idx), ioapic_phys); idx++; if ( bad_ioapic_register(i) ) { __set_fixmap(idx, 0, 0); continue; } if ( smp_found_config ) { /* The number of IO-APIC IRQ registers (== #pins): */ reg_01.raw = io_apic_read(i, 1); nr_ioapic_entries[i] = reg_01.bits.entries + 1; nr_irqs_gsi += nr_ioapic_entries[i]; if ( rangeset_add_singleton(mmio_ro_ranges, ioapic_phys >> PAGE_SHIFT) ) printk(KERN_ERR "Failed to mark IO-APIC page %lx read-only\n", ioapic_phys); } } nr_irqs_gsi = max(nr_irqs_gsi, highest_gsi() + 1); if ( max_gsi_irqs == 0 ) max_gsi_irqs = nr_irqs ? nr_irqs / 8 : PAGE_SIZE; else if ( nr_irqs != 0 && max_gsi_irqs > nr_irqs ) { printk(XENLOG_WARNING "\"max_gsi_irqs=\" cannot be specified larger" " than \"nr_irqs=\"\n"); max_gsi_irqs = nr_irqs; } if ( max_gsi_irqs < 16 ) max_gsi_irqs = 16; /* for PHYSDEVOP_pirq_eoi_gmfn guest assumptions */ if ( max_gsi_irqs > PAGE_SIZE * 8 ) max_gsi_irqs = PAGE_SIZE * 8; if ( !smp_found_config || skip_ioapic_setup || nr_irqs_gsi < 16 ) nr_irqs_gsi = 16; else if ( nr_irqs_gsi > max_gsi_irqs ) { printk(XENLOG_WARNING "Limiting to %u GSI IRQs (found %u)\n", max_gsi_irqs, nr_irqs_gsi); nr_irqs_gsi = max_gsi_irqs; } if ( nr_irqs == 0 ) nr_irqs = cpu_has_apic ? max(16U + num_present_cpus() * NR_DYNAMIC_VECTORS, 8 * nr_irqs_gsi) : nr_irqs_gsi; else if ( nr_irqs < 16 ) nr_irqs = 16; printk(XENLOG_INFO "IRQ limits: %u GSI, %u MSI/MSI-X\n", nr_irqs_gsi, nr_irqs - nr_irqs_gsi); } xen-4.4.0/xen/arch/x86/irq.c0000664000175000017500000020463312307313555013546 0ustar smbsmb/****************************************************************************** * arch/x86/irq.c * * Portions of this file are: * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void parse_irq_vector_map_param(char *s); /* opt_noirqbalance: If true, software IRQ balancing/affinity is disabled. */ bool_t __read_mostly opt_noirqbalance = 0; boolean_param("noirqbalance", opt_noirqbalance); unsigned int __read_mostly nr_irqs_gsi = 16; unsigned int __read_mostly nr_irqs; integer_param("nr_irqs", nr_irqs); /* This default may be changed by the AMD IOMMU code */ int __read_mostly opt_irq_vector_map = OPT_IRQ_VECTOR_MAP_DEFAULT; custom_param("irq_vector_map", parse_irq_vector_map_param); vmask_t global_used_vector_map; struct irq_desc __read_mostly *irq_desc = NULL; static DECLARE_BITMAP(used_vectors, NR_VECTORS); static DEFINE_SPINLOCK(vector_lock); DEFINE_PER_CPU(vector_irq_t, vector_irq); DEFINE_PER_CPU(struct cpu_user_regs *, __irq_regs); static LIST_HEAD(irq_ratelimit_list); static DEFINE_SPINLOCK(irq_ratelimit_lock); static struct timer irq_ratelimit_timer; /* irq_ratelimit: the max irq rate allowed in every 10ms, set 0 to disable */ static unsigned int __read_mostly irq_ratelimit_threshold = 10000; integer_param("irq_ratelimit", irq_ratelimit_threshold); static void __init parse_irq_vector_map_param(char *s) { char *ss; do { ss = strchr(s, ','); if ( ss ) *ss = '\0'; if ( !strcmp(s, "none")) opt_irq_vector_map=OPT_IRQ_VECTOR_MAP_NONE; else if ( !strcmp(s, "global")) opt_irq_vector_map=OPT_IRQ_VECTOR_MAP_GLOBAL; else if ( !strcmp(s, "per-device")) opt_irq_vector_map=OPT_IRQ_VECTOR_MAP_PERDEV; s = ss + 1; } while ( ss ); } /* Must be called when irq disabled */ void lock_vector_lock(void) { /* Used to the online set of cpus does not change * during assign_irq_vector. */ spin_lock(&vector_lock); } void unlock_vector_lock(void) { spin_unlock(&vector_lock); } static void trace_irq_mask(u32 event, int irq, int vector, cpumask_t *mask) { struct { unsigned int irq:16, vec:16; unsigned int mask[6]; } d; d.irq = irq; d.vec = vector; memset(d.mask, 0, sizeof(d.mask)); memcpy(d.mask, mask, min(sizeof(d.mask), sizeof(cpumask_t))); trace_var(event, 1, sizeof(d), &d); } static int __init __bind_irq_vector(int irq, int vector, const cpumask_t *cpu_mask) { cpumask_t online_mask; int cpu; struct irq_desc *desc = irq_to_desc(irq); BUG_ON((unsigned)irq >= nr_irqs); BUG_ON((unsigned)vector >= NR_VECTORS); cpumask_and(&online_mask, cpu_mask, &cpu_online_map); if (cpumask_empty(&online_mask)) return -EINVAL; if ( (desc->arch.vector == vector) && cpumask_equal(desc->arch.cpu_mask, &online_mask) ) return 0; if ( desc->arch.vector != IRQ_VECTOR_UNASSIGNED ) return -EBUSY; trace_irq_mask(TRC_HW_IRQ_BIND_VECTOR, irq, vector, &online_mask); for_each_cpu(cpu, &online_mask) per_cpu(vector_irq, cpu)[vector] = irq; desc->arch.vector = vector; cpumask_copy(desc->arch.cpu_mask, &online_mask); if ( desc->arch.used_vectors ) { ASSERT(!test_bit(vector, desc->arch.used_vectors)); set_bit(vector, desc->arch.used_vectors); } desc->arch.used = IRQ_USED; return 0; } int __init bind_irq_vector(int irq, int vector, const cpumask_t *cpu_mask) { unsigned long flags; int ret; spin_lock_irqsave(&vector_lock, flags); ret = __bind_irq_vector(irq, vector, cpu_mask); spin_unlock_irqrestore(&vector_lock, flags); return ret; } /* * Dynamic irq allocate and deallocation for MSI */ int create_irq(int node) { int irq, ret; struct irq_desc *desc; for (irq = nr_irqs_gsi; irq < nr_irqs; irq++) { desc = irq_to_desc(irq); if (cmpxchg(&desc->arch.used, IRQ_UNUSED, IRQ_RESERVED) == IRQ_UNUSED) break; } if (irq >= nr_irqs) return -ENOSPC; ret = init_one_irq_desc(desc); if (!ret) { cpumask_t *mask = NULL; if (node != NUMA_NO_NODE && node >= 0) { mask = &node_to_cpumask(node); if (cpumask_empty(mask)) mask = NULL; } ret = assign_irq_vector(irq, mask); } if (ret < 0) { desc->arch.used = IRQ_UNUSED; irq = ret; } else if ( dom0 ) { ret = irq_permit_access(dom0, irq); if ( ret ) printk(XENLOG_G_ERR "Could not grant Dom0 access to IRQ%d (error %d)\n", irq, ret); } return irq; } void destroy_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; struct irqaction *action; BUG_ON(!MSI_IRQ(irq)); if ( dom0 ) { int err = irq_deny_access(dom0, irq); if ( err ) printk(XENLOG_G_ERR "Could not revoke Dom0 access to IRQ%u (error %d)\n", irq, err); } spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_DISABLED; desc->status &= ~IRQ_GUEST; desc->handler->shutdown(desc); action = desc->action; desc->action = NULL; desc->msi_desc = NULL; cpumask_setall(desc->affinity); spin_unlock_irqrestore(&desc->lock, flags); /* Wait to make sure it's not being used on another CPU */ do { smp_mb(); } while ( desc->status & IRQ_INPROGRESS ); spin_lock_irqsave(&desc->lock, flags); desc->handler = &no_irq_type; clear_irq_vector(irq); desc->arch.used_vectors = NULL; spin_unlock_irqrestore(&desc->lock, flags); xfree(action); } static void __clear_irq_vector(int irq) { int cpu, vector, old_vector; cpumask_t tmp_mask; struct irq_desc *desc = irq_to_desc(irq); BUG_ON(!desc->arch.vector); /* Always clear desc->arch.vector */ vector = desc->arch.vector; cpumask_and(&tmp_mask, desc->arch.cpu_mask, &cpu_online_map); for_each_cpu(cpu, &tmp_mask) { ASSERT( per_cpu(vector_irq, cpu)[vector] == irq ); per_cpu(vector_irq, cpu)[vector] = ~irq; } desc->arch.vector = IRQ_VECTOR_UNASSIGNED; cpumask_clear(desc->arch.cpu_mask); if ( desc->arch.used_vectors ) { ASSERT(test_bit(vector, desc->arch.used_vectors)); clear_bit(vector, desc->arch.used_vectors); } desc->arch.used = IRQ_UNUSED; trace_irq_mask(TRC_HW_IRQ_CLEAR_VECTOR, irq, vector, &tmp_mask); if ( likely(!desc->arch.move_in_progress) ) return; /* If we were in motion, also clear desc->arch.old_vector */ old_vector = desc->arch.old_vector; cpumask_and(&tmp_mask, desc->arch.old_cpu_mask, &cpu_online_map); for_each_cpu(cpu, &tmp_mask) { ASSERT( per_cpu(vector_irq, cpu)[old_vector] == irq ); TRACE_3D(TRC_HW_IRQ_MOVE_FINISH, irq, old_vector, cpu); per_cpu(vector_irq, cpu)[old_vector] = ~irq; } desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED; cpumask_clear(desc->arch.old_cpu_mask); if ( desc->arch.used_vectors ) { ASSERT(test_bit(old_vector, desc->arch.used_vectors)); clear_bit(old_vector, desc->arch.used_vectors); } desc->arch.move_in_progress = 0; } void clear_irq_vector(int irq) { unsigned long flags; spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq); spin_unlock_irqrestore(&vector_lock, flags); } int irq_to_vector(int irq) { int vector = -1; BUG_ON(irq >= nr_irqs || irq < 0); if (IO_APIC_IRQ(irq)) { vector = irq_to_desc(irq)->arch.vector; if (vector >= FIRST_LEGACY_VECTOR && vector <= LAST_LEGACY_VECTOR) vector = 0; } else if (MSI_IRQ(irq)) vector = irq_to_desc(irq)->arch.vector; else vector = LEGACY_VECTOR(irq); return vector; } int arch_init_one_irq_desc(struct irq_desc *desc) { if ( !zalloc_cpumask_var(&desc->arch.cpu_mask) ) return -ENOMEM; if ( !alloc_cpumask_var(&desc->arch.old_cpu_mask) ) { free_cpumask_var(desc->arch.cpu_mask); return -ENOMEM; } if ( !alloc_cpumask_var(&desc->arch.pending_mask) ) { free_cpumask_var(desc->arch.old_cpu_mask); free_cpumask_var(desc->arch.cpu_mask); return -ENOMEM; } desc->arch.vector = IRQ_VECTOR_UNASSIGNED; desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED; return 0; } int __init init_irq_data(void) { struct irq_desc *desc; int irq, vector; for (vector = 0; vector < NR_VECTORS; ++vector) this_cpu(vector_irq)[vector] = INT_MIN; irq_desc = xzalloc_array(struct irq_desc, nr_irqs); if ( !irq_desc ) return -ENOMEM; for (irq = 0; irq < nr_irqs_gsi; irq++) { desc = irq_to_desc(irq); desc->irq = irq; init_one_irq_desc(desc); } for (; irq < nr_irqs; irq++) irq_to_desc(irq)->irq = irq; /* Never allocate the hypercall vector or Linux/BSD fast-trap vector. */ set_bit(LEGACY_SYSCALL_VECTOR, used_vectors); set_bit(HYPERCALL_VECTOR, used_vectors); /* IRQ_MOVE_CLEANUP_VECTOR used for clean up vectors */ set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); return 0; } static void __do_IRQ_guest(int vector); static void ack_none(struct irq_desc *desc) { ack_bad_irq(desc->irq); } hw_irq_controller no_irq_type = { "none", irq_startup_none, irq_shutdown_none, irq_enable_none, irq_disable_none, ack_none, }; static vmask_t *irq_get_used_vector_mask(int irq) { vmask_t *ret = NULL; if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_GLOBAL ) { struct irq_desc *desc = irq_to_desc(irq); ret = &global_used_vector_map; if ( desc->arch.used_vectors ) { printk(XENLOG_INFO "%s: Strange, unassigned irq %d already has used_vectors!\n", __func__, irq); } else { int vector; vector = irq_to_vector(irq); if ( vector > 0 ) { printk(XENLOG_INFO "%s: Strange, irq %d already assigned vector %d!\n", __func__, irq, vector); ASSERT(!test_bit(vector, ret)); set_bit(vector, ret); } } } else if ( IO_APIC_IRQ(irq) && opt_irq_vector_map != OPT_IRQ_VECTOR_MAP_NONE ) { ret = io_apic_get_used_vector_map(irq); } return ret; } static int __assign_irq_vector( int irq, struct irq_desc *desc, const cpumask_t *mask) { /* * NOTE! The local APIC isn't very good at handling * multiple interrupts at the same interrupt level. * As the interrupt level is determined by taking the * vector number and shifting that right by 4, we * want to spread these out a bit so that they don't * all fall in the same interrupt level. * * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ static int current_vector = FIRST_DYNAMIC_VECTOR, current_offset = 0; int cpu, err, old_vector; cpumask_t tmp_mask; vmask_t *irq_used_vectors = NULL; old_vector = irq_to_vector(irq); if (old_vector > 0) { cpumask_and(&tmp_mask, mask, &cpu_online_map); if (cpumask_intersects(&tmp_mask, desc->arch.cpu_mask)) { desc->arch.vector = old_vector; return 0; } } if ( desc->arch.move_in_progress || desc->arch.move_cleanup_count ) return -EAGAIN; err = -ENOSPC; /* This is the only place normal IRQs are ever marked * as "in use". If they're not in use yet, check to see * if we need to assign a global vector mask. */ if ( desc->arch.used == IRQ_USED ) { irq_used_vectors = desc->arch.used_vectors; } else irq_used_vectors = irq_get_used_vector_mask(irq); for_each_cpu(cpu, mask) { int new_cpu; int vector, offset; /* Only try and allocate irqs on cpus that are present. */ if (!cpu_online(cpu)) continue; cpumask_and(&tmp_mask, vector_allocation_cpumask(cpu), &cpu_online_map); vector = current_vector; offset = current_offset; next: vector += 8; if (vector > LAST_DYNAMIC_VECTOR) { /* If out of vectors on large boxen, must share them. */ offset = (offset + 1) % 8; vector = FIRST_DYNAMIC_VECTOR + offset; } if (unlikely(current_vector == vector)) continue; if (test_bit(vector, used_vectors)) goto next; if (irq_used_vectors && test_bit(vector, irq_used_vectors) ) goto next; for_each_cpu(new_cpu, &tmp_mask) if (per_cpu(vector_irq, new_cpu)[vector] >= 0) goto next; /* Found one! */ current_vector = vector; current_offset = offset; if (old_vector > 0) { desc->arch.move_in_progress = 1; cpumask_copy(desc->arch.old_cpu_mask, desc->arch.cpu_mask); desc->arch.old_vector = desc->arch.vector; } trace_irq_mask(TRC_HW_IRQ_ASSIGN_VECTOR, irq, vector, &tmp_mask); for_each_cpu(new_cpu, &tmp_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; desc->arch.vector = vector; cpumask_copy(desc->arch.cpu_mask, &tmp_mask); desc->arch.used = IRQ_USED; ASSERT((desc->arch.used_vectors == NULL) || (desc->arch.used_vectors == irq_used_vectors)); desc->arch.used_vectors = irq_used_vectors; if ( desc->arch.used_vectors ) { ASSERT(!test_bit(vector, desc->arch.used_vectors)); set_bit(vector, desc->arch.used_vectors); } err = 0; break; } return err; } int assign_irq_vector(int irq, const cpumask_t *mask) { int ret; unsigned long flags; struct irq_desc *desc = irq_to_desc(irq); BUG_ON(irq >= nr_irqs || irq <0); spin_lock_irqsave(&vector_lock, flags); ret = __assign_irq_vector(irq, desc, mask ?: TARGET_CPUS); if (!ret) { ret = desc->arch.vector; cpumask_copy(desc->affinity, desc->arch.cpu_mask); } spin_unlock_irqrestore(&vector_lock, flags); return ret; } /* * Initialize vector_irq on a new cpu. This function must be called * with vector_lock held. */ void __setup_vector_irq(int cpu) { int irq, vector; /* Clear vector_irq */ for (vector = 0; vector < NR_VECTORS; ++vector) per_cpu(vector_irq, cpu)[vector] = INT_MIN; /* Mark the inuse vectors */ for (irq = 0; irq < nr_irqs; ++irq) { struct irq_desc *desc = irq_to_desc(irq); if (!irq_desc_initialized(desc) || !cpumask_test_cpu(cpu, desc->arch.cpu_mask)) continue; vector = irq_to_vector(irq); per_cpu(vector_irq, cpu)[vector] = irq; } } void move_masked_irq(struct irq_desc *desc) { cpumask_t *pending_mask = desc->arch.pending_mask; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; desc->status &= ~IRQ_MOVE_PENDING; if (unlikely(cpumask_empty(pending_mask))) return; if (!desc->handler->set_affinity) return; /* * If there was a valid mask to work with, please do the disable, * re-program, enable sequence. This is *not* particularly important for * level triggered but in a edge trigger case, we might be setting rte when * an active trigger is comming in. This could cause some ioapics to * mal-function. Being paranoid i guess! * * For correct operation this depends on the caller masking the irqs. */ if ( likely(cpumask_intersects(pending_mask, &cpu_online_map)) ) desc->handler->set_affinity(desc, pending_mask); cpumask_clear(pending_mask); } void move_native_irq(struct irq_desc *desc) { if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; if (unlikely(desc->status & IRQ_DISABLED)) return; desc->handler->disable(desc); move_masked_irq(desc); desc->handler->enable(desc); } void irq_move_cleanup_interrupt(struct cpu_user_regs *regs) { unsigned vector, me; ack_APIC_irq(); me = smp_processor_id(); for ( vector = FIRST_DYNAMIC_VECTOR; vector <= LAST_HIPRIORITY_VECTOR; vector++) { unsigned int irq; unsigned int irr; struct irq_desc *desc; irq = __get_cpu_var(vector_irq)[vector]; if ((int)irq < 0) continue; if ( vector >= FIRST_LEGACY_VECTOR && vector <= LAST_LEGACY_VECTOR ) continue; desc = irq_to_desc(irq); if (!desc) continue; spin_lock(&desc->lock); if (!desc->arch.move_cleanup_count) goto unlock; if ( vector == desc->arch.vector && cpumask_test_cpu(me, desc->arch.cpu_mask) ) goto unlock; irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); /* * Check if the vector that needs to be cleanedup is * registered at the cpu's IRR. If so, then this is not * the best time to clean it up. Lets clean it up in the * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR * to myself. */ if (irr & (1 << (vector % 32))) { send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); TRACE_3D(TRC_HW_IRQ_MOVE_CLEANUP_DELAY, irq, vector, smp_processor_id()); goto unlock; } TRACE_3D(TRC_HW_IRQ_MOVE_CLEANUP, irq, vector, smp_processor_id()); __get_cpu_var(vector_irq)[vector] = ~irq; desc->arch.move_cleanup_count--; if ( desc->arch.move_cleanup_count == 0 ) { desc->arch.old_vector = IRQ_VECTOR_UNASSIGNED; cpumask_clear(desc->arch.old_cpu_mask); if ( desc->arch.used_vectors ) { ASSERT(test_bit(vector, desc->arch.used_vectors)); clear_bit(vector, desc->arch.used_vectors); } } unlock: spin_unlock(&desc->lock); } } static void send_cleanup_vector(struct irq_desc *desc) { cpumask_t cleanup_mask; cpumask_and(&cleanup_mask, desc->arch.old_cpu_mask, &cpu_online_map); desc->arch.move_cleanup_count = cpumask_weight(&cleanup_mask); send_IPI_mask(&cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); desc->arch.move_in_progress = 0; } void irq_complete_move(struct irq_desc *desc) { unsigned vector, me; if (likely(!desc->arch.move_in_progress)) return; vector = (u8)get_irq_regs()->entry_vector; me = smp_processor_id(); if ( vector == desc->arch.vector && cpumask_test_cpu(me, desc->arch.cpu_mask) ) send_cleanup_vector(desc); } unsigned int set_desc_affinity(struct irq_desc *desc, const cpumask_t *mask) { unsigned int irq; int ret; unsigned long flags; cpumask_t dest_mask; if (!cpumask_intersects(mask, &cpu_online_map)) return BAD_APICID; irq = desc->irq; spin_lock_irqsave(&vector_lock, flags); ret = __assign_irq_vector(irq, desc, mask); spin_unlock_irqrestore(&vector_lock, flags); if (ret < 0) return BAD_APICID; cpumask_copy(desc->affinity, mask); cpumask_and(&dest_mask, mask, desc->arch.cpu_mask); return cpu_mask_to_apicid(&dest_mask); } /* For re-setting irq interrupt affinity for specific irq */ void irq_set_affinity(struct irq_desc *desc, const cpumask_t *mask) { if (!desc->handler->set_affinity) return; ASSERT(spin_is_locked(&desc->lock)); desc->status &= ~IRQ_MOVE_PENDING; wmb(); cpumask_copy(desc->arch.pending_mask, mask); wmb(); desc->status |= IRQ_MOVE_PENDING; } void pirq_set_affinity(struct domain *d, int pirq, const cpumask_t *mask) { unsigned long flags; struct irq_desc *desc = domain_spin_lock_irq_desc(d, pirq, &flags); if ( !desc ) return; irq_set_affinity(desc, mask); spin_unlock_irqrestore(&desc->lock, flags); } DEFINE_PER_CPU(unsigned int, irq_count); uint8_t alloc_hipriority_vector(void) { static uint8_t next = FIRST_HIPRIORITY_VECTOR; BUG_ON(next < FIRST_HIPRIORITY_VECTOR); BUG_ON(next > LAST_HIPRIORITY_VECTOR); return next++; } static void (*direct_apic_vector[NR_VECTORS])(struct cpu_user_regs *); void set_direct_apic_vector( uint8_t vector, void (*handler)(struct cpu_user_regs *)) { BUG_ON(direct_apic_vector[vector] != NULL); direct_apic_vector[vector] = handler; } void alloc_direct_apic_vector( uint8_t *vector, void (*handler)(struct cpu_user_regs *)) { static DEFINE_SPINLOCK(lock); spin_lock(&lock); if (*vector == 0) { *vector = alloc_hipriority_vector(); set_direct_apic_vector(*vector, handler); } spin_unlock(&lock); } void do_IRQ(struct cpu_user_regs *regs) { struct irqaction *action; uint32_t tsc_in; struct irq_desc *desc; unsigned int vector = (u8)regs->entry_vector; int irq = __get_cpu_var(vector_irq[vector]); struct cpu_user_regs *old_regs = set_irq_regs(regs); perfc_incr(irqs); this_cpu(irq_count)++; irq_enter(); if (irq < 0) { if (direct_apic_vector[vector] != NULL) { (*direct_apic_vector[vector])(regs); } else { const char *kind = ", LAPIC"; if ( apic_isr_read(vector) ) ack_APIC_irq(); else kind = ""; if ( ! ( vector >= FIRST_LEGACY_VECTOR && vector <= LAST_LEGACY_VECTOR && bogus_8259A_irq(vector - FIRST_LEGACY_VECTOR) ) ) { printk("CPU%u: No irq handler for vector %02x (IRQ %d%s)\n", smp_processor_id(), vector, irq, kind); desc = irq_to_desc(~irq); if ( ~irq < nr_irqs && irq_desc_initialized(desc) ) { spin_lock(&desc->lock); printk("IRQ%d a=%04lx[%04lx,%04lx] v=%02x[%02x] t=%s s=%08x\n", ~irq, *cpumask_bits(desc->affinity), *cpumask_bits(desc->arch.cpu_mask), *cpumask_bits(desc->arch.old_cpu_mask), desc->arch.vector, desc->arch.old_vector, desc->handler->typename, desc->status); spin_unlock(&desc->lock); } } TRACE_1D(TRC_HW_IRQ_UNMAPPED_VECTOR, vector); } goto out_no_unlock; } desc = irq_to_desc(irq); spin_lock(&desc->lock); desc->handler->ack(desc); if ( likely(desc->status & IRQ_GUEST) ) { if ( irq_ratelimit_timer.function && /* irq rate limiting enabled? */ unlikely(desc->rl_cnt++ >= irq_ratelimit_threshold) ) { s_time_t now = NOW(); if ( now < (desc->rl_quantum_start + MILLISECS(10)) ) { desc->handler->disable(desc); /* * If handler->disable doesn't actually mask the interrupt, a * disabled irq still can fire. This check also avoids possible * deadlocks if ratelimit_timer_fn runs at the same time. */ if ( likely(list_empty(&desc->rl_link)) ) { spin_lock(&irq_ratelimit_lock); if ( list_empty(&irq_ratelimit_list) ) set_timer(&irq_ratelimit_timer, now + MILLISECS(10)); list_add(&desc->rl_link, &irq_ratelimit_list); spin_unlock(&irq_ratelimit_lock); } goto out; } desc->rl_cnt = 0; desc->rl_quantum_start = now; } tsc_in = tb_init_done ? get_cycles() : 0; __do_IRQ_guest(irq); TRACE_3D(TRC_HW_IRQ_HANDLED, irq, tsc_in, get_cycles()); goto out_no_end; } desc->status &= ~IRQ_REPLAY; desc->status |= IRQ_PENDING; /* * Since we set PENDING, if another processor is handling a different * instance of this same irq, the other processor will take care of it. */ if ( desc->status & (IRQ_DISABLED | IRQ_INPROGRESS) ) goto out; desc->status |= IRQ_INPROGRESS; action = desc->action; while ( desc->status & IRQ_PENDING ) { desc->status &= ~IRQ_PENDING; spin_unlock_irq(&desc->lock); tsc_in = tb_init_done ? get_cycles() : 0; action->handler(irq, action->dev_id, regs); TRACE_3D(TRC_HW_IRQ_HANDLED, irq, tsc_in, get_cycles()); spin_lock_irq(&desc->lock); } desc->status &= ~IRQ_INPROGRESS; out: if ( desc->handler->end ) desc->handler->end(desc, vector); out_no_end: spin_unlock(&desc->lock); out_no_unlock: irq_exit(); set_irq_regs(old_regs); } static void irq_ratelimit_timer_fn(void *data) { struct irq_desc *desc, *tmp; unsigned long flags; spin_lock_irqsave(&irq_ratelimit_lock, flags); list_for_each_entry_safe ( desc, tmp, &irq_ratelimit_list, rl_link ) { spin_lock(&desc->lock); desc->handler->enable(desc); list_del(&desc->rl_link); INIT_LIST_HEAD(&desc->rl_link); spin_unlock(&desc->lock); } spin_unlock_irqrestore(&irq_ratelimit_lock, flags); } static int __init irq_ratelimit_init(void) { if ( irq_ratelimit_threshold ) init_timer(&irq_ratelimit_timer, irq_ratelimit_timer_fn, NULL, 0); return 0; } __initcall(irq_ratelimit_init); int __init request_irq(unsigned int irq, void (*handler)(int, void *, struct cpu_user_regs *), const char * devname, void *dev_id) { struct irqaction * action; int retval; /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out * which interrupt is which (messes up the interrupt freeing * logic etc). */ if (irq >= nr_irqs) return -EINVAL; if (!handler) return -EINVAL; action = xmalloc(struct irqaction); if (!action) return -ENOMEM; action->handler = handler; action->name = devname; action->dev_id = dev_id; action->free_on_release = 1; retval = setup_irq(irq, action); if (retval) xfree(action); return retval; } void __init release_irq(unsigned int irq) { struct irq_desc *desc; unsigned long flags; struct irqaction *action; desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock,flags); action = desc->action; desc->action = NULL; desc->status |= IRQ_DISABLED; desc->handler->shutdown(desc); spin_unlock_irqrestore(&desc->lock,flags); /* Wait to make sure it's not being used on another CPU */ do { smp_mb(); } while ( desc->status & IRQ_INPROGRESS ); if (action && action->free_on_release) xfree(action); } int __init setup_irq(unsigned int irq, struct irqaction *new) { struct irq_desc *desc; unsigned long flags; desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock,flags); if ( desc->action != NULL ) { spin_unlock_irqrestore(&desc->lock,flags); return -EBUSY; } desc->action = new; desc->status &= ~IRQ_DISABLED; desc->handler->startup(desc); spin_unlock_irqrestore(&desc->lock,flags); return 0; } /* * HANDLING OF GUEST-BOUND PHYSICAL IRQS */ #define IRQ_MAX_GUESTS 7 typedef struct { u8 nr_guests; u8 in_flight; u8 shareable; u8 ack_type; #define ACKTYPE_NONE 0 /* No final acknowledgement is required */ #define ACKTYPE_UNMASK 1 /* Unmask PIC hardware (from any CPU) */ #define ACKTYPE_EOI 2 /* EOI on the CPU that was interrupted */ cpumask_var_t cpu_eoi_map; /* CPUs that need to EOI this interrupt */ struct timer eoi_timer; struct domain *guest[IRQ_MAX_GUESTS]; } irq_guest_action_t; /* * Stack of interrupts awaiting EOI on each CPU. These must be popped in * order, as only the current highest-priority pending irq can be EOIed. */ struct pending_eoi { u32 ready:1; /* Ready for EOI now? */ u32 irq:23; /* irq of the vector */ u32 vector:8; /* vector awaiting EOI */ }; static DEFINE_PER_CPU(struct pending_eoi, pending_eoi[NR_DYNAMIC_VECTORS]); #define pending_eoi_sp(p) ((p)[NR_DYNAMIC_VECTORS-1].vector) bool_t cpu_has_pending_apic_eoi(void) { return (pending_eoi_sp(this_cpu(pending_eoi)) != 0); } static inline void set_pirq_eoi(struct domain *d, unsigned int irq) { if ( !is_hvm_domain(d) && d->arch.pv_domain.pirq_eoi_map ) set_bit(irq, d->arch.pv_domain.pirq_eoi_map); } static inline void clear_pirq_eoi(struct domain *d, unsigned int irq) { if ( !is_hvm_domain(d) && d->arch.pv_domain.pirq_eoi_map ) clear_bit(irq, d->arch.pv_domain.pirq_eoi_map); } static void set_eoi_ready(void *data); static void irq_guest_eoi_timer_fn(void *data) { struct irq_desc *desc = data; unsigned int irq = desc - irq_desc; irq_guest_action_t *action; cpumask_t cpu_eoi_map; unsigned long flags; spin_lock_irqsave(&desc->lock, flags); if ( !(desc->status & IRQ_GUEST) ) goto out; action = (irq_guest_action_t *)desc->action; if ( action->ack_type != ACKTYPE_NONE ) { unsigned int i; for ( i = 0; i < action->nr_guests; i++ ) { struct domain *d = action->guest[i]; unsigned int pirq = domain_irq_to_pirq(d, irq); if ( test_and_clear_bool(pirq_info(d, pirq)->masked) ) action->in_flight--; } } if ( action->in_flight != 0 ) goto out; switch ( action->ack_type ) { case ACKTYPE_UNMASK: if ( desc->handler->end ) desc->handler->end(desc, 0); break; case ACKTYPE_EOI: cpumask_copy(&cpu_eoi_map, action->cpu_eoi_map); spin_unlock_irq(&desc->lock); on_selected_cpus(&cpu_eoi_map, set_eoi_ready, desc, 0); spin_lock_irq(&desc->lock); break; } out: spin_unlock_irqrestore(&desc->lock, flags); } static void __do_IRQ_guest(int irq) { struct irq_desc *desc = irq_to_desc(irq); irq_guest_action_t *action = (irq_guest_action_t *)desc->action; struct domain *d; int i, sp; struct pending_eoi *peoi = this_cpu(pending_eoi); unsigned int vector = (u8)get_irq_regs()->entry_vector; if ( unlikely(action->nr_guests == 0) ) { /* An interrupt may slip through while freeing an ACKTYPE_EOI irq. */ ASSERT(action->ack_type == ACKTYPE_EOI); ASSERT(desc->status & IRQ_DISABLED); if ( desc->handler->end ) desc->handler->end(desc, vector); return; } if ( action->ack_type == ACKTYPE_EOI ) { sp = pending_eoi_sp(peoi); ASSERT((sp == 0) || (peoi[sp-1].vector < vector)); ASSERT(sp < (NR_DYNAMIC_VECTORS-1)); peoi[sp].irq = irq; peoi[sp].vector = vector; peoi[sp].ready = 0; pending_eoi_sp(peoi) = sp+1; cpumask_set_cpu(smp_processor_id(), action->cpu_eoi_map); } for ( i = 0; i < action->nr_guests; i++ ) { struct pirq *pirq; d = action->guest[i]; pirq = pirq_info(d, domain_irq_to_pirq(d, irq)); if ( (action->ack_type != ACKTYPE_NONE) && !test_and_set_bool(pirq->masked) ) action->in_flight++; if ( !hvm_do_IRQ_dpci(d, pirq) ) send_guest_pirq(d, pirq); } if ( action->ack_type != ACKTYPE_NONE ) { stop_timer(&action->eoi_timer); migrate_timer(&action->eoi_timer, smp_processor_id()); set_timer(&action->eoi_timer, NOW() + MILLISECS(1)); } } /* * Retrieve Xen irq-descriptor corresponding to a domain-specific irq. * The descriptor is returned locked. This function is safe against changes * to the per-domain irq-to-vector mapping. */ struct irq_desc *domain_spin_lock_irq_desc( struct domain *d, int pirq, unsigned long *pflags) { const struct pirq *info = pirq_info(d, pirq); return info ? pirq_spin_lock_irq_desc(info, pflags) : NULL; } /* * Same with struct pirq already looked up. */ struct irq_desc *pirq_spin_lock_irq_desc( const struct pirq *pirq, unsigned long *pflags) { struct irq_desc *desc; unsigned long flags; for ( ; ; ) { int irq = pirq->arch.irq; if ( irq <= 0 ) return NULL; desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); if ( irq == pirq->arch.irq ) break; spin_unlock_irqrestore(&desc->lock, flags); } if ( pflags ) *pflags = flags; return desc; } static int prepare_domain_irq_pirq(struct domain *d, int irq, int pirq, struct pirq **pinfo) { int err = radix_tree_insert(&d->arch.irq_pirq, irq, radix_tree_int_to_ptr(0)); struct pirq *info; if ( err && err != -EEXIST ) return err; info = pirq_get_info(d, pirq); if ( !info ) { if ( !err ) radix_tree_delete(&d->arch.irq_pirq, irq); return -ENOMEM; } *pinfo = info; return 0; } static void set_domain_irq_pirq(struct domain *d, int irq, struct pirq *pirq) { radix_tree_replace_slot( radix_tree_lookup_slot(&d->arch.irq_pirq, irq), radix_tree_int_to_ptr(pirq->pirq)); pirq->arch.irq = irq; } static void clear_domain_irq_pirq(struct domain *d, int irq, struct pirq *pirq) { pirq->arch.irq = 0; radix_tree_replace_slot( radix_tree_lookup_slot(&d->arch.irq_pirq, irq), radix_tree_int_to_ptr(0)); } static void cleanup_domain_irq_pirq(struct domain *d, int irq, struct pirq *pirq) { pirq_cleanup_check(pirq, d); radix_tree_delete(&d->arch.irq_pirq, irq); } int init_domain_irq_mapping(struct domain *d) { unsigned int i; int err = 0; radix_tree_init(&d->arch.irq_pirq); if ( is_hvm_domain(d) ) radix_tree_init(&d->arch.hvm_domain.emuirq_pirq); for ( i = 1; platform_legacy_irq(i); ++i ) { struct pirq *info; if ( IO_APIC_IRQ(i) ) continue; err = prepare_domain_irq_pirq(d, i, i, &info); if ( err ) break; set_domain_irq_pirq(d, i, info); } if ( err ) cleanup_domain_irq_mapping(d); return err; } void cleanup_domain_irq_mapping(struct domain *d) { radix_tree_destroy(&d->arch.irq_pirq, NULL); if ( is_hvm_domain(d) ) radix_tree_destroy(&d->arch.hvm_domain.emuirq_pirq, NULL); } struct pirq *alloc_pirq_struct(struct domain *d) { size_t sz = is_hvm_domain(d) ? sizeof(struct pirq) : offsetof(struct pirq, arch.hvm); struct pirq *pirq = xzalloc_bytes(sz); if ( pirq ) { if ( is_hvm_domain(d) ) { pirq->arch.hvm.emuirq = IRQ_UNBOUND; pt_pirq_init(d, &pirq->arch.hvm.dpci); } } return pirq; } void (pirq_cleanup_check)(struct pirq *pirq, struct domain *d) { /* * Check whether all fields have their default values, and delete * the entry from the tree if so. * * NB: Common parts were already checked. */ if ( pirq->arch.irq ) return; if ( is_hvm_domain(d) ) { if ( pirq->arch.hvm.emuirq != IRQ_UNBOUND ) return; if ( !pt_pirq_cleanup_check(&pirq->arch.hvm.dpci) ) return; } if ( radix_tree_delete(&d->pirq_tree, pirq->pirq) != pirq ) BUG(); } /* Flush all ready EOIs from the top of this CPU's pending-EOI stack. */ static void flush_ready_eoi(void) { struct pending_eoi *peoi = this_cpu(pending_eoi); struct irq_desc *desc; int irq, sp; ASSERT(!local_irq_is_enabled()); sp = pending_eoi_sp(peoi); while ( (--sp >= 0) && peoi[sp].ready ) { irq = peoi[sp].irq; ASSERT(irq > 0); desc = irq_to_desc(irq); spin_lock(&desc->lock); if ( desc->handler->end ) desc->handler->end(desc, peoi[sp].vector); spin_unlock(&desc->lock); } pending_eoi_sp(peoi) = sp+1; } static void __set_eoi_ready(struct irq_desc *desc) { irq_guest_action_t *action = (irq_guest_action_t *)desc->action; struct pending_eoi *peoi = this_cpu(pending_eoi); int irq, sp; irq = desc - irq_desc; if ( !(desc->status & IRQ_GUEST) || (action->in_flight != 0) || !cpumask_test_and_clear_cpu(smp_processor_id(), action->cpu_eoi_map) ) return; sp = pending_eoi_sp(peoi); do { ASSERT(sp > 0); } while ( peoi[--sp].irq != irq ); ASSERT(!peoi[sp].ready); peoi[sp].ready = 1; } /* Mark specified IRQ as ready-for-EOI (if it really is) and attempt to EOI. */ static void set_eoi_ready(void *data) { struct irq_desc *desc = data; ASSERT(!local_irq_is_enabled()); spin_lock(&desc->lock); __set_eoi_ready(desc); spin_unlock(&desc->lock); flush_ready_eoi(); } void pirq_guest_eoi(struct pirq *pirq) { struct irq_desc *desc; ASSERT(local_irq_is_enabled()); desc = pirq_spin_lock_irq_desc(pirq, NULL); if ( desc ) desc_guest_eoi(desc, pirq); } void desc_guest_eoi(struct irq_desc *desc, struct pirq *pirq) { irq_guest_action_t *action; cpumask_t cpu_eoi_map; int irq; if ( !(desc->status & IRQ_GUEST) ) { spin_unlock_irq(&desc->lock); return; } action = (irq_guest_action_t *)desc->action; irq = desc - irq_desc; if ( unlikely(!test_and_clear_bool(pirq->masked)) || unlikely(--action->in_flight != 0) ) { spin_unlock_irq(&desc->lock); return; } if ( action->ack_type == ACKTYPE_UNMASK ) { ASSERT(cpumask_empty(action->cpu_eoi_map)); if ( desc->handler->end ) desc->handler->end(desc, 0); spin_unlock_irq(&desc->lock); return; } ASSERT(action->ack_type == ACKTYPE_EOI); cpumask_copy(&cpu_eoi_map, action->cpu_eoi_map); if ( cpumask_test_and_clear_cpu(smp_processor_id(), &cpu_eoi_map) ) { __set_eoi_ready(desc); spin_unlock(&desc->lock); flush_ready_eoi(); local_irq_enable(); } else { spin_unlock_irq(&desc->lock); } if ( !cpumask_empty(&cpu_eoi_map) ) on_selected_cpus(&cpu_eoi_map, set_eoi_ready, desc, 0); } int pirq_guest_unmask(struct domain *d) { unsigned int pirq = 0, n, i; struct pirq *pirqs[16]; do { n = radix_tree_gang_lookup(&d->pirq_tree, (void **)pirqs, pirq, ARRAY_SIZE(pirqs)); for ( i = 0; i < n; ++i ) { pirq = pirqs[i]->pirq; if ( pirqs[i]->masked && !evtchn_port_is_masked(d, evtchn_from_port(d, pirqs[i]->evtchn)) ) pirq_guest_eoi(pirqs[i]); } } while ( ++pirq < d->nr_pirqs && n == ARRAY_SIZE(pirqs) ); return 0; } static int pirq_acktype(struct domain *d, int pirq) { struct irq_desc *desc; int irq; irq = domain_pirq_to_irq(d, pirq); if ( irq <= 0 ) return ACKTYPE_NONE; desc = irq_to_desc(irq); if ( desc->handler == &no_irq_type ) return ACKTYPE_NONE; /* * Edge-triggered IO-APIC and LAPIC interrupts need no final * acknowledgement: we ACK early during interrupt processing. */ if ( !strcmp(desc->handler->typename, "IO-APIC-edge") || !strcmp(desc->handler->typename, "local-APIC-edge") ) return ACKTYPE_NONE; /* * MSIs are treated as edge-triggered interrupts, except * when there is no proper way to mask them. */ if ( desc->msi_desc ) return msi_maskable_irq(desc->msi_desc) ? ACKTYPE_NONE : ACKTYPE_EOI; /* * Level-triggered IO-APIC interrupts need to be acknowledged on the CPU * on which they were received. This is because we tickle the LAPIC to EOI. */ if ( !strcmp(desc->handler->typename, "IO-APIC-level") ) return desc->handler->ack == irq_complete_move ? ACKTYPE_EOI : ACKTYPE_UNMASK; /* Legacy PIC interrupts can be acknowledged from any CPU. */ if ( !strcmp(desc->handler->typename, "XT-PIC") ) return ACKTYPE_UNMASK; printk("Unknown PIC type '%s' for IRQ %d\n", desc->handler->typename, irq); BUG(); return 0; } int pirq_shared(struct domain *d, int pirq) { struct irq_desc *desc; irq_guest_action_t *action; unsigned long flags; int shared; desc = domain_spin_lock_irq_desc(d, pirq, &flags); if ( desc == NULL ) return 0; action = (irq_guest_action_t *)desc->action; shared = ((desc->status & IRQ_GUEST) && (action->nr_guests > 1)); spin_unlock_irqrestore(&desc->lock, flags); return shared; } int pirq_guest_bind(struct vcpu *v, struct pirq *pirq, int will_share) { unsigned int irq; struct irq_desc *desc; irq_guest_action_t *action, *newaction = NULL; int rc = 0; WARN_ON(!spin_is_locked(&v->domain->event_lock)); BUG_ON(!local_irq_is_enabled()); retry: desc = pirq_spin_lock_irq_desc(pirq, NULL); if ( desc == NULL ) { rc = -EINVAL; goto out; } action = (irq_guest_action_t *)desc->action; irq = desc - irq_desc; if ( !(desc->status & IRQ_GUEST) ) { if ( desc->action != NULL ) { printk(XENLOG_G_INFO "Cannot bind IRQ%d to dom%d. In use by '%s'.\n", pirq->pirq, v->domain->domain_id, desc->action->name); rc = -EBUSY; goto unlock_out; } if ( newaction == NULL ) { spin_unlock_irq(&desc->lock); if ( (newaction = xmalloc(irq_guest_action_t)) != NULL && zalloc_cpumask_var(&newaction->cpu_eoi_map) ) goto retry; xfree(newaction); printk(XENLOG_G_INFO "Cannot bind IRQ%d to dom%d. Out of memory.\n", pirq->pirq, v->domain->domain_id); return -ENOMEM; } action = newaction; desc->action = (struct irqaction *)action; newaction = NULL; action->nr_guests = 0; action->in_flight = 0; action->shareable = will_share; action->ack_type = pirq_acktype(v->domain, pirq->pirq); init_timer(&action->eoi_timer, irq_guest_eoi_timer_fn, desc, 0); desc->status |= IRQ_GUEST; desc->status &= ~IRQ_DISABLED; desc->handler->startup(desc); /* Attempt to bind the interrupt target to the correct CPU. */ if ( !opt_noirqbalance && (desc->handler->set_affinity != NULL) ) desc->handler->set_affinity(desc, cpumask_of(v->processor)); } else if ( !will_share || !action->shareable ) { printk(XENLOG_G_INFO "Cannot bind IRQ%d to dom%d. %s.\n", pirq->pirq, v->domain->domain_id, will_share ? "Others do not share" : "Will not share with others"); rc = -EBUSY; goto unlock_out; } else if ( action->nr_guests == 0 ) { /* * Indicates that an ACKTYPE_EOI interrupt is being released. * Wait for that to happen before continuing. */ ASSERT(action->ack_type == ACKTYPE_EOI); ASSERT(desc->status & IRQ_DISABLED); spin_unlock_irq(&desc->lock); cpu_relax(); goto retry; } if ( action->nr_guests == IRQ_MAX_GUESTS ) { printk(XENLOG_G_INFO "Cannot bind IRQ%d to dom%d. " "Already at max share.\n", pirq->pirq, v->domain->domain_id); rc = -EBUSY; goto unlock_out; } action->guest[action->nr_guests++] = v->domain; if ( action->ack_type != ACKTYPE_NONE ) set_pirq_eoi(v->domain, pirq->pirq); else clear_pirq_eoi(v->domain, pirq->pirq); unlock_out: spin_unlock_irq(&desc->lock); out: if ( newaction != NULL ) { free_cpumask_var(newaction->cpu_eoi_map); xfree(newaction); } return rc; } static irq_guest_action_t *__pirq_guest_unbind( struct domain *d, struct pirq *pirq, struct irq_desc *desc) { unsigned int irq; irq_guest_action_t *action; cpumask_t cpu_eoi_map; int i; action = (irq_guest_action_t *)desc->action; irq = desc - irq_desc; if ( unlikely(action == NULL) ) { dprintk(XENLOG_G_WARNING, "dom%d: pirq %d: desc->action is NULL!\n", d->domain_id, pirq->pirq); return NULL; } BUG_ON(!(desc->status & IRQ_GUEST)); for ( i = 0; (i < action->nr_guests) && (action->guest[i] != d); i++ ) continue; BUG_ON(i == action->nr_guests); memmove(&action->guest[i], &action->guest[i+1], (action->nr_guests-i-1) * sizeof(action->guest[0])); action->nr_guests--; switch ( action->ack_type ) { case ACKTYPE_UNMASK: if ( test_and_clear_bool(pirq->masked) && (--action->in_flight == 0) && desc->handler->end ) desc->handler->end(desc, 0); break; case ACKTYPE_EOI: /* NB. If #guests == 0 then we clear the eoi_map later on. */ if ( test_and_clear_bool(pirq->masked) && (--action->in_flight == 0) && (action->nr_guests != 0) ) { cpumask_copy(&cpu_eoi_map, action->cpu_eoi_map); spin_unlock_irq(&desc->lock); on_selected_cpus(&cpu_eoi_map, set_eoi_ready, desc, 0); spin_lock_irq(&desc->lock); } break; } /* * The guest cannot re-bind to this IRQ until this function returns. So, * when we have flushed this IRQ from ->masked, it should remain flushed. */ BUG_ON(pirq->masked); if ( action->nr_guests != 0 ) return NULL; BUG_ON(action->in_flight != 0); /* Disabling IRQ before releasing the desc_lock avoids an IRQ storm. */ desc->status |= IRQ_DISABLED; desc->handler->disable(desc); /* * Mark any remaining pending EOIs as ready to flush. * NOTE: We will need to make this a stronger barrier if in future we allow * an interrupt vectors to be re-bound to a different PIC. In that case we * would need to flush all ready EOIs before returning as otherwise the * desc->handler could change and we would call the wrong 'end' hook. */ cpumask_copy(&cpu_eoi_map, action->cpu_eoi_map); if ( !cpumask_empty(&cpu_eoi_map) ) { BUG_ON(action->ack_type != ACKTYPE_EOI); spin_unlock_irq(&desc->lock); on_selected_cpus(&cpu_eoi_map, set_eoi_ready, desc, 1); spin_lock_irq(&desc->lock); } BUG_ON(!cpumask_empty(action->cpu_eoi_map)); desc->action = NULL; desc->status &= ~(IRQ_GUEST|IRQ_INPROGRESS); desc->handler->shutdown(desc); /* Caller frees the old guest descriptor block. */ return action; } void pirq_guest_unbind(struct domain *d, struct pirq *pirq) { irq_guest_action_t *oldaction = NULL; struct irq_desc *desc; int irq = 0; WARN_ON(!spin_is_locked(&d->event_lock)); BUG_ON(!local_irq_is_enabled()); desc = pirq_spin_lock_irq_desc(pirq, NULL); if ( desc == NULL ) { irq = -pirq->arch.irq; BUG_ON(irq <= 0); desc = irq_to_desc(irq); spin_lock_irq(&desc->lock); clear_domain_irq_pirq(d, irq, pirq); } else { oldaction = __pirq_guest_unbind(d, pirq, desc); } spin_unlock_irq(&desc->lock); if ( oldaction != NULL ) { kill_timer(&oldaction->eoi_timer); free_cpumask_var(oldaction->cpu_eoi_map); xfree(oldaction); } else if ( irq > 0 ) cleanup_domain_irq_pirq(d, irq, pirq); } static int pirq_guest_force_unbind(struct domain *d, struct pirq *pirq) { struct irq_desc *desc; irq_guest_action_t *action, *oldaction = NULL; int i, bound = 0; WARN_ON(!spin_is_locked(&d->event_lock)); BUG_ON(!local_irq_is_enabled()); desc = pirq_spin_lock_irq_desc(pirq, NULL); BUG_ON(desc == NULL); if ( !(desc->status & IRQ_GUEST) ) goto out; action = (irq_guest_action_t *)desc->action; if ( unlikely(action == NULL) ) { dprintk(XENLOG_G_WARNING, "dom%d: pirq %d: desc->action is NULL!\n", d->domain_id, pirq->pirq); goto out; } for ( i = 0; (i < action->nr_guests) && (action->guest[i] != d); i++ ) continue; if ( i == action->nr_guests ) goto out; bound = 1; oldaction = __pirq_guest_unbind(d, pirq, desc); out: spin_unlock_irq(&desc->lock); if ( oldaction != NULL ) { kill_timer(&oldaction->eoi_timer); free_cpumask_var(oldaction->cpu_eoi_map); xfree(oldaction); } return bound; } static inline bool_t is_free_pirq(const struct domain *d, const struct pirq *pirq) { return !pirq || (!pirq->arch.irq && (!is_hvm_domain(d) || pirq->arch.hvm.emuirq == IRQ_UNBOUND)); } int get_free_pirq(struct domain *d, int type) { int i; ASSERT(spin_is_locked(&d->event_lock)); if ( type == MAP_PIRQ_TYPE_GSI ) { for ( i = 16; i < nr_irqs_gsi; i++ ) if ( is_free_pirq(d, pirq_info(d, i)) ) { pirq_get_info(d, i); return i; } } for ( i = d->nr_pirqs - 1; i >= nr_irqs_gsi; i-- ) if ( is_free_pirq(d, pirq_info(d, i)) ) { pirq_get_info(d, i); return i; } return -ENOSPC; } int get_free_pirqs(struct domain *d, unsigned int nr) { unsigned int i, found = 0; ASSERT(spin_is_locked(&d->event_lock)); for ( i = d->nr_pirqs - 1; i >= nr_irqs_gsi; --i ) if ( is_free_pirq(d, pirq_info(d, i)) ) { pirq_get_info(d, i); if ( ++found == nr ) return i; } else found = 0; return -ENOSPC; } int map_domain_pirq( struct domain *d, int pirq, int irq, int type, void *data) { int ret = 0; int old_irq, old_pirq; struct pirq *info; struct irq_desc *desc; unsigned long flags; ASSERT(spin_is_locked(&d->event_lock)); if ( !irq_access_permitted(current->domain, irq)) return -EPERM; if ( pirq < 0 || pirq >= d->nr_pirqs || irq < 0 || irq >= nr_irqs ) { dprintk(XENLOG_G_ERR, "dom%d: invalid pirq %d or irq %d\n", d->domain_id, pirq, irq); return -EINVAL; } old_irq = domain_pirq_to_irq(d, pirq); old_pirq = domain_irq_to_pirq(d, irq); if ( (old_irq > 0 && (old_irq != irq) ) || (old_pirq && (old_pirq != pirq)) ) { dprintk(XENLOG_G_WARNING, "dom%d: pirq %d or irq %d already mapped\n", d->domain_id, pirq, irq); return 0; } ret = xsm_map_domain_irq(XSM_HOOK, d, irq, data); if ( ret ) { dprintk(XENLOG_G_ERR, "dom%d: could not permit access to irq %d mapping to pirq %d\n", d->domain_id, irq, pirq); return ret; } ret = irq_permit_access(d, irq); if ( ret ) { printk(XENLOG_G_ERR "dom%d: could not permit access to IRQ%d (pirq %d)\n", d->domain_id, irq, pirq); return ret; } ret = prepare_domain_irq_pirq(d, irq, pirq, &info); if ( ret ) goto revoke; desc = irq_to_desc(irq); if ( type == MAP_PIRQ_TYPE_MSI || type == MAP_PIRQ_TYPE_MULTI_MSI ) { struct msi_info *msi = (struct msi_info *)data; struct msi_desc *msi_desc; struct pci_dev *pdev; unsigned int nr = 0; ASSERT(spin_is_locked(&pcidevs_lock)); ret = -ENODEV; if ( !cpu_has_apic ) goto done; pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn); ret = pci_enable_msi(msi, &msi_desc); if ( ret ) { if ( ret > 0 ) { msi->entry_nr = ret; ret = -ENFILE; } goto done; } spin_lock_irqsave(&desc->lock, flags); if ( desc->handler != &no_irq_type ) { spin_unlock_irqrestore(&desc->lock, flags); dprintk(XENLOG_G_ERR, "dom%d: irq %d in use\n", d->domain_id, irq); pci_disable_msi(msi_desc); ret = -EBUSY; goto done; } while ( !(ret = setup_msi_irq(desc, msi_desc + nr)) ) { if ( opt_irq_vector_map == OPT_IRQ_VECTOR_MAP_PERDEV && !desc->arch.used_vectors ) { desc->arch.used_vectors = &pdev->arch.used_vectors; if ( desc->arch.vector != IRQ_VECTOR_UNASSIGNED ) { int vector = desc->arch.vector; ASSERT(!test_bit(vector, desc->arch.used_vectors)); set_bit(vector, desc->arch.used_vectors); } } if ( type == MAP_PIRQ_TYPE_MSI || msi_desc->msi_attrib.type != PCI_CAP_ID_MSI || ++nr == msi->entry_nr ) break; set_domain_irq_pirq(d, irq, info); spin_unlock_irqrestore(&desc->lock, flags); info = NULL; irq = create_irq(NUMA_NO_NODE); ret = irq >= 0 ? prepare_domain_irq_pirq(d, irq, pirq + nr, &info) : irq; if ( ret ) break; msi_desc[nr].irq = irq; if ( irq_permit_access(d, irq) != 0 ) printk(XENLOG_G_WARNING "dom%d: could not permit access to IRQ%d (pirq %d)\n", d->domain_id, irq, pirq); desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); if ( desc->handler != &no_irq_type ) { dprintk(XENLOG_G_ERR, "dom%d: irq %d (pirq %u) in use (%s)\n", d->domain_id, irq, pirq + nr, desc->handler->typename); ret = -EBUSY; break; } } if ( ret ) { spin_unlock_irqrestore(&desc->lock, flags); while ( nr-- ) { if ( irq >= 0 ) { if ( irq_deny_access(d, irq) ) printk(XENLOG_G_ERR "dom%d: could not revoke access to IRQ%d (pirq %d)\n", d->domain_id, irq, pirq); destroy_irq(irq); } if ( info ) cleanup_domain_irq_pirq(d, irq, info); info = pirq_info(d, pirq + nr); irq = info->arch.irq; } pci_disable_msi(msi_desc); goto done; } set_domain_irq_pirq(d, irq, info); spin_unlock_irqrestore(&desc->lock, flags); } else { spin_lock_irqsave(&desc->lock, flags); set_domain_irq_pirq(d, irq, info); spin_unlock_irqrestore(&desc->lock, flags); } done: if ( ret ) { cleanup_domain_irq_pirq(d, irq, info); revoke: if ( irq_deny_access(d, irq) ) printk(XENLOG_G_ERR "dom%d: could not revoke access to IRQ%d (pirq %d)\n", d->domain_id, irq, pirq); } return ret; } /* The pirq should have been unbound before this call. */ int unmap_domain_pirq(struct domain *d, int pirq) { unsigned long flags; struct irq_desc *desc; int irq, ret = 0, rc; unsigned int i, nr = 1; bool_t forced_unbind; struct pirq *info; struct msi_desc *msi_desc = NULL; if ( (pirq < 0) || (pirq >= d->nr_pirqs) ) return -EINVAL; ASSERT(spin_is_locked(&pcidevs_lock)); ASSERT(spin_is_locked(&d->event_lock)); info = pirq_info(d, pirq); if ( !info || (irq = info->arch.irq) <= 0 ) { dprintk(XENLOG_G_ERR, "dom%d: pirq %d not mapped\n", d->domain_id, pirq); ret = -EINVAL; goto done; } desc = irq_to_desc(irq); msi_desc = desc->msi_desc; if ( msi_desc && msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ) { if ( msi_desc->msi_attrib.entry_nr ) { printk(XENLOG_G_ERR "dom%d: trying to unmap secondary MSI pirq %d\n", d->domain_id, pirq); ret = -EBUSY; goto done; } nr = msi_desc->msi.nvec; } ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq, msi_desc); if ( ret ) goto done; forced_unbind = pirq_guest_force_unbind(d, info); if ( forced_unbind ) dprintk(XENLOG_G_WARNING, "dom%d: forcing unbind of pirq %d\n", d->domain_id, pirq); if ( msi_desc != NULL ) pci_disable_msi(msi_desc); spin_lock_irqsave(&desc->lock, flags); for ( i = 0; ; ) { BUG_ON(irq != domain_pirq_to_irq(d, pirq + i)); if ( !forced_unbind ) clear_domain_irq_pirq(d, irq, info); else { info->arch.irq = -irq; radix_tree_replace_slot( radix_tree_lookup_slot(&d->arch.irq_pirq, irq), radix_tree_int_to_ptr(-pirq)); } if ( msi_desc ) { desc->handler = &no_irq_type; desc->msi_desc = NULL; } if ( ++i == nr ) break; spin_unlock_irqrestore(&desc->lock, flags); if ( !forced_unbind ) cleanup_domain_irq_pirq(d, irq, info); rc = irq_deny_access(d, irq); if ( rc ) { printk(XENLOG_G_ERR "dom%d: could not deny access to IRQ%d (pirq %d)\n", d->domain_id, irq, pirq + i); ret = rc; } do { info = pirq_info(d, pirq + i); if ( info && (irq = info->arch.irq) > 0 ) break; printk(XENLOG_G_ERR "dom%d: MSI pirq %d not mapped\n", d->domain_id, pirq + i); } while ( ++i < nr ); if ( i == nr ) { desc = NULL; break; } desc = irq_to_desc(irq); BUG_ON(desc->msi_desc != msi_desc + i); spin_lock_irqsave(&desc->lock, flags); } if ( desc ) { spin_unlock_irqrestore(&desc->lock, flags); if ( !forced_unbind ) cleanup_domain_irq_pirq(d, irq, info); rc = irq_deny_access(d, irq); if ( rc ) { printk(XENLOG_G_ERR "dom%d: could not deny access to IRQ%d (pirq %d)\n", d->domain_id, irq, pirq + nr - 1); ret = rc; } } if (msi_desc) msi_free_irq(msi_desc); done: return ret; } void free_domain_pirqs(struct domain *d) { int i; spin_lock(&pcidevs_lock); spin_lock(&d->event_lock); for ( i = 0; i < d->nr_pirqs; i++ ) if ( domain_pirq_to_irq(d, i) > 0 ) unmap_domain_pirq(d, i); spin_unlock(&d->event_lock); spin_unlock(&pcidevs_lock); } static void dump_irqs(unsigned char key) { int i, irq, pirq; struct irq_desc *desc; irq_guest_action_t *action; struct evtchn *evtchn; struct domain *d; const struct pirq *info; unsigned long flags; char *ssid; printk("IRQ information:\n"); for ( irq = 0; irq < nr_irqs; irq++ ) { desc = irq_to_desc(irq); if ( !irq_desc_initialized(desc) || desc->handler == &no_irq_type ) continue; ssid = in_irq() ? NULL : xsm_show_irq_sid(irq); spin_lock_irqsave(&desc->lock, flags); cpumask_scnprintf(keyhandler_scratch, sizeof(keyhandler_scratch), desc->affinity); printk(" IRQ:%4d affinity:%s vec:%02x type=%-15s" " status=%08x ", irq, keyhandler_scratch, desc->arch.vector, desc->handler->typename, desc->status); if ( ssid ) printk("Z=%-25s ", ssid); if ( desc->status & IRQ_GUEST ) { action = (irq_guest_action_t *)desc->action; printk("in-flight=%d domain-list=", action->in_flight); for ( i = 0; i < action->nr_guests; i++ ) { d = action->guest[i]; pirq = domain_irq_to_pirq(d, irq); info = pirq_info(d, pirq); evtchn = evtchn_from_port(d, info->evtchn); printk("%u:%3d(%c%c%c)", d->domain_id, pirq, (evtchn_port_is_pending(d, evtchn) ? 'P' : '-'), (evtchn_port_is_masked(d, evtchn) ? 'M' : '-'), (info->masked ? 'M' : '-')); if ( i != action->nr_guests ) printk(","); } printk("\n"); } else if ( desc->action ) printk("%ps()\n", desc->action->handler); else printk("mapped, unbound\n"); spin_unlock_irqrestore(&desc->lock, flags); xfree(ssid); } printk("Direct vector information:\n"); for ( i = FIRST_DYNAMIC_VECTOR; i < NR_VECTORS; ++i ) if ( direct_apic_vector[i] ) printk(" %#02x -> %ps()\n", i, direct_apic_vector[i]); dump_ioapic_irq_info(); } static struct keyhandler dump_irqs_keyhandler = { .diagnostic = 1, .u.fn = dump_irqs, .desc = "dump interrupt bindings" }; static int __init setup_dump_irqs(void) { register_keyhandler('i', &dump_irqs_keyhandler); return 0; } __initcall(setup_dump_irqs); /* A cpu has been removed from cpu_online_mask. Re-set irq affinities. */ void fixup_irqs(void) { unsigned int irq, sp; static int warned; struct irq_desc *desc; irq_guest_action_t *action; struct pending_eoi *peoi; for ( irq = 0; irq < nr_irqs; irq++ ) { int break_affinity = 0; int set_affinity = 1; cpumask_t affinity; if ( irq == 2 ) continue; desc = irq_to_desc(irq); if ( !irq_desc_initialized(desc) ) continue; spin_lock(&desc->lock); cpumask_copy(&affinity, desc->affinity); if ( !desc->action || cpumask_subset(&affinity, &cpu_online_map) ) { spin_unlock(&desc->lock); continue; } cpumask_and(&affinity, &affinity, &cpu_online_map); if ( cpumask_empty(&affinity) ) { break_affinity = 1; cpumask_copy(&affinity, &cpu_online_map); } if ( desc->handler->disable ) desc->handler->disable(desc); if ( desc->handler->set_affinity ) desc->handler->set_affinity(desc, &affinity); else if ( !(warned++) ) set_affinity = 0; if ( desc->handler->enable ) desc->handler->enable(desc); spin_unlock(&desc->lock); if ( break_affinity && set_affinity ) printk("Broke affinity for irq %i\n", irq); else if ( !set_affinity ) printk("Cannot set affinity for irq %i\n", irq); } /* That doesn't seem sufficient. Give it 1ms. */ local_irq_enable(); mdelay(1); local_irq_disable(); /* Clean up cpu_eoi_map of every interrupt to exclude this CPU. */ for ( irq = 0; irq < nr_irqs; irq++ ) { desc = irq_to_desc(irq); if ( !(desc->status & IRQ_GUEST) ) continue; action = (irq_guest_action_t *)desc->action; cpumask_clear_cpu(smp_processor_id(), action->cpu_eoi_map); } /* Flush the interrupt EOI stack. */ peoi = this_cpu(pending_eoi); for ( sp = 0; sp < pending_eoi_sp(peoi); sp++ ) peoi[sp].ready = 1; flush_ready_eoi(); } int map_domain_emuirq_pirq(struct domain *d, int pirq, int emuirq) { int old_emuirq = IRQ_UNBOUND, old_pirq = IRQ_UNBOUND; struct pirq *info; ASSERT(spin_is_locked(&d->event_lock)); if ( !is_hvm_domain(d) ) return -EINVAL; if ( pirq < 0 || pirq >= d->nr_pirqs || emuirq == IRQ_UNBOUND || emuirq >= (int) nr_irqs ) { dprintk(XENLOG_G_ERR, "dom%d: invalid pirq %d or emuirq %d\n", d->domain_id, pirq, emuirq); return -EINVAL; } old_emuirq = domain_pirq_to_emuirq(d, pirq); if ( emuirq != IRQ_PT ) old_pirq = domain_emuirq_to_pirq(d, emuirq); if ( (old_emuirq != IRQ_UNBOUND && (old_emuirq != emuirq) ) || (old_pirq != IRQ_UNBOUND && (old_pirq != pirq)) ) { dprintk(XENLOG_G_WARNING, "dom%d: pirq %d or emuirq %d already mapped\n", d->domain_id, pirq, emuirq); return 0; } info = pirq_get_info(d, pirq); if ( !info ) return -ENOMEM; /* do not store emuirq mappings for pt devices */ if ( emuirq != IRQ_PT ) { int err = radix_tree_insert(&d->arch.hvm_domain.emuirq_pirq, emuirq, radix_tree_int_to_ptr(pirq)); switch ( err ) { case 0: break; case -EEXIST: radix_tree_replace_slot( radix_tree_lookup_slot( &d->arch.hvm_domain.emuirq_pirq, emuirq), radix_tree_int_to_ptr(pirq)); break; default: pirq_cleanup_check(info, d); return err; } } info->arch.hvm.emuirq = emuirq; return 0; } int unmap_domain_pirq_emuirq(struct domain *d, int pirq) { int emuirq, ret = 0; struct pirq *info; if ( !is_hvm_domain(d) ) return -EINVAL; if ( (pirq < 0) || (pirq >= d->nr_pirqs) ) return -EINVAL; ASSERT(spin_is_locked(&d->event_lock)); emuirq = domain_pirq_to_emuirq(d, pirq); if ( emuirq == IRQ_UNBOUND ) { dprintk(XENLOG_G_ERR, "dom%d: pirq %d not mapped\n", d->domain_id, pirq); ret = -EINVAL; goto done; } info = pirq_info(d, pirq); if ( info ) { info->arch.hvm.emuirq = IRQ_UNBOUND; pirq_cleanup_check(info, d); } if ( emuirq != IRQ_PT ) radix_tree_delete(&d->arch.hvm_domain.emuirq_pirq, emuirq); done: return ret; } bool_t hvm_domain_use_pirq(const struct domain *d, const struct pirq *pirq) { return is_hvm_domain(d) && pirq && pirq->arch.hvm.emuirq != IRQ_UNBOUND; } xen-4.4.0/xen/arch/x86/string.c0000664000175000017500000000253512307313555014256 0ustar smbsmb/****************************************************************************** * string.c * * These provide something for compiler-emitted string operations to link * against. */ #include #include #undef memcpy void *memcpy(void *dest, const void *src, size_t n) { long d0, d1, d2; asm volatile ( " rep ; movs"__OS" ; " " mov %4,%3 ; " " rep ; movsb " : "=&c" (d0), "=&D" (d1), "=&S" (d2) : "0" (n/BYTES_PER_LONG), "r" (n%BYTES_PER_LONG), "1" (dest), "2" (src) : "memory" ); return dest; } #undef memset void *memset(void *s, int c, size_t n) { long d0, d1; asm volatile ( "rep stosb" : "=&c" (d0), "=&D" (d1) : "a" (c), "1" (s), "0" (n) : "memory"); return s; } #undef memmove void *memmove(void *dest, const void *src, size_t n) { long d0, d1, d2; if ( dest < src ) return memcpy(dest, src, n); asm volatile ( " std ; " " rep movsb ; " " cld " : "=&c" (d0), "=&S" (d1), "=&D" (d2) : "0" (n), "1" (n-1+(const char *)src), "2" (n-1+(char *)dest) : "memory"); return dest; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/microcode.c0000664000175000017500000002651412307313555014717 0ustar smbsmb/* * Intel CPU Microcode Update Driver for Linux * * Copyright (C) 2000-2006 Tigran Aivazian * 2006 Shaohua Li * * This driver allows to upgrade microcode on Intel processors * belonging to IA-32 family - PentiumPro, Pentium II, * Pentium III, Xeon, Pentium 4, etc. * * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture * Software Developer's Manual * Order Number 253668 or free download from: * * http://developer.intel.com/design/pentium4/manuals/253668.htm * * For more information, go to http://www.urbanmyth.org/microcode * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static module_t __initdata ucode_mod; static void *(*__initdata ucode_mod_map)(const module_t *); static signed int __initdata ucode_mod_idx; static bool_t __initdata ucode_mod_forced; static cpumask_t __initdata init_mask; /* * If we scan the initramfs.cpio for the early microcode code * and find it, then 'ucode_blob' will contain the pointer * and the size of said blob. It is allocated from Xen's heap * memory. */ struct ucode_mod_blob { void *data; size_t size; }; static struct ucode_mod_blob __initdata ucode_blob; /* * By default we will NOT parse the multiboot modules to see if there is * cpio image with the microcode images. */ static bool_t __initdata ucode_scan; void __init microcode_set_module(unsigned int idx) { ucode_mod_idx = idx; ucode_mod_forced = 1; } /* * The format is '[|scan]'. Both options are optional. * If the EFI has forced which of the multiboot payloads is to be used, * no parsing will be attempted. */ static void __init parse_ucode(char *s) { if ( ucode_mod_forced ) /* Forced by EFI */ return; if ( !strncmp(s, "scan", 4) ) ucode_scan = 1; else ucode_mod_idx = simple_strtol(s, NULL, 0); } custom_param("ucode", parse_ucode); /* * 8MB ought to be enough. */ #define MAX_EARLY_CPIO_MICROCODE (8 << 20) void __init microcode_scan_module( unsigned long *module_map, const multiboot_info_t *mbi, void *(*bootmap)(const module_t *)) { module_t *mod = (module_t *)__va(mbi->mods_addr); uint64_t *_blob_start; unsigned long _blob_size; struct cpio_data cd; long offset; const char *p = NULL; int i; ucode_blob.size = 0; if ( !ucode_scan ) return; if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) p = "kernel/x86/microcode/AuthenticAMD.bin"; else if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) p = "kernel/x86/microcode/GenuineIntel.bin"; else return; /* * Try all modules and see whichever could be the microcode blob. */ for ( i = 1 /* Ignore dom0 kernel */; i < mbi->mods_count; i++ ) { if ( !test_bit(i, module_map) ) continue; _blob_start = bootmap(&mod[i]); _blob_size = mod[i].mod_end; if ( !_blob_start ) { printk("Could not map multiboot module #%d (size: %ld)\n", i, _blob_size); continue; } cd.data = NULL; cd.size = 0; cd = find_cpio_data(p, _blob_start, _blob_size, &offset /* ignore */); if ( cd.data ) { /* * This is an arbitrary check - it would be sad if the blob * consumed most of the memory and did not allow guests * to launch. */ if ( cd.size > MAX_EARLY_CPIO_MICROCODE ) { printk("Multiboot %d microcode payload too big! (%ld, we can do %d)\n", i, cd.size, MAX_EARLY_CPIO_MICROCODE); goto err; } ucode_blob.size = cd.size; ucode_blob.data = xmalloc_bytes(cd.size); if ( !ucode_blob.data ) cd.data = NULL; else memcpy(ucode_blob.data, cd.data, cd.size); } bootmap(NULL); if ( cd.data ) break; } return; err: bootmap(NULL); } void __init microcode_grab_module( unsigned long *module_map, const multiboot_info_t *mbi, void *(*map)(const module_t *)) { module_t *mod = (module_t *)__va(mbi->mods_addr); if ( ucode_mod_idx < 0 ) ucode_mod_idx += mbi->mods_count; if ( ucode_mod_idx <= 0 || ucode_mod_idx >= mbi->mods_count || !__test_and_clear_bit(ucode_mod_idx, module_map) ) goto scan; ucode_mod = mod[ucode_mod_idx]; ucode_mod_map = map; scan: if ( ucode_scan ) microcode_scan_module(module_map, mbi, map); } const struct microcode_ops *microcode_ops; static DEFINE_SPINLOCK(microcode_mutex); DEFINE_PER_CPU(struct ucode_cpu_info, ucode_cpu_info); struct microcode_info { unsigned int cpu; uint32_t buffer_size; int error; char buffer[1]; }; static void __microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); xfree(uci->mc.mc_valid); memset(uci, 0, sizeof(*uci)); } static void microcode_fini_cpu(int cpu) { spin_lock(µcode_mutex); __microcode_fini_cpu(cpu); spin_unlock(µcode_mutex); } int microcode_resume_cpu(int cpu) { int err; struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); struct cpu_signature nsig; unsigned int cpu2; if ( !microcode_ops ) return 0; spin_lock(µcode_mutex); err = microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig); if ( err ) { __microcode_fini_cpu(cpu); spin_unlock(µcode_mutex); return err; } if ( uci->mc.mc_valid ) { err = microcode_ops->microcode_resume_match(cpu, uci->mc.mc_valid); if ( err >= 0 ) { if ( err ) err = microcode_ops->apply_microcode(cpu); spin_unlock(µcode_mutex); return err; } } nsig = uci->cpu_sig; __microcode_fini_cpu(cpu); uci->cpu_sig = nsig; err = -EIO; for_each_online_cpu ( cpu2 ) { uci = &per_cpu(ucode_cpu_info, cpu2); if ( uci->mc.mc_valid && microcode_ops->microcode_resume_match(cpu, uci->mc.mc_valid) > 0 ) { err = microcode_ops->apply_microcode(cpu); break; } } __microcode_fini_cpu(cpu); spin_unlock(µcode_mutex); return err; } static int microcode_update_cpu(const void *buf, size_t size) { int err; unsigned int cpu = smp_processor_id(); struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); spin_lock(µcode_mutex); err = microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig); if ( likely(!err) ) err = microcode_ops->cpu_request_microcode(cpu, buf, size); else __microcode_fini_cpu(cpu); spin_unlock(µcode_mutex); return err; } static long do_microcode_update(void *_info) { struct microcode_info *info = _info; int error; BUG_ON(info->cpu != smp_processor_id()); error = microcode_update_cpu(info->buffer, info->buffer_size); if ( error ) info->error = error; info->cpu = cpumask_next(info->cpu, &cpu_online_map); if ( info->cpu < nr_cpu_ids ) return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info); error = info->error; xfree(info); return error; } int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void) buf, unsigned long len) { int ret; struct microcode_info *info; if ( len != (uint32_t)len ) return -E2BIG; if ( microcode_ops == NULL ) return -EINVAL; info = xmalloc_bytes(sizeof(*info) + len); if ( info == NULL ) return -ENOMEM; ret = copy_from_guest(info->buffer, buf, len); if ( ret != 0 ) { xfree(info); return ret; } info->buffer_size = len; info->error = 0; info->cpu = cpumask_first(&cpu_online_map); if ( microcode_ops->start_update ) { ret = microcode_ops->start_update(); if ( ret != 0 ) { xfree(info); return ret; } } return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info); } static void __init _do_microcode_update(unsigned long data) { void *_data = (void *)data; size_t len = ucode_blob.size ? ucode_blob.size : ucode_mod.mod_end; microcode_update_cpu(_data, len); cpumask_set_cpu(smp_processor_id(), &init_mask); } static int __init microcode_init(void) { void *data; static struct tasklet __initdata tasklet; unsigned int cpu; if ( !microcode_ops ) return 0; if ( !ucode_mod.mod_end && !ucode_blob.size ) return 0; data = ucode_blob.size ? ucode_blob.data : ucode_mod_map(&ucode_mod); if ( !data ) return -ENOMEM; if ( microcode_ops->start_update && microcode_ops->start_update() != 0 ) goto out; softirq_tasklet_init(&tasklet, _do_microcode_update, (unsigned long)data); for_each_online_cpu ( cpu ) { tasklet_schedule_on_cpu(&tasklet, cpu); do { process_pending_softirqs(); } while ( !cpumask_test_cpu(cpu, &init_mask) ); } out: if ( ucode_blob.size ) xfree(data); else ucode_mod_map(NULL); return 0; } __initcall(microcode_init); static int microcode_percpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; switch ( action ) { case CPU_DEAD: microcode_fini_cpu(cpu); break; } return NOTIFY_DONE; } static struct notifier_block microcode_percpu_nfb = { .notifier_call = microcode_percpu_callback, }; static int __init microcode_presmp_init(void) { if ( microcode_ops ) { if ( ucode_mod.mod_end || ucode_blob.size ) { void *data; size_t len; int rc = 0; if ( ucode_blob.size ) { len = ucode_blob.size; data = ucode_blob.data; } else { len = ucode_mod.mod_end; data = ucode_mod_map(&ucode_mod); } if ( data ) rc = microcode_update_cpu(data, len); else rc = -ENOMEM; if ( !ucode_blob.size ) ucode_mod_map(NULL); if ( rc ) { if ( ucode_blob.size ) { xfree(ucode_blob.data); ucode_blob.size = 0; ucode_blob.data = NULL; } else ucode_mod.mod_end = 0; } } register_cpu_notifier(µcode_percpu_nfb); } return 0; } presmp_initcall(microcode_presmp_init); xen-4.4.0/xen/arch/x86/flushtlb.c0000664000175000017500000001140712307313555014571 0ustar smbsmb/****************************************************************************** * flushtlb.c * * TLB flushes are timestamped using a global virtual 'clock' which ticks * on any TLB flush on any processor. * * Copyright (c) 2003-2006, K A Fraser */ #include #include #include #include #include /* Debug builds: Wrap frequently to stress-test the wrap logic. */ #ifdef NDEBUG #define WRAP_MASK (0xFFFFFFFFU) #else #define WRAP_MASK (0x000003FFU) #endif u32 tlbflush_clock = 1U; DEFINE_PER_CPU(u32, tlbflush_time); /* * pre_flush(): Increment the virtual TLB-flush clock. Returns new clock value. * * This must happen *before* we flush the TLB. If we do it after, we race other * CPUs invalidating PTEs. For example, a page invalidated after the flush * might get the old timestamp, but this CPU can speculatively fetch the * mapping into its TLB after the flush but before inc'ing the clock. */ static u32 pre_flush(void) { u32 t, t1, t2; t = tlbflush_clock; do { t1 = t2 = t; /* Clock wrapped: someone else is leading a global TLB shootdown. */ if ( unlikely(t1 == 0) ) goto skip_clocktick; t2 = (t + 1) & WRAP_MASK; } while ( unlikely((t = cmpxchg(&tlbflush_clock, t1, t2)) != t1) ); /* Clock wrapped: we will lead a global TLB shootdown. */ if ( unlikely(t2 == 0) ) raise_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ); skip_clocktick: return t2; } /* * post_flush(): Update this CPU's timestamp with specified clock value. * * Note that this happens *after* flushing the TLB, as otherwise we can race a * NEED_FLUSH() test on another CPU. (e.g., other CPU sees the updated CPU * stamp and so does not force a synchronous TLB flush, but the flush in this * function hasn't yet occurred and so the TLB might be stale). The ordering * would only actually matter if this function were interruptible, and * something that abuses the stale mapping could exist in an interrupt * handler. In fact neither of these is the case, so really we are being ultra * paranoid. */ static void post_flush(u32 t) { this_cpu(tlbflush_time) = t; } void write_cr3(unsigned long cr3) { unsigned long flags; u32 t; /* This non-reentrant function is sometimes called in interrupt context. */ local_irq_save(flags); t = pre_flush(); hvm_flush_guest_tlbs(); #ifdef USER_MAPPINGS_ARE_GLOBAL { unsigned long cr4 = read_cr4(); write_cr4(cr4 & ~X86_CR4_PGE); asm volatile ( "mov %0, %%cr3" : : "r" (cr3) : "memory" ); write_cr4(cr4); } #else asm volatile ( "mov %0, %%cr3" : : "r" (cr3) : "memory" ); #endif post_flush(t); local_irq_restore(flags); } void flush_area_local(const void *va, unsigned int flags) { const struct cpuinfo_x86 *c = ¤t_cpu_data; unsigned int order = (flags - 1) & FLUSH_ORDER_MASK; unsigned long irqfl; /* This non-reentrant function is sometimes called in interrupt context. */ local_irq_save(irqfl); if ( flags & (FLUSH_TLB|FLUSH_TLB_GLOBAL) ) { if ( order == 0 ) { /* * We don't INVLPG multi-page regions because the 2M/4M/1G * region may not have been mapped with a superpage. Also there * are various errata surrounding INVLPG usage on superpages, and * a full flush is in any case not *that* expensive. */ asm volatile ( "invlpg %0" : : "m" (*(const char *)(va)) : "memory" ); } else { u32 t = pre_flush(); hvm_flush_guest_tlbs(); #ifndef USER_MAPPINGS_ARE_GLOBAL if ( !(flags & FLUSH_TLB_GLOBAL) || !(read_cr4() & X86_CR4_PGE) ) { asm volatile ( "mov %0, %%cr3" : : "r" (read_cr3()) : "memory" ); } else #endif { unsigned long cr4 = read_cr4(); write_cr4(cr4 & ~X86_CR4_PGE); barrier(); write_cr4(cr4); } post_flush(t); } } if ( flags & FLUSH_CACHE ) { unsigned long i, sz = 0; if ( order < (BITS_PER_LONG - PAGE_SHIFT) ) sz = 1UL << (order + PAGE_SHIFT); if ( c->x86_clflush_size && c->x86_cache_size && sz && ((sz >> 10) < c->x86_cache_size) ) { va = (const void *)((unsigned long)va & ~(sz - 1)); for ( i = 0; i < sz; i += c->x86_clflush_size ) asm volatile ( "clflush %0" : : "m" (((const char *)va)[i]) ); } else { wbinvd(); } } local_irq_restore(irqfl); } xen-4.4.0/xen/arch/x86/xen.lds.S0000664000175000017500000001033212307313555014275 0ustar smbsmb/* Excerpts written by Martin Mares */ /* Modified for i386/x86-64 Xen by Keir Fraser */ #include #include #include #include #undef ENTRY #undef ALIGN #ifdef EFI #define FORMAT "pei-x86-64" #undef __XEN_VIRT_START #define __XEN_VIRT_START __image_base__ ENTRY(efi_start) #else /* !EFI */ #define FORMAT "elf64-x86-64" ENTRY(start) #endif /* EFI */ OUTPUT_FORMAT(FORMAT, FORMAT, FORMAT) OUTPUT_ARCH(i386:x86-64) PHDRS { text PT_LOAD ; } SECTIONS { #if !defined(EFI) . = __XEN_VIRT_START; __image_base__ = .; #endif . = __XEN_VIRT_START + 0x100000; _start = .; .text : { _stext = .; /* Text and read-only data */ *(.text) *(.text.cold) *(.text.unlikely) *(.fixup) *(.gnu.warning) _etext = .; /* End of text section */ } :text = 0x9090 .rodata : { /* Bug frames table */ . = ALIGN(4); __start_bug_frames = .; *(.bug_frames.0) __stop_bug_frames_0 = .; *(.bug_frames.1) __stop_bug_frames_1 = .; *(.bug_frames.2) __stop_bug_frames_2 = .; *(.bug_frames.3) __stop_bug_frames_3 = .; *(.rodata) *(.rodata.*) } :text . = ALIGN(SMP_CACHE_BYTES); .data.read_mostly : { /* Exception table */ __start___ex_table = .; *(.ex_table) __stop___ex_table = .; /* Pre-exception table */ __start___pre_ex_table = .; *(.ex_table.pre) __stop___pre_ex_table = .; *(.data.read_mostly) *(.data.rel.ro) *(.data.rel.ro.*) } :text .data : { /* Data */ . = ALIGN(PAGE_SIZE); *(.data.page_aligned) *(.data) *(.data.rel) *(.data.rel.*) CONSTRUCTORS } :text #ifdef LOCK_PROFILE . = ALIGN(32); __lock_profile_start = .; .lockprofile.data : { *(.lockprofile.data) } :text __lock_profile_end = .; #endif . = ALIGN(PAGE_SIZE); /* Init code and data */ __init_begin = .; .init.text : { _sinittext = .; *(.init.text) _einittext = .; } :text .init.data : { *(.init.rodata) *(.init.rodata.str*) *(.init.data) *(.init.data.rel) *(.init.data.rel.*) . = ALIGN(4); __trampoline_rel_start = .; *(.trampoline_rel) __trampoline_rel_stop = .; __trampoline_seg_start = .; *(.trampoline_seg) __trampoline_seg_stop = .; . = ALIGN(8); __ctors_start = .; *(.ctors) __ctors_end = .; } :text . = ALIGN(32); .init.setup : { __setup_start = .; *(.init.setup) __setup_end = .; } :text .initcall.init : { __initcall_start = .; *(.initcallpresmp.init) __presmp_initcall_end = .; *(.initcall1.init) __initcall_end = .; } :text .xsm_initcall.init : { __xsm_initcall_start = .; *(.xsm_initcall.init) __xsm_initcall_end = .; } :text . = ALIGN(STACK_SIZE); __init_end = .; .bss : { /* BSS */ __bss_start = .; *(.bss.stack_aligned) . = ALIGN(PAGE_SIZE); *(.bss.page_aligned) *(.bss) . = ALIGN(SMP_CACHE_BYTES); __per_cpu_start = .; *(.bss.percpu) . = ALIGN(SMP_CACHE_BYTES); *(.bss.percpu.read_mostly) . = ALIGN(SMP_CACHE_BYTES); __per_cpu_data_end = .; } :text _end = . ; #ifdef EFI . = ALIGN(4); .reloc : { *(.reloc) } :text /* Trick the linker into setting the image size to exactly 16Mb. */ . = ALIGN(__section_alignment__); .pad : { . = ALIGN(0x1000000); } :text #else efi = .; #endif /* Sections to be discarded */ /DISCARD/ : { *(.exit.text) *(.exit.data) *(.exitcall.exit) *(.eh_frame) #ifdef EFI *(.comment) *(.comment.*) #endif } /* Stabs debugging sections. */ .stab 0 : { *(.stab) } .stabstr 0 : { *(.stabstr) } .stab.excl 0 : { *(.stab.excl) } .stab.exclstr 0 : { *(.stab.exclstr) } .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) } } ASSERT(kexec_reloc_size - kexec_reloc <= PAGE_SIZE, "kexec_reloc is too large") xen-4.4.0/xen/arch/x86/domain.c0000664000175000017500000016540312307313555014223 0ustar smbsmb/****************************************************************************** * arch/x86/domain.c * * x86-specific domain handling (e.g., register setup and context switching). */ /* * Copyright (C) 1995 Linus Torvalds * * Pentium III FXSR, SSE support * Gareth Hughes , May 2000 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include DEFINE_PER_CPU(struct vcpu *, curr_vcpu); DEFINE_PER_CPU(unsigned long, cr4); static void default_idle(void); void (*pm_idle) (void) __read_mostly = default_idle; void (*dead_idle) (void) __read_mostly = default_dead_idle; static void paravirt_ctxt_switch_from(struct vcpu *v); static void paravirt_ctxt_switch_to(struct vcpu *v); static void default_idle(void) { local_irq_disable(); if ( cpu_is_haltable(smp_processor_id()) ) safe_halt(); else local_irq_enable(); } void default_dead_idle(void) { /* * When going into S3, without flushing caches modified data may be * held by the CPUs spinning here indefinitely, and get discarded by * a subsequent INIT. */ wbinvd(); for ( ; ; ) halt(); } static void play_dead(void) { local_irq_disable(); /* * NOTE: After cpu_exit_clear, per-cpu variables are no longer accessible, * as they may be freed at any time. In this case, heap corruption or * #PF can occur (when heap debugging is enabled). For example, even * printk() can involve tasklet scheduling, which touches per-cpu vars. * * Consider very carefully when adding code to *dead_idle. Most hypervisor * subsystems are unsafe to call. */ cpu_exit_clear(smp_processor_id()); (*dead_idle)(); } static void idle_loop(void) { for ( ; ; ) { if ( cpu_is_offline(smp_processor_id()) ) play_dead(); (*pm_idle)(); do_tasklet(); do_softirq(); } } void startup_cpu_idle_loop(void) { struct vcpu *v = current; ASSERT(is_idle_vcpu(v)); cpumask_set_cpu(v->processor, v->domain->domain_dirty_cpumask); cpumask_set_cpu(v->processor, v->vcpu_dirty_cpumask); reset_stack_and_jump(idle_loop); } static void continue_idle_domain(struct vcpu *v) { reset_stack_and_jump(idle_loop); } static void continue_nonidle_domain(struct vcpu *v) { check_wakeup_from_wait(); mark_regs_dirty(guest_cpu_user_regs()); reset_stack_and_jump(ret_from_intr); } void dump_pageframe_info(struct domain *d) { struct page_info *page; printk("Memory pages belonging to domain %u:\n", d->domain_id); if ( d->tot_pages >= 10 ) { printk(" DomPage list too long to display\n"); } else { spin_lock(&d->page_alloc_lock); page_list_for_each ( page, &d->page_list ) { printk(" DomPage %p: caf=%08lx, taf=%" PRtype_info "\n", _p(page_to_mfn(page)), page->count_info, page->u.inuse.type_info); } spin_unlock(&d->page_alloc_lock); } if ( has_hvm_container_domain(d) ) p2m_pod_dump_data(d); spin_lock(&d->page_alloc_lock); page_list_for_each ( page, &d->xenpage_list ) { printk(" XenPage %p: caf=%08lx, taf=%" PRtype_info "\n", _p(page_to_mfn(page)), page->count_info, page->u.inuse.type_info); } spin_unlock(&d->page_alloc_lock); } struct domain *alloc_domain_struct(void) { struct domain *d; /* * We pack the PDX of the domain structure into a 32-bit field within * the page_info structure. Hence the MEMF_bits() restriction. */ unsigned int bits = 32 + PAGE_SHIFT + pfn_pdx_hole_shift; BUILD_BUG_ON(sizeof(*d) > PAGE_SIZE); d = alloc_xenheap_pages(0, MEMF_bits(bits)); if ( d != NULL ) clear_page(d); return d; } void free_domain_struct(struct domain *d) { lock_profile_deregister_struct(LOCKPROF_TYPE_PERDOM, d); free_xenheap_page(d); } struct vcpu *alloc_vcpu_struct(void) { struct vcpu *v; /* * This structure contains embedded PAE PDPTEs, used when an HVM guest * runs on shadow pagetables outside of 64-bit mode. In this case the CPU * may require that the shadow CR3 points below 4GB, and hence the whole * structure must satisfy this restriction. Thus we specify MEMF_bits(32). */ BUILD_BUG_ON(sizeof(*v) > PAGE_SIZE); v = alloc_xenheap_pages(0, MEMF_bits(32)); if ( v != NULL ) clear_page(v); return v; } void free_vcpu_struct(struct vcpu *v) { free_xenheap_page(v); } static DEFINE_PER_CPU(struct page_info *[ PFN_UP(sizeof(struct vcpu_guest_context))], vgc_pages); struct vcpu_guest_context *alloc_vcpu_guest_context(void) { unsigned int i, cpu = smp_processor_id(); enum fixed_addresses idx = FIX_VGC_BEGIN - cpu * PFN_UP(sizeof(struct vcpu_guest_context)); BUG_ON(per_cpu(vgc_pages[0], cpu) != NULL); for ( i = 0; i < PFN_UP(sizeof(struct vcpu_guest_context)); ++i ) { struct page_info *pg = alloc_domheap_page(NULL, 0); if ( unlikely(pg == NULL) ) { free_vcpu_guest_context(NULL); return NULL; } __set_fixmap(idx - i, page_to_mfn(pg), __PAGE_HYPERVISOR); per_cpu(vgc_pages[i], cpu) = pg; } return (void *)fix_to_virt(idx); } void free_vcpu_guest_context(struct vcpu_guest_context *vgc) { unsigned int i, cpu = smp_processor_id(); enum fixed_addresses idx = FIX_VGC_BEGIN - cpu * PFN_UP(sizeof(struct vcpu_guest_context)); BUG_ON(vgc && vgc != (void *)fix_to_virt(idx)); for ( i = 0; i < PFN_UP(sizeof(struct vcpu_guest_context)); ++i ) { if ( !per_cpu(vgc_pages[i], cpu) ) continue; __set_fixmap(idx - i, 0, 0); free_domheap_page(per_cpu(vgc_pages[i], cpu)); per_cpu(vgc_pages[i], cpu) = NULL; } } static int setup_compat_l4(struct vcpu *v) { struct page_info *pg; l4_pgentry_t *l4tab; int rc; pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v))); if ( pg == NULL ) return -ENOMEM; rc = setup_compat_arg_xlat(v); if ( rc ) { free_domheap_page(pg); return rc; } /* This page needs to look like a pagetable so that it can be shadowed */ pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1; l4tab = __map_domain_page(pg); clear_page(l4tab); init_guest_l4_table(l4tab, v->domain); unmap_domain_page(l4tab); v->arch.guest_table = pagetable_from_page(pg); v->arch.guest_table_user = v->arch.guest_table; return 0; } static void release_compat_l4(struct vcpu *v) { free_compat_arg_xlat(v); free_domheap_page(pagetable_get_page(v->arch.guest_table)); v->arch.guest_table = pagetable_null(); v->arch.guest_table_user = pagetable_null(); } static inline int may_switch_mode(struct domain *d) { return (!is_hvm_domain(d) && (d->tot_pages == 0)); } int switch_native(struct domain *d) { unsigned int vcpuid; if ( d == NULL ) return -EINVAL; if ( !may_switch_mode(d) ) return -EACCES; if ( !is_pv_32on64_domain(d) ) return 0; d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0; for ( vcpuid = 0; vcpuid < d->max_vcpus; vcpuid++ ) { if (d->vcpu[vcpuid]) release_compat_l4(d->vcpu[vcpuid]); } return 0; } int switch_compat(struct domain *d) { unsigned int vcpuid; if ( d == NULL ) return -EINVAL; if ( is_pvh_domain(d) ) { printk(XENLOG_G_INFO "Xen currently does not support 32bit PVH guests\n"); return -EINVAL; } if ( !may_switch_mode(d) ) return -EACCES; if ( is_pv_32on64_domain(d) ) return 0; d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1; for ( vcpuid = 0; vcpuid < d->max_vcpus; vcpuid++ ) { if ( (d->vcpu[vcpuid] != NULL) && (setup_compat_l4(d->vcpu[vcpuid]) != 0) ) goto undo_and_fail; } domain_set_alloc_bitsize(d); return 0; undo_and_fail: d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0; while ( vcpuid-- != 0 ) { if ( d->vcpu[vcpuid] != NULL ) release_compat_l4(d->vcpu[vcpuid]); } return -ENOMEM; } int vcpu_initialise(struct vcpu *v) { struct domain *d = v->domain; int rc; v->arch.flags = TF_kernel_mode; rc = mapcache_vcpu_init(v); if ( rc ) return rc; paging_vcpu_init(v); if ( (rc = vcpu_init_fpu(v)) != 0 ) return rc; vmce_init_vcpu(v); if ( has_hvm_container_domain(d) ) { rc = hvm_vcpu_initialise(v); goto done; } spin_lock_init(&v->arch.pv_vcpu.shadow_ldt_lock); if ( !is_idle_domain(d) ) { rc = create_perdomain_mapping(d, GDT_VIRT_START(v), 1 << GDT_LDT_VCPU_SHIFT, d->arch.pv_domain.gdt_ldt_l1tab, NULL); if ( rc ) goto done; BUILD_BUG_ON(NR_VECTORS * sizeof(*v->arch.pv_vcpu.trap_ctxt) > PAGE_SIZE); v->arch.pv_vcpu.trap_ctxt = xzalloc_array(struct trap_info, NR_VECTORS); if ( !v->arch.pv_vcpu.trap_ctxt ) { rc = -ENOMEM; goto done; } /* PV guests by default have a 100Hz ticker. */ v->periodic_period = MILLISECS(10); /* PV guests get an emulated PIT too for video BIOSes to use. */ if ( v->vcpu_id == 0 ) pit_init(v, cpu_khz); } v->arch.schedule_tail = continue_nonidle_domain; v->arch.ctxt_switch_from = paravirt_ctxt_switch_from; v->arch.ctxt_switch_to = paravirt_ctxt_switch_to; if ( is_idle_domain(d) ) { v->arch.schedule_tail = continue_idle_domain; v->arch.cr3 = __pa(idle_pg_table); } v->arch.pv_vcpu.ctrlreg[4] = real_cr4_to_pv_guest_cr4(mmu_cr4_features); rc = is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0; done: if ( rc ) { vcpu_destroy_fpu(v); if ( is_pv_domain(d) ) xfree(v->arch.pv_vcpu.trap_ctxt); } return rc; } void vcpu_destroy(struct vcpu *v) { if ( is_pv_32on64_vcpu(v) ) release_compat_l4(v); vcpu_destroy_fpu(v); if ( has_hvm_container_vcpu(v) ) hvm_vcpu_destroy(v); else xfree(v->arch.pv_vcpu.trap_ctxt); } int arch_domain_create(struct domain *d, unsigned int domcr_flags) { int i, paging_initialised = 0; int rc = -ENOMEM; d->arch.hvm_domain.hap_enabled = has_hvm_container_domain(d) && hvm_funcs.hap_supported && (domcr_flags & DOMCRF_hap); d->arch.hvm_domain.mem_sharing_enabled = 0; d->arch.s3_integrity = !!(domcr_flags & DOMCRF_s3_integrity); INIT_LIST_HEAD(&d->arch.pdev_list); d->arch.relmem = RELMEM_not_started; INIT_PAGE_LIST_HEAD(&d->arch.relmem_list); if ( d->domain_id && !is_idle_domain(d) && cpu_has_amd_erratum(&boot_cpu_data, AMD_ERRATUM_121) ) { if ( !opt_allow_unsafe ) { printk(XENLOG_G_ERR "Xen does not allow DomU creation on this CPU" " for security reasons.\n"); return -EPERM; } printk(XENLOG_G_WARNING "Dom%d may compromise security on this CPU.\n", d->domain_id); } if ( has_hvm_container_domain(d) ) rc = create_perdomain_mapping(d, PERDOMAIN_VIRT_START, 0, NULL, NULL); else if ( is_idle_domain(d) ) rc = 0; else { d->arch.pv_domain.gdt_ldt_l1tab = alloc_xenheap_pages(0, MEMF_node(domain_to_node(d))); if ( !d->arch.pv_domain.gdt_ldt_l1tab ) goto fail; clear_page(d->arch.pv_domain.gdt_ldt_l1tab); rc = create_perdomain_mapping(d, GDT_LDT_VIRT_START, GDT_LDT_MBYTES << (20 - PAGE_SHIFT), NULL, NULL); } if ( rc ) goto fail; mapcache_domain_init(d); HYPERVISOR_COMPAT_VIRT_START(d) = is_pv_domain(d) ? __HYPERVISOR_COMPAT_VIRT_START : ~0u; if ( (rc = paging_domain_init(d, domcr_flags)) != 0 ) goto fail; paging_initialised = 1; if ( !is_idle_domain(d) ) { d->arch.cpuids = xzalloc_array(cpuid_input_t, MAX_CPUID_INPUT); rc = -ENOMEM; if ( d->arch.cpuids == NULL ) goto fail; for ( i = 0; i < MAX_CPUID_INPUT; i++ ) { d->arch.cpuids[i].input[0] = XEN_CPUID_INPUT_UNUSED; d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED; } d->arch.ioport_caps = rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex); rc = -ENOMEM; if ( d->arch.ioport_caps == NULL ) goto fail; /* * The shared_info machine address must fit in a 32-bit field within a * 32-bit guest's start_info structure. Hence we specify MEMF_bits(32). */ if ( (d->shared_info = alloc_xenheap_pages(0, MEMF_bits(32))) == NULL ) goto fail; clear_page(d->shared_info); share_xen_page_with_guest( virt_to_page(d->shared_info), d, XENSHARE_writable); if ( (rc = init_domain_irq_mapping(d)) != 0 ) goto fail; if ( (rc = iommu_domain_init(d)) != 0 ) goto fail; } spin_lock_init(&d->arch.e820_lock); if ( has_hvm_container_domain(d) ) { if ( (rc = hvm_domain_initialise(d)) != 0 ) { iommu_domain_destroy(d); goto fail; } } else /* 64-bit PV guest by default. */ d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0; /* initialize default tsc behavior in case tools don't */ tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0); spin_lock_init(&d->arch.vtsc_lock); return 0; fail: d->is_dying = DOMDYING_dead; cleanup_domain_irq_mapping(d); free_xenheap_page(d->shared_info); if ( paging_initialised ) paging_final_teardown(d); free_perdomain_mappings(d); if ( is_pv_domain(d) ) free_xenheap_page(d->arch.pv_domain.gdt_ldt_l1tab); return rc; } void arch_domain_destroy(struct domain *d) { if ( has_hvm_container_domain(d) ) hvm_domain_destroy(d); xfree(d->arch.e820); free_domain_pirqs(d); if ( !is_idle_domain(d) ) iommu_domain_destroy(d); paging_final_teardown(d); free_perdomain_mappings(d); if ( is_pv_domain(d) ) free_xenheap_page(d->arch.pv_domain.gdt_ldt_l1tab); free_xenheap_page(d->shared_info); cleanup_domain_irq_mapping(d); } unsigned long pv_guest_cr4_fixup(const struct vcpu *v, unsigned long guest_cr4) { unsigned long hv_cr4_mask, hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4()); hv_cr4_mask = ~X86_CR4_TSD; if ( cpu_has_de ) hv_cr4_mask &= ~X86_CR4_DE; if ( cpu_has_fsgsbase && !is_pv_32bit_domain(v->domain) ) hv_cr4_mask &= ~X86_CR4_FSGSBASE; if ( cpu_has_xsave ) hv_cr4_mask &= ~X86_CR4_OSXSAVE; if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) ) gdprintk(XENLOG_WARNING, "Attempt to change CR4 flags %08lx -> %08lx\n", hv_cr4, guest_cr4); return (hv_cr4 & hv_cr4_mask) | (guest_cr4 & ~hv_cr4_mask); } #define xen_vcpu_guest_context vcpu_guest_context #define fpu_ctxt fpu_ctxt.x CHECK_FIELD_(struct, vcpu_guest_context, fpu_ctxt); #undef fpu_ctxt #undef xen_vcpu_guest_context /* Called by XEN_DOMCTL_setvcpucontext and VCPUOP_initialise. */ int arch_set_info_guest( struct vcpu *v, vcpu_guest_context_u c) { struct domain *d = v->domain; unsigned long cr3_gfn; struct page_info *cr3_page; unsigned long flags, cr4; unsigned int i; int rc = 0, compat; /* The context is a compat-mode one if the target domain is compat-mode; * we expect the tools to DTRT even in compat-mode callers. */ compat = is_pv_32on64_domain(d); #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld)) flags = c(flags); if ( is_pv_vcpu(v) ) { if ( !compat ) { if ( !is_canonical_address(c.nat->user_regs.eip) || !is_canonical_address(c.nat->event_callback_eip) || !is_canonical_address(c.nat->syscall_callback_eip) || !is_canonical_address(c.nat->failsafe_callback_eip) ) return -EINVAL; fixup_guest_stack_selector(d, c.nat->user_regs.ss); fixup_guest_stack_selector(d, c.nat->kernel_ss); fixup_guest_code_selector(d, c.nat->user_regs.cs); for ( i = 0; i < ARRAY_SIZE(c.nat->trap_ctxt); i++ ) { if ( !is_canonical_address(c.nat->trap_ctxt[i].address) ) return -EINVAL; fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs); } if ( !__addr_ok(c.nat->ldt_base) ) return -EINVAL; } else { fixup_guest_stack_selector(d, c.cmp->user_regs.ss); fixup_guest_stack_selector(d, c.cmp->kernel_ss); fixup_guest_code_selector(d, c.cmp->user_regs.cs); fixup_guest_code_selector(d, c.cmp->event_callback_cs); fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs); for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); i++ ) fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs); } /* LDT safety checks. */ if ( ((c(ldt_base) & (PAGE_SIZE - 1)) != 0) || (c(ldt_ents) > 8192) ) return -EINVAL; } else if ( is_pvh_vcpu(v) ) { /* PVH 32bitfixme */ ASSERT(!compat); if ( c(ctrlreg[0]) || c(ctrlreg[1]) || c(ctrlreg[2]) || c(ctrlreg[4]) || c(ctrlreg[5]) || c(ctrlreg[6]) || c(ctrlreg[7]) || c(ldt_base) || c(ldt_ents) || c(user_regs.cs) || c(user_regs.ss) || c(user_regs.es) || c(user_regs.ds) || c(user_regs.fs) || c(user_regs.gs) || c(kernel_ss) || c(kernel_sp) || c.nat->gs_base_kernel || c.nat->gdt_ents || c.nat->fs_base || c.nat->gs_base_user ) return -EINVAL; } v->fpu_initialised = !!(flags & VGCF_I387_VALID); v->arch.flags &= ~TF_kernel_mode; if ( (flags & VGCF_in_kernel) || has_hvm_container_vcpu(v)/*???*/ ) v->arch.flags |= TF_kernel_mode; v->arch.vgc_flags = flags; if ( flags & VGCF_I387_VALID ) { memcpy(v->arch.fpu_ctxt, &c.nat->fpu_ctxt, sizeof(c.nat->fpu_ctxt)); if ( v->arch.xsave_area ) v->arch.xsave_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE; } if ( !compat ) { memcpy(&v->arch.user_regs, &c.nat->user_regs, sizeof(c.nat->user_regs)); if ( is_pv_vcpu(v) ) memcpy(v->arch.pv_vcpu.trap_ctxt, c.nat->trap_ctxt, sizeof(c.nat->trap_ctxt)); } else { XLAT_cpu_user_regs(&v->arch.user_regs, &c.cmp->user_regs); for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); ++i ) XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i, c.cmp->trap_ctxt + i); } if ( has_hvm_container_vcpu(v) ) { for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i ) v->arch.debugreg[i] = c(debugreg[i]); hvm_set_info_guest(v); if ( is_hvm_vcpu(v) || v->is_initialised ) goto out; /* NB: No need to use PV cr3 un-pickling macros */ cr3_gfn = c(ctrlreg[3]) >> PAGE_SHIFT; cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC); v->arch.cr3 = page_to_maddr(cr3_page); v->arch.hvm_vcpu.guest_cr[3] = c.nat->ctrlreg[3]; v->arch.guest_table = pagetable_from_page(cr3_page); ASSERT(paging_mode_enabled(d)); goto pvh_skip_pv_stuff; } init_int80_direct_trap(v); /* IOPL privileges are virtualised. */ v->arch.pv_vcpu.iopl = (v->arch.user_regs.eflags >> 12) & 3; v->arch.user_regs.eflags &= ~X86_EFLAGS_IOPL; /* Ensure real hardware interrupts are enabled. */ v->arch.user_regs.eflags |= X86_EFLAGS_IF; if ( !v->is_initialised ) { if ( !compat && !(flags & VGCF_in_kernel) && !c.nat->ctrlreg[1] ) return -EINVAL; v->arch.pv_vcpu.ldt_base = c(ldt_base); v->arch.pv_vcpu.ldt_ents = c(ldt_ents); } else { unsigned long pfn = pagetable_get_pfn(v->arch.guest_table); bool_t fail; if ( !compat ) { fail = xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[3]; if ( pagetable_is_null(v->arch.guest_table_user) ) fail |= c.nat->ctrlreg[1] || !(flags & VGCF_in_kernel); else { pfn = pagetable_get_pfn(v->arch.guest_table_user); fail |= xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[1]; } } else { l4_pgentry_t *l4tab = map_domain_page(pfn); pfn = l4e_get_pfn(*l4tab); unmap_domain_page(l4tab); fail = compat_pfn_to_cr3(pfn) != c.cmp->ctrlreg[3]; } for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames); ++i ) fail |= v->arch.pv_vcpu.gdt_frames[i] != c(gdt_frames[i]); fail |= v->arch.pv_vcpu.gdt_ents != c(gdt_ents); fail |= v->arch.pv_vcpu.ldt_base != c(ldt_base); fail |= v->arch.pv_vcpu.ldt_ents != c(ldt_ents); if ( fail ) return -EOPNOTSUPP; } v->arch.pv_vcpu.kernel_ss = c(kernel_ss); v->arch.pv_vcpu.kernel_sp = c(kernel_sp); for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.ctrlreg); ++i ) v->arch.pv_vcpu.ctrlreg[i] = c(ctrlreg[i]); v->arch.pv_vcpu.event_callback_eip = c(event_callback_eip); v->arch.pv_vcpu.failsafe_callback_eip = c(failsafe_callback_eip); if ( !compat ) { v->arch.pv_vcpu.syscall_callback_eip = c.nat->syscall_callback_eip; v->arch.pv_vcpu.fs_base = c.nat->fs_base; v->arch.pv_vcpu.gs_base_kernel = c.nat->gs_base_kernel; v->arch.pv_vcpu.gs_base_user = c.nat->gs_base_user; } else { v->arch.pv_vcpu.event_callback_cs = c(event_callback_cs); v->arch.pv_vcpu.failsafe_callback_cs = c(failsafe_callback_cs); } v->arch.pv_vcpu.vm_assist = c(vm_assist); /* Only CR0.TS is modifiable by guest or admin. */ v->arch.pv_vcpu.ctrlreg[0] &= X86_CR0_TS; v->arch.pv_vcpu.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS; cr4 = v->arch.pv_vcpu.ctrlreg[4]; v->arch.pv_vcpu.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(v, cr4) : real_cr4_to_pv_guest_cr4(mmu_cr4_features); memset(v->arch.debugreg, 0, sizeof(v->arch.debugreg)); for ( i = 0; i < 8; i++ ) (void)set_debugreg(v, i, c(debugreg[i])); if ( v->is_initialised ) goto out; if ( v->vcpu_id == 0 ) d->vm_assist = c(vm_assist); rc = put_old_guest_table(current); if ( rc ) return rc; if ( !compat ) rc = (int)set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents); else { unsigned long gdt_frames[ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames)]; unsigned int n = (c.cmp->gdt_ents + 511) / 512; if ( n > ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames) ) return -EINVAL; for ( i = 0; i < n; ++i ) gdt_frames[i] = c.cmp->gdt_frames[i]; rc = (int)set_gdt(v, gdt_frames, c.cmp->gdt_ents); } if ( rc != 0 ) return rc; set_bit(_VPF_in_reset, &v->pause_flags); if ( !compat ) cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[3]); else cr3_gfn = compat_cr3_to_pfn(c.cmp->ctrlreg[3]); cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC); if ( !cr3_page ) rc = -EINVAL; else if ( paging_mode_refcounts(d) ) /* nothing */; else if ( cr3_page == v->arch.old_guest_table ) { v->arch.old_guest_table = NULL; put_page(cr3_page); } else { if ( !compat ) rc = put_old_guest_table(v); if ( !rc ) rc = get_page_type_preemptible(cr3_page, !compat ? PGT_root_page_table : PGT_l3_page_table); switch ( rc ) { case -EINTR: rc = -EAGAIN; case -EAGAIN: case 0: break; default: if ( cr3_page == current->arch.old_guest_table ) cr3_page = NULL; break; } } if ( rc ) /* handled below */; else if ( !compat ) { v->arch.guest_table = pagetable_from_page(cr3_page); if ( c.nat->ctrlreg[1] ) { cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[1]); cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC); if ( !cr3_page ) rc = -EINVAL; else if ( !paging_mode_refcounts(d) ) { rc = get_page_type_preemptible(cr3_page, PGT_root_page_table); switch ( rc ) { case -EINTR: rc = -EAGAIN; case -EAGAIN: v->arch.old_guest_table = pagetable_get_page(v->arch.guest_table); v->arch.guest_table = pagetable_null(); break; default: if ( cr3_page == current->arch.old_guest_table ) cr3_page = NULL; case 0: break; } } if ( !rc ) v->arch.guest_table_user = pagetable_from_page(cr3_page); } } else { l4_pgentry_t *l4tab; l4tab = map_domain_page(pagetable_get_pfn(v->arch.guest_table)); *l4tab = l4e_from_pfn(page_to_mfn(cr3_page), _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED); unmap_domain_page(l4tab); } if ( rc ) { if ( cr3_page ) put_page(cr3_page); destroy_gdt(v); return rc; } clear_bit(_VPF_in_reset, &v->pause_flags); pvh_skip_pv_stuff: if ( v->vcpu_id == 0 ) update_domain_wallclock_time(d); /* Don't redo final setup */ v->is_initialised = 1; if ( paging_mode_enabled(d) ) paging_update_paging_modes(v); update_cr3(v); out: if ( flags & VGCF_online ) clear_bit(_VPF_down, &v->pause_flags); else set_bit(_VPF_down, &v->pause_flags); return 0; #undef c } int arch_vcpu_reset(struct vcpu *v) { if ( is_pv_vcpu(v) ) { destroy_gdt(v); return vcpu_destroy_pagetables(v); } vcpu_end_shutdown_deferral(v); return 0; } long arch_do_vcpu_op( int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg) { long rc = 0; switch ( cmd ) { case VCPUOP_register_vcpu_time_memory_area: { struct vcpu_register_time_memory_area area; rc = -EFAULT; if ( copy_from_guest(&area, arg, 1) ) break; if ( !guest_handle_okay(area.addr.h, 1) ) break; rc = 0; v->arch.time_info_guest = area.addr.h; force_update_vcpu_system_time(v); break; } case VCPUOP_get_physid: { struct vcpu_get_physid cpu_id; rc = -EINVAL; if ( !is_pinned_vcpu(v) ) break; cpu_id.phys_id = (uint64_t)x86_cpu_to_apicid[v->vcpu_id] | ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32); rc = -EFAULT; if ( copy_to_guest(arg, &cpu_id, 1) ) break; rc = 0; break; } default: rc = -ENOSYS; break; } return rc; } #define loadsegment(seg,value) ({ \ int __r = 1; \ asm volatile ( \ "1: movl %k1,%%" #seg "\n2:\n" \ ".section .fixup,\"ax\"\n" \ "3: xorl %k0,%k0\n" \ " movl %k0,%%" #seg "\n" \ " jmp 2b\n" \ ".previous\n" \ _ASM_EXTABLE(1b, 3b) \ : "=r" (__r) : "r" (value), "0" (__r) );\ __r; }) /* * save_segments() writes a mask of segments which are dirty (non-zero), * allowing load_segments() to avoid some expensive segment loads and * MSR writes. */ static DEFINE_PER_CPU(unsigned int, dirty_segment_mask); #define DIRTY_DS 0x01 #define DIRTY_ES 0x02 #define DIRTY_FS 0x04 #define DIRTY_GS 0x08 #define DIRTY_FS_BASE 0x10 #define DIRTY_GS_BASE_USER 0x20 static void load_segments(struct vcpu *n) { struct cpu_user_regs *uregs = &n->arch.user_regs; int all_segs_okay = 1; unsigned int dirty_segment_mask, cpu = smp_processor_id(); /* Load and clear the dirty segment mask. */ dirty_segment_mask = per_cpu(dirty_segment_mask, cpu); per_cpu(dirty_segment_mask, cpu) = 0; /* Either selector != 0 ==> reload. */ if ( unlikely((dirty_segment_mask & DIRTY_DS) | uregs->ds) ) all_segs_okay &= loadsegment(ds, uregs->ds); /* Either selector != 0 ==> reload. */ if ( unlikely((dirty_segment_mask & DIRTY_ES) | uregs->es) ) all_segs_okay &= loadsegment(es, uregs->es); /* * Either selector != 0 ==> reload. * Also reload to reset FS_BASE if it was non-zero. */ if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) | uregs->fs) ) all_segs_okay &= loadsegment(fs, uregs->fs); /* * Either selector != 0 ==> reload. * Also reload to reset GS_BASE if it was non-zero. */ if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) | uregs->gs) ) { /* Reset GS_BASE with user %gs? */ if ( (dirty_segment_mask & DIRTY_GS) || !n->arch.pv_vcpu.gs_base_user ) all_segs_okay &= loadsegment(gs, uregs->gs); } if ( !is_pv_32on64_domain(n->domain) ) { /* This can only be non-zero if selector is NULL. */ if ( n->arch.pv_vcpu.fs_base ) wrfsbase(n->arch.pv_vcpu.fs_base); /* Most kernels have non-zero GS base, so don't bother testing. */ /* (This is also a serialising instruction, avoiding AMD erratum #88.) */ wrmsrl(MSR_SHADOW_GS_BASE, n->arch.pv_vcpu.gs_base_kernel); /* This can only be non-zero if selector is NULL. */ if ( n->arch.pv_vcpu.gs_base_user ) wrgsbase(n->arch.pv_vcpu.gs_base_user); /* If in kernel mode then switch the GS bases around. */ if ( (n->arch.flags & TF_kernel_mode) ) asm volatile ( "swapgs" ); } if ( unlikely(!all_segs_okay) ) { struct pv_vcpu *pv = &n->arch.pv_vcpu; struct cpu_user_regs *regs = guest_cpu_user_regs(); unsigned long *rsp = (n->arch.flags & TF_kernel_mode) ? (unsigned long *)regs->rsp : (unsigned long *)pv->kernel_sp; unsigned long cs_and_mask, rflags; if ( is_pv_32on64_domain(n->domain) ) { unsigned int *esp = ring_1(regs) ? (unsigned int *)regs->rsp : (unsigned int *)pv->kernel_sp; unsigned int cs_and_mask, eflags; int ret = 0; /* CS longword also contains full evtchn_upcall_mask. */ cs_and_mask = (unsigned short)regs->cs | ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16); /* Fold upcall mask into RFLAGS.IF. */ eflags = regs->_eflags & ~X86_EFLAGS_IF; eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9; if ( !ring_1(regs) ) { ret = put_user(regs->ss, esp-1); ret |= put_user(regs->_esp, esp-2); esp -= 2; } if ( ret | put_user(eflags, esp-1) | put_user(cs_and_mask, esp-2) | put_user(regs->_eip, esp-3) | put_user(uregs->gs, esp-4) | put_user(uregs->fs, esp-5) | put_user(uregs->es, esp-6) | put_user(uregs->ds, esp-7) ) { gdprintk(XENLOG_ERR, "Error while creating compat " "failsafe callback frame.\n"); domain_crash(n->domain); } if ( test_bit(_VGCF_failsafe_disables_events, &n->arch.vgc_flags) ) vcpu_info(n, evtchn_upcall_mask) = 1; regs->entry_vector |= TRAP_syscall; regs->_eflags &= 0xFFFCBEFFUL; regs->ss = FLAT_COMPAT_KERNEL_SS; regs->_esp = (unsigned long)(esp-7); regs->cs = FLAT_COMPAT_KERNEL_CS; regs->_eip = pv->failsafe_callback_eip; return; } if ( !(n->arch.flags & TF_kernel_mode) ) toggle_guest_mode(n); else regs->cs &= ~3; /* CS longword also contains full evtchn_upcall_mask. */ cs_and_mask = (unsigned long)regs->cs | ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32); /* Fold upcall mask into RFLAGS.IF. */ rflags = regs->rflags & ~X86_EFLAGS_IF; rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9; if ( put_user(regs->ss, rsp- 1) | put_user(regs->rsp, rsp- 2) | put_user(rflags, rsp- 3) | put_user(cs_and_mask, rsp- 4) | put_user(regs->rip, rsp- 5) | put_user(uregs->gs, rsp- 6) | put_user(uregs->fs, rsp- 7) | put_user(uregs->es, rsp- 8) | put_user(uregs->ds, rsp- 9) | put_user(regs->r11, rsp-10) | put_user(regs->rcx, rsp-11) ) { gdprintk(XENLOG_ERR, "Error while creating failsafe " "callback frame.\n"); domain_crash(n->domain); } if ( test_bit(_VGCF_failsafe_disables_events, &n->arch.vgc_flags) ) vcpu_info(n, evtchn_upcall_mask) = 1; regs->entry_vector |= TRAP_syscall; regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF| X86_EFLAGS_NT|X86_EFLAGS_TF); regs->ss = FLAT_KERNEL_SS; regs->rsp = (unsigned long)(rsp-11); regs->cs = FLAT_KERNEL_CS; regs->rip = pv->failsafe_callback_eip; } } static void save_segments(struct vcpu *v) { struct cpu_user_regs *regs = &v->arch.user_regs; unsigned int dirty_segment_mask = 0; regs->ds = read_segment_register(ds); regs->es = read_segment_register(es); regs->fs = read_segment_register(fs); regs->gs = read_segment_register(gs); if ( cpu_has_fsgsbase && !is_pv_32bit_vcpu(v) ) { v->arch.pv_vcpu.fs_base = __rdfsbase(); if ( v->arch.flags & TF_kernel_mode ) v->arch.pv_vcpu.gs_base_kernel = __rdgsbase(); else v->arch.pv_vcpu.gs_base_user = __rdgsbase(); } if ( regs->ds ) dirty_segment_mask |= DIRTY_DS; if ( regs->es ) dirty_segment_mask |= DIRTY_ES; if ( regs->fs || is_pv_32on64_domain(v->domain) ) { dirty_segment_mask |= DIRTY_FS; v->arch.pv_vcpu.fs_base = 0; /* != 0 selector kills fs_base */ } else if ( v->arch.pv_vcpu.fs_base ) { dirty_segment_mask |= DIRTY_FS_BASE; } if ( regs->gs || is_pv_32on64_domain(v->domain) ) { dirty_segment_mask |= DIRTY_GS; v->arch.pv_vcpu.gs_base_user = 0; /* != 0 selector kills gs_base_user */ } else if ( v->arch.pv_vcpu.gs_base_user ) { dirty_segment_mask |= DIRTY_GS_BASE_USER; } this_cpu(dirty_segment_mask) = dirty_segment_mask; } #define switch_kernel_stack(v) ((void)0) static void paravirt_ctxt_switch_from(struct vcpu *v) { save_segments(v); /* * Disable debug breakpoints. We do this aggressively because if we switch * to an HVM guest we may load DR0-DR3 with values that can cause #DE * inside Xen, before we get a chance to reload DR7, and this cannot always * safely be handled. */ if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) ) write_debugreg(7, 0); } static void paravirt_ctxt_switch_to(struct vcpu *v) { unsigned long cr4; set_int80_direct_trap(v); switch_kernel_stack(v); cr4 = pv_guest_cr4_to_real_cr4(v); if ( unlikely(cr4 != read_cr4()) ) write_cr4(cr4); if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) ) { write_debugreg(0, v->arch.debugreg[0]); write_debugreg(1, v->arch.debugreg[1]); write_debugreg(2, v->arch.debugreg[2]); write_debugreg(3, v->arch.debugreg[3]); write_debugreg(6, v->arch.debugreg[6]); write_debugreg(7, v->arch.debugreg[7]); } if ( (v->domain->arch.tsc_mode == TSC_MODE_PVRDTSCP) && boot_cpu_has(X86_FEATURE_RDTSCP) ) write_rdtscp_aux(v->domain->arch.incarnation); } /* Update per-VCPU guest runstate shared memory area (if registered). */ bool_t update_runstate_area(const struct vcpu *v) { if ( guest_handle_is_null(runstate_guest(v)) ) return 1; if ( has_32bit_shinfo(v->domain) ) { struct compat_vcpu_runstate_info info; XLAT_vcpu_runstate_info(&info, &v->runstate); __copy_to_guest(v->runstate_guest.compat, &info, 1); return 1; } return __copy_to_guest(runstate_guest(v), &v->runstate, 1) != sizeof(v->runstate); } static void _update_runstate_area(struct vcpu *v) { if ( !update_runstate_area(v) && is_pv_vcpu(v) && !(v->arch.flags & TF_kernel_mode) ) v->arch.pv_vcpu.need_update_runstate_area = 1; } static inline int need_full_gdt(struct vcpu *v) { return (is_pv_vcpu(v) && !is_idle_vcpu(v)); } static void __context_switch(void) { struct cpu_user_regs *stack_regs = guest_cpu_user_regs(); unsigned int cpu = smp_processor_id(); struct vcpu *p = per_cpu(curr_vcpu, cpu); struct vcpu *n = current; struct desc_struct *gdt; struct desc_ptr gdt_desc; ASSERT(p != n); ASSERT(cpumask_empty(n->vcpu_dirty_cpumask)); if ( !is_idle_vcpu(p) ) { memcpy(&p->arch.user_regs, stack_regs, CTXT_SWITCH_STACK_BYTES); vcpu_save_fpu(p); p->arch.ctxt_switch_from(p); } /* * Mark this CPU in next domain's dirty cpumasks before calling * ctxt_switch_to(). This avoids a race on things like EPT flushing, * which is synchronised on that function. */ if ( p->domain != n->domain ) cpumask_set_cpu(cpu, n->domain->domain_dirty_cpumask); cpumask_set_cpu(cpu, n->vcpu_dirty_cpumask); if ( !is_idle_vcpu(n) ) { memcpy(stack_regs, &n->arch.user_regs, CTXT_SWITCH_STACK_BYTES); if ( cpu_has_xsave ) { u64 xcr0 = n->arch.xcr0 ?: XSTATE_FP_SSE; if ( xcr0 != get_xcr0() && !set_xcr0(xcr0) ) BUG(); } vcpu_restore_fpu_eager(n); n->arch.ctxt_switch_to(n); } gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) : per_cpu(compat_gdt_table, cpu); if ( need_full_gdt(n) ) { unsigned long mfn = virt_to_mfn(gdt); l1_pgentry_t *pl1e = gdt_ldt_ptes(n->domain, n); unsigned int i; for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ ) l1e_write(pl1e + FIRST_RESERVED_GDT_PAGE + i, l1e_from_pfn(mfn + i, __PAGE_HYPERVISOR)); } if ( need_full_gdt(p) && ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(n)) ) { gdt_desc.limit = LAST_RESERVED_GDT_BYTE; gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY); asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); } write_ptbase(n); if ( need_full_gdt(n) && ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(p)) ) { gdt_desc.limit = LAST_RESERVED_GDT_BYTE; gdt_desc.base = GDT_VIRT_START(n); asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); } if ( p->domain != n->domain ) cpumask_clear_cpu(cpu, p->domain->domain_dirty_cpumask); cpumask_clear_cpu(cpu, p->vcpu_dirty_cpumask); per_cpu(curr_vcpu, cpu) = n; } void context_switch(struct vcpu *prev, struct vcpu *next) { unsigned int cpu = smp_processor_id(); cpumask_t dirty_mask; ASSERT(local_irq_is_enabled()); cpumask_copy(&dirty_mask, next->vcpu_dirty_cpumask); /* Allow at most one CPU at a time to be dirty. */ ASSERT(cpumask_weight(&dirty_mask) <= 1); if ( unlikely(!cpumask_test_cpu(cpu, &dirty_mask) && !cpumask_empty(&dirty_mask)) ) { /* Other cpus call __sync_local_execstate from flush ipi handler. */ flush_tlb_mask(&dirty_mask); } if ( prev != next ) _update_runstate_area(prev); if ( is_hvm_vcpu(prev) ) { if (prev != next) vpmu_save(prev); if ( !list_empty(&prev->arch.hvm_vcpu.tm_list) ) pt_save_timer(prev); } local_irq_disable(); set_current(next); if ( (per_cpu(curr_vcpu, cpu) == next) || (is_idle_vcpu(next) && cpu_online(cpu)) ) { local_irq_enable(); } else { __context_switch(); if ( is_pv_vcpu(next) && (is_idle_vcpu(prev) || has_hvm_container_vcpu(prev) || is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) ) { uint64_t efer = read_efer(); if ( !(efer & EFER_SCE) ) write_efer(efer | EFER_SCE); } /* Re-enable interrupts before restoring state which may fault. */ local_irq_enable(); if ( is_pv_vcpu(next) ) { load_LDT(next); load_segments(next); } set_cpuid_faulting(is_pv_vcpu(next) && (next->domain->domain_id != 0)); } if (is_hvm_vcpu(next) && (prev != next) ) /* Must be done with interrupts enabled */ vpmu_load(next); context_saved(prev); if ( prev != next ) _update_runstate_area(next); /* Ensure that the vcpu has an up-to-date time base. */ update_vcpu_system_time(next); schedule_tail(next); BUG(); } void continue_running(struct vcpu *same) { schedule_tail(same); BUG(); } int __sync_local_execstate(void) { unsigned long flags; int switch_required; local_irq_save(flags); switch_required = (this_cpu(curr_vcpu) != current); if ( switch_required ) { ASSERT(current == idle_vcpu[smp_processor_id()]); __context_switch(); } local_irq_restore(flags); return switch_required; } void sync_local_execstate(void) { (void)__sync_local_execstate(); } void sync_vcpu_execstate(struct vcpu *v) { if ( cpumask_test_cpu(smp_processor_id(), v->vcpu_dirty_cpumask) ) sync_local_execstate(); /* Other cpus call __sync_local_execstate from flush ipi handler. */ flush_tlb_mask(v->vcpu_dirty_cpumask); } #define next_arg(fmt, args) ({ \ unsigned long __arg; \ switch ( *(fmt)++ ) \ { \ case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \ case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \ case 'h': __arg = (unsigned long)va_arg(args, void *); break; \ default: __arg = 0; BUG(); \ } \ __arg; \ }) void hypercall_cancel_continuation(void) { struct cpu_user_regs *regs = guest_cpu_user_regs(); struct mc_state *mcs = ¤t->mc_state; if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) { __clear_bit(_MCSF_call_preempted, &mcs->flags); } else { if ( is_pv_vcpu(current) ) regs->eip += 2; /* skip re-execute 'syscall' / 'int $xx' */ else current->arch.hvm_vcpu.hcall_preempted = 0; } } unsigned long hypercall_create_continuation( unsigned int op, const char *format, ...) { struct mc_state *mcs = ¤t->mc_state; struct cpu_user_regs *regs; const char *p = format; unsigned long arg; unsigned int i; va_list args; va_start(args, format); if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) { __set_bit(_MCSF_call_preempted, &mcs->flags); for ( i = 0; *p != '\0'; i++ ) mcs->call.args[i] = next_arg(p, args); if ( is_pv_32on64_domain(current->domain) ) { for ( ; i < 6; i++ ) mcs->call.args[i] = 0; } } else { regs = guest_cpu_user_regs(); regs->eax = op; /* Ensure the hypercall trap instruction is re-executed. */ if ( is_pv_vcpu(current) ) regs->eip -= 2; /* re-execute 'syscall' / 'int $xx' */ else current->arch.hvm_vcpu.hcall_preempted = 1; if ( is_pv_vcpu(current) ? !is_pv_32on64_vcpu(current) : (hvm_guest_x86_mode(current) == 8) ) { for ( i = 0; *p != '\0'; i++ ) { arg = next_arg(p, args); switch ( i ) { case 0: regs->rdi = arg; break; case 1: regs->rsi = arg; break; case 2: regs->rdx = arg; break; case 3: regs->r10 = arg; break; case 4: regs->r8 = arg; break; case 5: regs->r9 = arg; break; } } } else { if ( supervisor_mode_kernel ) regs->eip &= ~31; /* re-execute entire hypercall entry stub */ for ( i = 0; *p != '\0'; i++ ) { arg = next_arg(p, args); switch ( i ) { case 0: regs->ebx = arg; break; case 1: regs->ecx = arg; break; case 2: regs->edx = arg; break; case 3: regs->esi = arg; break; case 4: regs->edi = arg; break; case 5: regs->ebp = arg; break; } } } } va_end(args); return op; } int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...) { int rc = 0; struct mc_state *mcs = ¤t->mc_state; struct cpu_user_regs *regs; unsigned int i, cval = 0; unsigned long nval = 0; va_list args; BUG_ON(id && *id > 5); BUG_ON(id && (mask & (1U << *id))); va_start(args, mask); if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) { if ( !test_bit(_MCSF_call_preempted, &mcs->flags) ) { va_end(args); return 0; } for ( i = 0; i < 6; ++i, mask >>= 1 ) { if ( mask & 1 ) { nval = va_arg(args, unsigned long); cval = va_arg(args, unsigned int); if ( cval == nval ) mask &= ~1U; else BUG_ON(nval == (unsigned int)nval); } else if ( id && *id == i ) { *id = mcs->call.args[i]; id = NULL; } if ( (mask & 1) && mcs->call.args[i] == nval ) { mcs->call.args[i] = cval; ++rc; } else BUG_ON(mcs->call.args[i] != (unsigned int)mcs->call.args[i]); } } else { regs = guest_cpu_user_regs(); for ( i = 0; i < 6; ++i, mask >>= 1 ) { unsigned long *reg; switch ( i ) { case 0: reg = ®s->ebx; break; case 1: reg = ®s->ecx; break; case 2: reg = ®s->edx; break; case 3: reg = ®s->esi; break; case 4: reg = ®s->edi; break; case 5: reg = ®s->ebp; break; default: BUG(); reg = NULL; break; } if ( (mask & 1) ) { nval = va_arg(args, unsigned long); cval = va_arg(args, unsigned int); if ( cval == nval ) mask &= ~1U; else BUG_ON(nval == (unsigned int)nval); } else if ( id && *id == i ) { *id = *reg; id = NULL; } if ( (mask & 1) && *reg == nval ) { *reg = cval; ++rc; } else BUG_ON(*reg != (unsigned int)*reg); } } va_end(args); return rc; } static int relinquish_memory( struct domain *d, struct page_list_head *list, unsigned long type) { struct page_info *page; unsigned long x, y; int ret = 0; /* Use a recursive lock, as we may enter 'free_domheap_page'. */ spin_lock_recursive(&d->page_alloc_lock); while ( (page = page_list_remove_head(list)) ) { /* Grab a reference to the page so it won't disappear from under us. */ if ( unlikely(!get_page(page, d)) ) { /* Couldn't get a reference -- someone is freeing this page. */ page_list_add_tail(page, &d->arch.relmem_list); continue; } if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) ) ret = put_page_and_type_preemptible(page); switch ( ret ) { case 0: break; case -EAGAIN: case -EINTR: ret = -EAGAIN; page_list_add(page, list); set_bit(_PGT_pinned, &page->u.inuse.type_info); put_page(page); goto out; default: BUG(); } clear_superpage_mark(page); if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) put_page(page); /* * Forcibly invalidate top-most, still valid page tables at this point * to break circular 'linear page table' references as well as clean up * partially validated pages. This is okay because MMU structures are * not shared across domains and this domain is now dead. Thus top-most * valid tables are not in use so a non-zero count means circular * reference or partially validated. */ y = page->u.inuse.type_info; for ( ; ; ) { x = y; if ( likely((x & PGT_type_mask) != type) || likely(!(x & (PGT_validated|PGT_partial))) ) break; y = cmpxchg(&page->u.inuse.type_info, x, x & ~(PGT_validated|PGT_partial)); if ( likely(y == x) ) { /* No need for atomic update of type_info here: noone else updates it. */ switch ( ret = free_page_type(page, x, 1) ) { case 0: break; case -EINTR: page_list_add(page, list); page->u.inuse.type_info |= PGT_validated; if ( x & PGT_partial ) put_page(page); put_page(page); ret = -EAGAIN; goto out; case -EAGAIN: page_list_add(page, list); page->u.inuse.type_info |= PGT_partial; if ( x & PGT_partial ) put_page(page); goto out; default: BUG(); } if ( x & PGT_partial ) { page->u.inuse.type_info--; put_page(page); } break; } } /* Put the page on the list and /then/ potentially free it. */ page_list_add_tail(page, &d->arch.relmem_list); put_page(page); if ( hypercall_preempt_check() ) { ret = -EAGAIN; goto out; } } /* list is empty at this point. */ page_list_move(list, &d->arch.relmem_list); out: spin_unlock_recursive(&d->page_alloc_lock); return ret; } int domain_relinquish_resources(struct domain *d) { int ret; struct vcpu *v; BUG_ON(!cpumask_empty(d->domain_dirty_cpumask)); switch ( d->arch.relmem ) { case RELMEM_not_started: pci_release_devices(d); /* Tear down paging-assistance stuff. */ paging_teardown(d); /* Drop the in-use references to page-table bases. */ for_each_vcpu ( d, v ) { ret = vcpu_destroy_pagetables(v); if ( ret ) return ret; } if ( is_pv_domain(d) ) { for_each_vcpu ( d, v ) { /* * Relinquish GDT mappings. No need for explicit unmapping of * the LDT as it automatically gets squashed with the guest * mappings. */ destroy_gdt(v); } if ( d->arch.pv_domain.pirq_eoi_map != NULL ) { unmap_domain_page_global(d->arch.pv_domain.pirq_eoi_map); put_page_and_type( mfn_to_page(d->arch.pv_domain.pirq_eoi_map_mfn)); d->arch.pv_domain.pirq_eoi_map = NULL; d->arch.pv_domain.auto_unmask = 0; } } d->arch.relmem = RELMEM_shared; /* fallthrough */ case RELMEM_shared: if ( is_hvm_domain(d) ) { /* If the domain has shared pages, relinquish them allowing * for preemption. */ ret = relinquish_shared_pages(d); if ( ret ) return ret; } d->arch.relmem = RELMEM_xen; spin_lock(&d->page_alloc_lock); page_list_splice(&d->arch.relmem_list, &d->page_list); INIT_PAGE_LIST_HEAD(&d->arch.relmem_list); spin_unlock(&d->page_alloc_lock); /* Fallthrough. Relinquish every page of memory. */ case RELMEM_xen: ret = relinquish_memory(d, &d->xenpage_list, ~0UL); if ( ret ) return ret; d->arch.relmem = RELMEM_l4; /* fallthrough */ case RELMEM_l4: ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table); if ( ret ) return ret; d->arch.relmem = RELMEM_l3; /* fallthrough */ case RELMEM_l3: ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table); if ( ret ) return ret; d->arch.relmem = RELMEM_l2; /* fallthrough */ case RELMEM_l2: ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table); if ( ret ) return ret; d->arch.relmem = RELMEM_done; /* fallthrough */ case RELMEM_done: break; default: BUG(); } if ( has_hvm_container_domain(d) ) hvm_domain_relinquish_resources(d); return 0; } void arch_dump_domain_info(struct domain *d) { paging_dump_domain_info(d); } void arch_dump_vcpu_info(struct vcpu *v) { paging_dump_vcpu_info(v); if ( is_hvm_vcpu(v) ) vpmu_dump(v); } void domain_cpuid( struct domain *d, unsigned int input, unsigned int sub_input, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { cpuid_input_t *cpuid; int i; for ( i = 0; i < MAX_CPUID_INPUT; i++ ) { cpuid = &d->arch.cpuids[i]; if ( (cpuid->input[0] == input) && ((cpuid->input[1] == XEN_CPUID_INPUT_UNUSED) || (cpuid->input[1] == sub_input)) ) { *eax = cpuid->eax; *ebx = cpuid->ebx; *ecx = cpuid->ecx; *edx = cpuid->edx; /* * Do not advertise host's invariant TSC unless the TSC is * emulated, or the domain cannot migrate to other hosts. */ if ( (input == 0x80000007) && /* Advanced Power Management */ !d->disable_migrate && !d->arch.vtsc ) *edx &= ~(1u<<8); /* TSC Invariant */ return; } } *eax = *ebx = *ecx = *edx = 0; } void vcpu_kick(struct vcpu *v) { /* * NB1. 'pause_flags' and 'processor' must be checked /after/ update of * pending flag. These values may fluctuate (after all, we hold no * locks) but the key insight is that each change will cause * evtchn_upcall_pending to be polled. * * NB2. We save the running flag across the unblock to avoid a needless * IPI for domains that we IPI'd to unblock. */ bool_t running = v->is_running; vcpu_unblock(v); if ( running && (in_irq() || (v != current)) ) cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ); } void vcpu_mark_events_pending(struct vcpu *v) { int already_pending = test_and_set_bit( 0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending)); if ( already_pending ) return; if ( has_hvm_container_vcpu(v) ) hvm_assert_evtchn_irq(v); else vcpu_kick(v); } static void vcpu_kick_softirq(void) { /* * Nothing to do here: we merely prevent notifiers from racing with checks * executed on return to guest context with interrupts enabled. See, for * example, xxx_intr_assist() executed on return to HVM guest context. */ } static int __init init_vcpu_kick_softirq(void) { open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq); return 0; } __initcall(init_vcpu_kick_softirq); /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/x86_64/0000775000175000017500000000000012307313555013535 5ustar smbsmbxen-4.4.0/xen/arch/x86/x86_64/asm-offsets.c0000664000175000017500000001577612307313555016150 0ustar smbsmb/* * Generate definitions needed by assembly language modules. * This code generates raw asm output which is post-processed * to extract and format the required data. */ #define COMPILE_OFFSETS #include #include #include #include #include #include #include #include #define DEFINE(_sym, _val) \ __asm__ __volatile__ ( "\n->" #_sym " %0 " #_val : : "i" (_val) ) #define BLANK() \ __asm__ __volatile__ ( "\n->" : : ) #define OFFSET(_sym, _str, _mem) \ DEFINE(_sym, offsetof(_str, _mem)); void __dummy__(void) { OFFSET(UREGS_r15, struct cpu_user_regs, r15); OFFSET(UREGS_r14, struct cpu_user_regs, r14); OFFSET(UREGS_r13, struct cpu_user_regs, r13); OFFSET(UREGS_r12, struct cpu_user_regs, r12); OFFSET(UREGS_rbp, struct cpu_user_regs, rbp); OFFSET(UREGS_rbx, struct cpu_user_regs, rbx); OFFSET(UREGS_r11, struct cpu_user_regs, r11); OFFSET(UREGS_r10, struct cpu_user_regs, r10); OFFSET(UREGS_r9, struct cpu_user_regs, r9); OFFSET(UREGS_r8, struct cpu_user_regs, r8); OFFSET(UREGS_rax, struct cpu_user_regs, rax); OFFSET(UREGS_rcx, struct cpu_user_regs, rcx); OFFSET(UREGS_rdx, struct cpu_user_regs, rdx); OFFSET(UREGS_rsi, struct cpu_user_regs, rsi); OFFSET(UREGS_rdi, struct cpu_user_regs, rdi); OFFSET(UREGS_error_code, struct cpu_user_regs, error_code); OFFSET(UREGS_entry_vector, struct cpu_user_regs, entry_vector); OFFSET(UREGS_saved_upcall_mask, struct cpu_user_regs, saved_upcall_mask); OFFSET(UREGS_rip, struct cpu_user_regs, rip); OFFSET(UREGS_cs, struct cpu_user_regs, cs); OFFSET(UREGS_eflags, struct cpu_user_regs, eflags); OFFSET(UREGS_rsp, struct cpu_user_regs, rsp); OFFSET(UREGS_ss, struct cpu_user_regs, ss); OFFSET(UREGS_ds, struct cpu_user_regs, ds); OFFSET(UREGS_es, struct cpu_user_regs, es); OFFSET(UREGS_fs, struct cpu_user_regs, fs); OFFSET(UREGS_gs, struct cpu_user_regs, gs); OFFSET(UREGS_kernel_sizeof, struct cpu_user_regs, es); DEFINE(UREGS_user_sizeof, sizeof(struct cpu_user_regs)); BLANK(); OFFSET(irq_caps_offset, struct domain, irq_caps); OFFSET(next_in_list_offset, struct domain, next_in_list); OFFSET(VCPU_processor, struct vcpu, processor); OFFSET(VCPU_domain, struct vcpu, domain); OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info); OFFSET(VCPU_trap_bounce, struct vcpu, arch.pv_vcpu.trap_bounce); OFFSET(VCPU_int80_bounce, struct vcpu, arch.pv_vcpu.int80_bounce); OFFSET(VCPU_thread_flags, struct vcpu, arch.flags); OFFSET(VCPU_event_addr, struct vcpu, arch.pv_vcpu.event_callback_eip); OFFSET(VCPU_event_sel, struct vcpu, arch.pv_vcpu.event_callback_cs); OFFSET(VCPU_failsafe_addr, struct vcpu, arch.pv_vcpu.failsafe_callback_eip); OFFSET(VCPU_failsafe_sel, struct vcpu, arch.pv_vcpu.failsafe_callback_cs); OFFSET(VCPU_syscall_addr, struct vcpu, arch.pv_vcpu.syscall_callback_eip); OFFSET(VCPU_syscall32_addr, struct vcpu, arch.pv_vcpu.syscall32_callback_eip); OFFSET(VCPU_syscall32_sel, struct vcpu, arch.pv_vcpu.syscall32_callback_cs); OFFSET(VCPU_syscall32_disables_events, struct vcpu, arch.pv_vcpu.syscall32_disables_events); OFFSET(VCPU_sysenter_addr, struct vcpu, arch.pv_vcpu.sysenter_callback_eip); OFFSET(VCPU_sysenter_sel, struct vcpu, arch.pv_vcpu.sysenter_callback_cs); OFFSET(VCPU_sysenter_disables_events, struct vcpu, arch.pv_vcpu.sysenter_disables_events); OFFSET(VCPU_trap_ctxt, struct vcpu, arch.pv_vcpu.trap_ctxt); OFFSET(VCPU_kernel_sp, struct vcpu, arch.pv_vcpu.kernel_sp); OFFSET(VCPU_kernel_ss, struct vcpu, arch.pv_vcpu.kernel_ss); OFFSET(VCPU_guest_context_flags, struct vcpu, arch.vgc_flags); OFFSET(VCPU_nmi_pending, struct vcpu, nmi_pending); OFFSET(VCPU_mce_pending, struct vcpu, mce_pending); OFFSET(VCPU_nmi_old_mask, struct vcpu, nmi_state.old_mask); OFFSET(VCPU_mce_old_mask, struct vcpu, mce_state.old_mask); OFFSET(VCPU_async_exception_mask, struct vcpu, async_exception_mask); DEFINE(VCPU_TRAP_NMI, VCPU_TRAP_NMI); DEFINE(VCPU_TRAP_MCE, VCPU_TRAP_MCE); DEFINE(_VGCF_failsafe_disables_events, _VGCF_failsafe_disables_events); DEFINE(_VGCF_syscall_disables_events, _VGCF_syscall_disables_events); BLANK(); OFFSET(VCPU_svm_vmcb_pa, struct vcpu, arch.hvm_svm.vmcb_pa); OFFSET(VCPU_svm_vmcb, struct vcpu, arch.hvm_svm.vmcb); OFFSET(VCPU_svm_vmcb_in_sync, struct vcpu, arch.hvm_svm.vmcb_in_sync); BLANK(); OFFSET(VCPU_vmx_launched, struct vcpu, arch.hvm_vmx.launched); OFFSET(VCPU_vmx_realmode, struct vcpu, arch.hvm_vmx.vmx_realmode); OFFSET(VCPU_vmx_emulate, struct vcpu, arch.hvm_vmx.vmx_emulate); OFFSET(VCPU_vm86_seg_mask, struct vcpu, arch.hvm_vmx.vm86_segment_mask); OFFSET(VCPU_hvm_guest_cr2, struct vcpu, arch.hvm_vcpu.guest_cr[2]); BLANK(); OFFSET(VCPU_nhvm_guestmode, struct vcpu, arch.hvm_vcpu.nvcpu.nv_guestmode); OFFSET(VCPU_nhvm_p2m, struct vcpu, arch.hvm_vcpu.nvcpu.nv_p2m); OFFSET(VCPU_nsvm_hap_enabled, struct vcpu, arch.hvm_vcpu.nvcpu.u.nsvm.ns_hap_enabled); BLANK(); OFFSET(DOMAIN_is_32bit_pv, struct domain, arch.is_32bit_pv); BLANK(); OFFSET(VMCB_rax, struct vmcb_struct, rax); OFFSET(VMCB_rip, struct vmcb_struct, rip); OFFSET(VMCB_rsp, struct vmcb_struct, rsp); OFFSET(VMCB_rflags, struct vmcb_struct, rflags); BLANK(); OFFSET(VCPUINFO_upcall_pending, struct vcpu_info, evtchn_upcall_pending); OFFSET(VCPUINFO_upcall_mask, struct vcpu_info, evtchn_upcall_mask); BLANK(); OFFSET(COMPAT_VCPUINFO_upcall_pending, struct compat_vcpu_info, evtchn_upcall_pending); OFFSET(COMPAT_VCPUINFO_upcall_mask, struct compat_vcpu_info, evtchn_upcall_mask); BLANK(); OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs); OFFSET(CPUINFO_processor_id, struct cpu_info, processor_id); OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu); DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info)); BLANK(); OFFSET(TRAPINFO_eip, struct trap_info, address); OFFSET(TRAPINFO_cs, struct trap_info, cs); OFFSET(TRAPINFO_flags, struct trap_info, flags); DEFINE(TRAPINFO_sizeof, sizeof(struct trap_info)); BLANK(); OFFSET(TRAPBOUNCE_error_code, struct trap_bounce, error_code); OFFSET(TRAPBOUNCE_flags, struct trap_bounce, flags); OFFSET(TRAPBOUNCE_cs, struct trap_bounce, cs); OFFSET(TRAPBOUNCE_eip, struct trap_bounce, eip); BLANK(); #if PERF_COUNTERS DEFINE(ASM_PERFC_hypercalls, PERFC_hypercalls); DEFINE(ASM_PERFC_exceptions, PERFC_exceptions); BLANK(); #endif DEFINE(IRQSTAT_shift, LOG_2(sizeof(irq_cpustat_t))); OFFSET(IRQSTAT_softirq_pending, irq_cpustat_t, __softirq_pending); BLANK(); OFFSET(CPUINFO86_ext_features, struct cpuinfo_x86, x86_capability[1]); BLANK(); OFFSET(MB_flags, multiboot_info_t, flags); OFFSET(MB_cmdline, multiboot_info_t, cmdline); } xen-4.4.0/xen/arch/x86/x86_64/Makefile0000664000175000017500000000064512307313555015202 0ustar smbsmbsubdir-y += compat obj-bin-y += entry.o obj-bin-y += gpr_switch.o obj-y += mm.o obj-y += traps.o obj-y += machine_kexec.o obj-y += pci.o obj-y += acpi_mmcfg.o obj-y += mmconf-fam10h.o obj-y += mmconfig_64.o obj-y += mmconfig-shared.o obj-y += compat.o obj-y += domain.o obj-y += physdev.o obj-y += platform_hypercall.o obj-y += cpu_idle.o obj-y += cpufreq.o obj-bin-y += kexec_reloc.o obj-$(crash_debug) += gdbstub.o xen-4.4.0/xen/arch/x86/x86_64/acpi_mmcfg.c0000664000175000017500000000713512307313555015774 0ustar smbsmb/* * acpi_mmconfig.c - Architecture-Specific Low-Level ACPI Boot Support * * Copyright (C) 2001, 2002 Paul Diefenbaugh * Copyright (C) 2001 Jun Nakajima * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * copied from Linux */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmconfig.h" /* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ struct acpi_mcfg_allocation *pci_mmcfg_config; int pci_mmcfg_config_num; static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, struct acpi_mcfg_allocation *cfg) { int year; if (cfg->address < 0xFFFFFFFF) return 0; if (!strcmp(mcfg->header.oem_id, "SGI") || !strcmp(mcfg->header.oem_id, "SGI2")) return 0; if (mcfg->header.revision >= 1 && dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year >= 2010) return 0; printk(KERN_ERR "MCFG region for %04x:%02x-%02x at %#"PRIx64 " (above 4GB) ignored\n", cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number, cfg->address); return -EINVAL; } int __init acpi_parse_mcfg(struct acpi_table_header *header) { struct acpi_table_mcfg *mcfg; unsigned long i; if (!header) return -EINVAL; mcfg = (struct acpi_table_mcfg *)header; /* how many config structures do we have */ pci_mmcfg_config_num = 0; i = header->length - sizeof(struct acpi_table_mcfg); while (i >= sizeof(struct acpi_mcfg_allocation)) { ++pci_mmcfg_config_num; i -= sizeof(struct acpi_mcfg_allocation); }; if (pci_mmcfg_config_num == 0) { printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); return -ENODEV; } pci_mmcfg_config = xmalloc_array(struct acpi_mcfg_allocation, pci_mmcfg_config_num); if (!pci_mmcfg_config) { printk(KERN_WARNING PREFIX "No memory for MCFG config tables\n"); return -ENOMEM; } memcpy(pci_mmcfg_config, &mcfg[1], pci_mmcfg_config_num * sizeof(*pci_mmcfg_config)); for (i = 0; i < pci_mmcfg_config_num; ++i) { if (acpi_mcfg_check_entry(mcfg, &pci_mmcfg_config[i])) { xfree(pci_mmcfg_config); pci_mmcfg_config_num = 0; return -ENODEV; } pci_add_segment(pci_mmcfg_config[i].pci_segment); } return 0; } xen-4.4.0/xen/arch/x86/x86_64/cpufreq.c0000664000175000017500000000441712307313555015354 0ustar smbsmb/****************************************************************************** * cpufreq.c -- adapt 32b compat guest to 64b hypervisor. * * Copyright (C) 2008, Liu Jinsong * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ #include #include #include #include #include #include DEFINE_XEN_GUEST_HANDLE(compat_processor_px_t); int compat_set_px_pminfo(uint32_t cpu, struct compat_processor_performance *perf) { struct xen_processor_performance *xen_perf; unsigned long xlat_page_current; xlat_malloc_init(xlat_page_current); xen_perf = xlat_malloc_array(xlat_page_current, struct xen_processor_performance, 1); if ( unlikely(xen_perf == NULL) ) return -EFAULT; #define XLAT_processor_performance_HNDL_states(_d_, _s_) do { \ XEN_GUEST_HANDLE(compat_processor_px_t) states; \ XEN_GUEST_HANDLE_PARAM(xen_processor_px_t) states_t; \ if ( unlikely(!compat_handle_okay((_s_)->states, (_s_)->state_count)) ) \ return -EFAULT; \ guest_from_compat_handle(states, (_s_)->states); \ states_t = guest_handle_cast(states, xen_processor_px_t); \ (_d_)->states = guest_handle_from_param(states_t, xen_processor_px_t); \ } while (0) XLAT_processor_performance(xen_perf, perf); #undef XLAT_processor_performance_HNDL_states return set_px_pminfo(cpu, xen_perf); } xen-4.4.0/xen/arch/x86/x86_64/machine_kexec.c0000664000175000017500000000117712307313555016472 0ustar smbsmb/****************************************************************************** * machine_kexec.c * * Xen port written by: * - Simon 'Horms' Horman * - Magnus Damm */ #include #include #include #include int machine_kexec_get_xen(xen_kexec_range_t *range) { range->start = virt_to_maddr(_start); range->size = virt_to_maddr(_end) - (unsigned long)range->start; return 0; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/x86_64/mmconfig.h0000664000175000017500000000573112307313555015513 0ustar smbsmb/* * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Author: Allen Kay - adapted from linux */ #define PCI_VENDOR_ID_INTEL 0x8086 #define PCI_DEVICE_ID_INTEL_E7520_MCH 0x3590 #define PCI_DEVICE_ID_INTEL_82945G_HB 0x2770 /* ioport ends */ #define PCI_PROBE_BIOS 0x0001 #define PCI_PROBE_CONF1 0x0002 #define PCI_PROBE_CONF2 0x0004 #define PCI_PROBE_MMCONF 0x0008 #define PCI_PROBE_MASK 0x000f #define PCI_PROBE_NOEARLY 0x0010 #define PCI_VENDOR_ID_AMD 0x1022 #define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000 #define PCI_VENDOR_ID_NVIDIA 0x10de extern unsigned int pci_probe; /* * AMD Fam10h CPUs are buggy, and cannot access MMIO config space * on their northbrige except through the * %eax register. As such, you MUST * NOT use normal IOMEM accesses, you need to only use the magic mmio-config * accessor functions. * In fact just use pci_config_*, nothing else please. */ static inline unsigned char mmio_config_readb(void __iomem *pos) { u8 val; asm volatile("movb (%1),%%al" : "=a" (val) : "r" (pos)); return val; } static inline unsigned short mmio_config_readw(void __iomem *pos) { u16 val; asm volatile("movw (%1),%%ax" : "=a" (val) : "r" (pos)); return val; } static inline unsigned int mmio_config_readl(void __iomem *pos) { u32 val; asm volatile("movl (%1),%%eax" : "=a" (val) : "r" (pos)); return val; } static inline void mmio_config_writeb(void __iomem *pos, u8 val) { asm volatile("movb %%al,(%1)" :: "a" (val), "r" (pos) : "memory"); } static inline void mmio_config_writew(void __iomem *pos, u16 val) { asm volatile("movw %%ax,(%1)" :: "a" (val), "r" (pos) : "memory"); } static inline void mmio_config_writel(void __iomem *pos, u32 val) { asm volatile("movl %%eax,(%1)" :: "a" (val), "r" (pos) : "memory"); } /* external variable defines */ extern int pci_mmcfg_config_num; extern struct acpi_mcfg_allocation *pci_mmcfg_config; /* function prototypes */ int acpi_parse_mcfg(struct acpi_table_header *header); int pci_mmcfg_reserved(uint64_t address, unsigned int segment, unsigned int start_bus, unsigned int end_bus, unsigned int flags); int pci_mmcfg_arch_init(void); int pci_mmcfg_arch_enable(unsigned int); void pci_mmcfg_arch_disable(unsigned int); xen-4.4.0/xen/arch/x86/x86_64/compat/0000775000175000017500000000000012307313555015020 5ustar smbsmbxen-4.4.0/xen/arch/x86/x86_64/compat/Makefile0000664000175000017500000000002612307313555016456 0ustar smbsmbobj-bin-y += entry.o xen-4.4.0/xen/arch/x86/x86_64/compat/entry.S0000664000175000017500000004332712307313555016316 0ustar smbsmb/* * Compatibility hypercall routines. */ #include #include #include #include #include #include #include #include #include ENTRY(compat_hypercall) pushq $0 SAVE_VOLATILE type=TRAP_syscall compat=1 cmpb $0,untrusted_msi(%rip) UNLIKELY_START(ne, msi_check) movl $HYPERCALL_VECTOR,%edi call check_for_unexpected_msi LOAD_C_CLOBBERED UNLIKELY_END(msi_check) GET_CURRENT(%rbx) cmpl $NR_hypercalls,%eax jae compat_bad_hypercall #ifndef NDEBUG /* Deliberately corrupt parameter regs not used by this hypercall. */ pushq UREGS_rbx(%rsp); pushq %rcx; pushq %rdx; pushq %rsi; pushq %rdi pushq UREGS_rbp+5*8(%rsp) leaq compat_hypercall_args_table(%rip),%r10 movl %eax,%eax movl $6,%ecx subb (%r10,%rax,1),%cl movq %rsp,%rdi movl $0xDEADBEEF,%eax rep stosq popq %r8 ; popq %r9 ; xchgl %r8d,%r9d /* Args 5&6: zero extend */ popq %rdx; popq %rcx; xchgl %edx,%ecx /* Args 3&4: zero extend */ popq %rdi; popq %rsi; xchgl %edi,%esi /* Args 1&2: zero extend */ movl UREGS_rax(%rsp),%eax pushq %rax pushq UREGS_rip+8(%rsp) #define SHADOW_BYTES 16 /* Shadow EIP + shadow hypercall # */ #else /* Relocate argument registers and zero-extend to 64 bits. */ movl %eax,%eax /* Hypercall # */ xchgl %ecx,%esi /* Arg 2, Arg 4 */ movl %edx,%edx /* Arg 3 */ movl %edi,%r8d /* Arg 5 */ movl %ebp,%r9d /* Arg 6 */ movl UREGS_rbx(%rsp),%edi /* Arg 1 */ #define SHADOW_BYTES 0 /* No on-stack shadow state */ #endif cmpb $0,tb_init_done(%rip) UNLIKELY_START(ne, compat_trace) call __trace_hypercall_entry /* Restore the registers that __trace_hypercall_entry clobbered. */ movl UREGS_rax+SHADOW_BYTES(%rsp),%eax /* Hypercall # */ movl UREGS_rbx+SHADOW_BYTES(%rsp),%edi /* Arg 1 */ movl UREGS_rcx+SHADOW_BYTES(%rsp),%esi /* Arg 2 */ movl UREGS_rdx+SHADOW_BYTES(%rsp),%edx /* Arg 3 */ movl UREGS_rsi+SHADOW_BYTES(%rsp),%ecx /* Arg 4 */ movl UREGS_rdi+SHADOW_BYTES(%rsp),%r8d /* Arg 5 */ movl UREGS_rbp+SHADOW_BYTES(%rsp),%r9d /* Arg 6 */ #undef SHADOW_BYTES UNLIKELY_END(compat_trace) leaq compat_hypercall_table(%rip),%r10 PERFC_INCR(hypercalls, %rax, %rbx) callq *(%r10,%rax,8) #ifndef NDEBUG /* Deliberately corrupt parameter regs used by this hypercall. */ popq %r10 # Shadow RIP cmpq %r10,UREGS_rip+8(%rsp) popq %rcx # Shadow hypercall index jne compat_skip_clobber /* If RIP has changed then don't clobber. */ leaq compat_hypercall_args_table(%rip),%r10 movb (%r10,%rcx,1),%cl movl $0xDEADBEEF,%r10d testb %cl,%cl; jz compat_skip_clobber; movl %r10d,UREGS_rbx(%rsp) cmpb $2, %cl; jb compat_skip_clobber; movl %r10d,UREGS_rcx(%rsp) cmpb $3, %cl; jb compat_skip_clobber; movl %r10d,UREGS_rdx(%rsp) cmpb $4, %cl; jb compat_skip_clobber; movl %r10d,UREGS_rsi(%rsp) cmpb $5, %cl; jb compat_skip_clobber; movl %r10d,UREGS_rdi(%rsp) cmpb $6, %cl; jb compat_skip_clobber; movl %r10d,UREGS_rbp(%rsp) compat_skip_clobber: #endif movl %eax,UREGS_rax(%rsp) # save the return value /* %rbx: struct vcpu */ ENTRY(compat_test_all_events) ASSERT_NOT_IN_ATOMIC cli # tests must not race interrupts /*compat_test_softirqs:*/ movl VCPU_processor(%rbx),%eax shll $IRQSTAT_shift,%eax leaq irq_stat+IRQSTAT_softirq_pending(%rip),%rcx cmpl $0,(%rcx,%rax,1) jne compat_process_softirqs testb $1,VCPU_mce_pending(%rbx) jnz compat_process_mce .Lcompat_test_guest_nmi: testb $1,VCPU_nmi_pending(%rbx) jnz compat_process_nmi compat_test_guest_events: movq VCPU_vcpu_info(%rbx),%rax movzwl COMPAT_VCPUINFO_upcall_pending(%rax),%eax decl %eax cmpl $0xfe,%eax ja compat_restore_all_guest /*compat_process_guest_events:*/ sti leaq VCPU_trap_bounce(%rbx),%rdx movl VCPU_event_addr(%rbx),%eax movl %eax,TRAPBOUNCE_eip(%rdx) movl VCPU_event_sel(%rbx),%eax movw %ax,TRAPBOUNCE_cs(%rdx) movb $TBF_INTERRUPT,TRAPBOUNCE_flags(%rdx) call compat_create_bounce_frame jmp compat_test_all_events ALIGN /* %rbx: struct vcpu */ compat_process_softirqs: sti andl $~TRAP_regs_partial,UREGS_entry_vector(%rsp) call do_softirq jmp compat_test_all_events ALIGN /* %rbx: struct vcpu */ compat_process_mce: testb $1 << VCPU_TRAP_MCE,VCPU_async_exception_mask(%rbx) jnz .Lcompat_test_guest_nmi sti movb $0,VCPU_mce_pending(%rbx) call set_guest_machinecheck_trapbounce testl %eax,%eax jz compat_test_all_events movzbl VCPU_async_exception_mask(%rbx),%edx # save mask for the movb %dl,VCPU_mce_old_mask(%rbx) # iret hypercall orl $1 << VCPU_TRAP_MCE,%edx movb %dl,VCPU_async_exception_mask(%rbx) jmp compat_process_trap ALIGN /* %rbx: struct vcpu */ compat_process_nmi: testb $1 << VCPU_TRAP_NMI,VCPU_async_exception_mask(%rbx) jnz compat_test_guest_events sti movb $0,VCPU_nmi_pending(%rbx) call set_guest_nmi_trapbounce testl %eax,%eax jz compat_test_all_events movzbl VCPU_async_exception_mask(%rbx),%edx # save mask for the movb %dl,VCPU_nmi_old_mask(%rbx) # iret hypercall orl $1 << VCPU_TRAP_NMI,%edx movb %dl,VCPU_async_exception_mask(%rbx) /* FALLTHROUGH */ compat_process_trap: leaq VCPU_trap_bounce(%rbx),%rdx call compat_create_bounce_frame jmp compat_test_all_events compat_bad_hypercall: movl $-ENOSYS,UREGS_rax(%rsp) jmp compat_test_all_events /* %rbx: struct vcpu, interrupts disabled */ ENTRY(compat_restore_all_guest) ASSERT_INTERRUPTS_DISABLED RESTORE_ALL adj=8 compat=1 .Lft0: iretq .section .fixup,"ax" .Lfx0: sti SAVE_ALL movq UREGS_error_code(%rsp),%rsi movq %rsp,%rax andq $~0xf,%rsp pushq $__HYPERVISOR_DS # SS pushq %rax # RSP pushfq # RFLAGS pushq $__HYPERVISOR_CS # CS leaq .Ldf0(%rip),%rax pushq %rax # RIP pushq %rsi # error_code/entry_vector jmp handle_exception .Ldf0: GET_CURRENT(%rbx) jmp compat_test_all_events compat_failsafe_callback: GET_CURRENT(%rbx) leaq VCPU_trap_bounce(%rbx),%rdx movl VCPU_failsafe_addr(%rbx),%eax movl %eax,TRAPBOUNCE_eip(%rdx) movl VCPU_failsafe_sel(%rbx),%eax movw %ax,TRAPBOUNCE_cs(%rdx) movb $TBF_FAILSAFE,TRAPBOUNCE_flags(%rdx) btq $_VGCF_failsafe_disables_events,VCPU_guest_context_flags(%rbx) jnc 1f orb $TBF_INTERRUPT,TRAPBOUNCE_flags(%rdx) 1: call compat_create_bounce_frame jmp compat_test_all_events .previous _ASM_PRE_EXTABLE(.Lft0, .Lfx0) _ASM_EXTABLE(.Ldf0, compat_failsafe_callback) /* %rdx: trap_bounce, %rbx: struct vcpu */ ENTRY(compat_post_handle_exception) testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx) jz compat_test_all_events .Lcompat_bounce_exception: call compat_create_bounce_frame movb $0,TRAPBOUNCE_flags(%rdx) jmp compat_test_all_events ENTRY(compat_syscall) cmpb $0,VCPU_syscall32_disables_events(%rbx) movzwl VCPU_syscall32_sel(%rbx),%esi movq VCPU_syscall32_addr(%rbx),%rax setne %cl leaq VCPU_trap_bounce(%rbx),%rdx testl $~3,%esi leal (,%rcx,TBF_INTERRUPT),%ecx UNLIKELY_START(z, compat_syscall_gpf) movq VCPU_trap_ctxt(%rbx),%rdi movl $TRAP_gp_fault,UREGS_entry_vector(%rsp) subl $2,UREGS_rip(%rsp) movl $0,TRAPBOUNCE_error_code(%rdx) movl TRAP_gp_fault * TRAPINFO_sizeof + TRAPINFO_eip(%rdi),%eax movzwl TRAP_gp_fault * TRAPINFO_sizeof + TRAPINFO_cs(%rdi),%esi testb $4,TRAP_gp_fault * TRAPINFO_sizeof + TRAPINFO_flags(%rdi) setnz %cl leal TBF_EXCEPTION|TBF_EXCEPTION_ERRCODE(,%rcx,TBF_INTERRUPT),%ecx UNLIKELY_END(compat_syscall_gpf) movq %rax,TRAPBOUNCE_eip(%rdx) movw %si,TRAPBOUNCE_cs(%rdx) movb %cl,TRAPBOUNCE_flags(%rdx) jmp .Lcompat_bounce_exception ENTRY(compat_sysenter) movq VCPU_trap_ctxt(%rbx),%rcx cmpb $TRAP_gp_fault,UREGS_entry_vector(%rsp) movzwl VCPU_sysenter_sel(%rbx),%eax movzwl TRAP_gp_fault * TRAPINFO_sizeof + TRAPINFO_cs(%rcx),%ecx cmovel %ecx,%eax testl $~3,%eax movl $FLAT_COMPAT_USER_SS,UREGS_ss(%rsp) cmovzl %ecx,%eax movw %ax,TRAPBOUNCE_cs(%rdx) call compat_create_bounce_frame jmp compat_test_all_events ENTRY(compat_int80_direct_trap) call compat_create_bounce_frame jmp compat_test_all_events /* CREATE A BASIC EXCEPTION FRAME ON GUEST OS (RING-1) STACK: */ /* {[ERRCODE,] EIP, CS, EFLAGS, [ESP, SS]} */ /* %rdx: trap_bounce, %rbx: struct vcpu */ /* On return only %rbx and %rdx are guaranteed non-clobbered. */ compat_create_bounce_frame: ASSERT_INTERRUPTS_ENABLED mov %fs,%edi testb $2,UREGS_cs+8(%rsp) jz 1f /* Push new frame at registered guest-OS stack base. */ movl VCPU_kernel_sp(%rbx),%esi .Lft1: mov VCPU_kernel_ss(%rbx),%fs subl $2*4,%esi movl UREGS_rsp+8(%rsp),%eax .Lft2: movl %eax,%fs:(%rsi) movl UREGS_ss+8(%rsp),%eax .Lft3: movl %eax,%fs:4(%rsi) jmp 2f 1: /* In kernel context already: push new frame at existing %rsp. */ movl UREGS_rsp+8(%rsp),%esi .Lft4: mov UREGS_ss+8(%rsp),%fs 2: movb TRAPBOUNCE_flags(%rdx),%cl subl $3*4,%esi movq VCPU_vcpu_info(%rbx),%rax pushq COMPAT_VCPUINFO_upcall_mask(%rax) testb $TBF_INTERRUPT,%cl setnz %ch # TBF_INTERRUPT -> set upcall mask orb %ch,COMPAT_VCPUINFO_upcall_mask(%rax) popq %rax shll $16,%eax # Bits 16-23: saved_upcall_mask movw UREGS_cs+8(%rsp),%ax # Bits 0-15: CS .Lft5: movl %eax,%fs:4(%rsi) # CS / saved_upcall_mask shrl $16,%eax testb %al,%al # Bits 0-7: saved_upcall_mask setz %ch # %ch == !saved_upcall_mask movl UREGS_eflags+8(%rsp),%eax andl $~X86_EFLAGS_IF,%eax addb %ch,%ch # Bit 9 (EFLAGS.IF) orb %ch,%ah # Fold EFLAGS.IF into %eax .Lft6: movl %eax,%fs:2*4(%rsi) # EFLAGS movl UREGS_rip+8(%rsp),%eax .Lft7: movl %eax,%fs:(%rsi) # EIP testb $TBF_EXCEPTION_ERRCODE,%cl jz 1f subl $4,%esi movl TRAPBOUNCE_error_code(%rdx),%eax .Lft8: movl %eax,%fs:(%rsi) # ERROR CODE 1: testb $TBF_FAILSAFE,%cl UNLIKELY_START(nz, compat_bounce_failsafe) subl $4*4,%esi movl %gs,%eax .Lft9: movl %eax,%fs:3*4(%rsi) # GS .Lft10: movl %edi,%fs:2*4(%rsi) # FS movl %es,%eax .Lft11: movl %eax,%fs:1*4(%rsi) # ES movl %ds,%eax .Lft12: movl %eax,%fs:0*4(%rsi) # DS UNLIKELY_END(compat_bounce_failsafe) /* Rewrite our stack frame and return to guest-OS mode. */ /* IA32 Ref. Vol. 3: TF, VM, RF and NT flags are cleared on trap. */ andl $~(X86_EFLAGS_VM|X86_EFLAGS_RF|\ X86_EFLAGS_NT|X86_EFLAGS_TF),UREGS_eflags+8(%rsp) mov %fs,UREGS_ss+8(%rsp) movl %esi,UREGS_rsp+8(%rsp) .Lft13: mov %edi,%fs movzwl TRAPBOUNCE_cs(%rdx),%eax /* Null selectors (0-3) are not allowed. */ testl $~3,%eax UNLIKELY_START(z, compat_bounce_null_selector) lea UNLIKELY_DISPATCH_LABEL(compat_bounce_null_selector)(%rip), %rdi jmp asm_domain_crash_synchronous /* Does not return */ __UNLIKELY_END(compat_bounce_null_selector) movl %eax,UREGS_cs+8(%rsp) movl TRAPBOUNCE_eip(%rdx),%eax movl %eax,UREGS_rip+8(%rsp) ret .section .fixup,"ax" .Lfx13: xorl %edi,%edi jmp .Lft13 .previous _ASM_EXTABLE(.Lft1, dom_crash_sync_extable) _ASM_EXTABLE(.Lft2, compat_crash_page_fault) _ASM_EXTABLE(.Lft3, compat_crash_page_fault_4) _ASM_EXTABLE(.Lft4, dom_crash_sync_extable) _ASM_EXTABLE(.Lft5, compat_crash_page_fault_4) _ASM_EXTABLE(.Lft6, compat_crash_page_fault_8) _ASM_EXTABLE(.Lft7, compat_crash_page_fault) _ASM_EXTABLE(.Lft8, compat_crash_page_fault) _ASM_EXTABLE(.Lft9, compat_crash_page_fault_12) _ASM_EXTABLE(.Lft10, compat_crash_page_fault_8) _ASM_EXTABLE(.Lft11, compat_crash_page_fault_4) _ASM_EXTABLE(.Lft12, compat_crash_page_fault) _ASM_EXTABLE(.Lft13, .Lfx13) compat_crash_page_fault_12: addl $4,%esi compat_crash_page_fault_8: addl $4,%esi compat_crash_page_fault_4: addl $4,%esi compat_crash_page_fault: .Lft14: mov %edi,%fs movl %esi,%edi call show_page_walk jmp dom_crash_sync_extable .section .fixup,"ax" .Lfx14: xorl %edi,%edi jmp .Lft14 .previous _ASM_EXTABLE(.Lft14, .Lfx14) .section .rodata, "a", @progbits ENTRY(compat_hypercall_table) .quad compat_set_trap_table /* 0 */ .quad do_mmu_update .quad compat_set_gdt .quad do_stack_switch .quad compat_set_callbacks .quad do_fpu_taskswitch /* 5 */ .quad do_sched_op_compat .quad compat_platform_op .quad do_set_debugreg .quad do_get_debugreg .quad compat_update_descriptor /* 10 */ .quad compat_ni_hypercall .quad compat_memory_op .quad compat_multicall .quad compat_update_va_mapping .quad compat_set_timer_op /* 15 */ .quad do_event_channel_op_compat .quad compat_xen_version .quad do_console_io .quad compat_physdev_op_compat .quad compat_grant_table_op /* 20 */ .quad compat_vm_assist .quad compat_update_va_mapping_otherdomain .quad compat_iret .quad compat_vcpu_op .quad compat_ni_hypercall /* 25 */ .quad compat_mmuext_op .quad do_xsm_op .quad compat_nmi_op .quad compat_sched_op .quad compat_callback_op /* 30 */ .quad compat_xenoprof_op .quad do_event_channel_op .quad compat_physdev_op .quad do_hvm_op .quad do_sysctl /* 35 */ .quad do_domctl .quad compat_kexec_op .quad do_tmem_op .rept __HYPERVISOR_arch_0-((.-compat_hypercall_table)/8) .quad compat_ni_hypercall .endr .quad do_mca /* 48 */ .rept NR_hypercalls-((.-compat_hypercall_table)/8) .quad compat_ni_hypercall .endr ENTRY(compat_hypercall_args_table) .byte 1 /* compat_set_trap_table */ /* 0 */ .byte 4 /* compat_mmu_update */ .byte 2 /* compat_set_gdt */ .byte 2 /* compat_stack_switch */ .byte 4 /* compat_set_callbacks */ .byte 1 /* compat_fpu_taskswitch */ /* 5 */ .byte 2 /* compat_sched_op_compat */ .byte 1 /* compat_platform_op */ .byte 2 /* compat_set_debugreg */ .byte 1 /* compat_get_debugreg */ .byte 4 /* compat_update_descriptor */ /* 10 */ .byte 0 /* compat_ni_hypercall */ .byte 2 /* compat_memory_op */ .byte 2 /* compat_multicall */ .byte 4 /* compat_update_va_mapping */ .byte 2 /* compat_set_timer_op */ /* 15 */ .byte 1 /* compat_event_channel_op_compat */ .byte 2 /* compat_xen_version */ .byte 3 /* compat_console_io */ .byte 1 /* compat_physdev_op_compat */ .byte 3 /* compat_grant_table_op */ /* 20 */ .byte 2 /* compat_vm_assist */ .byte 5 /* compat_update_va_mapping_otherdomain */ .byte 0 /* compat_iret */ .byte 3 /* compat_vcpu_op */ .byte 0 /* compat_ni_hypercall */ /* 25 */ .byte 4 /* compat_mmuext_op */ .byte 1 /* do_xsm_op */ .byte 2 /* compat_nmi_op */ .byte 2 /* compat_sched_op */ .byte 2 /* compat_callback_op */ /* 30 */ .byte 2 /* compat_xenoprof_op */ .byte 2 /* compat_event_channel_op */ .byte 2 /* compat_physdev_op */ .byte 2 /* do_hvm_op */ .byte 1 /* do_sysctl */ /* 35 */ .byte 1 /* do_domctl */ .byte 2 /* compat_kexec_op */ .byte 1 /* do_tmem_op */ .rept __HYPERVISOR_arch_0-(.-compat_hypercall_args_table) .byte 0 /* compat_ni_hypercall */ .endr .byte 1 /* do_mca */ .rept NR_hypercalls-(.-compat_hypercall_args_table) .byte 0 /* compat_ni_hypercall */ .endr xen-4.4.0/xen/arch/x86/x86_64/compat/mm.c0000664000175000017500000002565612307313555015613 0ustar smbsmb#include #include #include #include #include #include int compat_set_gdt(XEN_GUEST_HANDLE_PARAM(uint) frame_list, unsigned int entries) { unsigned int i, nr_pages = (entries + 511) / 512; unsigned long frames[16]; long ret; /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */ if ( entries > FIRST_RESERVED_GDT_ENTRY ) return -EINVAL; if ( !guest_handle_okay(frame_list, nr_pages) ) return -EFAULT; for ( i = 0; i < nr_pages; ++i ) { unsigned int frame; if ( __copy_from_guest(&frame, frame_list, 1) ) return -EFAULT; frames[i] = frame; guest_handle_add_offset(frame_list, 1); } domain_lock(current->domain); if ( (ret = set_gdt(current, frames, entries)) == 0 ) flush_tlb_local(); domain_unlock(current->domain); return ret; } int compat_update_descriptor(u32 pa_lo, u32 pa_hi, u32 desc_lo, u32 desc_hi) { return do_update_descriptor(pa_lo | ((u64)pa_hi << 32), desc_lo | ((u64)desc_hi << 32)); } int compat_arch_memory_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg) { struct compat_machphys_mfn_list xmml; l2_pgentry_t l2e; unsigned long v; compat_pfn_t mfn; unsigned int i; int rc = 0; switch ( op ) { case XENMEM_set_memory_map: { struct compat_foreign_memory_map cmp; struct xen_foreign_memory_map *nat = COMPAT_ARG_XLAT_VIRT_BASE; if ( copy_from_guest(&cmp, arg, 1) ) return -EFAULT; #define XLAT_memory_map_HNDL_buffer(_d_, _s_) \ guest_from_compat_handle((_d_)->buffer, (_s_)->buffer) XLAT_foreign_memory_map(nat, &cmp); #undef XLAT_memory_map_HNDL_buffer rc = arch_memory_op(op, guest_handle_from_ptr(nat, void)); break; } case XENMEM_memory_map: case XENMEM_machine_memory_map: { struct compat_memory_map cmp; struct xen_memory_map *nat = COMPAT_ARG_XLAT_VIRT_BASE; if ( copy_from_guest(&cmp, arg, 1) ) return -EFAULT; #define XLAT_memory_map_HNDL_buffer(_d_, _s_) \ guest_from_compat_handle((_d_)->buffer, (_s_)->buffer) XLAT_memory_map(nat, &cmp); #undef XLAT_memory_map_HNDL_buffer rc = arch_memory_op(op, guest_handle_from_ptr(nat, void)); if ( rc < 0 ) break; #define XLAT_memory_map_HNDL_buffer(_d_, _s_) ((void)0) XLAT_memory_map(&cmp, nat); #undef XLAT_memory_map_HNDL_buffer if ( __copy_to_guest(arg, &cmp, 1) ) rc = -EFAULT; break; } case XENMEM_set_pod_target: case XENMEM_get_pod_target: { struct compat_pod_target cmp; struct xen_pod_target *nat = COMPAT_ARG_XLAT_VIRT_BASE; if ( copy_from_guest(&cmp, arg, 1) ) return -EFAULT; XLAT_pod_target(nat, &cmp); rc = arch_memory_op(op, guest_handle_from_ptr(nat, void)); if ( rc < 0 ) break; if ( rc == __HYPERVISOR_memory_op ) hypercall_xlat_continuation(NULL, 0x2, nat, arg); XLAT_pod_target(&cmp, nat); if ( __copy_to_guest(arg, &cmp, 1) ) { if ( rc == __HYPERVISOR_memory_op ) hypercall_cancel_continuation(); rc = -EFAULT; } break; } case XENMEM_machphys_mapping: { struct domain *d = current->domain; struct compat_machphys_mapping mapping = { .v_start = MACH2PHYS_COMPAT_VIRT_START(d), .v_end = MACH2PHYS_COMPAT_VIRT_END, .max_mfn = MACH2PHYS_COMPAT_NR_ENTRIES(d) - 1 }; if ( copy_to_guest(arg, &mapping, 1) ) rc = -EFAULT; break; } case XENMEM_machphys_mfn_list: { unsigned long limit; compat_pfn_t last_mfn; if ( copy_from_guest(&xmml, arg, 1) ) return -EFAULT; limit = (unsigned long)(compat_machine_to_phys_mapping + max_page); if ( limit > RDWR_COMPAT_MPT_VIRT_END ) limit = RDWR_COMPAT_MPT_VIRT_END; for ( i = 0, v = RDWR_COMPAT_MPT_VIRT_START, last_mfn = 0; (i != xmml.max_extents) && (v < limit); i++, v += 1 << L2_PAGETABLE_SHIFT ) { l2e = compat_idle_pg_table_l2[l2_table_offset(v)]; if ( l2e_get_flags(l2e) & _PAGE_PRESENT ) mfn = l2e_get_pfn(l2e); else mfn = last_mfn; ASSERT(mfn); if ( copy_to_compat_offset(xmml.extent_start, i, &mfn, 1) ) return -EFAULT; last_mfn = mfn; } xmml.nr_extents = i; if ( __copy_to_guest(arg, &xmml, 1) ) rc = -EFAULT; break; } case XENMEM_get_sharing_freed_pages: return mem_sharing_get_nr_saved_mfns(); case XENMEM_get_sharing_shared_pages: return mem_sharing_get_nr_shared_mfns(); case XENMEM_paging_op: case XENMEM_access_op: { xen_mem_event_op_t meo; if ( copy_from_guest(&meo, arg, 1) ) return -EFAULT; rc = do_mem_event_op(op, meo.domain, (void *) &meo); if ( !rc && __copy_to_guest(arg, &meo, 1) ) return -EFAULT; break; } case XENMEM_sharing_op: { xen_mem_sharing_op_t mso; if ( copy_from_guest(&mso, arg, 1) ) return -EFAULT; if ( mso.op == XENMEM_sharing_op_audit ) return mem_sharing_audit(); rc = do_mem_event_op(op, mso.domain, (void *) &mso); if ( !rc && __copy_to_guest(arg, &mso, 1) ) return -EFAULT; break; } default: rc = -ENOSYS; break; } return rc; } int compat_update_va_mapping(unsigned int va, u32 lo, u32 hi, unsigned int flags) { return do_update_va_mapping(va, lo | ((u64)hi << 32), flags); } int compat_update_va_mapping_otherdomain(unsigned long va, u32 lo, u32 hi, unsigned long flags, domid_t domid) { return do_update_va_mapping_otherdomain(va, lo | ((u64)hi << 32), flags, domid); } DEFINE_XEN_GUEST_HANDLE(mmuext_op_compat_t); int compat_mmuext_op(XEN_GUEST_HANDLE_PARAM(mmuext_op_compat_t) cmp_uops, unsigned int count, XEN_GUEST_HANDLE_PARAM(uint) pdone, unsigned int foreigndom) { unsigned int i, preempt_mask; int rc = 0; XEN_GUEST_HANDLE_PARAM(mmuext_op_t) nat_ops; if ( unlikely(count == MMU_UPDATE_PREEMPTED) && likely(guest_handle_is_null(cmp_uops)) ) { set_xen_guest_handle(nat_ops, NULL); return do_mmuext_op(nat_ops, count, pdone, foreigndom); } preempt_mask = count & MMU_UPDATE_PREEMPTED; count ^= preempt_mask; if ( unlikely(!guest_handle_okay(cmp_uops, count)) ) return -EFAULT; set_xen_guest_handle(nat_ops, COMPAT_ARG_XLAT_VIRT_BASE); for ( ; count; count -= i ) { mmuext_op_t *nat_op = nat_ops.p; unsigned int limit = COMPAT_ARG_XLAT_SIZE / sizeof(*nat_op); int err; for ( i = 0; i < min(limit, count); ++i ) { mmuext_op_compat_t cmp_op; enum XLAT_mmuext_op_arg1 arg1; enum XLAT_mmuext_op_arg2 arg2; if ( unlikely(__copy_from_guest(&cmp_op, cmp_uops, 1) != 0) ) { rc = -EFAULT; break; } switch ( cmp_op.cmd ) { case MMUEXT_PIN_L1_TABLE: case MMUEXT_PIN_L2_TABLE: case MMUEXT_PIN_L3_TABLE: case MMUEXT_PIN_L4_TABLE: case MMUEXT_UNPIN_TABLE: case MMUEXT_NEW_BASEPTR: case MMUEXT_CLEAR_PAGE: case MMUEXT_COPY_PAGE: arg1 = XLAT_mmuext_op_arg1_mfn; break; default: arg1 = XLAT_mmuext_op_arg1_linear_addr; break; case MMUEXT_NEW_USER_BASEPTR: rc = -EINVAL; case MMUEXT_TLB_FLUSH_LOCAL: case MMUEXT_TLB_FLUSH_MULTI: case MMUEXT_TLB_FLUSH_ALL: case MMUEXT_FLUSH_CACHE: arg1 = -1; break; } if ( rc ) break; switch ( cmp_op.cmd ) { case MMUEXT_SET_LDT: arg2 = XLAT_mmuext_op_arg2_nr_ents; break; case MMUEXT_TLB_FLUSH_MULTI: case MMUEXT_INVLPG_MULTI: arg2 = XLAT_mmuext_op_arg2_vcpumask; break; case MMUEXT_COPY_PAGE: arg2 = XLAT_mmuext_op_arg2_src_mfn; break; default: arg2 = -1; break; } #define XLAT_mmuext_op_HNDL_arg2_vcpumask(_d_, _s_) \ guest_from_compat_handle((_d_)->arg2.vcpumask, (_s_)->arg2.vcpumask) XLAT_mmuext_op(nat_op, &cmp_op); #undef XLAT_mmuext_op_HNDL_arg2_vcpumask if ( rc || i >= limit ) break; guest_handle_add_offset(cmp_uops, 1); ++nat_op; } err = do_mmuext_op(nat_ops, i | preempt_mask, pdone, foreigndom); if ( err ) { BUILD_BUG_ON(__HYPERVISOR_mmuext_op <= 0); if ( err == __HYPERVISOR_mmuext_op ) { struct cpu_user_regs *regs = guest_cpu_user_regs(); struct mc_state *mcs = ¤t->mc_state; unsigned int arg1 = !test_bit(_MCSF_in_multicall, &mcs->flags) ? regs->ecx : mcs->call.args[1]; unsigned int left = arg1 & ~MMU_UPDATE_PREEMPTED; BUG_ON(left == arg1 && left != i); BUG_ON(left > count); guest_handle_add_offset(nat_ops, i - left); guest_handle_subtract_offset(cmp_uops, left); left = 1; if ( arg1 != MMU_UPDATE_PREEMPTED ) { BUG_ON(!hypercall_xlat_continuation(&left, 0x01, nat_ops, cmp_uops)); if ( !test_bit(_MCSF_in_multicall, &mcs->flags) ) regs->_ecx += count - i; else mcs->compat_call.args[1] += count - i; } else BUG_ON(hypercall_xlat_continuation(&left, 0)); BUG_ON(left != arg1); } else BUG_ON(err > 0); rc = err; } if ( rc ) break; /* Force do_mmuext_op() to not start counting from zero again. */ preempt_mask = MMU_UPDATE_PREEMPTED; } return rc; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/x86_64/compat/traps.c0000664000175000017500000002507312307313555016324 0ustar smbsmb#include #include #include #include void compat_show_guest_stack(struct vcpu *v, struct cpu_user_regs *regs, int debug_stack_lines) { unsigned int i, *stack, addr, mask = STACK_SIZE; stack = (unsigned int *)(unsigned long)regs->_esp; printk("Guest stack trace from esp=%08lx:\n ", (unsigned long)stack); if ( !__compat_access_ok(v->domain, stack, sizeof(*stack)) ) { printk("Guest-inaccessible memory.\n"); return; } if ( v != current ) { struct vcpu *vcpu; unsigned long mfn; ASSERT(guest_kernel_mode(v, regs)); mfn = read_cr3() >> PAGE_SHIFT; for_each_vcpu( v->domain, vcpu ) if ( pagetable_get_pfn(vcpu->arch.guest_table) == mfn ) break; if ( !vcpu ) { stack = do_page_walk(v, (unsigned long)stack); if ( (unsigned long)stack < PAGE_SIZE ) { printk("Inaccessible guest memory.\n"); return; } mask = PAGE_SIZE; } } for ( i = 0; i < debug_stack_lines * 8; i++ ) { if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask ) break; if ( __get_user(addr, stack) ) { if ( i != 0 ) printk("\n "); printk("Fault while accessing guest memory."); i = 1; break; } if ( (i != 0) && ((i % 8) == 0) ) printk("\n "); printk(" %08x", addr); stack++; } if ( mask == PAGE_SIZE ) { BUILD_BUG_ON(PAGE_SIZE == STACK_SIZE); unmap_domain_page(stack); } if ( i == 0 ) printk("Stack empty."); printk("\n"); } unsigned int compat_iret(void) { struct cpu_user_regs *regs = guest_cpu_user_regs(); struct vcpu *v = current; u32 eflags; /* Trim stack pointer to 32 bits. */ regs->rsp = (u32)regs->rsp; /* Restore EAX (clobbered by hypercall). */ if ( unlikely(__get_user(regs->_eax, (u32 *)regs->rsp)) ) goto exit_and_crash; /* Restore CS and EIP. */ if ( unlikely(__get_user(regs->_eip, (u32 *)regs->rsp + 1)) || unlikely(__get_user(regs->cs, (u32 *)regs->rsp + 2)) ) goto exit_and_crash; /* * Fix up and restore EFLAGS. We fix up in a local staging area * to avoid firing the BUG_ON(IOPL) check in arch_get_info_guest. */ if ( unlikely(__get_user(eflags, (u32 *)regs->rsp + 3)) ) goto exit_and_crash; regs->_eflags = (eflags & ~X86_EFLAGS_IOPL) | X86_EFLAGS_IF; if ( unlikely(eflags & X86_EFLAGS_VM) ) { /* * Cannot return to VM86 mode: inject a GP fault instead. Note that * the GP fault is reported on the first VM86 mode instruction, not on * the IRET (which is why we can simply leave the stack frame as-is * (except for perhaps having to copy it), which in turn seems better * than teaching create_bounce_frame() to needlessly deal with vm86 * mode frames). */ const struct trap_info *ti; u32 x, ksp = v->arch.pv_vcpu.kernel_sp - 40; unsigned int i; int rc = 0; gdprintk(XENLOG_ERR, "VM86 mode unavailable (ksp:%08X->%08X)\n", regs->_esp, ksp); if ( ksp < regs->_esp ) { for (i = 1; i < 10; ++i) { rc |= __get_user(x, (u32 *)regs->rsp + i); rc |= __put_user(x, (u32 *)(unsigned long)ksp + i); } } else if ( ksp > regs->_esp ) { for (i = 9; i > 0; ++i) { rc |= __get_user(x, (u32 *)regs->rsp + i); rc |= __put_user(x, (u32 *)(unsigned long)ksp + i); } } if ( rc ) goto exit_and_crash; regs->_esp = ksp; regs->ss = v->arch.pv_vcpu.kernel_ss; ti = &v->arch.pv_vcpu.trap_ctxt[TRAP_gp_fault]; if ( TI_GET_IF(ti) ) eflags &= ~X86_EFLAGS_IF; regs->_eflags &= ~(X86_EFLAGS_VM|X86_EFLAGS_RF| X86_EFLAGS_NT|X86_EFLAGS_TF); if ( unlikely(__put_user(0, (u32 *)regs->rsp)) ) goto exit_and_crash; regs->_eip = ti->address; regs->cs = ti->cs; } else if ( unlikely(ring_0(regs)) ) goto exit_and_crash; else if ( !ring_1(regs) ) { /* Return to ring 2/3: restore ESP and SS. */ if ( __get_user(regs->ss, (u32 *)regs->rsp + 5) || __get_user(regs->_esp, (u32 *)regs->rsp + 4)) goto exit_and_crash; } else regs->_esp += 16; /* Restore upcall mask from supplied EFLAGS.IF. */ vcpu_info(v, evtchn_upcall_mask) = !(eflags & X86_EFLAGS_IF); async_exception_cleanup(v); /* * The hypercall exit path will overwrite EAX with this return * value. */ return regs->_eax; exit_and_crash: gdprintk(XENLOG_ERR, "Fatal error\n"); domain_crash(v->domain); return 0; } static long compat_register_guest_callback( struct compat_callback_register *reg) { long ret = 0; struct vcpu *v = current; fixup_guest_code_selector(v->domain, reg->address.cs); switch ( reg->type ) { case CALLBACKTYPE_event: v->arch.pv_vcpu.event_callback_cs = reg->address.cs; v->arch.pv_vcpu.event_callback_eip = reg->address.eip; break; case CALLBACKTYPE_failsafe: v->arch.pv_vcpu.failsafe_callback_cs = reg->address.cs; v->arch.pv_vcpu.failsafe_callback_eip = reg->address.eip; if ( reg->flags & CALLBACKF_mask_events ) set_bit(_VGCF_failsafe_disables_events, &v->arch.vgc_flags); else clear_bit(_VGCF_failsafe_disables_events, &v->arch.vgc_flags); break; case CALLBACKTYPE_syscall32: v->arch.pv_vcpu.syscall32_callback_cs = reg->address.cs; v->arch.pv_vcpu.syscall32_callback_eip = reg->address.eip; v->arch.pv_vcpu.syscall32_disables_events = (reg->flags & CALLBACKF_mask_events) != 0; break; case CALLBACKTYPE_sysenter: v->arch.pv_vcpu.sysenter_callback_cs = reg->address.cs; v->arch.pv_vcpu.sysenter_callback_eip = reg->address.eip; v->arch.pv_vcpu.sysenter_disables_events = (reg->flags & CALLBACKF_mask_events) != 0; break; case CALLBACKTYPE_nmi: ret = register_guest_nmi_callback(reg->address.eip); break; default: ret = -ENOSYS; break; } return ret; } static long compat_unregister_guest_callback( struct compat_callback_unregister *unreg) { long ret; switch ( unreg->type ) { case CALLBACKTYPE_event: case CALLBACKTYPE_failsafe: case CALLBACKTYPE_syscall32: case CALLBACKTYPE_sysenter: ret = -EINVAL; break; case CALLBACKTYPE_nmi: ret = unregister_guest_nmi_callback(); break; default: ret = -ENOSYS; break; } return ret; } long compat_callback_op(int cmd, XEN_GUEST_HANDLE(void) arg) { long ret; switch ( cmd ) { case CALLBACKOP_register: { struct compat_callback_register reg; ret = -EFAULT; if ( copy_from_guest(®, arg, 1) ) break; ret = compat_register_guest_callback(®); } break; case CALLBACKOP_unregister: { struct compat_callback_unregister unreg; ret = -EFAULT; if ( copy_from_guest(&unreg, arg, 1) ) break; ret = compat_unregister_guest_callback(&unreg); } break; default: ret = -EINVAL; break; } return ret; } long compat_set_callbacks(unsigned long event_selector, unsigned long event_address, unsigned long failsafe_selector, unsigned long failsafe_address) { struct compat_callback_register event = { .type = CALLBACKTYPE_event, .address = { .cs = event_selector, .eip = event_address } }; struct compat_callback_register failsafe = { .type = CALLBACKTYPE_failsafe, .address = { .cs = failsafe_selector, .eip = failsafe_address } }; compat_register_guest_callback(&event); compat_register_guest_callback(&failsafe); return 0; } DEFINE_XEN_GUEST_HANDLE(trap_info_compat_t); int compat_set_trap_table(XEN_GUEST_HANDLE(trap_info_compat_t) traps) { struct compat_trap_info cur; struct trap_info *dst = current->arch.pv_vcpu.trap_ctxt; long rc = 0; /* If no table is presented then clear the entire virtual IDT. */ if ( guest_handle_is_null(traps) ) { memset(dst, 0, NR_VECTORS * sizeof(*dst)); return 0; } for ( ; ; ) { if ( hypercall_preempt_check() ) { rc = hypercall_create_continuation( __HYPERVISOR_set_trap_table, "h", traps); break; } if ( copy_from_guest(&cur, traps, 1) ) { rc = -EFAULT; break; } if ( cur.address == 0 ) break; fixup_guest_code_selector(current->domain, cur.cs); XLAT_trap_info(dst + cur.vector, &cur); if ( cur.vector == 0x80 ) init_int80_direct_trap(current); guest_handle_add_offset(traps, 1); } return rc; } static void hypercall_page_initialise_ring1_kernel(void *hypercall_page) { char *p; int i; /* Fill in all the transfer points with template machine code. */ for ( i = 0; i < (PAGE_SIZE / 32); i++ ) { if ( i == __HYPERVISOR_iret ) continue; p = (char *)(hypercall_page + (i * 32)); *(u8 *)(p+ 0) = 0xb8; /* mov $,%eax */ *(u32 *)(p+ 1) = i; *(u16 *)(p+ 5) = (HYPERCALL_VECTOR << 8) | 0xcd; /* int $xx */ *(u8 *)(p+ 7) = 0xc3; /* ret */ } /* * HYPERVISOR_iret is special because it doesn't return and expects a * special stack frame. Guests jump at this transfer point instead of * calling it. */ p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32)); *(u8 *)(p+ 0) = 0x50; /* push %eax */ *(u8 *)(p+ 1) = 0xb8; /* mov $__HYPERVISOR_iret,%eax */ *(u32 *)(p+ 2) = __HYPERVISOR_iret; *(u16 *)(p+ 6) = (HYPERCALL_VECTOR << 8) | 0xcd; /* int $xx */ } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/x86_64/gpr_switch.S0000664000175000017500000000351412307313555016035 0ustar smbsmb/* * GPR context switch between host and guest. * Used by IO-port-access emulation stub. * * Copyright (c) 2006, Novell, Inc. */ #include #include ENTRY(host_to_guest_gpr_switch) movq (%rsp), %rcx movq %rdi, (%rsp) movq UREGS_rdx(%rdi), %rdx pushq %rbx movq UREGS_rax(%rdi), %rax movq UREGS_rbx(%rdi), %rbx pushq %rbp movq UREGS_rsi(%rdi), %rsi movq UREGS_rbp(%rdi), %rbp pushq %r12 movq UREGS_r8(%rdi), %r8 movq UREGS_r12(%rdi), %r12 pushq %r13 movq UREGS_r9(%rdi), %r9 movq UREGS_r13(%rdi), %r13 pushq %r14 movq UREGS_r10(%rdi), %r10 movq UREGS_r14(%rdi), %r14 pushq %r15 movq UREGS_r11(%rdi), %r11 movq UREGS_r15(%rdi), %r15 pushq %rcx /* dummy push, filled by guest_to_host_gpr_switch pointer */ pushq %rcx leaq guest_to_host_gpr_switch(%rip),%rcx movq %rcx,8(%rsp) movq UREGS_rcx(%rdi), %rcx movq UREGS_rdi(%rdi), %rdi ret ENTRY(guest_to_host_gpr_switch) pushq %rdi movq 7*8(%rsp), %rdi movq %rax, UREGS_rax(%rdi) popq UREGS_rdi(%rdi) movq %r15, UREGS_r15(%rdi) movq %r11, UREGS_r11(%rdi) popq %r15 movq %r14, UREGS_r14(%rdi) movq %r10, UREGS_r10(%rdi) popq %r14 movq %r13, UREGS_r13(%rdi) movq %r9, UREGS_r9(%rdi) popq %r13 movq %r12, UREGS_r12(%rdi) movq %r8, UREGS_r8(%rdi) popq %r12 movq %rbp, UREGS_rbp(%rdi) movq %rsi, UREGS_rsi(%rdi) popq %rbp movq %rbx, UREGS_rbx(%rdi) movq %rdx, UREGS_rdx(%rdi) popq %rbx movq %rcx, UREGS_rcx(%rdi) popq %rcx ret xen-4.4.0/xen/arch/x86/x86_64/domain.c0000664000175000017500000000306012307313555015147 0ustar smbsmb/****************************************************************************** * arch/x86/x86_64/domain.c * */ #include #include #include #include #include #define xen_vcpu_get_physid vcpu_get_physid CHECK_vcpu_get_physid; #undef xen_vcpu_get_physid int arch_compat_vcpu_op( int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg) { int rc = -ENOSYS; switch ( cmd ) { case VCPUOP_register_runstate_memory_area: { struct compat_vcpu_register_runstate_memory_area area; struct compat_vcpu_runstate_info info; area.addr.p = 0; rc = -EFAULT; if ( copy_from_guest(&area.addr.h, arg, 1) ) break; if ( area.addr.h.c != area.addr.p || !compat_handle_okay(area.addr.h, 1) ) break; rc = 0; guest_from_compat_handle(v->runstate_guest.compat, area.addr.h); if ( v == current ) { XLAT_vcpu_runstate_info(&info, &v->runstate); } else { struct vcpu_runstate_info runstate; vcpu_runstate_get(v, &runstate); XLAT_vcpu_runstate_info(&info, &runstate); } __copy_to_guest(v->runstate_guest.compat, &info, 1); break; } case VCPUOP_get_physid: rc = arch_do_vcpu_op(cmd, v, arg); break; } return rc; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/x86_64/entry.S0000664000175000017500000006617612307313555015042 0ustar smbsmb/* * Hypercall and fault low-level handling routines. * * Copyright (c) 2005, K A Fraser */ #include #include #include #include #include #include #include #include ALIGN /* %rbx: struct vcpu */ switch_to_kernel: leaq VCPU_trap_bounce(%rbx),%rdx /* TB_eip = (32-bit syscall && syscall32_addr) ? * syscall32_addr : syscall_addr */ xor %eax,%eax cmpw $FLAT_USER_CS32,UREGS_cs(%rsp) cmoveq VCPU_syscall32_addr(%rbx),%rax testq %rax,%rax cmovzq VCPU_syscall_addr(%rbx),%rax movq %rax,TRAPBOUNCE_eip(%rdx) /* TB_flags = VGCF_syscall_disables_events ? TBF_INTERRUPT : 0 */ btl $_VGCF_syscall_disables_events,VCPU_guest_context_flags(%rbx) setc %cl leal (,%rcx,TBF_INTERRUPT),%ecx movb %cl,TRAPBOUNCE_flags(%rdx) call create_bounce_frame andl $~X86_EFLAGS_DF,UREGS_eflags(%rsp) jmp test_all_events /* %rbx: struct vcpu, interrupts disabled */ restore_all_guest: ASSERT_INTERRUPTS_DISABLED RESTORE_ALL testw $TRAP_syscall,4(%rsp) jz iret_exit_to_guest /* Don't use SYSRET path if the return address is not canonical. */ movq 8(%rsp),%rcx sarq $47,%rcx incl %ecx cmpl $1,%ecx ja .Lforce_iret cmpw $FLAT_USER_CS32,16(%rsp)# CS movq 8(%rsp),%rcx # RIP movq 24(%rsp),%r11 # RFLAGS movq 32(%rsp),%rsp # RSP je 1f sysretq 1: sysretl .Lforce_iret: /* Mimic SYSRET behavior. */ movq 8(%rsp),%rcx # RIP movq 24(%rsp),%r11 # RFLAGS ALIGN /* No special register assumptions. */ iret_exit_to_guest: addq $8,%rsp .Lft0: iretq .section .fixup,"ax" .Lfx0: sti SAVE_ALL movq UREGS_error_code(%rsp),%rsi movq %rsp,%rax andq $~0xf,%rsp pushq $__HYPERVISOR_DS # SS pushq %rax # RSP pushfq # RFLAGS pushq $__HYPERVISOR_CS # CS leaq .Ldf0(%rip),%rax pushq %rax # RIP pushq %rsi # error_code/entry_vector jmp handle_exception .Ldf0: GET_CURRENT(%rbx) jmp test_all_events failsafe_callback: GET_CURRENT(%rbx) leaq VCPU_trap_bounce(%rbx),%rdx movq VCPU_failsafe_addr(%rbx),%rax movq %rax,TRAPBOUNCE_eip(%rdx) movb $TBF_FAILSAFE,TRAPBOUNCE_flags(%rdx) bt $_VGCF_failsafe_disables_events,VCPU_guest_context_flags(%rbx) jnc 1f orb $TBF_INTERRUPT,TRAPBOUNCE_flags(%rdx) 1: call create_bounce_frame jmp test_all_events .previous _ASM_PRE_EXTABLE(.Lft0, .Lfx0) _ASM_EXTABLE(.Ldf0, failsafe_callback) ALIGN /* No special register assumptions. */ restore_all_xen: RESTORE_ALL adj=8 iretq /* * When entering SYSCALL from kernel mode: * %rax = hypercall vector * %rdi, %rsi, %rdx, %r10, %r8, %9 = hypercall arguments * %rcx = SYSCALL-saved %rip * NB. We must move %r10 to %rcx for C function-calling ABI. * * When entering SYSCALL from user mode: * Vector directly to the registered arch.syscall_addr. * * Initial work is done by per-CPU stack trampolines. At this point %rsp * has been initialised to point at the correct Xen stack, and %rsp, %rflags * and %cs have been saved. All other registers are still to be saved onto * the stack, starting with %rip, and an appropriate %ss must be saved into * the space left by the trampoline. */ ENTRY(syscall_enter) sti movl $FLAT_KERNEL_SS,24(%rsp) pushq %rcx pushq $0 movq 24(%rsp),%r11 /* Re-load user RFLAGS into %r11 before saving */ SAVE_VOLATILE TRAP_syscall GET_CURRENT(%rbx) movq VCPU_domain(%rbx),%rcx testb $1,DOMAIN_is_32bit_pv(%rcx) jnz compat_syscall testb $TF_kernel_mode,VCPU_thread_flags(%rbx) jz switch_to_kernel /*hypercall:*/ movq %r10,%rcx cmpq $NR_hypercalls,%rax jae bad_hypercall #ifndef NDEBUG /* Deliberately corrupt parameter regs not used by this hypercall. */ pushq %rdi; pushq %rsi; pushq %rdx; pushq %rcx; pushq %r8 ; pushq %r9 leaq hypercall_args_table(%rip),%r10 movq $6,%rcx sub (%r10,%rax,1),%cl movq %rsp,%rdi movl $0xDEADBEEF,%eax rep stosq popq %r9 ; popq %r8 ; popq %rcx; popq %rdx; popq %rsi; popq %rdi movq UREGS_rax(%rsp),%rax pushq %rax pushq UREGS_rip+8(%rsp) #define SHADOW_BYTES 16 /* Shadow EIP + shadow hypercall # */ #else #define SHADOW_BYTES 0 /* No on-stack shadow state */ #endif cmpb $0,tb_init_done(%rip) UNLIKELY_START(ne, trace) call __trace_hypercall_entry /* Restore the registers that __trace_hypercall_entry clobbered. */ movq UREGS_rax+SHADOW_BYTES(%rsp),%rax /* Hypercall # */ movq UREGS_rdi+SHADOW_BYTES(%rsp),%rdi /* Arg 1 */ movq UREGS_rsi+SHADOW_BYTES(%rsp),%rsi /* Arg 2 */ movq UREGS_rdx+SHADOW_BYTES(%rsp),%rdx /* Arg 3 */ movq UREGS_r10+SHADOW_BYTES(%rsp),%rcx /* Arg 4 */ movq UREGS_r8 +SHADOW_BYTES(%rsp),%r8 /* Arg 5 */ movq UREGS_r9 +SHADOW_BYTES(%rsp),%r9 /* Arg 6 */ #undef SHADOW_BYTES UNLIKELY_END(trace) leaq hypercall_table(%rip),%r10 PERFC_INCR(hypercalls, %rax, %rbx) callq *(%r10,%rax,8) #ifndef NDEBUG /* Deliberately corrupt parameter regs used by this hypercall. */ popq %r10 # Shadow RIP cmpq %r10,UREGS_rip+8(%rsp) popq %rcx # Shadow hypercall index jne skip_clobber /* If RIP has changed then don't clobber. */ leaq hypercall_args_table(%rip),%r10 movb (%r10,%rcx,1),%cl movl $0xDEADBEEF,%r10d cmpb $1,%cl; jb skip_clobber; movq %r10,UREGS_rdi(%rsp) cmpb $2,%cl; jb skip_clobber; movq %r10,UREGS_rsi(%rsp) cmpb $3,%cl; jb skip_clobber; movq %r10,UREGS_rdx(%rsp) cmpb $4,%cl; jb skip_clobber; movq %r10,UREGS_r10(%rsp) cmpb $5,%cl; jb skip_clobber; movq %r10,UREGS_r8(%rsp) cmpb $6,%cl; jb skip_clobber; movq %r10,UREGS_r9(%rsp) skip_clobber: #endif movq %rax,UREGS_rax(%rsp) # save the return value /* %rbx: struct vcpu */ test_all_events: ASSERT_NOT_IN_ATOMIC cli # tests must not race interrupts /*test_softirqs:*/ movl VCPU_processor(%rbx),%eax shll $IRQSTAT_shift,%eax leaq irq_stat+IRQSTAT_softirq_pending(%rip),%rcx cmpl $0,(%rcx,%rax,1) jne process_softirqs testb $1,VCPU_mce_pending(%rbx) jnz process_mce .Ltest_guest_nmi: testb $1,VCPU_nmi_pending(%rbx) jnz process_nmi test_guest_events: movq VCPU_vcpu_info(%rbx),%rax movzwl VCPUINFO_upcall_pending(%rax),%eax decl %eax cmpl $0xfe,%eax ja restore_all_guest /*process_guest_events:*/ sti leaq VCPU_trap_bounce(%rbx),%rdx movq VCPU_event_addr(%rbx),%rax movq %rax,TRAPBOUNCE_eip(%rdx) movb $TBF_INTERRUPT,TRAPBOUNCE_flags(%rdx) call create_bounce_frame jmp test_all_events ALIGN /* %rbx: struct vcpu */ process_softirqs: sti SAVE_PRESERVED call do_softirq jmp test_all_events ALIGN /* %rbx: struct vcpu */ process_mce: testb $1 << VCPU_TRAP_MCE,VCPU_async_exception_mask(%rbx) jnz .Ltest_guest_nmi sti movb $0,VCPU_mce_pending(%rbx) call set_guest_machinecheck_trapbounce test %eax,%eax jz test_all_events movzbl VCPU_async_exception_mask(%rbx),%edx # save mask for the movb %dl,VCPU_mce_old_mask(%rbx) # iret hypercall orl $1 << VCPU_TRAP_MCE,%edx movb %dl,VCPU_async_exception_mask(%rbx) jmp process_trap ALIGN /* %rbx: struct vcpu */ process_nmi: testb $1 << VCPU_TRAP_NMI,VCPU_async_exception_mask(%rbx) jnz test_guest_events sti movb $0,VCPU_nmi_pending(%rbx) call set_guest_nmi_trapbounce test %eax,%eax jz test_all_events movzbl VCPU_async_exception_mask(%rbx),%edx # save mask for the movb %dl,VCPU_nmi_old_mask(%rbx) # iret hypercall orl $1 << VCPU_TRAP_NMI,%edx movb %dl,VCPU_async_exception_mask(%rbx) /* FALLTHROUGH */ process_trap: leaq VCPU_trap_bounce(%rbx),%rdx call create_bounce_frame jmp test_all_events bad_hypercall: movq $-ENOSYS,UREGS_rax(%rsp) jmp test_all_events ENTRY(sysenter_entry) sti pushq $FLAT_USER_SS pushq $0 pushfq GLOBAL(sysenter_eflags_saved) pushq $3 /* ring 3 null cs */ pushq $0 /* null rip */ pushq $0 SAVE_VOLATILE TRAP_syscall GET_CURRENT(%rbx) cmpb $0,VCPU_sysenter_disables_events(%rbx) movq VCPU_sysenter_addr(%rbx),%rax setne %cl testl $X86_EFLAGS_NT,UREGS_eflags(%rsp) leaq VCPU_trap_bounce(%rbx),%rdx UNLIKELY_START(nz, sysenter_nt_set) pushfq andl $~X86_EFLAGS_NT,(%rsp) popfq xorl %eax,%eax UNLIKELY_END(sysenter_nt_set) testq %rax,%rax leal (,%rcx,TBF_INTERRUPT),%ecx UNLIKELY_START(z, sysenter_gpf) movq VCPU_trap_ctxt(%rbx),%rsi SAVE_PRESERVED movl $TRAP_gp_fault,UREGS_entry_vector(%rsp) movl %eax,TRAPBOUNCE_error_code(%rdx) movq TRAP_gp_fault * TRAPINFO_sizeof + TRAPINFO_eip(%rsi),%rax testb $4,TRAP_gp_fault * TRAPINFO_sizeof + TRAPINFO_flags(%rsi) setnz %cl leal TBF_EXCEPTION|TBF_EXCEPTION_ERRCODE(,%rcx,TBF_INTERRUPT),%ecx UNLIKELY_END(sysenter_gpf) movq VCPU_domain(%rbx),%rdi movq %rax,TRAPBOUNCE_eip(%rdx) movb %cl,TRAPBOUNCE_flags(%rdx) testb $1,DOMAIN_is_32bit_pv(%rdi) jnz compat_sysenter jmp .Lbounce_exception ENTRY(int80_direct_trap) pushq $0 SAVE_VOLATILE 0x80 cmpb $0,untrusted_msi(%rip) UNLIKELY_START(ne, msi_check) movl $0x80,%edi call check_for_unexpected_msi LOAD_C_CLOBBERED UNLIKELY_END(msi_check) GET_CURRENT(%rbx) /* Check that the callback is non-null. */ leaq VCPU_int80_bounce(%rbx),%rdx cmpb $0,TRAPBOUNCE_flags(%rdx) jz int80_slow_path movq VCPU_domain(%rbx),%rax testb $1,DOMAIN_is_32bit_pv(%rax) jnz compat_int80_direct_trap call create_bounce_frame jmp test_all_events int80_slow_path: /* * Setup entry vector and error code as if this was a GPF caused by an * IDT entry with DPL==0. */ movl $((0x80 << 3) | 0x2),UREGS_error_code(%rsp) SAVE_PRESERVED movl $TRAP_gp_fault,UREGS_entry_vector(%rsp) /* A GPF wouldn't have incremented the instruction pointer. */ subq $2,UREGS_rip(%rsp) jmp handle_exception_saved /* CREATE A BASIC EXCEPTION FRAME ON GUEST OS STACK: */ /* { RCX, R11, [DS-GS,] [CR2,] [ERRCODE,] RIP, CS, RFLAGS, RSP, SS } */ /* %rdx: trap_bounce, %rbx: struct vcpu */ /* On return only %rbx and %rdx are guaranteed non-clobbered. */ create_bounce_frame: ASSERT_INTERRUPTS_ENABLED testb $TF_kernel_mode,VCPU_thread_flags(%rbx) jnz 1f /* Push new frame at registered guest-OS stack base. */ pushq %rdx movq %rbx,%rdi call toggle_guest_mode popq %rdx movq VCPU_kernel_sp(%rbx),%rsi jmp 2f 1: /* In kernel context already: push new frame at existing %rsp. */ movq UREGS_rsp+8(%rsp),%rsi andb $0xfc,UREGS_cs+8(%rsp) # Indicate kernel context to guest. 2: andq $~0xf,%rsi # Stack frames are 16-byte aligned. movq $HYPERVISOR_VIRT_START,%rax cmpq %rax,%rsi movq $HYPERVISOR_VIRT_END+60,%rax sbb %ecx,%ecx # In +ve address space? Then okay. cmpq %rax,%rsi adc %ecx,%ecx # Above Xen private area? Then okay. UNLIKELY_START(g, create_bounce_frame_bad_sp) lea UNLIKELY_DISPATCH_LABEL(create_bounce_frame_bad_sp)(%rip), %rdi jmp asm_domain_crash_synchronous /* Does not return */ __UNLIKELY_END(create_bounce_frame_bad_sp) movb TRAPBOUNCE_flags(%rdx),%cl subq $40,%rsi movq UREGS_ss+8(%rsp),%rax .Lft2: movq %rax,32(%rsi) # SS movq UREGS_rsp+8(%rsp),%rax .Lft3: movq %rax,24(%rsi) # RSP movq VCPU_vcpu_info(%rbx),%rax pushq VCPUINFO_upcall_mask(%rax) testb $TBF_INTERRUPT,%cl setnz %ch # TBF_INTERRUPT -> set upcall mask orb %ch,VCPUINFO_upcall_mask(%rax) popq %rax shlq $32,%rax # Bits 32-39: saved_upcall_mask movw UREGS_cs+8(%rsp),%ax # Bits 0-15: CS .Lft4: movq %rax,8(%rsi) # CS / saved_upcall_mask shrq $32,%rax testb $0xFF,%al # Bits 0-7: saved_upcall_mask setz %ch # %ch == !saved_upcall_mask movl UREGS_eflags+8(%rsp),%eax andl $~X86_EFLAGS_IF,%eax addb %ch,%ch # Bit 9 (EFLAGS.IF) orb %ch,%ah # Fold EFLAGS.IF into %eax .Lft5: movq %rax,16(%rsi) # RFLAGS movq UREGS_rip+8(%rsp),%rax .Lft6: movq %rax,(%rsi) # RIP testb $TBF_EXCEPTION_ERRCODE,%cl jz 1f subq $8,%rsi movl TRAPBOUNCE_error_code(%rdx),%eax .Lft7: movq %rax,(%rsi) # ERROR CODE 1: testb $TBF_FAILSAFE,%cl UNLIKELY_START(nz, bounce_failsafe) subq $32,%rsi movl %gs,%eax .Lft8: movq %rax,24(%rsi) # GS movl %fs,%eax .Lft9: movq %rax,16(%rsi) # FS movl %es,%eax .Lft10: movq %rax,8(%rsi) # ES movl %ds,%eax .Lft11: movq %rax,(%rsi) # DS UNLIKELY_END(bounce_failsafe) subq $16,%rsi movq UREGS_r11+8(%rsp),%rax .Lft12: movq %rax,8(%rsi) # R11 movq UREGS_rcx+8(%rsp),%rax .Lft13: movq %rax,(%rsi) # RCX /* Rewrite our stack frame and return to guest-OS mode. */ /* IA32 Ref. Vol. 3: TF, VM, RF and NT flags are cleared on trap. */ /* Also clear AC: alignment checks shouldn't trigger in kernel mode. */ orl $TRAP_syscall,UREGS_entry_vector+8(%rsp) andl $~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|\ X86_EFLAGS_NT|X86_EFLAGS_TF),UREGS_eflags+8(%rsp) movq $FLAT_KERNEL_SS,UREGS_ss+8(%rsp) movq %rsi,UREGS_rsp+8(%rsp) movq $FLAT_KERNEL_CS,UREGS_cs+8(%rsp) movq TRAPBOUNCE_eip(%rdx),%rax testq %rax,%rax UNLIKELY_START(z, create_bounce_frame_bad_bounce_ip) lea UNLIKELY_DISPATCH_LABEL(create_bounce_frame_bad_bounce_ip)(%rip), %rdi jmp asm_domain_crash_synchronous /* Does not return */ __UNLIKELY_END(create_bounce_frame_bad_bounce_ip) movq %rax,UREGS_rip+8(%rsp) ret _ASM_EXTABLE(.Lft2, dom_crash_sync_extable) _ASM_EXTABLE(.Lft3, dom_crash_sync_extable) _ASM_EXTABLE(.Lft4, dom_crash_sync_extable) _ASM_EXTABLE(.Lft5, dom_crash_sync_extable) _ASM_EXTABLE(.Lft6, dom_crash_sync_extable) _ASM_EXTABLE(.Lft7, dom_crash_sync_extable) _ASM_EXTABLE(.Lft8, dom_crash_sync_extable) _ASM_EXTABLE(.Lft9, dom_crash_sync_extable) _ASM_EXTABLE(.Lft10, dom_crash_sync_extable) _ASM_EXTABLE(.Lft11, dom_crash_sync_extable) _ASM_EXTABLE(.Lft12, dom_crash_sync_extable) _ASM_EXTABLE(.Lft13, dom_crash_sync_extable) ENTRY(dom_crash_sync_extable) # Get out of the guest-save area of the stack. GET_STACK_BASE(%rax) leaq STACK_CPUINFO_FIELD(guest_cpu_user_regs)(%rax),%rsp # create_bounce_frame() temporarily clobbers CS.RPL. Fix up. __GET_CURRENT(%rax) movq VCPU_domain(%rax),%rax testb $1,DOMAIN_is_32bit_pv(%rax) setz %al leal (%rax,%rax,2),%eax orb %al,UREGS_cs(%rsp) xorl %edi,%edi jmp asm_domain_crash_synchronous /* Does not return */ /* No special register assumptions. */ ENTRY(ret_from_intr) GET_CURRENT(%rbx) testb $3,UREGS_cs(%rsp) jz restore_all_xen movq VCPU_domain(%rbx),%rax testb $1,DOMAIN_is_32bit_pv(%rax) jz test_all_events jmp compat_test_all_events ENTRY(page_fault) movl $TRAP_page_fault,4(%rsp) /* No special register assumptions. */ GLOBAL(handle_exception) SAVE_ALL handle_exception_saved: testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%rsp) jz exception_with_ints_disabled sti 1: movq %rsp,%rdi movzbl UREGS_entry_vector(%rsp),%eax leaq exception_table(%rip),%rdx GET_CURRENT(%rbx) PERFC_INCR(exceptions, %rax, %rbx) callq *(%rdx,%rax,8) testb $3,UREGS_cs(%rsp) jz restore_all_xen leaq VCPU_trap_bounce(%rbx),%rdx movq VCPU_domain(%rbx),%rax testb $1,DOMAIN_is_32bit_pv(%rax) jnz compat_post_handle_exception testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx) jz test_all_events .Lbounce_exception: call create_bounce_frame movb $0,TRAPBOUNCE_flags(%rdx) jmp test_all_events /* No special register assumptions. */ exception_with_ints_disabled: testb $3,UREGS_cs(%rsp) # interrupts disabled outside Xen? jnz FATAL_exception_with_ints_disabled movq %rsp,%rdi call search_pre_exception_table testq %rax,%rax # no fixup code for faulting EIP? jz 1b movq %rax,UREGS_rip(%rsp) subq $8,UREGS_rsp(%rsp) # add ec/ev to previous stack frame testb $15,UREGS_rsp(%rsp) # return %rsp is now aligned? jz 1f # then there is a pad quadword already movq %rsp,%rsi subq $8,%rsp movq %rsp,%rdi movq $UREGS_kernel_sizeof/8,%rcx rep; movsq # make room for ec/ev 1: movq UREGS_error_code(%rsp),%rax # ec/ev movq %rax,UREGS_kernel_sizeof(%rsp) jmp restore_all_xen # return to fixup code /* No special register assumptions. */ FATAL_exception_with_ints_disabled: movzbl UREGS_entry_vector(%rsp),%edi movq %rsp,%rsi call fatal_trap ud2 ENTRY(divide_error) pushq $0 movl $TRAP_divide_error,4(%rsp) jmp handle_exception ENTRY(coprocessor_error) pushq $0 movl $TRAP_copro_error,4(%rsp) jmp handle_exception ENTRY(simd_coprocessor_error) pushq $0 movl $TRAP_simd_error,4(%rsp) jmp handle_exception ENTRY(device_not_available) pushq $0 movl $TRAP_no_device,4(%rsp) jmp handle_exception ENTRY(debug) pushq $0 movl $TRAP_debug,4(%rsp) jmp handle_exception ENTRY(int3) pushq $0 movl $TRAP_int3,4(%rsp) jmp handle_exception ENTRY(overflow) pushq $0 movl $TRAP_overflow,4(%rsp) jmp handle_exception ENTRY(bounds) pushq $0 movl $TRAP_bounds,4(%rsp) jmp handle_exception ENTRY(invalid_op) pushq $0 movl $TRAP_invalid_op,4(%rsp) jmp handle_exception ENTRY(coprocessor_segment_overrun) pushq $0 movl $TRAP_copro_seg,4(%rsp) jmp handle_exception ENTRY(invalid_TSS) movl $TRAP_invalid_tss,4(%rsp) jmp handle_exception ENTRY(segment_not_present) movl $TRAP_no_segment,4(%rsp) jmp handle_exception ENTRY(stack_segment) movl $TRAP_stack_error,4(%rsp) jmp handle_exception ENTRY(general_protection) movl $TRAP_gp_fault,4(%rsp) jmp handle_exception ENTRY(alignment_check) movl $TRAP_alignment_check,4(%rsp) jmp handle_exception ENTRY(spurious_interrupt_bug) pushq $0 movl $TRAP_spurious_int,4(%rsp) jmp handle_exception ENTRY(double_fault) movl $TRAP_double_fault,4(%rsp) SAVE_ALL movq %rsp,%rdi call do_double_fault ud2 .pushsection .init.text, "ax", @progbits ENTRY(early_page_fault) SAVE_ALL movq %rsp,%rdi call do_early_page_fault jmp restore_all_xen .popsection ENTRY(nmi) pushq $0 movl $TRAP_nmi,4(%rsp) handle_ist_exception: SAVE_ALL testb $3,UREGS_cs(%rsp) jz 1f /* Interrupted guest context. Copy the context to stack bottom. */ GET_CPUINFO_FIELD(guest_cpu_user_regs,%rdi) movq %rsp,%rsi movl $UREGS_kernel_sizeof/8,%ecx movq %rdi,%rsp rep movsq 1: movq %rsp,%rdi movzbl UREGS_entry_vector(%rsp),%eax leaq exception_table(%rip),%rdx callq *(%rdx,%rax,8) cmpb $TRAP_nmi,UREGS_entry_vector(%rsp) jne ret_from_intr /* We want to get straight to the IRET on the NMI exit path. */ testb $3,UREGS_cs(%rsp) jz restore_all_xen GET_CURRENT(%rbx) /* Send an IPI to ourselves to cover for the lack of event checking. */ movl VCPU_processor(%rbx),%eax shll $IRQSTAT_shift,%eax leaq irq_stat+IRQSTAT_softirq_pending(%rip),%rcx cmpl $0,(%rcx,%rax,1) je 1f movl $EVENT_CHECK_VECTOR,%edi call send_IPI_self 1: movq VCPU_domain(%rbx),%rax cmpb $0,DOMAIN_is_32bit_pv(%rax) je restore_all_guest jmp compat_restore_all_guest ENTRY(nmi_crash) pushq $0 movl $TRAP_nmi,4(%rsp) SAVE_ALL movq %rsp,%rdi callq do_nmi_crash /* Does not return */ ud2 ENTRY(machine_check) pushq $0 movl $TRAP_machine_check,4(%rsp) jmp handle_ist_exception /* Enable NMIs. No special register assumptions. Only %rax is not preserved. */ ENTRY(enable_nmis) movq %rsp, %rax /* Grab RSP before pushing */ /* Set up stack frame */ pushq $0 /* SS */ pushq %rax /* RSP */ pushfq /* RFLAGS */ pushq $__HYPERVISOR_CS /* CS */ leaq 1f(%rip),%rax pushq %rax /* RIP */ iretq /* Disable the hardware NMI latch */ 1: retq /* No op trap handler. Required for kexec crash path. */ GLOBAL(trap_nop) iretq .section .rodata, "a", @progbits ENTRY(exception_table) .quad do_divide_error .quad do_debug .quad do_nmi .quad do_int3 .quad do_overflow .quad do_bounds .quad do_invalid_op .quad do_device_not_available .quad 0 # double_fault .quad do_coprocessor_segment_overrun .quad do_invalid_TSS .quad do_segment_not_present .quad do_stack_segment .quad do_general_protection .quad do_page_fault .quad do_spurious_interrupt_bug .quad do_coprocessor_error .quad do_alignment_check .quad do_machine_check .quad do_simd_coprocessor_error ENTRY(hypercall_table) .quad do_set_trap_table /* 0 */ .quad do_mmu_update .quad do_set_gdt .quad do_stack_switch .quad do_set_callbacks .quad do_fpu_taskswitch /* 5 */ .quad do_sched_op_compat .quad do_platform_op .quad do_set_debugreg .quad do_get_debugreg .quad do_update_descriptor /* 10 */ .quad do_ni_hypercall .quad do_memory_op .quad do_multicall .quad do_update_va_mapping .quad do_set_timer_op /* 15 */ .quad do_event_channel_op_compat .quad do_xen_version .quad do_console_io .quad do_physdev_op_compat .quad do_grant_table_op /* 20 */ .quad do_vm_assist .quad do_update_va_mapping_otherdomain .quad do_iret .quad do_vcpu_op .quad do_set_segment_base /* 25 */ .quad do_mmuext_op .quad do_xsm_op .quad do_nmi_op .quad do_sched_op .quad do_callback_op /* 30 */ .quad do_xenoprof_op .quad do_event_channel_op .quad do_physdev_op .quad do_hvm_op .quad do_sysctl /* 35 */ .quad do_domctl .quad do_kexec_op .quad do_tmem_op .rept __HYPERVISOR_arch_0-((.-hypercall_table)/8) .quad do_ni_hypercall .endr .quad do_mca /* 48 */ .rept NR_hypercalls-((.-hypercall_table)/8) .quad do_ni_hypercall .endr ENTRY(hypercall_args_table) .byte 1 /* do_set_trap_table */ /* 0 */ .byte 4 /* do_mmu_update */ .byte 2 /* do_set_gdt */ .byte 2 /* do_stack_switch */ .byte 3 /* do_set_callbacks */ .byte 1 /* do_fpu_taskswitch */ /* 5 */ .byte 2 /* do_sched_op_compat */ .byte 1 /* do_platform_op */ .byte 2 /* do_set_debugreg */ .byte 1 /* do_get_debugreg */ .byte 2 /* do_update_descriptor */ /* 10 */ .byte 0 /* do_ni_hypercall */ .byte 2 /* do_memory_op */ .byte 2 /* do_multicall */ .byte 3 /* do_update_va_mapping */ .byte 1 /* do_set_timer_op */ /* 15 */ .byte 1 /* do_event_channel_op_compat */ .byte 2 /* do_xen_version */ .byte 3 /* do_console_io */ .byte 1 /* do_physdev_op_compat */ .byte 3 /* do_grant_table_op */ /* 20 */ .byte 2 /* do_vm_assist */ .byte 4 /* do_update_va_mapping_otherdomain */ .byte 0 /* do_iret */ .byte 3 /* do_vcpu_op */ .byte 2 /* do_set_segment_base */ /* 25 */ .byte 4 /* do_mmuext_op */ .byte 1 /* do_xsm_op */ .byte 2 /* do_nmi_op */ .byte 2 /* do_sched_op */ .byte 2 /* do_callback_op */ /* 30 */ .byte 2 /* do_xenoprof_op */ .byte 2 /* do_event_channel_op */ .byte 2 /* do_physdev_op */ .byte 2 /* do_hvm_op */ .byte 1 /* do_sysctl */ /* 35 */ .byte 1 /* do_domctl */ .byte 2 /* do_kexec */ .byte 1 /* do_tmem_op */ .rept __HYPERVISOR_arch_0-(.-hypercall_args_table) .byte 0 /* do_ni_hypercall */ .endr .byte 1 /* do_mca */ /* 48 */ .rept NR_hypercalls-(.-hypercall_args_table) .byte 0 /* do_ni_hypercall */ .endr xen-4.4.0/xen/arch/x86/x86_64/cpu_idle.c0000664000175000017500000000760112307313555015471 0ustar smbsmb/****************************************************************************** * cpu_idle.c -- adapt x86/acpi/cpu_idle.c to compat guest. * * Copyright (C) 2007, 2008 Intel Corporation * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ #define __XEN_TOOLS__ /* for using get_xen_guest_handle macro */ #include #include #include #include #include #include CHECK_processor_csd; DEFINE_XEN_GUEST_HANDLE(compat_processor_csd_t); DEFINE_XEN_GUEST_HANDLE(compat_processor_cx_t); void *xlat_malloc(unsigned long *xlat_page_current, size_t size) { void *ret; /* normalize size to be 64 * n */ size = (size + 0x3fUL) & ~0x3fUL; if ( unlikely(size > xlat_page_left_size(*xlat_page_current)) ) return NULL; ret = (void *) *xlat_page_current; *xlat_page_current += size; return ret; } static int copy_from_compat_state(xen_processor_cx_t *xen_state, compat_processor_cx_t *state) { #define XLAT_processor_cx_HNDL_dp(_d_, _s_) do { \ XEN_GUEST_HANDLE(compat_processor_csd_t) dps; \ XEN_GUEST_HANDLE_PARAM(xen_processor_csd_t) dps_param; \ if ( unlikely(!compat_handle_okay((_s_)->dp, (_s_)->dpcnt)) ) \ return -EFAULT; \ guest_from_compat_handle(dps, (_s_)->dp); \ dps_param = guest_handle_cast(dps, xen_processor_csd_t); \ (_d_)->dp = guest_handle_from_param(dps_param, xen_processor_csd_t); \ } while (0) XLAT_processor_cx(xen_state, state); #undef XLAT_processor_cx_HNDL_dp return 0; } long compat_set_cx_pminfo(uint32_t cpu, struct compat_processor_power *power) { struct xen_processor_power *xen_power; unsigned long xlat_page_current; xlat_malloc_init(xlat_page_current); xen_power = xlat_malloc_array(xlat_page_current, struct xen_processor_power, 1); if ( unlikely(xen_power == NULL) ) return -EFAULT; #define XLAT_processor_power_HNDL_states(_d_, _s_) do { \ xen_processor_cx_t *xen_states = NULL; \ \ if ( likely((_s_)->count > 0) ) \ { \ XEN_GUEST_HANDLE(compat_processor_cx_t) states; \ compat_processor_cx_t state; \ int i; \ \ xen_states = xlat_malloc_array(xlat_page_current, \ xen_processor_cx_t, (_s_)->count); \ if ( unlikely(xen_states == NULL) ) \ return -EFAULT; \ \ if ( unlikely(!compat_handle_okay((_s_)->states, (_s_)->count)) ) \ return -EFAULT; \ guest_from_compat_handle(states, (_s_)->states); \ \ for ( i = 0; i < _s_->count; i++ ) \ { \ if ( unlikely(copy_from_guest_offset(&state, states, i, 1)) ) \ return -EFAULT; \ if ( unlikely(copy_from_compat_state(&xen_states[i], &state)) ) \ return -EFAULT; \ } \ } \ \ set_xen_guest_handle((_d_)->states, xen_states); \ } while (0) XLAT_processor_power(xen_power, power); #undef XLAT_processor_power_HNDL_states return set_cx_pminfo(cpu, xen_power); } xen-4.4.0/xen/arch/x86/x86_64/mmconf-fam10h.c0000664000175000017500000001123512307313555016234 0ustar smbsmb/* * AMD Family 10h mmconfig enablement (taken from Linux 2.6.36) */ #include #include #include #include #include #include #include #include #include #include #include "mmconfig.h" struct pci_hostbridge_probe { u32 bus; u32 slot; u32 vendor; u32 device; }; static u64 __cpuinitdata fam10h_pci_mmconf_base; static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, }; #define UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT) #define MASK (~(UNIT - 1)) #define SIZE (UNIT << 8) /* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */ #define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) #define BASE_VALID(b) ((b) + SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40)) static void __init get_fam10h_pci_mmconf_base(void) { unsigned int i, j, bus, slot, hi_mmio_num; u32 address; u64 val, tom2, start, end; struct range { u64 start, end; } range[8]; for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { u32 id; u16 device; u16 vendor; bus = pci_probes[i].bus; slot = pci_probes[i].slot; id = pci_conf_read32(0, bus, slot, 0, PCI_VENDOR_ID); vendor = id & 0xffff; device = (id>>16) & 0xffff; if (pci_probes[i].vendor == vendor && pci_probes[i].device == device) break; } if (i >= ARRAY_SIZE(pci_probes)) return; /* SYS_CFG */ address = MSR_K8_SYSCFG; rdmsrl(address, val); /* TOP_MEM2 is not enabled? */ if (!(val & (1<<21))) { tom2 = 1ULL << 32; } else { /* TOP_MEM2 */ address = MSR_K8_TOP_MEM2; rdmsrl(address, val); tom2 = max(val & 0xffffff800000ULL, 1ULL << 32); } /* * need to check if the range is in the high mmio range that is * above 4G */ for (hi_mmio_num = i = 0; i < 8; i++) { val = pci_conf_read32(0, bus, slot, 1, 0x80 + (i << 3)); if (!(val & 3)) continue; start = (val & 0xffffff00) << 8; /* 39:16 on 31:8*/ val = pci_conf_read32(0, bus, slot, 1, 0x84 + (i << 3)); end = ((val & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/ if (end < tom2) continue; for (j = hi_mmio_num; j; --j) { if (range[j - 1].start < start) break; range[j] = range[j - 1]; } range[j].start = start; range[j].end = end; hi_mmio_num++; } start = FAM10H_PCI_MMCONF_BASE; if (start <= tom2) start = (tom2 + 2 * UNIT - 1) & MASK; if (!hi_mmio_num) goto out; if (range[hi_mmio_num - 1].end < start) goto out; if (range[0].start > start + SIZE) goto out; /* need to find one window */ start = (range[0].start & MASK) - UNIT; if (start > tom2 && BASE_VALID(start)) goto out; start = (range[hi_mmio_num - 1].end + UNIT) & MASK; if (BASE_VALID(start)) goto out; /* need to find window between ranges */ for (i = 1; i < hi_mmio_num; i++) { start = (range[i - 1].end + UNIT) & MASK; end = range[i].start & MASK; if (end >= start + SIZE && BASE_VALID(start)) goto out; } return; out: if (e820_add_range(&e820, start, start + SIZE, E820_RESERVED)) fam10h_pci_mmconf_base = start; } void __cpuinit fam10h_check_enable_mmcfg(void) { u64 val; bool_t print = opt_cpu_info; if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF)) return; rdmsrl(MSR_FAM10H_MMIO_CONF_BASE, val); /* try to make sure that AP's setting is identical to BSP setting */ if (val & FAM10H_MMIO_CONF_ENABLE) { u64 base = val & MASK; if (!fam10h_pci_mmconf_base) { fam10h_pci_mmconf_base = base; return; } if (fam10h_pci_mmconf_base == base) return; } /* * if it is not enabled, try to enable it and assume only one segment * with 256 buses */ /* only try to get setting from BSP */ if (!fam10h_pci_mmconf_base) { get_fam10h_pci_mmconf_base(); print = 1; } if (!fam10h_pci_mmconf_base) { pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF; return; } if (print) printk(KERN_INFO "Enable MMCONFIG on AMD Fam10h at %"PRIx64"\n", fam10h_pci_mmconf_base); val &= ~((FAM10H_MMIO_CONF_BASE_MASK< * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ #include #include #include #include #include #include .text .align PAGE_SIZE .code64 ENTRY(kexec_reloc) /* %rdi - code page maddr */ /* %rsi - page table maddr */ /* %rdx - indirection page maddr */ /* %rcx - entry maddr (%rbp) */ /* %r8 - flags */ movq %rcx, %rbp /* Setup stack. */ leaq (reloc_stack - kexec_reloc)(%rdi), %rsp /* Load reloc page table. */ movq %rsi, %cr3 /* Jump to identity mapped code. */ leaq (identity_mapped - kexec_reloc)(%rdi), %rax jmpq *%rax identity_mapped: /* * Set cr0 to a known state: * - Paging enabled * - Alignment check disabled * - Write protect disabled * - No task switch * - Don't do FP software emulation. * - Protected mode enabled */ movq %cr0, %rax andl $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax orl $(X86_CR0_PG | X86_CR0_PE), %eax movq %rax, %cr0 /* * Set cr4 to a known state: * - physical address extension enabled */ movl $X86_CR4_PAE, %eax movq %rax, %cr4 movq %rdx, %rdi call relocate_pages /* Need to switch to 32-bit mode? */ testq $KEXEC_RELOC_FLAG_COMPAT, %r8 jnz call_32_bit call_64_bit: /* Call the image entry point. This should never return. */ callq *%rbp ud2 call_32_bit: /* Setup IDT. */ lidt compat_mode_idt(%rip) /* Load compat GDT. */ leaq compat_mode_gdt(%rip), %rax movq %rax, (compat_mode_gdt_desc + 2)(%rip) lgdt compat_mode_gdt_desc(%rip) /* Relocate compatibility mode entry point address. */ leal compatibility_mode(%rip), %eax movl %eax, compatibility_mode_far(%rip) /* Enter compatibility mode. */ ljmp *compatibility_mode_far(%rip) relocate_pages: /* %rdi - indirection page maddr */ pushq %rbx cld movq %rdi, %rbx xorl %edi, %edi xorl %esi, %esi next_entry: /* top, read another word for the indirection page */ movq (%rbx), %rcx addq $8, %rbx is_dest: testb $IND_DESTINATION, %cl jz is_ind movq %rcx, %rdi andq $PAGE_MASK, %rdi jmp next_entry is_ind: testb $IND_INDIRECTION, %cl jz is_done movq %rcx, %rbx andq $PAGE_MASK, %rbx jmp next_entry is_done: testb $IND_DONE, %cl jnz done is_source: testb $IND_SOURCE, %cl jz is_zero movq %rcx, %rsi /* For every source page do a copy */ andq $PAGE_MASK, %rsi movl $(PAGE_SIZE / 8), %ecx rep movsq jmp next_entry is_zero: testb $IND_ZERO, %cl jz next_entry movl $(PAGE_SIZE / 8), %ecx /* Zero the destination page. */ xorl %eax, %eax rep stosq jmp next_entry done: popq %rbx ret .code32 compatibility_mode: /* Setup some sane segments. */ movl $0x0008, %eax movl %eax, %ds movl %eax, %es movl %eax, %fs movl %eax, %gs movl %eax, %ss /* Disable paging and therefore leave 64 bit mode. */ movl %cr0, %eax andl $~X86_CR0_PG, %eax movl %eax, %cr0 /* Disable long mode */ movl $MSR_EFER, %ecx rdmsr andl $~EFER_LME, %eax wrmsr /* Clear cr4 to disable PAE. */ xorl %eax, %eax movl %eax, %cr4 /* Call the image entry point. This should never return. */ call *%ebp ud2 .align 4 compatibility_mode_far: .long 0x00000000 /* set in call_32_bit above */ .word 0x0010 compat_mode_gdt_desc: .word (3*8)-1 .quad 0x0000000000000000 /* set in call_32_bit above */ .align 8 compat_mode_gdt: .quad 0x0000000000000000 /* null */ .quad 0x00cf92000000ffff /* 0x0008 ring 0 data */ .quad 0x00cf9a000000ffff /* 0x0010 ring 0 code, compatibility */ compat_mode_idt: .word 0 /* limit */ .long 0 /* base */ /* * 16 words of stack are more than enough. */ .fill 16,8,0 reloc_stack: .globl kexec_reloc_size kexec_reloc_size: .long . - kexec_reloc xen-4.4.0/xen/arch/x86/x86_64/compat.c0000664000175000017500000000132612307313555015166 0ustar smbsmb/****************************************************************************** * compat.c */ #include #include #include #include DEFINE_XEN_GUEST_HANDLE(physdev_op_compat_t); #define physdev_op compat_physdev_op #define physdev_op_t physdev_op_compat_t #define do_physdev_op compat_physdev_op #define do_physdev_op_compat(x) compat_physdev_op_compat(_##x) #define COMPAT #define _XEN_GUEST_HANDLE(t) XEN_GUEST_HANDLE(t) typedef int ret_t; #include "../compat.c" /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/x86_64/gdbstub.c0000664000175000017500000001075712307313555015345 0ustar smbsmb/* * x86_64 -specific gdb stub routines * * Copyright (C) 2007 Dan Doucette ddoucette@teradici.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #define GDB_REG64(r) gdb_write_to_packet_hex(r, sizeof(u64), ctx) #define GDB_REG32(r) gdb_write_to_packet_hex(r, sizeof(u32), ctx) void gdb_arch_read_reg_array(struct cpu_user_regs *regs, struct gdb_context *ctx) { GDB_REG64(regs->rax); GDB_REG64(regs->rbx); GDB_REG64(regs->rcx); GDB_REG64(regs->rdx); GDB_REG64(regs->rsi); GDB_REG64(regs->rdi); GDB_REG64(regs->rbp); GDB_REG64(regs->rsp); GDB_REG64(regs->r8); GDB_REG64(regs->r9); GDB_REG64(regs->r10); GDB_REG64(regs->r11); GDB_REG64(regs->r12); GDB_REG64(regs->r13); GDB_REG64(regs->r14); GDB_REG64(regs->r15); GDB_REG64(regs->rip); GDB_REG32(regs->eflags); GDB_REG32(regs->cs); GDB_REG32(regs->ss); GDB_REG32(regs->ds); GDB_REG32(regs->es); GDB_REG32(regs->fs); GDB_REG32(regs->gs); gdb_send_packet(ctx); } void gdb_arch_write_reg_array(struct cpu_user_regs *regs, const char* buf, struct gdb_context *ctx) { gdb_send_reply("", ctx); } void gdb_arch_read_reg(unsigned long regnum, struct cpu_user_regs *regs, struct gdb_context *ctx) { switch (regnum) { case 0: GDB_REG64(regs->rax); break; case 1: GDB_REG64(regs->rbx); break; case 2: GDB_REG64(regs->rcx); break; case 3: GDB_REG64(regs->rdx); break; case 4: GDB_REG64(regs->rsi); break; case 5: GDB_REG64(regs->rdi); break; case 6: GDB_REG64(regs->rbp); break; case 7: GDB_REG64(regs->rsp); break; case 8: GDB_REG64(regs->r8); break; case 9: GDB_REG64(regs->r9); break; case 10: GDB_REG64(regs->r10); break; case 11: GDB_REG64(regs->r11); break; case 12: GDB_REG64(regs->r12); break; case 13: GDB_REG64(regs->r13); break; case 14: GDB_REG64(regs->r14); break; case 15: GDB_REG64(regs->r15); break; case 16: GDB_REG64(regs->rip); break; case 17: GDB_REG32(regs->rflags); break; case 18: GDB_REG32(regs->cs); break; case 19: GDB_REG32(regs->ss); break; case 20: GDB_REG32(regs->ds); break; case 21: GDB_REG32(regs->es); break; case 22: GDB_REG32(regs->fs); break; case 23: GDB_REG32(regs->gs); break; default: GDB_REG64(0xbaadf00ddeadbeef); break; } gdb_send_packet(ctx); } void gdb_arch_write_reg(unsigned long regnum, unsigned long val, struct cpu_user_regs *regs, struct gdb_context *ctx) { switch (regnum) { case 0: regs->rax = val; break; case 1: regs->rbx = val; break; case 2: regs->rcx = val; break; case 3: regs->rdx = val; break; case 4: regs->rsi = val; break; case 5: regs->rdi = val; break; case 6: regs->rbp = val; break; case 7: regs->rsp = val; break; case 8: regs->r8 = val; break; case 9: regs->r9 = val; break; case 10: regs->r10 = val; break; case 11: regs->r11 = val; break; case 12: regs->r12 = val; break; case 13: regs->r13 = val; break; case 14: regs->r14 = val; break; case 15: regs->r15 = val; break; case 16: regs->rip = val; break; case 17: regs->rflags = (u32)val; break; case 18: regs->cs = (u16)val; break; case 19: regs->ss = (u16)val; break; case 20: regs->ds = (u16)val; break; case 21: regs->es = (u16)val; break; case 22: regs->fs = (u16)val; break; case 23: regs->gs = (u16)val; break; default: break; } gdb_send_reply("OK", ctx); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * End: */ xen-4.4.0/xen/arch/x86/x86_64/mm.c0000664000175000017500000013461612307313555014325 0ustar smbsmb/****************************************************************************** * arch/x86/x86_64/mm.c * * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This * program is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., 59 * Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Parameters for PFN/MADDR compression. */ unsigned long __read_mostly max_pdx; unsigned long __read_mostly pfn_pdx_bottom_mask = ~0UL; unsigned long __read_mostly ma_va_bottom_mask = ~0UL; unsigned long __read_mostly pfn_top_mask = 0; unsigned long __read_mostly ma_top_mask = 0; unsigned long __read_mostly pfn_hole_mask = 0; unsigned int __read_mostly pfn_pdx_hole_shift = 0; unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START; /* Enough page directories to map into the bottom 1GB. */ l3_pgentry_t __attribute__ ((__section__ (".bss.page_aligned"))) l3_bootmap[L3_PAGETABLE_ENTRIES]; l2_pgentry_t __attribute__ ((__section__ (".bss.page_aligned"))) l2_bootmap[L2_PAGETABLE_ENTRIES]; l2_pgentry_t *compat_idle_pg_table_l2; int __mfn_valid(unsigned long mfn) { return likely(mfn < max_page) && likely(!(mfn & pfn_hole_mask)) && likely(test_bit(pfn_to_pdx(mfn) / PDX_GROUP_COUNT, pdx_group_valid)); } void *do_page_walk(struct vcpu *v, unsigned long addr) { unsigned long mfn = pagetable_get_pfn(v->arch.guest_table); l4_pgentry_t l4e, *l4t; l3_pgentry_t l3e, *l3t; l2_pgentry_t l2e, *l2t; l1_pgentry_t l1e, *l1t; if ( !is_pv_vcpu(v) || !is_canonical_address(addr) ) return NULL; l4t = map_domain_page(mfn); l4e = l4t[l4_table_offset(addr)]; unmap_domain_page(l4t); if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) return NULL; l3t = map_l3t_from_l4e(l4e); l3e = l3t[l3_table_offset(addr)]; unmap_domain_page(l3t); mfn = l3e_get_pfn(l3e); if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) return NULL; if ( (l3e_get_flags(l3e) & _PAGE_PSE) ) { mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)); goto ret; } l2t = map_domain_page(mfn); l2e = l2t[l2_table_offset(addr)]; unmap_domain_page(l2t); mfn = l2e_get_pfn(l2e); if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) return NULL; if ( (l2e_get_flags(l2e) & _PAGE_PSE) ) { mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)); goto ret; } l1t = map_domain_page(mfn); l1e = l1t[l1_table_offset(addr)]; unmap_domain_page(l1t); mfn = l1e_get_pfn(l1e); if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) return NULL; ret: return map_domain_page(mfn) + (addr & ~PAGE_MASK); } void __init pfn_pdx_hole_setup(unsigned long mask) { unsigned int i, j, bottom_shift = 0, hole_shift = 0; /* * We skip the first MAX_ORDER bits, as we never want to compress them. * This guarantees that page-pointer arithmetic remains valid within * contiguous aligned ranges of 2^MAX_ORDER pages. Among others, our * buddy allocator relies on this assumption. */ for ( j = MAX_ORDER-1; ; ) { i = find_next_zero_bit(&mask, BITS_PER_LONG, j); j = find_next_bit(&mask, BITS_PER_LONG, i); if ( j >= BITS_PER_LONG ) break; if ( j - i > hole_shift ) { hole_shift = j - i; bottom_shift = i; } } if ( !hole_shift ) return; printk(KERN_INFO "PFN compression on bits %u...%u\n", bottom_shift, bottom_shift + hole_shift - 1); pfn_pdx_hole_shift = hole_shift; pfn_pdx_bottom_mask = (1UL << bottom_shift) - 1; ma_va_bottom_mask = (PAGE_SIZE << bottom_shift) - 1; pfn_hole_mask = ((1UL << hole_shift) - 1) << bottom_shift; pfn_top_mask = ~(pfn_pdx_bottom_mask | pfn_hole_mask); ma_top_mask = pfn_top_mask << PAGE_SHIFT; } /* * Allocate page table pages for m2p table */ struct mem_hotadd_info { unsigned long spfn; unsigned long epfn; unsigned long cur; }; int hotadd_mem_valid(unsigned long pfn, struct mem_hotadd_info *info) { return (pfn < info->epfn && pfn >= info->spfn); } static unsigned long alloc_hotadd_mfn(struct mem_hotadd_info *info) { unsigned mfn; ASSERT((info->cur + ( 1UL << PAGETABLE_ORDER) < info->epfn) && info->cur >= info->spfn); mfn = info->cur; info->cur += (1UL << PAGETABLE_ORDER); return mfn; } #define M2P_NO_MAPPED 0 #define M2P_2M_MAPPED 1 #define M2P_1G_MAPPED 2 static int m2p_mapped(unsigned long spfn) { unsigned long va; l3_pgentry_t *l3_ro_mpt; l2_pgentry_t *l2_ro_mpt; va = RO_MPT_VIRT_START + spfn * sizeof(*machine_to_phys_mapping); l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]); switch ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & (_PAGE_PRESENT |_PAGE_PSE)) { case _PAGE_PSE|_PAGE_PRESENT: return M2P_1G_MAPPED; break; /* Check for next level */ case _PAGE_PRESENT: break; default: return M2P_NO_MAPPED; break; } l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]); if (l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT) return M2P_2M_MAPPED; return M2P_NO_MAPPED; } int share_hotadd_m2p_table(struct mem_hotadd_info *info) { unsigned long i, n, v, m2p_start_mfn = 0; l3_pgentry_t l3e; l2_pgentry_t l2e; /* M2P table is mappable read-only by privileged domains. */ for ( v = RDWR_MPT_VIRT_START; v != RDWR_MPT_VIRT_END; v += n << PAGE_SHIFT ) { n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES; l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[ l3_table_offset(v)]; if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) continue; if ( !(l3e_get_flags(l3e) & _PAGE_PSE) ) { n = L1_PAGETABLE_ENTRIES; l2e = l3e_to_l2e(l3e)[l2_table_offset(v)]; if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) continue; m2p_start_mfn = l2e_get_pfn(l2e); } else continue; for ( i = 0; i < n; i++ ) { struct page_info *page = mfn_to_page(m2p_start_mfn + i); if (hotadd_mem_valid(m2p_start_mfn + i, info)) share_xen_page_with_privileged_guests(page, XENSHARE_readonly); } } for ( v = RDWR_COMPAT_MPT_VIRT_START; v != RDWR_COMPAT_MPT_VIRT_END; v += 1 << L2_PAGETABLE_SHIFT ) { l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[ l3_table_offset(v)]; if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) continue; l2e = l3e_to_l2e(l3e)[l2_table_offset(v)]; if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) continue; m2p_start_mfn = l2e_get_pfn(l2e); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) { struct page_info *page = mfn_to_page(m2p_start_mfn + i); if (hotadd_mem_valid(m2p_start_mfn + i, info)) share_xen_page_with_privileged_guests(page, XENSHARE_readonly); } } return 0; } static void destroy_compat_m2p_mapping(struct mem_hotadd_info *info) { unsigned long i, va, rwva, pt_pfn; unsigned long smap = info->spfn, emap = info->spfn; l3_pgentry_t *l3_ro_mpt; l2_pgentry_t *l2_ro_mpt; if ( smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) ) return; if ( emap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) ) emap = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2; l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(HIRO_COMPAT_MPT_VIRT_START)]); ASSERT(l3e_get_flags(l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)]) & _PAGE_PRESENT); l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)]); for ( i = smap; i < emap; ) { va = HIRO_COMPAT_MPT_VIRT_START + i * sizeof(*compat_machine_to_phys_mapping); rwva = RDWR_COMPAT_MPT_VIRT_START + i * sizeof(*compat_machine_to_phys_mapping); if ( l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT ) { pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]); if ( hotadd_mem_valid(pt_pfn, info) ) { destroy_xen_mappings(rwva, rwva + (1UL << L2_PAGETABLE_SHIFT)); l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_empty()); } } i += 1UL < (L2_PAGETABLE_SHIFT - 2); } return; } void destroy_m2p_mapping(struct mem_hotadd_info *info) { l3_pgentry_t *l3_ro_mpt; unsigned long i, va, rwva; unsigned long smap = info->spfn, emap = info->epfn; l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]); /* * No need to clean m2p structure existing before the hotplug */ for (i = smap; i < emap;) { unsigned long pt_pfn; l2_pgentry_t *l2_ro_mpt; va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping); rwva = RDWR_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping); /* 1G mapping should not be created by mem hotadd */ if (!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT) || (l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PSE)) { i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) + (1UL << (L3_PAGETABLE_SHIFT - 3) ); continue; } l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]); if (!(l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT)) { i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) + (1UL << (L2_PAGETABLE_SHIFT - 3)) ; continue; } pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]); if ( hotadd_mem_valid(pt_pfn, info) ) { destroy_xen_mappings(rwva, rwva + (1UL << L2_PAGETABLE_SHIFT)); l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]); l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_empty()); } i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) + (1UL << (L2_PAGETABLE_SHIFT - 3)); } destroy_compat_m2p_mapping(info); /* Brute-Force flush all TLB */ flush_tlb_all(); return; } /* * Allocate and map the compatibility mode machine-to-phys table. * spfn/epfn: the pfn ranges to be setup * free_s/free_e: the pfn ranges that is free still */ static int setup_compat_m2p_table(struct mem_hotadd_info *info) { unsigned long i, va, smap, emap, rwva, epfn = info->epfn, mfn; unsigned int n; l3_pgentry_t *l3_ro_mpt = NULL; l2_pgentry_t *l2_ro_mpt = NULL; int err = 0; smap = info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 2)) -1)); /* * Notice: For hot-added memory, only range below m2p_compat_vstart * will be filled up (assuming memory is discontinous when booting). */ if ((smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2)) ) return 0; if ( epfn > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) ) epfn = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2; emap = ( (epfn + ((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1 )) & ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) ); va = HIRO_COMPAT_MPT_VIRT_START + smap * sizeof(*compat_machine_to_phys_mapping); l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]); ASSERT(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT); l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]); #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int)) #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \ sizeof(*compat_machine_to_phys_mapping)) BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \ sizeof(*compat_machine_to_phys_mapping)); for ( i = smap; i < emap; i += (1UL << (L2_PAGETABLE_SHIFT - 2)) ) { va = HIRO_COMPAT_MPT_VIRT_START + i * sizeof(*compat_machine_to_phys_mapping); rwva = RDWR_COMPAT_MPT_VIRT_START + i * sizeof(*compat_machine_to_phys_mapping); if (l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT) continue; for ( n = 0; n < CNT; ++n) if ( mfn_valid(i + n * PDX_GROUP_COUNT) ) break; if ( n == CNT ) continue; mfn = alloc_hotadd_mfn(info); err = map_pages_to_xen(rwva, mfn, 1UL << PAGETABLE_ORDER, PAGE_HYPERVISOR); if ( err ) break; /* Fill with INVALID_M2P_ENTRY. */ memset((void *)rwva, 0xFF, 1UL << L2_PAGETABLE_SHIFT); /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */ l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_from_pfn(mfn, _PAGE_PSE|_PAGE_PRESENT)); } #undef CNT #undef MFN return err; } /* * Allocate and map the machine-to-phys table. * The L3 for RO/RWRW MPT and the L2 for compatible MPT should be setup already */ static int setup_m2p_table(struct mem_hotadd_info *info) { unsigned long i, va, smap, emap; unsigned int n; l2_pgentry_t *l2_ro_mpt = NULL; l3_pgentry_t *l3_ro_mpt = NULL; int ret = 0; ASSERT(l4e_get_flags(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]) & _PAGE_PRESENT); l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]); smap = (info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1))); emap = ((info->epfn + ((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1 )) & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1)); va = RO_MPT_VIRT_START + smap * sizeof(*machine_to_phys_mapping); #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long)) #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \ sizeof(*machine_to_phys_mapping)) BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \ sizeof(*machine_to_phys_mapping)); i = smap; while ( i < emap ) { switch ( m2p_mapped(i) ) { case M2P_1G_MAPPED: i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) + (1UL << (L3_PAGETABLE_SHIFT - 3)); continue; case M2P_2M_MAPPED: i = (i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) + (1UL << (L2_PAGETABLE_SHIFT - 3)); continue; default: break; } va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping); for ( n = 0; n < CNT; ++n) if ( mfn_valid(i + n * PDX_GROUP_COUNT) ) break; if ( n < CNT ) { unsigned long mfn = alloc_hotadd_mfn(info); ret = map_pages_to_xen( RDWR_MPT_VIRT_START + i * sizeof(unsigned long), mfn, 1UL << PAGETABLE_ORDER, PAGE_HYPERVISOR); if ( ret ) goto error; /* Fill with INVALID_M2P_ENTRY. */ memset((void *)(RDWR_MPT_VIRT_START + i * sizeof(unsigned long)), 0xFF, 1UL << L2_PAGETABLE_SHIFT); ASSERT(!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PSE)); if ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT ) l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]) + l2_table_offset(va); else { l2_ro_mpt = alloc_xen_pagetable(); if ( !l2_ro_mpt ) { ret = -ENOMEM; goto error; } clear_page(l2_ro_mpt); l3e_write(&l3_ro_mpt[l3_table_offset(va)], l3e_from_paddr(__pa(l2_ro_mpt), __PAGE_HYPERVISOR | _PAGE_USER)); l2_ro_mpt += l2_table_offset(va); } /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */ l2e_write(l2_ro_mpt, l2e_from_pfn(mfn, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT)); } if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) ) l2_ro_mpt = NULL; i += ( 1UL << (L2_PAGETABLE_SHIFT - 3)); } #undef CNT #undef MFN ret = setup_compat_m2p_table(info); error: return ret; } void __init paging_init(void) { unsigned long i, mpt_size, va; unsigned int n, memflags; l3_pgentry_t *l3_ro_mpt; l2_pgentry_t *l2_ro_mpt = NULL; struct page_info *l1_pg, *l2_pg, *l3_pg; /* * We setup the L3s for 1:1 mapping if host support memory hotplug * to avoid sync the 1:1 mapping on page fault handler */ for ( va = DIRECTMAP_VIRT_START; va < DIRECTMAP_VIRT_END && (void *)va < __va(mem_hotplug); va += (1UL << L4_PAGETABLE_SHIFT) ) { if ( !(l4e_get_flags(idle_pg_table[l4_table_offset(va)]) & _PAGE_PRESENT) ) { l3_pg = alloc_domheap_page(NULL, 0); if ( !l3_pg ) goto nomem; l3_ro_mpt = page_to_virt(l3_pg); clear_page(l3_ro_mpt); l4e_write(&idle_pg_table[l4_table_offset(va)], l4e_from_page(l3_pg, __PAGE_HYPERVISOR)); } } /* Create user-accessible L2 directory to map the MPT for guests. */ if ( (l3_pg = alloc_domheap_page(NULL, 0)) == NULL ) goto nomem; l3_ro_mpt = page_to_virt(l3_pg); clear_page(l3_ro_mpt); l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)], l4e_from_page(l3_pg, __PAGE_HYPERVISOR | _PAGE_USER)); /* * Allocate and map the machine-to-phys table. * This also ensures L3 is present for fixmaps. */ mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1; mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL); #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long)) #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \ sizeof(*machine_to_phys_mapping)) BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \ sizeof(*machine_to_phys_mapping)); for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ ) { BUILD_BUG_ON(RO_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1)); va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT); memflags = MEMF_node(phys_to_nid(i << (L2_PAGETABLE_SHIFT - 3 + PAGE_SHIFT))); if ( cpu_has_page1gb && !((unsigned long)l2_ro_mpt & ~PAGE_MASK) && (mpt_size >> L3_PAGETABLE_SHIFT) > (i >> PAGETABLE_ORDER) ) { unsigned int k, holes; for ( holes = k = 0; k < 1 << PAGETABLE_ORDER; ++k) { for ( n = 0; n < CNT; ++n) if ( mfn_valid(MFN(i + k) + n * PDX_GROUP_COUNT) ) break; if ( n == CNT ) ++holes; } if ( k == holes ) { i += (1UL << PAGETABLE_ORDER) - 1; continue; } if ( holes == 0 && (l1_pg = alloc_domheap_pages(NULL, 2 * PAGETABLE_ORDER, memflags)) != NULL ) { map_pages_to_xen( RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT), page_to_mfn(l1_pg), 1UL << (2 * PAGETABLE_ORDER), PAGE_HYPERVISOR); memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), 0x77, 1UL << L3_PAGETABLE_SHIFT); ASSERT(!l2_table_offset(va)); /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */ l3e_write(&l3_ro_mpt[l3_table_offset(va)], l3e_from_page(l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT)); i += (1UL << PAGETABLE_ORDER) - 1; continue; } } for ( n = 0; n < CNT; ++n) if ( mfn_valid(MFN(i) + n * PDX_GROUP_COUNT) ) break; if ( n == CNT ) l1_pg = NULL; else if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, memflags)) == NULL ) goto nomem; else { map_pages_to_xen( RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT), page_to_mfn(l1_pg), 1UL << PAGETABLE_ORDER, PAGE_HYPERVISOR); /* Fill with INVALID_M2P_ENTRY. */ memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), 0xFF, 1UL << L2_PAGETABLE_SHIFT); } if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) ) { if ( (l2_pg = alloc_domheap_page(NULL, memflags)) == NULL ) goto nomem; l2_ro_mpt = page_to_virt(l2_pg); clear_page(l2_ro_mpt); l3e_write(&l3_ro_mpt[l3_table_offset(va)], l3e_from_page(l2_pg, __PAGE_HYPERVISOR | _PAGE_USER)); ASSERT(!l2_table_offset(va)); } /* NB. Cannot be GLOBAL as shadow_mode_translate reuses this area. */ if ( l1_pg ) l2e_write(l2_ro_mpt, l2e_from_page( l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT)); l2_ro_mpt++; } #undef CNT #undef MFN /* Create user-accessible L2 directory to map the MPT for compat guests. */ BUILD_BUG_ON(l4_table_offset(RDWR_MPT_VIRT_START) != l4_table_offset(HIRO_COMPAT_MPT_VIRT_START)); l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset( HIRO_COMPAT_MPT_VIRT_START)]); if ( (l2_ro_mpt = alloc_xen_pagetable()) == NULL ) goto nomem; compat_idle_pg_table_l2 = l2_ro_mpt; clear_page(l2_ro_mpt); l3e_write(&l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)], l3e_from_paddr(__pa(l2_ro_mpt), __PAGE_HYPERVISOR)); l2_ro_mpt += l2_table_offset(HIRO_COMPAT_MPT_VIRT_START); /* Allocate and map the compatibility mode machine-to-phys table. */ mpt_size = (mpt_size >> 1) + (1UL << (L2_PAGETABLE_SHIFT - 1)); if ( mpt_size > RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START ) mpt_size = RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START; mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL); if ( (m2p_compat_vstart + mpt_size) < MACH2PHYS_COMPAT_VIRT_END ) m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size; #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int)) #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \ sizeof(*compat_machine_to_phys_mapping)) BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \ sizeof(*compat_machine_to_phys_mapping)); for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++, l2_ro_mpt++ ) { memflags = MEMF_node(phys_to_nid(i << (L2_PAGETABLE_SHIFT - 2 + PAGE_SHIFT))); for ( n = 0; n < CNT; ++n) if ( mfn_valid(MFN(i) + n * PDX_GROUP_COUNT) ) break; if ( n == CNT ) continue; if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, memflags)) == NULL ) goto nomem; map_pages_to_xen( RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT), page_to_mfn(l1_pg), 1UL << PAGETABLE_ORDER, PAGE_HYPERVISOR); memset((void *)(RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), 0x55, 1UL << L2_PAGETABLE_SHIFT); /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */ l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT)); } #undef CNT #undef MFN machine_to_phys_mapping_valid = 1; /* Set up linear page table mapping. */ l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)], l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR)); return; nomem: panic("Not enough memory for m2p table"); } void __init zap_low_mappings(void) { BUG_ON(num_online_cpus() != 1); /* Remove aliased mapping of first 1:1 PML4 entry. */ l4e_write(&idle_pg_table[0], l4e_empty()); flush_local(FLUSH_TLB_GLOBAL); /* Replace with mapping of the boot trampoline only. */ map_pages_to_xen(trampoline_phys, trampoline_phys >> PAGE_SHIFT, PFN_UP(trampoline_end - trampoline_start), __PAGE_HYPERVISOR); } int setup_compat_arg_xlat(struct vcpu *v) { return create_perdomain_mapping(v->domain, ARG_XLAT_START(v), PFN_UP(COMPAT_ARG_XLAT_SIZE), NULL, NIL(struct page_info *)); } void free_compat_arg_xlat(struct vcpu *v) { destroy_perdomain_mapping(v->domain, ARG_XLAT_START(v), PFN_UP(COMPAT_ARG_XLAT_SIZE)); } void cleanup_frame_table(struct mem_hotadd_info *info) { unsigned long sva, eva; l3_pgentry_t l3e; l2_pgentry_t l2e; unsigned long spfn, epfn; spfn = info->spfn; epfn = info->epfn; sva = (unsigned long)pdx_to_page(pfn_to_pdx(spfn)); eva = (unsigned long)pdx_to_page(pfn_to_pdx(epfn)); /* Intialize all page */ memset(mfn_to_page(spfn), -1, (unsigned long)mfn_to_page(epfn) - (unsigned long)mfn_to_page(spfn)); while (sva < eva) { l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(sva)])[ l3_table_offset(sva)]; if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_flags(l3e) & _PAGE_PSE) ) { sva = (sva & ~((1UL << L3_PAGETABLE_SHIFT) - 1)) + (1UL << L3_PAGETABLE_SHIFT); continue; } l2e = l3e_to_l2e(l3e)[l2_table_offset(sva)]; ASSERT(l2e_get_flags(l2e) & _PAGE_PRESENT); if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) == (_PAGE_PSE | _PAGE_PRESENT) ) { if (hotadd_mem_valid(l2e_get_pfn(l2e), info)) destroy_xen_mappings(sva & ~((1UL << L2_PAGETABLE_SHIFT) - 1), ((sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) + (1UL << L2_PAGETABLE_SHIFT) - 1)); sva = (sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) + (1UL << L2_PAGETABLE_SHIFT); continue; } ASSERT(l1e_get_flags(l2e_to_l1e(l2e)[l1_table_offset(sva)]) & _PAGE_PRESENT); sva = (sva & ~((1UL << PAGE_SHIFT) - 1)) + (1UL << PAGE_SHIFT); } /* Brute-Force flush all TLB */ flush_tlb_all(); } static int setup_frametable_chunk(void *start, void *end, struct mem_hotadd_info *info) { unsigned long s = (unsigned long)start; unsigned long e = (unsigned long)end; unsigned long mfn; int err; ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1))); ASSERT(!(e & ((1 << L2_PAGETABLE_SHIFT) - 1))); for ( ; s < e; s += (1UL << L2_PAGETABLE_SHIFT)) { mfn = alloc_hotadd_mfn(info); err = map_pages_to_xen(s, mfn, 1UL << PAGETABLE_ORDER, PAGE_HYPERVISOR); if ( err ) return err; } memset(start, -1, s - (unsigned long)start); return 0; } static int extend_frame_table(struct mem_hotadd_info *info) { unsigned long cidx, nidx, eidx, spfn, epfn; spfn = info->spfn; epfn = info->epfn; eidx = (pfn_to_pdx(epfn) + PDX_GROUP_COUNT - 1) / PDX_GROUP_COUNT; nidx = cidx = pfn_to_pdx(spfn)/PDX_GROUP_COUNT; ASSERT( pfn_to_pdx(epfn) <= (DIRECTMAP_SIZE >> PAGE_SHIFT) && pfn_to_pdx(epfn) <= FRAMETABLE_NR ); if ( test_bit(cidx, pdx_group_valid) ) cidx = find_next_zero_bit(pdx_group_valid, eidx, cidx); if ( cidx >= eidx ) return 0; while ( cidx < eidx ) { int err; nidx = find_next_bit(pdx_group_valid, eidx, cidx); if ( nidx >= eidx ) nidx = eidx; err = setup_frametable_chunk(pdx_to_page(cidx * PDX_GROUP_COUNT ), pdx_to_page(nidx * PDX_GROUP_COUNT), info); if ( err ) return err; cidx = find_next_zero_bit(pdx_group_valid, eidx, nidx); } memset(mfn_to_page(spfn), 0, (unsigned long)mfn_to_page(epfn) - (unsigned long)mfn_to_page(spfn)); return 0; } void __init subarch_init_memory(void) { unsigned long i, n, v, m2p_start_mfn; l3_pgentry_t l3e; l2_pgentry_t l2e; BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1)); BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1)); /* M2P table is mappable read-only by privileged domains. */ for ( v = RDWR_MPT_VIRT_START; v != RDWR_MPT_VIRT_END; v += n << PAGE_SHIFT ) { n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES; l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[ l3_table_offset(v)]; if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) continue; if ( !(l3e_get_flags(l3e) & _PAGE_PSE) ) { n = L1_PAGETABLE_ENTRIES; l2e = l3e_to_l2e(l3e)[l2_table_offset(v)]; if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) continue; m2p_start_mfn = l2e_get_pfn(l2e); } else { m2p_start_mfn = l3e_get_pfn(l3e); } for ( i = 0; i < n; i++ ) { struct page_info *page = mfn_to_page(m2p_start_mfn + i); share_xen_page_with_privileged_guests(page, XENSHARE_readonly); } } for ( v = RDWR_COMPAT_MPT_VIRT_START; v != RDWR_COMPAT_MPT_VIRT_END; v += 1 << L2_PAGETABLE_SHIFT ) { l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[ l3_table_offset(v)]; if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) continue; l2e = l3e_to_l2e(l3e)[l2_table_offset(v)]; if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) continue; m2p_start_mfn = l2e_get_pfn(l2e); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) { struct page_info *page = mfn_to_page(m2p_start_mfn + i); share_xen_page_with_privileged_guests(page, XENSHARE_readonly); } } } long subarch_memory_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg) { struct xen_machphys_mfn_list xmml; l3_pgentry_t l3e; l2_pgentry_t l2e; unsigned long v; xen_pfn_t mfn, last_mfn; unsigned int i; long rc = 0; switch ( op ) { case XENMEM_machphys_mfn_list: if ( copy_from_guest(&xmml, arg, 1) ) return -EFAULT; BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1)); BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1)); for ( i = 0, v = RDWR_MPT_VIRT_START, last_mfn = 0; (i != xmml.max_extents) && (v < (unsigned long)(machine_to_phys_mapping + max_page)); i++, v += 1UL << L2_PAGETABLE_SHIFT ) { l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[ l3_table_offset(v)]; if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) mfn = last_mfn; else if ( !(l3e_get_flags(l3e) & _PAGE_PSE) ) { l2e = l3e_to_l2e(l3e)[l2_table_offset(v)]; if ( l2e_get_flags(l2e) & _PAGE_PRESENT ) mfn = l2e_get_pfn(l2e); else mfn = last_mfn; } else { mfn = l3e_get_pfn(l3e) + (l2_table_offset(v) << PAGETABLE_ORDER); } ASSERT(mfn); if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) ) return -EFAULT; last_mfn = mfn; } xmml.nr_extents = i; if ( __copy_to_guest(arg, &xmml, 1) ) return -EFAULT; break; case XENMEM_get_sharing_freed_pages: return mem_sharing_get_nr_saved_mfns(); case XENMEM_get_sharing_shared_pages: return mem_sharing_get_nr_shared_mfns(); case XENMEM_paging_op: case XENMEM_access_op: { xen_mem_event_op_t meo; if ( copy_from_guest(&meo, arg, 1) ) return -EFAULT; rc = do_mem_event_op(op, meo.domain, (void *) &meo); if ( !rc && __copy_to_guest(arg, &meo, 1) ) return -EFAULT; break; } case XENMEM_sharing_op: { xen_mem_sharing_op_t mso; if ( copy_from_guest(&mso, arg, 1) ) return -EFAULT; if ( mso.op == XENMEM_sharing_op_audit ) return mem_sharing_audit(); rc = do_mem_event_op(op, mso.domain, (void *) &mso); if ( !rc && __copy_to_guest(arg, &mso, 1) ) return -EFAULT; break; } default: rc = -ENOSYS; break; } return rc; } long do_stack_switch(unsigned long ss, unsigned long esp) { fixup_guest_stack_selector(current->domain, ss); current->arch.pv_vcpu.kernel_ss = ss; current->arch.pv_vcpu.kernel_sp = esp; return 0; } long do_set_segment_base(unsigned int which, unsigned long base) { struct vcpu *v = current; long ret = 0; switch ( which ) { case SEGBASE_FS: if ( wrmsr_safe(MSR_FS_BASE, base) ) ret = -EFAULT; else v->arch.pv_vcpu.fs_base = base; break; case SEGBASE_GS_USER: if ( wrmsr_safe(MSR_SHADOW_GS_BASE, base) ) ret = -EFAULT; else v->arch.pv_vcpu.gs_base_user = base; break; case SEGBASE_GS_KERNEL: if ( wrmsr_safe(MSR_GS_BASE, base) ) ret = -EFAULT; else v->arch.pv_vcpu.gs_base_kernel = base; break; case SEGBASE_GS_USER_SEL: __asm__ __volatile__ ( " swapgs \n" "1: movl %k0,%%gs \n" " "safe_swapgs" \n" ".section .fixup,\"ax\" \n" "2: xorl %k0,%k0 \n" " jmp 1b \n" ".previous \n" _ASM_EXTABLE(1b, 2b) : : "r" (base&0xffff) ); break; default: ret = -EINVAL; break; } return ret; } /* Returns TRUE if given descriptor is valid for GDT or LDT. */ int check_descriptor(const struct domain *dom, struct desc_struct *d) { u32 a = d->a, b = d->b; u16 cs; unsigned int dpl; /* A not-present descriptor will always fault, so is safe. */ if ( !(b & _SEGMENT_P) ) goto good; /* Check and fix up the DPL. */ dpl = (b >> 13) & 3; __fixup_guest_selector(dom, dpl); b = (b & ~_SEGMENT_DPL) | (dpl << 13); /* All code and data segments are okay. No base/limit checking. */ if ( (b & _SEGMENT_S) ) { if ( is_pv_32bit_domain(dom) ) { unsigned long base, limit; if ( b & _SEGMENT_L ) goto bad; /* * Older PAE Linux guests use segments which are limited to * 0xf6800000. Extend these to allow access to the larger read-only * M2P table available in 32on64 mode. */ base = (b & (0xff << 24)) | ((b & 0xff) << 16) | (a >> 16); limit = (b & 0xf0000) | (a & 0xffff); limit++; /* We add one because limit is inclusive. */ if ( (b & _SEGMENT_G) ) limit <<= 12; if ( (base == 0) && (limit > HYPERVISOR_COMPAT_VIRT_START(dom)) ) { a |= 0x0000ffff; b |= 0x000f0000; } } goto good; } /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */ if ( (b & _SEGMENT_TYPE) == 0x000 ) goto good; /* Everything but a call gate is discarded here. */ if ( (b & _SEGMENT_TYPE) != 0xc00 ) goto bad; /* Validate the target code selector. */ cs = a >> 16; if ( !guest_gate_selector_okay(dom, cs) ) goto bad; /* * Force DPL to zero, causing a GP fault with its error code indicating * the gate in use, allowing emulation. This is necessary because with * native guests (kernel in ring 3) call gates cannot be used directly * to transition from user to kernel mode (and whether a gate is used * to enter the kernel can only be determined when the gate is being * used), and with compat guests call gates cannot be used at all as * there are only 64-bit ones. * Store the original DPL in the selector's RPL field. */ b &= ~_SEGMENT_DPL; cs = (cs & ~3) | dpl; a = (a & 0xffffU) | (cs << 16); /* Reserved bits must be zero. */ if ( b & (is_pv_32bit_domain(dom) ? 0xe0 : 0xff) ) goto bad; good: d->a = a; d->b = b; return 1; bad: return 0; } int pagefault_by_memadd(unsigned long addr, struct cpu_user_regs *regs) { struct domain *d = current->domain; return mem_hotplug && guest_mode(regs) && is_pv_32bit_domain(d) && (addr >= HYPERVISOR_COMPAT_VIRT_START(d)) && (addr < MACH2PHYS_COMPAT_VIRT_END); } int handle_memadd_fault(unsigned long addr, struct cpu_user_regs *regs) { struct domain *d = current->domain; l4_pgentry_t *pl4e = NULL; l4_pgentry_t l4e; l3_pgentry_t *pl3e = NULL; l3_pgentry_t l3e; l2_pgentry_t *pl2e = NULL; l2_pgentry_t l2e, idle_l2e; unsigned long mfn, idle_index; int ret = 0; if (!is_pv_32on64_domain(d)) return 0; if ( (addr < HYPERVISOR_COMPAT_VIRT_START(d)) || (addr >= MACH2PHYS_COMPAT_VIRT_END) ) return 0; mfn = (read_cr3()) >> PAGE_SHIFT; pl4e = map_domain_page(mfn); l4e = pl4e[0]; if (!(l4e_get_flags(l4e) & _PAGE_PRESENT)) goto unmap; mfn = l4e_get_pfn(l4e); /* We don't need get page type here since it is current CR3 */ pl3e = map_domain_page(mfn); l3e = pl3e[3]; if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) goto unmap; mfn = l3e_get_pfn(l3e); pl2e = map_domain_page(mfn); l2e = pl2e[l2_table_offset(addr)]; if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT)) goto unmap; idle_index = (l2_table_offset(addr) - COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d))/ sizeof(l2_pgentry_t); idle_l2e = compat_idle_pg_table_l2[idle_index]; if (!(l2e_get_flags(idle_l2e) & _PAGE_PRESENT)) goto unmap; memcpy(&pl2e[l2_table_offset(addr)], &compat_idle_pg_table_l2[idle_index], sizeof(l2_pgentry_t)); ret = EXCRET_fault_fixed; unmap: if ( pl4e ) unmap_domain_page(pl4e); if ( pl3e ) unmap_domain_page(pl3e); if ( pl2e ) unmap_domain_page(pl2e); return ret; } void domain_set_alloc_bitsize(struct domain *d) { if ( !is_pv_32on64_domain(d) || (MACH2PHYS_COMPAT_NR_ENTRIES(d) >= max_page) || d->arch.physaddr_bitsize > 0 ) return; d->arch.physaddr_bitsize = /* 2^n entries can be contained in guest's p2m mapping space */ fls(MACH2PHYS_COMPAT_NR_ENTRIES(d)) - 1 /* 2^n pages -> 2^(n+PAGE_SHIFT) bits */ + PAGE_SHIFT; } unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits) { if ( (d == NULL) || (d->arch.physaddr_bitsize == 0) ) return bits; return min(d->arch.physaddr_bitsize, bits); } int transfer_pages_to_heap(struct mem_hotadd_info *info) { unsigned long i; struct page_info *pg; /* * Mark the allocated page before put free pages to buddy allocator * to avoid merge in free_heap_pages */ for (i = info->spfn; i < info->cur; i++) { pg = mfn_to_page(i); pg->count_info = PGC_state_inuse; } init_domheap_pages(pfn_to_paddr(info->cur), pfn_to_paddr(info->epfn)); return 0; } int mem_hotadd_check(unsigned long spfn, unsigned long epfn) { unsigned long s, e, length, sidx, eidx; if ( (spfn >= epfn) ) return 0; if (pfn_to_pdx(epfn) > FRAMETABLE_NR) return 0; if ( (spfn | epfn) & ((1UL << PAGETABLE_ORDER) - 1) ) return 0; if ( (spfn | epfn) & pfn_hole_mask ) return 0; /* Make sure the new range is not present now */ sidx = ((pfn_to_pdx(spfn) + PDX_GROUP_COUNT - 1) & ~(PDX_GROUP_COUNT - 1)) / PDX_GROUP_COUNT; eidx = (pfn_to_pdx(epfn - 1) & ~(PDX_GROUP_COUNT - 1)) / PDX_GROUP_COUNT; if (sidx >= eidx) return 0; s = find_next_zero_bit(pdx_group_valid, eidx, sidx); if ( s > eidx ) return 0; e = find_next_bit(pdx_group_valid, eidx, s); if ( e < eidx ) return 0; /* Caculate at most required m2p/compat m2p/frametable pages */ s = (spfn & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)); e = (epfn + (1UL << (L2_PAGETABLE_SHIFT - 3)) - 1) & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1); length = (e - s) * sizeof(unsigned long); s = (spfn & ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1)); e = (epfn + (1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) & ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1); e = min_t(unsigned long, e, (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2); if ( e > s ) length += (e -s) * sizeof(unsigned int); s = pfn_to_pdx(spfn) & ~(PDX_GROUP_COUNT - 1); e = ( pfn_to_pdx(epfn) + (PDX_GROUP_COUNT - 1) ) & ~(PDX_GROUP_COUNT - 1); length += (e - s) * sizeof(struct page_info); if ((length >> PAGE_SHIFT) > (epfn - spfn)) return 0; return 1; } /* * A bit paranoid for memory allocation failure issue since * it may be reason for memory add */ int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm) { struct mem_hotadd_info info; int ret, node; unsigned long old_max = max_page, old_total = total_pages; unsigned long old_node_start, old_node_span, orig_online; unsigned long i; dprintk(XENLOG_INFO, "memory_add %lx ~ %lx with pxm %x\n", spfn, epfn, pxm); if ( !mem_hotadd_check(spfn, epfn) ) return -EINVAL; if ( (node = setup_node(pxm)) == -1 ) return -EINVAL; if ( !valid_numa_range(spfn << PAGE_SHIFT, epfn << PAGE_SHIFT, node) ) { dprintk(XENLOG_WARNING, "spfn %lx ~ epfn %lx pxm %x node %x" "is not numa valid", spfn, epfn, pxm, node); return -EINVAL; } i = virt_to_mfn(HYPERVISOR_VIRT_END - 1) + 1; if ( spfn < i ) { ret = map_pages_to_xen((unsigned long)mfn_to_virt(spfn), spfn, min(epfn, i) - spfn, PAGE_HYPERVISOR); if ( ret ) return ret; } if ( i < epfn ) { if ( i < spfn ) i = spfn; ret = map_pages_to_xen((unsigned long)mfn_to_virt(i), i, epfn - i, __PAGE_HYPERVISOR); if ( ret ) return ret; } old_node_start = NODE_DATA(node)->node_start_pfn; old_node_span = NODE_DATA(node)->node_spanned_pages; orig_online = node_online(node); if ( !orig_online ) { dprintk(XENLOG_WARNING, "node %x pxm %x is not online\n",node, pxm); NODE_DATA(node)->node_id = node; NODE_DATA(node)->node_start_pfn = spfn; NODE_DATA(node)->node_spanned_pages = epfn - node_start_pfn(node); node_set_online(node); }else { if (NODE_DATA(node)->node_start_pfn > spfn) NODE_DATA(node)->node_start_pfn = spfn; if (node_end_pfn(node) < epfn) NODE_DATA(node)->node_spanned_pages = epfn - node_start_pfn(node); } ret = -EINVAL; info.spfn = spfn; info.epfn = epfn; info.cur = spfn; ret = extend_frame_table(&info); if (ret) goto destroy_frametable; /* Set max_page as setup_m2p_table will use it*/ if (max_page < epfn) { max_page = epfn; max_pdx = pfn_to_pdx(max_page - 1) + 1; } total_pages += epfn - spfn; set_pdx_range(spfn, epfn); ret = setup_m2p_table(&info); if ( ret ) goto destroy_m2p; if ( !need_iommu(dom0) ) { for ( i = spfn; i < epfn; i++ ) if ( iommu_map_page(dom0, i, i, IOMMUF_readable|IOMMUF_writable) ) break; if ( i != epfn ) { while (i-- > old_max) iommu_unmap_page(dom0, i); goto destroy_m2p; } } /* We can't revert any more */ transfer_pages_to_heap(&info); share_hotadd_m2p_table(&info); return 0; destroy_m2p: destroy_m2p_mapping(&info); max_page = old_max; total_pages = old_total; max_pdx = pfn_to_pdx(max_page - 1) + 1; destroy_frametable: cleanup_frame_table(&info); destroy_xen_mappings((unsigned long)mfn_to_virt(spfn), (unsigned long)mfn_to_virt(epfn)); if ( !orig_online ) node_set_offline(node); NODE_DATA(node)->node_start_pfn = old_node_start; NODE_DATA(node)->node_spanned_pages = old_node_span; return ret; } #include "compat/mm.c" /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/x86_64/traps.c0000664000175000017500000004666212307313555015050 0ustar smbsmb #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void print_xen_info(void) { char taint_str[TAINT_STRING_MAX_LEN]; printk("----[ Xen-%d.%d%s x86_64 debug=%c %s ]----\n", xen_major_version(), xen_minor_version(), xen_extra_version(), debug_build() ? 'y' : 'n', print_tainted(taint_str)); } enum context { CTXT_hypervisor, CTXT_pv_guest, CTXT_hvm_guest }; static void _show_registers( const struct cpu_user_regs *regs, unsigned long crs[8], enum context context, const struct vcpu *v) { static const char *const context_names[] = { [CTXT_hypervisor] = "hypervisor", [CTXT_pv_guest] = "pv guest", [CTXT_hvm_guest] = "hvm guest" }; printk("RIP: %04x:[<%016lx>]", regs->cs, regs->rip); if ( context == CTXT_hypervisor ) printk(" %pS", _p(regs->rip)); printk("\nRFLAGS: %016lx ", regs->rflags); if ( (context == CTXT_pv_guest) && v && v->vcpu_info ) printk("EM: %d ", !!vcpu_info(v, evtchn_upcall_mask)); printk("CONTEXT: %s\n", context_names[context]); printk("rax: %016lx rbx: %016lx rcx: %016lx\n", regs->rax, regs->rbx, regs->rcx); printk("rdx: %016lx rsi: %016lx rdi: %016lx\n", regs->rdx, regs->rsi, regs->rdi); printk("rbp: %016lx rsp: %016lx r8: %016lx\n", regs->rbp, regs->rsp, regs->r8); printk("r9: %016lx r10: %016lx r11: %016lx\n", regs->r9, regs->r10, regs->r11); if ( !(regs->entry_vector & TRAP_regs_partial) ) { printk("r12: %016lx r13: %016lx r14: %016lx\n", regs->r12, regs->r13, regs->r14); printk("r15: %016lx cr0: %016lx cr4: %016lx\n", regs->r15, crs[0], crs[4]); } else printk("cr0: %016lx cr4: %016lx\n", crs[0], crs[4]); printk("cr3: %016lx cr2: %016lx\n", crs[3], crs[2]); printk("ds: %04x es: %04x fs: %04x gs: %04x " "ss: %04x cs: %04x\n", regs->ds, regs->es, regs->fs, regs->gs, regs->ss, regs->cs); } void show_registers(struct cpu_user_regs *regs) { struct cpu_user_regs fault_regs = *regs; unsigned long fault_crs[8]; enum context context; struct vcpu *v = current; if ( has_hvm_container_vcpu(v) && guest_mode(regs) ) { struct segment_register sreg; context = CTXT_hvm_guest; fault_crs[0] = v->arch.hvm_vcpu.guest_cr[0]; fault_crs[2] = v->arch.hvm_vcpu.guest_cr[2]; fault_crs[3] = v->arch.hvm_vcpu.guest_cr[3]; fault_crs[4] = v->arch.hvm_vcpu.guest_cr[4]; hvm_get_segment_register(v, x86_seg_cs, &sreg); fault_regs.cs = sreg.sel; hvm_get_segment_register(v, x86_seg_ds, &sreg); fault_regs.ds = sreg.sel; hvm_get_segment_register(v, x86_seg_es, &sreg); fault_regs.es = sreg.sel; hvm_get_segment_register(v, x86_seg_fs, &sreg); fault_regs.fs = sreg.sel; hvm_get_segment_register(v, x86_seg_gs, &sreg); fault_regs.gs = sreg.sel; hvm_get_segment_register(v, x86_seg_ss, &sreg); fault_regs.ss = sreg.sel; } else { if ( guest_mode(regs) ) { context = CTXT_pv_guest; fault_crs[2] = arch_get_cr2(v); } else { context = CTXT_hypervisor; fault_crs[2] = read_cr2(); } fault_crs[0] = read_cr0(); fault_crs[3] = read_cr3(); fault_crs[4] = read_cr4(); fault_regs.ds = read_segment_register(ds); fault_regs.es = read_segment_register(es); fault_regs.fs = read_segment_register(fs); fault_regs.gs = read_segment_register(gs); } print_xen_info(); printk("CPU: %d\n", smp_processor_id()); _show_registers(&fault_regs, fault_crs, context, v); if ( this_cpu(ler_msr) && !guest_mode(regs) ) { u64 from, to; rdmsrl(this_cpu(ler_msr), from); rdmsrl(this_cpu(ler_msr) + 1, to); printk("ler: %016lx -> %016lx\n", from, to); } } void vcpu_show_registers(const struct vcpu *v) { const struct cpu_user_regs *regs = &v->arch.user_regs; unsigned long crs[8]; /* Only handle PV guests for now */ if ( !is_pv_vcpu(v) ) return; crs[0] = v->arch.pv_vcpu.ctrlreg[0]; crs[2] = arch_get_cr2(v); crs[3] = pagetable_get_paddr(guest_kernel_mode(v, regs) ? v->arch.guest_table : v->arch.guest_table_user); crs[4] = v->arch.pv_vcpu.ctrlreg[4]; _show_registers(regs, crs, CTXT_pv_guest, v); } void show_page_walk(unsigned long addr) { unsigned long pfn, mfn = read_cr3() >> PAGE_SHIFT; l4_pgentry_t l4e, *l4t; l3_pgentry_t l3e, *l3t; l2_pgentry_t l2e, *l2t; l1_pgentry_t l1e, *l1t; printk("Pagetable walk from %016lx:\n", addr); if ( !is_canonical_address(addr) ) return; l4t = map_domain_page(mfn); l4e = l4t[l4_table_offset(addr)]; unmap_domain_page(l4t); mfn = l4e_get_pfn(l4e); pfn = mfn_valid(mfn) && machine_to_phys_mapping_valid ? get_gpfn_from_mfn(mfn) : INVALID_M2P_ENTRY; printk(" L4[0x%03lx] = %"PRIpte" %016lx\n", l4_table_offset(addr), l4e_get_intpte(l4e), pfn); if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) || !mfn_valid(mfn) ) return; l3t = map_domain_page(mfn); l3e = l3t[l3_table_offset(addr)]; unmap_domain_page(l3t); mfn = l3e_get_pfn(l3e); pfn = mfn_valid(mfn) && machine_to_phys_mapping_valid ? get_gpfn_from_mfn(mfn) : INVALID_M2P_ENTRY; printk(" L3[0x%03lx] = %"PRIpte" %016lx%s\n", l3_table_offset(addr), l3e_get_intpte(l3e), pfn, (l3e_get_flags(l3e) & _PAGE_PSE) ? " (PSE)" : ""); if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_flags(l3e) & _PAGE_PSE) || !mfn_valid(mfn) ) return; l2t = map_domain_page(mfn); l2e = l2t[l2_table_offset(addr)]; unmap_domain_page(l2t); mfn = l2e_get_pfn(l2e); pfn = mfn_valid(mfn) && machine_to_phys_mapping_valid ? get_gpfn_from_mfn(mfn) : INVALID_M2P_ENTRY; printk(" L2[0x%03lx] = %"PRIpte" %016lx %s\n", l2_table_offset(addr), l2e_get_intpte(l2e), pfn, (l2e_get_flags(l2e) & _PAGE_PSE) ? "(PSE)" : ""); if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_flags(l2e) & _PAGE_PSE) || !mfn_valid(mfn) ) return; l1t = map_domain_page(mfn); l1e = l1t[l1_table_offset(addr)]; unmap_domain_page(l1t); mfn = l1e_get_pfn(l1e); pfn = mfn_valid(mfn) && machine_to_phys_mapping_valid ? get_gpfn_from_mfn(mfn) : INVALID_M2P_ENTRY; printk(" L1[0x%03lx] = %"PRIpte" %016lx\n", l1_table_offset(addr), l1e_get_intpte(l1e), pfn); } void double_fault(void); void do_double_fault(struct cpu_user_regs *regs) { unsigned int cpu; unsigned long crs[8]; console_force_unlock(); asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) ); /* Find information saved during fault and dump it to the console. */ printk("*** DOUBLE FAULT ***\n"); print_xen_info(); crs[0] = read_cr0(); crs[2] = read_cr2(); crs[3] = read_cr3(); crs[4] = read_cr4(); regs->ds = read_segment_register(ds); regs->es = read_segment_register(es); regs->fs = read_segment_register(fs); regs->gs = read_segment_register(gs); printk("CPU: %d\n", cpu); _show_registers(regs, crs, CTXT_hypervisor, NULL); show_stack_overflow(cpu, regs); panic("DOUBLE FAULT -- system shutdown"); } void toggle_guest_mode(struct vcpu *v) { if ( is_pv_32bit_vcpu(v) ) return; if ( cpu_has_fsgsbase ) { if ( v->arch.flags & TF_kernel_mode ) v->arch.pv_vcpu.gs_base_kernel = __rdgsbase(); else v->arch.pv_vcpu.gs_base_user = __rdgsbase(); } v->arch.flags ^= TF_kernel_mode; asm volatile ( "swapgs" ); update_cr3(v); #ifdef USER_MAPPINGS_ARE_GLOBAL /* Don't flush user global mappings from the TLB. Don't tick TLB clock. */ asm volatile ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" ); #else write_ptbase(v); #endif if ( !(v->arch.flags & TF_kernel_mode) ) return; if ( v->arch.pv_vcpu.need_update_runstate_area && update_runstate_area(v) ) v->arch.pv_vcpu.need_update_runstate_area = 0; if ( v->arch.pv_vcpu.pending_system_time.version && update_secondary_system_time(v, &v->arch.pv_vcpu.pending_system_time) ) v->arch.pv_vcpu.pending_system_time.version = 0; } unsigned long do_iret(void) { struct cpu_user_regs *regs = guest_cpu_user_regs(); struct iret_context iret_saved; struct vcpu *v = current; if ( unlikely(copy_from_user(&iret_saved, (void *)regs->rsp, sizeof(iret_saved))) ) { gdprintk(XENLOG_ERR, "Fault while reading IRET context from " "guest stack\n"); goto exit_and_crash; } /* Returning to user mode? */ if ( (iret_saved.cs & 3) == 3 ) { if ( unlikely(pagetable_is_null(v->arch.guest_table_user)) ) { gdprintk(XENLOG_ERR, "Guest switching to user mode with no " "user page tables\n"); goto exit_and_crash; } toggle_guest_mode(v); } regs->rip = iret_saved.rip; regs->cs = iret_saved.cs | 3; /* force guest privilege */ regs->rflags = ((iret_saved.rflags & ~(X86_EFLAGS_IOPL|X86_EFLAGS_VM)) | X86_EFLAGS_IF); regs->rsp = iret_saved.rsp; regs->ss = iret_saved.ss | 3; /* force guest privilege */ if ( !(iret_saved.flags & VGCF_in_syscall) ) { regs->entry_vector &= ~TRAP_syscall; regs->r11 = iret_saved.r11; regs->rcx = iret_saved.rcx; } /* Restore upcall mask from supplied EFLAGS.IF. */ vcpu_info(v, evtchn_upcall_mask) = !(iret_saved.rflags & X86_EFLAGS_IF); async_exception_cleanup(v); /* Saved %rax gets written back to regs->rax in entry.S. */ return iret_saved.rax; exit_and_crash: gdprintk(XENLOG_ERR, "Fatal error\n"); domain_crash(v->domain); return 0; } static int write_stack_trampoline( char *stack, char *stack_bottom, uint16_t cs_seg) { /* movq %rsp, saversp(%rip) */ stack[0] = 0x48; stack[1] = 0x89; stack[2] = 0x25; *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16; /* leaq saversp(%rip), %rsp */ stack[7] = 0x48; stack[8] = 0x8d; stack[9] = 0x25; *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16; /* pushq %r11 */ stack[14] = 0x41; stack[15] = 0x53; /* pushq $ */ stack[16] = 0x68; *(u32 *)&stack[17] = cs_seg; /* movq $syscall_enter,%r11 */ stack[21] = 0x49; stack[22] = 0xbb; *(void **)&stack[23] = (void *)syscall_enter; /* jmpq *%r11 */ stack[31] = 0x41; stack[32] = 0xff; stack[33] = 0xe3; return 34; } void __devinit subarch_percpu_traps_init(void) { char *stack_bottom, *stack; int cpu = smp_processor_id(); if ( cpu == 0 ) { /* Specify dedicated interrupt stacks for NMI, #DF, and #MC. */ set_intr_gate(TRAP_double_fault, &double_fault); set_ist(&idt_table[TRAP_double_fault], IST_DF); set_ist(&idt_table[TRAP_nmi], IST_NMI); set_ist(&idt_table[TRAP_machine_check], IST_MCE); /* * The 32-on-64 hypercall entry vector is only accessible from ring 1. * Also note that this is a trap gate, not an interrupt gate. */ _set_gate(idt_table+HYPERCALL_VECTOR, 15, 1, &compat_hypercall); /* Fast trap for int80 (faster than taking the #GP-fixup path). */ _set_gate(idt_table+0x80, 15, 3, &int80_direct_trap); } stack_bottom = (char *)get_stack_bottom(); stack = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1)); /* IST_MAX IST pages + 1 syscall page + 1 guard page + primary stack. */ BUILD_BUG_ON((IST_MAX + 2) * PAGE_SIZE + PRIMARY_STACK_SIZE > STACK_SIZE); /* Machine Check handler has its own per-CPU 4kB stack. */ this_cpu(init_tss).ist[IST_MCE-1] = (unsigned long)&stack[IST_MCE * PAGE_SIZE]; /* Double-fault handler has its own per-CPU 4kB stack. */ this_cpu(init_tss).ist[IST_DF-1] = (unsigned long)&stack[IST_DF * PAGE_SIZE]; /* NMI handler has its own per-CPU 4kB stack. */ this_cpu(init_tss).ist[IST_NMI-1] = (unsigned long)&stack[IST_NMI * PAGE_SIZE]; /* Trampoline for SYSCALL entry from long mode. */ stack = &stack[IST_MAX * PAGE_SIZE]; /* Skip the IST stacks. */ wrmsrl(MSR_LSTAR, (unsigned long)stack); stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS64); if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR ) { /* SYSENTER entry. */ wrmsrl(MSR_IA32_SYSENTER_ESP, (unsigned long)stack_bottom); wrmsrl(MSR_IA32_SYSENTER_EIP, (unsigned long)sysenter_entry); wrmsr(MSR_IA32_SYSENTER_CS, __HYPERVISOR_CS, 0); } /* Trampoline for SYSCALL entry from compatibility mode. */ stack = (char *)L1_CACHE_ALIGN((unsigned long)stack); wrmsrl(MSR_CSTAR, (unsigned long)stack); stack += write_stack_trampoline(stack, stack_bottom, FLAT_USER_CS32); /* Common SYSCALL parameters. */ wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS); wrmsr(MSR_SYSCALL_MASK, X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT| X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_TF, 0U); } void init_int80_direct_trap(struct vcpu *v) { struct trap_info *ti = &v->arch.pv_vcpu.trap_ctxt[0x80]; struct trap_bounce *tb = &v->arch.pv_vcpu.int80_bounce; tb->flags = TBF_EXCEPTION; tb->cs = ti->cs; tb->eip = ti->address; if ( null_trap_bounce(v, tb) ) tb->flags = 0; } static long register_guest_callback(struct callback_register *reg) { long ret = 0; struct vcpu *v = current; if ( !is_canonical_address(reg->address) ) return -EINVAL; switch ( reg->type ) { case CALLBACKTYPE_event: v->arch.pv_vcpu.event_callback_eip = reg->address; break; case CALLBACKTYPE_failsafe: v->arch.pv_vcpu.failsafe_callback_eip = reg->address; if ( reg->flags & CALLBACKF_mask_events ) set_bit(_VGCF_failsafe_disables_events, &v->arch.vgc_flags); else clear_bit(_VGCF_failsafe_disables_events, &v->arch.vgc_flags); break; case CALLBACKTYPE_syscall: v->arch.pv_vcpu.syscall_callback_eip = reg->address; if ( reg->flags & CALLBACKF_mask_events ) set_bit(_VGCF_syscall_disables_events, &v->arch.vgc_flags); else clear_bit(_VGCF_syscall_disables_events, &v->arch.vgc_flags); break; case CALLBACKTYPE_syscall32: v->arch.pv_vcpu.syscall32_callback_eip = reg->address; v->arch.pv_vcpu.syscall32_disables_events = !!(reg->flags & CALLBACKF_mask_events); break; case CALLBACKTYPE_sysenter: v->arch.pv_vcpu.sysenter_callback_eip = reg->address; v->arch.pv_vcpu.sysenter_disables_events = !!(reg->flags & CALLBACKF_mask_events); break; case CALLBACKTYPE_nmi: ret = register_guest_nmi_callback(reg->address); break; default: ret = -ENOSYS; break; } return ret; } static long unregister_guest_callback(struct callback_unregister *unreg) { long ret; switch ( unreg->type ) { case CALLBACKTYPE_event: case CALLBACKTYPE_failsafe: case CALLBACKTYPE_syscall: case CALLBACKTYPE_syscall32: case CALLBACKTYPE_sysenter: ret = -EINVAL; break; case CALLBACKTYPE_nmi: ret = unregister_guest_nmi_callback(); break; default: ret = -ENOSYS; break; } return ret; } long do_callback_op(int cmd, XEN_GUEST_HANDLE_PARAM(const_void) arg) { long ret; switch ( cmd ) { case CALLBACKOP_register: { struct callback_register reg; ret = -EFAULT; if ( copy_from_guest(®, arg, 1) ) break; ret = register_guest_callback(®); } break; case CALLBACKOP_unregister: { struct callback_unregister unreg; ret = -EFAULT; if ( copy_from_guest(&unreg, arg, 1) ) break; ret = unregister_guest_callback(&unreg); } break; default: ret = -ENOSYS; break; } return ret; } long do_set_callbacks(unsigned long event_address, unsigned long failsafe_address, unsigned long syscall_address) { struct callback_register event = { .type = CALLBACKTYPE_event, .address = event_address, }; struct callback_register failsafe = { .type = CALLBACKTYPE_failsafe, .address = failsafe_address, }; struct callback_register syscall = { .type = CALLBACKTYPE_syscall, .address = syscall_address, }; register_guest_callback(&event); register_guest_callback(&failsafe); register_guest_callback(&syscall); return 0; } static void hypercall_page_initialise_ring3_kernel(void *hypercall_page) { char *p; int i; /* Fill in all the transfer points with template machine code. */ for ( i = 0; i < (PAGE_SIZE / 32); i++ ) { if ( i == __HYPERVISOR_iret ) continue; p = (char *)(hypercall_page + (i * 32)); *(u8 *)(p+ 0) = 0x51; /* push %rcx */ *(u16 *)(p+ 1) = 0x5341; /* push %r11 */ *(u8 *)(p+ 3) = 0xb8; /* mov $,%eax */ *(u32 *)(p+ 4) = i; *(u16 *)(p+ 8) = 0x050f; /* syscall */ *(u16 *)(p+10) = 0x5b41; /* pop %r11 */ *(u8 *)(p+12) = 0x59; /* pop %rcx */ *(u8 *)(p+13) = 0xc3; /* ret */ } /* * HYPERVISOR_iret is special because it doesn't return and expects a * special stack frame. Guests jump at this transfer point instead of * calling it. */ p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32)); *(u8 *)(p+ 0) = 0x51; /* push %rcx */ *(u16 *)(p+ 1) = 0x5341; /* push %r11 */ *(u8 *)(p+ 3) = 0x50; /* push %rax */ *(u8 *)(p+ 4) = 0xb8; /* mov $__HYPERVISOR_iret,%eax */ *(u32 *)(p+ 5) = __HYPERVISOR_iret; *(u16 *)(p+ 9) = 0x050f; /* syscall */ } #include "compat/traps.c" void hypercall_page_initialise(struct domain *d, void *hypercall_page) { memset(hypercall_page, 0xCC, PAGE_SIZE); if ( has_hvm_container_domain(d) ) hvm_hypercall_page_initialise(d, hypercall_page); else if ( !is_pv_32bit_domain(d) ) hypercall_page_initialise_ring3_kernel(hypercall_page); else hypercall_page_initialise_ring1_kernel(hypercall_page); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/x86_64/mmconfig_64.c0000664000175000017500000001442612307313555016020 0ustar smbsmb/* * mmconfig.c - Low-level direct PCI config space access via MMCONFIG * * This is an 64bit optimized version that always keeps the full mmconfig * space mapped. This allows lockless config space operation. * * copied from Linux */ #include #include #include #include #include #include #include #include #include #include "mmconfig.h" /* Static virtual mapping of the MMCONFIG aperture */ struct mmcfg_virt { struct acpi_mcfg_allocation *cfg; char __iomem *virt; }; static struct mmcfg_virt *pci_mmcfg_virt; static unsigned int mmcfg_pci_segment_shift; static char __iomem *get_virt(unsigned int seg, unsigned int *bus) { struct acpi_mcfg_allocation *cfg; int cfg_num; for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { cfg = pci_mmcfg_virt[cfg_num].cfg; if (cfg->pci_segment == seg && (cfg->start_bus_number <= *bus) && (cfg->end_bus_number >= *bus)) { *bus -= cfg->start_bus_number; return pci_mmcfg_virt[cfg_num].virt; } } /* Fall back to type 0 */ return NULL; } static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) { char __iomem *addr; addr = get_virt(seg, &bus); if (!addr) return NULL; return addr + ((bus << 20) | (devfn << 12)); } int pci_mmcfg_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value) { char __iomem *addr; /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) { err: *value = -1; return -EINVAL; } addr = pci_dev_base(seg, bus, devfn); if (!addr) goto err; switch (len) { case 1: *value = mmio_config_readb(addr + reg); break; case 2: *value = mmio_config_readw(addr + reg); break; case 4: *value = mmio_config_readl(addr + reg); break; } return 0; } int pci_mmcfg_write(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 value) { char __iomem *addr; /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) return -EINVAL; addr = pci_dev_base(seg, bus, devfn); if (!addr) return -EINVAL; switch (len) { case 1: mmio_config_writeb(addr + reg, value); break; case 2: mmio_config_writew(addr + reg, value); break; case 4: mmio_config_writel(addr + reg, value); break; } return 0; } static void __iomem *mcfg_ioremap(const struct acpi_mcfg_allocation *cfg, unsigned long idx, unsigned int prot) { unsigned long virt, size; virt = PCI_MCFG_VIRT_START + (idx << mmcfg_pci_segment_shift) + (cfg->start_bus_number << 20); size = (cfg->end_bus_number - cfg->start_bus_number + 1) << 20; if (virt + size < virt || virt + size > PCI_MCFG_VIRT_END) return NULL; if (map_pages_to_xen(virt, (cfg->address >> PAGE_SHIFT) + (cfg->start_bus_number << (20 - PAGE_SHIFT)), size >> PAGE_SHIFT, prot)) return NULL; return (void __iomem *) virt; } void arch_pci_ro_device(int seg, int bdf) { unsigned int idx, bus = PCI_BUS(bdf); for (idx = 0; idx < pci_mmcfg_config_num; ++idx) { const struct acpi_mcfg_allocation *cfg = pci_mmcfg_virt[idx].cfg; unsigned long mfn = (cfg->address >> PAGE_SHIFT) + bdf; if (!pci_mmcfg_virt[idx].virt || cfg->pci_segment != seg || cfg->start_bus_number > bus || cfg->end_bus_number < bus) continue; if (rangeset_add_singleton(mmio_ro_ranges, mfn)) printk(XENLOG_ERR "%04x:%02x:%02x.%u: could not mark MCFG (mfn %#lx) read-only\n", cfg->pci_segment, bus, PCI_SLOT(bdf), PCI_FUNC(bdf), mfn); } } int pci_mmcfg_arch_enable(unsigned int idx) { const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg; const unsigned long *ro_map = pci_get_ro_map(cfg->pci_segment); if (pci_mmcfg_virt[idx].virt) return 0; pci_mmcfg_virt[idx].virt = mcfg_ioremap(cfg, idx, PAGE_HYPERVISOR_NOCACHE); if (!pci_mmcfg_virt[idx].virt) { printk(KERN_ERR "PCI: Cannot map MCFG aperture for segment %04x\n", cfg->pci_segment); return -ENOMEM; } printk(KERN_INFO "PCI: Using MCFG for segment %04x bus %02x-%02x\n", cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number); if (ro_map) { unsigned int bdf = PCI_BDF(cfg->start_bus_number, 0, 0); unsigned int end = PCI_BDF(cfg->end_bus_number, -1, -1); while ((bdf = find_next_bit(ro_map, end + 1, bdf)) <= end) { arch_pci_ro_device(cfg->pci_segment, bdf); if (bdf++ == end) break; } } return 0; } void pci_mmcfg_arch_disable(unsigned int idx) { const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg; pci_mmcfg_virt[idx].virt = NULL; /* * Don't use destroy_xen_mappings() here, or make sure that at least * the necessary L4 entries get populated (so that they get properly * propagated to guest domains' page tables). */ mcfg_ioremap(cfg, idx, 0); printk(KERN_WARNING "PCI: Not using MCFG for segment %04x bus %02x-%02x\n", cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number); } int __init pci_mmcfg_arch_init(void) { int i; if (pci_mmcfg_virt) return 0; pci_mmcfg_virt = xzalloc_array(struct mmcfg_virt, pci_mmcfg_config_num); if (pci_mmcfg_virt == NULL) { printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n"); pci_mmcfg_config_num = 0; return 0; } for (i = 0; i < pci_mmcfg_config_num; ++i) { pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; while (pci_mmcfg_config[i].end_bus_number >> mmcfg_pci_segment_shift) ++mmcfg_pci_segment_shift; } mmcfg_pci_segment_shift += 20; return 1; } xen-4.4.0/xen/arch/x86/x86_64/pci.c0000664000175000017500000000542312307313555014460 0ustar smbsmb/****************************************************************************** * pci.c * * Architecture-dependent PCI access functions. */ #include #include #include #define PCI_CONF_ADDRESS(bus, dev, func, reg) \ (0x80000000 | (bus << 16) | (dev << 11) | (func << 8) | (reg & ~3)) uint8_t pci_conf_read8( unsigned int seg, unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg) { u32 value; if ( seg || reg > 255 ) { pci_mmcfg_read(seg, bus, PCI_DEVFN(dev, func), reg, 1, &value); return value; } else { BUG_ON((bus > 255) || (dev > 31) || (func > 7)); return pci_conf_read(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 3, 1); } } uint16_t pci_conf_read16( unsigned int seg, unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg) { u32 value; if ( seg || reg > 255 ) { pci_mmcfg_read(seg, bus, PCI_DEVFN(dev, func), reg, 2, &value); return value; } else { BUG_ON((bus > 255) || (dev > 31) || (func > 7)); return pci_conf_read(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 2, 2); } } uint32_t pci_conf_read32( unsigned int seg, unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg) { u32 value; if ( seg || reg > 255 ) { pci_mmcfg_read(seg, bus, PCI_DEVFN(dev, func), reg, 4, &value); return value; } else { BUG_ON((bus > 255) || (dev > 31) || (func > 7)); return pci_conf_read(PCI_CONF_ADDRESS(bus, dev, func, reg), 0, 4); } } void pci_conf_write8( unsigned int seg, unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, uint8_t data) { if ( seg || reg > 255 ) pci_mmcfg_write(seg, bus, PCI_DEVFN(dev, func), reg, 1, data); else { BUG_ON((bus > 255) || (dev > 31) || (func > 7)); pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 3, 1, data); } } void pci_conf_write16( unsigned int seg, unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, uint16_t data) { if ( seg || reg > 255 ) pci_mmcfg_write(seg, bus, PCI_DEVFN(dev, func), reg, 2, data); else { BUG_ON((bus > 255) || (dev > 31) || (func > 7)); pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), reg & 2, 2, data); } } void pci_conf_write32( unsigned int seg, unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, uint32_t data) { if ( seg || reg > 255 ) pci_mmcfg_write(seg, bus, PCI_DEVFN(dev, func), reg, 4, data); else { BUG_ON((bus > 255) || (dev > 31) || (func > 7)); pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), 0, 4, data); } } xen-4.4.0/xen/arch/x86/x86_64/physdev.c0000664000175000017500000000463412307313555015372 0ustar smbsmb/****************************************************************************** * physdev.c */ #include #include #include #include #include #include #include #define do_physdev_op compat_physdev_op #define physdev_apic compat_physdev_apic #define physdev_apic_t physdev_apic_compat_t #define xen_physdev_eoi physdev_eoi CHECK_physdev_eoi; #undef xen_physdev_eoi #define physdev_pirq_eoi_gmfn compat_physdev_pirq_eoi_gmfn #define physdev_pirq_eoi_gmfn_t physdev_pirq_eoi_gmfn_compat_t #define physdev_set_iobitmap compat_physdev_set_iobitmap #define physdev_set_iobitmap_t physdev_set_iobitmap_compat_t #define xen_physdev_set_iopl physdev_set_iopl CHECK_physdev_set_iopl; #undef xen_physdev_set_iopl #define xen_physdev_irq physdev_irq CHECK_physdev_irq; #undef xen_physdev_irq #define xen_physdev_irq_status_query physdev_irq_status_query CHECK_physdev_irq_status_query; #undef xen_physdev_irq_status_query #define physdev_map_pirq_t physdev_map_pirq_compat_t #define xen_physdev_unmap_pirq physdev_unmap_pirq CHECK_physdev_unmap_pirq; #undef xen_physdev_unmap_pirq #define xen_physdev_manage_pci physdev_manage_pci CHECK_physdev_manage_pci; #undef xen_physdev_manage_pci #define xen_physdev_manage_pci_ext physdev_manage_pci_ext CHECK_physdev_manage_pci_ext; #undef xen_physdev_manage_pci_ext #define xen_physdev_restore_msi physdev_restore_msi CHECK_physdev_restore_msi; #undef xen_physdev_restore_msi #define xen_physdev_setup_gsi physdev_setup_gsi CHECK_physdev_setup_gsi; #undef xen_physdev_setup_gsi #define xen_physdev_get_free_pirq physdev_get_free_pirq CHECK_physdev_get_free_pirq; #undef xen_physdev_get_free_pirq #define xen_physdev_pci_mmcfg_reserved physdev_pci_mmcfg_reserved CHECK_physdev_pci_mmcfg_reserved; #undef xen_physdev_pci_mmcfg_reserved #define xen_physdev_pci_device_add physdev_pci_device_add CHECK_physdev_pci_device_add #undef xen_physdev_pci_device_add #define xen_physdev_pci_device physdev_pci_device CHECK_physdev_pci_device #undef xen_physdev_pci_device #define COMPAT #undef guest_handle_okay #define guest_handle_okay compat_handle_okay typedef int ret_t; #include "../physdev.c" /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/x86_64/mmconfig-shared.c0000664000175000017500000003060512307313555016750 0ustar smbsmb/* * mmconfig-shared.c - Low-level direct PCI config space access via * MMCONFIG - common code between i386 and x86-64. * * This code does: * - known chipset handling * - ACPI decoding and validation * * Per-architecture code takes care of the mappings and accesses * themselves. * * Author: Allen Kay - adapted to xen from Linux */ #include #include #include #include #include #include #include #include #include #include #include #include "mmconfig.h" unsigned int pci_probe = PCI_PROBE_CONF1 | PCI_PROBE_MMCONF; static void __init parse_mmcfg(char *s) { char *ss; do { ss = strchr(s, ','); if ( ss ) *ss = '\0'; if ( !parse_bool(s) ) pci_probe &= ~PCI_PROBE_MMCONF; else if ( !strcmp(s, "amd_fam10") || !strcmp(s, "amd-fam10") ) pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF; s = ss + 1; } while ( ss ); } custom_param("mmcfg", parse_mmcfg); static const char __init *pci_mmcfg_e7520(void) { u32 win; win = pci_conf_read16(0, 0, 0, 0, 0xce); win = win & 0xf000; if(win == 0x0000 || win == 0xf000) pci_mmcfg_config_num = 0; else { pci_mmcfg_config_num = 1; pci_mmcfg_config = xzalloc(struct acpi_mcfg_allocation); if (!pci_mmcfg_config) return NULL; pci_mmcfg_config[0].address = win << 16; pci_mmcfg_config[0].pci_segment = 0; pci_mmcfg_config[0].start_bus_number = 0; pci_mmcfg_config[0].end_bus_number = 255; } return "Intel Corporation E7520 Memory Controller Hub"; } static const char __init *pci_mmcfg_intel_945(void) { u32 pciexbar, mask = 0, len = 0; pci_mmcfg_config_num = 1; pciexbar = pci_conf_read32(0, 0, 0, 0, 0x48); /* Enable bit */ if (!(pciexbar & 1)) pci_mmcfg_config_num = 0; /* Size bits */ switch ((pciexbar >> 1) & 3) { case 0: mask = 0xf0000000U; len = 0x10000000U; break; case 1: mask = 0xf8000000U; len = 0x08000000U; break; case 2: mask = 0xfc000000U; len = 0x04000000U; break; default: pci_mmcfg_config_num = 0; } /* Errata #2, things break when not aligned on a 256Mb boundary */ /* Can only happen in 64M/128M mode */ if ((pciexbar & mask) & 0x0fffffffU) pci_mmcfg_config_num = 0; /* Don't hit the APIC registers and their friends */ if ((pciexbar & mask) >= 0xf0000000U) pci_mmcfg_config_num = 0; if (pci_mmcfg_config_num) { pci_mmcfg_config = xzalloc(struct acpi_mcfg_allocation); if (!pci_mmcfg_config) return NULL; pci_mmcfg_config[0].address = pciexbar & mask; pci_mmcfg_config[0].pci_segment = 0; pci_mmcfg_config[0].start_bus_number = 0; pci_mmcfg_config[0].end_bus_number = (len >> 20) - 1; } return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub"; } static const char __init *pci_mmcfg_amd_fam10h(void) { uint32_t address; uint64_t base, msr_content; int i; unsigned segnbits = 0, busnbits; if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF)) return NULL; address = MSR_FAM10H_MMIO_CONF_BASE; if (rdmsr_safe(address, msr_content)) return NULL; /* mmconfig is not enable */ if (!(msr_content & FAM10H_MMIO_CONF_ENABLE)) return NULL; base = msr_content & (FAM10H_MMIO_CONF_BASE_MASK<> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & FAM10H_MMIO_CONF_BUSRANGE_MASK; /* * only handle bus 0 ? * need to skip it */ if (!busnbits) return NULL; if (busnbits > 8) { segnbits = busnbits - 8; busnbits = 8; } pci_mmcfg_config_num = (1 << segnbits); pci_mmcfg_config = xmalloc_array(struct acpi_mcfg_allocation, pci_mmcfg_config_num); if (!pci_mmcfg_config) return NULL; for (i = 0; i < (1 << segnbits); i++) { pci_mmcfg_config[i].address = base + ((unsigned long)i << 28); pci_mmcfg_config[i].pci_segment = i; pci_mmcfg_config[i].start_bus_number = 0; pci_mmcfg_config[i].end_bus_number = (1 << busnbits) - 1; pci_add_segment(i); } return "AMD Family 10h NB"; } static const char __init *pci_mmcfg_nvidia_mcp55(void) { static bool_t __initdata mcp55_checked; int bus, i; static const u32 extcfg_regnum = 0x90; static const u32 extcfg_enable_mask = 1<<31; static const u32 extcfg_start_mask = 0xff<<16; static const int extcfg_start_shift = 16; static const u32 extcfg_size_mask = 0x3<<28; static const int extcfg_size_shift = 28; static const int extcfg_sizebus[] = {0xff, 0x7f, 0x3f, 0x1f}; static const u32 extcfg_base_mask[] = {0x7ff8, 0x7ffc, 0x7ffe, 0x7fff}; static const int extcfg_base_lshift = 25; /* check if amd fam10h already took over */ if (!acpi_disabled || pci_mmcfg_config_num || mcp55_checked) return NULL; mcp55_checked = 1; for (i = bus = 0; bus < 256; bus++) { u32 l, extcfg; u16 vendor, device; l = pci_conf_read32(0, bus, 0, 0, 0); vendor = l & 0xffff; device = (l >> 16) & 0xffff; if (PCI_VENDOR_ID_NVIDIA != vendor || 0x0369 != device) continue; extcfg = pci_conf_read32(0, bus, 0, 0, extcfg_regnum); if (extcfg & extcfg_enable_mask) i++; } if (!i) return NULL; pci_mmcfg_config_num = i; pci_mmcfg_config = xmalloc_array(struct acpi_mcfg_allocation, pci_mmcfg_config_num); for (i = bus = 0; bus < 256; bus++) { u64 base; u32 l, extcfg; u16 vendor, device; int size_index; l = pci_conf_read32(0, bus, 0, 0, 0); vendor = l & 0xffff; device = (l >> 16) & 0xffff; if (PCI_VENDOR_ID_NVIDIA != vendor || 0x0369 != device) continue; extcfg = pci_conf_read32(0, bus, 0, 0, extcfg_regnum); if (!(extcfg & extcfg_enable_mask)) continue; if (i >= pci_mmcfg_config_num) break; size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift; base = extcfg & extcfg_base_mask[size_index]; /* base could be > 4G */ pci_mmcfg_config[i].address = base << extcfg_base_lshift; pci_mmcfg_config[i].pci_segment = 0; pci_mmcfg_config[i].start_bus_number = (extcfg & extcfg_start_mask) >> extcfg_start_shift; pci_mmcfg_config[i].end_bus_number = pci_mmcfg_config[i].start_bus_number + extcfg_sizebus[size_index]; i++; } if (bus == 256) return "nVidia MCP55"; pci_mmcfg_config_num = 0; xfree(pci_mmcfg_config); pci_mmcfg_config = NULL; return NULL; } struct pci_mmcfg_hostbridge_probe { u32 bus; u32 devfn; u32 vendor; u32 device; const char *(*probe)(void); }; static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = { { 0, PCI_DEVFN(0, 0), PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, pci_mmcfg_e7520 }, { 0, PCI_DEVFN(0, 0), PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82945G_HB, pci_mmcfg_intel_945 }, { 0, PCI_DEVFN(0x18, 0), PCI_VENDOR_ID_AMD, 0x1200, pci_mmcfg_amd_fam10h }, { 0xff, PCI_DEVFN(0, 0), PCI_VENDOR_ID_AMD, 0x1200, pci_mmcfg_amd_fam10h }, { 0, PCI_DEVFN(0, 0), PCI_VENDOR_ID_NVIDIA, 0x0369, pci_mmcfg_nvidia_mcp55 }, }; static int __init pci_mmcfg_check_hostbridge(void) { u32 l; u32 bus, devfn; u16 vendor, device; int i; const char *name; pci_mmcfg_config_num = 0; pci_mmcfg_config = NULL; name = NULL; for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++) { bus = pci_mmcfg_probes[i].bus; devfn = pci_mmcfg_probes[i].devfn; l = pci_conf_read32(0, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), 0); vendor = l & 0xffff; device = (l >> 16) & 0xffff; if (pci_mmcfg_probes[i].vendor == vendor && pci_mmcfg_probes[i].device == device) name = pci_mmcfg_probes[i].probe(); } if (name) { printk(KERN_INFO "PCI: Found %s %s MMCONFIG support.\n", name, pci_mmcfg_config_num ? "with" : "without"); } return name != NULL; } static int __init is_mmconf_reserved( u64 addr, u64 size, int i, typeof(pci_mmcfg_config[0]) *cfg) { u64 old_size = size; int valid = 0; while (!e820_all_mapped(addr, addr + size - 1, E820_RESERVED)) { size >>= 1; if (size < (16UL<<20)) break; } if (size >= (16UL<<20) || size == old_size) { printk(KERN_NOTICE "PCI: MCFG area at %lx reserved in E820\n", addr); valid = 1; if (old_size != size) { /* update end_bus_number */ cfg->end_bus_number = cfg->start_bus_number + ((size>>20) - 1); printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx " "segment %hu buses %u - %u\n", i, (unsigned long)cfg->address, cfg->pci_segment, (unsigned int)cfg->start_bus_number, (unsigned int)cfg->end_bus_number); } } return valid; } static bool_t __init pci_mmcfg_reject_broken(void) { typeof(pci_mmcfg_config[0]) *cfg; int i; bool_t valid = 1; if ((pci_mmcfg_config_num == 0) || (pci_mmcfg_config == NULL) || (pci_mmcfg_config[0].address == 0)) return 0; cfg = &pci_mmcfg_config[0]; for (i = 0; i < pci_mmcfg_config_num; i++) { u64 addr, size; cfg = &pci_mmcfg_config[i]; addr = cfg->start_bus_number; addr <<= 20; addr += cfg->address; size = cfg->end_bus_number + 1 - cfg->start_bus_number; size <<= 20; printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx " "segment %04x buses %02x - %02x\n", i, (unsigned long)cfg->address, cfg->pci_segment, (unsigned int)cfg->start_bus_number, (unsigned int)cfg->end_bus_number); if (!is_mmconf_reserved(addr, size, i, cfg) || pci_mmcfg_arch_enable(i)) { pci_mmcfg_arch_disable(i); valid = 0; } } return valid; } void __init acpi_mmcfg_init(void) { bool_t valid = 1; /* MMCONFIG disabled */ if ((pci_probe & PCI_PROBE_MMCONF) == 0) return; /* MMCONFIG already enabled */ if (!(pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF)) return; if (pci_mmcfg_check_hostbridge()) { unsigned int i; pci_mmcfg_arch_init(); for (i = 0; i < pci_mmcfg_config_num; ++i) if (pci_mmcfg_arch_enable(i)) valid = 0; } else { acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); pci_mmcfg_arch_init(); valid = pci_mmcfg_reject_broken(); } if ((pci_mmcfg_config_num == 0) || (pci_mmcfg_config == NULL) || (pci_mmcfg_config[0].address == 0)) return; if (valid) pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; } int pci_mmcfg_reserved(uint64_t address, unsigned int segment, unsigned int start_bus, unsigned int end_bus, unsigned int flags) { unsigned int i; if (flags & ~XEN_PCI_MMCFG_RESERVED) return -EINVAL; for (i = 0; i < pci_mmcfg_config_num; ++i) { const typeof(pci_mmcfg_config[0]) *cfg = &pci_mmcfg_config[i]; if (cfg->pci_segment == segment && cfg->start_bus_number == start_bus && cfg->end_bus_number == end_bus) { if (cfg->address != address) { printk(KERN_WARNING "Base address presented for segment %04x bus %02x-%02x" " (%08" PRIx64 ") does not match previously obtained" " one (%08" PRIx64 ")\n", segment, start_bus, end_bus, address, cfg->address); return -EIO; } if (flags & XEN_PCI_MMCFG_RESERVED) return pci_mmcfg_arch_enable(i); pci_mmcfg_arch_disable(i); return 0; } } return -ENODEV; } xen-4.4.0/xen/arch/x86/x86_64/platform_hypercall.c0000664000175000017500000000243112307313555017570 0ustar smbsmb/****************************************************************************** * platform_hypercall.c */ #include #include #include DEFINE_XEN_GUEST_HANDLE(compat_platform_op_t); #define xen_platform_op compat_platform_op #define xen_platform_op_t compat_platform_op_t #define do_platform_op(x) compat_platform_op(_##x) #define efi_get_info efi_compat_get_info #define efi_runtime_call(x) efi_compat_runtime_call(x) #define xen_processor_performance compat_processor_performance #define set_px_pminfo compat_set_px_pminfo #define xen_processor_power compat_processor_power #define set_cx_pminfo compat_set_cx_pminfo #define xen_pf_pcpuinfo xenpf_pcpuinfo CHECK_pf_pcpuinfo; #undef xen_pf_pcpuinfo #define xen_pf_pcpu_version xenpf_pcpu_version CHECK_pf_pcpu_version; #undef xen_pf_pcpu_version #define xen_pf_enter_acpi_sleep xenpf_enter_acpi_sleep CHECK_pf_enter_acpi_sleep; #undef xen_pf_enter_acpi_sleep #define COMPAT #define _XEN_GUEST_HANDLE(t) XEN_GUEST_HANDLE(t) #define _XEN_GUEST_HANDLE_PARAM(t) XEN_GUEST_HANDLE_PARAM(t) typedef int ret_t; #include "../platform_hypercall.c" /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/crash.c0000664000175000017500000001544112307313555014050 0ustar smbsmb/****************************************************************************** * crash.c * * Based heavily on arch/i386/kernel/crash.c from Linux 2.6.16 * * Xen port written by: * - Simon 'Horms' Horman * - Magnus Damm */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static cpumask_t waiting_to_crash; static unsigned int crashing_cpu; static DEFINE_PER_CPU_READ_MOSTLY(bool_t, crash_save_done); /* This becomes the NMI handler for non-crashing CPUs, when Xen is crashing. */ void __attribute__((noreturn)) do_nmi_crash(struct cpu_user_regs *regs) { int cpu = smp_processor_id(); /* nmi_shootdown_cpus() should ensure that this assertion is correct. */ ASSERT(cpu != crashing_cpu); /* Save crash information and shut down CPU. Attempt only once. */ if ( !this_cpu(crash_save_done) ) { /* Disable the interrupt stack table for the MCE handler. This * prevents race conditions between clearing MCIP and receving a * new MCE, during which the exception frame would be clobbered * and the MCE handler fall into an infinite loop. We are soon * going to disable the NMI watchdog, so the loop would not be * caught. * * We do not need to change the NMI IST, as the nmi_crash * handler is immue to corrupt exception frames, by virtue of * being designed never to return. * * This update is safe from a security point of view, as this * pcpu is never going to try to sysret back to a PV vcpu. */ set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE); kexec_crash_save_cpu(); __stop_this_cpu(); this_cpu(crash_save_done) = 1; cpumask_clear_cpu(cpu, &waiting_to_crash); } /* Poor mans self_nmi(). __stop_this_cpu() has reverted the LAPIC * back to its boot state, so we are unable to rely on the regular * apic_* functions, due to 'x2apic_enabled' being possibly wrong. * (The likely scenario is that we have reverted from x2apic mode to * xapic, at which point #GPFs will occur if we use the apic_* * functions) * * The ICR and APIC ID of the LAPIC are still valid even during * software disable (Intel SDM Vol 3, 10.4.7.2). As a result, we * can deliberately queue up another NMI at the LAPIC which will not * be delivered as the hardware NMI latch is currently in effect. * This means that if NMIs become unlatched (e.g. following a * non-fatal MCE), the LAPIC will force us back here rather than * wandering back into regular Xen code. */ switch ( current_local_apic_mode() ) { u32 apic_id; case APIC_MODE_X2APIC: apic_id = apic_rdmsr(APIC_ID); apic_wrmsr(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL | ((u64)apic_id << 32)); break; case APIC_MODE_XAPIC: apic_id = GET_xAPIC_ID(apic_mem_read(APIC_ID)); while ( apic_mem_read(APIC_ICR) & APIC_ICR_BUSY ) cpu_relax(); apic_mem_write(APIC_ICR2, apic_id << 24); apic_mem_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL); break; default: break; } for ( ; ; ) halt(); } static void nmi_shootdown_cpus(void) { unsigned long msecs; int i, cpu = smp_processor_id(); disable_lapic_nmi_watchdog(); local_irq_disable(); crashing_cpu = cpu; local_irq_count(crashing_cpu) = 0; cpumask_andnot(&waiting_to_crash, &cpu_online_map, cpumask_of(cpu)); /* Change NMI trap handlers. Non-crashing pcpus get nmi_crash which * invokes do_nmi_crash (above), which cause them to write state and * fall into a loop. The crashing pcpu gets the nop handler to * cause it to return to this function ASAP. */ for ( i = 0; i < nr_cpu_ids; i++ ) { if ( idt_tables[i] == NULL ) continue; if ( i == cpu ) { /* * Disable the interrupt stack tables for this cpu's MCE and NMI * handlers, and alter the NMI handler to have no operation. * Disabling the stack tables prevents stack corruption race * conditions, while changing the handler helps prevent cascading * faults; we are certainly going to crash by this point. * * This update is safe from a security point of view, as this pcpu * is never going to try to sysret back to a PV vcpu. */ _set_gate_lower(&idt_tables[i][TRAP_nmi], 14, 0, &trap_nop); set_ist(&idt_tables[i][TRAP_machine_check], IST_NONE); } else { /* Do not update stack table for other pcpus. */ _update_gate_addr_lower(&idt_tables[i][TRAP_nmi], &nmi_crash); } } /* Ensure the new callback function is set before sending out the NMI. */ wmb(); smp_send_nmi_allbutself(); msecs = 1000; /* Wait at most a second for the other cpus to stop */ while ( !cpumask_empty(&waiting_to_crash) && msecs ) { mdelay(1); msecs--; } /* Leave a hint of how well we did trying to shoot down the other cpus */ if ( cpumask_empty(&waiting_to_crash) ) printk("Shot down all CPUs\n"); else { cpulist_scnprintf(keyhandler_scratch, sizeof keyhandler_scratch, &waiting_to_crash); printk("Failed to shoot down CPUs {%s}\n", keyhandler_scratch); } /* Crash shutdown any IOMMU functionality as the crashdump kernel is not * happy when booting if interrupt/dma remapping is still enabled */ iommu_crash_shutdown(); __stop_this_cpu(); /* This is a bit of a hack due to the problems with the x2apic_enabled * variable, but we can't do any better without a significant refactoring * of the APIC code */ x2apic_enabled = (current_local_apic_mode() == APIC_MODE_X2APIC); disable_IO_APIC(); hpet_disable(); } void machine_crash_shutdown(void) { crash_xen_info_t *info; nmi_shootdown_cpus(); info = kexec_crash_save_info(); info->xen_phys_start = xen_phys_start; info->dom0_pfn_to_mfn_frame_list_list = arch_get_pfn_to_mfn_frame_list_list(dom0); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/e820.c0000664000175000017500000005211712307313555013427 0ustar smbsmb#include #include #include #include #include #include #include #include #include #include #include #include /* * opt_mem: Limit maximum address of physical RAM. * Any RAM beyond this address limit is ignored. */ static unsigned long long __initdata opt_mem; size_param("mem", opt_mem); /* * opt_availmem: Limit maximum usable amount of physical RAM. * Any RAM beyond this limited amount is ignored. */ static unsigned long long __initdata opt_availmem; size_param("availmem", opt_availmem); /* opt_nomtrr_check: Don't clip ram to highest cacheable MTRR. */ static s8 __initdata e820_mtrr_clip = -1; boolean_param("e820-mtrr-clip", e820_mtrr_clip); /* opt_e820_verbose: Be verbose about clipping, the original e820, &c */ static bool_t __initdata e820_verbose; boolean_param("e820-verbose", e820_verbose); struct e820map e820; /* * This function checks if the entire range is mapped with type. * * Note: this function only works correct if the e820 table is sorted and * not-overlapping, which is the case */ int __init e820_all_mapped(u64 start, u64 end, unsigned type) { int i; for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; if (type && ei->type != type) continue; /* is the region (part) in overlap with the current region ?*/ if (ei->addr >= end || ei->addr + ei->size <= start) continue; /* if the region is at the beginning of we move * start to the end of the region since it's ok until there */ if (ei->addr <= start) start = ei->addr + ei->size; /* * if start is now at or beyond end, we're done, full * coverage */ if (start >= end) return 1; } return 0; } static void __init add_memory_region(unsigned long long start, unsigned long long size, int type) { int x; /*if (!efi_enabled)*/ { x = e820.nr_map; if (x == E820MAX) { printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); return; } e820.map[x].addr = start; e820.map[x].size = size; e820.map[x].type = type; e820.nr_map++; } } /* add_memory_region */ static void __init print_e820_memory_map(struct e820entry *map, int entries) { int i; for (i = 0; i < entries; i++) { printk(" %016Lx - %016Lx ", (unsigned long long)(map[i].addr), (unsigned long long)(map[i].addr + map[i].size)); switch (map[i].type) { case E820_RAM: printk("(usable)\n"); break; case E820_RESERVED: printk("(reserved)\n"); break; case E820_ACPI: printk("(ACPI data)\n"); break; case E820_NVS: printk("(ACPI NVS)\n"); break; case E820_UNUSABLE: printk("(unusable)\n"); break; default: printk("type %u\n", map[i].type); break; } } } /* * Sanitize the BIOS e820 map. * * Some e820 responses include overlapping entries. The following * replaces the original e820 map with a new one, removing overlaps. * */ struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ unsigned long long addr; /* address for this change point */ }; static struct change_member change_point_list[2*E820MAX] __initdata; static struct change_member *change_point[2*E820MAX] __initdata; static struct e820entry *overlap_list[E820MAX] __initdata; static struct e820entry new_bios[E820MAX] __initdata; static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) { struct change_member *change_tmp; unsigned long current_type, last_type; unsigned long long last_addr; int chgidx, still_changing; int overlap_entries; int new_bios_entry; int old_nr, new_nr, chg_nr; int i; /* Visually we're performing the following (1,2,3,4 = memory types)... Sample memory map (w/overlaps): ____22__________________ ______________________4_ ____1111________________ _44_____________________ 11111111________________ ____________________33__ ___________44___________ __________33333_________ ______________22________ ___________________2222_ _________111111111______ _____________________11_ _________________4______ Sanitized equivalent (no overlap): 1_______________________ _44_____________________ ___1____________________ ____22__________________ ______11________________ _________1______________ __________3_____________ ___________44___________ _____________33_________ _______________2________ ________________1_______ _________________4______ ___________________2____ ____________________33__ ______________________4_ */ /* if there's only one memory region, don't bother */ if (*pnr_map < 2) return -1; old_nr = *pnr_map; /* bail out if we find any unreasonable addresses in bios map */ for (i=0; iaddr = biosmap[i].addr; change_point[chgidx++]->pbios = &biosmap[i]; change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; change_point[chgidx++]->pbios = &biosmap[i]; } } chg_nr = chgidx; /* true number of change-points */ /* sort change-point list by memory addresses (low -> high) */ still_changing = 1; while (still_changing) { still_changing = 0; for (i=1; i < chg_nr; i++) { /* if > , swap */ /* or, if current= & last=, swap */ if ((change_point[i]->addr < change_point[i-1]->addr) || ((change_point[i]->addr == change_point[i-1]->addr) && (change_point[i]->addr == change_point[i]->pbios->addr) && (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) ) { change_tmp = change_point[i]; change_point[i] = change_point[i-1]; change_point[i-1] = change_tmp; still_changing=1; } } } /* create a new bios memory map, removing overlaps */ overlap_entries=0; /* number of entries in the overlap table */ new_bios_entry=0; /* index for creating new bios map entries */ last_type = 0; /* start with undefined memory type */ last_addr = 0; /* start with 0 as last starting address */ /* loop through change-points, determining affect on the new bios map */ for (chgidx=0; chgidx < chg_nr; chgidx++) { /* keep track of all overlapping bios entries */ if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) { /* add map entry to overlap list (> 1 entry implies an overlap) */ overlap_list[overlap_entries++]=change_point[chgidx]->pbios; } else { /* remove entry from list (order independent, so swap with last) */ for (i=0; ipbios) overlap_list[i] = overlap_list[overlap_entries-1]; } overlap_entries--; } /* if there are overlapping entries, decide which "type" to use */ /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ current_type = 0; for (i=0; itype > current_type) current_type = overlap_list[i]->type; /* continue building up new bios map based on this information */ if (current_type != last_type) { if (last_type != 0) { new_bios[new_bios_entry].size = change_point[chgidx]->addr - last_addr; /* move forward only if the new size was non-zero */ if (new_bios[new_bios_entry].size != 0) if (++new_bios_entry >= E820MAX) break; /* no more space left for new bios entries */ } if (current_type != 0) { new_bios[new_bios_entry].addr = change_point[chgidx]->addr; new_bios[new_bios_entry].type = current_type; last_addr=change_point[chgidx]->addr; } last_type = current_type; } } new_nr = new_bios_entry; /* retain count for new bios entries */ /* copy new bios mapping into original location */ memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); *pnr_map = new_nr; return 0; } /* * Copy the BIOS e820 map into a safe place. * * Sanity-check it while we're at it.. * * If we're lucky and live on a modern system, the setup code * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. * * We check to see that the memory map contains at least 2 elements * before we'll use it, because the detection code in setup.S may * not be perfect and most every PC known to man has two memory * regions: one from 0 to 640k, and one from 1mb up. (The IBM * thinkpad 560x, for example, does not cooperate with the memory * detection code.) */ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) { /* Only one memory region (or negative)? Ignore it */ if (nr_map < 2) return -1; do { unsigned long long start = biosmap->addr; unsigned long long size = biosmap->size; unsigned long long end = start + size; unsigned long type = biosmap->type; /* Overflow in 64 bits? Ignore the memory map. */ if (start > end) return -1; /* * Some BIOSes claim RAM in the 640k - 1M region. * Not right. Fix it up. */ if (type == E820_RAM) { if (start < 0x100000ULL && end > 0xA0000ULL) { if (start < 0xA0000ULL) add_memory_region(start, 0xA0000ULL-start, type); if (end <= 0x100000ULL) continue; start = 0x100000ULL; size = end - start; } } add_memory_region(start, size, type); } while (biosmap++,--nr_map); return 0; } /* * Find the highest page frame number we have available */ static unsigned long __init find_max_pfn(void) { int i; unsigned long max_pfn = 0; #if 0 if (efi_enabled) { efi_memmap_walk(efi_find_max_pfn, &max_pfn); return; } #endif for (i = 0; i < e820.nr_map; i++) { unsigned long start, end; /* RAM? */ if (e820.map[i].type != E820_RAM) continue; start = PFN_UP(e820.map[i].addr); end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); if (start >= end) continue; if (end > max_pfn) max_pfn = end; } return max_pfn; } static void __init clip_to_limit(uint64_t limit, char *warnmsg) { int i; char _warnmsg[160]; uint64_t old_limit = 0; for ( ; ; ) { /* Find a RAM region needing clipping. */ for ( i = 0; i < e820.nr_map; i++ ) if ( (e820.map[i].type == E820_RAM) && ((e820.map[i].addr + e820.map[i].size) > limit) ) break; /* If none found, we are done. */ if ( i == e820.nr_map ) break; old_limit = max_t( uint64_t, old_limit, e820.map[i].addr + e820.map[i].size); /* We try to convert clipped RAM areas to E820_UNUSABLE. */ if ( e820_change_range_type(&e820, max(e820.map[i].addr, limit), e820.map[i].addr + e820.map[i].size, E820_RAM, E820_UNUSABLE) ) continue; /* * If the type change fails (e.g., not space in table) then we clip or * delete the region as appropriate. */ if ( e820.map[i].addr < limit ) { e820.map[i].size = limit - e820.map[i].addr; } else { memmove(&e820.map[i], &e820.map[i+1], (e820.nr_map - i - 1) * sizeof(struct e820entry)); e820.nr_map--; } } if ( old_limit ) { if ( warnmsg ) { snprintf(_warnmsg, sizeof(_warnmsg), warnmsg, (long)(limit>>30)); printk("WARNING: %s\n", _warnmsg); } printk("Truncating RAM from %lukB to %lukB\n", (unsigned long)(old_limit >> 10), (unsigned long)(limit >> 10)); } } /* Conservative estimate of top-of-RAM by looking for MTRR WB regions. */ #define MSR_MTRRphysBase(reg) (0x200 + 2 * (reg)) #define MSR_MTRRphysMask(reg) (0x200 + 2 * (reg) + 1) static uint64_t __init mtrr_top_of_ram(void) { uint32_t eax, ebx, ecx, edx; uint64_t mtrr_cap, mtrr_def, addr_mask, base, mask, top; unsigned int i, phys_bits = 36; /* By default we check only Intel systems. */ if ( e820_mtrr_clip == -1 ) { char vendor[13]; cpuid(0x00000000, &eax, (uint32_t *)&vendor[0], (uint32_t *)&vendor[8], (uint32_t *)&vendor[4]); vendor[12] = '\0'; e820_mtrr_clip = !strcmp(vendor, "GenuineIntel"); } if ( !e820_mtrr_clip ) return 0; if ( e820_verbose ) printk("Checking MTRR ranges...\n"); /* Does the CPU support architectural MTRRs? */ cpuid(0x00000001, &eax, &ebx, &ecx, &edx); if ( !test_bit(X86_FEATURE_MTRR & 31, &edx) ) return 0; /* Find the physical address size for this CPU. */ cpuid(0x80000000, &eax, &ebx, &ecx, &edx); if ( eax >= 0x80000008 ) { cpuid(0x80000008, &eax, &ebx, &ecx, &edx); phys_bits = (uint8_t)eax; } addr_mask = ((1ull << phys_bits) - 1) & ~((1ull << 12) - 1); rdmsrl(MSR_MTRRcap, mtrr_cap); rdmsrl(MSR_MTRRdefType, mtrr_def); if ( e820_verbose ) printk(" MTRR cap: %"PRIx64" type: %"PRIx64"\n", mtrr_cap, mtrr_def); /* MTRRs enabled, and default memory type is not writeback? */ if ( !test_bit(11, &mtrr_def) || ((uint8_t)mtrr_def == MTRR_TYPE_WRBACK) ) return 0; /* * Find end of highest WB-type range. This is a conservative estimate * of the highest WB address since overlapping UC/WT ranges dominate. */ top = 0; for ( i = 0; i < (uint8_t)mtrr_cap; i++ ) { rdmsrl(MSR_MTRRphysBase(i), base); rdmsrl(MSR_MTRRphysMask(i), mask); if ( e820_verbose ) printk(" MTRR[%d]: base %"PRIx64" mask %"PRIx64"\n", i, base, mask); if ( !test_bit(11, &mask) || ((uint8_t)base != MTRR_TYPE_WRBACK) ) continue; base &= addr_mask; mask &= addr_mask; top = max_t(uint64_t, top, ((base | ~mask) & addr_mask) + PAGE_SIZE); } return top; } static void __init reserve_dmi_region(void) { u32 base, len; if ( (dmi_get_table(&base, &len) == 0) && ((base + len) > base) && reserve_e820_ram(&e820, base, base + len) ) printk("WARNING: DMI table located in E820 RAM %08x-%08x. Fixed.\n", base, base+len); } static void __init machine_specific_memory_setup( struct e820entry *raw, int *raw_nr) { unsigned long mpt_limit, ro_mpt_limit; uint64_t top_of_ram, size; int i; char nr = (char)*raw_nr; sanitize_e820_map(raw, &nr); *raw_nr = nr; (void)copy_e820_map(raw, nr); if ( opt_mem ) clip_to_limit(opt_mem, NULL); if ( opt_availmem ) { for ( i = size = 0; (i < e820.nr_map) && (size <= opt_availmem); i++ ) if ( e820.map[i].type == E820_RAM ) size += e820.map[i].size; if ( size > opt_availmem ) clip_to_limit( e820.map[i-1].addr + e820.map[i-1].size - (size-opt_availmem), NULL); } mpt_limit = ((RDWR_MPT_VIRT_END - RDWR_MPT_VIRT_START) / sizeof(unsigned long)) << PAGE_SHIFT; ro_mpt_limit = ((RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(unsigned long)) << PAGE_SHIFT; if ( mpt_limit > ro_mpt_limit ) mpt_limit = ro_mpt_limit; clip_to_limit(mpt_limit, "Only the first %lu GB of the physical " "memory map can be accessed by Xen."); reserve_dmi_region(); top_of_ram = mtrr_top_of_ram(); if ( top_of_ram ) clip_to_limit(top_of_ram, "MTRRs do not cover all of memory."); } /* This function relies on the passed in e820->map[] being sorted. */ int __init e820_add_range( struct e820map *e820, uint64_t s, uint64_t e, uint32_t type) { unsigned int i; for ( i = 0; i < e820->nr_map; ++i ) { uint64_t rs = e820->map[i].addr; uint64_t re = rs + e820->map[i].size; if ( rs == e && e820->map[i].type == type ) { e820->map[i].addr = s; return 1; } if ( re == s && e820->map[i].type == type && (i + 1 == e820->nr_map || e820->map[i + 1].addr >= e) ) { e820->map[i].size += e - s; return 1; } if ( rs >= e ) break; if ( re > s ) return 0; } if ( e820->nr_map >= ARRAY_SIZE(e820->map) ) { printk(XENLOG_WARNING "E820: overflow while adding region" " %"PRIx64"-%"PRIx64"\n", s, e); return 0; } memmove(e820->map + i + 1, e820->map + i, (e820->nr_map - i) * sizeof(*e820->map)); e820->nr_map++; e820->map[i].addr = s; e820->map[i].size = e - s; e820->map[i].type = type; return 1; } int __init e820_change_range_type( struct e820map *e820, uint64_t s, uint64_t e, uint32_t orig_type, uint32_t new_type) { uint64_t rs = 0, re = 0; int i; for ( i = 0; i < e820->nr_map; i++ ) { /* Have we found the e820 region that includes the specified range? */ rs = e820->map[i].addr; re = rs + e820->map[i].size; if ( (s >= rs) && (e <= re) ) break; } if ( (i == e820->nr_map) || (e820->map[i].type != orig_type) ) return 0; if ( (s == rs) && (e == re) ) { e820->map[i].type = new_type; } else if ( (s == rs) || (e == re) ) { if ( (e820->nr_map + 1) > ARRAY_SIZE(e820->map) ) goto overflow; memmove(&e820->map[i+1], &e820->map[i], (e820->nr_map-i) * sizeof(e820->map[0])); e820->nr_map++; if ( s == rs ) { e820->map[i].size = e - s; e820->map[i].type = new_type; e820->map[i+1].addr = e; e820->map[i+1].size = re - e; } else { e820->map[i].size = s - rs; e820->map[i+1].addr = s; e820->map[i+1].size = e - s; e820->map[i+1].type = new_type; } } else { if ( (e820->nr_map + 2) > ARRAY_SIZE(e820->map) ) goto overflow; memmove(&e820->map[i+2], &e820->map[i], (e820->nr_map-i) * sizeof(e820->map[0])); e820->nr_map += 2; e820->map[i].size = s - rs; e820->map[i+1].addr = s; e820->map[i+1].size = e - s; e820->map[i+1].type = new_type; e820->map[i+2].addr = e; e820->map[i+2].size = re - e; } /* Finally, look for any opportunities to merge adjacent e820 entries. */ for ( i = 0; i < (e820->nr_map - 1); i++ ) { if ( (e820->map[i].type != e820->map[i+1].type) || ((e820->map[i].addr + e820->map[i].size) != e820->map[i+1].addr) ) continue; e820->map[i].size += e820->map[i+1].size; memmove(&e820->map[i+1], &e820->map[i+2], (e820->nr_map-i-2) * sizeof(e820->map[0])); e820->nr_map--; i--; } return 1; overflow: printk("Overflow in e820 while reserving region %"PRIx64"-%"PRIx64"\n", s, e); return 0; } /* Set E820_RAM area (@s,@e) as RESERVED in specified e820 map. */ int __init reserve_e820_ram(struct e820map *e820, uint64_t s, uint64_t e) { return e820_change_range_type(e820, s, e, E820_RAM, E820_RESERVED); } unsigned long __init init_e820( const char *str, struct e820entry *raw, int *raw_nr) { if ( e820_verbose ) { printk("Initial %s RAM map:\n", str); print_e820_memory_map(raw, *raw_nr); } machine_specific_memory_setup(raw, raw_nr); printk("%s RAM map:\n", str); print_e820_memory_map(e820.map, e820.nr_map); return find_max_pfn(); } xen-4.4.0/xen/arch/x86/x86_emulate.c0000664000175000017500000000120312307313555015100 0ustar smbsmb/****************************************************************************** * x86_emulate.c * * Wrapper for generic x86 instruction decoder and emulator. * * Copyright (c) 2008, Citrix Systems, Inc. * * Authors: * Keir Fraser */ #include #include /* mark_regs_dirty() */ #include /* current_cpu_info */ #include /* cpu_has_amd_erratum() */ /* Avoid namespace pollution. */ #undef cmpxchg #undef cpuid #define cpu_has_amd_erratum(nr) \ cpu_has_amd_erratum(¤t_cpu_data, AMD_ERRATUM_##nr) #include "x86_emulate/x86_emulate.c" xen-4.4.0/xen/arch/x86/extable.c0000664000175000017500000000453212307313555014373 0ustar smbsmb #include #include #include #include #include #include #define EX_FIELD(ptr, field) ((unsigned long)&(ptr)->field + (ptr)->field) static inline unsigned long ex_addr(const struct exception_table_entry *x) { return EX_FIELD(x, addr); } static inline unsigned long ex_cont(const struct exception_table_entry *x) { return EX_FIELD(x, cont); } static int __init cmp_ex(const void *a, const void *b) { const struct exception_table_entry *l = a, *r = b; unsigned long lip = ex_addr(l); unsigned long rip = ex_addr(r); /* avoid overflow */ if (lip > rip) return 1; if (lip < rip) return -1; return 0; } #ifndef swap_ex static void __init swap_ex(void *a, void *b, int size) { struct exception_table_entry *l = a, *r = b, tmp; long delta = b - a; tmp = *l; l->addr = r->addr + delta; l->cont = r->cont + delta; r->addr = tmp.addr - delta; r->cont = tmp.cont - delta; } #endif void __init sort_exception_tables(void) { sort(__start___ex_table, __stop___ex_table - __start___ex_table, sizeof(struct exception_table_entry), cmp_ex, swap_ex); sort(__start___pre_ex_table, __stop___pre_ex_table - __start___pre_ex_table, sizeof(struct exception_table_entry), cmp_ex, swap_ex); } static inline unsigned long search_one_table(const struct exception_table_entry *first, const struct exception_table_entry *last, unsigned long value) { const struct exception_table_entry *mid; long diff; while ( first <= last ) { mid = (last - first) / 2 + first; diff = ex_addr(mid) - value; if (diff == 0) return ex_cont(mid); else if (diff < 0) first = mid+1; else last = mid-1; } return 0; } unsigned long search_exception_table(unsigned long addr) { return search_one_table( __start___ex_table, __stop___ex_table-1, addr); } unsigned long search_pre_exception_table(struct cpu_user_regs *regs) { unsigned long addr = (unsigned long)regs->eip; unsigned long fixup = search_one_table( __start___pre_ex_table, __stop___pre_ex_table-1, addr); if ( fixup ) { dprintk(XENLOG_INFO, "Pre-exception: %p -> %p\n", _p(addr), _p(fixup)); perfc_incr(exception_fixed); } return fixup; } xen-4.4.0/xen/arch/x86/mpparse.c0000664000175000017500000006626112307313555014425 0ustar smbsmb/* * Intel Multiprocessor Specification 1.1 and 1.4 * compliant MP-table parsing routines. * * (c) 1995 Alan Cox, Building #3 * (c) 1998, 1999, 2000 Ingo Molnar * * Fixes * Erich Boleyn : MP v1.4 and additional changes. * Alan Cox : Added EBDA scanning * Ingo Molnar : various cleanups and rewrites * Maciej W. Rozycki: Bits for default MP configurations * Paul Diefenbaugh: Added full ACPI support */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Have we found an MP table */ bool_t __initdata smp_found_config; /* * Various Linux-internal data structures created from the * MP-table. */ unsigned char __read_mostly apic_version[MAX_APICS]; unsigned char __read_mostly mp_bus_id_to_type[MAX_MP_BUSSES]; /* I/O APIC entries */ struct mpc_config_ioapic __read_mostly mp_ioapics[MAX_IO_APICS]; /* # of MP IRQ source entries */ struct mpc_config_intsrc __read_mostly mp_irqs[MAX_IRQ_SOURCES]; /* MP IRQ source entries */ int __read_mostly mp_irq_entries; bool_t __read_mostly pic_mode; bool_t __read_mostly def_to_bigsmp = 0; unsigned long __read_mostly mp_lapic_addr; /* Processor that is doing the boot up */ unsigned int __read_mostly boot_cpu_physical_apicid = BAD_APICID; /* Internal processor count */ static unsigned int __devinitdata num_processors; static unsigned int __initdata disabled_cpus; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; void __init set_nr_cpu_ids(unsigned int max_cpus) { if (!max_cpus) max_cpus = num_processors + disabled_cpus; if (max_cpus > NR_CPUS) max_cpus = NR_CPUS; else if (!max_cpus) max_cpus = 1; printk(XENLOG_INFO "SMP: Allowing %u CPUs (%d hotplug CPUs)\n", max_cpus, max_t(int, max_cpus - num_processors, 0)); nr_cpu_ids = max_cpus; #ifndef nr_cpumask_bits nr_cpumask_bits = (max_cpus + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); printk(XENLOG_DEBUG "NR_CPUS:%u nr_cpumask_bits:%u\n", NR_CPUS, nr_cpumask_bits); #endif } /* * Intel MP BIOS table parsing routines: */ /* * Checksum an MP configuration block. */ static int __init mpf_checksum(unsigned char *mp, int len) { int sum = 0; while (len--) sum += *mp++; return sum & 0xFF; } /* Return xen's logical cpu_id of the new added cpu or <0 if error */ static int __devinit MP_processor_info_x(struct mpc_config_processor *m, u32 apicidx, bool_t hotplug) { int ver, apicid, cpu = 0; if (!(m->mpc_cpuflag & CPU_ENABLED)) { if (!hotplug) ++disabled_cpus; return -EINVAL; } apicid = mpc_apic_id(m, apicidx); if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { Dprintk(" Bootup CPU\n"); boot_cpu_physical_apicid = apicid; } ver = m->mpc_apicver; /* * Validate version */ if (ver == 0x0) { printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " "fixing up to 0x10. (tell your hw vendor)\n", apicid); ver = 0x10; } apic_version[apicid] = ver; set_apicid(apicid, &phys_cpu_present_map); if (num_processors >= nr_cpu_ids) { printk(KERN_WARNING "WARNING: NR_CPUS limit of %u reached." " Processor ignored.\n", nr_cpu_ids); return -ENOSPC; } if (num_processors >= 8 && hotplug && genapic == &apic_default) { printk(KERN_WARNING "WARNING: CPUs limit of 8 reached." " Processor ignored.\n"); return -ENOSPC; } /* Boot cpu has been marked present in smp_prepare_boot_cpu */ if (!(m->mpc_cpuflag & CPU_BOOTPROCESSOR)) { cpu = alloc_cpu_id(); if (cpu < 0) { printk(KERN_WARNING "WARNING: Can't alloc cpu_id." " Processor with apicid %i ignored\n", apicid); return cpu; } x86_cpu_to_apicid[cpu] = apicid; cpumask_set_cpu(cpu, &cpu_present_map); } if (++num_processors > 8) { /* * No need for processor or APIC checks: physical delivery * (bigsmp) mode should always work. */ def_to_bigsmp = 1; } return cpu; } static int __devinit MP_processor_info(struct mpc_config_processor *m) { return MP_processor_info_x(m, m->mpc_apicid, 0); } static void __init MP_bus_info (struct mpc_config_bus *m) { char str[7]; memcpy(str, m->mpc_bustype, 6); str[6] = 0; #if 0 /* size of mpc_busid (8 bits) makes this check unnecessary */ if (m->mpc_busid >= MAX_MP_BUSSES) { printk(KERN_WARNING "MP table busid value (%d) for bustype %s " " is too large, max. supported is %d\n", m->mpc_busid, str, MAX_MP_BUSSES - 1); return; } #endif if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98; } else { printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); } } static void __init MP_ioapic_info (struct mpc_config_ioapic *m) { if (!(m->mpc_flags & MPC_APIC_USABLE)) return; printk(KERN_INFO "I/O APIC #%d Version %d at %#x.\n", m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", MAX_IO_APICS, nr_ioapics); panic("Recompile kernel with bigger MAX_IO_APICS"); } if (!m->mpc_apicaddr) { printk(KERN_ERR "WARNING: bogus zero I/O APIC address" " found in MP table, skipping!\n"); return; } mp_ioapics[nr_ioapics] = *m; nr_ioapics++; } static void __init MP_intsrc_info (struct mpc_config_intsrc *m) { mp_irqs [mp_irq_entries] = *m; Dprintk("Int: type %d, pol %d, trig %d, bus %d," " IRQ %02x, APIC ID %x, APIC INT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded"); } static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) { Dprintk("Lint: type %d, pol %d, trig %d, bus %d," " IRQ %02x, APIC ID %x, APIC LINT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); /* * Well it seems all SMP boards in existence * use ExtINT/LVT1 == LINT0 and * NMI/LVT2 == LINT1 - the following check * will show us if this assumptions is false. * Until then we do not have to add baggage. */ if ((m->mpc_irqtype == mp_ExtINT) && (m->mpc_destapiclint != 0)) BUG(); if ((m->mpc_irqtype == mp_NMI) && (m->mpc_destapiclint != 1)) BUG(); } /* * Read/parse the MPC */ static int __init smp_read_mpc(struct mp_config_table *mpc) { char str[16]; char oem[10]; int count=sizeof(*mpc); unsigned char *mpt=((unsigned char *)mpc)+count; if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { printk(KERN_ERR "SMP mptable: bad signature [%#x]!\n", *(u32 *)mpc->mpc_signature); return 0; } if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { printk(KERN_ERR "SMP mptable: checksum error!\n"); return 0; } if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", mpc->mpc_spec); return 0; } if (!mpc->mpc_lapic) { printk(KERN_ERR "SMP mptable: null local APIC address!\n"); return 0; } memcpy(oem,mpc->mpc_oem,8); oem[8]=0; printk(KERN_INFO "OEM ID: %s ",oem); memcpy(str,mpc->mpc_productid,12); str[12]=0; printk("Product ID: %s ",str); mps_oem_check(mpc, oem, str); printk("APIC at: %#x\n", mpc->mpc_lapic); /* * Save the local APIC address (it might be non-default) -- but only * if we're not using ACPI. */ if (!acpi_lapic) mp_lapic_addr = mpc->mpc_lapic; /* * Now process the configuration blocks. */ while (count < mpc->mpc_length) { switch(*mpt) { case MP_PROCESSOR: { struct mpc_config_processor *m= (struct mpc_config_processor *)mpt; /* ACPI may have already provided this data */ if (!acpi_lapic) MP_processor_info(m); mpt += sizeof(*m); count += sizeof(*m); break; } case MP_BUS: { struct mpc_config_bus *m= (struct mpc_config_bus *)mpt; MP_bus_info(m); mpt += sizeof(*m); count += sizeof(*m); break; } case MP_IOAPIC: { struct mpc_config_ioapic *m= (struct mpc_config_ioapic *)mpt; MP_ioapic_info(m); mpt+=sizeof(*m); count+=sizeof(*m); break; } case MP_INTSRC: { struct mpc_config_intsrc *m= (struct mpc_config_intsrc *)mpt; MP_intsrc_info(m); mpt+=sizeof(*m); count+=sizeof(*m); break; } case MP_LINTSRC: { struct mpc_config_lintsrc *m= (struct mpc_config_lintsrc *)mpt; MP_lintsrc_info(m); mpt+=sizeof(*m); count+=sizeof(*m); break; } default: { count = mpc->mpc_length; break; } } } clustered_apic_check(); if (!num_processors) printk(KERN_ERR "SMP mptable: no processors registered!\n"); return num_processors; } static int __init ELCR_trigger(unsigned int irq) { unsigned int port; port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } static void __init construct_default_ioirq_mptable(int mpc_default_type) { struct mpc_config_intsrc intsrc; int i; int ELCR_fallback = 0; intsrc.mpc_type = MP_INTSRC; intsrc.mpc_irqflag = 0; /* conforming */ intsrc.mpc_srcbus = 0; intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; intsrc.mpc_irqtype = mp_INT; /* * If true, we have an ISA/PCI system with no IRQ entries * in the MP table. To prevent the PCI interrupts from being set up * incorrectly, we try to use the ELCR. The sanity check to see if * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can * never be level sensitive, so we simply see if the ELCR agrees. * If it does, we assume it's valid. */ if (mpc_default_type == 5) { printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n"); else { printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); ELCR_fallback = 1; } } for (i = 0; platform_legacy_irq(i); i++) { switch (mpc_default_type) { case 2: if (i == 0 || i == 13) continue; /* IRQ0 & IRQ13 not connected */ /* fall through */ default: if (i == 2) continue; /* IRQ2 is never connected */ } if (ELCR_fallback) { /* * If the ELCR indicates a level-sensitive interrupt, we * copy that information over to the MP table in the * irqflag field (level sensitive, active high polarity). */ if (ELCR_trigger(i)) intsrc.mpc_irqflag = 13; else intsrc.mpc_irqflag = 0; } intsrc.mpc_srcbusirq = i; intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ MP_intsrc_info(&intsrc); } intsrc.mpc_irqtype = mp_ExtINT; intsrc.mpc_srcbusirq = 0; intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ MP_intsrc_info(&intsrc); } static inline void __init construct_default_ISA_mptable(int mpc_default_type) { struct mpc_config_processor processor; struct mpc_config_bus bus; struct mpc_config_ioapic ioapic; struct mpc_config_lintsrc lintsrc; int linttypes[2] = { mp_ExtINT, mp_NMI }; int i; /* * local APIC has default address */ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; /* * 2 CPUs, numbered 0 & 1. */ processor.mpc_type = MP_PROCESSOR; /* Either an integrated APIC or a discrete 82489DX. */ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; processor.mpc_cpuflag = CPU_ENABLED; processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; processor.mpc_reserved[0] = 0; processor.mpc_reserved[1] = 0; for (i = 0; i < 2; i++) { processor.mpc_apicid = i; MP_processor_info(&processor); } bus.mpc_type = MP_BUS; bus.mpc_busid = 0; switch (mpc_default_type) { default: printk("???\n"); printk(KERN_ERR "Unknown standard configuration %d\n", mpc_default_type); /* fall through */ case 1: case 5: memcpy(bus.mpc_bustype, "ISA ", 6); break; case 2: case 6: case 3: memcpy(bus.mpc_bustype, "EISA ", 6); break; case 4: case 7: memcpy(bus.mpc_bustype, "MCA ", 6); } MP_bus_info(&bus); if (mpc_default_type > 4) { bus.mpc_busid = 1; memcpy(bus.mpc_bustype, "PCI ", 6); MP_bus_info(&bus); } ioapic.mpc_type = MP_IOAPIC; ioapic.mpc_apicid = 2; ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; ioapic.mpc_flags = MPC_APIC_USABLE; ioapic.mpc_apicaddr = 0xFEC00000; MP_ioapic_info(&ioapic); /* * We set up most of the low 16 IO-APIC pins according to MPS rules. */ construct_default_ioirq_mptable(mpc_default_type); lintsrc.mpc_type = MP_LINTSRC; lintsrc.mpc_irqflag = 0; /* conforming */ lintsrc.mpc_srcbusid = 0; lintsrc.mpc_srcbusirq = 0; lintsrc.mpc_destapic = MP_APIC_ALL; for (i = 0; i < 2; i++) { lintsrc.mpc_irqtype = linttypes[i]; lintsrc.mpc_destapiclint = i; MP_lintsrc_info(&lintsrc); } } static __init void efi_unmap_mpf(void) { if (efi_enabled) __set_fixmap(FIX_EFI_MPF, 0, 0); } static struct intel_mp_floating *__initdata mpf_found; /* * Scan the memory blocks for an SMP configuration block. */ void __init get_smp_config (void) { struct intel_mp_floating *mpf = mpf_found; /* * ACPI supports both logical (e.g. Hyper-Threading) and physical * processors, where MPS only supports physical. */ if (acpi_lapic && acpi_ioapic) { efi_unmap_mpf(); printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); return; } else if (acpi_lapic) printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); if (mpf->mpf_feature2 & (1<<7)) { printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); pic_mode = 1; } else { printk(KERN_INFO " Virtual Wire compatibility mode.\n"); pic_mode = 0; } /* * Now see if we need to read further. */ if (mpf->mpf_feature1 != 0) { printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); construct_default_ISA_mptable(mpf->mpf_feature1); } else if (mpf->mpf_physptr) { /* * Read the physical hardware table. Anything here will * override the defaults. */ if (!smp_read_mpc((void *)(unsigned long)mpf->mpf_physptr)) { efi_unmap_mpf(); smp_found_config = 0; printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); return; } /* * If there are no explicit MP IRQ entries, then we are * broken. We set up most of the low 16 IO-APIC pins to * ISA defaults and hope it will work. */ if (!mp_irq_entries) { struct mpc_config_bus bus; printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); bus.mpc_type = MP_BUS; bus.mpc_busid = 0; memcpy(bus.mpc_bustype, "ISA ", 6); MP_bus_info(&bus); construct_default_ioirq_mptable(0); } } else BUG(); efi_unmap_mpf(); printk(KERN_INFO "Processors: %d\n", num_processors); /* * Only use the first configuration found. */ } static int __init smp_scan_config (unsigned long base, unsigned long length) { unsigned int *bp = maddr_to_virt(base); struct intel_mp_floating *mpf; Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); if (sizeof(*mpf) != 16) printk("Error: MPF size\n"); while (length > 0) { mpf = (struct intel_mp_floating *)bp; if ((*bp == SMP_MAGIC_IDENT) && (mpf->mpf_length == 1) && !mpf_checksum((unsigned char *)bp, 16) && ((mpf->mpf_specification == 1) || (mpf->mpf_specification == 4)) ) { smp_found_config = 1; printk(KERN_INFO "found SMP MP-table at %08lx\n", virt_to_maddr(mpf)); #if 0 reserve_bootmem(virt_to_maddr(mpf), PAGE_SIZE); if (mpf->mpf_physptr) { /* * We cannot access to MPC table to compute * table size yet, as only few megabytes from * the bottom is mapped now. * PC-9800's MPC table places on the very last * of physical memory; so that simply reserving * PAGE_SIZE from mpg->mpf_physptr yields BUG() * in reserve_bootmem. */ unsigned long size = PAGE_SIZE; unsigned long end = max_low_pfn * PAGE_SIZE; if (mpf->mpf_physptr + size > end) size = end - mpf->mpf_physptr; reserve_bootmem(mpf->mpf_physptr, size); } #endif mpf_found = mpf; return 1; } bp += 4; length -= 16; } return 0; } static void __init efi_check_config(void) { struct intel_mp_floating *mpf; if (efi.mps == EFI_INVALID_TABLE_ADDR) return; __set_fixmap(FIX_EFI_MPF, PFN_DOWN(efi.mps), __PAGE_HYPERVISOR); mpf = (void *)fix_to_virt(FIX_EFI_MPF) + ((long)efi.mps & (PAGE_SIZE-1)); if (memcmp(mpf->mpf_signature, "_MP_", 4) == 0 && mpf->mpf_length == 1 && mpf_checksum((void *)mpf, 16) && (mpf->mpf_specification == 1 || mpf->mpf_specification == 4)) { smp_found_config = 1; printk(KERN_INFO "SMP MP-table at %08lx\n", efi.mps); mpf_found = mpf; } else efi_unmap_mpf(); } void __init find_smp_config (void) { unsigned int address; if (efi_enabled) { efi_check_config(); return; } /* * FIXME: Linux assumes you have 640K of base ram.. * this continues the error... * * 1) Scan the bottom 1K for a signature * 2) Scan the top 1K of base RAM * 3) Scan the 64K of bios */ if (smp_scan_config(0x0,0x400) || smp_scan_config(639*0x400,0x400) || smp_scan_config(0xF0000,0x10000)) return; /* * If it is an SMP machine we should know now, unless the * configuration is in an EISA/MCA bus machine with an * extended bios data area. * * there is a real-mode segmented pointer pointing to the * 4K EBDA area at 0x40E, calculate and scan it here. * * NOTE! There are Linux loaders that will corrupt the EBDA * area, and as such this kind of SMP config may be less * trustworthy, simply because the SMP table may have been * stomped on during early boot. These loaders are buggy and * should be fixed. * * MP1.4 SPEC states to only scan first 1K of 4K EBDA. */ address = get_bios_ebda(); if (address) smp_scan_config(address, 0x400); } /* -------------------------------------------------------------------------- ACPI-based MP Configuration -------------------------------------------------------------------------- */ #ifdef CONFIG_ACPI void __init mp_register_lapic_address ( u64 address) { if (!x2apic_enabled) { mp_lapic_addr = (unsigned long) address; set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); } if (boot_cpu_physical_apicid == -1U) boot_cpu_physical_apicid = get_apic_id(); Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); } int __devinit mp_register_lapic ( u32 id, bool_t enabled, bool_t hotplug) { struct mpc_config_processor processor; int boot_cpu = 0; if (MAX_APICS <= id) { printk(KERN_WARNING "Processor #%d invalid (max %d)\n", id, MAX_APICS); return -EINVAL; } if (id == boot_cpu_physical_apicid) boot_cpu = 1; processor.mpc_type = MP_PROCESSOR; processor.mpc_apicid = id; processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; processor.mpc_reserved[0] = 0; processor.mpc_reserved[1] = 0; return MP_processor_info_x(&processor, id, hotplug); } void mp_unregister_lapic(uint32_t apic_id, uint32_t cpu) { if (!cpu || (apic_id == boot_cpu_physical_apicid)) return; if (x86_cpu_to_apicid[cpu] != apic_id) return; physid_clear(apic_id, phys_cpu_present_map); x86_cpu_to_apicid[cpu] = BAD_APICID; cpumask_clear_cpu(cpu, &cpu_present_map); } #ifdef CONFIG_X86_IO_APIC #define MP_ISA_BUS 0 #define MP_MAX_IOAPIC_PIN 127 static struct mp_ioapic_routing { int gsi_base; int gsi_end; unsigned long pin_programmed[BITS_TO_LONGS(MP_MAX_IOAPIC_PIN + 1)]; } mp_ioapic_routing[MAX_IO_APICS]; static int mp_find_ioapic ( int gsi) { unsigned int i; /* Find the IOAPIC that manages this GSI. */ for (i = 0; i < nr_ioapics; i++) { if ((gsi >= mp_ioapic_routing[i].gsi_base) && (gsi <= mp_ioapic_routing[i].gsi_end)) return i; } printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); return -1; } void __init mp_register_ioapic ( u8 id, u32 address, u32 gsi_base) { int idx = 0; int tmpid; if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " "(found %d)\n", MAX_IO_APICS, nr_ioapics); panic("Recompile kernel with bigger MAX_IO_APICS"); } if (!address) { printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" " found in MADT table, skipping!\n"); return; } idx = nr_ioapics++; mp_ioapics[idx].mpc_type = MP_IOAPIC; mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; mp_ioapics[idx].mpc_apicaddr = address; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) tmpid = io_apic_get_unique_id(idx, id); else tmpid = id; if (tmpid == -1) { nr_ioapics--; return; } mp_ioapics[idx].mpc_apicid = tmpid; mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); /* * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). */ mp_ioapic_routing[idx].gsi_base = gsi_base; mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); printk("IOAPIC[%d]: apic_id %d, version %d, address %#x, " "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); return; } unsigned __init highest_gsi(void) { unsigned x, res = 0; for (x = 0; x < nr_ioapics; x++) if (res < mp_ioapic_routing[x].gsi_end) res = mp_ioapic_routing[x].gsi_end; return res; } unsigned apic_gsi_base(int apic) { return mp_ioapic_routing[apic].gsi_base; } void __init mp_override_legacy_irq ( u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { struct mpc_config_intsrc intsrc; int ioapic = -1; int pin = -1; /* * Convert 'gsi' to 'ioapic.pin'. */ ioapic = mp_find_ioapic(gsi); if (ioapic < 0) return; pin = gsi - mp_ioapic_routing[ioapic].gsi_base; /* * TBD: This check is for faulty timer entries, where the override * erroneously sets the trigger to level, resulting in a HUGE * increase of timer interrupts! */ if ((bus_irq == 0) && (trigger == 3)) trigger = 1; intsrc.mpc_type = MP_INTSRC; intsrc.mpc_irqtype = mp_INT; intsrc.mpc_irqflag = (trigger << 2) | polarity; intsrc.mpc_srcbus = MP_ISA_BUS; intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ intsrc.mpc_dstirq = pin; /* INTIN# */ Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); mp_irqs[mp_irq_entries] = intsrc; if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded"); return; } void __init mp_config_acpi_legacy_irqs (void) { struct mpc_config_intsrc intsrc; int i = 0; int ioapic = -1; /* * Fabricate the legacy ISA bus (bus #31). */ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); /* * Locate the IOAPIC that manages the ISA IRQs (0-15). */ ioapic = mp_find_ioapic(0); if (ioapic < 0) return; intsrc.mpc_type = MP_INTSRC; intsrc.mpc_irqflag = 0; /* Conforming */ intsrc.mpc_srcbus = MP_ISA_BUS; intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* * Use the default configuration for the IRQs 0-15. Unless * overriden by (MADT) interrupt source override entries. */ for (i = 0; platform_legacy_irq(i); i++) { int idx; for (idx = 0; idx < mp_irq_entries; idx++) { struct mpc_config_intsrc *irq = mp_irqs + idx; /* Do we already have a mapping for this ISA IRQ? */ if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) break; /* Do we already have a mapping for this IOAPIC pin */ if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && (irq->mpc_dstirq == i)) break; } if (idx != mp_irq_entries) { printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); continue; /* IRQ already used */ } intsrc.mpc_irqtype = mp_INT; intsrc.mpc_srcbusirq = i; /* Identity mapped */ intsrc.mpc_dstirq = i; Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); mp_irqs[mp_irq_entries] = intsrc; if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded"); } } int mp_register_gsi (u32 gsi, int triggering, int polarity) { int ioapic; int ioapic_pin; struct irq_desc * desc; unsigned long flags; /* * Mapping between Global System Interrups, which * represent all possible interrupts, and IRQs * assigned to actual devices. */ #ifdef CONFIG_ACPI_BUS /* Don't set up the ACPI SCI because it's already set up */ if (acpi_fadt.sci_int == gsi) return gsi; #endif if (!nr_ioapics) { unsigned int port = 0x4d0 + (gsi >> 3); u8 val; if (!platform_legacy_irq(gsi)) return -EINVAL; val = inb(port); if (triggering) val |= 1 << (gsi & 7); else val &= ~(1 << (gsi & 7)); outb(val, port); return 0; } ioapic = mp_find_ioapic(gsi); if (ioapic < 0) { printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); return -EINVAL; } ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; desc = irq_to_desc(gsi); spin_lock_irqsave(&desc->lock, flags); if (!(desc->status & IRQ_DISABLED) && desc->handler != &no_irq_type) { spin_unlock_irqrestore(&desc->lock, flags); return -EEXIST; } spin_unlock_irqrestore(&desc->lock, flags); /* * Avoid pin reprogramming. PRTs typically include entries * with redundant pin->gsi mappings (but unique PCI devices); * we only program the IOAPIC on the first. */ if (ioapic_pin > MP_MAX_IOAPIC_PIN) { printk(KERN_ERR "Invalid reference to IOAPIC pin " "%d-%d\n", mp_ioapics[ioapic].mpc_apicid, ioapic_pin); return -EINVAL; } if (test_and_set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", mp_ioapics[ioapic].mpc_apicid, ioapic_pin); return -EEXIST; } return io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, triggering, polarity); } #endif /* CONFIG_X86_IO_APIC */ #endif /* CONFIG_ACPI */ xen-4.4.0/xen/arch/x86/acpi/0000775000175000017500000000000012307313555013513 5ustar smbsmbxen-4.4.0/xen/arch/x86/acpi/Makefile0000664000175000017500000000016712307313555015157 0ustar smbsmbsubdir-y += cpufreq obj-y += lib.o power.o suspend.o cpu_idle.o cpuidle_menu.o obj-bin-y += boot.init.o wakeup_prot.o xen-4.4.0/xen/arch/x86/acpi/wakeup_prot.S0000664000175000017500000000612312307313555016201 0ustar smbsmb .text #include #include #include #include #include #include #include .code64 #define GREG(x) %r##x #define SAVED_GREG(x) saved_r##x(%rip) #define DECLARE_GREG(x) saved_r##x: .quad 0 #define SAVE_GREG(x) movq GREG(x), SAVED_GREG(x) #define LOAD_GREG(x) movq SAVED_GREG(x), GREG(x) #define REF(x) x(%rip) ENTRY(do_suspend_lowlevel) SAVE_GREG(sp) SAVE_GREG(ax) SAVE_GREG(bx) SAVE_GREG(cx) SAVE_GREG(dx) SAVE_GREG(bp) SAVE_GREG(si) SAVE_GREG(di) SAVE_GREG(8) # save r8...r15 SAVE_GREG(9) SAVE_GREG(10) SAVE_GREG(11) SAVE_GREG(12) SAVE_GREG(13) SAVE_GREG(14) SAVE_GREG(15) pushfq; popq SAVED_GREG(flags) mov %cr8, GREG(ax) mov GREG(ax), REF(saved_cr8) mov %ss, REF(saved_ss) sgdt REF(saved_gdt) sidt REF(saved_idt) sldt REF(saved_ldt) mov %cr0, GREG(ax) mov GREG(ax), REF(saved_cr0) mov %cr3, GREG(ax) mov GREG(ax), REF(saved_cr3) call save_rest_processor_state mov $3, %rdi xor %eax, %eax /* enter sleep state physically */ call acpi_enter_sleep_state jmp __ret_point ENTRY(__ret_point) /* mmu_cr4_features contains latest cr4 setting */ mov REF(mmu_cr4_features), GREG(ax) mov GREG(ax), %cr4 mov REF(saved_cr3), GREG(ax) mov GREG(ax), %cr3 mov REF(saved_cr0), GREG(ax) mov GREG(ax), %cr0 lgdt REF(saved_gdt) lidt REF(saved_idt) lldt REF(saved_ldt) mov REF(saved_ss), %ss LOAD_GREG(sp) /* Reload code selector */ pushq $(__HYPERVISOR_CS64) leaq 1f(%rip),%rax pushq %rax lretq 1: mov REF(saved_cr8), %rax mov %rax, %cr8 pushq SAVED_GREG(flags) popfq call restore_rest_processor_state LOAD_GREG(bp) LOAD_GREG(ax) LOAD_GREG(bx) LOAD_GREG(cx) LOAD_GREG(dx) LOAD_GREG(si) LOAD_GREG(di) LOAD_GREG(8) # save r8...r15 LOAD_GREG(9) LOAD_GREG(10) LOAD_GREG(11) LOAD_GREG(12) LOAD_GREG(13) LOAD_GREG(14) LOAD_GREG(15) ret .data .align 16 GLOBAL(saved_magic) .long 0x9abcdef0 saved_ss: .word 0 .align 8 DECLARE_GREG(sp) DECLARE_GREG(bp) DECLARE_GREG(ax) DECLARE_GREG(bx) DECLARE_GREG(cx) DECLARE_GREG(dx) DECLARE_GREG(si) DECLARE_GREG(di) DECLARE_GREG(flags) DECLARE_GREG(8) DECLARE_GREG(9) DECLARE_GREG(10) DECLARE_GREG(11) DECLARE_GREG(12) DECLARE_GREG(13) DECLARE_GREG(14) DECLARE_GREG(15) saved_gdt: .quad 0,0 saved_idt: .quad 0,0 saved_ldt: .quad 0,0 saved_cr0: .quad 0 saved_cr3: .quad 0 saved_cr8: .quad 0 xen-4.4.0/xen/arch/x86/acpi/suspend.c0000664000175000017500000000605012307313555015341 0ustar smbsmb/* * Portions are: * Copyright (c) 2002 Pavel Machek * Copyright (c) 2001 Patrick Mochel */ #include #include #include #include #include #include #include #include #include #include #include static unsigned long saved_lstar, saved_cstar; static unsigned long saved_sysenter_esp, saved_sysenter_eip; static unsigned long saved_fs_base, saved_gs_base, saved_kernel_gs_base; static uint16_t saved_segs[4]; static uint64_t saved_xcr0; void save_rest_processor_state(void) { vcpu_save_fpu(current); asm volatile ( "movw %%ds,(%0); movw %%es,2(%0); movw %%fs,4(%0); movw %%gs,6(%0)" : : "r" (saved_segs) : "memory" ); saved_fs_base = rdfsbase(); saved_gs_base = rdgsbase(); rdmsrl(MSR_SHADOW_GS_BASE, saved_kernel_gs_base); rdmsrl(MSR_CSTAR, saved_cstar); rdmsrl(MSR_LSTAR, saved_lstar); if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR ) { rdmsrl(MSR_IA32_SYSENTER_ESP, saved_sysenter_esp); rdmsrl(MSR_IA32_SYSENTER_EIP, saved_sysenter_eip); } if ( cpu_has_xsave ) saved_xcr0 = get_xcr0(); } void restore_rest_processor_state(void) { struct vcpu *curr = current; load_TR(); /* Recover syscall MSRs */ wrmsrl(MSR_LSTAR, saved_lstar); wrmsrl(MSR_CSTAR, saved_cstar); wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS); wrmsr(MSR_SYSCALL_MASK, X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT| X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_TF, 0U); wrfsbase(saved_fs_base); wrgsbase(saved_gs_base); wrmsrl(MSR_SHADOW_GS_BASE, saved_kernel_gs_base); if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR ) { /* Recover sysenter MSRs */ wrmsrl(MSR_IA32_SYSENTER_ESP, saved_sysenter_esp); wrmsrl(MSR_IA32_SYSENTER_EIP, saved_sysenter_eip); wrmsr(MSR_IA32_SYSENTER_CS, __HYPERVISOR_CS, 0); } if ( !is_idle_vcpu(curr) ) { asm volatile ( "movw (%0),%%ds; movw 2(%0),%%es; movw 4(%0),%%fs" : : "r" (saved_segs) : "memory" ); do_set_segment_base(SEGBASE_GS_USER_SEL, saved_segs[3]); } if ( cpu_has_xsave && !set_xcr0(saved_xcr0) ) BUG(); /* Maybe load the debug registers. */ BUG_ON(!is_pv_vcpu(curr)); if ( !is_idle_vcpu(curr) && curr->arch.debugreg[7] ) { write_debugreg(0, curr->arch.debugreg[0]); write_debugreg(1, curr->arch.debugreg[1]); write_debugreg(2, curr->arch.debugreg[2]); write_debugreg(3, curr->arch.debugreg[3]); write_debugreg(6, curr->arch.debugreg[6]); write_debugreg(7, curr->arch.debugreg[7]); } /* Reload FPU state on next FPU use. */ stts(); if (cpu_has_pat) wrmsrl(MSR_IA32_CR_PAT, host_pat); mtrr_bp_restore(); } xen-4.4.0/xen/arch/x86/acpi/boot.c0000664000175000017500000004746712307313555014644 0ustar smbsmb/* * boot.c - Architecture-Specific Low-Level ACPI Boot Support * * Copyright (C) 2001, 2002 Paul Diefenbaugh * Copyright (C) 2001 Jun Nakajima * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CONFIG_HPET_TIMER #include /* for hpet_address */ #endif #include #include #define BAD_MADT_ENTRY(entry, end) ( \ (!entry) || (unsigned long)entry + sizeof(*entry) > end || \ ((struct acpi_subtable_header *)entry)->length != sizeof(*entry)) #define PREFIX "ACPI: " bool_t __initdata acpi_noirq; /* skip ACPI IRQ initialization */ bool_t __initdata acpi_ht = 1; /* enable HT */ bool_t __initdata acpi_lapic; bool_t __initdata acpi_ioapic; bool_t acpi_skip_timer_override __initdata; #ifdef CONFIG_X86_LOCAL_APIC static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; #endif /* -------------------------------------------------------------------------- Boot-time Configuration -------------------------------------------------------------------------- */ #ifdef CONFIG_X86_LOCAL_APIC static int __init acpi_parse_madt(struct acpi_table_header *table) { struct acpi_table_madt *madt; madt = (struct acpi_table_madt *)table; if (madt->address) { acpi_lapic_addr = (u64) madt->address; printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n", madt->address); } acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); return 0; } static int __init acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_madt_local_x2apic *processor = container_of(header, struct acpi_madt_local_x2apic, header); bool_t enabled = 0; if (BAD_MADT_ENTRY(processor, end)) return -EINVAL; acpi_table_print_madt_entry(header); /* Record local apic id only when enabled and fitting. */ if (processor->local_apic_id >= MAX_APICS || processor->uid >= MAX_MADT_ENTRIES) { printk("%sAPIC ID %#x and/or ACPI ID %#x beyond limit" " - processor ignored\n", processor->lapic_flags & ACPI_MADT_ENABLED ? KERN_WARNING "WARNING: " : KERN_INFO, processor->local_apic_id, processor->uid); /* * Must not return an error here, to prevent * acpi_table_parse_entries() from terminating early. */ return 0 /* -ENOSPC */; } if (processor->lapic_flags & ACPI_MADT_ENABLED) { x86_acpiid_to_apicid[processor->uid] = processor->local_apic_id; enabled = 1; } /* * We need to register disabled CPU as well to permit * counting disabled CPUs. This allows us to size * cpus_possible_map more accurately, to permit * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ mp_register_lapic(processor->local_apic_id, enabled, 0); return 0; } static int __init acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end) { struct acpi_madt_local_apic *processor = container_of(header, struct acpi_madt_local_apic, header); bool_t enabled = 0; if (BAD_MADT_ENTRY(processor, end)) return -EINVAL; acpi_table_print_madt_entry(header); /* Record local apic id only when enabled */ if (processor->lapic_flags & ACPI_MADT_ENABLED) { x86_acpiid_to_apicid[processor->processor_id] = processor->id; enabled = 1; } /* * We need to register disabled CPU as well to permit * counting disabled CPUs. This allows us to size * cpus_possible_map more accurately, to permit * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ mp_register_lapic(processor->id, enabled, 0); return 0; } static int __init acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, const unsigned long end) { struct acpi_madt_local_apic_override *lapic_addr_ovr = container_of(header, struct acpi_madt_local_apic_override, header); if (BAD_MADT_ENTRY(lapic_addr_ovr, end)) return -EINVAL; acpi_lapic_addr = lapic_addr_ovr->address; return 0; } static int __init acpi_parse_x2apic_nmi(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_madt_local_x2apic_nmi *x2apic_nmi = container_of(header, struct acpi_madt_local_x2apic_nmi, header); if (BAD_MADT_ENTRY(x2apic_nmi, end)) return -EINVAL; acpi_table_print_madt_entry(header); if (x2apic_nmi->lint != 1) printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); return 0; } static int __init acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end) { struct acpi_madt_local_apic_nmi *lapic_nmi = container_of(header, struct acpi_madt_local_apic_nmi, header); if (BAD_MADT_ENTRY(lapic_nmi, end)) return -EINVAL; acpi_table_print_madt_entry(header); if (lapic_nmi->lint != 1) printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); return 0; } #endif /*CONFIG_X86_LOCAL_APIC */ #if defined(CONFIG_X86_IO_APIC) /*&& defined(CONFIG_ACPI_INTERPRETER)*/ static int __init acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) { struct acpi_madt_io_apic *ioapic = container_of(header, struct acpi_madt_io_apic, header); if (BAD_MADT_ENTRY(ioapic, end)) return -EINVAL; acpi_table_print_madt_entry(header); mp_register_ioapic(ioapic->id, ioapic->address, ioapic->global_irq_base); return 0; } static int __init acpi_parse_int_src_ovr(struct acpi_subtable_header * header, const unsigned long end) { struct acpi_madt_interrupt_override *intsrc = container_of(header, struct acpi_madt_interrupt_override, header); if (BAD_MADT_ENTRY(intsrc, end)) return -EINVAL; acpi_table_print_madt_entry(header); if (acpi_skip_timer_override && intsrc->source_irq == 0 && intsrc->global_irq == 2) { printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); return 0; } mp_override_legacy_irq(intsrc->source_irq, ACPI_MADT_GET_POLARITY(intsrc->inti_flags), ACPI_MADT_GET_TRIGGER(intsrc->inti_flags), intsrc->global_irq); return 0; } static int __init acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end) { struct acpi_madt_nmi_source *nmi_src = container_of(header, struct acpi_madt_nmi_source, header); if (BAD_MADT_ENTRY(nmi_src, end)) return -EINVAL; acpi_table_print_madt_entry(header); /* TBD: Support nimsrc entries? */ return 0; } #endif /* CONFIG_X86_IO_APIC */ #ifdef CONFIG_HPET_TIMER static int __init acpi_parse_hpet(struct acpi_table_header *table) { struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table; if (hpet_tbl->address.space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) { printk(KERN_WARNING PREFIX "HPET timers must be located in " "memory.\n"); return -1; } /* * Some BIOSes provide multiple HPET tables. Sometimes this is a BIOS * bug; the intended way of supporting more than 1 HPET is to use AML * entries. * * If someone finds a real system with two genuine HPET tables, perhaps * they will be kind and implement support. Until then however, warn * that we will ignore subsequent tables. */ if (hpet_address) { printk(KERN_WARNING PREFIX "Found multiple HPET tables. Only using first\n"); return -1; } hpet_address = hpet_tbl->address.address; hpet_blockid = hpet_tbl->sequence; printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", hpet_tbl->id, hpet_address); return 0; } #else #define acpi_parse_hpet NULL #endif static int __init acpi_invalidate_bgrt(struct acpi_table_header *table) { struct acpi_table_bgrt *bgrt_tbl = container_of(table, struct acpi_table_bgrt, header); if (table->length < sizeof(*bgrt_tbl)) return -1; if (bgrt_tbl->version == 1 && bgrt_tbl->image_address && !page_is_ram_type(PFN_DOWN(bgrt_tbl->image_address), RAM_TYPE_CONVENTIONAL)) return 0; printk(KERN_INFO PREFIX "BGRT: invalidating v%d image at %#"PRIx64"\n", bgrt_tbl->version, bgrt_tbl->image_address); bgrt_tbl->image_address = 0; bgrt_tbl->status &= ~1; return 0; } #ifdef CONFIG_ACPI_SLEEP #define acpi_fadt_copy_address(dst, src, len) do { \ if (fadt->header.revision >= FADT2_REVISION_ID && \ fadt->header.length >= ACPI_FADT_V2_SIZE) \ acpi_sinfo.dst##_blk = fadt->x##src##_block; \ if (!acpi_sinfo.dst##_blk.address) { \ acpi_sinfo.dst##_blk.address = fadt->src##_block; \ acpi_sinfo.dst##_blk.space_id = ACPI_ADR_SPACE_SYSTEM_IO; \ acpi_sinfo.dst##_blk.bit_width = fadt->len##_length << 3; \ acpi_sinfo.dst##_blk.bit_offset = 0; \ acpi_sinfo.dst##_blk.access_width = fadt->len##_length; \ } \ } while (0) /* Get pm1x_cnt and pm1x_evt information for ACPI sleep */ static void __init acpi_fadt_parse_sleep_info(struct acpi_table_fadt *fadt) { struct acpi_table_facs *facs = NULL; uint64_t facs_pa; if (fadt->header.revision >= 5 && fadt->header.length >= ACPI_FADT_V5_SIZE) { acpi_sinfo.sleep_control = fadt->sleep_control; acpi_sinfo.sleep_status = fadt->sleep_status; printk(KERN_INFO PREFIX "v5 SLEEP INFO: control[%d:%"PRIx64"]," " status[%d:%"PRIx64"]\n", acpi_sinfo.sleep_control.space_id, acpi_sinfo.sleep_control.address, acpi_sinfo.sleep_status.space_id, acpi_sinfo.sleep_status.address); if ((fadt->sleep_control.address && (fadt->sleep_control.bit_offset || fadt->sleep_control.bit_width != fadt->sleep_control.access_width * 8)) || (fadt->sleep_status.address && (fadt->sleep_status.bit_offset || fadt->sleep_status.bit_width != fadt->sleep_status.access_width * 8))) { printk(KERN_WARNING PREFIX "Invalid sleep control/status register data:" " %#x:%#x:%#x %#x:%#x:%#x\n", fadt->sleep_control.bit_offset, fadt->sleep_control.bit_width, fadt->sleep_control.access_width, fadt->sleep_status.bit_offset, fadt->sleep_status.bit_width, fadt->sleep_status.access_width); fadt->sleep_control.address = 0; fadt->sleep_status.address = 0; } } if (fadt->flags & ACPI_FADT_HW_REDUCED) goto bad; acpi_fadt_copy_address(pm1a_cnt, pm1a_control, pm1_control); acpi_fadt_copy_address(pm1b_cnt, pm1b_control, pm1_control); acpi_fadt_copy_address(pm1a_evt, pm1a_event, pm1_event); acpi_fadt_copy_address(pm1b_evt, pm1b_event, pm1_event); printk(KERN_INFO PREFIX "SLEEP INFO: pm1x_cnt[%"PRIx64",%"PRIx64"], " "pm1x_evt[%"PRIx64",%"PRIx64"]\n", acpi_sinfo.pm1a_cnt_blk.address, acpi_sinfo.pm1b_cnt_blk.address, acpi_sinfo.pm1a_evt_blk.address, acpi_sinfo.pm1b_evt_blk.address); /* Now FACS... */ facs_pa = ((fadt->header.revision >= FADT2_REVISION_ID) ? fadt->Xfacs : (uint64_t)fadt->facs); if (fadt->facs && ((uint64_t)fadt->facs != facs_pa)) { printk(KERN_WARNING PREFIX "32/64X FACS address mismatch in FADT - " "%08x/%016"PRIx64", using 32\n", fadt->facs, facs_pa); facs_pa = (uint64_t)fadt->facs; } if (!facs_pa) goto bad; facs = (struct acpi_table_facs *) __acpi_map_table(facs_pa, sizeof(struct acpi_table_facs)); if (!facs) goto bad; if (strncmp(facs->signature, "FACS", 4)) { printk(KERN_ERR PREFIX "Invalid FACS signature %.4s\n", facs->signature); goto bad; } if (facs->length < 24) { printk(KERN_ERR PREFIX "Invalid FACS table length: %#x", facs->length); goto bad; } if (facs->length < 64) printk(KERN_WARNING PREFIX "FACS is shorter than ACPI spec allow: %#x", facs->length); acpi_sinfo.wakeup_vector = facs_pa + offsetof(struct acpi_table_facs, firmware_waking_vector); acpi_sinfo.vector_width = 32; printk(KERN_INFO PREFIX " wakeup_vec[%"PRIx64"], vec_size[%x]\n", acpi_sinfo.wakeup_vector, acpi_sinfo.vector_width); return; bad: memset(&acpi_sinfo, 0, offsetof(struct acpi_sleep_info, sleep_control)); memset(&acpi_sinfo.sleep_status + 1, 0, (long)(&acpi_sinfo + 1) - (long)(&acpi_sinfo.sleep_status + 1)); } #endif static int __init acpi_parse_fadt(struct acpi_table_header *table) { struct acpi_table_fadt *fadt = (struct acpi_table_fadt *)table; #ifdef CONFIG_ACPI_INTERPRETER /* initialize sci_int early for INT_SRC_OVR MADT parsing */ acpi_fadt.sci_int = fadt->sci_int; /* initialize rev and apic_phys_dest_mode for x86_64 genapic */ acpi_fadt.revision = fadt->revision; acpi_fadt.force_apic_physical_destination_mode = fadt->force_apic_physical_destination_mode; #endif #ifdef CONFIG_X86_PM_TIMER /* detect the location of the ACPI PM Timer */ if (fadt->header.revision >= FADT2_REVISION_ID) { /* FADT rev. 2 */ if (fadt->xpm_timer_block.space_id == ACPI_ADR_SPACE_SYSTEM_IO) pmtmr_ioport = fadt->xpm_timer_block.address; /* * "X" fields are optional extensions to the original V1.0 * fields, so we must selectively expand V1.0 fields if the * corresponding X field is zero. */ if (!pmtmr_ioport) pmtmr_ioport = fadt->pm_timer_block; } else { /* FADT rev. 1 */ pmtmr_ioport = fadt->pm_timer_block; } if (pmtmr_ioport) printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", pmtmr_ioport); #endif acpi_smi_cmd = fadt->smi_command; acpi_enable_value = fadt->acpi_enable; acpi_disable_value = fadt->acpi_disable; #ifdef CONFIG_ACPI_SLEEP acpi_fadt_parse_sleep_info(fadt); #endif return 0; } #ifdef CONFIG_X86_LOCAL_APIC /* * Parse LAPIC entries in MADT * returns 0 on success, < 0 on error */ static int __init acpi_parse_madt_lapic_entries(void) { int count, x2count; if (!cpu_has_apic) return -ENODEV; /* * Note that the LAPIC address is obtained from the MADT (32-bit value) * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). */ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, acpi_parse_lapic_addr_ovr, 0); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing LAPIC address override entry\n"); return count; } mp_register_lapic_address(acpi_lapic_addr); BUILD_BUG_ON(MAX_APICS != MAX_LOCAL_APIC); count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, acpi_parse_lapic, MAX_APICS); x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, acpi_parse_x2apic, MAX_APICS); if (!count && !x2count) { printk(KERN_ERR PREFIX "No LAPIC entries present\n"); /* TBD: Cleanup to allow fallback to MPS */ return -ENODEV; } else if (count < 0 || x2count < 0) { printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ return count < 0 ? count : x2count; } count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0); x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI, acpi_parse_x2apic_nmi, 0); if (count < 0 || x2count < 0) { printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); /* TBD: Cleanup to allow fallback to MPS */ return count < 0 ? count : x2count; } return 0; } #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC /* * Parse IOAPIC related entries in MADT * returns 0 on success, < 0 on error */ static int __init acpi_parse_madt_ioapic_entries(void) { int count; /* * ACPI interpreter is required to complete interrupt setup, * so if it is off, don't enumerate the io-apics with ACPI. * If MPS is present, it will handle them, * otherwise the system will stay in PIC mode */ if (acpi_disabled || acpi_noirq) { return -ENODEV; } if (!cpu_has_apic) return -ENODEV; /* * if "noapic" boot option, don't look for IO-APICs */ if (skip_ioapic_setup) { printk(KERN_INFO PREFIX "Skipping IOAPIC probe " "due to 'noapic' option.\n"); return -ENODEV; } count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic, MAX_IO_APICS); if (!count) { printk(KERN_ERR PREFIX "No IOAPIC entries present\n"); return -ENODEV; } else if (count < 0) { printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n"); return count; } count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, MAX_IRQ_SOURCES); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n"); /* TBD: Cleanup to allow fallback to MPS */ return count; } /* Fill in identity legacy mapings where no override */ mp_config_acpi_legacy_irqs(); count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, MAX_IRQ_SOURCES); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ return count; } return 0; } #else static inline int acpi_parse_madt_ioapic_entries(void) { return -1; } #endif /* !CONFIG_X86_IO_APIC */ static void __init acpi_process_madt(void) { #ifdef CONFIG_X86_LOCAL_APIC int error; if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) { /* * Parse MADT LAPIC entries */ error = acpi_parse_madt_lapic_entries(); if (!error) { acpi_lapic = 1; generic_bigsmp_probe(); /* * Parse MADT IO-APIC entries */ error = acpi_parse_madt_ioapic_entries(); if (!error) { acpi_ioapic = 1; smp_found_config = 1; clustered_apic_check(); } } if (error == -EINVAL) { /* * Dell Precision Workstation 410, 610 come here. */ printk(KERN_ERR PREFIX "Invalid BIOS MADT, disabling ACPI\n"); disable_acpi(); } } #endif return; } /* * acpi_boot_table_init() and acpi_boot_init() * called from setup_arch(), always. * 1. checksums all tables * 2. enumerates lapics * 3. enumerates io-apics * * acpi_table_init() is separate to allow reading SRAT without * other side effects. * * side effects of acpi_boot_init: * acpi_lapic = 1 if LAPIC found * acpi_ioapic = 1 if IOAPIC found * if (acpi_lapic && acpi_ioapic) smp_found_config = 1; * ... * * return value: (currently ignored) * 0: success * !0: failure */ int __init acpi_boot_table_init(void) { int error; /* * If acpi_disabled, bail out * One exception: acpi=ht continues far enough to enumerate LAPICs */ if (acpi_disabled && !acpi_ht) return 1; /* * Initialize the ACPI boot-time table parser. */ error = acpi_table_init(); if (error) { disable_acpi(); return error; } return 0; } int __init acpi_boot_init(void) { /* * If acpi_disabled, bail out * One exception: acpi=ht continues far enough to enumerate LAPICs */ if (acpi_disabled && !acpi_ht) return 1; /* * set sci_int and PM timer address */ acpi_table_parse(ACPI_SIG_FADT, acpi_parse_fadt); /* * Process the Multiple APIC Description Table (MADT), if present */ acpi_process_madt(); acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); acpi_dmar_init(); erst_init(); acpi_table_parse(ACPI_SIG_BGRT, acpi_invalidate_bgrt); return 0; } xen-4.4.0/xen/arch/x86/acpi/power.c0000664000175000017500000002543412307313555015023 0ustar smbsmb/* drivers/acpi/sleep/power.c - PM core functionality for Xen * * Copyrights from Linux side: * Copyright (c) 2000-2003 Patrick Mochel * Copyright (C) 2001-2003 Pavel Machek * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2004 David Shaohua Li * Copyright (c) 2005 Alexey Starikovskiy * * Slimmed with Xen specific support. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include uint32_t system_reset_counter = 1; static char __initdata opt_acpi_sleep[20]; string_param("acpi_sleep", opt_acpi_sleep); static u8 sleep_states[ACPI_S_STATE_COUNT]; static DEFINE_SPINLOCK(pm_lock); struct acpi_sleep_info acpi_sinfo; void do_suspend_lowlevel(void); static int device_power_down(void) { console_suspend(); time_suspend(); i8259A_suspend(); ioapic_suspend(); iommu_suspend(); lapic_suspend(); return 0; } static void device_power_up(void) { lapic_resume(); iommu_resume(); ioapic_resume(); i8259A_resume(); time_resume(); console_resume(); } static void freeze_domains(void) { struct domain *d; rcu_read_lock(&domlist_read_lock); /* * Note that we iterate in order of domain-id. Hence we will pause dom0 * first which is required for correctness (as only dom0 can add domains to * the domain list). Otherwise we could miss concurrently-created domains. */ for_each_domain ( d ) domain_pause(d); rcu_read_unlock(&domlist_read_lock); } static void thaw_domains(void) { struct domain *d; rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) { restore_vcpu_affinity(d); domain_unpause(d); } rcu_read_unlock(&domlist_read_lock); } static void acpi_sleep_prepare(u32 state) { void *wakeup_vector_va; if ( state != ACPI_STATE_S3 ) return; wakeup_vector_va = __acpi_map_table( acpi_sinfo.wakeup_vector, sizeof(uint64_t)); /* TBoot will set resume vector itself (when it is safe to do so). */ if ( tboot_in_measured_env() ) return; if ( acpi_sinfo.vector_width == 32 ) *(uint32_t *)wakeup_vector_va = bootsym_phys(wakeup_start); else *(uint64_t *)wakeup_vector_va = bootsym_phys(wakeup_start); } static void acpi_sleep_post(u32 state) {} /* Main interface to do xen specific suspend/resume */ static int enter_state(u32 state) { unsigned long flags; int error; unsigned long cr4; if ( (state <= ACPI_STATE_S0) || (state > ACPI_S_STATES_MAX) ) return -EINVAL; if ( !spin_trylock(&pm_lock) ) return -EBUSY; BUG_ON(system_state != SYS_STATE_active); system_state = SYS_STATE_suspend; printk(XENLOG_INFO "Preparing system for ACPI S%d state.\n", state); freeze_domains(); acpi_dmar_reinstate(); if ( (error = disable_nonboot_cpus()) ) { system_state = SYS_STATE_resume; goto enable_cpu; } cpufreq_del_cpu(0); hvm_cpu_down(); acpi_sleep_prepare(state); console_start_sync(); printk("Entering ACPI S%d state.\n", state); local_irq_save(flags); spin_debug_disable(); if ( (error = device_power_down()) ) { printk(XENLOG_ERR "Some devices failed to power down."); system_state = SYS_STATE_resume; goto done; } ACPI_FLUSH_CPU_CACHE(); switch ( state ) { case ACPI_STATE_S3: do_suspend_lowlevel(); system_reset_counter++; error = tboot_s3_resume(); break; case ACPI_STATE_S5: acpi_enter_sleep_state(ACPI_STATE_S5); break; default: error = -EINVAL; break; } system_state = SYS_STATE_resume; /* Restore CR4 and EFER from cached values. */ cr4 = read_cr4(); write_cr4(cr4 & ~X86_CR4_MCE); if ( cpu_has_efer ) write_efer(read_efer()); device_power_up(); mcheck_init(&boot_cpu_data, 0); write_cr4(cr4); printk(XENLOG_INFO "Finishing wakeup from ACPI S%d state.\n", state); if ( (state == ACPI_STATE_S3) && error ) tboot_s3_error(error); done: spin_debug_enable(); local_irq_restore(flags); console_end_sync(); acpi_sleep_post(state); if ( hvm_cpu_up() ) BUG(); enable_cpu: cpufreq_add_cpu(0); microcode_resume_cpu(0); rcu_barrier(); mtrr_aps_sync_begin(); enable_nonboot_cpus(); mtrr_aps_sync_end(); adjust_vtd_irq_affinities(); acpi_dmar_zap(); thaw_domains(); system_state = SYS_STATE_active; spin_unlock(&pm_lock); return error; } static long enter_state_helper(void *data) { struct acpi_sleep_info *sinfo = (struct acpi_sleep_info *)data; return enter_state(sinfo->sleep_state); } /* * Dom0 issues this hypercall in place of writing pm1a_cnt. Xen then * takes over the control and put the system into sleep state really. */ int acpi_enter_sleep(struct xenpf_enter_acpi_sleep *sleep) { if ( sleep->flags & XENPF_ACPI_SLEEP_EXTENDED ) { if ( !acpi_sinfo.sleep_control.address || !acpi_sinfo.sleep_status.address ) return -EPERM; if ( sleep->flags & ~XENPF_ACPI_SLEEP_EXTENDED ) return -EINVAL; if ( sleep->val_a > ACPI_SLEEP_TYPE_MAX || (sleep->val_b != ACPI_SLEEP_TYPE_INVALID && sleep->val_b > ACPI_SLEEP_TYPE_MAX) ) return -ERANGE; acpi_sinfo.sleep_type_a = sleep->val_a; acpi_sinfo.sleep_type_b = sleep->val_b; acpi_sinfo.sleep_extended = 1; } else if ( !acpi_sinfo.pm1a_cnt_blk.address ) return -EPERM; /* Sanity check */ else if ( sleep->val_b && ((sleep->val_a ^ sleep->val_b) & ACPI_BITMASK_SLEEP_ENABLE) ) { gdprintk(XENLOG_ERR, "Mismatched pm1a/pm1b setting."); return -EINVAL; } else if ( sleep->flags ) return -EINVAL; else { acpi_sinfo.pm1a_cnt_val = sleep->val_a; acpi_sinfo.pm1b_cnt_val = sleep->val_b; acpi_sinfo.sleep_extended = 0; } acpi_sinfo.sleep_state = sleep->sleep_state; return continue_hypercall_on_cpu(0, enter_state_helper, &acpi_sinfo); } static int acpi_get_wake_status(void) { uint32_t val; acpi_status status; if ( acpi_sinfo.sleep_extended ) { status = acpi_hw_register_read(ACPI_REGISTER_SLEEP_STATUS, &val); return ACPI_FAILURE(status) ? 0 : val & ACPI_X_WAKE_STATUS; } /* Wake status is the 15th bit of PM1 status register. (ACPI spec 3.0) */ status = acpi_hw_register_read(ACPI_REGISTER_PM1_STATUS, &val); if ( ACPI_FAILURE(status) ) return 0; val &= ACPI_BITMASK_WAKE_STATUS; val >>= ACPI_BITPOSITION_WAKE_STATUS; return val; } static void tboot_sleep(u8 sleep_state) { uint32_t shutdown_type; #define TB_COPY_GAS(tbg, g) \ tbg.space_id = g.space_id; \ tbg.bit_width = g.bit_width; \ tbg.bit_offset = g.bit_offset; \ tbg.access_width = g.access_width; \ tbg.address = g.address; /* sizes are not same (due to packing) so copy each one */ TB_COPY_GAS(g_tboot_shared->acpi_sinfo.pm1a_cnt_blk, acpi_sinfo.pm1a_cnt_blk); TB_COPY_GAS(g_tboot_shared->acpi_sinfo.pm1b_cnt_blk, acpi_sinfo.pm1b_cnt_blk); TB_COPY_GAS(g_tboot_shared->acpi_sinfo.pm1a_evt_blk, acpi_sinfo.pm1a_evt_blk); TB_COPY_GAS(g_tboot_shared->acpi_sinfo.pm1b_evt_blk, acpi_sinfo.pm1b_evt_blk); g_tboot_shared->acpi_sinfo.pm1a_cnt_val = acpi_sinfo.pm1a_cnt_val; g_tboot_shared->acpi_sinfo.pm1b_cnt_val = acpi_sinfo.pm1b_cnt_val; g_tboot_shared->acpi_sinfo.wakeup_vector = acpi_sinfo.wakeup_vector; g_tboot_shared->acpi_sinfo.vector_width = acpi_sinfo.vector_width; g_tboot_shared->acpi_sinfo.kernel_s3_resume_vector = bootsym_phys(wakeup_start); switch ( sleep_state ) { case ACPI_STATE_S3: shutdown_type = TB_SHUTDOWN_S3; break; case ACPI_STATE_S4: shutdown_type = TB_SHUTDOWN_S4; break; case ACPI_STATE_S5: shutdown_type = TB_SHUTDOWN_S5; break; default: return; } tboot_shutdown(shutdown_type); } /* System is really put into sleep state by this stub */ acpi_status acpi_enter_sleep_state(u8 sleep_state) { acpi_status status; if ( tboot_in_measured_env() ) { tboot_sleep(sleep_state); printk(XENLOG_ERR "TBOOT failed entering s3 state\n"); return_ACPI_STATUS(AE_ERROR); } ACPI_FLUSH_CPU_CACHE(); if ( acpi_sinfo.sleep_extended ) { /* * Set the SLP_TYP and SLP_EN bits. * * Note: We only use the first value returned by the \_Sx method * (acpi_sinfo.sleep_type_a) - As per ACPI specification. */ u8 sleep_type_value = ((acpi_sinfo.sleep_type_a << ACPI_X_SLEEP_TYPE_POSITION) & ACPI_X_SLEEP_TYPE_MASK) | ACPI_X_SLEEP_ENABLE; status = acpi_hw_register_write(ACPI_REGISTER_SLEEP_CONTROL, sleep_type_value); } else { status = acpi_hw_register_write(ACPI_REGISTER_PM1A_CONTROL, acpi_sinfo.pm1a_cnt_val); if ( !ACPI_FAILURE(status) && acpi_sinfo.pm1b_cnt_blk.address ) status = acpi_hw_register_write(ACPI_REGISTER_PM1B_CONTROL, acpi_sinfo.pm1b_cnt_val); } if ( ACPI_FAILURE(status) ) return_ACPI_STATUS(AE_ERROR); /* Wait until we enter sleep state, and spin until we wake */ while ( !acpi_get_wake_status() ) continue; return_ACPI_STATUS(AE_OK); } static int __init acpi_sleep_init(void) { int i; char *p = opt_acpi_sleep; while ( (p != NULL) && (*p != '\0') ) { if ( !strncmp(p, "s3_bios", 7) ) acpi_video_flags |= 1; if ( !strncmp(p, "s3_mode", 7) ) acpi_video_flags |= 2; p = strchr(p, ','); if ( p != NULL ) p += strspn(p, ", \t"); } printk(XENLOG_INFO "ACPI sleep modes:"); for ( i = 0; i < ACPI_S_STATE_COUNT; i++ ) { if ( i == ACPI_STATE_S3 ) { sleep_states[i] = 1; printk(" S%d", i); } else sleep_states[i] = 0; } printk("\n"); return 0; } __initcall(acpi_sleep_init); xen-4.4.0/xen/arch/x86/acpi/cpu_idle.c0000664000175000017500000010512112307313555015443 0ustar smbsmb/* * cpu_idle - xen idle state module derived from Linux * drivers/acpi/processor_idle.c & * arch/x86/kernel/acpi/cstate.c * * Copyright (C) 2001, 2002 Andy Grover * Copyright (C) 2001, 2002 Paul Diefenbaugh * Copyright (C) 2004, 2005 Dominik Brodowski * Copyright (C) 2004 Anil S Keshavamurthy * - Added processor hotplug support * Copyright (C) 2005 Venkatesh Pallipadi * - Added support for C3 on SMP * Copyright (C) 2007, 2008 Intel Corporation * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /*#define DEBUG_PM_CX*/ #define GET_HW_RES_IN_NS(msr, val) \ do { rdmsrl(msr, val); val = tsc_ticks2ns(val); } while( 0 ) #define GET_PC2_RES(val) GET_HW_RES_IN_NS(0x60D, val) /* SNB only */ #define GET_PC3_RES(val) GET_HW_RES_IN_NS(0x3F8, val) #define GET_PC6_RES(val) GET_HW_RES_IN_NS(0x3F9, val) #define GET_PC7_RES(val) GET_HW_RES_IN_NS(0x3FA, val) #define GET_CC3_RES(val) GET_HW_RES_IN_NS(0x3FC, val) #define GET_CC6_RES(val) GET_HW_RES_IN_NS(0x3FD, val) #define GET_CC7_RES(val) GET_HW_RES_IN_NS(0x3FE, val) /* SNB only */ static void lapic_timer_nop(void) { } void (*__read_mostly lapic_timer_off)(void); void (*__read_mostly lapic_timer_on)(void); bool_t lapic_timer_init(void) { if ( boot_cpu_has(X86_FEATURE_ARAT) ) { lapic_timer_off = lapic_timer_nop; lapic_timer_on = lapic_timer_nop; } else if ( hpet_broadcast_is_available() ) { lapic_timer_off = hpet_broadcast_enter; lapic_timer_on = hpet_broadcast_exit; } else if ( pit_broadcast_is_available() ) { lapic_timer_off = pit_broadcast_enter; lapic_timer_on = pit_broadcast_exit; } else return 0; return 1; } static uint64_t (*__read_mostly tick_to_ns)(uint64_t) = acpi_pm_tick_to_ns; void (*__read_mostly pm_idle_save)(void); unsigned int max_cstate __read_mostly = ACPI_PROCESSOR_MAX_POWER - 1; integer_param("max_cstate", max_cstate); static bool_t __read_mostly local_apic_timer_c2_ok; boolean_param("lapic_timer_c2_ok", local_apic_timer_c2_ok); struct acpi_processor_power *__read_mostly processor_powers[NR_CPUS]; struct hw_residencies { uint64_t pc2; uint64_t pc3; uint64_t pc6; uint64_t pc7; uint64_t cc3; uint64_t cc6; uint64_t cc7; }; static void do_get_hw_residencies(void *arg) { struct cpuinfo_x86 *c = ¤t_cpu_data; struct hw_residencies *hw_res = arg; if ( c->x86_vendor != X86_VENDOR_INTEL || c->x86 != 6 ) return; switch ( c->x86_model ) { /* Sandy bridge */ case 0x2A: case 0x2D: /* Ivy bridge */ case 0x3A: case 0x3E: /* Haswell */ case 0x3C: case 0x3F: case 0x45: case 0x46: GET_PC2_RES(hw_res->pc2); GET_CC7_RES(hw_res->cc7); /* fall through */ /* Nehalem */ case 0x1A: case 0x1E: case 0x1F: case 0x2E: /* Westmere */ case 0x25: case 0x2C: case 0x2F: GET_PC3_RES(hw_res->pc3); GET_PC6_RES(hw_res->pc6); GET_PC7_RES(hw_res->pc7); GET_CC3_RES(hw_res->cc3); GET_CC6_RES(hw_res->cc6); break; } } static void get_hw_residencies(uint32_t cpu, struct hw_residencies *hw_res) { memset(hw_res, 0, sizeof(*hw_res)); if ( smp_processor_id() == cpu ) do_get_hw_residencies(hw_res); else on_selected_cpus(cpumask_of(cpu), do_get_hw_residencies, hw_res, 1); } static void print_hw_residencies(uint32_t cpu) { struct hw_residencies hw_res; get_hw_residencies(cpu, &hw_res); printk("PC2[%"PRId64"] PC3[%"PRId64"] PC6[%"PRId64"] PC7[%"PRId64"]\n", hw_res.pc2, hw_res.pc3, hw_res.pc6, hw_res.pc7); printk("CC3[%"PRId64"] CC6[%"PRId64"] CC7[%"PRId64"]\n", hw_res.cc3, hw_res.cc6,hw_res.cc7); } static char* acpi_cstate_method_name[] = { "NONE", "SYSIO", "FFH", "HALT" }; static void print_acpi_power(uint32_t cpu, struct acpi_processor_power *power) { uint32_t i, idle_usage = 0; uint64_t res, idle_res = 0; u32 usage; u8 last_state_idx; printk("==cpu%d==\n", cpu); last_state_idx = power->last_state ? power->last_state->idx : -1; printk("active state:\t\tC%d\n", last_state_idx); printk("max_cstate:\t\tC%d\n", max_cstate); printk("states:\n"); for ( i = 1; i < power->count; i++ ) { spin_lock_irq(&power->stat_lock); res = tick_to_ns(power->states[i].time); usage = power->states[i].usage; spin_unlock_irq(&power->stat_lock); idle_usage += usage; idle_res += res; printk((last_state_idx == i) ? " *" : " "); printk("C%d:\t", i); printk("type[C%d] ", power->states[i].type); printk("latency[%03d] ", power->states[i].latency); printk("usage[%08d] ", usage); printk("method[%5s] ", acpi_cstate_method_name[power->states[i].entry_method]); printk("duration[%"PRId64"]\n", res); } printk((last_state_idx == 0) ? " *" : " "); printk("C0:\tusage[%08d] duration[%"PRId64"]\n", idle_usage, NOW() - idle_res); print_hw_residencies(cpu); } static void dump_cx(unsigned char key) { unsigned int cpu; printk("'%c' pressed -> printing ACPI Cx structures\n", key); for_each_online_cpu ( cpu ) if (processor_powers[cpu]) print_acpi_power(cpu, processor_powers[cpu]); } static struct keyhandler dump_cx_keyhandler = { .diagnostic = 1, .u.fn = dump_cx, .desc = "dump ACPI Cx structures" }; static int __init cpu_idle_key_init(void) { register_keyhandler('c', &dump_cx_keyhandler); return 0; } __initcall(cpu_idle_key_init); static uint64_t get_stime_tick(void) { return (uint64_t)NOW(); } static uint64_t stime_ticks_elapsed(uint64_t t1, uint64_t t2) { return t2 - t1; } static uint64_t stime_tick_to_ns(uint64_t ticks) { return ticks; } static uint64_t get_acpi_pm_tick(void) { return (uint64_t)inl(pmtmr_ioport); } static uint64_t acpi_pm_ticks_elapsed(uint64_t t1, uint64_t t2) { if ( t2 >= t1 ) return (t2 - t1); else if ( !(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER) ) return (((0x00FFFFFF - t1) + t2 + 1) & 0x00FFFFFF); else return ((0xFFFFFFFF - t1) + t2 +1); } uint64_t (*__read_mostly cpuidle_get_tick)(void) = get_acpi_pm_tick; static uint64_t (*__read_mostly ticks_elapsed)(uint64_t, uint64_t) = acpi_pm_ticks_elapsed; /* * The bit is set iff cpu use monitor/mwait to enter C state * with this flag set, CPU can be waken up from C state * by writing to specific memory address, instead of sending an IPI. */ static cpumask_t cpuidle_mwait_flags; void cpuidle_wakeup_mwait(cpumask_t *mask) { cpumask_t target; unsigned int cpu; cpumask_and(&target, mask, &cpuidle_mwait_flags); /* CPU is MWAITing on the cpuidle_mwait_wakeup flag. */ for_each_cpu(cpu, &target) mwait_wakeup(cpu) = 0; cpumask_andnot(mask, mask, &target); } void mwait_idle_with_hints(unsigned int eax, unsigned int ecx) { unsigned int cpu = smp_processor_id(); s_time_t expires = per_cpu(timer_deadline, cpu); __monitor((void *)&mwait_wakeup(cpu), 0, 0); smp_mb(); /* * Timer deadline passing is the event on which we will be woken via * cpuidle_mwait_wakeup. So check it now that the location is armed. */ if ( expires > NOW() || expires == 0 ) { cpumask_set_cpu(cpu, &cpuidle_mwait_flags); __mwait(eax, ecx); cpumask_clear_cpu(cpu, &cpuidle_mwait_flags); } if ( expires <= NOW() && expires > 0 ) raise_softirq(TIMER_SOFTIRQ); } static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { mwait_idle_with_hints(cx->address, MWAIT_ECX_INTERRUPT_BREAK); } static void acpi_idle_do_entry(struct acpi_processor_cx *cx) { switch ( cx->entry_method ) { case ACPI_CSTATE_EM_FFH: /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cx); return; case ACPI_CSTATE_EM_SYSIO: /* IO port based C-state */ inb(cx->address); /* Dummy wait op - must do something useless after P_LVL2 read because chipsets cannot guarantee that STPCLK# signal gets asserted in time to freeze execution properly. */ inl(pmtmr_ioport); return; case ACPI_CSTATE_EM_HALT: safe_halt(); local_irq_disable(); return; } } static int acpi_idle_bm_check(void) { u32 bm_status = 0; acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status); if ( bm_status ) acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1); /* * TBD: PIIX4 Erratum #18: Note that BM_STS doesn't always reflect * the true state of bus mastering activity; forcing us to * manually check the BMIDEA bit of each IDE channel. */ return bm_status; } static struct { spinlock_t lock; unsigned int count; } c3_cpu_status = { .lock = SPIN_LOCK_UNLOCKED }; void trace_exit_reason(u32 *irq_traced) { if ( unlikely(tb_init_done) ) { int i, curbit; u32 irr_status[8] = { 0 }; /* Get local apic IRR register */ for ( i = 0; i < 8; i++ ) irr_status[i] = apic_read(APIC_IRR + (i << 4)); i = 0; curbit = find_first_bit((const unsigned long *)irr_status, 256); while ( i < 4 && curbit < 256 ) { irq_traced[i++] = curbit; curbit = find_next_bit((const unsigned long *)irr_status, 256, curbit + 1); } } } /* * "AAJ72. EOI Transaction May Not be Sent if Software Enters Core C6 During * an Interrupt Service Routine" * * There was an errata with some Core i7 processors that an EOI transaction * may not be sent if software enters core C6 during an interrupt service * routine. So we don't enter deep Cx state if there is an EOI pending. */ bool_t errata_c6_eoi_workaround(void) { static bool_t fix_needed = -1; if ( unlikely(fix_needed == -1) ) { int model = boot_cpu_data.x86_model; fix_needed = (cpu_has_apic && !directed_eoi_enabled && (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6) && ((model == 0x1a) || (model == 0x1e) || (model == 0x1f) || (model == 0x25) || (model == 0x2c) || (model == 0x2f))); } return (fix_needed && cpu_has_pending_apic_eoi()); } void update_idle_stats(struct acpi_processor_power *power, struct acpi_processor_cx *cx, uint64_t before, uint64_t after) { int64_t sleep_ticks = ticks_elapsed(before, after); /* Interrupts are disabled */ spin_lock(&power->stat_lock); cx->usage++; if ( sleep_ticks > 0 ) { power->last_residency = tick_to_ns(sleep_ticks) / 1000UL; cx->time += sleep_ticks; } spin_unlock(&power->stat_lock); } static void acpi_processor_idle(void) { struct acpi_processor_power *power = processor_powers[smp_processor_id()]; struct acpi_processor_cx *cx = NULL; int next_state; uint64_t t1, t2 = 0; u32 exp = 0, pred = 0; u32 irq_traced[4] = { 0 }; if ( max_cstate > 0 && power && !sched_has_urgent_vcpu() && (next_state = cpuidle_current_governor->select(power)) > 0 ) { cx = &power->states[next_state]; if ( cx->type == ACPI_STATE_C3 && power->flags.bm_check && acpi_idle_bm_check() ) cx = power->safe_state; if ( cx->idx > max_cstate ) cx = &power->states[max_cstate]; menu_get_trace_data(&exp, &pred); } if ( !cx ) { if ( pm_idle_save ) pm_idle_save(); else safe_halt(); return; } cpufreq_dbs_timer_suspend(); sched_tick_suspend(); /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */ process_pending_softirqs(); /* * Interrupts must be disabled during bus mastering calculations and * for C2/C3 transitions. */ local_irq_disable(); if ( !cpu_is_haltable(smp_processor_id()) ) { local_irq_enable(); sched_tick_resume(); cpufreq_dbs_timer_resume(); return; } if ( (cx->type == ACPI_STATE_C3) && errata_c6_eoi_workaround() ) cx = power->safe_state; power->last_state = cx; /* * Sleep: * ------ * Invoke the current Cx state to put the processor to sleep. */ switch ( cx->type ) { case ACPI_STATE_C1: case ACPI_STATE_C2: if ( cx->type == ACPI_STATE_C1 || local_apic_timer_c2_ok ) { /* Get start time (ticks) */ t1 = cpuidle_get_tick(); /* Trace cpu idle entry */ TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, t1, exp, pred); /* Invoke C2 */ acpi_idle_do_entry(cx); /* Get end time (ticks) */ t2 = cpuidle_get_tick(); trace_exit_reason(irq_traced); /* Trace cpu idle exit */ TRACE_6D(TRC_PM_IDLE_EXIT, cx->idx, t2, irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]); /* Update statistics */ update_idle_stats(power, cx, t1, t2); /* Re-enable interrupts */ local_irq_enable(); break; } case ACPI_STATE_C3: /* * Before invoking C3, be aware that TSC/APIC timer may be * stopped by H/W. Without carefully handling of TSC/APIC stop issues, * deep C state can't work correctly. */ /* preparing APIC stop */ lapic_timer_off(); /* Get start time (ticks) */ t1 = cpuidle_get_tick(); /* Trace cpu idle entry */ TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, t1, exp, pred); /* * disable bus master * bm_check implies we need ARB_DIS * !bm_check implies we need cache flush * bm_control implies whether we can do ARB_DIS * * That leaves a case where bm_check is set and bm_control is * not set. In that case we cannot do much, we enter C3 * without doing anything. */ if ( cx->type != ACPI_STATE_C3 ) /* nothing to be done here */; else if ( power->flags.bm_check && power->flags.bm_control ) { spin_lock(&c3_cpu_status.lock); if ( ++c3_cpu_status.count == num_online_cpus() ) { /* * All CPUs are trying to go to C3 * Disable bus master arbitration */ acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1); } spin_unlock(&c3_cpu_status.lock); } else if ( !power->flags.bm_check ) { /* SMP with no shared cache... Invalidate cache */ ACPI_FLUSH_CPU_CACHE(); } /* Invoke C3 */ acpi_idle_do_entry(cx); if ( (cx->type == ACPI_STATE_C3) && power->flags.bm_check && power->flags.bm_control ) { /* Enable bus master arbitration */ spin_lock(&c3_cpu_status.lock); if ( c3_cpu_status.count-- == num_online_cpus() ) acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0); spin_unlock(&c3_cpu_status.lock); } /* Get end time (ticks) */ t2 = cpuidle_get_tick(); /* recovering TSC */ cstate_restore_tsc(); trace_exit_reason(irq_traced); /* Trace cpu idle exit */ TRACE_6D(TRC_PM_IDLE_EXIT, cx->idx, t2, irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]); /* Update statistics */ update_idle_stats(power, cx, t1, t2); /* Re-enable interrupts */ local_irq_enable(); /* recovering APIC */ lapic_timer_on(); break; default: /* Now in C0 */ power->last_state = &power->states[0]; local_irq_enable(); sched_tick_resume(); cpufreq_dbs_timer_resume(); return; } /* Now in C0 */ power->last_state = &power->states[0]; sched_tick_resume(); cpufreq_dbs_timer_resume(); if ( cpuidle_current_governor->reflect ) cpuidle_current_governor->reflect(power); } void acpi_dead_idle(void) { struct acpi_processor_power *power; struct acpi_processor_cx *cx; if ( (power = processor_powers[smp_processor_id()]) == NULL ) goto default_halt; if ( (cx = &power->states[power->count-1]) == NULL ) goto default_halt; if ( cx->entry_method == ACPI_CSTATE_EM_FFH ) { void *mwait_ptr = &mwait_wakeup(smp_processor_id()); /* * Cache must be flushed as the last operation before sleeping. * Otherwise, CPU may still hold dirty data, breaking cache coherency, * leading to strange errors. */ wbinvd(); while ( 1 ) { /* * 1. The CLFLUSH is a workaround for erratum AAI65 for * the Xeon 7400 series. * 2. The WBINVD is insufficient due to the spurious-wakeup * case where we return around the loop. * 3. Unlike wbinvd, clflush is a light weight but not serializing * instruction, hence memory fence is necessary to make sure all * load/store visible before flush cache line. */ mb(); clflush(mwait_ptr); __monitor(mwait_ptr, 0, 0); mb(); __mwait(cx->address, 0); } } else if ( current_cpu_data.x86_vendor == X86_VENDOR_AMD && cx->entry_method == ACPI_CSTATE_EM_SYSIO ) { /* Intel prefers not to use SYSIO */ /* Avoid references to shared data after the cache flush */ u32 address = cx->address; u32 pmtmr_ioport_local = pmtmr_ioport; wbinvd(); while ( 1 ) { inb(address); inl(pmtmr_ioport_local); } } default_halt: default_dead_idle(); } int cpuidle_init_cpu(unsigned int cpu) { struct acpi_processor_power *acpi_power; acpi_power = processor_powers[cpu]; if ( !acpi_power ) { unsigned int i; if ( cpu == 0 && boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ) { cpuidle_get_tick = get_stime_tick; ticks_elapsed = stime_ticks_elapsed; tick_to_ns = stime_tick_to_ns; } acpi_power = xzalloc(struct acpi_processor_power); if ( !acpi_power ) return -ENOMEM; for ( i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++ ) acpi_power->states[i].idx = i; acpi_power->cpu = cpu; processor_powers[cpu] = acpi_power; } acpi_power->count = 2; acpi_power->states[1].type = ACPI_STATE_C1; acpi_power->states[1].entry_method = ACPI_CSTATE_EM_HALT; acpi_power->safe_state = &acpi_power->states[1]; spin_lock_init(&acpi_power->stat_lock); return 0; } static int acpi_processor_ffh_cstate_probe(xen_processor_cx_t *cx) { struct cpuinfo_x86 *c = ¤t_cpu_data; unsigned int eax, ebx, ecx, edx; unsigned int edx_part; unsigned int cstate_type; /* C-state type and not ACPI C-state type */ unsigned int num_cstate_subtype; int ret = 0; static unsigned long printed; if ( c->cpuid_level < CPUID_MWAIT_LEAF ) { printk(XENLOG_INFO "MWAIT leaf not supported by cpuid\n"); return -EFAULT; } cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); if ( opt_cpu_info ) printk(XENLOG_DEBUG "cpuid.MWAIT[eax=%x ebx=%x ecx=%x edx=%x]\n", eax, ebx, ecx, edx); /* Check whether this particular cx_type (in CST) is supported or not */ cstate_type = (cx->reg.address >> MWAIT_SUBSTATE_SIZE) + 1; edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE); num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK; if ( num_cstate_subtype < (cx->reg.address & MWAIT_SUBSTATE_MASK) ) ret = -ERANGE; /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */ else if ( !(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || !(ecx & CPUID5_ECX_INTERRUPT_BREAK) ) ret = -ENODEV; else if ( opt_cpu_info || cx->type >= BITS_PER_LONG || !test_and_set_bit(cx->type, &printed) ) printk(XENLOG_INFO "Monitor-Mwait will be used to enter C%d state\n", cx->type); return ret; } /* * Initialize bm_flags based on the CPU cache properties * On SMP it depends on cache configuration * - When cache is not shared among all CPUs, we flush cache * before entering C3. * - When cache is shared among all CPUs, we use bm_check * mechanism as in UP case * * This routine is called only after all the CPUs are online */ static void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags) { struct cpuinfo_x86 *c = ¤t_cpu_data; flags->bm_check = 0; if ( num_online_cpus() == 1 ) flags->bm_check = 1; else if ( (c->x86_vendor == X86_VENDOR_INTEL) || ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 0x15)) ) { /* * Today all MP CPUs that support C3 share cache. * And caches should not be flushed by software while * entering C3 type state. */ flags->bm_check = 1; } /* * On all recent platforms, ARB_DISABLE is a nop. * So, set bm_control to zero to indicate that ARB_DISABLE * is not required while entering C3 type state on * P4, Core and beyond CPUs */ if ( c->x86_vendor == X86_VENDOR_INTEL && (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14)) ) flags->bm_control = 0; } #define VENDOR_INTEL (1) #define NATIVE_CSTATE_BEYOND_HALT (2) static int check_cx(struct acpi_processor_power *power, xen_processor_cx_t *cx) { static int bm_check_flag = -1; static int bm_control_flag = -1; switch ( cx->reg.space_id ) { case ACPI_ADR_SPACE_SYSTEM_IO: if ( cx->reg.address == 0 ) return -EINVAL; break; case ACPI_ADR_SPACE_FIXED_HARDWARE: if ( cx->reg.bit_width != VENDOR_INTEL || cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT ) return -EINVAL; /* assume all logical cpu has the same support for mwait */ if ( acpi_processor_ffh_cstate_probe(cx) ) return -EINVAL; break; default: return -ENODEV; } switch ( cx->type ) { case ACPI_STATE_C2: if ( local_apic_timer_c2_ok ) break; case ACPI_STATE_C3: if ( !lapic_timer_init() ) return -EINVAL; /* All the logic here assumes flags.bm_check is same across all CPUs */ if ( bm_check_flag < 0 ) { /* Determine whether bm_check is needed based on CPU */ acpi_processor_power_init_bm_check(&(power->flags)); } else { power->flags.bm_check = bm_check_flag; power->flags.bm_control = bm_control_flag; } if ( power->flags.bm_check ) { if ( !power->flags.bm_control ) { if ( power->flags.has_cst != 1 ) { /* bus mastering control is necessary */ ACPI_DEBUG_PRINT((ACPI_DB_INFO, "C3 support requires BM control\n")); return -EINVAL; } else { /* Here we enter C3 without bus mastering */ ACPI_DEBUG_PRINT((ACPI_DB_INFO, "C3 support without BM control\n")); } } /* * On older chipsets, BM_RLD needs to be set in order for Bus * Master activity to wake the system from C3, hence * acpi_set_register() is always being called once below. Newer * chipsets handle DMA during C3 automatically and BM_RLD is a * NOP. In either case, the proper way to handle BM_RLD is to * set it and leave it set. */ } else { /* * WBINVD should be set in fadt, for C3 state to be * supported on when bm_check is not required. */ if ( !(acpi_gbl_FADT.flags & ACPI_FADT_WBINVD) ) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Cache invalidation should work properly" " for C3 to be enabled on SMP systems\n")); return -EINVAL; } } if ( bm_check_flag < 0 ) { bm_check_flag = power->flags.bm_check; bm_control_flag = power->flags.bm_control; acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, bm_check_flag); } break; } return 0; } static unsigned int latency_factor = 2; integer_param("idle_latency_factor", latency_factor); static void set_cx( struct acpi_processor_power *acpi_power, xen_processor_cx_t *xen_cx) { struct acpi_processor_cx *cx; if ( check_cx(acpi_power, xen_cx) != 0 ) return; switch ( xen_cx->type ) { case ACPI_STATE_C1: cx = &acpi_power->states[1]; break; default: if ( acpi_power->count >= ACPI_PROCESSOR_MAX_POWER ) { case ACPI_STATE_C0: printk(XENLOG_WARNING "CPU%u: C%d data ignored\n", acpi_power->cpu, xen_cx->type); return; } cx = &acpi_power->states[acpi_power->count]; cx->type = xen_cx->type; break; } cx->address = xen_cx->reg.address; switch ( xen_cx->reg.space_id ) { case ACPI_ADR_SPACE_FIXED_HARDWARE: if ( xen_cx->reg.bit_width == VENDOR_INTEL && xen_cx->reg.bit_offset == NATIVE_CSTATE_BEYOND_HALT && boot_cpu_has(X86_FEATURE_MWAIT) ) cx->entry_method = ACPI_CSTATE_EM_FFH; else cx->entry_method = ACPI_CSTATE_EM_HALT; break; case ACPI_ADR_SPACE_SYSTEM_IO: if ( ioports_deny_access(dom0, cx->address, cx->address) ) printk(XENLOG_WARNING "Could not deny access to port %04x\n", cx->address); cx->entry_method = ACPI_CSTATE_EM_SYSIO; break; default: cx->entry_method = ACPI_CSTATE_EM_NONE; break; } cx->latency = xen_cx->latency; cx->target_residency = cx->latency * latency_factor; smp_wmb(); acpi_power->count++; if ( cx->type == ACPI_STATE_C1 || cx->type == ACPI_STATE_C2 ) acpi_power->safe_state = cx; } int get_cpu_id(u32 acpi_id) { int i; u32 apic_id; if ( acpi_id >= MAX_MADT_ENTRIES ) return -1; apic_id = x86_acpiid_to_apicid[acpi_id]; if ( apic_id == BAD_APICID ) return -1; for ( i = 0; i < nr_cpu_ids; i++ ) { if ( apic_id == x86_cpu_to_apicid[i] ) return i; } return -1; } #ifdef DEBUG_PM_CX static void print_cx_pminfo(uint32_t cpu, struct xen_processor_power *power) { XEN_GUEST_HANDLE(xen_processor_cx_t) states; xen_processor_cx_t state; XEN_GUEST_HANDLE(xen_processor_csd_t) csd; xen_processor_csd_t dp; uint32_t i; printk("cpu%d cx acpi info:\n", cpu); printk("\tcount = %d\n", power->count); printk("\tflags: bm_cntl[%d], bm_chk[%d], has_cst[%d],\n" "\t pwr_setup_done[%d], bm_rld_set[%d]\n", power->flags.bm_control, power->flags.bm_check, power->flags.has_cst, power->flags.power_setup_done, power->flags.bm_rld_set); states = power->states; for ( i = 0; i < power->count; i++ ) { if ( unlikely(copy_from_guest_offset(&state, states, i, 1)) ) return; printk("\tstates[%d]:\n", i); printk("\t\treg.space_id = %#x\n", state.reg.space_id); printk("\t\treg.bit_width = %#x\n", state.reg.bit_width); printk("\t\treg.bit_offset = %#x\n", state.reg.bit_offset); printk("\t\treg.access_size = %#x\n", state.reg.access_size); printk("\t\treg.address = %#"PRIx64"\n", state.reg.address); printk("\t\ttype = %d\n", state.type); printk("\t\tlatency = %d\n", state.latency); printk("\t\tpower = %d\n", state.power); csd = state.dp; printk("\t\tdp(@0x%p)\n", csd.p); if ( csd.p != NULL ) { if ( unlikely(copy_from_guest(&dp, csd, 1)) ) return; printk("\t\t\tdomain = %d\n", dp.domain); printk("\t\t\tcoord_type = %d\n", dp.coord_type); printk("\t\t\tnum = %d\n", dp.num); } } } #else #define print_cx_pminfo(c, p) #endif long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power) { XEN_GUEST_HANDLE(xen_processor_cx_t) states; xen_processor_cx_t xen_cx; struct acpi_processor_power *acpi_power; int cpu_id, i, ret; if ( unlikely(!guest_handle_okay(power->states, power->count)) ) return -EFAULT; if ( pm_idle_save && pm_idle != acpi_processor_idle ) return 0; print_cx_pminfo(cpu, power); /* map from acpi_id to cpu_id */ cpu_id = get_cpu_id(cpu); if ( cpu_id == -1 ) { static bool_t warn_once = 1; if ( warn_once || opt_cpu_info ) printk(XENLOG_WARNING "No CPU ID for APIC ID %#x\n", cpu); warn_once = 0; return -EINVAL; } ret = cpuidle_init_cpu(cpu_id); if ( ret < 0 ) return ret; acpi_power = processor_powers[cpu_id]; acpi_power->flags.bm_check = power->flags.bm_check; acpi_power->flags.bm_control = power->flags.bm_control; acpi_power->flags.has_cst = power->flags.has_cst; states = power->states; for ( i = 0; i < power->count; i++ ) { if ( unlikely(copy_from_guest_offset(&xen_cx, states, i, 1)) ) return -EFAULT; set_cx(acpi_power, &xen_cx); } if ( cpuidle_current_governor->enable && cpuidle_current_governor->enable(acpi_power) ) return -EFAULT; /* FIXME: C-state dependency is not supported by far */ if ( cpu_id == 0 ) { if ( pm_idle_save == NULL ) { pm_idle_save = pm_idle; pm_idle = acpi_processor_idle; } dead_idle = acpi_dead_idle; } return 0; } uint32_t pmstat_get_cx_nr(uint32_t cpuid) { return processor_powers[cpuid] ? processor_powers[cpuid]->count : 0; } int pmstat_get_cx_stat(uint32_t cpuid, struct pm_cx_stat *stat) { struct acpi_processor_power *power = processor_powers[cpuid]; uint64_t idle_usage = 0, idle_res = 0; uint64_t usage[ACPI_PROCESSOR_MAX_POWER], res[ACPI_PROCESSOR_MAX_POWER]; int i; struct hw_residencies hw_res; if ( power == NULL ) { stat->last = 0; stat->nr = 0; stat->idle_time = 0; return 0; } stat->last = power->last_state ? power->last_state->idx : 0; stat->idle_time = get_cpu_idle_time(cpuid); /* mimic the stat when detail info hasn't been registered by dom0 */ if ( pm_idle_save == NULL ) { stat->nr = 2; usage[1] = idle_usage = 1; res[1] = idle_res = stat->idle_time; memset(&hw_res, 0, sizeof(hw_res)); } else { stat->nr = power->count; for ( i = 1; i < power->count; i++ ) { spin_lock_irq(&power->stat_lock); usage[i] = power->states[i].usage; res[i] = tick_to_ns(power->states[i].time); spin_unlock_irq(&power->stat_lock); idle_usage += usage[i]; idle_res += res[i]; } get_hw_residencies(cpuid, &hw_res); } usage[0] = idle_usage; res[0] = NOW() - idle_res; if ( copy_to_guest(stat->triggers, usage, stat->nr) || copy_to_guest(stat->residencies, res, stat->nr) ) return -EFAULT; stat->pc2 = hw_res.pc2; stat->pc3 = hw_res.pc3; stat->pc6 = hw_res.pc6; stat->pc7 = hw_res.pc7; stat->cc3 = hw_res.cc3; stat->cc6 = hw_res.cc6; stat->cc7 = hw_res.cc7; return 0; } int pmstat_reset_cx_stat(uint32_t cpuid) { return 0; } void cpuidle_disable_deep_cstate(void) { if ( max_cstate > 1 ) { if ( local_apic_timer_c2_ok ) max_cstate = 2; else max_cstate = 1; } mb(); hpet_disable_legacy_broadcast(); } bool_t cpuidle_using_deep_cstate(void) { return xen_cpuidle && max_cstate > (local_apic_timer_c2_ok ? 2 : 1); } static int cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; /* Only hook on CPU_ONLINE because a dead cpu may utilize the info to * to enter deep C-state */ switch ( action ) { case CPU_ONLINE: (void)cpuidle_init_cpu(cpu); break; default: break; } return NOTIFY_DONE; } static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; static int __init cpuidle_presmp_init(void) { void *cpu = (void *)(long)smp_processor_id(); if ( !xen_cpuidle ) return 0; mwait_idle_init(&cpu_nfb); cpu_nfb.notifier_call(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); return 0; } presmp_initcall(cpuidle_presmp_init); xen-4.4.0/xen/arch/x86/acpi/cpuidle_menu.c0000664000175000017500000002305112307313555016331 0ustar smbsmb/* * cpuidle_menu - menu governor for cpu idle, main idea come from Linux. * drivers/cpuidle/governors/menu.c * * Copyright (C) 2006-2007 Adam Belay * Copyright (C) 2007, 2008 Intel Corporation * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ #include #include #include #include #include #include #include #include #define BUCKETS 6 #define RESOLUTION 1024 #define DECAY 4 #define MAX_INTERESTING 50000 /* * Concepts and ideas behind the menu governor * * For the menu governor, there are 3 decision factors for picking a C * state: * 1) Energy break even point * 2) Performance impact * 3) Latency tolerance (TBD: from guest virtual C state) * These these three factors are treated independently. * * Energy break even point * ----------------------- * C state entry and exit have an energy cost, and a certain amount of time in * the C state is required to actually break even on this cost. CPUIDLE * provides us this duration in the "target_residency" field. So all that we * need is a good prediction of how long we'll be idle. Like the traditional * menu governor, we start with the actual known "next timer event" time. * * Since there are other source of wakeups (interrupts for example) than * the next timer event, this estimation is rather optimistic. To get a * more realistic estimate, a correction factor is applied to the estimate, * that is based on historic behavior. For example, if in the past the actual * duration always was 50% of the next timer tick, the correction factor will * be 0.5. * * menu uses a running average for this correction factor, however it uses a * set of factors, not just a single factor. This stems from the realization * that the ratio is dependent on the order of magnitude of the expected * duration; if we expect 500 milliseconds of idle time the likelihood of * getting an interrupt very early is much higher than if we expect 50 micro * seconds of idle time. * For this reason we keep an array of 6 independent factors, that gets * indexed based on the magnitude of the expected duration * * Limiting Performance Impact * --------------------------- * C states, especially those with large exit latencies, can have a real * noticable impact on workloads, which is not acceptable for most sysadmins, * and in addition, less performance has a power price of its own. * * As a general rule of thumb, menu assumes that the following heuristic * holds: * The busier the system, the less impact of C states is acceptable * * This rule-of-thumb is implemented using average interrupt interval: * If the exit latency times multiplier is longer than the average * interrupt interval, the C state is not considered a candidate * for selection due to a too high performance impact. So the smaller * the average interrupt interval is, the smaller C state latency should be * and thus the less likely a busy CPU will hit such a deep C state. * */ struct perf_factor{ s_time_t time_stamp; s_time_t duration; unsigned int irq_count_stamp; unsigned int irq_sum; }; struct menu_device { int last_state_idx; unsigned int expected_us; u64 predicted_us; unsigned int measured_us; unsigned int exit_us; unsigned int bucket; u64 correction_factor[BUCKETS]; struct perf_factor pf; }; static DEFINE_PER_CPU(struct menu_device, menu_devices); static inline int which_bucket(unsigned int duration) { int bucket = 0; if (duration < 10) return bucket; if (duration < 100) return bucket + 1; if (duration < 1000) return bucket + 2; if (duration < 10000) return bucket + 3; if (duration < 100000) return bucket + 4; return bucket + 5; } /* * Return the average interrupt interval to take I/O performance * requirements into account. The smaller the average interrupt * interval to be, the more busy I/O activity, and thus the higher * the barrier to go to an expensive C state. */ /* 5 milisec sampling period */ #define SAMPLING_PERIOD 5000000 /* for I/O interrupt, we give 8x multiplier compared to C state latency*/ #define IO_MULTIPLIER 8 static inline s_time_t avg_intr_interval_us(void) { struct menu_device *data = &__get_cpu_var(menu_devices); s_time_t duration, now; s_time_t avg_interval; unsigned int irq_sum; now = NOW(); duration = (data->pf.duration + (now - data->pf.time_stamp) * (DECAY - 1)) / DECAY; irq_sum = (data->pf.irq_sum + (this_cpu(irq_count) - data->pf.irq_count_stamp) * (DECAY - 1)) / DECAY; if (irq_sum == 0) /* no irq recently, so return a big enough interval: 1 sec */ avg_interval = 1000000; else avg_interval = duration / irq_sum / 1000; /* in us */ if ( duration >= SAMPLING_PERIOD){ data->pf.time_stamp = now; data->pf.duration = duration; data->pf.irq_count_stamp= this_cpu(irq_count); data->pf.irq_sum = irq_sum; } return avg_interval; } static unsigned int get_sleep_length_us(void) { s_time_t us = (this_cpu(timer_deadline) - NOW()) / 1000; /* * while us < 0 or us > (u32)-1, return a large u32, * choose (unsigned int)-2000 to avoid wrapping while added with exit * latency because the latency should not larger than 2ms */ return (us >> 32) ? (unsigned int)-2000 : (unsigned int)us; } static int menu_select(struct acpi_processor_power *power) { struct menu_device *data = &__get_cpu_var(menu_devices); int i; s_time_t io_interval; /* TBD: Change to 0 if C0(polling mode) support is added later*/ data->last_state_idx = CPUIDLE_DRIVER_STATE_START; data->exit_us = 0; /* determine the expected residency time, round up */ data->expected_us = get_sleep_length_us(); data->bucket = which_bucket(data->expected_us); io_interval = avg_intr_interval_us(); /* * if the correction factor is 0 (eg first time init or cpu hotplug * etc), we actually want to start out with a unity factor. */ if (data->correction_factor[data->bucket] == 0) data->correction_factor[data->bucket] = RESOLUTION * DECAY; /* Make sure to round up for half microseconds */ data->predicted_us = DIV_ROUND( data->expected_us * data->correction_factor[data->bucket], RESOLUTION * DECAY); /* find the deepest idle state that satisfies our constraints */ for ( i = CPUIDLE_DRIVER_STATE_START + 1; i < power->count; i++ ) { struct acpi_processor_cx *s = &power->states[i]; if (s->target_residency > data->predicted_us) break; if (s->latency * IO_MULTIPLIER > io_interval) break; /* TBD: we need to check the QoS requirment in future */ data->exit_us = s->latency; data->last_state_idx = i; } return data->last_state_idx; } static void menu_reflect(struct acpi_processor_power *power) { struct menu_device *data = &__get_cpu_var(menu_devices); unsigned int last_idle_us = power->last_residency; unsigned int measured_us; u64 new_factor; measured_us = last_idle_us; /* * We correct for the exit latency; we are assuming here that the * exit latency happens after the event that we're interested in. */ if (measured_us > data->exit_us) measured_us -= data->exit_us; /* update our correction ratio */ new_factor = data->correction_factor[data->bucket] * (DECAY - 1) / DECAY; if (data->expected_us > 0 && data->measured_us < MAX_INTERESTING) new_factor += RESOLUTION * measured_us / data->expected_us; else /* * we were idle so long that we count it as a perfect * prediction */ new_factor += RESOLUTION; /* * We don't want 0 as factor; we always want at least * a tiny bit of estimated time. */ if (new_factor == 0) new_factor = 1; data->correction_factor[data->bucket] = new_factor; } static int menu_enable_device(struct acpi_processor_power *power) { if (!cpu_online(power->cpu)) return -1; memset(&per_cpu(menu_devices, power->cpu), 0, sizeof(struct menu_device)); return 0; } static struct cpuidle_governor menu_governor = { .name = "menu", .rating = 20, .enable = menu_enable_device, .select = menu_select, .reflect = menu_reflect, }; struct cpuidle_governor *cpuidle_current_governor = &menu_governor; void menu_get_trace_data(u32 *expected, u32 *pred) { struct menu_device *data = &__get_cpu_var(menu_devices); *expected = data->expected_us; *pred = data->predicted_us; } xen-4.4.0/xen/arch/x86/acpi/lib.c0000664000175000017500000000701112307313555014424 0ustar smbsmb/* * lib.c - Architecture-Specific Low-Level ACPI Support * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ #include #include #include #include #include #include u32 __read_mostly acpi_smi_cmd; u8 __read_mostly acpi_enable_value; u8 __read_mostly acpi_disable_value; u32 __read_mostly x86_acpiid_to_apicid[MAX_MADT_ENTRIES] = {[0 ... MAX_MADT_ENTRIES - 1] = BAD_APICID }; /* * Important Safety Note: The fixed ACPI page numbers are *subtracted* * from the fixed base. That's why we start at FIX_ACPI_END and * count idx down while incrementing the phys address. */ char *__acpi_map_table(paddr_t phys, unsigned long size) { unsigned long base, offset, mapped_size; int idx; /* XEN: RAM holes above 1MB are not permanently mapped. */ if ((phys + size) <= (1 * 1024 * 1024)) return __va(phys); offset = phys & (PAGE_SIZE - 1); mapped_size = PAGE_SIZE - offset; set_fixmap(FIX_ACPI_END, phys); base = fix_to_virt(FIX_ACPI_END); /* * Most cases can be covered by the below. */ idx = FIX_ACPI_END; while (mapped_size < size) { if (--idx < FIX_ACPI_BEGIN) return NULL; /* cannot handle this */ phys += PAGE_SIZE; set_fixmap(idx, phys); mapped_size += PAGE_SIZE; } return ((char *) base + offset); } unsigned int acpi_get_processor_id(unsigned int cpu) { unsigned int acpiid, apicid; if ((apicid = x86_cpu_to_apicid[cpu]) == BAD_APICID) return INVALID_ACPIID; for (acpiid = 0; acpiid < ARRAY_SIZE(x86_acpiid_to_apicid); acpiid++) if (x86_acpiid_to_apicid[acpiid] == apicid) return acpiid; return INVALID_ACPIID; } static void get_mwait_ecx(void *info) { *(u32 *)info = cpuid_ecx(CPUID_MWAIT_LEAF); } int arch_acpi_set_pdc_bits(u32 acpi_id, u32 *pdc, u32 mask) { unsigned int cpu = get_cpu_id(acpi_id); struct cpuinfo_x86 *c; u32 ecx; if (!(acpi_id + 1)) c = &boot_cpu_data; else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -EINVAL; else c = cpu_data + cpu; pdc[2] |= ACPI_PDC_C_CAPABILITY_SMP & mask; if (cpu_has(c, X86_FEATURE_EST)) pdc[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP & mask; if (cpu_has(c, X86_FEATURE_ACPI)) pdc[2] |= ACPI_PDC_T_FFH & mask; /* * If mwait/monitor or its break-on-interrupt extension are * unsupported, Cx_FFH will be disabled. */ if (!cpu_has(c, X86_FEATURE_MWAIT) || c->cpuid_level < CPUID_MWAIT_LEAF) ecx = 0; else if (c == &boot_cpu_data || cpu == smp_processor_id()) ecx = cpuid_ecx(CPUID_MWAIT_LEAF); else on_selected_cpus(cpumask_of(cpu), get_mwait_ecx, &ecx, 1); if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) pdc[2] &= ~(ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH); return 0; } xen-4.4.0/xen/arch/x86/acpi/cpufreq/0000775000175000017500000000000012307313555015160 5ustar smbsmbxen-4.4.0/xen/arch/x86/acpi/cpufreq/Makefile0000664000175000017500000000004712307313555016621 0ustar smbsmbobj-y += cpufreq.o obj-y += powernow.o xen-4.4.0/xen/arch/x86/acpi/cpufreq/cpufreq.c0000664000175000017500000004467412307313555017010 0ustar smbsmb/* * cpufreq.c - ACPI Processor P-States Driver ($Revision: 1.4 $) * * Copyright (C) 2001, 2002 Andy Grover * Copyright (C) 2001, 2002 Paul Diefenbaugh * Copyright (C) 2002 - 2004 Dominik Brodowski * Copyright (C) 2006 Denis Sadykov * * Feb 2008 - Liu Jinsong * porting acpi-cpufreq.c from Linux 2.6.23 to Xen hypervisor * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include enum { UNDEFINED_CAPABLE = 0, SYSTEM_INTEL_MSR_CAPABLE, SYSTEM_IO_CAPABLE, }; #define INTEL_MSR_RANGE (0xffffull) #define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1) struct acpi_cpufreq_data *cpufreq_drv_data[NR_CPUS]; static struct cpufreq_driver acpi_cpufreq_driver; static unsigned int __read_mostly acpi_pstate_strict; integer_param("acpi_pstate_strict", acpi_pstate_strict); static int check_est_cpu(unsigned int cpuid) { struct cpuinfo_x86 *cpu = &cpu_data[cpuid]; if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) return 0; return 1; } static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) { struct processor_performance *perf; int i; perf = data->acpi_data; for (i=0; istate_count; i++) { if (value == perf->states[i].status) return data->freq_table[i].frequency; } return 0; } static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data) { int i; struct processor_performance *perf; msr &= INTEL_MSR_RANGE; perf = data->acpi_data; for (i=0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) { if (msr == perf->states[data->freq_table[i].index].status) return data->freq_table[i].frequency; } return data->freq_table[0].frequency; } static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data) { switch (data->arch_cpu_flags) { case SYSTEM_INTEL_MSR_CAPABLE: return extract_msr(val, data); case SYSTEM_IO_CAPABLE: return extract_io(val, data); default: return 0; } } struct msr_addr { u32 reg; }; struct io_addr { u16 port; u8 bit_width; }; typedef union { struct msr_addr msr; struct io_addr io; } drv_addr_union; struct drv_cmd { unsigned int type; const cpumask_t *mask; drv_addr_union addr; u32 val; }; static void do_drv_read(void *drvcmd) { struct drv_cmd *cmd; cmd = (struct drv_cmd *)drvcmd; switch (cmd->type) { case SYSTEM_INTEL_MSR_CAPABLE: rdmsrl(cmd->addr.msr.reg, cmd->val); break; case SYSTEM_IO_CAPABLE: acpi_os_read_port((acpi_io_address)cmd->addr.io.port, &cmd->val, (u32)cmd->addr.io.bit_width); break; default: break; } } static void do_drv_write(void *drvcmd) { struct drv_cmd *cmd; uint64_t msr_content; cmd = (struct drv_cmd *)drvcmd; switch (cmd->type) { case SYSTEM_INTEL_MSR_CAPABLE: rdmsrl(cmd->addr.msr.reg, msr_content); msr_content = (msr_content & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE); wrmsrl(cmd->addr.msr.reg, msr_content); break; case SYSTEM_IO_CAPABLE: acpi_os_write_port((acpi_io_address)cmd->addr.io.port, cmd->val, (u32)cmd->addr.io.bit_width); break; default: break; } } static void drv_read(struct drv_cmd *cmd) { cmd->val = 0; ASSERT(cpumask_weight(cmd->mask) == 1); /* to reduce IPI for the sake of performance */ if (likely(cpumask_test_cpu(smp_processor_id(), cmd->mask))) do_drv_read((void *)cmd); else on_selected_cpus(cmd->mask, do_drv_read, cmd, 1); } static void drv_write(struct drv_cmd *cmd) { if (cpumask_equal(cmd->mask, cpumask_of(smp_processor_id()))) do_drv_write((void *)cmd); else on_selected_cpus(cmd->mask, do_drv_write, cmd, 1); } static u32 get_cur_val(const cpumask_t *mask) { struct cpufreq_policy *policy; struct processor_performance *perf; struct drv_cmd cmd; unsigned int cpu = smp_processor_id(); if (unlikely(cpumask_empty(mask))) return 0; if (!cpumask_test_cpu(cpu, mask)) cpu = cpumask_first(mask); if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return 0; policy = per_cpu(cpufreq_cpu_policy, cpu); if (!policy || !cpufreq_drv_data[policy->cpu]) return 0; switch (cpufreq_drv_data[policy->cpu]->arch_cpu_flags) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; break; case SYSTEM_IO_CAPABLE: cmd.type = SYSTEM_IO_CAPABLE; perf = cpufreq_drv_data[policy->cpu]->acpi_data; cmd.addr.io.port = perf->control_register.address; cmd.addr.io.bit_width = perf->control_register.bit_width; break; default: return 0; } cmd.mask = cpumask_of(cpu); drv_read(&cmd); return cmd.val; } struct perf_pair { union { struct { uint32_t lo; uint32_t hi; } split; uint64_t whole; } aperf, mperf; }; static DEFINE_PER_CPU(struct perf_pair, gov_perf_pair); static DEFINE_PER_CPU(struct perf_pair, usr_perf_pair); static void read_measured_perf_ctrs(void *_readin) { struct perf_pair *readin = _readin; rdmsrl(MSR_IA32_APERF, readin->aperf.whole); rdmsrl(MSR_IA32_MPERF, readin->mperf.whole); } /* * Return the measured active (C0) frequency on this CPU since last call * to this function. * Input: cpu number * Return: Average CPU frequency in terms of max frequency (zero on error) * * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance * over a period of time, while CPU is in C0 state. * IA32_MPERF counts at the rate of max advertised frequency * IA32_APERF counts at the rate of actual CPU frequency * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and * no meaning should be associated with absolute values of these MSRs. */ unsigned int get_measured_perf(unsigned int cpu, unsigned int flag) { struct cpufreq_policy *policy; struct perf_pair readin, cur, *saved; unsigned int perf_percent; unsigned int retval; if (!cpu_online(cpu)) return 0; policy = per_cpu(cpufreq_cpu_policy, cpu); if (!policy || !policy->aperf_mperf) return 0; switch (flag) { case GOV_GETAVG: { saved = &per_cpu(gov_perf_pair, cpu); break; } case USR_GETAVG: { saved = &per_cpu(usr_perf_pair, cpu); break; } default: return 0; } if (cpu == smp_processor_id()) { read_measured_perf_ctrs((void *)&readin); } else { on_selected_cpus(cpumask_of(cpu), read_measured_perf_ctrs, &readin, 1); } cur.aperf.whole = readin.aperf.whole - saved->aperf.whole; cur.mperf.whole = readin.mperf.whole - saved->mperf.whole; saved->aperf.whole = readin.aperf.whole; saved->mperf.whole = readin.mperf.whole; if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) { int shift_count = 7; cur.aperf.whole >>= shift_count; cur.mperf.whole >>= shift_count; } if (cur.aperf.whole && cur.mperf.whole) perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole; else perf_percent = 0; retval = policy->cpuinfo.max_freq * perf_percent / 100; return retval; } static unsigned int get_cur_freq_on_cpu(unsigned int cpu) { struct cpufreq_policy *policy; struct acpi_cpufreq_data *data; unsigned int freq; if (!cpu_online(cpu)) return 0; policy = per_cpu(cpufreq_cpu_policy, cpu); if (!policy) return 0; data = cpufreq_drv_data[policy->cpu]; if (unlikely(data == NULL || data->acpi_data == NULL || data->freq_table == NULL)) return 0; freq = extract_freq(get_cur_val(cpumask_of(cpu)), data); return freq; } static void feature_detect(void *info) { struct cpufreq_policy *policy = info; unsigned int eax, ecx; ecx = cpuid_ecx(6); if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY) { policy->aperf_mperf = 1; acpi_cpufreq_driver.getavg = get_measured_perf; } eax = cpuid_eax(6); if (eax & 0x2) { policy->turbo = CPUFREQ_TURBO_ENABLED; if (cpufreq_verbose) printk(XENLOG_INFO "CPU%u: Turbo Mode detected and enabled\n", smp_processor_id()); } } static unsigned int check_freqs(const cpumask_t *mask, unsigned int freq, struct acpi_cpufreq_data *data) { unsigned int cur_freq; unsigned int i; for (i=0; i<100; i++) { cur_freq = extract_freq(get_cur_val(mask), data); if (cur_freq == freq) return 1; udelay(10); } return 0; } static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { struct acpi_cpufreq_data *data = cpufreq_drv_data[policy->cpu]; struct processor_performance *perf; struct cpufreq_freqs freqs; cpumask_t online_policy_cpus; struct drv_cmd cmd; unsigned int next_state = 0; /* Index into freq_table */ unsigned int next_perf_state = 0; /* Index into perf table */ unsigned int j; int result = 0; if (unlikely(data == NULL || data->acpi_data == NULL || data->freq_table == NULL)) { return -ENODEV; } if (policy->turbo == CPUFREQ_TURBO_DISABLED) if (target_freq > policy->cpuinfo.second_max_freq) target_freq = policy->cpuinfo.second_max_freq; perf = data->acpi_data; result = cpufreq_frequency_table_target(policy, data->freq_table, target_freq, relation, &next_state); if (unlikely(result)) return -ENODEV; cpumask_and(&online_policy_cpus, &cpu_online_map, policy->cpus); next_perf_state = data->freq_table[next_state].index; if (perf->state == next_perf_state) { if (unlikely(policy->resume)) policy->resume = 0; else return 0; } switch (data->arch_cpu_flags) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; cmd.addr.msr.reg = MSR_IA32_PERF_CTL; cmd.val = (u32) perf->states[next_perf_state].control; break; case SYSTEM_IO_CAPABLE: cmd.type = SYSTEM_IO_CAPABLE; cmd.addr.io.port = perf->control_register.address; cmd.addr.io.bit_width = perf->control_register.bit_width; cmd.val = (u32) perf->states[next_perf_state].control; break; default: return -ENODEV; } if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY) cmd.mask = &online_policy_cpus; else cmd.mask = cpumask_of(policy->cpu); freqs.old = perf->states[perf->state].core_frequency * 1000; freqs.new = data->freq_table[next_state].frequency; drv_write(&cmd); if (acpi_pstate_strict && !check_freqs(cmd.mask, freqs.new, data)) { printk(KERN_WARNING "Fail transfer to new freq %d\n", freqs.new); return -EAGAIN; } for_each_cpu(j, &online_policy_cpus) cpufreq_statistic_update(j, perf->state, next_perf_state); perf->state = next_perf_state; policy->cur = freqs.new; return result; } static int acpi_cpufreq_verify(struct cpufreq_policy *policy) { struct acpi_cpufreq_data *data; struct processor_performance *perf; if (!policy || !(data = cpufreq_drv_data[policy->cpu]) || !processor_pminfo[policy->cpu]) return -EINVAL; perf = &processor_pminfo[policy->cpu]->perf; cpufreq_verify_within_limits(policy, 0, perf->states[perf->platform_limit].core_frequency * 1000); return cpufreq_frequency_table_verify(policy, data->freq_table); } static unsigned long acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu) { struct processor_performance *perf = data->acpi_data; if (cpu_khz) { /* search the closest match to cpu_khz */ unsigned int i; unsigned long freq; unsigned long freqn = perf->states[0].core_frequency * 1000; for (i=0; i<(perf->state_count-1); i++) { freq = freqn; freqn = perf->states[i+1].core_frequency * 1000; if ((2 * cpu_khz) > (freqn + freq)) { perf->state = i; return freq; } } perf->state = perf->state_count-1; return freqn; } else { /* assume CPU is at P0... */ perf->state = 0; return perf->states[0].core_frequency * 1000; } } static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) { unsigned int i; unsigned int valid_states = 0; unsigned int cpu = policy->cpu; struct acpi_cpufreq_data *data; unsigned int result = 0; struct cpuinfo_x86 *c = &cpu_data[policy->cpu]; struct processor_performance *perf; data = xzalloc(struct acpi_cpufreq_data); if (!data) return -ENOMEM; cpufreq_drv_data[cpu] = data; data->acpi_data = &processor_pminfo[cpu]->perf; perf = data->acpi_data; policy->shared_type = perf->shared_type; switch (perf->control_register.space_id) { case ACPI_ADR_SPACE_SYSTEM_IO: if (cpufreq_verbose) printk("xen_pminfo: @acpi_cpufreq_cpu_init," "SYSTEM IO addr space\n"); data->arch_cpu_flags = SYSTEM_IO_CAPABLE; break; case ACPI_ADR_SPACE_FIXED_HARDWARE: if (cpufreq_verbose) printk("xen_pminfo: @acpi_cpufreq_cpu_init," "HARDWARE addr space\n"); if (!check_est_cpu(cpu)) { result = -ENODEV; goto err_unreg; } data->arch_cpu_flags = SYSTEM_INTEL_MSR_CAPABLE; break; default: result = -ENODEV; goto err_unreg; } data->freq_table = xmalloc_array(struct cpufreq_frequency_table, (perf->state_count+1)); if (!data->freq_table) { result = -ENOMEM; goto err_unreg; } /* detect transition latency */ policy->cpuinfo.transition_latency = 0; for (i=0; istate_count; i++) { if ((perf->states[i].transition_latency * 1000) > policy->cpuinfo.transition_latency) policy->cpuinfo.transition_latency = perf->states[i].transition_latency * 1000; } policy->governor = cpufreq_opt_governor ? : CPUFREQ_DEFAULT_GOVERNOR; /* table init */ for (i=0; istate_count; i++) { if (i>0 && perf->states[i].core_frequency >= data->freq_table[valid_states-1].frequency / 1000) continue; data->freq_table[valid_states].index = i; data->freq_table[valid_states].frequency = perf->states[i].core_frequency * 1000; valid_states++; } data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END; perf->state = 0; result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table); if (result) goto err_freqfree; switch (perf->control_register.space_id) { case ACPI_ADR_SPACE_SYSTEM_IO: /* Current speed is unknown and not detectable by IO port */ policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu); break; case ACPI_ADR_SPACE_FIXED_HARDWARE: acpi_cpufreq_driver.get = get_cur_freq_on_cpu; policy->cur = get_cur_freq_on_cpu(cpu); break; default: break; } /* Check for APERF/MPERF support in hardware * also check for boost support */ if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) on_selected_cpus(cpumask_of(cpu), feature_detect, policy, 1); /* * the first call to ->target() should result in us actually * writing something to the appropriate registers. */ policy->resume = 1; return result; err_freqfree: xfree(data->freq_table); err_unreg: xfree(data); cpufreq_drv_data[cpu] = NULL; return result; } static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) { struct acpi_cpufreq_data *data = cpufreq_drv_data[policy->cpu]; if (data) { cpufreq_drv_data[policy->cpu] = NULL; xfree(data->freq_table); xfree(data); } return 0; } static struct cpufreq_driver acpi_cpufreq_driver = { .name = "acpi-cpufreq", .verify = acpi_cpufreq_verify, .target = acpi_cpufreq_target, .init = acpi_cpufreq_cpu_init, .exit = acpi_cpufreq_cpu_exit, }; static int __init cpufreq_driver_init(void) { int ret = 0; if ((cpufreq_controller == FREQCTL_xen) && (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)) ret = cpufreq_register_driver(&acpi_cpufreq_driver); else if ((cpufreq_controller == FREQCTL_xen) && (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) ret = powernow_register_driver(); return ret; } __initcall(cpufreq_driver_init); int cpufreq_cpu_init(unsigned int cpuid) { int ret; /* Currently we only handle Intel and AMD processor */ if ( (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) ) ret = cpufreq_add_cpu(cpuid); else ret = -EFAULT; return ret; } xen-4.4.0/xen/arch/x86/acpi/cpufreq/powernow.c0000664000175000017500000002662312307313555017215 0ustar smbsmb/* * powernow - AMD Architectural P-state Driver ($Revision: 1.4 $) * * Copyright (C) 2008 Mark Langsdorf * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1) #define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 #define CPB_CAPABLE 0x00000200 #define USE_HW_PSTATE 0x00000080 #define HW_PSTATE_MASK 0x00000007 #define HW_PSTATE_VALID_MASK 0x80000000 #define HW_PSTATE_MAX_MASK 0x000000f0 #define HW_PSTATE_MAX_SHIFT 4 #define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */ #define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */ #define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */ #define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */ #define MSR_HWCR_CPBDIS_MASK 0x02000000ULL #define ARCH_CPU_FLAG_RESUME 1 static struct cpufreq_driver powernow_cpufreq_driver; static void transition_pstate(void *pstate) { wrmsrl(MSR_PSTATE_CTRL, *(unsigned int *)pstate); } static void update_cpb(void *data) { struct cpufreq_policy *policy = (struct cpufreq_policy *)data; if (policy->turbo != CPUFREQ_TURBO_UNSUPPORTED) { uint64_t msr_content; rdmsrl(MSR_K8_HWCR, msr_content); if (policy->turbo == CPUFREQ_TURBO_ENABLED) msr_content &= ~MSR_HWCR_CPBDIS_MASK; else msr_content |= MSR_HWCR_CPBDIS_MASK; wrmsrl(MSR_K8_HWCR, msr_content); } } static int powernow_cpufreq_update (int cpuid, struct cpufreq_policy *policy) { if (!cpumask_test_cpu(cpuid, &cpu_online_map)) return -EINVAL; on_selected_cpus(cpumask_of(cpuid), update_cpb, policy, 1); return 0; } static int powernow_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { struct acpi_cpufreq_data *data = cpufreq_drv_data[policy->cpu]; struct processor_performance *perf; unsigned int next_state; /* Index into freq_table */ unsigned int next_perf_state; /* Index into perf table */ int result; if (unlikely(data == NULL || data->acpi_data == NULL || data->freq_table == NULL)) { return -ENODEV; } perf = data->acpi_data; result = cpufreq_frequency_table_target(policy, data->freq_table, target_freq, relation, &next_state); if (unlikely(result)) return result; next_perf_state = data->freq_table[next_state].index; if (perf->state == next_perf_state) { if (unlikely(data->arch_cpu_flags & ARCH_CPU_FLAG_RESUME)) data->arch_cpu_flags &= ~ARCH_CPU_FLAG_RESUME; else return 0; } if (policy->shared_type == CPUFREQ_SHARED_TYPE_HW && likely(policy->cpu == smp_processor_id())) { transition_pstate(&next_perf_state); cpufreq_statistic_update(policy->cpu, perf->state, next_perf_state); } else { cpumask_t online_policy_cpus; unsigned int cpu; cpumask_and(&online_policy_cpus, policy->cpus, &cpu_online_map); if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || unlikely(policy->cpu != smp_processor_id())) on_selected_cpus(&online_policy_cpus, transition_pstate, &next_perf_state, 1); else transition_pstate(&next_perf_state); for_each_cpu(cpu, &online_policy_cpus) cpufreq_statistic_update(cpu, perf->state, next_perf_state); } perf->state = next_perf_state; policy->cur = data->freq_table[next_state].frequency; return 0; } static void amd_fixup_frequency(struct xen_processor_px *px) { u32 hi, lo, fid, did; int index = px->control & 0x00000007; const struct cpuinfo_x86 *c = ¤t_cpu_data; if ((c->x86 != 0x10 || c->x86_model >= 10) && c->x86 != 0x11) return; rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); /* * MSR C001_0064+: * Bit 63: PstateEn. Read-write. If set, the P-state is valid. */ if (!(hi & (1U << 31))) return; fid = lo & 0x3f; did = (lo >> 6) & 7; if (c->x86 == 0x10) px->core_frequency = (100 * (fid + 16)) >> did; else px->core_frequency = (100 * (fid + 8)) >> did; } struct amd_cpu_data { struct processor_performance *perf; u32 max_hw_pstate; }; static void get_cpu_data(void *arg) { struct amd_cpu_data *data = arg; struct processor_performance *perf = data->perf; uint64_t msr_content; unsigned int i; rdmsrl(MSR_PSTATE_CUR_LIMIT, msr_content); data->max_hw_pstate = (msr_content & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; for (i = 0; i < perf->state_count && i <= data->max_hw_pstate; i++) amd_fixup_frequency(&perf->states[i]); } static int powernow_cpufreq_verify(struct cpufreq_policy *policy) { struct acpi_cpufreq_data *data; struct processor_performance *perf; if (!policy || !(data = cpufreq_drv_data[policy->cpu]) || !processor_pminfo[policy->cpu]) return -EINVAL; perf = &processor_pminfo[policy->cpu]->perf; cpufreq_verify_within_limits(policy, 0, perf->states[perf->platform_limit].core_frequency * 1000); return cpufreq_frequency_table_verify(policy, data->freq_table); } static void feature_detect(void *info) { struct cpufreq_policy *policy = info; unsigned int ecx, edx; ecx = cpuid_ecx(6); if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY) { policy->aperf_mperf = 1; powernow_cpufreq_driver.getavg = get_measured_perf; } edx = cpuid_edx(CPUID_FREQ_VOLT_CAPABILITIES); if ((edx & CPB_CAPABLE) == CPB_CAPABLE) { policy->turbo = CPUFREQ_TURBO_ENABLED; if (cpufreq_verbose) printk(XENLOG_INFO "CPU%u: Core Boost/Turbo detected and enabled\n", smp_processor_id()); } } static int powernow_cpufreq_cpu_init(struct cpufreq_policy *policy) { unsigned int i; unsigned int valid_states = 0; unsigned int cpu = policy->cpu; struct acpi_cpufreq_data *data; unsigned int result = 0; struct processor_performance *perf; struct amd_cpu_data info; struct cpuinfo_x86 *c = &cpu_data[policy->cpu]; data = xzalloc(struct acpi_cpufreq_data); if (!data) return -ENOMEM; cpufreq_drv_data[cpu] = data; data->acpi_data = &processor_pminfo[cpu]->perf; info.perf = perf = data->acpi_data; policy->shared_type = perf->shared_type; if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { cpumask_set_cpu(cpu, policy->cpus); if (cpumask_weight(policy->cpus) != 1) { printk(XENLOG_WARNING "Unsupported sharing type %d (%u CPUs)\n", policy->shared_type, cpumask_weight(policy->cpus)); result = -ENODEV; goto err_unreg; } } else { cpumask_copy(policy->cpus, cpumask_of(cpu)); } /* capability check */ if (perf->state_count <= 1) { printk("No P-States\n"); result = -ENODEV; goto err_unreg; } if (perf->control_register.space_id != perf->status_register.space_id) { result = -ENODEV; goto err_unreg; } data->freq_table = xmalloc_array(struct cpufreq_frequency_table, (perf->state_count+1)); if (!data->freq_table) { result = -ENOMEM; goto err_unreg; } /* detect transition latency */ policy->cpuinfo.transition_latency = 0; for (i=0; istate_count; i++) { if ((perf->states[i].transition_latency * 1000) > policy->cpuinfo.transition_latency) policy->cpuinfo.transition_latency = perf->states[i].transition_latency * 1000; } policy->governor = cpufreq_opt_governor ? : CPUFREQ_DEFAULT_GOVERNOR; on_selected_cpus(cpumask_of(cpu), get_cpu_data, &info, 1); /* table init */ for (i = 0; i < perf->state_count && i <= info.max_hw_pstate; i++) { if (i > 0 && perf->states[i].core_frequency >= data->freq_table[valid_states-1].frequency / 1000) continue; data->freq_table[valid_states].index = perf->states[i].control & HW_PSTATE_MASK; data->freq_table[valid_states].frequency = perf->states[i].core_frequency * 1000; valid_states++; } data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END; perf->state = 0; result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table); if (result) goto err_freqfree; if (c->cpuid_level >= 6) on_selected_cpus(cpumask_of(cpu), feature_detect, policy, 1); /* * the first call to ->target() should result in us actually * writing something to the appropriate registers. */ data->arch_cpu_flags |= ARCH_CPU_FLAG_RESUME; policy->cur = data->freq_table[i].frequency; return result; err_freqfree: xfree(data->freq_table); err_unreg: xfree(data); cpufreq_drv_data[cpu] = NULL; return result; } static int powernow_cpufreq_cpu_exit(struct cpufreq_policy *policy) { struct acpi_cpufreq_data *data = cpufreq_drv_data[policy->cpu]; if (data) { cpufreq_drv_data[policy->cpu] = NULL; xfree(data->freq_table); xfree(data); } return 0; } static struct cpufreq_driver powernow_cpufreq_driver = { .verify = powernow_cpufreq_verify, .target = powernow_cpufreq_target, .init = powernow_cpufreq_cpu_init, .exit = powernow_cpufreq_cpu_exit, .update = powernow_cpufreq_update }; unsigned int __init powernow_register_driver() { unsigned int i, ret = 0; for_each_online_cpu(i) { struct cpuinfo_x86 *c = &cpu_data[i]; if (c->x86_vendor != X86_VENDOR_AMD) ret = -ENODEV; else { u32 eax, ebx, ecx, edx; cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); if ((edx & USE_HW_PSTATE) != USE_HW_PSTATE) ret = -ENODEV; } if (ret) return ret; } ret = cpufreq_register_driver(&powernow_cpufreq_driver); return ret; } xen-4.4.0/xen/arch/x86/setup.c0000664000175000017500000013472212307313555014114 0ustar smbsmb#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for opt_tmem only */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for bzimage_headroom */ #include /* for generic_apic_probe */ #include #include #include /* opt_nosmp: If true, secondary processors are ignored. */ static bool_t __initdata opt_nosmp; boolean_param("nosmp", opt_nosmp); /* maxcpus: maximum number of CPUs to activate. */ static unsigned int __initdata max_cpus; integer_param("maxcpus", max_cpus); /* smep: Enable/disable Supervisor Mode Execution Protection (default on). */ static bool_t __initdata disable_smep; invbool_param("smep", disable_smep); /* **** Linux config option: propagated to domain0. */ /* "acpi=off": Sisables both ACPI table parsing and interpreter. */ /* "acpi=force": Override the disable blacklist. */ /* "acpi=strict": Disables out-of-spec workarounds. */ /* "acpi=ht": Limit ACPI just to boot-time to enable HT. */ /* "acpi=noirq": Disables ACPI interrupt routing. */ static void parse_acpi_param(char *s); custom_param("acpi", parse_acpi_param); /* **** Linux config option: propagated to domain0. */ /* acpi_skip_timer_override: Skip IRQ0 overrides. */ boolean_param("acpi_skip_timer_override", acpi_skip_timer_override); /* **** Linux config option: propagated to domain0. */ /* noapic: Disable IOAPIC setup. */ boolean_param("noapic", skip_ioapic_setup); /* **** Linux config option: propagated to domain0. */ /* xen_cpuidle: xen control cstate. */ s8 __read_mostly xen_cpuidle = -1; boolean_param("cpuidle", xen_cpuidle); #ifndef NDEBUG unsigned long __initdata highmem_start; size_param("highmem-start", highmem_start); #endif cpumask_t __read_mostly cpu_present_map; unsigned long __read_mostly xen_phys_start; unsigned long __read_mostly xen_virt_end; DEFINE_PER_CPU(struct tss_struct, init_tss); char __attribute__ ((__section__(".bss.stack_aligned"))) cpu0_stack[STACK_SIZE]; struct cpuinfo_x86 __read_mostly boot_cpu_data = { 0, 0, 0, 0, -1 }; unsigned long __read_mostly mmu_cr4_features = X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE; bool_t __initdata acpi_disabled; bool_t __initdata acpi_force; static char __initdata acpi_param[10] = ""; static void __init parse_acpi_param(char *s) { /* Save the parameter so it can be propagated to domain0. */ safe_strcpy(acpi_param, s); /* Interpret the parameter for use within Xen. */ if ( !parse_bool(s) ) { disable_acpi(); } else if ( !strcmp(s, "force") ) { acpi_force = 1; acpi_ht = 1; acpi_disabled = 0; } else if ( !strcmp(s, "ht") ) { if ( !acpi_force ) disable_acpi(); acpi_ht = 1; } else if ( !strcmp(s, "noirq") ) { acpi_noirq_set(); } } #define EARLY_FAIL(f, a...) do { \ printk( f , ## a ); \ for ( ; ; ) halt(); \ } while (0) static const module_t *__initdata initial_images; static unsigned int __initdata nr_initial_images; unsigned long __init initial_images_nrpages(void) { unsigned long nr; unsigned int i; for ( nr = i = 0; i < nr_initial_images; ++i ) nr += PFN_UP(initial_images[i].mod_end); return nr; } void __init discard_initial_images(void) { unsigned int i; for ( i = 0; i < nr_initial_images; ++i ) { uint64_t start = (uint64_t)initial_images[i].mod_start << PAGE_SHIFT; init_domheap_pages(start, start + PAGE_ALIGN(initial_images[i].mod_end)); } nr_initial_images = 0; initial_images = NULL; } static void free_xen_data(char *s, char *e) { #ifndef MEMORY_GUARD init_xenheap_pages(__pa(s), __pa(e)); #endif memguard_guard_range(s, e-s); /* Also zap the mapping in the 1:1 area. */ memguard_guard_range(__va(__pa(s)), e-s); } extern char __init_begin[], __init_end[], __bss_start[]; static void __init init_idle_domain(void) { scheduler_init(); set_current(idle_vcpu[0]); this_cpu(curr_vcpu) = current; } void __devinit srat_detect_node(int cpu) { unsigned node; u32 apicid = x86_cpu_to_apicid[cpu]; node = apicid_to_node[apicid]; if ( node == NUMA_NO_NODE ) node = 0; node_set_online(node); numa_set_node(cpu, node); if ( opt_cpu_info && acpi_numa > 0 ) printk("CPU %d APIC %d -> Node %d\n", cpu, apicid, node); } /* * Sort CPUs by tuple. Fortunately this hierarchy is * reflected in the structure of modern APIC identifiers, so we sort based on * those. This is slightly complicated by the fact that the BSP must remain * CPU 0. Hence we do a variation on longest-prefix matching to do the best we * can while keeping CPU 0 static. */ static void __init normalise_cpu_order(void) { unsigned int i, j, min_cpu; uint32_t apicid, diff, min_diff; for_each_present_cpu ( i ) { apicid = x86_cpu_to_apicid[i]; min_diff = min_cpu = ~0u; /* * Find remaining CPU with longest-prefix match on APIC ID. * Among identical longest-prefix matches, pick the smallest APIC ID. */ for ( j = cpumask_next(i, &cpu_present_map); j < nr_cpu_ids; j = cpumask_next(j, &cpu_present_map) ) { diff = x86_cpu_to_apicid[j] ^ apicid; while ( diff & (diff-1) ) diff &= diff-1; if ( (diff < min_diff) || ((diff == min_diff) && (x86_cpu_to_apicid[j] < x86_cpu_to_apicid[min_cpu])) ) { min_diff = diff; min_cpu = j; } } /* If no match then there must be no CPUs remaining to consider. */ if ( min_cpu >= nr_cpu_ids ) { BUG_ON(cpumask_next(i, &cpu_present_map) < nr_cpu_ids); break; } /* Switch the best-matching CPU with the next CPU in logical order. */ j = cpumask_next(i, &cpu_present_map); apicid = x86_cpu_to_apicid[min_cpu]; x86_cpu_to_apicid[min_cpu] = x86_cpu_to_apicid[j]; x86_cpu_to_apicid[j] = apicid; } } #define BOOTSTRAP_MAP_BASE (16UL << 20) #define BOOTSTRAP_MAP_LIMIT (1UL << L3_PAGETABLE_SHIFT) /* * Ensure a given physical memory range is present in the bootstrap mappings. * Use superpage mappings to ensure that pagetable memory needn't be allocated. */ static void *__init bootstrap_map(const module_t *mod) { static unsigned long __initdata map_cur = BOOTSTRAP_MAP_BASE; uint64_t start, end, mask = (1L << L2_PAGETABLE_SHIFT) - 1; void *ret; if ( system_state != SYS_STATE_early_boot ) return mod ? mfn_to_virt(mod->mod_start) : NULL; if ( !mod ) { destroy_xen_mappings(BOOTSTRAP_MAP_BASE, BOOTSTRAP_MAP_LIMIT); map_cur = BOOTSTRAP_MAP_BASE; return NULL; } start = (uint64_t)mod->mod_start << PAGE_SHIFT; end = start + mod->mod_end; if ( start >= end ) return NULL; if ( end <= BOOTSTRAP_MAP_BASE ) return (void *)(unsigned long)start; ret = (void *)(map_cur + (unsigned long)(start & mask)); start &= ~mask; end = (end + mask) & ~mask; if ( end - start > BOOTSTRAP_MAP_LIMIT - map_cur ) return NULL; map_pages_to_xen(map_cur, start >> PAGE_SHIFT, (end - start) >> PAGE_SHIFT, PAGE_HYPERVISOR); map_cur += end - start; return ret; } static void *__init move_memory( uint64_t dst, uint64_t src, unsigned int size, bool_t keep) { unsigned int blksz = BOOTSTRAP_MAP_LIMIT - BOOTSTRAP_MAP_BASE; unsigned int mask = (1L << L2_PAGETABLE_SHIFT) - 1; if ( src + size > BOOTSTRAP_MAP_BASE ) blksz >>= 1; while ( size ) { module_t mod; unsigned int soffs = src & mask; unsigned int doffs = dst & mask; unsigned int sz; void *d, *s; mod.mod_start = (src - soffs) >> PAGE_SHIFT; mod.mod_end = soffs + size; if ( mod.mod_end > blksz ) mod.mod_end = blksz; sz = mod.mod_end - soffs; s = bootstrap_map(&mod); mod.mod_start = (dst - doffs) >> PAGE_SHIFT; mod.mod_end = doffs + size; if ( mod.mod_end > blksz ) mod.mod_end = blksz; if ( sz > mod.mod_end - doffs ) sz = mod.mod_end - doffs; d = bootstrap_map(&mod); memmove(d + doffs, s + soffs, sz); dst += sz; src += sz; size -= sz; if ( keep ) return size ? NULL : d + doffs; bootstrap_map(NULL); } return NULL; } static uint64_t __init consider_modules( uint64_t s, uint64_t e, uint32_t size, const module_t *mod, unsigned int nr_mods, unsigned int this_mod) { unsigned int i; if ( s > e || e - s < size ) return 0; for ( i = 0; i < nr_mods ; ++i ) { uint64_t start = (uint64_t)mod[i].mod_start << PAGE_SHIFT; uint64_t end = start + PAGE_ALIGN(mod[i].mod_end); if ( i == this_mod ) continue; if ( s < end && start < e ) { end = consider_modules(end, e, size, mod + i + 1, nr_mods - i - 1, this_mod - i - 1); if ( end ) return end; return consider_modules(s, start, size, mod + i + 1, nr_mods - i - 1, this_mod - i - 1); } } return e; } static void __init setup_max_pdx(unsigned long top_page) { max_pdx = pfn_to_pdx(top_page - 1) + 1; if ( max_pdx > (DIRECTMAP_SIZE >> PAGE_SHIFT) ) max_pdx = DIRECTMAP_SIZE >> PAGE_SHIFT; if ( max_pdx > FRAMETABLE_NR ) max_pdx = FRAMETABLE_NR; if ( max_pdx >= PAGE_LIST_NULL ) max_pdx = PAGE_LIST_NULL - 1; max_page = pdx_to_pfn(max_pdx - 1) + 1; } void set_pdx_range(unsigned long smfn, unsigned long emfn) { unsigned long idx, eidx; idx = pfn_to_pdx(smfn) / PDX_GROUP_COUNT; eidx = (pfn_to_pdx(emfn - 1) + PDX_GROUP_COUNT) / PDX_GROUP_COUNT; for ( ; idx < eidx; ++idx ) __set_bit(idx, pdx_group_valid); } /* A temporary copy of the e820 map that we can mess with during bootstrap. */ static struct e820map __initdata boot_e820; struct boot_video_info { u8 orig_x; /* 0x00 */ u8 orig_y; /* 0x01 */ u8 orig_video_mode; /* 0x02 */ u8 orig_video_cols; /* 0x03 */ u8 orig_video_lines; /* 0x04 */ u8 orig_video_isVGA; /* 0x05 */ u16 orig_video_points; /* 0x06 */ /* VESA graphic mode -- linear frame buffer */ u32 capabilities; /* 0x08 */ u16 lfb_linelength; /* 0x0c */ u16 lfb_width; /* 0x0e */ u16 lfb_height; /* 0x10 */ u16 lfb_depth; /* 0x12 */ u32 lfb_base; /* 0x14 */ u32 lfb_size; /* 0x18 */ u8 red_size; /* 0x1c */ u8 red_pos; /* 0x1d */ u8 green_size; /* 0x1e */ u8 green_pos; /* 0x1f */ u8 blue_size; /* 0x20 */ u8 blue_pos; /* 0x21 */ u8 rsvd_size; /* 0x22 */ u8 rsvd_pos; /* 0x23 */ u16 vesapm_seg; /* 0x24 */ u16 vesapm_off; /* 0x26 */ u16 vesa_attrib; /* 0x28 */ }; extern struct boot_video_info boot_vid_info; static void __init parse_video_info(void) { struct boot_video_info *bvi = &bootsym(boot_vid_info); /* The EFI loader fills vga_console_info directly. */ if ( efi_enabled ) return; if ( (bvi->orig_video_isVGA == 1) && (bvi->orig_video_mode == 3) ) { vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3; vga_console_info.u.text_mode_3.font_height = bvi->orig_video_points; vga_console_info.u.text_mode_3.cursor_x = bvi->orig_x; vga_console_info.u.text_mode_3.cursor_y = bvi->orig_y; vga_console_info.u.text_mode_3.rows = bvi->orig_video_lines; vga_console_info.u.text_mode_3.columns = bvi->orig_video_cols; } else if ( bvi->orig_video_isVGA == 0x23 ) { vga_console_info.video_type = XEN_VGATYPE_VESA_LFB; vga_console_info.u.vesa_lfb.width = bvi->lfb_width; vga_console_info.u.vesa_lfb.height = bvi->lfb_height; vga_console_info.u.vesa_lfb.bytes_per_line = bvi->lfb_linelength; vga_console_info.u.vesa_lfb.bits_per_pixel = bvi->lfb_depth; vga_console_info.u.vesa_lfb.lfb_base = bvi->lfb_base; vga_console_info.u.vesa_lfb.lfb_size = bvi->lfb_size; vga_console_info.u.vesa_lfb.red_pos = bvi->red_pos; vga_console_info.u.vesa_lfb.red_size = bvi->red_size; vga_console_info.u.vesa_lfb.green_pos = bvi->green_pos; vga_console_info.u.vesa_lfb.green_size = bvi->green_size; vga_console_info.u.vesa_lfb.blue_pos = bvi->blue_pos; vga_console_info.u.vesa_lfb.blue_size = bvi->blue_size; vga_console_info.u.vesa_lfb.rsvd_pos = bvi->rsvd_pos; vga_console_info.u.vesa_lfb.rsvd_size = bvi->rsvd_size; vga_console_info.u.vesa_lfb.gbl_caps = bvi->capabilities; vga_console_info.u.vesa_lfb.mode_attrs = bvi->vesa_attrib; } } static void __init kexec_reserve_area(struct e820map *e820) { unsigned long kdump_start = kexec_crash_area.start; unsigned long kdump_size = kexec_crash_area.size; static bool_t __initdata is_reserved = 0; kdump_size = (kdump_size + PAGE_SIZE - 1) & PAGE_MASK; if ( (kdump_start == 0) || (kdump_size == 0) || is_reserved ) return; is_reserved = 1; if ( !reserve_e820_ram(e820, kdump_start, kdump_start + kdump_size) ) { printk("Kdump: DISABLED (failed to reserve %luMB (%lukB) at %#lx)" "\n", kdump_size >> 20, kdump_size >> 10, kdump_start); kexec_crash_area.start = kexec_crash_area.size = 0; } else { printk("Kdump: %luMB (%lukB) at %#lx\n", kdump_size >> 20, kdump_size >> 10, kdump_start); } } static void noinline init_done(void) { /* Free (or page-protect) the init areas. */ memset(__init_begin, 0xcc, __init_end - __init_begin); /* int3 poison */ free_xen_data(__init_begin, __init_end); printk("Freed %ldkB init memory.\n", (long)(__init_end-__init_begin)>>10); startup_cpu_idle_loop(); } static bool_t __init loader_is_grub2(const char *loader_name) { /* GRUB1="GNU GRUB 0.xx"; GRUB2="GRUB 1.xx" */ const char *p = strstr(loader_name, "GRUB "); return (p != NULL) && (p[5] != '0'); } static char * __init cmdline_cook(char *p, char *loader_name) { p = p ? : ""; /* Strip leading whitespace. */ while ( *p == ' ' ) p++; /* GRUB2 does not include image name as first item on command line. */ if ( loader_is_grub2(loader_name) ) return p; /* Strip image name plus whitespace. */ while ( (*p != ' ') && (*p != '\0') ) p++; while ( *p == ' ' ) p++; return p; } void __init __start_xen(unsigned long mbi_p) { char *memmap_type = NULL; char *cmdline, *kextra, *loader; unsigned int initrdidx; multiboot_info_t *mbi = __va(mbi_p); module_t *mod = (module_t *)__va(mbi->mods_addr); unsigned long nr_pages, raw_max_page, modules_headroom, *module_map; int i, j, e820_warn = 0, bytes = 0; bool_t acpi_boot_table_init_done = 0; struct ns16550_defaults ns16550 = { .data_bits = 8, .parity = 'n', .stop_bits = 1 }; percpu_init_areas(); set_intr_gate(TRAP_page_fault, &early_page_fault); loader = (mbi->flags & MBI_LOADERNAME) ? (char *)__va(mbi->boot_loader_name) : "unknown"; /* Parse the command-line options. */ cmdline = cmdline_cook((mbi->flags & MBI_CMDLINE) ? __va(mbi->cmdline) : NULL, loader); if ( (kextra = strstr(cmdline, " -- ")) != NULL ) { /* * Options after ' -- ' separator belong to dom0. * 1. Orphan dom0's options from Xen's command line. * 2. Skip all but final leading space from dom0's options. */ *kextra = '\0'; kextra += 3; while ( kextra[1] == ' ' ) kextra++; } cmdline_parse(cmdline); /* Must be after command line argument parsing and before * allocing any xenheap structures wanted in lower memory. */ kexec_early_calculations(); parse_video_info(); set_current((struct vcpu *)0xfffff000); /* debug sanity */ idle_vcpu[0] = current; set_processor_id(0); /* needed early, for smp_processor_id() */ if ( cpu_has_efer ) rdmsrl(MSR_EFER, this_cpu(efer)); asm volatile ( "mov %%cr4,%0" : "=r" (this_cpu(cr4)) ); smp_prepare_boot_cpu(); /* We initialise the serial devices very early so we can get debugging. */ ns16550.io_base = 0x3f8; ns16550.irq = 4; ns16550_init(0, &ns16550); ns16550.io_base = 0x2f8; ns16550.irq = 3; ns16550_init(1, &ns16550); ehci_dbgp_init(); console_init_preirq(); printk("Bootloader: %s\n", loader); printk("Command line: %s\n", cmdline); printk("Video information:\n"); /* Print VGA display mode information. */ switch ( vga_console_info.video_type ) { case XEN_VGATYPE_TEXT_MODE_3: printk(" VGA is text mode %dx%d, font 8x%d\n", vga_console_info.u.text_mode_3.columns, vga_console_info.u.text_mode_3.rows, vga_console_info.u.text_mode_3.font_height); break; case XEN_VGATYPE_VESA_LFB: case XEN_VGATYPE_EFI_LFB: printk(" VGA is graphics mode %dx%d, %d bpp\n", vga_console_info.u.vesa_lfb.width, vga_console_info.u.vesa_lfb.height, vga_console_info.u.vesa_lfb.bits_per_pixel); break; default: printk(" No VGA detected\n"); break; } /* Print VBE/DDC EDID information. */ if ( bootsym(boot_edid_caps) != 0x1313 ) { u16 caps = bootsym(boot_edid_caps); printk(" VBE/DDC methods:%s%s%s; ", (caps & 1) ? " V1" : "", (caps & 2) ? " V2" : "", !(caps & 3) ? " none" : ""); printk("EDID transfer time: %d seconds\n", caps >> 8); if ( *(u32 *)bootsym(boot_edid_info) == 0x13131313 ) { printk(" EDID info not retrieved because "); if ( !(caps & 3) ) printk("no DDC retrieval method detected\n"); else if ( (caps >> 8) > 5 ) printk("takes longer than 5 seconds\n"); else printk("of reasons unknown\n"); } } printk("Disc information:\n"); printk(" Found %d MBR signatures\n", bootsym(boot_mbr_signature_nr)); printk(" Found %d EDD information structures\n", bootsym(boot_edd_info_nr)); /* Check that we have at least one Multiboot module. */ if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) ) EARLY_FAIL("dom0 kernel not specified. " "Check bootloader configuration.\n"); if ( ((unsigned long)cpu0_stack & (STACK_SIZE-1)) != 0 ) EARLY_FAIL("Misaligned CPU0 stack.\n"); if ( efi_enabled ) { set_pdx_range(xen_phys_start >> PAGE_SHIFT, (xen_phys_start + BOOTSTRAP_MAP_BASE) >> PAGE_SHIFT); /* Clean up boot loader identity mappings. */ destroy_xen_mappings(xen_phys_start, xen_phys_start + BOOTSTRAP_MAP_BASE); /* Make boot page tables match non-EFI boot. */ l3_bootmap[l3_table_offset(BOOTSTRAP_MAP_BASE)] = l3e_from_paddr(__pa(l2_bootmap), __PAGE_HYPERVISOR); memmap_type = loader; } else if ( e820_raw_nr != 0 ) { memmap_type = "Xen-e820"; } else if ( mbi->flags & MBI_MEMMAP ) { memmap_type = "Multiboot-e820"; while ( (bytes < mbi->mmap_length) && (e820_raw_nr < E820MAX) ) { memory_map_t *map = __va(mbi->mmap_addr + bytes); /* * This is a gross workaround for a BIOS bug. Some bootloaders do * not write e820 map entries into pre-zeroed memory. This is * okay if the BIOS fills in all fields of the map entry, but * some broken BIOSes do not bother to write the high word of * the length field if the length is smaller than 4GB. We * detect and fix this by flagging sections below 4GB that * appear to be larger than 4GB in size. */ if ( (map->base_addr_high == 0) && (map->length_high != 0) ) { if ( !e820_warn ) { printk("WARNING: Buggy e820 map detected and fixed " "(truncated length fields).\n"); e820_warn = 1; } map->length_high = 0; } e820_raw[e820_raw_nr].addr = ((u64)map->base_addr_high << 32) | (u64)map->base_addr_low; e820_raw[e820_raw_nr].size = ((u64)map->length_high << 32) | (u64)map->length_low; e820_raw[e820_raw_nr].type = map->type; e820_raw_nr++; bytes += map->size + 4; } } else if ( bootsym(lowmem_kb) ) { memmap_type = "Xen-e801"; e820_raw[0].addr = 0; e820_raw[0].size = bootsym(lowmem_kb) << 10; e820_raw[0].type = E820_RAM; e820_raw[1].addr = 0x100000; e820_raw[1].size = bootsym(highmem_kb) << 10; e820_raw[1].type = E820_RAM; e820_raw_nr = 2; } else if ( mbi->flags & MBI_MEMLIMITS ) { memmap_type = "Multiboot-e801"; e820_raw[0].addr = 0; e820_raw[0].size = mbi->mem_lower << 10; e820_raw[0].type = E820_RAM; e820_raw[1].addr = 0x100000; e820_raw[1].size = mbi->mem_upper << 10; e820_raw[1].type = E820_RAM; e820_raw_nr = 2; } else { EARLY_FAIL("Bootloader provided no memory information.\n"); } /* Sanitise the raw E820 map to produce a final clean version. */ max_page = raw_max_page = init_e820(memmap_type, e820_raw, &e820_raw_nr); /* Create a temporary copy of the E820 map. */ memcpy(&boot_e820, &e820, sizeof(e820)); /* Early kexec reservation (explicit static start address). */ nr_pages = 0; for ( i = 0; i < e820.nr_map; i++ ) if ( e820.map[i].type == E820_RAM ) nr_pages += e820.map[i].size >> PAGE_SHIFT; set_kexec_crash_area_size((u64)nr_pages << PAGE_SHIFT); kexec_reserve_area(&boot_e820); initial_images = mod; nr_initial_images = mbi->mods_count; /* * Iterate backwards over all superpage-aligned RAM regions. * * We require superpage alignment because the boot allocator is not yet * initialised. Hence we can only map superpages in the address range * 0 to BOOTSTRAP_DIRECTMAP_END, as this is guaranteed not to require * dynamic allocation of pagetables. * * As well as mapping superpages in that range, in preparation for * initialising the boot allocator, we also look for a region to which * we can relocate the dom0 kernel and other multiboot modules. Also, on * x86/64, we relocate Xen to higher memory. */ for ( i = 0; !efi_enabled && i < mbi->mods_count; i++ ) { if ( mod[i].mod_start & (PAGE_SIZE - 1) ) EARLY_FAIL("Bootloader didn't honor module alignment request.\n"); mod[i].mod_end -= mod[i].mod_start; mod[i].mod_start >>= PAGE_SHIFT; mod[i].reserved = 0; } modules_headroom = bzimage_headroom(bootstrap_map(mod), mod->mod_end); bootstrap_map(NULL); #ifndef highmem_start /* Don't allow split below 4Gb. */ if ( highmem_start < GB(4) ) highmem_start = 0; else /* align to L3 entry boundary */ highmem_start &= ~((1UL << L3_PAGETABLE_SHIFT) - 1); #endif for ( i = boot_e820.nr_map-1; i >= 0; i-- ) { uint64_t s, e, mask = (1UL << L2_PAGETABLE_SHIFT) - 1; uint64_t end, limit = ARRAY_SIZE(l2_identmap) << L2_PAGETABLE_SHIFT; /* Superpage-aligned chunks from BOOTSTRAP_MAP_BASE. */ s = (boot_e820.map[i].addr + mask) & ~mask; e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask; s = max_t(uint64_t, s, BOOTSTRAP_MAP_BASE); if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) ) continue; if ( s < limit ) { end = min(e, limit); set_pdx_range(s >> PAGE_SHIFT, end >> PAGE_SHIFT); map_pages_to_xen((unsigned long)__va(s), s >> PAGE_SHIFT, (end - s) >> PAGE_SHIFT, PAGE_HYPERVISOR); } if ( e > min(HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START, 1UL << (PAGE_SHIFT + 32)) ) e = min(HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START, 1UL << (PAGE_SHIFT + 32)); #define reloc_size ((__pa(&_end) + mask) & ~mask) /* Is the region suitable for relocating Xen? */ if ( !xen_phys_start && e <= limit ) { /* Don't overlap with modules. */ end = consider_modules(s, e, reloc_size + mask, mod, mbi->mods_count, -1); end &= ~mask; } else end = 0; if ( end > s ) { l4_pgentry_t *pl4e; l3_pgentry_t *pl3e; l2_pgentry_t *pl2e; uint64_t load_start; int i, j, k; /* Select relocation address. */ e = end - reloc_size; xen_phys_start = e; bootsym(trampoline_xen_phys_start) = e; /* * Perform relocation to new physical address. * Before doing so we must sync static/global data with main memory * with a barrier(). After this we must *not* modify static/global * data until after we have switched to the relocated pagetables! */ load_start = (unsigned long)_start - XEN_VIRT_START; barrier(); move_memory(e + load_start, load_start, _end - _start, 1); /* Walk initial pagetables, relocating page directory entries. */ pl4e = __va(__pa(idle_pg_table)); for ( i = 0 ; i < L4_PAGETABLE_ENTRIES; i++, pl4e++ ) { if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) ) continue; *pl4e = l4e_from_intpte(l4e_get_intpte(*pl4e) + xen_phys_start); pl3e = l4e_to_l3e(*pl4e); for ( j = 0; j < L3_PAGETABLE_ENTRIES; j++, pl3e++ ) { /* Not present, 1GB mapping, or already relocated? */ if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) || (l3e_get_flags(*pl3e) & _PAGE_PSE) || (l3e_get_pfn(*pl3e) > 0x1000) ) continue; *pl3e = l3e_from_intpte(l3e_get_intpte(*pl3e) + xen_phys_start); pl2e = l3e_to_l2e(*pl3e); for ( k = 0; k < L2_PAGETABLE_ENTRIES; k++, pl2e++ ) { /* Not present, PSE, or already relocated? */ if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) || (l2e_get_flags(*pl2e) & _PAGE_PSE) || (l2e_get_pfn(*pl2e) > 0x1000) ) continue; *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) + xen_phys_start); } } } /* The only data mappings to be relocated are in the Xen area. */ pl2e = __va(__pa(l2_xenmap)); *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT, PAGE_HYPERVISOR | _PAGE_PSE); for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ ) { if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) continue; *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) + xen_phys_start); } /* Re-sync the stack and then switch to relocated pagetables. */ asm volatile ( "rep movsb ; " /* re-sync the stack */ "movq %%cr4,%%rsi ; " "andb $0x7f,%%sil ; " "movq %%rsi,%%cr4 ; " /* CR4.PGE == 0 */ "movq %0,%%cr3 ; " /* CR3 == new pagetables */ "orb $0x80,%%sil ; " "movq %%rsi,%%cr4 " /* CR4.PGE == 1 */ : : "r" (__pa(idle_pg_table)), "S" (cpu0_stack), "D" (__va(__pa(cpu0_stack))), "c" (STACK_SIZE) : "memory" ); bootstrap_map(NULL); } /* Is the region suitable for relocating the multiboot modules? */ for ( j = mbi->mods_count - 1; j >= 0; j-- ) { unsigned long headroom = j ? 0 : modules_headroom; unsigned long size = PAGE_ALIGN(headroom + mod[j].mod_end); if ( mod[j].reserved ) continue; /* Don't overlap with other modules. */ end = consider_modules(s, e, size, mod, mbi->mods_count, j); if ( highmem_start && end > highmem_start ) continue; if ( s < end && (headroom || ((end - size) >> PAGE_SHIFT) > mod[j].mod_start) ) { move_memory(end - size + headroom, (uint64_t)mod[j].mod_start << PAGE_SHIFT, mod[j].mod_end, 0); mod[j].mod_start = (end - size) >> PAGE_SHIFT; mod[j].mod_end += headroom; mod[j].reserved = 1; } } /* Don't overlap with modules. */ e = consider_modules(s, e, PAGE_ALIGN(kexec_crash_area.size), mod, mbi->mods_count, -1); if ( !kexec_crash_area.start && (s < e) ) { e = (e - kexec_crash_area.size) & PAGE_MASK; kexec_crash_area.start = e; } } if ( modules_headroom && !mod->reserved ) EARLY_FAIL("Not enough memory to relocate the dom0 kernel image.\n"); for ( i = 0; i < mbi->mods_count; ++i ) { uint64_t s = (uint64_t)mod[i].mod_start << PAGE_SHIFT; reserve_e820_ram(&boot_e820, s, s + PAGE_ALIGN(mod[i].mod_end)); } if ( !xen_phys_start ) EARLY_FAIL("Not enough memory to relocate Xen.\n"); reserve_e820_ram(&boot_e820, efi_enabled ? mbi->mem_upper : __pa(&_start), __pa(&_end)); /* Late kexec reservation (dynamic start address). */ kexec_reserve_area(&boot_e820); setup_max_pdx(raw_max_page); if ( highmem_start ) xenheap_max_mfn(PFN_DOWN(highmem_start)); /* * Walk every RAM region and map it in its entirety (on x86/64, at least) * and notify it to the boot allocator. */ for ( i = 0; i < boot_e820.nr_map; i++ ) { uint64_t s, e, mask = PAGE_SIZE - 1; uint64_t map_s, map_e; /* Only page alignment required now. */ s = (boot_e820.map[i].addr + mask) & ~mask; e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask; s = max_t(uint64_t, s, 1<<20); if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) ) continue; if ( !acpi_boot_table_init_done && s >= (1ULL << 32) && !acpi_boot_table_init() ) { acpi_boot_table_init_done = 1; srat_parse_regions(s); setup_max_pdx(raw_max_page); } if ( pfn_to_pdx((e - 1) >> PAGE_SHIFT) >= max_pdx ) { if ( pfn_to_pdx(s >> PAGE_SHIFT) >= max_pdx ) { for ( j = i - 1; ; --j ) { if ( boot_e820.map[j].type == E820_RAM ) break; ASSERT(j); } map_e = boot_e820.map[j].addr + boot_e820.map[j].size; for ( j = 0; j < mbi->mods_count; ++j ) { uint64_t end = pfn_to_paddr(mod[j].mod_start) + mod[j].mod_end; if ( map_e < end ) map_e = end; } if ( PFN_UP(map_e) < max_page ) { max_page = PFN_UP(map_e); max_pdx = pfn_to_pdx(max_page - 1) + 1; } printk(XENLOG_WARNING "Ignoring inaccessible memory range" " %013"PRIx64"-%013"PRIx64"\n", s, e); continue; } map_e = e; e = (pdx_to_pfn(max_pdx - 1) + 1ULL) << PAGE_SHIFT; printk(XENLOG_WARNING "Ignoring inaccessible memory range" " %013"PRIx64"-%013"PRIx64"\n", e, map_e); } set_pdx_range(s >> PAGE_SHIFT, e >> PAGE_SHIFT); /* Need to create mappings above BOOTSTRAP_MAP_BASE. */ map_s = max_t(uint64_t, s, BOOTSTRAP_MAP_BASE); map_e = min_t(uint64_t, e, ARRAY_SIZE(l2_identmap) << L2_PAGETABLE_SHIFT); /* Pass mapped memory to allocator /before/ creating new mappings. */ init_boot_pages(s, min(map_s, e)); s = map_s; if ( s < map_e ) { uint64_t mask = (1UL << L2_PAGETABLE_SHIFT) - 1; map_s = (s + mask) & ~mask; map_e &= ~mask; init_boot_pages(map_s, map_e); } if ( map_s > map_e ) map_s = map_e = s; /* Create new mappings /before/ passing memory to the allocator. */ if ( map_e < e ) { uint64_t limit = __pa(HYPERVISOR_VIRT_END - 1) + 1; uint64_t end = min(e, limit); if ( map_e < end ) { map_pages_to_xen((unsigned long)__va(map_e), PFN_DOWN(map_e), PFN_DOWN(end - map_e), PAGE_HYPERVISOR); init_boot_pages(map_e, end); map_e = end; } } if ( map_e < e ) { /* This range must not be passed to the boot allocator and * must also not be mapped with _PAGE_GLOBAL. */ map_pages_to_xen((unsigned long)__va(map_e), PFN_DOWN(map_e), PFN_DOWN(e - map_e), __PAGE_HYPERVISOR); } if ( s < map_s ) { map_pages_to_xen((unsigned long)__va(s), s >> PAGE_SHIFT, (map_s - s) >> PAGE_SHIFT, PAGE_HYPERVISOR); init_boot_pages(s, map_s); } } for ( i = 0; i < mbi->mods_count; ++i ) { set_pdx_range(mod[i].mod_start, mod[i].mod_start + PFN_UP(mod[i].mod_end)); map_pages_to_xen((unsigned long)mfn_to_virt(mod[i].mod_start), mod[i].mod_start, PFN_UP(mod[i].mod_end), PAGE_HYPERVISOR); } if ( kexec_crash_area.size ) { unsigned long s = PFN_DOWN(kexec_crash_area.start); unsigned long e = min(s + PFN_UP(kexec_crash_area.size), PFN_UP(__pa(HYPERVISOR_VIRT_END - 1))); if ( e > s ) map_pages_to_xen((unsigned long)__va(kexec_crash_area.start), s, e - s, PAGE_HYPERVISOR); } xen_virt_end = ((unsigned long)_end + (1UL << L2_PAGETABLE_SHIFT) - 1) & ~((1UL << L2_PAGETABLE_SHIFT) - 1); destroy_xen_mappings(xen_virt_end, XEN_VIRT_START + BOOTSTRAP_MAP_BASE); memguard_init(); nr_pages = 0; for ( i = 0; i < e820.nr_map; i++ ) if ( e820.map[i].type == E820_RAM ) nr_pages += e820.map[i].size >> PAGE_SHIFT; printk("System RAM: %luMB (%lukB)\n", nr_pages >> (20 - PAGE_SHIFT), nr_pages << (PAGE_SHIFT - 10)); total_pages = nr_pages; /* Sanity check for unwanted bloat of certain hypercall structures. */ BUILD_BUG_ON(sizeof(((struct xen_platform_op *)0)->u) != sizeof(((struct xen_platform_op *)0)->u.pad)); BUILD_BUG_ON(sizeof(((struct xen_domctl *)0)->u) != sizeof(((struct xen_domctl *)0)->u.pad)); BUILD_BUG_ON(sizeof(((struct xen_sysctl *)0)->u) != sizeof(((struct xen_sysctl *)0)->u.pad)); BUILD_BUG_ON(sizeof(start_info_t) > PAGE_SIZE); BUILD_BUG_ON(sizeof(shared_info_t) > PAGE_SIZE); BUILD_BUG_ON(sizeof(struct vcpu_info) != 64); BUILD_BUG_ON(sizeof(((struct compat_platform_op *)0)->u) != sizeof(((struct compat_platform_op *)0)->u.pad)); BUILD_BUG_ON(sizeof(start_info_compat_t) > PAGE_SIZE); BUILD_BUG_ON(sizeof(struct compat_vcpu_info) != 64); /* Check definitions in public headers match internal defs. */ BUILD_BUG_ON(__HYPERVISOR_VIRT_START != HYPERVISOR_VIRT_START); BUILD_BUG_ON(__HYPERVISOR_VIRT_END != HYPERVISOR_VIRT_END); BUILD_BUG_ON(MACH2PHYS_VIRT_START != RO_MPT_VIRT_START); BUILD_BUG_ON(MACH2PHYS_VIRT_END != RO_MPT_VIRT_END); init_frametable(); if ( !acpi_boot_table_init_done ) acpi_boot_table_init(); acpi_numa_init(); numa_initmem_init(0, raw_max_page); end_boot_allocator(); system_state = SYS_STATE_boot; if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) ) { unsigned long limit = virt_to_mfn(HYPERVISOR_VIRT_END - 1); uint64_t mask = PAGE_SIZE - 1; if ( !highmem_start ) xenheap_max_mfn(limit); /* Pass the remaining memory to the allocator. */ for ( i = 0; i < boot_e820.nr_map; i++ ) { uint64_t s, e; if ( boot_e820.map[i].type != E820_RAM ) continue; s = (boot_e820.map[i].addr + mask) & ~mask; e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask; if ( PFN_DOWN(e) <= limit ) continue; if ( PFN_DOWN(s) <= limit ) s = pfn_to_paddr(limit + 1); init_domheap_pages(s, e); } if ( opt_tmem ) { printk(XENLOG_WARNING "TMEM physical RAM limit exceeded, disabling TMEM\n"); opt_tmem = 0; } } vm_init(); vesa_init(); softirq_init(); tasklet_subsys_init(); early_cpu_init(); paging_init(); tboot_probe(); /* Unmap the first page of CPU0's stack. */ memguard_guard_stack(cpu0_stack); open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period); if ( opt_watchdog ) nmi_watchdog = NMI_LOCAL_APIC; sort_exception_tables(); find_smp_config(); dmi_scan_machine(); generic_apic_probe(); acpi_boot_init(); if ( smp_found_config ) get_smp_config(); if ( opt_nosmp ) { max_cpus = 0; set_nr_cpu_ids(1); } else { set_nr_cpu_ids(max_cpus); max_cpus = nr_cpu_ids; } /* Low mappings were only needed for some BIOS table parsing. */ zap_low_mappings(); mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges", RANGESETF_prettyprint_hex); init_apic_mappings(); normalise_cpu_order(); init_cpu_to_node(); x2apic_bsp_setup(); init_IRQ(); module_map = xmalloc_array(unsigned long, BITS_TO_LONGS(mbi->mods_count)); bitmap_fill(module_map, mbi->mods_count); __clear_bit(0, module_map); /* Dom0 kernel is always first */ xsm_init(module_map, mbi, bootstrap_map); microcode_grab_module(module_map, mbi, bootstrap_map); timer_init(); init_idle_domain(); trap_init(); rcu_init(); early_time_init(); arch_init_memory(); identify_cpu(&boot_cpu_data); if ( cpu_has_fxsr ) set_in_cr4(X86_CR4_OSFXSR); if ( cpu_has_xmm ) set_in_cr4(X86_CR4_OSXMMEXCPT); if ( disable_smep ) setup_clear_cpu_cap(X86_FEATURE_SMEP); if ( cpu_has_smep ) set_in_cr4(X86_CR4_SMEP); if ( cpu_has_fsgsbase ) set_in_cr4(X86_CR4_FSGSBASE); local_irq_enable(); pt_pci_init(); vesa_mtrr_init(); acpi_mmcfg_init(); early_msi_init(); iommu_setup(); /* setup iommu if available */ smp_prepare_cpus(max_cpus); spin_debug_enable(); /* * Initialise higher-level timer functions. We do this fairly late * (after interrupts got enabled) because the time bases and scale * factors need to be updated regularly. */ init_xen_time(); initialize_keytable(); console_init_postirq(); do_presmp_initcalls(); for_each_present_cpu ( i ) { /* Set up cpu_to_node[]. */ srat_detect_node(i); /* Set up node_to_cpumask based on cpu_to_node[]. */ numa_add_cpu(i); if ( (num_online_cpus() < max_cpus) && !cpu_online(i) ) { int ret = cpu_up(i); if ( ret != 0 ) printk("Failed to bring up CPU %u (error %d)\n", i, ret); } } printk("Brought up %ld CPUs\n", (long)num_online_cpus()); smp_cpus_done(); do_initcalls(); if ( opt_watchdog ) watchdog_setup(); if ( !tboot_protect_mem_regions() ) panic("Could not protect TXT memory regions"); /* Create initial domain 0. */ dom0 = domain_create(0, DOMCRF_s3_integrity, 0); if ( IS_ERR(dom0) || (alloc_dom0_vcpu0() == NULL) ) panic("Error creating domain 0"); dom0->is_privileged = 1; dom0->target = NULL; /* Grab the DOM0 command line. */ cmdline = (char *)(mod[0].string ? __va(mod[0].string) : NULL); if ( (cmdline != NULL) || (kextra != NULL) ) { static char __initdata dom0_cmdline[MAX_GUEST_CMDLINE]; cmdline = cmdline_cook(cmdline, loader); safe_strcpy(dom0_cmdline, cmdline); if ( kextra != NULL ) /* kextra always includes exactly one leading space. */ safe_strcat(dom0_cmdline, kextra); /* Append any extra parameters. */ if ( skip_ioapic_setup && !strstr(dom0_cmdline, "noapic") ) safe_strcat(dom0_cmdline, " noapic"); if ( acpi_skip_timer_override && !strstr(dom0_cmdline, "acpi_skip_timer_override") ) safe_strcat(dom0_cmdline, " acpi_skip_timer_override"); if ( (strlen(acpi_param) == 0) && acpi_disabled ) { printk("ACPI is disabled, notifying Domain 0 (acpi=off)\n"); safe_strcpy(acpi_param, "off"); } if ( (strlen(acpi_param) != 0) && !strstr(dom0_cmdline, "acpi=") ) { safe_strcat(dom0_cmdline, " acpi="); safe_strcat(dom0_cmdline, acpi_param); } cmdline = dom0_cmdline; } if ( xen_cpuidle ) xen_processor_pmbits |= XEN_PROCESSOR_PM_CX; initrdidx = find_first_bit(module_map, mbi->mods_count); if ( bitmap_weight(module_map, mbi->mods_count) > 1 ) printk(XENLOG_WARNING "Multiple initrd candidates, picking module #%u\n", initrdidx); /* * We're going to setup domain0 using the module(s) that we stashed safely * above our heap. The second module, if present, is an initrd ramdisk. */ if ( construct_dom0(dom0, mod, modules_headroom, (initrdidx > 0) && (initrdidx < mbi->mods_count) ? mod + initrdidx : NULL, bootstrap_map, cmdline) != 0) panic("Could not set up DOM0 guest OS"); /* Scrub RAM that is still free and so may go to an unprivileged domain. */ scrub_heap_pages(); init_trace_bufs(); init_constructors(); console_endboot(); /* Hide UART from DOM0 if we're using it */ serial_endboot(); dmi_end_boot(); system_state = SYS_STATE_active; domain_unpause_by_systemcontroller(dom0); reset_stack_and_jump(init_done); } void arch_get_xen_caps(xen_capabilities_info_t *info) { /* Interface name is always xen-3.0-* for Xen-3.x. */ int major = 3, minor = 0; char s[32]; (*info)[0] = '\0'; snprintf(s, sizeof(s), "xen-%d.%d-x86_64 ", major, minor); safe_strcat(*info, s); snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor); safe_strcat(*info, s); if ( hvm_enabled ) { snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor); safe_strcat(*info, s); snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor); safe_strcat(*info, s); snprintf(s, sizeof(s), "hvm-%d.%d-x86_64 ", major, minor); safe_strcat(*info, s); } } int __init xen_in_range(unsigned long mfn) { paddr_t start, end; int i; enum { region_s3, region_text, region_bss, nr_regions }; static struct { paddr_t s, e; } xen_regions[nr_regions] __initdata; /* initialize first time */ if ( !xen_regions[0].s ) { /* S3 resume code (and other real mode trampoline code) */ xen_regions[region_s3].s = bootsym_phys(trampoline_start); xen_regions[region_s3].e = bootsym_phys(trampoline_end); /* hypervisor code + data */ xen_regions[region_text].s =__pa(&_stext); xen_regions[region_text].e = __pa(&__init_begin); /* bss */ xen_regions[region_bss].s = __pa(&__bss_start); xen_regions[region_bss].e = __pa(&_end); } start = (paddr_t)mfn << PAGE_SHIFT; end = start + PAGE_SIZE; for ( i = 0; i < nr_regions; i++ ) if ( (start < xen_regions[i].e) && (end > xen_regions[i].s) ) return 1; return 0; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/dmi_scan.c0000664000175000017500000003542412307313555014530 0ustar smbsmb#include #include #include #include #include #include #include #include #include #include #include #include #include #include #define bt_ioremap(b,l) ((void *)__acpi_map_table(b,l)) #define bt_iounmap(b,l) ((void)0) #define memcpy_fromio memcpy #define alloc_bootmem(l) xmalloc_bytes(l) struct dmi_eps { char anchor[5]; /* "_DMI_" */ u8 checksum; u16 size; u32 address; u16 num_structures; u8 revision; } __attribute__((packed)); struct smbios_eps { char anchor[4]; /* "_SM_" */ u8 checksum; u8 length; u8 major, minor; u16 max_size; u8 revision; u8 _rsrvd_[5]; struct dmi_eps dmi; } __attribute__((packed)); struct dmi_header { u8 type; u8 length; u16 handle; }; #undef DMI_DEBUG #ifdef DMI_DEBUG #define dmi_printk(x) printk x #else #define dmi_printk(x) #endif static char * __init dmi_string(struct dmi_header *dm, u8 s) { char *bp=(char *)dm; bp+=dm->length; if(!s) return ""; s--; while(s>0 && *bp) { bp+=strlen(bp); bp++; s--; } return bp; } /* * We have to be cautious here. We have seen BIOSes with DMI pointers * pointing to completely the wrong place for example */ static int __init dmi_table(u32 base, int len, int num, void (*decode)(struct dmi_header *)) { u8 *buf; struct dmi_header *dm; u8 *data; int i=0; buf = bt_ioremap(base, len); if(buf==NULL) return -1; data = buf; /* * Stop when we see all the items the table claimed to have * OR we run off the end of the table (also happens) */ while(ilength; while(data-bufanchor, "_SM_", 4) && dmi_checksum(eps, eps->length) && memcmp(eps->dmi.anchor, "_DMI_", 5) == 0 && dmi_checksum(&eps->dmi, sizeof(eps->dmi))) { efi_dmi_address = eps->dmi.address; efi_dmi_size = eps->dmi.size; } } int __init dmi_get_table(u32 *base, u32 *len) { struct dmi_eps eps; char __iomem *p, *q; if (efi_enabled) { if (!efi_dmi_size) return -1; *base = efi_dmi_address; *len = efi_dmi_size; return 0; } p = maddr_to_virt(0xF0000); for (q = p; q < p + 0x10000; q += 16) { memcpy_fromio(&eps, q, 15); if (memcmp(eps.anchor, "_DMI_", 5) == 0 && dmi_checksum(&eps, sizeof(eps))) { *base = eps.address; *len = eps.size; return 0; } } return -1; } static int __init _dmi_iterate(const struct dmi_eps *dmi, const struct smbios_eps __iomem *smbios, void (*decode)(struct dmi_header *)) { u16 num = dmi->num_structures; u16 len = dmi->size; u32 base = dmi->address; /* * DMI version 0.0 means that the real version is taken from * the SMBIOS version, which we may not know at this point. */ if (dmi->revision) printk(KERN_INFO "DMI %d.%d present.\n", dmi->revision >> 4, dmi->revision & 0x0f); else if (!smbios) printk(KERN_INFO "DMI present.\n"); dmi_printk((KERN_INFO "%d structures occupying %d bytes.\n", num, len)); dmi_printk((KERN_INFO "DMI table at 0x%08X.\n", base)); return dmi_table(base, len, num, decode); } static int __init dmi_iterate(void (*decode)(struct dmi_header *)) { struct dmi_eps eps; char __iomem *p, *q; p = maddr_to_virt(0xF0000); for (q = p; q < p + 0x10000; q += 16) { memcpy_fromio(&eps, q, sizeof(eps)); if (memcmp(eps.anchor, "_DMI_", 5) == 0 && dmi_checksum(&eps, sizeof(eps))) return _dmi_iterate(&eps, NULL, decode); } return -1; } static int __init dmi_efi_iterate(void (*decode)(struct dmi_header *)) { struct smbios_eps eps; const struct smbios_eps __iomem *p; int ret = -1; if (efi.smbios == EFI_INVALID_TABLE_ADDR) return -1; p = bt_ioremap(efi.smbios, sizeof(eps)); if (!p) return -1; memcpy_fromio(&eps, p, sizeof(eps)); bt_iounmap(p, sizeof(eps)); if (memcmp(eps.anchor, "_SM_", 4)) return -1; p = bt_ioremap(efi.smbios, eps.length); if (!p) return -1; if (dmi_checksum(p, eps.length) && memcmp(eps.dmi.anchor, "_DMI_", 5) == 0 && dmi_checksum(&eps.dmi, sizeof(eps.dmi))) { printk(KERN_INFO "SMBIOS %d.%d present.\n", eps.major, eps.minor); ret = _dmi_iterate(&eps.dmi, p, decode); } bt_iounmap(p, eps.length); return ret; } static char *__initdata dmi_ident[DMI_STRING_MAX]; /* * Save a DMI string */ static void __init dmi_save_ident(struct dmi_header *dm, int slot, int string) { char *d = (char*)dm; char *p = dmi_string(dm, d[string]); if(p==NULL || *p == 0) return; if (dmi_ident[slot]) return; dmi_ident[slot] = alloc_bootmem(strlen(p)+1); if(dmi_ident[slot]) strlcpy(dmi_ident[slot], p, strlen(p)+1); else printk(KERN_ERR "dmi_save_ident: out of memory.\n"); } /* * Ugly compatibility crap. */ #define dmi_blacklist dmi_system_id #define NO_MATCH { DMI_NONE, NULL} #define MATCH DMI_MATCH /* * Toshiba keyboard likes to repeat keys when they are not repeated. */ static __init int broken_toshiba_keyboard(struct dmi_blacklist *d) { printk(KERN_WARNING "Toshiba with broken keyboard detected. If your keyboard sometimes generates 3 keypresses instead of one, see http://davyd.ucc.asn.au/projects/toshiba/README\n"); return 0; } static int __init ich10_bios_quirk(struct dmi_system_id *d) { u32 port, smictl; if ( pci_conf_read16(0, 0, 0x1f, 0, PCI_VENDOR_ID) != 0x8086 ) return 0; switch ( pci_conf_read16(0, 0, 0x1f, 0, PCI_DEVICE_ID) ) { case 0x3a14: case 0x3a16: case 0x3a18: case 0x3a1a: port = (pci_conf_read16(0, 0, 0x1f, 0, 0x40) & 0xff80) + 0x30; smictl = inl(port); /* turn off LEGACY_USB{,2}_EN if enabled */ if ( smictl & 0x20008 ) outl(smictl & ~0x20008, port); break; } return 0; } #ifdef CONFIG_ACPI_SLEEP static __init int reset_videomode_after_s3(struct dmi_blacklist *d) { /* See acpi_wakeup.S */ acpi_video_flags |= 2; return 0; } #endif #ifdef CONFIG_ACPI_BOOT static __init __attribute__((unused)) int dmi_disable_acpi(struct dmi_blacklist *d) { if (!acpi_force) { printk(KERN_NOTICE "%s detected: acpi off\n",d->ident); disable_acpi(); } else { printk(KERN_NOTICE "Warning: DMI blacklist says broken, but acpi forced\n"); } return 0; } /* * Limit ACPI to CPU enumeration for HT */ static __init __attribute__((unused)) int force_acpi_ht(struct dmi_blacklist *d) { if (!acpi_force) { printk(KERN_NOTICE "%s detected: force use of acpi=ht\n", d->ident); disable_acpi(); acpi_ht = 1; } else { printk(KERN_NOTICE "Warning: acpi=force overrules DMI blacklist: acpi=ht\n"); } return 0; } #endif /* * Process the DMI blacklists */ /* * This will be expanded over time to force things like the APM * interrupt mask settings according to the laptop */ static __initdata struct dmi_blacklist dmi_blacklist[]={ { broken_toshiba_keyboard, "Toshiba Satellite 4030cdt", { /* Keyboard generates spurious repeats */ MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), NO_MATCH, NO_MATCH, NO_MATCH } }, #ifdef CONFIG_ACPI_SLEEP { reset_videomode_after_s3, "Toshiba Satellite 4030cdt", { /* Reset video mode after returning from ACPI S3 sleep */ MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), NO_MATCH, NO_MATCH, NO_MATCH } }, #endif { ich10_bios_quirk, "Intel board & BIOS", /* * BIOS leaves legacy USB emulation enabled while * SMM can't properly handle it. */ { MATCH(DMI_BOARD_VENDOR, "Intel Corp"), MATCH(DMI_BIOS_VENDOR, "Intel Corp"), NO_MATCH, NO_MATCH } }, #ifdef CONFIG_ACPI_BOOT /* * If your system is blacklisted here, but you find that acpi=force * works for you, please contact acpi-devel@sourceforge.net */ /* * Boxes that need ACPI disabled */ { dmi_disable_acpi, "IBM Thinkpad", { MATCH(DMI_BOARD_VENDOR, "IBM"), MATCH(DMI_BOARD_NAME, "2629H1G"), NO_MATCH, NO_MATCH }}, /* * Boxes that need acpi=ht */ { force_acpi_ht, "FSC Primergy T850", { MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"), NO_MATCH, NO_MATCH }}, { force_acpi_ht, "DELL GX240", { MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"), MATCH(DMI_BOARD_NAME, "OptiPlex GX240"), NO_MATCH, NO_MATCH }}, { force_acpi_ht, "HP VISUALIZE NT Workstation", { MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"), NO_MATCH, NO_MATCH }}, { force_acpi_ht, "Compaq Workstation W8000", { MATCH(DMI_SYS_VENDOR, "Compaq"), MATCH(DMI_PRODUCT_NAME, "Workstation W8000"), NO_MATCH, NO_MATCH }}, { force_acpi_ht, "ASUS P4B266", { MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), MATCH(DMI_BOARD_NAME, "P4B266"), NO_MATCH, NO_MATCH }}, { force_acpi_ht, "ASUS P2B-DS", { MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), MATCH(DMI_BOARD_NAME, "P2B-DS"), NO_MATCH, NO_MATCH }}, { force_acpi_ht, "ASUS CUR-DLS", { MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), MATCH(DMI_BOARD_NAME, "CUR-DLS"), NO_MATCH, NO_MATCH }}, { force_acpi_ht, "ABIT i440BX-W83977", { MATCH(DMI_BOARD_VENDOR, "ABIT "), MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"), NO_MATCH, NO_MATCH }}, { force_acpi_ht, "IBM Bladecenter", { MATCH(DMI_BOARD_VENDOR, "IBM"), MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"), NO_MATCH, NO_MATCH }}, { force_acpi_ht, "IBM eServer xSeries 360", { MATCH(DMI_BOARD_VENDOR, "IBM"), MATCH(DMI_BOARD_NAME, "eServer xSeries 360"), NO_MATCH, NO_MATCH }}, { force_acpi_ht, "IBM eserver xSeries 330", { MATCH(DMI_BOARD_VENDOR, "IBM"), MATCH(DMI_BOARD_NAME, "eserver xSeries 330"), NO_MATCH, NO_MATCH }}, { force_acpi_ht, "IBM eserver xSeries 440", { MATCH(DMI_BOARD_VENDOR, "IBM"), MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"), NO_MATCH, NO_MATCH }}, #endif // CONFIG_ACPI_BOOT { NULL, } }; /* * Process a DMI table entry. Right now all we care about are the BIOS * and machine entries. For 2.5 we should pull the smbus controller info * out of here. */ static void __init dmi_decode(struct dmi_header *dm) { #ifdef DMI_DEBUG u8 *data = (u8 *)dm; #endif switch(dm->type) { case 0: dmi_printk(("BIOS Vendor: %s\n", dmi_string(dm, data[4]))); dmi_save_ident(dm, DMI_BIOS_VENDOR, 4); dmi_printk(("BIOS Version: %s\n", dmi_string(dm, data[5]))); dmi_save_ident(dm, DMI_BIOS_VERSION, 5); dmi_printk(("BIOS Release: %s\n", dmi_string(dm, data[8]))); dmi_save_ident(dm, DMI_BIOS_DATE, 8); break; case 1: dmi_printk(("System Vendor: %s\n", dmi_string(dm, data[4]))); dmi_save_ident(dm, DMI_SYS_VENDOR, 4); dmi_printk(("Product Name: %s\n", dmi_string(dm, data[5]))); dmi_save_ident(dm, DMI_PRODUCT_NAME, 5); dmi_printk(("Version: %s\n", dmi_string(dm, data[6]))); dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6); dmi_printk(("Serial Number: %s\n", dmi_string(dm, data[7]))); break; case 2: dmi_printk(("Board Vendor: %s\n", dmi_string(dm, data[4]))); dmi_save_ident(dm, DMI_BOARD_VENDOR, 4); dmi_printk(("Board Name: %s\n", dmi_string(dm, data[5]))); dmi_save_ident(dm, DMI_BOARD_NAME, 5); dmi_printk(("Board Version: %s\n", dmi_string(dm, data[6]))); dmi_save_ident(dm, DMI_BOARD_VERSION, 6); break; } } void __init dmi_scan_machine(void) { if ((!efi_enabled ? dmi_iterate(dmi_decode) : dmi_efi_iterate(dmi_decode)) == 0) dmi_check_system(dmi_blacklist); else printk(KERN_INFO "DMI not present.\n"); } /** * dmi_check_system - check system DMI data * @list: array of dmi_system_id structures to match against * * Walk the blacklist table running matching functions until someone * returns non zero or we hit the end. Callback function is called for * each successfull match. Returns the number of matches. */ int __init dmi_check_system(struct dmi_system_id *list) { int i, count = 0; struct dmi_system_id *d = list; while (d->ident) { for (i = 0; i < ARRAY_SIZE(d->matches); i++) { int s = d->matches[i].slot; if (s == DMI_NONE) continue; if (dmi_ident[s] && strstr(dmi_ident[s], d->matches[i].substr)) continue; /* No match */ goto fail; } if (d->callback && d->callback(d)) break; count++; fail: d++; } return count; } /** * dmi_get_date - parse a DMI date * @field: data index (see enum dmi_field) * @yearp: optional out parameter for the year * @monthp: optional out parameter for the month * @dayp: optional out parameter for the day * * The date field is assumed to be in the form resembling * [mm[/dd]]/yy[yy] and the result is stored in the out * parameters any or all of which can be omitted. * * If the field doesn't exist, all out parameters are set to zero * and false is returned. Otherwise, true is returned with any * invalid part of date set to zero. * * On return, year, month and day are guaranteed to be in the * range of [0,9999], [0,12] and [0,31] respectively. */ bool_t __init dmi_get_date(int field, int *yearp, int *monthp, int *dayp) { int year = 0, month = 0, day = 0; bool_t exists; const char *s, *e, *y; s = field < DMI_STRING_MAX ? dmi_ident[field] : NULL; exists = !!s; if (!exists) goto out; /* * Determine year first. We assume the date string resembles * mm/dd/yy[yy] but the original code extracted only the year * from the end. Keep the behavior in the spirit of no * surprises. */ y = strrchr(s, '/'); if (!y) goto out; y++; year = simple_strtoul(y, &e, 10); if (y != e && year < 100) { /* 2-digit year */ year += 1900; if (year < 1996) /* no dates < spec 1.0 */ year += 100; } if (year > 9999) /* year should fit in %04d */ year = 0; /* parse the mm and dd */ month = simple_strtoul(s, &e, 10); if (s == e || *e != '/' || !month || month > 12) { month = 0; goto out; } s = e + 1; day = simple_strtoul(s, &e, 10); if (s == y || s == e || *e != '/' || day > 31) day = 0; out: if (yearp) *yearp = year; if (monthp) *monthp = month; if (dayp) *dayp = day; return exists; } void __init dmi_end_boot(void) { unsigned int i; for ( i = 0; i < DMI_STRING_MAX; ++i ) xfree(dmi_ident[i]); } xen-4.4.0/xen/arch/x86/compat.c0000664000175000017500000000171412307313555014231 0ustar smbsmb/****************************************************************************** * compat.c * * Implementations of legacy hypercalls. These call through to the new * hypercall after doing necessary argument munging. */ #include #include #include #ifndef COMPAT typedef long ret_t; #endif /* Legacy hypercall (as of 0x00030202). */ ret_t do_physdev_op_compat(XEN_GUEST_HANDLE(physdev_op_t) uop) { struct physdev_op op; if ( unlikely(copy_from_guest(&op, uop, 1) != 0) ) return -EFAULT; return do_physdev_op(op.cmd, guest_handle_from_ptr(&uop.p->u, void)); } #ifndef COMPAT /* Legacy hypercall (as of 0x00030202). */ long do_event_channel_op_compat(XEN_GUEST_HANDLE_PARAM(evtchn_op_t) uop) { struct evtchn_op op; if ( unlikely(copy_from_guest(&op, uop, 1) != 0) ) return -EFAULT; return do_event_channel_op(op.cmd, guest_handle_from_ptr(&uop.p->u, void)); } #endif xen-4.4.0/xen/arch/x86/apic.c0000664000175000017500000012227512307313555013670 0ustar smbsmb/* * based on linux-2.6.17.13/arch/i386/kernel/apic.c * * Local APIC handling, local APIC timers * * (c) 1999, 2000 Ingo Molnar * * Fixes * Maciej W. Rozycki : Bits for genuine 82489DX APICs; * thanks to Eric Gilmore * and Rolf G. Tews * for testing these extensively. * Maciej W. Rozycki : Various updates and fixes. * Mikael Pettersson : Power Management for UP-APIC. * Pavel Machek and * Mikael Pettersson : PM converted to driver model. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static bool_t tdt_enabled __read_mostly; static bool_t tdt_enable __initdata = 1; boolean_param("tdt", tdt_enable); static struct { int active; /* r/w apic fields */ unsigned int apic_id; unsigned int apic_taskpri; unsigned int apic_ldr; unsigned int apic_dfr; unsigned int apic_spiv; unsigned int apic_lvtt; unsigned int apic_lvtpc; unsigned int apic_lvtcmci; unsigned int apic_lvt0; unsigned int apic_lvt1; unsigned int apic_lvterr; unsigned int apic_tmict; unsigned int apic_tdcr; unsigned int apic_thmr; } apic_pm_state; /* * Knob to control our willingness to enable the local APIC. */ static s8 __initdata enable_local_apic; /* -1=force-disable, +1=force-enable */ /* * Debug level */ u8 __read_mostly apic_verbosity; static bool_t __initdata opt_x2apic = 1; boolean_param("x2apic", opt_x2apic); /* * Bootstrap processor local APIC boot mode - so we can undo our changes * to the APIC state. */ static enum apic_mode apic_boot_mode = APIC_MODE_INVALID; bool_t __read_mostly x2apic_enabled = 0; bool_t __read_mostly directed_eoi_enabled = 0; static int modern_apic(void) { unsigned int lvr, version; /* AMD systems use old APIC versions, so check the CPU */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= 0xf) return 1; lvr = apic_read(APIC_LVR); version = GET_APIC_VERSION(lvr); return version >= 0x14; } /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. */ void ack_bad_irq(unsigned int irq) { printk("unexpected IRQ trap at irq %02x\n", irq); /* * Currently unexpected vectors happen only on SMP and APIC. * We _must_ ack these because every local APIC has only N * irq slots per priority level, and a 'hanging, unacked' IRQ * holds up an irq slot - in excessive cases (when multiple * unexpected vectors occur) that might lock up the APIC * completely. * But only ack when the APIC is enabled -AK */ if (cpu_has_apic) ack_APIC_irq(); } void __init apic_intr_init(void) { smp_intr_init(); /* self generated IPI for local APIC timer */ set_direct_apic_vector(LOCAL_TIMER_VECTOR, apic_timer_interrupt); /* IPI vectors for APIC spurious and error interrupts */ set_direct_apic_vector(SPURIOUS_APIC_VECTOR, spurious_interrupt); set_direct_apic_vector(ERROR_APIC_VECTOR, error_interrupt); /* Performance Counters Interrupt */ set_direct_apic_vector(PMU_APIC_VECTOR, pmu_apic_interrupt); } /* Using APIC to generate smp_local_timer_interrupt? */ static bool_t __read_mostly using_apic_timer; static bool_t __read_mostly enabled_via_apicbase; int get_physical_broadcast(void) { if (modern_apic()) return 0xff; else return 0xf; } int get_maxlvt(void) { unsigned int v, ver, maxlvt; v = apic_read(APIC_LVR); ver = GET_APIC_VERSION(v); /* 82489DXs do not report # of LVT entries. */ maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2; return maxlvt; } void clear_local_APIC(void) { int maxlvt; unsigned long v; maxlvt = get_maxlvt(); /* Work around AMD Erratum 411. This is a nice thing to do anyway. */ apic_write_around(APIC_TMICT, 0); /* * Masking an LVT entry on a P6 can trigger a local APIC error * if the vector is zero. Mask LVTERR first to prevent this. */ if (maxlvt >= 3) { v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); } /* * Careful: we have to set masks only first to deassert * any level-triggered sources. */ v = apic_read(APIC_LVTT); apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); v = apic_read(APIC_LVT0); apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); v = apic_read(APIC_LVT1); apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); if (maxlvt >= 4) { v = apic_read(APIC_LVTPC); apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); } /* lets not touch this if we didn't frob it */ #ifdef CONFIG_X86_MCE_THERMAL if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); } #endif if (maxlvt >= 6) { v = apic_read(APIC_CMCI); apic_write_around(APIC_CMCI, v | APIC_LVT_MASKED); } /* * Clean APIC state for other OSs: */ apic_write_around(APIC_LVTT, APIC_LVT_MASKED); apic_write_around(APIC_LVT0, APIC_LVT_MASKED); apic_write_around(APIC_LVT1, APIC_LVT_MASKED); if (maxlvt >= 3) apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); if (maxlvt >= 4) apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); #ifdef CONFIG_X86_MCE_THERMAL if (maxlvt >= 5) apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); #endif if (maxlvt >= 6) apic_write_around(APIC_CMCI, APIC_LVT_MASKED); v = GET_APIC_VERSION(apic_read(APIC_LVR)); if (APIC_INTEGRATED(v)) { /* !82489DX */ if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } } void __init connect_bsp_APIC(void) { if (pic_mode) { /* * Do not trust the local APIC being empty at bootup. */ clear_local_APIC(); /* * PIC mode, enable APIC mode in the IMCR, i.e. * connect BSP's local APIC to INT and NMI lines. */ apic_printk(APIC_VERBOSE, "leaving PIC mode, " "enabling APIC mode.\n"); outb(0x70, 0x22); outb(0x01, 0x23); } enable_apic_mode(); } void disconnect_bsp_APIC(int virt_wire_setup) { if (pic_mode) { /* * Put the board back into PIC mode (has an effect * only on certain older boards). Note that APIC * interrupts, including IPIs, won't work beyond * this point! The only exception are INIT IPIs. */ apic_printk(APIC_VERBOSE, "disabling APIC mode, " "entering PIC mode.\n"); outb(0x70, 0x22); outb(0x00, 0x23); } else { /* Go back to Virtual Wire compatibility mode */ unsigned long value; /* For the spurious interrupt use vector F, and enable it */ value = apic_read(APIC_SPIV); value &= ~APIC_VECTOR_MASK; value |= APIC_SPIV_APIC_ENABLED; value |= 0xf; apic_write_around(APIC_SPIV, value); if (!virt_wire_setup) { /* For LVT0 make it edge triggered, active high, external and enabled */ value = apic_read(APIC_LVT0); value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); apic_write_around(APIC_LVT0, value); } else { /* Disable LVT0 */ apic_write_around(APIC_LVT0, APIC_LVT_MASKED); } /* For LVT1 make it edge triggered, active high, nmi and enabled */ value = apic_read(APIC_LVT1); value &= ~( APIC_MODE_MASK | APIC_SEND_PENDING | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); apic_write_around(APIC_LVT1, value); } } void disable_local_APIC(void) { clear_local_APIC(); /* * Disable APIC (implies clearing of registers * for 82489DX!). */ apic_write_around(APIC_SPIV, apic_read(APIC_SPIV) & ~APIC_SPIV_APIC_ENABLED); if (enabled_via_apicbase) { uint64_t msr_content; rdmsrl(MSR_IA32_APICBASE, msr_content); wrmsrl(MSR_IA32_APICBASE, msr_content & ~(MSR_IA32_APICBASE_ENABLE|MSR_IA32_APICBASE_EXTD)); } if ( kexecing ) { uint64_t msr_content; rdmsrl(MSR_IA32_APICBASE, msr_content); msr_content &= ~(MSR_IA32_APICBASE_ENABLE|MSR_IA32_APICBASE_EXTD); wrmsrl(MSR_IA32_APICBASE, msr_content); switch ( apic_boot_mode ) { case APIC_MODE_DISABLED: break; /* Nothing to do - we did this above */ case APIC_MODE_XAPIC: msr_content |= MSR_IA32_APICBASE_ENABLE; wrmsrl(MSR_IA32_APICBASE, msr_content); break; case APIC_MODE_X2APIC: msr_content |= (MSR_IA32_APICBASE_ENABLE|MSR_IA32_APICBASE_EXTD); wrmsrl(MSR_IA32_APICBASE, msr_content); break; default: printk("Default case when reverting #%d lapic to boot state\n", smp_processor_id()); break; } } } /* * This is to verify that we're looking at a real local APIC. * Check these against your board if the CPUs aren't getting * started for no apparent reason. */ int __init verify_local_APIC(void) { unsigned int reg0, reg1; /* * The version register is read-only in a real APIC. */ reg0 = apic_read(APIC_LVR); apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); /* We don't try writing LVR in x2APIC mode since that incurs #GP. */ if ( !x2apic_enabled ) apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); reg1 = apic_read(APIC_LVR); apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); /* * The two version reads above should print the same * numbers. If the second one is different, then we * poke at a non-APIC. */ if (reg1 != reg0) return 0; /* * Check if the version looks reasonably. */ reg1 = GET_APIC_VERSION(reg0); if (reg1 == 0x00 || reg1 == 0xff) return 0; reg1 = get_maxlvt(); if (reg1 < 0x02 || reg1 == 0xff) return 0; /* * Detecting directed EOI on BSP: * If having directed EOI support in lapic, force to use ioapic_ack_old, * and enable the directed EOI for intr handling. */ if ( reg0 & APIC_LVR_DIRECTED_EOI ) { if ( ioapic_ack_new == 1 && ioapic_ack_forced == 1 ) printk("Not enabling directed EOI because ioapic_ack_new has been " "forced on the command line\n"); else { ioapic_ack_new = 0; directed_eoi_enabled = 1; printk("Enabled directed EOI with ioapic_ack_old on!\n"); } } /* * The ID register is read/write in a real APIC. */ reg0 = apic_read(APIC_ID); apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); /* * The next two are just to see if we have sane values. * They're only really relevant if we're in Virtual Wire * compatibility mode, but most boxes are anymore. */ reg0 = apic_read(APIC_LVT0); apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); reg1 = apic_read(APIC_LVT1); apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); return 1; } void __init sync_Arb_IDs(void) { /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not needed on AMD */ if (modern_apic()) return; /* * Wait for idle. */ apic_wait_icr_idle(); apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); } /* * An initial setup of the virtual wire mode. */ void __init init_bsp_APIC(void) { unsigned long value, ver; /* * Don't do the setup now if we have a SMP BIOS as the * through-I/O-APIC virtual wire mode might be active. */ if (smp_found_config || !cpu_has_apic) return; value = apic_read(APIC_LVR); ver = GET_APIC_VERSION(value); /* * Do not trust the local APIC being empty at bootup. */ clear_local_APIC(); /* * Enable APIC. */ value = apic_read(APIC_SPIV); value &= ~APIC_VECTOR_MASK; value |= APIC_SPIV_APIC_ENABLED; /* This bit is reserved on P4/Xeon and should be cleared */ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 15)) value &= ~APIC_SPIV_FOCUS_DISABLED; else value |= APIC_SPIV_FOCUS_DISABLED; value |= SPURIOUS_APIC_VECTOR; apic_write_around(APIC_SPIV, value); /* * Set up the virtual wire mode. */ apic_write_around(APIC_LVT0, APIC_DM_EXTINT); value = APIC_DM_NMI; if (!APIC_INTEGRATED(ver)) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT1, value); } static void apic_pm_activate(void) { apic_pm_state.active = 1; } static void __enable_x2apic(void) { uint64_t msr_content; rdmsrl(MSR_IA32_APICBASE, msr_content); if ( !(msr_content & MSR_IA32_APICBASE_EXTD) ) { msr_content |= MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD; msr_content = (uint32_t)msr_content; wrmsrl(MSR_IA32_APICBASE, msr_content); } } static void resume_x2apic(void) { struct IO_APIC_route_entry **ioapic_entries = NULL; ASSERT(x2apic_enabled); ioapic_entries = alloc_ioapic_entries(); if ( !ioapic_entries ) { printk("Allocate ioapic_entries failed\n"); goto out; } if ( save_IO_APIC_setup(ioapic_entries) ) { printk("Saving IO-APIC state failed\n"); goto out; } mask_8259A(); mask_IO_APIC_setup(ioapic_entries); iommu_enable_x2apic_IR(); __enable_x2apic(); restore_IO_APIC_setup(ioapic_entries); unmask_8259A(); out: if ( ioapic_entries ) free_ioapic_entries(ioapic_entries); } void __devinit setup_local_APIC(void) { unsigned long oldvalue, value, ver, maxlvt; int i, j; /* Pound the ESR really hard over the head with a big hammer - mbligh */ if (esr_disable) { apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); } value = apic_read(APIC_LVR); ver = GET_APIC_VERSION(value); BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f); /* * Double-check whether this APIC is really registered. */ if (!apic_id_registered()) BUG(); /* * Intel recommends to set DFR, LDR and TPR before enabling * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel * document number 292116). So here it goes... */ init_apic_ldr(); /* * Set Task Priority to reject any interrupts below FIRST_DYNAMIC_VECTOR. */ apic_write_around(APIC_TASKPRI, (FIRST_DYNAMIC_VECTOR & 0xF0) - 0x10); /* * After a crash, we no longer service the interrupts and a pending * interrupt from previous kernel might still have ISR bit set. * * Most probably by now CPU has serviced that pending interrupt and * it might not have done the ack_APIC_irq() because it thought, * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it * does not clear the ISR bit and cpu thinks it has already serivced * the interrupt. Hence a vector might get locked. It was noticed * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. */ for (i = APIC_ISR_NR - 1; i >= 0; i--) { value = apic_read(APIC_ISR + i*0x10); for (j = 31; j >= 0; j--) { if (value & (1< 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); oldvalue = apic_read(APIC_ESR); value = ERROR_APIC_VECTOR; // enables sending errors apic_write_around(APIC_LVTERR, value); /* * spec says clear errors after enabling vector. */ if (maxlvt > 3) apic_write(APIC_ESR, 0); value = apic_read(APIC_ESR); if (value != oldvalue) apic_printk(APIC_VERBOSE, "ESR value before enabling " "vector: %#lx after: %#lx\n", oldvalue, value); } else { if (esr_disable) /* * Something untraceble is creating bad interrupts on * secondary quads ... for the moment, just leave the * ESR disabled - we can't do anything useful with the * errors anyway - mbligh */ printk("Leaving ESR disabled.\n"); else printk("No ESR for 82489DX.\n"); } if (nmi_watchdog == NMI_LOCAL_APIC) setup_apic_nmi_watchdog(); apic_pm_activate(); } int lapic_suspend(void) { unsigned long flags; int maxlvt = get_maxlvt(); if (!apic_pm_state.active) return 0; apic_pm_state.apic_id = apic_read(APIC_ID); apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); apic_pm_state.apic_ldr = apic_read(APIC_LDR); apic_pm_state.apic_dfr = apic_read(APIC_DFR); apic_pm_state.apic_spiv = apic_read(APIC_SPIV); apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); if (maxlvt >= 4) apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); if (maxlvt >= 6) { apic_pm_state.apic_lvtcmci = apic_read(APIC_CMCI); } apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); if (maxlvt >= 5) apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); local_irq_save(flags); disable_local_APIC(); iommu_disable_x2apic_IR(); local_irq_restore(flags); return 0; } int lapic_resume(void) { uint64_t msr_content; unsigned long flags; int maxlvt; if (!apic_pm_state.active) return 0; local_irq_save(flags); /* * Make sure the APICBASE points to the right address * * FIXME! This will be wrong if we ever support suspend on * SMP! We'll need to do this as part of the CPU restore! */ if ( !x2apic_enabled ) { rdmsrl(MSR_IA32_APICBASE, msr_content); msr_content &= ~MSR_IA32_APICBASE_BASE; wrmsrl(MSR_IA32_APICBASE, msr_content | MSR_IA32_APICBASE_ENABLE | mp_lapic_addr); } else resume_x2apic(); maxlvt = get_maxlvt(); apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); apic_write(APIC_ID, apic_pm_state.apic_id); apic_write(APIC_DFR, apic_pm_state.apic_dfr); apic_write(APIC_LDR, apic_pm_state.apic_ldr); apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); apic_write(APIC_SPIV, apic_pm_state.apic_spiv); apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); if (maxlvt >= 5) apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); if (maxlvt >= 6) { apic_write(APIC_CMCI, apic_pm_state.apic_lvtcmci); } if (maxlvt >= 4) apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); apic_write(APIC_TMICT, apic_pm_state.apic_tmict); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); local_irq_restore(flags); return 0; } /* * Detect and enable local APICs on non-SMP boards. * Original code written by Keir Fraser. */ static void __init lapic_disable(char *str) { enable_local_apic = -1; setup_clear_cpu_cap(X86_FEATURE_APIC); } custom_param("nolapic", lapic_disable); static void __init lapic_enable(char *str) { enable_local_apic = 1; } custom_param("lapic", lapic_enable); static void __init apic_set_verbosity(char *str) { if (strcmp("debug", str) == 0) apic_verbosity = APIC_DEBUG; else if (strcmp("verbose", str) == 0) apic_verbosity = APIC_VERBOSE; else printk(KERN_WARNING "APIC Verbosity level %s not recognised" " use apic_verbosity=verbose or apic_verbosity=debug", str); } custom_param("apic_verbosity", apic_set_verbosity); static int __init detect_init_APIC (void) { uint64_t msr_content; u32 features; /* Disabled by kernel option? */ if (enable_local_apic < 0) return -1; switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x17)) break; goto no_apic; case X86_VENDOR_INTEL: if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || (boot_cpu_data.x86 == 5 && cpu_has_apic)) break; goto no_apic; default: goto no_apic; } if (!cpu_has_apic) { /* * Over-ride BIOS and try to enable the local * APIC only if "lapic" specified. */ if (enable_local_apic <= 0) { printk("Local APIC disabled by BIOS -- " "you can enable it with \"lapic\"\n"); return -1; } /* * Some BIOSes disable the local APIC in the * APIC_BASE MSR. This can only be done in * software for Intel P6 or later and AMD K7 * (Model > 1) or later. */ rdmsrl(MSR_IA32_APICBASE, msr_content); if (!(msr_content & MSR_IA32_APICBASE_ENABLE)) { printk("Local APIC disabled by BIOS -- reenabling.\n"); msr_content &= ~MSR_IA32_APICBASE_BASE; msr_content |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; wrmsrl(MSR_IA32_APICBASE, msr_content | MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE); enabled_via_apicbase = 1; } } /* * The APIC feature bit should now be enabled * in `cpuid' */ features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { printk("Could not enable APIC!\n"); return -1; } set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; /* The BIOS may have set up the APIC at some other address */ rdmsrl(MSR_IA32_APICBASE, msr_content); if (msr_content & MSR_IA32_APICBASE_ENABLE) mp_lapic_addr = msr_content & MSR_IA32_APICBASE_BASE; if (nmi_watchdog != NMI_NONE) nmi_watchdog = NMI_LOCAL_APIC; printk("Found and enabled local APIC!\n"); apic_pm_activate(); return 0; no_apic: printk("No local APIC present or hardware disabled\n"); return -1; } void x2apic_ap_setup(void) { if ( x2apic_enabled ) __enable_x2apic(); } void __init x2apic_bsp_setup(void) { struct IO_APIC_route_entry **ioapic_entries = NULL; if ( !cpu_has_x2apic ) return; if ( !opt_x2apic ) { if ( !x2apic_enabled ) { printk("Not enabling x2APIC: disabled by cmdline.\n"); return; } printk("x2APIC: Already enabled by BIOS: Ignoring cmdline disable.\n"); } if ( !iommu_supports_eim() ) { if ( !x2apic_enabled ) { printk("Not enabling x2APIC: depends on iommu_supports_eim.\n"); return; } panic("x2APIC: already enabled by BIOS, but " "iommu_supports_eim failed"); } if ( (ioapic_entries = alloc_ioapic_entries()) == NULL ) { printk("Allocate ioapic_entries failed\n"); goto out; } if ( save_IO_APIC_setup(ioapic_entries) ) { printk("Saving IO-APIC state failed\n"); goto out; } mask_8259A(); mask_IO_APIC_setup(ioapic_entries); if ( iommu_enable_x2apic_IR() ) { if ( x2apic_enabled ) panic("Interrupt remapping could not be enabled while " "x2APIC is already enabled by BIOS"); printk(XENLOG_ERR "Failed to enable Interrupt Remapping: Will not enable x2APIC.\n"); goto restore_out; } force_iommu = 1; genapic = apic_x2apic_probe(); printk("Switched to APIC driver %s.\n", genapic->name); if ( !x2apic_enabled ) { x2apic_enabled = 1; __enable_x2apic(); } restore_out: restore_IO_APIC_setup(ioapic_entries); unmask_8259A(); out: if ( ioapic_entries ) free_ioapic_entries(ioapic_entries); } void __init init_apic_mappings(void) { unsigned long apic_phys; if ( x2apic_enabled ) goto __next; /* * If no local APIC can be found then set up a fake all * zeroes page to simulate the local APIC and another * one for the IO-APIC. */ if (!smp_found_config && detect_init_APIC()) { apic_phys = __pa(alloc_xenheap_page()); clear_page(__va(apic_phys)); } else apic_phys = mp_lapic_addr; set_fixmap_nocache(FIX_APIC_BASE, apic_phys); apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", APIC_BASE, apic_phys); __next: /* * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ if (boot_cpu_physical_apicid == -1U) boot_cpu_physical_apicid = get_apic_id(); x86_cpu_to_apicid[0] = get_apic_id(); init_ioapic_mappings(); } /***************************************************************************** * APIC calibration * * The APIC is programmed in bus cycles. * Timeout values should specified in real time units. * The "cheapest" time source is the cyclecounter. * * Thus, we need a mappings from: bus cycles <- cycle counter <- system time * * The calibration is currently a bit shoddy since it requires the external * timer chip to generate periodic timer interupts. *****************************************************************************/ /* used for system time scaling */ static u32 __read_mostly bus_scale; /* scaling factor: ns -> bus cycles */ /* * The timer chip is already set up at HZ interrupts per second here, * but we do not accept timer interrupts yet. We only allow the BP * to calibrate. */ static unsigned int __init get_8254_timer_count(void) { /*extern spinlock_t i8253_lock;*/ /*unsigned long flags;*/ unsigned int count; /*spin_lock_irqsave(&i8253_lock, flags);*/ outb_p(0x00, PIT_MODE); count = inb_p(PIT_CH0); count |= inb_p(PIT_CH0) << 8; /*spin_unlock_irqrestore(&i8253_lock, flags);*/ return count; } /* next tick in 8254 can be caught by catching timer wraparound */ static void __init wait_8254_wraparound(void) { unsigned int curr_count, prev_count; curr_count = get_8254_timer_count(); do { prev_count = curr_count; curr_count = get_8254_timer_count(); /* workaround for broken Mercury/Neptune */ if (prev_count >= curr_count + 0x100) curr_count = get_8254_timer_count(); } while (prev_count >= curr_count); } /* * This function sets up the local APIC timer, with a timeout of * 'clocks' APIC bus clock. During calibration we actually call * this function twice on the boot CPU, once with a bogus timeout * value, second time for real. The other (noncalibrating) CPUs * call this function only once, with the real, calibrated value. * * We do reads before writes even if unnecessary, to get around the * P5 APIC double write bug. */ #define APIC_DIVISOR 1 static void __setup_APIC_LVTT(unsigned int clocks) { unsigned int lvtt_value, tmp_value, ver; ver = GET_APIC_VERSION(apic_read(APIC_LVR)); /* NB. Xen uses local APIC timer in one-shot mode. */ lvtt_value = /*APIC_TIMER_MODE_PERIODIC |*/ LOCAL_TIMER_VECTOR; if (!APIC_INTEGRATED(ver)) lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); if ( tdt_enabled ) { lvtt_value &= (~APIC_TIMER_MODE_MASK); lvtt_value |= APIC_TIMER_MODE_TSC_DEADLINE; } apic_write_around(APIC_LVTT, lvtt_value); tmp_value = apic_read(APIC_TDCR); apic_write_around(APIC_TDCR, (tmp_value | APIC_TDR_DIV_1)); apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); } static void __devinit setup_APIC_timer(void) { unsigned long flags; local_irq_save(flags); __setup_APIC_LVTT(0); local_irq_restore(flags); } /* * In this function we calibrate APIC bus clocks to the external * timer. Unfortunately we cannot use jiffies and the timer irq * to calibrate, since some later bootup code depends on getting * the first irq? Ugh. * * We want to do the calibration only once since we * want to have local timer irqs syncron. CPUs connected * by the same APIC bus have the very same bus frequency. * And we want to have irqs off anyways, no accidental * APIC irq that way. */ static int __init calibrate_APIC_clock(void) { unsigned long long t1 = 0, t2 = 0; long tt1, tt2; long result; int i; unsigned long bus_freq; /* KAF: pointer-size avoids compile warns. */ u32 bus_cycle; /* length of one bus cycle in pico-seconds */ const int LOOPS = HZ/10; apic_printk(APIC_VERBOSE, "calibrating APIC timer ...\n"); /* * Put whatever arbitrary (but long enough) timeout * value into the APIC clock, we just want to get the * counter running for calibration. */ __setup_APIC_LVTT(1000000000); /* * The timer chip counts down to zero. Let's wait * for a wraparound to start exact measurement: * (the current tick might have been already half done) */ wait_8254_wraparound(); /* * We wrapped around just now. Let's start: */ if (cpu_has_tsc) rdtscll(t1); tt1 = apic_read(APIC_TMCCT); /* * Let's wait LOOPS wraprounds: */ for (i = 0; i < LOOPS; i++) wait_8254_wraparound(); tt2 = apic_read(APIC_TMCCT); if (cpu_has_tsc) rdtscll(t2); /* * The APIC bus clock counter is 32 bits only, it * might have overflown, but note that we use signed * longs, thus no extra care needed. * * underflown to be exact, as the timer counts down ;) */ result = (tt1-tt2)*APIC_DIVISOR/LOOPS; if (cpu_has_tsc) apic_printk(APIC_VERBOSE, "..... CPU clock speed is " "%ld.%04ld MHz.\n", ((long)(t2-t1)/LOOPS)/(1000000/HZ), ((long)(t2-t1)/LOOPS)%(1000000/HZ)); apic_printk(APIC_VERBOSE, "..... host bus clock speed is " "%ld.%04ld MHz.\n", result/(1000000/HZ), result%(1000000/HZ)); /* set up multipliers for accurate timer code */ bus_freq = result*HZ; bus_cycle = (u32) (1000000000000LL/bus_freq); /* in pico seconds */ bus_scale = (1000*262144)/bus_cycle; apic_printk(APIC_VERBOSE, "..... bus_scale = %#x\n", bus_scale); /* reset APIC to zero timeout value */ __setup_APIC_LVTT(0); return result; } void __init setup_boot_APIC_clock(void) { unsigned long flags; apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"); using_apic_timer = 1; local_irq_save(flags); calibrate_APIC_clock(); if ( tdt_enable && boot_cpu_has(X86_FEATURE_TSC_DEADLINE) ) { printk(KERN_DEBUG "TSC deadline timer enabled\n"); tdt_enabled = 1; } setup_APIC_timer(); local_irq_restore(flags); } void __devinit setup_secondary_APIC_clock(void) { setup_APIC_timer(); } void disable_APIC_timer(void) { if (using_apic_timer) { unsigned long v; /* Work around AMD Erratum 411. This is a nice thing to do anyway. */ apic_write_around(APIC_TMICT, 0); v = apic_read(APIC_LVTT); apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); } } void enable_APIC_timer(void) { if (using_apic_timer) { unsigned long v; v = apic_read(APIC_LVTT); apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); } } #undef APIC_DIVISOR /* * reprogram_timer: Reprogram the APIC timer. * Timeout is a Xen system time (nanoseconds since boot); 0 disables the timer. * Returns 1 on success; 0 if the timeout is too soon or is in the past. */ int reprogram_timer(s_time_t timeout) { s_time_t expire; u32 apic_tmict = 0; /* No local APIC: timer list is polled via the PIT interrupt. */ if ( !cpu_has_apic ) return 1; if ( tdt_enabled ) { wrmsrl(MSR_IA32_TSC_DEADLINE, timeout ? stime2tsc(timeout) : 0); return 1; } if ( timeout && ((expire = timeout - NOW()) > 0) ) apic_tmict = min_t(u64, (bus_scale * expire) >> 18, UINT_MAX); apic_write(APIC_TMICT, (unsigned long)apic_tmict); return apic_tmict || !timeout; } void apic_timer_interrupt(struct cpu_user_regs * regs) { ack_APIC_irq(); perfc_incr(apic_timer); raise_softirq(TIMER_SOFTIRQ); } static DEFINE_PER_CPU(bool_t, state_dump_pending); void smp_send_state_dump(unsigned int cpu) { /* We overload the spurious interrupt handler to handle the dump. */ per_cpu(state_dump_pending, cpu) = 1; send_IPI_mask(cpumask_of(cpu), SPURIOUS_APIC_VECTOR); } /* * Spurious interrupts should _never_ happen with our APIC/SMP architecture. */ void spurious_interrupt(struct cpu_user_regs *regs) { /* * Check if this is a vectored interrupt (most likely, as this is probably * a request to dump local CPU state). Vectored interrupts are ACKed; * spurious interrupts are not. */ if (apic_isr_read(SPURIOUS_APIC_VECTOR)) { ack_APIC_irq(); if (this_cpu(state_dump_pending)) { this_cpu(state_dump_pending) = 0; dump_execstate(regs); goto out; } } /* see sw-dev-man vol 3, chapter 7.4.13.5 */ printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should " "never happen.\n", smp_processor_id()); out: ; } /* * This interrupt should never happen with our APIC/SMP architecture */ void error_interrupt(struct cpu_user_regs *regs) { unsigned long v, v1; /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); v1 = apic_read(APIC_ESR); ack_APIC_irq(); /* Here is what the APIC error bits mean: 0: Send CS error 1: Receive CS error 2: Send accept error 3: Receive accept error 4: Reserved 5: Send illegal vector 6: Received illegal vector 7: Illegal register address */ printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); } /* * This interrupt handles performance counters interrupt */ void pmu_apic_interrupt(struct cpu_user_regs *regs) { ack_APIC_irq(); vpmu_do_interrupt(regs); } /* * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ int __init APIC_init_uniprocessor (void) { if (enable_local_apic < 0) clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); if (!smp_found_config && !cpu_has_apic) { skip_ioapic_setup = 1; return -1; } /* * Complain if the BIOS pretends there is one. */ if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", boot_cpu_physical_apicid); clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); skip_ioapic_setup = 1; return -1; } verify_local_APIC(); connect_bsp_APIC(); /* * Hack: In case of kdump, after a crash, kernel might be booting * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid * might be zero if read from MP tables. Get it from LAPIC. */ #ifdef CONFIG_CRASH_DUMP boot_cpu_physical_apicid = get_apic_id(); #endif physids_clear(phys_cpu_present_map); physid_set(boot_cpu_physical_apicid, phys_cpu_present_map); setup_local_APIC(); if (nmi_watchdog == NMI_LOCAL_APIC) check_nmi_watchdog(); #ifdef CONFIG_X86_IO_APIC if (smp_found_config) if (!skip_ioapic_setup && nr_ioapics) setup_IO_APIC(); #endif setup_boot_APIC_clock(); return 0; } static const char * __init apic_mode_to_str(const enum apic_mode mode) { switch ( mode ) { case APIC_MODE_INVALID: return "invalid"; case APIC_MODE_DISABLED: return "disabled"; case APIC_MODE_XAPIC: return "xapic"; case APIC_MODE_X2APIC: return "x2apic"; default: return "unrecognised"; } } /* Needs to be called during startup. It records the state the BIOS * leaves the local APIC so we can undo upon kexec. */ void __init record_boot_APIC_mode(void) { /* Sanity check - we should only ever run once, but could possibly * be called several times */ if ( APIC_MODE_INVALID != apic_boot_mode ) return; apic_boot_mode = current_local_apic_mode(); apic_printk(APIC_DEBUG, "APIC boot state is '%s'\n", apic_mode_to_str(apic_boot_mode)); } /* Look at the bits in MSR_IA32_APICBASE and work out which * APIC mode we are in */ enum apic_mode current_local_apic_mode(void) { u64 msr_contents; rdmsrl(MSR_IA32_APICBASE, msr_contents); /* Reading EXTD bit from the MSR is only valid if CPUID * says so, else reserved */ if ( boot_cpu_has(X86_FEATURE_X2APIC) && (msr_contents & MSR_IA32_APICBASE_EXTD) ) return APIC_MODE_X2APIC; /* EN bit should always be valid as long as we can read the MSR */ if ( msr_contents & MSR_IA32_APICBASE_ENABLE ) return APIC_MODE_XAPIC; return APIC_MODE_DISABLED; } void check_for_unexpected_msi(unsigned int vector) { BUG_ON(apic_isr_read(vector)); } xen-4.4.0/xen/arch/x86/domain_build.c0000664000175000017500000011754612307313555015407 0ustar smbsmb/****************************************************************************** * domain_build.c * * Copyright (c) 2002-2005, K A Fraser */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for bzimage_parse */ #include #include static long __initdata dom0_nrpages; static long __initdata dom0_min_nrpages; static long __initdata dom0_max_nrpages = LONG_MAX; /* * dom0_mem=[min:,][max:,][] * * : The minimum amount of memory which should be allocated for dom0. * : The maximum amount of memory which should be allocated for dom0. * : The precise amount of memory to allocate for dom0. * * Notes: * 1. is clamped from below by and from above by available * memory and * 2. is clamped from above by available memory and * 3. is ignored if it is greater than * 4. If is not specified, it is calculated as follows: * "All of memory is allocated to domain 0, minus 1/16th which is reserved * for uses such as DMA buffers (the reservation is clamped to 128MB)." * * Each value can be specified as positive or negative: * If +ve: The specified amount is an absolute value. * If -ve: The specified amount is subtracted from total available memory. */ static long __init parse_amt(const char *s, const char **ps) { long pages = parse_size_and_unit((*s == '-') ? s+1 : s, ps) >> PAGE_SHIFT; return (*s == '-') ? -pages : pages; } static void __init parse_dom0_mem(const char *s) { do { if ( !strncmp(s, "min:", 4) ) dom0_min_nrpages = parse_amt(s+4, &s); else if ( !strncmp(s, "max:", 4) ) dom0_max_nrpages = parse_amt(s+4, &s); else dom0_nrpages = parse_amt(s, &s); if ( *s != ',' ) break; } while ( *s++ == ',' ); } custom_param("dom0_mem", parse_dom0_mem); static unsigned int __initdata opt_dom0_max_vcpus_min = 1; static unsigned int __initdata opt_dom0_max_vcpus_max = UINT_MAX; static void __init parse_dom0_max_vcpus(const char *s) { if (*s == '-') /* -M */ opt_dom0_max_vcpus_max = simple_strtoul(s + 1, &s, 0); else /* N, N-, or N-M */ { opt_dom0_max_vcpus_min = simple_strtoul(s, &s, 0); if (*s++ == '\0') /* N */ opt_dom0_max_vcpus_max = opt_dom0_max_vcpus_min; else if (*s != '\0') /* N-M */ opt_dom0_max_vcpus_max = simple_strtoul(s, &s, 0); } } custom_param("dom0_max_vcpus", parse_dom0_max_vcpus); struct vcpu *__init alloc_dom0_vcpu0(void) { unsigned max_vcpus; max_vcpus = num_cpupool_cpus(cpupool0); if ( opt_dom0_max_vcpus_min > max_vcpus ) max_vcpus = opt_dom0_max_vcpus_min; if ( opt_dom0_max_vcpus_max < max_vcpus ) max_vcpus = opt_dom0_max_vcpus_max; if ( max_vcpus > MAX_VIRT_CPUS ) max_vcpus = MAX_VIRT_CPUS; dom0->vcpu = xzalloc_array(struct vcpu *, max_vcpus); if ( !dom0->vcpu ) return NULL; dom0->max_vcpus = max_vcpus; return alloc_vcpu(dom0, 0, 0); } static bool_t __initdata opt_dom0_shadow; boolean_param("dom0_shadow", opt_dom0_shadow); static char __initdata opt_dom0_ioports_disable[200] = ""; string_param("dom0_ioports_disable", opt_dom0_ioports_disable); /* Allow ring-3 access in long mode as guest cannot use ring 1 ... */ #define BASE_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER) #define L1_PROT (BASE_PROT|_PAGE_GUEST_KERNEL) /* ... except for compatibility mode guests. */ #define COMPAT_L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED) #define L2_PROT (BASE_PROT|_PAGE_DIRTY) #define L3_PROT (BASE_PROT|_PAGE_DIRTY) #define L4_PROT (BASE_PROT|_PAGE_DIRTY) #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK) #define round_pgdown(_p) ((_p)&PAGE_MASK) static struct page_info * __init alloc_chunk( struct domain *d, unsigned long max_pages) { static unsigned int __initdata last_order = MAX_ORDER; static unsigned int __initdata memflags = MEMF_no_dma; struct page_info *page; unsigned int order = get_order_from_pages(max_pages), free_order; if ( order > last_order ) order = last_order; else if ( max_pages & (max_pages - 1) ) --order; while ( (page = alloc_domheap_pages(d, order, memflags)) == NULL ) if ( order-- == 0 ) break; if ( page ) last_order = order; else if ( memflags ) { /* * Allocate up to 2MB at a time: It prevents allocating very large * chunks from DMA pools before the >4GB pool is fully depleted. */ last_order = 21 - PAGE_SHIFT; memflags = 0; return alloc_chunk(d, max_pages); } /* * Make a reasonable attempt at finding a smaller chunk at a higher * address, to avoid allocating from low memory as much as possible. */ for ( free_order = order; !memflags && page && order--; ) { struct page_info *pg2; if ( d->tot_pages + (1 << order) > d->max_pages ) continue; pg2 = alloc_domheap_pages(d, order, 0); if ( pg2 > page ) { free_domheap_pages(page, free_order); page = pg2; free_order = order; } else if ( pg2 ) free_domheap_pages(pg2, order); } return page; } static unsigned long __init compute_dom0_nr_pages( struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len) { unsigned long avail = avail_domheap_pages() + initial_images_nrpages(); unsigned long nr_pages = dom0_nrpages; unsigned long min_pages = dom0_min_nrpages; unsigned long max_pages = dom0_max_nrpages; /* Reserve memory for further dom0 vcpu-struct allocations... */ avail -= (d->max_vcpus - 1UL) << get_order_from_bytes(sizeof(struct vcpu)); /* ...and compat_l4's, if needed. */ if ( is_pv_32on64_domain(d) ) avail -= d->max_vcpus - 1; /* Reserve memory for iommu_dom0_init() (rough estimate). */ if ( iommu_enabled ) { unsigned int s; for ( s = 9; s < BITS_PER_LONG; s += 9 ) avail -= max_pdx >> s; } /* * If domain 0 allocation isn't specified, reserve 1/16th of available * memory for things like DMA buffers. This reservation is clamped to * a maximum of 128MB. */ if ( nr_pages == 0 ) nr_pages = -min(avail / 16, 128UL << (20 - PAGE_SHIFT)); /* Negative memory specification means "all memory - specified amount". */ if ( (long)nr_pages < 0 ) nr_pages += avail; if ( (long)min_pages < 0 ) min_pages += avail; if ( (long)max_pages < 0 ) max_pages += avail; /* Clamp dom0 memory according to min/max limits and available memory. */ nr_pages = max(nr_pages, min_pages); nr_pages = min(nr_pages, max_pages); nr_pages = min(nr_pages, avail); if ( (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) && ((dom0_min_nrpages <= 0) || (nr_pages > min_pages)) ) { /* * Legacy Linux kernels (i.e. such without a XEN_ELFNOTE_INIT_P2M * note) require that there is enough virtual space beyond the initial * allocation to set up their initial page tables. This space is * roughly the same size as the p2m table, so make sure the initial * allocation doesn't consume more than about half the space that's * available between params.virt_base and the address space end. */ unsigned long vstart, vend, end; size_t sizeof_long = is_pv_32bit_domain(d) ? sizeof(int) : sizeof(long); vstart = parms->virt_base; vend = round_pgup(parms->virt_kend); if ( !parms->elf_notes[XEN_ELFNOTE_MOD_START_PFN].data.num ) vend += round_pgup(initrd_len); end = vend + nr_pages * sizeof_long; if ( end > vstart ) end += end - vstart; if ( end <= vstart || (sizeof_long < sizeof(end) && end > (1UL << (8 * sizeof_long))) ) { end = sizeof_long >= sizeof(end) ? 0 : 1UL << (8 * sizeof_long); nr_pages = (end - vend) / (2 * sizeof_long); if ( dom0_min_nrpages > 0 && nr_pages < min_pages ) nr_pages = min_pages; printk("Dom0 memory clipped to %lu pages\n", nr_pages); } } d->max_pages = min_t(unsigned long, max_pages, UINT_MAX); return nr_pages; } static void __init process_dom0_ioports_disable(void) { unsigned long io_from, io_to; char *t, *s = opt_dom0_ioports_disable; const char *u; if ( *s == '\0' ) return; while ( (t = strsep(&s, ",")) != NULL ) { io_from = simple_strtoul(t, &u, 16); if ( u == t ) { parse_error: printk("Invalid ioport range <%s> " "in dom0_ioports_disable, skipping\n", t); continue; } if ( *u == '\0' ) io_to = io_from; else if ( *u == '-' ) io_to = simple_strtoul(u + 1, &u, 16); else goto parse_error; if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) ) goto parse_error; printk("Disabling dom0 access to ioport range %04lx-%04lx\n", io_from, io_to); if ( ioports_deny_access(dom0, io_from, io_to) != 0 ) BUG(); } } int __init construct_dom0( struct domain *d, const module_t *image, unsigned long image_headroom, module_t *initrd, void *(*bootstrap_map)(const module_t *), char *cmdline) { int i, cpu, rc, compatible, compat32, order, machine; struct cpu_user_regs *regs; unsigned long pfn, mfn; unsigned long nr_pages; unsigned long nr_pt_pages; unsigned long alloc_spfn; unsigned long alloc_epfn; unsigned long initrd_pfn = -1, initrd_mfn = 0; unsigned long count; struct page_info *page = NULL; start_info_t *si; struct vcpu *v = d->vcpu[0]; unsigned long long value; char *image_base = bootstrap_map(image); unsigned long image_len = image->mod_end; char *image_start = image_base + image_headroom; unsigned long initrd_len = initrd ? initrd->mod_end : 0; l4_pgentry_t *l4tab = NULL, *l4start = NULL; l3_pgentry_t *l3tab = NULL, *l3start = NULL; l2_pgentry_t *l2tab = NULL, *l2start = NULL; l1_pgentry_t *l1tab = NULL, *l1start = NULL; /* * This fully describes the memory layout of the initial domain. All * *_start address are page-aligned, except v_start (and v_end) which are * superpage-aligned. */ struct elf_binary elf; struct elf_dom_parms parms; unsigned long vkern_start; unsigned long vkern_end; unsigned long vinitrd_start; unsigned long vinitrd_end; unsigned long vphysmap_start; unsigned long vphysmap_end; unsigned long vstartinfo_start; unsigned long vstartinfo_end; unsigned long vstack_start; unsigned long vstack_end; unsigned long vpt_start; unsigned long vpt_end; unsigned long v_start; unsigned long v_end; /* Machine address of next candidate page-table page. */ paddr_t mpt_alloc; /* Sanity! */ BUG_ON(d->domain_id != 0); BUG_ON(d->vcpu[0] == NULL); BUG_ON(v->is_initialised); printk("*** LOADING DOMAIN 0 ***\n"); d->max_pages = ~0U; if ( (rc = bzimage_parse(image_base, &image_start, &image_len)) != 0 ) return rc; if ( (rc = elf_init(&elf, image_start, image_len)) != 0 ) return rc; #ifdef VERBOSE elf_set_verbose(&elf); #endif elf_parse_binary(&elf); if ( (rc = elf_xen_parse(&elf, &parms)) != 0 ) goto out; /* compatibility check */ compatible = 0; compat32 = 0; machine = elf_uval(&elf, elf.ehdr, e_machine); printk(" Xen kernel: 64-bit, lsb, compat32\n"); if (elf_32bit(&elf) && parms.pae == PAEKERN_bimodal) parms.pae = PAEKERN_extended_cr3; if (elf_32bit(&elf) && parms.pae && machine == EM_386) { compat32 = 1; compatible = 1; } if (elf_64bit(&elf) && machine == EM_X86_64) compatible = 1; printk(" Dom0 kernel: %s%s, %s, paddr %#" PRIx64 " -> %#" PRIx64 "\n", elf_64bit(&elf) ? "64-bit" : "32-bit", parms.pae ? ", PAE" : "", elf_msb(&elf) ? "msb" : "lsb", elf.pstart, elf.pend); if ( elf.bsd_symtab_pstart ) printk(" Dom0 symbol map %#" PRIx64 " -> %#" PRIx64 "\n", elf.bsd_symtab_pstart, elf.bsd_symtab_pend); if ( !compatible ) { printk("Mismatch between Xen and DOM0 kernel\n"); rc = -EINVAL; goto out; } if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE && !test_bit(XENFEAT_dom0, parms.f_supported) ) { printk("Kernel does not support Dom0 operation\n"); rc = -EINVAL; goto out; } if ( compat32 ) { d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1; v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0]; if ( setup_compat_arg_xlat(v) != 0 ) BUG(); } nr_pages = compute_dom0_nr_pages(d, &parms, initrd_len); if ( parms.pae == PAEKERN_extended_cr3 ) set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist); if ( (parms.virt_hv_start_low != UNSET_ADDR) && elf_32bit(&elf) ) { unsigned long mask = (1UL << L2_PAGETABLE_SHIFT) - 1; value = (parms.virt_hv_start_low + mask) & ~mask; BUG_ON(!is_pv_32bit_domain(d)); if ( value > __HYPERVISOR_COMPAT_VIRT_START ) panic("Domain 0 expects too high a hypervisor start address"); HYPERVISOR_COMPAT_VIRT_START(d) = max_t(unsigned int, m2p_compat_vstart, value); } if ( (parms.p2m_base != UNSET_ADDR) && elf_32bit(&elf) ) { printk(XENLOG_WARNING "P2M table base ignored\n"); parms.p2m_base = UNSET_ADDR; } domain_set_alloc_bitsize(d); /* * Why do we need this? The number of page-table frames depends on the * size of the bootstrap address space. But the size of the address space * depends on the number of page-table frames (since each one is mapped * read-only). We have a pair of simultaneous equations in two unknowns, * which we solve by exhaustive search. */ v_start = parms.virt_base; vkern_start = parms.virt_kstart; vkern_end = parms.virt_kend; if ( parms.elf_notes[XEN_ELFNOTE_MOD_START_PFN].data.num ) { vinitrd_start = vinitrd_end = 0; vphysmap_start = round_pgup(vkern_end); } else { vinitrd_start = round_pgup(vkern_end); vinitrd_end = vinitrd_start + initrd_len; vphysmap_start = round_pgup(vinitrd_end); } vphysmap_end = vphysmap_start + (nr_pages * (!is_pv_32on64_domain(d) ? sizeof(unsigned long) : sizeof(unsigned int))); if ( parms.p2m_base != UNSET_ADDR ) vphysmap_end = vphysmap_start; vstartinfo_start = round_pgup(vphysmap_end); vstartinfo_end = (vstartinfo_start + sizeof(struct start_info) + sizeof(struct dom0_vga_console_info)); vpt_start = round_pgup(vstartinfo_end); for ( nr_pt_pages = 2; ; nr_pt_pages++ ) { vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE); vstack_start = vpt_end; vstack_end = vstack_start + PAGE_SIZE; v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1); if ( (v_end - vstack_end) < (512UL << 10) ) v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */ #define NR(_l,_h,_s) \ (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \ ((_l) & ~((1UL<<(_s))-1))) >> (_s)) if ( (!is_pv_32on64_domain(d) + /* # L4 */ NR(v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */ (!is_pv_32on64_domain(d) ? NR(v_start, v_end, L3_PAGETABLE_SHIFT) : /* # L2 */ 4) + /* # compat L2 */ NR(v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */ <= nr_pt_pages ) break; } count = v_end - v_start; if ( vinitrd_start ) count -= PAGE_ALIGN(initrd_len); order = get_order_from_bytes(count); if ( (1UL << order) + PFN_UP(initrd_len) > nr_pages ) panic("Domain 0 allocation is too small for kernel image"); if ( parms.p2m_base != UNSET_ADDR ) { vphysmap_start = parms.p2m_base; vphysmap_end = vphysmap_start + nr_pages * sizeof(unsigned long); } page = alloc_domheap_pages(d, order, 0); if ( page == NULL ) panic("Not enough RAM for domain 0 allocation"); alloc_spfn = page_to_mfn(page); alloc_epfn = alloc_spfn + d->tot_pages; if ( initrd_len ) { initrd_pfn = vinitrd_start ? (vinitrd_start - v_start) >> PAGE_SHIFT : d->tot_pages; initrd_mfn = mfn = initrd->mod_start; count = PFN_UP(initrd_len); if ( d->arch.physaddr_bitsize && ((mfn + count - 1) >> (d->arch.physaddr_bitsize - PAGE_SHIFT)) ) { order = get_order_from_pages(count); page = alloc_domheap_pages(d, order, 0); if ( !page ) panic("Not enough RAM for domain 0 initrd"); for ( count = -count; order--; ) if ( count & (1UL << order) ) { free_domheap_pages(page, order); page += 1UL << order; } memcpy(page_to_virt(page), mfn_to_virt(initrd->mod_start), initrd_len); mpt_alloc = (paddr_t)initrd->mod_start << PAGE_SHIFT; init_domheap_pages(mpt_alloc, mpt_alloc + PAGE_ALIGN(initrd_len)); initrd->mod_start = initrd_mfn = page_to_mfn(page); } else { while ( count-- ) if ( assign_pages(d, mfn_to_page(mfn++), 0, 0) ) BUG(); } initrd->mod_end = 0; } printk("PHYSICAL MEMORY ARRANGEMENT:\n" " Dom0 alloc.: %"PRIpaddr"->%"PRIpaddr, pfn_to_paddr(alloc_spfn), pfn_to_paddr(alloc_epfn)); if ( d->tot_pages < nr_pages ) printk(" (%lu pages to be allocated)", nr_pages - d->tot_pages); if ( initrd ) { mpt_alloc = (paddr_t)initrd->mod_start << PAGE_SHIFT; printk("\n Init. ramdisk: %"PRIpaddr"->%"PRIpaddr, mpt_alloc, mpt_alloc + initrd_len); } printk("\nVIRTUAL MEMORY ARRANGEMENT:\n" " Loaded kernel: %p->%p\n" " Init. ramdisk: %p->%p\n" " Phys-Mach map: %p->%p\n" " Start info: %p->%p\n" " Page tables: %p->%p\n" " Boot stack: %p->%p\n" " TOTAL: %p->%p\n", _p(vkern_start), _p(vkern_end), _p(vinitrd_start), _p(vinitrd_end), _p(vphysmap_start), _p(vphysmap_end), _p(vstartinfo_start), _p(vstartinfo_end), _p(vpt_start), _p(vpt_end), _p(vstack_start), _p(vstack_end), _p(v_start), _p(v_end)); printk(" ENTRY ADDRESS: %p\n", _p(parms.virt_entry)); mpt_alloc = (vpt_start - v_start) + pfn_to_paddr(alloc_spfn); if ( vinitrd_start ) mpt_alloc -= PAGE_ALIGN(initrd_len); /* Overlap with Xen protected area? */ if ( !is_pv_32on64_domain(d) ? ((v_start < HYPERVISOR_VIRT_END) && (v_end > HYPERVISOR_VIRT_START)) : (v_end > HYPERVISOR_COMPAT_VIRT_START(d)) ) { printk("DOM0 image overlaps with Xen private area.\n"); rc = -EINVAL; goto out; } if ( is_pv_32on64_domain(d) ) { v->arch.pv_vcpu.failsafe_callback_cs = FLAT_COMPAT_KERNEL_CS; v->arch.pv_vcpu.event_callback_cs = FLAT_COMPAT_KERNEL_CS; } /* WARNING: The new domain must have its 'processor' field filled in! */ if ( !is_pv_32on64_domain(d) ) { maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l4_page_table; l4start = l4tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; } else { page = alloc_domheap_page(NULL, 0); if ( !page ) panic("Not enough RAM for domain 0 PML4"); page->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1; l4start = l4tab = page_to_virt(page); maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table; l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; } clear_page(l4tab); init_guest_l4_table(l4tab, d); v->arch.guest_table = pagetable_from_paddr(__pa(l4start)); if ( is_pv_32on64_domain(d) ) v->arch.guest_table_user = v->arch.guest_table; l4tab += l4_table_offset(v_start); pfn = alloc_spfn; for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ ) { if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) ) { maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table; l1start = l1tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; clear_page(l1tab); if ( count == 0 ) l1tab += l1_table_offset(v_start); if ( !((unsigned long)l2tab & (PAGE_SIZE-1)) ) { maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table; l2start = l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; clear_page(l2tab); if ( count == 0 ) l2tab += l2_table_offset(v_start); if ( !((unsigned long)l3tab & (PAGE_SIZE-1)) ) { if ( count || !l3start ) { maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table; l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; } l3tab = l3start; clear_page(l3tab); if ( count == 0 ) l3tab += l3_table_offset(v_start); *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT); l4tab++; } *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT); l3tab++; } *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT); l2tab++; } if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) ) mfn = pfn++; else mfn = initrd_mfn++; *l1tab = l1e_from_pfn(mfn, (!is_pv_32on64_domain(d) ? L1_PROT : COMPAT_L1_PROT)); l1tab++; page = mfn_to_page(mfn); if ( (page->u.inuse.type_info == 0) && !get_page_and_type(page, d, PGT_writable_page) ) BUG(); } if ( is_pv_32on64_domain(d) ) { /* Ensure the first four L3 entries are all populated. */ for ( i = 0, l3tab = l3start; i < 4; ++i, ++l3tab ) { if ( !l3e_get_intpte(*l3tab) ) { maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l2_page_table; l2tab = __va(mpt_alloc); mpt_alloc += PAGE_SIZE; clear_page(l2tab); *l3tab = l3e_from_paddr(__pa(l2tab), L3_PROT); } if ( i == 3 ) l3e_get_page(*l3tab)->u.inuse.type_info |= PGT_pae_xen_l2; } /* Install read-only guest visible MPT mapping. */ l2tab = l3e_to_l2e(l3start[3]); memcpy(&l2tab[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)], &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)], COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab)); } /* Pages that are part of page tables must be read only. */ l4tab = l4start + l4_table_offset(vpt_start); l3start = l3tab = l4e_to_l3e(*l4tab); l3tab += l3_table_offset(vpt_start); l2start = l2tab = l3e_to_l2e(*l3tab); l2tab += l2_table_offset(vpt_start); l1start = l1tab = l2e_to_l1e(*l2tab); l1tab += l1_table_offset(vpt_start); for ( count = 0; count < nr_pt_pages; count++ ) { l1e_remove_flags(*l1tab, _PAGE_RW); page = mfn_to_page(l1e_get_pfn(*l1tab)); /* Read-only mapping + PGC_allocated + page-table page. */ page->count_info = PGC_allocated | 3; page->u.inuse.type_info |= PGT_validated | 1; /* Top-level p.t. is pinned. */ if ( (page->u.inuse.type_info & PGT_type_mask) == (!is_pv_32on64_domain(d) ? PGT_l4_page_table : PGT_l3_page_table) ) { page->count_info += 1; page->u.inuse.type_info += 1 | PGT_pinned; } /* Iterate. */ if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) ) { if ( !((unsigned long)++l2tab & (PAGE_SIZE - 1)) ) { if ( !((unsigned long)++l3tab & (PAGE_SIZE - 1)) ) l3start = l3tab = l4e_to_l3e(*++l4tab); l2start = l2tab = l3e_to_l2e(*l3tab); } l1start = l1tab = l2e_to_l1e(*l2tab); } } /* Mask all upcalls... */ for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1; printk("Dom0 has maximum %u VCPUs\n", d->max_vcpus); cpu = cpumask_first(cpupool0->cpu_valid); for ( i = 1; i < d->max_vcpus; i++ ) { cpu = cpumask_cycle(cpu, cpupool0->cpu_valid); (void)alloc_vcpu(d, i, cpu); } /* Set up CR3 value for write_ptbase */ if ( paging_mode_enabled(d) ) paging_update_paging_modes(v); else update_cr3(v); /* We run on dom0's page tables for the final part of the build process. */ write_ptbase(v); mapcache_override_current(v); /* Copy the OS image and free temporary buffer. */ elf.dest_base = (void*)vkern_start; elf.dest_size = vkern_end - vkern_start; rc = elf_load_binary(&elf); if ( rc < 0 ) { printk("Failed to load the kernel binary\n"); goto out; } bootstrap_map(NULL); if ( UNSET_ADDR != parms.virt_hypercall ) { if ( (parms.virt_hypercall < v_start) || (parms.virt_hypercall >= v_end) ) { mapcache_override_current(NULL); write_ptbase(current); printk("Invalid HYPERCALL_PAGE field in ELF notes.\n"); rc = -1; goto out; } hypercall_page_initialise( d, (void *)(unsigned long)parms.virt_hypercall); } /* Free temporary buffers. */ discard_initial_images(); /* Set up start info area. */ si = (start_info_t *)vstartinfo_start; clear_page(si); si->nr_pages = nr_pages; si->shared_info = virt_to_maddr(d->shared_info); si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN; if ( !vinitrd_start && initrd_len ) si->flags |= SIF_MOD_START_PFN; si->flags |= (xen_processor_pmbits << 8) & SIF_PM_MASK; si->pt_base = vpt_start; si->nr_pt_frames = nr_pt_pages; si->mfn_list = vphysmap_start; snprintf(si->magic, sizeof(si->magic), "xen-3.0-x86_%d%s", elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : ""); count = d->tot_pages; l4start = map_domain_page(pagetable_get_pfn(v->arch.guest_table)); l3tab = NULL; l2tab = NULL; l1tab = NULL; /* Set up the phys->machine table if not part of the initial mapping. */ if ( parms.p2m_base != UNSET_ADDR ) { unsigned long va = vphysmap_start; if ( v_start <= vphysmap_end && vphysmap_start <= v_end ) panic("DOM0 P->M table overlaps initial mapping"); while ( va < vphysmap_end ) { if ( d->tot_pages + ((round_pgup(vphysmap_end) - va) >> PAGE_SHIFT) + 3 > nr_pages ) panic("Dom0 allocation too small for initial P->M table"); if ( l1tab ) { unmap_domain_page(l1tab); l1tab = NULL; } if ( l2tab ) { unmap_domain_page(l2tab); l2tab = NULL; } if ( l3tab ) { unmap_domain_page(l3tab); l3tab = NULL; } l4tab = l4start + l4_table_offset(va); if ( !l4e_get_intpte(*l4tab) ) { page = alloc_domheap_page(d, 0); if ( !page ) break; /* No mapping, PGC_allocated + page-table page. */ page->count_info = PGC_allocated | 2; page->u.inuse.type_info = PGT_l3_page_table | PGT_validated | 1; l3tab = __map_domain_page(page); clear_page(l3tab); *l4tab = l4e_from_page(page, L4_PROT); } else l3tab = map_domain_page(l4e_get_pfn(*l4tab)); l3tab += l3_table_offset(va); if ( !l3e_get_intpte(*l3tab) ) { if ( cpu_has_page1gb && !(va & ((1UL << L3_PAGETABLE_SHIFT) - 1)) && vphysmap_end >= va + (1UL << L3_PAGETABLE_SHIFT) && (page = alloc_domheap_pages(d, L3_PAGETABLE_SHIFT - PAGE_SHIFT, 0)) != NULL ) { *l3tab = l3e_from_page(page, L1_PROT|_PAGE_DIRTY|_PAGE_PSE); va += 1UL << L3_PAGETABLE_SHIFT; continue; } if ( (page = alloc_domheap_page(d, 0)) == NULL ) break; /* No mapping, PGC_allocated + page-table page. */ page->count_info = PGC_allocated | 2; page->u.inuse.type_info = PGT_l2_page_table | PGT_validated | 1; l2tab = __map_domain_page(page); clear_page(l2tab); *l3tab = l3e_from_page(page, L3_PROT); } else l2tab = map_domain_page(l3e_get_pfn(*l3tab)); l2tab += l2_table_offset(va); if ( !l2e_get_intpte(*l2tab) ) { if ( !(va & ((1UL << L2_PAGETABLE_SHIFT) - 1)) && vphysmap_end >= va + (1UL << L2_PAGETABLE_SHIFT) && (page = alloc_domheap_pages(d, L2_PAGETABLE_SHIFT - PAGE_SHIFT, 0)) != NULL ) { *l2tab = l2e_from_page(page, L1_PROT|_PAGE_DIRTY|_PAGE_PSE); if ( opt_allow_superpage ) get_superpage(page_to_mfn(page), d); va += 1UL << L2_PAGETABLE_SHIFT; continue; } if ( (page = alloc_domheap_page(d, 0)) == NULL ) break; /* No mapping, PGC_allocated + page-table page. */ page->count_info = PGC_allocated | 2; page->u.inuse.type_info = PGT_l1_page_table | PGT_validated | 1; l1tab = __map_domain_page(page); clear_page(l1tab); *l2tab = l2e_from_page(page, L2_PROT); } else l1tab = map_domain_page(l2e_get_pfn(*l2tab)); l1tab += l1_table_offset(va); BUG_ON(l1e_get_intpte(*l1tab)); page = alloc_domheap_page(d, 0); if ( !page ) break; *l1tab = l1e_from_page(page, L1_PROT|_PAGE_DIRTY); va += PAGE_SIZE; va &= PAGE_MASK; } if ( !page ) panic("Not enough RAM for DOM0 P->M table"); } if ( l1tab ) unmap_domain_page(l1tab); if ( l2tab ) unmap_domain_page(l2tab); if ( l3tab ) unmap_domain_page(l3tab); unmap_domain_page(l4start); /* Write the phys->machine and machine->phys table entries. */ for ( pfn = 0; pfn < count; pfn++ ) { mfn = pfn + alloc_spfn; if ( pfn >= initrd_pfn ) { if ( pfn < initrd_pfn + PFN_UP(initrd_len) ) mfn = initrd->mod_start + (pfn - initrd_pfn); else mfn -= PFN_UP(initrd_len); } #ifndef NDEBUG #define REVERSE_START ((v_end - v_start) >> PAGE_SHIFT) if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) ) mfn = alloc_epfn - (pfn - REVERSE_START); #endif if ( !is_pv_32on64_domain(d) ) ((unsigned long *)vphysmap_start)[pfn] = mfn; else ((unsigned int *)vphysmap_start)[pfn] = mfn; set_gpfn_from_mfn(mfn, pfn); if (!(pfn & 0xfffff)) process_pending_softirqs(); } si->first_p2m_pfn = pfn; si->nr_p2m_frames = d->tot_pages - count; page_list_for_each ( page, &d->page_list ) { mfn = page_to_mfn(page); BUG_ON(SHARED_M2P(get_gpfn_from_mfn(mfn))); if ( get_gpfn_from_mfn(mfn) >= count ) { BUG_ON(is_pv_32bit_domain(d)); if ( !page->u.inuse.type_info && !get_page_and_type(page, d, PGT_writable_page) ) BUG(); ((unsigned long *)vphysmap_start)[pfn] = mfn; set_gpfn_from_mfn(mfn, pfn); ++pfn; if (!(pfn & 0xfffff)) process_pending_softirqs(); } } BUG_ON(pfn != d->tot_pages); #ifndef NDEBUG alloc_epfn += PFN_UP(initrd_len) + si->nr_p2m_frames; #endif while ( pfn < nr_pages ) { if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL ) panic("Not enough RAM for DOM0 reservation"); while ( pfn < d->tot_pages ) { mfn = page_to_mfn(page); #ifndef NDEBUG #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn))) #endif if ( !is_pv_32on64_domain(d) ) ((unsigned long *)vphysmap_start)[pfn] = mfn; else ((unsigned int *)vphysmap_start)[pfn] = mfn; set_gpfn_from_mfn(mfn, pfn); #undef pfn page++; pfn++; if (!(pfn & 0xfffff)) process_pending_softirqs(); } } if ( initrd_len != 0 ) { si->mod_start = vinitrd_start ?: initrd_pfn; si->mod_len = initrd_len; } memset(si->cmd_line, 0, sizeof(si->cmd_line)); if ( cmdline != NULL ) strlcpy((char *)si->cmd_line, cmdline, sizeof(si->cmd_line)); if ( fill_console_start_info((void *)(si + 1)) ) { si->console.dom0.info_off = sizeof(struct start_info); si->console.dom0.info_size = sizeof(struct dom0_vga_console_info); } if ( is_pv_32on64_domain(d) ) xlat_start_info(si, XLAT_start_info_console_dom0); /* Return to idle domain's page tables. */ mapcache_override_current(NULL); write_ptbase(current); update_domain_wallclock_time(d); v->is_initialised = 1; clear_bit(_VPF_down, &v->pause_flags); /* * Initial register values: * DS,ES,FS,GS = FLAT_KERNEL_DS * CS:EIP = FLAT_KERNEL_CS:start_pc * SS:ESP = FLAT_KERNEL_SS:start_stack * ESI = start_info * [EAX,EBX,ECX,EDX,EDI,EBP are zero] */ regs = &v->arch.user_regs; regs->ds = regs->es = regs->fs = regs->gs = !is_pv_32on64_domain(d) ? FLAT_KERNEL_DS : FLAT_COMPAT_KERNEL_DS; regs->ss = (!is_pv_32on64_domain(d) ? FLAT_KERNEL_SS : FLAT_COMPAT_KERNEL_SS); regs->cs = (!is_pv_32on64_domain(d) ? FLAT_KERNEL_CS : FLAT_COMPAT_KERNEL_CS); regs->eip = parms.virt_entry; regs->esp = vstack_end; regs->esi = vstartinfo_start; regs->eflags = X86_EFLAGS_IF; if ( opt_dom0_shadow ) if ( paging_enable(d, PG_SH_enable) == 0 ) paging_update_paging_modes(v); if ( supervisor_mode_kernel ) { v->arch.pv_vcpu.kernel_ss &= ~3; v->arch.user_regs.ss &= ~3; v->arch.user_regs.es &= ~3; v->arch.user_regs.ds &= ~3; v->arch.user_regs.fs &= ~3; v->arch.user_regs.gs &= ~3; printk("Dom0 runs in ring 0 (supervisor mode)\n"); if ( !test_bit(XENFEAT_supervisor_mode_kernel, parms.f_supported) ) panic("Dom0 does not support supervisor-mode execution"); } else { if ( test_bit(XENFEAT_supervisor_mode_kernel, parms.f_required) ) panic("Dom0 requires supervisor-mode execution"); } rc = 0; /* DOM0 is permitted full I/O capabilities. */ rc |= ioports_permit_access(dom0, 0, 0xFFFF); rc |= iomem_permit_access(dom0, 0UL, ~0UL); rc |= irqs_permit_access(dom0, 1, nr_irqs_gsi - 1); /* * Modify I/O port access permissions. */ /* Master Interrupt Controller (PIC). */ rc |= ioports_deny_access(dom0, 0x20, 0x21); /* Slave Interrupt Controller (PIC). */ rc |= ioports_deny_access(dom0, 0xA0, 0xA1); /* Interval Timer (PIT). */ rc |= ioports_deny_access(dom0, 0x40, 0x43); /* PIT Channel 2 / PC Speaker Control. */ rc |= ioports_deny_access(dom0, 0x61, 0x61); /* ACPI PM Timer. */ if ( pmtmr_ioport ) rc |= ioports_deny_access(dom0, pmtmr_ioport, pmtmr_ioport + 3); /* PCI configuration space (NB. 0xcf8 has special treatment). */ rc |= ioports_deny_access(dom0, 0xcfc, 0xcff); /* Command-line I/O ranges. */ process_dom0_ioports_disable(); /* * Modify I/O memory access permissions. */ /* Local APIC. */ if ( mp_lapic_addr != 0 ) { mfn = paddr_to_pfn(mp_lapic_addr); rc |= iomem_deny_access(dom0, mfn, mfn); } /* I/O APICs. */ for ( i = 0; i < nr_ioapics; i++ ) { mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr); if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) ) rc |= iomem_deny_access(dom0, mfn, mfn); } /* MSI range. */ rc |= iomem_deny_access(dom0, paddr_to_pfn(MSI_ADDR_BASE_LO), paddr_to_pfn(MSI_ADDR_BASE_LO + MSI_ADDR_DEST_ID_MASK)); /* HyperTransport range. */ if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) rc |= iomem_deny_access(dom0, paddr_to_pfn(0xfdULL << 32), paddr_to_pfn((1ULL << 40) - 1)); /* Remove access to E820_UNUSABLE I/O regions above 1MB. */ for ( i = 0; i < e820.nr_map; i++ ) { unsigned long sfn, efn; sfn = max_t(unsigned long, paddr_to_pfn(e820.map[i].addr), 0x100ul); efn = paddr_to_pfn(e820.map[i].addr + e820.map[i].size - 1); if ( (e820.map[i].type == E820_UNUSABLE) && (e820.map[i].size != 0) && (sfn <= efn) ) rc |= iomem_deny_access(dom0, sfn, efn); } BUG_ON(rc != 0); if ( elf_check_broken(&elf) ) printk(" Xen warning: dom0 kernel broken ELF: %s\n", elf_check_broken(&elf)); iommu_dom0_init(dom0); return 0; out: if ( elf_check_broken(&elf) ) printk(" Xen dom0 kernel broken ELF: %s\n", elf_check_broken(&elf)); return rc; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/boot/0000775000175000017500000000000012307313555013542 5ustar smbsmbxen-4.4.0/xen/arch/x86/boot/Makefile0000664000175000017500000000013212307313555015176 0ustar smbsmbobj-bin-y += head.o head.o: reloc.S %.S: %.c $(MAKE) -f build32.mk $@ reloc.S: head.S xen-4.4.0/xen/arch/x86/boot/edd.S0000664000175000017500000001474112307313555014431 0ustar smbsmb/****************************************************************************** * edd.S * * BIOS Enhanced Disk Drive support * * Copyright (C) 2002, 2003, 2004 Dell, Inc. * by Matt Domsch October 2002 * conformant to T13 Committee www.t13.org * projects 1572D, 1484D, 1386D, 1226DT * disk signature read by Matt Domsch * and Andrew Wilks September 2003, June 2004 * legacy CHS retrieval by Patrick J. LoPresti * March 2004 * Command line option parsing, Matt Domsch, November 2004 * * Updated and ported for Xen by Keir Fraser June 2007 */ #include .code16 /* Offset of disc signature in the MBR. */ #define EDD_MBR_SIG_OFFSET 0x1B8 get_edd: cmpb $2, bootsym(opt_edd) # edd=off ? je edd_done cmpb $1, bootsym(opt_edd) # edd=skipmbr ? je edd_start # Read the first sector of each BIOS disk device and store the 4-byte signature edd_mbr_sig_start: movb $0x80, %dl # from device 80 movw $bootsym(boot_mbr_signature),%bx # store buffer ptr in bx edd_mbr_sig_read: pushw %bx movb $0x02, %ah # 0x02 Read Sectors movb $1, %al # read 1 sector movb $0, %dh # at head 0 movw $1, %cx # cylinder 0, sector 0 pushw %es pushw %ds popw %es movw $bootsym(boot_edd_info), %bx # disk's data goes into info pushw %dx # work around buggy BIOSes stc # work around buggy BIOSes int $0x13 sti # work around buggy BIOSes popw %dx popw %es popw %bx jc edd_mbr_sig_done # on failure, we're done. cmpb $0, %ah # some BIOSes do not set CF jne edd_mbr_sig_done # on failure, we're done. cmpw $0xaa55, bootsym(boot_edd_info)+0x1fe jne .Ledd_mbr_sig_next movl bootsym(boot_edd_info)+EDD_MBR_SIG_OFFSET,%eax movb %dl, (%bx) # store BIOS drive number movl %eax, 4(%bx) # store signature from MBR incb bootsym(boot_mbr_signature_nr) # note that we stored something addw $8, %bx # increment sig buffer ptr .Ledd_mbr_sig_next: incb %dl # increment to next device jz edd_mbr_sig_done cmpb $EDD_MBR_SIG_MAX,bootsym(boot_mbr_signature_nr) jb edd_mbr_sig_read edd_mbr_sig_done: # Do the BIOS Enhanced Disk Drive calls # This consists of two calls: # int 13h ah=41h "Check Extensions Present" # int 13h ah=48h "Get Device Parameters" # int 13h ah=08h "Legacy Get Device Parameters" # # A buffer of size EDD_INFO_MAX*(EDDEXTSIZE+EDDPARMSIZE) is reserved at # boot_edd_info, the first four bytes of which are used to store the device # number, interface support map and version results from fn41. The next four # bytes are used to store the legacy cylinders, heads, and sectors from fn08. # The following 74 bytes are used to store the results from fn48. # This code is sensitive to the size of the structs in edd.h edd_start: /* ds:si points at fn48 results. Fn41 results go immediately before. */ movw $bootsym(boot_edd_info)+EDDEXTSIZE, %si movb $0x80, %dl # BIOS device 0x80 edd_check_ext: movb $0x41, %ah # 0x41 Check Extensions Present movw $0x55AA, %bx # magic int $0x13 # make the call jc edd_done # no more BIOS devices cmpw $0xAA55, %bx # is magic right? jne edd_next # nope, next... movb %dl, %ds:-8(%si) # store device number movb %ah, %ds:-7(%si) # store version movw %cx, %ds:-6(%si) # store extensions incb bootsym(boot_edd_info_nr) # note that we stored something edd_get_device_params: movw $EDDPARMSIZE, %ds:(%si) # put size movw $0x0, %ds:2(%si) # work around buggy BIOSes movb $0x48, %ah # 0x48 Get Device Parameters int $0x13 # make the call # Don't check for fail return # it doesn't matter. edd_get_legacy_chs: xorw %ax, %ax movw %ax, %ds:-4(%si) movw %ax, %ds:-2(%si) # Ralf Brown's Interrupt List says to set ES:DI to # 0000h:0000h "to guard against BIOS bugs" pushw %es movw %ax, %es movw %ax, %di pushw %dx # legacy call clobbers %dl movb $0x08, %ah # 0x08 Legacy Get Device Params int $0x13 # make the call jc edd_legacy_done # failed movb %cl, %al # Low 6 bits are max andb $0x3F, %al # sector number movb %al, %ds:-1(%si) # Record max sect movb %dh, %ds:-2(%si) # Record max head number movb %ch, %al # Low 8 bits of max cyl shr $6, %cl movb %cl, %ah # High 2 bits of max cyl movw %ax, %ds:-4(%si) edd_legacy_done: popw %dx popw %es movw %si, %ax # increment si addw $EDDPARMSIZE+EDDEXTSIZE, %ax movw %ax, %si edd_next: incb %dl # increment to next device cmpb $EDD_INFO_MAX,bootsym(boot_edd_info_nr) jb edd_check_ext edd_done: ret opt_edd: .byte 0 # edd=on/off/skipmbr GLOBAL(boot_edd_info_nr) .byte 0 GLOBAL(boot_mbr_signature_nr) .byte 0 GLOBAL(boot_mbr_signature) .fill EDD_MBR_SIG_MAX*8,1,0 GLOBAL(boot_edd_info) .fill 512,1,0 # big enough for a disc sector xen-4.4.0/xen/arch/x86/boot/mem.S0000664000175000017500000000552312307313555014451 0ustar smbsmb .code16 #define SMAP 0x534d4150 #define E820MAX 128 get_memory_map: .Lmeme820: xorl %ebx, %ebx # continuation counter movw $bootsym(e820map), %di # point into the whitelist # so we can have the bios # directly write into it. 1: movl $0x0000e820, %eax # e820, upper word zeroed movl $SMAP,%edx # ascii 'SMAP' movl $20,%ecx # size of the e820rec pushw %ds # data record. popw %es int $0x15 jc .Lmem88 cmpl $SMAP,%eax # check the return is `SMAP' jne .Lmem88 movb bootsym(e820nr),%al # up to 128 entries cmpb $E820MAX,%al jae .Lmem88 incb bootsym(e820nr) movw %di,%ax addw $20,%ax movw %ax,%di cmpl $0,%ebx # check to see if jne 1b # %ebx is set to EOF .Lmem88: movb $0x88, %ah int $0x15 movw %ax,bootsym(highmem_kb) .Lmeme801: stc # fix to work around buggy xorw %cx,%cx # BIOSes which don't clear/set xorw %dx,%dx # carry on pass/error of # e801h memory size call # or merely pass cx,dx though # without changing them. movw $0xe801, %ax int $0x15 jc .Lint12 cmpw $0x0, %cx # Kludge to handle BIOSes jne 1f # which report their extended cmpw $0x0, %dx # memory in AX/BX rather than jne 1f # CX/DX. The spec I have read movw %ax, %cx # seems to indicate AX/BX movw %bx, %dx # are more reasonable anyway... 1: andl $0xffff,%edx # clear sign extend shll $6,%edx # and go from 64k to 1k chunks movl %edx,bootsym(highmem_kb) # store extended memory size andl $0xffff,%ecx # clear sign extend addl %ecx,bootsym(highmem_kb) # and add lower memory into .Lint12: int $0x12 movw %ax,bootsym(lowmem_kb) ret GLOBAL(e820map) .fill E820MAX*20,1,0 GLOBAL(e820nr) .long 0 GLOBAL(lowmem_kb) .long 0 GLOBAL(highmem_kb) .long 0 xen-4.4.0/xen/arch/x86/boot/build32.mk0000664000175000017500000000164512307313555015345 0ustar smbsmboverride XEN_TARGET_ARCH=x86_32 CFLAGS = include $(XEN_ROOT)/Config.mk $(call cc-options-add,CFLAGS,CC,$(EMBEDDED_EXTRA_CFLAGS)) CFLAGS += -Werror -fno-builtin -msoft-float CFLAGS := $(filter-out -flto,$(CFLAGS)) # NB. awk invocation is a portable alternative to 'head -n -1' %.S: %.bin (od -v -t x $< | tr -s ' ' | awk 'NR > 1 {print s} {s=$$0}' | \ sed 's/ /,0x/g' | sed 's/,0x$$//' | sed 's/^[0-9]*,/ .long /') >$@ %.bin: %.lnk $(OBJCOPY) -O binary $< $@ %.lnk: %.o $(LD) $(LDFLAGS_DIRECT) -N -Ttext 0 -o $@ $< %.o: %.c $(CC) $(CFLAGS) -c -fpic $< -o $@ $(OBJDUMP) -h $@ | sed -n '/[0-9]/{s,00*,0,g;p}' |\ while read idx name sz rest; do \ case "$$name" in \ .data|.data.*|.rodata|.rodata.*|.bss|.bss.*) \ test $$sz != 0 || continue; \ echo "Error: non-empty $$name: 0x$$sz" >&2; \ exit $$(expr $$idx + 1);; \ esac; \ done reloc.o: $(BASEDIR)/include/asm-x86/config.h .PRECIOUS: %.bin %.lnk xen-4.4.0/xen/arch/x86/boot/wakeup.S0000664000175000017500000001207412307313555015166 0ustar smbsmb .code16 #define wakesym(sym) (sym - wakeup_start) .align 16 ENTRY(wakeup_start) cli cld # setup data segment movw %cs, %ax movw %ax, %ds movw %ax, %ss # A stack required for BIOS call movw $wakesym(wakeup_stack), %sp pushl $0 # Kill dangerous flag early popfl # check magic number movl wakesym(real_magic), %eax cmpl $0x12345678, %eax jne bogus_real_magic # for acpi_sleep=s3_bios testl $1, wakesym(video_flags) jz 1f lcall $0xc000, $3 movw %cs, %ax # In case messed by BIOS movw %ax, %ds movw %ax, %ss # Need this? How to ret if clobbered? 1: # for acpi_sleep=s3_mode testl $2, wakesym(video_flags) jz 1f movl wakesym(video_mode), %eax call mode_setw 1: # Show some progress if VGA is resumed movw $0xb800, %ax movw %ax, %fs movw $0x0e00 + 'L', %fs:(0x10) # boot trampoline is under 1M, and shift its start into # %fs to reference symbols in that area mov wakesym(trampoline_seg), %fs lidt %fs:bootsym(idt_48) lgdt %fs:bootsym(gdt_48) movw $1, %ax lmsw %ax # Turn on CR0.PE ljmpl $BOOT_CS32, $bootsym_rel(wakeup_32, 6) /* This code uses an extended set of video mode numbers. These include: * Aliases for standard modes * NORMAL_VGA (-1) * EXTENDED_VGA (-2) * ASK_VGA (-3) * Video modes numbered by menu position -- NOT RECOMMENDED because of lack * of compatibility when extending the table. These are between 0x00 and 0xff. */ #define VIDEO_FIRST_MENU 0x0000 /* Standard BIOS video modes (BIOS number + 0x0100) */ #define VIDEO_FIRST_BIOS 0x0100 /* VESA BIOS video modes (VESA number + 0x0200) */ #define VIDEO_FIRST_VESA 0x0200 /* Video7 special modes (BIOS number + 0x0900) */ #define VIDEO_FIRST_V7 0x0900 # Setting of user mode (AX=mode ID) => CF=success mode_setw: movw %ax, %bx cmpb $VIDEO_FIRST_VESA>>8, %ah jnc check_vesaw decb %ah setbadw: clc ret check_vesaw: subb $VIDEO_FIRST_VESA>>8, %bh orw $0x4000, %bx # Use linear frame buffer movw $0x4f02, %ax # VESA BIOS mode set call int $0x10 cmpw $0x004f, %ax # AL=4f if implemented jnz _setbadw # AH=0 if OK stc ret _setbadw: jmp setbadw bogus_real_magic: movw $0x0e00 + 'B', %fs:(0x12) jmp bogus_real_magic .align 4 real_magic: .long 0x12345678 GLOBAL(video_mode) .long 0 GLOBAL(video_flags) .long 0 trampoline_seg: .word 0 .pushsection .trampoline_seg, "a" .long trampoline_seg - . .popsection .code32 # Now in protect mode, with paging disabled # Add offset for any reference to xen specific symbols wakeup_32: /* Set up segment registers and initial stack for protected mode */ mov $BOOT_DS, %eax mov %eax, %ds mov %eax, %ss mov $bootsym_rel(wakeup_stack, 4, %esp) # check saved magic again mov $sym_phys(saved_magic), %eax add bootsym_rel(trampoline_xen_phys_start, 4, %eax) mov (%eax), %eax cmp $0x9abcdef0, %eax jne bogus_saved_magic /* fpu init? */ /* Initialise CR4. */ mov $X86_CR4_PAE, %ecx mov %ecx, %cr4 /* Load pagetable base register */ mov $sym_phys(idle_pg_table),%eax add bootsym_rel(trampoline_xen_phys_start,4,%eax) mov %eax,%cr3 /* Will cpuid feature change after resume? */ /* Set up EFER (Extended Feature Enable Register). */ mov bootsym_rel(cpuid_ext_features,4,%edi) test $0x20100800,%edi /* SYSCALL/SYSRET, No Execute, Long Mode? */ jz .Lskip_eferw movl $MSR_EFER,%ecx rdmsr btsl $_EFER_LME,%eax /* Long Mode */ btsl $_EFER_SCE,%eax /* SYSCALL/SYSRET */ btl $20,%edi /* No Execute? */ jnc 1f btsl $_EFER_NX,%eax /* No Execute */ 1: wrmsr .Lskip_eferw: wbinvd /* Enable paging and flush prefetch queue */ mov $0x80050033,%eax /* hi-to-lo: PG,AM,WP,NE,ET,MP,PE */ mov %eax,%cr0 jmp 1f 1: /* Now in compatibility mode. Long-jump to 64-bit mode */ ljmp $BOOT_CS64, $bootsym_rel(wakeup_64,6) .code64 wakeup_64: /* Jump to high mappings and the higher-level wakeup code. */ movq ret_point(%rip), %rbx jmp *%rbx ret_point: .quad __ret_point bogus_saved_magic: movw $0x0e00 + 'S', 0xb8014 jmp bogus_saved_magic .align 16 .fill PAGE_SIZE,1,0 wakeup_stack: xen-4.4.0/xen/arch/x86/boot/trampoline.S0000664000175000017500000001434512307313555016047 0ustar smbsmb .code16 /* NB. bootsym() is only usable in real mode, or via BOOT_PSEUDORM_DS. */ #undef bootsym #define bootsym(s) ((s)-trampoline_start) #define bootsym_rel(sym, off, opnd...) \ bootsym(sym),##opnd; \ 111:; \ .pushsection .trampoline_rel, "a"; \ .long 111b - (off) - .; \ .popsection #define bootsym_segrel(sym, off) \ $0,$bootsym(sym); \ 111:; \ .pushsection .trampoline_seg, "a"; \ .long 111b - (off) - .; \ .popsection GLOBAL(trampoline_realmode_entry) mov %cs,%ax mov %ax,%ds movb $0xA5,bootsym(trampoline_cpu_started) cld cli lidt bootsym(idt_48) lgdt bootsym(gdt_48) mov $1,%bl # EBX != 0 indicates we are an AP xor %ax, %ax inc %ax lmsw %ax # CR0.PE = 1 (enter protected mode) ljmpl $BOOT_CS32,$bootsym_rel(trampoline_protmode_entry,6) idt_48: .word 0, 0, 0 # base = limit = 0 gdt_48: .word 6*8-1 .long bootsym_rel(trampoline_gdt,4) trampoline_gdt: /* 0x0000: unused */ .quad 0x0000000000000000 /* 0x0008: ring 0 code, 32-bit mode */ .quad 0x00cf9a000000ffff /* 0x0010: ring 0 code, 64-bit mode */ .quad 0x00af9a000000ffff /* 0x0018: ring 0 data */ .quad 0x00cf92000000ffff /* 0x0020: real-mode code @ BOOT_TRAMPOLINE */ .long 0x0000ffff .long 0x00009a00 /* 0x0028: real-mode data @ BOOT_TRAMPOLINE */ .long 0x0000ffff .long 0x00009200 .pushsection .trampoline_rel, "a" .long trampoline_gdt + BOOT_PSEUDORM_CS + 2 - . .long trampoline_gdt + BOOT_PSEUDORM_DS + 2 - . .popsection GLOBAL(cpuid_ext_features) .long 0 GLOBAL(trampoline_xen_phys_start) .long 0 GLOBAL(trampoline_cpu_started) .byte 0 .code32 trampoline_protmode_entry: /* Set up a few descriptors: on entry only CS is guaranteed good. */ mov $BOOT_DS,%eax mov %eax,%ds mov %eax,%es /* Set up FPU. */ fninit /* Initialise CR4. */ mov $X86_CR4_PAE,%ecx mov %ecx,%cr4 /* Load pagetable base register. */ mov $sym_phys(idle_pg_table),%eax add bootsym_rel(trampoline_xen_phys_start,4,%eax) mov %eax,%cr3 /* Set up EFER (Extended Feature Enable Register). */ mov bootsym_rel(cpuid_ext_features,4,%edi) test $0x20100800,%edi /* SYSCALL/SYSRET, No Execute, Long Mode? */ jz .Lskip_efer movl $MSR_EFER,%ecx rdmsr btsl $_EFER_LME,%eax /* Long Mode */ btsl $_EFER_SCE,%eax /* SYSCALL/SYSRET */ btl $20,%edi /* No Execute? */ jnc 1f btsl $_EFER_NX,%eax /* No Execute */ 1: wrmsr .Lskip_efer: mov $0x80050033,%eax /* hi-to-lo: PG,AM,WP,NE,ET,MP,PE */ mov %eax,%cr0 jmp 1f 1: /* Now in compatibility mode. Long-jump into 64-bit mode. */ ljmp $BOOT_CS64,$bootsym_rel(start64,6) .code64 start64: /* Jump to high mappings. */ mov high_start(%rip),%rax jmpq *%rax high_start: .quad __high_start .code32 trampoline_boot_cpu_entry: cmpb $0,bootsym_rel(skip_realmode,5) jnz .Lskip_realmode /* Load pseudo-real-mode segments. */ mov $BOOT_PSEUDORM_DS,%eax mov %eax,%ds mov %eax,%es mov %eax,%fs mov %eax,%gs mov %eax,%ss /* Switch to pseudo-rm CS, enter real mode, and flush insn queue. */ mov %cr0,%eax dec %eax ljmp $BOOT_PSEUDORM_CS,$bootsym(1f) .code16 1: mov %eax,%cr0 # CR0.PE = 0 (leave protected mode) /* Load proper real-mode values into %cs, %ds, %es and %ss. */ ljmp bootsym_segrel(1f,2) 1: mov %cs,%ax mov %ax,%ds mov %ax,%es mov %ax,%ss /* Initialise stack pointer and IDT, and enable irqs. */ xor %esp,%esp lidt bootsym(rm_idt) sti /* * Declare that our target operating mode is long mode. * Initialise 32-bit registers since some buggy BIOSes depend on it. */ xor %ecx,%ecx xor %edx,%edx xor %esi,%esi xor %edi,%edi xor %ebp,%ebp movl $0xec00,%eax # declare target operating mode movl $0x0002,%ebx # long mode int $0x15 /* * Do real-mode work: * 1. Get memory map. * 2. Get Enhanced Disk Drive (EDD) information. * 3. Set video mode. * 4. Get keyboard shift flags. */ call get_memory_map call get_edd call video mov $0x0200,%ax int $0x16 mov %al,bootsym(kbd_shift_flags) /* Disable irqs before returning to protected mode. */ cli /* Reset GDT and IDT. Some BIOSes clobber GDTR. */ lidt bootsym(idt_48) lgdt bootsym(gdt_48) /* Enter protected mode, and flush insn queue. */ xor %ax,%ax inc %ax lmsw %ax # CR0.PE = 1 (enter protected mode) /* Load proper protected-mode values into all segment registers. */ ljmpl $BOOT_CS32,$bootsym_rel(1f,6) .code32 1: mov $BOOT_DS,%eax mov %eax,%ds mov %eax,%es mov %eax,%fs mov %eax,%gs mov %eax,%ss .Lskip_realmode: /* EBX == 0 indicates we are the BP (Boot Processor). */ xor %ebx,%ebx /* Jump to the common bootstrap entry point. */ jmp trampoline_protmode_entry skip_realmode: .byte 0 GLOBAL(kbd_shift_flags) .byte 0 rm_idt: .word 256*4-1, 0, 0 #include "mem.S" #include "edd.S" #include "video.S" #include "wakeup.S" xen-4.4.0/xen/arch/x86/boot/reloc.c0000664000175000017500000000547312307313555015023 0ustar smbsmb/****************************************************************************** * reloc.c * * 32-bit flat memory-map routines for relocating Multiboot structures * and modules. This is most easily done early with paging disabled. * * Copyright (c) 2009, Citrix Systems, Inc. * * Authors: * Keir Fraser */ /* entered with %eax = BOOT_TRAMPOLINE */ asm ( " .text \n" " .globl _start \n" "_start: \n" " call 1f \n" "1: pop %ebx \n" " mov %eax,alloc-1b(%ebx) \n" " jmp reloc \n" ); /* This is our data. Because the code must be relocatable, no BSS is * allowed. All data is accessed PC-relative with inline assembly. */ asm ( "alloc: \n" " .long 0 \n" ); typedef unsigned int u32; #include "../../../include/xen/multiboot.h" static void *reloc_mbi_struct(void *old, unsigned int bytes) { void *new; asm( " call 1f \n" "1: pop %%edx \n" " mov alloc-1b(%%edx),%0 \n" " sub %1,%0 \n" " and $~15,%0 \n" " mov %0,alloc-1b(%%edx) \n" " mov %0,%%edi \n" " rep movsb \n" : "=&r" (new), "+c" (bytes), "+S" (old) : : "edx", "edi"); return new; } static char *reloc_mbi_string(char *old) { char *p; for ( p = old; *p != '\0'; p++ ) continue; return reloc_mbi_struct(old, p - old + 1); } multiboot_info_t *reloc(multiboot_info_t *mbi_old) { multiboot_info_t *mbi = reloc_mbi_struct(mbi_old, sizeof(*mbi)); int i; if ( mbi->flags & MBI_CMDLINE ) mbi->cmdline = (u32)reloc_mbi_string((char *)mbi->cmdline); if ( mbi->flags & MBI_MODULES ) { module_t *mods = reloc_mbi_struct( (module_t *)mbi->mods_addr, mbi->mods_count * sizeof(module_t)); mbi->mods_addr = (u32)mods; for ( i = 0; i < mbi->mods_count; i++ ) { if ( mods[i].string ) mods[i].string = (u32)reloc_mbi_string((char *)mods[i].string); } } if ( mbi->flags & MBI_MEMMAP ) mbi->mmap_addr = (u32)reloc_mbi_struct( (memory_map_t *)mbi->mmap_addr, mbi->mmap_length); if ( mbi->flags & MBI_LOADERNAME ) mbi->boot_loader_name = (u32)reloc_mbi_string( (char *)mbi->boot_loader_name); /* Mask features we don't understand or don't relocate. */ mbi->flags &= (MBI_MEMLIMITS | MBI_BOOTDEV | MBI_CMDLINE | MBI_MODULES | MBI_MEMMAP | MBI_LOADERNAME); return mbi; } xen-4.4.0/xen/arch/x86/boot/cmdline.S0000664000175000017500000002243012307313555015302 0ustar smbsmb/****************************************************************************** * cmdline.S * * Early command-line parsing. */ .code32 #include "video.h" # NB. String pointer on stack is modified to point past parsed digits. .Latoi: push %ebx push %ecx push %edx push %esi xor %ebx,%ebx /* %ebx = accumulator */ mov $10,%ecx /* %ecx = base (default base 10) */ mov 16+4(%esp),%esi /* %esi = pointer into ascii string. */ lodsb cmpb $'0',%al jne 2f mov $8,%ecx /* Prefix '0' => octal (base 8) */ lodsb cmpb $'x',%al jne 2f mov $16,%ecx /* Prefix '0x' => hex (base 16) */ 1: lodsb 2: sub $'0',%al jb 4f cmp $9,%al jbe 3f sub $'A'-'0'-10,%al jb 4f cmp $15,%al jbe 3f sub $'a'-'A',%al jb 4f 3: cmp %cl,%al jae 4f movzbl %al,%eax xchg %eax,%ebx mul %ecx xchg %eax,%ebx add %eax,%ebx jmp 1b 4: mov %ebx,%eax dec %esi mov %esi,16+4(%esp) pop %esi pop %edx pop %ecx pop %ebx ret .Lstrstr: push %ecx push %edx push %esi push %edi xor %eax,%eax xor %ecx,%ecx not %ecx mov 16+4(%esp),%esi mov 16+8(%esp),%edi repne scasb not %ecx dec %ecx mov %ecx,%edx 1: mov 16+8(%esp),%edi mov %esi,%eax mov %edx,%ecx repe cmpsb je 2f xchg %eax,%esi inc %esi cmpb $0,-1(%eax) jne 1b xor %eax,%eax 2: pop %edi pop %esi pop %edx pop %ecx ret .Lstr_prefix: push %esi push %edi mov 8+4(%esp),%esi /* 1st arg is prefix string */ mov 8+8(%esp),%edi /* 2nd arg is main string */ 1: lodsb test %al,%al jz 2f scasb je 1b sbb %eax,%eax or $1,%al jmp 3f 2: xor %eax,%eax 3: pop %edi pop %esi ret .Lstrlen: push %ecx push %esi push %edi xor %eax,%eax xor %ecx,%ecx not %ecx mov 12+4(%esp),%edi repne scasb not %ecx dec %ecx mov %ecx,%eax pop %edi pop %esi pop %ecx ret .Lfind_option: mov 4(%esp),%eax dec %eax push %ebx 1: pushl 4+8(%esp) inc %eax push %eax call .Lstrstr add $8,%esp test %eax,%eax jz 3f cmp %eax,4+4(%esp) je 2f cmpb $' ',-1(%eax) jne 1b 2: mov %eax,%ebx pushl 4+8(%esp) call .Lstrlen add $4,%esp xadd %eax,%ebx /* NUL check (as $'\0' == 0x30 in GAS) */ cmpb $0,(%ebx) je 3f cmpb $' ',(%ebx) je 3f cmpb $'=',(%ebx) jne 1b 3: pop %ebx ret cmdline_parse_early: pusha /* Bail if there is no command line to parse. */ mov sym_phys(multiboot_ptr),%ebx mov MB_flags(%ebx),%eax test $4,%al jz .Lcmdline_exit mov MB_cmdline(%ebx),%eax test %eax,%eax jz .Lcmdline_exit /* Check for 'no-real-mode' command-line option. */ pushl $sym_phys(.Lno_rm_opt) pushl MB_cmdline(%ebx) call .Lfind_option test %eax,%eax setnz %al or %al,sym_phys(skip_realmode) /* Check for 'tboot=' command-line option. */ movl $sym_phys(.Ltboot_opt),4(%esp) call .Lfind_option test %eax,%eax setnz %al or %al,sym_phys(skip_realmode) /* tboot= implies no-real-mode */ .Lparse_edd: /* Check for 'edd=' command-line option. */ movl $sym_phys(.Ledd_opt),4(%esp) call .Lfind_option test %eax,%eax jz .Lparse_edid cmpb $'=',3(%eax) jne .Lparse_edid add $4,%eax movb $2,sym_phys(opt_edd) /* opt_edd=2: edd=off */ cmpw $0x666f,(%eax) /* 0x666f == "of" */ je .Lparse_edid decb sym_phys(opt_edd) /* opt_edd=1: edd=skipmbr */ cmpw $0x6b73,(%eax) /* 0x6b73 == "sk" */ je .Lparse_edid decb sym_phys(opt_edd) /* opt_edd=0: edd=on (default) */ .Lparse_edid: /* Check for 'edid=' command-line option. */ movl $sym_phys(.Ledid_opt),4(%esp) call .Lfind_option test %eax,%eax jz .Lparse_vga cmpb $'=',4(%eax) jne .Lparse_vga add $5,%eax mov %eax,%ebx push %ebx pushl $sym_phys(.Ledid_force) call .Lstr_prefix add $8,%esp movb $2,sym_phys(opt_edid) /* opt_edid=2: edid=force */ test %eax,%eax jz .Lparse_vga push %ebx pushl $sym_phys(.Ledid_no) call .Lstr_prefix add $8,%esp decb sym_phys(opt_edid) /* opt_edid=1: edid=no */ test %eax,%eax jz .Lparse_vga decb sym_phys(opt_edid) /* opt_edid=0: default */ .Lparse_vga: /* Check for 'vga=' command-line option. */ movl $sym_phys(.Lvga_opt),4(%esp) call .Lfind_option add $8,%esp test %eax,%eax jz .Lcmdline_exit cmpb $'=',3(%eax) jne .Lcmdline_exit add $4,%eax /* Found the 'vga=' option. Default option is to display vga menu. */ movw $ASK_VGA,sym_phys(boot_vid_mode) /* Check for 'vga=text-80x. */ mov %eax,%ebx push %ebx pushl $sym_phys(.Lvga_text80) call .Lstr_prefix add $8,%esp test %eax,%eax jnz .Lparse_vga_gfx /* We have 'vga=text-80x'. */ add $8,%ebx push %ebx call .Latoi add $4,%esp mov %ax,%bx lea sym_phys(.Lvga_text_modes),%esi 1: lodsw test %ax,%ax jz .Lcmdline_exit cmp %ax,%bx lodsw jne 1b mov %ax,sym_phys(boot_vid_mode) jmp .Lcmdline_exit .Lparse_vga_gfx: /* Check for 'vga=gfx-xx'. */ push %ebx pushl $sym_phys(.Lvga_gfx) call .Lstr_prefix add $8,%esp test %eax,%eax jnz .Lparse_vga_mode /* We have 'vga=gfx-xx'. */ /* skip 'gfx-' */ add $4,%ebx /* parse */ push %ebx call .Latoi pop %esi mov %ax,sym_phys(vesa_size)+0 /* skip 'x' */ lodsb cmpb $'x',%al jne .Lcmdline_exit /* parse */ push %esi call .Latoi pop %esi mov %ax,sym_phys(vesa_size)+2 /* skip 'x' */ lodsb cmpb $'x',%al jne .Lcmdline_exit /* parse */ push %esi call .Latoi pop %esi mov %ax,sym_phys(vesa_size)+4 /* commit to vesa mode */ movw $VIDEO_VESA_BY_SIZE,sym_phys(boot_vid_mode) jmp .Lcmdline_exit .Lparse_vga_mode: /* Check for 'vga=mode-'. */ push %ebx pushl $sym_phys(.Lvga_mode) call .Lstr_prefix add $8,%esp test %eax,%eax jnz .Lparse_vga_current /* We have 'vga=mode-'. */ add $5,%ebx push %ebx call .Latoi add $4,%esp mov %ax,sym_phys(boot_vid_mode) jmp .Lcmdline_exit .Lparse_vga_current: /* Check for 'vga=current'. */ push %ebx pushl $sym_phys(.Lvga_current) call .Lstr_prefix add $8,%esp test %eax,%eax jnz .Lcmdline_exit /* We have 'vga=current'. */ movw $VIDEO_CURRENT_MODE,sym_phys(boot_vid_mode) .Lcmdline_exit: popa ret .Lvga_text_modes: /* rows, mode_number */ .word 25,VIDEO_80x25 .word 50,VIDEO_80x50 .word 43,VIDEO_80x43 .word 28,VIDEO_80x28 .word 30,VIDEO_80x30 .word 34,VIDEO_80x34 .word 60,VIDEO_80x60 .word 0 .Lvga_opt: .asciz "vga" .Lvga_text80: .asciz "text-80x" .Lvga_gfx: .asciz "gfx-" .Lvga_mode: .asciz "mode-" .Lvga_current: .asciz "current" .Lno_rm_opt: .asciz "no-real-mode" .Ltboot_opt: .asciz "tboot" .Ledid_opt: .asciz "edid" .Ledid_force: .asciz "force" .Ledid_no: .asciz "no" .Ledd_opt: .asciz "edd" xen-4.4.0/xen/arch/x86/boot/head.S0000664000175000017500000001625312307313555014576 0ustar smbsmb#include #include #include #include #include #include #include #include .text .code32 #define sym_phys(sym) ((sym) - __XEN_VIRT_START) #define BOOT_CS32 0x0008 #define BOOT_CS64 0x0010 #define BOOT_DS 0x0018 #define BOOT_PSEUDORM_CS 0x0020 #define BOOT_PSEUDORM_DS 0x0028 ENTRY(start) jmp __start .align 4 /*** MULTIBOOT HEADER ****/ #define MULTIBOOT_HEADER_FLAGS (MULTIBOOT_HEADER_MODS_ALIGNED | \ MULTIBOOT_HEADER_WANT_MEMORY) /* Magic number indicating a Multiboot header. */ .long MULTIBOOT_HEADER_MAGIC /* Flags to bootloader (see Multiboot spec). */ .long MULTIBOOT_HEADER_FLAGS /* Checksum: must be the negated sum of the first two fields. */ .long -(MULTIBOOT_HEADER_MAGIC + MULTIBOOT_HEADER_FLAGS) .section .init.text, "ax" .Lbad_cpu_msg: .asciz "ERR: Not a 64-bit CPU!" .Lbad_ldr_msg: .asciz "ERR: Not a Multiboot bootloader!" bad_cpu: mov $(sym_phys(.Lbad_cpu_msg)),%esi # Error message jmp print_err not_multiboot: mov $(sym_phys(.Lbad_ldr_msg)),%esi # Error message print_err: mov $0xB8000,%edi # VGA framebuffer 1: mov (%esi),%bl test %bl,%bl # Terminate on '\0' sentinel 2: je 2b mov $0x3f8+5,%dx # UART Line Status Register 3: in %dx,%al test $0x20,%al # Test THR Empty flag je 3b mov $0x3f8+0,%dx # UART Transmit Holding Register mov %bl,%al out %al,%dx # Send a character over the serial line movsb # Write a character to the VGA framebuffer mov $7,%al stosb # Write an attribute to the VGA framebuffer jmp 1b gdt_boot_descr: .word 6*8-1 .long sym_phys(trampoline_gdt) __start: cld cli /* Initialise GDT and basic data segments. */ lgdt %cs:sym_phys(gdt_boot_descr) mov $BOOT_DS,%ecx mov %ecx,%ds mov %ecx,%es mov %ecx,%ss /* Check for Multiboot bootloader */ cmp $0x2BADB002,%eax jne not_multiboot /* Set up trampoline segment 64k below EBDA */ movzwl 0x40e,%eax /* EBDA segment */ cmp $0xa000,%eax /* sanity check (high) */ jae 0f cmp $0x4000,%eax /* sanity check (low) */ jae 1f 0: movzwl 0x413,%eax /* use base memory size on failure */ shl $10-4,%eax 1: /* * Compare the value in the BDA with the information from the * multiboot structure (if available) and use the smallest. */ testb $MBI_MEMLIMITS,(%ebx) jz 2f /* not available? BDA value will be fine */ mov 4(%ebx),%edx cmp $0x100,%edx /* is the multiboot value too small? */ jb 2f /* if so, do not use it */ shl $10-4,%edx cmp %eax,%edx /* compare with BDA value */ cmovb %edx,%eax /* and use the smaller */ 2: /* Reserve 64kb for the trampoline */ sub $0x1000,%eax /* From arch/x86/smpboot.c: start_eip had better be page-aligned! */ xor %al, %al shl $4, %eax mov %eax,sym_phys(trampoline_phys) /* Save the Multiboot info struct (after relocation) for later use. */ mov $sym_phys(cpu0_stack)+1024,%esp push %ebx call reloc mov %eax,sym_phys(multiboot_ptr) /* Initialize BSS (no nasty surprises!) */ mov $sym_phys(__bss_start),%edi mov $sym_phys(_end),%ecx sub %edi,%ecx xor %eax,%eax rep stosb /* Interrogate CPU extended features via CPUID. */ mov $0x80000000,%eax cpuid xor %edx,%edx cmp $0x80000000,%eax # any function > 0x80000000? jbe 1f mov $0x80000001,%eax cpuid 1: mov %edx,sym_phys(cpuid_ext_features) mov %edx,sym_phys(boot_cpu_data)+CPUINFO86_ext_features /* Check for availability of long mode. */ bt $29,%edx jnc bad_cpu /* Initialise L2 boot-map page table entries (16MB). */ mov $sym_phys(l2_bootmap),%edx mov $PAGE_HYPERVISOR|_PAGE_PSE,%eax mov $8,%ecx 1: mov %eax,(%edx) add $8,%edx add $(1<= 0xa0 && pfn < 0xc0 .long (pfn << PAGE_SHIFT) | PAGE_HYPERVISOR_NOCACHE | MAP_SMALL_PAGES .else .long (pfn << PAGE_SHIFT) | PAGE_HYPERVISOR | MAP_SMALL_PAGES .endif .long 0 pfn = pfn + 1 .endr .size l1_identmap, . - l1_identmap xen-4.4.0/xen/arch/x86/boot/video.S0000664000175000017500000007372312307313555015010 0ustar smbsmb/****************************************************************************** * video.S * * Display adapter & video mode setup, version 2.13 (14-May-99) * * Copyright (C) 1995 -- 1998 Martin Mares * Based on the original setup.S code (C) Linus Torvalds and Mats Anderson * * Rewritten to use GNU 'as' by Chris Noe May 1999 * * Updated and ported for Xen by Keir Fraser June 2007 */ .code16 #include "video.h" /* Scratch space layout: trampoline_end to trampoline_end+0x1000. */ #define modelist bootsym(trampoline_end) /* 2kB (256 entries) */ #define vesa_glob_info (modelist + 0x800) /* 1kB */ #define vesa_mode_info (vesa_glob_info + 0x400) /* 1kB */ /* Retrieve Extended Display Identification Data. */ #define CONFIG_FIRMWARE_EDID /* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */ #undef CONFIG_VIDEO_400_HACK /* Positions of various video parameters passed to the kernel */ /* (see also include/linux/tty.h) */ #define PARAM_CURSOR_POS 0x00 #define PARAM_VIDEO_MODE 0x02 #define PARAM_VIDEO_COLS 0x03 #define PARAM_VIDEO_LINES 0x04 #define PARAM_HAVE_VGA 0x05 #define PARAM_FONT_POINTS 0x06 #define PARAM_CAPABILITIES 0x08 #define PARAM_LFB_LINELENGTH 0x0c #define PARAM_LFB_WIDTH 0x0e #define PARAM_LFB_HEIGHT 0x10 #define PARAM_LFB_DEPTH 0x12 #define PARAM_LFB_BASE 0x14 #define PARAM_LFB_SIZE 0x18 #define PARAM_LFB_COLORS 0x1c #define PARAM_VESAPM_SEG 0x24 #define PARAM_VESAPM_OFF 0x26 #define PARAM_VESA_ATTRIB 0x28 #define _param(param) bootsym(boot_vid_info)+(param) video: xorw %ax, %ax movw %ax, %gs # GS is zero cld call basic_detect # Basic adapter type testing (EGA/VGA/MDA/CGA) cmpb $0,_param(PARAM_HAVE_VGA) je 1f # Bail if there's no VGA movw bootsym(boot_vid_mode), %ax # User selected video mode cmpw $ASK_VGA, %ax # Bring up the menu jz vid2 call mode_set # Set the mode jc vid1 leaw bootsym(badmdt), %si # Invalid mode ID call prtstr vid2: call mode_menu vid1: call store_edid call mode_params # Store mode parameters 1: ret # Detect if we have CGA, MDA, EGA or VGA and pass it to the kernel. basic_detect: movb $0, _param(PARAM_HAVE_VGA) movb $0x12, %ah # Check EGA/VGA movb $0x10, %bl int $0x10 cmpb $0x10, %bl # No, it's a CGA/MDA/HGA card. je basret movw $0x1a00, %ax # Check EGA or VGA? int $0x10 cmpb $0x1a, %al # 1a means VGA... jne basret # anything else is EGA. incb _param(PARAM_HAVE_VGA) # We've detected a VGA basret: ret # Store the video mode parameters for later usage by the kernel. # This is done by asking the BIOS except for the rows/columns # parameters in the default 80x25 mode -- these are set directly, # because some very obscure BIOSes supply insane values. mode_params: cmpb $0, bootsym(graphic_mode) jnz mopar_gr movb $0x03, %ah # Read cursor position xorb %bh, %bh int $0x10 movw %dx, _param(PARAM_CURSOR_POS) movb $0x0f, %ah # Read page/mode/width int $0x10 movw %ax, _param(PARAM_VIDEO_MODE) # Video mode and screen width movw %gs:(0x485), %ax # Font size movw %ax, _param(PARAM_FONT_POINTS) # (valid only on EGA/VGA) movw bootsym(force_size), %ax # Forced size? orw %ax, %ax jz mopar1 movb %ah, _param(PARAM_VIDEO_COLS) movb %al, _param(PARAM_VIDEO_LINES) ret mopar1: movb %gs:(0x484), %al # On EGA/VGA, use the EGA+ BIOS incb %al # location of max lines. mopar2: movb %al, _param(PARAM_VIDEO_LINES) ret # Fetching of VESA frame buffer parameters mopar_gr: leaw vesa_mode_info, %di movb $0x23, _param(PARAM_HAVE_VGA) movw 16(%di), %ax movw %ax, _param(PARAM_LFB_LINELENGTH) movw 18(%di), %ax movw %ax, _param(PARAM_LFB_WIDTH) movw 20(%di), %ax movw %ax, _param(PARAM_LFB_HEIGHT) movb 25(%di), %al movb $0, %ah movw %ax, _param(PARAM_LFB_DEPTH) movl 40(%di), %eax movl %eax, _param(PARAM_LFB_BASE) movl 31(%di), %eax movl %eax, _param(PARAM_LFB_COLORS) movl 35(%di), %eax movl %eax, _param(PARAM_LFB_COLORS+4) movw 0(%di), %ax movw %ax, _param(PARAM_VESA_ATTRIB) # get video mem size leaw vesa_glob_info, %di xorl %eax, %eax movw 18(%di), %ax movl %eax, _param(PARAM_LFB_SIZE) # store mode capabilities movl 10(%di), %eax movl %eax, _param(PARAM_CAPABILITIES) # switching the DAC to 8-bit is for <= 8 bpp only movw _param(PARAM_LFB_DEPTH), %ax cmpw $8, %ax jg dac_done # get DAC switching capability xorl %eax, %eax movb 10(%di), %al testb $1, %al jz dac_set # attempt to switch DAC to 8-bit movw $0x4f08, %ax movw $0x0800, %bx int $0x10 cmpw $0x004f, %ax jne dac_set movb %bh, bootsym(dac_size) # store actual DAC size dac_set: # set color size to DAC size movb bootsym(dac_size), %al movb %al, _param(PARAM_LFB_COLORS+0) movb %al, _param(PARAM_LFB_COLORS+2) movb %al, _param(PARAM_LFB_COLORS+4) movb %al, _param(PARAM_LFB_COLORS+6) # set color offsets to 0 movb $0, _param(PARAM_LFB_COLORS+1) movb $0, _param(PARAM_LFB_COLORS+3) movb $0, _param(PARAM_LFB_COLORS+5) movb $0, _param(PARAM_LFB_COLORS+7) dac_done: # get protected mode interface informations movw $0x4f0a, %ax xorw %bx, %bx xorw %di, %di int $0x10 cmp $0x004f, %ax jnz no_pm movw %es, _param(PARAM_VESAPM_SEG) movw %di, _param(PARAM_VESAPM_OFF) no_pm: pushw %ds popw %es ret # The video mode menu mode_menu: leaw bootsym(keymsg), %si # "Return/Space/Timeout" message call prtstr call flush nokey: call getkt cmpb $0x0d, %al # ENTER ? je listm # yes - manual mode selection cmpb $0x20, %al # SPACE ? je defmd1 # no - repeat call beep jmp nokey defmd1: ret # No mode chosen? Default 80x25 listm: call mode_table # List mode table listm0: leaw bootsym(name_bann), %si # Print adapter name call prtstr movw bootsym(card_name), %si orw %si, %si jnz an2 leaw bootsym(vga_name), %si jmp an1 an2: call prtstr leaw bootsym(svga_name), %si an1: call prtstr leaw bootsym(listhdr), %si # Table header call prtstr movb $0x30, %dl # DL holds mode number leaw modelist, %si lm1: cmpw $ASK_VGA, (%si) # End? jz lm2 movb %dl, %al # Menu selection number call prtchr call prtsp2 lodsw call prthw # Mode ID call prtsp2 lodsw call prtdec # Width movb $0x78, %al # the letter 'x' call prtchr lodsw call prtdec # Height testb $0xff,(%si) jnz 1f push %si leaw bootsym(textmode), %si call prtstr pop %si lodsw jmp 2f 1: movb $0x78, %al # the letter 'x' call prtchr lodsw call prtdec # Depth 2: movb $0x0d, %al # New line call prtchr movb $0x0a, %al call prtchr incb %dl # Next character cmpb $'z'+1, %dl jnz skip_bail leaw bootsym(menu_bail_msg), %si call prtstr jmp lm2 skip_bail: cmpb $'i', %dl jnz skip_pause push %si push %dx leaw bootsym(menu_more_msg), %si # '' call prtstr call flush 1: call getkey cmpb $0x20, %al # SPACE ? jne 1b # yes - manual mode selection leaw bootsym(crlft), %si call prtstr pop %dx pop %si skip_pause: cmpb $'9'+1, %dl jnz lm1 movb $'a', %dl jmp lm1 lm2: leaw bootsym(prompt), %si # Mode prompt call prtstr leaw bootsym(edit_buf), %di # Editor buffer lm3: call getkey cmpb $0x0d, %al # Enter? jz lment cmpb $0x08, %al # Backspace? jz lmbs cmpb $0x20, %al # Printable? jc lm3 cmpw $bootsym(edit_buf)+4, %di # Enough space? jz lm3 stosb call prtchr jmp lm3 lmbs: cmpw $bootsym(edit_buf), %di # Backspace jz lm3 decw %di movb $0x08, %al call prtchr call prtspc movb $0x08, %al call prtchr jmp lm3 lment: movb $0, (%di) leaw bootsym(crlft), %si call prtstr leaw bootsym(edit_buf), %si cmpb $0, (%si) # Empty string = default mode jz lmdef cmpb $0, 1(%si) # One character = menu selection jz mnusel cmpw $0x656d, (%si) # 'me' jnz lmhx cmpw $0x756e, 2(%si) # 'nu' jnz lmhx jmp listm lmhx: xorw %bx, %bx # Else => mode ID in hex lmhex: lodsb orb %al, %al jz lmuse1 subb $0x30, %al jc lmbad cmpb $10, %al jc lmhx1 subb $7, %al andb $0xdf, %al cmpb $10, %al jc lmbad cmpb $16, %al jnc lmbad lmhx1: shlw $4, %bx orb %al, %bl jmp lmhex lmuse1: movw %bx, %ax jmp lmuse mnusel: lodsb # Menu selection xorb %ah, %ah subb $0x30, %al jc lmbad cmpb $10, %al jc lmuse cmpb $0x61-0x30, %al jc lmbad subb $0x61-0x30-10, %al cmpb $36, %al jnc lmbad lmuse: call mode_set jc lmdef lmbad: leaw bootsym(unknt), %si call prtstr jmp mode_menu lmdef: ret _setrec: jmp setrec # Ugly... _set_80x25: jmp set_80x25 # Setting of user mode (AX=mode ID) => CF=success mode_set: movw %ax, bootsym(boot_vid_mode) movw %ax, %bx cmpw $VIDEO_VESA_BY_SIZE, %ax je setvesabysize testb $VIDEO_RECALC>>8, %ah jnz _setrec cmpb $VIDEO_FIRST_SPECIAL>>8, %ah jz setspc cmpb $VIDEO_FIRST_VESA>>8, %ah jnc check_vesa orb %ah, %ah jnz setbad jmp setmenu setbad: clc ret setspc: xorb %bh, %bh # Set special mode cmpb $VIDEO_LAST_SPECIAL-VIDEO_FIRST_SPECIAL, %bl jnc setbad addw %bx, %bx jmp *bootsym(spec_inits)(%bx) setmenu: orb %al, %al # 80x25 is an exception jz _set_80x25 pushw %bx # Set mode chosen from menu call mode_table # Build the mode table popw %ax shlw $3, %ax addw %ax, %si cmpw %di, %si jnc setbad movw (%si), %ax # Fetch mode ID jmp mode_set check_vesa: leaw vesa_glob_info, %di movw $0x4f00, %ax int $0x10 cmpw $0x004f, %ax jnz setbad leaw vesa_mode_info, %di subb $VIDEO_FIRST_VESA>>8, %bh movw %bx, %cx # Get mode information structure movw $0x4f01, %ax int $0x10 addb $VIDEO_FIRST_VESA>>8, %bh cmpw $0x004f, %ax jnz setbad movb (%di), %al # Check mode attributes. andb $0x99, %al cmpb $0x99, %al jnz _setbad # Doh! No linear frame buffer. subb $VIDEO_FIRST_VESA>>8, %bh orw $0x4000, %bx # Use linear frame buffer movw $0x4f02, %ax # VESA BIOS mode set call int $0x10 cmpw $0x004f, %ax # AL=4f if implemented jnz _setbad # AH=0 if OK movb $1, bootsym(graphic_mode) # flag graphic mode stc ret _setbad: jmp setbad # Ugly... # Recalculate vertical display end registers -- this fixes various # inconsistencies of extended modes on many adapters. Called when # the VIDEO_RECALC flag is set in the mode ID. setrec: subb $VIDEO_RECALC>>8, %ah # Set the base mode call mode_set jnc rct3 movw %gs:(0x485), %ax # Font size in pixels movb %gs:(0x484), %bl # Number of rows incb %bl mulb %bl # Number of visible decw %ax # scan lines - 1 movw $0x3d4, %dx movw %ax, %bx movb $0x12, %al # Lower 8 bits movb %bl, %ah outw %ax, %dx movb $0x07, %al # Bits 8 and 9 in the overflow register call inidx xchgb %al, %ah andb $0xbd, %ah shrb %bh jnc rct1 orb $0x02, %ah rct1: shrb %bh jnc rct2 orb $0x40, %ah rct2: movb $0x07, %al outw %ax, %dx stc rct3: ret inidx: outb %al, %dx # Read from indexed VGA register incw %dx # AL=index, DX=index reg port -> AL=data inb %dx, %al decw %dx ret setvesabysize: call mode_table leaw modelist,%si 1: add $8,%si cmpw $ASK_VGA,-8(%si) # End? je _setbad movw -6(%si),%ax cmpw %ax,bootsym(vesa_size)+0 jne 1b movw -4(%si),%ax cmpw %ax,bootsym(vesa_size)+2 jne 1b movw -2(%si),%ax cmpw %ax,bootsym(vesa_size)+4 jne 1b movw -8(%si),%ax movw %ax,%bx movw %ax,bootsym(boot_vid_mode) jmp check_vesa # Table of routines for setting of the special modes. spec_inits: .word bootsym(set_80x25) .word bootsym(set_8pixel) .word bootsym(set_80x43) .word bootsym(set_80x28) .word bootsym(set_current) .word bootsym(set_80x30) .word bootsym(set_80x34) .word bootsym(set_80x60) # Set the 80x25 mode. If already set, do nothing. set_80x25: movw $0x5019, bootsym(force_size) # Override possibly broken BIOS use_80x25: movw $0x1202, %ax # Force 400 scan lines movb $0x30, %bl int $0x10 movw $0x0003, %ax # Mode 3 int $0x10 stc ret # Set the 80x50/80x43 8-pixel mode. Simple BIOS calls. set_8pixel: call use_80x25 # The base is 80x25 set_8pt: movw $0x1112, %ax # Use 8x8 font xorb %bl, %bl int $0x10 movw $0x1200, %ax # Use alternate print screen movb $0x20, %bl int $0x10 movw $0x1201, %ax # Turn off cursor emulation movb $0x34, %bl int $0x10 movb $0x01, %ah # Define cursor scan lines 6-7 movw $0x0607, %cx int $0x10 stc ret # Set the 80x28 mode. This mode works on all VGA's, because it's a standard # 80x25 mode with 14-point fonts instead of 16-point. set_80x28: call use_80x25 # The base is 80x25 set14: movw $0x1111, %ax # Use 9x14 font xorb %bl, %bl int $0x10 movb $0x01, %ah # Define cursor scan lines 11-12 movw $0x0b0c, %cx int $0x10 set_current: stc ret # Set the 80x43 mode. This mode is works on all VGA's. # It's a 350-scanline mode with 8-pixel font. set_80x43: movw $0x1201, %ax # Set 350 scans movb $0x30, %bl int $0x10 movw $0x0003, %ax # Reset video mode int $0x10 jmp set_8pt # Use 8-pixel font # Set the 80x30 mode (all VGA's). 480 scanlines, 16-pixel font. set_80x30: call use_80x25 # Start with real 80x25 movw $0x3cc, %dx # Get CRTC port inb %dx, %al movb $0xd4, %dl rorb %al # Mono or color? jc set48a movb $0xb4, %dl set48a: movw $0x0c11, %ax # Vertical sync end (also unlocks CR0-7) call outidx movw $0x0b06, %ax # Vertical total call outidx movw $0x3e07, %ax # (Vertical) overflow call outidx movw $0xea10, %ax # Vertical sync start call outidx movw $0xdf12, %ax # Vertical display end call outidx movw $0xe715, %ax # Vertical blank start call outidx movw $0x0416, %ax # Vertical blank end call outidx pushw %dx movb $0xcc, %dl # Misc output register (read) inb %dx, %al movb $0xc2, %dl # (write) andb $0x0d, %al # Preserve clock select bits and color bit orb $0xe2, %al # Set correct sync polarity outb %al, %dx popw %dx movw $0x501e, bootsym(force_size) stc # That's all. ret # Set the 80x34 mode (all VGA's). 480 scans, 14-pixel font. set_80x34: call set_80x30 # Set 480 scans call set14 # And 14-pt font movw $0xdb12, %ax # VGA vertical display end movw $0x5022, bootsym(force_size) setvde: call outidx stc ret # Set the 80x60 mode (all VGA's). 480 scans, 8-pixel font. set_80x60: call set_80x30 # Set 480 scans call set_8pt # And 8-pt font movw $0xdf12, %ax # VGA vertical display end movw $0x503c, bootsym(force_size) jmp setvde # Write to indexed VGA register (AL=index, AH=data, DX=index reg. port) outidx: outb %al, %dx pushw %ax movb %ah, %al incw %dx outb %al, %dx decw %dx popw %ax ret # Build the table of video modes (stored after the setup.S code at the # `modelist' label. Each video mode record looks like: # .word MODE-ID (our special mode ID (see above)) # .byte rows (number of rows) # .byte columns (number of columns) # Returns address of the end of the table in DI, the end is marked # with a ASK_VGA ID. mode_table: movw bootsym(mt_end), %di # Already filled? orw %di, %di jnz mtab1 leaw modelist, %di # Store standard modes: movw $VIDEO_80x25,(%di) # The 80x25 mode (ALL) movw $0x50,2(%di) movw $0x19,4(%di) movw $0x00,6(%di) addw $8,%di leaw bootsym(vga_modes), %si # All modes for std VGA movw $vga_modes_end-vga_modes, %cx rep movsb call vesa_modes # Detect VESA VGA modes movw $ASK_VGA, (%di) # End marker movw %di, bootsym(mt_end) mtab1: leaw modelist, %si # SI=mode list, DI=list end ret0: ret # Modes usable on all standard VGAs vga_modes: .word VIDEO_80x50, 0x50,0x32,0 # 80x50 .word VIDEO_80x43, 0x50,0x2b,0 # 80x43 .word VIDEO_80x28, 0x50,0x1c,0 # 80x28 .word VIDEO_80x30, 0x50,0x1e,0 # 80x30 .word VIDEO_80x34, 0x50,0x22,0 # 80x34 .word VIDEO_80x60, 0x50,0x3c,0 # 80x60 vga_modes_end: # Detect VESA modes. vesa_modes: movw %di, %bp # BP=original mode table end leaw vesa_glob_info, %di movw $0x4f00, %ax # VESA Get card info call int $0x10 movw %di, %si movw %bp, %di cmpw $0x004f, %ax # Successful? jnz ret0 cmpw $0x4556, (%si) # 'VE' jnz ret0 cmpw $0x4153, 2(%si) # 'SA' jnz ret0 movw $bootsym(vesa_name), bootsym(card_name) # Set name to "VESA VGA" pushw %gs lgsw 0xe(%si), %si # GS:SI=mode list movw $128, %cx # Iteration limit vesa1: gs; lodsw cmpw $0xffff, %ax # End of the table? jz vesar cmpw $0x0080, %ax # Check validity of mode ID jc vesa2 orb %ah, %ah # Valid IDs 0x0000-0x007f/0x0100-0x07ff jz vesan # Certain BIOSes report 0x80-0xff! cmpw $0x0800, %ax jnc vesae vesa2: pushw %cx movw %ax, %cx # Get mode information structure movw $0x4f01, %ax int $0x10 movw %cx, %bx # BX=mode number addb $VIDEO_FIRST_VESA>>8, %bh popw %cx cmpw $0x004f, %ax jnz vesan # Don't report errors (buggy BIOSES) movb (%di), %al # Check capabilities. andb $0x9b, %al # LFB gfx mode in color? cmpb $0x9b, %al jnz vesan movw %bx, (%di) # Store mode number movw 0x12(%di), %bx # Width movw %bx, 2(%di) movw 0x14(%di), %bx # Height movw %bx, 4(%di) xorw %bx, %bx movb 0x19(%di), %bl # Depth movw %bx, 6(%di) addw $8, %di # The mode is valid. Store it. vesan: loop vesa1 # Next mode. Limit exceeded => error vesae: leaw bootsym(vesaer), %si call prtstr movw %bp, %di # Discard already found modes. vesar: popw %gs ret # Read a key and return the ASCII code in al, scan code in ah getkey: xorb %ah, %ah int $0x16 ret # Read a key with a timeout of 30 seconds. # The hardware clock is used to get the time. getkt: call gettime addb $30, %al # Wait 30 seconds cmpb $60, %al jl lminute subb $60, %al lminute: movb %al, %cl again: movb $0x01, %ah int $0x16 jnz getkey # key pressed, so get it call gettime cmpb %cl, %al jne again movb $0x20, %al # timeout, return `space' ret # Flush the keyboard buffer flush: movb $0x01, %ah int $0x16 jz empty xorb %ah, %ah int $0x16 jmp flush empty: ret # Print hexadecimal number. prthw: pushw %ax movb %ah, %al call prthb popw %ax prthb: pushw %ax shrb $4, %al call prthn popw %ax andb $0x0f, %al prthn: cmpb $0x0a, %al jc prth1 addb $0x07, %al prth1: addb $0x30, %al jmp prtchr # Print decimal number in ax prtdec: pushw %ax pushw %cx pushw %dx xorw %dx, %dx movw $0x0a, %cx divw %cx testw %ax, %ax jz skip10 cmpw $0x09, %ax jbe lt100 call prtdec jmp skip10 lt100: addb $0x30, %al call prtchr skip10: movb %dl, %al addb $0x30, %al call prtchr popw %dx popw %cx popw %ax ret # Routine to print asciiz string at ds:si prtstr: lodsb andb %al, %al jz fin call prtchr jmp prtstr fin: ret # Space printing prtsp2: call prtspc # Print double space prtspc: movb $0x20, %al # Print single space (note: fall-thru) # Part of above routine, this one just prints ascii al prtchr: pushw %ax pushw %cx movw $7,%bx movw $0x01, %cx movb $0x0e, %ah int $0x10 popw %cx popw %ax ret beep: movb $0x07, %al jmp prtchr # Read the cmos clock. Return the seconds in al gettime: pushw %cx movb $0x02, %ah int $0x1a movb %dh, %al # %dh contains the seconds andb $0x0f, %al movb %dh, %ah movb $0x04, %cl shrb %cl, %ah aad popw %cx ret store_edid: #ifdef CONFIG_FIRMWARE_EDID pushw %ax pushw %bx pushw %cx pushw %dx pushw %di cmpb $1, bootsym(opt_edid) # EDID disabled on cmdline (edid=no)? je .Lno_edid leaw vesa_glob_info, %di movw $0x4f00, %ax int $0x10 cmpw $0x004f, %ax jne .Lno_edid cmpw $0x0200, 4(%di) # only do EDID on >= VBE2.0 jb .Lno_edid xorw %di, %di # Report Capability pushw %di popw %es # ES:DI must be 0:0 movw $0x4f15, %ax xorw %bx, %bx xorw %cx, %cx int $0x10 pushw %ds popw %es cmpw $0x004f, %ax # Call failed? jne .Lno_edid movw %bx, bootsym(boot_edid_caps) cmpb $2, bootsym(opt_edid) # EDID forced on cmdline (edid=force)? je .Lforce_edid /* EDID not forced on cmdline, so perform further sanity checks. */ testb $3,%bl # No DDC capabilities? jz .Lno_edid cmpb $5,%bh # Longer than 5s to read EDID? ja .Lno_edid .Lforce_edid: movw $0x4f15, %ax # do VBE/DDC movw $0x01, %bx movw $0x00, %cx movw $0x00, %dx movw $bootsym(boot_edid_info), %di int $0x10 .Lno_edid: popw %di # restore all registers popw %dx popw %cx popw %bx popw %ax #endif ret opt_edid: .byte 0 # EDID parsing option (force/no/default) mt_end: .word 0 # End of video mode table if built edit_buf: .space 6 # Line editor buffer card_name: .word 0 # Pointer to adapter name graphic_mode: .byte 0 # Graphic mode with a linear frame buffer dac_size: .byte 6 # DAC bit depth # Status messages keymsg: .ascii "Press to see video modes available," .byte 0x0d, 0x0a .ascii " to continue or wait 30 secs" .byte 0x0d, 0x0a, 0 listhdr: .byte 0x0d, 0x0a .ascii "MODE-KEY MODE-ID WIDTHxHEIGHTxDEPTH" crlft: .byte 0x0d, 0x0a, 0 prompt: .byte 0x0d, 0x0a .asciz "Enter mode number or 'menu': " unknt: .ascii "Unknown mode ID. Try again." .byte 0x0d, 0x0a, 0 badmdt: .ascii "You passed an undefined mode number." .byte 0x0d, 0x0a, 0 vesaer: .ascii "Error: Scanning of VESA modes failed. Please " .ascii "report to ." .byte 0x0d, 0x0a, 0 textmode: .asciz " (text)" menu_more_msg: .asciz "" menu_bail_msg: .ascii "" .byte 0x0d, 0x0a, 0 svga_name: .ascii " " vga_name: .asciz "VGA" vesa_name: .asciz "VESA" name_bann: .asciz "Video adapter: " force_size: .word 0 # Use this size instead of the one in BIOS vars vesa_size: .word 0,0,0 # width x depth x height /* If we don't run at all, assume basic video mode 3 at 80x25. */ GLOBAL(boot_vid_mode) .word VIDEO_80x25 GLOBAL(boot_vid_info) .byte 0, 0 /* orig_x, orig_y */ .byte 3 /* text mode 3 */ .byte 80, 25 /* 80x25 */ .byte 1 /* isVGA */ .word 16 /* 8x16 font */ .fill 0x28,1,0 GLOBAL(boot_edid_info) .fill 128,1,0x13 GLOBAL(boot_edid_caps) .word 0x1313 xen-4.4.0/xen/arch/x86/boot/video.h0000664000175000017500000000156712307313555015032 0ustar smbsmb#ifndef __BOOT_VIDEO_H__ #define __BOOT_VIDEO_H__ /* * Video modes numbered by menu position -- NOT RECOMMENDED because of lack * of compatibility when extending the table. These are between 0x00 and 0xff. */ #define VIDEO_FIRST_MENU 0x0000 /* VESA BIOS video modes (VESA number + 0x0200) */ #define VIDEO_FIRST_VESA 0x0200 /* Special video modes */ #define VIDEO_FIRST_SPECIAL 0x0f00 #define VIDEO_80x25 0x0f00 #define VIDEO_80x50 0x0f01 #define VIDEO_80x43 0x0f02 #define VIDEO_80x28 0x0f03 #define VIDEO_CURRENT_MODE 0x0f04 #define VIDEO_80x30 0x0f05 #define VIDEO_80x34 0x0f06 #define VIDEO_80x60 0x0f07 #define VIDEO_LAST_SPECIAL 0x0f08 #define ASK_VGA 0xfffd #define VIDEO_VESA_BY_SIZE 0xffff /* The "recalculate timings" flag */ #define VIDEO_RECALC 0x8000 #endif /* __BOOT_VIDEO_H__ */ xen-4.4.0/xen/arch/x86/boot/x86_64.S0000664000175000017500000001440712307313555014632 0ustar smbsmb .code64 /* Install relocated data selectors. */ lgdt gdt_descr(%rip) mov $(__HYPERVISOR_DS64),%ecx mov %ecx,%ds mov %ecx,%es mov %ecx,%fs mov %ecx,%gs mov %ecx,%ss /* Enable full CR4 features. */ mov mmu_cr4_features(%rip),%rcx mov %rcx,%cr4 mov stack_start(%rip),%rsp or $(STACK_SIZE-CPUINFO_sizeof),%rsp /* Reset EFLAGS (subsumes CLI and CLD). */ pushq $0 popf /* Reload code selector. */ pushq $(__HYPERVISOR_CS64) leaq 1f(%rip),%rax pushq %rax lretq 1: lidt idt_descr(%rip) test %ebx,%ebx jnz start_secondary /* Initialise IDT with simple error defaults. */ leaq ignore_int(%rip),%rcx movl %ecx,%eax andl $0xFFFF0000,%eax orl $0x00008E00,%eax shlq $32,%rax movl %ecx,%edx andl $0x0000FFFF,%edx orl $(__HYPERVISOR_CS64<<16),%edx orq %rdx,%rax shrq $32,%rcx movl %ecx,%edx leaq idt_table(%rip),%rdi movl $256,%ecx 1: movq %rax,(%rdi) movq %rdx,8(%rdi) addq $16,%rdi loop 1b /* Pass off the Multiboot info structure to C land. */ mov multiboot_ptr(%rip),%edi call __start_xen ud2 /* Force a panic (invalid opcode). */ /* This is the default interrupt handler. */ int_msg: .asciz "Unknown interrupt (cr2=%016lx)\n" hex_msg: .asciz " %016lx" ignore_int: SAVE_ALL movq %cr2,%rsi leaq int_msg(%rip),%rdi xorl %eax,%eax call printk movq %rsp,%rbp 0: movq (%rbp),%rsi addq $8,%rbp leaq hex_msg(%rip),%rdi xorl %eax,%eax call printk testq $0xff8,%rbp jnz 0b 1: jmp 1b /*** DESCRIPTOR TABLES ***/ .data .align 8 multiboot_ptr: .long 0 .word 0 GLOBAL(gdt_descr) .word LAST_RESERVED_GDT_BYTE .quad boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE .word 0,0,0 GLOBAL(idt_descr) .word 256*16-1 .quad idt_table GLOBAL(stack_start) .quad cpu0_stack .section .data.page_aligned, "aw", @progbits .align PAGE_SIZE, 0 GLOBAL(boot_cpu_gdt_table) .quad 0x0000000000000000 /* unused */ .quad 0x00af9a000000ffff /* 0xe008 ring 0 code, 64-bit mode */ .quad 0x00cf92000000ffff /* 0xe010 ring 0 data */ .quad 0x0000000000000000 /* reserved */ .quad 0x00cffa000000ffff /* 0xe023 ring 3 code, compatibility */ .quad 0x00cff2000000ffff /* 0xe02b ring 3 data */ .quad 0x00affa000000ffff /* 0xe033 ring 3 code, 64-bit mode */ .quad 0x00cf9a000000ffff /* 0xe038 ring 0 code, compatibility */ .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0 .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */ .align PAGE_SIZE, 0 /* NB. Even rings != 0 get access to the full 4Gb, as only the */ /* (compatibility) machine->physical mapping table lives there. */ GLOBAL(boot_cpu_compat_gdt_table) .quad 0x0000000000000000 /* unused */ .quad 0x00af9a000000ffff /* 0xe008 ring 0 code, 64-bit mode */ .quad 0x00cf92000000ffff /* 0xe010 ring 0 data */ .quad 0x00cfba000000ffff /* 0xe019 ring 1 code, compatibility */ .quad 0x00cfb2000000ffff /* 0xe021 ring 1 data */ .quad 0x00cffa000000ffff /* 0xe02b ring 3 code, compatibility */ .quad 0x00cff2000000ffff /* 0xe033 ring 3 data */ .quad 0x00cf9a000000ffff /* 0xe038 ring 0 code, compatibility */ .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0 .quad 0x0000910000000000 /* per-CPU entry (limit == cpu) */ .align PAGE_SIZE, 0 GLOBAL(__page_tables_start) /* Mapping of first 16 megabytes of memory. */ GLOBAL(l2_identmap) .quad sym_phys(l1_identmap) + __PAGE_HYPERVISOR pfn = 0 .rept 7 pfn = pfn + (1 << PAGETABLE_ORDER) .quad (pfn << PAGE_SHIFT) | PAGE_HYPERVISOR | _PAGE_PSE .endr .fill 4 * L2_PAGETABLE_ENTRIES - 8, 8, 0 .size l2_identmap, . - l2_identmap GLOBAL(l2_xenmap) idx = 0 .rept 8 .quad sym_phys(__image_base__) + (idx << L2_PAGETABLE_SHIFT) + (PAGE_HYPERVISOR | _PAGE_PSE) idx = idx + 1 .endr .fill L2_PAGETABLE_ENTRIES - 8, 8, 0 .size l2_xenmap, . - l2_xenmap l2_fixmap: idx = 0 .rept L2_PAGETABLE_ENTRIES .if idx == l2_table_offset(FIXADDR_TOP - 1) .quad sym_phys(l1_fixmap) + __PAGE_HYPERVISOR .else .quad 0 .endif idx = idx + 1 .endr .size l2_fixmap, . - l2_fixmap GLOBAL(l3_identmap) idx = 0 .rept 4 .quad sym_phys(l2_identmap) + (idx << PAGE_SHIFT) + __PAGE_HYPERVISOR idx = idx + 1 .endr .fill L3_PAGETABLE_ENTRIES - 4, 8, 0 .size l3_identmap, . - l3_identmap l3_xenmap: idx = 0 .rept L3_PAGETABLE_ENTRIES .if idx == l3_table_offset(XEN_VIRT_START) .quad sym_phys(l2_xenmap) + __PAGE_HYPERVISOR .elseif idx == l3_table_offset(FIXADDR_TOP - 1) .quad sym_phys(l2_fixmap) + __PAGE_HYPERVISOR .else .quad 0 .endif idx = idx + 1 .endr .size l3_xenmap, . - l3_xenmap /* Top-level master (and idle-domain) page directory. */ GLOBAL(idle_pg_table) .quad sym_phys(l3_bootmap) + __PAGE_HYPERVISOR idx = 1 .rept L4_PAGETABLE_ENTRIES - 1 .if idx == l4_table_offset(DIRECTMAP_VIRT_START) .quad sym_phys(l3_identmap) + __PAGE_HYPERVISOR .elseif idx == l4_table_offset(XEN_VIRT_START) .quad sym_phys(l3_xenmap) + __PAGE_HYPERVISOR .else .quad 0 .endif idx = idx + 1 .endr .size idle_pg_table, . - idle_pg_table GLOBAL(__page_tables_end) xen-4.4.0/xen/arch/x86/boot/mkelf32.c0000664000175000017500000003127012307313555015154 0ustar smbsmb/****************************************************************************** * mkelf32.c * * Usage: elf-prefix * * Converts an Elf32 or Elf64 executable binary into a simple Elf32 * image comprising a single chunk to be loaded at . */ #include #include #include #include #include #include #include #include #include #define u8 uint8_t #define u16 uint16_t #define u32 uint32_t #define u64 uint64_t #define s8 int8_t #define s16 int16_t #define s32 int32_t #define s64 int64_t #include "../../../include/xen/elfstructs.h" #define DYNAMICALLY_FILLED 0 #define RAW_OFFSET 128 static Elf32_Ehdr out_ehdr = { { ELFMAG0, ELFMAG1, ELFMAG2, ELFMAG3, /* EI_MAG{0-3} */ ELFCLASS32, /* EI_CLASS */ ELFDATA2LSB, /* EI_DATA */ EV_CURRENT, /* EI_VERSION */ 0, 0, 0, 0, 0, 0, 0, 0, 0 }, /* e_ident */ ET_EXEC, /* e_type */ EM_386, /* e_machine */ EV_CURRENT, /* e_version */ DYNAMICALLY_FILLED, /* e_entry */ sizeof(Elf32_Ehdr), /* e_phoff */ DYNAMICALLY_FILLED, /* e_shoff */ 0, /* e_flags */ sizeof(Elf32_Ehdr), /* e_ehsize */ sizeof(Elf32_Phdr), /* e_phentsize */ 1, /* e_phnum */ sizeof(Elf32_Shdr), /* e_shentsize */ 3, /* e_shnum */ 2 /* e_shstrndx */ }; static Elf32_Phdr out_phdr = { PT_LOAD, /* p_type */ RAW_OFFSET, /* p_offset */ DYNAMICALLY_FILLED, /* p_vaddr */ DYNAMICALLY_FILLED, /* p_paddr */ DYNAMICALLY_FILLED, /* p_filesz */ DYNAMICALLY_FILLED, /* p_memsz */ PF_R|PF_W|PF_X, /* p_flags */ 64 /* p_align */ }; static u8 out_shstrtab[] = "\0.text\0.shstrtab"; static Elf32_Shdr out_shdr[] = { { 0 }, { 1, /* sh_name */ SHT_PROGBITS, /* sh_type */ SHF_WRITE|SHF_ALLOC|SHF_EXECINSTR, /* sh_flags */ DYNAMICALLY_FILLED, /* sh_addr */ RAW_OFFSET, /* sh_offset */ DYNAMICALLY_FILLED, /* sh_size */ 0, /* sh_link */ 0, /* sh_info */ 64, /* sh_addralign */ 0 /* sh_entsize */ }, { 7, /* sh_name */ SHT_STRTAB, /* sh_type */ 0, /* sh_flags */ 0, /* sh_addr */ DYNAMICALLY_FILLED, /* sh_offset */ sizeof(out_shstrtab), /* sh_size */ 0, /* sh_link */ 0, /* sh_info */ 1, /* sh_addralign */ 0 /* sh_entsize */ } }; /* Some system header files define these macros and pollute our namespace. */ #undef swap16 #undef swap32 #undef swap64 #define swap16(_v) ((((u16)(_v)>>8)&0xff)|(((u16)(_v)&0xff)<<8)) #define swap32(_v) (((u32)swap16((u16)(_v))<<16)|(u32)swap16((u32)((_v)>>16))) #define swap64(_v) (((u64)swap32((u32)(_v))<<32)|(u64)swap32((u32)((_v)>>32))) static int big_endian; static void endianadjust_ehdr32(Elf32_Ehdr *eh) { if ( !big_endian ) return; eh->e_type = swap16(eh->e_type); eh->e_machine = swap16(eh->e_machine); eh->e_version = swap32(eh->e_version); eh->e_entry = swap32(eh->e_entry); eh->e_phoff = swap32(eh->e_phoff); eh->e_shoff = swap32(eh->e_shoff); eh->e_flags = swap32(eh->e_flags); eh->e_ehsize = swap16(eh->e_ehsize); eh->e_phentsize = swap16(eh->e_phentsize); eh->e_phnum = swap16(eh->e_phnum); eh->e_shentsize = swap16(eh->e_shentsize); eh->e_shnum = swap16(eh->e_shnum); eh->e_shstrndx = swap16(eh->e_shstrndx); } static void endianadjust_ehdr64(Elf64_Ehdr *eh) { if ( !big_endian ) return; eh->e_type = swap16(eh->e_type); eh->e_machine = swap16(eh->e_machine); eh->e_version = swap32(eh->e_version); eh->e_entry = swap64(eh->e_entry); eh->e_phoff = swap64(eh->e_phoff); eh->e_shoff = swap64(eh->e_shoff); eh->e_flags = swap32(eh->e_flags); eh->e_ehsize = swap16(eh->e_ehsize); eh->e_phentsize = swap16(eh->e_phentsize); eh->e_phnum = swap16(eh->e_phnum); eh->e_shentsize = swap16(eh->e_shentsize); eh->e_shnum = swap16(eh->e_shnum); eh->e_shstrndx = swap16(eh->e_shstrndx); } static void endianadjust_phdr32(Elf32_Phdr *ph) { if ( !big_endian ) return; ph->p_type = swap32(ph->p_type); ph->p_offset = swap32(ph->p_offset); ph->p_vaddr = swap32(ph->p_vaddr); ph->p_paddr = swap32(ph->p_paddr); ph->p_filesz = swap32(ph->p_filesz); ph->p_memsz = swap32(ph->p_memsz); ph->p_flags = swap32(ph->p_flags); ph->p_align = swap32(ph->p_align); } static void endianadjust_phdr64(Elf64_Phdr *ph) { if ( !big_endian ) return; ph->p_type = swap32(ph->p_type); ph->p_flags = swap32(ph->p_flags); ph->p_offset = swap64(ph->p_offset); ph->p_vaddr = swap64(ph->p_vaddr); ph->p_paddr = swap64(ph->p_paddr); ph->p_filesz = swap64(ph->p_filesz); ph->p_memsz = swap64(ph->p_memsz); ph->p_align = swap64(ph->p_align); } static void endianadjust_shdr32(Elf32_Shdr *sh) { if ( !big_endian ) return; sh->sh_name = swap32(sh->sh_name); sh->sh_type = swap32(sh->sh_type); sh->sh_flags = swap32(sh->sh_flags); sh->sh_addr = swap32(sh->sh_addr); sh->sh_offset = swap32(sh->sh_offset); sh->sh_size = swap32(sh->sh_size); sh->sh_link = swap32(sh->sh_link); sh->sh_info = swap32(sh->sh_info); sh->sh_addralign = swap32(sh->sh_addralign); sh->sh_entsize = swap32(sh->sh_entsize); } static void do_write(int fd, void *data, int len) { int done, left = len; char *p = data; while ( left != 0 ) { if ( (done = write(fd, p, left)) == -1 ) { if ( errno == EINTR ) continue; fprintf(stderr, "Error writing output image: %d (%s).\n", errno, strerror(errno)); exit(1); } left -= done; p += done; } } static void do_read(int fd, void *data, int len) { int done, left = len; char *p = data; while ( left != 0 ) { if ( (done = read(fd, p, left)) == -1 ) { if ( errno == EINTR ) continue; fprintf(stderr, "Error reading input image: %d (%s).\n", errno, strerror(errno)); exit(1); } left -= done; p += done; } } int main(int argc, char **argv) { u64 final_exec_addr; u32 loadbase, dat_siz, mem_siz; char *inimage, *outimage; int infd, outfd; char buffer[1024]; int bytes, todo, i; Elf32_Ehdr in32_ehdr; Elf32_Phdr in32_phdr; Elf64_Ehdr in64_ehdr; Elf64_Phdr in64_phdr; if ( argc != 5 ) { fprintf(stderr, "Usage: mkelf32 " " \n"); return 1; } inimage = argv[1]; outimage = argv[2]; loadbase = strtoul(argv[3], NULL, 16); final_exec_addr = strtoull(argv[4], NULL, 16); infd = open(inimage, O_RDONLY); if ( infd == -1 ) { fprintf(stderr, "Failed to open input image '%s': %d (%s).\n", inimage, errno, strerror(errno)); return 1; } do_read(infd, &in32_ehdr, sizeof(in32_ehdr)); if ( !IS_ELF(in32_ehdr) || (in32_ehdr.e_ident[EI_DATA] != ELFDATA2LSB) ) { fprintf(stderr, "Input image must be a little-endian Elf image.\n"); return 1; } big_endian = (*(u16 *)in32_ehdr.e_ident == ((ELFMAG0 << 8) | ELFMAG1)); endianadjust_ehdr32(&in32_ehdr); switch ( in32_ehdr.e_ident[EI_CLASS] ) { case ELFCLASS32: if ( in32_ehdr.e_phentsize != sizeof(in32_phdr) ) { fprintf(stderr, "Bad program header size (%d != %d).\n", (int)in32_ehdr.e_phentsize, (int)sizeof(in32_phdr)); return 1; } if ( in32_ehdr.e_phnum != 1 ) { fprintf(stderr, "Expect precisely 1 program header; found %d.\n", (int)in32_ehdr.e_phnum); return 1; } (void)lseek(infd, in32_ehdr.e_phoff, SEEK_SET); do_read(infd, &in32_phdr, sizeof(in32_phdr)); endianadjust_phdr32(&in32_phdr); (void)lseek(infd, in32_phdr.p_offset, SEEK_SET); dat_siz = (u32)in32_phdr.p_filesz; /* Do not use p_memsz: it does not include BSS alignment padding. */ /*mem_siz = (u32)in32_phdr.p_memsz;*/ mem_siz = (u32)(final_exec_addr - in32_phdr.p_vaddr); break; case ELFCLASS64: (void)lseek(infd, 0, SEEK_SET); do_read(infd, &in64_ehdr, sizeof(in64_ehdr)); endianadjust_ehdr64(&in64_ehdr); if ( in64_ehdr.e_phentsize != sizeof(in64_phdr) ) { fprintf(stderr, "Bad program header size (%d != %d).\n", (int)in64_ehdr.e_phentsize, (int)sizeof(in64_phdr)); return 1; } if ( in64_ehdr.e_phnum != 1 ) { fprintf(stderr, "Expect precisly 1 program header; found %d.\n", (int)in64_ehdr.e_phnum); return 1; } (void)lseek(infd, in64_ehdr.e_phoff, SEEK_SET); do_read(infd, &in64_phdr, sizeof(in64_phdr)); endianadjust_phdr64(&in64_phdr); (void)lseek(infd, in64_phdr.p_offset, SEEK_SET); dat_siz = (u32)in64_phdr.p_filesz; /* Do not use p_memsz: it does not include BSS alignment padding. */ /*mem_siz = (u32)in64_phdr.p_memsz;*/ mem_siz = (u32)(final_exec_addr - in64_phdr.p_vaddr); break; default: fprintf(stderr, "Input image must be a 32- or 64-bit Elf image.\n"); return 1; } /* * End the image on a page boundary. This gets round alignment bugs * in the boot- or chain-loader (e.g., kexec on the XenoBoot CD). */ mem_siz += -(loadbase + mem_siz) & 0xfff; out_ehdr.e_entry = loadbase; out_ehdr.e_shoff = RAW_OFFSET + dat_siz; out_phdr.p_vaddr = loadbase; out_phdr.p_paddr = loadbase; out_phdr.p_filesz = dat_siz; out_phdr.p_memsz = mem_siz; out_shdr[1].sh_addr = loadbase; out_shdr[1].sh_size = dat_siz; out_shdr[2].sh_offset = RAW_OFFSET + dat_siz + sizeof(out_shdr); outfd = open(outimage, O_WRONLY|O_CREAT|O_TRUNC, 0775); if ( outfd == -1 ) { fprintf(stderr, "Failed to open output image '%s': %d (%s).\n", outimage, errno, strerror(errno)); return 1; } endianadjust_ehdr32(&out_ehdr); do_write(outfd, &out_ehdr, sizeof(out_ehdr)); endianadjust_phdr32(&out_phdr); do_write(outfd, &out_phdr, sizeof(out_phdr)); if ( (bytes = RAW_OFFSET - sizeof(out_ehdr) - sizeof(out_phdr)) < 0 ) { fprintf(stderr, "Header overflow.\n"); return 1; } do_write(outfd, buffer, bytes); for ( bytes = 0; bytes < dat_siz; bytes += todo ) { todo = ((dat_siz - bytes) > sizeof(buffer)) ? sizeof(buffer) : (dat_siz - bytes); do_read(infd, buffer, todo); do_write(outfd, buffer, todo); } for ( i = 0; i < (sizeof(out_shdr) / sizeof(out_shdr[0])); i++ ) endianadjust_shdr32(&out_shdr[i]); do_write(outfd, &out_shdr[0], sizeof(out_shdr)); do_write(outfd, out_shstrtab, sizeof(out_shstrtab)); do_write(outfd, buffer, 4-((sizeof(out_shstrtab)+dat_siz)&3)); close(infd); close(outfd); return 0; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/shutdown.c0000664000175000017500000004312712307313555014625 0ustar smbsmb/****************************************************************************** * arch/x86/shutdown.c * * x86-specific shutdown handling. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include enum reboot_type { BOOT_TRIPLE = 't', BOOT_KBD = 'k', BOOT_ACPI = 'a', BOOT_CF9 = 'p', }; static long no_idt[2]; static int reboot_mode; /* * reboot=b[ios] | t[riple] | k[bd] | n[o] [, [w]arm | [c]old] * warm Don't set the cold reboot flag * cold Set the cold reboot flag * triple Force a triple fault (init) * kbd Use the keyboard controller. cold reset (default) * acpi Use the RESET_REG in the FADT * pci Use the so-called "PCI reset register", CF9 */ static enum reboot_type reboot_type = BOOT_ACPI; static void __init set_reboot_type(char *str) { for ( ; ; ) { switch ( *str ) { case 'n': /* no reboot */ opt_noreboot = 1; break; case 'w': /* "warm" reboot (no memory testing etc) */ reboot_mode = 0x1234; break; case 'c': /* "cold" reboot (with memory testing etc) */ reboot_mode = 0x0; break; case 'a': case 'k': case 't': case 'p': reboot_type = *str; break; } if ( (str = strchr(str, ',')) == NULL ) break; str++; } } custom_param("reboot", set_reboot_type); static inline void kb_wait(void) { int i; for ( i = 0; i < 0x10000; i++ ) if ( (inb_p(0x64) & 0x02) == 0 ) break; } static void __attribute__((noreturn)) __machine_halt(void *unused) { local_irq_disable(); for ( ; ; ) halt(); } void machine_halt(void) { watchdog_disable(); console_start_sync(); local_irq_enable(); smp_call_function(__machine_halt, NULL, 0); __machine_halt(NULL); } static int __init override_reboot(struct dmi_system_id *d) { enum reboot_type type = (long)d->driver_data; if ( reboot_type != type ) { static const char *__initdata msg[] = { [BOOT_KBD] = "keyboard controller", [BOOT_CF9] = "PCI", }; reboot_type = type; ASSERT(type >= 0 && type < ARRAY_SIZE(msg) && msg[type]); printk("%s series board detected. Selecting %s reboot method.\n", d->ident, msg[type]); } return 0; } static struct dmi_system_id __initdata reboot_dmi_table[] = { { /* Handle problems with rebooting on Dell E520's */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "Dell E520", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"), }, }, { /* Handle problems with rebooting on Dell 1300's */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "Dell PowerEdge 1300", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"), }, }, { /* Handle problems with rebooting on Dell 300's */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "Dell PowerEdge 300", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), }, }, { /* Handle problems with rebooting on Dell Optiplex 745's SFF */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "Dell OptiPlex 745", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), }, }, { /* Handle problems with rebooting on Dell Optiplex 745's DFF */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "Dell OptiPlex 745", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), DMI_MATCH(DMI_BOARD_NAME, "0MM599"), }, }, { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "Dell OptiPlex 745", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), DMI_MATCH(DMI_BOARD_NAME, "0KW626"), }, }, { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "Dell OptiPlex 330", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"), DMI_MATCH(DMI_BOARD_NAME, "0KP561"), }, }, { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "Dell OptiPlex 360", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"), DMI_MATCH(DMI_BOARD_NAME, "0T656F"), }, }, { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "Dell OptiPlex 760", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"), DMI_MATCH(DMI_BOARD_NAME, "0G919G"), }, }, { /* Handle problems with rebooting on Dell 2400's */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "Dell PowerEdge 2400", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), }, }, { /* Handle problems with rebooting on Dell T5400's */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "Dell Precision T5400", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), }, }, { /* Handle problems with rebooting on Dell T7400's */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "Dell Precision T7400", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"), }, }, { /* Handle problems with rebooting on HP laptops */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "HP Compaq Laptop", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), }, }, { /* Handle problems with rebooting on Dell XPS710 */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "Dell XPS710", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), }, }, { /* Handle problems with rebooting on Dell DXP061 */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "Dell DXP061", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"), }, }, { /* Handle problems with rebooting on Sony VGN-Z540N */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "Sony VGN-Z540N", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"), DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), }, }, { /* Handle problems with rebooting on ASUS P4S800 */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "ASUS P4S800", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, { /* Handle reboot issue on Acer Aspire one */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_KBD, .ident = "Acer Aspire One A110", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Acer"), DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), }, }, { /* Handle problems with rebooting on Apple MacBook5 */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Apple MacBook5", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), }, }, { /* Handle problems with rebooting on Apple MacBookPro5 */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Apple MacBookPro5", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), }, }, { /* Handle problems with rebooting on Apple Macmini3,1 */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Apple Macmini3,1", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), }, }, { /* Handle problems with rebooting on the iMac9,1. */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Apple iMac9,1", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), }, }, { /* Handle problems with rebooting on the Latitude E6320. */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Dell Latitude E6320", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"), }, }, { /* Handle problems with rebooting on the Latitude E5420. */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Dell Latitude E5420", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"), }, }, { /* Handle problems with rebooting on the Latitude E6220. */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Dell Latitude E6220", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6220"), }, }, { /* Handle problems with rebooting on the Latitude E6420. */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Dell Latitude E6420", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"), }, }, { /* Handle problems with rebooting on the OptiPlex 990. */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Dell OptiPlex 990", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), }, }, { /* Handle problems with rebooting on the Precision M6600. */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Dell OptiPlex 990", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"), }, }, { /* Handle problems with rebooting on the Latitude E6520. */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Dell Latitude E6520", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6520"), }, }, { /* Handle problems with rebooting on the OptiPlex 790. */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Dell OptiPlex 790", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 790"), }, }, { /* Handle problems with rebooting on the OptiPlex 990. */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Dell OptiPlex 990", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), }, }, { /* Handle problems with rebooting on the OptiPlex 390. */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Dell OptiPlex 390", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 390"), }, }, { /* Handle problems with rebooting on the Latitude E6320. */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Dell Latitude E6320", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"), }, }, { /* Handle problems with rebooting on the Latitude E6420. */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Dell Latitude E6420", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"), }, }, { /* Handle problems with rebooting on the Latitude E6520. */ .callback = override_reboot, .driver_data = (void *)(long)BOOT_CF9, .ident = "Dell Latitude E6520", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6520"), }, }, { } }; static int __init reboot_init(void) { dmi_check_system(reboot_dmi_table); return 0; } __initcall(reboot_init); static void __machine_restart(void *pdelay) { machine_restart(*(unsigned int *)pdelay); } void machine_restart(unsigned int delay_millisecs) { unsigned int i, attempt; enum reboot_type orig_reboot_type = reboot_type; watchdog_disable(); console_start_sync(); spin_debug_disable(); local_irq_enable(); /* Ensure we are the boot CPU. */ if ( get_apic_id() != boot_cpu_physical_apicid ) { /* Send IPI to the boot CPU (logical cpu 0). */ on_selected_cpus(cpumask_of(0), __machine_restart, &delay_millisecs, 0); for ( ; ; ) halt(); } /* * We may be called from an interrupt context, and various functions we * may need to call (alloc_domheap_pages, map_domain_page, ...) assert that * they are not called from interrupt context. This hack keeps them happy. */ local_irq_count(0) = 0; smp_send_stop(); mdelay(delay_millisecs); if ( tboot_in_measured_env() ) { acpi_dmar_reinstate(); tboot_shutdown(TB_SHUTDOWN_REBOOT); } efi_reset_system(reboot_mode != 0); /* Rebooting needs to touch the page at absolute address 0. */ *((unsigned short *)__va(0x472)) = reboot_mode; for ( attempt = 0; ; attempt++ ) { switch ( reboot_type ) { case BOOT_KBD: /* Pulse the keyboard reset line. */ for ( i = 0; i < 100; i++ ) { kb_wait(); udelay(50); outb(0xfe,0x64); /* pulse reset low */ udelay(50); } /* * If this platform supports ACPI reset, we follow a Windows-style * reboot attempt sequence: * ACPI -> KBD -> ACPI -> KBD * After this we revert to our usual sequence: * KBD -> TRIPLE -> KBD -> TRIPLE -> KBD -> ... */ reboot_type = (((attempt == 1) && (orig_reboot_type == BOOT_ACPI)) ? BOOT_ACPI : BOOT_TRIPLE); break; case BOOT_TRIPLE: asm volatile ( "lidt %0 ; int3" : "=m" (no_idt) ); reboot_type = BOOT_KBD; break; case BOOT_ACPI: acpi_reboot(); reboot_type = BOOT_KBD; break; case BOOT_CF9: { u8 cf9 = inb(0xcf9) & ~6; outb(cf9|2, 0xcf9); /* Request hard reset */ udelay(50); outb(cf9|6, 0xcf9); /* Actually do the reset */ udelay(50); } reboot_type = BOOT_ACPI; break; } } } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/microcode_intel.c0000664000175000017500000003020412307313555016101 0ustar smbsmb/* * Intel CPU Microcode Update Driver for Linux * * Copyright (C) 2000-2006 Tigran Aivazian * 2006 Shaohua Li * * This driver allows to upgrade microcode on Intel processors * belonging to IA-32 family - PentiumPro, Pentium II, * Pentium III, Xeon, Pentium 4, etc. * * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture * Software Developer's Manual * Order Number 253668 or free download from: * * http://developer.intel.com/design/pentium4/manuals/253668.htm * * For more information, go to http://www.urbanmyth.org/microcode * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #define pr_debug(x...) ((void)0) struct microcode_header_intel { unsigned int hdrver; unsigned int rev; unsigned int date; unsigned int sig; unsigned int cksum; unsigned int ldrver; unsigned int pf; unsigned int datasize; unsigned int totalsize; unsigned int reserved[3]; }; struct microcode_intel { struct microcode_header_intel hdr; unsigned int bits[0]; }; /* microcode format is extended from prescott processors */ struct extended_signature { unsigned int sig; unsigned int pf; unsigned int cksum; }; struct extended_sigtable { unsigned int count; unsigned int cksum; unsigned int reserved[3]; struct extended_signature sigs[0]; }; #define DEFAULT_UCODE_DATASIZE (2000) #define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) #define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) #define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) #define DWSIZE (sizeof(u32)) #define get_totalsize(mc) \ (((struct microcode_intel *)mc)->hdr.totalsize ? \ ((struct microcode_intel *)mc)->hdr.totalsize : \ DEFAULT_UCODE_TOTALSIZE) #define get_datasize(mc) \ (((struct microcode_intel *)mc)->hdr.datasize ? \ ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) #define sigmatch(s1, s2, p1, p2) \ (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) /* serialize access to the physical write to MSR 0x79 */ static DEFINE_SPINLOCK(microcode_update_lock); static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data[cpu_num]; uint64_t msr_content; BUG_ON(cpu_num != smp_processor_id()); memset(csig, 0, sizeof(*csig)); if ( (c->x86_vendor != X86_VENDOR_INTEL) || (c->x86 < 6) || cpu_has(c, X86_FEATURE_IA64) ) { printk(KERN_ERR "microcode: CPU%d not a capable Intel " "processor\n", cpu_num); return -1; } csig->sig = cpuid_eax(0x00000001); if ( (c->x86_model >= 5) || (c->x86 > 6) ) { /* get processor flags from MSR 0x17 */ rdmsrl(MSR_IA32_PLATFORM_ID, msr_content); csig->pf = 1 << ((msr_content >> 50) & 7); } wrmsrl(MSR_IA32_UCODE_REV, 0x0ULL); /* see notes above for revision 1.07. Apparent chip bug */ sync_core(); /* get the current revision from MSR 0x8B */ rdmsrl(MSR_IA32_UCODE_REV, msr_content); csig->rev = (uint32_t)(msr_content >> 32); pr_debug("microcode: collect_cpu_info : sig=%#x, pf=%#x, rev=%#x\n", csig->sig, csig->pf, csig->rev); return 0; } static inline int microcode_update_match( int cpu_num, const struct microcode_header_intel *mc_header, int sig, int pf) { struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu_num); return (sigmatch(sig, uci->cpu_sig.sig, pf, uci->cpu_sig.pf) && (mc_header->rev > uci->cpu_sig.rev)); } static int microcode_sanity_check(void *mc) { struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header = NULL; struct extended_signature *ext_sig; unsigned long total_size, data_size, ext_table_size; int sum, orig_sum, ext_sigcount = 0, i; total_size = get_totalsize(mc_header); data_size = get_datasize(mc_header); if ( (data_size + MC_HEADER_SIZE) > total_size ) { printk(KERN_ERR "microcode: error! " "Bad data size in microcode data file\n"); return -EINVAL; } if ( (mc_header->ldrver != 1) || (mc_header->hdrver != 1) ) { printk(KERN_ERR "microcode: error! " "Unknown microcode update format\n"); return -EINVAL; } ext_table_size = total_size - (MC_HEADER_SIZE + data_size); if ( ext_table_size ) { if ( (ext_table_size < EXT_HEADER_SIZE) || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE) ) { printk(KERN_ERR "microcode: error! " "Small exttable size in microcode data file\n"); return -EINVAL; } ext_header = mc + MC_HEADER_SIZE + data_size; if ( ext_table_size != exttable_size(ext_header) ) { printk(KERN_ERR "microcode: error! " "Bad exttable size in microcode data file\n"); return -EFAULT; } ext_sigcount = ext_header->count; } /* check extended table checksum */ if ( ext_table_size ) { int ext_table_sum = 0; int *ext_tablep = (int *)ext_header; i = ext_table_size / DWSIZE; while ( i-- ) ext_table_sum += ext_tablep[i]; if ( ext_table_sum ) { printk(KERN_WARNING "microcode: aborting, " "bad extended signature table checksum\n"); return -EINVAL; } } /* calculate the checksum */ orig_sum = 0; i = (MC_HEADER_SIZE + data_size) / DWSIZE; while ( i-- ) orig_sum += ((int *)mc)[i]; if ( orig_sum ) { printk(KERN_ERR "microcode: aborting, bad checksum\n"); return -EINVAL; } if ( !ext_table_size ) return 0; /* check extended signature checksum */ for ( i = 0; i < ext_sigcount; i++ ) { ext_sig = (void *)ext_header + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i; sum = orig_sum - (mc_header->sig + mc_header->pf + mc_header->cksum) + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); if ( sum ) { printk(KERN_ERR "microcode: aborting, bad checksum\n"); return -EINVAL; } } return 0; } /* * return 0 - no update found * return 1 - found update * return < 0 - error */ static int get_matching_microcode(const void *mc, int cpu) { struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); const struct microcode_header_intel *mc_header = mc; const struct extended_sigtable *ext_header; unsigned long total_size = get_totalsize(mc_header); int ext_sigcount, i; struct extended_signature *ext_sig; void *new_mc; if ( microcode_update_match(cpu, mc_header, mc_header->sig, mc_header->pf) ) goto find; if ( total_size <= (get_datasize(mc_header) + MC_HEADER_SIZE) ) return 0; ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; ext_sigcount = ext_header->count; ext_sig = (void *)ext_header + EXT_HEADER_SIZE; for ( i = 0; i < ext_sigcount; i++ ) { if ( microcode_update_match(cpu, mc_header, ext_sig->sig, ext_sig->pf) ) goto find; ext_sig++; } return 0; find: pr_debug("microcode: CPU%d found a matching microcode update with" " version %#x (current=%#x)\n", cpu, mc_header->rev, uci->cpu_sig.rev); new_mc = xmalloc_bytes(total_size); if ( new_mc == NULL ) { printk(KERN_ERR "microcode: error! Can not allocate memory\n"); return -ENOMEM; } memcpy(new_mc, mc, total_size); xfree(uci->mc.mc_intel); uci->mc.mc_intel = new_mc; return 1; } static int apply_microcode(int cpu) { unsigned long flags; uint64_t msr_content; unsigned int val[2]; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu_num); /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); if ( uci->mc.mc_intel == NULL ) return -EINVAL; /* serialize access to the physical write to MSR 0x79 */ spin_lock_irqsave(µcode_update_lock, flags); /* write microcode via MSR 0x79 */ wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)uci->mc.mc_intel->bits); wrmsrl(MSR_IA32_UCODE_REV, 0x0ULL); /* see notes above for revision 1.07. Apparent chip bug */ sync_core(); /* get the current revision from MSR 0x8B */ rdmsrl(MSR_IA32_UCODE_REV, msr_content); val[1] = (uint32_t)(msr_content >> 32); spin_unlock_irqrestore(µcode_update_lock, flags); if ( val[1] != uci->mc.mc_intel->hdr.rev ) { printk(KERN_ERR "microcode: CPU%d update from revision " "%#x to %#x failed\n", cpu_num, uci->cpu_sig.rev, val[1]); return -EIO; } printk(KERN_INFO "microcode: CPU%d updated from revision " "%#x to %#x, date = %04x-%02x-%02x \n", cpu_num, uci->cpu_sig.rev, val[1], uci->mc.mc_intel->hdr.date & 0xffff, uci->mc.mc_intel->hdr.date >> 24, (uci->mc.mc_intel->hdr.date >> 16) & 0xff); uci->cpu_sig.rev = val[1]; return 0; } static long get_next_ucode_from_buffer(void **mc, const u8 *buf, unsigned long size, long offset) { struct microcode_header_intel *mc_header; unsigned long total_size; /* No more data */ if ( offset >= size ) return 0; mc_header = (struct microcode_header_intel *)(buf + offset); total_size = get_totalsize(mc_header); if ( (offset + total_size) > size ) { printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); return -EINVAL; } *mc = xmalloc_bytes(total_size); if ( *mc == NULL ) { printk(KERN_ERR "microcode: error! Can not allocate memory\n"); return -ENOMEM; } memcpy(*mc, (const void *)(buf + offset), total_size); return offset + total_size; } static int cpu_request_microcode(int cpu, const void *buf, size_t size) { long offset = 0; int error = 0; void *mc; unsigned int matching_count = 0; /* We should bind the task to the CPU */ BUG_ON(cpu != raw_smp_processor_id()); while ( (offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) > 0 ) { error = microcode_sanity_check(mc); if ( error ) break; error = get_matching_microcode(mc, cpu); if ( error < 0 ) break; /* * It's possible the data file has multiple matching ucode, * lets keep searching till the latest version */ if ( error == 1 ) { matching_count++; error = 0; } xfree(mc); } if ( offset > 0 ) xfree(mc); if ( offset < 0 ) error = offset; if ( !error && matching_count ) apply_microcode(cpu); return error; } static int microcode_resume_match(int cpu, const void *mc) { return get_matching_microcode(mc, cpu); } static const struct microcode_ops microcode_intel_ops = { .microcode_resume_match = microcode_resume_match, .cpu_request_microcode = cpu_request_microcode, .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode, }; static __init int microcode_init_intel(void) { if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) microcode_ops = µcode_intel_ops; return 0; } presmp_initcall(microcode_init_intel); xen-4.4.0/xen/arch/x86/gdbstub.c0000664000175000017500000000434212307313555014400 0ustar smbsmb/* * x86-specific gdb stub routines * based on x86 cdb(xen/arch/x86/cdb.c), but Extensively modified. * * Copyright (C) 2006 Isaku Yamahata * VA Linux Systems Japan. K.K. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include u16 gdb_arch_signal_num(struct cpu_user_regs *regs, unsigned long cookie) { return 5; /* TRAP signal. see include/gdb/signals.h */ } /* * Use __copy_*_user to make us page-fault safe, but not otherwise restrict * our access to the full virtual address space. */ unsigned int gdb_arch_copy_from_user(void *dest, const void *src, unsigned len) { return __copy_from_user(dest, src, len); } unsigned int gdb_arch_copy_to_user(void *dest, const void *src, unsigned len) { return __copy_to_user(dest, src, len); } void gdb_arch_print_state(struct cpu_user_regs *regs) { /* XXX */ } void gdb_arch_enter(struct cpu_user_regs *regs) { /* nothing */ } void gdb_arch_exit(struct cpu_user_regs *regs) { /* nothing */ } void gdb_arch_resume(struct cpu_user_regs *regs, unsigned long addr, unsigned long type, struct gdb_context *ctx) { if ( addr != -1UL ) regs->eip = addr; regs->eflags &= ~X86_EFLAGS_TF; /* Set eflags.RF to ensure we do not re-enter. */ regs->eflags |= X86_EFLAGS_RF; /* Set the trap flag if we are single stepping. */ if ( type == GDB_STEP ) regs->eflags |= X86_EFLAGS_TF; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * End: */ xen-4.4.0/xen/arch/x86/time.c0000664000175000017500000015275312307313555013716 0ustar smbsmb/****************************************************************************** * arch/x86/time.c * * Per-CPU time calibration and management. * * Copyright (c) 2002-2005, K A Fraser * * Portions from Linux are: * Copyright (c) 1991, 1992, 1995 Linus Torvalds */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for early_time_init */ #include /* opt_clocksource: Force clocksource to one of: pit, hpet, acpi. */ static char __initdata opt_clocksource[10]; string_param("clocksource", opt_clocksource); unsigned long __read_mostly cpu_khz; /* CPU clock frequency in kHz. */ DEFINE_SPINLOCK(rtc_lock); unsigned long pit0_ticks; static u32 wc_sec, wc_nsec; /* UTC time at last 'time update'. */ static DEFINE_SPINLOCK(wc_lock); struct cpu_time { u64 local_tsc_stamp; s_time_t stime_local_stamp; s_time_t stime_master_stamp; struct time_scale tsc_scale; }; struct platform_timesource { char *id; char *name; u64 frequency; u64 (*read_counter)(void); int (*init)(struct platform_timesource *); void (*resume)(struct platform_timesource *); int counter_bits; }; static DEFINE_PER_CPU(struct cpu_time, cpu_time); /* Calibrate all CPUs to platform timer every EPOCH. */ #define EPOCH MILLISECS(1000) static struct timer calibration_timer; /* * We simulate a 32-bit platform timer from the 16-bit PIT ch2 counter. * Otherwise overflow happens too quickly (~50ms) for us to guarantee that * softirq handling will happen in time. * * The pit_lock protects the 16- and 32-bit stamp fields as well as the */ static DEFINE_SPINLOCK(pit_lock); static u16 pit_stamp16; static u32 pit_stamp32; static bool_t __read_mostly using_pit; /* * 32-bit division of integer dividend and integer divisor yielding * 32-bit fractional quotient. */ static inline u32 div_frac(u32 dividend, u32 divisor) { u32 quotient, remainder; ASSERT(dividend < divisor); asm ( "divl %4" : "=a" (quotient), "=d" (remainder) : "0" (0), "1" (dividend), "r" (divisor) ); return quotient; } /* * 32-bit multiplication of multiplicand and fractional multiplier * yielding 32-bit product (radix point at same position as in multiplicand). */ static inline u32 mul_frac(u32 multiplicand, u32 multiplier) { u32 product_int, product_frac; asm ( "mul %3" : "=a" (product_frac), "=d" (product_int) : "0" (multiplicand), "r" (multiplier) ); return product_int; } /* * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. */ static inline u64 scale_delta(u64 delta, struct time_scale *scale) { u64 product; if ( scale->shift < 0 ) delta >>= -scale->shift; else delta <<= scale->shift; asm ( "mulq %2 ; shrd $32,%1,%0" : "=a" (product), "=d" (delta) : "rm" (delta), "0" ((u64)scale->mul_frac) ); return product; } #define _TS_MUL_FRAC_IDENTITY 0x80000000UL /* Compute the reciprocal of the given time_scale. */ static inline struct time_scale scale_reciprocal(struct time_scale scale) { struct time_scale reciprocal; u32 dividend; ASSERT(scale.mul_frac != 0); dividend = _TS_MUL_FRAC_IDENTITY; reciprocal.shift = 1 - scale.shift; while ( unlikely(dividend >= scale.mul_frac) ) { dividend >>= 1; reciprocal.shift++; } asm ( "divl %4" : "=a" (reciprocal.mul_frac), "=d" (dividend) : "0" (0), "1" (dividend), "r" (scale.mul_frac) ); return reciprocal; } /* * cpu_mask that denotes the CPUs that needs timer interrupt coming in as * IPIs in place of local APIC timers */ static cpumask_t pit_broadcast_mask; static void smp_send_timer_broadcast_ipi(void) { int cpu = smp_processor_id(); cpumask_t mask; cpumask_and(&mask, &cpu_online_map, &pit_broadcast_mask); if ( cpumask_test_cpu(cpu, &mask) ) { cpumask_clear_cpu(cpu, &mask); raise_softirq(TIMER_SOFTIRQ); } if ( !cpumask_empty(&mask) ) { cpumask_raise_softirq(&mask, TIMER_SOFTIRQ); } } static void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs) { ASSERT(local_irq_is_enabled()); if ( hpet_legacy_irq_tick() ) return; /* Only for start-of-day interruopt tests in io_apic.c. */ (*(volatile unsigned long *)&pit0_ticks)++; /* Rough hack to allow accurate timers to sort-of-work with no APIC. */ if ( !cpu_has_apic ) raise_softirq(TIMER_SOFTIRQ); if ( xen_cpuidle ) smp_send_timer_broadcast_ipi(); /* Emulate a 32-bit PIT counter. */ if ( using_pit ) { u16 count; spin_lock_irq(&pit_lock); outb(0x80, PIT_MODE); count = inb(PIT_CH2); count |= inb(PIT_CH2) << 8; pit_stamp32 += (u16)(pit_stamp16 - count); pit_stamp16 = count; spin_unlock_irq(&pit_lock); } } static struct irqaction __read_mostly irq0 = { timer_interrupt, "timer", NULL }; /* ------ Calibrate the TSC ------- * Return processor ticks per second / CALIBRATE_FRAC. */ #define CLOCK_TICK_RATE 1193182 /* system crystal frequency (Hz) */ #define CALIBRATE_FRAC 20 /* calibrate over 50ms */ #define CALIBRATE_LATCH ((CLOCK_TICK_RATE+(CALIBRATE_FRAC/2))/CALIBRATE_FRAC) static u64 init_pit_and_calibrate_tsc(void) { u64 start, end; unsigned long count; /* Set PIT channel 0 to HZ Hz. */ #define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ) outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ outb(LATCH >> 8, PIT_CH0); /* MSB */ /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); /* * Now let's take care of CTC channel 2 * * Set the Gate high, program CTC channel 2 for mode 0, (interrupt on * terminal count mode), binary count, load 5 * LATCH count, (LSB and MSB) * to begin countdown. */ outb(0xb0, PIT_MODE); /* binary, mode 0, LSB/MSB, Ch 2 */ outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */ outb(CALIBRATE_LATCH >> 8, PIT_CH2); /* MSB of count */ rdtscll(start); for ( count = 0; (inb(0x61) & 0x20) == 0; count++ ) continue; rdtscll(end); /* Error if the CTC doesn't behave itself. */ if ( count == 0 ) return 0; return ((end - start) * (u64)CALIBRATE_FRAC); } static void set_time_scale(struct time_scale *ts, u64 ticks_per_sec) { u64 tps64 = ticks_per_sec; u32 tps32; int shift = 0; ASSERT(tps64 != 0); while ( tps64 > (MILLISECS(1000)*2) ) { tps64 >>= 1; shift--; } tps32 = (u32)tps64; while ( tps32 <= (u32)MILLISECS(1000) ) { tps32 <<= 1; shift++; } ts->mul_frac = div_frac(MILLISECS(1000), tps32); ts->shift = shift; } static char *freq_string(u64 freq) { static char s[20]; unsigned int x, y; y = (unsigned int)do_div(freq, 1000000) / 1000; x = (unsigned int)freq; snprintf(s, sizeof(s), "%u.%03uMHz", x, y); return s; } /************************************************************ * PLATFORM TIMER 1: PROGRAMMABLE INTERVAL TIMER (LEGACY PIT) */ static u64 read_pit_count(void) { u16 count16; u32 count32; unsigned long flags; spin_lock_irqsave(&pit_lock, flags); outb(0x80, PIT_MODE); count16 = inb(PIT_CH2); count16 |= inb(PIT_CH2) << 8; count32 = pit_stamp32 + (u16)(pit_stamp16 - count16); spin_unlock_irqrestore(&pit_lock, flags); return count32; } static int __init init_pit(struct platform_timesource *pts) { using_pit = 1; return 1; } static struct platform_timesource __initdata plt_pit = { .id = "pit", .name = "PIT", .frequency = CLOCK_TICK_RATE, .read_counter = read_pit_count, .counter_bits = 32, .init = init_pit }; /************************************************************ * PLATFORM TIMER 2: HIGH PRECISION EVENT TIMER (HPET) */ static u64 read_hpet_count(void) { return hpet_read32(HPET_COUNTER); } static int __init init_hpet(struct platform_timesource *pts) { u64 hpet_rate = hpet_setup(); if ( hpet_rate == 0 ) return 0; pts->frequency = hpet_rate; return 1; } static void resume_hpet(struct platform_timesource *pts) { hpet_resume(NULL); } static struct platform_timesource __initdata plt_hpet = { .id = "hpet", .name = "HPET", .read_counter = read_hpet_count, .counter_bits = 32, .init = init_hpet, .resume = resume_hpet }; /************************************************************ * PLATFORM TIMER 3: ACPI PM TIMER */ u32 __read_mostly pmtmr_ioport; /* ACPI PM timer ticks at 3.579545 MHz. */ #define ACPI_PM_FREQUENCY 3579545 static u64 read_pmtimer_count(void) { return inl(pmtmr_ioport); } static int __init init_pmtimer(struct platform_timesource *pts) { if ( pmtmr_ioport == 0 ) return 0; return 1; } static struct platform_timesource __initdata plt_pmtimer = { .id = "acpi", .name = "ACPI PM Timer", .frequency = ACPI_PM_FREQUENCY, .read_counter = read_pmtimer_count, .counter_bits = 24, .init = init_pmtimer }; static struct time_scale __read_mostly pmt_scale; static struct time_scale __read_mostly pmt_scale_r; static __init int init_pmtmr_scale(void) { set_time_scale(&pmt_scale, ACPI_PM_FREQUENCY); pmt_scale_r = scale_reciprocal(pmt_scale); return 0; } __initcall(init_pmtmr_scale); uint64_t acpi_pm_tick_to_ns(uint64_t ticks) { return scale_delta(ticks, &pmt_scale); } uint64_t ns_to_acpi_pm_tick(uint64_t ns) { return scale_delta(ns, &pmt_scale_r); } /************************************************************ * GENERIC PLATFORM TIMER INFRASTRUCTURE */ /* details of chosen timesource */ static struct platform_timesource __read_mostly plt_src; /* hardware-width mask */ static u64 __read_mostly plt_mask; /* ns between calls to plt_overflow() */ static u64 __read_mostly plt_overflow_period; /* scale: platform counter -> nanosecs */ static struct time_scale __read_mostly plt_scale; /* Protected by platform_timer_lock. */ static DEFINE_SPINLOCK(platform_timer_lock); static s_time_t stime_platform_stamp; /* System time at below platform time */ static u64 platform_timer_stamp; /* Platform time at above system time */ static u64 plt_stamp64; /* 64-bit platform counter stamp */ static u64 plt_stamp; /* hardware-width platform counter stamp */ static struct timer plt_overflow_timer; static s_time_t __read_platform_stime(u64 platform_time) { u64 diff = platform_time - platform_timer_stamp; ASSERT(spin_is_locked(&platform_timer_lock)); return (stime_platform_stamp + scale_delta(diff, &plt_scale)); } static void plt_overflow(void *unused) { int i; u64 count; s_time_t now, plt_now, plt_wrap; spin_lock_irq(&platform_timer_lock); count = plt_src.read_counter(); plt_stamp64 += (count - plt_stamp) & plt_mask; plt_stamp = count; now = NOW(); plt_wrap = __read_platform_stime(plt_stamp64); for ( i = 0; i < 10; i++ ) { plt_now = plt_wrap; plt_wrap = __read_platform_stime(plt_stamp64 + plt_mask + 1); if ( ABS(plt_wrap - now) > ABS(plt_now - now) ) break; plt_stamp64 += plt_mask + 1; } if ( i != 0 ) { static bool_t warned_once; if ( !test_and_set_bool(warned_once) ) printk("Platform timer appears to have unexpectedly wrapped " "%u%s times.\n", i, (i == 10) ? " or more" : ""); } spin_unlock_irq(&platform_timer_lock); set_timer(&plt_overflow_timer, NOW() + plt_overflow_period); } static s_time_t read_platform_stime(void) { u64 count; s_time_t stime; ASSERT(!local_irq_is_enabled()); spin_lock(&platform_timer_lock); count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask); stime = __read_platform_stime(count); spin_unlock(&platform_timer_lock); return stime; } static void platform_time_calibration(void) { u64 count; s_time_t stamp; unsigned long flags; spin_lock_irqsave(&platform_timer_lock, flags); count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask); stamp = __read_platform_stime(count); stime_platform_stamp = stamp; platform_timer_stamp = count; spin_unlock_irqrestore(&platform_timer_lock, flags); } static void resume_platform_timer(void) { /* Timer source can be reset when backing from S3 to S0 */ if ( plt_src.resume ) plt_src.resume(&plt_src); plt_stamp64 = platform_timer_stamp; plt_stamp = plt_src.read_counter(); } static void __init init_platform_timer(void) { static struct platform_timesource * __initdata plt_timers[] = { &plt_hpet, &plt_pmtimer, &plt_pit }; struct platform_timesource *pts = NULL; int i, rc = -1; if ( opt_clocksource[0] != '\0' ) { for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ ) { pts = plt_timers[i]; if ( !strcmp(opt_clocksource, pts->id) ) { rc = pts->init(pts); break; } } if ( rc <= 0 ) printk("WARNING: %s clocksource '%s'.\n", (rc == 0) ? "Could not initialise" : "Unrecognised", opt_clocksource); } if ( rc <= 0 ) { for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ ) { pts = plt_timers[i]; if ( (rc = pts->init(pts)) > 0 ) break; } } BUG_ON(rc <= 0); plt_mask = (u64)~0ull >> (64 - pts->counter_bits); set_time_scale(&plt_scale, pts->frequency); plt_overflow_period = scale_delta( 1ull << (pts->counter_bits-1), &plt_scale); init_timer(&plt_overflow_timer, plt_overflow, NULL, 0); plt_src = *pts; plt_overflow(NULL); platform_timer_stamp = plt_stamp64; stime_platform_stamp = NOW(); printk("Platform timer is %s %s\n", freq_string(pts->frequency), pts->name); } u64 stime2tsc(s_time_t stime) { struct cpu_time *t; struct time_scale sys_to_tsc; s_time_t stime_delta; t = &this_cpu(cpu_time); sys_to_tsc = scale_reciprocal(t->tsc_scale); stime_delta = stime - t->stime_local_stamp; if ( stime_delta < 0 ) stime_delta = 0; return t->local_tsc_stamp + scale_delta(stime_delta, &sys_to_tsc); } void cstate_restore_tsc(void) { if ( boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ) return; write_tsc(stime2tsc(read_platform_stime())); } /*************************************************************************** * CMOS Timer functions ***************************************************************************/ /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. * * [For the Julian calendar (which was used in Russia before 1917, * Britain & colonies before 1752, anywhere else before 1582, * and is still in use by some communities) leave out the * -year/100+year/400 terms, and add 10.] * * This algorithm was first published by Gauss (I think). * * WARNING: this function will overflow on 2106-02-07 06:28:16 on * machines were long is 32-bit! (However, as time_t is signed, we * will already get problems at other places on 2038-01-19 03:14:08) */ unsigned long mktime (unsigned int year, unsigned int mon, unsigned int day, unsigned int hour, unsigned int min, unsigned int sec) { /* 1..12 -> 11,12,1..10: put Feb last since it has a leap day. */ if ( 0 >= (int) (mon -= 2) ) { mon += 12; year -= 1; } return ((((unsigned long)(year/4 - year/100 + year/400 + 367*mon/12 + day)+ year*365 - 719499 )*24 + hour /* now have hours */ )*60 + min /* now have minutes */ )*60 + sec; /* finally seconds */ } static unsigned long __get_cmos_time(void) { unsigned int year, mon, day, hour, min, sec; sec = CMOS_READ(RTC_SECONDS); min = CMOS_READ(RTC_MINUTES); hour = CMOS_READ(RTC_HOURS); day = CMOS_READ(RTC_DAY_OF_MONTH); mon = CMOS_READ(RTC_MONTH); year = CMOS_READ(RTC_YEAR); if ( !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD ) { BCD_TO_BIN(sec); BCD_TO_BIN(min); BCD_TO_BIN(hour); BCD_TO_BIN(day); BCD_TO_BIN(mon); BCD_TO_BIN(year); } if ( (year += 1900) < 1970 ) year += 100; return mktime(year, mon, day, hour, min, sec); } static unsigned long get_cmos_time(void) { unsigned long res, flags; int i; if ( efi_enabled ) { res = efi_get_time(); if ( res ) return res; } if ( unlikely(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) ) panic("System without CMOS RTC must be booted from EFI"); spin_lock_irqsave(&rtc_lock, flags); /* read RTC exactly on falling edge of update flag */ for ( i = 0 ; i < 1000000 ; i++ ) /* may take up to 1 second... */ if ( (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) ) break; for ( i = 0 ; i < 1000000 ; i++ ) /* must try at least 2.228 ms */ if ( !(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) ) break; res = __get_cmos_time(); spin_unlock_irqrestore(&rtc_lock, flags); return res; } /*************************************************************************** * System Time ***************************************************************************/ s_time_t get_s_time(void) { struct cpu_time *t = &this_cpu(cpu_time); u64 tsc, delta; s_time_t now; rdtscll(tsc); delta = tsc - t->local_tsc_stamp; now = t->stime_local_stamp + scale_delta(delta, &t->tsc_scale); return now; } uint64_t tsc_ticks2ns(uint64_t ticks) { struct cpu_time *t = &this_cpu(cpu_time); return scale_delta(ticks, &t->tsc_scale); } /* Explicitly OR with 1 just in case version number gets out of sync. */ #define version_update_begin(v) (((v)+1)|1) #define version_update_end(v) ((v)+1) static void __update_vcpu_system_time(struct vcpu *v, int force) { struct cpu_time *t; struct vcpu_time_info *u, _u; struct domain *d = v->domain; s_time_t tsc_stamp = 0; if ( v->vcpu_info == NULL ) return; t = &this_cpu(cpu_time); u = &vcpu_info(v, time); if ( d->arch.vtsc ) { s_time_t stime = t->stime_local_stamp; if ( is_hvm_domain(d) ) { struct pl_time *pl = &v->domain->arch.hvm_domain.pl_time; stime += pl->stime_offset + v->arch.hvm_vcpu.stime_offset; if ( stime >= 0 ) tsc_stamp = gtime_to_gtsc(d, stime); else tsc_stamp = -gtime_to_gtsc(d, -stime); } else tsc_stamp = gtime_to_gtsc(d, stime); } else { tsc_stamp = t->local_tsc_stamp; } memset(&_u, 0, sizeof(_u)); if ( d->arch.vtsc ) { _u.tsc_timestamp = tsc_stamp; _u.system_time = t->stime_local_stamp; _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac; _u.tsc_shift = d->arch.vtsc_to_ns.shift; } else { _u.tsc_timestamp = t->local_tsc_stamp; _u.system_time = t->stime_local_stamp; _u.tsc_to_system_mul = t->tsc_scale.mul_frac; _u.tsc_shift = (s8)t->tsc_scale.shift; } if ( is_hvm_domain(d) ) _u.tsc_timestamp += v->arch.hvm_vcpu.cache_tsc_offset; /* Don't bother unless timestamp record has changed or we are forced. */ _u.version = u->version; /* make versions match for memcmp test */ if ( !force && !memcmp(u, &_u, sizeof(_u)) ) return; /* 1. Update guest kernel version. */ _u.version = u->version = version_update_begin(u->version); wmb(); /* 2. Update all other guest kernel fields. */ *u = _u; wmb(); /* 3. Update guest kernel version. */ u->version = version_update_end(u->version); if ( !update_secondary_system_time(v, &_u) && is_pv_domain(d) && !is_pv_32bit_domain(d) && !(v->arch.flags & TF_kernel_mode) ) v->arch.pv_vcpu.pending_system_time = _u; } bool_t update_secondary_system_time(const struct vcpu *v, struct vcpu_time_info *u) { XEN_GUEST_HANDLE(vcpu_time_info_t) user_u = v->arch.time_info_guest; if ( guest_handle_is_null(user_u) ) return 1; /* 1. Update userspace version. */ if ( __copy_field_to_guest(user_u, u, version) == sizeof(u->version) ) return 0; wmb(); /* 2. Update all other userspace fields. */ __copy_to_guest(user_u, u, 1); wmb(); /* 3. Update userspace version. */ u->version = version_update_end(u->version); __copy_field_to_guest(user_u, u, version); return 1; } void update_vcpu_system_time(struct vcpu *v) { __update_vcpu_system_time(v, 0); } void force_update_vcpu_system_time(struct vcpu *v) { __update_vcpu_system_time(v, 1); } void update_domain_wallclock_time(struct domain *d) { uint32_t *wc_version; spin_lock(&wc_lock); wc_version = &shared_info(d, wc_version); *wc_version = version_update_begin(*wc_version); wmb(); shared_info(d, wc_sec) = wc_sec + d->time_offset_seconds; shared_info(d, wc_nsec) = wc_nsec; wmb(); *wc_version = version_update_end(*wc_version); spin_unlock(&wc_lock); } static void update_domain_rtc(void) { struct domain *d; rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) if ( is_hvm_domain(d) ) rtc_update_clock(d); rcu_read_unlock(&domlist_read_lock); } void domain_set_time_offset(struct domain *d, int32_t time_offset_seconds) { d->time_offset_seconds = time_offset_seconds; if ( is_hvm_domain(d) ) rtc_update_clock(d); update_domain_wallclock_time(d); } int cpu_frequency_change(u64 freq) { struct cpu_time *t = &this_cpu(cpu_time); u64 curr_tsc; /* Sanity check: CPU frequency allegedly dropping below 1MHz? */ if ( freq < 1000000u ) { printk(XENLOG_WARNING "Rejecting CPU frequency change " "to %"PRIu64" Hz\n", freq); return -EINVAL; } local_irq_disable(); /* Platform time /first/, as we may be delayed by platform_timer_lock. */ t->stime_master_stamp = read_platform_stime(); /* TSC-extrapolated time may be bogus after frequency change. */ /*t->stime_local_stamp = get_s_time();*/ t->stime_local_stamp = t->stime_master_stamp; rdtscll(curr_tsc); t->local_tsc_stamp = curr_tsc; set_time_scale(&t->tsc_scale, freq); local_irq_enable(); update_vcpu_system_time(current); /* A full epoch should pass before we check for deviation. */ if ( smp_processor_id() == 0 ) { set_timer(&calibration_timer, NOW() + EPOCH); platform_time_calibration(); } return 0; } /* Set clock to after 00:00:00 UTC, 1 January, 1970. */ void do_settime(unsigned long secs, unsigned long nsecs, u64 system_time_base) { u64 x; u32 y, _wc_sec, _wc_nsec; struct domain *d; x = (secs * 1000000000ULL) + (u64)nsecs - system_time_base; y = do_div(x, 1000000000); spin_lock(&wc_lock); wc_sec = _wc_sec = (u32)x; wc_nsec = _wc_nsec = (u32)y; spin_unlock(&wc_lock); rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) update_domain_wallclock_time(d); rcu_read_unlock(&domlist_read_lock); } /* Per-CPU communication between rendezvous IRQ and softirq handler. */ struct cpu_calibration { u64 local_tsc_stamp; s_time_t stime_local_stamp; s_time_t stime_master_stamp; }; static DEFINE_PER_CPU(struct cpu_calibration, cpu_calibration); /* Softirq handler for per-CPU time calibration. */ static void local_time_calibration(void) { struct cpu_time *t = &this_cpu(cpu_time); struct cpu_calibration *c = &this_cpu(cpu_calibration); /* * System timestamps, extrapolated from local and master oscillators, * taken during this calibration and the previous calibration. */ s_time_t prev_local_stime, curr_local_stime; s_time_t prev_master_stime, curr_master_stime; /* TSC timestamps taken during this calibration and prev calibration. */ u64 prev_tsc, curr_tsc; /* * System time and TSC ticks elapsed during the previous calibration * 'epoch'. These values are down-shifted to fit in 32 bits. */ u64 stime_elapsed64, tsc_elapsed64; u32 stime_elapsed32, tsc_elapsed32; /* The accumulated error in the local estimate. */ u64 local_stime_err; /* Error correction to slow down a fast local clock. */ u32 error_factor = 0; /* Calculated TSC shift to ensure 32-bit scale multiplier. */ int tsc_shift = 0; /* The overall calibration scale multiplier. */ u32 calibration_mul_frac; if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ) { /* Atomically read cpu_calibration struct and write cpu_time struct. */ local_irq_disable(); t->local_tsc_stamp = c->local_tsc_stamp; t->stime_local_stamp = c->stime_master_stamp; t->stime_master_stamp = c->stime_master_stamp; local_irq_enable(); update_vcpu_system_time(current); goto out; } prev_tsc = t->local_tsc_stamp; prev_local_stime = t->stime_local_stamp; prev_master_stime = t->stime_master_stamp; /* Disabling IRQs ensures we atomically read cpu_calibration struct. */ local_irq_disable(); curr_tsc = c->local_tsc_stamp; curr_local_stime = c->stime_local_stamp; curr_master_stime = c->stime_master_stamp; local_irq_enable(); #if 0 printk("PRE%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64"\n", smp_processor_id(), prev_tsc, prev_local_stime, prev_master_stime); printk("CUR%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64 " -> %"PRId64"\n", smp_processor_id(), curr_tsc, curr_local_stime, curr_master_stime, curr_master_stime - curr_local_stime); #endif /* Local time warps forward if it lags behind master time. */ if ( curr_local_stime < curr_master_stime ) curr_local_stime = curr_master_stime; stime_elapsed64 = curr_master_stime - prev_master_stime; tsc_elapsed64 = curr_tsc - prev_tsc; /* * Weirdness can happen if we lose sync with the platform timer. * We could be smarter here: resync platform timer with local timer? */ if ( ((s64)stime_elapsed64 < (EPOCH / 2)) ) goto out; /* * Calculate error-correction factor. This only slows down a fast local * clock (slow clocks are warped forwards). The scale factor is clamped * to >= 0.5. */ if ( curr_local_stime != curr_master_stime ) { local_stime_err = curr_local_stime - curr_master_stime; if ( local_stime_err > EPOCH ) local_stime_err = EPOCH; error_factor = div_frac(EPOCH, EPOCH + (u32)local_stime_err); } /* * We require 0 < stime_elapsed < 2^31. * This allows us to binary shift a 32-bit tsc_elapsed such that: * stime_elapsed < tsc_elapsed <= 2*stime_elapsed */ while ( ((u32)stime_elapsed64 != stime_elapsed64) || ((s32)stime_elapsed64 < 0) ) { stime_elapsed64 >>= 1; tsc_elapsed64 >>= 1; } /* stime_master_diff now fits in a 32-bit word. */ stime_elapsed32 = (u32)stime_elapsed64; /* tsc_elapsed <= 2*stime_elapsed */ while ( tsc_elapsed64 > (stime_elapsed32 * 2) ) { tsc_elapsed64 >>= 1; tsc_shift--; } /* Local difference must now fit in 32 bits. */ ASSERT((u32)tsc_elapsed64 == tsc_elapsed64); tsc_elapsed32 = (u32)tsc_elapsed64; /* tsc_elapsed > stime_elapsed */ ASSERT(tsc_elapsed32 != 0); while ( tsc_elapsed32 <= stime_elapsed32 ) { tsc_elapsed32 <<= 1; tsc_shift++; } calibration_mul_frac = div_frac(stime_elapsed32, tsc_elapsed32); if ( error_factor != 0 ) calibration_mul_frac = mul_frac(calibration_mul_frac, error_factor); #if 0 printk("---%d: %08x %08x %d\n", smp_processor_id(), error_factor, calibration_mul_frac, tsc_shift); #endif /* Record new timestamp information, atomically w.r.t. interrupts. */ local_irq_disable(); t->tsc_scale.mul_frac = calibration_mul_frac; t->tsc_scale.shift = tsc_shift; t->local_tsc_stamp = curr_tsc; t->stime_local_stamp = curr_local_stime; t->stime_master_stamp = curr_master_stime; local_irq_enable(); update_vcpu_system_time(current); out: if ( smp_processor_id() == 0 ) { set_timer(&calibration_timer, NOW() + EPOCH); platform_time_calibration(); } } /* * TSC Reliability check */ /* * The Linux original version of this function is * Copyright (c) 2006, Red Hat, Inc., Ingo Molnar */ static void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp) { #define rdtsc_barrier() mb() static DEFINE_SPINLOCK(sync_lock); static cycles_t last_tsc; cycles_t start, now, prev, end; int i; rdtsc_barrier(); start = get_cycles(); rdtsc_barrier(); /* The measurement runs for 20 msecs: */ end = start + tsc_khz * 20ULL; now = start; for ( i = 0; ; i++ ) { /* * We take the global lock, measure TSC, save the * previous TSC that was measured (possibly on * another CPU) and update the previous TSC timestamp. */ spin_lock(&sync_lock); prev = last_tsc; rdtsc_barrier(); now = get_cycles(); rdtsc_barrier(); last_tsc = now; spin_unlock(&sync_lock); /* * Be nice every now and then (and also check whether measurement is * done [we also insert a 10 million loops safety exit, so we dont * lock up in case the TSC readout is totally broken]): */ if ( unlikely(!(i & 7)) ) { if ( (now > end) || (i > 10000000) ) break; cpu_relax(); /*touch_nmi_watchdog();*/ } /* * Outside the critical section we can now see whether we saw a * time-warp of the TSC going backwards: */ if ( unlikely(prev > now) ) { spin_lock(&sync_lock); if ( *max_warp < prev - now ) *max_warp = prev - now; spin_unlock(&sync_lock); } } } static unsigned long tsc_max_warp, tsc_check_count; static cpumask_t tsc_check_cpumask; static void tsc_check_slave(void *unused) { unsigned int cpu = smp_processor_id(); local_irq_disable(); while ( !cpumask_test_cpu(cpu, &tsc_check_cpumask) ) mb(); check_tsc_warp(cpu_khz, &tsc_max_warp); cpumask_clear_cpu(cpu, &tsc_check_cpumask); local_irq_enable(); } static void tsc_check_reliability(void) { unsigned int cpu = smp_processor_id(); static DEFINE_SPINLOCK(lock); spin_lock(&lock); tsc_check_count++; smp_call_function(tsc_check_slave, NULL, 0); cpumask_andnot(&tsc_check_cpumask, &cpu_online_map, cpumask_of(cpu)); local_irq_disable(); check_tsc_warp(cpu_khz, &tsc_max_warp); local_irq_enable(); while ( !cpumask_empty(&tsc_check_cpumask) ) cpu_relax(); spin_unlock(&lock); } /* * Rendezvous for all CPUs in IRQ context. * Master CPU snapshots the platform timer. * All CPUS snapshot their local TSC and extrapolation of system time. */ struct calibration_rendezvous { cpumask_t cpu_calibration_map; atomic_t semaphore; s_time_t master_stime; u64 master_tsc_stamp; }; /* * Keep TSCs in sync when they run at the same rate, but may stop in * deep-sleep C states. */ static void time_calibration_tsc_rendezvous(void *_r) { int i; struct cpu_calibration *c = &this_cpu(cpu_calibration); struct calibration_rendezvous *r = _r; unsigned int total_cpus = cpumask_weight(&r->cpu_calibration_map); /* Loop to get rid of cache effects on TSC skew. */ for ( i = 4; i >= 0; i-- ) { if ( smp_processor_id() == 0 ) { while ( atomic_read(&r->semaphore) != (total_cpus - 1) ) mb(); if ( r->master_stime == 0 ) { r->master_stime = read_platform_stime(); rdtscll(r->master_tsc_stamp); } atomic_inc(&r->semaphore); if ( i == 0 ) write_tsc(r->master_tsc_stamp); while ( atomic_read(&r->semaphore) != (2*total_cpus - 1) ) mb(); atomic_set(&r->semaphore, 0); } else { atomic_inc(&r->semaphore); while ( atomic_read(&r->semaphore) < total_cpus ) mb(); if ( i == 0 ) write_tsc(r->master_tsc_stamp); atomic_inc(&r->semaphore); while ( atomic_read(&r->semaphore) > total_cpus ) mb(); } } rdtscll(c->local_tsc_stamp); c->stime_local_stamp = get_s_time(); c->stime_master_stamp = r->master_stime; raise_softirq(TIME_CALIBRATE_SOFTIRQ); } /* Ordinary rendezvous function which does not modify TSC values. */ static void time_calibration_std_rendezvous(void *_r) { struct cpu_calibration *c = &this_cpu(cpu_calibration); struct calibration_rendezvous *r = _r; unsigned int total_cpus = cpumask_weight(&r->cpu_calibration_map); if ( smp_processor_id() == 0 ) { while ( atomic_read(&r->semaphore) != (total_cpus - 1) ) cpu_relax(); r->master_stime = read_platform_stime(); mb(); /* write r->master_stime /then/ signal */ atomic_inc(&r->semaphore); } else { atomic_inc(&r->semaphore); while ( atomic_read(&r->semaphore) != total_cpus ) cpu_relax(); mb(); /* receive signal /then/ read r->master_stime */ } rdtscll(c->local_tsc_stamp); c->stime_local_stamp = get_s_time(); c->stime_master_stamp = r->master_stime; raise_softirq(TIME_CALIBRATE_SOFTIRQ); } static void (*time_calibration_rendezvous_fn)(void *) = time_calibration_std_rendezvous; static void time_calibration(void *unused) { struct calibration_rendezvous r = { .semaphore = ATOMIC_INIT(0) }; cpumask_copy(&r.cpu_calibration_map, &cpu_online_map); /* @wait=1 because we must wait for all cpus before freeing @r. */ on_selected_cpus(&r.cpu_calibration_map, time_calibration_rendezvous_fn, &r, 1); } void init_percpu_time(void) { struct cpu_time *t = &this_cpu(cpu_time); unsigned long flags; s_time_t now; /* Initial estimate for TSC rate. */ this_cpu(cpu_time).tsc_scale = per_cpu(cpu_time, 0).tsc_scale; local_irq_save(flags); rdtscll(t->local_tsc_stamp); now = read_platform_stime(); local_irq_restore(flags); t->stime_master_stamp = now; t->stime_local_stamp = now; } /* * On certain older Intel CPUs writing the TSC MSR clears the upper 32 bits. * Obviously we must not use write_tsc() on such CPUs. * * Additionally, AMD specifies that being able to write the TSC MSR is not an * architectural feature (but, other than their manual says, also cannot be * determined from CPUID bits). */ static void __init tsc_check_writability(void) { const char *what = NULL; uint64_t tsc; /* * If all CPUs are reported as synchronised and in sync, we never write * the TSCs (except unavoidably, when a CPU is physically hot-plugged). * Hence testing for writability is pointless and even harmful. */ if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ) return; rdtscll(tsc); if ( wrmsr_safe(MSR_IA32_TSC, 0) == 0 ) { uint64_t tmp, tmp2; rdtscll(tmp2); write_tsc(tsc | (1ULL << 32)); rdtscll(tmp); if ( ABS((s64)tmp - (s64)tmp2) < (1LL << 31) ) what = "only partially"; } else { what = "not"; } /* Nothing to do if the TSC is fully writable. */ if ( !what ) { /* * Paranoia - write back original TSC value. However, APs get synced * with BSP as they are brought up, so this doesn't much matter. */ write_tsc(tsc); return; } printk(XENLOG_WARNING "TSC %s writable\n", what); /* time_calibration_tsc_rendezvous() must not be used */ setup_clear_cpu_cap(X86_FEATURE_CONSTANT_TSC); /* cstate_restore_tsc() must not be used (or do nothing) */ if ( !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ) cpuidle_disable_deep_cstate(); /* synchronize_tsc_slave() must do nothing */ disable_tsc_sync = 1; } /* Late init function, after all cpus have booted */ static int __init verify_tsc_reliability(void) { if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ) { /* * Sadly, despite processor vendors' best design guidance efforts, on * some systems, cpus may come out of reset improperly synchronized. * So we must verify there is no warp and we can't do that until all * CPUs are booted. */ tsc_check_reliability(); if ( tsc_max_warp ) { printk("%s: TSC warp detected, disabling TSC_RELIABLE\n", __func__); setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE); } } return 0; } __initcall(verify_tsc_reliability); /* Late init function (after interrupts are enabled). */ int __init init_xen_time(void) { tsc_check_writability(); /* If we have constant-rate TSCs then scale factor can be shared. */ if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ) { /* If TSCs are not marked as 'reliable', re-sync during rendezvous. */ if ( !boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ) time_calibration_rendezvous_fn = time_calibration_tsc_rendezvous; } open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration); /* System time (get_s_time()) starts ticking from now. */ rdtscll(this_cpu(cpu_time).local_tsc_stamp); /* NB. get_cmos_time() can take over one second to execute. */ do_settime(get_cmos_time(), 0, NOW()); init_platform_timer(); init_percpu_time(); init_timer(&calibration_timer, time_calibration, NULL, 0); set_timer(&calibration_timer, NOW() + EPOCH); return 0; } /* Early init function. */ void __init early_time_init(void) { u64 tmp = init_pit_and_calibrate_tsc(); set_time_scale(&this_cpu(cpu_time).tsc_scale, tmp); do_div(tmp, 1000); cpu_khz = (unsigned long)tmp; printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); setup_irq(0, &irq0); } /* keep pit enabled for pit_broadcast working while cpuidle enabled */ static int _disable_pit_irq(void(*hpet_broadcast_setup)(void)) { int ret = 1; if ( using_pit || !cpu_has_apic ) return -1; /* * If we do not rely on PIT CH0 then we can use HPET for one-shot timer * emulation when entering deep C states. * XXX dom0 may rely on RTC interrupt delivery, so only enable * hpet_broadcast if FSB mode available or if force_hpet_broadcast. */ if ( cpuidle_using_deep_cstate() && !boot_cpu_has(X86_FEATURE_ARAT) ) { hpet_broadcast_setup(); if ( !hpet_broadcast_is_available() ) { if ( xen_cpuidle > 0 ) { printk("%ps() failed, turning to PIT broadcast\n", hpet_broadcast_setup); return -1; } ret = 0; } } /* Disable PIT CH0 timer interrupt. */ outb_p(0x30, PIT_MODE); outb_p(0, PIT_CH0); outb_p(0, PIT_CH0); return ret; } static int __init disable_pit_irq(void) { if ( !_disable_pit_irq(hpet_broadcast_init) ) { xen_cpuidle = 0; printk("CPUIDLE: disabled due to no HPET. " "Force enable with 'cpuidle'.\n"); } return 0; } __initcall(disable_pit_irq); void pit_broadcast_enter(void) { cpumask_set_cpu(smp_processor_id(), &pit_broadcast_mask); } void pit_broadcast_exit(void) { int cpu = smp_processor_id(); if ( cpumask_test_and_clear_cpu(cpu, &pit_broadcast_mask) ) reprogram_timer(this_cpu(timer_deadline)); } int pit_broadcast_is_available(void) { return cpuidle_using_deep_cstate(); } void send_timer_event(struct vcpu *v) { send_guest_vcpu_virq(v, VIRQ_TIMER); } /* Return secs after 00:00:00 localtime, 1 January, 1970. */ unsigned long get_localtime(struct domain *d) { return wc_sec + (wc_nsec + NOW()) / 1000000000ULL + d->time_offset_seconds; } /* Return microsecs after 00:00:00 localtime, 1 January, 1970. */ uint64_t get_localtime_us(struct domain *d) { return ((wc_sec + d->time_offset_seconds) * 1000000000ULL + wc_nsec + NOW()) / 1000UL; } unsigned long get_sec(void) { return wc_sec + (wc_nsec + NOW()) / 1000000000ULL; } /* "cmos_utc_offset" is the difference between UTC time and CMOS time. */ static long cmos_utc_offset; /* in seconds */ int time_suspend(void) { if ( smp_processor_id() == 0 ) { cmos_utc_offset = -get_cmos_time(); cmos_utc_offset += (wc_sec + (wc_nsec + NOW()) / 1000000000ULL); kill_timer(&calibration_timer); /* Sync platform timer stamps. */ platform_time_calibration(); } /* Better to cancel calibration timer for accuracy. */ clear_bit(TIME_CALIBRATE_SOFTIRQ, &softirq_pending(smp_processor_id())); return 0; } int time_resume(void) { init_pit_and_calibrate_tsc(); resume_platform_timer(); if ( !_disable_pit_irq(hpet_broadcast_resume) ) BUG(); init_percpu_time(); set_timer(&calibration_timer, NOW() + EPOCH); do_settime(get_cmos_time() + cmos_utc_offset, 0, NOW()); update_vcpu_system_time(current); update_domain_rtc(); return 0; } int dom0_pit_access(struct ioreq *ioreq) { /* Is Xen using Channel 2? Then disallow direct dom0 access. */ if ( using_pit ) return 0; switch ( ioreq->addr ) { case PIT_CH2: if ( ioreq->dir == IOREQ_READ ) ioreq->data = inb(PIT_CH2); else outb(ioreq->data, PIT_CH2); return 1; case PIT_MODE: if ( ioreq->dir == IOREQ_READ ) return 0; /* urk! */ switch ( ioreq->data & 0xc0 ) { case 0xc0: /* Read Back */ if ( ioreq->data & 0x08 ) /* Select Channel 2? */ outb(ioreq->data & 0xf8, PIT_MODE); if ( !(ioreq->data & 0x06) ) /* Select Channel 0/1? */ return 1; /* no - we're done */ /* Filter Channel 2 and reserved bit 0. */ ioreq->data &= ~0x09; return 0; /* emulate ch0/1 readback */ case 0x80: /* Select Counter 2 */ outb(ioreq->data, PIT_MODE); return 1; } case 0x61: if ( ioreq->dir == IOREQ_READ ) ioreq->data = inb(0x61); else outb((inb(0x61) & ~3) | (ioreq->data & 3), 0x61); return 1; } return 0; } struct tm wallclock_time(void) { uint64_t seconds; if ( !wc_sec ) return (struct tm) { 0 }; seconds = NOW() + (wc_sec * 1000000000ull) + wc_nsec; do_div(seconds, 1000000000); return gmtime(seconds); } /* * PV SoftTSC Emulation. */ /* * tsc=unstable: Override all tests; assume TSC is unreliable. * tsc=skewed: Assume TSCs are individually reliable, but skewed across CPUs. */ static void __init tsc_parse(const char *s) { if ( !strcmp(s, "unstable") ) { setup_clear_cpu_cap(X86_FEATURE_CONSTANT_TSC); setup_clear_cpu_cap(X86_FEATURE_NONSTOP_TSC); setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE); } else if ( !strcmp(s, "skewed") ) { setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE); } } custom_param("tsc", tsc_parse); u64 gtime_to_gtsc(struct domain *d, u64 time) { if ( !is_hvm_domain(d) ) time = max_t(s64, time - d->arch.vtsc_offset, 0); return scale_delta(time, &d->arch.ns_to_vtsc); } u64 gtsc_to_gtime(struct domain *d, u64 tsc) { u64 time = scale_delta(tsc, &d->arch.vtsc_to_ns); if ( !is_hvm_domain(d) ) time += d->arch.vtsc_offset; return time; } void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs, int rdtscp) { s_time_t now = get_s_time(); struct domain *d = v->domain; spin_lock(&d->arch.vtsc_lock); if ( guest_kernel_mode(v, regs) ) d->arch.vtsc_kerncount++; else d->arch.vtsc_usercount++; if ( (int64_t)(now - d->arch.vtsc_last) > 0 ) d->arch.vtsc_last = now; else now = ++d->arch.vtsc_last; spin_unlock(&d->arch.vtsc_lock); now = gtime_to_gtsc(d, now); regs->eax = (uint32_t)now; regs->edx = (uint32_t)(now >> 32); if ( rdtscp ) regs->ecx = (d->arch.tsc_mode == TSC_MODE_PVRDTSCP) ? d->arch.incarnation : 0; } int host_tsc_is_safe(void) { return boot_cpu_has(X86_FEATURE_TSC_RELIABLE); } void cpuid_time_leaf(uint32_t sub_idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) { struct domain *d = current->domain; uint64_t offset; switch ( sub_idx ) { case 0: /* features */ *eax = ( ( (!!d->arch.vtsc) << 0 ) | ( (!!host_tsc_is_safe()) << 1 ) | ( (!!boot_cpu_has(X86_FEATURE_RDTSCP)) << 2 ) | 0 ); *ebx = d->arch.tsc_mode; *ecx = d->arch.tsc_khz; *edx = d->arch.incarnation; break; case 1: /* scale and offset */ if ( !d->arch.vtsc ) offset = d->arch.vtsc_offset; else /* offset already applied to value returned by virtual rdtscp */ offset = 0; *eax = (uint32_t)offset; *ebx = (uint32_t)(offset >> 32); *ecx = d->arch.vtsc_to_ns.mul_frac; *edx = (s8)d->arch.vtsc_to_ns.shift; break; case 2: /* physical cpu_khz */ *eax = cpu_khz; *ebx = *ecx = *edx = 0; break; default: *eax = *ebx = *ecx = *edx = 0; } } /* * called to collect tsc-related data only for save file or live * migrate; called after last rdtsc is done on this incarnation */ void tsc_get_info(struct domain *d, uint32_t *tsc_mode, uint64_t *elapsed_nsec, uint32_t *gtsc_khz, uint32_t *incarnation) { *incarnation = d->arch.incarnation; *tsc_mode = d->arch.tsc_mode; switch ( *tsc_mode ) { case TSC_MODE_NEVER_EMULATE: *elapsed_nsec = *gtsc_khz = 0; break; case TSC_MODE_ALWAYS_EMULATE: *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; *gtsc_khz = d->arch.tsc_khz; break; case TSC_MODE_DEFAULT: if ( d->arch.vtsc ) { *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; *gtsc_khz = d->arch.tsc_khz; } else { uint64_t tsc = 0; rdtscll(tsc); *elapsed_nsec = scale_delta(tsc,&d->arch.vtsc_to_ns); *gtsc_khz = cpu_khz; } break; case TSC_MODE_PVRDTSCP: if ( d->arch.vtsc ) { *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; *gtsc_khz = cpu_khz; } else { uint64_t tsc = 0; rdtscll(tsc); *elapsed_nsec = (scale_delta(tsc,&d->arch.vtsc_to_ns) - d->arch.vtsc_offset); *gtsc_khz = 0; /* ignored by tsc_set_info */ } break; } if ( (int64_t)*elapsed_nsec < 0 ) *elapsed_nsec = 0; } /* * This may be called as many as three times for a domain, once when the * hypervisor creates the domain, once when the toolstack creates the * domain and, if restoring/migrating, once when saved/migrated values * are restored. Care must be taken that, if multiple calls occur, * only the last "sticks" and all are completed before the guest executes * an rdtsc instruction */ void tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec, uint32_t gtsc_khz, uint32_t incarnation) { if ( is_idle_domain(d) || (d->domain_id == 0) ) { d->arch.vtsc = 0; return; } if ( is_pvh_domain(d) ) { /* * PVH fixme: support more tsc modes. * * NB: The reason this is disabled here appears to be with * additional support required to do the PV RDTSC emulation. * Since we're no longer taking the PV emulation path for * anything, we may be able to remove this restriction. * * pvhfixme: Experiments show that "default" works for PVH, * but "always_emulate" does not for some reason. Figure out * why. */ switch ( tsc_mode ) { case TSC_MODE_NEVER_EMULATE: break; default: printk(XENLOG_WARNING "PVH currently does not support tsc emulation. Setting timer_mode = never_emulate\n"); /* FALLTHRU */ case TSC_MODE_DEFAULT: tsc_mode = TSC_MODE_NEVER_EMULATE; break; } } switch ( d->arch.tsc_mode = tsc_mode ) { case TSC_MODE_NEVER_EMULATE: d->arch.vtsc = 0; break; case TSC_MODE_ALWAYS_EMULATE: d->arch.vtsc = 1; d->arch.vtsc_offset = get_s_time() - elapsed_nsec; d->arch.tsc_khz = gtsc_khz ? gtsc_khz : cpu_khz; set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 ); d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns); break; case TSC_MODE_DEFAULT: d->arch.vtsc = 1; d->arch.vtsc_offset = get_s_time() - elapsed_nsec; d->arch.tsc_khz = gtsc_khz ? gtsc_khz : cpu_khz; set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 ); /* use native TSC if initial host has safe TSC, has not migrated * yet and tsc_khz == cpu_khz */ if ( host_tsc_is_safe() && incarnation == 0 && d->arch.tsc_khz == cpu_khz ) d->arch.vtsc = 0; else d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns); break; case TSC_MODE_PVRDTSCP: d->arch.vtsc = boot_cpu_has(X86_FEATURE_RDTSCP) && host_tsc_is_safe() ? 0 : 1; d->arch.tsc_khz = cpu_khz; set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 ); d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns); if ( d->arch.vtsc ) d->arch.vtsc_offset = get_s_time() - elapsed_nsec; else { /* when using native TSC, offset is nsec relative to power-on * of physical machine */ uint64_t tsc = 0; rdtscll(tsc); d->arch.vtsc_offset = scale_delta(tsc,&d->arch.vtsc_to_ns) - elapsed_nsec; } break; } d->arch.incarnation = incarnation + 1; if ( is_hvm_domain(d) ) hvm_set_rdtsc_exiting(d, d->arch.vtsc); } /* vtsc may incur measurable performance degradation, diagnose with this */ static void dump_softtsc(unsigned char key) { struct domain *d; int domcnt = 0; tsc_check_reliability(); if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ) printk("TSC marked as reliable, " "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count); else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) ) { printk("TSC has constant rate, "); if (max_cstate <= 2 && tsc_max_warp == 0) printk("no deep Cstates, passed warp test, deemed reliable, "); else printk("deep Cstates possible, so not reliable, "); printk("warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count); } else printk("TSC not marked as either constant or reliable, " "warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count); for_each_domain ( d ) { if ( d->domain_id == 0 && d->arch.tsc_mode == TSC_MODE_DEFAULT ) continue; printk("dom%u%s: mode=%d",d->domain_id, is_hvm_domain(d) ? "(hvm)" : "", d->arch.tsc_mode); if ( d->arch.vtsc_offset ) printk(",ofs=%#"PRIx64, d->arch.vtsc_offset); if ( d->arch.tsc_khz ) printk(",khz=%"PRIu32, d->arch.tsc_khz); if ( d->arch.incarnation ) printk(",inc=%"PRIu32, d->arch.incarnation); if ( !(d->arch.vtsc_kerncount | d->arch.vtsc_usercount) ) { printk("\n"); continue; } if ( is_hvm_domain(d) ) printk(",vtsc count: %"PRIu64" total\n", d->arch.vtsc_kerncount); else printk(",vtsc count: %"PRIu64" kernel, %"PRIu64" user\n", d->arch.vtsc_kerncount, d->arch.vtsc_usercount); domcnt++; } if ( !domcnt ) printk("No domains have emulated TSC\n"); } static struct keyhandler dump_softtsc_keyhandler = { .diagnostic = 1, .u.fn = dump_softtsc, .desc = "dump softtsc stats" }; static int __init setup_dump_softtsc(void) { register_keyhandler('s', &dump_softtsc_keyhandler); return 0; } __initcall(setup_dump_softtsc); /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/msi.c0000664000175000017500000011360412307313555013540 0ustar smbsmb/* * File: msi.c * Purpose: PCI Message Signaled Interrupt (MSI) * * Copyright (C) 2003-2004 Intel * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com) */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static s8 __read_mostly use_msi = -1; boolean_param("msi", use_msi); /* bitmap indicate which fixed map is free */ static DEFINE_SPINLOCK(msix_fixmap_lock); static DECLARE_BITMAP(msix_fixmap_pages, FIX_MSIX_MAX_PAGES); static DEFINE_PER_CPU(cpumask_var_t, scratch_mask); static int msix_fixmap_alloc(void) { int i, rc = -ENOMEM; spin_lock(&msix_fixmap_lock); for ( i = 0; i < FIX_MSIX_MAX_PAGES; i++ ) if ( !test_bit(i, &msix_fixmap_pages) ) break; if ( i == FIX_MSIX_MAX_PAGES ) goto out; rc = FIX_MSIX_IO_RESERV_BASE + i; set_bit(i, &msix_fixmap_pages); out: spin_unlock(&msix_fixmap_lock); return rc; } static void msix_fixmap_free(int idx) { spin_lock(&msix_fixmap_lock); if ( idx >= FIX_MSIX_IO_RESERV_BASE ) clear_bit(idx - FIX_MSIX_IO_RESERV_BASE, &msix_fixmap_pages); spin_unlock(&msix_fixmap_lock); } static int msix_get_fixmap(struct arch_msix *msix, u64 table_paddr, u64 entry_paddr) { long nr_page; int idx; nr_page = (entry_paddr >> PAGE_SHIFT) - (table_paddr >> PAGE_SHIFT); if ( nr_page < 0 || nr_page >= MAX_MSIX_TABLE_PAGES ) return -EINVAL; spin_lock(&msix->table_lock); if ( msix->table_refcnt[nr_page]++ == 0 ) { idx = msix_fixmap_alloc(); if ( idx < 0 ) { msix->table_refcnt[nr_page]--; goto out; } set_fixmap_nocache(idx, entry_paddr); msix->table_idx[nr_page] = idx; } else idx = msix->table_idx[nr_page]; out: spin_unlock(&msix->table_lock); return idx; } static void msix_put_fixmap(struct arch_msix *msix, int idx) { int i; spin_lock(&msix->table_lock); for ( i = 0; i < MAX_MSIX_TABLE_PAGES; i++ ) { if ( msix->table_idx[i] == idx ) break; } if ( i == MAX_MSIX_TABLE_PAGES ) goto out; if ( --msix->table_refcnt[i] == 0 ) { __set_fixmap(idx, 0, 0); msix_fixmap_free(idx); msix->table_idx[i] = 0; } out: spin_unlock(&msix->table_lock); } /* * MSI message composition */ void msi_compose_msg(unsigned vector, const cpumask_t *cpu_mask, struct msi_msg *msg) { unsigned dest; memset(msg, 0, sizeof(*msg)); if ( !cpumask_intersects(cpu_mask, &cpu_online_map) ) { dprintk(XENLOG_ERR,"%s, compose msi message error!!\n", __func__); return; } if ( vector ) { cpumask_t *mask = this_cpu(scratch_mask); cpumask_and(mask, cpu_mask, &cpu_online_map); dest = cpu_mask_to_apicid(mask); msg->address_hi = MSI_ADDR_BASE_HI; msg->address_lo = MSI_ADDR_BASE_LO | ((INT_DEST_MODE == 0) ? MSI_ADDR_DESTMODE_PHYS: MSI_ADDR_DESTMODE_LOGIC) | ((INT_DELIVERY_MODE != dest_LowestPrio) ? MSI_ADDR_REDIRECTION_CPU: MSI_ADDR_REDIRECTION_LOWPRI) | MSI_ADDR_DEST_ID(dest); msg->dest32 = dest; msg->data = MSI_DATA_TRIGGER_EDGE | MSI_DATA_LEVEL_ASSERT | ((INT_DELIVERY_MODE != dest_LowestPrio) ? MSI_DATA_DELIVERY_FIXED: MSI_DATA_DELIVERY_LOWPRI) | MSI_DATA_VECTOR(vector); } } static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg) { switch ( entry->msi_attrib.type ) { case PCI_CAP_ID_MSI: { struct pci_dev *dev = entry->dev; int pos = entry->msi_attrib.pos; u16 data, seg = dev->seg; u8 bus = dev->bus; u8 slot = PCI_SLOT(dev->devfn); u8 func = PCI_FUNC(dev->devfn); msg->address_lo = pci_conf_read32(seg, bus, slot, func, msi_lower_address_reg(pos)); if ( entry->msi_attrib.is_64 ) { msg->address_hi = pci_conf_read32(seg, bus, slot, func, msi_upper_address_reg(pos)); data = pci_conf_read16(seg, bus, slot, func, msi_data_reg(pos, 1)); } else { msg->address_hi = 0; data = pci_conf_read16(seg, bus, slot, func, msi_data_reg(pos, 0)); } msg->data = data; break; } case PCI_CAP_ID_MSIX: { void __iomem *base; base = entry->mask_base; msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET); break; } default: BUG(); } if ( iommu_intremap ) iommu_read_msi_from_ire(entry, msg); } static int write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) { entry->msg = *msg; if ( iommu_intremap ) { int rc; ASSERT(msg != &entry->msg); rc = iommu_update_ire_from_msi(entry, msg); if ( rc ) return rc; } switch ( entry->msi_attrib.type ) { case PCI_CAP_ID_MSI: { struct pci_dev *dev = entry->dev; int pos = entry->msi_attrib.pos; u16 seg = dev->seg; u8 bus = dev->bus; u8 slot = PCI_SLOT(dev->devfn); u8 func = PCI_FUNC(dev->devfn); int nr = entry->msi_attrib.entry_nr; ASSERT((msg->data & (entry[-nr].msi.nvec - 1)) == nr); if ( nr ) return 0; pci_conf_write32(seg, bus, slot, func, msi_lower_address_reg(pos), msg->address_lo); if ( entry->msi_attrib.is_64 ) { pci_conf_write32(seg, bus, slot, func, msi_upper_address_reg(pos), msg->address_hi); pci_conf_write16(seg, bus, slot, func, msi_data_reg(pos, 1), msg->data); } else pci_conf_write16(seg, bus, slot, func, msi_data_reg(pos, 0), msg->data); break; } case PCI_CAP_ID_MSIX: { void __iomem *base; base = entry->mask_base; writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET); break; } default: BUG(); } return 0; } void set_msi_affinity(struct irq_desc *desc, const cpumask_t *mask) { struct msi_msg msg; unsigned int dest; struct msi_desc *msi_desc = desc->msi_desc; dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID || !msi_desc) return; ASSERT(spin_is_locked(&desc->lock)); memset(&msg, 0, sizeof(msg)); read_msi_msg(msi_desc, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(desc->arch.vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); msg.dest32 = dest; write_msi_msg(msi_desc, &msg); } void __msi_set_enable(u16 seg, u8 bus, u8 slot, u8 func, int pos, int enable) { u16 control = pci_conf_read16(seg, bus, slot, func, pos + PCI_MSI_FLAGS); control &= ~PCI_MSI_FLAGS_ENABLE; if ( enable ) control |= PCI_MSI_FLAGS_ENABLE; pci_conf_write16(seg, bus, slot, func, pos + PCI_MSI_FLAGS, control); } static void msi_set_enable(struct pci_dev *dev, int enable) { int pos; u16 seg = dev->seg; u8 bus = dev->bus; u8 slot = PCI_SLOT(dev->devfn); u8 func = PCI_FUNC(dev->devfn); pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSI); if ( pos ) __msi_set_enable(seg, bus, slot, func, pos, enable); } static void msix_set_enable(struct pci_dev *dev, int enable) { int pos; u16 control, seg = dev->seg; u8 bus = dev->bus; u8 slot = PCI_SLOT(dev->devfn); u8 func = PCI_FUNC(dev->devfn); pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX); if ( pos ) { control = pci_conf_read16(seg, bus, slot, func, pos + PCI_MSIX_FLAGS); control &= ~PCI_MSIX_FLAGS_ENABLE; if ( enable ) control |= PCI_MSIX_FLAGS_ENABLE; pci_conf_write16(seg, bus, slot, func, pos + PCI_MSIX_FLAGS, control); } } int msi_maskable_irq(const struct msi_desc *entry) { BUG_ON(!entry); return entry->msi_attrib.type != PCI_CAP_ID_MSI || entry->msi_attrib.maskbit; } static void msi_set_mask_bit(struct irq_desc *desc, int flag) { struct msi_desc *entry = desc->msi_desc; ASSERT(spin_is_locked(&desc->lock)); BUG_ON(!entry || !entry->dev); switch (entry->msi_attrib.type) { case PCI_CAP_ID_MSI: if (entry->msi_attrib.maskbit) { u32 mask_bits; u16 seg = entry->dev->seg; u8 bus = entry->dev->bus; u8 slot = PCI_SLOT(entry->dev->devfn); u8 func = PCI_FUNC(entry->dev->devfn); mask_bits = pci_conf_read32(seg, bus, slot, func, entry->msi.mpos); mask_bits &= ~((u32)1 << entry->msi_attrib.entry_nr); mask_bits |= (u32)flag << entry->msi_attrib.entry_nr; pci_conf_write32(seg, bus, slot, func, entry->msi.mpos, mask_bits); } break; case PCI_CAP_ID_MSIX: { int offset = PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET; writel(flag, entry->mask_base + offset); readl(entry->mask_base + offset); break; } default: BUG(); break; } entry->msi_attrib.masked = !!flag; } static int msi_get_mask_bit(const struct msi_desc *entry) { switch (entry->msi_attrib.type) { case PCI_CAP_ID_MSI: if (!entry->dev || !entry->msi_attrib.maskbit) break; return (pci_conf_read32(entry->dev->seg, entry->dev->bus, PCI_SLOT(entry->dev->devfn), PCI_FUNC(entry->dev->devfn), entry->msi.mpos) >> entry->msi_attrib.entry_nr) & 1; case PCI_CAP_ID_MSIX: return readl(entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) & 1; } return -1; } void mask_msi_irq(struct irq_desc *desc) { msi_set_mask_bit(desc, 1); } void unmask_msi_irq(struct irq_desc *desc) { msi_set_mask_bit(desc, 0); } static unsigned int startup_msi_irq(struct irq_desc *desc) { unmask_msi_irq(desc); return 0; } void ack_nonmaskable_msi_irq(struct irq_desc *desc) { irq_complete_move(desc); move_native_irq(desc); } static void ack_maskable_msi_irq(struct irq_desc *desc) { ack_nonmaskable_msi_irq(desc); ack_APIC_irq(); /* ACKTYPE_NONE */ } void end_nonmaskable_msi_irq(struct irq_desc *desc, u8 vector) { ack_APIC_irq(); /* ACKTYPE_EOI */ } /* * IRQ chip for MSI PCI/PCI-X/PCI-Express devices, * which implement the MSI or MSI-X capability structure. */ static hw_irq_controller pci_msi_maskable = { .typename = "PCI-MSI/-X", .startup = startup_msi_irq, .shutdown = mask_msi_irq, .enable = unmask_msi_irq, .disable = mask_msi_irq, .ack = ack_maskable_msi_irq, .set_affinity = set_msi_affinity }; /* As above, but without having masking capability. */ static hw_irq_controller pci_msi_nonmaskable = { .typename = "PCI-MSI", .startup = irq_startup_none, .shutdown = irq_shutdown_none, .enable = irq_enable_none, .disable = irq_disable_none, .ack = ack_nonmaskable_msi_irq, .end = end_nonmaskable_msi_irq, .set_affinity = set_msi_affinity }; static struct msi_desc *alloc_msi_entry(unsigned int nr) { struct msi_desc *entry; entry = xmalloc_array(struct msi_desc, nr); if ( !entry ) return NULL; INIT_LIST_HEAD(&entry->list); while ( nr-- ) { entry[nr].dev = NULL; entry[nr].remap_index = -1; } return entry; } int setup_msi_irq(struct irq_desc *desc, struct msi_desc *msidesc) { return __setup_msi_irq(desc, msidesc, msi_maskable_irq(msidesc) ? &pci_msi_maskable : &pci_msi_nonmaskable); } int __setup_msi_irq(struct irq_desc *desc, struct msi_desc *msidesc, hw_irq_controller *handler) { struct msi_msg msg; desc->msi_desc = msidesc; desc->handler = handler; msi_compose_msg(desc->arch.vector, desc->arch.cpu_mask, &msg); return write_msi_msg(msidesc, &msg); } int msi_free_irq(struct msi_desc *entry) { unsigned int nr = entry->msi.nvec; if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX ) { unsigned long start; start = (unsigned long)entry->mask_base & ~(PAGE_SIZE - 1); msix_put_fixmap(entry->dev->msix, virt_to_fix(start)); nr = 1; } while ( nr-- ) { destroy_irq(entry[nr].irq); /* Free the unused IRTE if intr remap enabled */ if ( iommu_intremap ) iommu_update_ire_from_msi(entry + nr, NULL); } list_del(&entry->list); xfree(entry); return 0; } static struct msi_desc *find_msi_entry(struct pci_dev *dev, int irq, int cap_id) { struct msi_desc *entry; list_for_each_entry( entry, &dev->msi_list, list ) { if ( entry->msi_attrib.type == cap_id && (irq == -1 || entry->irq == irq) ) return entry; } return NULL; } /** * msi_capability_init - configure device's MSI capability structure * @dev: pointer to the pci_dev data structure of MSI device function * * Setup the MSI capability structure of device function with a single * MSI irq, regardless of device function is capable of handling * multiple messages. A return of zero indicates the successful setup * of an entry zero with the new MSI irq or non-zero for otherwise. **/ static int msi_capability_init(struct pci_dev *dev, int irq, struct msi_desc **desc, unsigned int nvec) { struct msi_desc *entry; int pos; unsigned int i, maxvec, mpos; u16 control, seg = dev->seg; u8 bus = dev->bus; u8 slot = PCI_SLOT(dev->devfn); u8 func = PCI_FUNC(dev->devfn); ASSERT(spin_is_locked(&pcidevs_lock)); pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSI); control = pci_conf_read16(seg, bus, slot, func, msi_control_reg(pos)); maxvec = multi_msi_capable(control); if ( nvec > maxvec ) return maxvec; control &= ~PCI_MSI_FLAGS_QSIZE; multi_msi_enable(control, nvec); /* MSI Entry Initialization */ msi_set_enable(dev, 0); /* Ensure msi is disabled as I set it up */ entry = alloc_msi_entry(nvec); if ( !entry ) return -ENOMEM; mpos = msi_mask_bits_reg(pos, is_64bit_address(control)); for ( i = 0; i < nvec; ++i ) { entry[i].msi_attrib.type = PCI_CAP_ID_MSI; entry[i].msi_attrib.is_64 = is_64bit_address(control); entry[i].msi_attrib.entry_nr = i; entry[i].msi_attrib.maskbit = is_mask_bit_support(control); entry[i].msi_attrib.masked = 1; entry[i].msi_attrib.pos = pos; if ( entry[i].msi_attrib.maskbit ) entry[i].msi.mpos = mpos; entry[i].msi.nvec = 0; entry[i].dev = dev; } entry->msi.nvec = nvec; entry->irq = irq; if ( entry->msi_attrib.maskbit ) { u32 maskbits; /* All MSIs are unmasked by default, Mask them all */ maskbits = pci_conf_read32(seg, bus, slot, func, mpos); maskbits |= ~(u32)0 >> (32 - maxvec); pci_conf_write32(seg, bus, slot, func, mpos, maskbits); } list_add_tail(&entry->list, &dev->msi_list); *desc = entry; /* Restore the original MSI enabled bits */ pci_conf_write16(seg, bus, slot, func, msi_control_reg(pos), control); return 0; } static u64 read_pci_mem_bar(u16 seg, u8 bus, u8 slot, u8 func, u8 bir, int vf) { u8 limit; u32 addr, base = PCI_BASE_ADDRESS_0; u64 disp = 0; if ( vf >= 0 ) { struct pci_dev *pdev = pci_get_pdev(seg, bus, PCI_DEVFN(slot, func)); unsigned int pos = pci_find_ext_capability(seg, bus, PCI_DEVFN(slot, func), PCI_EXT_CAP_ID_SRIOV); u16 ctrl = pci_conf_read16(seg, bus, slot, func, pos + PCI_SRIOV_CTRL); u16 num_vf = pci_conf_read16(seg, bus, slot, func, pos + PCI_SRIOV_NUM_VF); u16 offset = pci_conf_read16(seg, bus, slot, func, pos + PCI_SRIOV_VF_OFFSET); u16 stride = pci_conf_read16(seg, bus, slot, func, pos + PCI_SRIOV_VF_STRIDE); if ( !pdev || !pos || !(ctrl & PCI_SRIOV_CTRL_VFE) || !(ctrl & PCI_SRIOV_CTRL_MSE) || !num_vf || !offset || (num_vf > 1 && !stride) || bir >= PCI_SRIOV_NUM_BARS || !pdev->vf_rlen[bir] ) return 0; base = pos + PCI_SRIOV_BAR; vf -= PCI_BDF(bus, slot, func) + offset; if ( vf < 0 || (vf && vf % stride) ) return 0; if ( stride ) { if ( vf % stride ) return 0; vf /= stride; } if ( vf >= num_vf ) return 0; BUILD_BUG_ON(ARRAY_SIZE(pdev->vf_rlen) != PCI_SRIOV_NUM_BARS); disp = vf * pdev->vf_rlen[bir]; limit = PCI_SRIOV_NUM_BARS; } else switch ( pci_conf_read8(seg, bus, slot, func, PCI_HEADER_TYPE) & 0x7f ) { case PCI_HEADER_TYPE_NORMAL: limit = 6; break; case PCI_HEADER_TYPE_BRIDGE: limit = 2; break; case PCI_HEADER_TYPE_CARDBUS: limit = 1; break; default: return 0; } if ( bir >= limit ) return 0; addr = pci_conf_read32(seg, bus, slot, func, base + bir * 4); if ( (addr & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO ) return 0; if ( (addr & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64 ) { addr &= PCI_BASE_ADDRESS_MEM_MASK; if ( ++bir >= limit ) return 0; return addr + disp + ((u64)pci_conf_read32(seg, bus, slot, func, base + bir * 4) << 32); } return (addr & PCI_BASE_ADDRESS_MEM_MASK) + disp; } /** * msix_capability_init - configure device's MSI-X capability * @dev: pointer to the pci_dev data structure of MSI-X device function * @entries: pointer to an array of struct msix_entry entries * @nvec: number of @entries * * Setup the MSI-X capability structure of device function with the requested * number MSI-X irqs. A return of zero indicates the successful setup of * requested MSI-X entries with allocated irqs or non-zero for otherwise. **/ static int msix_capability_init(struct pci_dev *dev, struct msi_info *msi, struct msi_desc **desc, unsigned int nr_entries) { struct arch_msix *msix = dev->msix; struct msi_desc *entry = NULL; int pos, vf; u16 control; u64 table_paddr; u32 table_offset; u8 bir, pbus, pslot, pfunc; u16 seg = dev->seg; u8 bus = dev->bus; u8 slot = PCI_SLOT(dev->devfn); u8 func = PCI_FUNC(dev->devfn); ASSERT(spin_is_locked(&pcidevs_lock)); pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX); control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos)); msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */ if ( desc ) { entry = alloc_msi_entry(1); if ( !entry ) return -ENOMEM; ASSERT(msi); } /* Locate MSI-X table region */ table_offset = pci_conf_read32(seg, bus, slot, func, msix_table_offset_reg(pos)); bir = (u8)(table_offset & PCI_MSIX_BIRMASK); table_offset &= ~PCI_MSIX_BIRMASK; if ( !dev->info.is_virtfn ) { pbus = bus; pslot = slot; pfunc = func; vf = -1; } else { pbus = dev->info.physfn.bus; pslot = PCI_SLOT(dev->info.physfn.devfn); pfunc = PCI_FUNC(dev->info.physfn.devfn); vf = PCI_BDF2(dev->bus, dev->devfn); } table_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf); WARN_ON(msi && msi->table_base != table_paddr); if ( !table_paddr ) { if ( !msi || !msi->table_base ) { xfree(entry); return -ENXIO; } table_paddr = msi->table_base; } table_paddr += table_offset; if ( !msix->used_entries ) { u64 pba_paddr; u32 pba_offset; msix->nr_entries = nr_entries; msix->table.first = PFN_DOWN(table_paddr); msix->table.last = PFN_DOWN(table_paddr + nr_entries * PCI_MSIX_ENTRY_SIZE - 1); WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->table.first, msix->table.last)); pba_offset = pci_conf_read32(seg, bus, slot, func, msix_pba_offset_reg(pos)); bir = (u8)(pba_offset & PCI_MSIX_BIRMASK); pba_paddr = read_pci_mem_bar(seg, pbus, pslot, pfunc, bir, vf); WARN_ON(!pba_paddr); pba_paddr += pba_offset & ~PCI_MSIX_BIRMASK; msix->pba.first = PFN_DOWN(pba_paddr); msix->pba.last = PFN_DOWN(pba_paddr + BITS_TO_LONGS(nr_entries) - 1); WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, msix->pba.first, msix->pba.last)); } if ( entry ) { /* Map MSI-X table region */ u64 entry_paddr = table_paddr + msi->entry_nr * PCI_MSIX_ENTRY_SIZE; int idx = msix_get_fixmap(msix, table_paddr, entry_paddr); void __iomem *base; if ( idx < 0 ) { xfree(entry); return idx; } base = (void *)(fix_to_virt(idx) + ((unsigned long)entry_paddr & (PAGE_SIZE - 1))); /* Mask interrupt here */ writel(1, base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); entry->msi_attrib.type = PCI_CAP_ID_MSIX; entry->msi_attrib.is_64 = 1; entry->msi_attrib.entry_nr = msi->entry_nr; entry->msi_attrib.maskbit = 1; entry->msi_attrib.masked = 1; entry->msi_attrib.pos = pos; entry->irq = msi->irq; entry->dev = dev; entry->mask_base = base; list_add_tail(&entry->list, &dev->msi_list); *desc = entry; } if ( !msix->used_entries ) { if ( rangeset_add_range(mmio_ro_ranges, msix->table.first, msix->table.last) ) WARN(); if ( rangeset_add_range(mmio_ro_ranges, msix->pba.first, msix->pba.last) ) WARN(); if ( dev->domain ) p2m_change_entry_type_global(dev->domain, p2m_mmio_direct, p2m_mmio_direct); if ( desc && (!dev->domain || !paging_mode_translate(dev->domain)) ) { struct domain *d = dev->domain; if ( !d ) for_each_domain(d) if ( !paging_mode_translate(d) && (iomem_access_permitted(d, msix->table.first, msix->table.last) || iomem_access_permitted(d, msix->pba.first, msix->pba.last)) ) break; if ( d ) { if ( !is_hardware_domain(d) && msix->warned != d->domain_id ) { msix->warned = d->domain_id; printk(XENLOG_ERR "Potentially insecure use of MSI-X on %04x:%02x:%02x.%u by Dom%d\n", seg, bus, slot, func, d->domain_id); } /* XXX How to deal with existing mappings? */ } } } WARN_ON(msix->nr_entries != nr_entries); WARN_ON(msix->table.first != (table_paddr >> PAGE_SHIFT)); ++msix->used_entries; /* Restore MSI-X enabled bits */ pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control); return 0; } /** * pci_enable_msi - configure device's MSI capability structure * @dev: pointer to the pci_dev data structure of MSI device function * * Setup the MSI capability structure of device function with * a single MSI irq upon its software driver call to request for * MSI mode enabled on its hardware device function. A return of zero * indicates the successful setup of an entry zero with the new MSI * irq or non-zero for otherwise. **/ static int __pci_enable_msi(struct msi_info *msi, struct msi_desc **desc) { struct pci_dev *pdev; struct msi_desc *old_desc; ASSERT(spin_is_locked(&pcidevs_lock)); pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn); if ( !pdev ) return -ENODEV; old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSI); if ( old_desc ) { dprintk(XENLOG_WARNING, "irq %d has already mapped to MSI on " "device %04x:%02x:%02x.%01x\n", msi->irq, msi->seg, msi->bus, PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); *desc = old_desc; return 0; } old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSIX); if ( old_desc ) { dprintk(XENLOG_WARNING, "MSI-X is already in use on " "device %04x:%02x:%02x.%01x\n", msi->seg, msi->bus, PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); pci_disable_msi(old_desc); } return msi_capability_init(pdev, msi->irq, desc, msi->entry_nr); } static void __pci_disable_msi(struct msi_desc *entry) { struct pci_dev *dev; dev = entry->dev; msi_set_enable(dev, 0); BUG_ON(list_empty(&dev->msi_list)); } /** * pci_enable_msix - configure device's MSI-X capability structure * @dev: pointer to the pci_dev data structure of MSI-X device function * @entries: pointer to an array of MSI-X entries * @nvec: number of MSI-X irqs requested for allocation by device driver * * Setup the MSI-X capability structure of device function with the number * of requested irqs upon its software driver call to request for * MSI-X mode enabled on its hardware device function. A return of zero * indicates the successful configuration of MSI-X capability structure * with new allocated MSI-X irqs. A return of < 0 indicates a failure. * Or a return of > 0 indicates that driver request is exceeding the number * of irqs available. Driver should use the returned value to re-send * its request. **/ static int __pci_enable_msix(struct msi_info *msi, struct msi_desc **desc) { int status, pos, nr_entries; struct pci_dev *pdev; u16 control; u8 slot = PCI_SLOT(msi->devfn); u8 func = PCI_FUNC(msi->devfn); struct msi_desc *old_desc; ASSERT(spin_is_locked(&pcidevs_lock)); pdev = pci_get_pdev(msi->seg, msi->bus, msi->devfn); if ( !pdev ) return -ENODEV; pos = pci_find_cap_offset(msi->seg, msi->bus, slot, func, PCI_CAP_ID_MSIX); control = pci_conf_read16(msi->seg, msi->bus, slot, func, msix_control_reg(pos)); nr_entries = multi_msix_capable(control); if (msi->entry_nr >= nr_entries) return -EINVAL; old_desc = find_msi_entry(pdev, msi->irq, PCI_CAP_ID_MSIX); if ( old_desc ) { dprintk(XENLOG_WARNING, "irq %d has already mapped to MSIX on " "device %04x:%02x:%02x.%01x\n", msi->irq, msi->seg, msi->bus, PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); *desc = old_desc; return 0; } old_desc = find_msi_entry(pdev, -1, PCI_CAP_ID_MSI); if ( old_desc ) { dprintk(XENLOG_WARNING, "MSI is already in use on " "device %04x:%02x:%02x.%01x\n", msi->seg, msi->bus, PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); pci_disable_msi(old_desc); } status = msix_capability_init(pdev, msi, desc, nr_entries); return status; } static void _pci_cleanup_msix(struct arch_msix *msix) { if ( !--msix->used_entries ) { if ( rangeset_remove_range(mmio_ro_ranges, msix->table.first, msix->table.last) ) WARN(); if ( rangeset_remove_range(mmio_ro_ranges, msix->pba.first, msix->pba.last) ) WARN(); } } static void __pci_disable_msix(struct msi_desc *entry) { struct pci_dev *dev; int pos; u16 control, seg; u8 bus, slot, func; dev = entry->dev; seg = dev->seg; bus = dev->bus; slot = PCI_SLOT(dev->devfn); func = PCI_FUNC(dev->devfn); pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX); control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos)); msix_set_enable(dev, 0); BUG_ON(list_empty(&dev->msi_list)); writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); pci_conf_write16(seg, bus, slot, func, msix_control_reg(pos), control); _pci_cleanup_msix(dev->msix); } int pci_prepare_msix(u16 seg, u8 bus, u8 devfn, bool_t off) { int rc; struct pci_dev *pdev; u8 slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn); unsigned int pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX); if ( !use_msi ) return 0; if ( !pos ) return -ENODEV; spin_lock(&pcidevs_lock); pdev = pci_get_pdev(seg, bus, devfn); if ( !pdev ) rc = -ENODEV; else if ( pdev->msix->used_entries != !!off ) rc = -EBUSY; else if ( off ) { _pci_cleanup_msix(pdev->msix); rc = 0; } else { u16 control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos)); rc = msix_capability_init(pdev, NULL, NULL, multi_msix_capable(control)); } spin_unlock(&pcidevs_lock); return rc; } /* * Notice: only construct the msi_desc * no change to irq_desc here, and the interrupt is masked */ int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc) { ASSERT(spin_is_locked(&pcidevs_lock)); if ( !use_msi ) return -EPERM; return msi->table_base ? __pci_enable_msix(msi, desc) : __pci_enable_msi(msi, desc); } /* * Device only, no irq_desc */ void pci_disable_msi(struct msi_desc *msi_desc) { if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ) __pci_disable_msi(msi_desc); else if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSIX ) __pci_disable_msix(msi_desc); } static void msi_free_irqs(struct pci_dev* dev) { struct msi_desc *entry, *tmp; list_for_each_entry_safe( entry, tmp, &dev->msi_list, list ) { pci_disable_msi(entry); msi_free_irq(entry); } } void pci_cleanup_msi(struct pci_dev *pdev) { /* Disable MSI and/or MSI-X */ msi_set_enable(pdev, 0); msix_set_enable(pdev, 0); msi_free_irqs(pdev); } int pci_restore_msi_state(struct pci_dev *pdev) { unsigned long flags; int irq; int ret; struct msi_desc *entry, *tmp; struct irq_desc *desc; struct msi_msg msg; ASSERT(spin_is_locked(&pcidevs_lock)); if ( !use_msi ) return -EOPNOTSUPP; if ( !pdev ) return -EINVAL; ret = xsm_resource_setup_pci(XSM_PRIV, (pdev->seg << 16) | (pdev->bus << 8) | pdev->devfn); if ( ret ) return ret; list_for_each_entry_safe( entry, tmp, &pdev->msi_list, list ) { unsigned int i = 0, nr = 1; irq = entry->irq; desc = &irq_desc[irq]; spin_lock_irqsave(&desc->lock, flags); ASSERT(desc->msi_desc == entry); if (desc->msi_desc != entry) { bogus: dprintk(XENLOG_ERR, "Restore MSI for %04x:%02x:%02x:%u entry %u not set?\n", pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), i); spin_unlock_irqrestore(&desc->lock, flags); return -EINVAL; } if ( entry->msi_attrib.type == PCI_CAP_ID_MSI ) { msi_set_enable(pdev, 0); nr = entry->msi.nvec; } else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX ) msix_set_enable(pdev, 0); msg = entry->msg; write_msi_msg(entry, &msg); for ( i = 0; ; ) { msi_set_mask_bit(desc, entry[i].msi_attrib.masked); if ( !--nr ) break; spin_unlock_irqrestore(&desc->lock, flags); desc = &irq_desc[entry[++i].irq]; spin_lock_irqsave(&desc->lock, flags); if ( desc->msi_desc != entry + i ) goto bogus; } spin_unlock_irqrestore(&desc->lock, flags); if ( entry->msi_attrib.type == PCI_CAP_ID_MSI ) { unsigned int cpos = msi_control_reg(entry->msi_attrib.pos); u16 control = pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), cpos); control &= ~PCI_MSI_FLAGS_QSIZE; multi_msi_enable(control, entry->msi.nvec); pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), cpos, control); msi_set_enable(pdev, 1); } else if ( entry->msi_attrib.type == PCI_CAP_ID_MSIX ) msix_set_enable(pdev, 1); } return 0; } unsigned int pci_msix_get_table_len(struct pci_dev *pdev) { int pos; u16 control, seg = pdev->seg; u8 bus, slot, func; unsigned int len; bus = pdev->bus; slot = PCI_SLOT(pdev->devfn); func = PCI_FUNC(pdev->devfn); pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX); if ( !pos || !use_msi ) return 0; control = pci_conf_read16(seg, bus, slot, func, msix_control_reg(pos)); len = msix_table_size(control) * PCI_MSIX_ENTRY_SIZE; return len; } static int msi_cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; switch ( action ) { case CPU_UP_PREPARE: if ( !alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) ) return notifier_from_errno(ENOMEM); break; case CPU_UP_CANCELED: case CPU_DEAD: free_cpumask_var(per_cpu(scratch_mask, cpu)); break; default: break; } return NOTIFY_DONE; } static struct notifier_block msi_cpu_nfb = { .notifier_call = msi_cpu_callback }; void __init early_msi_init(void) { if ( use_msi < 0 ) use_msi = !(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_MSI); if ( !use_msi ) return; register_cpu_notifier(&msi_cpu_nfb); if ( msi_cpu_callback(&msi_cpu_nfb, CPU_UP_PREPARE, NULL) & NOTIFY_STOP_MASK ) BUG(); } static void dump_msi(unsigned char key) { unsigned int irq; printk("MSI information:\n"); for ( irq = 0; irq < nr_irqs; irq++ ) { struct irq_desc *desc = irq_to_desc(irq); const struct msi_desc *entry; u32 addr, data, dest32; signed char mask; struct msi_attrib attr; unsigned long flags; const char *type = "???"; if ( !irq_desc_initialized(desc) ) continue; spin_lock_irqsave(&desc->lock, flags); entry = desc->msi_desc; if ( !entry ) { spin_unlock_irqrestore(&desc->lock, flags); continue; } switch ( entry->msi_attrib.type ) { case PCI_CAP_ID_MSI: type = "MSI"; break; case PCI_CAP_ID_MSIX: type = "MSI-X"; break; case 0: switch ( entry->msi_attrib.pos ) { case MSI_TYPE_HPET: type = "HPET"; break; case MSI_TYPE_IOMMU: type = "IOMMU"; break; } break; } data = entry->msg.data; addr = entry->msg.address_lo; dest32 = entry->msg.dest32; attr = entry->msi_attrib; if ( entry->msi_attrib.type ) mask = msi_get_mask_bit(entry); else mask = -1; spin_unlock_irqrestore(&desc->lock, flags); if ( mask >= 0 ) mask += '0'; else mask = '?'; printk(" %-6s%4u vec=%02x%7s%6s%3sassert%5s%7s" " dest=%08x mask=%d/%d/%c\n", type, irq, (data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT, data & MSI_DATA_DELIVERY_LOWPRI ? "lowest" : "fixed", data & MSI_DATA_TRIGGER_LEVEL ? "level" : "edge", data & MSI_DATA_LEVEL_ASSERT ? "" : "de", addr & MSI_ADDR_DESTMODE_LOGIC ? "log" : "phys", addr & MSI_ADDR_REDIRECTION_LOWPRI ? "lowest" : "cpu", dest32, attr.maskbit, attr.masked, mask); } } static struct keyhandler dump_msi_keyhandler = { .diagnostic = 1, .u.fn = dump_msi, .desc = "dump MSI state" }; static int __init msi_setup_keyhandler(void) { register_keyhandler('M', &dump_msi_keyhandler); return 0; } __initcall(msi_setup_keyhandler); xen-4.4.0/xen/arch/x86/xstate.c0000664000175000017500000002662712307313555014270 0ustar smbsmb/* * arch/x86/xstate.c * * x86 extended state operations * */ #include #include #include #include #include #include #include #include bool_t __read_mostly cpu_has_xsaveopt; /* * Maximum size (in byte) of the XSAVE/XRSTOR save area required by all * the supported and enabled features on the processor, including the * XSAVE.HEADER. We only enable XCNTXT_MASK that we have known. */ static u32 __read_mostly xsave_cntxt_size; /* A 64-bit bitmask of the XSAVE/XRSTOR features supported by processor. */ u64 xfeature_mask; /* Cached xcr0 for fast read */ static DEFINE_PER_CPU(uint64_t, xcr0); /* Because XCR0 is cached for each CPU, xsetbv() is not exposed. Users should * use set_xcr0() instead. */ static inline bool_t xsetbv(u32 index, u64 xfeatures) { u32 hi = xfeatures >> 32; u32 lo = (u32)xfeatures; asm volatile ( "1: .byte 0x0f,0x01,0xd1\n" "3: \n" ".section .fixup,\"ax\" \n" "2: xor %0,%0 \n" " jmp 3b \n" ".previous \n" _ASM_EXTABLE(1b, 2b) : "+a" (lo) : "c" (index), "d" (hi)); return lo != 0; } bool_t set_xcr0(u64 xfeatures) { if ( !xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures) ) return 0; this_cpu(xcr0) = xfeatures; return 1; } uint64_t get_xcr0(void) { return this_cpu(xcr0); } void xsave(struct vcpu *v, uint64_t mask) { struct xsave_struct *ptr = v->arch.xsave_area; uint32_t hmask = mask >> 32; uint32_t lmask = mask; int word_size = mask & XSTATE_FP ? (cpu_has_fpu_sel ? 8 : 0) : -1; if ( word_size <= 0 || !is_pv_32bit_vcpu(v) ) { typeof(ptr->fpu_sse.fip.sel) fcs = ptr->fpu_sse.fip.sel; typeof(ptr->fpu_sse.fdp.sel) fds = ptr->fpu_sse.fdp.sel; if ( cpu_has_xsaveopt ) { /* * xsaveopt may not write the FPU portion even when the respective * mask bit is set. For the check further down to work we hence * need to put the save image back into the state that it was in * right after the previous xsaveopt. */ if ( word_size > 0 && (ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET] == 4 || ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET] == 2) ) { ptr->fpu_sse.fip.sel = 0; ptr->fpu_sse.fdp.sel = 0; } asm volatile ( ".byte 0x48,0x0f,0xae,0x37" : "=m" (*ptr) : "a" (lmask), "d" (hmask), "D" (ptr) ); } else asm volatile ( ".byte 0x48,0x0f,0xae,0x27" : "=m" (*ptr) : "a" (lmask), "d" (hmask), "D" (ptr) ); if ( !(mask & ptr->xsave_hdr.xstate_bv & XSTATE_FP) || /* * AMD CPUs don't save/restore FDP/FIP/FOP unless an exception * is pending. */ (!(ptr->fpu_sse.fsw & 0x0080) && boot_cpu_data.x86_vendor == X86_VENDOR_AMD) ) { if ( cpu_has_xsaveopt && word_size > 0 ) { ptr->fpu_sse.fip.sel = fcs; ptr->fpu_sse.fdp.sel = fds; } return; } if ( word_size > 0 && !((ptr->fpu_sse.fip.addr | ptr->fpu_sse.fdp.addr) >> 32) ) { struct ix87_env fpu_env; asm volatile ( "fnstenv %0" : "=m" (fpu_env) ); ptr->fpu_sse.fip.sel = fpu_env.fcs; ptr->fpu_sse.fdp.sel = fpu_env.fds; word_size = 4; } } else { if ( cpu_has_xsaveopt ) asm volatile ( ".byte 0x0f,0xae,0x37" : "=m" (*ptr) : "a" (lmask), "d" (hmask), "D" (ptr) ); else asm volatile ( ".byte 0x0f,0xae,0x27" : "=m" (*ptr) : "a" (lmask), "d" (hmask), "D" (ptr) ); word_size = 4; } if ( word_size >= 0 ) ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET] = word_size; } void xrstor(struct vcpu *v, uint64_t mask) { uint32_t hmask = mask >> 32; uint32_t lmask = mask; struct xsave_struct *ptr = v->arch.xsave_area; /* * AMD CPUs don't save/restore FDP/FIP/FOP unless an exception * is pending. Clear the x87 state here by setting it to fixed * values. The hypervisor data segment can be sometimes 0 and * sometimes new user value. Both should be ok. Use the FPU saved * data block as a safe address because it should be in L1. */ if ( (mask & ptr->xsave_hdr.xstate_bv & XSTATE_FP) && !(ptr->fpu_sse.fsw & 0x0080) && boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) asm volatile ( "fnclex\n\t" /* clear exceptions */ "ffree %%st(7)\n\t" /* clear stack tag */ "fildl %0" /* load to clear state */ : : "m" (ptr->fpu_sse) ); /* * XRSTOR can fault if passed a corrupted data block. We handle this * possibility, which may occur if the block was passed to us by control * tools or through VCPUOP_initialise, by silently clearing the block. */ switch ( __builtin_expect(ptr->fpu_sse.x[FPU_WORD_SIZE_OFFSET], 8) ) { default: asm volatile ( "1: .byte 0x48,0x0f,0xae,0x2f\n" ".section .fixup,\"ax\" \n" "2: mov %5,%%ecx \n" " xor %1,%1 \n" " rep stosb \n" " lea %2,%0 \n" " mov %3,%1 \n" " jmp 1b \n" ".previous \n" _ASM_EXTABLE(1b, 2b) : "+&D" (ptr), "+&a" (lmask) : "m" (*ptr), "g" (lmask), "d" (hmask), "m" (xsave_cntxt_size) : "ecx" ); break; case 4: case 2: asm volatile ( "1: .byte 0x0f,0xae,0x2f\n" ".section .fixup,\"ax\" \n" "2: mov %5,%%ecx \n" " xor %1,%1 \n" " rep stosb \n" " lea %2,%0 \n" " mov %3,%1 \n" " jmp 1b \n" ".previous \n" _ASM_EXTABLE(1b, 2b) : "+&D" (ptr), "+&a" (lmask) : "m" (*ptr), "g" (lmask), "d" (hmask), "m" (xsave_cntxt_size) : "ecx" ); break; } } bool_t xsave_enabled(const struct vcpu *v) { if ( !cpu_has_xsave ) return 0; ASSERT(xsave_cntxt_size >= XSTATE_AREA_MIN_SIZE); ASSERT(v->arch.xsave_area); return !!v->arch.xcr0_accum; } int xstate_alloc_save_area(struct vcpu *v) { struct xsave_struct *save_area; if ( !cpu_has_xsave || is_idle_vcpu(v) ) return 0; BUG_ON(xsave_cntxt_size < XSTATE_AREA_MIN_SIZE); /* XSAVE/XRSTOR requires the save area be 64-byte-boundary aligned. */ save_area = _xzalloc(xsave_cntxt_size, 64); if ( save_area == NULL ) return -ENOMEM; /* * Set the memory image to default values, but don't force the context * to be loaded from memory (i.e. keep save_area->xsave_hdr.xstate_bv * clear). */ save_area->fpu_sse.fcw = FCW_DEFAULT; save_area->fpu_sse.mxcsr = MXCSR_DEFAULT; v->arch.xsave_area = save_area; v->arch.xcr0 = 0; v->arch.xcr0_accum = 0; return 0; } void xstate_free_save_area(struct vcpu *v) { xfree(v->arch.xsave_area); v->arch.xsave_area = NULL; } /* Collect the information of processor's extended state */ void xstate_init(bool_t bsp) { u32 eax, ebx, ecx, edx, min_size; u64 feature_mask; if ( boot_cpu_data.cpuid_level < XSTATE_CPUID ) { BUG_ON(!bsp); setup_clear_cpu_cap(X86_FEATURE_XSAVE); return; } cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); BUG_ON((eax & XSTATE_FP_SSE) != XSTATE_FP_SSE); BUG_ON((eax & XSTATE_YMM) && !(eax & XSTATE_SSE)); feature_mask = (((u64)edx << 32) | eax) & XCNTXT_MASK; /* FP/SSE, XSAVE.HEADER, YMM */ min_size = XSTATE_AREA_MIN_SIZE; if ( eax & XSTATE_YMM ) min_size += XSTATE_YMM_SIZE; BUG_ON(ecx < min_size); /* * Set CR4_OSXSAVE and run "cpuid" to get xsave_cntxt_size. */ set_in_cr4(X86_CR4_OSXSAVE); if ( !set_xcr0(feature_mask) ) BUG(); if ( bsp ) { xfeature_mask = feature_mask; /* * xsave_cntxt_size is the max size required by enabled features. * We know FP/SSE and YMM about eax, and nothing about edx at present. */ xsave_cntxt_size = xstate_ctxt_size(feature_mask); printk("%s: using cntxt_size: %#x and states: %#"PRIx64"\n", __func__, xsave_cntxt_size, xfeature_mask); } else { BUG_ON(xfeature_mask != feature_mask); BUG_ON(xsave_cntxt_size != xstate_ctxt_size(feature_mask)); } /* Check XSAVEOPT feature. */ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); if ( bsp ) cpu_has_xsaveopt = !!(eax & XSTATE_FEATURE_XSAVEOPT); else BUG_ON(!cpu_has_xsaveopt != !(eax & XSTATE_FEATURE_XSAVEOPT)); } unsigned int xstate_ctxt_size(u64 xcr0) { u32 ebx = 0; if ( xcr0 ) { u64 act_xcr0 = get_xcr0(); u32 eax, ecx, edx; bool_t ok = set_xcr0(xcr0); ASSERT(ok); cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); ASSERT(ebx <= ecx); ok = set_xcr0(act_xcr0); ASSERT(ok); } return ebx; } int validate_xstate(u64 xcr0, u64 xcr0_accum, u64 xstate_bv, u64 xfeat_mask) { if ( (xcr0_accum & ~xfeat_mask) || (xstate_bv & ~xcr0_accum) || (xcr0 & ~xcr0_accum) || !(xcr0 & XSTATE_FP) || ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) || ((xcr0_accum & XSTATE_YMM) && !(xcr0_accum & XSTATE_SSE)) ) return -EINVAL; if ( xcr0_accum & ~xfeature_mask ) return -EOPNOTSUPP; return 0; } int handle_xsetbv(u32 index, u64 new_bv) { struct vcpu *curr = current; u64 mask; if ( index != XCR_XFEATURE_ENABLED_MASK ) return -EOPNOTSUPP; if ( (new_bv & ~xfeature_mask) || !(new_bv & XSTATE_FP) ) return -EINVAL; if ( (new_bv & XSTATE_YMM) && !(new_bv & XSTATE_SSE) ) return -EINVAL; if ( !set_xcr0(new_bv) ) return -EFAULT; mask = new_bv & ~curr->arch.xcr0_accum; curr->arch.xcr0 = new_bv; curr->arch.xcr0_accum |= new_bv; mask &= curr->fpu_dirtied ? ~XSTATE_FP_SSE : XSTATE_NONLAZY; if ( mask ) { unsigned long cr0 = read_cr0(); clts(); if ( curr->fpu_dirtied ) asm ( "stmxcsr %0" : "=m" (curr->arch.xsave_area->fpu_sse.mxcsr) ); xrstor(curr, mask); if ( cr0 & X86_CR0_TS ) write_cr0(cr0); } return 0; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/delay.c0000664000175000017500000000130412307313555014037 0ustar smbsmb/* * Precise Delay Loops for i386 * * Copyright (C) 1993 Linus Torvalds * Copyright (C) 1997 Martin Mares * * The __delay function must _NOT_ be inlined as its execution time * depends wildly on alignment on many x86 processors. The additional * jump magic is needed to get the timing stable on all the CPU's * we have to worry about. */ #include #include #include #include #include void __udelay(unsigned long usecs) { unsigned long ticks = usecs * (cpu_khz / 1000); unsigned long s, e; rdtscl(s); do { rep_nop(); rdtscl(e); } while ((e-s) < ticks); } xen-4.4.0/xen/arch/x86/mm.c0000664000175000017500000055163512307313555013373 0ustar smbsmb/****************************************************************************** * arch/x86/mm.c * * Copyright (c) 2002-2005 K A Fraser * Copyright (c) 2004 Christian Limpach * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * A description of the x86 page table API: * * Domains trap to do_mmu_update with a list of update requests. * This is a list of (ptr, val) pairs, where the requested operation * is *ptr = val. * * Reference counting of pages: * ---------------------------- * Each page has two refcounts: tot_count and type_count. * * TOT_COUNT is the obvious reference count. It counts all uses of a * physical page frame by a domain, including uses as a page directory, * a page table, or simple mappings via a PTE. This count prevents a * domain from releasing a frame back to the free pool when it still holds * a reference to it. * * TYPE_COUNT is more subtle. A frame can be put to one of three * mutually-exclusive uses: it might be used as a page directory, or a * page table, or it may be mapped writable by the domain [of course, a * frame may not be used in any of these three ways!]. * So, type_count is a count of the number of times a frame is being * referred to in its current incarnation. Therefore, a page can only * change its type when its type count is zero. * * Pinning the page type: * ---------------------- * The type of a page can be pinned/unpinned with the commands * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is, * pinning is not reference counted, so it can't be nested). * This is useful to prevent a page's type count falling to zero, at which * point safety checks would need to be carried out next time the count * is increased again. * * A further note on writable page mappings: * ----------------------------------------- * For simplicity, the count of writable mappings for a page may not * correspond to reality. The 'writable count' is incremented for every * PTE which maps the page with the _PAGE_RW flag set. However, for * write access to be possible the page directory entry must also have * its _PAGE_RW bit set. We do not check this as it complicates the * reference counting considerably [consider the case of multiple * directory entries referencing a single page table, some with the RW * bit set, others not -- it starts getting a bit messy]. * In normal use, this simplification shouldn't be a problem. * However, the logic can be added if required. * * One more note on read-only page mappings: * ----------------------------------------- * We want domains to be able to map pages for read-only access. The * main reason is that page tables and directories should be readable * by a domain, but it would not be safe for them to be writable. * However, domains have free access to rings 1 & 2 of the Intel * privilege model. In terms of page protection, these are considered * to be part of 'supervisor mode'. The WP bit in CR0 controls whether * read-only restrictions are respected in supervisor mode -- if the * bit is clear then any mapped page is writable. * * We get round this by always setting the WP bit and disallowing * updates to it. This is very unlikely to cause a problem for guest * OS's, which will generally use the WP bit to simplify copy-on-write * implementation (in that case, OS wants a fault when it writes to * an application-supplied buffer). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Mapping of the fixmap space needed early. */ l1_pgentry_t __attribute__ ((__section__ (".bss.page_aligned"))) l1_fixmap[L1_PAGETABLE_ENTRIES]; #define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a) /* * PTE updates can be done with ordinary writes except: * 1. Debug builds get extra checking by using CMPXCHG[8B]. */ #if !defined(NDEBUG) #define PTE_UPDATE_WITH_CMPXCHG #endif paddr_t __read_mostly mem_hotplug; /* Private domain structs for DOMID_XEN and DOMID_IO. */ struct domain *dom_xen, *dom_io, *dom_cow; /* Frame table size in pages. */ unsigned long max_page; unsigned long total_pages; unsigned long __read_mostly pdx_group_valid[BITS_TO_LONGS( (FRAMETABLE_NR + PDX_GROUP_COUNT - 1) / PDX_GROUP_COUNT)] = { [0] = 1 }; bool_t __read_mostly machine_to_phys_mapping_valid = 0; struct rangeset *__read_mostly mmio_ro_ranges; #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT) bool_t __read_mostly opt_allow_superpage; boolean_param("allowsuperpage", opt_allow_superpage); static void put_superpage(unsigned long mfn); static uint32_t base_disallow_mask; #define L1_DISALLOW_MASK (base_disallow_mask | _PAGE_GNTTAB) #define L2_DISALLOW_MASK (base_disallow_mask & ~_PAGE_PSE) #define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \ base_disallow_mask : \ 0xFFFFF198U) #define L4_DISALLOW_MASK (base_disallow_mask) #ifdef USER_MAPPINGS_ARE_GLOBAL /* Global bit is allowed to be set on L1 PTEs. Intended for user mappings. */ #undef L1_DISALLOW_MASK #define L1_DISALLOW_MASK ((base_disallow_mask | _PAGE_GNTTAB) & ~_PAGE_GLOBAL) #endif #define l1_disallow_mask(d) \ ((d != dom_io) && \ (rangeset_is_empty((d)->iomem_caps) && \ rangeset_is_empty((d)->arch.ioport_caps) && \ !has_arch_pdevs(d) && \ is_pv_domain(d)) ? \ L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS)) static void __init init_frametable_chunk(void *start, void *end) { unsigned long s = (unsigned long)start; unsigned long e = (unsigned long)end; unsigned long step, mfn; ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1))); for ( ; s < e; s += step << PAGE_SHIFT ) { step = 1UL << (cpu_has_page1gb && !(s & ((1UL << L3_PAGETABLE_SHIFT) - 1)) ? L3_PAGETABLE_SHIFT - PAGE_SHIFT : L2_PAGETABLE_SHIFT - PAGE_SHIFT); /* * The hardcoded 4 below is arbitrary - just pick whatever you think * is reasonable to waste as a trade-off for using a large page. */ while ( step && s + (step << PAGE_SHIFT) > e + (4 << PAGE_SHIFT) ) step >>= PAGETABLE_ORDER; do { mfn = alloc_boot_pages(step, step); } while ( !mfn && (step >>= PAGETABLE_ORDER) ); if ( !mfn ) panic("Not enough memory for frame table"); map_pages_to_xen(s, mfn, step, PAGE_HYPERVISOR); } memset(start, 0, end - start); memset(end, -1, s - e); } static void __init init_spagetable(void) { BUILD_BUG_ON(XEN_VIRT_END > SPAGETABLE_VIRT_START); init_frametable_chunk(spage_table, mem_hotplug ? spage_table + SPAGETABLE_NR : pdx_to_spage(max_pdx - 1) + 1); } void __init init_frametable(void) { unsigned int sidx, eidx, nidx; unsigned int max_idx = (max_pdx + PDX_GROUP_COUNT - 1) / PDX_GROUP_COUNT; struct page_info *end_pg, *top_pg; BUILD_BUG_ON(XEN_VIRT_END > FRAMETABLE_VIRT_START); BUILD_BUG_ON(FRAMETABLE_VIRT_START & ((1UL << L2_PAGETABLE_SHIFT) - 1)); for ( sidx = 0; ; sidx = nidx ) { eidx = find_next_zero_bit(pdx_group_valid, max_idx, sidx); nidx = find_next_bit(pdx_group_valid, max_idx, eidx); if ( nidx >= max_idx ) break; init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT), pdx_to_page(eidx * PDX_GROUP_COUNT)); } end_pg = pdx_to_page(max_pdx - 1) + 1; top_pg = mem_hotplug ? pdx_to_page(max_idx * PDX_GROUP_COUNT - 1) + 1 : end_pg; init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT), top_pg); memset(end_pg, -1, (unsigned long)top_pg - (unsigned long)end_pg); if (opt_allow_superpage) init_spagetable(); } #ifndef NDEBUG static unsigned int __read_mostly root_pgt_pv_xen_slots = ROOT_PAGETABLE_PV_XEN_SLOTS; static l4_pgentry_t __read_mostly split_l4e; #else #define root_pgt_pv_xen_slots ROOT_PAGETABLE_PV_XEN_SLOTS #endif void __init arch_init_memory(void) { unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn; /* Basic guest-accessible flags: PRESENT, R/W, USER, A/D, AVAIL[0,1,2] */ base_disallow_mask = ~(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER| _PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_AVAIL); /* Allow guest access to the NX flag if hardware supports it. */ if ( cpu_has_nx ) base_disallow_mask &= ~_PAGE_NX_BIT; /* On x86/64, range [62:52] is available for guest software use. */ base_disallow_mask &= ~get_pte_flags((intpte_t)0x7ff << 52); /* * Initialise our DOMID_XEN domain. * Any Xen-heap pages that we will allow to be mapped will have * their domain field set to dom_xen. * Hidden PCI devices will also be associated with this domain * (but be [partly] controlled by Dom0 nevertheless). */ dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0); BUG_ON(IS_ERR(dom_xen)); INIT_LIST_HEAD(&dom_xen->arch.pdev_list); /* * Initialise our DOMID_IO domain. * This domain owns I/O pages that are within the range of the page_info * array. Mappings occur at the priv of the caller. */ dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0); BUG_ON(IS_ERR(dom_io)); /* * Initialise our COW domain. * This domain owns sharable pages. */ dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0); BUG_ON(IS_ERR(dom_cow)); /* First 1MB of RAM is historically marked as I/O. */ for ( i = 0; i < 0x100; i++ ) share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable); /* Any areas not specified as RAM by the e820 map are considered I/O. */ for ( i = 0, pfn = 0; pfn < max_page; i++ ) { while ( (i < e820.nr_map) && (e820.map[i].type != E820_RAM) && (e820.map[i].type != E820_UNUSABLE) ) i++; if ( i >= e820.nr_map ) { /* No more RAM regions: mark as I/O right to end of memory map. */ rstart_pfn = rend_pfn = max_page; } else { /* Mark as I/O just up as far as next RAM region. */ rstart_pfn = min_t(unsigned long, max_page, PFN_UP(e820.map[i].addr)); rend_pfn = max_t(unsigned long, rstart_pfn, PFN_DOWN(e820.map[i].addr + e820.map[i].size)); } /* * Make sure any Xen mappings of RAM holes above 1MB are blown away. * In particular this ensures that RAM holes are respected even in * the statically-initialised 1-16MB mapping area. */ iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT)); ioend_pfn = min(rstart_pfn, 16UL << (20 - PAGE_SHIFT)); if ( iostart_pfn < ioend_pfn ) destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn), (unsigned long)mfn_to_virt(ioend_pfn)); /* Mark as I/O up to next RAM region. */ for ( ; pfn < rstart_pfn; pfn++ ) { if ( !mfn_valid(pfn) ) continue; share_xen_page_with_guest( mfn_to_page(pfn), dom_io, XENSHARE_writable); } /* Skip the RAM region. */ pfn = rend_pfn; } subarch_init_memory(); efi_init_memory(); mem_sharing_init(); #ifndef NDEBUG if ( highmem_start ) { unsigned long split_va = (unsigned long)__va(highmem_start); if ( split_va < HYPERVISOR_VIRT_END && split_va - 1 == (unsigned long)__va(highmem_start - 1) ) { root_pgt_pv_xen_slots = l4_table_offset(split_va) - ROOT_PAGETABLE_FIRST_XEN_SLOT; ASSERT(root_pgt_pv_xen_slots < ROOT_PAGETABLE_PV_XEN_SLOTS); if ( l4_table_offset(split_va) == l4_table_offset(split_va - 1) ) { l3_pgentry_t *l3tab = alloc_xen_pagetable(); if ( l3tab ) { const l3_pgentry_t *l3idle = l4e_to_l3e(idle_pg_table[l4_table_offset(split_va)]); for ( i = 0; i < l3_table_offset(split_va); ++i ) l3tab[i] = l3idle[i]; for ( ; i <= L3_PAGETABLE_ENTRIES; ++i ) l3tab[i] = l3e_empty(); split_l4e = l4e_from_pfn(virt_to_mfn(l3tab), __PAGE_HYPERVISOR); } else ++root_pgt_pv_xen_slots; } } } #endif } int page_is_ram_type(unsigned long mfn, unsigned long mem_type) { uint64_t maddr = pfn_to_paddr(mfn); int i; for ( i = 0; i < e820.nr_map; i++ ) { switch ( e820.map[i].type ) { case E820_RAM: if ( mem_type & RAM_TYPE_CONVENTIONAL ) break; continue; case E820_RESERVED: if ( mem_type & RAM_TYPE_RESERVED ) break; continue; case E820_UNUSABLE: if ( mem_type & RAM_TYPE_UNUSABLE ) break; continue; case E820_ACPI: case E820_NVS: if ( mem_type & RAM_TYPE_ACPI ) break; continue; default: /* unknown */ continue; } /* Test the range. */ if ( (e820.map[i].addr <= maddr) && ((e820.map[i].addr + e820.map[i].size) >= (maddr + PAGE_SIZE)) ) return 1; } return 0; } unsigned long domain_get_maximum_gpfn(struct domain *d) { if ( has_hvm_container_domain(d) ) return p2m_get_hostp2m(d)->max_mapped_pfn; /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */ return (arch_get_max_pfn(d) ?: 1) - 1; } void share_xen_page_with_guest( struct page_info *page, struct domain *d, int readonly) { if ( page_get_owner(page) == d ) return; set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY); spin_lock(&d->page_alloc_lock); /* The incremented type count pins as writable or read-only. */ page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page); page->u.inuse.type_info |= PGT_validated | 1; page_set_owner(page, d); wmb(); /* install valid domain ptr before updating refcnt. */ ASSERT((page->count_info & ~PGC_xen_heap) == 0); /* Only add to the allocation list if the domain isn't dying. */ if ( !d->is_dying ) { page->count_info |= PGC_allocated | 1; if ( unlikely(d->xenheap_pages++ == 0) ) get_knownalive_domain(d); page_list_add_tail(page, &d->xenpage_list); } spin_unlock(&d->page_alloc_lock); } void share_xen_page_with_privileged_guests( struct page_info *page, int readonly) { share_xen_page_with_guest(page, dom_xen, readonly); } void make_cr3(struct vcpu *v, unsigned long mfn) { v->arch.cr3 = mfn << PAGE_SHIFT; } void write_ptbase(struct vcpu *v) { write_cr3(v->arch.cr3); } /* * Should be called after CR3 is updated. * * Uses values found in vcpu->arch.(guest_table and guest_table_user), and * for HVM guests, arch.monitor_table and hvm's guest CR3. * * Update ref counts to shadow tables appropriately. */ void update_cr3(struct vcpu *v) { unsigned long cr3_mfn=0; if ( paging_mode_enabled(v->domain) ) { paging_update_cr3(v); return; } if ( !(v->arch.flags & TF_kernel_mode) ) cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user); else cr3_mfn = pagetable_get_pfn(v->arch.guest_table); make_cr3(v, cr3_mfn); } static void invalidate_shadow_ldt(struct vcpu *v, int flush) { l1_pgentry_t *pl1e; int i; unsigned long pfn; struct page_info *page; BUG_ON(unlikely(in_irq())); spin_lock(&v->arch.pv_vcpu.shadow_ldt_lock); if ( v->arch.pv_vcpu.shadow_ldt_mapcnt == 0 ) goto out; v->arch.pv_vcpu.shadow_ldt_mapcnt = 0; pl1e = gdt_ldt_ptes(v->domain, v); for ( i = 16; i < 32; i++ ) { pfn = l1e_get_pfn(pl1e[i]); if ( pfn == 0 ) continue; l1e_write(&pl1e[i], l1e_empty()); page = mfn_to_page(pfn); ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page); ASSERT_PAGE_IS_DOMAIN(page, v->domain); put_page_and_type(page); } /* Rid TLBs of stale mappings (guest mappings and shadow mappings). */ if ( flush ) flush_tlb_mask(v->vcpu_dirty_cpumask); out: spin_unlock(&v->arch.pv_vcpu.shadow_ldt_lock); } static int alloc_segdesc_page(struct page_info *page) { struct desc_struct *descs; int i; descs = __map_domain_page(page); for ( i = 0; i < 512; i++ ) if ( unlikely(!check_descriptor(page_get_owner(page), &descs[i])) ) goto fail; unmap_domain_page(descs); return 0; fail: unmap_domain_page(descs); return -EINVAL; } /* Map shadow page at offset @off. */ int map_ldt_shadow_page(unsigned int off) { struct vcpu *v = current; struct domain *d = v->domain; unsigned long gmfn; struct page_info *page; l1_pgentry_t l1e, nl1e; unsigned long gva = v->arch.pv_vcpu.ldt_base + (off << PAGE_SHIFT); int okay; BUG_ON(unlikely(in_irq())); if ( is_pv_32bit_domain(d) ) gva = (u32)gva; guest_get_eff_kern_l1e(v, gva, &l1e); if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) ) return 0; gmfn = l1e_get_pfn(l1e); page = get_page_from_gfn(d, gmfn, NULL, P2M_ALLOC); if ( unlikely(!page) ) return 0; okay = get_page_type(page, PGT_seg_desc_page); if ( unlikely(!okay) ) { put_page(page); return 0; } nl1e = l1e_from_pfn(page_to_mfn(page), l1e_get_flags(l1e) | _PAGE_RW); spin_lock(&v->arch.pv_vcpu.shadow_ldt_lock); l1e_write(&gdt_ldt_ptes(d, v)[off + 16], nl1e); v->arch.pv_vcpu.shadow_ldt_mapcnt++; spin_unlock(&v->arch.pv_vcpu.shadow_ldt_lock); return 1; } static int get_page_from_pagenr(unsigned long page_nr, struct domain *d) { struct page_info *page = mfn_to_page(page_nr); if ( unlikely(!mfn_valid(page_nr)) || unlikely(!get_page(page, d)) ) { MEM_LOG("Could not get page ref for pfn %lx", page_nr); return 0; } return 1; } static int get_page_and_type_from_pagenr(unsigned long page_nr, unsigned long type, struct domain *d, int partial, int preemptible) { struct page_info *page = mfn_to_page(page_nr); int rc; if ( likely(partial >= 0) && unlikely(!get_page_from_pagenr(page_nr, d)) ) return -EINVAL; rc = (preemptible ? get_page_type_preemptible(page, type) : (get_page_type(page, type) ? 0 : -EINVAL)); if ( unlikely(rc) && partial >= 0 && (!preemptible || page != current->arch.old_guest_table) ) put_page(page); return rc; } static void put_data_page( struct page_info *page, int writeable) { if ( writeable ) put_page_and_type(page); else put_page(page); } /* * We allow root tables to map each other (a.k.a. linear page tables). It * needs some special care with reference counts and access permissions: * 1. The mapping entry must be read-only, or the guest may get write access * to its own PTEs. * 2. We must only bump the reference counts for an *already validated* * L2 table, or we can end up in a deadlock in get_page_type() by waiting * on a validation that is required to complete that validation. * 3. We only need to increment the reference counts for the mapped page * frame if it is mapped by a different root table. This is sufficient and * also necessary to allow validation of a root table mapping itself. */ #define define_get_linear_pagetable(level) \ static int \ get_##level##_linear_pagetable( \ level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \ { \ unsigned long x, y; \ struct page_info *page; \ unsigned long pfn; \ \ if ( (level##e_get_flags(pde) & _PAGE_RW) ) \ { \ MEM_LOG("Attempt to create linear p.t. with write perms"); \ return 0; \ } \ \ if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \ { \ /* Make sure the mapped frame belongs to the correct domain. */ \ if ( unlikely(!get_page_from_pagenr(pfn, d)) ) \ return 0; \ \ /* \ * Ensure that the mapped frame is an already-validated page table. \ * If so, atomically increment the count (checking for overflow). \ */ \ page = mfn_to_page(pfn); \ y = page->u.inuse.type_info; \ do { \ x = y; \ if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \ unlikely((x & (PGT_type_mask|PGT_validated)) != \ (PGT_##level##_page_table|PGT_validated)) ) \ { \ put_page(page); \ return 0; \ } \ } \ while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \ } \ \ return 1; \ } int is_iomem_page(unsigned long mfn) { struct page_info *page; if ( !mfn_valid(mfn) ) return 1; /* Caller must know that it is an iomem page, or a reference is held. */ page = mfn_to_page(mfn); ASSERT((page->count_info & PGC_count_mask) != 0); return (page_get_owner(page) == dom_io); } static int update_xen_mappings(unsigned long mfn, unsigned long cacheattr) { int err = 0; bool_t alias = mfn >= PFN_DOWN(xen_phys_start) && mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START); unsigned long xen_va = XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT); if ( unlikely(alias) && cacheattr ) err = map_pages_to_xen(xen_va, mfn, 1, 0); if ( !err ) err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1, PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr)); if ( unlikely(alias) && !cacheattr && !err ) err = map_pages_to_xen(xen_va, mfn, 1, PAGE_HYPERVISOR); return err; } int get_page_from_l1e( l1_pgentry_t l1e, struct domain *l1e_owner, struct domain *pg_owner) { unsigned long mfn = l1e_get_pfn(l1e); struct page_info *page = mfn_to_page(mfn); uint32_t l1f = l1e_get_flags(l1e); struct vcpu *curr = current; struct domain *real_pg_owner; bool_t write; if ( !(l1f & _PAGE_PRESENT) ) return 0; if ( unlikely(l1f & l1_disallow_mask(l1e_owner)) ) { MEM_LOG("Bad L1 flags %x", l1f & l1_disallow_mask(l1e_owner)); return -EINVAL; } if ( !mfn_valid(mfn) || (real_pg_owner = page_get_owner_and_reference(page)) == dom_io ) { /* Only needed the reference to confirm dom_io ownership. */ if ( mfn_valid(mfn) ) put_page(page); /* DOMID_IO reverts to caller for privilege checks. */ if ( pg_owner == dom_io ) pg_owner = curr->domain; if ( !iomem_access_permitted(pg_owner, mfn, mfn) ) { if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */ { MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx", pg_owner->domain_id, mfn); return -EPERM; } return -EINVAL; } if ( pg_owner != l1e_owner && !iomem_access_permitted(l1e_owner, mfn, mfn) ) { if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */ { MEM_LOG("Dom%u attempted to map I/O space %08lx in dom%u to dom%u", curr->domain->domain_id, mfn, pg_owner->domain_id, l1e_owner->domain_id); return -EPERM; } return -EINVAL; } if ( !(l1f & _PAGE_RW) || !rangeset_contains_singleton(mmio_ro_ranges, mfn) ) return 0; dprintk(XENLOG_G_WARNING, "d%d: Forcing read-only access to MFN %lx\n", l1e_owner->domain_id, mfn); return 1; } if ( unlikely( (real_pg_owner != pg_owner) && (real_pg_owner != dom_cow) ) ) { /* * Let privileged domains transfer the right to map their target * domain's pages. This is used to allow stub-domain pvfb export to * dom0, until pvfb supports granted mappings. At that time this * minor hack can go away. */ if ( (real_pg_owner == NULL) || (pg_owner == l1e_owner) || xsm_priv_mapping(XSM_TARGET, pg_owner, real_pg_owner) ) { MEM_LOG("pg_owner %d l1e_owner %d, but real_pg_owner %d", pg_owner->domain_id, l1e_owner->domain_id, real_pg_owner?real_pg_owner->domain_id:-1); goto could_not_pin; } pg_owner = real_pg_owner; } /* Extra paranoid check for shared memory. Writable mappings * disallowed (unshare first!) */ if ( (l1f & _PAGE_RW) && (real_pg_owner == dom_cow) ) goto could_not_pin; /* Foreign mappings into guests in shadow external mode don't * contribute to writeable mapping refcounts. (This allows the * qemu-dm helper process in dom0 to map the domain's memory without * messing up the count of "real" writable mappings.) */ write = (l1f & _PAGE_RW) && ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)); if ( write && !get_page_type(page, PGT_writable_page) ) { MEM_LOG("Could not get page type PGT_writable_page"); goto could_not_pin; } if ( pte_flags_to_cacheattr(l1f) != ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) ) { unsigned long x, nx, y = page->count_info; unsigned long cacheattr = pte_flags_to_cacheattr(l1f); int err; if ( is_xen_heap_page(page) ) { if ( write ) put_page_type(page); put_page(page); MEM_LOG("Attempt to change cache attributes of Xen heap page"); return -EACCES; } do { x = y; nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base); } while ( (y = cmpxchg(&page->count_info, x, nx)) != x ); err = update_xen_mappings(mfn, cacheattr); if ( unlikely(err) ) { cacheattr = y & PGC_cacheattr_mask; do { x = y; nx = (x & ~PGC_cacheattr_mask) | cacheattr; } while ( (y = cmpxchg(&page->count_info, x, nx)) != x ); if ( write ) put_page_type(page); put_page(page); MEM_LOG("Error updating mappings for mfn %lx (pfn %lx," " from L1 entry %" PRIpte ") for %d", mfn, get_gpfn_from_mfn(mfn), l1e_get_intpte(l1e), l1e_owner->domain_id); return err; } } return 0; could_not_pin: MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte " for l1e_owner=%d, pg_owner=%d", mfn, get_gpfn_from_mfn(mfn), l1e_get_intpte(l1e), l1e_owner->domain_id, pg_owner->domain_id); if ( real_pg_owner != NULL ) put_page(page); return -EBUSY; } /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */ define_get_linear_pagetable(l2); static int get_page_from_l2e( l2_pgentry_t l2e, unsigned long pfn, struct domain *d) { unsigned long mfn = l2e_get_pfn(l2e); int rc; if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) return 1; if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) ) { MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK); return -EINVAL; } if ( !(l2e_get_flags(l2e) & _PAGE_PSE) ) { rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, 0, 0); if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) rc = 0; return rc; } if ( !opt_allow_superpage ) { MEM_LOG("Attempt to map superpage without allowsuperpage " "flag in hypervisor"); return -EINVAL; } if ( mfn & (L1_PAGETABLE_ENTRIES-1) ) { MEM_LOG("Unaligned superpage map attempt mfn %lx", mfn); return -EINVAL; } return get_superpage(mfn, d); } define_get_linear_pagetable(l3); static int get_page_from_l3e( l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial) { int rc; if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) return 1; if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) ) { MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d)); return -EINVAL; } rc = get_page_and_type_from_pagenr( l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, 1); if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) ) rc = 0; return rc; } define_get_linear_pagetable(l4); static int get_page_from_l4e( l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial) { int rc; if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) return 1; if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) ) { MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK); return -EINVAL; } rc = get_page_and_type_from_pagenr( l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, 1); if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) ) rc = 0; return rc; } #ifdef USER_MAPPINGS_ARE_GLOBAL #define adjust_guest_l1e(pl1e, d) \ do { \ if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \ likely(!is_pv_32on64_domain(d)) ) \ { \ /* _PAGE_GUEST_KERNEL page cannot have the Global bit set. */ \ if ( (l1e_get_flags((pl1e)) & (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL)) \ == (_PAGE_GUEST_KERNEL|_PAGE_GLOBAL) ) \ MEM_LOG("Global bit is set to kernel page %lx", \ l1e_get_pfn((pl1e))); \ if ( !(l1e_get_flags((pl1e)) & _PAGE_USER) ) \ l1e_add_flags((pl1e), (_PAGE_GUEST_KERNEL|_PAGE_USER)); \ if ( !(l1e_get_flags((pl1e)) & _PAGE_GUEST_KERNEL) ) \ l1e_add_flags((pl1e), (_PAGE_GLOBAL|_PAGE_USER)); \ } \ } while ( 0 ) #else #define adjust_guest_l1e(pl1e, d) \ do { \ if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) && \ likely(!is_pv_32on64_domain(d)) ) \ l1e_add_flags((pl1e), _PAGE_USER); \ } while ( 0 ) #endif #define adjust_guest_l2e(pl2e, d) \ do { \ if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) && \ likely(!is_pv_32on64_domain(d)) ) \ l2e_add_flags((pl2e), _PAGE_USER); \ } while ( 0 ) #define adjust_guest_l3e(pl3e, d) \ do { \ if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \ l3e_add_flags((pl3e), likely(!is_pv_32on64_domain(d)) ? \ _PAGE_USER : \ _PAGE_USER|_PAGE_RW); \ } while ( 0 ) #define adjust_guest_l4e(pl4e, d) \ do { \ if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) && \ likely(!is_pv_32on64_domain(d)) ) \ l4e_add_flags((pl4e), _PAGE_USER); \ } while ( 0 ) #define unadjust_guest_l3e(pl3e, d) \ do { \ if ( unlikely(is_pv_32on64_domain(d)) && \ likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) ) \ l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); \ } while ( 0 ) void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner) { unsigned long pfn = l1e_get_pfn(l1e); struct page_info *page; struct domain *pg_owner; struct vcpu *v; if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(pfn) ) return; page = mfn_to_page(pfn); pg_owner = page_get_owner(page); /* * Check if this is a mapping that was established via a grant reference. * If it was then we should not be here: we require that such mappings are * explicitly destroyed via the grant-table interface. * * The upshot of this is that the guest can end up with active grants that * it cannot destroy (because it no longer has a PTE to present to the * grant-table interface). This can lead to subtle hard-to-catch bugs, * hence a special grant PTE flag can be enabled to catch the bug early. * * (Note that the undestroyable active grants are not a security hole in * Xen. All active grants can safely be cleaned up when the domain dies.) */ if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) && !l1e_owner->is_shutting_down && !l1e_owner->is_dying ) { MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte, l1e_get_intpte(l1e)); domain_crash(l1e_owner); } /* Remember we didn't take a type-count of foreign writable mappings * to paging-external domains */ if ( (l1e_get_flags(l1e) & _PAGE_RW) && ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) ) { put_page_and_type(page); } else { /* We expect this is rare so we blow the entire shadow LDT. */ if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == PGT_seg_desc_page)) && unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) && (l1e_owner == pg_owner) ) { for_each_vcpu ( pg_owner, v ) invalidate_shadow_ldt(v, 1); } put_page(page); } } /* * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. * Note also that this automatically deals correctly with linear p.t.'s. */ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) { if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) ) return 1; if ( l2e_get_flags(l2e) & _PAGE_PSE ) put_superpage(l2e_get_pfn(l2e)); else put_page_and_type(l2e_get_page(l2e)); return 0; } static int __put_page_type(struct page_info *, int preemptible); static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, int partial, bool_t defer) { struct page_info *pg; if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) ) return 1; if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) ) { unsigned long mfn = l3e_get_pfn(l3e); int writeable = l3e_get_flags(l3e) & _PAGE_RW; ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))); do { put_data_page(mfn_to_page(mfn), writeable); } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) ); return 0; } pg = l3e_get_page(l3e); if ( unlikely(partial > 0) ) { ASSERT(!defer); return __put_page_type(pg, 1); } if ( defer ) { current->arch.old_guest_table = pg; return 0; } return put_page_and_type_preemptible(pg); } static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, int partial, bool_t defer) { if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && (l4e_get_pfn(l4e) != pfn) ) { struct page_info *pg = l4e_get_page(l4e); if ( unlikely(partial > 0) ) { ASSERT(!defer); return __put_page_type(pg, 1); } if ( defer ) { current->arch.old_guest_table = pg; return 0; } return put_page_and_type_preemptible(pg); } return 1; } static int alloc_l1_table(struct page_info *page) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l1_pgentry_t *pl1e; unsigned int i; int ret = 0; pl1e = map_domain_page(pfn); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) { if ( is_guest_l1_slot(i) ) switch ( ret = get_page_from_l1e(pl1e[i], d, d) ) { default: goto fail; case 0: break; case 1: l1e_remove_flags(pl1e[i], _PAGE_RW); break; } adjust_guest_l1e(pl1e[i], d); } unmap_domain_page(pl1e); return 0; fail: MEM_LOG("Failure in alloc_l1_table: entry %d", i); while ( i-- > 0 ) if ( is_guest_l1_slot(i) ) put_page_from_l1e(pl1e[i], d); unmap_domain_page(pl1e); return ret; } static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e) { struct page_info *page; l3_pgentry_t l3e3; if ( !is_pv_32bit_domain(d) ) return 1; pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK); /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */ l3e3 = pl3e[3]; if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) ) { MEM_LOG("PAE L3 3rd slot is empty"); return 0; } /* * The Xen-private mappings include linear mappings. The L2 thus cannot * be shared by multiple L3 tables. The test here is adequate because: * 1. Cannot appear in slots != 3 because get_page_type() checks the * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3 * 2. Cannot appear in another page table's L3: * a. alloc_l3_table() calls this function and this check will fail * b. mod_l3_entry() disallows updates to slot 3 in an existing table */ page = l3e_get_page(l3e3); BUG_ON(page->u.inuse.type_info & PGT_pinned); BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0); BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2)); if ( (page->u.inuse.type_info & PGT_count_mask) != 1 ) { MEM_LOG("PAE L3 3rd slot is shared"); return 0; } return 1; } static int alloc_l2_table(struct page_info *page, unsigned long type, int preemptible) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l2_pgentry_t *pl2e; unsigned int i; int rc = 0; pl2e = map_domain_page(pfn); for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ ) { if ( preemptible && i > page->nr_validated_ptes && hypercall_preempt_check() ) { page->nr_validated_ptes = i; rc = -EAGAIN; break; } if ( !is_guest_l2_slot(d, type, i) || (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 ) continue; if ( rc < 0 ) { MEM_LOG("Failure in alloc_l2_table: entry %d", i); while ( i-- > 0 ) if ( is_guest_l2_slot(d, type, i) ) put_page_from_l2e(pl2e[i], pfn); break; } adjust_guest_l2e(pl2e[i], d); } if ( rc >= 0 && (type & PGT_pae_xen_l2) ) { /* Xen private mappings. */ memcpy(&pl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)], &compat_idle_pg_table_l2[ l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)], COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*pl2e)); } unmap_domain_page(pl2e); return rc > 0 ? 0 : rc; } static int alloc_l3_table(struct page_info *page) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l3_pgentry_t *pl3e; unsigned int i; int rc = 0, partial = page->partial_pte; pl3e = map_domain_page(pfn); /* * PAE guests allocate full pages, but aren't required to initialize * more than the first four entries; when running in compatibility * mode, however, the full page is visible to the MMU, and hence all * 512 entries must be valid/verified, which is most easily achieved * by clearing them out. */ if ( is_pv_32on64_domain(d) ) memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e)); for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++, partial = 0 ) { if ( is_pv_32bit_domain(d) && (i == 3) ) { if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) || (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ) rc = -EINVAL; else rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]), PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1); } else if ( !is_guest_l3_slot(i) || (rc = get_page_from_l3e(pl3e[i], pfn, d, partial)) > 0 ) continue; if ( rc == -EAGAIN ) { page->nr_validated_ptes = i; page->partial_pte = partial ?: 1; } else if ( rc == -EINTR && i ) { page->nr_validated_ptes = i; page->partial_pte = 0; rc = -EAGAIN; } if ( rc < 0 ) break; adjust_guest_l3e(pl3e[i], d); } if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) ) rc = -EINVAL; if ( rc < 0 && rc != -EAGAIN && rc != -EINTR ) { MEM_LOG("Failure in alloc_l3_table: entry %d", i); if ( i ) { page->nr_validated_ptes = i; page->partial_pte = 0; current->arch.old_guest_table = page; } while ( i-- > 0 ) { if ( !is_guest_l3_slot(i) ) continue; unadjust_guest_l3e(pl3e[i], d); } } unmap_domain_page(pl3e); return rc > 0 ? 0 : rc; } void init_guest_l4_table(l4_pgentry_t l4tab[], const struct domain *d) { /* Xen private mappings. */ memcpy(&l4tab[ROOT_PAGETABLE_FIRST_XEN_SLOT], &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT], root_pgt_pv_xen_slots * sizeof(l4_pgentry_t)); #ifndef NDEBUG if ( l4e_get_intpte(split_l4e) ) l4tab[ROOT_PAGETABLE_FIRST_XEN_SLOT + root_pgt_pv_xen_slots] = split_l4e; #endif l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] = l4e_from_pfn(domain_page_map_to_mfn(l4tab), __PAGE_HYPERVISOR); l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] = l4e_from_page(d->arch.perdomain_l3_pg, __PAGE_HYPERVISOR); } static int alloc_l4_table(struct page_info *page) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l4_pgentry_t *pl4e = map_domain_page(pfn); unsigned int i; int rc = 0, partial = page->partial_pte; for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++, partial = 0 ) { if ( !is_guest_l4_slot(d, i) || (rc = get_page_from_l4e(pl4e[i], pfn, d, partial)) > 0 ) continue; if ( rc == -EAGAIN ) { page->nr_validated_ptes = i; page->partial_pte = partial ?: 1; } else if ( rc < 0 ) { if ( rc != -EINTR ) MEM_LOG("Failure in alloc_l4_table: entry %d", i); if ( i ) { page->nr_validated_ptes = i; page->partial_pte = 0; if ( rc == -EINTR ) rc = -EAGAIN; else { if ( current->arch.old_guest_table ) page->nr_validated_ptes++; current->arch.old_guest_table = page; } } } if ( rc < 0 ) { unmap_domain_page(pl4e); return rc; } adjust_guest_l4e(pl4e[i], d); } init_guest_l4_table(pl4e, d); unmap_domain_page(pl4e); return rc > 0 ? 0 : rc; } static void free_l1_table(struct page_info *page) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l1_pgentry_t *pl1e; unsigned int i; pl1e = map_domain_page(pfn); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) if ( is_guest_l1_slot(i) ) put_page_from_l1e(pl1e[i], d); unmap_domain_page(pl1e); } static int free_l2_table(struct page_info *page, int preemptible) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l2_pgentry_t *pl2e; unsigned int i = page->nr_validated_ptes - 1; int err = 0; pl2e = map_domain_page(pfn); ASSERT(page->nr_validated_ptes); do { if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) && put_page_from_l2e(pl2e[i], pfn) == 0 && preemptible && i && hypercall_preempt_check() ) { page->nr_validated_ptes = i; err = -EAGAIN; } } while ( !err && i-- ); unmap_domain_page(pl2e); if ( !err ) page->u.inuse.type_info &= ~PGT_pae_xen_l2; return err; } static int free_l3_table(struct page_info *page) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l3_pgentry_t *pl3e; int rc = 0, partial = page->partial_pte; unsigned int i = page->nr_validated_ptes - !partial; pl3e = map_domain_page(pfn); do { if ( is_guest_l3_slot(i) ) { rc = put_page_from_l3e(pl3e[i], pfn, partial, 0); if ( rc < 0 ) break; partial = 0; if ( rc > 0 ) continue; unadjust_guest_l3e(pl3e[i], d); } } while ( i-- ); unmap_domain_page(pl3e); if ( rc == -EAGAIN ) { page->nr_validated_ptes = i; page->partial_pte = partial ?: -1; } else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) { page->nr_validated_ptes = i + 1; page->partial_pte = 0; rc = -EAGAIN; } return rc > 0 ? 0 : rc; } static int free_l4_table(struct page_info *page) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l4_pgentry_t *pl4e = map_domain_page(pfn); int rc = 0, partial = page->partial_pte; unsigned int i = page->nr_validated_ptes - !partial; do { if ( is_guest_l4_slot(d, i) ) rc = put_page_from_l4e(pl4e[i], pfn, partial, 0); if ( rc < 0 ) break; partial = 0; } while ( i-- ); if ( rc == -EAGAIN ) { page->nr_validated_ptes = i; page->partial_pte = partial ?: -1; } else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) { page->nr_validated_ptes = i + 1; page->partial_pte = 0; rc = -EAGAIN; } unmap_domain_page(pl4e); return rc > 0 ? 0 : rc; } int page_lock(struct page_info *page) { unsigned long x, nx; do { while ( (x = page->u.inuse.type_info) & PGT_locked ) cpu_relax(); nx = x + (1 | PGT_locked); if ( !(x & PGT_validated) || !(x & PGT_count_mask) || !(nx & PGT_count_mask) ) return 0; } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x ); return 1; } void page_unlock(struct page_info *page) { unsigned long x, nx, y = page->u.inuse.type_info; do { x = y; nx = x - (1 | PGT_locked); } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x ); } /* How to write an entry to the guest pagetables. * Returns 0 for failure (pointer not valid), 1 for success. */ static inline int update_intpte(intpte_t *p, intpte_t old, intpte_t new, unsigned long mfn, struct vcpu *v, int preserve_ad) { int rv = 1; #ifndef PTE_UPDATE_WITH_CMPXCHG if ( !preserve_ad ) { rv = paging_write_guest_entry(v, p, new, _mfn(mfn)); } else #endif { intpte_t t = old; for ( ; ; ) { intpte_t _new = new; if ( preserve_ad ) _new |= old & (_PAGE_ACCESSED | _PAGE_DIRTY); rv = paging_cmpxchg_guest_entry(v, p, &t, _new, _mfn(mfn)); if ( unlikely(rv == 0) ) { MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte ": saw %" PRIpte, old, _new, t); break; } if ( t == old ) break; /* Allowed to change in Accessed/Dirty flags only. */ BUG_ON((t ^ old) & ~(intpte_t)(_PAGE_ACCESSED|_PAGE_DIRTY)); old = t; } } return rv; } /* Macro that wraps the appropriate type-changes around update_intpte(). * Arguments are: type, ptr, old, new, mfn, vcpu */ #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \ update_intpte(&_t ## e_get_intpte(*(_p)), \ _t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \ (_m), (_v), (_ad)) /* Update the L1 entry at pl1e to new value nl1e. */ static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, unsigned long gl1mfn, int preserve_ad, struct vcpu *pt_vcpu, struct domain *pg_dom) { l1_pgentry_t ol1e; struct domain *pt_dom = pt_vcpu->domain; int rc = 0; if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ) return -EFAULT; if ( unlikely(paging_mode_refcounts(pt_dom)) ) { if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu, preserve_ad) ) return 0; return -EBUSY; } if ( l1e_get_flags(nl1e) & _PAGE_PRESENT ) { /* Translate foreign guest addresses. */ struct page_info *page = NULL; if ( paging_mode_translate(pg_dom) ) { page = get_page_from_gfn(pg_dom, l1e_get_pfn(nl1e), NULL, P2M_ALLOC); if ( !page ) return -EINVAL; nl1e = l1e_from_pfn(page_to_mfn(page), l1e_get_flags(nl1e)); } if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom)) ) { MEM_LOG("Bad L1 flags %x", l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom)); if ( page ) put_page(page); return -EINVAL; } /* Fast path for identical mapping, r/w and presence. */ if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) ) { adjust_guest_l1e(nl1e, pt_dom); if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu, preserve_ad) ) { if ( page ) put_page(page); return 0; } if ( page ) put_page(page); return -EBUSY; } switch ( rc = get_page_from_l1e(nl1e, pt_dom, pg_dom) ) { default: if ( page ) put_page(page); return rc; case 0: break; case 1: l1e_remove_flags(nl1e, _PAGE_RW); rc = 0; break; } if ( page ) put_page(page); adjust_guest_l1e(nl1e, pt_dom); if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu, preserve_ad)) ) { ol1e = nl1e; rc = -EBUSY; } } else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu, preserve_ad)) ) { return -EBUSY; } put_page_from_l1e(ol1e, pt_dom); return rc; } /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */ static int mod_l2_entry(l2_pgentry_t *pl2e, l2_pgentry_t nl2e, unsigned long pfn, int preserve_ad, struct vcpu *vcpu) { l2_pgentry_t ol2e; struct domain *d = vcpu->domain; struct page_info *l2pg = mfn_to_page(pfn); unsigned long type = l2pg->u.inuse.type_info; int rc = 0; if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) ) { MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e); return -EPERM; } if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) ) return -EFAULT; if ( l2e_get_flags(nl2e) & _PAGE_PRESENT ) { if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) ) { MEM_LOG("Bad L2 flags %x", l2e_get_flags(nl2e) & L2_DISALLOW_MASK); return -EINVAL; } /* Fast path for identical mapping and presence. */ if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT) ) { adjust_guest_l2e(nl2e, d); if ( UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad) ) return 0; return -EBUSY; } if ( unlikely((rc = get_page_from_l2e(nl2e, pfn, d)) < 0) ) return rc; adjust_guest_l2e(nl2e, d); if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad)) ) { ol2e = nl2e; rc = -EBUSY; } } else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad)) ) { return -EBUSY; } put_page_from_l2e(ol2e, pfn); return rc; } /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */ static int mod_l3_entry(l3_pgentry_t *pl3e, l3_pgentry_t nl3e, unsigned long pfn, int preserve_ad, struct vcpu *vcpu) { l3_pgentry_t ol3e; struct domain *d = vcpu->domain; int rc = 0; if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) ) { MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e); return -EINVAL; } /* * Disallow updates to final L3 slot. It contains Xen mappings, and it * would be a pain to ensure they remain continuously valid throughout. */ if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) ) return -EINVAL; if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) ) return -EFAULT; if ( l3e_get_flags(nl3e) & _PAGE_PRESENT ) { if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) ) { MEM_LOG("Bad L3 flags %x", l3e_get_flags(nl3e) & l3_disallow_mask(d)); return -EINVAL; } /* Fast path for identical mapping and presence. */ if ( !l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT) ) { adjust_guest_l3e(nl3e, d); rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad); return rc ? 0 : -EFAULT; } rc = get_page_from_l3e(nl3e, pfn, d, 0); if ( unlikely(rc < 0) ) return rc; rc = 0; adjust_guest_l3e(nl3e, d); if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad)) ) { ol3e = nl3e; rc = -EFAULT; } } else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad)) ) { return -EFAULT; } if ( likely(rc == 0) ) if ( !create_pae_xen_mappings(d, pl3e) ) BUG(); put_page_from_l3e(ol3e, pfn, 0, 1); return rc; } /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */ static int mod_l4_entry(l4_pgentry_t *pl4e, l4_pgentry_t nl4e, unsigned long pfn, int preserve_ad, struct vcpu *vcpu) { struct domain *d = vcpu->domain; l4_pgentry_t ol4e; int rc = 0; if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) ) { MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e); return -EINVAL; } if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) ) return -EFAULT; if ( l4e_get_flags(nl4e) & _PAGE_PRESENT ) { if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) ) { MEM_LOG("Bad L4 flags %x", l4e_get_flags(nl4e) & L4_DISALLOW_MASK); return -EINVAL; } /* Fast path for identical mapping and presence. */ if ( !l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT) ) { adjust_guest_l4e(nl4e, d); rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad); return rc ? 0 : -EFAULT; } rc = get_page_from_l4e(nl4e, pfn, d, 0); if ( unlikely(rc < 0) ) return rc; rc = 0; adjust_guest_l4e(nl4e, d); if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad)) ) { ol4e = nl4e; rc = -EFAULT; } } else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad)) ) { return -EFAULT; } put_page_from_l4e(ol4e, pfn, 0, 1); return rc; } static int cleanup_page_cacheattr(struct page_info *page) { uint32_t cacheattr = (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base; if ( likely(cacheattr == 0) ) return 0; page->count_info &= ~PGC_cacheattr_mask; BUG_ON(is_xen_heap_page(page)); return update_xen_mappings(page_to_mfn(page), 0); } void put_page(struct page_info *page) { unsigned long nx, x, y = page->count_info; do { ASSERT((y & PGC_count_mask) != 0); x = y; nx = x - 1; } while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) ); if ( unlikely((nx & PGC_count_mask) == 0) ) { if ( cleanup_page_cacheattr(page) == 0 ) free_domheap_page(page); else MEM_LOG("Leaking pfn %lx", page_to_mfn(page)); } } struct domain *page_get_owner_and_reference(struct page_info *page) { unsigned long x, y = page->count_info; do { x = y; /* * Count == 0: Page is not allocated, so we cannot take a reference. * Count == -1: Reference count would wrap, which is invalid. * Count == -2: Remaining unused ref is reserved for get_page_light(). */ if ( unlikely(((x + 2) & PGC_count_mask) <= 2) ) return NULL; } while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x ); return page_get_owner(page); } int get_page(struct page_info *page, struct domain *domain) { struct domain *owner = page_get_owner_and_reference(page); if ( likely(owner == domain) ) return 1; if ( owner != NULL ) put_page(page); if ( !paging_mode_refcounts(domain) && !domain->is_dying ) gdprintk(XENLOG_INFO, "Error pfn %lx: rd=%p, od=%p, caf=%08lx, taf=%" PRtype_info "\n", page_to_mfn(page), domain, owner, page->count_info, page->u.inuse.type_info); return 0; } /* * Special version of get_page() to be used exclusively when * - a page is known to already have a non-zero reference count * - the page does not need its owner to be checked * - it will not be called more than once without dropping the thus * acquired reference again. * Due to get_page() reserving one reference, this call cannot fail. */ static void get_page_light(struct page_info *page) { unsigned long x, nx, y = page->count_info; do { x = y; nx = x + 1; BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */ BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */ y = cmpxchg(&page->count_info, x, nx); } while ( unlikely(y != x) ); } static int alloc_page_type(struct page_info *page, unsigned long type, int preemptible) { struct domain *owner = page_get_owner(page); int rc; /* A page table is dirtied when its type count becomes non-zero. */ if ( likely(owner != NULL) ) paging_mark_dirty(owner, page_to_mfn(page)); switch ( type & PGT_type_mask ) { case PGT_l1_page_table: rc = alloc_l1_table(page); break; case PGT_l2_page_table: rc = alloc_l2_table(page, type, preemptible); break; case PGT_l3_page_table: ASSERT(preemptible); rc = alloc_l3_table(page); break; case PGT_l4_page_table: ASSERT(preemptible); rc = alloc_l4_table(page); break; case PGT_seg_desc_page: rc = alloc_segdesc_page(page); break; default: printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%lx\n", type, page->u.inuse.type_info, page->count_info); rc = -EINVAL; BUG(); } /* No need for atomic update of type_info here: noone else updates it. */ wmb(); switch ( rc ) { case 0: page->u.inuse.type_info |= PGT_validated; break; case -EINTR: ASSERT((page->u.inuse.type_info & (PGT_count_mask|PGT_validated|PGT_partial)) == 1); page->u.inuse.type_info &= ~PGT_count_mask; break; default: ASSERT(rc < 0); MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %" PRtype_info ": caf=%08lx taf=%" PRtype_info, page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)), type, page->count_info, page->u.inuse.type_info); if ( page != current->arch.old_guest_table ) page->u.inuse.type_info = 0; else { ASSERT((page->u.inuse.type_info & (PGT_count_mask | PGT_validated)) == 1); case -EAGAIN: get_page_light(page); page->u.inuse.type_info |= PGT_partial; } break; } return rc; } int free_page_type(struct page_info *page, unsigned long type, int preemptible) { struct domain *owner = page_get_owner(page); unsigned long gmfn; int rc; if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) ) { /* A page table is dirtied when its type count becomes zero. */ paging_mark_dirty(owner, page_to_mfn(page)); if ( shadow_mode_refcounts(owner) ) return 0; gmfn = mfn_to_gmfn(owner, page_to_mfn(page)); ASSERT(VALID_M2P(gmfn)); /* Page sharing not supported for shadowed domains */ if(!SHARED_M2P(gmfn)) shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn)); } if ( !(type & PGT_partial) ) { page->nr_validated_ptes = 1U << PAGETABLE_ORDER; page->partial_pte = 0; } switch ( type & PGT_type_mask ) { case PGT_l1_page_table: free_l1_table(page); rc = 0; break; case PGT_l2_page_table: rc = free_l2_table(page, preemptible); break; case PGT_l3_page_table: ASSERT(preemptible); rc = free_l3_table(page); break; case PGT_l4_page_table: ASSERT(preemptible); rc = free_l4_table(page); break; default: MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page)); rc = -EINVAL; BUG(); } return rc; } static int __put_final_page_type( struct page_info *page, unsigned long type, int preemptible) { int rc = free_page_type(page, type, preemptible); /* No need for atomic update of type_info here: noone else updates it. */ if ( rc == 0 ) { /* * Record TLB information for flush later. We do not stamp page tables * when running in shadow mode: * 1. Pointless, since it's the shadow pt's which must be tracked. * 2. Shadow mode reuses this field for shadowed page tables to * store flags info -- we don't want to conflict with that. */ if ( !(shadow_mode_enabled(page_get_owner(page)) && (page->count_info & PGC_page_table)) ) page->tlbflush_timestamp = tlbflush_current_time(); wmb(); page->u.inuse.type_info--; } else if ( rc == -EINTR ) { ASSERT((page->u.inuse.type_info & (PGT_count_mask|PGT_validated|PGT_partial)) == 1); if ( !(shadow_mode_enabled(page_get_owner(page)) && (page->count_info & PGC_page_table)) ) page->tlbflush_timestamp = tlbflush_current_time(); wmb(); page->u.inuse.type_info |= PGT_validated; } else { BUG_ON(rc != -EAGAIN); wmb(); get_page_light(page); page->u.inuse.type_info |= PGT_partial; } return rc; } static int __put_page_type(struct page_info *page, int preemptible) { unsigned long nx, x, y = page->u.inuse.type_info; int rc = 0; for ( ; ; ) { x = y; nx = x - 1; ASSERT((x & PGT_count_mask) != 0); if ( unlikely((nx & PGT_count_mask) == 0) ) { if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) && likely(nx & (PGT_validated|PGT_partial)) ) { /* * Page-table pages must be unvalidated when count is zero. The * 'free' is safe because the refcnt is non-zero and validated * bit is clear => other ops will spin or fail. */ nx = x & ~(PGT_validated|PGT_partial); if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ) continue; /* We cleared the 'valid bit' so we do the clean up. */ rc = __put_final_page_type(page, x, preemptible); if ( x & PGT_partial ) put_page(page); break; } /* * Record TLB information for flush later. We do not stamp page * tables when running in shadow mode: * 1. Pointless, since it's the shadow pt's which must be tracked. * 2. Shadow mode reuses this field for shadowed page tables to * store flags info -- we don't want to conflict with that. */ if ( !(shadow_mode_enabled(page_get_owner(page)) && (page->count_info & PGC_page_table)) ) page->tlbflush_timestamp = tlbflush_current_time(); } if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) ) break; if ( preemptible && hypercall_preempt_check() ) return -EINTR; } return rc; } static int __get_page_type(struct page_info *page, unsigned long type, int preemptible) { unsigned long nx, x, y = page->u.inuse.type_info; int rc = 0; ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2))); for ( ; ; ) { x = y; nx = x + 1; if ( unlikely((nx & PGT_count_mask) == 0) ) { MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page)); return -EINVAL; } else if ( unlikely((x & PGT_count_mask) == 0) ) { struct domain *d = page_get_owner(page); /* Normally we should never let a page go from type count 0 * to type count 1 when it is shadowed. One exception: * out-of-sync shadowed pages are allowed to become * writeable. */ if ( d && shadow_mode_enabled(d) && (page->count_info & PGC_page_table) && !((page->shadow_flags & (1u<<29)) && type == PGT_writable_page) ) shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page))); ASSERT(!(x & PGT_pae_xen_l2)); if ( (x & PGT_type_mask) != type ) { /* * On type change we check to flush stale TLB entries. This * may be unnecessary (e.g., page was GDT/LDT) but those * circumstances should be very rare. */ cpumask_t mask; cpumask_copy(&mask, d->domain_dirty_cpumask); /* Don't flush if the timestamp is old enough */ tlbflush_filter(mask, page->tlbflush_timestamp); if ( unlikely(!cpumask_empty(&mask)) && /* Shadow mode: track only writable pages. */ (!shadow_mode_enabled(page_get_owner(page)) || ((nx & PGT_type_mask) == PGT_writable_page)) ) { perfc_incr(need_flush_tlb_flush); flush_tlb_mask(&mask); } /* We lose existing type and validity. */ nx &= ~(PGT_type_mask | PGT_validated); nx |= type; /* No special validation needed for writable pages. */ /* Page tables and GDT/LDT need to be scanned for validity. */ if ( type == PGT_writable_page || type == PGT_shared_page ) nx |= PGT_validated; } } else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) ) { /* Don't log failure if it could be a recursive-mapping attempt. */ if ( ((x & PGT_type_mask) == PGT_l2_page_table) && (type == PGT_l1_page_table) ) return -EINVAL; if ( ((x & PGT_type_mask) == PGT_l3_page_table) && (type == PGT_l2_page_table) ) return -EINVAL; if ( ((x & PGT_type_mask) == PGT_l4_page_table) && (type == PGT_l3_page_table) ) return -EINVAL; MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") " "for mfn %lx (pfn %lx)", x, type, page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page))); return -EINVAL; } else if ( unlikely(!(x & PGT_validated)) ) { if ( !(x & PGT_partial) ) { /* Someone else is updating validation of this page. Wait... */ while ( (y = page->u.inuse.type_info) == x ) { if ( preemptible && hypercall_preempt_check() ) return -EINTR; cpu_relax(); } continue; } /* Type ref count was left at 1 when PGT_partial got set. */ ASSERT((x & PGT_count_mask) == 1); nx = x & ~PGT_partial; } if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) ) break; if ( preemptible && hypercall_preempt_check() ) return -EINTR; } if ( unlikely((x & PGT_type_mask) != type) ) { /* Special pages should not be accessible from devices. */ struct domain *d = page_get_owner(page); if ( d && is_pv_domain(d) && unlikely(need_iommu(d)) ) { if ( (x & PGT_type_mask) == PGT_writable_page ) iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page))); else if ( type == PGT_writable_page ) iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page), IOMMUF_readable|IOMMUF_writable); } } if ( unlikely(!(nx & PGT_validated)) ) { if ( !(x & PGT_partial) ) { page->nr_validated_ptes = 0; page->partial_pte = 0; } rc = alloc_page_type(page, type, preemptible); } if ( (x & PGT_partial) && !(nx & PGT_partial) ) put_page(page); return rc; } void put_page_type(struct page_info *page) { int rc = __put_page_type(page, 0); ASSERT(rc == 0); (void)rc; } int get_page_type(struct page_info *page, unsigned long type) { int rc = __get_page_type(page, type, 0); if ( likely(rc == 0) ) return 1; ASSERT(rc != -EINTR && rc != -EAGAIN); return 0; } int put_page_type_preemptible(struct page_info *page) { return __put_page_type(page, 1); } int get_page_type_preemptible(struct page_info *page, unsigned long type) { ASSERT(!current->arch.old_guest_table); return __get_page_type(page, type, 1); } static int get_spage_pages(struct page_info *page, struct domain *d) { int i; for (i = 0; i < (1<= 0) put_page_and_type(--page); return 0; } } return 1; } static void put_spage_pages(struct page_info *page) { int i; for (i = 0; i < (1<type_info; int pages_done = 0; ASSERT(opt_allow_superpage); do { x = y; nx = x + 1; if ( (x & SGT_type_mask) == SGT_mark ) { MEM_LOG("Duplicate superpage mark attempt mfn %lx", spage_to_mfn(spage)); if ( pages_done ) put_spage_pages(spage_to_page(spage)); return -EINVAL; } if ( (x & SGT_type_mask) == SGT_dynamic ) { if ( pages_done ) { put_spage_pages(spage_to_page(spage)); pages_done = 0; } } else if ( !pages_done ) { if ( !get_spage_pages(spage_to_page(spage), d) ) { MEM_LOG("Superpage type conflict in mark attempt mfn %lx", spage_to_mfn(spage)); return -EINVAL; } pages_done = 1; } nx = (nx & ~SGT_type_mask) | SGT_mark; } while ( (y = cmpxchg(&spage->type_info, x, nx)) != x ); return 0; } static int unmark_superpage(struct spage_info *spage) { unsigned long x, nx, y = spage->type_info; unsigned long do_pages = 0; ASSERT(opt_allow_superpage); do { x = y; nx = x - 1; if ( (x & SGT_type_mask) != SGT_mark ) { MEM_LOG("Attempt to unmark unmarked superpage mfn %lx", spage_to_mfn(spage)); return -EINVAL; } if ( (nx & SGT_count_mask) == 0 ) { nx = (nx & ~SGT_type_mask) | SGT_none; do_pages = 1; } else { nx = (nx & ~SGT_type_mask) | SGT_dynamic; } } while ( (y = cmpxchg(&spage->type_info, x, nx)) != x ); if ( do_pages ) put_spage_pages(spage_to_page(spage)); return 0; } void clear_superpage_mark(struct page_info *page) { struct spage_info *spage; if ( !opt_allow_superpage ) return; spage = page_to_spage(page); if ((spage->type_info & SGT_type_mask) == SGT_mark) unmark_superpage(spage); } int get_superpage(unsigned long mfn, struct domain *d) { struct spage_info *spage; unsigned long x, nx, y; int pages_done = 0; ASSERT(opt_allow_superpage); spage = mfn_to_spage(mfn); y = spage->type_info; do { x = y; nx = x + 1; if ( (x & SGT_type_mask) != SGT_none ) { if ( pages_done ) { put_spage_pages(spage_to_page(spage)); pages_done = 0; } } else { if ( !get_spage_pages(spage_to_page(spage), d) ) { MEM_LOG("Type conflict on superpage mapping mfn %lx", spage_to_mfn(spage)); return -EINVAL; } pages_done = 1; nx = (nx & ~SGT_type_mask) | SGT_dynamic; } } while ( (y = cmpxchg(&spage->type_info, x, nx)) != x ); return 0; } static void put_superpage(unsigned long mfn) { struct spage_info *spage; unsigned long x, nx, y; unsigned long do_pages = 0; if ( !opt_allow_superpage ) { put_spage_pages(mfn_to_page(mfn)); return; } spage = mfn_to_spage(mfn); y = spage->type_info; do { x = y; nx = x - 1; if ((x & SGT_type_mask) == SGT_dynamic) { if ((nx & SGT_count_mask) == 0) { nx = (nx & ~SGT_type_mask) | SGT_none; do_pages = 1; } } } while ((y = cmpxchg(&spage->type_info, x, nx)) != x); if (do_pages) put_spage_pages(spage_to_page(spage)); return; } int put_old_guest_table(struct vcpu *v) { int rc; if ( !v->arch.old_guest_table ) return 0; switch ( rc = put_page_and_type_preemptible(v->arch.old_guest_table) ) { case -EINTR: case -EAGAIN: return -EAGAIN; } v->arch.old_guest_table = NULL; return rc; } int vcpu_destroy_pagetables(struct vcpu *v) { unsigned long mfn = pagetable_get_pfn(v->arch.guest_table); struct page_info *page; l4_pgentry_t *l4tab = NULL; int rc = put_old_guest_table(v); if ( rc ) return rc; if ( is_pv_32on64_vcpu(v) ) { l4tab = map_domain_page(mfn); mfn = l4e_get_pfn(*l4tab); } if ( mfn ) { page = mfn_to_page(mfn); if ( paging_mode_refcounts(v->domain) ) put_page(page); else rc = put_page_and_type_preemptible(page); } if ( l4tab ) { if ( !rc ) l4e_write(l4tab, l4e_empty()); unmap_domain_page(l4tab); } else if ( !rc ) { v->arch.guest_table = pagetable_null(); /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */ mfn = pagetable_get_pfn(v->arch.guest_table_user); if ( mfn ) { page = mfn_to_page(mfn); if ( paging_mode_refcounts(v->domain) ) put_page(page); else rc = put_page_and_type_preemptible(page); } if ( !rc ) v->arch.guest_table_user = pagetable_null(); } v->arch.cr3 = 0; return rc; } int new_guest_cr3(unsigned long mfn) { struct vcpu *curr = current; struct domain *d = curr->domain; int rc; unsigned long old_base_mfn; if ( is_pv_32on64_domain(d) ) { unsigned long gt_mfn = pagetable_get_pfn(curr->arch.guest_table); l4_pgentry_t *pl4e = map_domain_page(gt_mfn); rc = paging_mode_refcounts(d) ? -EINVAL /* Old code was broken, but what should it be? */ : mod_l4_entry( pl4e, l4e_from_pfn( mfn, (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)), gt_mfn, 0, curr); unmap_domain_page(pl4e); switch ( rc ) { case 0: break; case -EINTR: case -EAGAIN: return -EAGAIN; default: MEM_LOG("Error while installing new compat baseptr %lx", mfn); return rc; } invalidate_shadow_ldt(curr, 0); write_ptbase(curr); return 0; } rc = put_old_guest_table(curr); if ( unlikely(rc) ) return rc; old_base_mfn = pagetable_get_pfn(curr->arch.guest_table); /* * This is particularly important when getting restarted after the * previous attempt got preempted in the put-old-MFN phase. */ if ( old_base_mfn == mfn ) { write_ptbase(curr); return 0; } rc = paging_mode_refcounts(d) ? (get_page_from_pagenr(mfn, d) ? 0 : -EINVAL) : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 1); switch ( rc ) { case 0: break; case -EINTR: case -EAGAIN: return -EAGAIN; default: MEM_LOG("Error while installing new baseptr %lx", mfn); return rc; } invalidate_shadow_ldt(curr, 0); curr->arch.guest_table = pagetable_from_pfn(mfn); update_cr3(curr); write_ptbase(curr); if ( likely(old_base_mfn != 0) ) { struct page_info *page = mfn_to_page(old_base_mfn); if ( paging_mode_refcounts(d) ) put_page(page); else switch ( rc = put_page_and_type_preemptible(page) ) { case -EINTR: rc = -EAGAIN; case -EAGAIN: curr->arch.old_guest_table = page; break; default: BUG_ON(rc); break; } } return rc; } static struct domain *get_pg_owner(domid_t domid) { struct domain *pg_owner = NULL, *curr = current->domain; if ( likely(domid == DOMID_SELF) ) { pg_owner = rcu_lock_current_domain(); goto out; } if ( unlikely(domid == curr->domain_id) ) { MEM_LOG("Cannot specify itself as foreign domain"); goto out; } if ( unlikely(paging_mode_translate(curr)) ) { MEM_LOG("Cannot mix foreign mappings with translated domains"); goto out; } switch ( domid ) { case DOMID_IO: pg_owner = rcu_lock_domain(dom_io); break; case DOMID_XEN: pg_owner = rcu_lock_domain(dom_xen); break; default: if ( (pg_owner = rcu_lock_domain_by_id(domid)) == NULL ) { MEM_LOG("Unknown domain '%u'", domid); break; } break; } out: return pg_owner; } static void put_pg_owner(struct domain *pg_owner) { rcu_unlock_domain(pg_owner); } static inline int vcpumask_to_pcpumask( struct domain *d, XEN_GUEST_HANDLE_PARAM(const_void) bmap, cpumask_t *pmask) { unsigned int vcpu_id, vcpu_bias, offs; unsigned long vmask; struct vcpu *v; bool_t is_native = !is_pv_32on64_domain(d); cpumask_clear(pmask); for ( vmask = 0, offs = 0; ; ++offs) { vcpu_bias = offs * (is_native ? BITS_PER_LONG : 32); if ( vcpu_bias >= d->max_vcpus ) return 0; if ( unlikely(is_native ? copy_from_guest_offset(&vmask, bmap, offs, 1) : copy_from_guest_offset((unsigned int *)&vmask, bmap, offs, 1)) ) { cpumask_clear(pmask); return -EFAULT; } while ( vmask ) { vcpu_id = find_first_set_bit(vmask); vmask &= ~(1UL << vcpu_id); vcpu_id += vcpu_bias; if ( (vcpu_id >= d->max_vcpus) ) return 0; if ( ((v = d->vcpu[vcpu_id]) != NULL) ) cpumask_or(pmask, pmask, v->vcpu_dirty_cpumask); } } } long do_mmuext_op( XEN_GUEST_HANDLE_PARAM(mmuext_op_t) uops, unsigned int count, XEN_GUEST_HANDLE_PARAM(uint) pdone, unsigned int foreigndom) { struct mmuext_op op; unsigned long type; unsigned int i, done = 0; struct vcpu *curr = current; struct domain *d = curr->domain; struct domain *pg_owner; int okay, rc = put_old_guest_table(curr); if ( unlikely(rc) ) { if ( likely(rc == -EAGAIN) ) rc = hypercall_create_continuation( __HYPERVISOR_mmuext_op, "hihi", uops, count, pdone, foreigndom); return rc; } if ( unlikely(count == MMU_UPDATE_PREEMPTED) && likely(guest_handle_is_null(uops)) ) { /* See the curr->arch.old_guest_table related * hypercall_create_continuation() below. */ return (int)foreigndom; } if ( unlikely(count & MMU_UPDATE_PREEMPTED) ) { count &= ~MMU_UPDATE_PREEMPTED; if ( unlikely(!guest_handle_is_null(pdone)) ) (void)copy_from_guest(&done, pdone, 1); } else perfc_incr(calls_to_mmuext_op); if ( unlikely(!guest_handle_okay(uops, count)) ) return -EFAULT; if ( (pg_owner = get_pg_owner(foreigndom)) == NULL ) return -ESRCH; rc = xsm_mmuext_op(XSM_TARGET, d, pg_owner); if ( rc ) { put_pg_owner(pg_owner); return rc; } for ( i = 0; i < count; i++ ) { if ( curr->arch.old_guest_table || hypercall_preempt_check() ) { rc = -EAGAIN; break; } if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) ) { MEM_LOG("Bad __copy_from_guest"); rc = -EFAULT; break; } okay = 1; switch ( op.cmd ) { case MMUEXT_PIN_L1_TABLE: type = PGT_l1_page_table; goto pin_page; case MMUEXT_PIN_L2_TABLE: type = PGT_l2_page_table; goto pin_page; case MMUEXT_PIN_L3_TABLE: type = PGT_l3_page_table; goto pin_page; case MMUEXT_PIN_L4_TABLE: if ( is_pv_32bit_domain(pg_owner) ) break; type = PGT_l4_page_table; pin_page: { struct page_info *page; /* Ignore pinning of invalid paging levels. */ if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) ) break; if ( paging_mode_refcounts(pg_owner) ) break; page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC); if ( unlikely(!page) ) { okay = 0; break; } rc = get_page_type_preemptible(page, type); okay = !rc; if ( unlikely(!okay) ) { if ( rc == -EINTR ) rc = -EAGAIN; else if ( rc != -EAGAIN ) MEM_LOG("Error while pinning mfn %lx", page_to_mfn(page)); if ( page != curr->arch.old_guest_table ) put_page(page); break; } if ( (rc = xsm_memory_pin_page(XSM_HOOK, d, pg_owner, page)) != 0 ) okay = 0; else if ( unlikely(test_and_set_bit(_PGT_pinned, &page->u.inuse.type_info)) ) { MEM_LOG("Mfn %lx already pinned", page_to_mfn(page)); okay = 0; } if ( unlikely(!okay) ) goto pin_drop; /* A page is dirtied when its pin status is set. */ paging_mark_dirty(pg_owner, page_to_mfn(page)); /* We can race domain destruction (domain_relinquish_resources). */ if ( unlikely(pg_owner != d) ) { int drop_ref; spin_lock(&pg_owner->page_alloc_lock); drop_ref = (pg_owner->is_dying && test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)); spin_unlock(&pg_owner->page_alloc_lock); if ( drop_ref ) { pin_drop: if ( type == PGT_l1_page_table ) put_page_and_type(page); else curr->arch.old_guest_table = page; } } break; } case MMUEXT_UNPIN_TABLE: { struct page_info *page; if ( paging_mode_refcounts(pg_owner) ) break; page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC); if ( unlikely(!page) ) { okay = 0; MEM_LOG("Mfn %lx bad domain", op.arg1.mfn); break; } if ( !test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) ) { okay = 0; put_page(page); MEM_LOG("Mfn %lx not pinned", op.arg1.mfn); break; } switch ( rc = put_page_and_type_preemptible(page) ) { case -EINTR: case -EAGAIN: curr->arch.old_guest_table = page; rc = 0; break; default: BUG_ON(rc); break; } put_page(page); /* A page is dirtied when its pin status is cleared. */ paging_mark_dirty(pg_owner, page_to_mfn(page)); break; } case MMUEXT_NEW_BASEPTR: if ( paging_mode_translate(d) ) okay = 0; else { rc = new_guest_cr3(op.arg1.mfn); okay = !rc; } break; case MMUEXT_NEW_USER_BASEPTR: { unsigned long old_mfn; if ( paging_mode_translate(current->domain) ) { okay = 0; break; } old_mfn = pagetable_get_pfn(curr->arch.guest_table_user); /* * This is particularly important when getting restarted after the * previous attempt got preempted in the put-old-MFN phase. */ if ( old_mfn == op.arg1.mfn ) break; if ( op.arg1.mfn != 0 ) { if ( paging_mode_refcounts(d) ) okay = get_page_from_pagenr(op.arg1.mfn, d); else { rc = get_page_and_type_from_pagenr( op.arg1.mfn, PGT_root_page_table, d, 0, 1); okay = !rc; } if ( unlikely(!okay) ) { if ( rc == -EINTR ) rc = -EAGAIN; else if ( rc != -EAGAIN ) MEM_LOG("Error while installing new mfn %lx", op.arg1.mfn); break; } } curr->arch.guest_table_user = pagetable_from_pfn(op.arg1.mfn); if ( old_mfn != 0 ) { struct page_info *page = mfn_to_page(old_mfn); if ( paging_mode_refcounts(d) ) put_page(page); else switch ( rc = put_page_and_type_preemptible(page) ) { case -EINTR: rc = -EAGAIN; case -EAGAIN: curr->arch.old_guest_table = page; okay = 0; break; default: BUG_ON(rc); break; } } break; } case MMUEXT_TLB_FLUSH_LOCAL: flush_tlb_local(); break; case MMUEXT_INVLPG_LOCAL: if ( !paging_mode_enabled(d) || paging_invlpg(curr, op.arg1.linear_addr) != 0 ) flush_tlb_one_local(op.arg1.linear_addr); break; case MMUEXT_TLB_FLUSH_MULTI: case MMUEXT_INVLPG_MULTI: { cpumask_t pmask; if ( unlikely(vcpumask_to_pcpumask(d, guest_handle_to_param(op.arg2.vcpumask, const_void), &pmask)) ) { okay = 0; break; } if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI ) flush_tlb_mask(&pmask); else flush_tlb_one_mask(&pmask, op.arg1.linear_addr); break; } case MMUEXT_TLB_FLUSH_ALL: flush_tlb_mask(d->domain_dirty_cpumask); break; case MMUEXT_INVLPG_ALL: flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr); break; case MMUEXT_FLUSH_CACHE: if ( unlikely(!cache_flush_permitted(d)) ) { MEM_LOG("Non-physdev domain tried to FLUSH_CACHE."); okay = 0; } else { wbinvd(); } break; case MMUEXT_FLUSH_CACHE_GLOBAL: if ( unlikely(foreigndom != DOMID_SELF) ) okay = 0; else if ( likely(cache_flush_permitted(d)) ) { unsigned int cpu; cpumask_t mask; cpumask_clear(&mask); for_each_online_cpu(cpu) if ( !cpumask_intersects(&mask, per_cpu(cpu_sibling_mask, cpu)) ) cpumask_set_cpu(cpu, &mask); flush_mask(&mask, FLUSH_CACHE); } else { MEM_LOG("Non-physdev domain tried to FLUSH_CACHE_GLOBAL"); okay = 0; } break; case MMUEXT_SET_LDT: { unsigned long ptr = op.arg1.linear_addr; unsigned long ents = op.arg2.nr_ents; if ( paging_mode_external(d) ) { MEM_LOG("ignoring SET_LDT hypercall from external domain"); okay = 0; } else if ( ((ptr & (PAGE_SIZE - 1)) != 0) || !__addr_ok(ptr) || (ents > 8192) ) { okay = 0; MEM_LOG("Bad args to SET_LDT: ptr=%lx, ents=%lx", ptr, ents); } else if ( (curr->arch.pv_vcpu.ldt_ents != ents) || (curr->arch.pv_vcpu.ldt_base != ptr) ) { invalidate_shadow_ldt(curr, 0); flush_tlb_local(); curr->arch.pv_vcpu.ldt_base = ptr; curr->arch.pv_vcpu.ldt_ents = ents; load_LDT(curr); } break; } case MMUEXT_CLEAR_PAGE: { struct page_info *page; page = get_page_from_gfn(d, op.arg1.mfn, NULL, P2M_ALLOC); if ( !page || !get_page_type(page, PGT_writable_page) ) { if ( page ) put_page(page); MEM_LOG("Error while clearing mfn %lx", op.arg1.mfn); okay = 0; break; } /* A page is dirtied when it's being cleared. */ paging_mark_dirty(d, page_to_mfn(page)); clear_domain_page(page_to_mfn(page)); put_page_and_type(page); break; } case MMUEXT_COPY_PAGE: { struct page_info *src_page, *dst_page; src_page = get_page_from_gfn(d, op.arg2.src_mfn, NULL, P2M_ALLOC); if ( unlikely(!src_page) ) { okay = 0; MEM_LOG("Error while copying from mfn %lx", op.arg2.src_mfn); break; } dst_page = get_page_from_gfn(d, op.arg1.mfn, NULL, P2M_ALLOC); okay = (dst_page && get_page_type(dst_page, PGT_writable_page)); if ( unlikely(!okay) ) { put_page(src_page); if ( dst_page ) put_page(dst_page); MEM_LOG("Error while copying to mfn %lx", op.arg1.mfn); break; } /* A page is dirtied when it's being copied to. */ paging_mark_dirty(d, page_to_mfn(dst_page)); copy_domain_page(page_to_mfn(dst_page), page_to_mfn(src_page)); put_page_and_type(dst_page); put_page(src_page); break; } case MMUEXT_MARK_SUPER: { unsigned long mfn; struct spage_info *spage; mfn = op.arg1.mfn; if ( mfn & (L1_PAGETABLE_ENTRIES-1) ) { MEM_LOG("Unaligned superpage reference mfn %lx", mfn); okay = 0; break; } if ( !opt_allow_superpage ) { MEM_LOG("Superpages disallowed"); okay = 0; rc = -ENOSYS; break; } spage = mfn_to_spage(mfn); okay = (mark_superpage(spage, d) >= 0); break; } case MMUEXT_UNMARK_SUPER: { unsigned long mfn; struct spage_info *spage; mfn = op.arg1.mfn; if ( mfn & (L1_PAGETABLE_ENTRIES-1) ) { MEM_LOG("Unaligned superpage reference mfn %lx", mfn); okay = 0; break; } if ( !opt_allow_superpage ) { MEM_LOG("Superpages disallowed"); okay = 0; rc = -ENOSYS; break; } spage = mfn_to_spage(mfn); okay = (unmark_superpage(spage) >= 0); break; } default: MEM_LOG("Invalid extended pt command %#x", op.cmd); rc = -ENOSYS; okay = 0; break; } if ( unlikely(!okay) ) { rc = rc ? rc : -EINVAL; break; } guest_handle_add_offset(uops, 1); } if ( rc == -EAGAIN ) { ASSERT(i < count); rc = hypercall_create_continuation( __HYPERVISOR_mmuext_op, "hihi", uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); } else if ( curr->arch.old_guest_table ) { XEN_GUEST_HANDLE_PARAM(void) null; ASSERT(rc || i == count); set_xen_guest_handle(null, NULL); /* * In order to have a way to communicate the final return value to * our continuation, we pass this in place of "foreigndom", building * on the fact that this argument isn't needed anymore. */ rc = hypercall_create_continuation( __HYPERVISOR_mmuext_op, "hihi", null, MMU_UPDATE_PREEMPTED, null, rc); } put_pg_owner(pg_owner); perfc_add(num_mmuext_ops, i); /* Add incremental work we have done to the @done output parameter. */ if ( unlikely(!guest_handle_is_null(pdone)) ) { done += i; copy_to_guest(pdone, &done, 1); } return rc; } long do_mmu_update( XEN_GUEST_HANDLE_PARAM(mmu_update_t) ureqs, unsigned int count, XEN_GUEST_HANDLE_PARAM(uint) pdone, unsigned int foreigndom) { struct mmu_update req; void *va; unsigned long gpfn, gmfn, mfn; struct page_info *page; unsigned int cmd, i = 0, done = 0, pt_dom; struct vcpu *curr = current, *v = curr; struct domain *d = v->domain, *pt_owner = d, *pg_owner; struct domain_mmap_cache mapcache; uint32_t xsm_needed = 0; uint32_t xsm_checked = 0; int rc = put_old_guest_table(curr); if ( unlikely(rc) ) { if ( likely(rc == -EAGAIN) ) rc = hypercall_create_continuation( __HYPERVISOR_mmu_update, "hihi", ureqs, count, pdone, foreigndom); return rc; } if ( unlikely(count == MMU_UPDATE_PREEMPTED) && likely(guest_handle_is_null(ureqs)) ) { /* See the curr->arch.old_guest_table related * hypercall_create_continuation() below. */ return (int)foreigndom; } if ( unlikely(count & MMU_UPDATE_PREEMPTED) ) { count &= ~MMU_UPDATE_PREEMPTED; if ( unlikely(!guest_handle_is_null(pdone)) ) (void)copy_from_guest(&done, pdone, 1); } else perfc_incr(calls_to_mmu_update); if ( unlikely(!guest_handle_okay(ureqs, count)) ) return -EFAULT; if ( (pt_dom = foreigndom >> 16) != 0 ) { /* Pagetables belong to a foreign domain (PFD). */ if ( (pt_owner = rcu_lock_domain_by_id(pt_dom - 1)) == NULL ) return -EINVAL; if ( pt_owner == d ) rcu_unlock_domain(pt_owner); else if ( !pt_owner->vcpu || (v = pt_owner->vcpu[0]) == NULL ) { rc = -EINVAL; goto out; } } if ( (pg_owner = get_pg_owner((uint16_t)foreigndom)) == NULL ) { rc = -ESRCH; goto out; } domain_mmap_cache_init(&mapcache); for ( i = 0; i < count; i++ ) { if ( curr->arch.old_guest_table || hypercall_preempt_check() ) { rc = -EAGAIN; break; } if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) ) { MEM_LOG("Bad __copy_from_guest"); rc = -EFAULT; break; } cmd = req.ptr & (sizeof(l1_pgentry_t)-1); switch ( cmd ) { /* * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table. * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR) * current A/D bits. */ case MMU_NORMAL_PT_UPDATE: case MMU_PT_UPDATE_PRESERVE_AD: { p2m_type_t p2mt; xsm_needed |= XSM_MMU_NORMAL_UPDATE; if ( get_pte_flags(req.val) & _PAGE_PRESENT ) { xsm_needed |= XSM_MMU_UPDATE_READ; if ( get_pte_flags(req.val) & _PAGE_RW ) xsm_needed |= XSM_MMU_UPDATE_WRITE; } if ( xsm_needed != xsm_checked ) { rc = xsm_mmu_update(XSM_TARGET, d, pt_owner, pg_owner, xsm_needed); if ( rc ) break; xsm_checked = xsm_needed; } rc = -EINVAL; req.ptr -= cmd; gmfn = req.ptr >> PAGE_SHIFT; page = get_page_from_gfn(pt_owner, gmfn, &p2mt, P2M_ALLOC); if ( p2m_is_paged(p2mt) ) { ASSERT(!page); p2m_mem_paging_populate(pg_owner, gmfn); rc = -ENOENT; break; } if ( unlikely(!page) ) { MEM_LOG("Could not get page for normal update"); break; } mfn = page_to_mfn(page); va = map_domain_page_with_cache(mfn, &mapcache); va = (void *)((unsigned long)va + (unsigned long)(req.ptr & ~PAGE_MASK)); if ( page_lock(page) ) { switch ( page->u.inuse.type_info & PGT_type_mask ) { case PGT_l1_page_table: { l1_pgentry_t l1e = l1e_from_intpte(req.val); p2m_type_t l1e_p2mt = p2m_ram_rw; struct page_info *target = NULL; p2m_query_t q = (l1e_get_flags(l1e) & _PAGE_RW) ? P2M_UNSHARE : P2M_ALLOC; if ( paging_mode_translate(pg_owner) ) target = get_page_from_gfn(pg_owner, l1e_get_pfn(l1e), &l1e_p2mt, q); if ( p2m_is_paged(l1e_p2mt) ) { if ( target ) put_page(target); p2m_mem_paging_populate(pg_owner, l1e_get_pfn(l1e)); rc = -ENOENT; break; } else if ( p2m_ram_paging_in == l1e_p2mt && !target ) { rc = -ENOENT; break; } /* If we tried to unshare and failed */ else if ( (q & P2M_UNSHARE) && p2m_is_shared(l1e_p2mt) ) { /* We could not have obtained a page ref. */ ASSERT(target == NULL); /* And mem_sharing_notify has already been called. */ rc = -ENOMEM; break; } rc = mod_l1_entry(va, l1e, mfn, cmd == MMU_PT_UPDATE_PRESERVE_AD, v, pg_owner); if ( target ) put_page(target); } break; case PGT_l2_page_table: rc = mod_l2_entry(va, l2e_from_intpte(req.val), mfn, cmd == MMU_PT_UPDATE_PRESERVE_AD, v); break; case PGT_l3_page_table: rc = mod_l3_entry(va, l3e_from_intpte(req.val), mfn, cmd == MMU_PT_UPDATE_PRESERVE_AD, v); break; case PGT_l4_page_table: rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn, cmd == MMU_PT_UPDATE_PRESERVE_AD, v); break; case PGT_writable_page: perfc_incr(writable_mmu_updates); if ( paging_write_guest_entry(v, va, req.val, _mfn(mfn)) ) rc = 0; break; } page_unlock(page); if ( rc == -EINTR ) rc = -EAGAIN; } else if ( get_page_type(page, PGT_writable_page) ) { perfc_incr(writable_mmu_updates); if ( paging_write_guest_entry(v, va, req.val, _mfn(mfn)) ) rc = 0; put_page_type(page); } unmap_domain_page_with_cache(va, &mapcache); put_page(page); } break; case MMU_MACHPHYS_UPDATE: mfn = req.ptr >> PAGE_SHIFT; gpfn = req.val; xsm_needed |= XSM_MMU_MACHPHYS_UPDATE; if ( xsm_needed != xsm_checked ) { rc = xsm_mmu_update(XSM_TARGET, d, NULL, pg_owner, xsm_needed); if ( rc ) break; xsm_checked = xsm_needed; } if ( unlikely(!get_page_from_pagenr(mfn, pg_owner)) ) { MEM_LOG("Could not get page for mach->phys update"); rc = -EINVAL; break; } if ( unlikely(paging_mode_translate(pg_owner)) ) { MEM_LOG("Mach-phys update on auto-translate guest"); rc = -EINVAL; break; } set_gpfn_from_mfn(mfn, gpfn); paging_mark_dirty(pg_owner, mfn); put_page(mfn_to_page(mfn)); break; default: MEM_LOG("Invalid page update command %x", cmd); rc = -ENOSYS; break; } if ( unlikely(rc) ) break; guest_handle_add_offset(ureqs, 1); } if ( rc == -EAGAIN ) { ASSERT(i < count); rc = hypercall_create_continuation( __HYPERVISOR_mmu_update, "hihi", ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); } else if ( curr->arch.old_guest_table ) { XEN_GUEST_HANDLE_PARAM(void) null; ASSERT(rc || i == count); set_xen_guest_handle(null, NULL); /* * In order to have a way to communicate the final return value to * our continuation, we pass this in place of "foreigndom", building * on the fact that this argument isn't needed anymore. */ rc = hypercall_create_continuation( __HYPERVISOR_mmu_update, "hihi", null, MMU_UPDATE_PREEMPTED, null, rc); } put_pg_owner(pg_owner); domain_mmap_cache_destroy(&mapcache); perfc_add(num_page_updates, i); out: if ( pt_owner && (pt_owner != d) ) rcu_unlock_domain(pt_owner); /* Add incremental work we have done to the @done output parameter. */ if ( unlikely(!guest_handle_is_null(pdone)) ) { done += i; copy_to_guest(pdone, &done, 1); } return rc; } static int create_grant_pte_mapping( uint64_t pte_addr, l1_pgentry_t nl1e, struct vcpu *v) { int rc = GNTST_okay; void *va; unsigned long gmfn, mfn; struct page_info *page; l1_pgentry_t ol1e; struct domain *d = v->domain; adjust_guest_l1e(nl1e, d); gmfn = pte_addr >> PAGE_SHIFT; page = get_page_from_gfn(d, gmfn, NULL, P2M_ALLOC); if ( unlikely(!page) ) { MEM_LOG("Could not get page for normal update"); return GNTST_general_error; } mfn = page_to_mfn(page); va = map_domain_page(mfn); va = (void *)((unsigned long)va + ((unsigned long)pte_addr & ~PAGE_MASK)); if ( !page_lock(page) ) { rc = GNTST_general_error; goto failed; } if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table ) { page_unlock(page); rc = GNTST_general_error; goto failed; } ol1e = *(l1_pgentry_t *)va; if ( !UPDATE_ENTRY(l1, (l1_pgentry_t *)va, ol1e, nl1e, mfn, v, 0) ) { page_unlock(page); rc = GNTST_general_error; goto failed; } page_unlock(page); if ( !paging_mode_refcounts(d) ) put_page_from_l1e(ol1e, d); failed: unmap_domain_page(va); put_page(page); return rc; } static int destroy_grant_pte_mapping( uint64_t addr, unsigned long frame, struct domain *d) { int rc = GNTST_okay; void *va; unsigned long gmfn, mfn; struct page_info *page; l1_pgentry_t ol1e; gmfn = addr >> PAGE_SHIFT; page = get_page_from_gfn(d, gmfn, NULL, P2M_ALLOC); if ( unlikely(!page) ) { MEM_LOG("Could not get page for normal update"); return GNTST_general_error; } mfn = page_to_mfn(page); va = map_domain_page(mfn); va = (void *)((unsigned long)va + ((unsigned long)addr & ~PAGE_MASK)); if ( !page_lock(page) ) { rc = GNTST_general_error; goto failed; } if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table ) { page_unlock(page); rc = GNTST_general_error; goto failed; } ol1e = *(l1_pgentry_t *)va; /* Check that the virtual address supplied is actually mapped to frame. */ if ( unlikely(l1e_get_pfn(ol1e) != frame) ) { page_unlock(page); MEM_LOG("PTE entry %lx for address %"PRIx64" doesn't match frame %lx", (unsigned long)l1e_get_intpte(ol1e), addr, frame); rc = GNTST_general_error; goto failed; } /* Delete pagetable entry. */ if ( unlikely(!UPDATE_ENTRY (l1, (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn, d->vcpu[0] /* Change if we go to per-vcpu shadows. */, 0)) ) { page_unlock(page); MEM_LOG("Cannot delete PTE entry at %p", va); rc = GNTST_general_error; goto failed; } page_unlock(page); failed: unmap_domain_page(va); put_page(page); return rc; } static int create_grant_va_mapping( unsigned long va, l1_pgentry_t nl1e, struct vcpu *v) { l1_pgentry_t *pl1e, ol1e; struct domain *d = v->domain; unsigned long gl1mfn; struct page_info *l1pg; int okay; adjust_guest_l1e(nl1e, d); pl1e = guest_map_l1e(v, va, &gl1mfn); if ( !pl1e ) { MEM_LOG("Could not find L1 PTE for address %lx", va); return GNTST_general_error; } if ( !get_page_from_pagenr(gl1mfn, current->domain) ) { guest_unmap_l1e(v, pl1e); return GNTST_general_error; } l1pg = mfn_to_page(gl1mfn); if ( !page_lock(l1pg) ) { put_page(l1pg); guest_unmap_l1e(v, pl1e); return GNTST_general_error; } if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table ) { page_unlock(l1pg); put_page(l1pg); guest_unmap_l1e(v, pl1e); return GNTST_general_error; } ol1e = *pl1e; okay = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0); page_unlock(l1pg); put_page(l1pg); guest_unmap_l1e(v, pl1e); if ( okay && !paging_mode_refcounts(d) ) put_page_from_l1e(ol1e, d); return okay ? GNTST_okay : GNTST_general_error; } static int replace_grant_va_mapping( unsigned long addr, unsigned long frame, l1_pgentry_t nl1e, struct vcpu *v) { l1_pgentry_t *pl1e, ol1e; unsigned long gl1mfn; struct page_info *l1pg; int rc = 0; pl1e = guest_map_l1e(v, addr, &gl1mfn); if ( !pl1e ) { MEM_LOG("Could not find L1 PTE for address %lx", addr); return GNTST_general_error; } if ( !get_page_from_pagenr(gl1mfn, current->domain) ) { rc = GNTST_general_error; goto out; } l1pg = mfn_to_page(gl1mfn); if ( !page_lock(l1pg) ) { rc = GNTST_general_error; put_page(l1pg); goto out; } if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table ) { rc = GNTST_general_error; goto unlock_and_out; } ol1e = *pl1e; /* Check that the virtual address supplied is actually mapped to frame. */ if ( unlikely(l1e_get_pfn(ol1e) != frame) ) { MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx", l1e_get_pfn(ol1e), addr, frame); rc = GNTST_general_error; goto unlock_and_out; } /* Delete pagetable entry. */ if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, v, 0)) ) { MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e); rc = GNTST_general_error; goto unlock_and_out; } unlock_and_out: page_unlock(l1pg); put_page(l1pg); out: guest_unmap_l1e(v, pl1e); return rc; } static int destroy_grant_va_mapping( unsigned long addr, unsigned long frame, struct vcpu *v) { return replace_grant_va_mapping(addr, frame, l1e_empty(), v); } static int create_grant_p2m_mapping(uint64_t addr, unsigned long frame, unsigned int flags, unsigned int cache_flags) { p2m_type_t p2mt; int rc; if ( cache_flags || (flags & ~GNTMAP_readonly) != GNTMAP_host_map ) return GNTST_general_error; if ( flags & GNTMAP_readonly ) p2mt = p2m_grant_map_ro; else p2mt = p2m_grant_map_rw; rc = guest_physmap_add_entry(current->domain, addr >> PAGE_SHIFT, frame, PAGE_ORDER_4K, p2mt); if ( rc ) return GNTST_general_error; else return GNTST_okay; } int create_grant_host_mapping(uint64_t addr, unsigned long frame, unsigned int flags, unsigned int cache_flags) { l1_pgentry_t pte; uint32_t grant_pte_flags; if ( paging_mode_external(current->domain) ) return create_grant_p2m_mapping(addr, frame, flags, cache_flags); grant_pte_flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_GNTTAB; if ( cpu_has_nx ) grant_pte_flags |= _PAGE_NX_BIT; pte = l1e_from_pfn(frame, grant_pte_flags); if ( (flags & GNTMAP_application_map) ) l1e_add_flags(pte,_PAGE_USER); if ( !(flags & GNTMAP_readonly) ) l1e_add_flags(pte,_PAGE_RW); l1e_add_flags(pte, ((flags >> _GNTMAP_guest_avail0) * _PAGE_AVAIL0) & _PAGE_AVAIL); l1e_add_flags(pte, cacheattr_to_pte_flags(cache_flags >> 5)); if ( flags & GNTMAP_contains_pte ) return create_grant_pte_mapping(addr, pte, current); return create_grant_va_mapping(addr, pte, current); } static int replace_grant_p2m_mapping( uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags) { unsigned long gfn = (unsigned long)(addr >> PAGE_SHIFT); p2m_type_t type; mfn_t old_mfn; struct domain *d = current->domain; if ( new_addr != 0 || (flags & GNTMAP_contains_pte) ) return GNTST_general_error; old_mfn = get_gfn(d, gfn, &type); if ( !p2m_is_grant(type) || mfn_x(old_mfn) != frame ) { put_gfn(d, gfn); gdprintk(XENLOG_WARNING, "replace_grant_p2m_mapping: old mapping invalid (type %d, mfn %lx, frame %lx)\n", type, mfn_x(old_mfn), frame); return GNTST_general_error; } guest_physmap_remove_page(d, gfn, frame, PAGE_ORDER_4K); put_gfn(d, gfn); return GNTST_okay; } int replace_grant_host_mapping( uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags) { struct vcpu *curr = current; l1_pgentry_t *pl1e, ol1e; unsigned long gl1mfn; struct page_info *l1pg; int rc; if ( paging_mode_external(current->domain) ) return replace_grant_p2m_mapping(addr, frame, new_addr, flags); if ( flags & GNTMAP_contains_pte ) { if ( !new_addr ) return destroy_grant_pte_mapping(addr, frame, curr->domain); MEM_LOG("Unsupported grant table operation"); return GNTST_general_error; } if ( !new_addr ) return destroy_grant_va_mapping(addr, frame, curr); pl1e = guest_map_l1e(curr, new_addr, &gl1mfn); if ( !pl1e ) { MEM_LOG("Could not find L1 PTE for address %lx", (unsigned long)new_addr); return GNTST_general_error; } if ( !get_page_from_pagenr(gl1mfn, current->domain) ) { guest_unmap_l1e(curr, pl1e); return GNTST_general_error; } l1pg = mfn_to_page(gl1mfn); if ( !page_lock(l1pg) ) { put_page(l1pg); guest_unmap_l1e(curr, pl1e); return GNTST_general_error; } if ( (l1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table ) { page_unlock(l1pg); put_page(l1pg); guest_unmap_l1e(curr, pl1e); return GNTST_general_error; } ol1e = *pl1e; if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, l1e_empty(), gl1mfn, curr, 0)) ) { page_unlock(l1pg); put_page(l1pg); MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e); guest_unmap_l1e(curr, pl1e); return GNTST_general_error; } page_unlock(l1pg); put_page(l1pg); guest_unmap_l1e(curr, pl1e); rc = replace_grant_va_mapping(addr, frame, ol1e, curr); if ( rc && !paging_mode_refcounts(curr->domain) ) put_page_from_l1e(ol1e, curr->domain); return rc; } int donate_page( struct domain *d, struct page_info *page, unsigned int memflags) { spin_lock(&d->page_alloc_lock); if ( is_xen_heap_page(page) || (page_get_owner(page) != NULL) ) goto fail; if ( d->is_dying ) goto fail; if ( page->count_info & ~(PGC_allocated | 1) ) goto fail; if ( !(memflags & MEMF_no_refcount) ) { if ( d->tot_pages >= d->max_pages ) goto fail; domain_adjust_tot_pages(d, 1); } page->count_info = PGC_allocated | 1; page_set_owner(page, d); page_list_add_tail(page,&d->page_list); spin_unlock(&d->page_alloc_lock); return 0; fail: spin_unlock(&d->page_alloc_lock); MEM_LOG("Bad donate %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info, (void *)page_to_mfn(page), d, d->domain_id, page_get_owner(page), page->count_info, page->u.inuse.type_info); return -1; } int steal_page( struct domain *d, struct page_info *page, unsigned int memflags) { unsigned long x, y; bool_t drop_dom_ref = 0; spin_lock(&d->page_alloc_lock); if ( is_xen_heap_page(page) || (page_get_owner(page) != d) ) goto fail; /* * We require there is just one reference (PGC_allocated). We temporarily * drop this reference now so that we can safely swizzle the owner. */ y = page->count_info; do { x = y; if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) ) goto fail; y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask); } while ( y != x ); /* Swizzle the owner then reinstate the PGC_allocated reference. */ page_set_owner(page, NULL); y = page->count_info; do { x = y; BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated); } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x ); /* Unlink from original owner. */ if ( !(memflags & MEMF_no_refcount) && !domain_adjust_tot_pages(d, -1) ) drop_dom_ref = 1; page_list_del(page, &d->page_list); spin_unlock(&d->page_alloc_lock); if ( unlikely(drop_dom_ref) ) put_domain(d); return 0; fail: spin_unlock(&d->page_alloc_lock); MEM_LOG("Bad page %p: ed=%p(%u), sd=%p, caf=%08lx, taf=%" PRtype_info, (void *)page_to_mfn(page), d, d->domain_id, page_get_owner(page), page->count_info, page->u.inuse.type_info); return -1; } static int __do_update_va_mapping( unsigned long va, u64 val64, unsigned long flags, struct domain *pg_owner) { l1_pgentry_t val = l1e_from_intpte(val64); struct vcpu *v = current; struct domain *d = v->domain; struct page_info *gl1pg; l1_pgentry_t *pl1e; unsigned long bmap_ptr, gl1mfn; cpumask_t pmask; int rc; perfc_incr(calls_to_update_va); rc = xsm_update_va_mapping(XSM_TARGET, d, pg_owner, val); if ( rc ) return rc; rc = -EINVAL; pl1e = guest_map_l1e(v, va, &gl1mfn); if ( unlikely(!pl1e || !get_page_from_pagenr(gl1mfn, d)) ) goto out; gl1pg = mfn_to_page(gl1mfn); if ( !page_lock(gl1pg) ) { put_page(gl1pg); goto out; } if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table ) { page_unlock(gl1pg); put_page(gl1pg); goto out; } rc = mod_l1_entry(pl1e, val, gl1mfn, 0, v, pg_owner); page_unlock(gl1pg); put_page(gl1pg); out: if ( pl1e ) guest_unmap_l1e(v, pl1e); switch ( flags & UVMF_FLUSHTYPE_MASK ) { case UVMF_TLB_FLUSH: switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) ) { case UVMF_LOCAL: flush_tlb_local(); break; case UVMF_ALL: flush_tlb_mask(d->domain_dirty_cpumask); break; default: rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr, void), &pmask); flush_tlb_mask(&pmask); break; } break; case UVMF_INVLPG: switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) ) { case UVMF_LOCAL: if ( !paging_mode_enabled(d) || (paging_invlpg(v, va) != 0) ) flush_tlb_one_local(va); break; case UVMF_ALL: flush_tlb_one_mask(d->domain_dirty_cpumask, va); break; default: rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr, void), &pmask); flush_tlb_one_mask(&pmask, va); break; } break; } return rc; } long do_update_va_mapping(unsigned long va, u64 val64, unsigned long flags) { return __do_update_va_mapping(va, val64, flags, current->domain); } long do_update_va_mapping_otherdomain(unsigned long va, u64 val64, unsigned long flags, domid_t domid) { struct domain *pg_owner; int rc; if ( (pg_owner = get_pg_owner(domid)) == NULL ) return -ESRCH; rc = __do_update_va_mapping(va, val64, flags, pg_owner); put_pg_owner(pg_owner); return rc; } /************************* * Descriptor Tables */ void destroy_gdt(struct vcpu *v) { l1_pgentry_t *pl1e; int i; unsigned long pfn; v->arch.pv_vcpu.gdt_ents = 0; pl1e = gdt_ldt_ptes(v->domain, v); for ( i = 0; i < FIRST_RESERVED_GDT_PAGE; i++ ) { if ( (pfn = l1e_get_pfn(pl1e[i])) != 0 ) put_page_and_type(mfn_to_page(pfn)); l1e_write(&pl1e[i], l1e_empty()); v->arch.pv_vcpu.gdt_frames[i] = 0; } } long set_gdt(struct vcpu *v, unsigned long *frames, unsigned int entries) { struct domain *d = v->domain; l1_pgentry_t *pl1e; /* NB. There are 512 8-byte entries per GDT page. */ int i, nr_pages = (entries + 511) / 512; unsigned long mfn, *pfns; if ( entries > FIRST_RESERVED_GDT_ENTRY ) return -EINVAL; pfns = xmalloc_array(unsigned long, nr_pages); if ( !pfns ) return -ENOMEM; /* Check the pages in the new GDT. */ for ( i = 0; i < nr_pages; i++ ) { struct page_info *page; pfns[i] = frames[i]; page = get_page_from_gfn(d, frames[i], NULL, P2M_ALLOC); if ( !page ) goto fail; if ( !get_page_type(page, PGT_seg_desc_page) ) { put_page(page); goto fail; } mfn = frames[i] = page_to_mfn(page); } /* Tear down the old GDT. */ destroy_gdt(v); /* Install the new GDT. */ v->arch.pv_vcpu.gdt_ents = entries; pl1e = gdt_ldt_ptes(d, v); for ( i = 0; i < nr_pages; i++ ) { v->arch.pv_vcpu.gdt_frames[i] = frames[i]; l1e_write(&pl1e[i], l1e_from_pfn(frames[i], __PAGE_HYPERVISOR)); } xfree(pfns); return 0; fail: while ( i-- > 0 ) { put_page_and_type(mfn_to_page(frames[i])); } xfree(pfns); return -EINVAL; } long do_set_gdt(XEN_GUEST_HANDLE_PARAM(xen_ulong_t) frame_list, unsigned int entries) { int nr_pages = (entries + 511) / 512; unsigned long frames[16]; struct vcpu *curr = current; long ret; /* Rechecked in set_gdt, but ensures a sane limit for copy_from_user(). */ if ( entries > FIRST_RESERVED_GDT_ENTRY ) return -EINVAL; if ( copy_from_guest(frames, frame_list, nr_pages) ) return -EFAULT; domain_lock(curr->domain); if ( (ret = set_gdt(curr, frames, entries)) == 0 ) flush_tlb_local(); domain_unlock(curr->domain); return ret; } long do_update_descriptor(u64 pa, u64 desc) { struct domain *dom = current->domain; unsigned long gmfn = pa >> PAGE_SHIFT; unsigned long mfn; unsigned int offset; struct desc_struct *gdt_pent, d; struct page_info *page; long ret = -EINVAL; offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct); *(u64 *)&d = desc; page = get_page_from_gfn(dom, gmfn, NULL, P2M_ALLOC); if ( (((unsigned int)pa % sizeof(struct desc_struct)) != 0) || !page || !check_descriptor(dom, &d) ) { if ( page ) put_page(page); return -EINVAL; } mfn = page_to_mfn(page); /* Check if the given frame is in use in an unsafe context. */ switch ( page->u.inuse.type_info & PGT_type_mask ) { case PGT_seg_desc_page: if ( unlikely(!get_page_type(page, PGT_seg_desc_page)) ) goto out; break; default: if ( unlikely(!get_page_type(page, PGT_writable_page)) ) goto out; break; } paging_mark_dirty(dom, mfn); /* All is good so make the update. */ gdt_pent = map_domain_page(mfn); write_atomic((uint64_t *)&gdt_pent[offset], *(uint64_t *)&d); unmap_domain_page(gdt_pent); put_page_type(page); ret = 0; /* success */ out: put_page(page); return ret; } typedef struct e820entry e820entry_t; DEFINE_XEN_GUEST_HANDLE(e820entry_t); struct memory_map_context { unsigned int n; unsigned long s; struct xen_memory_map map; }; static int handle_iomem_range(unsigned long s, unsigned long e, void *p) { struct memory_map_context *ctxt = p; if ( s > ctxt->s ) { e820entry_t ent; XEN_GUEST_HANDLE_PARAM(e820entry_t) buffer_param; XEN_GUEST_HANDLE(e820entry_t) buffer; if ( ctxt->n + 1 >= ctxt->map.nr_entries ) return -EINVAL; ent.addr = (uint64_t)ctxt->s << PAGE_SHIFT; ent.size = (uint64_t)(s - ctxt->s) << PAGE_SHIFT; ent.type = E820_RESERVED; buffer_param = guest_handle_cast(ctxt->map.buffer, e820entry_t); buffer = guest_handle_from_param(buffer_param, e820entry_t); if ( __copy_to_guest_offset(buffer, ctxt->n, &ent, 1) ) return -EFAULT; ctxt->n++; } ctxt->s = e + 1; return 0; } int xenmem_add_to_physmap_one( struct domain *d, unsigned int space, domid_t foreign_domid, unsigned long idx, xen_pfn_t gpfn) { struct page_info *page = NULL; unsigned long gfn = 0; /* gcc ... */ unsigned long prev_mfn, mfn = 0, old_gpfn; int rc; p2m_type_t p2mt; switch ( space ) { case XENMAPSPACE_shared_info: if ( idx == 0 ) mfn = virt_to_mfn(d->shared_info); break; case XENMAPSPACE_grant_table: spin_lock(&d->grant_table->lock); if ( d->grant_table->gt_version == 0 ) d->grant_table->gt_version = 1; if ( d->grant_table->gt_version == 2 && (idx & XENMAPIDX_grant_table_status) ) { idx &= ~XENMAPIDX_grant_table_status; if ( idx < nr_status_frames(d->grant_table) ) mfn = virt_to_mfn(d->grant_table->status[idx]); } else { if ( (idx >= nr_grant_frames(d->grant_table)) && (idx < max_nr_grant_frames) ) gnttab_grow_table(d, idx + 1); if ( idx < nr_grant_frames(d->grant_table) ) mfn = virt_to_mfn(d->grant_table->shared_raw[idx]); } spin_unlock(&d->grant_table->lock); break; case XENMAPSPACE_gmfn_range: case XENMAPSPACE_gmfn: { p2m_type_t p2mt; gfn = idx; idx = mfn_x(get_gfn_unshare(d, idx, &p2mt)); /* If the page is still shared, exit early */ if ( p2m_is_shared(p2mt) ) { put_gfn(d, gfn); return -ENOMEM; } if ( !get_page_from_pagenr(idx, d) ) break; mfn = idx; page = mfn_to_page(mfn); break; } default: break; } if ( !paging_mode_translate(d) || (mfn == 0) ) { if ( page ) put_page(page); if ( space == XENMAPSPACE_gmfn || space == XENMAPSPACE_gmfn_range ) put_gfn(d, gfn); return -EINVAL; } /* Remove previously mapped page if it was present. */ prev_mfn = mfn_x(get_gfn(d, gpfn, &p2mt)); if ( mfn_valid(prev_mfn) ) { if ( is_xen_heap_mfn(prev_mfn) ) /* Xen heap frames are simply unhooked from this phys slot. */ guest_physmap_remove_page(d, gpfn, prev_mfn, PAGE_ORDER_4K); else /* Normal domain memory is freed, to avoid leaking memory. */ guest_remove_page(d, gpfn); } /* In the XENMAPSPACE_gmfn case we still hold a ref on the old page. */ put_gfn(d, gpfn); /* Unmap from old location, if any. */ old_gpfn = get_gpfn_from_mfn(mfn); ASSERT( old_gpfn != SHARED_M2P_ENTRY ); if ( space == XENMAPSPACE_gmfn || space == XENMAPSPACE_gmfn_range ) ASSERT( old_gpfn == gfn ); if ( old_gpfn != INVALID_M2P_ENTRY ) guest_physmap_remove_page(d, old_gpfn, mfn, PAGE_ORDER_4K); /* Map at new location. */ rc = guest_physmap_add_page(d, gpfn, mfn, PAGE_ORDER_4K); /* In the XENMAPSPACE_gmfn, we took a ref of the gfn at the top */ if ( space == XENMAPSPACE_gmfn || space == XENMAPSPACE_gmfn_range ) put_gfn(d, gfn); if ( page ) put_page(page); return rc; } long arch_memory_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg) { int rc; switch ( op ) { case XENMEM_set_memory_map: { struct xen_foreign_memory_map fmap; struct domain *d; struct e820entry *e820; if ( copy_from_guest(&fmap, arg, 1) ) return -EFAULT; if ( fmap.map.nr_entries > E820MAX ) return -EINVAL; d = rcu_lock_domain_by_any_id(fmap.domid); if ( d == NULL ) return -ESRCH; rc = xsm_domain_memory_map(XSM_TARGET, d); if ( rc ) { rcu_unlock_domain(d); return rc; } if ( is_hvm_domain(d) ) { rcu_unlock_domain(d); return -EPERM; } e820 = xmalloc_array(e820entry_t, fmap.map.nr_entries); if ( e820 == NULL ) { rcu_unlock_domain(d); return -ENOMEM; } if ( copy_from_guest(e820, fmap.map.buffer, fmap.map.nr_entries) ) { xfree(e820); rcu_unlock_domain(d); return -EFAULT; } spin_lock(&d->arch.e820_lock); xfree(d->arch.e820); d->arch.e820 = e820; d->arch.nr_e820 = fmap.map.nr_entries; spin_unlock(&d->arch.e820_lock); rcu_unlock_domain(d); return rc; } case XENMEM_memory_map: { struct xen_memory_map map; struct domain *d = current->domain; if ( copy_from_guest(&map, arg, 1) ) return -EFAULT; spin_lock(&d->arch.e820_lock); /* Backwards compatibility. */ if ( (d->arch.nr_e820 == 0) || (d->arch.e820 == NULL) ) { spin_unlock(&d->arch.e820_lock); return -ENOSYS; } map.nr_entries = min(map.nr_entries, d->arch.nr_e820); if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) || __copy_to_guest(arg, &map, 1) ) { spin_unlock(&d->arch.e820_lock); return -EFAULT; } spin_unlock(&d->arch.e820_lock); return 0; } case XENMEM_machine_memory_map: { struct memory_map_context ctxt; XEN_GUEST_HANDLE(e820entry_t) buffer; XEN_GUEST_HANDLE_PARAM(e820entry_t) buffer_param; unsigned int i; rc = xsm_machine_memory_map(XSM_PRIV); if ( rc ) return rc; if ( copy_from_guest(&ctxt.map, arg, 1) ) return -EFAULT; if ( ctxt.map.nr_entries < e820.nr_map + 1 ) return -EINVAL; buffer_param = guest_handle_cast(ctxt.map.buffer, e820entry_t); buffer = guest_handle_from_param(buffer_param, e820entry_t); if ( !guest_handle_okay(buffer, ctxt.map.nr_entries) ) return -EFAULT; for ( i = 0, ctxt.n = 0, ctxt.s = 0; i < e820.nr_map; ++i, ++ctxt.n ) { unsigned long s = PFN_DOWN(e820.map[i].addr); if ( s ) { rc = rangeset_report_ranges(current->domain->iomem_caps, ctxt.s, s - 1, handle_iomem_range, &ctxt); if ( !rc ) rc = handle_iomem_range(s, s, &ctxt); if ( rc ) return rc; } if ( ctxt.map.nr_entries <= ctxt.n + (e820.nr_map - i) ) return -EINVAL; if ( __copy_to_guest_offset(buffer, ctxt.n, e820.map + i, 1) ) return -EFAULT; ctxt.s = PFN_UP(e820.map[i].addr + e820.map[i].size); } if ( ctxt.s ) { rc = rangeset_report_ranges(current->domain->iomem_caps, ctxt.s, ~0UL, handle_iomem_range, &ctxt); if ( !rc && ctxt.s ) rc = handle_iomem_range(~0UL, ~0UL, &ctxt); if ( rc ) return rc; } ctxt.map.nr_entries = ctxt.n; if ( __copy_to_guest(arg, &ctxt.map, 1) ) return -EFAULT; return 0; } case XENMEM_machphys_mapping: { struct xen_machphys_mapping mapping = { .v_start = MACH2PHYS_VIRT_START, .v_end = MACH2PHYS_VIRT_END, .max_mfn = MACH2PHYS_NR_ENTRIES - 1 }; if ( !mem_hotplug && current->domain == dom0 ) mapping.max_mfn = max_page - 1; if ( copy_to_guest(arg, &mapping, 1) ) return -EFAULT; return 0; } case XENMEM_set_pod_target: case XENMEM_get_pod_target: { xen_pod_target_t target; struct domain *d; struct p2m_domain *p2m; if ( copy_from_guest(&target, arg, 1) ) return -EFAULT; d = rcu_lock_domain_by_any_id(target.domid); if ( d == NULL ) return -ESRCH; if ( op == XENMEM_set_pod_target ) rc = xsm_set_pod_target(XSM_PRIV, d); else rc = xsm_get_pod_target(XSM_PRIV, d); if ( rc != 0 ) goto pod_target_out_unlock; if ( op == XENMEM_set_pod_target ) { if ( target.target_pages > d->max_pages ) { rc = -EINVAL; goto pod_target_out_unlock; } rc = p2m_pod_set_mem_target(d, target.target_pages); } if ( rc == -EAGAIN ) { rc = hypercall_create_continuation( __HYPERVISOR_memory_op, "lh", op, arg); } else if ( rc >= 0 ) { p2m = p2m_get_hostp2m(d); target.tot_pages = d->tot_pages; target.pod_cache_pages = p2m->pod.count; target.pod_entries = p2m->pod.entry_count; if ( __copy_to_guest(arg, &target, 1) ) { rc= -EFAULT; goto pod_target_out_unlock; } } pod_target_out_unlock: rcu_unlock_domain(d); return rc; } default: return subarch_memory_op(op, arg); } return 0; } /************************* * Writable Pagetables */ struct ptwr_emulate_ctxt { struct x86_emulate_ctxt ctxt; unsigned long cr2; l1_pgentry_t pte; }; static int ptwr_emulated_read( enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { unsigned int rc; unsigned long addr = offset; if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 ) { propagate_page_fault(addr + bytes - rc, 0); /* read fault */ return X86EMUL_EXCEPTION; } return X86EMUL_OKAY; } static int ptwr_emulated_update( unsigned long addr, paddr_t old, paddr_t val, unsigned int bytes, unsigned int do_cmpxchg, struct ptwr_emulate_ctxt *ptwr_ctxt) { unsigned long mfn; unsigned long unaligned_addr = addr; struct page_info *page; l1_pgentry_t pte, ol1e, nl1e, *pl1e; struct vcpu *v = current; struct domain *d = v->domain; /* Only allow naturally-aligned stores within the original %cr2 page. */ if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) ) { MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)", ptwr_ctxt->cr2, addr, bytes); return X86EMUL_UNHANDLEABLE; } /* Turn a sub-word access into a full-word access. */ if ( bytes != sizeof(paddr_t) ) { paddr_t full; unsigned int rc, offset = addr & (sizeof(paddr_t)-1); /* Align address; read full word. */ addr &= ~(sizeof(paddr_t)-1); if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 ) { propagate_page_fault(addr+sizeof(paddr_t)-rc, 0); /* read fault */ return X86EMUL_EXCEPTION; } /* Mask out bits provided by caller. */ full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8)); /* Shift the caller value and OR in the missing bits. */ val &= (((paddr_t)1 << (bytes*8)) - 1); val <<= (offset)*8; val |= full; /* Also fill in missing parts of the cmpxchg old value. */ old &= (((paddr_t)1 << (bytes*8)) - 1); old <<= (offset)*8; old |= full; } pte = ptwr_ctxt->pte; mfn = l1e_get_pfn(pte); page = mfn_to_page(mfn); /* We are looking only for read-only mappings of p.t. pages. */ ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT); ASSERT(mfn_valid(mfn)); ASSERT((page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table); ASSERT((page->u.inuse.type_info & PGT_count_mask) != 0); ASSERT(page_get_owner(page) == d); /* Check the new PTE. */ nl1e = l1e_from_intpte(val); switch ( get_page_from_l1e(nl1e, d, d) ) { default: if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) ) { /* * If this is an upper-half write to a PAE PTE then we assume that * the guest has simply got the two writes the wrong way round. We * zap the PRESENT bit on the assumption that the bottom half will * be written immediately after we return to the guest. */ gdprintk(XENLOG_DEBUG, "ptwr_emulate: fixing up invalid PAE PTE %" PRIpte"\n", l1e_get_intpte(nl1e)); l1e_remove_flags(nl1e, _PAGE_PRESENT); } else { MEM_LOG("ptwr_emulate: could not get_page_from_l1e()"); return X86EMUL_UNHANDLEABLE; } break; case 0: break; case 1: l1e_remove_flags(nl1e, _PAGE_RW); break; } adjust_guest_l1e(nl1e, d); /* Checked successfully: do the update (write or cmpxchg). */ pl1e = map_domain_page(mfn); pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK)); if ( do_cmpxchg ) { int okay; intpte_t t = old; ol1e = l1e_from_intpte(old); okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e), &t, l1e_get_intpte(nl1e), _mfn(mfn)); okay = (okay && t == old); if ( !okay ) { unmap_domain_page(pl1e); put_page_from_l1e(nl1e, d); return X86EMUL_CMPXCHG_FAILED; } } else { ol1e = *pl1e; if ( !UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, mfn, v, 0) ) BUG(); } trace_ptwr_emulation(addr, nl1e); unmap_domain_page(pl1e); /* Finally, drop the old PTE. */ put_page_from_l1e(ol1e, d); return X86EMUL_OKAY; } static int ptwr_emulated_write( enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { paddr_t val = 0; if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) ) { MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)", offset, bytes); return X86EMUL_UNHANDLEABLE; } memcpy(&val, p_data, bytes); return ptwr_emulated_update( offset, 0, val, bytes, 0, container_of(ctxt, struct ptwr_emulate_ctxt, ctxt)); } static int ptwr_emulated_cmpxchg( enum x86_segment seg, unsigned long offset, void *p_old, void *p_new, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { paddr_t old = 0, new = 0; if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) ) { MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)", offset, bytes); return X86EMUL_UNHANDLEABLE; } memcpy(&old, p_old, bytes); memcpy(&new, p_new, bytes); return ptwr_emulated_update( offset, old, new, bytes, 1, container_of(ctxt, struct ptwr_emulate_ctxt, ctxt)); } static const struct x86_emulate_ops ptwr_emulate_ops = { .read = ptwr_emulated_read, .insn_fetch = ptwr_emulated_read, .write = ptwr_emulated_write, .cmpxchg = ptwr_emulated_cmpxchg, }; /* Write page fault handler: check if guest is trying to modify a PTE. */ int ptwr_do_page_fault(struct vcpu *v, unsigned long addr, struct cpu_user_regs *regs) { struct domain *d = v->domain; struct page_info *page; l1_pgentry_t pte; struct ptwr_emulate_ctxt ptwr_ctxt; int rc; /* Attempt to read the PTE that maps the VA being accessed. */ guest_get_eff_l1e(v, addr, &pte); /* We are looking only for read-only mappings of p.t. pages. */ if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) || !get_page_from_pagenr(l1e_get_pfn(pte), d) ) goto bail; page = l1e_get_page(pte); if ( !page_lock(page) ) { put_page(page); goto bail; } if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table ) { page_unlock(page); put_page(page); goto bail; } ptwr_ctxt.ctxt.regs = regs; ptwr_ctxt.ctxt.force_writeback = 0; ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size = is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG; ptwr_ctxt.cr2 = addr; ptwr_ctxt.pte = pte; rc = x86_emulate(&ptwr_ctxt.ctxt, &ptwr_emulate_ops); page_unlock(page); put_page(page); if ( rc == X86EMUL_UNHANDLEABLE ) goto bail; perfc_incr(ptwr_emulations); return EXCRET_fault_fixed; bail: return 0; } /************************* * fault handling for read-only MMIO pages */ struct mmio_ro_emulate_ctxt { struct x86_emulate_ctxt ctxt; unsigned long cr2; }; static int mmio_ro_emulated_read( enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { return X86EMUL_UNHANDLEABLE; } static int mmio_ro_emulated_write( enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { struct mmio_ro_emulate_ctxt *mmio_ro_ctxt = container_of(ctxt, struct mmio_ro_emulate_ctxt, ctxt); /* Only allow naturally-aligned stores at the original %cr2 address. */ if ( ((bytes | offset) & (bytes - 1)) || offset != mmio_ro_ctxt->cr2 ) { MEM_LOG("mmio_ro_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)", mmio_ro_ctxt->cr2, offset, bytes); return X86EMUL_UNHANDLEABLE; } return X86EMUL_OKAY; } static const struct x86_emulate_ops mmio_ro_emulate_ops = { .read = mmio_ro_emulated_read, .insn_fetch = ptwr_emulated_read, .write = mmio_ro_emulated_write, }; /* Check if guest is trying to modify a r/o MMIO page. */ int mmio_ro_do_page_fault(struct vcpu *v, unsigned long addr, struct cpu_user_regs *regs) { l1_pgentry_t pte; unsigned long mfn; unsigned int addr_size = is_pv_32on64_domain(v->domain) ? 32 : BITS_PER_LONG; struct mmio_ro_emulate_ctxt mmio_ro_ctxt = { .ctxt.regs = regs, .ctxt.addr_size = addr_size, .ctxt.sp_size = addr_size, .cr2 = addr }; int rc; /* Attempt to read the PTE that maps the VA being accessed. */ guest_get_eff_l1e(v, addr, &pte); /* We are looking only for read-only mappings of MMIO pages. */ if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) ) return 0; mfn = l1e_get_pfn(pte); if ( mfn_valid(mfn) ) { struct page_info *page = mfn_to_page(mfn); struct domain *owner = page_get_owner_and_reference(page); if ( owner ) put_page(page); if ( owner != dom_io ) return 0; } if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) ) return 0; rc = x86_emulate(&mmio_ro_ctxt.ctxt, &mmio_ro_emulate_ops); return rc != X86EMUL_UNHANDLEABLE ? EXCRET_fault_fixed : 0; } void *alloc_xen_pagetable(void) { if ( system_state != SYS_STATE_early_boot ) { void *ptr = alloc_xenheap_page(); BUG_ON(!dom0 && !ptr); return ptr; } return mfn_to_virt(alloc_boot_pages(1, 1)); } void free_xen_pagetable(void *v) { if ( system_state != SYS_STATE_early_boot ) free_xenheap_page(v); } static DEFINE_SPINLOCK(map_pgdir_lock); static l3_pgentry_t *virt_to_xen_l3e(unsigned long v) { l4_pgentry_t *pl4e; pl4e = &idle_pg_table[l4_table_offset(v)]; if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) ) { bool_t locking = system_state > SYS_STATE_boot; l3_pgentry_t *pl3e = alloc_xen_pagetable(); if ( !pl3e ) return NULL; clear_page(pl3e); if ( locking ) spin_lock(&map_pgdir_lock); if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) ) { l4e_write(pl4e, l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR)); pl3e = NULL; } if ( locking ) spin_unlock(&map_pgdir_lock); if ( pl3e ) free_xen_pagetable(pl3e); } return l4e_to_l3e(*pl4e) + l3_table_offset(v); } static l2_pgentry_t *virt_to_xen_l2e(unsigned long v) { l3_pgentry_t *pl3e; pl3e = virt_to_xen_l3e(v); if ( !pl3e ) return NULL; if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) { bool_t locking = system_state > SYS_STATE_boot; l2_pgentry_t *pl2e = alloc_xen_pagetable(); if ( !pl2e ) return NULL; clear_page(pl2e); if ( locking ) spin_lock(&map_pgdir_lock); if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) { l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR)); pl2e = NULL; } if ( locking ) spin_unlock(&map_pgdir_lock); if ( pl2e ) free_xen_pagetable(pl2e); } BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE); return l3e_to_l2e(*pl3e) + l2_table_offset(v); } l1_pgentry_t *virt_to_xen_l1e(unsigned long v) { l2_pgentry_t *pl2e; pl2e = virt_to_xen_l2e(v); if ( !pl2e ) return NULL; if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) { bool_t locking = system_state > SYS_STATE_boot; l1_pgentry_t *pl1e = alloc_xen_pagetable(); if ( !pl1e ) return NULL; clear_page(pl1e); if ( locking ) spin_lock(&map_pgdir_lock); if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) { l2e_write(pl2e, l2e_from_paddr(__pa(pl1e), __PAGE_HYPERVISOR)); pl1e = NULL; } if ( locking ) spin_unlock(&map_pgdir_lock); if ( pl1e ) free_xen_pagetable(pl1e); } BUG_ON(l2e_get_flags(*pl2e) & _PAGE_PSE); return l2e_to_l1e(*pl2e) + l1_table_offset(v); } /* Convert to from superpage-mapping flags for map_pages_to_xen(). */ #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f)) #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f)) /* * map_pages_to_xen() can be called with interrupts disabled during * early bootstrap. In this case it is safe to use flush_area_local() * and avoid locking because only the local CPU is online. */ #define flush_area(v,f) (!local_irq_is_enabled() ? \ flush_area_local((const void *)v, f) : \ flush_area_all((const void *)v, f)) int map_pages_to_xen( unsigned long virt, unsigned long mfn, unsigned long nr_mfns, unsigned int flags) { bool_t locking = system_state > SYS_STATE_boot; l2_pgentry_t *pl2e, ol2e; l1_pgentry_t *pl1e, ol1e; unsigned int i; #define flush_flags(oldf) do { \ unsigned int o_ = (oldf); \ if ( (o_) & _PAGE_GLOBAL ) \ flush_flags |= FLUSH_TLB_GLOBAL; \ if ( (flags & _PAGE_PRESENT) && \ (((o_) ^ flags) & PAGE_CACHE_ATTRS) ) \ flush_flags |= FLUSH_CACHE; \ } while (0) while ( nr_mfns != 0 ) { l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt); if ( !pl3e ) return -ENOMEM; ol3e = *pl3e; if ( cpu_has_page1gb && !(((virt >> PAGE_SHIFT) | mfn) & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) && nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) && !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) ) { /* 1GB-page mapping. */ l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags))); if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) ) { unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER); if ( l3e_get_flags(ol3e) & _PAGE_PSE ) { flush_flags(lNf_to_l1f(l3e_get_flags(ol3e))); flush_area(virt, flush_flags); } else { pl2e = l3e_to_l2e(ol3e); for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) { ol2e = pl2e[i]; if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) ) continue; if ( l2e_get_flags(ol2e) & _PAGE_PSE ) flush_flags(lNf_to_l1f(l2e_get_flags(ol2e))); else { unsigned int j; pl1e = l2e_to_l1e(ol2e); for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ ) flush_flags(l1e_get_flags(pl1e[j])); } } flush_area(virt, flush_flags); for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) { ol2e = pl2e[i]; if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) && !(l2e_get_flags(ol2e) & _PAGE_PSE) ) free_xen_pagetable(l2e_to_l1e(ol2e)); } free_xen_pagetable(pl2e); } } virt += 1UL << L3_PAGETABLE_SHIFT; mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); continue; } if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) && (l3e_get_flags(ol3e) & _PAGE_PSE) ) { unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER); /* Skip this PTE if there is no change. */ if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES - 1)) + (l2_table_offset(virt) << PAGETABLE_ORDER) + l1_table_offset(virt) == mfn) && ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) & ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 ) { /* We can skip to end of L3 superpage if we got a match. */ i = (1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)); if ( i > nr_mfns ) i = nr_mfns; virt += i << PAGE_SHIFT; mfn += i; nr_mfns -= i; continue; } pl2e = alloc_xen_pagetable(); if ( pl2e == NULL ) return -ENOMEM; for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) l2e_write(pl2e + i, l2e_from_pfn(l3e_get_pfn(ol3e) + (i << PAGETABLE_ORDER), l3e_get_flags(ol3e))); if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL ) flush_flags |= FLUSH_TLB_GLOBAL; if ( locking ) spin_lock(&map_pgdir_lock); if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) && (l3e_get_flags(*pl3e) & _PAGE_PSE) ) { l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e), __PAGE_HYPERVISOR)); pl2e = NULL; } if ( locking ) spin_unlock(&map_pgdir_lock); flush_area(virt, flush_flags); if ( pl2e ) free_xen_pagetable(pl2e); } pl2e = virt_to_xen_l2e(virt); if ( !pl2e ) return -ENOMEM; if ( ((((virt>>PAGE_SHIFT) | mfn) & ((1<= (1< nr_mfns ) i = nr_mfns; virt += i << L1_PAGETABLE_SHIFT; mfn += i; nr_mfns -= i; goto check_l3; } pl1e = alloc_xen_pagetable(); if ( pl1e == NULL ) return -ENOMEM; for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) l1e_write(&pl1e[i], l1e_from_pfn(l2e_get_pfn(*pl2e) + i, lNf_to_l1f(l2e_get_flags(*pl2e)))); if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL ) flush_flags |= FLUSH_TLB_GLOBAL; if ( locking ) spin_lock(&map_pgdir_lock); if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) && (l2e_get_flags(*pl2e) & _PAGE_PSE) ) { l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e), __PAGE_HYPERVISOR)); pl1e = NULL; } if ( locking ) spin_unlock(&map_pgdir_lock); flush_area(virt, flush_flags); if ( pl1e ) free_xen_pagetable(pl1e); } pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt); ol1e = *pl1e; l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags)); if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) ) { unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0); flush_flags(l1e_get_flags(ol1e)); flush_area(virt, flush_flags); } virt += 1UL << L1_PAGETABLE_SHIFT; mfn += 1UL; nr_mfns -= 1UL; if ( (flags == PAGE_HYPERVISOR) && ((nr_mfns == 0) || ((((virt >> PAGE_SHIFT) | mfn) & ((1 << PAGETABLE_ORDER) - 1)) == 0)) ) { unsigned long base_mfn; pl1e = l2e_to_l1e(*pl2e); if ( locking ) spin_lock(&map_pgdir_lock); base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ ) if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) || (l1e_get_flags(*pl1e) != flags) ) break; if ( i == L1_PAGETABLE_ENTRIES ) { ol2e = *pl2e; l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn, l1f_to_lNf(flags))); if ( locking ) spin_unlock(&map_pgdir_lock); flush_area(virt - PAGE_SIZE, FLUSH_TLB_GLOBAL | FLUSH_ORDER(PAGETABLE_ORDER)); free_xen_pagetable(l2e_to_l1e(ol2e)); } else if ( locking ) spin_unlock(&map_pgdir_lock); } } check_l3: if ( cpu_has_page1gb && (flags == PAGE_HYPERVISOR) && ((nr_mfns == 0) || !(((virt >> PAGE_SHIFT) | mfn) & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) ) { unsigned long base_mfn; if ( locking ) spin_lock(&map_pgdir_lock); ol3e = *pl3e; pl2e = l3e_to_l2e(ol3e); base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES - 1); for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ ) if ( (l2e_get_pfn(*pl2e) != (base_mfn + (i << PAGETABLE_ORDER))) || (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) ) break; if ( i == L2_PAGETABLE_ENTRIES ) { l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn, l1f_to_lNf(flags))); if ( locking ) spin_unlock(&map_pgdir_lock); flush_area(virt - PAGE_SIZE, FLUSH_TLB_GLOBAL | FLUSH_ORDER(2*PAGETABLE_ORDER)); free_xen_pagetable(l3e_to_l2e(ol3e)); } else if ( locking ) spin_unlock(&map_pgdir_lock); } } #undef flush_flags return 0; } void destroy_xen_mappings(unsigned long s, unsigned long e) { bool_t locking = system_state > SYS_STATE_boot; l2_pgentry_t *pl2e; l1_pgentry_t *pl1e; unsigned int i; unsigned long v = s; ASSERT((s & ~PAGE_MASK) == 0); ASSERT((e & ~PAGE_MASK) == 0); while ( v < e ) { l3_pgentry_t *pl3e = virt_to_xen_l3e(v); if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) { v += 1UL << L3_PAGETABLE_SHIFT; v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1); continue; } if ( l3e_get_flags(*pl3e) & _PAGE_PSE ) { if ( l2_table_offset(v) == 0 && l1_table_offset(v) == 0 && ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) ) { /* PAGE1GB: whole superpage is destroyed. */ l3e_write_atomic(pl3e, l3e_empty()); v += 1UL << L3_PAGETABLE_SHIFT; continue; } /* PAGE1GB: shatter the superpage and fall through. */ pl2e = alloc_xen_pagetable(); for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) l2e_write(pl2e + i, l2e_from_pfn(l3e_get_pfn(*pl3e) + (i << PAGETABLE_ORDER), l3e_get_flags(*pl3e))); if ( locking ) spin_lock(&map_pgdir_lock); if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) && (l3e_get_flags(*pl3e) & _PAGE_PSE) ) { l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e), __PAGE_HYPERVISOR)); pl2e = NULL; } if ( locking ) spin_unlock(&map_pgdir_lock); if ( pl2e ) free_xen_pagetable(pl2e); } pl2e = virt_to_xen_l2e(v); if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) { v += 1UL << L2_PAGETABLE_SHIFT; v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1); continue; } if ( l2e_get_flags(*pl2e) & _PAGE_PSE ) { if ( (l1_table_offset(v) == 0) && ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) ) { /* PSE: whole superpage is destroyed. */ l2e_write_atomic(pl2e, l2e_empty()); v += 1UL << L2_PAGETABLE_SHIFT; } else { /* PSE: shatter the superpage and try again. */ pl1e = alloc_xen_pagetable(); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) l1e_write(&pl1e[i], l1e_from_pfn(l2e_get_pfn(*pl2e) + i, l2e_get_flags(*pl2e) & ~_PAGE_PSE)); if ( locking ) spin_lock(&map_pgdir_lock); if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) && (l2e_get_flags(*pl2e) & _PAGE_PSE) ) { l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e), __PAGE_HYPERVISOR)); pl1e = NULL; } if ( locking ) spin_unlock(&map_pgdir_lock); if ( pl1e ) free_xen_pagetable(pl1e); } } else { /* Ordinary 4kB mapping. */ pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v); l1e_write_atomic(pl1e, l1e_empty()); v += PAGE_SIZE; /* If we are done with the L2E, check if it is now empty. */ if ( (v != e) && (l1_table_offset(v) != 0) ) continue; pl1e = l2e_to_l1e(*pl2e); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) if ( l1e_get_intpte(pl1e[i]) != 0 ) break; if ( i == L1_PAGETABLE_ENTRIES ) { /* Empty: zap the L2E and free the L1 page. */ l2e_write_atomic(pl2e, l2e_empty()); flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */ free_xen_pagetable(pl1e); } } /* If we are done with the L3E, check if it is now empty. */ if ( (v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0) ) continue; pl2e = l3e_to_l2e(*pl3e); for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) if ( l2e_get_intpte(pl2e[i]) != 0 ) break; if ( i == L2_PAGETABLE_ENTRIES ) { /* Empty: zap the L3E and free the L2 page. */ l3e_write_atomic(pl3e, l3e_empty()); flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */ free_xen_pagetable(pl2e); } } flush_area(NULL, FLUSH_TLB_GLOBAL); } #undef flush_area void __set_fixmap( enum fixed_addresses idx, unsigned long mfn, unsigned long flags) { BUG_ON(idx >= __end_of_fixed_addresses); map_pages_to_xen(fix_to_virt(idx), mfn, 1, flags); } void *__init arch_vmap_virt_end(void) { return (void *)fix_to_virt(__end_of_fixed_addresses); } void __iomem *ioremap(paddr_t pa, size_t len) { unsigned long pfn = PFN_DOWN(pa); void *va; WARN_ON(page_is_ram_type(pfn, RAM_TYPE_CONVENTIONAL)); /* The low first Mb is always mapped. */ if ( !((pa + len - 1) >> 20) ) va = __va(pa); else { unsigned int offs = pa & (PAGE_SIZE - 1); unsigned int nr = PFN_UP(offs + len); va = __vmap(&pfn, nr, 1, 1, PAGE_HYPERVISOR_NOCACHE) + offs; } return (void __force __iomem *)va; } int create_perdomain_mapping(struct domain *d, unsigned long va, unsigned int nr, l1_pgentry_t **pl1tab, struct page_info **ppg) { struct page_info *pg; l3_pgentry_t *l3tab; l2_pgentry_t *l2tab; l1_pgentry_t *l1tab; unsigned int memf = MEMF_node(domain_to_node(d)); int rc = 0; ASSERT(va >= PERDOMAIN_VIRT_START && va < PERDOMAIN_VIRT_SLOT(PERDOMAIN_SLOTS)); if ( !d->arch.perdomain_l3_pg ) { pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d))); if ( !pg ) return -ENOMEM; l3tab = __map_domain_page(pg); clear_page(l3tab); d->arch.perdomain_l3_pg = pg; if ( !nr ) { unmap_domain_page(l3tab); return 0; } } else if ( !nr ) return 0; else l3tab = __map_domain_page(d->arch.perdomain_l3_pg); ASSERT(!l3_table_offset(va ^ (va + nr * PAGE_SIZE - 1))); if ( !(l3e_get_flags(l3tab[l3_table_offset(va)]) & _PAGE_PRESENT) ) { pg = alloc_domheap_page(NULL, memf); if ( !pg ) { unmap_domain_page(l3tab); return -ENOMEM; } l2tab = __map_domain_page(pg); clear_page(l2tab); l3tab[l3_table_offset(va)] = l3e_from_page(pg, __PAGE_HYPERVISOR); } else l2tab = map_domain_page(l3e_get_pfn(l3tab[l3_table_offset(va)])); unmap_domain_page(l3tab); if ( !pl1tab && !ppg ) { unmap_domain_page(l2tab); return 0; } for ( l1tab = NULL; !rc && nr--; ) { l2_pgentry_t *pl2e = l2tab + l2_table_offset(va); if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) { if ( pl1tab && !IS_NIL(pl1tab) ) { l1tab = alloc_xenheap_pages(0, memf); if ( !l1tab ) { rc = -ENOMEM; break; } ASSERT(!pl1tab[l2_table_offset(va)]); pl1tab[l2_table_offset(va)] = l1tab; pg = virt_to_page(l1tab); } else { pg = alloc_domheap_page(NULL, memf); if ( !pg ) { rc = -ENOMEM; break; } l1tab = __map_domain_page(pg); } clear_page(l1tab); *pl2e = l2e_from_page(pg, __PAGE_HYPERVISOR); } else if ( !l1tab ) l1tab = map_domain_page(l2e_get_pfn(*pl2e)); if ( ppg && !(l1e_get_flags(l1tab[l1_table_offset(va)]) & _PAGE_PRESENT) ) { pg = alloc_domheap_page(NULL, memf); if ( pg ) { clear_domain_page(page_to_mfn(pg)); if ( !IS_NIL(ppg) ) *ppg++ = pg; l1tab[l1_table_offset(va)] = l1e_from_page(pg, __PAGE_HYPERVISOR | _PAGE_AVAIL0); l2e_add_flags(*pl2e, _PAGE_AVAIL0); } else rc = -ENOMEM; } va += PAGE_SIZE; if ( rc || !nr || !l1_table_offset(va) ) { /* Note that this is a no-op for the alloc_xenheap_page() case. */ unmap_domain_page(l1tab); l1tab = NULL; } } ASSERT(!l1tab); unmap_domain_page(l2tab); return rc; } void destroy_perdomain_mapping(struct domain *d, unsigned long va, unsigned int nr) { const l3_pgentry_t *l3tab, *pl3e; ASSERT(va >= PERDOMAIN_VIRT_START && va < PERDOMAIN_VIRT_SLOT(PERDOMAIN_SLOTS)); ASSERT(!l3_table_offset(va ^ (va + nr * PAGE_SIZE - 1))); if ( !d->arch.perdomain_l3_pg ) return; l3tab = __map_domain_page(d->arch.perdomain_l3_pg); pl3e = l3tab + l3_table_offset(va); if ( l3e_get_flags(*pl3e) & _PAGE_PRESENT ) { const l2_pgentry_t *l2tab = map_domain_page(l3e_get_pfn(*pl3e)); const l2_pgentry_t *pl2e = l2tab + l2_table_offset(va); unsigned int i = l1_table_offset(va); while ( nr ) { if ( l2e_get_flags(*pl2e) & _PAGE_PRESENT ) { l1_pgentry_t *l1tab = map_domain_page(l2e_get_pfn(*pl2e)); for ( ; nr && i < L1_PAGETABLE_ENTRIES; --nr, ++i ) { if ( (l1e_get_flags(l1tab[i]) & (_PAGE_PRESENT | _PAGE_AVAIL0)) == (_PAGE_PRESENT | _PAGE_AVAIL0) ) free_domheap_page(l1e_get_page(l1tab[i])); l1tab[i] = l1e_empty(); } unmap_domain_page(l1tab); } else if ( nr + i < L1_PAGETABLE_ENTRIES ) break; else nr -= L1_PAGETABLE_ENTRIES - i; ++pl2e; i = 0; } unmap_domain_page(l2tab); } unmap_domain_page(l3tab); } void free_perdomain_mappings(struct domain *d) { l3_pgentry_t *l3tab = __map_domain_page(d->arch.perdomain_l3_pg); unsigned int i; for ( i = 0; i < PERDOMAIN_SLOTS; ++i) if ( l3e_get_flags(l3tab[i]) & _PAGE_PRESENT ) { struct page_info *l2pg = l3e_get_page(l3tab[i]); l2_pgentry_t *l2tab = __map_domain_page(l2pg); unsigned int j; for ( j = 0; j < L2_PAGETABLE_ENTRIES; ++j ) if ( l2e_get_flags(l2tab[j]) & _PAGE_PRESENT ) { struct page_info *l1pg = l2e_get_page(l2tab[j]); if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 ) { l1_pgentry_t *l1tab = __map_domain_page(l1pg); unsigned int k; for ( k = 0; k < L1_PAGETABLE_ENTRIES; ++k ) if ( (l1e_get_flags(l1tab[k]) & (_PAGE_PRESENT | _PAGE_AVAIL0)) == (_PAGE_PRESENT | _PAGE_AVAIL0) ) free_domheap_page(l1e_get_page(l1tab[k])); unmap_domain_page(l1tab); } if ( is_xen_heap_page(l1pg) ) free_xenheap_page(page_to_virt(l1pg)); else free_domheap_page(l1pg); } unmap_domain_page(l2tab); free_domheap_page(l2pg); } unmap_domain_page(l3tab); free_domheap_page(d->arch.perdomain_l3_pg); } #ifdef MEMORY_GUARD void memguard_init(void) { unsigned long start = max_t(unsigned long, xen_phys_start, 1UL << 20); map_pages_to_xen( (unsigned long)__va(start), start >> PAGE_SHIFT, (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT, __PAGE_HYPERVISOR|MAP_SMALL_PAGES); BUG_ON(start != xen_phys_start); map_pages_to_xen( XEN_VIRT_START, start >> PAGE_SHIFT, (__pa(&_end) + PAGE_SIZE - 1 - start) >> PAGE_SHIFT, __PAGE_HYPERVISOR|MAP_SMALL_PAGES); } static void __memguard_change_range(void *p, unsigned long l, int guard) { unsigned long _p = (unsigned long)p; unsigned long _l = (unsigned long)l; unsigned int flags = __PAGE_HYPERVISOR | MAP_SMALL_PAGES; /* Ensure we are dealing with a page-aligned whole number of pages. */ ASSERT((_p&~PAGE_MASK) == 0); ASSERT((_l&~PAGE_MASK) == 0); if ( guard ) flags &= ~_PAGE_PRESENT; map_pages_to_xen( _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags); } void memguard_guard_range(void *p, unsigned long l) { __memguard_change_range(p, l, 1); } void memguard_unguard_range(void *p, unsigned long l) { __memguard_change_range(p, l, 0); } #endif void memguard_guard_stack(void *p) { BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE); p = (void *)((unsigned long)p + STACK_SIZE - PRIMARY_STACK_SIZE - PAGE_SIZE); memguard_guard_range(p, PAGE_SIZE); } void memguard_unguard_stack(void *p) { p = (void *)((unsigned long)p + STACK_SIZE - PRIMARY_STACK_SIZE - PAGE_SIZE); memguard_unguard_range(p, PAGE_SIZE); } void arch_dump_shared_mem_info(void) { printk("Shared frames %u -- Saved frames %u\n", mem_sharing_get_nr_shared_mfns(), mem_sharing_get_nr_saved_mfns()); } const unsigned long *__init get_platform_badpages(unsigned int *array_size) { u32 igd_id; static unsigned long __initdata bad_pages[] = { 0x20050000, 0x20110000, 0x20130000, 0x20138000, 0x40004000, }; *array_size = ARRAY_SIZE(bad_pages); igd_id = pci_conf_read32(0, 0, 2, 0, 0); if ( !IS_SNB_GFX(igd_id) ) return NULL; return bad_pages; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/cpu/0000775000175000017500000000000012307313555013366 5ustar smbsmbxen-4.4.0/xen/arch/x86/cpu/Makefile0000664000175000017500000000023312307313555015024 0ustar smbsmbsubdir-y += mcheck subdir-y += mtrr obj-y += amd.o obj-y += centaur.o obj-y += common.o obj-y += intel.o obj-y += intel_cacheinfo.o obj-y += mwait-idle.o xen-4.4.0/xen/arch/x86/cpu/cpu.h0000664000175000017500000000122512307313555014326 0ustar smbsmb/* attempt to consolidate cpu attributes */ struct cpu_dev { char * c_vendor; /* some have two possibilities for cpuid string */ char * c_ident[2]; void (*c_init)(struct cpuinfo_x86 * c); void (*c_identify)(struct cpuinfo_x86 * c); }; extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; extern bool_t opt_arat; extern unsigned int opt_cpuid_mask_ecx, opt_cpuid_mask_edx; extern unsigned int opt_cpuid_mask_xsave_eax; extern unsigned int opt_cpuid_mask_ext_ecx, opt_cpuid_mask_ext_edx; extern int get_model_name(struct cpuinfo_x86 *c); extern void display_cacheinfo(struct cpuinfo_x86 *c); extern void early_intel_workaround(struct cpuinfo_x86 *c); xen-4.4.0/xen/arch/x86/cpu/centaur.c0000664000175000017500000000345112307313555015176 0ustar smbsmb#include #include #include #include #include #include #include #include "cpu.h" #define ACE_PRESENT (1 << 6) #define ACE_ENABLED (1 << 7) #define ACE_FCR (1 << 28) /* MSR_VIA_FCR */ #define RNG_PRESENT (1 << 2) #define RNG_ENABLED (1 << 3) #define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */ static void __init init_c3(struct cpuinfo_x86 *c) { uint64_t msr_content; /* Test for Centaur Extended Feature Flags presence */ if (cpuid_eax(0xC0000000) >= 0xC0000001) { u32 tmp = cpuid_edx(0xC0000001); /* enable ACE unit, if present and disabled */ if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) { rdmsrl(MSR_VIA_FCR, msr_content); /* enable ACE unit */ wrmsrl(MSR_VIA_FCR, msr_content | ACE_FCR); printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n"); } /* enable RNG unit, if present and disabled */ if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) { rdmsrl(MSR_VIA_RNG, msr_content); /* enable RNG unit */ wrmsrl(MSR_VIA_RNG, msr_content | RNG_ENABLE); printk(KERN_INFO "CPU: Enabled h/w RNG\n"); } /* store Centaur Extended Feature Flags as * word 5 of the CPU capability bit array */ c->x86_capability[5] = cpuid_edx(0xC0000001); } if (c->x86 == 0x6 && c->x86_model >= 0xf) { c->x86_cache_alignment = c->x86_clflush_size * 2; set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); } get_model_name(c); display_cacheinfo(c); } static void __init init_centaur(struct cpuinfo_x86 *c) { if (c->x86 == 6) init_c3(c); } static struct cpu_dev centaur_cpu_dev __cpuinitdata = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, .c_init = init_centaur, }; int __init centaur_init_cpu(void) { cpu_devs[X86_VENDOR_CENTAUR] = ¢aur_cpu_dev; return 0; } xen-4.4.0/xen/arch/x86/cpu/common.c0000664000175000017500000003652112307313555015031 0ustar smbsmb#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "cpu.h" static bool_t __cpuinitdata use_xsave = 1; boolean_param("xsave", use_xsave); bool_t __devinitdata opt_arat = 1; boolean_param("arat", opt_arat); unsigned int __devinitdata opt_cpuid_mask_ecx = ~0u; integer_param("cpuid_mask_ecx", opt_cpuid_mask_ecx); unsigned int __devinitdata opt_cpuid_mask_edx = ~0u; integer_param("cpuid_mask_edx", opt_cpuid_mask_edx); unsigned int __devinitdata opt_cpuid_mask_xsave_eax = ~0u; integer_param("cpuid_mask_xsave_eax", opt_cpuid_mask_xsave_eax); unsigned int __devinitdata opt_cpuid_mask_ext_ecx = ~0u; integer_param("cpuid_mask_ext_ecx", opt_cpuid_mask_ext_ecx); unsigned int __devinitdata opt_cpuid_mask_ext_edx = ~0u; integer_param("cpuid_mask_ext_edx", opt_cpuid_mask_ext_edx); struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; unsigned int paddr_bits __read_mostly = 36; /* * Default host IA32_CR_PAT value to cover all memory types. * BIOS usually sets it to 0x07040600070406. */ u64 host_pat = 0x050100070406; static unsigned int __cpuinitdata cleared_caps[NCAPINTS]; void __init setup_clear_cpu_cap(unsigned int cap) { __clear_bit(cap, boot_cpu_data.x86_capability); __set_bit(cap, cleared_caps); } static void default_init(struct cpuinfo_x86 * c) { /* Not much we can do here... */ /* Check if at least it has cpuid */ BUG_ON(c->cpuid_level == -1); __clear_bit(X86_FEATURE_SEP, c->x86_capability); } static struct cpu_dev default_cpu = { .c_init = default_init, .c_vendor = "Unknown", }; static struct cpu_dev * this_cpu = &default_cpu; bool_t opt_cpu_info; boolean_param("cpuinfo", opt_cpu_info); int __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; char *p, *q; if (cpuid_eax(0x80000000) < 0x80000004) return 0; v = (unsigned int *) c->x86_model_id; cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); c->x86_model_id[48] = 0; /* Intel chips right-justify this string for some dumb reason; undo that brain damage */ p = q = &c->x86_model_id[0]; while ( *p == ' ' ) p++; if ( p != q ) { while ( *p ) *q++ = *p++; while ( q <= &c->x86_model_id[48] ) *q++ = '\0'; /* Zero-pad the rest */ } return 1; } void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { unsigned int n, dummy, ecx, edx, l2size; n = cpuid_eax(0x80000000); if (n >= 0x80000005) { cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); if (opt_cpu_info) printk("CPU: L1 I cache %dK (%d bytes/line)," " D cache %dK (%d bytes/line)\n", edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); c->x86_cache_size=(ecx>>24)+(edx>>24); } if (n < 0x80000006) /* Some chips just has a large L1. */ return; ecx = cpuid_ecx(0x80000006); l2size = ecx >> 16; c->x86_cache_size = l2size; if (opt_cpu_info) printk("CPU: L2 Cache: %dK (%d bytes/line)\n", l2size, ecx & 0xFF); } static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) { char *v = c->x86_vendor_id; int i; static int printed; for (i = 0; i < X86_VENDOR_NUM; i++) { if (cpu_devs[i]) { if (!strcmp(v,cpu_devs[i]->c_ident[0]) || (cpu_devs[i]->c_ident[1] && !strcmp(v,cpu_devs[i]->c_ident[1]))) { c->x86_vendor = i; if (!early) this_cpu = cpu_devs[i]; return; } } } if (!printed) { printed++; printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); printk(KERN_ERR "CPU: Your system may be unstable.\n"); } c->x86_vendor = X86_VENDOR_UNKNOWN; this_cpu = &default_cpu; } /* Do minimum CPU detection early. Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. The others are not touched to avoid unwanted side effects. WARNING: this function is only called on the BP. Don't add code here that is supposed to run on all CPUs. */ static void __init early_cpu_detect(void) { struct cpuinfo_x86 *c = &boot_cpu_data; u32 cap4, tfms, cap0, misc; c->x86_cache_alignment = 32; /* Get vendor name */ cpuid(0x00000000, &c->cpuid_level, (int *)&c->x86_vendor_id[0], (int *)&c->x86_vendor_id[8], (int *)&c->x86_vendor_id[4]); get_cpu_vendor(c, 1); cpuid(0x00000001, &tfms, &misc, &cap4, &cap0); c->x86 = (tfms >> 8) & 15; c->x86_model = (tfms >> 4) & 15; if (c->x86 == 0xf) c->x86 += (tfms >> 20) & 0xff; if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; c->x86_mask = tfms & 15; cap0 &= ~cleared_caps[0]; cap4 &= ~cleared_caps[4]; if (cap0 & (1<<19)) c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; /* Leaf 0x1 capabilities filled in early for Xen. */ c->x86_capability[0] = cap0; c->x86_capability[4] = cap4; } static void __cpuinit generic_identify(struct cpuinfo_x86 *c) { u32 tfms, xlvl, capability, excap, ebx; /* Get vendor name */ cpuid(0x00000000, &c->cpuid_level, (int *)&c->x86_vendor_id[0], (int *)&c->x86_vendor_id[8], (int *)&c->x86_vendor_id[4]); get_cpu_vendor(c, 0); /* Initialize the standard set of capabilities */ /* Note that the vendor-specific code below might override */ /* Intel-defined flags: level 0x00000001 */ cpuid(0x00000001, &tfms, &ebx, &excap, &capability); c->x86_capability[0] = capability; c->x86_capability[4] = excap; c->x86 = (tfms >> 8) & 15; c->x86_model = (tfms >> 4) & 15; if (c->x86 == 0xf) c->x86 += (tfms >> 20) & 0xff; if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; c->x86_mask = tfms & 15; if ( cpu_has(c, X86_FEATURE_CLFLSH) ) c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8; /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); if ( (xlvl & 0xffff0000) == 0x80000000 ) { if ( xlvl >= 0x80000001 ) { c->x86_capability[1] = cpuid_edx(0x80000001); c->x86_capability[6] = cpuid_ecx(0x80000001); } if ( xlvl >= 0x80000004 ) get_model_name(c); /* Default name */ if ( xlvl >= 0x80000008 ) paddr_bits = cpuid_eax(0x80000008) & 0xff; } /* Intel-defined flags: level 0x00000007 */ if ( c->cpuid_level >= 0x00000007 ) { u32 dummy; cpuid_count(0x00000007, 0, &dummy, &ebx, &dummy, &dummy); c->x86_capability[X86_FEATURE_FSGSBASE / 32] = ebx; } early_intel_workaround(c); #ifdef CONFIG_X86_HT c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; #endif } /* * This does the hard work of actually picking apart the CPU stuff... */ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; c->x86_cache_size = -1; c->x86_vendor = X86_VENDOR_UNKNOWN; c->cpuid_level = -1; /* CPUID not detected */ c->x86_model = c->x86_mask = 0; /* So far unknown... */ c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; c->x86_num_siblings = 1; c->x86_clflush_size = 0; c->phys_proc_id = BAD_APICID; c->cpu_core_id = BAD_APICID; c->compute_unit_id = BAD_APICID; memset(&c->x86_capability, 0, sizeof c->x86_capability); generic_identify(c); #ifdef NOISY_CAPS printk(KERN_DEBUG "CPU: After generic identify, caps:"); for (i = 0; i < NCAPINTS; i++) printk(" %08x", c->x86_capability[i]); printk("\n"); #endif if (this_cpu->c_identify) { this_cpu->c_identify(c); #ifdef NOISY_CAPS printk(KERN_DEBUG "CPU: After vendor identify, caps:"); for (i = 0; i < NCAPINTS; i++) printk(" %08x", c->x86_capability[i]); printk("\n"); #endif } /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are * features a certain CPU supports which CPUID doesn't * tell us, CPUID claiming incorrect flags, or other bugs, * we handle them here. * * At the end of this section, c->x86_capability better * indicate the features this CPU genuinely supports! */ if (this_cpu->c_init) this_cpu->c_init(c); /* Initialize xsave/xrstor features */ if ( !use_xsave ) clear_bit(X86_FEATURE_XSAVE, boot_cpu_data.x86_capability); if ( cpu_has_xsave ) xstate_init(c == &boot_cpu_data); /* * The vendor-specific functions might have changed features. Now * we do "generic changes." */ for (i = 0 ; i < NCAPINTS ; ++i) c->x86_capability[i] &= ~cleared_caps[i]; /* If the model name is still unset, do table lookup. */ if ( !c->x86_model_id[0] ) { /* Last resort... */ snprintf(c->x86_model_id, sizeof(c->x86_model_id), "%02x/%02x", c->x86_vendor, c->x86_model); } /* Now the feature flags better reflect actual CPU features! */ #ifdef NOISY_CAPS printk(KERN_DEBUG "CPU: After all inits, caps:"); for (i = 0; i < NCAPINTS; i++) printk(" %08x", c->x86_capability[i]); printk("\n"); #endif /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are * common between the CPUs. The first time this routine gets * executed, c == &boot_cpu_data. */ if ( c != &boot_cpu_data ) { /* AND the already accumulated flags with these */ for ( i = 0 ; i < NCAPINTS ; i++ ) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; mcheck_init(c, 0); } else { mcheck_init(c, 1); mtrr_bp_init(); } } /* cpuid returns the value latched in the HW at reset, not the APIC ID * register's value. For any box whose BIOS changes APIC IDs, like * clustered APIC systems, we must use hard_smp_processor_id. * * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID. */ static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) { return hard_smp_processor_id() >> index_msb; } /* leaf 0xb SMT level */ #define SMT_LEVEL 0 /* leaf 0xb sub-leaf types */ #define INVALID_TYPE 0 #define SMT_TYPE 1 #define CORE_TYPE 2 #define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) #define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) #define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) /* * Check for extended topology enumeration cpuid leaf 0xb and if it * exists, use it for cpu topology detection. */ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) { unsigned int eax, ebx, ecx, edx, sub_index; unsigned int ht_mask_width, core_plus_mask_width; unsigned int core_select_mask, core_level_siblings; unsigned int initial_apicid; if ( c->cpuid_level < 0xb ) return; cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); /* Check if the cpuid leaf 0xb is actually implemented */ if ( ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE) ) return; set_bit(X86_FEATURE_XTOPOLOGY, c->x86_capability); initial_apicid = edx; /* Populate HT related information from sub-leaf level 0 */ core_level_siblings = c->x86_num_siblings = LEVEL_MAX_SIBLINGS(ebx); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); sub_index = 1; do { cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); /* Check for the Core type in the implemented sub leaves */ if ( LEAFB_SUBTYPE(ecx) == CORE_TYPE ) { core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); break; } sub_index++; } while ( LEAFB_SUBTYPE(ecx) != INVALID_TYPE ); core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; c->cpu_core_id = phys_pkg_id(initial_apicid, ht_mask_width) & core_select_mask; c->phys_proc_id = phys_pkg_id(initial_apicid, core_plus_mask_width); c->apicid = phys_pkg_id(initial_apicid, 0); c->x86_max_cores = (core_level_siblings / c->x86_num_siblings); if ( opt_cpu_info ) { printk("CPU: Physical Processor ID: %d\n", c->phys_proc_id); if ( c->x86_max_cores > 1 ) printk("CPU: Processor Core ID: %d\n", c->cpu_core_id); } } #ifdef CONFIG_X86_HT void __cpuinit detect_ht(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; int index_msb, core_bits; cpuid(1, &eax, &ebx, &ecx, &edx); c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0); if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY) || cpu_has(c, X86_FEATURE_XTOPOLOGY)) return; c->x86_num_siblings = (ebx & 0xff0000) >> 16; if (c->x86_num_siblings == 1) { printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); } else if (c->x86_num_siblings > 1 ) { if (c->x86_num_siblings > nr_cpu_ids) { printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", c->x86_num_siblings); c->x86_num_siblings = 1; return; } index_msb = get_count_order(c->x86_num_siblings); c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); if (opt_cpu_info) printk("CPU: Physical Processor ID: %d\n", c->phys_proc_id); c->x86_num_siblings = c->x86_num_siblings / c->x86_max_cores; index_msb = get_count_order(c->x86_num_siblings) ; core_bits = get_count_order(c->x86_max_cores); c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) & ((1 << core_bits) - 1); if (opt_cpu_info && c->x86_max_cores > 1) printk("CPU: Processor Core ID: %d\n", c->cpu_core_id); } } #endif void __cpuinit print_cpu_info(unsigned int cpu) { const struct cpuinfo_x86 *c = cpu_data + cpu; const char *vendor = NULL; if (!opt_cpu_info) return; printk("CPU%u: ", cpu); if (c->x86_vendor < X86_VENDOR_NUM) vendor = this_cpu->c_vendor; else vendor = c->x86_vendor_id; if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) printk("%s ", vendor); if (!c->x86_model_id[0]) printk("%d86", c->x86); else printk("%s", c->x86_model_id); printk(" stepping %02x\n", c->x86_mask); } static cpumask_t cpu_initialized; /* This is hacky. :) * We're emulating future behavior. * In the future, the cpu-specific init functions will be called implicitly * via the magic of initcalls. * They will insert themselves into the cpu_devs structure. * Then, when cpu_init() is called, we can just iterate over that array. */ void __init early_cpu_init(void) { intel_cpu_init(); amd_init_cpu(); centaur_init_cpu(); early_cpu_detect(); } /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. */ void __cpuinit cpu_init(void) { int cpu = smp_processor_id(); struct tss_struct *t = &this_cpu(init_tss); struct desc_ptr gdt_desc = { .base = (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY), .limit = LAST_RESERVED_GDT_BYTE }; if (cpumask_test_and_set_cpu(cpu, &cpu_initialized)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); for (;;) local_irq_enable(); } if (opt_cpu_info) printk("Initializing CPU#%d\n", cpu); if (cpu_has_pat) wrmsrl(MSR_IA32_CR_PAT, host_pat); /* Install correct page table. */ write_ptbase(current); asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); /* No nested task. */ asm volatile ("pushf ; andw $0xbfff,(%"__OP"sp) ; popf" ); /* Ensure FPU gets initialised for each domain. */ stts(); /* Set up and load the per-CPU TSS and LDT. */ t->bitmap = IOBMP_INVALID_OFFSET; /* Bottom-of-stack must be 16-byte aligned! */ BUG_ON((get_stack_bottom() & 15) != 0); t->rsp0 = get_stack_bottom(); load_TR(); asm volatile ( "lldt %%ax" : : "a" (0) ); /* Clear all 6 debug registers: */ #define CD(register) asm volatile ( "mov %0,%%db" #register : : "r"(0UL) ); CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); #undef CD } void cpu_uninit(unsigned int cpu) { cpumask_clear_cpu(cpu, &cpu_initialized); } xen-4.4.0/xen/arch/x86/cpu/amd.c0000664000175000017500000004075612307313555014307 0ustar smbsmb#include #include #include #include #include #include #include #include #include #include #include #include /* amd_init_cpu */ #include #include #include "cpu.h" /* * Pre-canned values for overriding the CPUID features * and extended features masks. * * Currently supported processors: * * "fam_0f_rev_c" * "fam_0f_rev_d" * "fam_0f_rev_e" * "fam_0f_rev_f" * "fam_0f_rev_g" * "fam_10_rev_b" * "fam_10_rev_c" * "fam_11_rev_b" */ static char opt_famrev[14]; string_param("cpuid_mask_cpu", opt_famrev); /* 1 = allow, 0 = don't allow guest creation, -1 = don't allow boot */ s8 __read_mostly opt_allow_unsafe; boolean_param("allow_unsafe", opt_allow_unsafe); static inline void wrmsr_amd(unsigned int index, unsigned int lo, unsigned int hi) { asm volatile ( "wrmsr" : /* No outputs */ : "c" (index), "a" (lo), "d" (hi), "D" (0x9c5a203a) ); } static inline int rdmsr_amd_safe(unsigned int msr, unsigned int *lo, unsigned int *hi) { int err; asm volatile("1: rdmsr\n2:\n" ".section .fixup,\"ax\"\n" "3: movl %6,%2\n" " jmp 2b\n" ".previous\n" _ASM_EXTABLE(1b, 3b) : "=a" (*lo), "=d" (*hi), "=r" (err) : "c" (msr), "D" (0x9c5a203a), "2" (0), "i" (-EFAULT)); return err; } static inline int wrmsr_amd_safe(unsigned int msr, unsigned int lo, unsigned int hi) { int err; asm volatile("1: wrmsr\n2:\n" ".section .fixup,\"ax\"\n" "3: movl %6,%0\n" " jmp 2b\n" ".previous\n" _ASM_EXTABLE(1b, 3b) : "=r" (err) : "c" (msr), "a" (lo), "d" (hi), "D" (0x9c5a203a), "0" (0), "i" (-EFAULT)); return err; } /* * Mask the features and extended features returned by CPUID. Parameters are * set from the boot line via two methods: * * 1) Specific processor revision string * 2) User-defined masks * * The processor revision string parameter has precedene. */ static void __devinit set_cpuidmask(const struct cpuinfo_x86 *c) { static unsigned int feat_ecx, feat_edx; static unsigned int extfeat_ecx, extfeat_edx; static enum { not_parsed, no_mask, set_mask } status; if (status == no_mask) return; if (status == set_mask) goto setmask; ASSERT((status == not_parsed) && (smp_processor_id() == 0)); status = no_mask; if (~(opt_cpuid_mask_ecx & opt_cpuid_mask_edx & opt_cpuid_mask_ext_ecx & opt_cpuid_mask_ext_edx)) { feat_ecx = opt_cpuid_mask_ecx; feat_edx = opt_cpuid_mask_edx; extfeat_ecx = opt_cpuid_mask_ext_ecx; extfeat_edx = opt_cpuid_mask_ext_edx; } else if (*opt_famrev == '\0') { return; } else if (!strcmp(opt_famrev, "fam_0f_rev_c")) { feat_ecx = AMD_FEATURES_K8_REV_C_ECX; feat_edx = AMD_FEATURES_K8_REV_C_EDX; extfeat_ecx = AMD_EXTFEATURES_K8_REV_C_ECX; extfeat_edx = AMD_EXTFEATURES_K8_REV_C_EDX; } else if (!strcmp(opt_famrev, "fam_0f_rev_d")) { feat_ecx = AMD_FEATURES_K8_REV_D_ECX; feat_edx = AMD_FEATURES_K8_REV_D_EDX; extfeat_ecx = AMD_EXTFEATURES_K8_REV_D_ECX; extfeat_edx = AMD_EXTFEATURES_K8_REV_D_EDX; } else if (!strcmp(opt_famrev, "fam_0f_rev_e")) { feat_ecx = AMD_FEATURES_K8_REV_E_ECX; feat_edx = AMD_FEATURES_K8_REV_E_EDX; extfeat_ecx = AMD_EXTFEATURES_K8_REV_E_ECX; extfeat_edx = AMD_EXTFEATURES_K8_REV_E_EDX; } else if (!strcmp(opt_famrev, "fam_0f_rev_f")) { feat_ecx = AMD_FEATURES_K8_REV_F_ECX; feat_edx = AMD_FEATURES_K8_REV_F_EDX; extfeat_ecx = AMD_EXTFEATURES_K8_REV_F_ECX; extfeat_edx = AMD_EXTFEATURES_K8_REV_F_EDX; } else if (!strcmp(opt_famrev, "fam_0f_rev_g")) { feat_ecx = AMD_FEATURES_K8_REV_G_ECX; feat_edx = AMD_FEATURES_K8_REV_G_EDX; extfeat_ecx = AMD_EXTFEATURES_K8_REV_G_ECX; extfeat_edx = AMD_EXTFEATURES_K8_REV_G_EDX; } else if (!strcmp(opt_famrev, "fam_10_rev_b")) { feat_ecx = AMD_FEATURES_FAM10h_REV_B_ECX; feat_edx = AMD_FEATURES_FAM10h_REV_B_EDX; extfeat_ecx = AMD_EXTFEATURES_FAM10h_REV_B_ECX; extfeat_edx = AMD_EXTFEATURES_FAM10h_REV_B_EDX; } else if (!strcmp(opt_famrev, "fam_10_rev_c")) { feat_ecx = AMD_FEATURES_FAM10h_REV_C_ECX; feat_edx = AMD_FEATURES_FAM10h_REV_C_EDX; extfeat_ecx = AMD_EXTFEATURES_FAM10h_REV_C_ECX; extfeat_edx = AMD_EXTFEATURES_FAM10h_REV_C_EDX; } else if (!strcmp(opt_famrev, "fam_11_rev_b")) { feat_ecx = AMD_FEATURES_FAM11h_REV_B_ECX; feat_edx = AMD_FEATURES_FAM11h_REV_B_EDX; extfeat_ecx = AMD_EXTFEATURES_FAM11h_REV_B_ECX; extfeat_edx = AMD_EXTFEATURES_FAM11h_REV_B_EDX; } else { printk("Invalid processor string: %s\n", opt_famrev); printk("CPUID will not be masked\n"); return; } /* Setting bits in the CPUID mask MSR that are not set in the * unmasked CPUID response can cause those bits to be set in the * masked response. Avoid that by explicitly masking in software. */ feat_ecx &= cpuid_ecx(0x00000001); feat_edx &= cpuid_edx(0x00000001); extfeat_ecx &= cpuid_ecx(0x80000001); extfeat_edx &= cpuid_edx(0x80000001); status = set_mask; printk("Writing CPUID feature mask ECX:EDX -> %08Xh:%08Xh\n", feat_ecx, feat_edx); printk("Writing CPUID extended feature mask ECX:EDX -> %08Xh:%08Xh\n", extfeat_ecx, extfeat_edx); setmask: /* FIXME check if processor supports CPUID masking */ /* AMD processors prior to family 10h required a 32-bit password */ if (c->x86 >= 0x10) { wrmsr(MSR_K8_FEATURE_MASK, feat_edx, feat_ecx); wrmsr(MSR_K8_EXT_FEATURE_MASK, extfeat_edx, extfeat_ecx); } else { wrmsr_amd(MSR_K8_FEATURE_MASK, feat_edx, feat_ecx); wrmsr_amd(MSR_K8_EXT_FEATURE_MASK, extfeat_edx, extfeat_ecx); } } /* * Check for the presence of an AMD erratum. Arguments are defined in amd.h * for each known erratum. Return 1 if erratum is found. */ int cpu_has_amd_erratum(const struct cpuinfo_x86 *cpu, int osvw_id, ...) { va_list ap; u32 range; u32 ms; if (cpu->x86_vendor != X86_VENDOR_AMD) return 0; if (osvw_id >= 0 && cpu_has(cpu, X86_FEATURE_OSVW)) { u64 osvw_len; rdmsrl(MSR_AMD_OSVW_ID_LENGTH, osvw_len); if (osvw_id < osvw_len) { u64 osvw_bits; rdmsrl(MSR_AMD_OSVW_STATUS + (osvw_id >> 6), osvw_bits); return (osvw_bits >> (osvw_id & 0x3f)) & 1; } } /* OSVW unavailable or ID unknown, match family-model-stepping range */ va_start(ap, osvw_id); ms = (cpu->x86_model << 4) | cpu->x86_mask; while ((range = va_arg(ap, int))) { if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && (ms >= AMD_MODEL_RANGE_START(range)) && (ms <= AMD_MODEL_RANGE_END(range))) { va_end(ap); return 1; } } va_end(ap); return 0; } /* Can this system suffer from TSC drift due to C1 clock ramping? */ static int c1_ramping_may_cause_clock_drift(struct cpuinfo_x86 *c) { if (cpuid_edx(0x80000007) & (1<<8)) { /* * CPUID.AdvPowerMgmtInfo.TscInvariant * EDX bit 8, 8000_0007 * Invariant TSC on 8th Gen or newer, use it * (assume all cores have invariant TSC) */ return 0; } return 1; } /* * Disable C1-Clock ramping if enabled in PMM7.CpuLowPwrEnh on 8th-generation * cores only. Assume BIOS has setup all Northbridges equivalently. */ static void disable_c1_ramping(void) { u8 pmm7; int node, nr_nodes; /* Read the number of nodes from the first Northbridge. */ nr_nodes = ((pci_conf_read32(0, 0, 0x18, 0x0, 0x60)>>4)&0x07)+1; for (node = 0; node < nr_nodes; node++) { /* PMM7: bus=0, dev=0x18+node, function=0x3, register=0x87. */ pmm7 = pci_conf_read8(0, 0, 0x18+node, 0x3, 0x87); /* Invalid read means we've updated every Northbridge. */ if (pmm7 == 0xFF) break; pmm7 &= 0xFC; /* clear pmm7[1:0] */ pci_conf_write8(0, 0, 0x18+node, 0x3, 0x87, pmm7); printk ("AMD: Disabling C1 Clock Ramping Node #%x\n", node); } } int force_mwait __cpuinitdata; static void disable_c1e(void *unused) { uint64_t msr_content; /* * Disable C1E mode, as the APIC timer stops in that mode. * The MSR does not exist in all FamilyF CPUs (only Rev F and above), * but we safely catch the #GP in that case. */ if ((rdmsr_safe(MSR_K8_ENABLE_C1E, msr_content) == 0) && (msr_content & (3ULL << 27)) && (wrmsr_safe(MSR_K8_ENABLE_C1E, msr_content & ~(3ULL << 27)) != 0)) printk(KERN_ERR "Failed to disable C1E on CPU#%u (%16"PRIx64")\n", smp_processor_id(), msr_content); } static void check_disable_c1e(unsigned int port, u8 value) { /* C1E is sometimes enabled during entry to ACPI mode. */ if ((port == acpi_smi_cmd) && (value == acpi_enable_value)) on_each_cpu(disable_c1e, NULL, 1); } /* * BIOS is expected to clear MtrrFixDramModEn bit. According to AMD BKDG : * "The MtrrFixDramModEn bit should be set to 1 during BIOS initalization of * the fixed MTRRs, then cleared to 0 for operation." */ static void check_syscfg_dram_mod_en(void) { uint64_t syscfg; static bool_t printed = 0; if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0x0f))) return; rdmsrl(MSR_K8_SYSCFG, syscfg); if (!(syscfg & K8_MTRRFIXRANGE_DRAM_MODIFY)) return; if (!test_and_set_bool(printed)) printk(KERN_ERR "MTRR: SYSCFG[MtrrFixDramModEn] not " "cleared by BIOS, clearing this bit\n"); syscfg &= ~K8_MTRRFIXRANGE_DRAM_MODIFY; wrmsrl(MSR_K8_SYSCFG, syscfg); } static void __devinit amd_get_topology(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT int cpu; unsigned bits; if (c->x86_max_cores <= 1) return; /* * On a AMD multi core setup the lower bits of the APIC id * distingush the cores. */ cpu = smp_processor_id(); bits = (cpuid_ecx(0x80000008) >> 12) & 0xf; if (bits == 0) { while ((1 << bits) < c->x86_max_cores) bits++; } /* Low order bits define the core id */ c->cpu_core_id = c->phys_proc_id & ((1<phys_proc_id >>= bits; /* Collect compute unit ID if available */ if (cpu_has(c, X86_FEATURE_TOPOEXT)) { u32 eax, ebx, ecx, edx; cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); c->compute_unit_id = ebx & 0xFF; c->x86_num_siblings = ((ebx >> 8) & 0x3) + 1; } if (opt_cpu_info) printk("CPU %d(%d) -> Processor %d, %s %d\n", cpu, c->x86_max_cores, c->phys_proc_id, cpu_has(c, X86_FEATURE_TOPOEXT) ? "Compute Unit" : "Core", cpu_has(c, X86_FEATURE_TOPOEXT) ? c->compute_unit_id : c->cpu_core_id); #endif } static void __devinit init_amd(struct cpuinfo_x86 *c) { u32 l, h; unsigned long long value; /* Disable TLB flush filter by setting HWCR.FFDIS on K8 * bit 6 of msr C001_0015 * * Errata 63 for SH-B3 steppings * Errata 122 for all steppings (F+ have it disabled by default) */ if (c->x86 == 15) { rdmsrl(MSR_K7_HWCR, value); value |= 1 << 6; wrmsrl(MSR_K7_HWCR, value); } /* * FIXME: We should handle the K5 here. Set up the write * range and also turn on MSR 83 bits 4 and 31 (write alloc, * no bus pipeline) */ /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ clear_bit(0*32+31, c->x86_capability); if (c->x86 == 0xf && c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { /* * Some BIOSes incorrectly force this feature, but only K8 * revision D (model = 0x14) and later actually support it. * (AMD Erratum #110, docId: 25759). */ unsigned int lo, hi; clear_bit(X86_FEATURE_LAHF_LM, c->x86_capability); if (!rdmsr_amd_safe(0xc001100d, &lo, &hi)) { hi &= ~1; wrmsr_amd_safe(0xc001100d, lo, hi); } } switch(c->x86) { case 0xf ... 0x17: disable_c1e(NULL); if (acpi_smi_cmd && (acpi_enable_value | acpi_disable_value)) pv_post_outb_hook = check_disable_c1e; break; } display_cacheinfo(c); if (cpuid_eax(0x80000000) >= 0x80000008) { c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; } if (cpuid_eax(0x80000000) >= 0x80000007) { c->x86_power = cpuid_edx(0x80000007); if (c->x86_power & (1<<8)) { set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); set_bit(X86_FEATURE_NONSTOP_TSC, c->x86_capability); if (c->x86 != 0x11) set_bit(X86_FEATURE_TSC_RELIABLE, c->x86_capability); } } /* re-enable TopologyExtensions if switched off by BIOS */ if ((c->x86 == 0x15) && (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && !cpu_has(c, X86_FEATURE_TOPOEXT) && !rdmsr_safe(MSR_K8_EXT_FEATURE_MASK, value)) { value |= 1ULL << 54; wrmsr_safe(MSR_K8_EXT_FEATURE_MASK, value); rdmsrl(MSR_K8_EXT_FEATURE_MASK, value); if (value & (1ULL << 54)) { set_bit(X86_FEATURE_TOPOEXT, c->x86_capability); printk(KERN_INFO "CPU: Re-enabling disabled " "Topology Extensions Support\n"); } } /* * The way access filter has a performance penalty on some workloads. * Disable it on the affected CPUs. */ if (c->x86 == 0x15 && c->x86_model >= 0x02 && c->x86_model < 0x20 && !rdmsr_safe(MSR_AMD64_IC_CFG, value) && (value & 0x1e) != 0x1e) wrmsr_safe(MSR_AMD64_IC_CFG, value | 0x1e); amd_get_topology(c); /* Pointless to use MWAIT on Family10 as it does not deep sleep. */ if (c->x86 >= 0x10 && !force_mwait) clear_bit(X86_FEATURE_MWAIT, c->x86_capability); if (!cpu_has_amd_erratum(c, AMD_ERRATUM_121)) opt_allow_unsafe = 1; else if (opt_allow_unsafe < 0) panic("Xen will not boot on this CPU for security reasons" "Pass \"allow_unsafe\" if you're trusting all your" " (PV) guest kernels.\n"); else if (!opt_allow_unsafe && c == &boot_cpu_data) printk(KERN_WARNING "*** Xen will not allow creation of DomU-s on" " this CPU for security reasons. ***\n" KERN_WARNING "*** Pass \"allow_unsafe\" if you're trusting" " all your (PV) guest kernels. ***\n"); if (c->x86 == 0x16 && c->x86_model <= 0xf) { if (c == &boot_cpu_data) { l = pci_conf_read32(0, 0, 0x18, 0x3, 0x58); h = pci_conf_read32(0, 0, 0x18, 0x3, 0x5c); if ((l & 0x1f) | (h & 0x1)) printk(KERN_WARNING "Applying workaround for erratum 792: %s%s%s\n", (l & 0x1f) ? "clearing D18F3x58[4:0]" : "", ((l & 0x1f) && (h & 0x1)) ? " and " : "", (h & 0x1) ? "clearing D18F3x5C[0]" : ""); if (l & 0x1f) pci_conf_write32(0, 0, 0x18, 0x3, 0x58, l & ~0x1f); if (h & 0x1) pci_conf_write32(0, 0, 0x18, 0x3, 0x5c, h & ~0x1); } rdmsrl(MSR_AMD64_LS_CFG, value); if (!(value & (1 << 15))) { static bool_t warned; if (c == &boot_cpu_data || opt_cpu_info || !test_and_set_bool(warned)) printk(KERN_WARNING "CPU%u: Applying workaround for erratum 793\n", smp_processor_id()); wrmsrl(MSR_AMD64_LS_CFG, value | (1 << 15)); } } /* AMD CPUs do not support SYSENTER outside of legacy mode. */ clear_bit(X86_FEATURE_SEP, c->x86_capability); if (c->x86 == 0x10) { /* do this for boot cpu */ if (c == &boot_cpu_data) check_enable_amd_mmconf_dmi(); fam10h_check_enable_mmcfg(); /* * On family 10h BIOS may not have properly enabled WC+ * support, causing it to be converted to CD memtype. This may * result in performance degradation for certain nested-paging * guests. Prevent this conversion by clearing bit 24 in * MSR_F10_BU_CFG2. */ rdmsrl(MSR_F10_BU_CFG2, value); value &= ~(1ULL << 24); wrmsrl(MSR_F10_BU_CFG2, value); } /* * Family 0x12 and above processors have APIC timer * running in deep C states. */ if ( opt_arat && c->x86 > 0x11 ) set_bit(X86_FEATURE_ARAT, c->x86_capability); /* * Prior to Family 0x14, perf counters are not reset during warm reboot. * We have to reset them manually. */ if (nmi_watchdog != NMI_LOCAL_APIC && c->x86 < 0x14) { wrmsrl(MSR_K7_PERFCTR0, 0); wrmsrl(MSR_K7_PERFCTR1, 0); wrmsrl(MSR_K7_PERFCTR2, 0); wrmsrl(MSR_K7_PERFCTR3, 0); } if (cpuid_edx(0x80000007) & (1 << 10)) { rdmsr(MSR_K7_HWCR, l, h); l |= (1 << 27); /* Enable read-only APERF/MPERF bit */ wrmsr(MSR_K7_HWCR, l, h); } /* Prevent TSC drift in non single-processor, single-core platforms. */ if ((smp_processor_id() == 1) && c1_ramping_may_cause_clock_drift(c)) disable_c1_ramping(); set_cpuidmask(c); check_syscfg_dram_mod_en(); } static struct cpu_dev amd_cpu_dev __cpuinitdata = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, .c_init = init_amd, }; int __init amd_init_cpu(void) { cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev; return 0; } xen-4.4.0/xen/arch/x86/cpu/mwait-idle.c0000664000175000017500000003515312307313555015575 0ustar smbsmb/* * mwait_idle.c - native hardware idle loop for modern processors * * Copyright (c) 2010, Intel Corporation. * Len Brown * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ /* * mwait_idle is a cpuidle driver that loads on specific processors * in lieu of the legacy ACPI processor_idle driver. The intent is to * make Linux more efficient on these processors, as mwait_idle knows * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs. */ /* * Design Assumptions * * All CPUs have same idle states as boot CPU * * Chipset BM_STS (bus master status) bit is a NOP * for preventing entry into deep C-states */ /* * Known limitations * * The driver currently initializes for_each_online_cpu() upon load. * It it unaware of subsequent processors hot-added to the system. * This means that if you boot with maxcpus=n and later online * processors above n, those processors will use C1 only. * * ACPI has a .suspend hack to turn off deep C-states during suspend * to avoid complications with the lapic timer workaround. * Have not seen issues with suspend, but may need same workaround here. */ /* un-comment DEBUG to enable pr_debug() statements */ #define DEBUG #include #include #include #include #include #include #include #include #include #include #define MWAIT_IDLE_VERSION "0.4" #undef PREFIX #define PREFIX "mwait-idle: " #ifdef DEBUG # define pr_debug(fmt...) printk(KERN_DEBUG fmt) #else # define pr_debug(fmt...) #endif static __initdata bool_t no_mwait_idle; invbool_param("mwait-idle", no_mwait_idle); static unsigned int mwait_substates; #define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF /* Reliable LAPIC Timer States, bit 1 for C1 etc. Default to only C1. */ static unsigned int lapic_timer_reliable_states = (1 << 1); struct idle_cpu { const struct cpuidle_state *state_table; /* * Hardware C-state auto-demotion may not always be optimal. * Indicate which enable bits to clear here. */ unsigned long auto_demotion_disable_flags; bool_t disable_promotion_to_c1e; }; static const struct idle_cpu *icpu; static const struct cpuidle_state { char name[16]; unsigned int flags; unsigned int exit_latency; /* in US */ unsigned int target_residency; /* in US */ } *cpuidle_state_table; /* * Set this flag for states where the HW flushes the TLB for us * and so we don't need cross-calls to keep it consistent. * If this flag is set, SW flushes the TLB, so even if the * HW doesn't do the flushing, this flag is safe to use. */ #define CPUIDLE_FLAG_TLB_FLUSHED 0x10000 /* * MWAIT takes an 8-bit "hint" in EAX "suggesting" * the C-state (top nibble) and sub-state (bottom nibble) * 0x00 means "MWAIT(C1)", 0x10 means "MWAIT(C2)" etc. * * We store the hint at the top of our "flags" for each state. */ #define flg2MWAIT(flags) (((flags) >> 24) & 0xFF) #define MWAIT2flg(eax) ((eax & 0xFF) << 24) #define MWAIT_HINT2CSTATE(hint) (((hint) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) #define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK) /* * States are indexed by the cstate number, * which is also the index into the MWAIT hint array. * Thus C0 is a dummy. */ static const struct cpuidle_state nehalem_cstates[] = { { .name = "C1-NHM", .flags = MWAIT2flg(0x00), .exit_latency = 3, .target_residency = 6, }, { .name = "C1E-NHM", .flags = MWAIT2flg(0x01), .exit_latency = 10, .target_residency = 20, }, { .name = "C3-NHM", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 20, .target_residency = 80, }, { .name = "C6-NHM", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 200, .target_residency = 800, }, {} }; static const struct cpuidle_state snb_cstates[] = { { .name = "C1-SNB", .flags = MWAIT2flg(0x00), .exit_latency = 2, .target_residency = 2, }, { .name = "C1E-SNB", .flags = MWAIT2flg(0x01), .exit_latency = 10, .target_residency = 20, }, { .name = "C3-SNB", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 80, .target_residency = 211, }, { .name = "C6-SNB", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 104, .target_residency = 345, }, { .name = "C7-SNB", .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 109, .target_residency = 345, }, {} }; static const struct cpuidle_state ivb_cstates[] = { { .name = "C1-IVB", .flags = MWAIT2flg(0x00), .exit_latency = 1, .target_residency = 1, }, { .name = "C1E-IVB", .flags = MWAIT2flg(0x01), .exit_latency = 10, .target_residency = 20, }, { .name = "C3-IVB", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 59, .target_residency = 156, }, { .name = "C6-IVB", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 80, .target_residency = 300, }, { .name = "C7-IVB", .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 87, .target_residency = 300, }, {} }; static const struct cpuidle_state hsw_cstates[] = { { .name = "C1-HSW", .flags = MWAIT2flg(0x00), .exit_latency = 2, .target_residency = 2, }, { .name = "C1E-HSW", .flags = MWAIT2flg(0x01), .exit_latency = 10, .target_residency = 20, }, { .name = "C3-HSW", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 33, .target_residency = 100, }, { .name = "C6-HSW", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 133, .target_residency = 400, }, { .name = "C7s-HSW", .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 166, .target_residency = 500, }, { .name = "C8-HSW", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 300, .target_residency = 900, }, { .name = "C9-HSW", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 600, .target_residency = 1800, }, { .name = "C10-HSW", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 2600, .target_residency = 7700, }, {} }; static const struct cpuidle_state atom_cstates[] = { { .name = "C1E-ATM", .flags = MWAIT2flg(0x00), .exit_latency = 10, .target_residency = 20, }, { .name = "C2-ATM", .flags = MWAIT2flg(0x10), .exit_latency = 20, .target_residency = 80, }, { .name = "C4-ATM", .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 100, .target_residency = 400, }, { .name = "C6-ATM", .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 140, .target_residency = 560, }, {} }; static void mwait_idle(void) { unsigned int cpu = smp_processor_id(); struct acpi_processor_power *power = processor_powers[cpu]; struct acpi_processor_cx *cx = NULL; unsigned int eax, next_state, cstate; u64 before, after; u32 exp = 0, pred = 0, irq_traced[4] = { 0 }; if (max_cstate > 0 && power && !sched_has_urgent_vcpu() && (next_state = cpuidle_current_governor->select(power)) > 0) { do { cx = &power->states[next_state]; } while (cx->type > max_cstate && --next_state); if (!next_state) cx = NULL; menu_get_trace_data(&exp, &pred); } if (!cx) { if (pm_idle_save) pm_idle_save(); else safe_halt(); return; } cpufreq_dbs_timer_suspend(); sched_tick_suspend(); /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */ process_pending_softirqs(); /* Interrupts must be disabled for C2 and higher transitions. */ local_irq_disable(); if (!cpu_is_haltable(cpu)) { local_irq_enable(); sched_tick_resume(); cpufreq_dbs_timer_resume(); return; } power->last_state = cx; eax = cx->address; cstate = ((eax >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; #if 0 /* XXX Can we/do we need to do something similar on Xen? */ /* * leave_mm() to avoid costly and often unnecessary wakeups * for flushing the user TLB's associated with the active mm. */ if (cpuidle_state_table[].flags & CPUIDLE_FLAG_TLB_FLUSHED) leave_mm(cpu); #endif if (!(lapic_timer_reliable_states & (1 << cstate))) lapic_timer_off(); before = cpuidle_get_tick(); TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, before, exp, pred); if (cpu_is_haltable(cpu)) mwait_idle_with_hints(eax, MWAIT_ECX_INTERRUPT_BREAK); after = cpuidle_get_tick(); cstate_restore_tsc(); trace_exit_reason(irq_traced); TRACE_6D(TRC_PM_IDLE_EXIT, cx->idx, after, irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]); update_idle_stats(power, cx, before, after); local_irq_enable(); if (!(lapic_timer_reliable_states & (1 << cstate))) lapic_timer_on(); /* Now back in C0. */ power->last_state = &power->states[0]; sched_tick_resume(); cpufreq_dbs_timer_resume(); if ( cpuidle_current_governor->reflect ) cpuidle_current_governor->reflect(power); } static void auto_demotion_disable(void *dummy) { u64 msr_bits; rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); msr_bits &= ~(icpu->auto_demotion_disable_flags); wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); } static void c1e_promotion_disable(void *dummy) { u64 msr_bits; rdmsrl(MSR_IA32_POWER_CTL, msr_bits); msr_bits &= ~0x2; wrmsrl(MSR_IA32_POWER_CTL, msr_bits); } static const struct idle_cpu idle_cpu_nehalem = { .state_table = nehalem_cstates, .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE, .disable_promotion_to_c1e = 1, }; static const struct idle_cpu idle_cpu_atom = { .state_table = atom_cstates, }; static const struct idle_cpu idle_cpu_lincroft = { .state_table = atom_cstates, .auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE, }; static const struct idle_cpu idle_cpu_snb = { .state_table = snb_cstates, .disable_promotion_to_c1e = 1, }; static const struct idle_cpu idle_cpu_ivb = { .state_table = ivb_cstates, .disable_promotion_to_c1e = 1, }; static const struct idle_cpu idle_cpu_hsw = { .state_table = hsw_cstates, .disable_promotion_to_c1e = 1, }; #define ICPU(model, cpu) { 6, model, &idle_cpu_##cpu } static struct intel_idle_id { unsigned int family, model; const struct idle_cpu *data; } intel_idle_ids[] __initdata = { ICPU(0x1a, nehalem), ICPU(0x1e, nehalem), ICPU(0x1f, nehalem), ICPU(0x25, nehalem), ICPU(0x2c, nehalem), ICPU(0x2e, nehalem), ICPU(0x2f, nehalem), ICPU(0x1c, atom), ICPU(0x26, lincroft), ICPU(0x2a, snb), ICPU(0x2d, snb), ICPU(0x3a, ivb), ICPU(0x3e, ivb), ICPU(0x3c, hsw), ICPU(0x3f, hsw), ICPU(0x45, hsw), ICPU(0x46, hsw), {} }; static int __init mwait_idle_probe(void) { unsigned int eax, ebx, ecx; const struct intel_idle_id *id; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || !boot_cpu_has(X86_FEATURE_MWAIT) || boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) return -ENODEV; for (id = intel_idle_ids; id->family; ++id) if (id->family == boot_cpu_data.x86 && id->model == boot_cpu_data.x86_model) break; if (!id->family) { pr_debug(PREFIX "does not run on family %d model %d\n", boot_cpu_data.x86, boot_cpu_data.x86_model); return -ENODEV; } cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || !(ecx & CPUID5_ECX_INTERRUPT_BREAK) || !mwait_substates) return -ENODEV; if (!max_cstate || no_mwait_idle) { pr_debug(PREFIX "disabled\n"); return -EPERM; } pr_debug(PREFIX "MWAIT substates: %#x\n", mwait_substates); icpu = id->data; cpuidle_state_table = icpu->state_table; if (boot_cpu_has(X86_FEATURE_ARAT)) lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; pr_debug(PREFIX "v" MWAIT_IDLE_VERSION " model %#x\n", boot_cpu_data.x86_model); pr_debug(PREFIX "lapic_timer_reliable_states %#x\n", lapic_timer_reliable_states); return 0; } static int mwait_idle_cpu_init(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu, cstate; struct acpi_processor_power *dev = processor_powers[cpu]; switch (action) { default: return NOTIFY_DONE; case CPU_UP_PREPARE: cpuidle_init_cpu(cpu); return NOTIFY_DONE; case CPU_ONLINE: if (!dev) return NOTIFY_DONE; break; } dev->count = 1; for (cstate = 0; cpuidle_state_table[cstate].target_residency; ++cstate) { unsigned int num_substates, hint, state, substate; struct acpi_processor_cx *cx; hint = flg2MWAIT(cpuidle_state_table[cstate].flags); state = MWAIT_HINT2CSTATE(hint) + 1; substate = MWAIT_HINT2SUBSTATE(hint); if (state > max_cstate) { printk(PREFIX "max C-state %u reached\n", max_cstate); break; } /* Does the state exist in CPUID.MWAIT? */ num_substates = (mwait_substates >> (state * 4)) & MWAIT_SUBSTATE_MASK; /* if sub-state in table is not enumerated by CPUID */ if (substate >= num_substates) continue; if (dev->count >= ACPI_PROCESSOR_MAX_POWER) { printk(PREFIX "max C-state count of %u reached\n", ACPI_PROCESSOR_MAX_POWER); break; } if (state > 2 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && !pm_idle_save) setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE); cx = dev->states + dev->count; cx->type = state; cx->address = hint; cx->entry_method = ACPI_CSTATE_EM_FFH; cx->latency = cpuidle_state_table[cstate].exit_latency; cx->target_residency = cpuidle_state_table[cstate].target_residency; dev->count++; } if (icpu->auto_demotion_disable_flags) on_selected_cpus(cpumask_of(cpu), auto_demotion_disable, NULL, 1); if (icpu->disable_promotion_to_c1e) on_selected_cpus(cpumask_of(cpu), c1e_promotion_disable, NULL, 1); return NOTIFY_DONE; } int __init mwait_idle_init(struct notifier_block *nfb) { int err; if (pm_idle_save) return -ENODEV; err = mwait_idle_probe(); if (!err && !boot_cpu_has(X86_FEATURE_ARAT)) { hpet_broadcast_init(); if (xen_cpuidle < 0 && !hpet_broadcast_is_available()) err = -ENODEV; else if(!lapic_timer_init()) err = -EINVAL; if (err) pr_debug(PREFIX "not used (%d)\n", err); } if (!err) { nfb->notifier_call = mwait_idle_cpu_init; mwait_idle_cpu_init(nfb, CPU_UP_PREPARE, NULL); pm_idle_save = pm_idle; pm_idle = mwait_idle; dead_idle = acpi_dead_idle; } return err; } xen-4.4.0/xen/arch/x86/cpu/mcheck/0000775000175000017500000000000012307313555014620 5ustar smbsmbxen-4.4.0/xen/arch/x86/cpu/mcheck/mctelem.c0000664000175000017500000003567012307313555016425 0ustar smbsmb/* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation, version 2 of the * License. */ /* * mctelem.c - x86 Machine Check Telemetry Transport */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mce.h" struct mctelem_ent { struct mctelem_ent *mcte_next; /* next in chronological order */ struct mctelem_ent *mcte_prev; /* previous in chronological order */ uint32_t mcte_flags; /* See MCTE_F_* below */ uint32_t mcte_refcnt; /* Reference count */ void *mcte_data; /* corresponding data payload */ }; #define MCTE_F_HOME_URGENT 0x0001U /* free to urgent freelist */ #define MCTE_F_HOME_NONURGENT 0x0002U /* free to nonurgent freelist */ #define MCTE_F_CLASS_URGENT 0x0004U /* in use - urgent errors */ #define MCTE_F_CLASS_NONURGENT 0x0008U /* in use - nonurgent errors */ #define MCTE_F_STATE_FREE 0x0010U /* on a freelist */ #define MCTE_F_STATE_UNCOMMITTED 0x0020U /* reserved; on no list */ #define MCTE_F_STATE_COMMITTED 0x0040U /* on a committed list */ #define MCTE_F_STATE_PROCESSING 0x0080U /* on a processing list */ #define MCTE_F_MASK_HOME (MCTE_F_HOME_URGENT | MCTE_F_HOME_NONURGENT) #define MCTE_F_MASK_CLASS (MCTE_F_CLASS_URGENT | MCTE_F_CLASS_NONURGENT) #define MCTE_F_MASK_STATE (MCTE_F_STATE_FREE | \ MCTE_F_STATE_UNCOMMITTED | \ MCTE_F_STATE_COMMITTED | \ MCTE_F_STATE_PROCESSING) #define MCTE_HOME(tep) ((tep)->mcte_flags & MCTE_F_MASK_HOME) #define MCTE_CLASS(tep) ((tep)->mcte_flags & MCTE_F_MASK_CLASS) #define MCTE_SET_CLASS(tep, new) do { \ (tep)->mcte_flags &= ~MCTE_F_MASK_CLASS; \ (tep)->mcte_flags |= MCTE_F_CLASS_##new; } while (0) #define MCTE_STATE(tep) ((tep)->mcte_flags & MCTE_F_MASK_STATE) #define MCTE_TRANSITION_STATE(tep, old, new) do { \ BUG_ON(MCTE_STATE(tep) != (MCTE_F_STATE_##old)); \ (tep)->mcte_flags &= ~MCTE_F_MASK_STATE; \ (tep)->mcte_flags |= (MCTE_F_STATE_##new); } while (0) #define MC_URGENT_NENT 10 #define MC_NONURGENT_NENT 20 #define MC_NCLASSES (MC_NONURGENT + 1) #define COOKIE2MCTE(c) ((struct mctelem_ent *)(c)) #define MCTE2COOKIE(tep) ((mctelem_cookie_t)(tep)) static struct mc_telem_ctl { /* Linked lists that thread the array members together. * * The free lists are singly-linked via mcte_next, and we allocate * from them by atomically unlinking an element from the head. * Consumed entries are returned to the head of the free list. * When an entry is reserved off the free list it is not linked * on any list until it is committed or dismissed. * * The committed list grows at the head and we do not maintain a * tail pointer; insertions are performed atomically. The head * thus has the most-recently committed telemetry, i.e. the * list is in reverse chronological order. The committed list * is singly-linked via mcte_prev pointers, and mcte_next is NULL. * When we move telemetry from the committed list to the processing * list we atomically unlink the committed list and keep a pointer * to the head of that list; we then traverse the list following * mcte_prev and fill in mcte_next to doubly-link the list, and then * append the tail of the list onto the processing list. If we panic * during this manipulation of the committed list we still have * the pointer to its head so we can recover all entries during * the panic flow (albeit in reverse chronological order). * * The processing list is updated in a controlled context, and * we can lock it for updates. The head of the processing list * always has the oldest telemetry, and we append (as above) * at the tail of the processing list. */ struct mctelem_ent *mctc_free[MC_NCLASSES]; struct mctelem_ent *mctc_committed[MC_NCLASSES]; struct mctelem_ent *mctc_processing_head[MC_NCLASSES]; struct mctelem_ent *mctc_processing_tail[MC_NCLASSES]; /* * Telemetry array */ struct mctelem_ent *mctc_elems; } mctctl; struct mc_telem_cpu_ctl { /* * Per-CPU processing lists, used for deferred (softirq) * processing of telemetry. @pending is indexed by the * CPU that the telemetry belongs to. @processing is indexed * by the CPU that is processing the telemetry. */ struct mctelem_ent *pending; struct mctelem_ent *processing; }; static DEFINE_PER_CPU(struct mc_telem_cpu_ctl, mctctl); /* Lock protecting all processing lists */ static DEFINE_SPINLOCK(processing_lock); static void mctelem_xchg_head(struct mctelem_ent **headp, struct mctelem_ent **linkp, struct mctelem_ent *new) { for (;;) { struct mctelem_ent *old; *linkp = old = *headp; if (cmpxchgptr(headp, old, new) == old) break; } } void mctelem_defer(mctelem_cookie_t cookie) { struct mctelem_ent *tep = COOKIE2MCTE(cookie); mctelem_xchg_head(&this_cpu(mctctl.pending), &tep->mcte_next, tep); } void mctelem_process_deferred(unsigned int cpu, int (*fn)(mctelem_cookie_t)) { struct mctelem_ent *tep; struct mctelem_ent *head, *prev; int ret; /* * First, unhook the list of telemetry structures, and * hook it up to the processing list head for this CPU. */ mctelem_xchg_head(&per_cpu(mctctl.pending, cpu), &this_cpu(mctctl.processing), NULL); head = this_cpu(mctctl.processing); /* * Then, fix up the list to include prev pointers, to make * things a little easier, as the list must be traversed in * chronological order, which is backward from the order they * are in. */ for (tep = head, prev = NULL; tep != NULL; tep = tep->mcte_next) { tep->mcte_prev = prev; prev = tep; } /* * Now walk the list of telemetry structures, handling each * one of them. Unhooking the structure here does not need to * be atomic, as this list is only accessed from a softirq * context; the MCE handler does not touch it. */ for (tep = prev; tep != NULL; tep = prev) { prev = tep->mcte_prev; tep->mcte_next = tep->mcte_prev = NULL; ret = fn(MCTE2COOKIE(tep)); if (prev != NULL) prev->mcte_next = NULL; tep->mcte_prev = tep->mcte_next = NULL; if (ret != 0) mctelem_commit(MCTE2COOKIE(tep)); else mctelem_dismiss(MCTE2COOKIE(tep)); } } int mctelem_has_deferred(unsigned int cpu) { if (per_cpu(mctctl.pending, cpu) != NULL) return 1; return 0; } /* Free an entry to its native free list; the entry must not be linked on * any list. */ static void mctelem_free(struct mctelem_ent *tep) { mctelem_class_t target = MCTE_HOME(tep) == MCTE_F_HOME_URGENT ? MC_URGENT : MC_NONURGENT; BUG_ON(tep->mcte_refcnt != 0); BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE); tep->mcte_prev = NULL; mctelem_xchg_head(&mctctl.mctc_free[target], &tep->mcte_next, tep); } /* Increment the reference count of an entry that is not linked on to * any list and which only the caller has a pointer to. */ static void mctelem_hold(struct mctelem_ent *tep) { tep->mcte_refcnt++; } /* Increment the reference count on an entry that is linked at the head of * a processing list. The caller is responsible for locking the list. */ static void mctelem_processing_hold(struct mctelem_ent *tep) { int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ? MC_URGENT : MC_NONURGENT; BUG_ON(tep != mctctl.mctc_processing_head[which]); tep->mcte_refcnt++; } /* Decrement the reference count on an entry that is linked at the head of * a processing list. The caller is responsible for locking the list. */ static void mctelem_processing_release(struct mctelem_ent *tep) { int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ? MC_URGENT : MC_NONURGENT; BUG_ON(tep != mctctl.mctc_processing_head[which]); if (--tep->mcte_refcnt == 0) { MCTE_TRANSITION_STATE(tep, PROCESSING, FREE); mctctl.mctc_processing_head[which] = tep->mcte_next; mctelem_free(tep); } } void mctelem_init(int reqdatasz) { static int called = 0; static int datasz = 0, realdatasz = 0; char *datarr; int i; BUG_ON(MC_URGENT != 0 || MC_NONURGENT != 1 || MC_NCLASSES != 2); /* Called from mcheck_init for all processors; initialize for the * first call only (no race here since the boot cpu completes * init before others start up). */ if (++called == 1) { realdatasz = reqdatasz; datasz = (reqdatasz & ~0xf) + 0x10; /* 16 byte roundup */ } else { BUG_ON(reqdatasz != realdatasz); return; } if ((mctctl.mctc_elems = xmalloc_array(struct mctelem_ent, MC_URGENT_NENT + MC_NONURGENT_NENT)) == NULL || (datarr = xmalloc_bytes((MC_URGENT_NENT + MC_NONURGENT_NENT) * datasz)) == NULL) { if (mctctl.mctc_elems) xfree(mctctl.mctc_elems); printk("Allocations for MCA telemetry failed\n"); return; } for (i = 0; i < MC_URGENT_NENT + MC_NONURGENT_NENT; i++) { struct mctelem_ent *tep, **tepp; tep = mctctl.mctc_elems + i; tep->mcte_flags = MCTE_F_STATE_FREE; tep->mcte_refcnt = 0; tep->mcte_data = datarr + i * datasz; if (i < MC_URGENT_NENT) { tepp = &mctctl.mctc_free[MC_URGENT]; tep->mcte_flags |= MCTE_F_HOME_URGENT; } else { tepp = &mctctl.mctc_free[MC_NONURGENT]; tep->mcte_flags |= MCTE_F_HOME_NONURGENT; } tep->mcte_next = *tepp; tep->mcte_prev = NULL; *tepp = tep; } } /* incremented non-atomically when reserve fails */ static int mctelem_drop_count; /* Reserve a telemetry entry, or return NULL if none available. * If we return an entry then the caller must subsequently call exactly one of * mctelem_unreserve or mctelem_commit for that entry. */ mctelem_cookie_t mctelem_reserve(mctelem_class_t which) { struct mctelem_ent **freelp; struct mctelem_ent *oldhead, *newhead; mctelem_class_t target = (which == MC_URGENT) ? MC_URGENT : MC_NONURGENT; freelp = &mctctl.mctc_free[target]; for (;;) { if ((oldhead = *freelp) == NULL) { if (which == MC_URGENT && target == MC_URGENT) { /* raid the non-urgent freelist */ target = MC_NONURGENT; freelp = &mctctl.mctc_free[target]; continue; } else { mctelem_drop_count++; return (NULL); } } newhead = oldhead->mcte_next; if (cmpxchgptr(freelp, oldhead, newhead) == oldhead) { struct mctelem_ent *tep = oldhead; mctelem_hold(tep); MCTE_TRANSITION_STATE(tep, FREE, UNCOMMITTED); tep->mcte_next = NULL; tep->mcte_prev = NULL; if (which == MC_URGENT) MCTE_SET_CLASS(tep, URGENT); else MCTE_SET_CLASS(tep, NONURGENT); return MCTE2COOKIE(tep); } } } void *mctelem_dataptr(mctelem_cookie_t cookie) { struct mctelem_ent *tep = COOKIE2MCTE(cookie); return tep->mcte_data; } /* Release a previously reserved entry back to the freelist without * submitting it for logging. The entry must not be linked on to any * list - that's how mctelem_reserve handed it out. */ void mctelem_dismiss(mctelem_cookie_t cookie) { struct mctelem_ent *tep = COOKIE2MCTE(cookie); tep->mcte_refcnt--; MCTE_TRANSITION_STATE(tep, UNCOMMITTED, FREE); mctelem_free(tep); } /* Commit an entry with completed telemetry for logging. The caller must * not reference the entry after this call. Note that we add entries * at the head of the committed list, so that list therefore has entries * in reverse chronological order. */ void mctelem_commit(mctelem_cookie_t cookie) { struct mctelem_ent *tep = COOKIE2MCTE(cookie); mctelem_class_t target = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ? MC_URGENT : MC_NONURGENT; BUG_ON(tep->mcte_next != NULL || tep->mcte_prev != NULL); MCTE_TRANSITION_STATE(tep, UNCOMMITTED, COMMITTED); mctelem_xchg_head(&mctctl.mctc_committed[target], &tep->mcte_prev, tep); } /* Move telemetry from committed list to processing list, reversing the * list into chronological order. The processing list has been * locked by the caller, and may be non-empty. We append the * reversed committed list on to the tail of the processing list. * The committed list may grow even while we run, so use atomic * operations to swap NULL to the freelist head. * * Note that "chronological order" means the order in which producers * won additions to the processing list, which may not reflect the * strict chronological order of the associated events if events are * closely spaced in time and contend for the processing list at once. */ static struct mctelem_ent *dangling[MC_NCLASSES]; static void mctelem_append_processing(mctelem_class_t which) { mctelem_class_t target = which == MC_URGENT ? MC_URGENT : MC_NONURGENT; struct mctelem_ent **commlp = &mctctl.mctc_committed[target]; struct mctelem_ent **proclhp = &mctctl.mctc_processing_head[target]; struct mctelem_ent **procltp = &mctctl.mctc_processing_tail[target]; struct mctelem_ent *tep, *ltep; /* Check for an empty list; no race since we hold the processing lock */ if (*commlp == NULL) return; /* Atomically unlink the committed list, and keep a pointer to * the list we unlink in a well-known location so it can be * picked up in panic code should we panic between this unlink * and the append to the processing list. */ mctelem_xchg_head(commlp, &dangling[target], NULL); if (dangling[target] == NULL) return; /* Traverse the list following the previous pointers (reverse * chronological order). For each entry fill in the next pointer * and transition the element state. */ for (tep = dangling[target], ltep = NULL; tep != NULL; tep = tep->mcte_prev) { MCTE_TRANSITION_STATE(tep, COMMITTED, PROCESSING); tep->mcte_next = ltep; ltep = tep; } /* ltep points to the head of a chronologically ordered linked * list of telemetry entries ending at the most recent entry * dangling[target] if mcte_next is followed; tack this on to * the processing list. */ if (*proclhp == NULL) { *proclhp = ltep; *procltp = dangling[target]; } else { (*procltp)->mcte_next = ltep; ltep->mcte_prev = *procltp; *procltp = dangling[target]; } wmb(); dangling[target] = NULL; wmb(); } mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which) { mctelem_class_t target = (which == MC_URGENT) ? MC_URGENT : MC_NONURGENT; struct mctelem_ent *tep; spin_lock(&processing_lock); mctelem_append_processing(target); if ((tep = mctctl.mctc_processing_head[target]) == NULL) { spin_unlock(&processing_lock); return NULL; } mctelem_processing_hold(tep); wmb(); spin_unlock(&processing_lock); return MCTE2COOKIE(tep); } void mctelem_consume_oldest_end(mctelem_cookie_t cookie) { struct mctelem_ent *tep = COOKIE2MCTE(cookie); spin_lock(&processing_lock); mctelem_processing_release(tep); wmb(); spin_unlock(&processing_lock); } void mctelem_ack(mctelem_class_t which, mctelem_cookie_t cookie) { mctelem_class_t target = (which == MC_URGENT) ? MC_URGENT : MC_NONURGENT; struct mctelem_ent *tep = COOKIE2MCTE(cookie); if (tep == NULL) return; spin_lock(&processing_lock); if (tep == mctctl.mctc_processing_head[target]) mctelem_processing_release(tep); wmb(); spin_unlock(&processing_lock); } xen-4.4.0/xen/arch/x86/cpu/mcheck/Makefile0000664000175000017500000000036712307313555016266 0ustar smbsmbobj-y += amd_nonfatal.o obj-y += amd_k8.o obj-y += amd_f10.o obj-y += mce_amd.o obj-y += mcaction.o obj-y += barrier.o obj-y += mctelem.o obj-y += mce.o obj-y += mce-apei.o obj-y += mce_intel.o obj-y += non-fatal.o obj-y += util.o obj-y += vmce.o xen-4.4.0/xen/arch/x86/cpu/mcheck/mce.h0000664000175000017500000001605212307313555015541 0ustar smbsmb#ifndef _MCE_H #define _MCE_H #include #include #include #include #include #include #include #include "x86_mca.h" #include "mctelem.h" #define MCE_QUIET 0 #define MCE_VERBOSE 1 /* !only for developer debug as printk is unsafe in MCE context */ #define MCE_CRITICAL 2 extern int mce_verbosity; /* Define the default level of machine check related print. * When set mce_verbosity=verbose, all mce debug information * will be printed, otherwise, those information will not be * printed. */ #define mce_printk(v, s, a...) do { \ if ((v) <= mce_verbosity) \ printk(s, ##a); \ } while (0) enum mcheck_type { mcheck_unset = -1, mcheck_none, mcheck_amd_famXX, mcheck_amd_k8, mcheck_intel }; extern uint8_t cmci_apic_vector; /* Init functions */ enum mcheck_type amd_mcheck_init(struct cpuinfo_x86 *c); enum mcheck_type intel_mcheck_init(struct cpuinfo_x86 *c, bool_t bsp); void intel_mcheck_timer(struct cpuinfo_x86 *c); void mce_intel_feature_init(struct cpuinfo_x86 *c); void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c); uint64_t mce_cap_init(void); extern unsigned int firstbank; struct mcinfo_extended *intel_get_extended_msrs( struct mcinfo_global *mig, struct mc_info *mi); int mce_available(struct cpuinfo_x86 *c); unsigned int mce_firstbank(struct cpuinfo_x86 *c); /* Helper functions used for collecting error telemetry */ struct mc_info *x86_mcinfo_getptr(void); void mc_panic(char *s); void x86_mc_get_cpu_info(unsigned, uint32_t *, uint16_t *, uint16_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *); /* Register a handler for machine check exceptions. */ typedef void (*x86_mce_vector_t)(struct cpu_user_regs *, long); extern void x86_mce_vector_register(x86_mce_vector_t); /* Common generic MCE handler that implementations may nominate * via x86_mce_vector_register. */ extern void mcheck_cmn_handler(struct cpu_user_regs *, long, struct mca_banks *, struct mca_banks *); /* Register a handler for judging whether mce is recoverable. */ typedef int (*mce_recoverable_t)(uint64_t status); extern void mce_recoverable_register(mce_recoverable_t); /* Read an MSR, checking for an interposed value first */ extern struct intpose_ent *intpose_lookup(unsigned int, uint64_t, uint64_t *); extern bool_t intpose_inval(unsigned int, uint64_t); static inline uint64_t mca_rdmsr(unsigned int msr) { uint64_t val; if (intpose_lookup(smp_processor_id(), msr, &val) == NULL) rdmsrl(msr, val); return val; } /* Write an MSR, invalidating any interposed value */ #define mca_wrmsr(msr, val) do { \ if ( !intpose_inval(smp_processor_id(), msr) ) \ wrmsrl(msr, val); \ } while ( 0 ) /* Utility function to "logout" all architectural MCA telemetry from the MCA * banks of the current processor. A cookie is returned which may be * uses to reference the data so logged (the cookie can be NULL if * no logout structures were available). The caller can also pass a pointer * to a structure which will be completed with some summary information * of the MCA data observed in the logout operation. */ enum mca_source { MCA_POLLER, MCA_CMCI_HANDLER, MCA_RESET, MCA_MCE_SCAN }; struct mca_summary { uint32_t errcnt; /* number of banks with valid errors */ int ripv; /* meaningful on #MC */ int eipv; /* meaningful on #MC */ bool_t uc; /* UC flag */ bool_t pcc; /* PCC flag */ bool_t recoverable; /* software error recoverable flag */ }; DECLARE_PER_CPU(struct mca_banks *, poll_bankmask); DECLARE_PER_CPU(struct mca_banks *, no_cmci_banks); DECLARE_PER_CPU(struct mca_banks *, mce_clear_banks); extern bool_t cmci_support; extern bool_t is_mc_panic; extern bool_t mce_broadcast; extern void mcheck_mca_clearbanks(struct mca_banks *); extern mctelem_cookie_t mcheck_mca_logout(enum mca_source, struct mca_banks *, struct mca_summary *, struct mca_banks *); /* Register a callback to be made during bank telemetry logout. * This callback is only available to those machine check handlers * that call to the common mcheck_cmn_handler or who use the common * telemetry logout function mcheck_mca_logout in error polling. * * This can be used to collect additional information (typically non- * architectural) provided by newer CPU families/models without the need * to duplicate the whole handler resulting in various handlers each with * its own tweaks and bugs. The callback receives an struct mc_info pointer * which it can use with x86_mcinfo_add to add additional telemetry, * the current MCA bank number we are reading telemetry from, and the * MCi_STATUS value for that bank. */ /* Register a handler for judging whether the bank need to be cleared */ typedef int (*mce_need_clearbank_t)(enum mca_source who, u64 status); extern void mce_need_clearbank_register(mce_need_clearbank_t); typedef struct mcinfo_extended *(*x86_mce_callback_t) (struct mc_info *, uint16_t, uint64_t); extern void x86_mce_callback_register(x86_mce_callback_t); void *x86_mcinfo_add(struct mc_info *mi, void *mcinfo); void *x86_mcinfo_reserve(struct mc_info *mi, int size); void x86_mcinfo_dump(struct mc_info *mi); static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t msr) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: if (msr >= MSR_IA32_MC0_CTL2 && msr < MSR_IA32_MCx_CTL2(v->arch.vmce.mcg_cap & MCG_CAP_COUNT) ) return 1; break; case X86_VENDOR_AMD: switch (msr) { case MSR_F10_MC4_MISC1: case MSR_F10_MC4_MISC2: case MSR_F10_MC4_MISC3: return 1; } break; } return 0; } static inline int mce_bank_msr(const struct vcpu *v, uint32_t msr) { if ( (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MCx_CTL(v->arch.vmce.mcg_cap & MCG_CAP_COUNT)) || mce_vendor_bank_msr(v, msr) ) return 1; return 0; } /* MC softirq */ void mce_handler_init(void); extern const struct mca_error_handler *mce_dhandlers; extern const struct mca_error_handler *mce_uhandlers; extern unsigned int mce_dhandler_num; extern unsigned int mce_uhandler_num; /* Fields are zero when not available */ struct mce { uint64_t status; uint64_t misc; uint64_t addr; uint64_t mcgstatus; uint64_t ip; uint64_t tsc; /* cpu time stamp counter */ uint64_t time; /* wall time_t when error was detected */ uint8_t cpuvendor; /* cpu vendor as encoded in system.h */ uint8_t inject_flags; /* software inject flags */ uint16_t pad; uint32_t cpuid; /* CPUID 1 EAX */ uint8_t cs; /* code segment */ uint8_t bank; /* machine check bank */ uint8_t cpu; /* cpu number; obsolete; use extcpu now */ uint8_t finished; /* entry is valid */ uint32_t extcpu; /* linux cpu number that detected the error */ uint32_t socketid; /* CPU socket ID */ uint32_t apicid; /* CPU initial apic ID */ uint64_t mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ }; extern int apei_write_mce(struct mce *m); #endif /* _MCE_H */ xen-4.4.0/xen/arch/x86/cpu/mcheck/mce_amd.c0000664000175000017500000001114412307313555016352 0ustar smbsmb/* * common MCA implementation for AMD CPUs. * Copyright (c) 2012 Advanced Micro Devices, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include "mce.h" #include "x86_mca.h" #include "mce_amd.h" #include "mcaction.h" #include "mce_quirks.h" #define ANY -1 static const struct mce_quirkdata mce_amd_quirks[] = { { 0xf /* cpu family */, ANY /* all models */, ANY /* all steppings */, MCEQUIRK_K8_GART }, { 0x10 /* cpu family */, ANY /* all models */, ANY /* all steppings */, MCEQUIRK_F10_GART }, }; /* Error Code Types */ enum mc_ec_type { MC_EC_TLB_TYPE = 0x0010, MC_EC_MEM_TYPE = 0x0100, MC_EC_BUS_TYPE = 0x0800, }; enum mc_ec_type mc_ec2type(uint16_t errorcode) { if ( errorcode & MC_EC_BUS_TYPE ) return MC_EC_BUS_TYPE; if ( errorcode & MC_EC_MEM_TYPE ) return MC_EC_MEM_TYPE; if ( errorcode & MC_EC_TLB_TYPE ) return MC_EC_TLB_TYPE; /* Unreached */ BUG(); return 0; } int mc_amd_recoverable_scan(uint64_t status) { int ret = 0; enum mc_ec_type ectype; uint16_t errorcode; if ( !(status & MCi_STATUS_UC) ) return 1; errorcode = status & (MCi_STATUS_MCA | MCi_STATUS_MSEC); ectype = mc_ec2type(errorcode); switch ( ectype ) { case MC_EC_BUS_TYPE: /* value in addr MSR is physical */ /* should run cpu offline action */ break; case MC_EC_MEM_TYPE: /* value in addr MSR is physical */ ret = 1; /* run memory page offline action */ break; case MC_EC_TLB_TYPE: /* value in addr MSR is virtual */ /* should run tlb flush action and retry */ break; } return ret; } int mc_amd_addrcheck(uint64_t status, uint64_t misc, int addrtype) { enum mc_ec_type ectype; uint16_t errorcode; errorcode = status & (MCi_STATUS_MCA | MCi_STATUS_MSEC); ectype = mc_ec2type(errorcode); switch (ectype) { case MC_EC_BUS_TYPE: /* value in addr MSR is physical */ case MC_EC_MEM_TYPE: /* value in addr MSR is physical */ return (addrtype == MC_ADDR_PHYSICAL); case MC_EC_TLB_TYPE: /* value in addr MSR is virtual */ return (addrtype == MC_ADDR_VIRTUAL); } /* unreached */ BUG(); return 0; } /* MC quirks */ enum mcequirk_amd_flags mcequirk_lookup_amd_quirkdata(struct cpuinfo_x86 *c) { int i; BUG_ON(c->x86_vendor != X86_VENDOR_AMD); for ( i = 0; i < ARRAY_SIZE(mce_amd_quirks); i++ ) { if ( c->x86 != mce_amd_quirks[i].cpu_family ) continue; if ( (mce_amd_quirks[i].cpu_model != ANY) && (mce_amd_quirks[i].cpu_model != c->x86_model) ) continue; if ( (mce_amd_quirks[i].cpu_stepping != ANY) && (mce_amd_quirks[i].cpu_stepping != c->x86_mask) ) continue; return mce_amd_quirks[i].quirk; } return 0; } int mcequirk_amd_apply(enum mcequirk_amd_flags flags) { uint64_t val; switch ( flags ) { case MCEQUIRK_K8_GART: /* * Enable error reporting for all errors except for GART * TBL walk error reporting, which trips off incorrectly * with AGP GART & 3ware & Cerberus. */ wrmsrl(MSR_IA32_MCx_CTL(4), ~(1ULL << 10)); wrmsrl(MSR_IA32_MCx_STATUS(4), 0ULL); break; case MCEQUIRK_F10_GART: if ( rdmsr_safe(MSR_AMD64_MCx_MASK(4), val) == 0 ) wrmsr_safe(MSR_AMD64_MCx_MASK(4), val | (1 << 10)); break; } return 0; } enum mcheck_type amd_mcheck_init(struct cpuinfo_x86 *ci) { enum mcheck_type rc = mcheck_none; switch ( ci->x86 ) { default: /* Assume that machine check support is available. * The minimum provided support is at least the K8. */ case 0xf: rc = amd_k8_mcheck_init(ci); break; case 0x10 ... 0x17: rc = amd_f10_mcheck_init(ci); break; } return rc; } xen-4.4.0/xen/arch/x86/cpu/mcheck/non-fatal.c0000664000175000017500000000531112307313555016643 0ustar smbsmb/* * Non Fatal Machine Check Exception Reporting * * (C) Copyright 2002 Dave Jones. * * This file contains routines to check for non-fatal MCEs every 15s * */ #include #include #include #include #include #include #include #include #include #include #include #include #include "mce.h" #include "vmce.h" static struct timer mce_timer; #define MCE_PERIOD MILLISECS(8000) #define MCE_PERIOD_MIN MILLISECS(2000) #define MCE_PERIOD_MAX MILLISECS(16000) static uint64_t period = MCE_PERIOD; static int adjust = 0; static int variable_period = 1; static void mce_checkregs (void *info) { mctelem_cookie_t mctc; struct mca_summary bs; static uint64_t dumpcount = 0; mctc = mcheck_mca_logout(MCA_POLLER, __get_cpu_var(poll_bankmask), &bs, NULL); if (bs.errcnt && mctc != NULL) { adjust++; /* If Dom0 enabled the VIRQ_MCA event, then notify it. * Otherwise, if dom0 has had plenty of time to register * the virq handler but still hasn't then dump telemetry * to the Xen console. The call count may be incremented * on multiple cpus at once and is indicative only - just * a simple-minded attempt to avoid spamming the console * for corrected errors in early startup. */ if (dom0_vmce_enabled()) { mctelem_commit(mctc); send_global_virq(VIRQ_MCA); } else if (++dumpcount >= 10) { x86_mcinfo_dump((struct mc_info *)mctelem_dataptr(mctc)); mctelem_dismiss(mctc); } else { mctelem_dismiss(mctc); } } else if (mctc != NULL) { mctelem_dismiss(mctc); } } static void mce_work_fn(void *data) { on_each_cpu(mce_checkregs, NULL, 1); if (variable_period) { if (adjust) period /= (adjust + 1); else period *= 2; if (period > MCE_PERIOD_MAX) period = MCE_PERIOD_MAX; if (period < MCE_PERIOD_MIN) period = MCE_PERIOD_MIN; } set_timer(&mce_timer, NOW() + period); adjust = 0; } static int __init init_nonfatal_mce_checker(void) { struct cpuinfo_x86 *c = &boot_cpu_data; /* Check for MCE support */ if (mce_disabled || !mce_available(c)) return -ENODEV; if ( __get_cpu_var(poll_bankmask) == NULL ) return -EINVAL; /* * Check for non-fatal errors every MCE_RATE s */ switch (c->x86_vendor) { case X86_VENDOR_AMD: /* Assume we are on K8 or newer AMD CPU here */ amd_nonfatal_mcheck_init(c); break; case X86_VENDOR_INTEL: init_timer(&mce_timer, mce_work_fn, NULL, 0); set_timer(&mce_timer, NOW() + MCE_PERIOD); break; } printk(KERN_INFO "mcheck_poll: Machine check polling timer started.\n"); return 0; } __initcall(init_nonfatal_mce_checker); xen-4.4.0/xen/arch/x86/cpu/mcheck/mce_quirks.h0000664000175000017500000000273012307313555017135 0ustar smbsmb/* * MCA quirks * Copyright (c) 2009 Advanced Micro Devices, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _MCE_QUIRK_H #define _MCE_QUIRK_H #include struct mce_quirkdata { int32_t cpu_family; int16_t cpu_model; int16_t cpu_stepping; uint32_t quirk; }; /* use a binary flag if multiple quirks apply * to one CPU family/model */ enum mcequirk_amd_flags { MCEQUIRK_K8_GART = 2, MCEQUIRK_F10_GART }; enum mcequirk_intel_flags { MCEQUIRK_DUMMY = 0x1, /* nothing known yet */ }; enum mcequirk_amd_flags mcequirk_lookup_amd_quirkdata(struct cpuinfo_x86 *c); int mcequirk_amd_apply(enum mcequirk_amd_flags flags); enum mcequirk_intel_flags mcequirk_lookup_intel_quirkdata(struct cpuinfo_x86 *c); int mcequirk_intel_apply(enum mcequirk_intel_flags flags); #endif /* _MCE_QUIRK_H */ xen-4.4.0/xen/arch/x86/cpu/mcheck/vmce.h0000664000175000017500000000137012307313555015724 0ustar smbsmb#ifndef _MCHECK_VMCE_H #define _MCHECK_VMCE_H #include "x86_mca.h" int vmce_init(struct cpuinfo_x86 *c); #define dom0_vmce_enabled() (dom0 && dom0->max_vcpus && dom0->vcpu[0] \ && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) int unmmap_broken_page(struct domain *d, mfn_t mfn, unsigned long gfn); int vmce_intel_rdmsr(const struct vcpu *, uint32_t msr, uint64_t *val); int vmce_intel_wrmsr(struct vcpu *, uint32_t msr, uint64_t val); int vmce_amd_rdmsr(const struct vcpu *, uint32_t msr, uint64_t *val); int vmce_amd_wrmsr(struct vcpu *, uint32_t msr, uint64_t val); int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d, uint64_t gstatus); #define VMCE_INJECT_BROADCAST (-1) int inject_vmce(struct domain *d, int vcpu); #endif xen-4.4.0/xen/arch/x86/cpu/mcheck/barrier.c0000664000175000017500000000226212307313555016414 0ustar smbsmb#include "barrier.h" #include "util.h" #include "mce.h" void mce_barrier_init(struct mce_softirq_barrier *bar) { atomic_set(&bar->val, 0); atomic_set(&bar->ingen, 0); atomic_set(&bar->outgen, 0); } void mce_barrier_dec(struct mce_softirq_barrier *bar) { atomic_inc(&bar->outgen); wmb(); atomic_dec(&bar->val); } void mce_barrier_enter(struct mce_softirq_barrier *bar) { int gen; if (!mce_broadcast) return; atomic_inc(&bar->ingen); gen = atomic_read(&bar->outgen); mb(); atomic_inc(&bar->val); while ( atomic_read(&bar->val) != num_online_cpus() && atomic_read(&bar->outgen) == gen ) { mb(); mce_panic_check(); } } void mce_barrier_exit(struct mce_softirq_barrier *bar) { int gen; if ( !mce_broadcast ) return; atomic_inc(&bar->outgen); gen = atomic_read(&bar->ingen); mb(); atomic_dec(&bar->val); while ( atomic_read(&bar->val) != 0 && atomic_read(&bar->ingen) == gen ) { mb(); mce_panic_check(); } } void mce_barrier(struct mce_softirq_barrier *bar) { mce_barrier_enter(bar); mce_barrier_exit(bar); } xen-4.4.0/xen/arch/x86/cpu/mcheck/mctelem.h0000664000175000017500000000620212307313555016417 0ustar smbsmb/* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation, version 2 of the * License. */ #ifndef _MCTELEM_H #define _MCTELEM_H #include #include #include /* Helper functions used for collecting error telemetry. * * mctelem_init preallocates a number of data areas for use during * machine check data "logout". Two classes are distinguished - * urgent uses, intended for use from machine check exception handlers, * and non-urgent uses intended for use from error pollers. * Associated with each logout entry of whatever class is a data area * sized per the single argument to mctelem_init. mctelem_init should be * called from MCA init code before anybody has the chance to change the * machine check vector with mcheck_mca_logout or to use mcheck_mca_logout. * * To reserve an entry of a given class for use in logout, call * mctelem_reserve (or use the common handler functions which do all this * for you). This returns an opaque cookie, or NULL if no elements are * available. Elements are reserved with an atomic operation so no deadlock * will occur if, for example, a machine check exception interrupts a * scheduled error poll. The implementation will raid free non-urgent * entries if all urgent entries are in use when an urgent request is received. * Once an entry is reserved the caller must eventually perform exactly * one of two actions: mctelem_commit or mctelem_dismiss. * * On mctelem_commit the entry is appended to a processing list; mctelem_dismiss * frees the element without processing. After either call the cookie * must not be referenced again. * * To consume committed telemetry call mctelem_consume_oldest_begin * which will return a cookie referencing the oldest (first committed) * entry of the requested class. Access the associated data using * mctelem_dataptr and when finished use mctelem_consume_oldest_end - in the * begin .. end bracket you are guaranteed that the entry can't be freed * even if it is ack'd elsewhere). Once the ultimate consumer of the * telemetry has processed it to stable storage it should acknowledge * the telemetry quoting the cookie id, at which point we will free * the element from the processing list. */ typedef struct mctelem_cookie *mctelem_cookie_t; typedef enum mctelem_class { MC_URGENT, MC_NONURGENT } mctelem_class_t; extern void mctelem_init(int); extern mctelem_cookie_t mctelem_reserve(mctelem_class_t); extern void *mctelem_dataptr(mctelem_cookie_t); extern void mctelem_commit(mctelem_cookie_t); extern void mctelem_dismiss(mctelem_cookie_t); extern mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t); extern void mctelem_consume_oldest_end(mctelem_cookie_t); extern void mctelem_ack(mctelem_class_t, mctelem_cookie_t); extern void mctelem_defer(mctelem_cookie_t); extern void mctelem_process_deferred(unsigned int, int (*)(mctelem_cookie_t)); int mctelem_has_deferred(unsigned int); #endif xen-4.4.0/xen/arch/x86/cpu/mcheck/mce_amd.h0000664000175000017500000000044112307313555016355 0ustar smbsmb#ifndef _MCHECK_AMD_H #define _MCHECK_AMD_H enum mcheck_type amd_k8_mcheck_init(struct cpuinfo_x86 *c); enum mcheck_type amd_f10_mcheck_init(struct cpuinfo_x86 *c); int mc_amd_recoverable_scan(uint64_t status); int mc_amd_addrcheck(uint64_t status, uint64_t misc, int addrtype); #endif xen-4.4.0/xen/arch/x86/cpu/mcheck/amd_f10.c0000664000175000017500000000744112307313555016201 0ustar smbsmb/* * MCA implementation for AMD Family10 CPUs * Copyright (c) 2007 Advanced Micro Devices, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* K8 common MCA documentation published at * * AMD64 Architecture Programmer's Manual Volume 2: * System Programming * Publication # 24593 Revision: 3.12 * Issue Date: September 2006 */ /* Family10 MCA documentation published at * * BIOS and Kernel Developer's Guide * For AMD Family 10h Processors * Publication # 31116 Revision: 1.08 * Isse Date: June 10, 2007 */ #include #include #include #include "mce.h" #include "mce_quirks.h" #include "x86_mca.h" #include "mce_amd.h" #include "mcaction.h" static struct mcinfo_extended * amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status) { struct mcinfo_extended *mc_ext; /* Family 0x10 introduced additional MSR that belong to the * northbridge bank (4). */ if (mi == NULL || bank != 4) return NULL; if (!(status & MCi_STATUS_VAL)) return NULL; if (!(status & MCi_STATUS_MISCV)) return NULL; mc_ext = x86_mcinfo_reserve(mi, sizeof(*mc_ext)); if (!mc_ext) { mi->flags |= MCINFO_FLAGS_UNCOMPLETE; return NULL; } mc_ext->common.type = MC_TYPE_EXTENDED; mc_ext->common.size = sizeof(*mc_ext); mc_ext->mc_msrs = 3; mc_ext->mc_msr[0].reg = MSR_F10_MC4_MISC1; mc_ext->mc_msr[1].reg = MSR_F10_MC4_MISC2; mc_ext->mc_msr[2].reg = MSR_F10_MC4_MISC3; mc_ext->mc_msr[0].value = mca_rdmsr(MSR_F10_MC4_MISC1); mc_ext->mc_msr[1].value = mca_rdmsr(MSR_F10_MC4_MISC2); mc_ext->mc_msr[2].value = mca_rdmsr(MSR_F10_MC4_MISC3); return mc_ext; } /* AMD Family10 machine check */ enum mcheck_type amd_f10_mcheck_init(struct cpuinfo_x86 *c) { enum mcequirk_amd_flags quirkflag = mcequirk_lookup_amd_quirkdata(c); if (amd_k8_mcheck_init(c) == mcheck_none) return mcheck_none; if (quirkflag == MCEQUIRK_F10_GART) mcequirk_amd_apply(quirkflag); x86_mce_callback_register(amd_f10_handler); mce_recoverable_register(mc_amd_recoverable_scan); mce_register_addrcheck(mc_amd_addrcheck); return mcheck_amd_famXX; } /* amd specific MCA MSR */ int vmce_amd_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) { switch (msr) { case MSR_F10_MC4_MISC1: /* DRAM error type */ v->arch.vmce.bank[1].mci_misc = val; mce_printk(MCE_VERBOSE, "MCE: wr msr %#"PRIx64"\n", val); break; case MSR_F10_MC4_MISC2: /* Link error type */ case MSR_F10_MC4_MISC3: /* L3 cache error type */ /* ignore write: we do not emulate link and l3 cache errors * to the guest. */ mce_printk(MCE_VERBOSE, "MCE: wr msr %#"PRIx64"\n", val); break; default: return 0; } return 1; } int vmce_amd_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) { switch (msr) { case MSR_F10_MC4_MISC1: /* DRAM error type */ *val = v->arch.vmce.bank[1].mci_misc; mce_printk(MCE_VERBOSE, "MCE: rd msr %#"PRIx64"\n", *val); break; case MSR_F10_MC4_MISC2: /* Link error type */ case MSR_F10_MC4_MISC3: /* L3 cache error type */ /* we do not emulate link and l3 cache * errors to the guest. */ *val = 0; mce_printk(MCE_VERBOSE, "MCE: rd msr %#"PRIx64"\n", *val); break; default: return 0; } return 1; } xen-4.4.0/xen/arch/x86/cpu/mcheck/util.h0000664000175000017500000000012312307313555015742 0ustar smbsmb#ifndef _MCHECK_UTIL_H #define _MCHECK_UTIL_H void mce_panic_check(void); #endif xen-4.4.0/xen/arch/x86/cpu/mcheck/barrier.h0000664000175000017500000000246712307313555016430 0ustar smbsmb#ifndef _MCHECK_BARRIER_H #define _MCHECK_BARRIER_H #include /* MCE handling */ struct mce_softirq_barrier { atomic_t val; atomic_t ingen; atomic_t outgen; }; /* * Initialize a barrier. Just set it to 0. */ void mce_barrier_init(struct mce_softirq_barrier *); /* * This function will need to be used when offlining a CPU in the * recovery actions. * * Decrement a barrier only. Needed for cases where the CPU * in question can't do it itself (e.g. it is being offlined). */ void mce_barrier_dec(struct mce_softirq_barrier *); /* * Increment the generation number and the value. The generation number * is incremented when entering a barrier. This way, it can be checked * on exit if a CPU is trying to re-enter the barrier. This can happen * if the first CPU to make it out immediately exits or re-enters, while * another CPU that is still in the loop becomes otherwise occupied * (e.g. it needs to service an interrupt, etc), missing the value * it's waiting for. * * These barrier functions should always be paired, so that the * counter value will reach 0 again after all CPUs have exited. */ void mce_barrier_enter(struct mce_softirq_barrier *); void mce_barrier_exit(struct mce_softirq_barrier *); void mce_barrier(struct mce_softirq_barrier *); #endif /* _MCHECK_BARRIER_H */ xen-4.4.0/xen/arch/x86/cpu/mcheck/amd_k8.c0000664000175000017500000000621712307313555016135 0ustar smbsmb/* * MCA implementation for AMD K8 CPUs * Copyright (c) 2007 Advanced Micro Devices, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* K8 common MCA documentation published at * * AMD64 Architecture Programmer's Manual Volume 2: * System Programming * Publication # 24593 Revision: 3.12 * Issue Date: September 2006 * * URL: * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf */ /* The related documentation for K8 Revisions A - E is: * * BIOS and Kernel Developer's Guide for * AMD Athlon 64 and AMD Opteron Processors * Publication # 26094 Revision: 3.30 * Issue Date: February 2006 * * URL: * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26094.PDF */ /* The related documentation for K8 Revisions F - G is: * * BIOS and Kernel Developer's Guide for * AMD NPT Family 0Fh Processors * Publication # 32559 Revision: 3.04 * Issue Date: December 2006 * * URL: * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/32559.pdf */ #include #include #include #include #include #include #include #include #include #include #include #include #include "mce.h" #include "mce_quirks.h" /* Machine Check Handler for AMD K8 family series */ static void k8_machine_check(struct cpu_user_regs *regs, long error_code) { mcheck_cmn_handler(regs, error_code, mca_allbanks, __get_cpu_var(mce_clear_banks)); } static int k8_need_clearbank_scan(enum mca_source who, uint64_t status) { if (who != MCA_MCE_SCAN) return 1; /* * For fatal error, it shouldn't be cleared so that sticky bank * have a chance to be handled after reboot by polling. */ if ((status & MCi_STATUS_UC) && (status & MCi_STATUS_PCC)) return 0; return 1; } /* AMD K8 machine check */ enum mcheck_type amd_k8_mcheck_init(struct cpuinfo_x86 *c) { uint32_t i; enum mcequirk_amd_flags quirkflag; quirkflag = mcequirk_lookup_amd_quirkdata(c); mce_handler_init(); x86_mce_vector_register(k8_machine_check); mce_need_clearbank_register(k8_need_clearbank_scan); for (i = 0; i < nr_mce_banks; i++) { if (quirkflag == MCEQUIRK_K8_GART && i == 4) { mcequirk_amd_apply(quirkflag); } else { /* Enable error reporting of all errors */ wrmsrl(MSR_IA32_MCx_CTL(i), 0xffffffffffffffffULL); wrmsrl(MSR_IA32_MCx_STATUS(i), 0x0ULL); } } return mcheck_amd_k8; } xen-4.4.0/xen/arch/x86/cpu/mcheck/mce.c0000664000175000017500000014103212307313555015531 0ustar smbsmb/* * mce.c - x86 Machine Check Exception Reporting * (c) 2002 Alan Cox , Dave Jones */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* for do_mca */ #include #include #include #include #include "mce.h" #include "barrier.h" #include "mcaction.h" #include "util.h" #include "vmce.h" bool_t __read_mostly mce_disabled; invbool_param("mce", mce_disabled); bool_t __read_mostly mce_broadcast = 0; bool_t is_mc_panic; unsigned int __read_mostly nr_mce_banks; unsigned int __read_mostly firstbank; uint8_t __read_mostly cmci_apic_vector; DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, poll_bankmask); DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, no_cmci_banks); DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, mce_clear_banks); static void intpose_init(void); static void mcinfo_clear(struct mc_info *); struct mca_banks *mca_allbanks; #define SEG_PL(segsel) ((segsel) & 0x3) #define _MC_MSRINJ_F_REQ_HWCR_WREN (1 << 16) #if 0 static int x86_mcerr(const char *msg, int err) { gdprintk(XENLOG_WARNING, "x86_mcerr: %s, returning %d\n", msg != NULL ? msg : "", err); return err; } #else #define x86_mcerr(msg, err) (err) #endif int mce_verbosity; static void __init mce_set_verbosity(char *str) { if (strcmp("verbose", str) == 0) mce_verbosity = MCE_VERBOSE; else printk(KERN_DEBUG "Machine Check verbosity level %s not recognised" "use mce_verbosity=verbose", str); } custom_param("mce_verbosity", mce_set_verbosity); /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct cpu_user_regs *regs, long error_code) { printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); } static x86_mce_vector_t _machine_check_vector = unexpected_machine_check; void x86_mce_vector_register(x86_mce_vector_t hdlr) { _machine_check_vector = hdlr; wmb(); } /* Call the installed machine check handler for this CPU setup. */ void machine_check_vector(struct cpu_user_regs *regs, long error_code) { _machine_check_vector(regs, error_code); } /* Init machine check callback handler * It is used to collect additional information provided by newer * CPU families/models without the need to duplicate the whole handler. * This avoids having many handlers doing almost nearly the same and each * with its own tweaks ands bugs. */ static x86_mce_callback_t mc_callback_bank_extended = NULL; void x86_mce_callback_register(x86_mce_callback_t cbfunc) { mc_callback_bank_extended = cbfunc; } /* Machine check recoverable judgement callback handler * It is used to judge whether an UC error is recoverable by software */ static mce_recoverable_t mc_recoverable_scan = NULL; void mce_recoverable_register(mce_recoverable_t cbfunc) { mc_recoverable_scan = cbfunc; } struct mca_banks *mcabanks_alloc(void) { struct mca_banks *mb; mb = xmalloc(struct mca_banks); if (!mb) return NULL; mb->bank_map = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_mce_banks)); if (!mb->bank_map) { xfree(mb); return NULL; } mb->num = nr_mce_banks; return mb; } void mcabanks_free(struct mca_banks *banks) { if (banks == NULL) return; if (banks->bank_map) xfree(banks->bank_map); xfree(banks); } static void mcabank_clear(int banknum) { uint64_t status; status = mca_rdmsr(MSR_IA32_MCx_STATUS(banknum)); if (status & MCi_STATUS_ADDRV) mca_wrmsr(MSR_IA32_MCx_ADDR(banknum), 0x0ULL); if (status & MCi_STATUS_MISCV) mca_wrmsr(MSR_IA32_MCx_MISC(banknum), 0x0ULL); mca_wrmsr(MSR_IA32_MCx_STATUS(banknum), 0x0ULL); } /* Judging whether to Clear Machine Check error bank callback handler * According to Intel latest MCA OS Recovery Writer's Guide, * whether the error MCA bank needs to be cleared is decided by the mca_source * and MCi_status bit value. */ static mce_need_clearbank_t mc_need_clearbank_scan = NULL; void mce_need_clearbank_register(mce_need_clearbank_t cbfunc) { mc_need_clearbank_scan = cbfunc; } static struct mce_softirq_barrier mce_inside_bar, mce_severity_bar; static struct mce_softirq_barrier mce_trap_bar; /* * mce_logout_lock should only be used in the trap handler, * while MCIP has not been cleared yet in the global status * register. Other use is not safe, since an MCE trap can * happen at any moment, which would cause lock recursion. */ static DEFINE_SPINLOCK(mce_logout_lock); static atomic_t severity_cpu = ATOMIC_INIT(-1); static atomic_t found_error = ATOMIC_INIT(0); static cpumask_t mce_fatal_cpus; const struct mca_error_handler *__read_mostly mce_dhandlers; const struct mca_error_handler *__read_mostly mce_uhandlers; unsigned int __read_mostly mce_dhandler_num; unsigned int __read_mostly mce_uhandler_num; static void mca_init_bank(enum mca_source who, struct mc_info *mi, int bank) { struct mcinfo_bank *mib; if (!mi) return; mib = x86_mcinfo_reserve(mi, sizeof(*mib)); if (!mib) { mi->flags |= MCINFO_FLAGS_UNCOMPLETE; return; } mib->mc_status = mca_rdmsr(MSR_IA32_MCx_STATUS(bank)); mib->common.type = MC_TYPE_BANK; mib->common.size = sizeof (struct mcinfo_bank); mib->mc_bank = bank; if (mib->mc_status & MCi_STATUS_MISCV) mib->mc_misc = mca_rdmsr(MSR_IA32_MCx_MISC(bank)); if (mib->mc_status & MCi_STATUS_ADDRV) mib->mc_addr = mca_rdmsr(MSR_IA32_MCx_ADDR(bank)); if ((mib->mc_status & MCi_STATUS_MISCV) && (mib->mc_status & MCi_STATUS_ADDRV) && (mc_check_addr(mib->mc_status, mib->mc_misc, MC_ADDR_PHYSICAL)) && (who == MCA_POLLER || who == MCA_CMCI_HANDLER) && (mfn_valid(paddr_to_pfn(mib->mc_addr)))) { struct domain *d; d = maddr_get_owner(mib->mc_addr); if (d) mib->mc_domid = d->domain_id; } if (who == MCA_CMCI_HANDLER) { mib->mc_ctrl2 = mca_rdmsr(MSR_IA32_MC0_CTL2 + bank); rdtscll(mib->mc_tsc); } } static int mca_init_global(uint32_t flags, struct mcinfo_global *mig) { uint64_t status; int cpu_nr; struct vcpu *v = current; struct domain *d; /* Set global information */ mig->common.type = MC_TYPE_GLOBAL; mig->common.size = sizeof (struct mcinfo_global); status = mca_rdmsr(MSR_IA32_MCG_STATUS); mig->mc_gstatus = status; mig->mc_domid = mig->mc_vcpuid = -1; mig->mc_flags = flags; cpu_nr = smp_processor_id(); /* Retrieve detector information */ x86_mc_get_cpu_info(cpu_nr, &mig->mc_socketid, &mig->mc_coreid, &mig->mc_core_threadid, &mig->mc_apicid, NULL, NULL, NULL); /* This is really meaningless */ if (v != NULL && ((d = v->domain) != NULL)) { mig->mc_domid = d->domain_id; mig->mc_vcpuid = v->vcpu_id; } else { mig->mc_domid = -1; mig->mc_vcpuid = -1; } return 0; } /* Utility function to perform MCA bank telemetry readout and to push that * telemetry towards an interested dom0 for logging and diagnosis. * The caller - #MC handler or MCA poll function - must arrange that we * do not migrate cpus. */ /* XXFM Could add overflow counting? */ /* Add out_param clear_bank for Machine Check Handler Caller. * For Intel latest CPU, whether to clear the error bank status needs to * be judged by the callback function defined above. */ mctelem_cookie_t mcheck_mca_logout(enum mca_source who, struct mca_banks *bankmask, struct mca_summary *sp, struct mca_banks *clear_bank) { uint64_t gstatus, status; struct mcinfo_global *mig = NULL; /* on stack */ mctelem_cookie_t mctc = NULL; bool_t uc = 0, pcc = 0, recover = 1, need_clear = 1; uint32_t mc_flags = 0; struct mc_info *mci = NULL; mctelem_class_t which = MC_URGENT; /* XXXgcc */ int errcnt = 0; int i; gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS); switch (who) { case MCA_MCE_SCAN: mc_flags = MC_FLAG_MCE; which = MC_URGENT; break; case MCA_POLLER: case MCA_RESET: mc_flags = MC_FLAG_POLLED; which = MC_NONURGENT; break; case MCA_CMCI_HANDLER: mc_flags = MC_FLAG_CMCI; which = MC_NONURGENT; break; default: BUG(); } /* If no mc_recovery_scan callback handler registered, * this error is not recoverable */ recover = (mc_recoverable_scan) ? 1 : 0; for (i = 0; i < nr_mce_banks; i++) { /* Skip bank if corresponding bit in bankmask is clear */ if (!mcabanks_test(i, bankmask)) continue; status = mca_rdmsr(MSR_IA32_MCx_STATUS(i)); if (!(status & MCi_STATUS_VAL)) continue; /* this bank has no valid telemetry */ /* For Intel Latest CPU CMCI/MCE Handler caller, we need to * decide whether to clear bank by MCi_STATUS bit value such as * OVER/UC/EN/PCC/S/AR */ if ( mc_need_clearbank_scan ) need_clear = mc_need_clearbank_scan(who, status); /* If this is the first bank with valid MCA DATA, then * try to reserve an entry from the urgent/nonurgent queue * depending on whether we are called from an exception or * a poller; this can fail (for example dom0 may not * yet have consumed past telemetry). */ if (errcnt++ == 0) { if ( (mctc = mctelem_reserve(which)) != NULL ) { mci = mctelem_dataptr(mctc); mcinfo_clear(mci); mig = x86_mcinfo_reserve(mci, sizeof(*mig)); /* mc_info should at least hold up the global information */ ASSERT(mig); mca_init_global(mc_flags, mig); /* A hook here to get global extended msrs */ { if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) intel_get_extended_msrs(mig, mci); } } } /* flag for uncorrected errors */ if (!uc && ((status & MCi_STATUS_UC) != 0)) uc = 1; /* flag processor context corrupt */ if (!pcc && ((status & MCi_STATUS_PCC) != 0)) pcc = 1; if (recover && uc) /* uc = 1, recover = 1, we need not panic. */ recover = mc_recoverable_scan(status); mca_init_bank(who, mci, i); if (mc_callback_bank_extended) mc_callback_bank_extended(mci, i, status); /* By default, need_clear = 1 */ if (who != MCA_MCE_SCAN && need_clear) /* Clear bank */ mcabank_clear(i); else if ( who == MCA_MCE_SCAN && need_clear) mcabanks_set(i, clear_bank); wmb(); } if (mig && errcnt > 0) { if (pcc) mig->mc_flags |= MC_FLAG_UNCORRECTABLE; else if (uc) mig->mc_flags |= MC_FLAG_RECOVERABLE; else mig->mc_flags |= MC_FLAG_CORRECTABLE; } if (sp) { sp->errcnt = errcnt; sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0; sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0; sp->uc = uc; sp->pcc = pcc; sp->recoverable = recover; } return mci != NULL ? mctc : NULL; /* may be NULL */ } static void mce_spin_lock(spinlock_t *lk) { while (!spin_trylock(lk)) { cpu_relax(); mce_panic_check(); } } static void mce_spin_unlock(spinlock_t *lk) { spin_unlock(lk); } static enum mce_result mce_action(struct cpu_user_regs *regs, mctelem_cookie_t mctc); /* * Return: * -1: if system can't be recovered * 0: Continue to next step */ static int mce_urgent_action(struct cpu_user_regs *regs, mctelem_cookie_t mctc) { uint64_t gstatus; if ( mctc == NULL) return 0; gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS); /* * FIXME: When RIPV = EIPV = 0, it's a little bit tricky. It may be an * asynchronic error, currently we have no way to precisely locate * whether the error occur at guest or hypervisor. * To avoid handling error in wrong way, we treat it as unrecovered. * * Another unrecovered case is RIPV = 0 while in hypervisor * since Xen is not pre-emptible. */ if ( !(gstatus & MCG_STATUS_RIPV) && (!(gstatus & MCG_STATUS_EIPV) || !guest_mode(regs)) ) return -1; return mce_action(regs, mctc) == MCER_RESET ? -1 : 0; } /* Shared #MC handler. */ void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code, struct mca_banks *bankmask, struct mca_banks *clear_bank) { uint64_t gstatus; mctelem_cookie_t mctc = NULL; struct mca_summary bs; mce_spin_lock(&mce_logout_lock); if (clear_bank != NULL) { memset( clear_bank->bank_map, 0x0, sizeof(long) * BITS_TO_LONGS(clear_bank->num)); } mctc = mcheck_mca_logout(MCA_MCE_SCAN, bankmask, &bs, clear_bank); if (bs.errcnt) { /* * Uncorrected errors must be dealt with in softirq context. */ if (bs.uc || bs.pcc) { add_taint(TAINT_MACHINE_CHECK); if (mctc != NULL) mctelem_defer(mctc); /* * For PCC=1 and can't be recovered, context is lost, so * reboot now without clearing the banks, and deal with * the telemetry after reboot (the MSRs are sticky) */ if (bs.pcc || !bs.recoverable) cpumask_set_cpu(smp_processor_id(), &mce_fatal_cpus); } else { if (mctc != NULL) mctelem_commit(mctc); } atomic_set(&found_error, 1); /* The last CPU will be take check/clean-up etc */ atomic_set(&severity_cpu, smp_processor_id()); mce_printk(MCE_CRITICAL, "MCE: clear_bank map %lx on CPU%d\n", *((unsigned long*)clear_bank), smp_processor_id()); if (clear_bank != NULL) mcheck_mca_clearbanks(clear_bank); } else { if (mctc != NULL) mctelem_dismiss(mctc); } mce_spin_unlock(&mce_logout_lock); mce_barrier_enter(&mce_trap_bar); if ( mctc != NULL && mce_urgent_action(regs, mctc)) cpumask_set_cpu(smp_processor_id(), &mce_fatal_cpus); mce_barrier_exit(&mce_trap_bar); /* * Wait until everybody has processed the trap. */ mce_barrier_enter(&mce_trap_bar); if (atomic_read(&severity_cpu) == smp_processor_id()) { /* According to SDM, if no error bank found on any cpus, * something unexpected happening, we can't do any * recovery job but to reset the system. */ if (atomic_read(&found_error) == 0) mc_panic("MCE: No CPU found valid MCE, need reset"); if (!cpumask_empty(&mce_fatal_cpus)) { char *ebufp, ebuf[96] = "MCE: Fatal error happened on CPUs "; ebufp = ebuf + strlen(ebuf); cpumask_scnprintf(ebufp, 95 - strlen(ebuf), &mce_fatal_cpus); mc_panic(ebuf); } atomic_set(&found_error, 0); } mce_barrier_exit(&mce_trap_bar); /* Clear flags after above fatal check */ mce_barrier_enter(&mce_trap_bar); gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS); if ((gstatus & MCG_STATUS_MCIP) != 0) { mce_printk(MCE_CRITICAL, "MCE: Clear MCIP@ last step"); mca_wrmsr(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP); } mce_barrier_exit(&mce_trap_bar); raise_softirq(MACHINE_CHECK_SOFTIRQ); } void mcheck_mca_clearbanks(struct mca_banks *bankmask) { int i; for (i = 0; i < nr_mce_banks; i++) { if (!mcabanks_test(i, bankmask)) continue; mcabank_clear(i); } } /*check the existence of Machine Check*/ int mce_available(struct cpuinfo_x86 *c) { return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } /* * Check if bank 0 is usable for MCE. It isn't for Intel P6 family * before model 0x1a. */ unsigned int mce_firstbank(struct cpuinfo_x86 *c) { if (c->x86 == 6) { if (c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a) return 1; } return 0; } int show_mca_info(int inited, struct cpuinfo_x86 *c) { static enum mcheck_type g_type = mcheck_unset; if (inited != g_type) { char prefix[20]; static const char *const type_str[] = { [mcheck_amd_famXX] = "AMD", [mcheck_amd_k8] = "AMD K8", [mcheck_intel] = "Intel" }; snprintf(prefix, ARRAY_SIZE(prefix), g_type != mcheck_unset ? XENLOG_WARNING "CPU%i: " : XENLOG_INFO, smp_processor_id()); BUG_ON(inited >= ARRAY_SIZE(type_str)); switch (inited) { default: printk("%s%s machine check reporting enabled\n", prefix, type_str[inited]); break; case mcheck_amd_famXX: printk("%s%s Fam%xh machine check reporting enabled\n", prefix, type_str[inited], c->x86); break; case mcheck_none: printk("%sNo machine check initialization\n", prefix); break; } g_type = inited; } return 0; } static void set_poll_bankmask(struct cpuinfo_x86 *c) { int cpu = smp_processor_id(); struct mca_banks *mb; mb = per_cpu(poll_bankmask, cpu); BUG_ON(!mb); if (cmci_support && !mce_disabled) { mb->num = per_cpu(no_cmci_banks, cpu)->num; bitmap_copy(mb->bank_map, per_cpu(no_cmci_banks, cpu)->bank_map, nr_mce_banks); } else { bitmap_copy(mb->bank_map, mca_allbanks->bank_map, nr_mce_banks); if (mce_firstbank(c)) mcabanks_clear(0, mb); } } /* The perbank ctl/status init is platform specific because of AMD's quirk */ int mca_cap_init(void) { uint64_t msr_content; rdmsrl(MSR_IA32_MCG_CAP, msr_content); if (msr_content & MCG_CTL_P) /* Control register present ? */ wrmsrl(MSR_IA32_MCG_CTL, 0xffffffffffffffffULL); if (nr_mce_banks && (msr_content & MCG_CAP_COUNT) != nr_mce_banks) { dprintk(XENLOG_WARNING, "Different bank number on cpu %x\n", smp_processor_id()); return -ENODEV; } nr_mce_banks = msr_content & MCG_CAP_COUNT; if (!nr_mce_banks) { printk(XENLOG_INFO "CPU%u: No MCE banks present. " "Machine check support disabled\n", smp_processor_id()); return -ENODEV; } /* mcabanks_alloc depends on nr_mce_banks */ if (!mca_allbanks) { int i; mca_allbanks = mcabanks_alloc(); for ( i = 0; i < nr_mce_banks; i++) mcabanks_set(i, mca_allbanks); } return mca_allbanks ? 0:-ENOMEM; } static void cpu_bank_free(unsigned int cpu) { struct mca_banks *poll = per_cpu(poll_bankmask, cpu); struct mca_banks *clr = per_cpu(mce_clear_banks, cpu); mcabanks_free(poll); mcabanks_free(clr); } static int cpu_bank_alloc(unsigned int cpu) { struct mca_banks *poll = mcabanks_alloc(); struct mca_banks *clr = mcabanks_alloc(); if ( !poll || !clr ) { mcabanks_free(poll); mcabanks_free(clr); return -ENOMEM; } per_cpu(poll_bankmask, cpu) = poll; per_cpu(mce_clear_banks, cpu) = clr; return 0; } static int cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int rc = 0; switch ( action ) { case CPU_UP_PREPARE: rc = cpu_bank_alloc(cpu); break; case CPU_UP_CANCELED: case CPU_DEAD: cpu_bank_free(cpu); break; default: break; } return !rc ? NOTIFY_DONE : notifier_from_errno(rc); } static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; /* This has to be run for each processor */ void mcheck_init(struct cpuinfo_x86 *c, bool_t bsp) { enum mcheck_type inited = mcheck_none; if (mce_disabled == 1) { dprintk(XENLOG_INFO, "MCE support disabled by bootparam\n"); return; } if (!mce_available(c)) { printk(XENLOG_INFO "CPU%i: No machine check support available\n", smp_processor_id()); return; } /*Hardware Enable */ if (mca_cap_init()) return; /* Early MCE initialisation for BSP. */ if ( bsp && cpu_bank_alloc(smp_processor_id()) ) BUG(); switch (c->x86_vendor) { case X86_VENDOR_AMD: inited = amd_mcheck_init(c); break; case X86_VENDOR_INTEL: switch (c->x86) { case 6: case 15: inited = intel_mcheck_init(c, bsp); break; } break; default: break; } show_mca_info(inited, c); if (inited == mcheck_none || inited == mcheck_unset) goto out; intpose_init(); mctelem_init(sizeof(struct mc_info)); /* Turn on MCE now */ set_in_cr4(X86_CR4_MCE); if ( bsp ) register_cpu_notifier(&cpu_nfb); set_poll_bankmask(c); return; out: if ( bsp ) { cpu_bank_free(smp_processor_id()); mcabanks_free(mca_allbanks); mca_allbanks = NULL; } } static void mcinfo_clear(struct mc_info *mi) { memset(mi, 0, sizeof(struct mc_info)); x86_mcinfo_nentries(mi) = 0; } void *x86_mcinfo_reserve(struct mc_info *mi, int size) { int i; unsigned long end1, end2; struct mcinfo_common *mic_base, *mic_index; mic_index = mic_base = x86_mcinfo_first(mi); /* go to first free entry */ for (i = 0; i < x86_mcinfo_nentries(mi); i++) { mic_index = x86_mcinfo_next(mic_index); } /* check if there is enough size */ end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct mc_info)); end2 = (unsigned long)((uint8_t *)mic_index + size); if (end1 < end2) { mce_printk(MCE_CRITICAL, "mcinfo_add: No space left in mc_info\n"); return NULL; } /* there's enough space. add entry. */ x86_mcinfo_nentries(mi)++; return memset(mic_index, 0, size); } void *x86_mcinfo_add(struct mc_info *mi, void *mcinfo) { struct mcinfo_common *mic, *buf; mic = (struct mcinfo_common *)mcinfo; buf = x86_mcinfo_reserve(mi, mic->size); if ( !buf ) mce_printk(MCE_CRITICAL, "mcinfo_add: No space left in mc_info\n"); else memcpy(buf, mic, mic->size); return buf; } static void x86_mcinfo_apei_save( struct mcinfo_global *mc_global, struct mcinfo_bank *mc_bank) { struct mce m; memset(&m, 0, sizeof(struct mce)); m.cpu = mc_global->mc_coreid; m.cpuvendor = boot_cpu_data.x86_vendor; m.cpuid = cpuid_eax(1); m.socketid = mc_global->mc_socketid; m.apicid = mc_global->mc_apicid; m.mcgstatus = mc_global->mc_gstatus; m.status = mc_bank->mc_status; m.misc = mc_bank->mc_misc; m.addr = mc_bank->mc_addr; m.bank = mc_bank->mc_bank; apei_write_mce(&m); } /* Dump machine check information in a format, * mcelog can parse. This is used only when * Dom0 does not take the notification. */ void x86_mcinfo_dump(struct mc_info *mi) { struct mcinfo_common *mic = NULL; struct mcinfo_global *mc_global; struct mcinfo_bank *mc_bank; /* first print the global info */ x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL); if (mic == NULL) return; mc_global = (struct mcinfo_global *)mic; if (mc_global->mc_flags & MC_FLAG_MCE) { printk(XENLOG_WARNING "CPU%d: Machine Check Exception: %16"PRIx64"\n", mc_global->mc_coreid, mc_global->mc_gstatus); } else if (mc_global->mc_flags & MC_FLAG_CMCI) { printk(XENLOG_WARNING "CMCI occurred on CPU %d.\n", mc_global->mc_coreid); } else if (mc_global->mc_flags & MC_FLAG_POLLED) { printk(XENLOG_WARNING "POLLED occurred on CPU %d.\n", mc_global->mc_coreid); } /* then the bank information */ x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); /* finds the first entry */ do { if (mic == NULL) return; if (mic->type != MC_TYPE_BANK) goto next; mc_bank = (struct mcinfo_bank *)mic; printk(XENLOG_WARNING "Bank %d: %16"PRIx64, mc_bank->mc_bank, mc_bank->mc_status); if (mc_bank->mc_status & MCi_STATUS_MISCV) printk("[%16"PRIx64"]", mc_bank->mc_misc); if (mc_bank->mc_status & MCi_STATUS_ADDRV) printk(" at %16"PRIx64, mc_bank->mc_addr); printk("\n"); if (is_mc_panic) x86_mcinfo_apei_save(mc_global, mc_bank); next: mic = x86_mcinfo_next(mic); /* next entry */ if ((mic == NULL) || (mic->size == 0)) break; } while (1); } static void do_mc_get_cpu_info(void *v) { int cpu = smp_processor_id(); int cindex, cpn; struct cpuinfo_x86 *c; xen_mc_logical_cpu_t *log_cpus, *xcp; uint32_t junk, ebx; log_cpus = v; c = &cpu_data[cpu]; cindex = 0; cpn = cpu - 1; /* * Deal with sparse masks, condensed into a contig array. */ while (cpn >= 0) { if (cpu_online(cpn)) cindex++; cpn--; } xcp = &log_cpus[cindex]; c = &cpu_data[cpu]; xcp->mc_cpunr = cpu; x86_mc_get_cpu_info(cpu, &xcp->mc_chipid, &xcp->mc_coreid, &xcp->mc_threadid, &xcp->mc_apicid, &xcp->mc_ncores, &xcp->mc_ncores_active, &xcp->mc_nthreads); xcp->mc_cpuid_level = c->cpuid_level; xcp->mc_family = c->x86; xcp->mc_vendor = c->x86_vendor; xcp->mc_model = c->x86_model; xcp->mc_step = c->x86_mask; xcp->mc_cache_size = c->x86_cache_size; xcp->mc_cache_alignment = c->x86_cache_alignment; memcpy(xcp->mc_vendorid, c->x86_vendor_id, sizeof xcp->mc_vendorid); memcpy(xcp->mc_brandid, c->x86_model_id, sizeof xcp->mc_brandid); memcpy(xcp->mc_cpu_caps, c->x86_capability, sizeof xcp->mc_cpu_caps); /* * This part needs to run on the CPU itself. */ xcp->mc_nmsrvals = __MC_NMSRS; xcp->mc_msrvalues[0].reg = MSR_IA32_MCG_CAP; rdmsrl(MSR_IA32_MCG_CAP, xcp->mc_msrvalues[0].value); if (c->cpuid_level >= 1) { cpuid(1, &junk, &ebx, &junk, &junk); xcp->mc_clusterid = (ebx >> 24) & 0xff; } else xcp->mc_clusterid = hard_smp_processor_id(); } void x86_mc_get_cpu_info(unsigned cpu, uint32_t *chipid, uint16_t *coreid, uint16_t *threadid, uint32_t *apicid, unsigned *ncores, unsigned *ncores_active, unsigned *nthreads) { struct cpuinfo_x86 *c; *apicid = cpu_physical_id(cpu); c = &cpu_data[cpu]; if (c->apicid == BAD_APICID) { *chipid = cpu; *coreid = 0; *threadid = 0; if (ncores != NULL) *ncores = 1; if (ncores_active != NULL) *ncores_active = 1; if (nthreads != NULL) *nthreads = 1; } else { *chipid = c->phys_proc_id; if (c->x86_max_cores > 1) *coreid = c->cpu_core_id; else *coreid = 0; *threadid = c->apicid & ((1 << (c->x86_num_siblings - 1)) - 1); if (ncores != NULL) *ncores = c->x86_max_cores; if (ncores_active != NULL) *ncores_active = c->booted_cores; if (nthreads != NULL) *nthreads = c->x86_num_siblings; } } #define INTPOSE_NENT 50 static struct intpose_ent { unsigned int cpu_nr; uint64_t msr; uint64_t val; } intpose_arr[INTPOSE_NENT]; static void intpose_init(void) { static int done; int i; if (done++ > 0) return; for (i = 0; i < INTPOSE_NENT; i++) { intpose_arr[i].cpu_nr = -1; } } struct intpose_ent *intpose_lookup(unsigned int cpu_nr, uint64_t msr, uint64_t *valp) { int i; for (i = 0; i < INTPOSE_NENT; i++) { if (intpose_arr[i].cpu_nr == cpu_nr && intpose_arr[i].msr == msr) { if (valp != NULL) *valp = intpose_arr[i].val; return &intpose_arr[i]; } } return NULL; } static void intpose_add(unsigned int cpu_nr, uint64_t msr, uint64_t val) { struct intpose_ent *ent; int i; if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) { ent->val = val; return; } for (i = 0, ent = &intpose_arr[0]; i < INTPOSE_NENT; i++, ent++) { if (ent->cpu_nr == -1) { ent->cpu_nr = cpu_nr; ent->msr = msr; ent->val = val; return; } } printk("intpose_add: interpose array full - request dropped\n"); } bool_t intpose_inval(unsigned int cpu_nr, uint64_t msr) { struct intpose_ent *ent = intpose_lookup(cpu_nr, msr, NULL); if ( !ent ) return 0; ent->cpu_nr = -1; return 1; } #define IS_MCA_BANKREG(r) \ ((r) >= MSR_IA32_MC0_CTL && \ (r) <= MSR_IA32_MCx_MISC(nr_mce_banks - 1) && \ ((r) - MSR_IA32_MC0_CTL) % 4 != 0) /* excludes MCi_CTL */ static int x86_mc_msrinject_verify(struct xen_mc_msrinject *mci) { struct cpuinfo_x86 *c; int i, errs = 0; c = &cpu_data[smp_processor_id()]; for (i = 0; i < mci->mcinj_count; i++) { uint64_t reg = mci->mcinj_msr[i].reg; const char *reason = NULL; if (IS_MCA_BANKREG(reg)) { if (c->x86_vendor == X86_VENDOR_AMD) { /* On AMD we can set MCi_STATUS_WREN in the * HWCR MSR to allow non-zero writes to banks * MSRs not to #GP. The injector in dom0 * should set that bit, but we detect when it * is necessary and set it as a courtesy to * avoid #GP in the hypervisor. */ mci->mcinj_flags |= _MC_MSRINJ_F_REQ_HWCR_WREN; continue; } else { /* No alternative but to interpose, so require * that the injector specified as such. */ if (!(mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE)) { reason = "must specify interposition"; } } } else { switch (reg) { /* MSRs acceptable on all x86 cpus */ case MSR_IA32_MCG_STATUS: break; case MSR_F10_MC4_MISC1: case MSR_F10_MC4_MISC2: case MSR_F10_MC4_MISC3: if (c->x86_vendor != X86_VENDOR_AMD) reason = "only supported on AMD"; else if (c->x86 < 0x10) reason = "only supported on AMD Fam10h+"; break; /* MSRs that the HV will take care of */ case MSR_K8_HWCR: if (c->x86_vendor == X86_VENDOR_AMD) reason = "HV will operate HWCR"; else reason ="only supported on AMD"; break; default: reason = "not a recognized MCA MSR"; break; } } if (reason != NULL) { printk("HV MSR INJECT ERROR: MSR %#Lx %s\n", (unsigned long long)mci->mcinj_msr[i].reg, reason); errs++; } } return !errs; } static uint64_t x86_mc_hwcr_wren(void) { uint64_t old; rdmsrl(MSR_K8_HWCR, old); if (!(old & K8_HWCR_MCi_STATUS_WREN)) { uint64_t new = old | K8_HWCR_MCi_STATUS_WREN; wrmsrl(MSR_K8_HWCR, new); } return old; } static void x86_mc_hwcr_wren_restore(uint64_t hwcr) { if (!(hwcr & K8_HWCR_MCi_STATUS_WREN)) wrmsrl(MSR_K8_HWCR, hwcr); } static void x86_mc_msrinject(void *data) { struct xen_mc_msrinject *mci = data; struct mcinfo_msr *msr; uint64_t hwcr = 0; int intpose; int i; if (mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN) hwcr = x86_mc_hwcr_wren(); intpose = (mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE) != 0; for (i = 0, msr = &mci->mcinj_msr[0]; i < mci->mcinj_count; i++, msr++) { printk("HV MSR INJECT (%s) target %u actual %u MSR %#Lx <-- %#Lx\n", intpose ? "interpose" : "hardware", mci->mcinj_cpunr, smp_processor_id(), (unsigned long long)msr->reg, (unsigned long long)msr->value); if (intpose) intpose_add(mci->mcinj_cpunr, msr->reg, msr->value); else wrmsrl(msr->reg, msr->value); } if (mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN) x86_mc_hwcr_wren_restore(hwcr); } /*ARGSUSED*/ static void x86_mc_mceinject(void *data) { printk("Simulating #MC on cpu %d\n", smp_processor_id()); __asm__ __volatile__("int $0x12"); } #if BITS_PER_LONG == 64 #define ID2COOKIE(id) ((mctelem_cookie_t)(id)) #define COOKIE2ID(c) ((uint64_t)(c)) #elif defined(BITS_PER_LONG) #error BITS_PER_LONG has unexpected value #else #error BITS_PER_LONG definition absent #endif # include # define xen_mcinfo_msr mcinfo_msr CHECK_mcinfo_msr; # undef xen_mcinfo_msr # undef CHECK_mcinfo_msr # define CHECK_mcinfo_msr struct mcinfo_msr # define xen_mcinfo_common mcinfo_common CHECK_mcinfo_common; # undef xen_mcinfo_common # undef CHECK_mcinfo_common # define CHECK_mcinfo_common struct mcinfo_common CHECK_FIELD_(struct, mc_fetch, flags); CHECK_FIELD_(struct, mc_fetch, fetch_id); # define CHECK_compat_mc_fetch struct mc_fetch CHECK_FIELD_(struct, mc_physcpuinfo, ncpus); # define CHECK_compat_mc_physcpuinfo struct mc_physcpuinfo #define CHECK_compat_mc_inject_v2 struct mc_inject_v2 CHECK_mc; # undef CHECK_compat_mc_fetch # undef CHECK_compat_mc_physcpuinfo # define xen_mc_info mc_info CHECK_mc_info; # undef xen_mc_info # define xen_mcinfo_global mcinfo_global CHECK_mcinfo_global; # undef xen_mcinfo_global # define xen_mcinfo_bank mcinfo_bank CHECK_mcinfo_bank; # undef xen_mcinfo_bank # define xen_mcinfo_extended mcinfo_extended CHECK_mcinfo_extended; # undef xen_mcinfo_extended # define xen_mcinfo_recovery mcinfo_recovery # define xen_cpu_offline_action cpu_offline_action # define xen_page_offline_action page_offline_action CHECK_mcinfo_recovery; # undef xen_cpu_offline_action # undef xen_page_offline_action # undef xen_mcinfo_recovery /* Machine Check Architecture Hypercall */ long do_mca(XEN_GUEST_HANDLE_PARAM(xen_mc_t) u_xen_mc) { long ret = 0; struct xen_mc curop, *op = &curop; struct vcpu *v = current; union { struct xen_mc_fetch *nat; struct compat_mc_fetch *cmp; } mc_fetch; union { struct xen_mc_physcpuinfo *nat; struct compat_mc_physcpuinfo *cmp; } mc_physcpuinfo; uint32_t flags, cmdflags; int nlcpu; xen_mc_logical_cpu_t *log_cpus = NULL; mctelem_cookie_t mctc; mctelem_class_t which; unsigned int target; struct xen_mc_msrinject *mc_msrinject; struct xen_mc_mceinject *mc_mceinject; ret = xsm_do_mca(XSM_PRIV); if ( ret ) return x86_mcerr(NULL, ret); if ( copy_from_guest(op, u_xen_mc, 1) ) return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT); if ( op->interface_version != XEN_MCA_INTERFACE_VERSION ) return x86_mcerr("do_mca: interface version mismatch", -EACCES); switch (op->cmd) { case XEN_MC_fetch: mc_fetch.nat = &op->u.mc_fetch; cmdflags = mc_fetch.nat->flags; switch (cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT)) { case XEN_MC_NONURGENT: which = MC_NONURGENT; break; case XEN_MC_URGENT: which = MC_URGENT; break; default: return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL); } flags = XEN_MC_OK; if (cmdflags & XEN_MC_ACK) { mctelem_cookie_t cookie = ID2COOKIE(mc_fetch.nat->fetch_id); mctelem_ack(which, cookie); } else { if (!is_pv_32on64_vcpu(v) ? guest_handle_is_null(mc_fetch.nat->data) : compat_handle_is_null(mc_fetch.cmp->data)) return x86_mcerr("do_mca fetch: guest buffer " "invalid", -EINVAL); if ((mctc = mctelem_consume_oldest_begin(which))) { struct mc_info *mcip = mctelem_dataptr(mctc); if (!is_pv_32on64_vcpu(v) ? copy_to_guest(mc_fetch.nat->data, mcip, 1) : copy_to_compat(mc_fetch.cmp->data, mcip, 1)) { ret = -EFAULT; flags |= XEN_MC_FETCHFAILED; mc_fetch.nat->fetch_id = 0; } else { mc_fetch.nat->fetch_id = COOKIE2ID(mctc); } mctelem_consume_oldest_end(mctc); } else { /* There is no data */ flags |= XEN_MC_NODATA; mc_fetch.nat->fetch_id = 0; } mc_fetch.nat->flags = flags; if (copy_to_guest(u_xen_mc, op, 1) != 0) ret = -EFAULT; } break; case XEN_MC_notifydomain: return x86_mcerr("do_mca notify unsupported", -EINVAL); case XEN_MC_physcpuinfo: mc_physcpuinfo.nat = &op->u.mc_physcpuinfo; nlcpu = num_online_cpus(); if (!is_pv_32on64_vcpu(v) ? !guest_handle_is_null(mc_physcpuinfo.nat->info) : !compat_handle_is_null(mc_physcpuinfo.cmp->info)) { if (mc_physcpuinfo.nat->ncpus <= 0) return x86_mcerr("do_mca cpuinfo: ncpus <= 0", -EINVAL); nlcpu = min(nlcpu, (int)mc_physcpuinfo.nat->ncpus); log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu); if (log_cpus == NULL) return x86_mcerr("do_mca cpuinfo", -ENOMEM); on_each_cpu(do_mc_get_cpu_info, log_cpus, 1); if (!is_pv_32on64_vcpu(v) ? copy_to_guest(mc_physcpuinfo.nat->info, log_cpus, nlcpu) : copy_to_compat(mc_physcpuinfo.cmp->info, log_cpus, nlcpu)) ret = -EFAULT; xfree(log_cpus); } mc_physcpuinfo.nat->ncpus = nlcpu; if (copy_to_guest(u_xen_mc, op, 1)) return x86_mcerr("do_mca cpuinfo", -EFAULT); break; case XEN_MC_msrinject: if (nr_mce_banks == 0) return x86_mcerr("do_mca inject", -ENODEV); mc_msrinject = &op->u.mc_msrinject; target = mc_msrinject->mcinj_cpunr; if (target >= nr_cpu_ids) return x86_mcerr("do_mca inject: bad target", -EINVAL); if (!cpu_online(target)) return x86_mcerr("do_mca inject: target offline", -EINVAL); if (mc_msrinject->mcinj_count == 0) return 0; if (!x86_mc_msrinject_verify(mc_msrinject)) return x86_mcerr("do_mca inject: illegal MSR", -EINVAL); add_taint(TAINT_ERROR_INJECT); on_selected_cpus(cpumask_of(target), x86_mc_msrinject, mc_msrinject, 1); break; case XEN_MC_mceinject: if (nr_mce_banks == 0) return x86_mcerr("do_mca #MC", -ENODEV); mc_mceinject = &op->u.mc_mceinject; target = mc_mceinject->mceinj_cpunr; if (target >= nr_cpu_ids) return x86_mcerr("do_mca #MC: bad target", -EINVAL); if (!cpu_online(target)) return x86_mcerr("do_mca #MC: target offline", -EINVAL); add_taint(TAINT_ERROR_INJECT); if ( mce_broadcast ) on_each_cpu(x86_mc_mceinject, mc_mceinject, 1); else on_selected_cpus(cpumask_of(target), x86_mc_mceinject, mc_mceinject, 1); break; case XEN_MC_inject_v2: { const cpumask_t *cpumap; cpumask_var_t cmv; if (nr_mce_banks == 0) return x86_mcerr("do_mca #MC", -ENODEV); if ( op->u.mc_inject_v2.flags & XEN_MC_INJECT_CPU_BROADCAST ) cpumap = &cpu_online_map; else { ret = xenctl_bitmap_to_cpumask(&cmv, &op->u.mc_inject_v2.cpumap); if ( ret ) break; cpumap = cmv; if ( !cpumask_intersects(cpumap, &cpu_online_map) ) { free_cpumask_var(cmv); ret = x86_mcerr("No online CPU passed\n", -EINVAL); break; } if ( !cpumask_subset(cpumap, &cpu_online_map) ) dprintk(XENLOG_INFO, "Not all required CPUs are online\n"); } switch (op->u.mc_inject_v2.flags & XEN_MC_INJECT_TYPE_MASK) { case XEN_MC_INJECT_TYPE_MCE: if ( mce_broadcast && !cpumask_equal(cpumap, &cpu_online_map) ) printk("Not trigger MCE on all CPUs, may HANG!\n"); on_selected_cpus(cpumap, x86_mc_mceinject, NULL, 1); break; case XEN_MC_INJECT_TYPE_CMCI: if ( !cmci_apic_vector ) ret = x86_mcerr( "No CMCI supported in platform\n", -EINVAL); else { if ( cpumask_test_cpu(smp_processor_id(), cpumap) ) send_IPI_self(cmci_apic_vector); send_IPI_mask(cpumap, cmci_apic_vector); } break; default: ret = x86_mcerr("Wrong mca type\n", -EINVAL); break; } if (cpumap != &cpu_online_map) free_cpumask_var(cmv); break; } default: return x86_mcerr("do_mca: bad command", -EINVAL); } return ret; } int mcinfo_dumpped; static int x86_mcinfo_dump_panic(mctelem_cookie_t mctc) { struct mc_info *mcip = mctelem_dataptr(mctc); x86_mcinfo_dump(mcip); mcinfo_dumpped++; return 0; } /* XXX shall we dump commited mc_info?? */ static void mc_panic_dump(void) { int cpu; dprintk(XENLOG_ERR, "Begin dump mc_info\n"); for_each_online_cpu(cpu) mctelem_process_deferred(cpu, x86_mcinfo_dump_panic); dprintk(XENLOG_ERR, "End dump mc_info, %x mcinfo dumped\n", mcinfo_dumpped); } void mc_panic(char *s) { is_mc_panic = 1; console_force_unlock(); printk("Fatal machine check: %s\n", s); printk("\n" "****************************************\n" "\n" " The processor has reported a hardware error which cannot\n" " be recovered from. Xen will now reboot the machine.\n"); mc_panic_dump(); panic("HARDWARE ERROR"); } /* Machine Check owner judge algorithm: * When error happens, all cpus serially read its msr banks. * The first CPU who fetches the error bank's info will clear * this bank. Later readers can't get any information again. * The first CPU is the actual mce_owner * * For Fatal (pcc=1) error, it might cause machine crash * before we're able to log. For avoiding log missing, we adopt two * round scanning: * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset. * All MCE banks are sticky, when boot up, MCE polling mechanism * will help to collect and log those MCE errors. * Round2: Do all MCE processing logic as normal. */ /* Maybe called in MCE context, no lock, no printk */ static enum mce_result mce_action(struct cpu_user_regs *regs, mctelem_cookie_t mctc) { struct mc_info *local_mi; enum mce_result bank_result = MCER_NOERROR; enum mce_result worst_result = MCER_NOERROR; struct mcinfo_common *mic = NULL; struct mca_binfo binfo; const struct mca_error_handler *handlers = mce_dhandlers; unsigned int i, handler_num = mce_dhandler_num; /* When in mce context, regs is valid */ if (regs) { handler_num = mce_uhandler_num; handlers = mce_uhandlers; } /* At least a default handler should be registerd */ ASSERT(handler_num); local_mi = (struct mc_info*)mctelem_dataptr(mctc); x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL); if (mic == NULL) { printk(KERN_ERR "MCE: get local buffer entry failed\n "); return MCER_CONTINUE; } memset(&binfo, 0, sizeof(binfo)); binfo.mig = (struct mcinfo_global *)mic; binfo.mi = local_mi; /* Processing bank information */ x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK); for ( ; bank_result != MCER_RESET && mic && mic->size; mic = x86_mcinfo_next(mic) ) { if (mic->type != MC_TYPE_BANK) { continue; } binfo.mib = (struct mcinfo_bank*)mic; binfo.bank = binfo.mib->mc_bank; bank_result = MCER_NOERROR; for ( i = 0; i < handler_num; i++ ) { if (handlers[i].owned_error(binfo.mib->mc_status)) { handlers[i].recovery_handler(&binfo, &bank_result, regs); if (worst_result < bank_result) worst_result = bank_result; break; } } ASSERT(i != handler_num); } return worst_result; } /* * Called from mctelem_process_deferred. Return 1 if the telemetry * should be committed for dom0 consumption, 0 if it should be * dismissed. */ static int mce_delayed_action(mctelem_cookie_t mctc) { enum mce_result result; int ret = 0; result = mce_action(NULL, mctc); switch (result) { case MCER_RESET: dprintk(XENLOG_ERR, "MCE delayed action failed\n"); is_mc_panic = 1; x86_mcinfo_dump(mctelem_dataptr(mctc)); panic("MCE: Software recovery failed for the UCR"); break; case MCER_RECOVERED: dprintk(XENLOG_INFO, "MCE: Error is successfully recovered\n"); ret = 1; break; case MCER_CONTINUE: dprintk(XENLOG_INFO, "MCE: Error can't be recovered, " "system is tainted\n"); x86_mcinfo_dump(mctelem_dataptr(mctc)); ret = 1; break; default: ret = 0; break; } return ret; } /* Softirq Handler for this MCE# processing */ static void mce_softirq(void) { int cpu = smp_processor_id(); unsigned int workcpu; mce_printk(MCE_VERBOSE, "CPU%d enter softirq\n", cpu); mce_barrier_enter(&mce_inside_bar); /* * Everybody is here. Now let's see who gets to do the * recovery work. Right now we just see if there's a CPU * that did not have any problems, and pick that one. * * First, just set a default value: the last CPU who reaches this * will overwrite the value and become the default. */ atomic_set(&severity_cpu, cpu); mce_barrier_enter(&mce_severity_bar); if (!mctelem_has_deferred(cpu)) atomic_set(&severity_cpu, cpu); mce_barrier_exit(&mce_severity_bar); /* We choose severity_cpu for further processing */ if (atomic_read(&severity_cpu) == cpu) { mce_printk(MCE_VERBOSE, "CPU%d handling errors\n", cpu); /* Step1: Fill DOM0 LOG buffer, vMCE injection buffer and * vMCE MSRs virtualization buffer */ for_each_online_cpu(workcpu) { mctelem_process_deferred(workcpu, mce_delayed_action); } /* Step2: Send Log to DOM0 through vIRQ */ if (dom0_vmce_enabled()) { mce_printk(MCE_VERBOSE, "MCE: send MCE# to DOM0 through virq\n"); send_global_virq(VIRQ_MCA); } } mce_barrier_exit(&mce_inside_bar); } /* Machine Check owner judge algorithm: * When error happens, all cpus serially read its msr banks. * The first CPU who fetches the error bank's info will clear * this bank. Later readers can't get any infor again. * The first CPU is the actual mce_owner * * For Fatal (pcc=1) error, it might cause machine crash * before we're able to log. For avoiding log missing, we adopt two * round scanning: * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset. * All MCE banks are sticky, when boot up, MCE polling mechanism * will help to collect and log those MCE errors. * Round2: Do all MCE processing logic as normal. */ void mce_handler_init(void) { if (smp_processor_id() != 0) return; /* callback register, do we really need so many callback? */ /* mce handler data initialization */ mce_barrier_init(&mce_inside_bar); mce_barrier_init(&mce_severity_bar); mce_barrier_init(&mce_trap_bar); spin_lock_init(&mce_logout_lock); open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq); } xen-4.4.0/xen/arch/x86/cpu/mcheck/mcaction.c0000664000175000017500000001004712307313555016563 0ustar smbsmb#include #include #include "mcaction.h" #include "vmce.h" #include "mce.h" static struct mcinfo_recovery * mci_action_add_pageoffline(int bank, struct mc_info *mi, uint64_t mfn, uint32_t status) { struct mcinfo_recovery *rec; if (!mi) return NULL; rec = x86_mcinfo_reserve(mi, sizeof(*rec)); if (!rec) { mi->flags |= MCINFO_FLAGS_UNCOMPLETE; return NULL; } rec->common.type = MC_TYPE_RECOVERY; rec->common.size = sizeof(*rec); rec->mc_bank = bank; rec->action_types = MC_ACTION_PAGE_OFFLINE; rec->action_info.page_retire.mfn = mfn; rec->action_info.page_retire.status = status; return rec; } mce_check_addr_t mc_check_addr = NULL; void mce_register_addrcheck(mce_check_addr_t cbfunc) { mc_check_addr = cbfunc; } void mc_memerr_dhandler(struct mca_binfo *binfo, enum mce_result *result, struct cpu_user_regs *regs) { struct mcinfo_bank *bank = binfo->mib; struct mcinfo_global *global = binfo->mig; struct domain *d; unsigned long mfn, gfn; uint32_t status; int vmce_vcpuid; if (!mc_check_addr(bank->mc_status, bank->mc_misc, MC_ADDR_PHYSICAL)) { dprintk(XENLOG_WARNING, "No physical address provided for memory error\n"); return; } mfn = bank->mc_addr >> PAGE_SHIFT; if (offline_page(mfn, 1, &status)) { dprintk(XENLOG_WARNING, "Failed to offline page %lx for MCE error\n", mfn); return; } mci_action_add_pageoffline(binfo->bank, binfo->mi, mfn, status); /* This is free page */ if (status & PG_OFFLINE_OFFLINED) *result = MCER_RECOVERED; else if (status & PG_OFFLINE_AGAIN) *result = MCER_CONTINUE; else if (status & PG_OFFLINE_PENDING) { /* This page has owner */ if (status & PG_OFFLINE_OWNED) { bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT; mce_printk(MCE_QUIET, "MCE: This error page is ownded" " by DOM %d\n", bank->mc_domid); /* XXX: Cannot handle shared pages yet * (this should identify all domains and gfn mapping to * the mfn in question) */ BUG_ON( bank->mc_domid == DOMID_COW ); if ( bank->mc_domid != DOMID_XEN ) { d = get_domain_by_id(bank->mc_domid); ASSERT(d); gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT); if ( unmmap_broken_page(d, _mfn(mfn), gfn) ) { printk("Unmap broken memory %lx for DOM%d failed\n", mfn, d->domain_id); goto vmce_failed; } bank->mc_addr = gfn << PAGE_SHIFT | (bank->mc_addr & (PAGE_SIZE -1 )); if ( fill_vmsr_data(bank, d, global->mc_gstatus) == -1 ) { mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d " "failed\n", bank->mc_domid); goto vmce_failed; } if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) vmce_vcpuid = VMCE_INJECT_BROADCAST; else vmce_vcpuid = global->mc_vcpuid; /* We will inject vMCE to DOMU*/ if ( inject_vmce(d, vmce_vcpuid) < 0 ) { mce_printk(MCE_QUIET, "inject vMCE to DOM%d" " failed\n", d->domain_id); goto vmce_failed; } /* Impacted domain go on with domain's recovery job * if the domain has its own MCA handler. * For xen, it has contained the error and finished * its own recovery job. */ *result = MCER_RECOVERED; put_domain(d); return; vmce_failed: put_domain(d); domain_crash(d); } } } } xen-4.4.0/xen/arch/x86/cpu/mcheck/mce_intel.c0000664000175000017500000006127312307313555016734 0ustar smbsmb#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mce.h" #include "x86_mca.h" #include "barrier.h" #include "util.h" #include "vmce.h" #include "mcaction.h" static DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, mce_banks_owned); bool_t __read_mostly cmci_support = 0; static bool_t __read_mostly ser_support = 0; static bool_t __read_mostly mce_force_broadcast; boolean_param("mce_fb", mce_force_broadcast); static int __read_mostly nr_intel_ext_msrs; /* Intel SDM define bit15~bit0 of IA32_MCi_STATUS as the MC error code */ #define INTEL_MCCOD_MASK 0xFFFF /* * Currently Intel SDM define 2 kinds of srao errors: * 1). Memory scrubbing error, error code = 0xC0 ~ 0xCF * 2). L3 explicit writeback error, error code = 0x17A */ #define INTEL_SRAO_MEM_SCRUB 0xC0 ... 0xCF #define INTEL_SRAO_L3_EWB 0x17A /* * Currently Intel SDM define 2 kinds of srar errors: * 1). Data Load error, error code = 0x134 * 2). Instruction Fetch error, error code = 0x150 */ #define INTEL_SRAR_DATA_LOAD 0x134 #define INTEL_SRAR_INSTR_FETCH 0x150 #ifdef CONFIG_X86_MCE_THERMAL static void intel_thermal_interrupt(struct cpu_user_regs *regs) { uint64_t msr_content; unsigned int cpu = smp_processor_id(); static DEFINE_PER_CPU(s_time_t, next); ack_APIC_irq(); if (NOW() < per_cpu(next, cpu)) return; per_cpu(next, cpu) = NOW() + MILLISECS(5000); rdmsrl(MSR_IA32_THERM_STATUS, msr_content); if (msr_content & 0x1) { printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu); printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n", cpu); add_taint(TAINT_MACHINE_CHECK); } else { printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); } } /* Thermal monitoring depends on APIC, ACPI and clock modulation */ static int intel_thermal_supported(struct cpuinfo_x86 *c) { if (!cpu_has_apic) return 0; if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) return 0; return 1; } static u32 __read_mostly lvtthmr_init; static void __init mcheck_intel_therm_init(void) { /* * This function is only called on boot CPU. Save the init thermal * LVT value on BSP and use that value to restore APs' thermal LVT * entry BIOS programmed later */ if (intel_thermal_supported(&boot_cpu_data)) lvtthmr_init = apic_read(APIC_LVTTHMR); } /* P4/Xeon Thermal regulation detect and init */ static void intel_init_thermal(struct cpuinfo_x86 *c) { uint64_t msr_content; uint32_t val; int tm2 = 0; unsigned int cpu = smp_processor_id(); static uint8_t thermal_apic_vector; if (!intel_thermal_supported(c)) return; /* -ENODEV */ /* first check if its enabled already, in which case there might * be some SMM goo which handles it, so we can't even put a handler * since it might be delivered via SMI already -zwanem. */ rdmsrl(MSR_IA32_MISC_ENABLE, msr_content); val = lvtthmr_init; /* * The initial value of thermal LVT entries on all APs always reads * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI * sequence to them and LVT registers are reset to 0s except for * the mask bits which are set to 1s when APs receive INIT IPI. * If BIOS takes over the thermal interrupt and sets its interrupt * delivery mode to SMI (not fixed), it restores the value that the * BIOS has programmed on AP based on BSP's info we saved (since BIOS * is required to set the same value for all threads/cores). */ if ((val & APIC_MODE_MASK) != APIC_DM_FIXED || (val & APIC_VECTOR_MASK) > 0xf) apic_write(APIC_LVTTHMR, val); if ((msr_content & (1ULL<<3)) && (val & APIC_MODE_MASK) == APIC_DM_SMI) { if (c == &boot_cpu_data) printk(KERN_DEBUG "Thermal monitoring handled by SMI\n"); return; /* -EBUSY */ } if (cpu_has(c, X86_FEATURE_TM2) && (msr_content & (1ULL << 13))) tm2 = 1; /* check whether a vector already exists, temporarily masked? */ if (val & APIC_VECTOR_MASK) { if (c == &boot_cpu_data) printk(KERN_DEBUG "Thermal LVT vector (%#x) already installed\n", val & APIC_VECTOR_MASK); return; /* -EBUSY */ } alloc_direct_apic_vector(&thermal_apic_vector, intel_thermal_interrupt); /* The temperature transition interrupt handler setup */ val = thermal_apic_vector; /* our delivery vector */ val |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ apic_write_around(APIC_LVTTHMR, val); rdmsrl(MSR_IA32_THERM_INTERRUPT, msr_content); wrmsrl(MSR_IA32_THERM_INTERRUPT, msr_content | 0x03); rdmsrl(MSR_IA32_MISC_ENABLE, msr_content); wrmsrl(MSR_IA32_MISC_ENABLE, msr_content | (1ULL<<3)); apic_write_around(APIC_LVTTHMR, val & ~APIC_LVT_MASKED); if (opt_cpu_info) printk(KERN_INFO "CPU%u: Thermal monitoring enabled (%s)\n", cpu, tm2 ? "TM2" : "TM1"); return; } #endif /* CONFIG_X86_MCE_THERMAL */ /* Intel MCE handler */ static inline void intel_get_extended_msr(struct mcinfo_extended *ext, u32 msr) { if ( ext->mc_msrs < ARRAY_SIZE(ext->mc_msr) && msr < MSR_IA32_MCG_EAX + nr_intel_ext_msrs ) { ext->mc_msr[ext->mc_msrs].reg = msr; rdmsrl(msr, ext->mc_msr[ext->mc_msrs].value); ++ext->mc_msrs; } } struct mcinfo_extended * intel_get_extended_msrs(struct mcinfo_global *mig, struct mc_info *mi) { struct mcinfo_extended *mc_ext; int i; /* * According to spec, processor _support_ 64 bit will always * have MSR beyond IA32_MCG_MISC */ if (!mi|| !mig || nr_intel_ext_msrs == 0 || !(mig->mc_gstatus & MCG_STATUS_EIPV)) return NULL; mc_ext = x86_mcinfo_reserve(mi, sizeof(*mc_ext)); if (!mc_ext) { mi->flags |= MCINFO_FLAGS_UNCOMPLETE; return NULL; } /* this function will called when CAP(9).MCG_EXT_P = 1 */ mc_ext->common.type = MC_TYPE_EXTENDED; mc_ext->common.size = sizeof(struct mcinfo_extended); for (i = MSR_IA32_MCG_EAX; i <= MSR_IA32_MCG_MISC; i++) intel_get_extended_msr(mc_ext, i); for (i = MSR_IA32_MCG_R8; i <= MSR_IA32_MCG_R15; i++) intel_get_extended_msr(mc_ext, i); return mc_ext; } enum intel_mce_type { intel_mce_invalid, intel_mce_fatal, intel_mce_corrected, intel_mce_ucr_ucna, intel_mce_ucr_srao, intel_mce_ucr_srar, }; static enum intel_mce_type intel_check_mce_type(uint64_t status) { if (!(status & MCi_STATUS_VAL)) return intel_mce_invalid; if (status & MCi_STATUS_PCC) return intel_mce_fatal; /* Corrected error? */ if (!(status & MCi_STATUS_UC)) return intel_mce_corrected; if (!ser_support) return intel_mce_fatal; if (status & MCi_STATUS_S) { if (status & MCi_STATUS_AR) { if (status & MCi_STATUS_OVER) return intel_mce_fatal; else return intel_mce_ucr_srar; } else return intel_mce_ucr_srao; } else return intel_mce_ucr_ucna; /* Any type not included abovoe ? */ return intel_mce_fatal; } static void intel_memerr_dhandler( struct mca_binfo *binfo, enum mce_result *result, struct cpu_user_regs *regs) { mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n"); mc_memerr_dhandler(binfo, result, regs); } static int intel_srar_check(uint64_t status) { return ( intel_check_mce_type(status) == intel_mce_ucr_srar ); } static int intel_checkaddr(uint64_t status, uint64_t misc, int addrtype) { if (!(status & MCi_STATUS_ADDRV) || !(status & MCi_STATUS_MISCV) || ((misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) ) { /* addr is virtual */ return (addrtype == MC_ADDR_VIRTUAL); } return (addrtype == MC_ADDR_PHYSICAL); } static void intel_srar_dhandler( struct mca_binfo *binfo, enum mce_result *result, struct cpu_user_regs *regs) { uint64_t status = binfo->mib->mc_status; /* For unknown srar error code, reset system */ *result = MCER_RESET; switch ( status & INTEL_MCCOD_MASK ) { case INTEL_SRAR_DATA_LOAD: case INTEL_SRAR_INSTR_FETCH: intel_memerr_dhandler(binfo, result, regs); break; default: break; } } static int intel_srao_check(uint64_t status) { return ( intel_check_mce_type(status) == intel_mce_ucr_srao ); } static void intel_srao_dhandler( struct mca_binfo *binfo, enum mce_result *result, struct cpu_user_regs *regs) { uint64_t status = binfo->mib->mc_status; /* For unknown srao error code, no action required */ *result = MCER_CONTINUE; if ( status & MCi_STATUS_VAL ) { switch ( status & INTEL_MCCOD_MASK ) { case INTEL_SRAO_MEM_SCRUB: case INTEL_SRAO_L3_EWB: intel_memerr_dhandler(binfo, result, regs); break; default: break; } } } static int intel_default_check(uint64_t status) { return 1; } static void intel_default_mce_dhandler( struct mca_binfo *binfo, enum mce_result *result, struct cpu_user_regs * regs) { uint64_t status = binfo->mib->mc_status; enum intel_mce_type type; type = intel_check_mce_type(status); if (type == intel_mce_fatal) *result = MCER_RESET; else *result = MCER_CONTINUE; } static const struct mca_error_handler intel_mce_dhandlers[] = { {intel_srao_check, intel_srao_dhandler}, {intel_srar_check, intel_srar_dhandler}, {intel_default_check, intel_default_mce_dhandler} }; static void intel_default_mce_uhandler( struct mca_binfo *binfo, enum mce_result *result, struct cpu_user_regs *regs) { uint64_t status = binfo->mib->mc_status; enum intel_mce_type type; type = intel_check_mce_type(status); switch (type) { case intel_mce_fatal: *result = MCER_RESET; break; default: *result = MCER_CONTINUE; break; } } static const struct mca_error_handler intel_mce_uhandlers[] = { {intel_default_check, intel_default_mce_uhandler} }; static void intel_machine_check(struct cpu_user_regs * regs, long error_code) { mcheck_cmn_handler(regs, error_code, mca_allbanks, __get_cpu_var(mce_clear_banks)); } /* According to MCA OS writer guide, CMCI handler need to clear bank when * 1) CE (UC = 0) * 2) ser_support = 1, Superious error, OVER = 0, EN = 0, [UC = 1] * 3) ser_support = 1, UCNA, OVER = 0, S = 1, AR = 0, PCC = 0, [UC = 1, EN = 1] * MCA handler need to clear bank when * 1) ser_support = 1, Superious error, OVER = 0, EN = 0, UC = 1 * 2) ser_support = 1, SRAR, UC = 1, OVER = 0, S = 1, AR = 1, [EN = 1] * 3) ser_support = 1, SRAO, UC = 1, S = 1, AR = 0, [EN = 1] */ static int intel_need_clearbank_scan(enum mca_source who, u64 status) { if ( who == MCA_CMCI_HANDLER) { /* CMCI need clear bank */ if ( !(status & MCi_STATUS_UC) ) return 1; /* Spurious need clear bank */ else if ( ser_support && !(status & MCi_STATUS_OVER) && !(status & MCi_STATUS_EN) ) return 1; /* UCNA OVER = 0 need clear bank */ else if ( ser_support && !(status & MCi_STATUS_OVER) && !(status & MCi_STATUS_PCC) && !(status & MCi_STATUS_S) && !(status & MCi_STATUS_AR)) return 1; /* Only Log, no clear */ else return 0; } else if ( who == MCA_MCE_SCAN) { if ( !ser_support ) return 0; /* * For fatal error, it shouldn't be cleared so that sticky bank * have chance to be handled after reboot by polling */ if ( (status & MCi_STATUS_UC) && (status & MCi_STATUS_PCC) ) return 0; /* Spurious need clear bank */ else if ( !(status & MCi_STATUS_OVER) && (status & MCi_STATUS_UC) && !(status & MCi_STATUS_EN)) return 1; /* SRAR OVER=0 clear bank. OVER = 1 have caused reset */ else if ( (status & MCi_STATUS_UC) && (status & MCi_STATUS_S) && (status & MCi_STATUS_AR ) && !(status & MCi_STATUS_OVER) ) return 1; /* SRAO need clear bank */ else if ( !(status & MCi_STATUS_AR) && (status & MCi_STATUS_S) && (status & MCi_STATUS_UC)) return 1; else return 0; } return 1; } /* MCE continues/is recoverable when * 1) CE UC = 0 * 2) Supious ser_support = 1, OVER = 0, En = 0 [UC = 1] * 3) SRAR ser_support = 1, OVER = 0, PCC = 0, S = 1, AR = 1 [UC =1, EN = 1] * 4) SRAO ser_support = 1, PCC = 0, S = 1, AR = 0, EN = 1 [UC = 1] * 5) UCNA ser_support = 1, OVER = 0, EN = 1, PCC = 0, S = 0, AR = 0, [UC = 1] */ static int intel_recoverable_scan(uint64_t status) { if ( !(status & MCi_STATUS_UC ) ) return 1; else if ( ser_support && !(status & MCi_STATUS_EN) && !(status & MCi_STATUS_OVER) ) return 1; /* SRAR error */ else if ( ser_support && !(status & MCi_STATUS_OVER) && !(status & MCi_STATUS_PCC) && (status & MCi_STATUS_S) && (status & MCi_STATUS_AR) && (status & MCi_STATUS_EN) ) return 1; /* SRAO error */ else if (ser_support && !(status & MCi_STATUS_PCC) && (status & MCi_STATUS_S) && !(status & MCi_STATUS_AR) && (status & MCi_STATUS_EN)) return 1; /* UCNA error */ else if (ser_support && !(status & MCi_STATUS_OVER) && (status & MCi_STATUS_EN) && !(status & MCi_STATUS_PCC) && !(status & MCi_STATUS_S) && !(status & MCi_STATUS_AR)) return 1; return 0; } /* CMCI */ static DEFINE_SPINLOCK(cmci_discover_lock); /* * Discover bank sharing using the algorithm recommended in the SDM. */ static int do_cmci_discover(int i) { unsigned msr = MSR_IA32_MCx_CTL2(i); u64 val; rdmsrl(msr, val); /* Some other CPU already owns this bank. */ if (val & CMCI_EN) { mcabanks_clear(i, __get_cpu_var(mce_banks_owned)); goto out; } val &= ~CMCI_THRESHOLD_MASK; wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD); rdmsrl(msr, val); if (!(val & CMCI_EN)) { /* This bank does not support CMCI. Polling timer has to handle it. */ mcabanks_set(i, __get_cpu_var(no_cmci_banks)); return 0; } mcabanks_set(i, __get_cpu_var(mce_banks_owned)); out: mcabanks_clear(i, __get_cpu_var(no_cmci_banks)); return 1; } static void cmci_discover(void) { unsigned long flags; int i; mctelem_cookie_t mctc; struct mca_summary bs; mce_printk(MCE_VERBOSE, "CMCI: find owner on CPU%d\n", smp_processor_id()); spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < nr_mce_banks; i++) if (!mcabanks_test(i, __get_cpu_var(mce_banks_owned))) do_cmci_discover(i); spin_unlock_irqrestore(&cmci_discover_lock, flags); /* In case CMCI happended when do owner change. * If CMCI happened yet not processed immediately, * MCi_status (error_count bit 38~52) is not cleared, * the CMCI interrupt will never be triggered again. */ mctc = mcheck_mca_logout( MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL); if (bs.errcnt && mctc != NULL) { if (dom0_vmce_enabled()) { mctelem_commit(mctc); send_global_virq(VIRQ_MCA); } else { x86_mcinfo_dump(mctelem_dataptr(mctc)); mctelem_dismiss(mctc); } } else if (mctc != NULL) mctelem_dismiss(mctc); mce_printk(MCE_VERBOSE, "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n", smp_processor_id(), *((unsigned long *)__get_cpu_var(mce_banks_owned)->bank_map), *((unsigned long *)__get_cpu_var(no_cmci_banks)->bank_map)); } /* * Define an owner for each bank. Banks can be shared between CPUs * and to avoid reporting events multiple times always set up one * CPU as owner. * * The assignment has to be redone when CPUs go offline and * any of the owners goes away. Also pollers run in parallel so we * have to be careful to update the banks in a way that doesn't * lose or duplicate events. */ static void mce_set_owner(void) { if (!cmci_support || mce_disabled == 1) return; cmci_discover(); } static void __cpu_mcheck_distribute_cmci(void *unused) { cmci_discover(); } static void cpu_mcheck_distribute_cmci(void) { if (cmci_support && !mce_disabled) on_each_cpu(__cpu_mcheck_distribute_cmci, NULL, 0); } static void clear_cmci(void) { int i; if (!cmci_support || mce_disabled == 1) return; mce_printk(MCE_VERBOSE, "CMCI: clear_cmci support on CPU%d\n", smp_processor_id()); for (i = 0; i < nr_mce_banks; i++) { unsigned msr = MSR_IA32_MCx_CTL2(i); u64 val; if (!mcabanks_test(i, __get_cpu_var(mce_banks_owned))) continue; rdmsrl(msr, val); if (val & (CMCI_EN|CMCI_THRESHOLD_MASK)) wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK)); mcabanks_clear(i, __get_cpu_var(mce_banks_owned)); } } static void cpu_mcheck_disable(void) { clear_in_cr4(X86_CR4_MCE); if (cmci_support && !mce_disabled) clear_cmci(); } static void cmci_interrupt(struct cpu_user_regs *regs) { mctelem_cookie_t mctc; struct mca_summary bs; ack_APIC_irq(); mctc = mcheck_mca_logout( MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL); if (bs.errcnt && mctc != NULL) { if (dom0_vmce_enabled()) { mctelem_commit(mctc); mce_printk(MCE_VERBOSE, "CMCI: send CMCI to DOM0 through virq\n"); send_global_virq(VIRQ_MCA); } else { x86_mcinfo_dump(mctelem_dataptr(mctc)); mctelem_dismiss(mctc); } } else if (mctc != NULL) mctelem_dismiss(mctc); } static void intel_init_cmci(struct cpuinfo_x86 *c) { u32 l, apic; int cpu = smp_processor_id(); if (!mce_available(c) || !cmci_support) { if (opt_cpu_info) mce_printk(MCE_QUIET, "CMCI: CPU%d has no CMCI support\n", cpu); return; } apic = apic_read(APIC_CMCI); if ( apic & APIC_VECTOR_MASK ) { mce_printk(MCE_QUIET, "CPU%d CMCI LVT vector (%#x) already installed\n", cpu, ( apic & APIC_VECTOR_MASK )); return; } alloc_direct_apic_vector(&cmci_apic_vector, cmci_interrupt); apic = cmci_apic_vector; apic |= (APIC_DM_FIXED | APIC_LVT_MASKED); apic_write_around(APIC_CMCI, apic); l = apic_read(APIC_CMCI); apic_write_around(APIC_CMCI, l & ~APIC_LVT_MASKED); mce_set_owner(); } /* MCA */ static int mce_is_broadcast(struct cpuinfo_x86 *c) { if (mce_force_broadcast) return 1; /* According to Intel SDM Dec, 2009, 15.10.4.1, For processors with * DisplayFamily_DisplayModel encoding of 06H_EH and above, * a MCA signal is broadcast to all logical processors in the system */ if (c->x86_vendor == X86_VENDOR_INTEL && c->x86 == 6 && c->x86_model >= 0xe) return 1; return 0; } /* Check and init MCA */ static void intel_init_mca(struct cpuinfo_x86 *c) { bool_t broadcast, cmci = 0, ser = 0; int ext_num = 0, first; uint64_t msr_content; broadcast = mce_is_broadcast(c); rdmsrl(MSR_IA32_MCG_CAP, msr_content); if ((msr_content & MCG_CMCI_P) && cpu_has_apic) cmci = 1; /* Support Software Error Recovery */ if (msr_content & MCG_SER_P) ser = 1; if (msr_content & MCG_EXT_P) ext_num = (msr_content >> MCG_EXT_CNT) & 0xff; first = mce_firstbank(c); if (smp_processor_id() == 0) { dprintk(XENLOG_INFO, "MCA Capability: BCAST %x SER %x" " CMCI %x firstbank %x extended MCE MSR %x\n", broadcast, ser, cmci, first, ext_num); mce_broadcast = broadcast; cmci_support = cmci; ser_support = ser; nr_intel_ext_msrs = ext_num; firstbank = first; } else if (cmci != cmci_support || ser != ser_support || broadcast != mce_broadcast || first != firstbank || ext_num != nr_intel_ext_msrs) { dprintk(XENLOG_WARNING, "CPU %u has different MCA capability (%x,%x,%x,%x,%x)" " than BSP, may cause undetermined result!!!\n", smp_processor_id(), broadcast, ser, cmci, first, ext_num); } } static void intel_mce_post_reset(void) { mctelem_cookie_t mctc; struct mca_summary bs; mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs, NULL); /* in the boot up stage, print out and also log in DOM0 boot process */ if (bs.errcnt && mctc != NULL) { x86_mcinfo_dump(mctelem_dataptr(mctc)); mctelem_commit(mctc); } return; } static void intel_init_mce(void) { uint64_t msr_content; int i; intel_mce_post_reset(); /* clear all banks */ for (i = firstbank; i < nr_mce_banks; i++) { /* Some banks are shared across cores, use MCi_CTRL to judge whether * this bank has been initialized by other cores already. */ rdmsrl(MSR_IA32_MCx_CTL(i), msr_content); if (!msr_content) { /* if ctl is 0, this bank is never initialized */ mce_printk(MCE_VERBOSE, "mce_init: init bank%d\n", i); wrmsrl(MSR_IA32_MCx_CTL(i), 0xffffffffffffffffULL); wrmsrl(MSR_IA32_MCx_STATUS(i), 0x0ULL); } } if (firstbank) /* if cmci enabled, firstbank = 0 */ wrmsrl(MSR_IA32_MC0_STATUS, 0x0ULL); x86_mce_vector_register(intel_machine_check); mce_recoverable_register(intel_recoverable_scan); mce_need_clearbank_register(intel_need_clearbank_scan); mce_register_addrcheck(intel_checkaddr); mce_dhandlers = intel_mce_dhandlers; mce_dhandler_num = ARRAY_SIZE(intel_mce_dhandlers); mce_uhandlers = intel_mce_uhandlers; mce_uhandler_num = ARRAY_SIZE(intel_mce_uhandlers); } static void cpu_mcabank_free(unsigned int cpu) { struct mca_banks *cmci = per_cpu(no_cmci_banks, cpu); struct mca_banks *owned = per_cpu(mce_banks_owned, cpu); mcabanks_free(cmci); mcabanks_free(owned); } static int cpu_mcabank_alloc(unsigned int cpu) { struct mca_banks *cmci = mcabanks_alloc(); struct mca_banks *owned = mcabanks_alloc(); if (!cmci || !owned) goto out; per_cpu(no_cmci_banks, cpu) = cmci; per_cpu(mce_banks_owned, cpu) = owned; return 0; out: mcabanks_free(cmci); mcabanks_free(owned); return -ENOMEM; } static int cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int rc = 0; switch ( action ) { case CPU_UP_PREPARE: rc = cpu_mcabank_alloc(cpu); break; case CPU_DYING: cpu_mcheck_disable(); break; case CPU_UP_CANCELED: case CPU_DEAD: cpu_mcheck_distribute_cmci(); cpu_mcabank_free(cpu); break; default: break; } return !rc ? NOTIFY_DONE : notifier_from_errno(rc); } static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; /* p4/p6 family have similar MCA initialization process */ enum mcheck_type intel_mcheck_init(struct cpuinfo_x86 *c, bool_t bsp) { if ( bsp ) { /* Early MCE initialisation for BSP. */ if ( cpu_mcabank_alloc(0) ) BUG(); register_cpu_notifier(&cpu_nfb); mcheck_intel_therm_init(); } intel_init_mca(c); mce_handler_init(); intel_init_mce(); intel_init_cmci(c); #ifdef CONFIG_X86_MCE_THERMAL intel_init_thermal(c); #endif return mcheck_intel; } /* intel specific MCA MSR */ int vmce_intel_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) { unsigned int bank = msr - MSR_IA32_MC0_CTL2; if ( bank < GUEST_MC_BANK_NUM ) { v->arch.vmce.bank[bank].mci_ctl2 = val; mce_printk(MCE_VERBOSE, "MCE: wr MC%u_CTL2 %#"PRIx64"\n", bank, val); } return 1; } int vmce_intel_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) { unsigned int bank = msr - MSR_IA32_MC0_CTL2; if ( bank < GUEST_MC_BANK_NUM ) { *val = v->arch.vmce.bank[bank].mci_ctl2; mce_printk(MCE_VERBOSE, "MCE: rd MC%u_CTL2 %#"PRIx64"\n", bank, *val); } return 1; } xen-4.4.0/xen/arch/x86/cpu/mcheck/util.c0000664000175000017500000000030312307313555015735 0ustar smbsmb #include #include "util.h" #include "mce.h" void mce_panic_check(void) { if ( is_mc_panic ) { local_irq_enable(); for ( ; ; ) halt(); } } xen-4.4.0/xen/arch/x86/cpu/mcheck/mcaction.h0000664000175000017500000000073512307313555016573 0ustar smbsmb#ifndef _MCHECK_ACTION_H #define _MCHECK_ACTION_H #include #include "x86_mca.h" void mc_memerr_dhandler(struct mca_binfo *binfo, enum mce_result *result, struct cpu_user_regs *regs); #define MC_ADDR_PHYSICAL 0 #define MC_ADDR_VIRTUAL 1 typedef int (*mce_check_addr_t)(uint64_t status, uint64_t misc, int addr_type); extern void mce_register_addrcheck(mce_check_addr_t); extern mce_check_addr_t mc_check_addr; #endif xen-4.4.0/xen/arch/x86/cpu/mcheck/vmce.c0000664000175000017500000003141412307313555015721 0ustar smbsmb/* * vmce.c - provide software emulated vMCE support to guest * * Copyright (C) 2010, 2011 Jiang, Yunhong * Copyright (C) 2012, 2013 Liu, Jinsong * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mce.h" #include "x86_mca.h" #include "vmce.h" /* * MCG_SER_P: software error recovery supported * MCG_TES_P: to avoid MCi_status bit56:53 model specific * MCG_CMCI_P: expose CMCI capability but never really inject it to guest, * for sake of performance since guest not polling periodically */ #define INTEL_GUEST_MCG_CAP (MCG_SER_P | \ MCG_TES_P | \ MCG_CMCI_P | \ GUEST_MC_BANK_NUM) #define AMD_GUEST_MCG_CAP GUEST_MC_BANK_NUM void vmce_init_vcpu(struct vcpu *v) { int i; /* global MCA MSRs init */ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) v->arch.vmce.mcg_cap = INTEL_GUEST_MCG_CAP; else v->arch.vmce.mcg_cap = AMD_GUEST_MCG_CAP; v->arch.vmce.mcg_status = 0; /* per-bank MCA MSRs init */ for ( i = 0; i < GUEST_MC_BANK_NUM; i++ ) memset(&v->arch.vmce.bank[i], 0, sizeof(struct vmce_bank)); spin_lock_init(&v->arch.vmce.lock); } int vmce_restore_vcpu(struct vcpu *v, const struct hvm_vmce_vcpu *ctxt) { unsigned long guest_mcg_cap; if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) guest_mcg_cap = INTEL_GUEST_MCG_CAP; else guest_mcg_cap = AMD_GUEST_MCG_CAP; if ( ctxt->caps & ~guest_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) { dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities" " %#" PRIx64 " for d%d:v%u (supported: %#Lx)\n", has_hvm_container_vcpu(v) ? "HVM" : "PV", ctxt->caps, v->domain->domain_id, v->vcpu_id, guest_mcg_cap & ~MCG_CAP_COUNT); return -EPERM; } v->arch.vmce.mcg_cap = ctxt->caps; v->arch.vmce.bank[0].mci_ctl2 = ctxt->mci_ctl2_bank0; v->arch.vmce.bank[1].mci_ctl2 = ctxt->mci_ctl2_bank1; return 0; } /* * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM, * when migrating from old vMCE version to new vMCE. */ static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) { int ret = 1; unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4; *val = 0; switch ( msr & (MSR_IA32_MC0_CTL | 3) ) { case MSR_IA32_MC0_CTL: /* stick all 1's to MCi_CTL */ *val = ~0UL; mce_printk(MCE_VERBOSE, "MCE: rd MC%u_CTL %#"PRIx64"\n", bank, *val); break; case MSR_IA32_MC0_STATUS: if ( bank < GUEST_MC_BANK_NUM ) { *val = v->arch.vmce.bank[bank].mci_status; if ( *val ) mce_printk(MCE_VERBOSE, "MCE: rd MC%u_STATUS %#"PRIx64"\n", bank, *val); } break; case MSR_IA32_MC0_ADDR: if ( bank < GUEST_MC_BANK_NUM ) { *val = v->arch.vmce.bank[bank].mci_addr; if ( *val ) mce_printk(MCE_VERBOSE, "MCE: rd MC%u_ADDR %#"PRIx64"\n", bank, *val); } break; case MSR_IA32_MC0_MISC: if ( bank < GUEST_MC_BANK_NUM ) { *val = v->arch.vmce.bank[bank].mci_misc; if ( *val ) mce_printk(MCE_VERBOSE, "MCE: rd MC%u_MISC %#"PRIx64"\n", bank, *val); } break; default: switch ( boot_cpu_data.x86_vendor ) { case X86_VENDOR_INTEL: ret = vmce_intel_rdmsr(v, msr, val); break; case X86_VENDOR_AMD: ret = vmce_amd_rdmsr(v, msr, val); break; default: ret = 0; break; } break; } return ret; } /* * < 0: Unsupported and will #GP fault to guest * = 0: Not handled, should be handled by other components * > 0: Success */ int vmce_rdmsr(uint32_t msr, uint64_t *val) { struct vcpu *cur = current; int ret = 1; *val = 0; spin_lock(&cur->arch.vmce.lock); switch ( msr ) { case MSR_IA32_MCG_STATUS: *val = cur->arch.vmce.mcg_status; if (*val) mce_printk(MCE_VERBOSE, "MCE: rd MCG_STATUS %#"PRIx64"\n", *val); break; case MSR_IA32_MCG_CAP: *val = cur->arch.vmce.mcg_cap; mce_printk(MCE_VERBOSE, "MCE: rd MCG_CAP %#"PRIx64"\n", *val); break; case MSR_IA32_MCG_CTL: if ( cur->arch.vmce.mcg_cap & MCG_CTL_P ) *val = ~0ULL; mce_printk(MCE_VERBOSE, "MCE: rd MCG_CTL %#"PRIx64"\n", *val); break; default: ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0; break; } spin_unlock(&cur->arch.vmce.lock); return ret; } /* * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM, * when migratie from old vMCE version to new vMCE. */ static int bank_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) { int ret = 1; unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4; switch ( msr & (MSR_IA32_MC0_CTL | 3) ) { case MSR_IA32_MC0_CTL: /* * if guest crazy clear any bit of MCi_CTL, * treat it as not implement and ignore write change it. */ break; case MSR_IA32_MC0_STATUS: mce_printk(MCE_VERBOSE, "MCE: wr MC%u_STATUS %#"PRIx64"\n", bank, val); if ( val ) ret = -1; else if ( bank < GUEST_MC_BANK_NUM ) v->arch.vmce.bank[bank].mci_status = val; break; case MSR_IA32_MC0_ADDR: mce_printk(MCE_VERBOSE, "MCE: wr MC%u_ADDR %#"PRIx64"\n", bank, val); if ( val ) ret = -1; else if ( bank < GUEST_MC_BANK_NUM ) v->arch.vmce.bank[bank].mci_addr = val; break; case MSR_IA32_MC0_MISC: mce_printk(MCE_VERBOSE, "MCE: wr MC%u_MISC %#"PRIx64"\n", bank, val); if ( val ) ret = -1; else if ( bank < GUEST_MC_BANK_NUM ) v->arch.vmce.bank[bank].mci_misc = val; break; default: switch ( boot_cpu_data.x86_vendor ) { case X86_VENDOR_INTEL: ret = vmce_intel_wrmsr(v, msr, val); break; case X86_VENDOR_AMD: ret = vmce_amd_wrmsr(v, msr, val); break; default: ret = 0; break; } break; } return ret; } /* * < 0: Unsupported and will #GP fault to guest * = 0: Not handled, should be handled by other components * > 0: Success */ int vmce_wrmsr(uint32_t msr, uint64_t val) { struct vcpu *cur = current; int ret = 1; spin_lock(&cur->arch.vmce.lock); switch ( msr ) { case MSR_IA32_MCG_CTL: /* If MCG_CTL exists then stick to all 1's, else ignore. */ break; case MSR_IA32_MCG_STATUS: cur->arch.vmce.mcg_status = val; mce_printk(MCE_VERBOSE, "MCE: wr MCG_STATUS %"PRIx64"\n", val); break; case MSR_IA32_MCG_CAP: /* * According to Intel SDM, IA32_MCG_CAP is a read-only register, * the effect of writing to the IA32_MCG_CAP is undefined. Here we * treat writing as 'write not change'. Guest would not surprise. */ mce_printk(MCE_VERBOSE, "MCE: MCG_CAP is r/o\n"); break; default: ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0; break; } spin_unlock(&cur->arch.vmce.lock); return ret; } static int vmce_save_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h) { struct vcpu *v; int err = 0; for_each_vcpu( d, v ) { struct hvm_vmce_vcpu ctxt = { .caps = v->arch.vmce.mcg_cap, .mci_ctl2_bank0 = v->arch.vmce.bank[0].mci_ctl2, .mci_ctl2_bank1 = v->arch.vmce.bank[1].mci_ctl2 }; err = hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt); if ( err ) break; } return err; } static int vmce_load_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h) { unsigned int vcpuid = hvm_load_instance(h); struct vcpu *v; struct hvm_vmce_vcpu ctxt; int err; if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) { dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n", d->domain_id, vcpuid); err = -EINVAL; } else err = hvm_load_entry_zeroextend(VMCE_VCPU, h, &ctxt); return err ?: vmce_restore_vcpu(v, &ctxt); } HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt, vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU); /* * for Intel MCE, broadcast vMCE to all vcpus * for AMD MCE, only inject vMCE to vcpu0 * * @ d, domain to which would inject vmce * @ vcpu, * -1 (VMCE_INJECT_BROADCAST), broadcast vMCE to all vcpus * >= 0, vcpu, the vMCE is injected to */ int inject_vmce(struct domain *d, int vcpu) { struct vcpu *v; int ret = -ESRCH; for_each_vcpu ( d, v ) { if ( vcpu != VMCE_INJECT_BROADCAST && vcpu != v->vcpu_id ) continue; if ( (has_hvm_container_domain(d) || guest_has_trap_callback(d, v->vcpu_id, TRAP_machine_check)) && !test_and_set_bool(v->mce_pending) ) { mce_printk(MCE_VERBOSE, "MCE: inject vMCE to d%d:v%d\n", d->domain_id, v->vcpu_id); vcpu_kick(v); ret = 0; } else { mce_printk(MCE_QUIET, "Failed to inject vMCE to d%d:v%d\n", d->domain_id, v->vcpu_id); ret = -EBUSY; break; } if ( vcpu != VMCE_INJECT_BROADCAST ) break; } return ret; } int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d, uint64_t gstatus) { struct vcpu *v = d->vcpu[0]; if ( mc_bank->mc_domid != (uint16_t)~0 ) { if ( v->arch.vmce.mcg_status & MCG_STATUS_MCIP ) { mce_printk(MCE_QUIET, "MCE: guest has not handled previous" " vMCE yet!\n"); return -1; } spin_lock(&v->arch.vmce.lock); v->arch.vmce.mcg_status = gstatus; /* * 1. Skip bank 0 to avoid 'bank 0 quirk' of old processors * 2. Filter MCi_STATUS MSCOD model specific error code to guest */ v->arch.vmce.bank[1].mci_status = mc_bank->mc_status & MCi_STATUS_MSCOD_MASK; v->arch.vmce.bank[1].mci_addr = mc_bank->mc_addr; v->arch.vmce.bank[1].mci_misc = mc_bank->mc_misc; spin_unlock(&v->arch.vmce.lock); } return 0; } /* It's said some ram is setup as mmio_direct for UC cache attribute */ #define P2M_UNMAP_TYPES (p2m_to_mask(p2m_ram_rw) \ | p2m_to_mask(p2m_ram_logdirty) \ | p2m_to_mask(p2m_ram_ro) \ | p2m_to_mask(p2m_mmio_direct)) /* * Currently all CPUs are redenzevous at the MCE softirq handler, no * need to consider paging p2m type * Currently only support HVM guest with EPT paging mode * XXX following situation missed: * PoD, Foreign mapped, Granted, Shared */ int unmmap_broken_page(struct domain *d, mfn_t mfn, unsigned long gfn) { mfn_t r_mfn; p2m_type_t pt; int rc; /* Always trust dom0's MCE handler will prevent future access */ if ( d == dom0 ) return 0; if (!mfn_valid(mfn_x(mfn))) return -EINVAL; if ( !has_hvm_container_domain(d) || !paging_mode_hap(d) ) return -ENOSYS; rc = -1; r_mfn = get_gfn_query(d, gfn, &pt); if ( p2m_to_mask(pt) & P2M_UNMAP_TYPES) { ASSERT(mfn_x(r_mfn) == mfn_x(mfn)); p2m_change_type(d, gfn, pt, p2m_ram_broken); rc = 0; } put_gfn(d, gfn); return rc; } xen-4.4.0/xen/arch/x86/cpu/mcheck/amd_nonfatal.c0000664000175000017500000001600612307313555017412 0ustar smbsmb/* * MCA implementation for AMD CPUs * Copyright (c) 2007 Advanced Micro Devices, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* K8 common MCA documentation published at * * AMD64 Architecture Programmer's Manual Volume 2: * System Programming * Publication # 24593 Revision: 3.12 * Issue Date: September 2006 * * URL: * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf */ /* The related documentation for K8 Revisions A - E is: * * BIOS and Kernel Developer's Guide for * AMD Athlon 64 and AMD Opteron Processors * Publication # 26094 Revision: 3.30 * Issue Date: February 2006 * * URL: * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26094.PDF */ /* The related documentation for K8 Revisions F - G is: * * BIOS and Kernel Developer's Guide for * AMD NPT Family 0Fh Processors * Publication # 32559 Revision: 3.04 * Issue Date: December 2006 * * URL: * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/32559.pdf */ #include #include #include #include #include #include #include #include #include #include #include "mce.h" #include "vmce.h" static struct timer mce_timer; #define MCE_PERIOD MILLISECS(10000) #define MCE_MIN MILLISECS(2000) #define MCE_MAX MILLISECS(30000) static s_time_t period = MCE_PERIOD; static int hw_threshold = 0; static int adjust = 0; static int variable_period = 1; /* The polling service routine: * Collects information of correctable errors and notifies * Dom0 via an event. */ static void mce_amd_checkregs(void *info) { mctelem_cookie_t mctc; struct mca_summary bs; mctc = mcheck_mca_logout(MCA_POLLER, mca_allbanks, &bs, NULL); if (bs.errcnt && mctc != NULL) { static uint64_t dumpcount = 0; /* If Dom0 enabled the VIRQ_MCA event, then notify it. * Otherwise, if dom0 has had plenty of time to register * the virq handler but still hasn't then dump telemetry * to the Xen console. The call count may be incremented * on multiple cpus at once and is indicative only - just * a simple-minded attempt to avoid spamming the console * for corrected errors in early startup. */ if (dom0_vmce_enabled()) { mctelem_commit(mctc); send_global_virq(VIRQ_MCA); } else if (++dumpcount >= 10) { x86_mcinfo_dump((struct mc_info *)mctelem_dataptr(mctc)); mctelem_dismiss(mctc); } else { mctelem_dismiss(mctc); } } else if (mctc != NULL) { mctelem_dismiss(mctc); } /* adjust is global and all cpus may attempt to increment it without * synchronisation, so they race and the final adjust count * (number of cpus seeing any error) is approximate. We can * guarantee that if any cpu observes an error that the * adjust count is at least 1. */ if (bs.errcnt) adjust++; } /* polling service routine invoker: * Adjust poll frequency at runtime. No error means slow polling frequency, * an error means higher polling frequency. * It uses hw threshold register introduced in AMD K8 RevF to detect * multiple correctable errors between two polls. In that case, * increase polling frequency higher than normal. */ static void mce_amd_work_fn(void *data) { on_each_cpu(mce_amd_checkregs, data, 1); if (adjust > 0) { if (!dom0_vmce_enabled()) { /* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */ printk("MCE: polling routine found correctable error. " " Use mcelog to parse above error output.\n"); } } if (hw_threshold) { uint64_t value; uint32_t counter; value = mca_rdmsr(MSR_IA32_MCx_MISC(4)); /* Only the error counter field is of interest * Bit field is described in AMD K8 BKDG chapter 6.4.5.5 */ counter = (value & 0xFFF00000000ULL) >> 32U; /* HW does not count *all* kinds of correctable errors. * Thus it is possible, that the polling routine finds an * correctable error even if the HW reports nothing. */ if (counter > 0) { /* HW reported correctable errors, * the polling routine did not find... */ if (adjust == 0) { printk("CPU counter reports %"PRIu32 " correctable hardware error%s that %s" " not reported by the status MSRs\n", counter, (counter == 1 ? "" : "s"), (counter == 1 ? "was" : "were")); } /* subtract 1 to not double count the error * from the polling service routine */ adjust += (counter - 1); /* Restart counter */ /* No interrupt, reset counter value */ value &= ~(0x60FFF00000000ULL); /* Counter enable */ value |= (1ULL << 51); mca_wrmsr(MSR_IA32_MCx_MISC(4), value); wmb(); } } if (variable_period && adjust > 0) { /* Increase polling frequency */ adjust++; /* adjust == 1 must have an effect */ period /= adjust; } else if (variable_period) { /* Decrease polling frequency */ period *= 2; } if (variable_period && period > MCE_MAX) { /* limit: Poll at least every 30s */ period = MCE_MAX; } if (variable_period && period < MCE_MIN) { /* limit: Poll every 2s. * When this is reached an uncorrectable error * is expected to happen, if Dom0 does nothing. */ period = MCE_MIN; } set_timer(&mce_timer, NOW() + period); adjust = 0; } void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c) { if (c->x86_vendor != X86_VENDOR_AMD) return; /* Assume we are on K8 or newer AMD CPU here */ /* The threshold bitfields in MSR_IA32_MC4_MISC has * been introduced along with the SVME feature bit. */ if (variable_period && cpu_has(c, X86_FEATURE_SVM)) { uint64_t value; /* hw threshold registers present */ hw_threshold = 1; rdmsrl(MSR_IA32_MCx_MISC(4), value); if (value & (1ULL << 61)) { /* Locked bit */ /* Locked by BIOS. Not available for use */ hw_threshold = 0; } if (!(value & (1ULL << 63))) { /* Valid bit */ /* No CtrP present */ hw_threshold = 0; } else { if (!(value & (1ULL << 62))) { /* Counter Bit */ /* No counter field present */ hw_threshold = 0; } } if (hw_threshold) { /* No interrupt, reset counter value */ value &= ~(0x60FFF00000000ULL); /* Counter enable */ value |= (1ULL << 51); wrmsrl(MSR_IA32_MCx_MISC(4), value); /* serialize */ wmb(); printk(XENLOG_INFO "MCA: Use hw thresholding to adjust polling frequency\n"); } } init_timer(&mce_timer, mce_amd_work_fn, NULL, 0); set_timer(&mce_timer, NOW() + period); return; } xen-4.4.0/xen/arch/x86/cpu/mcheck/x86_mca.h0000664000175000017500000001220312307313555016234 0ustar smbsmb/* * MCA implementation for AMD CPUs * Copyright (c) 2007-2012 Advanced Micro Devices, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef X86_MCA_H #define X86_MCA_H #include /* The MCA/MCE MSRs should not be used anywhere else. * They are cpu family/model specific and are only for use * in terms of machine check handling. * So we define them here rather in . */ /* Bitfield of the MSR_IA32_MCG_CAP register */ #define MCG_CAP_COUNT 0x00000000000000ffULL #define MCG_CTL_P (1ULL<<8) #define MCG_EXT_P (1ULL<<9) /* Intel specific */ #define MCG_CMCI_P (1ULL<<10) /* Intel specific */ #define MCG_TES_P (1ULL<<11) /* Intel specific */ #define MCG_EXT_CNT 16 /* Intel specific */ #define MCG_SER_P (1ULL<<24) /* Intel specific */ /* Other bits are reserved */ /* Bitfield of the MSR_IA32_MCG_STATUS register */ #define MCG_STATUS_RIPV 0x0000000000000001ULL #define MCG_STATUS_EIPV 0x0000000000000002ULL #define MCG_STATUS_MCIP 0x0000000000000004ULL /* Bits 3-63 are reserved */ /* Bitfield of MSR_K8_MCi_STATUS registers */ /* MCA error code */ #define MCi_STATUS_MCA 0x000000000000ffffULL /* model-specific error code */ #define MCi_STATUS_MSEC 0x00000000ffff0000ULL /* Other information */ #define MCi_STATUS_OTHER 0x01ffffff00000000ULL /* Action Required flag */ #define MCi_STATUS_AR 0x0080000000000000ULL /* Intel specific */ /* Signaling flag */ #define MCi_STATUS_S 0x0100000000000000ULL /* Intel specific */ /* processor context corrupt */ #define MCi_STATUS_PCC 0x0200000000000000ULL /* MSR_K8_MCi_ADDR register valid */ #define MCi_STATUS_ADDRV 0x0400000000000000ULL /* MSR_K8_MCi_MISC register valid */ #define MCi_STATUS_MISCV 0x0800000000000000ULL /* error condition enabled */ #define MCi_STATUS_EN 0x1000000000000000ULL /* uncorrected error */ #define MCi_STATUS_UC 0x2000000000000000ULL /* status register overflow */ #define MCi_STATUS_OVER 0x4000000000000000ULL /* valid */ #define MCi_STATUS_VAL 0x8000000000000000ULL /* Bitfield of MSi_STATUS_OTHER field */ /* reserved bits */ #define MCi_STATUS_OTHER_RESERVED1 0x00001fff00000000ULL /* uncorrectable ECC error */ #define MCi_STATUS_OTEHR_UC_ECC 0x0000200000000000ULL /* correctable ECC error */ #define MCi_STATUS_OTHER_C_ECC 0x0000400000000000ULL /* ECC syndrome of an ECC error */ #define MCi_STATUS_OTHER_ECC_SYNDROME 0x007f800000000000ULL /* reserved bits */ #define MCi_STATUS_OTHER_RESERVED2 0x0180000000000000ULL /* Bitfield of MSR_K8_HWCR register */ #define K8_HWCR_MCi_STATUS_WREN (1ULL << 18) /*Intel Specific bitfield*/ #define CMCI_THRESHOLD 0x2 #define MCi_MISC_ADDRMOD_MASK (0x7UL << 6) #define MCi_MISC_PHYSMOD (0x2UL << 6) #include struct mca_banks { int num; unsigned long *bank_map; }; static inline void mcabanks_clear(int bit, struct mca_banks *banks) { if (!banks || !banks->bank_map || bit >= banks->num) return ; clear_bit(bit, banks->bank_map); } static inline void mcabanks_set(int bit, struct mca_banks* banks) { if (!banks || !banks->bank_map || bit >= banks->num) return; set_bit(bit, banks->bank_map); } static inline int mcabanks_test(int bit, struct mca_banks* banks) { if (!banks || !banks->bank_map || bit >= banks->num) return 0; return test_bit(bit, banks->bank_map); } struct mca_banks *mcabanks_alloc(void); void mcabanks_free(struct mca_banks *banks); extern struct mca_banks *mca_allbanks; /* Keep bank so that we can get status even if mib is NULL */ struct mca_binfo { int bank; struct mcinfo_global *mig; struct mcinfo_bank *mib; struct mc_info *mi; struct cpu_user_regs *regs; }; enum mce_result { MCER_NOERROR, MCER_RECOVERED, /* Not recovered, but can continue */ MCER_CONTINUE, MCER_RESET, }; struct mca_error_handler { /* Assume corresponding recovery action could be uniquely * identified by mca_code. Otherwise, we might need to have * a seperate function to decode the corresponding actions * for the particular mca error later. */ int (*owned_error)(uint64_t status); void (*recovery_handler)(struct mca_binfo *binfo, enum mce_result *result, struct cpu_user_regs *regs); }; /* Global variables */ extern bool_t mce_disabled; #endif /* X86_MCA_H */ xen-4.4.0/xen/arch/x86/cpu/mcheck/mce-apei.c0000664000175000017500000000735212307313555016453 0ustar smbsmb/* * Bridge between MCE and APEI * * On some machine, corrected memory errors are reported via APEI * generic hardware error source (GHES) instead of corrected Machine * Check. These corrected memory errors can be reported to user space * through /dev/mcelog via faking a corrected Machine Check, so that * the error memory page can be offlined by /sbin/mcelog if the error * count for one page is beyond the threshold. * * For fatal MCE, save MCE record into persistent storage via ERST, so * that the MCE record can be logged after reboot via ERST. * * Copyright 2010 Intel Corp. * Author: Huang Ying * Ported by: Liu, Jinsong * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include "mce.h" #define CPER_CREATOR_MCE \ UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ 0x64, 0x90, 0xb8, 0x9d) #define CPER_SECTION_TYPE_MCE \ UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \ 0x04, 0x4a, 0x38, 0xfc) #pragma pack(1) /* * CPER specification (in UEFI specification 2.3 appendix N) requires * byte-packed. */ struct cper_mce_record { struct cper_record_header hdr; struct cper_section_descriptor sec_hdr; struct mce mce; } __packed; /* Reset to default packing */ #pragma pack() int apei_write_mce(struct mce *m) { struct cper_mce_record rcd; if (!m) return -EINVAL; memset(&rcd, 0, sizeof(rcd)); memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE); rcd.hdr.revision = CPER_RECORD_REV; rcd.hdr.signature_end = CPER_SIG_END; rcd.hdr.section_count = 1; rcd.hdr.error_severity = CPER_SER_FATAL; /* timestamp, platform_id, partition_id are all invalid */ rcd.hdr.validation_bits = 0; rcd.hdr.record_length = sizeof(rcd); rcd.hdr.creator_id = CPER_CREATOR_MCE; rcd.hdr.notification_type = CPER_NOTIFY_MCE; rcd.hdr.record_id = cper_next_record_id(); rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR; rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd; rcd.sec_hdr.section_length = sizeof(rcd.mce); rcd.sec_hdr.revision = CPER_SEC_REV; /* fru_id and fru_text is invalid */ rcd.sec_hdr.validation_bits = 0; rcd.sec_hdr.flags = CPER_SEC_PRIMARY; rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE; rcd.sec_hdr.section_severity = CPER_SER_FATAL; memcpy(&rcd.mce, m, sizeof(*m)); return erst_write(&rcd.hdr); } size_t apei_read_mce(struct mce *m, u64 *record_id) { struct cper_mce_record rcd; size_t len; if (!m || !record_id) return -EINVAL; len = erst_read_next(&rcd.hdr, sizeof(rcd)); if (len <= 0) return len; /* Can not skip other records in storage via ERST unless clear them */ else if (len != sizeof(rcd) || uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { printk(KERN_WARNING "MCE-APEI: Can not skip the unknown record in ERST"); return -EIO; } memcpy(m, &rcd.mce, sizeof(*m)); *record_id = rcd.hdr.record_id; return sizeof(*m); } /* Check whether there is record in ERST */ int apei_check_mce(void) { return erst_get_record_count(); } int apei_clear_mce(u64 record_id) { return erst_clear(record_id); } xen-4.4.0/xen/arch/x86/cpu/intel.c0000664000175000017500000001507412307313555014654 0ustar smbsmb#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "cpu.h" #define select_idle_routine(x) ((void)0) static unsigned int probe_intel_cpuid_faulting(void) { uint64_t x; return !rdmsr_safe(MSR_INTEL_PLATFORM_INFO, x) && (x & (1u<<31)); } static DEFINE_PER_CPU(bool_t, cpuid_faulting_enabled); void set_cpuid_faulting(bool_t enable) { uint32_t hi, lo; if (!cpu_has_cpuid_faulting || this_cpu(cpuid_faulting_enabled) == enable ) return; rdmsr(MSR_INTEL_MISC_FEATURES_ENABLES, lo, hi); lo &= ~1; if (enable) lo |= 1; wrmsr(MSR_INTEL_MISC_FEATURES_ENABLES, lo, hi); this_cpu(cpuid_faulting_enabled) = enable; } /* * opt_cpuid_mask_ecx/edx: cpuid.1[ecx, edx] feature mask. * For example, E8400[Intel Core 2 Duo Processor series] ecx = 0x0008E3FD, * edx = 0xBFEBFBFF when executing CPUID.EAX = 1 normally. If you want to * 'rev down' to E8400, you can set these values in these Xen boot parameters. */ static void __devinit set_cpuidmask(const struct cpuinfo_x86 *c) { u32 eax, edx; const char *extra = ""; if (!~(opt_cpuid_mask_ecx & opt_cpuid_mask_edx & opt_cpuid_mask_ext_ecx & opt_cpuid_mask_ext_edx & opt_cpuid_mask_xsave_eax)) return; /* Only family 6 supports this feature */ switch ((c->x86 == 6) * c->x86_model) { case 0x17: if ((c->x86_mask & 0x0f) < 4) break; /* fall through */ case 0x1d: wrmsr(MSR_INTEL_CPUID_FEATURE_MASK, opt_cpuid_mask_ecx, opt_cpuid_mask_edx); if (~(opt_cpuid_mask_ext_ecx & opt_cpuid_mask_ext_edx)) extra = "extended "; else if (~opt_cpuid_mask_xsave_eax) extra = "xsave "; else return; break; /* * CPU supports this feature if the processor signature meets the following: * (CPUID.(EAX=01h):EAX) > 000106A2h, or * (CPUID.(EAX=01h):EAX) == 000106Exh, 0002065xh, 000206Cxh, 000206Exh, or 000206Fxh * */ case 0x1a: if ((c->x86_mask & 0x0f) <= 2) break; /* fall through */ case 0x1e: case 0x1f: case 0x25: case 0x2c: case 0x2e: case 0x2f: wrmsr(MSR_INTEL_CPUID1_FEATURE_MASK, opt_cpuid_mask_ecx, opt_cpuid_mask_edx); wrmsr(MSR_INTEL_CPUID80000001_FEATURE_MASK, opt_cpuid_mask_ext_ecx, opt_cpuid_mask_ext_edx); if (!~opt_cpuid_mask_xsave_eax) return; extra = "xsave "; break; case 0x2a: case 0x2d: wrmsr(MSR_INTEL_CPUID1_FEATURE_MASK_V2, opt_cpuid_mask_ecx, opt_cpuid_mask_edx); rdmsr(MSR_INTEL_CPUIDD_01_FEATURE_MASK, eax, edx); wrmsr(MSR_INTEL_CPUIDD_01_FEATURE_MASK, opt_cpuid_mask_xsave_eax, edx); wrmsr(MSR_INTEL_CPUID80000001_FEATURE_MASK_V2, opt_cpuid_mask_ext_ecx, opt_cpuid_mask_ext_edx); return; } printk(XENLOG_ERR "Cannot set CPU %sfeature mask on CPU#%d\n", extra, smp_processor_id()); } void __devinit early_intel_workaround(struct cpuinfo_x86 *c) { if (c->x86_vendor != X86_VENDOR_INTEL) return; /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ if (c->x86 == 15 && c->x86_cache_alignment == 64) c->x86_cache_alignment = 128; /* Unmask CPUID levels if masked: */ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { u64 misc_enable; rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); c->cpuid_level = cpuid_eax(0); if (opt_cpu_info || c == &boot_cpu_data) printk(KERN_INFO "revised cpuid level: %d\n", c->cpuid_level); } } /* CPUID workaround for Intel 0F33/0F34 CPU */ if (boot_cpu_data.x86 == 0xF && boot_cpu_data.x86_model == 3 && (boot_cpu_data.x86_mask == 3 || boot_cpu_data.x86_mask == 4)) paddr_bits = 36; } /* * P4 Xeon errata 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ static void __devinit Intel_errata_workarounds(struct cpuinfo_x86 *c) { unsigned long lo, hi; if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { rdmsr (MSR_IA32_MISC_ENABLE, lo, hi); if ((lo & (1<<9)) == 0) { printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); lo |= (1<<9); /* Disable hw prefetching */ wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); } } } /* * find out the number of processor cores on the die */ static int __devinit num_cpu_cores(struct cpuinfo_x86 *c) { unsigned int eax, ebx, ecx, edx; if (c->cpuid_level < 4) return 1; /* Intel has a non-standard dependency on %ecx for this CPUID level. */ cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); if (eax & 0x1f) return ((eax >> 26) + 1); else return 1; } static void __devinit init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; /* Detect the extended topology information if available */ detect_extended_topology(c); select_idle_routine(c); l2 = init_intel_cacheinfo(c); if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); /* Check for version and the number of counters */ if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability); } if ( !cpu_has(c, X86_FEATURE_XTOPOLOGY) ) { c->x86_max_cores = num_cpu_cores(c); detect_ht(c); } if (c == &boot_cpu_data && c->x86 == 6) { if (probe_intel_cpuid_faulting()) set_bit(X86_FEATURE_CPUID_FAULTING, c->x86_capability); } else if (boot_cpu_has(X86_FEATURE_CPUID_FAULTING)) { BUG_ON(!probe_intel_cpuid_faulting()); set_bit(X86_FEATURE_CPUID_FAULTING, c->x86_capability); } if (!cpu_has_cpuid_faulting) set_cpuidmask(c); /* Work around errata */ Intel_errata_workarounds(c); if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); if (cpuid_edx(0x80000007) & (1u<<8)) { set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); set_bit(X86_FEATURE_NONSTOP_TSC, c->x86_capability); set_bit(X86_FEATURE_TSC_RELIABLE, c->x86_capability); } if ( opt_arat && ( c->cpuid_level >= 0x00000006 ) && ( cpuid_eax(0x00000006) & (1u<<2) ) ) set_bit(X86_FEATURE_ARAT, c->x86_capability); } static struct cpu_dev intel_cpu_dev __cpuinitdata = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, .c_init = init_intel, }; int __init intel_cpu_init(void) { cpu_devs[X86_VENDOR_INTEL] = &intel_cpu_dev; return 0; } // arch_initcall(intel_cpu_init); xen-4.4.0/xen/arch/x86/cpu/mtrr/0000775000175000017500000000000012307313555014352 5ustar smbsmbxen-4.4.0/xen/arch/x86/cpu/mtrr/Makefile0000664000175000017500000000004312307313555016007 0ustar smbsmbobj-y += generic.o obj-y += main.o xen-4.4.0/xen/arch/x86/cpu/mtrr/generic.c0000664000175000017500000003123612307313555016137 0ustar smbsmb/* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong because MTRRs can span upto 40 bits (36bits on most modern x86) */ #include #include #include #include #include #include #include #include #include #include "mtrr.h" struct fixed_range_block { int base_msr; /* start address of an MTRR block */ int ranges; /* number of MTRRs in this block */ }; static struct fixed_range_block fixed_range_blocks[] = { { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ {} }; static unsigned long smp_changes_mask; struct mtrr_state mtrr_state = {}; /* Get the MSR pair relating to a var range */ static void get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) { rdmsrl(MTRRphysBase_MSR(index), vr->base); rdmsrl(MTRRphysMask_MSR(index), vr->mask); } static void get_fixed_ranges(mtrr_type * frs) { uint64_t *p = (uint64_t *) frs; int i; rdmsrl(MTRRfix64K_00000_MSR, p[0]); for (i = 0; i < 2; i++) rdmsrl(MTRRfix16K_80000_MSR + i, p[1 + i]); for (i = 0; i < 8; i++) rdmsrl(MTRRfix4K_C0000_MSR + i, p[3 + i]); } void mtrr_save_fixed_ranges(void *info) { if (cpu_has_mtrr) get_fixed_ranges(mtrr_state.fixed_ranges); } /* Grab all of the MTRR state for this CPU into *state */ void __init get_mtrr_state(void) { unsigned int i; struct mtrr_var_range *vrs; uint64_t msr_content; if (!mtrr_state.var_ranges) { mtrr_state.var_ranges = xmalloc_array(struct mtrr_var_range, num_var_ranges); if (!mtrr_state.var_ranges) return; } vrs = mtrr_state.var_ranges; rdmsrl(MTRRcap_MSR, msr_content); mtrr_state.have_fixed = (msr_content >> 8) & 1; for (i = 0; i < num_var_ranges; i++) get_mtrr_var_range(i, &vrs[i]); if (mtrr_state.have_fixed) get_fixed_ranges(mtrr_state.fixed_ranges); rdmsrl(MTRRdefType_MSR, msr_content); mtrr_state.def_type = (msr_content & 0xff); mtrr_state.enabled = (msr_content & 0xc00) >> 10; /* Store mtrr_cap for HVM MTRR virtualisation. */ rdmsrl(MTRRcap_MSR, mtrr_state.mtrr_cap); } /* Some BIOS's are fucked and don't set all MTRRs the same! */ void __init mtrr_state_warn(void) { unsigned long mask = smp_changes_mask; if (!mask) return; if (mask & MTRR_CHANGE_MASK_FIXED) printk(KERN_WARNING "mtrr: your CPUs had inconsistent fixed MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_VARIABLE) printk(KERN_WARNING "mtrr: your CPUs had inconsistent variable MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_DEFTYPE) printk(KERN_WARNING "mtrr: your CPUs had inconsistent MTRRdefType settings\n"); printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n"); printk(KERN_INFO "mtrr: corrected configuration.\n"); } /* Doesn't attempt to pass an error out to MTRR users because it's quite complicated in some cases and probably not worth it because the best error handling is to ignore it. */ void mtrr_wrmsr(unsigned int msr, uint64_t msr_content) { if (wrmsr_safe(msr, msr_content) < 0) printk(KERN_ERR "MTRR: CPU %u: Writing MSR %x to %"PRIx64" failed\n", smp_processor_id(), msr, msr_content); /* Cache overlap status for efficient HVM MTRR virtualisation. */ mtrr_state.overlapped = is_var_mtrr_overlapped(&mtrr_state); } /** * Checks and updates an fixed-range MTRR if it differs from the value it * should have. If K8 extenstions are wanted, update the K8 SYSCFG MSR also. * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information * \param msr MSR address of the MTTR which should be checked and updated * \param changed pointer which indicates whether the MTRR needed to be changed * \param msrwords pointer to the MSR values which the MSR should have */ static void set_fixed_range(int msr, int * changed, unsigned int * msrwords) { uint64_t msr_content, val; rdmsrl(msr, msr_content); val = ((uint64_t)msrwords[1] << 32) | msrwords[0]; if (msr_content != val) { mtrr_wrmsr(msr, val); *changed = TRUE; } } int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) /* [SUMMARY] Get a free MTRR. The starting (base) address of the region. The size (in bytes) of the region. [RETURNS] The index of the region on success, else -1 on error. */ { int i, max; mtrr_type ltype; unsigned long lbase, lsize; max = num_var_ranges; if (replace_reg >= 0 && replace_reg < max) return replace_reg; for (i = 0; i < max; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); if (lsize == 0) return i; } return -ENOSPC; } static void generic_get_mtrr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type *type) { uint64_t _mask, _base; rdmsrl(MTRRphysMask_MSR(reg), _mask); if ((_mask & 0x800) == 0) { /* Invalid (i.e. free) range */ *base = 0; *size = 0; *type = 0; return; } rdmsrl(MTRRphysBase_MSR(reg), _base); /* Work out the shifted address mask. */ _mask = size_or_mask | (_mask >> PAGE_SHIFT); /* This works correctly if size is a power of two, i.e. a contiguous range. */ *size = -(uint32_t)_mask; *base = _base >> PAGE_SHIFT; *type = _base & 0xff; } /** * Checks and updates the fixed-range MTRRs if they differ from the saved set * \param frs pointer to fixed-range MTRR values, saved by get_fixed_ranges() */ static int set_fixed_ranges(mtrr_type * frs) { unsigned long long *saved = (unsigned long long *) frs; int changed = FALSE; int block=-1, range; while (fixed_range_blocks[++block].ranges) for (range=0; range < fixed_range_blocks[block].ranges; range++) set_fixed_range(fixed_range_blocks[block].base_msr + range, &changed, (unsigned int *) saved++); return changed; } /* Set the MSR pair relating to a var range. Returns TRUE if changes are made */ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) { uint32_t lo, hi, base_lo, base_hi, mask_lo, mask_hi; uint64_t msr_content; int changed = FALSE; rdmsrl(MTRRphysBase_MSR(index), msr_content); lo = (uint32_t)msr_content; hi = (uint32_t)(msr_content >> 32); base_lo = (uint32_t)vr->base; base_hi = (uint32_t)(vr->base >> 32); lo &= 0xfffff0ffUL; base_lo &= 0xfffff0ffUL; hi &= size_and_mask >> (32 - PAGE_SHIFT); base_hi &= size_and_mask >> (32 - PAGE_SHIFT); if ((base_lo != lo) || (base_hi != hi)) { mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base); changed = TRUE; } rdmsrl(MTRRphysMask_MSR(index), msr_content); lo = (uint32_t)msr_content; hi = (uint32_t)(msr_content >> 32); mask_lo = (uint32_t)vr->mask; mask_hi = (uint32_t)(vr->mask >> 32); lo &= 0xfffff800UL; mask_lo &= 0xfffff800UL; hi &= size_and_mask >> (32 - PAGE_SHIFT); mask_hi &= size_and_mask >> (32 - PAGE_SHIFT); if ((mask_lo != lo) || (mask_hi != hi)) { mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask); changed = TRUE; } return changed; } static uint64_t deftype; static unsigned long set_mtrr_state(void) /* [SUMMARY] Set the MTRR state for this CPU. The MTRR state information to read. Some relevant CPU context. [NOTE] The CPU must already be in a safe state for MTRR changes. [RETURNS] 0 if no changes made, else a mask indication what was changed. */ { unsigned int i; unsigned long change_mask = 0; for (i = 0; i < num_var_ranges; i++) if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) change_mask |= MTRR_CHANGE_MASK_VARIABLE; if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) change_mask |= MTRR_CHANGE_MASK_FIXED; /* Set_mtrr_restore restores the old value of MTRRdefType, so to set it we fiddle with the saved value */ if ((deftype & 0xff) != mtrr_state.def_type || ((deftype & 0xc00) >> 10) != mtrr_state.enabled) { deftype = (deftype & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10); change_mask |= MTRR_CHANGE_MASK_DEFTYPE; } return change_mask; } static unsigned long cr4 = 0; static DEFINE_SPINLOCK(set_atomicity_lock); /* * Since we are disabling the cache don't allow any interrupts - they * would run extremely slow and would only increase the pain. The caller must * ensure that local interrupts are disabled and are reenabled after post_set() * has been called. */ static void prepare_set(void) { unsigned long cr0; /* Note that this is not ideal, since the cache is only flushed/disabled for this CPU while the MTRRs are changed, but changing this requires more invasive changes to the way the kernel boots */ spin_lock(&set_atomicity_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | 0x40000000; /* set CD flag */ write_cr0(cr0); wbinvd(); /* Save value of CR4 and clear Page Global Enable (bit 7) */ if ( cpu_has_pge ) { cr4 = read_cr4(); write_cr4(cr4 & ~X86_CR4_PGE); } /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ flush_tlb_local(); /* Save MTRR state */ rdmsrl(MTRRdefType_MSR, deftype); /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MTRRdefType_MSR, deftype & ~0xcff); } static void post_set(void) { /* Flush TLBs (no need to flush caches - they are disabled) */ flush_tlb_local(); /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MTRRdefType_MSR, deftype); /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); /* Restore value of CR4 */ if ( cpu_has_pge ) write_cr4(cr4); spin_unlock(&set_atomicity_lock); } static void generic_set_all(void) { unsigned long mask, count; unsigned long flags; local_irq_save(flags); prepare_set(); /* Actually set the state */ mask = set_mtrr_state(); post_set(); local_irq_restore(flags); /* Use the atomic bitops to update the global mask */ for (count = 0; count < sizeof mask * 8; ++count) { if (mask & 0x01) set_bit(count, &smp_changes_mask); mask >>= 1; } } static void generic_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) /* [SUMMARY] Set variable MTRR register on the local CPU. The register to set. The base address of the region. The size of the region. If this is 0 the region is disabled. The type of the region. If TRUE, do the change safely. If FALSE, safety measures should be done externally. [RETURNS] Nothing. */ { unsigned long flags; struct mtrr_var_range *vr; vr = &mtrr_state.var_ranges[reg]; local_irq_save(flags); prepare_set(); if (size == 0) { /* The invalid bit is kept in the mask, so we simply clear the relevant mask register to disable a range. */ mtrr_wrmsr(MTRRphysMask_MSR(reg), 0); memset(vr, 0, sizeof(struct mtrr_var_range)); } else { uint32_t base_lo, base_hi, mask_lo, mask_hi; base_lo = base << PAGE_SHIFT | type; base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT); mask_lo = -size << PAGE_SHIFT | 0x800; mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT); vr->base = ((uint64_t)base_hi << 32) | base_lo; vr->mask = ((uint64_t)mask_hi << 32) | mask_lo; mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base); mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask); } post_set(); local_irq_restore(flags); } int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type) { unsigned long lbase, last; /* For Intel PPro stepping <= 7, must be 4 MiB aligned and not touch 0x70000000->0x7003FFFF */ if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && boot_cpu_data.x86_mask <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { printk(KERN_WARNING "mtrr: base(%#lx000) is not 4 MiB aligned\n", base); return -EINVAL; } if (!(base + size < 0x70000 || base > 0x7003F) && (type == MTRR_TYPE_WRCOMB || type == MTRR_TYPE_WRBACK)) { printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); return -EINVAL; } } /* Check upper bits of base and last are equal and lower bits are 0 for base and 1 for last */ last = base + size - 1; for (lbase = base; !(lbase & 1) && (last & 1); lbase = lbase >> 1, last = last >> 1) ; if (lbase != last) { printk(KERN_WARNING "mtrr: base(%#lx000) is not aligned on a size(%#lx000) boundary\n", base, size); return -EINVAL; } return 0; } static int generic_have_wrcomb(void) { unsigned long config; rdmsrl(MTRRcap_MSR, config); return (config & (1ULL << 10)); } int positive_have_wrcomb(void) { return 1; } /* generic structure... */ const struct mtrr_ops generic_mtrr_ops = { .use_intel_if = 1, .set_all = generic_set_all, .get = generic_get_mtrr, .get_free_region = generic_get_free_region, .set = generic_set_mtrr, .validate_add_page = generic_validate_add_page, .have_wrcomb = generic_have_wrcomb, }; xen-4.4.0/xen/arch/x86/cpu/mtrr/mtrr.h0000664000175000017500000000447712307313555015523 0ustar smbsmb/* * local mtrr defines. */ #ifndef TRUE #define TRUE 1 #define FALSE 0 #endif #define MTRRcap_MSR 0x0fe #define MTRRdefType_MSR 0x2ff #define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) #define MTRRfix64K_00000_MSR 0x250 #define MTRRfix16K_80000_MSR 0x258 #define MTRRfix16K_A0000_MSR 0x259 #define MTRRfix4K_C0000_MSR 0x268 #define MTRRfix4K_C8000_MSR 0x269 #define MTRRfix4K_D0000_MSR 0x26a #define MTRRfix4K_D8000_MSR 0x26b #define MTRRfix4K_E0000_MSR 0x26c #define MTRRfix4K_E8000_MSR 0x26d #define MTRRfix4K_F0000_MSR 0x26e #define MTRRfix4K_F8000_MSR 0x26f #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 struct mtrr_ops { u32 vendor; u32 use_intel_if; // void (*init)(void); void (*set)(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); void (*set_all)(void); void (*get)(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type * type); int (*get_free_region)(unsigned long base, unsigned long size, int replace_reg); int (*validate_add_page)(unsigned long base, unsigned long size, unsigned int type); int (*have_wrcomb)(void); }; extern int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg); extern int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type); extern const struct mtrr_ops generic_mtrr_ops; extern int positive_have_wrcomb(void); /* library functions for processor-specific routines */ struct set_mtrr_context { unsigned long flags; unsigned long cr4val; uint64_t deftype; u32 ccr3; }; void set_mtrr_done(struct set_mtrr_context *ctxt); void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); void get_mtrr_state(void); extern void set_mtrr_ops(const struct mtrr_ops *); extern u64 size_or_mask, size_and_mask; extern const struct mtrr_ops *mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) extern unsigned int num_var_ranges; void mtrr_state_warn(void); void mtrr_wrmsr(unsigned int msr, uint64_t msr_content); extern int amd_init_mtrr(void); extern int cyrix_init_mtrr(void); xen-4.4.0/xen/arch/x86/cpu/mtrr/main.c0000664000175000017500000004362012307313555015447 0ustar smbsmb/* Generic MTRR (Memory Type Range Register) driver. Copyright (C) 1997-2000 Richard Gooch Copyright (c) 2002 Patrick Mochel This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. Richard Gooch may be reached by email at rgooch@atnf.csiro.au The postal address is: Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia. Source: "Pentium Pro Family Developer's Manual, Volume 3: Operating System Writer's Guide" (Intel document number 242692), section 11.11.7 This was cleaned and made readable by Patrick Mochel on 6-7 March 2002. Source: Intel Architecture Software Developers Manual, Volume 3: System Programming Guide; Section 9.11. (1997 edition - PPro). */ #include #include #include #include #include #include #include #include #include #include "mtrr.h" /* No blocking mutexes in Xen. Spin instead. */ #define DEFINE_MUTEX(_m) DEFINE_SPINLOCK(_m) #define mutex_lock(_m) spin_lock(_m) #define mutex_unlock(_m) spin_unlock(_m) #define dump_stack() ((void)0) #define get_cpu() smp_processor_id() #define put_cpu() do {} while(0) u32 __read_mostly num_var_ranges = 0; unsigned int *__read_mostly usage_table; static DEFINE_MUTEX(mtrr_mutex); u64 __read_mostly size_or_mask; u64 __read_mostly size_and_mask; const struct mtrr_ops *__read_mostly mtrr_if = NULL; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); static const char *const mtrr_strings[MTRR_NUM_TYPES] = { "uncachable", /* 0 */ "write-combining", /* 1 */ "?", /* 2 */ "?", /* 3 */ "write-through", /* 4 */ "write-protect", /* 5 */ "write-back", /* 6 */ }; static const char *mtrr_attrib_to_str(int x) { return (x <= 6) ? mtrr_strings[x] : "?"; } /* Returns non-zero if we have the write-combining memory type */ static int have_wrcomb(void) { return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0); } /* This function returns the number of variable MTRRs */ static void __init set_num_var_ranges(void) { unsigned long config = 0; if (use_intel()) { rdmsrl(MTRRcap_MSR, config); } else if (is_cpu(AMD)) config = 2; else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) config = 8; num_var_ranges = config & 0xff; } static void __init init_table(void) { int i, max; max = num_var_ranges; if ((usage_table = xmalloc_array(unsigned int, max)) == NULL) { printk(KERN_ERR "mtrr: could not allocate\n"); return; } for (i = 0; i < max; i++) usage_table[i] = 1; } struct set_mtrr_data { atomic_t count; atomic_t gate; unsigned long smp_base; unsigned long smp_size; unsigned int smp_reg; mtrr_type smp_type; }; /* As per the IA32 SDM vol-3: 10.11.8 MTRR Considerations in MP Systems section * MTRRs updates must to be synchronized across all the processors. * This flags avoids multiple cpu synchronization while booting each cpu. * At the boot & resume time, this flag is turned on in mtrr_aps_sync_begin(). * Using this flag the mtrr initialization (and the all cpus sync up) in the * mtrr_ap_init() is avoided while booting each cpu. * After all the cpus have came up, then mtrr_aps_sync_end() synchronizes all * the cpus and updates mtrrs on all of them. Then this flag is turned off. */ int hold_mtrr_updates_on_aps; static void ipi_handler(void *info) /* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. [RETURNS] Nothing. */ { struct set_mtrr_data *data = info; unsigned long flags; local_irq_save(flags); atomic_dec(&data->count); while(!atomic_read(&data->gate)) cpu_relax(); /* The master has cleared me to execute */ if (data->smp_reg == ~0U) /* update all mtrr registers */ /* At the cpu hot-add time this will reinitialize mtrr * registres on the existing cpus. It is ok. */ mtrr_if->set_all(); else /* single mtrr register update */ mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); atomic_dec(&data->count); while(atomic_read(&data->gate)) cpu_relax(); atomic_dec(&data->count); local_irq_restore(flags); } static inline int types_compatible(mtrr_type type1, mtrr_type type2) { return type1 == MTRR_TYPE_UNCACHABLE || type2 == MTRR_TYPE_UNCACHABLE || (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH); } /** * set_mtrr - update mtrrs on all processors * @reg: mtrr in question * @base: mtrr base * @size: mtrr size * @type: mtrr type * * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: * * 1. Send IPI to do the following: * 2. Disable Interrupts * 3. Wait for all procs to do so * 4. Enter no-fill cache mode * 5. Flush caches * 6. Clear PGE bit * 7. Flush all TLBs * 8. Disable all range registers * 9. Update the MTRRs * 10. Enable all range registers * 11. Flush all TLBs and caches again * 12. Enter normal cache mode and reenable caching * 13. Set PGE * 14. Wait for buddies to catch up * 15. Enable interrupts. * * What does that mean for us? Well, first we set data.count to the number * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait * until it hits 0 and proceed. We set the data.gate flag and reset data.count. * Meanwhile, they are waiting for that flag to be set. Once it's set, each * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it * differently, so we call mtrr_if->set() callback and let them take care of it. * When they're done, they again decrement data->count and wait for data.gate to * be reset. * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag. * Everyone then enables interrupts and we all continue on. * * Note that the mechanism is the same for UP systems, too; all the SMP stuff * becomes nops. */ static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { cpumask_t allbutself; unsigned int nr_cpus; struct set_mtrr_data data; unsigned long flags; cpumask_andnot(&allbutself, &cpu_online_map, cpumask_of(smp_processor_id())); nr_cpus = cpumask_weight(&allbutself); data.smp_reg = reg; data.smp_base = base; data.smp_size = size; data.smp_type = type; atomic_set(&data.count, nr_cpus); atomic_set(&data.gate,0); /* Start the ball rolling on other CPUs */ on_selected_cpus(&allbutself, ipi_handler, &data, 0); local_irq_save(flags); while (atomic_read(&data.count)) cpu_relax(); /* ok, reset count and toggle gate */ atomic_set(&data.count, nr_cpus); smp_wmb(); atomic_set(&data.gate,1); /* do our MTRR business */ /* HACK! * We use this same function to initialize the mtrrs on boot. * The state of the boot cpu's mtrrs has been saved, and we want * to replicate across all the APs. * If we're doing that @reg is set to something special... */ if (reg == ~0U) /* update all mtrr registers */ /* at boot or resume time, this will reinitialize the mtrrs on * the bp. It is ok. */ mtrr_if->set_all(); else /* update the single mtrr register */ mtrr_if->set(reg,base,size,type); /* wait for the others */ while (atomic_read(&data.count)) cpu_relax(); atomic_set(&data.count, nr_cpus); smp_wmb(); atomic_set(&data.gate,0); /* * Wait here for everyone to have seen the gate change * So we're the last ones to touch 'data' */ while (atomic_read(&data.count)) cpu_relax(); local_irq_restore(flags); } /** * mtrr_add_page - Add a memory type region * @base: Physical base address of region in pages (in units of 4 kB!) * @size: Physical size of region in pages (4 kB) * @type: Type of MTRR desired * @increment: If this is true do usage counting on the region * * Memory type region registers control the caching on newer Intel and * non Intel processors. This function allows drivers to request an * MTRR is added. The details and hardware specifics of each processor's * implementation are hidden from the caller, but nevertheless the * caller should expect to need to provide a power of two size on an * equivalent power of two boundary. * * If the region cannot be added either because all regions are in use * or the CPU cannot support it a negative value is returned. On success * the register number for this entry is returned, but should be treated * as a cookie only. * * On a multiprocessor machine the changes are made to all processors. * This is required on x86 by the Intel processors. * * The available types are * * %MTRR_TYPE_UNCACHABLE - No caching * * %MTRR_TYPE_WRBACK - Write data back in bursts whenever * * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts * * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes * * BUGS: Needs a quiet flag for the cases where drivers do not mind * failures and do not wish system log messages to be sent. */ int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, char increment) { int i, replace, error; mtrr_type ltype; unsigned long lbase, lsize; if (!mtrr_if) return -ENXIO; if ((error = mtrr_if->validate_add_page(base,size,type))) return error; if (type >= MTRR_NUM_TYPES) { printk(KERN_WARNING "mtrr: type: %u invalid\n", type); return -EINVAL; } /* If the type is WC, check that this processor supports it */ if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { printk(KERN_WARNING "mtrr: your processor doesn't support write-combining\n"); return -ENOSYS; } if (!size) { printk(KERN_WARNING "mtrr: zero sized request\n"); return -EINVAL; } if ((base | (base + size - 1)) >> (paddr_bits - PAGE_SHIFT)) { printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); return -EINVAL; } error = -EINVAL; replace = -1; /* Search for existing MTRR */ mutex_lock(&mtrr_mutex); for (i = 0; i < num_var_ranges; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase) continue; /* At this point we know there is some kind of overlap/enclosure */ if (base < lbase || base + size - 1 > lbase + lsize - 1) { if (base <= lbase && base + size - 1 >= lbase + lsize - 1) { /* New region encloses an existing region */ if (type == ltype) { replace = replace == -1 ? i : -2; continue; } else if (types_compatible(type, ltype)) continue; } printk(KERN_WARNING "mtrr: %#lx000,%#lx000 overlaps existing" " %#lx000,%#lx000\n", base, size, lbase, lsize); goto out; } /* New region is enclosed by an existing region */ if (ltype != type) { if (types_compatible(type, ltype)) continue; printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", base, size, mtrr_attrib_to_str(ltype), mtrr_attrib_to_str(type)); goto out; } if (increment) ++usage_table[i]; error = i; goto out; } /* Search for an empty MTRR */ i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { set_mtrr(i, base, size, type); if (likely(replace < 0)) usage_table[i] = 1; else { usage_table[i] = usage_table[replace] + !!increment; if (unlikely(replace != i)) { set_mtrr(replace, 0, 0, 0); usage_table[replace] = 0; } } } else printk(KERN_INFO "mtrr: no more MTRRs available\n"); error = i; out: mutex_unlock(&mtrr_mutex); return error; } static int mtrr_check(unsigned long base, unsigned long size) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { printk(KERN_WARNING "mtrr: size and base must be multiples of 4 kiB\n"); printk(KERN_DEBUG "mtrr: size: %#lx base: %#lx\n", size, base); dump_stack(); return -1; } return 0; } /** * mtrr_add - Add a memory type region * @base: Physical base address of region * @size: Physical size of region * @type: Type of MTRR desired * @increment: If this is true do usage counting on the region * * Memory type region registers control the caching on newer Intel and * non Intel processors. This function allows drivers to request an * MTRR is added. The details and hardware specifics of each processor's * implementation are hidden from the caller, but nevertheless the * caller should expect to need to provide a power of two size on an * equivalent power of two boundary. * * If the region cannot be added either because all regions are in use * or the CPU cannot support it a negative value is returned. On success * the register number for this entry is returned, but should be treated * as a cookie only. * * On a multiprocessor machine the changes are made to all processors. * This is required on x86 by the Intel processors. * * The available types are * * %MTRR_TYPE_UNCACHABLE - No caching * * %MTRR_TYPE_WRBACK - Write data back in bursts whenever * * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts * * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes * * BUGS: Needs a quiet flag for the cases where drivers do not mind * failures and do not wish system log messages to be sent. */ int __init mtrr_add(unsigned long base, unsigned long size, unsigned int type, char increment) { if (mtrr_check(base, size)) return -EINVAL; return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, increment); } /** * mtrr_del_page - delete a memory type region * @reg: Register returned by mtrr_add * @base: Physical base address * @size: Size of region * * If register is supplied then base and size are ignored. This is * how drivers should call it. * * Releases an MTRR region. If the usage count drops to zero the * register is freed and the region returns to default state. * On success the register is returned, on failure a negative error * code. */ int mtrr_del_page(int reg, unsigned long base, unsigned long size) { int i, max; mtrr_type ltype; unsigned long lbase, lsize; int error = -EINVAL; if (!mtrr_if) return -ENXIO; max = num_var_ranges; mutex_lock(&mtrr_mutex); if (reg < 0) { /* Search for existing MTRR */ for (i = 0; i < max; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); if (lbase == base && lsize == size) { reg = i; break; } } if (reg < 0) { printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, size); goto out; } } if (reg >= max) { printk(KERN_WARNING "mtrr: register: %d too big\n", reg); goto out; } mtrr_if->get(reg, &lbase, &lsize, <ype); if (lsize < 1) { printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); goto out; } if (usage_table[reg] < 1) { printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); goto out; } if (--usage_table[reg] < 1) set_mtrr(reg, 0, 0, 0); error = reg; out: mutex_unlock(&mtrr_mutex); return error; } /** * mtrr_del - delete a memory type region * @reg: Register returned by mtrr_add * @base: Physical base address * @size: Size of region * * If register is supplied then base and size are ignored. This is * how drivers should call it. * * Releases an MTRR region. If the usage count drops to zero the * register is freed and the region returns to default state. * On success the register is returned, on failure a negative error * code. */ int __init mtrr_del(int reg, unsigned long base, unsigned long size) { if (mtrr_check(base, size)) return -EINVAL; return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); } /* The suspend/resume methods are only for CPU without MTRR. CPU using generic * MTRR driver doesn't require this */ struct mtrr_value { mtrr_type ltype; unsigned long lbase; unsigned long lsize; }; /** * mtrr_bp_init - initialize mtrrs on the boot CPU * * This needs to be called early; before any of the other CPUs are * initialized (i.e. before smp_init()). * */ void __init mtrr_bp_init(void) { if (cpu_has_mtrr) { mtrr_if = &generic_mtrr_ops; size_or_mask = ~((1ULL << (paddr_bits - PAGE_SHIFT)) - 1); size_and_mask = ~size_or_mask & 0xfffff00000ULL; } if (mtrr_if) { set_num_var_ranges(); init_table(); if (use_intel()) get_mtrr_state(); } } void mtrr_ap_init(void) { if (!mtrr_if || !use_intel() || hold_mtrr_updates_on_aps) return; /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries changed, * but this routine will be called in cpu boot time, holding the lock * breaks it. This routine is called in two cases: 1.very earily time * of software resume, when there absolutely isn't mtrr entry changes; * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to * prevent mtrr entry changes */ set_mtrr(~0U, 0, 0, 0); } /** * Save current fixed-range MTRR state of the BSP */ void mtrr_save_state(void) { int cpu = get_cpu(); if (cpu == 0) mtrr_save_fixed_ranges(NULL); else on_selected_cpus(cpumask_of(0), mtrr_save_fixed_ranges, NULL, 1); put_cpu(); } void mtrr_aps_sync_begin(void) { if (!use_intel()) return; hold_mtrr_updates_on_aps = 1; } void mtrr_aps_sync_end(void) { if (!use_intel()) return; set_mtrr(~0U, 0, 0, 0); hold_mtrr_updates_on_aps = 0; } void mtrr_bp_restore(void) { if (!use_intel()) return; mtrr_if->set_all(); } static int __init mtrr_init_finialize(void) { if (!mtrr_if) return 0; if (use_intel()) mtrr_state_warn(); return 0; } __initcall(mtrr_init_finialize); xen-4.4.0/xen/arch/x86/cpu/intel_cacheinfo.c0000664000175000017500000002276312307313555016656 0ustar smbsmb/* * Routines to indentify caches on Intel CPU. * * Changes: * Venkatesh Pallipadi : Adding cache identification through cpuid(4) * Ashok Raj : Work with CPU hotplug infrastructure. * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ #include #include #include #include #include #define LVL_1_INST 1 #define LVL_1_DATA 2 #define LVL_2 3 #define LVL_3 4 #define LVL_TRACE 5 struct _cache_table { unsigned char descriptor; char cache_type; short size; }; /* all the cache descriptor types we care about (no TLB or trace cache entries) */ static struct _cache_table cache_table[] __cpuinitdata = { { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x29, LVL_3, 4096 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */ { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */ { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */ { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ { 0x44, LVL_2, 1024 }, /* 4-way set assoc, 32 byte line size */ { 0x45, LVL_2, 2048 }, /* 4-way set assoc, 32 byte line size */ { 0x46, LVL_3, 4096 }, /* 4-way set assoc, 64 byte line size */ { 0x47, LVL_3, 8192 }, /* 8-way set assoc, 64 byte line size */ { 0x49, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ { 0x4a, LVL_3, 6144 }, /* 12-way set assoc, 64 byte line size */ { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */ { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */ { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */ { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ { 0x78, LVL_2, 1024 }, /* 4-way set assoc, 64 byte line size */ { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x7c, LVL_2, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x7d, LVL_2, 2048 }, /* 8-way set assoc, 64 byte line size */ { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ { 0x84, LVL_2, 1024 }, /* 8-way set assoc, 32 byte line size */ { 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */ { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ { 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */ { 0x00, 0, 0} }; enum _cache_type { CACHE_TYPE_NULL = 0, CACHE_TYPE_DATA = 1, CACHE_TYPE_INST = 2, CACHE_TYPE_UNIFIED = 3 }; union _cpuid4_leaf_eax { struct { enum _cache_type type:5; unsigned int level:3; unsigned int is_self_initializing:1; unsigned int is_fully_associative:1; unsigned int reserved:4; unsigned int num_threads_sharing:12; unsigned int num_cores_on_die:6; } split; u32 full; }; union _cpuid4_leaf_ebx { struct { unsigned int coherency_line_size:12; unsigned int physical_line_partition:10; unsigned int ways_of_associativity:10; } split; u32 full; }; union _cpuid4_leaf_ecx { struct { unsigned int number_of_sets:32; } split; u32 full; }; struct _cpuid4_info { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; }; unsigned short num_cache_leaves; static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned edx; cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); if (eax.split.type == CACHE_TYPE_NULL) return -EIO; /* better error ? */ this_leaf->eax = eax; this_leaf->ebx = ebx; this_leaf->ecx = ecx; this_leaf->size = (ecx.split.number_of_sets + 1) * (ebx.split.coherency_line_size + 1) * (ebx.split.physical_line_partition + 1) * (ebx.split.ways_of_associativity + 1); return 0; } static int __cpuinit find_num_cache_leaves(void) { unsigned int eax, ebx, ecx, edx; union _cpuid4_leaf_eax cache_eax; int i = -1; do { ++i; /* Do cpuid(4) loop to find out num_cache_leaves */ cpuid_count(4, i, &eax, &ebx, &ecx, &edx); cache_eax.full = eax; } while (cache_eax.split.type != CACHE_TYPE_NULL); return i; } unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) { unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; if (c->cpuid_level > 3) { static int is_initialized; if (is_initialized == 0) { /* Init num_cache_leaves from boot CPU */ num_cache_leaves = find_num_cache_leaves(); is_initialized++; } /* * Whenever possible use cpuid(4), deterministic cache * parameters cpuid leaf to find the cache details */ for (i = 0; i < num_cache_leaves; i++) { struct _cpuid4_info this_leaf; int retval; retval = cpuid4_cache_lookup(i, &this_leaf); if (retval >= 0) { switch(this_leaf.eax.split.level) { case 1: if (this_leaf.eax.split.type == CACHE_TYPE_DATA) new_l1d = this_leaf.size/1024; else if (this_leaf.eax.split.type == CACHE_TYPE_INST) new_l1i = this_leaf.size/1024; break; case 2: new_l2 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); l2_id = c->apicid >> index_msb; break; case 3: new_l3 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); l3_id = c->apicid >> index_msb; break; default: break; } } } } /* * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for * trace cache */ if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) { /* supports eax=2 call */ int i, j, n; int regs[4]; unsigned char *dp = (unsigned char *)regs; int only_trace = 0; if (num_cache_leaves != 0 && c->x86 == 15) only_trace = 1; /* Number of times to iterate */ n = cpuid_eax(2) & 0xFF; for ( i = 0 ; i < n ; i++ ) { cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); /* If bit 31 is set, this is an unknown format */ for ( j = 0 ; j < 3 ; j++ ) { if ( regs[j] < 0 ) regs[j] = 0; } /* Byte 0 is level count, not a descriptor */ for ( j = 1 ; j < 16 ; j++ ) { unsigned char des = dp[j]; unsigned char k = 0; /* look up this descriptor in the table */ while (cache_table[k].descriptor != 0) { if (cache_table[k].descriptor == des) { if (only_trace && cache_table[k].cache_type != LVL_TRACE) break; switch (cache_table[k].cache_type) { case LVL_1_INST: l1i += cache_table[k].size; break; case LVL_1_DATA: l1d += cache_table[k].size; break; case LVL_2: l2 += cache_table[k].size; break; case LVL_3: l3 += cache_table[k].size; break; case LVL_TRACE: trace += cache_table[k].size; break; } break; } k++; } } } } if (new_l1d) l1d = new_l1d; if (new_l1i) l1i = new_l1i; if (new_l2) { l2 = new_l2; } if (new_l3) { l3 = new_l3; } if (opt_cpu_info) { if (trace) printk("CPU: Trace cache: %dK uops", trace); else if ( l1i ) printk("CPU: L1 I cache: %dK", l1i); if (l1d) printk(", L1 D cache: %dK\n", l1d); else printk("\n"); if (l2) printk("CPU: L2 cache: %dK\n", l2); if (l3) printk("CPU: L3 cache: %dK\n", l3); } c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); return l2; } xen-4.4.0/xen/arch/x86/smpboot.c0000664000175000017500000006756612307313555014452 0ustar smbsmb/* * x86 SMP booting functions * * This inherits a great deal from Linux's SMP boot code: * (c) 1995 Alan Cox, Building #3 * (c) 1998, 1999, 2000 Ingo Molnar * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define setup_trampoline() (bootsym_phys(trampoline_realmode_entry)) unsigned long __read_mostly trampoline_phys; /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_mask); /* representing HT and core siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_mask); cpumask_t cpu_online_map __read_mostly; EXPORT_SYMBOL(cpu_online_map); struct cpuinfo_x86 cpu_data[NR_CPUS]; u32 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; static int cpu_error; static enum cpu_state { CPU_STATE_DYING, /* slave -> master: I am dying */ CPU_STATE_DEAD, /* slave -> master: I am completely dead */ CPU_STATE_INIT, /* master -> slave: Early bringup phase 1 */ CPU_STATE_CALLOUT, /* master -> slave: Early bringup phase 2 */ CPU_STATE_CALLIN, /* slave -> master: Completed phase 2 */ CPU_STATE_ONLINE /* master -> slave: Go fully online now. */ } cpu_state; #define set_cpu_state(state) do { mb(); cpu_state = (state); } while (0) void *stack_base[NR_CPUS]; static void smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = cpu_data + id; *c = boot_cpu_data; if ( id != 0 ) identify_cpu(c); /* * Certain Athlons might work (for various values of 'work') in SMP * but they are not certified as MP capable. */ if ( (c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6) ) { /* Athlon 660/661 is valid. */ if ( (c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)) ) goto valid_k7; /* Duron 670 is valid */ if ( (c->x86_model==7) && (c->x86_mask==0) ) goto valid_k7; /* * Athlon 662, Duron 671, and Athlon >model 7 have capability bit. * It's worth noting that the A5 stepping (662) of some Athlon XP's * have the MP bit set. * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more. */ if ( ((c->x86_model==6) && (c->x86_mask>=2)) || ((c->x86_model==7) && (c->x86_mask>=1)) || (c->x86_model> 7) ) if (cpu_has_mp) goto valid_k7; /* If we get here, it's not a certified SMP capable AMD system. */ add_taint(TAINT_UNSAFE_SMP); } valid_k7: ; } /* * TSC's upper 32 bits can't be written in earlier CPUs (before * Prescott), there is no way to resync one AP against BP. */ bool_t disable_tsc_sync; static atomic_t tsc_count; static uint64_t tsc_value; static cpumask_t tsc_sync_cpu_mask; static void synchronize_tsc_master(unsigned int slave) { unsigned int i; if ( disable_tsc_sync ) return; if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) && !cpumask_test_cpu(slave, &tsc_sync_cpu_mask) ) return; for ( i = 1; i <= 5; i++ ) { rdtscll(tsc_value); wmb(); atomic_inc(&tsc_count); while ( atomic_read(&tsc_count) != (i<<1) ) cpu_relax(); } atomic_set(&tsc_count, 0); cpumask_clear_cpu(slave, &tsc_sync_cpu_mask); } static void synchronize_tsc_slave(unsigned int slave) { unsigned int i; if ( disable_tsc_sync ) return; if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) && !cpumask_test_cpu(slave, &tsc_sync_cpu_mask) ) return; for ( i = 1; i <= 5; i++ ) { while ( atomic_read(&tsc_count) != ((i<<1)-1) ) cpu_relax(); rmb(); /* * If a CPU has been physically hotplugged, we may as well write * to its TSC in spite of X86_FEATURE_TSC_RELIABLE. The platform does * not sync up a new CPU's TSC for us. */ __write_tsc(tsc_value); atomic_inc(&tsc_count); } } static void smp_callin(void) { unsigned int cpu = smp_processor_id(); int i, rc; /* Wait 2s total for startup. */ Dprintk("Waiting for CALLOUT.\n"); for ( i = 0; cpu_state != CPU_STATE_CALLOUT; i++ ) { BUG_ON(i >= 200); cpu_relax(); mdelay(10); } /* * The boot CPU has finished the init stage and is spinning on cpu_state * update until we finish. We are free to set up this CPU: first the APIC. */ Dprintk("CALLIN, before setup_local_APIC().\n"); x2apic_ap_setup(); setup_local_APIC(); /* Save our processor parameters. */ smp_store_cpu_info(cpu); if ( (rc = hvm_cpu_up()) != 0 ) { printk("CPU%d: Failed to initialise HVM. Not coming online.\n", cpu); cpu_error = rc; clear_local_APIC(); spin_debug_enable(); cpu_exit_clear(cpu); (*dead_idle)(); } /* Allow the master to continue. */ set_cpu_state(CPU_STATE_CALLIN); synchronize_tsc_slave(cpu); /* And wait for our final Ack. */ while ( cpu_state != CPU_STATE_ONLINE ) cpu_relax(); } static int booting_cpu; /* CPUs for which sibling maps can be computed. */ static cpumask_t cpu_sibling_setup_map; static void link_thread_siblings(int cpu1, int cpu2) { cpumask_set_cpu(cpu1, per_cpu(cpu_sibling_mask, cpu2)); cpumask_set_cpu(cpu2, per_cpu(cpu_sibling_mask, cpu1)); cpumask_set_cpu(cpu1, per_cpu(cpu_core_mask, cpu2)); cpumask_set_cpu(cpu2, per_cpu(cpu_core_mask, cpu1)); } static void set_cpu_sibling_map(int cpu) { int i; struct cpuinfo_x86 *c = cpu_data; cpumask_set_cpu(cpu, &cpu_sibling_setup_map); if ( c[cpu].x86_num_siblings > 1 ) { for_each_cpu ( i, &cpu_sibling_setup_map ) { if ( cpu_has(c, X86_FEATURE_TOPOEXT) ) { if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) && (c[cpu].compute_unit_id == c[i].compute_unit_id) ) link_thread_siblings(cpu, i); } else if ( (c[cpu].phys_proc_id == c[i].phys_proc_id) && (c[cpu].cpu_core_id == c[i].cpu_core_id) ) { link_thread_siblings(cpu, i); } } } else { cpumask_set_cpu(cpu, per_cpu(cpu_sibling_mask, cpu)); } if ( c[cpu].x86_max_cores == 1 ) { cpumask_copy(per_cpu(cpu_core_mask, cpu), per_cpu(cpu_sibling_mask, cpu)); c[cpu].booted_cores = 1; return; } for_each_cpu ( i, &cpu_sibling_setup_map ) { if ( c[cpu].phys_proc_id == c[i].phys_proc_id ) { cpumask_set_cpu(i, per_cpu(cpu_core_mask, cpu)); cpumask_set_cpu(cpu, per_cpu(cpu_core_mask, i)); /* * Does this new cpu bringup a new core? */ if ( cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) == 1 ) { /* * for each core in package, increment * the booted_cores for this new cpu */ if ( cpumask_first(per_cpu(cpu_sibling_mask, i)) == i ) c[cpu].booted_cores++; /* * increment the core count for all * the other cpus in this package */ if ( i != cpu ) c[i].booted_cores++; } else if ( (i != cpu) && !c[cpu].booted_cores ) { c[cpu].booted_cores = c[i].booted_cores; } } } } static void construct_percpu_idt(unsigned int cpu) { unsigned char idt_load[10]; *(unsigned short *)(&idt_load[0]) = (IDT_ENTRIES*sizeof(idt_entry_t))-1; *(unsigned long *)(&idt_load[2]) = (unsigned long)idt_tables[cpu]; __asm__ __volatile__ ( "lidt %0" : "=m" (idt_load) ); } void start_secondary(void *unused) { /* * Dont put anything before smp_callin(), SMP booting is so fragile that we * want to limit the things done here to the most necessary things. */ unsigned int cpu = booting_cpu; set_processor_id(cpu); set_current(idle_vcpu[cpu]); this_cpu(curr_vcpu) = idle_vcpu[cpu]; if ( cpu_has_efer ) rdmsrl(MSR_EFER, this_cpu(efer)); asm volatile ( "mov %%cr4,%0" : "=r" (this_cpu(cr4)) ); /* * Just as during early bootstrap, it is convenient here to disable * spinlock checking while we have IRQs disabled. This allows us to * acquire IRQ-unsafe locks when it would otherwise be disallowed. * * It is safe because the race we are usually trying to avoid involves * a group of CPUs rendezvousing in an IPI handler, where one cannot * join because it is spinning with IRQs disabled waiting to acquire a * lock held by another in the rendezvous group (the lock must be an * IRQ-unsafe lock since the CPU took the IPI after acquiring it, and * hence had IRQs enabled). This is a deadlock scenario. * * However, no CPU can be involved in rendezvous until it is online, * hence no such group can be waiting for this CPU until it is * visible in cpu_online_map. Hence such a deadlock is not possible. */ spin_debug_disable(); percpu_traps_init(); cpu_init(); smp_callin(); /* * At this point, boot CPU has fully initialised the IDT. It is * now safe to make ourselves a private copy. */ construct_percpu_idt(cpu); setup_secondary_APIC_clock(); /* * low-memory mappings have been cleared, flush them from * the local TLBs too. */ flush_tlb_local(); /* This must be done before setting cpu_online_map */ spin_debug_enable(); set_cpu_sibling_map(cpu); notify_cpu_starting(cpu); wmb(); /* * We need to hold vector_lock so there the set of online cpus * does not change while we are assigning vectors to cpus. Holding * this lock ensures we don't half assign or remove an irq from a cpu. */ lock_vector_lock(); __setup_vector_irq(cpu); cpumask_set_cpu(cpu, &cpu_online_map); unlock_vector_lock(); init_percpu_time(); /* We can take interrupts now: we're officially "up". */ local_irq_enable(); mtrr_ap_init(); microcode_resume_cpu(cpu); wmb(); startup_cpu_idle_loop(); } extern void *stack_start; static int wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) { unsigned long send_status = 0, accept_status = 0; int maxlvt, timeout, num_starts, i; /* * Be paranoid about clearing APIC errors. */ if ( APIC_INTEGRATED(apic_version[phys_apicid]) ) { apic_read_around(APIC_SPIV); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } Dprintk("Asserting INIT.\n"); /* * Turn INIT on target chip via IPI */ apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid); if ( !x2apic_enabled ) { Dprintk("Waiting for send to finish...\n"); timeout = 0; do { Dprintk("+"); udelay(100); send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; } while ( send_status && (timeout++ < 1000) ); mdelay(10); Dprintk("Deasserting INIT.\n"); apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); Dprintk("Waiting for send to finish...\n"); timeout = 0; do { Dprintk("+"); udelay(100); send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; } while ( send_status && (timeout++ < 1000) ); } else if ( tboot_in_measured_env() ) { /* * With tboot AP is actually spinning in a mini-guest before * receiving INIT. Upon receiving INIT ipi, AP need time to VMExit, * update VMCS to tracking SIPIs and VMResume. * * While AP is in root mode handling the INIT the CPU will drop * any SIPIs */ udelay(10); } /* * Should we send STARTUP IPIs ? * * Determine this based on the APIC version. * If we don't have an integrated APIC, don't send the STARTUP IPIs. */ num_starts = APIC_INTEGRATED(apic_version[phys_apicid]) ? 2 : 0; /* Run STARTUP IPI loop. */ Dprintk("#startup loops: %d.\n", num_starts); maxlvt = get_maxlvt(); for ( i = 0; i < num_starts; i++ ) { Dprintk("Sending STARTUP #%d.\n", i+1); apic_read_around(APIC_SPIV); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); Dprintk("After apic_write.\n"); /* * STARTUP IPI * Boot on the stack */ apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid); if ( !x2apic_enabled ) { /* Give the other CPU some time to accept the IPI. */ udelay(300); Dprintk("Startup point 1.\n"); Dprintk("Waiting for send to finish...\n"); timeout = 0; do { Dprintk("+"); udelay(100); send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; } while ( send_status && (timeout++ < 1000) ); /* Give the other CPU some time to accept the IPI. */ udelay(200); } /* Due to the Pentium erratum 3AP. */ if ( maxlvt > 3 ) { apic_read_around(APIC_SPIV); apic_write(APIC_ESR, 0); } accept_status = (apic_read(APIC_ESR) & 0xEF); if ( send_status || accept_status ) break; } Dprintk("After Startup.\n"); if ( send_status ) printk("APIC never delivered???\n"); if ( accept_status ) printk("APIC delivery error (%lx).\n", accept_status); return (send_status | accept_status); } int alloc_cpu_id(void) { cpumask_t tmp_map; int cpu; cpumask_complement(&tmp_map, &cpu_present_map); cpu = cpumask_first(&tmp_map); return (cpu < nr_cpu_ids) ? cpu : -ENODEV; } static int do_boot_cpu(int apicid, int cpu) { int timeout, boot_error = 0, rc = 0; unsigned long start_eip; /* * Save current MTRR state in case it was changed since early boot * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: */ mtrr_save_state(); booting_cpu = cpu; /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); /* So we see what's up */ if ( opt_cpu_info ) printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); stack_start = stack_base[cpu]; /* This grunge runs the startup process for the targeted processor. */ set_cpu_state(CPU_STATE_INIT); Dprintk("Setting warm reset code and vector.\n"); smpboot_setup_warm_reset_vector(start_eip); /* Starting actual IPI sequence... */ if ( !tboot_in_measured_env() || tboot_wake_ap(apicid, start_eip) ) boot_error = wakeup_secondary_cpu(apicid, start_eip); if ( !boot_error ) { /* Allow AP to start initializing. */ set_cpu_state(CPU_STATE_CALLOUT); Dprintk("After Callout %d.\n", cpu); /* Wait 5s total for a response. */ for ( timeout = 0; timeout < 50000; timeout++ ) { if ( cpu_state != CPU_STATE_CALLOUT ) break; udelay(100); } if ( cpu_state == CPU_STATE_CALLIN ) { /* number CPUs logically, starting from 1 (BSP is 0) */ Dprintk("OK.\n"); print_cpu_info(cpu); synchronize_tsc_master(cpu); Dprintk("CPU has booted.\n"); } else if ( cpu_state == CPU_STATE_DEAD ) { rmb(); rc = cpu_error; } else { boot_error = 1; mb(); if ( bootsym(trampoline_cpu_started) == 0xA5 ) /* trampoline started but...? */ printk("Stuck ??\n"); else /* trampoline code not run */ printk("Not responding.\n"); } } if ( boot_error ) { cpu_exit_clear(cpu); rc = -EIO; } /* mark "stuck" area as not stuck */ bootsym(trampoline_cpu_started) = 0; mb(); smpboot_restore_warm_reset_vector(); return rc; } void cpu_exit_clear(unsigned int cpu) { cpu_uninit(cpu); set_cpu_state(CPU_STATE_DEAD); } static void cpu_smpboot_free(unsigned int cpu) { unsigned int order; free_cpumask_var(per_cpu(cpu_sibling_mask, cpu)); free_cpumask_var(per_cpu(cpu_core_mask, cpu)); order = get_order_from_pages(NR_RESERVED_GDT_PAGES); free_xenheap_pages(per_cpu(gdt_table, cpu), order); free_xenheap_pages(per_cpu(compat_gdt_table, cpu), order); order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t)); free_xenheap_pages(idt_tables[cpu], order); idt_tables[cpu] = NULL; if ( stack_base[cpu] != NULL ) { memguard_unguard_stack(stack_base[cpu]); free_xenheap_pages(stack_base[cpu], STACK_ORDER); stack_base[cpu] = NULL; } } static int cpu_smpboot_alloc(unsigned int cpu) { unsigned int order; struct desc_struct *gdt; stack_base[cpu] = alloc_xenheap_pages(STACK_ORDER, 0); if ( stack_base[cpu] == NULL ) goto oom; memguard_guard_stack(stack_base[cpu]); order = get_order_from_pages(NR_RESERVED_GDT_PAGES); per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(order, MEMF_node(cpu_to_node(cpu))); if ( gdt == NULL ) goto oom; memcpy(gdt, boot_cpu_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE); BUILD_BUG_ON(NR_CPUS > 0x10000); gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu; per_cpu(compat_gdt_table, cpu) = gdt = alloc_xenheap_pages(order, MEMF_node(cpu_to_node(cpu))); if ( gdt == NULL ) goto oom; memcpy(gdt, boot_cpu_compat_gdt_table, NR_RESERVED_GDT_PAGES * PAGE_SIZE); gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu; order = get_order_from_bytes(IDT_ENTRIES * sizeof(idt_entry_t)); idt_tables[cpu] = alloc_xenheap_pages(order, MEMF_node(cpu_to_node(cpu))); if ( idt_tables[cpu] == NULL ) goto oom; memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES * sizeof(idt_entry_t)); if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) && zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) ) return 0; oom: cpu_smpboot_free(cpu); return -ENOMEM; } static int cpu_smpboot_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int rc = 0; switch ( action ) { case CPU_UP_PREPARE: rc = cpu_smpboot_alloc(cpu); break; case CPU_UP_CANCELED: case CPU_DEAD: cpu_smpboot_free(cpu); break; default: break; } return !rc ? NOTIFY_DONE : notifier_from_errno(rc); } static struct notifier_block cpu_smpboot_nfb = { .notifier_call = cpu_smpboot_callback }; void __init smp_prepare_cpus(unsigned int max_cpus) { register_cpu_notifier(&cpu_smpboot_nfb); mtrr_aps_sync_begin(); /* Setup boot CPU information */ smp_store_cpu_info(0); /* Final full version of the data */ print_cpu_info(0); boot_cpu_physical_apicid = get_apic_id(); x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; stack_base[0] = stack_start; if ( !zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, 0)) || !zalloc_cpumask_var(&per_cpu(cpu_core_mask, 0)) ) panic("No memory for boot CPU sibling/core maps"); set_cpu_sibling_map(0); /* * If we couldn't find an SMP configuration at boot time, * get out of here now! */ if ( !smp_found_config && !acpi_lapic ) { printk(KERN_NOTICE "SMP motherboard not detected.\n"); init_uniprocessor: physids_clear(phys_cpu_present_map); physid_set(0, phys_cpu_present_map); if (APIC_init_uniprocessor()) printk(KERN_NOTICE "Local APIC not detected." " Using dummy APIC emulation.\n"); return; } /* * Should not be necessary because the MP table should list the boot * CPU too, but we do it for the sake of robustness anyway. * Makes no sense to do this check in clustered apic mode, so skip it */ if ( !check_apicid_present(boot_cpu_physical_apicid) ) { printk("weird, boot CPU (#%d) not listed by the BIOS.\n", boot_cpu_physical_apicid); physid_set(hard_smp_processor_id(), phys_cpu_present_map); } /* If we couldn't find a local APIC, then get out of here now! */ if ( APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic ) { printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", boot_cpu_physical_apicid); goto init_uniprocessor; } verify_local_APIC(); connect_bsp_APIC(); setup_local_APIC(); smpboot_setup_io_apic(); setup_boot_APIC_clock(); } void __init smp_prepare_boot_cpu(void) { cpumask_set_cpu(smp_processor_id(), &cpu_online_map); cpumask_set_cpu(smp_processor_id(), &cpu_present_map); } static void remove_siblinginfo(int cpu) { int sibling; struct cpuinfo_x86 *c = cpu_data; for_each_cpu ( sibling, per_cpu(cpu_core_mask, cpu) ) { cpumask_clear_cpu(cpu, per_cpu(cpu_core_mask, sibling)); /* Last thread sibling in this cpu core going down. */ if ( cpumask_weight(per_cpu(cpu_sibling_mask, cpu)) == 1 ) c[sibling].booted_cores--; } for_each_cpu(sibling, per_cpu(cpu_sibling_mask, cpu)) cpumask_clear_cpu(cpu, per_cpu(cpu_sibling_mask, sibling)); cpumask_clear(per_cpu(cpu_sibling_mask, cpu)); cpumask_clear(per_cpu(cpu_core_mask, cpu)); c[cpu].phys_proc_id = BAD_APICID; c[cpu].cpu_core_id = BAD_APICID; c[cpu].compute_unit_id = BAD_APICID; cpumask_clear_cpu(cpu, &cpu_sibling_setup_map); } void __cpu_disable(void) { int cpu = smp_processor_id(); set_cpu_state(CPU_STATE_DYING); local_irq_disable(); clear_local_APIC(); /* Allow any queued timer interrupts to get serviced */ local_irq_enable(); mdelay(1); local_irq_disable(); time_suspend(); remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ cpumask_clear_cpu(cpu, cpupool0->cpu_valid); cpumask_clear_cpu(cpu, &cpu_online_map); fixup_irqs(); if ( cpu_disable_scheduler(cpu) ) BUG(); } void __cpu_die(unsigned int cpu) { /* We don't do anything here: idle task is faking death itself. */ unsigned int i = 0; enum cpu_state seen_state; while ( (seen_state = cpu_state) != CPU_STATE_DEAD ) { BUG_ON(seen_state != CPU_STATE_DYING); mdelay(100); cpu_relax(); process_pending_softirqs(); if ( (++i % 10) == 0 ) printk(KERN_ERR "CPU %u still not dead...\n", cpu); } } int cpu_add(uint32_t apic_id, uint32_t acpi_id, uint32_t pxm) { int node, cpu = -1; dprintk(XENLOG_DEBUG, "cpu_add apic_id %x acpi_id %x pxm %x\n", apic_id, acpi_id, pxm); if ( (acpi_id >= MAX_MADT_ENTRIES) || (apic_id >= MAX_APICS) || (pxm >= 256) ) return -EINVAL; if ( !cpu_hotplug_begin() ) return -EBUSY; /* Detect if the cpu has been added before */ if ( x86_acpiid_to_apicid[acpi_id] != BAD_APICID ) { cpu = (x86_acpiid_to_apicid[acpi_id] != apic_id) ? -EINVAL : -EEXIST; goto out; } if ( physid_isset(apic_id, phys_cpu_present_map) ) { cpu = -EEXIST; goto out; } if ( (cpu = mp_register_lapic(apic_id, 1, 1)) < 0 ) goto out; x86_acpiid_to_apicid[acpi_id] = apic_id; if ( !srat_disabled() ) { if ( (node = setup_node(pxm)) < 0 ) { dprintk(XENLOG_WARNING, "Setup node failed for pxm %x\n", pxm); x86_acpiid_to_apicid[acpi_id] = BAD_APICID; mp_unregister_lapic(apic_id, cpu); cpu = node; goto out; } apicid_to_node[apic_id] = node; } /* Physically added CPUs do not have synchronised TSC. */ if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) ) { static bool_t once_only; if ( !test_and_set_bool(once_only) ) printk(XENLOG_WARNING " ** New physical CPU %u may have skewed TSC and hence " "break assumed cross-CPU TSC coherency.\n" " ** Consider using boot parameter \"tsc=skewed\" " "which forces TSC emulation where appropriate.\n", cpu); cpumask_set_cpu(cpu, &tsc_sync_cpu_mask); } srat_detect_node(cpu); numa_add_cpu(cpu); dprintk(XENLOG_INFO, "Add CPU %x with index %x\n", apic_id, cpu); out: cpu_hotplug_done(); return cpu; } int __cpu_up(unsigned int cpu) { int apicid, ret; if ( (apicid = x86_cpu_to_apicid[cpu]) == BAD_APICID ) return -ENODEV; if ( (ret = do_boot_cpu(apicid, cpu)) != 0 ) return ret; set_cpu_state(CPU_STATE_ONLINE); while ( !cpu_online(cpu) ) { cpu_relax(); process_pending_softirqs(); } return 0; } void __init smp_cpus_done(void) { /* * Don't taint if we are running SMP kernel on a single non-MP * approved Athlon */ if ( tainted & TAINT_UNSAFE_SMP ) { if ( num_online_cpus() > 1 ) printk(KERN_INFO "WARNING: This combination of AMD " "processors is not suitable for SMP.\n"); else tainted &= ~TAINT_UNSAFE_SMP; } if ( nmi_watchdog == NMI_LOCAL_APIC ) check_nmi_watchdog(); setup_ioapic_dest(); mtrr_save_state(); mtrr_aps_sync_end(); } void __init smp_intr_init(void) { int irq, vector, seridx, cpu = smp_processor_id(); /* * IRQ0 must be given a fixed assignment and initialized, * because it's used before the IO-APIC is set up. */ irq_to_desc(0)->arch.vector = IRQ0_VECTOR; /* * Also ensure serial interrupts are high priority. We do not * want them to be blocked by unacknowledged guest-bound interrupts. */ for ( seridx = 0; seridx <= SERHND_IDX; seridx++ ) { if ( (irq = serial_irq(seridx)) < 0 ) continue; vector = alloc_hipriority_vector(); per_cpu(vector_irq, cpu)[vector] = irq; irq_to_desc(irq)->arch.vector = vector; cpumask_copy(irq_to_desc(irq)->arch.cpu_mask, &cpu_online_map); } /* Direct IPI vectors. */ set_direct_apic_vector(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_direct_apic_vector(EVENT_CHECK_VECTOR, event_check_interrupt); set_direct_apic_vector(INVALIDATE_TLB_VECTOR, invalidate_interrupt); set_direct_apic_vector(CALL_FUNCTION_VECTOR, call_function_interrupt); } xen-4.4.0/xen/arch/x86/bitops.c0000664000175000017500000000541712307313555014252 0ustar smbsmb #include #include unsigned int __find_first_bit( const unsigned long *addr, unsigned int size) { unsigned long d0, d1, res; asm volatile ( "1: xor %%eax,%%eax\n\t" /* also ensures ZF==1 if size==0 */ " repe; scas"__OS"\n\t" " je 2f\n\t" " bsf -"STR(BITS_PER_LONG/8)"(%2),%0\n\t" " jz 1b\n\t" " lea -"STR(BITS_PER_LONG/8)"(%2),%2\n\t" "2: sub %%ebx,%%edi\n\t" " shl $3,%%edi\n\t" " add %%edi,%%eax" : "=&a" (res), "=&c" (d0), "=&D" (d1) : "1" (BITS_TO_LONGS(size)), "2" (addr), "b" ((int)(long)addr) : "memory" ); return res; } unsigned int __find_next_bit( const unsigned long *addr, unsigned int size, unsigned int offset) { const unsigned long *p = addr + (offset / BITS_PER_LONG); unsigned int set, bit = offset & (BITS_PER_LONG - 1); ASSERT(offset <= size); if ( bit != 0 ) { /* Look for a bit in the first word. */ set = __scanbit(*p >> bit, BITS_PER_LONG - bit); if ( set < (BITS_PER_LONG - bit) ) return (offset + set); offset += BITS_PER_LONG - bit; p++; } if ( offset >= size ) return size; /* Search remaining full words for a bit. */ set = __find_first_bit(p, size - offset); return (offset + set); } unsigned int __find_first_zero_bit( const unsigned long *addr, unsigned int size) { unsigned long d0, d1, d2, res; asm volatile ( "1: xor %%eax,%%eax ; not %3\n\t" /* rAX == ~0ul */ " xor %%edx,%%edx\n\t" /* also ensures ZF==1 if size==0 */ " repe; scas"__OS"\n\t" " je 2f\n\t" " xor -"STR(BITS_PER_LONG/8)"(%2),%3\n\t" " jz 1b\n\t" " bsf %3,%0\n\t" " lea -"STR(BITS_PER_LONG/8)"(%2),%2\n\t" "2: sub %%ebx,%%edi\n\t" " shl $3,%%edi\n\t" " add %%edi,%%edx" : "=&d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2) : "1" (BITS_TO_LONGS(size)), "2" (addr), "b" ((int)(long)addr) : "memory" ); return res; } unsigned int __find_next_zero_bit( const unsigned long *addr, unsigned int size, unsigned int offset) { const unsigned long *p = addr + (offset / BITS_PER_LONG); unsigned int set, bit = offset & (BITS_PER_LONG - 1); ASSERT(offset <= size); if ( bit != 0 ) { /* Look for zero in the first word. */ set = __scanbit(~(*p >> bit), BITS_PER_LONG - bit); if ( set < (BITS_PER_LONG - bit) ) return (offset + set); offset += BITS_PER_LONG - bit; p++; } if ( offset >= size ) return size; /* Search remaining full words for a zero. */ set = __find_first_zero_bit(p, size - offset); return (offset + set); } xen-4.4.0/xen/arch/x86/traps.c0000664000175000017500000033561412307313555014110 0ustar smbsmb/****************************************************************************** * arch/x86/traps.c * * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Copyright (C) 1991, 1992 Linus Torvalds * * Pentium III FXSR, SSE support * Gareth Hughes , May 2000 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * opt_nmi: one of 'ignore', 'dom0', or 'fatal'. * fatal: Xen prints diagnostic message and then hangs. * dom0: The NMI is virtualised to DOM0. * ignore: The NMI error is cleared and ignored. */ #ifdef NDEBUG static char __read_mostly opt_nmi[10] = "dom0"; #else static char __read_mostly opt_nmi[10] = "fatal"; #endif string_param("nmi", opt_nmi); DEFINE_PER_CPU(u64, efer); static DEFINE_PER_CPU(unsigned long, last_extable_addr); DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr); DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, gdt_table); DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, compat_gdt_table); /* Master table, used by CPU0. */ idt_entry_t idt_table[IDT_ENTRIES]; /* Pointer to the IDT of every CPU. */ idt_entry_t *idt_tables[NR_CPUS] __read_mostly; void (*ioemul_handle_quirk)( u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs); static int debug_stack_lines = 20; integer_param("debug_stack_lines", debug_stack_lines); static bool_t __devinitdata opt_ler; boolean_param("ler", opt_ler); #define stack_words_per_line 4 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp) static void show_guest_stack(struct vcpu *v, struct cpu_user_regs *regs) { int i; unsigned long *stack, addr; unsigned long mask = STACK_SIZE; /* Avoid HVM as we don't know what the stack looks like. */ if ( is_hvm_vcpu(v) ) return; if ( is_pv_32on64_vcpu(v) ) { compat_show_guest_stack(v, regs, debug_stack_lines); return; } if ( vm86_mode(regs) ) { stack = (unsigned long *)((regs->ss << 4) + (regs->esp & 0xffff)); printk("Guest stack trace from ss:sp = %04x:%04x (VM86)\n ", regs->ss, (uint16_t)(regs->esp & 0xffff)); } else { stack = (unsigned long *)regs->esp; printk("Guest stack trace from "__OP"sp=%p:\n ", stack); } if ( !access_ok(stack, sizeof(*stack)) ) { printk("Guest-inaccessible memory.\n"); return; } if ( v != current ) { struct vcpu *vcpu; ASSERT(guest_kernel_mode(v, regs)); vcpu = maddr_get_owner(read_cr3()) == v->domain ? v : NULL; if ( !vcpu ) { stack = do_page_walk(v, (unsigned long)stack); if ( (unsigned long)stack < PAGE_SIZE ) { printk("Inaccessible guest memory.\n"); return; } mask = PAGE_SIZE; } } for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ ) { if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask ) break; if ( __get_user(addr, stack) ) { if ( i != 0 ) printk("\n "); printk("Fault while accessing guest memory."); i = 1; break; } if ( (i != 0) && ((i % stack_words_per_line) == 0) ) printk("\n "); printk(" %p", _p(addr)); stack++; } if ( mask == PAGE_SIZE ) { BUILD_BUG_ON(PAGE_SIZE == STACK_SIZE); unmap_domain_page(stack); } if ( i == 0 ) printk("Stack empty."); printk("\n"); } #if !defined(CONFIG_FRAME_POINTER) /* * Stack trace from pointers found in stack, unaided by frame pointers. For * caller convenience, this has the same prototype as its alternative, and * simply ignores the base pointer parameter. */ static void _show_trace(unsigned long sp, unsigned long __maybe_unused bp) { unsigned long *stack = (unsigned long *)sp, addr; unsigned long *bottom = (unsigned long *)get_printable_stack_bottom(sp); while ( stack <= bottom ) { addr = *stack++; if ( is_active_kernel_text(addr) ) printk(" [<%p>] %pS\n", _p(addr), _p(addr)); } } #else /* Stack trace from frames in the stack, using frame pointers */ static void _show_trace(unsigned long sp, unsigned long bp) { unsigned long *frame, next, addr; /* Bounds for range of valid frame pointer. */ unsigned long low = sp, high = get_printable_stack_bottom(sp); /* The initial frame pointer. */ next = bp; for ( ; ; ) { /* Valid frame pointer? */ if ( (next < low) || (next >= high) ) { /* * Exception stack frames have a different layout, denoted by an * inverted frame pointer. */ next = ~next; if ( (next < low) || (next >= high) ) break; frame = (unsigned long *)next; next = frame[0]; addr = frame[(offsetof(struct cpu_user_regs, eip) - offsetof(struct cpu_user_regs, ebp)) / BYTES_PER_LONG]; } else { /* Ordinary stack frame. */ frame = (unsigned long *)next; next = frame[0]; addr = frame[1]; } printk(" [<%p>] %pS\n", _p(addr), _p(addr)); low = (unsigned long)&frame[2]; } } #endif static void show_trace(const struct cpu_user_regs *regs) { unsigned long *sp = ESP_BEFORE_EXCEPTION(regs); printk("Xen call trace:\n"); /* * If RIP looks sensible, or the top of the stack doesn't, print RIP at * the top of the stack trace. */ if ( is_active_kernel_text(regs->rip) || !is_active_kernel_text(*sp) ) printk(" [<%p>] %pS\n", _p(regs->rip), _p(regs->rip)); /* * Else RIP looks bad but the top of the stack looks good. Perhaps we * followed a wild function pointer? Lets assume the top of the stack is a * return address; print it and skip past so _show_trace() doesn't print * it again. */ else { printk(" [<%p>] %pS\n", _p(*sp), _p(*sp)); sp++; } _show_trace((unsigned long)sp, regs->rbp); printk("\n"); } void show_stack(struct cpu_user_regs *regs) { unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), addr; int i; if ( guest_mode(regs) ) return show_guest_stack(current, regs); printk("Xen stack trace from "__OP"sp=%p:\n ", stack); for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ ) { if ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) == 0 ) break; if ( (i != 0) && ((i % stack_words_per_line) == 0) ) printk("\n "); addr = *stack++; printk(" %p", _p(addr)); } if ( i == 0 ) printk("Stack empty."); printk("\n"); show_trace(regs); } void show_stack_overflow(unsigned int cpu, const struct cpu_user_regs *regs) { #ifdef MEMORY_GUARD unsigned long esp = regs->rsp; unsigned long esp_top, esp_bottom; esp_bottom = (esp | (STACK_SIZE - 1)) + 1; esp_top = esp_bottom - PRIMARY_STACK_SIZE; printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n", (void *)esp_top, (void *)esp_bottom, (void *)esp, (void *)per_cpu(init_tss, cpu).esp0); /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */ if ( ((unsigned long)(esp - esp_top) > 512) && ((unsigned long)(esp_top - esp) > 512) ) { printk("No stack overflow detected. Skipping stack trace.\n"); return; } if ( esp < esp_top ) esp = esp_top; printk("Xen stack overflow (dumping trace %p-%p):\n", (void *)esp, (void *)esp_bottom); _show_trace(esp, regs->rbp); printk("\n"); #endif } void show_execution_state(struct cpu_user_regs *regs) { show_registers(regs); show_stack(regs); } void vcpu_show_execution_state(struct vcpu *v) { printk("*** Dumping Dom%d vcpu#%d state: ***\n", v->domain->domain_id, v->vcpu_id); if ( v == current ) { show_execution_state(guest_cpu_user_regs()); return; } vcpu_pause(v); /* acceptably dangerous */ vcpu_show_registers(v); if ( guest_kernel_mode(v, &v->arch.user_regs) ) show_guest_stack(v, &v->arch.user_regs); vcpu_unpause(v); } static char *trapstr(int trapnr) { static char *strings[] = { "divide error", "debug", "nmi", "bkpt", "overflow", "bounds", "invalid opcode", "device not available", "double fault", "coprocessor segment", "invalid tss", "segment not found", "stack error", "general protection fault", "page fault", "spurious interrupt", "coprocessor error", "alignment check", "machine check", "simd error" }; if ( (trapnr < 0) || (trapnr >= ARRAY_SIZE(strings)) ) return "???"; return strings[trapnr]; } /* * This is called for faults at very unexpected times (e.g., when interrupts * are disabled). In such situations we can't do much that is safe. We try to * print out some tracing and then we just spin. */ void fatal_trap(int trapnr, struct cpu_user_regs *regs) { static DEFINE_PER_CPU(char, depth); /* * In some cases, we can end up in a vicious cycle of fatal_trap()s * within fatal_trap()s. We give the problem a couple of iterations to * bottom out, and then we just panic. */ if ( ++this_cpu(depth) < 3 ) { watchdog_disable(); console_start_sync(); show_execution_state(regs); if ( trapnr == TRAP_page_fault ) { unsigned long cr2 = read_cr2(); printk("Faulting linear address: %p\n", _p(cr2)); show_page_walk(cr2); } } panic("FATAL TRAP: vector = %d (%s)\n" "[error_code=%04x] %s", trapnr, trapstr(trapnr), regs->error_code, (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT"); } static void do_guest_trap( int trapnr, const struct cpu_user_regs *regs, int use_error_code) { struct vcpu *v = current; struct trap_bounce *tb; const struct trap_info *ti; trace_pv_trap(trapnr, regs->eip, use_error_code, regs->error_code); tb = &v->arch.pv_vcpu.trap_bounce; ti = &v->arch.pv_vcpu.trap_ctxt[trapnr]; tb->flags = TBF_EXCEPTION; tb->cs = ti->cs; tb->eip = ti->address; if ( use_error_code ) { tb->flags |= TBF_EXCEPTION_ERRCODE; tb->error_code = regs->error_code; } if ( TI_GET_IF(ti) ) tb->flags |= TBF_INTERRUPT; if ( unlikely(null_trap_bounce(v, tb)) ) gdprintk(XENLOG_WARNING, "Unhandled %s fault/trap [#%d] " "on VCPU %d [ec=%04x]\n", trapstr(trapnr), trapnr, v->vcpu_id, regs->error_code); } static void instruction_done( struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch) { regs->eip = eip; regs->eflags &= ~X86_EFLAGS_RF; if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) ) { current->arch.debugreg[6] |= bpmatch | 0xffff0ff0; if ( regs->eflags & X86_EFLAGS_TF ) current->arch.debugreg[6] |= 0x4000; do_guest_trap(TRAP_debug, regs, 0); } } static unsigned int check_guest_io_breakpoint(struct vcpu *v, unsigned int port, unsigned int len) { unsigned int width, i, match = 0; unsigned long start; if ( !(v->arch.debugreg[5]) || !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ) return 0; for ( i = 0; i < 4; i++ ) { if ( !(v->arch.debugreg[5] & (3 << (i * DR_ENABLE_SIZE))) ) continue; start = v->arch.debugreg[i]; width = 0; switch ( (v->arch.debugreg[7] >> (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc ) { case DR_LEN_1: width = 1; break; case DR_LEN_2: width = 2; break; case DR_LEN_4: width = 4; break; case DR_LEN_8: width = 8; break; } if ( (start < (port + len)) && ((start + width) > port) ) match |= 1 << i; } return match; } /* * Called from asm to set up the MCE trapbounce info. * Returns 0 if no callback is set up, else 1. */ int set_guest_machinecheck_trapbounce(void) { struct vcpu *v = current; struct trap_bounce *tb = &v->arch.pv_vcpu.trap_bounce; do_guest_trap(TRAP_machine_check, guest_cpu_user_regs(), 0); tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */ return !null_trap_bounce(v, tb); } /* * Called from asm to set up the NMI trapbounce info. * Returns 0 if no callback is set up, else 1. */ int set_guest_nmi_trapbounce(void) { struct vcpu *v = current; struct trap_bounce *tb = &v->arch.pv_vcpu.trap_bounce; do_guest_trap(TRAP_nmi, guest_cpu_user_regs(), 0); tb->flags &= ~TBF_EXCEPTION; /* not needed for NMI delivery path */ return !null_trap_bounce(v, tb); } static inline void do_trap( int trapnr, struct cpu_user_regs *regs, int use_error_code) { struct vcpu *curr = current; unsigned long fixup; DEBUGGER_trap_entry(trapnr, regs); if ( guest_mode(regs) ) { do_guest_trap(trapnr, regs, use_error_code); return; } if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) { dprintk(XENLOG_ERR, "Trap %d: %p -> %p\n", trapnr, _p(regs->eip), _p(fixup)); this_cpu(last_extable_addr) = regs->eip; regs->eip = fixup; return; } if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) && has_hvm_container_vcpu(curr) && curr->arch.hvm_vcpu.fpu_exception_callback ) { curr->arch.hvm_vcpu.fpu_exception_callback( curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs); return; } DEBUGGER_trap_fatal(trapnr, regs); show_execution_state(regs); panic("FATAL TRAP: vector = %d (%s)\n" "[error_code=%04x]", trapnr, trapstr(trapnr), regs->error_code); } #define DO_ERROR_NOCODE(trapnr, name) \ void do_##name(struct cpu_user_regs *regs) \ { \ do_trap(trapnr, regs, 0); \ } #define DO_ERROR(trapnr, name) \ void do_##name(struct cpu_user_regs *regs) \ { \ do_trap(trapnr, regs, 1); \ } DO_ERROR_NOCODE(TRAP_divide_error, divide_error) DO_ERROR_NOCODE(TRAP_overflow, overflow) DO_ERROR_NOCODE(TRAP_bounds, bounds) DO_ERROR_NOCODE(TRAP_copro_seg, coprocessor_segment_overrun) DO_ERROR( TRAP_invalid_tss, invalid_TSS) DO_ERROR( TRAP_no_segment, segment_not_present) DO_ERROR( TRAP_stack_error, stack_segment) DO_ERROR_NOCODE(TRAP_copro_error, coprocessor_error) DO_ERROR( TRAP_alignment_check, alignment_check) DO_ERROR_NOCODE(TRAP_simd_error, simd_coprocessor_error) /* Returns 0 if not handled, and non-0 for success. */ int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val) { struct domain *d = current->domain; /* Optionally shift out of the way of Viridian architectural MSRs. */ uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000; switch ( idx - base ) { case 0: /* Write hypercall page MSR. Read as zero. */ { *val = 0; return 1; } } return 0; } /* Returns 1 if handled, 0 if not and -Exx for error. */ int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val) { struct domain *d = current->domain; /* Optionally shift out of the way of Viridian architectural MSRs. */ uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000; switch ( idx - base ) { case 0: /* Write hypercall page */ { void *hypercall_page; unsigned long gmfn = val >> PAGE_SHIFT; unsigned int page_index = val & (PAGE_SIZE - 1); struct page_info *page; p2m_type_t t; if ( page_index > 0 ) { gdprintk(XENLOG_WARNING, "wrmsr hypercall page index %#x unsupported\n", page_index); return 0; } page = get_page_from_gfn(d, gmfn, &t, P2M_ALLOC); if ( !page || !get_page_type(page, PGT_writable_page) ) { if ( page ) put_page(page); if ( p2m_is_paging(t) ) { p2m_mem_paging_populate(d, gmfn); return -EAGAIN; } gdprintk(XENLOG_WARNING, "Bad GMFN %lx (MFN %lx) to MSR %08x\n", gmfn, page ? page_to_mfn(page) : -1UL, base); return 0; } hypercall_page = __map_domain_page(page); hypercall_page_initialise(d, hypercall_page); unmap_domain_page(hypercall_page); put_page_and_type(page); return 1; } } return 0; } int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) { struct domain *d = current->domain; /* Optionally shift out of the way of Viridian architectural leaves. */ uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000; uint32_t limit; idx -= base; /* * Some Solaris PV drivers fail if max > base + 2. Help them out by * hiding the PVRDTSCP leaf if PVRDTSCP is disabled. */ limit = (d->arch.tsc_mode < TSC_MODE_PVRDTSCP) ? 2 : 3; if ( idx > limit ) return 0; switch ( idx ) { case 0: *eax = base + limit; /* Largest leaf */ *ebx = XEN_CPUID_SIGNATURE_EBX; *ecx = XEN_CPUID_SIGNATURE_ECX; *edx = XEN_CPUID_SIGNATURE_EDX; break; case 1: *eax = (xen_major_version() << 16) | xen_minor_version(); *ebx = 0; /* Reserved */ *ecx = 0; /* Reserved */ *edx = 0; /* Reserved */ break; case 2: *eax = 1; /* Number of hypercall-transfer pages */ *ebx = 0x40000000; /* MSR base address */ if ( is_viridian_domain(d) ) *ebx = 0x40000200; *ecx = 0; /* Features 1 */ *edx = 0; /* Features 2 */ if ( is_pv_vcpu(current) ) *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD; break; case 3: *eax = *ebx = *ecx = *edx = 0; cpuid_time_leaf( sub_idx, eax, ebx, ecx, edx ); break; default: BUG(); } return 1; } void pv_cpuid(struct cpu_user_regs *regs) { uint32_t a, b, c, d; a = regs->eax; b = regs->ebx; c = regs->ecx; d = regs->edx; if ( current->domain->domain_id != 0 ) { unsigned int cpuid_leaf = a, sub_leaf = c; if ( !cpuid_hypervisor_leaves(a, c, &a, &b, &c, &d) ) domain_cpuid(current->domain, a, c, &a, &b, &c, &d); switch ( cpuid_leaf ) { case 0xd: { unsigned int _eax, _ebx, _ecx, _edx; /* EBX value of main leaf 0 depends on enabled xsave features */ if ( sub_leaf == 0 && current->arch.xcr0 ) { /* reset EBX to default value first */ b = XSTATE_AREA_MIN_SIZE; for ( sub_leaf = 2; sub_leaf < 63; sub_leaf++ ) { if ( !(current->arch.xcr0 & (1ULL << sub_leaf)) ) continue; domain_cpuid(current->domain, cpuid_leaf, sub_leaf, &_eax, &_ebx, &_ecx, &_edx); if ( (_eax + _ebx) > b ) b = _eax + _ebx; } } break; } } goto out; } asm ( "cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "0" (a), "1" (b), "2" (c), "3" (d) ); if ( (regs->eax & 0x7fffffff) == 0x00000001 ) { /* Modify Feature Information. */ __clear_bit(X86_FEATURE_VME, &d); if ( !cpu_has_apic ) __clear_bit(X86_FEATURE_APIC, &d); __clear_bit(X86_FEATURE_PSE, &d); __clear_bit(X86_FEATURE_PGE, &d); __clear_bit(X86_FEATURE_PSE36, &d); } switch ( (uint32_t)regs->eax ) { case 0x00000001: /* Modify Feature Information. */ if ( !cpu_has_sep ) __clear_bit(X86_FEATURE_SEP, &d); __clear_bit(X86_FEATURE_DS, &d); __clear_bit(X86_FEATURE_ACC, &d); __clear_bit(X86_FEATURE_PBE, &d); if ( is_pvh_vcpu(current) ) __clear_bit(X86_FEATURE_MTRR, &d); __clear_bit(X86_FEATURE_DTES64 % 32, &c); __clear_bit(X86_FEATURE_MWAIT % 32, &c); __clear_bit(X86_FEATURE_DSCPL % 32, &c); __clear_bit(X86_FEATURE_VMXE % 32, &c); __clear_bit(X86_FEATURE_SMXE % 32, &c); __clear_bit(X86_FEATURE_TM2 % 32, &c); if ( is_pv_32bit_vcpu(current) ) __clear_bit(X86_FEATURE_CX16 % 32, &c); __clear_bit(X86_FEATURE_XTPR % 32, &c); __clear_bit(X86_FEATURE_PDCM % 32, &c); __clear_bit(X86_FEATURE_PCID % 32, &c); __clear_bit(X86_FEATURE_DCA % 32, &c); if ( !cpu_has_xsave ) { __clear_bit(X86_FEATURE_XSAVE % 32, &c); __clear_bit(X86_FEATURE_AVX % 32, &c); } if ( !cpu_has_apic ) __clear_bit(X86_FEATURE_X2APIC % 32, &c); __set_bit(X86_FEATURE_HYPERVISOR % 32, &c); break; case 0x00000007: if ( regs->ecx == 0 ) b &= (cpufeat_mask(X86_FEATURE_BMI1) | cpufeat_mask(X86_FEATURE_HLE) | cpufeat_mask(X86_FEATURE_AVX2) | cpufeat_mask(X86_FEATURE_BMI2) | cpufeat_mask(X86_FEATURE_ERMS) | cpufeat_mask(X86_FEATURE_RTM) | cpufeat_mask(X86_FEATURE_FSGSBASE)); else b = 0; a = c = d = 0; break; case 0x0000000d: /* XSAVE */ if ( !cpu_has_xsave ) goto unsupported; break; case 0x80000001: /* Modify Feature Information. */ if ( is_pv_32bit_vcpu(current) ) { __clear_bit(X86_FEATURE_LM % 32, &d); __clear_bit(X86_FEATURE_LAHF_LM % 32, &c); } if ( is_pv_32on64_vcpu(current) && boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) __clear_bit(X86_FEATURE_SYSCALL % 32, &d); __clear_bit(X86_FEATURE_PAGE1GB % 32, &d); __clear_bit(X86_FEATURE_RDTSCP % 32, &d); __clear_bit(X86_FEATURE_SVM % 32, &c); if ( !cpu_has_apic ) __clear_bit(X86_FEATURE_EXTAPIC % 32, &c); __clear_bit(X86_FEATURE_OSVW % 32, &c); __clear_bit(X86_FEATURE_IBS % 32, &c); __clear_bit(X86_FEATURE_SKINIT % 32, &c); __clear_bit(X86_FEATURE_WDT % 32, &c); __clear_bit(X86_FEATURE_LWP % 32, &c); __clear_bit(X86_FEATURE_NODEID_MSR % 32, &c); __clear_bit(X86_FEATURE_TOPOEXT % 32, &c); break; case 0x00000005: /* MONITOR/MWAIT */ case 0x0000000a: /* Architectural Performance Monitor Features */ case 0x0000000b: /* Extended Topology Enumeration */ case 0x8000000a: /* SVM revision and features */ case 0x8000001b: /* Instruction Based Sampling */ case 0x8000001c: /* Light Weight Profiling */ case 0x8000001e: /* Extended topology reporting */ unsupported: a = b = c = d = 0; break; default: (void)cpuid_hypervisor_leaves(regs->eax, 0, &a, &b, &c, &d); break; } out: regs->eax = a; regs->ebx = b; regs->ecx = c; regs->edx = d; } static int emulate_invalid_rdtscp(struct cpu_user_regs *regs) { char opcode[3]; unsigned long eip, rc; struct vcpu *v = current; eip = regs->eip; if ( (rc = copy_from_user(opcode, (char *)eip, sizeof(opcode))) != 0 ) { propagate_page_fault(eip + sizeof(opcode) - rc, 0); return EXCRET_fault_fixed; } if ( memcmp(opcode, "\xf\x1\xf9", sizeof(opcode)) ) return 0; eip += sizeof(opcode); pv_soft_rdtsc(v, regs, 1); instruction_done(regs, eip, 0); return EXCRET_fault_fixed; } static int emulate_forced_invalid_op(struct cpu_user_regs *regs) { char sig[5], instr[2]; unsigned long eip, rc; eip = regs->eip; /* Check for forced emulation signature: ud2 ; .ascii "xen". */ if ( (rc = copy_from_user(sig, (char *)eip, sizeof(sig))) != 0 ) { propagate_page_fault(eip + sizeof(sig) - rc, 0); return EXCRET_fault_fixed; } if ( memcmp(sig, "\xf\xbxen", sizeof(sig)) ) return 0; eip += sizeof(sig); /* We only emulate CPUID. */ if ( ( rc = copy_from_user(instr, (char *)eip, sizeof(instr))) != 0 ) { propagate_page_fault(eip + sizeof(instr) - rc, 0); return EXCRET_fault_fixed; } if ( memcmp(instr, "\xf\xa2", sizeof(instr)) ) return 0; eip += sizeof(instr); pv_cpuid(regs); instruction_done(regs, eip, 0); trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip); return EXCRET_fault_fixed; } void do_invalid_op(struct cpu_user_regs *regs) { const struct bug_frame *bug; u8 bug_insn[2]; const char *prefix = "", *filename, *predicate, *eip = (char *)regs->eip; unsigned long fixup; int id, lineno; static const struct bug_frame *const stop_frames[] = { __stop_bug_frames_0, __stop_bug_frames_1, __stop_bug_frames_2, __stop_bug_frames_3, NULL }; DEBUGGER_trap_entry(TRAP_invalid_op, regs); if ( likely(guest_mode(regs)) ) { if ( !emulate_invalid_rdtscp(regs) && !emulate_forced_invalid_op(regs) ) do_guest_trap(TRAP_invalid_op, regs, 0); return; } if ( (!is_kernel_text(eip) && (system_state > SYS_STATE_boot || !is_kernel_inittext(eip))) || __copy_from_user(bug_insn, eip, sizeof(bug_insn)) || memcmp(bug_insn, "\xf\xb", sizeof(bug_insn)) ) goto die; for ( bug = __start_bug_frames, id = 0; stop_frames[id]; ++bug ) { while ( unlikely(bug == stop_frames[id]) ) ++id; if ( bug_loc(bug) == eip ) break; } if ( !stop_frames[id] ) goto die; eip += sizeof(bug_insn); if ( id == BUGFRAME_run_fn ) { void (*fn)(struct cpu_user_regs *) = bug_ptr(bug); fn(regs); regs->eip = (unsigned long)eip; return; } /* WARN, BUG or ASSERT: decode the filename pointer and line number. */ filename = bug_ptr(bug); if ( !is_kernel(filename) ) goto die; fixup = strlen(filename); if ( fixup > 50 ) { filename += fixup - 47; prefix = "..."; } lineno = bug_line(bug); switch ( id ) { case BUGFRAME_warn: printk("Xen WARN at %s%s:%d\n", prefix, filename, lineno); show_execution_state(regs); regs->eip = (unsigned long)eip; return; case BUGFRAME_bug: printk("Xen BUG at %s%s:%d\n", prefix, filename, lineno); DEBUGGER_trap_fatal(TRAP_invalid_op, regs); show_execution_state(regs); panic("Xen BUG at %s%s:%d", prefix, filename, lineno); case BUGFRAME_assert: /* ASSERT: decode the predicate string pointer. */ predicate = bug_msg(bug); if ( !is_kernel(predicate) ) predicate = ""; printk("Assertion '%s' failed at %s%s:%d\n", predicate, prefix, filename, lineno); DEBUGGER_trap_fatal(TRAP_invalid_op, regs); show_execution_state(regs); panic("Assertion '%s' failed at %s%s:%d", predicate, prefix, filename, lineno); } die: if ( (fixup = search_exception_table(regs->eip)) != 0 ) { this_cpu(last_extable_addr) = regs->eip; regs->eip = fixup; return; } DEBUGGER_trap_fatal(TRAP_invalid_op, regs); show_execution_state(regs); panic("FATAL TRAP: vector = %d (invalid opcode)", TRAP_invalid_op); } void do_int3(struct cpu_user_regs *regs) { DEBUGGER_trap_entry(TRAP_int3, regs); if ( !guest_mode(regs) ) { debugger_trap_fatal(TRAP_int3, regs); return; } do_guest_trap(TRAP_int3, regs, 0); } void do_machine_check(struct cpu_user_regs *regs) { machine_check_vector(regs, regs->error_code); } static void reserved_bit_page_fault( unsigned long addr, struct cpu_user_regs *regs) { printk("d%d:v%d: reserved bit in page table (ec=%04X)\n", current->domain->domain_id, current->vcpu_id, regs->error_code); show_page_walk(addr); show_execution_state(regs); } struct trap_bounce *propagate_page_fault(unsigned long addr, u16 error_code) { struct trap_info *ti; struct vcpu *v = current; struct trap_bounce *tb = &v->arch.pv_vcpu.trap_bounce; if ( unlikely(!is_canonical_address(addr)) ) { ti = &v->arch.pv_vcpu.trap_ctxt[TRAP_gp_fault]; tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE; tb->error_code = 0; tb->cs = ti->cs; tb->eip = ti->address; if ( TI_GET_IF(ti) ) tb->flags |= TBF_INTERRUPT; return tb; } v->arch.pv_vcpu.ctrlreg[2] = addr; arch_set_cr2(v, addr); /* Re-set error_code.user flag appropriately for the guest. */ error_code &= ~PFEC_user_mode; if ( !guest_kernel_mode(v, guest_cpu_user_regs()) ) error_code |= PFEC_user_mode; trace_pv_page_fault(addr, error_code); ti = &v->arch.pv_vcpu.trap_ctxt[TRAP_page_fault]; tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE; tb->error_code = error_code; tb->cs = ti->cs; tb->eip = ti->address; if ( TI_GET_IF(ti) ) tb->flags |= TBF_INTERRUPT; if ( unlikely(null_trap_bounce(v, tb)) ) { printk("d%d:v%d: unhandled page fault (ec=%04X)\n", v->domain->domain_id, v->vcpu_id, error_code); show_page_walk(addr); } if ( unlikely(error_code & PFEC_reserved_bit) ) reserved_bit_page_fault(addr, guest_cpu_user_regs()); return NULL; } static int handle_gdt_ldt_mapping_fault( unsigned long offset, struct cpu_user_regs *regs) { struct vcpu *curr = current; /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */ unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1; unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT); /* * If the fault is in another vcpu's area, it cannot be due to * a GDT/LDT descriptor load. Thus we can reasonably exit immediately, and * indeed we have to since map_ldt_shadow_page() works correctly only on * accesses to a vcpu's own area. */ if ( vcpu_area != curr->vcpu_id ) return 0; /* Byte offset within the gdt/ldt sub-area. */ offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL; if ( likely(is_ldt_area) ) { /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */ if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) ) { if ( guest_mode(regs) ) trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT, regs->eip, offset); } else { struct trap_bounce *tb; /* In hypervisor mode? Leave it to the #PF handler to fix up. */ if ( !guest_mode(regs) ) return 0; /* In guest mode? Propagate fault to guest, with adjusted %cr2. */ tb = propagate_page_fault(curr->arch.pv_vcpu.ldt_base + offset, regs->error_code); if ( tb ) tb->error_code = ((u16)offset & ~3) | 4; } } else { /* GDT fault: handle the fault as #GP(selector). */ regs->error_code = (u16)offset & ~7; (void)do_general_protection(regs); } return EXCRET_fault_fixed; } #define IN_HYPERVISOR_RANGE(va) \ (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END)) enum pf_type { real_fault, smep_fault, spurious_fault }; static enum pf_type __page_fault_type( unsigned long addr, unsigned int error_code) { unsigned long mfn, cr3 = read_cr3(); l4_pgentry_t l4e, *l4t; l3_pgentry_t l3e, *l3t; l2_pgentry_t l2e, *l2t; l1_pgentry_t l1e, *l1t; unsigned int required_flags, disallowed_flags, page_user; /* * We do not take spurious page faults in IRQ handlers as we do not * modify page tables in IRQ context. We therefore bail here because * map_domain_page() is not IRQ-safe. */ if ( in_irq() ) return real_fault; /* Reserved bit violations are never spurious faults. */ if ( error_code & PFEC_reserved_bit ) return real_fault; required_flags = _PAGE_PRESENT; if ( error_code & PFEC_write_access ) required_flags |= _PAGE_RW; if ( error_code & PFEC_user_mode ) required_flags |= _PAGE_USER; disallowed_flags = 0; if ( error_code & PFEC_insn_fetch ) disallowed_flags |= _PAGE_NX_BIT; page_user = _PAGE_USER; mfn = cr3 >> PAGE_SHIFT; l4t = map_domain_page(mfn); l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]); mfn = l4e_get_pfn(l4e); unmap_domain_page(l4t); if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) || (l4e_get_flags(l4e) & disallowed_flags) ) return real_fault; page_user &= l4e_get_flags(l4e); l3t = map_domain_page(mfn); l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]); mfn = l3e_get_pfn(l3e); unmap_domain_page(l3t); if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) || (l3e_get_flags(l3e) & disallowed_flags) ) return real_fault; page_user &= l3e_get_flags(l3e); if ( l3e_get_flags(l3e) & _PAGE_PSE ) goto leaf; l2t = map_domain_page(mfn); l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]); mfn = l2e_get_pfn(l2e); unmap_domain_page(l2t); if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) || (l2e_get_flags(l2e) & disallowed_flags) ) return real_fault; page_user &= l2e_get_flags(l2e); if ( l2e_get_flags(l2e) & _PAGE_PSE ) goto leaf; l1t = map_domain_page(mfn); l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]); mfn = l1e_get_pfn(l1e); unmap_domain_page(l1t); if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) || (l1e_get_flags(l1e) & disallowed_flags) ) return real_fault; page_user &= l1e_get_flags(l1e); leaf: /* * Supervisor Mode Execution Protection (SMEP): * Disallow supervisor execution from user-accessible mappings */ if ( (read_cr4() & X86_CR4_SMEP) && page_user && ((error_code & (PFEC_insn_fetch|PFEC_user_mode)) == PFEC_insn_fetch) ) return smep_fault; return spurious_fault; } static enum pf_type spurious_page_fault( unsigned long addr, unsigned int error_code) { unsigned long flags; enum pf_type pf_type; /* * Disabling interrupts prevents TLB flushing, and hence prevents * page tables from becoming invalid under our feet during the walk. */ local_irq_save(flags); pf_type = __page_fault_type(addr, error_code); local_irq_restore(flags); return pf_type; } static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs) { struct vcpu *v = current; struct domain *d = v->domain; /* No fixups in interrupt context or when interrupts are disabled. */ if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) ) return 0; /* Faults from external-mode guests are handled by shadow/hap */ if ( paging_mode_external(d) && guest_mode(regs) ) { int ret = paging_fault(addr, regs); if ( ret == EXCRET_fault_fixed ) trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr); return ret; } if ( !(regs->error_code & PFEC_page_present) && (pagefault_by_memadd(addr, regs)) ) return handle_memadd_fault(addr, regs); if ( unlikely(IN_HYPERVISOR_RANGE(addr)) ) { if ( !(regs->error_code & (PFEC_user_mode | PFEC_reserved_bit)) && (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) ) return handle_gdt_ldt_mapping_fault( addr - GDT_LDT_VIRT_START, regs); return 0; } if ( guest_kernel_mode(v, regs) && !(regs->error_code & (PFEC_reserved_bit | PFEC_insn_fetch)) && (regs->error_code & PFEC_write_access) ) { if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) && /* Do not check if access-protection fault since the page may legitimately be not present in shadow page tables */ (paging_mode_enabled(d) || (regs->error_code & PFEC_page_present)) && ptwr_do_page_fault(v, addr, regs) ) return EXCRET_fault_fixed; if ( is_hardware_domain(d) && (regs->error_code & PFEC_page_present) && mmio_ro_do_page_fault(v, addr, regs) ) return EXCRET_fault_fixed; } /* For non-external shadowed guests, we fix up both their own * pagefaults and Xen's, since they share the pagetables. */ if ( paging_mode_enabled(d) && !paging_mode_external(d) ) { int ret = paging_fault(addr, regs); if ( ret == EXCRET_fault_fixed ) trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr); return ret; } return 0; } /* * #PF error code: * Bit 0: Protection violation (=1) ; Page not present (=0) * Bit 1: Write access * Bit 2: User mode (=1) ; Supervisor mode (=0) * Bit 3: Reserved bit violation * Bit 4: Instruction fetch */ void do_page_fault(struct cpu_user_regs *regs) { unsigned long addr, fixup; unsigned int error_code; enum pf_type pf_type; addr = read_cr2(); /* fixup_page_fault() might change regs->error_code, so cache it here. */ error_code = regs->error_code; DEBUGGER_trap_entry(TRAP_page_fault, regs); perfc_incr(page_faults); if ( unlikely(fixup_page_fault(addr, regs) != 0) ) return; if ( unlikely(!guest_mode(regs)) ) { pf_type = spurious_page_fault(addr, error_code); BUG_ON(pf_type == smep_fault); if ( pf_type != real_fault ) return; if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) { perfc_incr(copy_user_faults); if ( unlikely(regs->error_code & PFEC_reserved_bit) ) reserved_bit_page_fault(addr, regs); this_cpu(last_extable_addr) = regs->eip; regs->eip = fixup; return; } DEBUGGER_trap_fatal(TRAP_page_fault, regs); show_execution_state(regs); show_page_walk(addr); panic("FATAL PAGE FAULT\n" "[error_code=%04x]\n" "Faulting linear address: %p", error_code, _p(addr)); } if ( unlikely(current->domain->arch.suppress_spurious_page_faults) ) { pf_type = spurious_page_fault(addr, error_code); if ( pf_type == smep_fault ) { gdprintk(XENLOG_ERR, "Fatal SMEP fault\n"); domain_crash(current->domain); } if ( pf_type != real_fault ) return; } propagate_page_fault(addr, regs->error_code); } /* * Early #PF handler to print CR2, error code, and stack. * * We also deal with spurious faults here, even though they should never happen * during early boot (an issue was seen once, but was most likely a hardware * problem). */ void __init do_early_page_fault(struct cpu_user_regs *regs) { static int stuck; static unsigned long prev_eip, prev_cr2; unsigned long cr2 = read_cr2(); BUG_ON(smp_processor_id() != 0); if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) ) { prev_eip = regs->eip; prev_cr2 = cr2; stuck = 0; return; } if ( stuck++ == 1000 ) { unsigned long *stk = (unsigned long *)regs; printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n", regs->cs, _p(regs->eip), _p(cr2), regs->error_code); show_page_walk(cr2); printk("Stack dump: "); while ( ((long)stk & ((PAGE_SIZE - 1) & ~(BYTES_PER_LONG - 1))) != 0 ) printk("%p ", _p(*stk++)); for ( ; ; ) halt(); } } long do_fpu_taskswitch(int set) { struct vcpu *v = current; if ( set ) { v->arch.pv_vcpu.ctrlreg[0] |= X86_CR0_TS; stts(); } else { v->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS; if ( v->fpu_dirtied ) clts(); } return 0; } static int read_descriptor(unsigned int sel, const struct vcpu *v, const struct cpu_user_regs * regs, unsigned long *base, unsigned long *limit, unsigned int *ar, unsigned int vm86attr) { struct desc_struct desc; if ( !vm86_mode(regs) ) { if ( sel < 4) desc.b = desc.a = 0; else if ( __get_user(desc, (const struct desc_struct *)(!(sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v)) + (sel >> 3)) ) return 0; if ( !(vm86attr & _SEGMENT_CODE) ) desc.b &= ~_SEGMENT_L; } else { desc.a = (sel << 20) | 0xffff; desc.b = vm86attr | (sel >> 12); } *ar = desc.b & 0x00f0ff00; if ( !(desc.b & _SEGMENT_L) ) { *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) + (desc.b & 0xff000000)); *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000); if ( desc.b & _SEGMENT_G ) *limit = ((*limit + 1) << 12) - 1; #ifndef NDEBUG if ( !vm86_mode(regs) && (sel > 3) ) { unsigned int a, l; unsigned char valid; asm volatile ( "larl %2,%0 ; setz %1" : "=r" (a), "=qm" (valid) : "rm" (sel)); BUG_ON(valid && ((a & 0x00f0ff00) != *ar)); asm volatile ( "lsll %2,%0 ; setz %1" : "=r" (l), "=qm" (valid) : "rm" (sel)); BUG_ON(valid && (l != *limit)); } #endif } else { *base = 0UL; *limit = ~0UL; } return 1; } static int read_gate_descriptor(unsigned int gate_sel, const struct vcpu *v, unsigned int *sel, unsigned long *off, unsigned int *ar) { struct desc_struct desc; const struct desc_struct *pdesc; pdesc = (const struct desc_struct *) (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v)) + (gate_sel >> 3); if ( (gate_sel < 4) || ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) || __get_user(desc, pdesc) ) return 0; *sel = (desc.a >> 16) & 0x0000fffc; *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000); *ar = desc.b & 0x0000ffff; /* * check_descriptor() clears the DPL field and stores the * guest requested DPL in the selector's RPL field. */ if ( *ar & _SEGMENT_DPL ) return 0; *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL; if ( !is_pv_32bit_vcpu(v) ) { if ( (*ar & 0x1f00) != 0x0c00 || (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) || __get_user(desc, pdesc + 1) || (desc.b & 0x1f00) ) return 0; *off |= (unsigned long)desc.a << 32; return 1; } switch ( *ar & 0x1f00 ) { case 0x0400: *off &= 0xffff; break; case 0x0c00: break; default: return 0; } return 1; } /* Has the guest requested sufficient permission for this I/O access? */ static int guest_io_okay( unsigned int port, unsigned int bytes, struct vcpu *v, struct cpu_user_regs *regs) { /* If in user mode, switch to kernel mode just to read I/O bitmap. */ int user_mode = !(v->arch.flags & TF_kernel_mode); #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v) if ( !vm86_mode(regs) && (v->arch.pv_vcpu.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) ) return 1; if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) ) { union { uint8_t bytes[2]; uint16_t mask; } x; /* * Grab permission bytes from guest space. Inaccessible bytes are * read as 0xff (no access allowed). */ TOGGLE_MODE(); switch ( __copy_from_guest_offset(x.bytes, v->arch.pv_vcpu.iobmp, port>>3, 2) ) { default: x.bytes[0] = ~0; case 1: x.bytes[1] = ~0; case 0: break; } TOGGLE_MODE(); if ( (x.mask & (((1<domain, port, port + bytes - 1); } static int pci_cfg_ok(struct domain *d, int write, int size) { uint32_t machine_bdf; uint16_t start, end; if (!is_hardware_domain(d)) return 0; machine_bdf = (d->arch.pci_cf8 >> 8) & 0xFFFF; if ( write ) { const unsigned long *ro_map = pci_get_ro_map(0); if ( ro_map && test_bit(machine_bdf, ro_map) ) return 0; } start = d->arch.pci_cf8 & 0xFF; /* AMD extended configuration space access? */ if ( (d->arch.pci_cf8 & 0x0F000000) && boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 ) { uint64_t msr_val; if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) ) return 0; if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) ) start |= (d->arch.pci_cf8 >> 16) & 0xF00; } end = start + size - 1; if (xsm_pci_config_permission(XSM_HOOK, d, machine_bdf, start, end, write)) return 0; return 1; } uint32_t guest_io_read( unsigned int port, unsigned int bytes, struct vcpu *v, struct cpu_user_regs *regs) { uint32_t data = 0; unsigned int shift = 0; if ( admin_io_okay(port, bytes, v, regs) ) { switch ( bytes ) { case 1: return inb(port); case 2: return inw(port); case 4: return inl(port); } } while ( bytes != 0 ) { unsigned int size = 1; uint32_t sub_data = ~0; if ( (port == 0x42) || (port == 0x43) || (port == 0x61) ) { sub_data = pv_pit_handler(port, 0, 0); } else if ( (port == RTC_PORT(0)) ) { sub_data = v->domain->arch.cmos_idx; } else if ( (port == RTC_PORT(1)) && ioports_access_permitted(v->domain, RTC_PORT(0), RTC_PORT(1)) ) { unsigned long flags; spin_lock_irqsave(&rtc_lock, flags); outb(v->domain->arch.cmos_idx & 0x7f, RTC_PORT(0)); sub_data = inb(RTC_PORT(1)); spin_unlock_irqrestore(&rtc_lock, flags); } else if ( (port == 0xcf8) && (bytes == 4) ) { size = 4; sub_data = v->domain->arch.pci_cf8; } else if ( (port & 0xfffc) == 0xcfc ) { size = min(bytes, 4 - (port & 3)); if ( size == 3 ) size = 2; if ( pci_cfg_ok(v->domain, 0, size) ) sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size); } if ( size == 4 ) return sub_data; data |= (sub_data & ((1u << (size * 8)) - 1)) << shift; shift += size * 8; port += size; bytes -= size; } return data; } void guest_io_write( unsigned int port, unsigned int bytes, uint32_t data, struct vcpu *v, struct cpu_user_regs *regs) { if ( admin_io_okay(port, bytes, v, regs) ) { switch ( bytes ) { case 1: outb((uint8_t)data, port); if ( pv_post_outb_hook ) pv_post_outb_hook(port, (uint8_t)data); break; case 2: outw((uint16_t)data, port); break; case 4: outl(data, port); break; } return; } while ( bytes != 0 ) { unsigned int size = 1; if ( (port == 0x42) || (port == 0x43) || (port == 0x61) ) { pv_pit_handler(port, (uint8_t)data, 1); } else if ( (port == RTC_PORT(0)) ) { v->domain->arch.cmos_idx = data; } else if ( (port == RTC_PORT(1)) && ioports_access_permitted(v->domain, RTC_PORT(0), RTC_PORT(1)) ) { unsigned long flags; if ( pv_rtc_handler ) pv_rtc_handler(v->domain->arch.cmos_idx & 0x7f, data); spin_lock_irqsave(&rtc_lock, flags); outb(v->domain->arch.cmos_idx & 0x7f, RTC_PORT(0)); outb(data, RTC_PORT(1)); spin_unlock_irqrestore(&rtc_lock, flags); } else if ( (port == 0xcf8) && (bytes == 4) ) { size = 4; v->domain->arch.pci_cf8 = data; } else if ( (port & 0xfffc) == 0xcfc ) { size = min(bytes, 4 - (port & 3)); if ( size == 3 ) size = 2; if ( pci_cfg_ok(v->domain, 1, size) ) pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data); } if ( size == 4 ) return; port += size; bytes -= size; data >>= size * 8; } } /* I/O emulation support. Helper routines for, and type of, the stack stub.*/ void host_to_guest_gpr_switch(struct cpu_user_regs *) __attribute__((__regparm__(1))); unsigned long guest_to_host_gpr_switch(unsigned long) __attribute__((__regparm__(1))); void (*pv_post_outb_hook)(unsigned int port, u8 value); static inline uint64_t guest_misc_enable(uint64_t val) { val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL | MSR_IA32_MISC_ENABLE_MONITOR_ENABLE); val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL | MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | MSR_IA32_MISC_ENABLE_XTPR_DISABLE; return val; } /* Instruction fetch with error handling. */ #define insn_fetch(type, base, eip, limit) \ ({ unsigned long _rc, _ptr = (base) + (eip); \ type _x; \ if ( ad_default < 8 ) \ _ptr = (unsigned int)_ptr; \ if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \ goto fail; \ if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \ { \ propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \ goto skip; \ } \ (eip) += sizeof(_x); _x; }) #define read_sreg(regs, sr) read_segment_register(sr) static int is_cpufreq_controller(struct domain *d) { return ((cpufreq_controller == FREQCTL_dom0_kernel) && (d->domain_id == 0)); } #include "x86_64/mmconfig.h" static int emulate_privileged_op(struct cpu_user_regs *regs) { struct vcpu *v = current; unsigned long *reg, eip = regs->eip; u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0; enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none; int rc; unsigned int port, i, data_sel, ar, data, bpmatch = 0; unsigned int op_bytes, op_default, ad_bytes, ad_default, opsize_prefix= 0; #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \ ? regs->reg \ : ad_bytes == 4 \ ? (u32)regs->reg \ : (u16)regs->reg) #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \ ? regs->reg = (val) \ : ad_bytes == 4 \ ? (*(u32 *)®s->reg = (val)) \ : (*(u16 *)®s->reg = (val))) unsigned long code_base, code_limit; char io_emul_stub[32]; void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1))); uint64_t val, msr_content; if ( !read_descriptor(regs->cs, v, regs, &code_base, &code_limit, &ar, _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) ) goto fail; op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2; ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default; if ( !(ar & _SEGMENT_S) || !(ar & _SEGMENT_P) || !(ar & _SEGMENT_CODE) ) goto fail; /* emulating only opcodes not allowing SS to be default */ data_sel = read_sreg(regs, ds); /* Legacy prefixes. */ for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) ) { switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) ) { case 0x66: /* operand-size override */ opsize_prefix = 1; op_bytes = op_default ^ 6; /* switch between 2/4 bytes */ continue; case 0x67: /* address-size override */ ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */ continue; case 0x2e: /* CS override */ data_sel = regs->cs; continue; case 0x3e: /* DS override */ data_sel = read_sreg(regs, ds); continue; case 0x26: /* ES override */ data_sel = read_sreg(regs, es); continue; case 0x64: /* FS override */ data_sel = read_sreg(regs, fs); lm_ovr = lm_seg_fs; continue; case 0x65: /* GS override */ data_sel = read_sreg(regs, gs); lm_ovr = lm_seg_gs; continue; case 0x36: /* SS override */ data_sel = regs->ss; continue; case 0xf0: /* LOCK */ lock = 1; continue; case 0xf2: /* REPNE/REPNZ */ case 0xf3: /* REP/REPE/REPZ */ rep_prefix = 1; continue; default: if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 ) { rex = opcode; continue; } break; } break; } /* REX prefix. */ if ( rex & 8 ) /* REX.W */ op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */ modrm_reg = (rex & 4) << 1; /* REX.R */ /* REX.X does not need to be decoded. */ modrm_rm = (rex & 1) << 3; /* REX.B */ if ( opcode == 0x0f ) goto twobyte_opcode; if ( lock ) goto fail; /* Input/Output String instructions. */ if ( (opcode >= 0x6c) && (opcode <= 0x6f) ) { unsigned long data_base, data_limit; if ( rep_prefix && (rd_ad(ecx) == 0) ) goto done; if ( !(opcode & 2) ) { data_sel = read_sreg(regs, es); lm_ovr = lm_seg_none; } if ( !(ar & _SEGMENT_L) ) { if ( !read_descriptor(data_sel, v, regs, &data_base, &data_limit, &ar, _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL| _SEGMENT_P) ) goto fail; if ( !(ar & _SEGMENT_S) || !(ar & _SEGMENT_P) || (opcode & 2 ? (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) : (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) ) goto fail; } else { switch ( lm_ovr ) { default: data_base = 0UL; break; case lm_seg_fs: data_base = rdfsbase(); break; case lm_seg_gs: data_base = rdgsbase(); break; } data_limit = ~0UL; ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P; } port = (u16)regs->edx; continue_io_string: switch ( opcode ) { case 0x6c: /* INSB */ op_bytes = 1; case 0x6d: /* INSW/INSL */ if ( (data_limit < (op_bytes - 1)) || (rd_ad(edi) > (data_limit - (op_bytes - 1))) || !guest_io_okay(port, op_bytes, v, regs) ) goto fail; data = guest_io_read(port, op_bytes, v, regs); if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 ) { propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc, PFEC_write_access); return EXCRET_fault_fixed; } wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF) ? -op_bytes : op_bytes)); break; case 0x6e: /* OUTSB */ op_bytes = 1; case 0x6f: /* OUTSW/OUTSL */ if ( (data_limit < (op_bytes - 1)) || (rd_ad(esi) > (data_limit - (op_bytes - 1))) || !guest_io_okay(port, op_bytes, v, regs) ) goto fail; if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes)) != 0 ) { propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0); return EXCRET_fault_fixed; } guest_io_write(port, op_bytes, data, v, regs); wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF) ? -op_bytes : op_bytes)); break; } bpmatch = check_guest_io_breakpoint(v, port, op_bytes); if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) ) { if ( !bpmatch && !hypercall_preempt_check() ) goto continue_io_string; eip = regs->eip; } goto done; } /* * Very likely to be an I/O instruction (IN/OUT). * Build an on-stack stub to execute the instruction with full guest * GPR context. This is needed for some systems which (ab)use IN/OUT * to communicate with BIOS code in system-management mode. */ /* movq $host_to_guest_gpr_switch,%rcx */ io_emul_stub[0] = 0x48; io_emul_stub[1] = 0xb9; *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch; /* callq *%rcx */ io_emul_stub[10] = 0xff; io_emul_stub[11] = 0xd1; /* data16 or nop */ io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66; /* */ io_emul_stub[13] = opcode; /* imm8 or nop */ io_emul_stub[14] = 0x90; /* ret (jumps to guest_to_host_gpr_switch) */ io_emul_stub[15] = 0xc3; /* Handy function-typed pointer to the stub. */ io_emul = (void *)io_emul_stub; if ( ioemul_handle_quirk ) ioemul_handle_quirk(opcode, &io_emul_stub[12], regs); /* I/O Port and Interrupt Flag instructions. */ switch ( opcode ) { case 0xe4: /* IN imm8,%al */ op_bytes = 1; case 0xe5: /* IN imm8,%eax */ port = insn_fetch(u8, code_base, eip, code_limit); io_emul_stub[14] = port; /* imm8 */ exec_in: if ( !guest_io_okay(port, op_bytes, v, regs) ) goto fail; if ( admin_io_okay(port, op_bytes, v, regs) ) { mark_regs_dirty(regs); io_emul(regs); } else { if ( op_bytes == 4 ) regs->eax = 0; else regs->eax &= ~((1u << (op_bytes * 8)) - 1); regs->eax |= guest_io_read(port, op_bytes, v, regs); } bpmatch = check_guest_io_breakpoint(v, port, op_bytes); goto done; case 0xec: /* IN %dx,%al */ op_bytes = 1; case 0xed: /* IN %dx,%eax */ port = (u16)regs->edx; goto exec_in; case 0xe6: /* OUT %al,imm8 */ op_bytes = 1; case 0xe7: /* OUT %eax,imm8 */ port = insn_fetch(u8, code_base, eip, code_limit); io_emul_stub[14] = port; /* imm8 */ exec_out: if ( !guest_io_okay(port, op_bytes, v, regs) ) goto fail; if ( admin_io_okay(port, op_bytes, v, regs) ) { mark_regs_dirty(regs); io_emul(regs); if ( (op_bytes == 1) && pv_post_outb_hook ) pv_post_outb_hook(port, regs->eax); } else { guest_io_write(port, op_bytes, regs->eax, v, regs); } bpmatch = check_guest_io_breakpoint(v, port, op_bytes); goto done; case 0xee: /* OUT %al,%dx */ op_bytes = 1; case 0xef: /* OUT %eax,%dx */ port = (u16)regs->edx; goto exec_out; case 0xfa: /* CLI */ case 0xfb: /* STI */ if ( v->arch.pv_vcpu.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) ) goto fail; /* * This is just too dangerous to allow, in my opinion. Consider if the * caller then tries to reenable interrupts using POPF: we can't trap * that and we'll end up with hard-to-debug lockups. Fast & loose will * do for us. :-) */ /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/ goto done; } /* No decode of this single-byte opcode. */ goto fail; twobyte_opcode: /* * All 2 and 3 byte opcodes, except RDTSC (0x31), RDTSCP (0x1,0xF9), * and CPUID (0xa2), are executable only from guest kernel mode * (virtual ring 0). */ opcode = insn_fetch(u8, code_base, eip, code_limit); if ( !guest_kernel_mode(v, regs) && (opcode != 0x1) && (opcode != 0x31) && (opcode != 0xa2) ) goto fail; if ( lock && (opcode & ~3) != 0x20 ) goto fail; switch ( opcode ) { case 0x1: /* RDTSCP and XSETBV */ switch ( insn_fetch(u8, code_base, eip, code_limit) ) { case 0xf9: /* RDTSCP */ if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) && !guest_kernel_mode(v, regs) ) goto fail; pv_soft_rdtsc(v, regs, 1); break; case 0xd1: /* XSETBV */ { u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32); if ( lock || rep_prefix || opsize_prefix || !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) ) { do_guest_trap(TRAP_invalid_op, regs, 0); goto skip; } if ( !guest_kernel_mode(v, regs) ) goto fail; if ( handle_xsetbv(regs->ecx, new_xfeature) ) goto fail; break; } default: goto fail; } break; case 0x06: /* CLTS */ (void)do_fpu_taskswitch(0); break; case 0x09: /* WBINVD */ /* Ignore the instruction if unprivileged. */ if ( !cache_flush_permitted(v->domain) ) /* Non-physdev domain attempted WBINVD; ignore for now since newer linux uses this in some start-of-day timing loops */ ; else wbinvd(); break; case 0x20: /* MOV CR?, */ opcode = insn_fetch(u8, code_base, eip, code_limit); if ( opcode < 0xc0 ) goto fail; modrm_reg += ((opcode >> 3) & 7) + (lock << 3); modrm_rm |= (opcode >> 0) & 7; reg = decode_register(modrm_rm, regs, 0); switch ( modrm_reg ) { case 0: /* Read CR0 */ *reg = (read_cr0() & ~X86_CR0_TS) | v->arch.pv_vcpu.ctrlreg[0]; break; case 2: /* Read CR2 */ *reg = v->arch.pv_vcpu.ctrlreg[2]; break; case 3: /* Read CR3 */ { unsigned long mfn; if ( !is_pv_32on64_vcpu(v) ) { mfn = pagetable_get_pfn(v->arch.guest_table); *reg = xen_pfn_to_cr3(mfn_to_gmfn( v->domain, mfn)); } else { l4_pgentry_t *pl4e = map_domain_page(pagetable_get_pfn(v->arch.guest_table)); mfn = l4e_get_pfn(*pl4e); unmap_domain_page(pl4e); *reg = compat_pfn_to_cr3(mfn_to_gmfn( v->domain, mfn)); } /* PTs should not be shared */ BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow); } break; case 4: /* Read CR4 */ *reg = v->arch.pv_vcpu.ctrlreg[4]; break; default: goto fail; } break; case 0x21: /* MOV DR?, */ { unsigned long res; opcode = insn_fetch(u8, code_base, eip, code_limit); if ( opcode < 0xc0 ) goto fail; modrm_reg += ((opcode >> 3) & 7) + (lock << 3); modrm_rm |= (opcode >> 0) & 7; reg = decode_register(modrm_rm, regs, 0); if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 ) goto fail; *reg = res; break; } case 0x22: /* MOV ,CR? */ opcode = insn_fetch(u8, code_base, eip, code_limit); if ( opcode < 0xc0 ) goto fail; modrm_reg += ((opcode >> 3) & 7) + (lock << 3); modrm_rm |= (opcode >> 0) & 7; reg = decode_register(modrm_rm, regs, 0); switch ( modrm_reg ) { case 0: /* Write CR0 */ if ( (*reg ^ read_cr0()) & ~X86_CR0_TS ) { gdprintk(XENLOG_WARNING, "Attempt to change unmodifiable CR0 flags.\n"); goto fail; } (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS)); break; case 2: /* Write CR2 */ v->arch.pv_vcpu.ctrlreg[2] = *reg; arch_set_cr2(v, *reg); break; case 3: {/* Write CR3 */ unsigned long gfn; struct page_info *page; gfn = !is_pv_32on64_vcpu(v) ? xen_cr3_to_pfn(*reg) : compat_cr3_to_pfn(*reg); page = get_page_from_gfn(v->domain, gfn, NULL, P2M_ALLOC); if ( page ) { rc = new_guest_cr3(page_to_mfn(page)); put_page(page); } else rc = -EINVAL; switch ( rc ) { case 0: break; case -EAGAIN: /* retry after preemption */ goto skip; default: /* not okay */ goto fail; } break; } case 4: /* Write CR4 */ v->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(v, *reg); write_cr4(pv_guest_cr4_to_real_cr4(v)); break; default: goto fail; } break; case 0x23: /* MOV ,DR? */ opcode = insn_fetch(u8, code_base, eip, code_limit); if ( opcode < 0xc0 ) goto fail; modrm_reg += ((opcode >> 3) & 7) + (lock << 3); modrm_rm |= (opcode >> 0) & 7; reg = decode_register(modrm_rm, regs, 0); if ( do_set_debugreg(modrm_reg, *reg) != 0 ) goto fail; break; case 0x30: /* WRMSR */ { uint32_t eax = regs->eax; uint32_t edx = regs->edx; msr_content = ((uint64_t)edx << 32) | eax; switch ( (u32)regs->ecx ) { case MSR_FS_BASE: if ( is_pv_32on64_vcpu(v) ) goto fail; wrfsbase(msr_content); v->arch.pv_vcpu.fs_base = msr_content; break; case MSR_GS_BASE: if ( is_pv_32on64_vcpu(v) ) goto fail; wrgsbase(msr_content); v->arch.pv_vcpu.gs_base_kernel = msr_content; break; case MSR_SHADOW_GS_BASE: if ( is_pv_32on64_vcpu(v) ) goto fail; if ( wrmsr_safe(MSR_SHADOW_GS_BASE, msr_content) ) goto fail; v->arch.pv_vcpu.gs_base_user = msr_content; break; case MSR_K7_FID_VID_STATUS: case MSR_K7_FID_VID_CTL: case MSR_K8_PSTATE_LIMIT: case MSR_K8_PSTATE_CTRL: case MSR_K8_PSTATE_STATUS: case MSR_K8_PSTATE0: case MSR_K8_PSTATE1: case MSR_K8_PSTATE2: case MSR_K8_PSTATE3: case MSR_K8_PSTATE4: case MSR_K8_PSTATE5: case MSR_K8_PSTATE6: case MSR_K8_PSTATE7: case MSR_K8_HWCR: if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) goto fail; if ( !is_cpufreq_controller(v->domain) ) break; if ( wrmsr_safe(regs->ecx, msr_content) != 0 ) goto fail; break; case MSR_AMD64_NB_CFG: if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD || boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 ) goto fail; if ( !is_hardware_domain(v->domain) || !is_pinned_vcpu(v) ) break; if ( (rdmsr_safe(MSR_AMD64_NB_CFG, val) != 0) || (eax != (uint32_t)val) || ((edx ^ (val >> 32)) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) ) goto invalid; if ( wrmsr_safe(MSR_AMD64_NB_CFG, msr_content) != 0 ) goto fail; break; case MSR_FAM10H_MMIO_CONF_BASE: if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD || boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 ) goto fail; if ( !is_hardware_domain(v->domain) || !is_pinned_vcpu(v) ) break; if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) != 0) ) goto fail; if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ? val != msr_content : ((val ^ msr_content) & ~( FAM10H_MMIO_CONF_ENABLE | (FAM10H_MMIO_CONF_BUSRANGE_MASK << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) | ((u64)FAM10H_MMIO_CONF_BASE_MASK << FAM10H_MMIO_CONF_BASE_SHIFT))) ) goto invalid; if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, msr_content) != 0 ) goto fail; break; case MSR_IA32_UCODE_REV: if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) goto fail; if ( !is_hardware_domain(v->domain) || !is_pinned_vcpu(v) ) break; if ( rdmsr_safe(regs->ecx, val) ) goto fail; if ( msr_content ) goto invalid; break; case MSR_IA32_MISC_ENABLE: if ( rdmsr_safe(regs->ecx, val) ) goto fail; val = guest_misc_enable(val); if ( msr_content != val ) goto invalid; break; case MSR_IA32_MPERF: case MSR_IA32_APERF: if (( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) && ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) ) goto fail; if ( !is_cpufreq_controller(v->domain) ) break; if ( wrmsr_safe(regs->ecx, msr_content ) != 0 ) goto fail; break; case MSR_IA32_PERF_CTL: if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) goto fail; if ( !is_cpufreq_controller(v->domain) ) break; if ( wrmsr_safe(regs->ecx, msr_content) != 0 ) goto fail; break; case MSR_IA32_THERM_CONTROL: case MSR_IA32_ENERGY_PERF_BIAS: if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) goto fail; if ( !is_hardware_domain(v->domain) || !is_pinned_vcpu(v) ) break; if ( wrmsr_safe(regs->ecx, msr_content) != 0 ) goto fail; break; default: if ( wrmsr_hypervisor_regs(regs->ecx, msr_content) == 1 ) break; rc = vmce_wrmsr(regs->ecx, msr_content); if ( rc < 0 ) goto fail; if ( rc ) break; if ( (rdmsr_safe(regs->ecx, val) != 0) || (msr_content != val) ) invalid: gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from " "0x%016"PRIx64" to 0x%016"PRIx64".\n", _p(regs->ecx), val, msr_content); break; } break; } case 0x31: /* RDTSC */ if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) && !guest_kernel_mode(v, regs) ) goto fail; if ( v->domain->arch.vtsc ) pv_soft_rdtsc(v, regs, 0); else rdtsc(regs->eax, regs->edx); break; case 0x32: /* RDMSR */ switch ( (u32)regs->ecx ) { case MSR_FS_BASE: if ( is_pv_32on64_vcpu(v) ) goto fail; val = cpu_has_fsgsbase ? __rdfsbase() : v->arch.pv_vcpu.fs_base; goto rdmsr_writeback; case MSR_GS_BASE: if ( is_pv_32on64_vcpu(v) ) goto fail; val = cpu_has_fsgsbase ? __rdgsbase() : v->arch.pv_vcpu.gs_base_kernel; goto rdmsr_writeback; case MSR_SHADOW_GS_BASE: if ( is_pv_32on64_vcpu(v) ) goto fail; regs->eax = v->arch.pv_vcpu.gs_base_user & 0xFFFFFFFFUL; regs->edx = v->arch.pv_vcpu.gs_base_user >> 32; break; case MSR_K7_FID_VID_CTL: case MSR_K7_FID_VID_STATUS: case MSR_K8_PSTATE_LIMIT: case MSR_K8_PSTATE_CTRL: case MSR_K8_PSTATE_STATUS: case MSR_K8_PSTATE0: case MSR_K8_PSTATE1: case MSR_K8_PSTATE2: case MSR_K8_PSTATE3: case MSR_K8_PSTATE4: case MSR_K8_PSTATE5: case MSR_K8_PSTATE6: case MSR_K8_PSTATE7: if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) goto fail; if ( !is_cpufreq_controller(v->domain) ) { regs->eax = regs->edx = 0; break; } goto rdmsr_normal; case MSR_IA32_UCODE_REV: BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL); if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) { if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) ) goto fail; sync_core(); } goto rdmsr_normal; case MSR_IA32_MISC_ENABLE: if ( rdmsr_safe(regs->ecx, msr_content) ) goto fail; msr_content = guest_misc_enable(msr_content); regs->eax = (uint32_t)msr_content; regs->edx = (uint32_t)(msr_content >> 32); break; default: if ( rdmsr_hypervisor_regs(regs->ecx, &val) ) { rdmsr_writeback: regs->eax = (uint32_t)val; regs->edx = (uint32_t)(val >> 32); break; } rc = vmce_rdmsr(regs->ecx, &val); if ( rc < 0 ) goto fail; if ( rc ) goto rdmsr_writeback; case MSR_EFER: rdmsr_normal: /* Everyone can read the MSR space. */ /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n", _p(regs->ecx));*/ if ( rdmsr_safe(regs->ecx, msr_content) ) goto fail; regs->eax = (uint32_t)msr_content; regs->edx = (uint32_t)(msr_content >> 32); break; } break; case 0xa2: /* CPUID */ pv_cpuid(regs); break; default: goto fail; } #undef wr_ad #undef rd_ad done: instruction_done(regs, eip, bpmatch); skip: return EXCRET_fault_fixed; fail: return 0; } static inline int check_stack_limit(unsigned int ar, unsigned int limit, unsigned int esp, unsigned int decr) { return (((esp - decr) < (esp - 1)) && (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit)); } static void emulate_gate_op(struct cpu_user_regs *regs) { struct vcpu *v = current; unsigned int sel, ar, dpl, nparm, opnd_sel; unsigned int op_default, op_bytes, ad_default, ad_bytes; unsigned long off, eip, opnd_off, base, limit; int jump; /* Check whether this fault is due to the use of a call gate. */ if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) || (((ar >> 13) & 3) < (regs->cs & 3)) || ((ar & _SEGMENT_TYPE) != 0xc00) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } if ( !(ar & _SEGMENT_P) ) { do_guest_trap(TRAP_no_segment, regs, 1); return; } dpl = (ar >> 13) & 3; nparm = ar & 0x1f; /* * Decode instruction (and perhaps operand) to determine RPL, * whether this is a jump or a call, and the call return offset. */ if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) || !(ar & _SEGMENT_S) || !(ar & _SEGMENT_P) || !(ar & _SEGMENT_CODE) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2; ad_default = ad_bytes = op_default; opnd_sel = opnd_off = 0; jump = -1; for ( eip = regs->eip; eip - regs->_eip < 10; ) { switch ( insn_fetch(u8, base, eip, limit) ) { case 0x66: /* operand-size override */ op_bytes = op_default ^ 6; /* switch between 2/4 bytes */ continue; case 0x67: /* address-size override */ ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */ continue; case 0x2e: /* CS override */ opnd_sel = regs->cs; ASSERT(opnd_sel); continue; case 0x3e: /* DS override */ opnd_sel = read_sreg(regs, ds); if ( !opnd_sel ) opnd_sel = dpl; continue; case 0x26: /* ES override */ opnd_sel = read_sreg(regs, es); if ( !opnd_sel ) opnd_sel = dpl; continue; case 0x64: /* FS override */ opnd_sel = read_sreg(regs, fs); if ( !opnd_sel ) opnd_sel = dpl; continue; case 0x65: /* GS override */ opnd_sel = read_sreg(regs, gs); if ( !opnd_sel ) opnd_sel = dpl; continue; case 0x36: /* SS override */ opnd_sel = regs->ss; if ( !opnd_sel ) opnd_sel = dpl; continue; case 0xea: ++jump; /* FALLTHROUGH */ case 0x9a: ++jump; opnd_sel = regs->cs; opnd_off = eip; ad_bytes = ad_default; eip += op_bytes + 2; break; case 0xff: { unsigned int modrm; switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 ) { case 0x28: case 0x68: case 0xa8: ++jump; /* FALLTHROUGH */ case 0x18: case 0x58: case 0x98: ++jump; if ( ad_bytes != 2 ) { if ( (modrm & 7) == 4 ) { unsigned int sib; sib = insn_fetch(u8, base, eip, limit); modrm = (modrm & ~7) | (sib & 7); if ( (sib >>= 3) != 4 ) opnd_off = *(unsigned long *) decode_register(sib & 7, regs, 0); opnd_off <<= sib >> 3; } if ( (modrm & 7) != 5 || (modrm & 0xc0) ) opnd_off += *(unsigned long *) decode_register(modrm & 7, regs, 0); else modrm |= 0x87; if ( !opnd_sel ) { switch ( modrm & 7 ) { default: opnd_sel = read_sreg(regs, ds); break; case 4: case 5: opnd_sel = regs->ss; break; } } } else { switch ( modrm & 7 ) { case 0: case 1: case 7: opnd_off = regs->ebx; break; case 6: if ( !(modrm & 0xc0) ) modrm |= 0x80; else case 2: case 3: { opnd_off = regs->ebp; if ( !opnd_sel ) opnd_sel = regs->ss; } break; } if ( !opnd_sel ) opnd_sel = read_sreg(regs, ds); switch ( modrm & 7 ) { case 0: case 2: case 4: opnd_off += regs->esi; break; case 1: case 3: case 5: opnd_off += regs->edi; break; } } switch ( modrm & 0xc0 ) { case 0x40: opnd_off += insn_fetch(s8, base, eip, limit); break; case 0x80: opnd_off += insn_fetch(s32, base, eip, limit); break; } if ( ad_bytes == 4 ) opnd_off = (unsigned int)opnd_off; else if ( ad_bytes == 2 ) opnd_off = (unsigned short)opnd_off; break; } } break; } break; } if ( jump < 0 ) { fail: do_guest_trap(TRAP_gp_fault, regs, 1); skip: return; } if ( (opnd_sel != regs->cs && !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) || !(ar & _SEGMENT_S) || !(ar & _SEGMENT_P) || ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } opnd_off += op_bytes; #define ad_default ad_bytes opnd_sel = insn_fetch(u16, base, opnd_off, limit); #undef ad_default ASSERT((opnd_sel & ~3) == regs->error_code); if ( dpl < (opnd_sel & 3) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) || !(ar & _SEGMENT_S) || !(ar & _SEGMENT_CODE) || (!jump || (ar & _SEGMENT_EC) ? ((ar >> 13) & 3) > (regs->cs & 3) : ((ar >> 13) & 3) != (regs->cs & 3)) ) { regs->error_code = sel; do_guest_trap(TRAP_gp_fault, regs, 1); return; } if ( !(ar & _SEGMENT_P) ) { regs->error_code = sel; do_guest_trap(TRAP_no_segment, regs, 1); return; } if ( off > limit ) { regs->error_code = 0; do_guest_trap(TRAP_gp_fault, regs, 1); return; } if ( !jump ) { unsigned int ss, esp, *stkp; int rc; #define push(item) do \ { \ --stkp; \ esp -= 4; \ rc = __put_user(item, stkp); \ if ( rc ) \ { \ propagate_page_fault((unsigned long)(stkp + 1) - rc, \ PFEC_write_access); \ return; \ } \ } while ( 0 ) if ( ((ar >> 13) & 3) < (regs->cs & 3) ) { sel |= (ar >> 13) & 3; /* Inner stack known only for kernel ring. */ if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } esp = v->arch.pv_vcpu.kernel_sp; ss = v->arch.pv_vcpu.kernel_ss; if ( (ss & 3) != (sel & 3) || !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) || ((ar >> 13) & 3) != (sel & 3) || !(ar & _SEGMENT_S) || (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR) ) { regs->error_code = ss & ~3; do_guest_trap(TRAP_invalid_tss, regs, 1); return; } if ( !(ar & _SEGMENT_P) || !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) ) { regs->error_code = ss & ~3; do_guest_trap(TRAP_stack_error, regs, 1); return; } stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp); if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } push(regs->ss); push(regs->esp); if ( nparm ) { const unsigned int *ustkp; if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) || ((ar >> 13) & 3) != (regs->cs & 3) || !(ar & _SEGMENT_S) || (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR) || !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) ) return do_guest_trap(TRAP_gp_fault, regs, 1); ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4); if ( !compat_access_ok(ustkp - nparm, nparm * 4) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } do { unsigned int parm; --ustkp; rc = __get_user(parm, ustkp); if ( rc ) { propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0); return; } push(parm); } while ( --nparm ); } } else { sel |= (regs->cs & 3); esp = regs->esp; ss = regs->ss; if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) || ((ar >> 13) & 3) != (sel & 3) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } if ( !check_stack_limit(ar, limit, esp, 2 * 4) ) { regs->error_code = 0; do_guest_trap(TRAP_stack_error, regs, 1); return; } stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp); if ( !compat_access_ok(stkp - 2, 2 * 4) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } } push(regs->cs); push(eip); #undef push regs->esp = esp; regs->ss = ss; } else sel |= (regs->cs & 3); regs->cs = sel; instruction_done(regs, off, 0); } void do_general_protection(struct cpu_user_regs *regs) { struct vcpu *v = current; unsigned long fixup; DEBUGGER_trap_entry(TRAP_gp_fault, regs); if ( regs->error_code & 1 ) goto hardware_gp; if ( !guest_mode(regs) ) goto gp_in_kernel; /* * Cunning trick to allow arbitrary "INT n" handling. * * We set DPL == 0 on all vectors in the IDT. This prevents any INT * instruction from trapping to the appropriate vector, when that might not * be expected by Xen or the guest OS. For example, that entry might be for * a fault handler (unlike traps, faults don't increment EIP), or might * expect an error code on the stack (which a software trap never * provides), or might be a hardware interrupt handler that doesn't like * being called spuriously. * * Instead, a GPF occurs with the faulting IDT vector in the error code. * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is * clear to indicate that it's a software fault, not hardware. * * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is * okay because they can only be triggered by an explicit DPL-checked * instruction. The DPL specified by the guest OS for these vectors is NOT * CHECKED!! */ if ( (regs->error_code & 3) == 2 ) { /* This fault must be due to instruction. */ const struct trap_info *ti; unsigned char vector = regs->error_code >> 3; ti = &v->arch.pv_vcpu.trap_ctxt[vector]; if ( permit_softint(TI_GET_DPL(ti), v, regs) ) { regs->eip += 2; do_guest_trap(vector, regs, 0); return; } } else if ( is_pv_32on64_vcpu(v) && regs->error_code ) { emulate_gate_op(regs); return; } /* Emulate some simple privileged and I/O instructions. */ if ( (regs->error_code == 0) && emulate_privileged_op(regs) ) { trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip); return; } /* Pass on GPF as is. */ do_guest_trap(TRAP_gp_fault, regs, 1); return; gp_in_kernel: if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) { dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n", regs->error_code, _p(regs->eip), _p(fixup)); this_cpu(last_extable_addr) = regs->eip; regs->eip = fixup; return; } DEBUGGER_trap_fatal(TRAP_gp_fault, regs); hardware_gp: show_execution_state(regs); panic("GENERAL PROTECTION FAULT\n[error_code=%04x]", regs->error_code); } static DEFINE_PER_CPU(struct softirq_trap, softirq_trap); static void nmi_mce_softirq(void) { int cpu = smp_processor_id(); struct softirq_trap *st = &per_cpu(softirq_trap, cpu); BUG_ON(st == NULL); BUG_ON(st->vcpu == NULL); /* Set the tmp value unconditionally, so that * the check in the iret hypercall works. */ cpumask_copy(st->vcpu->cpu_affinity_tmp, st->vcpu->cpu_affinity); if ((cpu != st->processor) || (st->processor != st->vcpu->processor)) { /* We are on a different physical cpu. * Make sure to wakeup the vcpu on the * specified processor. */ vcpu_set_affinity(st->vcpu, cpumask_of(st->processor)); /* Affinity is restored in the iret hypercall. */ } /* Only used to defer wakeup of domain/vcpu to * a safe (non-NMI/MCE) context. */ vcpu_kick(st->vcpu); st->vcpu = NULL; } static void pci_serr_softirq(void) { printk("\n\nNMI - PCI system error (SERR)\n"); outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */ } void async_exception_cleanup(struct vcpu *curr) { int trap; if ( !curr->async_exception_mask ) return; /* Restore affinity. */ if ( !cpumask_empty(curr->cpu_affinity_tmp) && !cpumask_equal(curr->cpu_affinity_tmp, curr->cpu_affinity) ) { vcpu_set_affinity(curr, curr->cpu_affinity_tmp); cpumask_clear(curr->cpu_affinity_tmp); } if ( !(curr->async_exception_mask & (curr->async_exception_mask - 1)) ) trap = __scanbit(curr->async_exception_mask, VCPU_TRAP_NONE); else for ( trap = VCPU_TRAP_NONE + 1; trap <= VCPU_TRAP_LAST; ++trap ) if ( (curr->async_exception_mask ^ curr->async_exception_state(trap).old_mask) == (1 << trap) ) break; ASSERT(trap <= VCPU_TRAP_LAST); /* Restore previous asynchronous exception mask. */ curr->async_exception_mask = curr->async_exception_state(trap).old_mask; } static void nmi_dom0_report(unsigned int reason_idx) { struct domain *d = dom0; if ( (d == NULL) || (d->vcpu == NULL) || (d->vcpu[0] == NULL) ) return; set_bit(reason_idx, nmi_reason(d)); send_guest_trap(d, 0, TRAP_nmi); } static void pci_serr_error(struct cpu_user_regs *regs) { outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable the PCI SERR error line. */ switch ( opt_nmi[0] ) { case 'd': /* 'dom0' */ nmi_dom0_report(_XEN_NMIREASON_pci_serr); case 'i': /* 'ignore' */ /* Would like to print a diagnostic here but can't call printk() from NMI context -- raise a softirq instead. */ raise_softirq(PCI_SERR_SOFTIRQ); break; default: /* 'fatal' */ console_force_unlock(); printk("\n\nNMI - PCI system error (SERR)\n"); fatal_trap(TRAP_nmi, regs); } } static void io_check_error(struct cpu_user_regs *regs) { switch ( opt_nmi[0] ) { case 'd': /* 'dom0' */ nmi_dom0_report(_XEN_NMIREASON_io_error); case 'i': /* 'ignore' */ break; default: /* 'fatal' */ console_force_unlock(); printk("\n\nNMI - I/O ERROR\n"); fatal_trap(TRAP_nmi, regs); } outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */ mdelay(1); outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */ } static void unknown_nmi_error(struct cpu_user_regs *regs, unsigned char reason) { switch ( opt_nmi[0] ) { case 'd': /* 'dom0' */ nmi_dom0_report(_XEN_NMIREASON_unknown); case 'i': /* 'ignore' */ break; default: /* 'fatal' */ console_force_unlock(); printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); printk("Do you have a strange power saving mode enabled?\n"); fatal_trap(TRAP_nmi, regs); } } static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu) { return 0; } static nmi_callback_t nmi_callback = dummy_nmi_callback; void do_nmi(struct cpu_user_regs *regs) { unsigned int cpu = smp_processor_id(); unsigned char reason; ++nmi_count(cpu); if ( nmi_callback(regs, cpu) ) return; if ( nmi_watchdog ) nmi_watchdog_tick(regs); /* Only the BSP gets external NMIs from the system. */ if ( cpu == 0 ) { reason = inb(0x61); if ( reason & 0x80 ) pci_serr_error(regs); if ( reason & 0x40 ) io_check_error(regs); if ( !(reason & 0xc0) && !nmi_watchdog ) unknown_nmi_error(regs, reason); } } void set_nmi_callback(nmi_callback_t callback) { nmi_callback = callback; } void unset_nmi_callback(void) { nmi_callback = dummy_nmi_callback; } void do_device_not_available(struct cpu_user_regs *regs) { struct vcpu *curr = current; BUG_ON(!guest_mode(regs)); vcpu_restore_fpu_lazy(curr); if ( curr->arch.pv_vcpu.ctrlreg[0] & X86_CR0_TS ) { do_guest_trap(TRAP_no_device, regs, 0); curr->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS; } else TRACE_0D(TRC_PV_MATH_STATE_RESTORE); return; } u64 read_efer(void) { return this_cpu(efer); } void write_efer(u64 val) { this_cpu(efer) = val; wrmsrl(MSR_EFER, val); } static void ler_enable(void) { u64 debugctl; if ( !this_cpu(ler_msr) ) return; rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | IA32_DEBUGCTLMSR_LBR); } void do_debug(struct cpu_user_regs *regs) { struct vcpu *v = current; DEBUGGER_trap_entry(TRAP_debug, regs); if ( !guest_mode(regs) ) { if ( regs->eflags & X86_EFLAGS_TF ) { /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */ if ( (regs->rip >= (unsigned long)sysenter_entry) && (regs->rip <= (unsigned long)sysenter_eflags_saved) ) { if ( regs->rip == (unsigned long)sysenter_eflags_saved ) regs->eflags &= ~X86_EFLAGS_TF; goto out; } if ( !debugger_trap_fatal(TRAP_debug, regs) ) { WARN_ON(1); regs->eflags &= ~X86_EFLAGS_TF; } } else { /* * We ignore watchpoints when they trigger within Xen. This may * happen when a buffer is passed to us which previously had a * watchpoint set on it. No need to bump EIP; the only faulting * trap is an instruction breakpoint, which can't happen to us. */ WARN_ON(!search_exception_table(regs->eip)); } goto out; } /* Save debug status register where guest OS can peek at it */ v->arch.debugreg[6] = read_debugreg(6); ler_enable(); do_guest_trap(TRAP_debug, regs, 0); return; out: ler_enable(); return; } void do_spurious_interrupt_bug(struct cpu_user_regs *regs) { } static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr) { int i; /* Keep secondary tables in sync with IRQ updates. */ for ( i = 1; i < nr_cpu_ids; i++ ) if ( idt_tables[i] != NULL ) _set_gate(&idt_tables[i][n], 14, dpl, addr); _set_gate(&idt_table[n], 14, dpl, addr); } static void set_swint_gate(unsigned int n, void *addr) { __set_intr_gate(n, 3, addr); } void set_intr_gate(unsigned int n, void *addr) { __set_intr_gate(n, 0, addr); } void load_TR(void) { struct tss_struct *tss = &this_cpu(init_tss); struct desc_ptr old_gdt, tss_gdt = { .base = (long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY), .limit = LAST_RESERVED_GDT_BYTE }; _set_tssldt_desc( this_cpu(gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, (unsigned long)tss, offsetof(struct tss_struct, __cacheline_filler) - 1, 9); _set_tssldt_desc( this_cpu(compat_gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, (unsigned long)tss, offsetof(struct tss_struct, __cacheline_filler) - 1, 11); /* Switch to non-compat GDT (which has B bit clear) to execute LTR. */ asm volatile ( "sgdt %0; lgdt %2; ltr %w1; lgdt %0" : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" ); } void __devinit percpu_traps_init(void) { subarch_percpu_traps_init(); if ( !opt_ler ) return; switch ( boot_cpu_data.x86_vendor ) { case X86_VENDOR_INTEL: switch ( boot_cpu_data.x86 ) { case 6: this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP; break; case 15: this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP; break; } break; case X86_VENDOR_AMD: switch ( boot_cpu_data.x86 ) { case 6: case 0xf ... 0x17: this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP; break; } break; } ler_enable(); } void __init trap_init(void) { /* * Note that interrupt gates are always used, rather than trap gates. We * must have interrupts disabled until DS/ES/FS/GS are saved because the * first activation must have the "bad" value(s) for these registers and * we may lose them if another activation is installed before they are * saved. The page-fault handler also needs interrupts disabled until %cr2 * has been read and saved on the stack. */ set_intr_gate(TRAP_divide_error,÷_error); set_intr_gate(TRAP_debug,&debug); set_intr_gate(TRAP_nmi,&nmi); set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */ set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */ set_intr_gate(TRAP_bounds,&bounds); set_intr_gate(TRAP_invalid_op,&invalid_op); set_intr_gate(TRAP_no_device,&device_not_available); set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun); set_intr_gate(TRAP_invalid_tss,&invalid_TSS); set_intr_gate(TRAP_no_segment,&segment_not_present); set_intr_gate(TRAP_stack_error,&stack_segment); set_intr_gate(TRAP_gp_fault,&general_protection); set_intr_gate(TRAP_page_fault,&page_fault); set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug); set_intr_gate(TRAP_copro_error,&coprocessor_error); set_intr_gate(TRAP_alignment_check,&alignment_check); set_intr_gate(TRAP_machine_check,&machine_check); set_intr_gate(TRAP_simd_error,&simd_coprocessor_error); /* CPU0 uses the master IDT. */ idt_tables[0] = idt_table; this_cpu(gdt_table) = boot_cpu_gdt_table; this_cpu(compat_gdt_table) = boot_cpu_compat_gdt_table; percpu_traps_init(); cpu_init(); open_softirq(NMI_MCE_SOFTIRQ, nmi_mce_softirq); open_softirq(PCI_SERR_SOFTIRQ, pci_serr_softirq); } long register_guest_nmi_callback(unsigned long address) { struct vcpu *v = current; struct domain *d = v->domain; struct trap_info *t = &v->arch.pv_vcpu.trap_ctxt[TRAP_nmi]; if ( !is_canonical_address(address) ) return -EINVAL; t->vector = TRAP_nmi; t->flags = 0; t->cs = (is_pv_32on64_domain(d) ? FLAT_COMPAT_KERNEL_CS : FLAT_KERNEL_CS); t->address = address; TI_SET_IF(t, 1); /* * If no handler was registered we can 'lose the NMI edge'. Re-assert it * now. */ if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) ) v->nmi_pending = 1; return 0; } long unregister_guest_nmi_callback(void) { struct vcpu *v = current; struct trap_info *t = &v->arch.pv_vcpu.trap_ctxt[TRAP_nmi]; memset(t, 0, sizeof(*t)); return 0; } int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr) { struct vcpu *v; struct trap_info *t; BUG_ON(d == NULL); BUG_ON(vcpuid >= d->max_vcpus); /* Sanity check - XXX should be more fine grained. */ BUG_ON(trap_nr >= NR_VECTORS); v = d->vcpu[vcpuid]; t = &v->arch.pv_vcpu.trap_ctxt[trap_nr]; return (t->address != 0); } int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr) { struct vcpu *v; struct softirq_trap *st = &per_cpu(softirq_trap, smp_processor_id()); BUG_ON(d == NULL); BUG_ON(vcpuid >= d->max_vcpus); v = d->vcpu[vcpuid]; switch (trap_nr) { case TRAP_nmi: if ( cmpxchgptr(&st->vcpu, NULL, v) ) return -EBUSY; if ( !test_and_set_bool(v->nmi_pending) ) { st->domain = d; st->processor = v->processor; /* not safe to wake up a vcpu here */ raise_softirq(NMI_MCE_SOFTIRQ); return 0; } st->vcpu = NULL; break; case TRAP_machine_check: if ( cmpxchgptr(&st->vcpu, NULL, v) ) return -EBUSY; /* We are called by the machine check (exception or polling) handlers * on the physical CPU that reported a machine check error. */ if ( !test_and_set_bool(v->mce_pending) ) { st->domain = d; st->vcpu = v; st->processor = v->processor; /* not safe to wake up a vcpu here */ raise_softirq(NMI_MCE_SOFTIRQ); return 0; } st->vcpu = NULL; break; } /* delivery failed */ return -EIO; } long do_set_trap_table(XEN_GUEST_HANDLE_PARAM(const_trap_info_t) traps) { struct trap_info cur; struct vcpu *curr = current; struct trap_info *dst = curr->arch.pv_vcpu.trap_ctxt; long rc = 0; /* If no table is presented then clear the entire virtual IDT. */ if ( guest_handle_is_null(traps) ) { memset(dst, 0, NR_VECTORS * sizeof(*dst)); init_int80_direct_trap(curr); return 0; } for ( ; ; ) { if ( hypercall_preempt_check() ) { rc = hypercall_create_continuation( __HYPERVISOR_set_trap_table, "h", traps); break; } if ( copy_from_guest(&cur, traps, 1) ) { rc = -EFAULT; break; } if ( cur.address == 0 ) break; if ( !is_canonical_address(cur.address) ) return -EINVAL; fixup_guest_code_selector(curr->domain, cur.cs); memcpy(&dst[cur.vector], &cur, sizeof(cur)); if ( cur.vector == 0x80 ) init_int80_direct_trap(curr); guest_handle_add_offset(traps, 1); } return rc; } long set_debugreg(struct vcpu *v, int reg, unsigned long value) { int i; struct vcpu *curr = current; switch ( reg ) { case 0: if ( !access_ok(value, sizeof(long)) ) return -EPERM; if ( v == curr ) write_debugreg(0, value); break; case 1: if ( !access_ok(value, sizeof(long)) ) return -EPERM; if ( v == curr ) write_debugreg(1, value); break; case 2: if ( !access_ok(value, sizeof(long)) ) return -EPERM; if ( v == curr ) write_debugreg(2, value); break; case 3: if ( !access_ok(value, sizeof(long)) ) return -EPERM; if ( v == curr ) write_debugreg(3, value); break; case 6: /* * DR6: Bits 4-11,16-31 reserved (set to 1). * Bit 12 reserved (set to 0). */ value &= 0xffffefff; /* reserved bits => 0 */ value |= 0xffff0ff0; /* reserved bits => 1 */ if ( v == curr ) write_debugreg(6, value); break; case 7: /* * DR7: Bit 10 reserved (set to 1). * Bits 11-12,14-15 reserved (set to 0). */ value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */ value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */ /* * Privileged bits: * GD (bit 13): must be 0. */ if ( value & DR_GENERAL_DETECT ) return -EPERM; /* DR7.{G,L}E = 0 => debugging disabled for this domain. */ if ( value & DR7_ACTIVE_MASK ) { unsigned int io_enable = 0; for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE ) { if ( ((value >> i) & 3) == DR_IO ) { if ( !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ) return -EPERM; io_enable |= value & (3 << ((i - 16) >> 1)); } } /* Guest DR5 is a handy stash for I/O intercept information. */ v->arch.debugreg[5] = io_enable; value &= ~io_enable; /* * If DR7 was previously clear then we need to load all other * debug registers at this point as they were not restored during * context switch. */ if ( (v == curr) && !(v->arch.debugreg[7] & DR7_ACTIVE_MASK) ) { write_debugreg(0, v->arch.debugreg[0]); write_debugreg(1, v->arch.debugreg[1]); write_debugreg(2, v->arch.debugreg[2]); write_debugreg(3, v->arch.debugreg[3]); write_debugreg(6, v->arch.debugreg[6]); } } if ( v == curr ) write_debugreg(7, value); break; default: return -EINVAL; } v->arch.debugreg[reg] = value; return 0; } long do_set_debugreg(int reg, unsigned long value) { return set_debugreg(current, reg, value); } unsigned long do_get_debugreg(int reg) { struct vcpu *curr = current; switch ( reg ) { case 0 ... 3: case 6: return curr->arch.debugreg[reg]; case 7: return (curr->arch.debugreg[7] | curr->arch.debugreg[5]); case 4 ... 5: return ((curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ? curr->arch.debugreg[reg + 2] : 0); } return -EINVAL; } void asm_domain_crash_synchronous(unsigned long addr) { if ( addr == 0 ) addr = this_cpu(last_extable_addr); printk("domain_crash_sync called from entry.S: fault at %p %pS\n", _p(addr), _p(addr)); __domain_crash_synchronous(); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/sysctl.c0000664000175000017500000000533412307313555014271 0ustar smbsmb/****************************************************************************** * Arch-specific sysctl.c * * System management operations. For use by node control stack. * * Copyright (c) 2002-2006, K Fraser */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0) long cpu_up_helper(void *data) { int cpu = (unsigned long)data; int ret = cpu_up(cpu); if ( ret == -EBUSY ) { /* On EBUSY, flush RCU work and have one more go. */ rcu_barrier(); ret = cpu_up(cpu); } return ret; } long cpu_down_helper(void *data) { int cpu = (unsigned long)data; int ret = cpu_down(cpu); if ( ret == -EBUSY ) { /* On EBUSY, flush RCU work and have one more go. */ rcu_barrier(); ret = cpu_down(cpu); } return ret; } void arch_do_physinfo(xen_sysctl_physinfo_t *pi) { memcpy(pi->hw_cap, boot_cpu_data.x86_capability, NCAPINTS*4); if ( hvm_enabled ) pi->capabilities |= XEN_SYSCTL_PHYSCAP_hvm; if ( iommu_enabled ) pi->capabilities |= XEN_SYSCTL_PHYSCAP_hvm_directio; } long arch_do_sysctl( struct xen_sysctl *sysctl, XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) u_sysctl) { long ret = 0; switch ( sysctl->cmd ) { case XEN_SYSCTL_cpu_hotplug: { unsigned int cpu = sysctl->u.cpu_hotplug.cpu; switch ( sysctl->u.cpu_hotplug.op ) { case XEN_SYSCTL_CPU_HOTPLUG_ONLINE: ret = xsm_resource_plug_core(XSM_HOOK); if ( ret ) break; ret = continue_hypercall_on_cpu( 0, cpu_up_helper, (void *)(unsigned long)cpu); break; case XEN_SYSCTL_CPU_HOTPLUG_OFFLINE: ret = xsm_resource_unplug_core(XSM_HOOK); if ( ret ) break; ret = continue_hypercall_on_cpu( 0, cpu_down_helper, (void *)(unsigned long)cpu); break; default: ret = -EINVAL; break; } } break; default: ret = -ENOSYS; break; } return ret; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/microcode_amd.c0000664000175000017500000002706212307313555015537 0ustar smbsmb/* * AMD CPU Microcode Update Driver for Linux * Copyright (C) 2008 Advanced Micro Devices Inc. * * Author: Peter Oruba * * Based on work by: * Tigran Aivazian * * This driver allows to upgrade microcode on AMD * family 0x10 and 0x11 processors. * * Licensed unter the terms of the GNU General Public * License version 2. See file COPYING for details. */ #include #include #include #include #include #include #include #include #include #include #include struct equiv_cpu_entry { uint32_t installed_cpu; uint32_t fixed_errata_mask; uint32_t fixed_errata_compare; uint16_t equiv_cpu; uint16_t reserved; } __attribute__((packed)); struct microcode_header_amd { uint32_t data_code; uint32_t patch_id; uint8_t mc_patch_data_id[2]; uint8_t mc_patch_data_len; uint8_t init_flag; uint32_t mc_patch_data_checksum; uint32_t nb_dev_id; uint32_t sb_dev_id; uint16_t processor_rev_id; uint8_t nb_rev_id; uint8_t sb_rev_id; uint8_t bios_api_rev; uint8_t reserved1[3]; uint32_t match_reg[8]; } __attribute__((packed)); #define UCODE_MAGIC 0x00414d44 #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 #define UCODE_UCODE_TYPE 0x00000001 struct microcode_amd { void *mpb; size_t mpb_size; struct equiv_cpu_entry *equiv_cpu_table; size_t equiv_cpu_table_size; }; struct mpbhdr { uint32_t type; uint32_t len; uint8_t data[]; }; /* serialize access to the physical write */ static DEFINE_SPINLOCK(microcode_update_lock); /* See comment in start_update() for cases when this routine fails */ static int collect_cpu_info(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data[cpu]; memset(csig, 0, sizeof(*csig)); if ( (c->x86_vendor != X86_VENDOR_AMD) || (c->x86 < 0x10) ) { printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n", cpu); return -EINVAL; } rdmsrl(MSR_AMD_PATCHLEVEL, csig->rev); printk(KERN_DEBUG "microcode: CPU%d collect_cpu_info: patch_id=%#x\n", cpu, csig->rev); return 0; } static bool_t microcode_fits(const struct microcode_amd *mc_amd, int cpu) { struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); const struct microcode_header_amd *mc_header = mc_amd->mpb; const struct equiv_cpu_entry *equiv_cpu_table = mc_amd->equiv_cpu_table; unsigned int current_cpu_id; unsigned int equiv_cpu_id = 0x0; unsigned int i; /* We should bind the task to the CPU */ BUG_ON(cpu != raw_smp_processor_id()); current_cpu_id = cpuid_eax(0x00000001); for ( i = 0; equiv_cpu_table[i].installed_cpu != 0; i++ ) { if ( current_cpu_id == equiv_cpu_table[i].installed_cpu ) { equiv_cpu_id = equiv_cpu_table[i].equiv_cpu & 0xffff; break; } } if ( !equiv_cpu_id ) return 0; if ( (mc_header->processor_rev_id) != equiv_cpu_id ) return 0; if ( mc_header->patch_id <= uci->cpu_sig.rev ) return 0; printk(KERN_DEBUG "microcode: CPU%d found a matching microcode " "update with version %#x (current=%#x)\n", cpu, mc_header->patch_id, uci->cpu_sig.rev); return 1; } static int apply_microcode(int cpu) { unsigned long flags; struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); uint32_t rev; struct microcode_amd *mc_amd = uci->mc.mc_amd; struct microcode_header_amd *hdr; /* We should bind the task to the CPU */ BUG_ON(raw_smp_processor_id() != cpu); if ( mc_amd == NULL ) return -EINVAL; hdr = mc_amd->mpb; if ( hdr == NULL ) return -EINVAL; spin_lock_irqsave(µcode_update_lock, flags); wrmsrl(MSR_AMD_PATCHLOADER, (unsigned long)hdr); /* get patch id after patching */ rdmsrl(MSR_AMD_PATCHLEVEL, rev); spin_unlock_irqrestore(µcode_update_lock, flags); /* check current patch id and patch's id for match */ if ( rev != hdr->patch_id ) { printk(KERN_ERR "microcode: CPU%d update from revision " "%#x to %#x failed\n", cpu, hdr->patch_id, rev); return -EIO; } printk(KERN_WARNING "microcode: CPU%d updated from revision %#x to %#x\n", cpu, uci->cpu_sig.rev, hdr->patch_id); uci->cpu_sig.rev = rev; return 0; } static int get_ucode_from_buffer_amd( struct microcode_amd *mc_amd, const void *buf, size_t bufsize, size_t *offset) { const uint8_t *bufp = buf; size_t off; const struct mpbhdr *mpbuf; off = *offset; /* No more data */ if ( off >= bufsize ) { printk(KERN_ERR "microcode: Microcode buffer overrun\n"); return -EINVAL; } mpbuf = (const struct mpbhdr *)&bufp[off]; if ( mpbuf->type != UCODE_UCODE_TYPE ) { printk(KERN_ERR "microcode: Wrong microcode payload type field\n"); return -EINVAL; } if ( (off + mpbuf->len) > bufsize ) { printk(KERN_ERR "microcode: Bad data in microcode data file\n"); return -EINVAL; } if ( mc_amd->mpb_size < mpbuf->len ) { if ( mc_amd->mpb ) { xfree(mc_amd->mpb); mc_amd->mpb_size = 0; } mc_amd->mpb = xmalloc_bytes(mpbuf->len); if ( mc_amd->mpb == NULL ) return -ENOMEM; mc_amd->mpb_size = mpbuf->len; } memcpy(mc_amd->mpb, mpbuf->data, mpbuf->len); *offset = off + mpbuf->len + 8; printk(KERN_DEBUG "microcode: CPU%d size %zu, block size %u offset %zu equivID %#x rev %#x\n", raw_smp_processor_id(), bufsize, mpbuf->len, off, ((struct microcode_header_amd *)mc_amd->mpb)->processor_rev_id, ((struct microcode_header_amd *)mc_amd->mpb)->patch_id); return 0; } static int install_equiv_cpu_table( struct microcode_amd *mc_amd, const uint32_t *buf, size_t *offset) { const struct mpbhdr *mpbuf = (const struct mpbhdr *)&buf[1]; /* No more data */ if ( mpbuf->len + 12 >= *offset ) return -EINVAL; if ( mpbuf->type != UCODE_EQUIV_CPU_TABLE_TYPE ) { printk(KERN_ERR "microcode: Wrong microcode equivalent cpu table type field\n"); return -EINVAL; } if ( mpbuf->len == 0 ) { printk(KERN_ERR "microcode: Wrong microcode equivalent cpu table length\n"); return -EINVAL; } mc_amd->equiv_cpu_table = xmalloc_bytes(mpbuf->len); if ( !mc_amd->equiv_cpu_table ) { printk(KERN_ERR "microcode: Cannot allocate memory for equivalent cpu table\n"); return -ENOMEM; } memcpy(mc_amd->equiv_cpu_table, mpbuf->data, mpbuf->len); mc_amd->equiv_cpu_table_size = mpbuf->len; *offset = mpbuf->len + 12; /* add header length */ return 0; } static int cpu_request_microcode(int cpu, const void *buf, size_t bufsize) { struct microcode_amd *mc_amd, *mc_old; size_t offset = bufsize; size_t last_offset, applied_offset = 0; int error = 0, save_error = 1; struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); /* We should bind the task to the CPU */ BUG_ON(cpu != raw_smp_processor_id()); if ( *(const uint32_t *)buf != UCODE_MAGIC ) { printk(KERN_ERR "microcode: Wrong microcode patch file magic\n"); error = -EINVAL; goto out; } mc_amd = xmalloc(struct microcode_amd); if ( !mc_amd ) { printk(KERN_ERR "microcode: Cannot allocate memory for microcode patch\n"); error = -ENOMEM; goto out; } error = install_equiv_cpu_table(mc_amd, buf, &offset); if ( error ) { xfree(mc_amd); printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); error = -EINVAL; goto out; } mc_old = uci->mc.mc_amd; /* implicitely validates uci->mc.mc_valid */ uci->mc.mc_amd = mc_amd; /* * It's possible the data file has multiple matching ucode, * lets keep searching till the latest version */ mc_amd->mpb = NULL; mc_amd->mpb_size = 0; last_offset = offset; while ( (error = get_ucode_from_buffer_amd(mc_amd, buf, bufsize, &offset)) == 0 ) { if ( microcode_fits(mc_amd, cpu) ) { error = apply_microcode(cpu); if ( error ) break; applied_offset = last_offset; } last_offset = offset; if ( offset >= bufsize ) break; } /* On success keep the microcode patch for * re-apply on resume. */ if ( applied_offset ) { save_error = get_ucode_from_buffer_amd( mc_amd, buf, bufsize, &applied_offset); if ( save_error ) error = save_error; } if ( save_error ) { xfree(mc_amd); uci->mc.mc_amd = mc_old; } else xfree(mc_old); out: svm_host_osvw_init(); /* * In some cases we may return an error even if processor's microcode has * been updated. For example, the first patch in a container file is loaded * successfully but subsequent container file processing encounters a * failure. */ return error; } static int microcode_resume_match(int cpu, const void *mc) { struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); struct microcode_amd *mc_amd = uci->mc.mc_amd; const struct microcode_amd *src = mc; if ( !microcode_fits(src, cpu) ) return 0; if ( src != mc_amd ) { if ( mc_amd ) { xfree(mc_amd->equiv_cpu_table); xfree(mc_amd->mpb); xfree(mc_amd); } mc_amd = xmalloc(struct microcode_amd); uci->mc.mc_amd = mc_amd; if ( !mc_amd ) return -ENOMEM; mc_amd->equiv_cpu_table = xmalloc_bytes(src->equiv_cpu_table_size); if ( !mc_amd->equiv_cpu_table ) goto err1; mc_amd->mpb = xmalloc_bytes(src->mpb_size); if ( !mc_amd->mpb ) goto err2; mc_amd->equiv_cpu_table_size = src->equiv_cpu_table_size; mc_amd->mpb_size = src->mpb_size; memcpy(mc_amd->mpb, src->mpb, src->mpb_size); memcpy(mc_amd->equiv_cpu_table, src->equiv_cpu_table, src->equiv_cpu_table_size); } return 1; err2: xfree(mc_amd->equiv_cpu_table); err1: xfree(mc_amd); uci->mc.mc_amd = NULL; return -ENOMEM; } static int start_update(void) { /* * We assume here that svm_host_osvw_init() will be called on each cpu (from * cpu_request_microcode()). * * Note that if collect_cpu_info() returns an error then * cpu_request_microcode() will not invoked thus leaving OSVW bits not * updated. Currently though collect_cpu_info() will not fail on processors * supporting OSVW so we will not deal with this possibility. */ svm_host_osvw_reset(); return 0; } static const struct microcode_ops microcode_amd_ops = { .microcode_resume_match = microcode_resume_match, .cpu_request_microcode = cpu_request_microcode, .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode, .start_update = start_update, }; static __init int microcode_init_amd(void) { if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) microcode_ops = µcode_amd_ops; return 0; } presmp_initcall(microcode_init_amd); xen-4.4.0/xen/arch/x86/efi/0000775000175000017500000000000012307313555013342 5ustar smbsmbxen-4.4.0/xen/arch/x86/efi/Makefile0000664000175000017500000000104012307313555014775 0ustar smbsmbCFLAGS += -fshort-wchar obj-y += stub.o create = test -e $(1) || touch -t 199901010000 $(1) efi := $(filter y,$(x86_64)$(shell rm -f disabled)) efi := $(if $(efi),$(shell $(CC) $(filter-out $(CFLAGS-y) .%.d,$(CFLAGS)) -c check.c 2>disabled && echo y)) efi := $(if $(efi),$(shell $(LD) -mi386pep --subsystem=10 -o check.efi check.o 2>disabled && echo y)) efi := $(if $(efi),$(shell rm disabled)y,$(shell $(call create,boot.init.o); $(call create,runtime.o))) extra-$(efi) += boot.init.o relocs-dummy.o runtime.o compat.o stub.o: $(extra-y) xen-4.4.0/xen/arch/x86/efi/relocs-dummy.S0000664000175000017500000000035312307313555016107 0ustar smbsmb#include .section .reloc, "a", @progbits .balign 4 GLOBAL(__base_relocs_start) .long 0 .long 8 GLOBAL(__base_relocs_end) .globl VIRT_START, ALT_START .equ VIRT_START, XEN_VIRT_START .equ ALT_START, XEN_VIRT_END xen-4.4.0/xen/arch/x86/efi/stub.c0000664000175000017500000000131412307313555014462 0ustar smbsmb#include #include #include #include #ifndef efi_enabled const bool_t efi_enabled = 0; #endif void __init efi_init_memory(void) { } unsigned long efi_get_time(void) { BUG(); return 0; } void efi_halt_system(void) { } void efi_reset_system(bool_t warm) { } int efi_get_info(uint32_t idx, union xenpf_efi_info *info) { return -ENOSYS; } int efi_compat_get_info(uint32_t idx, union compat_pf_efi_info *) __attribute__((__alias__("efi_get_info"))); int efi_runtime_call(struct xenpf_efi_runtime_call *op) { return -ENOSYS; } int efi_compat_runtime_call(struct compat_pf_efi_runtime_call *) __attribute__((__alias__("efi_runtime_call"))); xen-4.4.0/xen/arch/x86/efi/mkreloc.c0000664000175000017500000002441612307313555015151 0ustar smbsmb#include #include #include #include #include #include #include #include #include #include struct mz_hdr { uint16_t signature; #define MZ_SIGNATURE 0x5a4d uint16_t last_page_size; uint16_t page_count; uint16_t relocation_count; uint16_t header_paras; uint16_t min_paras; uint16_t max_paras; uint16_t entry_ss; uint16_t entry_sp; uint16_t checksum; uint16_t entry_ip; uint16_t entry_cs; uint16_t relocations; uint16_t overlay; uint8_t reserved[32]; uint32_t extended_header_base; }; struct pe_hdr { uint32_t signature; #define PE_SIGNATURE 0x00004550 uint16_t cpu; uint16_t section_count; int32_t timestamp; uint32_t symbols_file_offset; uint32_t symbol_count; uint16_t opt_hdr_size; uint16_t flags; struct { uint16_t magic; #define PE_MAGIC_EXE32 0x010b #define PE_MAGIC_EXE32PLUS 0x020b uint8_t linker_major, linker_minor; uint32_t code_size, data_size, bss_size; uint32_t entry_rva, code_rva, data_rva; } opt_hdr; }; #define PE_PAGE_SIZE 0x1000 #define PE_BASE_RELOC_ABS 0 #define PE_BASE_RELOC_HIGHLOW 3 #define PE_BASE_RELOC_DIR64 10 struct coff_section { char name[8]; uint32_t size; uint32_t rva; uint32_t file_size; uint32_t file_offset; uint32_t relocation_file_offset; uint32_t line_number_file_offset; uint16_t relocation_count; uint16_t line_number_count; uint32_t flags; #define COFF_SECTION_BSS 0x00000080 #define COFF_SECTION_DISCARDABLE 0x02000000 }; static void usage(const char *cmd, int rc) { fprintf(rc ? stderr : stdout, "Usage: %s \n", cmd); exit(rc); } static unsigned int load(const char *name, int *handle, struct coff_section **sections, uint_fast64_t *image_base, uint32_t *image_size, unsigned int *width) { int in = open(name, O_RDONLY); struct mz_hdr mz_hdr; struct pe_hdr pe_hdr; uint32_t base; if ( in < 0 || read(in, &mz_hdr, sizeof(mz_hdr)) != sizeof(mz_hdr) ) { perror(name); exit(2); } if ( mz_hdr.signature != MZ_SIGNATURE || mz_hdr.relocations < sizeof(mz_hdr) || !mz_hdr.extended_header_base ) { fprintf(stderr, "%s: Wrong DOS file format\n", name); exit(2); } if ( lseek(in, mz_hdr.extended_header_base, SEEK_SET) < 0 || read(in, &pe_hdr, sizeof(pe_hdr)) != sizeof(pe_hdr) || read(in, &base, sizeof(base)) != sizeof(base) || /* * Luckily the image size field lives at the * same offset for both formats. */ lseek(in, 24, SEEK_CUR) < 0 || read(in, image_size, sizeof(*image_size)) != sizeof(*image_size) ) { perror(name); exit(3); } switch ( (pe_hdr.signature == PE_SIGNATURE && pe_hdr.opt_hdr_size > sizeof(pe_hdr.opt_hdr)) * pe_hdr.opt_hdr.magic ) { case PE_MAGIC_EXE32: *width = 32; *image_base = base; break; case PE_MAGIC_EXE32PLUS: *width = 64; *image_base = ((uint64_t)base << 32) | pe_hdr.opt_hdr.data_rva; break; default: fprintf(stderr, "%s: Wrong PE file format\n", name); exit(3); } *sections = malloc(pe_hdr.section_count * sizeof(**sections)); if ( !*sections ) { perror(NULL); exit(4); } if ( lseek(in, mz_hdr.extended_header_base + offsetof(struct pe_hdr, opt_hdr) + pe_hdr.opt_hdr_size, SEEK_SET) < 0 || read(in, *sections, pe_hdr.section_count * sizeof(**sections)) != pe_hdr.section_count * sizeof(**sections) ) { perror(name); exit(4); } *handle = in; return pe_hdr.section_count; } static long page_size; static const void *map_section(const struct coff_section *sec, int in, const char *name) { const char *ptr; unsigned long offs; if ( !page_size ) page_size = sysconf(_SC_PAGESIZE); offs = sec->file_offset & (page_size - 1); ptr = mmap(0, offs + sec->file_size, PROT_READ, MAP_PRIVATE, in, sec->file_offset - offs); if ( ptr == MAP_FAILED ) { perror(name); exit(6); } return ptr + offs; } static void unmap_section(const void *ptr, const struct coff_section *sec) { unsigned long offs = sec->file_offset & (page_size - 1); munmap((char *)ptr - offs, offs + sec->file_size); } static void diff_sections(const unsigned char *ptr1, const unsigned char *ptr2, const struct coff_section *sec, int_fast64_t diff, unsigned int width, uint_fast64_t base, uint_fast64_t end) { static uint_fast32_t cur_rva, reloc_size; unsigned int disp = 0; uint_fast32_t i; if ( !sec ) { reloc_size += reloc_size & 2; if ( reloc_size ) printf("\t.balign 4\n" "\t.equ rva_%08" PRIxFAST32 "_relocs, %#08" PRIxFAST32 "\n", cur_rva, reloc_size); return; } while ( !(diff & (((int_fast64_t)1 << ((disp + 1) * CHAR_BIT)) - 1)) ) ++disp; for ( i = 0; i < sec->file_size; ++i ) { uint_fast32_t rva; union { uint32_t u32; uint64_t u64; } val1, val2; int_fast64_t delta; unsigned int reloc = (width == 4 ? PE_BASE_RELOC_HIGHLOW : PE_BASE_RELOC_DIR64); if ( ptr1[i] == ptr2[i] ) continue; if ( i < disp || i + width - disp > sec->file_size ) { fprintf(stderr, "Bogus difference at %s:%08" PRIxFAST32 "\n", sec->name, i); exit(3); } memcpy(&val1, ptr1 + i - disp, width); memcpy(&val2, ptr2 + i - disp, width); delta = width == 4 ? val2.u32 - val1.u32 : val2.u64 - val1.u64; if ( delta != diff ) { fprintf(stderr, "Difference at %s:%08" PRIxFAST32 " is %#" PRIxFAST64 " (expected %#" PRIxFAST64 ")\n", sec->name, i, delta, diff); continue; } if ( width == 8 && (val1.u64 < base || val1.u64 > end) ) reloc = PE_BASE_RELOC_HIGHLOW; rva = (sec->rva + i - disp) & ~(PE_PAGE_SIZE - 1); if ( rva > cur_rva ) { reloc_size += reloc_size & 2; if ( reloc_size ) printf("\t.equ rva_%08" PRIxFAST32 "_relocs," " %#08" PRIxFAST32 "\n", cur_rva, reloc_size); printf("\t.balign 4\n" "\t.long %#08" PRIxFAST32 "," " rva_%08" PRIxFAST32 "_relocs\n", rva, rva); cur_rva = rva; reloc_size = 8; } else if ( rva != cur_rva ) { fprintf(stderr, "Cannot handle decreasing RVA (at %s:%08" PRIxFAST32 ")\n", sec->name, i); exit(3); } printf("\t.word (%u << 12) | 0x%03" PRIxFAST32 "\n", reloc, sec->rva + i - disp - rva); reloc_size += 2; i += width - disp - 1; } } int main(int argc, char *argv[]) { int in1, in2; unsigned int i, nsec, width1, width2; uint_fast64_t base1, base2; uint32_t size1, size2; struct coff_section *sec1, *sec2; if ( argc == 1 || !strcmp(argv[1], "-?") || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help") ) usage(*argv, argc == 1); if ( argc != 3 ) usage(*argv, 1); nsec = load(argv[1], &in1, &sec1, &base1, &size1, &width1); if ( nsec != load(argv[2], &in2, &sec2, &base2, &size2, &width2) ) { fputs("Mismatched section counts\n", stderr); return 5; } if ( width1 != width2 ) { fputs("Mismatched image types\n", stderr); return 5; } width1 >>= 3; if ( base1 == base2 ) { fputs("Images must have different base addresses\n", stderr); return 5; } if ( size1 != size2 ) { fputs("Images must have identical sizes\n", stderr); return 5; } puts("\t.section .reloc, \"a\", @progbits\n" "\t.balign 4\n" "\t.globl __base_relocs_start, __base_relocs_end\n" "__base_relocs_start:"); for ( i = 0; i < nsec; ++i ) { const void *ptr1, *ptr2; if ( memcmp(sec1[i].name, sec2[i].name, sizeof(sec1[i].name)) || sec1[i].rva != sec2[i].rva || sec1[i].size != sec2[i].size || sec1[i].file_size != sec2[i].file_size || sec1[i].flags != sec2[i].flags ) { fprintf(stderr, "Mismatched section %u parameters\n", i); return 5; } if ( !sec1[i].size || (sec1[i].flags & (COFF_SECTION_DISCARDABLE|COFF_SECTION_BSS)) ) continue; /* * Don't generate relocations for sections that definitely * aren't used by the boot loader code. */ if ( memcmp(sec1[i].name, ".initcal", sizeof(sec1[i].name)) == 0 || memcmp(sec1[i].name, ".init.se", sizeof(sec1[i].name)) == 0 || memcmp(sec1[i].name, ".lockpro", sizeof(sec1[i].name)) == 0 ) continue; if ( !sec1[i].rva ) { fprintf(stderr, "Can't handle section %u with zero RVA\n", i); return 3; } if ( sec1[i].file_size > sec1[i].size ) { sec1[i].file_size = sec1[i].size; sec2[i].file_size = sec2[i].size; } ptr1 = map_section(sec1 + i, in1, argv[1]); ptr2 = map_section(sec2 + i, in2, argv[2]); diff_sections(ptr1, ptr2, sec1 + i, base2 - base1, width1, base1, base1 + size1); unmap_section(ptr1, sec1 + i); unmap_section(ptr2, sec2 + i); } diff_sections(NULL, NULL, NULL, 0, 0, 0, 0); puts("__base_relocs_end:"); close(in1); close(in2); return 0; } xen-4.4.0/xen/arch/x86/efi/boot.c0000664000175000017500000015653012307313555014463 0ustar smbsmb#include "efi.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #if EFI_PAGE_SIZE != PAGE_SIZE # error Cannot use xen/pfn.h here! #endif #include #include #include #include #include #define __ASSEMBLY__ /* avoid pulling in ACPI stuff (conflicts with EFI) */ #include #undef __ASSEMBLY__ #include #include /* Using SetVirtualAddressMap() is incompatible with kexec: */ #undef USE_SET_VIRTUAL_ADDRESS_MAP #define SHIM_LOCK_PROTOCOL_GUID \ { 0x605dab50, 0xe046, 0x4300, {0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23} } typedef EFI_STATUS (/* _not_ EFIAPI */ *EFI_SHIM_LOCK_VERIFY) ( IN VOID *Buffer, IN UINT32 Size); typedef struct { EFI_SHIM_LOCK_VERIFY Verify; } EFI_SHIM_LOCK_PROTOCOL; extern char start[]; extern u32 cpuid_ext_features; union string { CHAR16 *w; char *s; const char *cs; }; struct file { UINTN size; union { EFI_PHYSICAL_ADDRESS addr; void *ptr; }; }; static EFI_BOOT_SERVICES *__initdata efi_bs; static EFI_HANDLE __initdata efi_ih; static SIMPLE_TEXT_OUTPUT_INTERFACE *__initdata StdOut; static SIMPLE_TEXT_OUTPUT_INTERFACE *__initdata StdErr; static UINT32 __initdata mdesc_ver; static struct file __initdata cfg; static struct file __initdata kernel; static struct file __initdata ramdisk; static struct file __initdata ucode; static struct file __initdata xsm; static multiboot_info_t __initdata mbi = { .flags = MBI_MODULES | MBI_LOADERNAME }; static module_t __initdata mb_modules[3]; static CHAR16 __initdata newline[] = L"\r\n"; #define PrintStr(s) StdOut->OutputString(StdOut, s) #define PrintErr(s) StdErr->OutputString(StdErr, s) static CHAR16 *__init FormatDec(UINT64 Val, CHAR16 *Buffer) { if ( Val >= 10 ) Buffer = FormatDec(Val / 10, Buffer); *Buffer = (CHAR16)(L'0' + Val % 10); return Buffer + 1; } static CHAR16 *__init FormatHex(UINT64 Val, UINTN Width, CHAR16 *Buffer) { if ( Width > 1 || Val >= 0x10 ) Buffer = FormatHex(Val >> 4, Width ? Width - 1 : 0, Buffer); *Buffer = (CHAR16)((Val &= 0xf) < 10 ? L'0' + Val : L'a' + Val - 10); return Buffer + 1; } static void __init DisplayUint(UINT64 Val, INTN Width) { CHAR16 PrintString[32], *end; if (Width < 0) end = FormatDec(Val, PrintString); else { PrintStr(L"0x"); end = FormatHex(Val, Width, PrintString); } *end = 0; PrintStr(PrintString); } static CHAR16 *__init wstrcpy(CHAR16 *d, const CHAR16 *s) { CHAR16 *r = d; while ( (*d++ = *s++) != 0 ) ; return r; } static int __init wstrcmp(const CHAR16 *s1, const CHAR16 *s2) { while ( *s1 && *s1 == *s2 ) { ++s1; ++s2; } return *s1 - *s2; } static int __init wstrncmp(const CHAR16 *s1, const CHAR16 *s2, UINTN n) { while ( n && *s1 && *s1 == *s2 ) { --n; ++s1; ++s2; } return n ? *s1 - *s2 : 0; } static CHAR16 *__init s2w(union string *str) { const char *s = str->s; CHAR16 *w; void *ptr; if ( efi_bs->AllocatePool(EfiLoaderData, (strlen(s) + 1) * sizeof(*w), &ptr) != EFI_SUCCESS ) return NULL; w = str->w = ptr; do { *w = *s++; } while ( *w++ ); return str->w; } static char *__init w2s(const union string *str) { const CHAR16 *w = str->w; char *s = str->s; do { if ( *w > 0x007f ) return NULL; *s = *w++; } while ( *s++ ); return str->s; } static bool_t __init match_guid(const EFI_GUID *guid1, const EFI_GUID *guid2) { return guid1->Data1 == guid2->Data1 && guid1->Data2 == guid2->Data2 && guid1->Data3 == guid2->Data3 && !memcmp(guid1->Data4, guid2->Data4, sizeof(guid1->Data4)); } static void __init __attribute__((__noreturn__)) blexit(const CHAR16 *str) { if ( str ) PrintStr((CHAR16 *)str); PrintStr(newline); if ( cfg.addr ) efi_bs->FreePages(cfg.addr, PFN_UP(cfg.size)); if ( kernel.addr ) efi_bs->FreePages(kernel.addr, PFN_UP(kernel.size)); if ( ramdisk.addr ) efi_bs->FreePages(ramdisk.addr, PFN_UP(ramdisk.size)); if ( ucode.addr ) efi_bs->FreePages(ucode.addr, PFN_UP(ucode.size)); if ( xsm.addr ) efi_bs->FreePages(xsm.addr, PFN_UP(xsm.size)); efi_bs->Exit(efi_ih, EFI_SUCCESS, 0, NULL); for( ; ; ); /* not reached */ } /* generic routine for printing error messages */ static void __init PrintErrMesg(const CHAR16 *mesg, EFI_STATUS ErrCode) { StdOut = StdErr; PrintErr((CHAR16 *)mesg); PrintErr(L": "); switch (ErrCode) { case EFI_NOT_FOUND: mesg = L"Not found"; break; case EFI_NO_MEDIA: mesg = L"The device has no media"; break; case EFI_MEDIA_CHANGED: mesg = L"Media changed"; break; case EFI_DEVICE_ERROR: mesg = L"Device error"; break; case EFI_VOLUME_CORRUPTED: mesg = L"Volume corrupted"; break; case EFI_ACCESS_DENIED: mesg = L"Access denied"; break; case EFI_OUT_OF_RESOURCES: mesg = L"Out of resources"; break; case EFI_VOLUME_FULL: mesg = L"Volume is full"; break; case EFI_SECURITY_VIOLATION: mesg = L"Security violation"; break; case EFI_CRC_ERROR: mesg = L"CRC error"; break; case EFI_COMPROMISED_DATA: mesg = L"Compromised data"; break; default: PrintErr(L"ErrCode: "); DisplayUint(ErrCode, 0); mesg = NULL; break; } blexit(mesg); } static void __init place_string(u32 *addr, const char *s) { static char *__initdata alloc = start; if ( s && *s ) { size_t len1 = strlen(s) + 1; const char *old = (char *)(long)*addr; size_t len2 = *addr ? strlen(old) + 1 : 0; alloc -= len1 + len2; /* * Insert new string before already existing one. This is needed * for options passed on the command line to override options from * the configuration file. */ memcpy(alloc, s, len1); if ( *addr ) { alloc[len1 - 1] = ' '; memcpy(alloc + len1, old, len2); } } *addr = (long)alloc; } static unsigned int __init get_argv(unsigned int argc, CHAR16 **argv, CHAR16 *cmdline, UINTN cmdsize) { CHAR16 *ptr = (CHAR16 *)(argv + argc + 1), *prev = NULL; bool_t prev_sep = TRUE; for ( ; cmdsize > sizeof(*cmdline) && *cmdline; cmdsize -= sizeof(*cmdline), ++cmdline ) { bool_t cur_sep = *cmdline == L' ' || *cmdline == L'\t'; if ( !prev_sep ) { if ( cur_sep ) ++ptr; else if ( argv ) { *ptr = *cmdline; *++ptr = 0; } } else if ( !cur_sep ) { if ( !argv ) ++argc; else if ( prev && wstrcmp(prev, L"--") == 0 ) { union string rest = { .w = cmdline }; --argv; place_string(&mbi.cmdline, w2s(&rest)); break; } else { *argv++ = prev = ptr; *ptr = *cmdline; *++ptr = 0; } } prev_sep = cur_sep; } if ( argv ) *argv = NULL; return argc; } static EFI_FILE_HANDLE __init get_parent_handle(EFI_LOADED_IMAGE *loaded_image, CHAR16 **leaf) { static EFI_GUID __initdata fs_protocol = SIMPLE_FILE_SYSTEM_PROTOCOL; EFI_FILE_HANDLE dir_handle; EFI_DEVICE_PATH *dp; CHAR16 *pathend, *ptr; EFI_STATUS ret; do { EFI_FILE_IO_INTERFACE *fio; /* Get the file system interface. */ ret = efi_bs->HandleProtocol(loaded_image->DeviceHandle, &fs_protocol, (void **)&fio); if ( EFI_ERROR(ret) ) blexit(L"Couldn't obtain the File System Protocol Interface"); ret = fio->OpenVolume(fio, &dir_handle); } while ( ret == EFI_MEDIA_CHANGED ); if ( ret != EFI_SUCCESS ) blexit(L"OpenVolume failure"); #define buffer ((CHAR16 *)keyhandler_scratch) #define BUFFERSIZE sizeof(keyhandler_scratch) for ( dp = loaded_image->FilePath, *buffer = 0; DevicePathType(dp) != END_DEVICE_PATH_TYPE; dp = (void *)dp + DevicePathNodeLength(dp) ) { FILEPATH_DEVICE_PATH *fp; if ( DevicePathType(dp) != MEDIA_DEVICE_PATH || DevicePathSubType(dp) != MEDIA_FILEPATH_DP ) blexit(L"Unsupported device path component"); if ( *buffer ) { EFI_FILE_HANDLE new_handle; ret = dir_handle->Open(dir_handle, &new_handle, buffer, EFI_FILE_MODE_READ, 0); if ( ret != EFI_SUCCESS ) { PrintErr(L"Open failed for "); PrintErrMesg(buffer, ret); } dir_handle->Close(dir_handle); dir_handle = new_handle; } fp = (void *)dp; if ( BUFFERSIZE < DevicePathNodeLength(dp) - sizeof(*dp) + sizeof(*buffer) ) blexit(L"Increase BUFFERSIZE"); memcpy(buffer, fp->PathName, DevicePathNodeLength(dp) - sizeof(*dp)); buffer[(DevicePathNodeLength(dp) - sizeof(*dp)) / sizeof(*buffer)] = 0; } for ( ptr = buffer, pathend = NULL; *ptr; ++ptr ) if ( *ptr == L'\\' ) pathend = ptr; if ( pathend ) { *pathend = 0; *leaf = pathend + 1; if ( *buffer ) { EFI_FILE_HANDLE new_handle; ret = dir_handle->Open(dir_handle, &new_handle, buffer, EFI_FILE_MODE_READ, 0); if ( ret != EFI_SUCCESS ) { PrintErr(L"Open failed for "); PrintErrMesg(buffer, ret); } dir_handle->Close(dir_handle); dir_handle = new_handle; } } else *leaf = buffer; #undef BUFFERSIZE #undef buffer return dir_handle; } static CHAR16 *__init point_tail(CHAR16 *fn) { CHAR16 *tail = NULL; for ( ; ; ++fn ) switch ( *fn ) { case 0: return tail; case L'.': case L'-': case L'_': tail = fn; break; } } static bool_t __init read_file(EFI_FILE_HANDLE dir_handle, CHAR16 *name, struct file *file) { EFI_FILE_HANDLE FileHandle = NULL; UINT64 size; EFI_STATUS ret; CHAR16 *what = NULL; if ( !name ) PrintErrMesg(L"No filename", EFI_OUT_OF_RESOURCES); ret = dir_handle->Open(dir_handle, &FileHandle, name, EFI_FILE_MODE_READ, 0); if ( file == &cfg && ret == EFI_NOT_FOUND ) return 0; if ( EFI_ERROR(ret) ) what = L"Open"; else ret = FileHandle->SetPosition(FileHandle, -1); if ( EFI_ERROR(ret) ) what = what ?: L"Seek"; else ret = FileHandle->GetPosition(FileHandle, &size); if ( EFI_ERROR(ret) ) what = what ?: L"Get size"; else ret = FileHandle->SetPosition(FileHandle, 0); if ( EFI_ERROR(ret) ) what = what ?: L"Seek"; else { file->addr = min(1UL << (32 + PAGE_SHIFT), HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START); ret = efi_bs->AllocatePages(AllocateMaxAddress, EfiLoaderData, PFN_UP(size), &file->addr); } if ( EFI_ERROR(ret) ) { file->addr = 0; what = what ?: L"Allocation"; } else { if ( file != &cfg ) { PrintStr(name); PrintStr(L": "); DisplayUint(file->addr, 2 * sizeof(file->addr)); PrintStr(L"-"); DisplayUint(file->addr + size, 2 * sizeof(file->addr)); PrintStr(newline); mb_modules[mbi.mods_count].mod_start = file->addr >> PAGE_SHIFT; mb_modules[mbi.mods_count].mod_end = size; ++mbi.mods_count; } file->size = size; ret = FileHandle->Read(FileHandle, &file->size, file->ptr); if ( !EFI_ERROR(ret) && file->size != size ) ret = EFI_ABORTED; if ( EFI_ERROR(ret) ) what = L"Read"; } if ( FileHandle ) FileHandle->Close(FileHandle); if ( what ) { PrintErr(what); PrintErr(L" failed for "); PrintErrMesg(name, ret); } return 1; } static void __init pre_parse(const struct file *cfg) { char *ptr = cfg->ptr, *end = ptr + cfg->size; bool_t start = 1, comment = 0; for ( ; ptr < end; ++ptr ) { if ( iscntrl(*ptr) ) { comment = 0; start = 1; *ptr = 0; } else if ( comment || (start && isspace(*ptr)) ) *ptr = 0; else if ( *ptr == '#' || (start && *ptr == ';') ) { comment = 1; *ptr = 0; } else start = 0; } if ( cfg->size && end[-1] ) PrintStr(L"No newline at end of config file," " last line will be ignored.\r\n"); } static char *__init get_value(const struct file *cfg, const char *section, const char *item) { char *ptr = cfg->ptr, *end = ptr + cfg->size; size_t slen = section ? strlen(section) : 0, ilen = strlen(item); bool_t match = !slen; for ( ; ptr < end; ++ptr ) { switch ( *ptr ) { case 0: continue; case '[': if ( !slen ) break; if ( match ) return NULL; match = strncmp(++ptr, section, slen) == 0 && ptr[slen] == ']'; break; default: if ( match && strncmp(ptr, item, ilen) == 0 && ptr[ilen] == '=' ) return ptr + ilen + 1; break; } ptr += strlen(ptr); } return NULL; } static void __init split_value(char *s) { while ( *s && isspace(*s) ) ++s; place_string(&mb_modules[mbi.mods_count].string, s); while ( *s && !isspace(*s) ) ++s; *s = 0; } static void __init edd_put_string(u8 *dst, size_t n, const char *src) { while ( n-- && *src ) *dst++ = *src++; if ( *src ) PrintErrMesg(L"Internal error populating EDD info", EFI_BUFFER_TOO_SMALL); while ( n-- ) *dst++ = ' '; } #define edd_put_string(d, s) edd_put_string(d, ARRAY_SIZE(d), s) static void __init setup_efi_pci(void) { EFI_STATUS status; EFI_HANDLE *handles; static EFI_GUID __initdata pci_guid = EFI_PCI_IO_PROTOCOL; UINTN i, nr_pci, size = 0; struct efi_pci_rom *last = NULL; status = efi_bs->LocateHandle(ByProtocol, &pci_guid, NULL, &size, NULL); if ( status == EFI_BUFFER_TOO_SMALL ) status = efi_bs->AllocatePool(EfiLoaderData, size, (void **)&handles); if ( !EFI_ERROR(status) ) status = efi_bs->LocateHandle(ByProtocol, &pci_guid, NULL, &size, handles); if ( EFI_ERROR(status) ) size = 0; nr_pci = size / sizeof(*handles); for ( i = 0; i < nr_pci; ++i ) { EFI_PCI_IO *pci = NULL; u64 attributes; struct efi_pci_rom *rom, *va; UINTN segment, bus, device, function; status = efi_bs->HandleProtocol(handles[i], &pci_guid, (void **)&pci); if ( EFI_ERROR(status) || !pci || !pci->RomImage || !pci->RomSize ) continue; status = pci->Attributes(pci, EfiPciIoAttributeOperationGet, 0, &attributes); if ( EFI_ERROR(status) || !(attributes & EFI_PCI_IO_ATTRIBUTE_EMBEDDED_ROM) || EFI_ERROR(pci->GetLocation(pci, &segment, &bus, &device, &function)) ) continue; DisplayUint(segment, 4); PrintStr(L":"); DisplayUint(bus, 2); PrintStr(L":"); DisplayUint(device, 2); PrintStr(L"."); DisplayUint(function, 1); PrintStr(L": ROM: "); DisplayUint(pci->RomSize, 0); PrintStr(L" bytes at "); DisplayUint((UINTN)pci->RomImage, 0); PrintStr(newline); size = pci->RomSize + sizeof(*rom); status = efi_bs->AllocatePool(EfiRuntimeServicesData, size, (void **)&rom); if ( EFI_ERROR(status) ) continue; rom->next = NULL; rom->size = pci->RomSize; status = pci->Pci.Read(pci, EfiPciIoWidthUint16, PCI_VENDOR_ID, 1, &rom->vendor); if ( !EFI_ERROR(status) ) status = pci->Pci.Read(pci, EfiPciIoWidthUint16, PCI_DEVICE_ID, 1, &rom->devid); if ( EFI_ERROR(status) ) { efi_bs->FreePool(rom); continue; } rom->segment = segment; rom->bus = bus; rom->devfn = (device << 3) | function; memcpy(rom->data, pci->RomImage, pci->RomSize); va = (void *)rom + DIRECTMAP_VIRT_START; if ( last ) last->next = va; else efi_pci_roms = va; last = rom; } efi_bs->FreePool(handles); } static int __init set_color(u32 mask, int bpp, u8 *pos, u8 *sz) { if ( bpp < 0 ) return bpp; if ( !mask ) return -EINVAL; for ( *pos = 0; !(mask & 1); ++*pos ) mask >>= 1; for ( *sz = 0; mask & 1; ++sz) mask >>= 1; if ( mask ) return -EINVAL; return max(*pos + *sz, bpp); } extern const intpte_t __page_tables_start[], __page_tables_end[]; #define in_page_tables(v) ((intpte_t *)(v) >= __page_tables_start && \ (intpte_t *)(v) < __page_tables_end) #define PE_BASE_RELOC_ABS 0 #define PE_BASE_RELOC_HIGHLOW 3 #define PE_BASE_RELOC_DIR64 10 extern const struct pe_base_relocs { u32 rva; u32 size; u16 entries[]; } __base_relocs_start[], __base_relocs_end[]; static void __init relocate_image(unsigned long delta) { const struct pe_base_relocs *base_relocs; for ( base_relocs = __base_relocs_start; base_relocs < __base_relocs_end; ) { unsigned int i, n; n = (base_relocs->size - sizeof(*base_relocs)) / sizeof(*base_relocs->entries); for ( i = 0; i < n; ++i ) { unsigned long addr = xen_phys_start + base_relocs->rva + (base_relocs->entries[i] & 0xfff); switch ( base_relocs->entries[i] >> 12 ) { case PE_BASE_RELOC_ABS: break; case PE_BASE_RELOC_HIGHLOW: if ( delta ) { *(u32 *)addr += delta; if ( in_page_tables(addr) ) *(u32 *)addr += xen_phys_start; } break; case PE_BASE_RELOC_DIR64: if ( delta ) { *(u64 *)addr += delta; if ( in_page_tables(addr) ) *(intpte_t *)addr += xen_phys_start; } break; default: blexit(L"Unsupported relocation type"); } } base_relocs = (const void *)(base_relocs->entries + i + (i & 1)); } } extern const s32 __trampoline_rel_start[], __trampoline_rel_stop[]; extern const s32 __trampoline_seg_start[], __trampoline_seg_stop[]; static void __init relocate_trampoline(unsigned long phys) { const s32 *trampoline_ptr; trampoline_phys = phys; /* Apply relocations to trampoline. */ for ( trampoline_ptr = __trampoline_rel_start; trampoline_ptr < __trampoline_rel_stop; ++trampoline_ptr ) *(u32 *)(*trampoline_ptr + (long)trampoline_ptr) += phys; for ( trampoline_ptr = __trampoline_seg_start; trampoline_ptr < __trampoline_seg_stop; ++trampoline_ptr ) *(u16 *)(*trampoline_ptr + (long)trampoline_ptr) = phys >> 4; } void EFIAPI __init __attribute__((__noreturn__)) efi_start(EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable) { static EFI_GUID __initdata loaded_image_guid = LOADED_IMAGE_PROTOCOL; static EFI_GUID __initdata gop_guid = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID; static EFI_GUID __initdata bio_guid = BLOCK_IO_PROTOCOL; static EFI_GUID __initdata devp_guid = DEVICE_PATH_PROTOCOL; static EFI_GUID __initdata shim_lock_guid = SHIM_LOCK_PROTOCOL_GUID; EFI_LOADED_IMAGE *loaded_image; EFI_STATUS status; unsigned int i, argc; CHAR16 **argv, *file_name, *cfg_file_name = NULL; UINTN cols, rows, depth, size, map_key, info_size, gop_mode = ~0; EFI_HANDLE *handles = NULL; EFI_SHIM_LOCK_PROTOCOL *shim_lock; EFI_GRAPHICS_OUTPUT_PROTOCOL *gop = NULL; EFI_GRAPHICS_OUTPUT_MODE_INFORMATION *mode_info; EFI_FILE_HANDLE dir_handle; union string section = { NULL }, name; struct e820entry *e; u64 efer; bool_t base_video = 0; efi_ih = ImageHandle; efi_bs = SystemTable->BootServices; efi_rs = SystemTable->RuntimeServices; efi_ct = SystemTable->ConfigurationTable; efi_num_ct = SystemTable->NumberOfTableEntries; efi_version = SystemTable->Hdr.Revision; efi_fw_vendor = SystemTable->FirmwareVendor; efi_fw_revision = SystemTable->FirmwareRevision; StdOut = SystemTable->ConOut; StdErr = SystemTable->StdErr ?: StdOut; status = efi_bs->HandleProtocol(ImageHandle, &loaded_image_guid, (void **)&loaded_image); if ( status != EFI_SUCCESS ) PrintErrMesg(L"No Loaded Image Protocol", status); xen_phys_start = (UINTN)loaded_image->ImageBase; if ( (xen_phys_start + loaded_image->ImageSize - 1) >> 32 ) blexit(L"Xen must be loaded below 4Gb."); if ( xen_phys_start & ((1 << L2_PAGETABLE_SHIFT) - 1) ) blexit(L"Xen must be loaded at a 2Mb boundary."); trampoline_xen_phys_start = xen_phys_start; /* Get the file system interface. */ dir_handle = get_parent_handle(loaded_image, &file_name); argc = get_argv(0, NULL, loaded_image->LoadOptions, loaded_image->LoadOptionsSize); if ( argc > 0 && efi_bs->AllocatePool(EfiLoaderData, (argc + 1) * sizeof(*argv) + loaded_image->LoadOptionsSize, (void **)&argv) == EFI_SUCCESS ) get_argv(argc, argv, loaded_image->LoadOptions, loaded_image->LoadOptionsSize); else argc = 0; for ( i = 1; i < argc; ++i ) { CHAR16 *ptr = argv[i]; if ( !ptr ) break; if ( *ptr == L'/' || *ptr == L'-' ) { if ( wstrcmp(ptr + 1, L"basevideo") == 0 ) base_video = 1; else if ( wstrncmp(ptr + 1, L"cfg=", 4) == 0 ) cfg_file_name = ptr + 5; else if ( i + 1 < argc && wstrcmp(ptr + 1, L"cfg") == 0 ) cfg_file_name = argv[++i]; else if ( wstrcmp(ptr + 1, L"help") == 0 || (ptr[1] == L'?' && !ptr[2]) ) { PrintStr(L"Xen EFI Loader options:\r\n"); PrintStr(L"-basevideo retain current video mode\r\n"); PrintStr(L"-cfg= specify configuration file\r\n"); PrintStr(L"-help, -? display this help\r\n"); blexit(NULL); } else { PrintStr(L"WARNING: Unknown command line option '"); PrintStr(ptr); PrintStr(L"' ignored\r\n"); } } else section.w = ptr; } if ( !base_video ) { unsigned int best; for ( i = 0, size = 0, best = StdOut->Mode->Mode; i < StdOut->Mode->MaxMode; ++i ) { if ( StdOut->QueryMode(StdOut, i, &cols, &rows) == EFI_SUCCESS && cols * rows > size ) { size = cols * rows; best = i; } } if ( best != StdOut->Mode->Mode ) StdOut->SetMode(StdOut, best); } PrintStr(L"Xen " __stringify(XEN_VERSION) "." __stringify(XEN_SUBVERSION) XEN_EXTRAVERSION " (c/s " XEN_CHANGESET ") EFI loader\r\n"); relocate_image(0); if ( StdOut->QueryMode(StdOut, StdOut->Mode->Mode, &cols, &rows) == EFI_SUCCESS ) { vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3; vga_console_info.u.text_mode_3.columns = cols; vga_console_info.u.text_mode_3.rows = rows; vga_console_info.u.text_mode_3.font_height = 16; } size = 0; status = efi_bs->LocateHandle(ByProtocol, &gop_guid, NULL, &size, NULL); if ( status == EFI_BUFFER_TOO_SMALL ) status = efi_bs->AllocatePool(EfiLoaderData, size, (void **)&handles); if ( !EFI_ERROR(status) ) status = efi_bs->LocateHandle(ByProtocol, &gop_guid, NULL, &size, handles); if ( EFI_ERROR(status) ) size = 0; for ( i = 0; i < size / sizeof(*handles); ++i ) { status = efi_bs->HandleProtocol(handles[i], &gop_guid, (void **)&gop); if ( EFI_ERROR(status) ) continue; status = gop->QueryMode(gop, gop->Mode->Mode, &info_size, &mode_info); if ( !EFI_ERROR(status) ) break; } if ( handles ) efi_bs->FreePool(handles); if ( EFI_ERROR(status) ) gop = NULL; /* Read and parse the config file. */ if ( !cfg_file_name ) { CHAR16 *tail; while ( (tail = point_tail(file_name)) != NULL ) { wstrcpy(tail, L".cfg"); if ( read_file(dir_handle, file_name, &cfg) ) break; *tail = 0; } if ( !tail ) blexit(L"No configuration file found."); PrintStr(L"Using configuration file '"); PrintStr(file_name); PrintStr(L"'\r\n"); } else if ( !read_file(dir_handle, cfg_file_name, &cfg) ) blexit(L"Configuration file not found."); pre_parse(&cfg); if ( section.w ) w2s(§ion); else section.s = get_value(&cfg, "global", "default"); for ( ; ; ) { name.s = get_value(&cfg, section.s, "kernel"); if ( name.s ) break; name.s = get_value(&cfg, "global", "chain"); if ( !name.s ) break; efi_bs->FreePages(cfg.addr, PFN_UP(cfg.size)); cfg.addr = 0; if ( !read_file(dir_handle, s2w(&name), &cfg) ) { PrintStr(L"Chained configuration file '"); PrintStr(name.w); efi_bs->FreePool(name.w); blexit(L"'not found."); } pre_parse(&cfg); efi_bs->FreePool(name.w); } if ( !name.s ) blexit(L"No Dom0 kernel image specified."); split_value(name.s); read_file(dir_handle, s2w(&name), &kernel); efi_bs->FreePool(name.w); if ( !EFI_ERROR(efi_bs->LocateProtocol(&shim_lock_guid, NULL, (void **)&shim_lock)) && shim_lock->Verify(kernel.ptr, kernel.size) != EFI_SUCCESS ) blexit(L"Dom0 kernel image could not be verified."); name.s = get_value(&cfg, section.s, "ramdisk"); if ( name.s ) { split_value(name.s); read_file(dir_handle, s2w(&name), &ramdisk); efi_bs->FreePool(name.w); } name.s = get_value(&cfg, section.s, "ucode"); if ( !name.s ) name.s = get_value(&cfg, "global", "ucode"); if ( name.s ) { microcode_set_module(mbi.mods_count); split_value(name.s); read_file(dir_handle, s2w(&name), &ucode); efi_bs->FreePool(name.w); } name.s = get_value(&cfg, section.s, "xsm"); if ( name.s ) { split_value(name.s); read_file(dir_handle, s2w(&name), &xsm); efi_bs->FreePool(name.w); } name.s = get_value(&cfg, section.s, "options"); if ( name.s ) place_string(&mbi.cmdline, name.s); /* Insert image name last, as it gets prefixed to the other options. */ if ( argc ) { name.w = *argv; w2s(&name); } else name.s = "xen"; place_string(&mbi.cmdline, name.s); cols = rows = depth = 0; if ( !base_video ) { name.cs = get_value(&cfg, section.s, "video"); if ( !name.cs ) name.cs = get_value(&cfg, "global", "video"); if ( name.cs && !strncmp(name.cs, "gfx-", 4) ) { cols = simple_strtoul(name.cs + 4, &name.cs, 10); if ( *name.cs == 'x' ) rows = simple_strtoul(name.cs + 1, &name.cs, 10); if ( *name.cs == 'x' ) depth = simple_strtoul(name.cs + 1, &name.cs, 10); if ( *name.cs ) cols = rows = depth = 0; } } efi_bs->FreePages(cfg.addr, PFN_UP(cfg.size)); cfg.addr = 0; dir_handle->Close(dir_handle); if ( gop && !base_video ) { for ( i = size = 0; i < gop->Mode->MaxMode; ++i ) { unsigned int bpp = 0; status = gop->QueryMode(gop, i, &info_size, &mode_info); if ( EFI_ERROR(status) ) continue; switch ( mode_info->PixelFormat ) { case PixelBitMask: bpp = hweight32(mode_info->PixelInformation.RedMask | mode_info->PixelInformation.GreenMask | mode_info->PixelInformation.BlueMask); break; case PixelRedGreenBlueReserved8BitPerColor: case PixelBlueGreenRedReserved8BitPerColor: bpp = 24; break; default: continue; } if ( cols == mode_info->HorizontalResolution && rows == mode_info->VerticalResolution && (!depth || bpp == depth) ) { gop_mode = i; break; } if ( !cols && !rows && mode_info->HorizontalResolution * mode_info->VerticalResolution > size ) { size = mode_info->HorizontalResolution * mode_info->VerticalResolution; gop_mode = i; } } } if ( mbi.cmdline ) mbi.flags |= MBI_CMDLINE; /* * These must not be initialized statically, since the value must * not get relocated when processing base relocations below. */ mbi.boot_loader_name = (long)"EFI"; mbi.mods_addr = (long)mb_modules; place_string(&mbi.mem_upper, NULL); /* Collect EDD info. */ BUILD_BUG_ON(offsetof(struct edd_info, edd_device_params) != EDDEXTSIZE); BUILD_BUG_ON(sizeof(struct edd_device_params) != EDDPARMSIZE); size = 0; status = efi_bs->LocateHandle(ByProtocol, &bio_guid, NULL, &size, NULL); if ( status == EFI_BUFFER_TOO_SMALL ) status = efi_bs->AllocatePool(EfiLoaderData, size, (void **)&handles); if ( !EFI_ERROR(status) ) status = efi_bs->LocateHandle(ByProtocol, &bio_guid, NULL, &size, handles); if ( EFI_ERROR(status) ) size = 0; for ( i = 0; i < size / sizeof(*handles); ++i ) { EFI_BLOCK_IO *bio; EFI_DEV_PATH_PTR devp; struct edd_info *info = boot_edd_info + boot_edd_info_nr; struct edd_device_params *params = &info->edd_device_params; enum { root, acpi, pci, ctrlr } state = root; status = efi_bs->HandleProtocol(handles[i], &bio_guid, (void **)&bio); if ( EFI_ERROR(status) || bio->Media->RemovableMedia || bio->Media->LogicalPartition ) continue; if ( boot_edd_info_nr < EDD_INFO_MAX ) { info->device = 0x80 + boot_edd_info_nr; /* fake */ info->version = 0x11; params->length = offsetof(struct edd_device_params, dpte_ptr); params->number_of_sectors = bio->Media->LastBlock + 1; params->bytes_per_sector = bio->Media->BlockSize; params->dpte_ptr = ~0; } ++boot_edd_info_nr; status = efi_bs->HandleProtocol(handles[i], &devp_guid, (void **)&devp); if ( EFI_ERROR(status) ) continue; for ( ; !IsDevicePathEnd(devp.DevPath); devp.DevPath = NextDevicePathNode(devp.DevPath) ) { switch ( DevicePathType(devp.DevPath) ) { const u8 *p; case ACPI_DEVICE_PATH: if ( state != root || boot_edd_info_nr > EDD_INFO_MAX ) break; switch ( DevicePathSubType(devp.DevPath) ) { case ACPI_DP: if ( devp.Acpi->HID != EISA_PNP_ID(0xA03) && devp.Acpi->HID != EISA_PNP_ID(0xA08) ) break; params->interface_path.pci.bus = devp.Acpi->UID; state = acpi; break; case EXPANDED_ACPI_DP: /* XXX */ break; } break; case HARDWARE_DEVICE_PATH: if ( state != acpi || DevicePathSubType(devp.DevPath) != HW_PCI_DP || boot_edd_info_nr > EDD_INFO_MAX ) break; state = pci; edd_put_string(params->host_bus_type, "PCI"); params->interface_path.pci.slot = devp.Pci->Device; params->interface_path.pci.function = devp.Pci->Function; break; case MESSAGING_DEVICE_PATH: if ( state != pci || boot_edd_info_nr > EDD_INFO_MAX ) break; state = ctrlr; switch ( DevicePathSubType(devp.DevPath) ) { case MSG_ATAPI_DP: edd_put_string(params->interface_type, "ATAPI"); params->interface_path.pci.channel = devp.Atapi->PrimarySecondary; params->device_path.atapi.device = devp.Atapi->SlaveMaster; params->device_path.atapi.lun = devp.Atapi->Lun; break; case MSG_SCSI_DP: edd_put_string(params->interface_type, "SCSI"); params->device_path.scsi.id = devp.Scsi->Pun; params->device_path.scsi.lun = devp.Scsi->Lun; break; case MSG_FIBRECHANNEL_DP: edd_put_string(params->interface_type, "FIBRE"); params->device_path.fibre.wwid = devp.FibreChannel->WWN; params->device_path.fibre.lun = devp.FibreChannel->Lun; break; case MSG_1394_DP: edd_put_string(params->interface_type, "1394"); params->device_path.i1394.eui = devp.F1394->Guid; break; case MSG_USB_DP: case MSG_USB_CLASS_DP: edd_put_string(params->interface_type, "USB"); break; case MSG_I2O_DP: edd_put_string(params->interface_type, "I2O"); params->device_path.i2o.identity_tag = devp.I2O->Tid; break; default: continue; } info->version = 0x30; params->length = sizeof(struct edd_device_params); params->key = 0xbedd; params->device_path_info_length = sizeof(struct edd_device_params) - offsetof(struct edd_device_params, key); for ( p = (const u8 *)¶ms->key; p < ¶ms->checksum; ++p ) params->checksum -= *p; break; case MEDIA_DEVICE_PATH: if ( DevicePathSubType(devp.DevPath) == MEDIA_HARDDRIVE_DP && devp.HardDrive->MBRType == MBR_TYPE_PCAT && boot_mbr_signature_nr < EDD_MBR_SIG_MAX ) { struct mbr_signature *sig = boot_mbr_signature + boot_mbr_signature_nr; sig->device = 0x80 + boot_edd_info_nr; /* fake */ memcpy(&sig->signature, devp.HardDrive->Signature, sizeof(sig->signature)); ++boot_mbr_signature_nr; } break; } } } if ( handles ) efi_bs->FreePool(handles); if ( boot_edd_info_nr > EDD_INFO_MAX ) boot_edd_info_nr = EDD_INFO_MAX; /* XXX Collect EDID info. */ if ( cpuid_eax(0x80000000) > 0x80000000 ) { cpuid_ext_features = cpuid_edx(0x80000001); boot_cpu_data.x86_capability[1] = cpuid_ext_features; } /* Obtain basic table pointers. */ for ( i = 0; i < efi_num_ct; ++i ) { static EFI_GUID __initdata acpi2_guid = ACPI_20_TABLE_GUID; static EFI_GUID __initdata acpi_guid = ACPI_TABLE_GUID; static EFI_GUID __initdata mps_guid = MPS_TABLE_GUID; static EFI_GUID __initdata smbios_guid = SMBIOS_TABLE_GUID; if ( match_guid(&acpi2_guid, &efi_ct[i].VendorGuid) ) efi.acpi20 = (long)efi_ct[i].VendorTable; if ( match_guid(&acpi_guid, &efi_ct[i].VendorGuid) ) efi.acpi = (long)efi_ct[i].VendorTable; if ( match_guid(&mps_guid, &efi_ct[i].VendorGuid) ) efi.mps = (long)efi_ct[i].VendorTable; if ( match_guid(&smbios_guid, &efi_ct[i].VendorGuid) ) efi.smbios = (long)efi_ct[i].VendorTable; } if (efi.smbios != EFI_INVALID_TABLE_ADDR) dmi_efi_get_table((void *)(long)efi.smbios); /* Collect PCI ROM contents. */ setup_efi_pci(); /* Get snapshot of variable store parameters. */ status = (efi_rs->Hdr.Revision >> 16) >= 2 ? efi_rs->QueryVariableInfo(EFI_VARIABLE_NON_VOLATILE | EFI_VARIABLE_BOOTSERVICE_ACCESS | EFI_VARIABLE_RUNTIME_ACCESS, &efi_boot_max_var_store_size, &efi_boot_remain_var_store_size, &efi_boot_max_var_size) : EFI_INCOMPATIBLE_VERSION; if ( EFI_ERROR(status) ) { efi_boot_max_var_store_size = 0; efi_boot_remain_var_store_size = 0; efi_boot_max_var_size = status; PrintStr(L"Warning: Could not query variable store: "); DisplayUint(status, 0); PrintStr(newline); } /* Allocate space for trampoline (in first Mb). */ cfg.addr = 0x100000; cfg.size = trampoline_end - trampoline_start; status = efi_bs->AllocatePages(AllocateMaxAddress, EfiLoaderData, PFN_UP(cfg.size), &cfg.addr); if ( status == EFI_SUCCESS ) relocate_trampoline(cfg.addr); else { cfg.addr = 0; PrintStr(L"Trampoline space cannot be allocated; will try fallback.\r\n"); } /* Initialise L2 identity-map and boot-map page table entries (16MB). */ for ( i = 0; i < 8; ++i ) { unsigned int slot = (xen_phys_start >> L2_PAGETABLE_SHIFT) + i; paddr_t addr = slot << L2_PAGETABLE_SHIFT; l2_identmap[slot] = l2e_from_paddr(addr, PAGE_HYPERVISOR|_PAGE_PSE); slot &= L2_PAGETABLE_ENTRIES - 1; l2_bootmap[slot] = l2e_from_paddr(addr, __PAGE_HYPERVISOR|_PAGE_PSE); } /* Initialise L3 boot-map page directory entries. */ l3_bootmap[l3_table_offset(xen_phys_start)] = l3e_from_paddr((UINTN)l2_bootmap, __PAGE_HYPERVISOR); l3_bootmap[l3_table_offset(xen_phys_start + (8 << L2_PAGETABLE_SHIFT) - 1)] = l3e_from_paddr((UINTN)l2_bootmap, __PAGE_HYPERVISOR); if ( gop ) { int bpp = 0; /* Set graphics mode. */ if ( gop_mode < gop->Mode->MaxMode && gop_mode != gop->Mode->Mode ) gop->SetMode(gop, gop_mode); /* Get graphics and frame buffer info. */ status = gop->QueryMode(gop, gop->Mode->Mode, &info_size, &mode_info); if ( !EFI_ERROR(status) ) switch ( mode_info->PixelFormat ) { case PixelRedGreenBlueReserved8BitPerColor: vga_console_info.u.vesa_lfb.red_pos = 0; vga_console_info.u.vesa_lfb.red_size = 8; vga_console_info.u.vesa_lfb.green_pos = 8; vga_console_info.u.vesa_lfb.green_size = 8; vga_console_info.u.vesa_lfb.blue_pos = 16; vga_console_info.u.vesa_lfb.blue_size = 8; vga_console_info.u.vesa_lfb.rsvd_pos = 24; vga_console_info.u.vesa_lfb.rsvd_size = 8; bpp = 32; break; case PixelBlueGreenRedReserved8BitPerColor: vga_console_info.u.vesa_lfb.red_pos = 16; vga_console_info.u.vesa_lfb.red_size = 8; vga_console_info.u.vesa_lfb.green_pos = 8; vga_console_info.u.vesa_lfb.green_size = 8; vga_console_info.u.vesa_lfb.blue_pos = 0; vga_console_info.u.vesa_lfb.blue_size = 8; vga_console_info.u.vesa_lfb.rsvd_pos = 24; vga_console_info.u.vesa_lfb.rsvd_size = 8; bpp = 32; break; case PixelBitMask: bpp = set_color(mode_info->PixelInformation.RedMask, bpp, &vga_console_info.u.vesa_lfb.red_pos, &vga_console_info.u.vesa_lfb.red_size); bpp = set_color(mode_info->PixelInformation.GreenMask, bpp, &vga_console_info.u.vesa_lfb.green_pos, &vga_console_info.u.vesa_lfb.green_size); bpp = set_color(mode_info->PixelInformation.BlueMask, bpp, &vga_console_info.u.vesa_lfb.blue_pos, &vga_console_info.u.vesa_lfb.blue_size); bpp = set_color(mode_info->PixelInformation.ReservedMask, bpp, &vga_console_info.u.vesa_lfb.rsvd_pos, &vga_console_info.u.vesa_lfb.rsvd_size); if ( bpp > 0 ) break; /* fall through */ default: PrintErr(L"Current graphics mode is unsupported!"); status = EFI_UNSUPPORTED; break; } if ( !EFI_ERROR(status) ) { vga_console_info.video_type = XEN_VGATYPE_EFI_LFB; vga_console_info.u.vesa_lfb.gbl_caps = 2; /* possibly non-VGA */ vga_console_info.u.vesa_lfb.width = mode_info->HorizontalResolution; vga_console_info.u.vesa_lfb.height = mode_info->VerticalResolution; vga_console_info.u.vesa_lfb.bits_per_pixel = bpp; vga_console_info.u.vesa_lfb.bytes_per_line = (mode_info->PixelsPerScanLine * bpp + 7) >> 3; vga_console_info.u.vesa_lfb.lfb_base = gop->Mode->FrameBufferBase; vga_console_info.u.vesa_lfb.lfb_size = (gop->Mode->FrameBufferSize + 0xffff) >> 16; } } status = efi_bs->GetMemoryMap(&efi_memmap_size, NULL, &map_key, &efi_mdesc_size, &mdesc_ver); mbi.mem_upper -= efi_memmap_size; mbi.mem_upper &= -__alignof__(EFI_MEMORY_DESCRIPTOR); if ( mbi.mem_upper < xen_phys_start ) blexit(L"Out of static memory"); efi_memmap = (void *)(long)mbi.mem_upper; status = efi_bs->GetMemoryMap(&efi_memmap_size, efi_memmap, &map_key, &efi_mdesc_size, &mdesc_ver); if ( EFI_ERROR(status) ) blexit(L"Cannot obtain memory map"); /* Populate E820 table and check trampoline area availability. */ e = e820map - 1; for ( i = 0; i < efi_memmap_size; i += efi_mdesc_size ) { EFI_MEMORY_DESCRIPTOR *desc = efi_memmap + i; u64 len = desc->NumberOfPages << EFI_PAGE_SHIFT; u32 type; switch ( desc->Type ) { default: type = E820_RESERVED; break; case EfiConventionalMemory: case EfiBootServicesCode: case EfiBootServicesData: if ( !trampoline_phys && desc->PhysicalStart + len <= 0x100000 && len >= cfg.size && desc->PhysicalStart + len > cfg.addr ) cfg.addr = (desc->PhysicalStart + len - cfg.size) & PAGE_MASK; /* fall through */ case EfiLoaderCode: case EfiLoaderData: if ( desc->Attribute & EFI_MEMORY_WB ) type = E820_RAM; else case EfiUnusableMemory: type = E820_UNUSABLE; break; case EfiACPIReclaimMemory: type = E820_ACPI; break; case EfiACPIMemoryNVS: type = E820_NVS; break; } if ( e820nr && type == e->type && desc->PhysicalStart == e->addr + e->size ) e->size += len; else if ( !len || e820nr >= E820MAX ) continue; else { ++e; e->addr = desc->PhysicalStart; e->size = len; e->type = type; ++e820nr; } } if ( !trampoline_phys ) { if ( !cfg.addr ) blexit(L"No memory for trampoline"); relocate_trampoline(cfg.addr); } status = efi_bs->ExitBootServices(ImageHandle, map_key); if ( EFI_ERROR(status) ) PrintErrMesg(L"Cannot exit boot services", status); /* Adjust pointers into EFI. */ efi_ct = (void *)efi_ct + DIRECTMAP_VIRT_START; #ifdef USE_SET_VIRTUAL_ADDRESS_MAP efi_rs = (void *)efi_rs + DIRECTMAP_VIRT_START; #endif efi_memmap = (void *)efi_memmap + DIRECTMAP_VIRT_START; efi_fw_vendor = (void *)efi_fw_vendor + DIRECTMAP_VIRT_START; relocate_image(__XEN_VIRT_START - xen_phys_start); memcpy((void *)trampoline_phys, trampoline_start, cfg.size); /* Set system registers and transfer control. */ asm volatile("pushq $0\n\tpopfq"); rdmsrl(MSR_EFER, efer); efer |= EFER_SCE; if ( cpuid_ext_features & (1 << (X86_FEATURE_NX & 0x1f)) ) efer |= EFER_NX; wrmsrl(MSR_EFER, efer); write_cr0(X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | X86_CR0_PG); asm volatile ( "mov %[cr4], %%cr4\n\t" "mov %[cr3], %%cr3\n\t" "movabs $__start_xen, %[rip]\n\t" "lidt idt_descr(%%rip)\n\t" "lgdt gdt_descr(%%rip)\n\t" "mov stack_start(%%rip), %%rsp\n\t" "mov %[ds], %%ss\n\t" "mov %[ds], %%ds\n\t" "mov %[ds], %%es\n\t" "mov %[ds], %%fs\n\t" "mov %[ds], %%gs\n\t" "movl %[cs], 8(%%rsp)\n\t" "mov %[rip], (%%rsp)\n\t" "lretq %[stkoff]-16" : [rip] "=&r" (efer/* any dead 64-bit variable */) : [cr3] "r" (idle_pg_table), [cr4] "r" (mmu_cr4_features), [cs] "ir" (__HYPERVISOR_CS), [ds] "r" (__HYPERVISOR_DS), [stkoff] "i" (STACK_SIZE - sizeof(struct cpu_info)), "D" (&mbi) : "memory" ); for( ; ; ); /* not reached */ } #ifndef USE_SET_VIRTUAL_ADDRESS_MAP static __init void copy_mapping(unsigned long mfn, unsigned long end, bool_t (*is_valid)(unsigned long smfn, unsigned long emfn)) { unsigned long next; for ( ; mfn < end; mfn = next ) { l4_pgentry_t l4e = efi_l4_pgtable[l4_table_offset(mfn << PAGE_SHIFT)]; l3_pgentry_t *l3src, *l3dst; unsigned long va = (unsigned long)mfn_to_virt(mfn); next = mfn + (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)); if ( !is_valid(mfn, min(next, end)) ) continue; if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) { l3dst = alloc_xen_pagetable(); BUG_ON(!l3dst); clear_page(l3dst); efi_l4_pgtable[l4_table_offset(mfn << PAGE_SHIFT)] = l4e_from_paddr(virt_to_maddr(l3dst), __PAGE_HYPERVISOR); } else l3dst = l4e_to_l3e(l4e); l3src = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]); l3dst[l3_table_offset(mfn << PAGE_SHIFT)] = l3src[l3_table_offset(va)]; } } static bool_t __init ram_range_valid(unsigned long smfn, unsigned long emfn) { unsigned long sz = pfn_to_pdx(emfn - 1) / PDX_GROUP_COUNT + 1; return !(smfn & pfn_hole_mask) && find_next_bit(pdx_group_valid, sz, pfn_to_pdx(smfn) / PDX_GROUP_COUNT) < sz; } static bool_t __init rt_range_valid(unsigned long smfn, unsigned long emfn) { return 1; } #endif #define INVALID_VIRTUAL_ADDRESS (0xBAAADUL << \ (EFI_PAGE_SHIFT + BITS_PER_LONG - 32)) void __init efi_init_memory(void) { unsigned int i; #ifndef USE_SET_VIRTUAL_ADDRESS_MAP struct rt_extra { struct rt_extra *next; unsigned long smfn, emfn; unsigned int prot; } *extra, *extra_head = NULL; #endif printk(XENLOG_INFO "EFI memory map:\n"); for ( i = 0; i < efi_memmap_size; i += efi_mdesc_size ) { EFI_MEMORY_DESCRIPTOR *desc = efi_memmap + i; u64 len = desc->NumberOfPages << EFI_PAGE_SHIFT; unsigned long smfn, emfn; unsigned int prot = PAGE_HYPERVISOR; printk(XENLOG_INFO " %013" PRIx64 "-%013" PRIx64 " type=%u attr=%016" PRIx64 "\n", desc->PhysicalStart, desc->PhysicalStart + len - 1, desc->Type, desc->Attribute); if ( !(desc->Attribute & EFI_MEMORY_RUNTIME) ) continue; desc->VirtualStart = INVALID_VIRTUAL_ADDRESS; smfn = PFN_DOWN(desc->PhysicalStart); emfn = PFN_UP(desc->PhysicalStart + len); if ( desc->Attribute & EFI_MEMORY_WB ) /* nothing */; else if ( desc->Attribute & EFI_MEMORY_WT ) prot |= _PAGE_PWT | MAP_SMALL_PAGES; else if ( desc->Attribute & EFI_MEMORY_WC ) prot |= _PAGE_PAT | MAP_SMALL_PAGES; else if ( desc->Attribute & (EFI_MEMORY_UC | EFI_MEMORY_UCE) ) prot |= _PAGE_PWT | _PAGE_PCD | MAP_SMALL_PAGES; else { printk(XENLOG_ERR "Unknown cachability for MFNs %#lx-%#lx\n", smfn, emfn - 1); continue; } if ( desc->Attribute & EFI_MEMORY_WP ) prot &= _PAGE_RW; if ( desc->Attribute & EFI_MEMORY_XP ) prot |= _PAGE_NX_BIT; if ( pfn_to_pdx(emfn - 1) < (DIRECTMAP_SIZE >> PAGE_SHIFT) && !(smfn & pfn_hole_mask) && !((smfn ^ (emfn - 1)) & ~pfn_pdx_bottom_mask) ) { if ( (unsigned long)mfn_to_virt(emfn - 1) >= HYPERVISOR_VIRT_END ) prot &= ~_PAGE_GLOBAL; if ( map_pages_to_xen((unsigned long)mfn_to_virt(smfn), smfn, emfn - smfn, prot) == 0 ) desc->VirtualStart = (unsigned long)maddr_to_virt(desc->PhysicalStart); else printk(XENLOG_ERR "Could not map MFNs %#lx-%#lx\n", smfn, emfn - 1); } #ifndef USE_SET_VIRTUAL_ADDRESS_MAP else if ( !((desc->PhysicalStart + len - 1) >> (VADDR_BITS - 1)) && (extra = xmalloc(struct rt_extra)) != NULL ) { extra->smfn = smfn; extra->emfn = emfn; extra->prot = prot & ~_PAGE_GLOBAL; extra->next = extra_head; extra_head = extra; desc->VirtualStart = desc->PhysicalStart; } #endif else { #ifdef USE_SET_VIRTUAL_ADDRESS_MAP /* XXX allocate e.g. down from FIXADDR_START */ #endif printk(XENLOG_ERR "No mapping for MFNs %#lx-%#lx\n", smfn, emfn - 1); } } #ifdef USE_SET_VIRTUAL_ADDRESS_MAP efi_rs->SetVirtualAddressMap(efi_memmap_size, efi_mdesc_size, mdesc_ver, efi_memmap); #else /* Set up 1:1 page tables to do runtime calls in "physical" mode. */ efi_l4_pgtable = alloc_xen_pagetable(); BUG_ON(!efi_l4_pgtable); clear_page(efi_l4_pgtable); copy_mapping(0, max_page, ram_range_valid); /* Insert non-RAM runtime mappings inside the direct map. */ for ( i = 0; i < efi_memmap_size; i += efi_mdesc_size ) { const EFI_MEMORY_DESCRIPTOR *desc = efi_memmap + i; if ( (desc->Attribute & EFI_MEMORY_RUNTIME) && desc->VirtualStart != INVALID_VIRTUAL_ADDRESS && desc->VirtualStart != desc->PhysicalStart ) copy_mapping(PFN_DOWN(desc->PhysicalStart), PFN_UP(desc->PhysicalStart + (desc->NumberOfPages << EFI_PAGE_SHIFT)), rt_range_valid); } /* Insert non-RAM runtime mappings outside of the direct map. */ while ( (extra = extra_head) != NULL ) { unsigned long addr = extra->smfn << PAGE_SHIFT; l4_pgentry_t l4e = efi_l4_pgtable[l4_table_offset(addr)]; l3_pgentry_t *pl3e; l2_pgentry_t *pl2e; l1_pgentry_t *l1t; if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) { pl3e = alloc_xen_pagetable(); BUG_ON(!pl3e); clear_page(pl3e); efi_l4_pgtable[l4_table_offset(addr)] = l4e_from_paddr(virt_to_maddr(pl3e), __PAGE_HYPERVISOR); } else pl3e = l4e_to_l3e(l4e); pl3e += l3_table_offset(addr); if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) { pl2e = alloc_xen_pagetable(); BUG_ON(!pl2e); clear_page(pl2e); *pl3e = l3e_from_paddr(virt_to_maddr(pl2e), __PAGE_HYPERVISOR); } else { BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE); pl2e = l3e_to_l2e(*pl3e); } pl2e += l2_table_offset(addr); if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) { l1t = alloc_xen_pagetable(); BUG_ON(!l1t); clear_page(l1t); *pl2e = l2e_from_paddr(virt_to_maddr(l1t), __PAGE_HYPERVISOR); } else { BUG_ON(l2e_get_flags(*pl2e) & _PAGE_PSE); l1t = l2e_to_l1e(*pl2e); } for ( i = l1_table_offset(addr); i < L1_PAGETABLE_ENTRIES && extra->smfn < extra->emfn; ++i, ++extra->smfn ) l1t[i] = l1e_from_pfn(extra->smfn, extra->prot); if ( extra->smfn == extra->emfn ) { extra_head = extra->next; xfree(extra); } } /* Insert Xen mappings. */ for ( i = l4_table_offset(HYPERVISOR_VIRT_START); i < l4_table_offset(DIRECTMAP_VIRT_END); ++i ) efi_l4_pgtable[i] = idle_pg_table[i]; #endif } xen-4.4.0/xen/arch/x86/efi/compat.c0000664000175000017500000000167612307313555015003 0ustar smbsmb#include #include #define efi_get_info efi_compat_get_info #define xenpf_efi_info compat_pf_efi_info #define efi_runtime_call efi_compat_runtime_call #define xenpf_efi_runtime_call compat_pf_efi_runtime_call #define xenpf_efi_guid compat_pf_efi_guid #define xenpf_efi_time compat_pf_efi_time #define COMPAT #undef DEFINE_XEN_GUEST_HANDLE #define DEFINE_XEN_GUEST_HANDLE DEFINE_COMPAT_HANDLE #undef XEN_GUEST_HANDLE #define XEN_GUEST_HANDLE COMPAT_HANDLE #undef guest_handle_okay #define guest_handle_okay compat_handle_okay #undef guest_handle_cast #define guest_handle_cast compat_handle_cast #undef __copy_from_guest #define __copy_from_guest __copy_from_compat #undef copy_from_guest_offset #define copy_from_guest_offset copy_from_compat_offset #undef copy_to_guest #define copy_to_guest copy_to_compat #undef __copy_to_guest_offset #define __copy_to_guest_offset __copy_to_compat_offset #include "runtime.c" xen-4.4.0/xen/arch/x86/efi/check.c0000664000175000017500000000007612307313555014566 0ustar smbsmbint __attribute__((__ms_abi__)) test(int i) { return i; } xen-4.4.0/xen/arch/x86/efi/runtime.c0000664000175000017500000003465612307313555015207 0ustar smbsmb#include "efi.h" #include #include #include #include #include #include DEFINE_XEN_GUEST_HANDLE(CHAR16); #ifndef COMPAT # include const bool_t efi_enabled = 1; unsigned int __read_mostly efi_num_ct; EFI_CONFIGURATION_TABLE *__read_mostly efi_ct; unsigned int __read_mostly efi_version; unsigned int __read_mostly efi_fw_revision; const CHAR16 *__read_mostly efi_fw_vendor; EFI_RUNTIME_SERVICES *__read_mostly efi_rs; static DEFINE_SPINLOCK(efi_rs_lock); UINTN __read_mostly efi_memmap_size; UINTN __read_mostly efi_mdesc_size; void *__read_mostly efi_memmap; UINT64 __read_mostly efi_boot_max_var_store_size; UINT64 __read_mostly efi_boot_remain_var_store_size; UINT64 __read_mostly efi_boot_max_var_size; struct efi __read_mostly efi = { .acpi = EFI_INVALID_TABLE_ADDR, .acpi20 = EFI_INVALID_TABLE_ADDR, .mps = EFI_INVALID_TABLE_ADDR, .smbios = EFI_INVALID_TABLE_ADDR, }; l4_pgentry_t *__read_mostly efi_l4_pgtable; const struct efi_pci_rom *__read_mostly efi_pci_roms; unsigned long efi_rs_enter(void) { unsigned long cr3 = read_cr3(); spin_lock(&efi_rs_lock); /* prevent fixup_page_fault() from doing anything */ irq_enter(); if ( is_pv_vcpu(current) && !is_idle_vcpu(current) ) { struct desc_ptr gdt_desc = { .limit = LAST_RESERVED_GDT_BYTE, .base = (unsigned long)(per_cpu(gdt_table, smp_processor_id()) - FIRST_RESERVED_GDT_ENTRY) }; asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); } write_cr3(virt_to_maddr(efi_l4_pgtable)); return cr3; } void efi_rs_leave(unsigned long cr3) { write_cr3(cr3); if ( is_pv_vcpu(current) && !is_idle_vcpu(current) ) { struct desc_ptr gdt_desc = { .limit = LAST_RESERVED_GDT_BYTE, .base = GDT_VIRT_START(current) }; asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); } irq_exit(); spin_unlock(&efi_rs_lock); } unsigned long efi_get_time(void) { EFI_TIME time; EFI_STATUS status; unsigned long cr3 = efi_rs_enter(), flags; spin_lock_irqsave(&rtc_lock, flags); status = efi_rs->GetTime(&time, NULL); spin_unlock_irqrestore(&rtc_lock, flags); efi_rs_leave(cr3); if ( EFI_ERROR(status) ) return 0; return mktime(time.Year, time.Month, time.Day, time.Hour, time.Minute, time.Second); } void efi_halt_system(void) { EFI_STATUS status; unsigned long cr3 = efi_rs_enter(); status = efi_rs->ResetSystem(EfiResetShutdown, EFI_SUCCESS, 0, NULL); efi_rs_leave(cr3); printk(XENLOG_WARNING "EFI: could not halt system (%#lx)\n", status); } void efi_reset_system(bool_t warm) { EFI_STATUS status; unsigned long cr3 = efi_rs_enter(); status = efi_rs->ResetSystem(warm ? EfiResetWarm : EfiResetCold, EFI_SUCCESS, 0, NULL); efi_rs_leave(cr3); printk(XENLOG_WARNING "EFI: could not reset system (%#lx)\n", status); } #endif int efi_get_info(uint32_t idx, union xenpf_efi_info *info) { unsigned int i, n; switch ( idx ) { case XEN_FW_EFI_VERSION: info->version = efi_version; break; case XEN_FW_EFI_RT_VERSION: { unsigned long cr3 = efi_rs_enter(); info->version = efi_rs->Hdr.Revision; efi_rs_leave(cr3); break; } case XEN_FW_EFI_CONFIG_TABLE: info->cfg.addr = __pa(efi_ct); info->cfg.nent = efi_num_ct; break; case XEN_FW_EFI_VENDOR: info->vendor.revision = efi_fw_revision; n = info->vendor.bufsz / sizeof(*efi_fw_vendor); if ( !guest_handle_okay(guest_handle_cast(info->vendor.name, CHAR16), n) ) return -EFAULT; for ( i = 0; i < n; ++i ) { if ( __copy_to_guest_offset(info->vendor.name, i, efi_fw_vendor + i, 1) ) return -EFAULT; if ( !efi_fw_vendor[i] ) break; } break; case XEN_FW_EFI_MEM_INFO: for ( i = 0; i < efi_memmap_size; i += efi_mdesc_size ) { EFI_MEMORY_DESCRIPTOR *desc = efi_memmap + i; u64 len = desc->NumberOfPages << EFI_PAGE_SHIFT; if ( info->mem.addr >= desc->PhysicalStart && info->mem.addr < desc->PhysicalStart + len ) { info->mem.type = desc->Type; info->mem.attr = desc->Attribute; if ( info->mem.addr + info->mem.size < info->mem.addr || info->mem.addr + info->mem.size > desc->PhysicalStart + len ) info->mem.size = desc->PhysicalStart + len - info->mem.addr; return 0; } } return -ESRCH; case XEN_FW_EFI_PCI_ROM: { const struct efi_pci_rom *ent; for ( ent = efi_pci_roms; ent; ent = ent->next ) if ( info->pci_rom.segment == ent->segment && info->pci_rom.bus == ent->bus && info->pci_rom.devfn == ent->devfn && info->pci_rom.vendor == ent->vendor && info->pci_rom.devid == ent->devid ) { info->pci_rom.address = __pa(ent->data); info->pci_rom.size = ent->size; return 0; } return -ESRCH; } default: return -EINVAL; } return 0; } static long gwstrlen(XEN_GUEST_HANDLE_PARAM(CHAR16) str) { unsigned long len; for ( len = 0; ; ++len ) { CHAR16 c; if ( copy_from_guest_offset(&c, str, len, 1) ) return -EFAULT; if ( !c ) break; } return len; } static inline EFI_TIME *cast_time(struct xenpf_efi_time *time) { #define chk_fld(F, f) \ BUILD_BUG_ON(sizeof(cast_time(NULL)->F) != sizeof(time->f) || \ offsetof(EFI_TIME, F) != offsetof(struct xenpf_efi_time, f)) chk_fld(Year, year); chk_fld(Month, month); chk_fld(Day, day); chk_fld(Hour, hour); chk_fld(Minute, min); chk_fld(Second, sec); chk_fld(Nanosecond, ns); chk_fld(TimeZone, tz); chk_fld(Daylight, daylight); #undef chk_fld return (void *)time; } static inline EFI_GUID *cast_guid(struct xenpf_efi_guid *guid) { #define chk_fld(n) \ BUILD_BUG_ON(sizeof(cast_guid(NULL)->Data##n) != sizeof(guid->data##n) || \ offsetof(EFI_GUID, Data##n) != \ offsetof(struct xenpf_efi_guid, data##n)) chk_fld(1); chk_fld(2); chk_fld(3); chk_fld(4); #undef chk_fld return (void *)guid; } int efi_runtime_call(struct xenpf_efi_runtime_call *op) { unsigned long cr3, flags; EFI_STATUS status = EFI_NOT_STARTED; int rc = 0; switch ( op->function ) { case XEN_EFI_get_time: { EFI_TIME_CAPABILITIES caps; if ( op->misc ) return -EINVAL; cr3 = efi_rs_enter(); spin_lock_irqsave(&rtc_lock, flags); status = efi_rs->GetTime(cast_time(&op->u.get_time.time), &caps); spin_unlock_irqrestore(&rtc_lock, flags); efi_rs_leave(cr3); if ( !EFI_ERROR(status) ) { op->u.get_time.resolution = caps.Resolution; op->u.get_time.accuracy = caps.Accuracy; if ( caps.SetsToZero ) op->misc = XEN_EFI_GET_TIME_SET_CLEARS_NS; } } break; case XEN_EFI_set_time: if ( op->misc ) return -EINVAL; cr3 = efi_rs_enter(); spin_lock_irqsave(&rtc_lock, flags); status = efi_rs->SetTime(cast_time(&op->u.set_time)); spin_unlock_irqrestore(&rtc_lock, flags); efi_rs_leave(cr3); break; case XEN_EFI_get_wakeup_time: { BOOLEAN enabled, pending; if ( op->misc ) return -EINVAL; cr3 = efi_rs_enter(); spin_lock_irqsave(&rtc_lock, flags); status = efi_rs->GetWakeupTime(&enabled, &pending, cast_time(&op->u.get_wakeup_time)); spin_unlock_irqrestore(&rtc_lock, flags); efi_rs_leave(cr3); if ( !EFI_ERROR(status) ) { if ( enabled ) op->misc |= XEN_EFI_GET_WAKEUP_TIME_ENABLED; if ( pending ) op->misc |= XEN_EFI_GET_WAKEUP_TIME_PENDING; } } break; case XEN_EFI_set_wakeup_time: if ( op->misc & ~(XEN_EFI_SET_WAKEUP_TIME_ENABLE | XEN_EFI_SET_WAKEUP_TIME_ENABLE_ONLY) ) return -EINVAL; cr3 = efi_rs_enter(); spin_lock_irqsave(&rtc_lock, flags); status = efi_rs->SetWakeupTime(!!(op->misc & XEN_EFI_SET_WAKEUP_TIME_ENABLE), (op->misc & XEN_EFI_SET_WAKEUP_TIME_ENABLE_ONLY) ? NULL : cast_time(&op->u.set_wakeup_time)); spin_unlock_irqrestore(&rtc_lock, flags); efi_rs_leave(cr3); op->misc = 0; break; case XEN_EFI_get_next_high_monotonic_count: if ( op->misc ) return -EINVAL; cr3 = efi_rs_enter(); status = efi_rs->GetNextHighMonotonicCount(&op->misc); efi_rs_leave(cr3); break; case XEN_EFI_get_variable: { CHAR16 *name; long len; unsigned char *data; UINTN size; if ( op->misc ) return -EINVAL; len = gwstrlen(guest_handle_cast(op->u.get_variable.name, CHAR16)); if ( len < 0 ) return len; name = xmalloc_array(CHAR16, ++len); if ( !name ) return -ENOMEM; __copy_from_guest(name, op->u.get_variable.name, len); size = op->u.get_variable.size; if ( size ) { data = xmalloc_bytes(size); if ( !data ) { xfree(name); return -ENOMEM; } } else data = NULL; cr3 = efi_rs_enter(); status = efi_rs->GetVariable( name, cast_guid(&op->u.get_variable.vendor_guid), &op->misc, &size, data); efi_rs_leave(cr3); if ( !EFI_ERROR(status) && copy_to_guest(op->u.get_variable.data, data, size) ) rc = -EFAULT; op->u.get_variable.size = size; xfree(data); xfree(name); } break; case XEN_EFI_set_variable: { CHAR16 *name; long len; unsigned char *data; len = gwstrlen(guest_handle_cast(op->u.set_variable.name, CHAR16)); if ( len < 0 ) return len; name = xmalloc_array(CHAR16, ++len); if ( !name ) return -ENOMEM; __copy_from_guest(name, op->u.set_variable.name, len); data = xmalloc_bytes(op->u.set_variable.size); if ( !data ) rc = -ENOMEM; else if ( copy_from_guest(data, op->u.set_variable.data, op->u.set_variable.size) ) rc = -EFAULT; else { cr3 = efi_rs_enter(); status = efi_rs->SetVariable( name, cast_guid(&op->u.set_variable.vendor_guid), op->misc, op->u.set_variable.size, data); efi_rs_leave(cr3); } xfree(data); xfree(name); } break; case XEN_EFI_get_next_variable_name: { union { CHAR16 *str; unsigned char *raw; } name; UINTN size; if ( op->misc ) return -EINVAL; size = op->u.get_next_variable_name.size; name.raw = xmalloc_bytes(size); if ( !name.raw ) return -ENOMEM; if ( copy_from_guest(name.raw, op->u.get_next_variable_name.name, size) ) { xfree(name.raw); return -EFAULT; } cr3 = efi_rs_enter(); status = efi_rs->GetNextVariableName( &size, name.str, cast_guid(&op->u.get_next_variable_name.vendor_guid)); efi_rs_leave(cr3); if ( !EFI_ERROR(status) && copy_to_guest(op->u.get_next_variable_name.name, name.raw, size) ) rc = -EFAULT; op->u.get_next_variable_name.size = size; xfree(name.raw); } break; case XEN_EFI_query_variable_info: if ( op->misc & ~XEN_EFI_VARINFO_BOOT_SNAPSHOT ) return -EINVAL; if ( op->misc & XEN_EFI_VARINFO_BOOT_SNAPSHOT ) { if ( (op->u.query_variable_info.attr & ~EFI_VARIABLE_APPEND_WRITE) != (EFI_VARIABLE_NON_VOLATILE | EFI_VARIABLE_BOOTSERVICE_ACCESS | EFI_VARIABLE_RUNTIME_ACCESS) ) return -EINVAL; op->u.query_variable_info.max_store_size = efi_boot_max_var_store_size; op->u.query_variable_info.remain_store_size = efi_boot_remain_var_store_size; if ( efi_boot_max_var_store_size ) { op->u.query_variable_info.max_size = efi_boot_max_var_size; status = EFI_SUCCESS; } else { op->u.query_variable_info.max_size = 0; status = efi_boot_max_var_size; } break; } cr3 = efi_rs_enter(); if ( (efi_rs->Hdr.Revision >> 16) < 2 ) { efi_rs_leave(cr3); return -EOPNOTSUPP; } status = efi_rs->QueryVariableInfo( op->u.query_variable_info.attr, &op->u.query_variable_info.max_store_size, &op->u.query_variable_info.remain_store_size, &op->u.query_variable_info.max_size); efi_rs_leave(cr3); break; case XEN_EFI_query_capsule_capabilities: case XEN_EFI_update_capsule: if ( op->misc ) return -EINVAL; cr3 = efi_rs_enter(); if ( (efi_rs->Hdr.Revision >> 16) < 2 ) { efi_rs_leave(cr3); return -EOPNOTSUPP; } efi_rs_leave(cr3); /* XXX fall through for now */ default: return -ENOSYS; } #ifndef COMPAT op->status = status; #else op->status = (status & 0x3fffffff) | ((status >> 32) & 0xc0000000); #endif return rc; } xen-4.4.0/xen/arch/x86/efi/efi.h0000664000175000017500000000165712307313555014267 0ustar smbsmb#include #include #include #include #include #include #include #include #include #include struct efi_pci_rom { const struct efi_pci_rom *next; u16 vendor, devid, segment; u8 bus, devfn; unsigned long size; unsigned char data[]; }; extern unsigned int efi_num_ct; extern EFI_CONFIGURATION_TABLE *efi_ct; extern unsigned int efi_version, efi_fw_revision; extern const CHAR16 *efi_fw_vendor; extern EFI_RUNTIME_SERVICES *efi_rs; extern UINTN efi_memmap_size, efi_mdesc_size; extern void *efi_memmap; extern l4_pgentry_t *efi_l4_pgtable; extern const struct efi_pci_rom *efi_pci_roms; extern UINT64 efi_boot_max_var_store_size, efi_boot_remain_var_store_size, efi_boot_max_var_size; unsigned long efi_rs_enter(void); void efi_rs_leave(unsigned long); xen-4.4.0/xen/arch/x86/hpet.c0000664000175000017500000005302112307313555013704 0ustar smbsmb/****************************************************************************** * arch/x86/hpet.c * * HPET management. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MAX_DELTA_NS MILLISECS(10*1000) #define MIN_DELTA_NS MICROSECS(20) #define HPET_EVT_USED_BIT 0 #define HPET_EVT_USED (1 << HPET_EVT_USED_BIT) #define HPET_EVT_DISABLE_BIT 1 #define HPET_EVT_DISABLE (1 << HPET_EVT_DISABLE_BIT) #define HPET_EVT_LEGACY_BIT 2 #define HPET_EVT_LEGACY (1 << HPET_EVT_LEGACY_BIT) struct hpet_event_channel { unsigned long mult; int shift; s_time_t next_event; cpumask_var_t cpumask; spinlock_t lock; void (*event_handler)(struct hpet_event_channel *); unsigned int idx; /* physical channel idx */ unsigned int cpu; /* msi target */ struct msi_desc msi;/* msi state */ unsigned int flags; /* HPET_EVT_x */ } __cacheline_aligned; static struct hpet_event_channel *__read_mostly hpet_events; /* msi hpet channels used for broadcast */ static unsigned int __read_mostly num_hpets_used; DEFINE_PER_CPU(struct hpet_event_channel *, cpu_bc_channel); unsigned long __initdata hpet_address; u8 __initdata hpet_blockid; /* * force_hpet_broadcast: by default legacy hpet broadcast will be stopped * if RTC interrupts are enabled. Enable this option if want to always enable * legacy hpet broadcast for deep C state */ static bool_t __initdata force_hpet_broadcast; boolean_param("hpetbroadcast", force_hpet_broadcast); /* * Calculate a multiplication factor for scaled math, which is used to convert * nanoseconds based values to clock ticks: * * clock_ticks = (nanoseconds * factor) >> shift. * * div_sc is the rearranged equation to calculate a factor from a given clock * ticks / nanoseconds ratio: * * factor = (clock_ticks << shift) / nanoseconds */ static inline unsigned long div_sc(unsigned long ticks, unsigned long nsec, int shift) { uint64_t tmp = ((uint64_t)ticks) << shift; do_div(tmp, nsec); return (unsigned long) tmp; } /* * Convert nanoseconds based values to clock ticks: * * clock_ticks = (nanoseconds * factor) >> shift. */ static inline unsigned long ns2ticks(unsigned long nsec, int shift, unsigned long factor) { uint64_t tmp = ((uint64_t)nsec * factor) >> shift; return (unsigned long) tmp; } static int hpet_next_event(unsigned long delta, int timer) { uint32_t cnt, cmp; unsigned long flags; local_irq_save(flags); cnt = hpet_read32(HPET_COUNTER); cmp = cnt + delta; hpet_write32(cmp, HPET_Tn_CMP(timer)); cmp = hpet_read32(HPET_COUNTER); local_irq_restore(flags); /* Are we within two ticks of the deadline passing? Then we may miss. */ return ((cmp + 2 - cnt) > delta) ? -ETIME : 0; } static int reprogram_hpet_evt_channel( struct hpet_event_channel *ch, s_time_t expire, s_time_t now, int force) { int64_t delta; int ret; if ( (ch->flags & HPET_EVT_DISABLE) || (expire == 0) ) return 0; if ( unlikely(expire < 0) ) { printk(KERN_DEBUG "reprogram: expire <= 0\n"); return -ETIME; } delta = expire - now; if ( (delta <= 0) && !force ) return -ETIME; ch->next_event = expire; if ( expire == STIME_MAX ) { /* We assume it will take a long time for the timer to wrap. */ hpet_write32(0, HPET_Tn_CMP(ch->idx)); return 0; } delta = min_t(int64_t, delta, MAX_DELTA_NS); delta = max_t(int64_t, delta, MIN_DELTA_NS); delta = ns2ticks(delta, ch->shift, ch->mult); ret = hpet_next_event(delta, ch->idx); while ( ret && force ) { delta += delta; ret = hpet_next_event(delta, ch->idx); } return ret; } static void evt_do_broadcast(cpumask_t *mask) { unsigned int cpu = smp_processor_id(); if ( cpumask_test_and_clear_cpu(cpu, mask) ) raise_softirq(TIMER_SOFTIRQ); cpuidle_wakeup_mwait(mask); if ( !cpumask_empty(mask) ) cpumask_raise_softirq(mask, TIMER_SOFTIRQ); } static void handle_hpet_broadcast(struct hpet_event_channel *ch) { cpumask_t mask; s_time_t now, next_event; unsigned int cpu; unsigned long flags; spin_lock_irqsave(&ch->lock, flags); again: ch->next_event = STIME_MAX; spin_unlock_irqrestore(&ch->lock, flags); next_event = STIME_MAX; cpumask_clear(&mask); now = NOW(); /* find all expired events */ for_each_cpu(cpu, ch->cpumask) { s_time_t deadline; rmb(); deadline = per_cpu(timer_deadline, cpu); rmb(); if ( !cpumask_test_cpu(cpu, ch->cpumask) ) continue; if ( deadline <= now ) cpumask_set_cpu(cpu, &mask); else if ( deadline < next_event ) next_event = deadline; } /* wakeup the cpus which have an expired event. */ evt_do_broadcast(&mask); if ( next_event != STIME_MAX ) { spin_lock_irqsave(&ch->lock, flags); if ( next_event < ch->next_event && reprogram_hpet_evt_channel(ch, next_event, now, 0) ) goto again; spin_unlock_irqrestore(&ch->lock, flags); } } static void hpet_interrupt_handler(int irq, void *data, struct cpu_user_regs *regs) { struct hpet_event_channel *ch = (struct hpet_event_channel *)data; this_cpu(irq_count)--; if ( !ch->event_handler ) { printk(XENLOG_WARNING "Spurious HPET timer interrupt on HPET timer %d\n", ch->idx); return; } ch->event_handler(ch); } static void hpet_msi_unmask(struct irq_desc *desc) { u32 cfg; struct hpet_event_channel *ch = desc->action->dev_id; cfg = hpet_read32(HPET_Tn_CFG(ch->idx)); cfg |= HPET_TN_ENABLE; hpet_write32(cfg, HPET_Tn_CFG(ch->idx)); ch->msi.msi_attrib.masked = 0; } static void hpet_msi_mask(struct irq_desc *desc) { u32 cfg; struct hpet_event_channel *ch = desc->action->dev_id; cfg = hpet_read32(HPET_Tn_CFG(ch->idx)); cfg &= ~HPET_TN_ENABLE; hpet_write32(cfg, HPET_Tn_CFG(ch->idx)); ch->msi.msi_attrib.masked = 1; } static int hpet_msi_write(struct hpet_event_channel *ch, struct msi_msg *msg) { ch->msi.msg = *msg; if ( iommu_intremap ) { int rc = iommu_update_ire_from_msi(&ch->msi, msg); if ( rc ) return rc; } hpet_write32(msg->data, HPET_Tn_ROUTE(ch->idx)); hpet_write32(msg->address_lo, HPET_Tn_ROUTE(ch->idx) + 4); return 0; } static void __maybe_unused hpet_msi_read(struct hpet_event_channel *ch, struct msi_msg *msg) { msg->data = hpet_read32(HPET_Tn_ROUTE(ch->idx)); msg->address_lo = hpet_read32(HPET_Tn_ROUTE(ch->idx) + 4); msg->address_hi = MSI_ADDR_BASE_HI; if ( iommu_intremap ) iommu_read_msi_from_ire(&ch->msi, msg); } static unsigned int hpet_msi_startup(struct irq_desc *desc) { hpet_msi_unmask(desc); return 0; } #define hpet_msi_shutdown hpet_msi_mask static void hpet_msi_ack(struct irq_desc *desc) { irq_complete_move(desc); move_native_irq(desc); ack_APIC_irq(); } static void hpet_msi_set_affinity(struct irq_desc *desc, const cpumask_t *mask) { struct hpet_event_channel *ch = desc->action->dev_id; struct msi_msg msg = ch->msi.msg; msg.dest32 = set_desc_affinity(desc, mask); if ( msg.dest32 == BAD_APICID ) return; msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(desc->arch.vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(msg.dest32); if ( msg.data != ch->msi.msg.data || msg.dest32 != ch->msi.msg.dest32 ) hpet_msi_write(ch, &msg); } /* * IRQ Chip for MSI HPET Devices, */ static hw_irq_controller hpet_msi_type = { .typename = "HPET-MSI", .startup = hpet_msi_startup, .shutdown = hpet_msi_shutdown, .enable = hpet_msi_unmask, .disable = hpet_msi_mask, .ack = hpet_msi_ack, .set_affinity = hpet_msi_set_affinity, }; static int __hpet_setup_msi_irq(struct irq_desc *desc) { struct msi_msg msg; msi_compose_msg(desc->arch.vector, desc->arch.cpu_mask, &msg); return hpet_msi_write(desc->action->dev_id, &msg); } static int __init hpet_setup_msi_irq(struct hpet_event_channel *ch) { int ret; u32 cfg = hpet_read32(HPET_Tn_CFG(ch->idx)); irq_desc_t *desc = irq_to_desc(ch->msi.irq); if ( iommu_intremap ) { ch->msi.hpet_id = hpet_blockid; ret = iommu_setup_hpet_msi(&ch->msi); if ( ret ) return ret; } /* set HPET Tn as oneshot */ cfg &= ~(HPET_TN_LEVEL | HPET_TN_PERIODIC); cfg |= HPET_TN_FSB | HPET_TN_32BIT; hpet_write32(cfg, HPET_Tn_CFG(ch->idx)); desc->handler = &hpet_msi_type; ret = request_irq(ch->msi.irq, hpet_interrupt_handler, "HPET", ch); if ( ret >= 0 ) ret = __hpet_setup_msi_irq(desc); if ( ret < 0 ) { if ( iommu_intremap ) iommu_update_ire_from_msi(&ch->msi, NULL); return ret; } desc->msi_desc = &ch->msi; return 0; } static int __init hpet_assign_irq(struct hpet_event_channel *ch) { int irq; if ( (irq = create_irq(NUMA_NO_NODE)) < 0 ) return irq; ch->msi.irq = irq; if ( hpet_setup_msi_irq(ch) ) { destroy_irq(irq); return -EINVAL; } return 0; } static void __init hpet_fsb_cap_lookup(void) { u32 id; unsigned int i, num_chs; if ( unlikely(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_MSI) ) return; id = hpet_read32(HPET_ID); num_chs = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); num_chs++; /* Value read out starts from 0 */ hpet_events = xzalloc_array(struct hpet_event_channel, num_chs); if ( !hpet_events ) return; for ( i = 0; i < num_chs && num_hpets_used < nr_cpu_ids; i++ ) { struct hpet_event_channel *ch = &hpet_events[num_hpets_used]; u32 cfg = hpet_read32(HPET_Tn_CFG(i)); /* Only consider HPET timer with MSI support */ if ( !(cfg & HPET_TN_FSB_CAP) ) continue; if ( !zalloc_cpumask_var(&ch->cpumask) ) { if ( !num_hpets_used ) { xfree(hpet_events); hpet_events = NULL; } break; } ch->flags = 0; ch->idx = i; if ( hpet_assign_irq(ch) == 0 ) num_hpets_used++; } printk(XENLOG_INFO "HPET: %u timers usable for broadcast (%u total)\n", num_hpets_used, num_chs); } static struct hpet_event_channel *hpet_get_channel(unsigned int cpu) { static unsigned int next_channel; unsigned int i, next; struct hpet_event_channel *ch; if ( num_hpets_used == 0 ) return hpet_events; if ( num_hpets_used >= nr_cpu_ids ) return &hpet_events[cpu]; do { next = next_channel; if ( (i = next + 1) == num_hpets_used ) i = 0; } while ( cmpxchg(&next_channel, next, i) != next ); /* try unused channel first */ for ( i = next; i < next + num_hpets_used; i++ ) { ch = &hpet_events[i % num_hpets_used]; if ( !test_and_set_bit(HPET_EVT_USED_BIT, &ch->flags) ) { ch->cpu = cpu; return ch; } } /* share a in-use channel */ ch = &hpet_events[next]; if ( !test_and_set_bit(HPET_EVT_USED_BIT, &ch->flags) ) ch->cpu = cpu; return ch; } static void set_channel_irq_affinity(struct hpet_event_channel *ch) { struct irq_desc *desc = irq_to_desc(ch->msi.irq); ASSERT(!local_irq_is_enabled()); spin_lock(&desc->lock); hpet_msi_mask(desc); hpet_msi_set_affinity(desc, cpumask_of(ch->cpu)); hpet_msi_unmask(desc); spin_unlock(&desc->lock); spin_unlock(&ch->lock); /* We may have missed an interrupt due to the temporary masking. */ if ( ch->event_handler && ch->next_event < NOW() ) ch->event_handler(ch); } static void hpet_attach_channel(unsigned int cpu, struct hpet_event_channel *ch) { ASSERT(!local_irq_is_enabled()); spin_lock(&ch->lock); per_cpu(cpu_bc_channel, cpu) = ch; /* try to be the channel owner again while holding the lock */ if ( !test_and_set_bit(HPET_EVT_USED_BIT, &ch->flags) ) ch->cpu = cpu; if ( ch->cpu != cpu ) spin_unlock(&ch->lock); else set_channel_irq_affinity(ch); } static void hpet_detach_channel(unsigned int cpu, struct hpet_event_channel *ch) { spin_lock_irq(&ch->lock); ASSERT(ch == per_cpu(cpu_bc_channel, cpu)); per_cpu(cpu_bc_channel, cpu) = NULL; if ( cpu != ch->cpu ) spin_unlock_irq(&ch->lock); else if ( cpumask_empty(ch->cpumask) ) { ch->cpu = -1; clear_bit(HPET_EVT_USED_BIT, &ch->flags); spin_unlock_irq(&ch->lock); } else { ch->cpu = cpumask_first(ch->cpumask); set_channel_irq_affinity(ch); local_irq_enable(); } } #include void (*__read_mostly pv_rtc_handler)(uint8_t index, uint8_t value); static void handle_rtc_once(uint8_t index, uint8_t value) { if ( index != RTC_REG_B ) return; /* RTC Reg B, contain PIE/AIE/UIE */ if ( value & (RTC_PIE | RTC_AIE | RTC_UIE ) ) { cpuidle_disable_deep_cstate(); pv_rtc_handler = NULL; } } void __init hpet_broadcast_init(void) { u64 hpet_rate = hpet_setup(); u32 hpet_id, cfg; unsigned int i, n; if ( hpet_rate == 0 || hpet_broadcast_is_available() ) return; cfg = hpet_read32(HPET_CFG); hpet_fsb_cap_lookup(); if ( num_hpets_used > 0 ) { /* Stop HPET legacy interrupts */ cfg &= ~HPET_CFG_LEGACY; n = num_hpets_used; } else { hpet_id = hpet_read32(HPET_ID); if ( !(hpet_id & HPET_ID_LEGSUP) ) return; if ( !hpet_events ) hpet_events = xzalloc(struct hpet_event_channel); if ( !hpet_events || !zalloc_cpumask_var(&hpet_events->cpumask) ) return; hpet_events->msi.irq = -1; /* Start HPET legacy interrupts */ cfg |= HPET_CFG_LEGACY; n = 1; if ( !force_hpet_broadcast ) pv_rtc_handler = handle_rtc_once; } hpet_write32(cfg, HPET_CFG); for ( i = 0; i < n; i++ ) { if ( i == 0 && (cfg & HPET_CFG_LEGACY) ) { /* set HPET T0 as oneshot */ cfg = hpet_read32(HPET_Tn_CFG(0)); cfg &= ~(HPET_TN_LEVEL | HPET_TN_PERIODIC); cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; hpet_write32(cfg, HPET_Tn_CFG(0)); } /* * The period is a femto seconds value. We need to calculate the scaled * math multiplication factor for nanosecond to hpet tick conversion. */ hpet_events[i].mult = div_sc((unsigned long)hpet_rate, 1000000000ul, 32); hpet_events[i].shift = 32; hpet_events[i].next_event = STIME_MAX; spin_lock_init(&hpet_events[i].lock); wmb(); hpet_events[i].event_handler = handle_hpet_broadcast; hpet_events[i].msi.msi_attrib.maskbit = 1; hpet_events[i].msi.msi_attrib.pos = MSI_TYPE_HPET; } if ( !num_hpets_used ) hpet_events->flags = HPET_EVT_LEGACY; } void hpet_broadcast_resume(void) { u32 cfg; unsigned int i, n; if ( !hpet_events ) return; hpet_resume(NULL); cfg = hpet_read32(HPET_CFG); if ( num_hpets_used > 0 ) { /* Stop HPET legacy interrupts */ cfg &= ~HPET_CFG_LEGACY; n = num_hpets_used; } else if ( hpet_events->flags & HPET_EVT_DISABLE ) return; else { /* Start HPET legacy interrupts */ cfg |= HPET_CFG_LEGACY; n = 1; } hpet_write32(cfg, HPET_CFG); for ( i = 0; i < n; i++ ) { if ( hpet_events[i].msi.irq >= 0 ) __hpet_setup_msi_irq(irq_to_desc(hpet_events[i].msi.irq)); /* set HPET Tn as oneshot */ cfg = hpet_read32(HPET_Tn_CFG(hpet_events[i].idx)); cfg &= ~(HPET_TN_LEVEL | HPET_TN_PERIODIC); cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; if ( !(hpet_events[i].flags & HPET_EVT_LEGACY) ) cfg |= HPET_TN_FSB; hpet_write32(cfg, HPET_Tn_CFG(hpet_events[i].idx)); hpet_events[i].next_event = STIME_MAX; } } void hpet_disable_legacy_broadcast(void) { u32 cfg; unsigned long flags; if ( !hpet_events || !(hpet_events->flags & HPET_EVT_LEGACY) ) return; spin_lock_irqsave(&hpet_events->lock, flags); hpet_events->flags |= HPET_EVT_DISABLE; /* disable HPET T0 */ cfg = hpet_read32(HPET_Tn_CFG(0)); cfg &= ~HPET_TN_ENABLE; hpet_write32(cfg, HPET_Tn_CFG(0)); /* Stop HPET legacy interrupts */ cfg = hpet_read32(HPET_CFG); cfg &= ~HPET_CFG_LEGACY; hpet_write32(cfg, HPET_CFG); spin_unlock_irqrestore(&hpet_events->lock, flags); smp_send_event_check_mask(&cpu_online_map); } void hpet_broadcast_enter(void) { unsigned int cpu = smp_processor_id(); struct hpet_event_channel *ch = per_cpu(cpu_bc_channel, cpu); if ( per_cpu(timer_deadline, cpu) == 0 ) return; if ( !ch ) ch = hpet_get_channel(cpu); ASSERT(!local_irq_is_enabled()); if ( !(ch->flags & HPET_EVT_LEGACY) ) hpet_attach_channel(cpu, ch); /* Disable LAPIC timer interrupts. */ disable_APIC_timer(); cpumask_set_cpu(cpu, ch->cpumask); spin_lock(&ch->lock); /* reprogram if current cpu expire time is nearer */ if ( per_cpu(timer_deadline, cpu) < ch->next_event ) reprogram_hpet_evt_channel(ch, per_cpu(timer_deadline, cpu), NOW(), 1); spin_unlock(&ch->lock); } void hpet_broadcast_exit(void) { unsigned int cpu = smp_processor_id(); struct hpet_event_channel *ch = per_cpu(cpu_bc_channel, cpu); if ( per_cpu(timer_deadline, cpu) == 0 ) return; if ( !ch ) ch = hpet_get_channel(cpu); /* Reprogram the deadline; trigger timer work now if it has passed. */ enable_APIC_timer(); if ( !reprogram_timer(per_cpu(timer_deadline, cpu)) ) raise_softirq(TIMER_SOFTIRQ); cpumask_clear_cpu(cpu, ch->cpumask); if ( !(ch->flags & HPET_EVT_LEGACY) ) hpet_detach_channel(cpu, ch); } int hpet_broadcast_is_available(void) { return ((hpet_events && (hpet_events->flags & HPET_EVT_LEGACY)) || num_hpets_used > 0); } int hpet_legacy_irq_tick(void) { this_cpu(irq_count)--; if ( !hpet_events || (hpet_events->flags & (HPET_EVT_DISABLE|HPET_EVT_LEGACY)) != HPET_EVT_LEGACY ) return 0; hpet_events->event_handler(hpet_events); return 1; } static u32 *hpet_boot_cfg; u64 __init hpet_setup(void) { static u64 __initdata hpet_rate; u32 hpet_id, hpet_period; unsigned int last; if ( hpet_rate ) return hpet_rate; if ( hpet_address == 0 ) return 0; set_fixmap_nocache(FIX_HPET_BASE, hpet_address); hpet_id = hpet_read32(HPET_ID); if ( (hpet_id & HPET_ID_REV) == 0 ) { printk("BAD HPET revision id.\n"); return 0; } /* Check for sane period (100ps <= period <= 100ns). */ hpet_period = hpet_read32(HPET_PERIOD); if ( (hpet_period > 100000000) || (hpet_period < 100000) ) { printk("BAD HPET period %u.\n", hpet_period); return 0; } last = (hpet_id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; hpet_boot_cfg = xmalloc_array(u32, 2 + last); hpet_resume(hpet_boot_cfg); hpet_rate = 1000000000000000ULL; /* 10^15 */ (void)do_div(hpet_rate, hpet_period); return hpet_rate; } void hpet_resume(u32 *boot_cfg) { static u32 system_reset_latch; u32 hpet_id, cfg; unsigned int i, last; if ( system_reset_latch == system_reset_counter ) return; system_reset_latch = system_reset_counter; cfg = hpet_read32(HPET_CFG); if ( boot_cfg ) *boot_cfg = cfg; cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); if ( cfg ) { printk(XENLOG_WARNING "HPET: reserved bits %#x set in global config register\n", cfg); cfg = 0; } hpet_write32(cfg, HPET_CFG); hpet_id = hpet_read32(HPET_ID); last = (hpet_id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; for ( i = 0; i <= last; ++i ) { cfg = hpet_read32(HPET_Tn_CFG(i)); if ( boot_cfg ) boot_cfg[i + 1] = cfg; cfg &= ~HPET_TN_ENABLE; if ( cfg & HPET_TN_RESERVED ) { printk(XENLOG_WARNING "HPET: reserved bits %#x set in channel %u config register\n", cfg & HPET_TN_RESERVED, i); cfg &= ~HPET_TN_RESERVED; } hpet_write32(cfg, HPET_Tn_CFG(i)); } cfg = hpet_read32(HPET_CFG); cfg |= HPET_CFG_ENABLE; hpet_write32(cfg, HPET_CFG); } void hpet_disable(void) { unsigned int i; u32 id; if ( !hpet_boot_cfg ) { if ( hpet_broadcast_is_available() ) hpet_disable_legacy_broadcast(); return; } hpet_write32(*hpet_boot_cfg & ~HPET_CFG_ENABLE, HPET_CFG); id = hpet_read32(HPET_ID); for ( i = 0; i <= ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); ++i ) hpet_write32(hpet_boot_cfg[i + 1], HPET_Tn_CFG(i)); if ( *hpet_boot_cfg & HPET_CFG_ENABLE ) hpet_write32(*hpet_boot_cfg, HPET_CFG); } xen-4.4.0/xen/arch/x86/usercopy.c0000664000175000017500000001160712307313555014621 0ustar smbsmb/* * User address space access functions. * * Copyright 1997 Andi Kleen * Copyright 1997 Linus Torvalds * Copyright 2002 Andi Kleen */ #include #include #include unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned n) { unsigned long __d0, __d1, __d2, __n = n; asm volatile ( " cmp $"STR(2*BYTES_PER_LONG-1)",%0\n" " jbe 1f\n" " mov %1,%0\n" " neg %0\n" " and $"STR(BYTES_PER_LONG-1)",%0\n" " sub %0,%3\n" "4: rep movsb\n" /* make 'to' address aligned */ " mov %3,%0\n" " shr $"STR(LONG_BYTEORDER)",%0\n" " and $"STR(BYTES_PER_LONG-1)",%3\n" " .align 2,0x90\n" "0: rep movs"__OS"\n" /* as many words as possible... */ " mov %3,%0\n" "1: rep movsb\n" /* ...remainder copied as bytes */ "2:\n" ".section .fixup,\"ax\"\n" "5: add %3,%0\n" " jmp 2b\n" "3: lea 0(%3,%0,"STR(BYTES_PER_LONG)"),%0\n" " jmp 2b\n" ".previous\n" _ASM_EXTABLE(4b, 5b) _ASM_EXTABLE(0b, 3b) _ASM_EXTABLE(1b, 2b) : "=&c" (__n), "=&D" (__d0), "=&S" (__d1), "=&r" (__d2) : "0" (__n), "1" (to), "2" (from), "3" (__n) : "memory" ); return __n; } unsigned long __copy_from_user_ll(void *to, const void __user *from, unsigned n) { unsigned long __d0, __d1, __d2, __n = n; asm volatile ( " cmp $"STR(2*BYTES_PER_LONG-1)",%0\n" " jbe 1f\n" " mov %1,%0\n" " neg %0\n" " and $"STR(BYTES_PER_LONG-1)",%0\n" " sub %0,%3\n" "4: rep; movsb\n" /* make 'to' address aligned */ " mov %3,%0\n" " shr $"STR(LONG_BYTEORDER)",%0\n" " and $"STR(BYTES_PER_LONG-1)",%3\n" " .align 2,0x90\n" "0: rep; movs"__OS"\n" /* as many words as possible... */ " mov %3,%0\n" "1: rep; movsb\n" /* ...remainder copied as bytes */ "2:\n" ".section .fixup,\"ax\"\n" "5: add %3,%0\n" " jmp 6f\n" "3: lea 0(%3,%0,"STR(BYTES_PER_LONG)"),%0\n" "6: push %0\n" " push %%"__OP"ax\n" " xor %%eax,%%eax\n" " rep; stosb\n" " pop %%"__OP"ax\n" " pop %0\n" " jmp 2b\n" ".previous\n" _ASM_EXTABLE(4b, 5b) _ASM_EXTABLE(0b, 3b) _ASM_EXTABLE(1b, 6b) : "=&c" (__n), "=&D" (__d0), "=&S" (__d1), "=&r" (__d2) : "0" (__n), "1" (to), "2" (from), "3" (__n) : "memory" ); return __n; } /** * copy_to_user: - Copy a block of data into user space. * @to: Destination address, in user space. * @from: Source address, in kernel space. * @n: Number of bytes to copy. * * Context: User context only. This function may sleep. * * Copy data from kernel space to user space. * * Returns number of bytes that could not be copied. * On success, this will be zero. */ unsigned long copy_to_user(void __user *to, const void *from, unsigned n) { if ( access_ok(to, n) ) n = __copy_to_user(to, from, n); return n; } #define __do_clear_user(addr,size) \ do { \ long __d0; \ __asm__ __volatile__( \ "0: rep; stosl\n" \ " movl %2,%0\n" \ "1: rep; stosb\n" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: lea 0(%2,%0,4),%0\n" \ " jmp 2b\n" \ ".previous\n" \ _ASM_EXTABLE(0b,3b) \ _ASM_EXTABLE(1b,2b) \ : "=&c"(size), "=&D" (__d0) \ : "r"(size & 3), "0"(size / 4), "1"((long)addr), "a"(0)); \ } while (0) /** * clear_user: - Zero a block of memory in user space. * @to: Destination address, in user space. * @n: Number of bytes to zero. * * Zero a block of memory in user space. * * Returns number of bytes that could not be cleared. * On success, this will be zero. */ unsigned long clear_user(void __user *to, unsigned n) { if ( access_ok(to, n) ) __do_clear_user(to, n); return n; } /** * copy_from_user: - Copy a block of data from user space. * @to: Destination address, in kernel space. * @from: Source address, in user space. * @n: Number of bytes to copy. * * Context: User context only. This function may sleep. * * Copy data from user space to kernel space. * * Returns number of bytes that could not be copied. * On success, this will be zero. * * If some data could not be copied, this function will pad the copied * data to the requested size using zero bytes. */ unsigned long copy_from_user(void *to, const void __user *from, unsigned n) { if ( access_ok(from, n) ) n = __copy_from_user(to, from, n); else memset(to, 0, n); return n; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/pci.c0000664000175000017500000000247312307313555013524 0ustar smbsmb/****************************************************************************** * pci.c * * Architecture-dependent PCI access functions. */ #include #include #include static DEFINE_SPINLOCK(pci_config_lock); uint32_t pci_conf_read(uint32_t cf8, uint8_t offset, uint8_t bytes) { unsigned long flags; uint32_t value; BUG_ON((offset + bytes) > 4); spin_lock_irqsave(&pci_config_lock, flags); outl(cf8, 0xcf8); switch ( bytes ) { case 1: value = inb(0xcfc + offset); break; case 2: value = inw(0xcfc + offset); break; case 4: value = inl(0xcfc + offset); break; default: value = 0; BUG(); } spin_unlock_irqrestore(&pci_config_lock, flags); return value; } void pci_conf_write(uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data) { unsigned long flags; BUG_ON((offset + bytes) > 4); spin_lock_irqsave(&pci_config_lock, flags); outl(cf8, 0xcf8); switch ( bytes ) { case 1: outb((uint8_t)data, 0xcfc + offset); break; case 2: outw((uint16_t)data, 0xcfc + offset); break; case 4: outl(data, 0xcfc + offset); break; } spin_unlock_irqrestore(&pci_config_lock, flags); } xen-4.4.0/xen/arch/x86/physdev.c0000664000175000017500000005204712307313555014435 0ustar smbsmb #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int physdev_map_pirq(domid_t, int type, int *index, int *pirq_p, struct msi_info *); int physdev_unmap_pirq(domid_t, int pirq); #include "x86_64/mmconfig.h" #ifndef COMPAT typedef long ret_t; static int physdev_hvm_map_pirq( struct domain *d, int type, int *index, int *pirq) { int ret = 0; spin_lock(&d->event_lock); switch ( type ) { case MAP_PIRQ_TYPE_GSI: { struct hvm_irq_dpci *hvm_irq_dpci; struct hvm_girq_dpci_mapping *girq; uint32_t machine_gsi = 0; if ( *index < 0 || *index >= NR_HVM_IRQS ) { ret = -EINVAL; break; } /* find the machine gsi corresponding to the * emulated gsi */ hvm_irq_dpci = domain_get_irq_dpci(d); if ( hvm_irq_dpci ) { BUILD_BUG_ON(ARRAY_SIZE(hvm_irq_dpci->girq) < NR_HVM_IRQS); list_for_each_entry ( girq, &hvm_irq_dpci->girq[*index], list ) machine_gsi = girq->machine_gsi; } /* found one, this mean we are dealing with a pt device */ if ( machine_gsi ) { *index = domain_pirq_to_irq(d, machine_gsi); *pirq = machine_gsi; ret = (*pirq > 0) ? 0 : *pirq; } /* we didn't find any, this means we are dealing * with an emulated device */ else { if ( *pirq < 0 ) *pirq = get_free_pirq(d, type); ret = map_domain_emuirq_pirq(d, *pirq, *index); } break; } default: ret = -EINVAL; dprintk(XENLOG_G_WARNING, "map type %d not supported yet\n", type); break; } spin_unlock(&d->event_lock); return ret; } int physdev_map_pirq(domid_t domid, int type, int *index, int *pirq_p, struct msi_info *msi) { struct domain *d = current->domain; int pirq, irq, ret = 0; void *map_data = NULL; if ( domid == DOMID_SELF && is_hvm_domain(d) ) { /* * Only makes sense for vector-based callback, else HVM-IRQ logic * calls back into itself and deadlocks on hvm_domain.irq_lock. */ if ( !is_hvm_pv_evtchn_domain(d) ) return -EINVAL; return physdev_hvm_map_pirq(d, type, index, pirq_p); } d = rcu_lock_domain_by_any_id(domid); if ( d == NULL ) return -ESRCH; ret = xsm_map_domain_pirq(XSM_TARGET, d); if ( ret ) goto free_domain; /* Verify or get irq. */ switch ( type ) { case MAP_PIRQ_TYPE_GSI: if ( *index < 0 || *index >= nr_irqs_gsi ) { dprintk(XENLOG_G_ERR, "dom%d: map invalid irq %d\n", d->domain_id, *index); ret = -EINVAL; goto free_domain; } irq = domain_pirq_to_irq(current->domain, *index); if ( irq <= 0 ) { if ( is_hardware_domain(current->domain) ) irq = *index; else { dprintk(XENLOG_G_ERR, "dom%d: map pirq with incorrect irq!\n", d->domain_id); ret = -EINVAL; goto free_domain; } } break; case MAP_PIRQ_TYPE_MSI: if ( !msi->table_base ) msi->entry_nr = 1; irq = *index; if ( irq == -1 ) case MAP_PIRQ_TYPE_MULTI_MSI: irq = create_irq(NUMA_NO_NODE); if ( irq < nr_irqs_gsi || irq >= nr_irqs ) { dprintk(XENLOG_G_ERR, "dom%d: can't create irq for msi!\n", d->domain_id); ret = -EINVAL; goto free_domain; } msi->irq = irq; map_data = msi; break; default: dprintk(XENLOG_G_ERR, "dom%d: wrong map_pirq type %x\n", d->domain_id, type); ret = -EINVAL; goto free_domain; } spin_lock(&pcidevs_lock); /* Verify or get pirq. */ spin_lock(&d->event_lock); pirq = domain_irq_to_pirq(d, irq); if ( *pirq_p < 0 ) { if ( pirq ) { dprintk(XENLOG_G_ERR, "dom%d: %d:%d already mapped to %d\n", d->domain_id, *index, *pirq_p, pirq); if ( pirq < 0 ) { ret = -EBUSY; goto done; } } else if ( type == MAP_PIRQ_TYPE_MULTI_MSI ) { if ( msi->entry_nr <= 0 || msi->entry_nr > 32 ) ret = -EDOM; else if ( msi->entry_nr != 1 && !iommu_intremap ) ret = -EOPNOTSUPP; else { while ( msi->entry_nr & (msi->entry_nr - 1) ) msi->entry_nr += msi->entry_nr & -msi->entry_nr; pirq = get_free_pirqs(d, msi->entry_nr); if ( pirq < 0 ) { while ( (msi->entry_nr >>= 1) > 1 ) if ( get_free_pirqs(d, msi->entry_nr) > 0 ) break; dprintk(XENLOG_G_ERR, "dom%d: no block of %d free pirqs\n", d->domain_id, msi->entry_nr << 1); ret = pirq; } } if ( ret < 0 ) goto done; } else { pirq = get_free_pirq(d, type); if ( pirq < 0 ) { dprintk(XENLOG_G_ERR, "dom%d: no free pirq\n", d->domain_id); ret = pirq; goto done; } } } else { if ( pirq && pirq != *pirq_p ) { dprintk(XENLOG_G_ERR, "dom%d: pirq %d conflicts with irq %d\n", d->domain_id, *index, *pirq_p); ret = -EEXIST; goto done; } else pirq = *pirq_p; } ret = map_domain_pirq(d, pirq, irq, type, map_data); if ( ret == 0 ) *pirq_p = pirq; done: spin_unlock(&d->event_lock); spin_unlock(&pcidevs_lock); if ( ret != 0 ) switch ( type ) { case MAP_PIRQ_TYPE_MSI: if ( *index == -1 ) case MAP_PIRQ_TYPE_MULTI_MSI: destroy_irq(irq); break; } free_domain: rcu_unlock_domain(d); return ret; } int physdev_unmap_pirq(domid_t domid, int pirq) { struct domain *d; int ret; d = rcu_lock_domain_by_any_id(domid); if ( d == NULL ) return -ESRCH; ret = xsm_unmap_domain_pirq(XSM_TARGET, d); if ( ret ) goto free_domain; if ( is_hvm_domain(d) ) { spin_lock(&d->event_lock); if ( domain_pirq_to_emuirq(d, pirq) != IRQ_UNBOUND ) ret = unmap_domain_pirq_emuirq(d, pirq); spin_unlock(&d->event_lock); if ( domid == DOMID_SELF || ret ) goto free_domain; } spin_lock(&pcidevs_lock); spin_lock(&d->event_lock); ret = unmap_domain_pirq(d, pirq); spin_unlock(&d->event_lock); spin_unlock(&pcidevs_lock); free_domain: rcu_unlock_domain(d); return ret; } #endif /* COMPAT */ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) { int irq; ret_t ret; struct vcpu *v = current; switch ( cmd ) { case PHYSDEVOP_eoi: { struct physdev_eoi eoi; struct pirq *pirq; ret = -EFAULT; if ( copy_from_guest(&eoi, arg, 1) != 0 ) break; ret = -EINVAL; if ( eoi.irq >= v->domain->nr_pirqs ) break; spin_lock(&v->domain->event_lock); pirq = pirq_info(v->domain, eoi.irq); if ( !pirq ) { spin_unlock(&v->domain->event_lock); break; } if ( is_pv_domain(v->domain) && v->domain->arch.pv_domain.auto_unmask ) evtchn_unmask(pirq->evtchn); if ( is_pv_domain(v->domain) || domain_pirq_to_irq(v->domain, eoi.irq) > 0 ) pirq_guest_eoi(pirq); if ( is_hvm_domain(v->domain) && domain_pirq_to_emuirq(v->domain, eoi.irq) > 0 ) { struct hvm_irq *hvm_irq = &v->domain->arch.hvm_domain.irq; int gsi = domain_pirq_to_emuirq(v->domain, eoi.irq); /* if this is a level irq and count > 0, send another * notification */ if ( gsi >= NR_ISAIRQS /* ISA irqs are edge triggered */ && hvm_irq->gsi_assert_count[gsi] ) send_guest_pirq(v->domain, pirq); } spin_unlock(&v->domain->event_lock); ret = 0; break; } case PHYSDEVOP_pirq_eoi_gmfn_v2: case PHYSDEVOP_pirq_eoi_gmfn_v1: { struct physdev_pirq_eoi_gmfn info; unsigned long mfn; struct page_info *page; ret = -EFAULT; if ( copy_from_guest(&info, arg, 1) != 0 ) break; ret = -EINVAL; page = get_page_from_gfn(current->domain, info.gmfn, NULL, P2M_ALLOC); if ( !page ) break; if ( !get_page_type(page, PGT_writable_page) ) { put_page(page); break; } mfn = page_to_mfn(page); if ( cmpxchg(&v->domain->arch.pv_domain.pirq_eoi_map_mfn, 0, mfn) != 0 ) { put_page_and_type(mfn_to_page(mfn)); ret = -EBUSY; break; } v->domain->arch.pv_domain.pirq_eoi_map = map_domain_page_global(mfn); if ( v->domain->arch.pv_domain.pirq_eoi_map == NULL ) { v->domain->arch.pv_domain.pirq_eoi_map_mfn = 0; put_page_and_type(mfn_to_page(mfn)); ret = -ENOSPC; break; } if ( cmd == PHYSDEVOP_pirq_eoi_gmfn_v1 ) v->domain->arch.pv_domain.auto_unmask = 1; ret = 0; break; } /* Legacy since 0x00030202. */ case PHYSDEVOP_IRQ_UNMASK_NOTIFY: { ret = pirq_guest_unmask(v->domain); break; } case PHYSDEVOP_irq_status_query: { struct physdev_irq_status_query irq_status_query; ret = -EFAULT; if ( copy_from_guest(&irq_status_query, arg, 1) != 0 ) break; irq = irq_status_query.irq; ret = -EINVAL; if ( (irq < 0) || (irq >= v->domain->nr_pirqs) ) break; irq_status_query.flags = 0; if ( is_hvm_domain(v->domain) && domain_pirq_to_irq(v->domain, irq) <= 0 && domain_pirq_to_emuirq(v->domain, irq) == IRQ_UNBOUND ) { ret = -EINVAL; break; } /* * Even edge-triggered or message-based IRQs can need masking from * time to time. If teh guest is not dynamically checking for this * via the new pirq_eoi_map mechanism, it must conservatively always * execute the EOI hypercall. In practice, this only really makes a * difference for maskable MSI sources, and if those are supported * then dom0 is probably modern anyway. */ irq_status_query.flags |= XENIRQSTAT_needs_eoi; if ( pirq_shared(v->domain, irq) ) irq_status_query.flags |= XENIRQSTAT_shared; ret = __copy_to_guest(arg, &irq_status_query, 1) ? -EFAULT : 0; break; } case PHYSDEVOP_map_pirq: { physdev_map_pirq_t map; struct msi_info msi; ret = -EFAULT; if ( copy_from_guest(&map, arg, 1) != 0 ) break; switch ( map.type ) { case MAP_PIRQ_TYPE_MSI_SEG: map.type = MAP_PIRQ_TYPE_MSI; msi.seg = map.bus >> 16; break; case MAP_PIRQ_TYPE_MULTI_MSI: if ( map.table_base ) return -EINVAL; msi.seg = map.bus >> 16; break; default: msi.seg = 0; break; } msi.bus = map.bus; msi.devfn = map.devfn; msi.entry_nr = map.entry_nr; msi.table_base = map.table_base; ret = physdev_map_pirq(map.domid, map.type, &map.index, &map.pirq, &msi); if ( map.type == MAP_PIRQ_TYPE_MULTI_MSI ) map.entry_nr = msi.entry_nr; if ( __copy_to_guest(arg, &map, 1) ) ret = -EFAULT; break; } case PHYSDEVOP_unmap_pirq: { struct physdev_unmap_pirq unmap; ret = -EFAULT; if ( copy_from_guest(&unmap, arg, 1) != 0 ) break; ret = physdev_unmap_pirq(unmap.domid, unmap.pirq); break; } case PHYSDEVOP_apic_read: { struct physdev_apic apic; ret = -EFAULT; if ( copy_from_guest(&apic, arg, 1) != 0 ) break; ret = xsm_apic(XSM_PRIV, v->domain, cmd); if ( ret ) break; ret = ioapic_guest_read(apic.apic_physbase, apic.reg, &apic.value); if ( __copy_to_guest(arg, &apic, 1) ) ret = -EFAULT; break; } case PHYSDEVOP_apic_write: { struct physdev_apic apic; ret = -EFAULT; if ( copy_from_guest(&apic, arg, 1) != 0 ) break; ret = xsm_apic(XSM_PRIV, v->domain, cmd); if ( ret ) break; ret = ioapic_guest_write(apic.apic_physbase, apic.reg, apic.value); break; } case PHYSDEVOP_alloc_irq_vector: { struct physdev_irq irq_op; ret = -EFAULT; if ( copy_from_guest(&irq_op, arg, 1) != 0 ) break; /* Use the APIC check since this dummy hypercall should still only * be called by the domain with access to program the ioapic */ ret = xsm_apic(XSM_PRIV, v->domain, cmd); if ( ret ) break; /* Vector is only used by hypervisor, and dom0 shouldn't touch it in its world, return irq_op.irq as the vecotr, and make this hypercall dummy, and also defer the vector allocation when dom0 tries to programe ioapic entry. */ irq_op.vector = irq_op.irq; ret = 0; if ( __copy_to_guest(arg, &irq_op, 1) ) ret = -EFAULT; break; } case PHYSDEVOP_set_iopl: { struct physdev_set_iopl set_iopl; ret = -ENOSYS; if ( is_pvh_vcpu(current) ) break; ret = -EFAULT; if ( copy_from_guest(&set_iopl, arg, 1) != 0 ) break; ret = -EINVAL; if ( set_iopl.iopl > 3 ) break; ret = 0; v->arch.pv_vcpu.iopl = set_iopl.iopl; break; } case PHYSDEVOP_set_iobitmap: { struct physdev_set_iobitmap set_iobitmap; ret = -ENOSYS; if ( is_pvh_vcpu(current) ) break; ret = -EFAULT; if ( copy_from_guest(&set_iobitmap, arg, 1) != 0 ) break; ret = -EINVAL; if ( !guest_handle_okay(set_iobitmap.bitmap, IOBMP_BYTES) || (set_iobitmap.nr_ports > 65536) ) break; ret = 0; #ifndef COMPAT v->arch.pv_vcpu.iobmp = set_iobitmap.bitmap; #else guest_from_compat_handle(v->arch.pv_vcpu.iobmp, set_iobitmap.bitmap); #endif v->arch.pv_vcpu.iobmp_limit = set_iobitmap.nr_ports; break; } case PHYSDEVOP_manage_pci_add: { struct physdev_manage_pci manage_pci; ret = -EFAULT; if ( copy_from_guest(&manage_pci, arg, 1) != 0 ) break; ret = pci_add_device(0, manage_pci.bus, manage_pci.devfn, NULL); break; } case PHYSDEVOP_manage_pci_remove: { struct physdev_manage_pci manage_pci; ret = -EFAULT; if ( copy_from_guest(&manage_pci, arg, 1) != 0 ) break; ret = pci_remove_device(0, manage_pci.bus, manage_pci.devfn); break; } case PHYSDEVOP_manage_pci_add_ext: { struct physdev_manage_pci_ext manage_pci_ext; struct pci_dev_info pdev_info; ret = -EFAULT; if ( copy_from_guest(&manage_pci_ext, arg, 1) != 0 ) break; ret = -EINVAL; if ( (manage_pci_ext.is_extfn > 1) || (manage_pci_ext.is_virtfn > 1) ) break; pdev_info.is_extfn = manage_pci_ext.is_extfn; pdev_info.is_virtfn = manage_pci_ext.is_virtfn; pdev_info.physfn.bus = manage_pci_ext.physfn.bus; pdev_info.physfn.devfn = manage_pci_ext.physfn.devfn; ret = pci_add_device(0, manage_pci_ext.bus, manage_pci_ext.devfn, &pdev_info); break; } case PHYSDEVOP_pci_device_add: { struct physdev_pci_device_add add; struct pci_dev_info pdev_info; ret = -EFAULT; if ( copy_from_guest(&add, arg, 1) != 0 ) break; pdev_info.is_extfn = !!(add.flags & XEN_PCI_DEV_EXTFN); if ( add.flags & XEN_PCI_DEV_VIRTFN ) { pdev_info.is_virtfn = 1; pdev_info.physfn.bus = add.physfn.bus; pdev_info.physfn.devfn = add.physfn.devfn; } else pdev_info.is_virtfn = 0; ret = pci_add_device(add.seg, add.bus, add.devfn, &pdev_info); break; } case PHYSDEVOP_pci_device_remove: { struct physdev_pci_device dev; ret = -EFAULT; if ( copy_from_guest(&dev, arg, 1) != 0 ) break; ret = pci_remove_device(dev.seg, dev.bus, dev.devfn); break; } case PHYSDEVOP_prepare_msix: case PHYSDEVOP_release_msix: { struct physdev_pci_device dev; if ( copy_from_guest(&dev, arg, 1) ) ret = -EFAULT; else ret = xsm_resource_setup_pci(XSM_PRIV, (dev.seg << 16) | (dev.bus << 8) | dev.devfn) ?: pci_prepare_msix(dev.seg, dev.bus, dev.devfn, cmd != PHYSDEVOP_prepare_msix); break; } case PHYSDEVOP_pci_mmcfg_reserved: { struct physdev_pci_mmcfg_reserved info; ret = xsm_resource_setup_misc(XSM_PRIV); if ( ret ) break; ret = -EFAULT; if ( copy_from_guest(&info, arg, 1) ) break; ret = pci_mmcfg_reserved(info.address, info.segment, info.start_bus, info.end_bus, info.flags); break; } case PHYSDEVOP_restore_msi: { struct physdev_restore_msi restore_msi; struct pci_dev *pdev; ret = -EFAULT; if ( copy_from_guest(&restore_msi, arg, 1) != 0 ) break; spin_lock(&pcidevs_lock); pdev = pci_get_pdev(0, restore_msi.bus, restore_msi.devfn); ret = pdev ? pci_restore_msi_state(pdev) : -ENODEV; spin_unlock(&pcidevs_lock); break; } case PHYSDEVOP_restore_msi_ext: { struct physdev_pci_device dev; struct pci_dev *pdev; ret = -EFAULT; if ( copy_from_guest(&dev, arg, 1) != 0 ) break; spin_lock(&pcidevs_lock); pdev = pci_get_pdev(dev.seg, dev.bus, dev.devfn); ret = pdev ? pci_restore_msi_state(pdev) : -ENODEV; spin_unlock(&pcidevs_lock); break; } case PHYSDEVOP_setup_gsi: { struct physdev_setup_gsi setup_gsi; ret = -EFAULT; if ( copy_from_guest(&setup_gsi, arg, 1) != 0 ) break; ret = -EINVAL; if ( setup_gsi.gsi < 0 || setup_gsi.gsi >= nr_irqs_gsi ) break; ret = xsm_resource_setup_gsi(XSM_PRIV, setup_gsi.gsi); if ( ret ) break; ret = mp_register_gsi(setup_gsi.gsi, setup_gsi.triggering, setup_gsi.polarity); break; } case PHYSDEVOP_get_free_pirq: { struct physdev_get_free_pirq out; struct domain *d = v->domain; ret = -EFAULT; if ( copy_from_guest(&out, arg, 1) != 0 ) break; spin_lock(&d->event_lock); ret = get_free_pirq(d, out.type); if ( ret >= 0 ) { struct pirq *info = pirq_get_info(d, ret); if ( info ) info->arch.irq = PIRQ_ALLOCATED; else ret = -ENOMEM; } spin_unlock(&d->event_lock); if ( ret >= 0 ) { out.pirq = ret; ret = __copy_to_guest(arg, &out, 1) ? -EFAULT : 0; } break; } case PHYSDEVOP_dbgp_op: { struct physdev_dbgp_op op; if ( !is_hardware_domain(v->domain) ) ret = -EPERM; else if ( copy_from_guest(&op, arg, 1) ) ret = -EFAULT; else ret = dbgp_op(&op); break; } default: ret = -ENOSYS; break; } return ret; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/copy_page.S0000664000175000017500000000231312307313555014670 0ustar smbsmb#include #include #define src_reg %rsi #define dst_reg %rdi #define WORD_SIZE 8 #define tmp1_reg %r8 #define tmp2_reg %r9 #define tmp3_reg %r10 #define tmp4_reg %r11 ENTRY(copy_page_sse2) mov $PAGE_SIZE/(4*WORD_SIZE)-3, %ecx prefetchnta 2*4*WORD_SIZE(src_reg) mov (src_reg), tmp1_reg mov WORD_SIZE(src_reg), tmp2_reg mov 2*WORD_SIZE(src_reg), tmp3_reg mov 3*WORD_SIZE(src_reg), tmp4_reg 0: prefetchnta 3*4*WORD_SIZE(src_reg) 1: add $4*WORD_SIZE, src_reg movnti tmp1_reg, (dst_reg) mov (src_reg), tmp1_reg dec %ecx movnti tmp2_reg, WORD_SIZE(dst_reg) mov WORD_SIZE(src_reg), tmp2_reg movnti tmp3_reg, 2*WORD_SIZE(dst_reg) mov 2*WORD_SIZE(src_reg), tmp3_reg movnti tmp4_reg, 3*WORD_SIZE(dst_reg) lea 4*WORD_SIZE(dst_reg), dst_reg mov 3*WORD_SIZE(src_reg), tmp4_reg jg 0b jpe 1b movnti tmp1_reg, (dst_reg) movnti tmp2_reg, WORD_SIZE(dst_reg) movnti tmp3_reg, 2*WORD_SIZE(dst_reg) movnti tmp4_reg, 3*WORD_SIZE(dst_reg) sfence ret xen-4.4.0/xen/arch/x86/debug.c0000664000175000017500000001602012307313555014030 0ustar smbsmb/* * Copyright (C) 2009, Mukesh Rathor, Oracle Corp. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License v2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 021110-1307, USA. */ #include #include #include #include #include #include #include #include /* * This file for general routines common to more than one debugger, like kdb, * gdbsx, etc.. */ #ifdef XEN_KDB_CONFIG #include "../kdb/include/kdbdefs.h" #include "../kdb/include/kdbproto.h" #define DBGP(...) {(kdbdbg) ? kdbp(__VA_ARGS__):0;} #define DBGP1(...) {(kdbdbg>1) ? kdbp(__VA_ARGS__):0;} #define DBGP2(...) {(kdbdbg>2) ? kdbp(__VA_ARGS__):0;} #else #define DBGP1(...) ((void)0) #define DBGP2(...) ((void)0) #endif /* Returns: mfn for the given (hvm guest) vaddr */ static unsigned long dbg_hvm_va2mfn(dbgva_t vaddr, struct domain *dp, int toaddr, unsigned long *gfn) { unsigned long mfn; uint32_t pfec = PFEC_page_present; p2m_type_t gfntype; DBGP2("vaddr:%lx domid:%d\n", vaddr, dp->domain_id); *gfn = paging_gva_to_gfn(dp->vcpu[0], vaddr, &pfec); if ( *gfn == INVALID_GFN ) { DBGP2("kdb:bad gfn from gva_to_gfn\n"); return INVALID_MFN; } mfn = mfn_x(get_gfn(dp, *gfn, &gfntype)); if ( p2m_is_readonly(gfntype) && toaddr ) { DBGP2("kdb:p2m_is_readonly: gfntype:%x\n", gfntype); mfn = INVALID_MFN; } else DBGP2("X: vaddr:%lx domid:%d mfn:%lx\n", vaddr, dp->domain_id, mfn); if ( mfn == INVALID_MFN ) { put_gfn(dp, *gfn); *gfn = INVALID_GFN; } return mfn; } /* * pgd3val: this is the value of init_mm.pgd[3] in a PV guest. It is optional. * This to assist debug of modules in the guest. The kernel address * space seems is always mapped, but modules are not necessarily * mapped in any arbitraty guest cr3 that we pick if pgd3val is 0. * Modules should always be addressible if we use cr3 from init_mm. * Since pgd3val is already a pgd value, cr3->pgd[3], we just need to * do 2 level lookups. * * NOTE: 4 level paging works for 32 PAE guests also because cpu runs in IA32-e * mode. * Returns: mfn for the given (pv guest) vaddr */ static unsigned long dbg_pv_va2mfn(dbgva_t vaddr, struct domain *dp, uint64_t pgd3val) { l4_pgentry_t l4e, *l4t; l3_pgentry_t l3e, *l3t; l2_pgentry_t l2e, *l2t; l1_pgentry_t l1e, *l1t; unsigned long cr3 = (pgd3val ? pgd3val : dp->vcpu[0]->arch.cr3); unsigned long mfn = cr3 >> PAGE_SHIFT; DBGP2("vaddr:%lx domid:%d cr3:%lx pgd3:%lx\n", vaddr, dp->domain_id, cr3, pgd3val); if ( pgd3val == 0 ) { l4t = map_domain_page(mfn); l4e = l4t[l4_table_offset(vaddr)]; unmap_domain_page(l4t); mfn = l4e_get_pfn(l4e); DBGP2("l4t:%p l4to:%lx l4e:%lx mfn:%lx\n", l4t, l4_table_offset(vaddr), l4e, mfn); if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) { DBGP1("l4 PAGE not present. vaddr:%lx cr3:%lx\n", vaddr, cr3); return INVALID_MFN; } l3t = map_domain_page(mfn); l3e = l3t[l3_table_offset(vaddr)]; unmap_domain_page(l3t); mfn = l3e_get_pfn(l3e); DBGP2("l3t:%p l3to:%lx l3e:%lx mfn:%lx\n", l3t, l3_table_offset(vaddr), l3e, mfn); if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_flags(l3e) & _PAGE_PSE) ) { DBGP1("l3 PAGE not present. vaddr:%lx cr3:%lx\n", vaddr, cr3); return INVALID_MFN; } } l2t = map_domain_page(mfn); l2e = l2t[l2_table_offset(vaddr)]; unmap_domain_page(l2t); mfn = l2e_get_pfn(l2e); DBGP2("l2t:%p l2to:%lx l2e:%lx mfn:%lx\n", l2t, l2_table_offset(vaddr), l2e, mfn); if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_flags(l2e) & _PAGE_PSE) ) { DBGP1("l2 PAGE not present. vaddr:%lx cr3:%lx\n", vaddr, cr3); return INVALID_MFN; } l1t = map_domain_page(mfn); l1e = l1t[l1_table_offset(vaddr)]; unmap_domain_page(l1t); mfn = l1e_get_pfn(l1e); DBGP2("l1t:%p l1to:%lx l1e:%lx mfn:%lx\n", l1t, l1_table_offset(vaddr), l1e, mfn); return mfn_valid(mfn) ? mfn : INVALID_MFN; } /* Returns: number of bytes remaining to be copied */ static int dbg_rw_guest_mem(dbgva_t addr, dbgbyte_t *buf, int len, struct domain *dp, int toaddr, uint64_t pgd3) { while ( len > 0 ) { char *va; unsigned long mfn, gfn = INVALID_GFN, pagecnt; pagecnt = min_t(long, PAGE_SIZE - (addr & ~PAGE_MASK), len); mfn = (has_hvm_container_domain(dp) ? dbg_hvm_va2mfn(addr, dp, toaddr, &gfn) : dbg_pv_va2mfn(addr, dp, pgd3)); if ( mfn == INVALID_MFN ) break; va = map_domain_page(mfn); va = va + (addr & (PAGE_SIZE-1)); if ( toaddr ) { memcpy(va, buf, pagecnt); /* va = buf */ paging_mark_dirty(dp, mfn); } else { memcpy(buf, va, pagecnt); /* buf = va */ } unmap_domain_page(va); if ( gfn != INVALID_GFN ) put_gfn(dp, gfn); addr += pagecnt; buf += pagecnt; len -= pagecnt; } return len; } /* * addr is hypervisor addr if domid == DOMID_IDLE, else it's guest addr * buf is debugger buffer. * if toaddr, then addr = buf (write to addr), else buf = addr (rd from guest) * pgd3: value of init_mm.pgd[3] in guest. see above. * Returns: number of bytes remaining to be copied. */ int dbg_rw_mem(dbgva_t addr, dbgbyte_t *buf, int len, domid_t domid, int toaddr, uint64_t pgd3) { struct domain *dp = get_domain_by_id(domid); int hyp = (domid == DOMID_IDLE); DBGP2("gmem:addr:%lx buf:%p len:$%d domid:%x toaddr:%x dp:%p\n", addr, buf, len, domid, toaddr, dp); if ( hyp ) { if ( toaddr ) len = __copy_to_user((void *)addr, buf, len); else len = __copy_from_user(buf, (void *)addr, len); } else if ( dp ) { if ( !dp->is_dying ) /* make sure guest is still there */ len= dbg_rw_guest_mem(addr, buf, len, dp, toaddr, pgd3); put_domain(dp); } DBGP2("gmem:exit:len:$%d\n", len); return len; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/platform_hypercall.c0000664000175000017500000004166412307313555016645 0ustar smbsmb/****************************************************************************** * platform_hypercall.c * * Hardware platform operations. Intended for use by domain-0 kernel. * * Copyright (c) 2002-2006, K Fraser */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "cpu/mtrr/mtrr.h" #include #ifndef COMPAT typedef long ret_t; DEFINE_SPINLOCK(xenpf_lock); # undef copy_from_compat # define copy_from_compat copy_from_guest # undef copy_to_compat # define copy_to_compat copy_to_guest # undef guest_from_compat_handle # define guest_from_compat_handle(x,y) ((x)=(y)) #else extern spinlock_t xenpf_lock; #endif static DEFINE_PER_CPU(uint64_t, freq); static long cpu_frequency_change_helper(void *data) { return cpu_frequency_change(this_cpu(freq)); } /* from sysctl.c */ long cpu_up_helper(void *data); long cpu_down_helper(void *data); /* from core_parking.c */ long core_parking_helper(void *data); uint32_t get_cur_idle_nums(void); ret_t do_platform_op(XEN_GUEST_HANDLE_PARAM(xen_platform_op_t) u_xenpf_op) { ret_t ret = 0; struct xen_platform_op curop, *op = &curop; if ( copy_from_guest(op, u_xenpf_op, 1) ) return -EFAULT; if ( op->interface_version != XENPF_INTERFACE_VERSION ) return -EACCES; ret = xsm_platform_op(XSM_PRIV, op->cmd); if ( ret ) return ret; /* * Trylock here avoids deadlock with an existing platform critical section * which might (for some current or future reason) want to synchronise * with this vcpu. */ while ( !spin_trylock(&xenpf_lock) ) if ( hypercall_preempt_check() ) return hypercall_create_continuation( __HYPERVISOR_platform_op, "h", u_xenpf_op); switch ( op->cmd ) { case XENPF_settime: { do_settime(op->u.settime.secs, op->u.settime.nsecs, op->u.settime.system_time); ret = 0; } break; case XENPF_add_memtype: { ret = mtrr_add_page( op->u.add_memtype.mfn, op->u.add_memtype.nr_mfns, op->u.add_memtype.type, 1); if ( ret >= 0 ) { op->u.add_memtype.handle = 0; op->u.add_memtype.reg = ret; ret = __copy_field_to_guest(u_xenpf_op, op, u.add_memtype) ? -EFAULT : 0; if ( ret != 0 ) mtrr_del_page(ret, 0, 0); } } break; case XENPF_del_memtype: { if (op->u.del_memtype.handle == 0 /* mtrr/main.c otherwise does a lookup */ && (int)op->u.del_memtype.reg >= 0) { ret = mtrr_del_page(op->u.del_memtype.reg, 0, 0); if ( ret > 0 ) ret = 0; } else ret = -EINVAL; } break; case XENPF_read_memtype: { unsigned long mfn, nr_mfns; mtrr_type type; ret = -EINVAL; if ( op->u.read_memtype.reg < num_var_ranges ) { mtrr_if->get(op->u.read_memtype.reg, &mfn, &nr_mfns, &type); op->u.read_memtype.mfn = mfn; op->u.read_memtype.nr_mfns = nr_mfns; op->u.read_memtype.type = type; ret = __copy_field_to_guest(u_xenpf_op, op, u.read_memtype) ? -EFAULT : 0; } } break; case XENPF_microcode_update: { XEN_GUEST_HANDLE(const_void) data; guest_from_compat_handle(data, op->u.microcode.data); /* * alloc_vcpu() will access data which is modified during * microcode update */ while ( !spin_trylock(&vcpu_alloc_lock) ) { if ( hypercall_preempt_check() ) { ret = hypercall_create_continuation( __HYPERVISOR_platform_op, "h", u_xenpf_op); goto out; } } ret = microcode_update( guest_handle_to_param(data, const_void), op->u.microcode.length); spin_unlock(&vcpu_alloc_lock); } break; case XENPF_platform_quirk: { int quirk_id = op->u.platform_quirk.quirk_id; switch ( quirk_id ) { case QUIRK_NOIRQBALANCING: printk("Platform quirk -- Disabling IRQ balancing/affinity.\n"); opt_noirqbalance = 1; setup_ioapic_dest(); break; case QUIRK_IOAPIC_BAD_REGSEL: dprintk(XENLOG_WARNING, "Domain 0 thinks that IO-APIC REGSEL is bad\n"); break; case QUIRK_IOAPIC_GOOD_REGSEL: break; default: ret = -EINVAL; break; } } break; case XENPF_firmware_info: switch ( op->u.firmware_info.type ) { case XEN_FW_DISK_INFO: { const struct edd_info *info; u16 length; ret = -ESRCH; if ( op->u.firmware_info.index >= bootsym(boot_edd_info_nr) ) break; info = bootsym(boot_edd_info) + op->u.firmware_info.index; /* Transfer the EDD info block. */ ret = -EFAULT; if ( copy_from_compat(&length, op->u.firmware_info.u. disk_info.edd_params, 1) ) break; if ( length > info->edd_device_params.length ) length = info->edd_device_params.length; if ( copy_to_compat(op->u.firmware_info.u.disk_info.edd_params, (u8 *)&info->edd_device_params, length) ) break; if ( copy_to_compat(op->u.firmware_info.u.disk_info.edd_params, &length, 1) ) break; /* Transfer miscellaneous other information values. */ #define C(x) op->u.firmware_info.u.disk_info.x = info->x C(device); C(version); C(interface_support); C(legacy_max_cylinder); C(legacy_max_head); C(legacy_sectors_per_track); #undef C ret = (__copy_field_to_guest(u_xenpf_op, op, u.firmware_info.u.disk_info) ? -EFAULT : 0); break; } case XEN_FW_DISK_MBR_SIGNATURE: { const struct mbr_signature *sig; ret = -ESRCH; if ( op->u.firmware_info.index >= bootsym(boot_mbr_signature_nr) ) break; sig = bootsym(boot_mbr_signature) + op->u.firmware_info.index; op->u.firmware_info.u.disk_mbr_signature.device = sig->device; op->u.firmware_info.u.disk_mbr_signature.mbr_signature = sig->signature; ret = (__copy_field_to_guest(u_xenpf_op, op, u.firmware_info.u.disk_mbr_signature) ? -EFAULT : 0); break; } case XEN_FW_VBEDDC_INFO: ret = -ESRCH; if ( op->u.firmware_info.index != 0 ) break; if ( *(u32 *)bootsym(boot_edid_info) == 0x13131313 ) break; op->u.firmware_info.u.vbeddc_info.capabilities = bootsym(boot_edid_caps); op->u.firmware_info.u.vbeddc_info.edid_transfer_time = bootsym(boot_edid_caps) >> 8; ret = 0; if ( __copy_field_to_guest(u_xenpf_op, op, u.firmware_info. u.vbeddc_info.capabilities) || __copy_field_to_guest(u_xenpf_op, op, u.firmware_info. u.vbeddc_info.edid_transfer_time) || copy_to_compat(op->u.firmware_info.u.vbeddc_info.edid, bootsym(boot_edid_info), 128) ) ret = -EFAULT; break; case XEN_FW_EFI_INFO: ret = efi_get_info(op->u.firmware_info.index, &op->u.firmware_info.u.efi_info); if ( ret == 0 && __copy_field_to_guest(u_xenpf_op, op, u.firmware_info.u.efi_info) ) ret = -EFAULT; break; case XEN_FW_KBD_SHIFT_FLAGS: ret = -ESRCH; if ( op->u.firmware_info.index != 0 ) break; op->u.firmware_info.u.kbd_shift_flags = bootsym(kbd_shift_flags); ret = 0; if ( __copy_field_to_guest(u_xenpf_op, op, u.firmware_info.u.kbd_shift_flags) ) ret = -EFAULT; break; default: ret = -EINVAL; break; } break; case XENPF_efi_runtime_call: ret = efi_runtime_call(&op->u.efi_runtime_call); if ( ret == 0 && __copy_field_to_guest(u_xenpf_op, op, u.efi_runtime_call) ) ret = -EFAULT; break; case XENPF_enter_acpi_sleep: ret = acpi_enter_sleep(&op->u.enter_acpi_sleep); break; case XENPF_change_freq: ret = -ENOSYS; if ( cpufreq_controller != FREQCTL_dom0_kernel ) break; ret = -EINVAL; if ( op->u.change_freq.flags || !cpu_online(op->u.change_freq.cpu) ) break; per_cpu(freq, op->u.change_freq.cpu) = op->u.change_freq.freq; ret = continue_hypercall_on_cpu(op->u.change_freq.cpu, cpu_frequency_change_helper, NULL); break; case XENPF_getidletime: { uint32_t cpu; uint64_t idletime, now = NOW(); struct xenctl_bitmap ctlmap; cpumask_var_t cpumap; XEN_GUEST_HANDLE(uint8) cpumap_bitmap; XEN_GUEST_HANDLE(uint64) idletimes; ret = -ENOSYS; if ( cpufreq_controller != FREQCTL_dom0_kernel ) break; ctlmap.nr_bits = op->u.getidletime.cpumap_nr_cpus; guest_from_compat_handle(cpumap_bitmap, op->u.getidletime.cpumap_bitmap); ctlmap.bitmap.p = cpumap_bitmap.p; /* handle -> handle_64 conversion */ if ( (ret = xenctl_bitmap_to_cpumask(&cpumap, &ctlmap)) != 0 ) goto out; guest_from_compat_handle(idletimes, op->u.getidletime.idletime); for_each_cpu ( cpu, cpumap ) { idletime = get_cpu_idle_time(cpu); if ( !idletime ) { cpumask_clear_cpu(cpu, cpumap); continue; } if ( copy_to_guest_offset(idletimes, cpu, &idletime, 1) ) { ret = -EFAULT; break; } } op->u.getidletime.now = now; if ( ret == 0 ) ret = cpumask_to_xenctl_bitmap(&ctlmap, cpumap); free_cpumask_var(cpumap); if ( ret == 0 && __copy_field_to_guest(u_xenpf_op, op, u.getidletime) ) ret = -EFAULT; } break; case XENPF_set_processor_pminfo: switch ( op->u.set_pminfo.type ) { case XEN_PM_PX: if ( !(xen_processor_pmbits & XEN_PROCESSOR_PM_PX) ) { ret = -ENOSYS; break; } ret = set_px_pminfo(op->u.set_pminfo.id, &op->u.set_pminfo.u.perf); break; case XEN_PM_CX: if ( !(xen_processor_pmbits & XEN_PROCESSOR_PM_CX) ) { ret = -ENOSYS; break; } ret = set_cx_pminfo(op->u.set_pminfo.id, &op->u.set_pminfo.u.power); break; case XEN_PM_TX: if ( !(xen_processor_pmbits & XEN_PROCESSOR_PM_TX) ) { ret = -ENOSYS; break; } ret = -EINVAL; break; case XEN_PM_PDC: { XEN_GUEST_HANDLE(uint32) pdc; guest_from_compat_handle(pdc, op->u.set_pminfo.u.pdc); ret = acpi_set_pdc_bits( op->u.set_pminfo.id, guest_handle_to_param(pdc, uint32)); } break; default: ret = -EINVAL; break; } break; case XENPF_get_cpuinfo: { struct xenpf_pcpuinfo *g_info; g_info = &op->u.pcpu_info; if ( !get_cpu_maps() ) { ret = -EBUSY; break; } if ( (g_info->xen_cpuid >= nr_cpu_ids) || !cpu_present(g_info->xen_cpuid) ) { g_info->flags = XEN_PCPU_FLAGS_INVALID; } else { g_info->apic_id = x86_cpu_to_apicid[g_info->xen_cpuid]; g_info->acpi_id = acpi_get_processor_id(g_info->xen_cpuid); ASSERT(g_info->apic_id != BAD_APICID); g_info->flags = 0; if (cpu_online(g_info->xen_cpuid)) g_info->flags |= XEN_PCPU_FLAGS_ONLINE; } g_info->max_present = cpumask_last(&cpu_present_map); put_cpu_maps(); ret = __copy_field_to_guest(u_xenpf_op, op, u.pcpu_info) ? -EFAULT : 0; } break; case XENPF_get_cpu_version: { struct xenpf_pcpu_version *ver = &op->u.pcpu_version; if ( !get_cpu_maps() ) { ret = -EBUSY; break; } if ( (ver->xen_cpuid >= nr_cpu_ids) || !cpu_online(ver->xen_cpuid) ) { memset(ver->vendor_id, 0, sizeof(ver->vendor_id)); ver->family = 0; ver->model = 0; ver->stepping = 0; } else { const struct cpuinfo_x86 *c = &cpu_data[ver->xen_cpuid]; memcpy(ver->vendor_id, c->x86_vendor_id, sizeof(ver->vendor_id)); ver->family = c->x86; ver->model = c->x86_model; ver->stepping = c->x86_mask; } ver->max_present = cpumask_last(&cpu_present_map); put_cpu_maps(); if ( __copy_field_to_guest(u_xenpf_op, op, u.pcpu_version) ) ret = -EFAULT; } break; case XENPF_cpu_online: { int cpu = op->u.cpu_ol.cpuid; ret = xsm_resource_plug_core(XSM_HOOK); if ( ret ) break; if ( cpu >= nr_cpu_ids || !cpu_present(cpu) ) { ret = -EINVAL; break; } if ( cpu_online(cpu) ) { ret = 0; break; } ret = continue_hypercall_on_cpu( 0, cpu_up_helper, (void *)(unsigned long)cpu); break; } case XENPF_cpu_offline: { int cpu = op->u.cpu_ol.cpuid; ret = xsm_resource_unplug_core(XSM_HOOK); if ( ret ) break; if ( cpu == 0 ) { ret = -EOPNOTSUPP; break; } if ( cpu >= nr_cpu_ids || !cpu_present(cpu) ) { ret = -EINVAL; break; } if ( !cpu_online(cpu) ) { ret = 0; break; } ret = continue_hypercall_on_cpu( 0, cpu_down_helper, (void *)(unsigned long)cpu); break; } break; case XENPF_cpu_hotadd: ret = xsm_resource_plug_core(XSM_HOOK); if ( ret ) break; ret = cpu_add(op->u.cpu_add.apic_id, op->u.cpu_add.acpi_id, op->u.cpu_add.pxm); break; case XENPF_mem_hotadd: ret = xsm_resource_plug_core(XSM_HOOK); if ( ret ) break; ret = memory_add(op->u.mem_add.spfn, op->u.mem_add.epfn, op->u.mem_add.pxm); break; case XENPF_core_parking: { uint32_t idle_nums; switch(op->u.core_parking.type) { case XEN_CORE_PARKING_SET: idle_nums = min_t(uint32_t, op->u.core_parking.idle_nums, num_present_cpus() - 1); ret = continue_hypercall_on_cpu( 0, core_parking_helper, (void *)(unsigned long)idle_nums); break; case XEN_CORE_PARKING_GET: op->u.core_parking.idle_nums = get_cur_idle_nums(); ret = __copy_field_to_guest(u_xenpf_op, op, u.core_parking) ? -EFAULT : 0; break; default: ret = -EINVAL; break; } } break; default: ret = -ENOSYS; break; } out: spin_unlock(&xenpf_lock); return ret; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/Rules.mk0000664000175000017500000000240512307313555014223 0ustar smbsmb######################################## # x86-specific definitions HAS_IOPORTS := y HAS_ACPI := y HAS_VGA := y HAS_VIDEO := y HAS_CPUFREQ := y HAS_PCI := y HAS_PASSTHROUGH := y HAS_NS16550 := y HAS_EHCI := y HAS_KEXEC := y HAS_GDBSX := y xenoprof := y # # If you change any of these configuration options then you must # 'make clean' before rebuilding. # supervisor_mode_kernel ?= n CFLAGS += -I$(BASEDIR)/include CFLAGS += -I$(BASEDIR)/include/asm-x86/mach-generic CFLAGS += -I$(BASEDIR)/include/asm-x86/mach-default # Prevent floating-point variables from creeping into Xen. CFLAGS += -msoft-float $(call cc-options-add,CFLAGS,CC,$(EMBEDDED_EXTRA_CFLAGS)) $(call cc-option-add,CFLAGS,CC,-Wnested-externs) $(call as-insn-check,CFLAGS,CC,"vmcall",-DHAVE_GAS_VMX) $(call as-insn-check,CFLAGS,CC,"invept (%rax)$$(comma)%rax",-DHAVE_GAS_EPT) $(call as-insn-check,CFLAGS,CC,"rdfsbase %rax",-DHAVE_GAS_FSGSBASE) ifeq ($(supervisor_mode_kernel),y) CFLAGS += -DCONFIG_X86_SUPERVISOR_MODE_KERNEL=1 endif x86 := y x86_32 := n x86_64 := y CFLAGS += -mno-red-zone -mno-sse -fpic CFLAGS += -fno-asynchronous-unwind-tables # -fvisibility=hidden reduces -fpic cost, if it's available ifneq ($(call cc-option,$(CC),-fvisibility=hidden,n),n) CFLAGS += -DGCC_HAS_VISIBILITY_ATTRIBUTE endif xen-4.4.0/xen/arch/x86/mm/0000775000175000017500000000000012307313555013210 5ustar smbsmbxen-4.4.0/xen/arch/x86/mm/Makefile0000664000175000017500000000060712307313555014653 0ustar smbsmbsubdir-y += shadow subdir-y += hap obj-y += paging.o obj-y += p2m.o p2m-pt.o p2m-ept.o p2m-pod.o obj-y += guest_walk_2.o obj-y += guest_walk_3.o obj-$(x86_64) += guest_walk_4.o obj-$(x86_64) += mem_event.o obj-$(x86_64) += mem_paging.o obj-$(x86_64) += mem_sharing.o obj-$(x86_64) += mem_access.o guest_walk_%.o: guest_walk.c Makefile $(CC) $(CFLAGS) -DGUEST_PAGING_LEVELS=$* -c $< -o $@ xen-4.4.0/xen/arch/x86/mm/mem_access.c0000664000175000017500000000321612307313555015455 0ustar smbsmb/****************************************************************************** * arch/x86/mm/mem_access.c * * Memory access support. * * Copyright (c) 2011 Virtuata, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include int mem_access_memop(struct domain *d, xen_mem_event_op_t *meo) { int rc; if ( unlikely(!d->mem_event->access.ring_page) ) return -ENODEV; switch( meo->op ) { case XENMEM_access_op_resume: { p2m_mem_access_resume(d); rc = 0; } break; default: rc = -ENOSYS; break; } return rc; } int mem_access_send_req(struct domain *d, mem_event_request_t *req) { int rc = mem_event_claim_slot(d, &d->mem_event->access); if ( rc < 0 ) return rc; mem_event_put_request(d, &d->mem_event->access, req); return 0; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/mm/p2m-ept.c0000664000175000017500000006013112307313555014641 0ustar smbsmb/* * ept-p2m.c: use the EPT page table as p2m * Copyright (c) 2007, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mm-locks.h" #define atomic_read_ept_entry(__pepte) \ ( (ept_entry_t) { .epte = read_atomic(&(__pepte)->epte) } ) #define atomic_write_ept_entry(__pepte, __epte) \ write_atomic(&(__pepte)->epte, (__epte).epte) #define is_epte_present(ept_entry) ((ept_entry)->epte & 0x7) #define is_epte_superpage(ept_entry) ((ept_entry)->sp) static inline bool_t is_epte_valid(ept_entry_t *e) { return (e->epte != 0 && e->sa_p2mt != p2m_invalid); } static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type, p2m_access_t access) { /* First apply type permissions */ switch(type) { case p2m_invalid: case p2m_mmio_dm: case p2m_populate_on_demand: case p2m_ram_paging_out: case p2m_ram_paged: case p2m_ram_paging_in: default: entry->r = entry->w = entry->x = 0; break; case p2m_ram_rw: entry->r = entry->w = entry->x = 1; break; case p2m_mmio_direct: entry->r = entry->x = 1; entry->w = !rangeset_contains_singleton(mmio_ro_ranges, entry->mfn); break; case p2m_ram_logdirty: case p2m_ram_ro: case p2m_ram_shared: entry->r = entry->x = 1; entry->w = 0; break; case p2m_grant_map_rw: entry->r = entry->w = 1; entry->x = 0; break; case p2m_grant_map_ro: entry->r = 1; entry->w = entry->x = 0; break; } /* Then restrict with access permissions */ switch (access) { case p2m_access_n: case p2m_access_n2rwx: entry->r = entry->w = entry->x = 0; break; case p2m_access_r: entry->w = entry->x = 0; break; case p2m_access_w: entry->r = entry->x = 0; break; case p2m_access_x: entry->r = entry->w = 0; break; case p2m_access_rx: case p2m_access_rx2rw: entry->w = 0; break; case p2m_access_wx: entry->r = 0; break; case p2m_access_rw: entry->x = 0; break; case p2m_access_rwx: break; } } #define GUEST_TABLE_MAP_FAILED 0 #define GUEST_TABLE_NORMAL_PAGE 1 #define GUEST_TABLE_SUPER_PAGE 2 #define GUEST_TABLE_POD_PAGE 3 /* Fill in middle levels of ept table */ static int ept_set_middle_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry) { struct page_info *pg; pg = p2m_alloc_ptp(p2m, 0); if ( pg == NULL ) return 0; ept_entry->epte = 0; ept_entry->mfn = page_to_mfn(pg); ept_entry->access = p2m->default_access; ept_entry->r = ept_entry->w = ept_entry->x = 1; return 1; } /* free ept sub tree behind an entry */ static void ept_free_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry, int level) { /* End if the entry is a leaf entry. */ if ( level == 0 || !is_epte_present(ept_entry) || is_epte_superpage(ept_entry) ) return; if ( level > 1 ) { ept_entry_t *epte = map_domain_page(ept_entry->mfn); for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ ) ept_free_entry(p2m, epte + i, level - 1); unmap_domain_page(epte); } p2m_free_ptp(p2m, mfn_to_page(ept_entry->mfn)); } static int ept_split_super_page(struct p2m_domain *p2m, ept_entry_t *ept_entry, int level, int target) { ept_entry_t new_ept, *table; uint64_t trunk; int rv = 1; /* End if the entry is a leaf entry or reaches the target level. */ if ( level == 0 || level == target ) return rv; ASSERT(is_epte_superpage(ept_entry)); if ( !ept_set_middle_entry(p2m, &new_ept) ) return 0; table = map_domain_page(new_ept.mfn); trunk = 1UL << ((level - 1) * EPT_TABLE_ORDER); for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ ) { ept_entry_t *epte = table + i; epte->epte = 0; epte->emt = ept_entry->emt; epte->ipat = ept_entry->ipat; epte->sp = (level > 1) ? 1 : 0; epte->access = ept_entry->access; epte->sa_p2mt = ept_entry->sa_p2mt; epte->mfn = ept_entry->mfn + i * trunk; epte->rsvd2_snp = ( iommu_enabled && iommu_snoop ) ? 1 : 0; ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access); if ( (level - 1) == target ) continue; ASSERT(is_epte_superpage(epte)); if ( !(rv = ept_split_super_page(p2m, epte, level - 1, target)) ) break; } unmap_domain_page(table); /* Even failed we should install the newly allocated ept page. */ *ept_entry = new_ept; return rv; } /* Take the currently mapped table, find the corresponding gfn entry, * and map the next table, if available. If the entry is empty * and read_only is set, * Return values: * 0: Failed to map. Either read_only was set and the entry was * empty, or allocating a new page failed. * GUEST_TABLE_NORMAL_PAGE: next level mapped normally * GUEST_TABLE_SUPER_PAGE: * The next entry points to a superpage, and caller indicates * that they are going to the superpage level, or are only doing * a read. * GUEST_TABLE_POD: * The next entry is marked populate-on-demand. */ static int ept_next_level(struct p2m_domain *p2m, bool_t read_only, ept_entry_t **table, unsigned long *gfn_remainder, int next_level) { unsigned long mfn; ept_entry_t *ept_entry, e; u32 shift, index; shift = next_level * EPT_TABLE_ORDER; index = *gfn_remainder >> shift; /* index must be falling into the page */ ASSERT(index < EPT_PAGETABLE_ENTRIES); ept_entry = (*table) + index; /* ept_next_level() is called (sometimes) without a lock. Read * the entry once, and act on the "cached" entry after that to * avoid races. */ e = atomic_read_ept_entry(ept_entry); if ( !is_epte_present(&e) ) { if ( e.sa_p2mt == p2m_populate_on_demand ) return GUEST_TABLE_POD_PAGE; if ( read_only ) return GUEST_TABLE_MAP_FAILED; if ( !ept_set_middle_entry(p2m, ept_entry) ) return GUEST_TABLE_MAP_FAILED; else e = atomic_read_ept_entry(ept_entry); /* Refresh */ } /* The only time sp would be set here is if we had hit a superpage */ if ( is_epte_superpage(&e) ) return GUEST_TABLE_SUPER_PAGE; mfn = e.mfn; unmap_domain_page(*table); *table = map_domain_page(mfn); *gfn_remainder &= (1UL << shift) - 1; return GUEST_TABLE_NORMAL_PAGE; } /* * ept_set_entry() computes 'need_modify_vtd_table' for itself, * by observing whether any gfn->mfn translations are modified. */ static int ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, unsigned int order, p2m_type_t p2mt, p2m_access_t p2ma) { ept_entry_t *table, *ept_entry = NULL; unsigned long gfn_remainder = gfn; unsigned long offset = 0; u32 index; int i, target = order / EPT_TABLE_ORDER; int rv = 0; int ret = 0; bool_t direct_mmio = (p2mt == p2m_mmio_direct); uint8_t ipat = 0; int need_modify_vtd_table = 1; int vtd_pte_present = 0; int needs_sync = 1; ept_entry_t old_entry = { .epte = 0 }; struct ept_data *ept = &p2m->ept; struct domain *d = p2m->domain; ASSERT(ept); /* * the caller must make sure: * 1. passing valid gfn and mfn at order boundary. * 2. gfn not exceeding guest physical address width. * 3. passing a valid order. */ if ( ((gfn | mfn_x(mfn)) & ((1UL << order) - 1)) || ((u64)gfn >> ((ept_get_wl(ept) + 1) * EPT_TABLE_ORDER)) || (order % EPT_TABLE_ORDER) ) return 0; ASSERT((target == 2 && hvm_hap_has_1gb()) || (target == 1 && hvm_hap_has_2mb()) || (target == 0)); table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m))); for ( i = ept_get_wl(ept); i > target; i-- ) { ret = ept_next_level(p2m, 0, &table, &gfn_remainder, i); if ( !ret ) goto out; else if ( ret != GUEST_TABLE_NORMAL_PAGE ) break; } ASSERT(ret != GUEST_TABLE_POD_PAGE || i != target); index = gfn_remainder >> (i * EPT_TABLE_ORDER); offset = gfn_remainder & ((1UL << (i * EPT_TABLE_ORDER)) - 1); ept_entry = table + index; /* In case VT-d uses same page table, this flag is needed by VT-d */ vtd_pte_present = is_epte_present(ept_entry) ? 1 : 0; /* * If we're here with i > target, we must be at a leaf node, and * we need to break up the superpage. * * If we're here with i == target and i > 0, we need to check to see * if we're replacing a non-leaf entry (i.e., pointing to an N-1 table) * with a leaf entry (a 1GiB or 2MiB page), and handle things appropriately. */ if ( i == target ) { /* We reached the target level. */ ept_entry_t new_entry = { .epte = 0 }; /* No need to flush if the old entry wasn't valid */ if ( !is_epte_present(ept_entry) ) needs_sync = 0; /* If we're replacing a non-leaf entry with a leaf entry (1GiB or 2MiB), * the intermediate tables will be freed below after the ept flush * * Read-then-write is OK because we hold the p2m lock. */ old_entry = *ept_entry; if ( mfn_valid(mfn_x(mfn)) || direct_mmio || p2m_is_paged(p2mt) || (p2mt == p2m_ram_paging_in) ) { /* Construct the new entry, and then write it once */ new_entry.emt = epte_get_entry_emt(p2m->domain, gfn, mfn, &ipat, direct_mmio); new_entry.ipat = ipat; new_entry.sp = order ? 1 : 0; new_entry.sa_p2mt = p2mt; new_entry.access = p2ma; new_entry.rsvd2_snp = (iommu_enabled && iommu_snoop); new_entry.mfn = mfn_x(mfn); if ( old_entry.mfn == new_entry.mfn ) need_modify_vtd_table = 0; ept_p2m_type_to_flags(&new_entry, p2mt, p2ma); } atomic_write_ept_entry(ept_entry, new_entry); } else { /* We need to split the original page. */ ept_entry_t split_ept_entry; ept_entry_t new_entry = { .epte = 0 }; ASSERT(is_epte_superpage(ept_entry)); split_ept_entry = atomic_read_ept_entry(ept_entry); if ( !ept_split_super_page(p2m, &split_ept_entry, i, target) ) { ept_free_entry(p2m, &split_ept_entry, i); goto out; } /* now install the newly split ept sub-tree */ /* NB: please make sure domian is paused and no in-fly VT-d DMA. */ atomic_write_ept_entry(ept_entry, split_ept_entry); /* then move to the level we want to make real changes */ for ( ; i > target; i-- ) if ( !ept_next_level(p2m, 0, &table, &gfn_remainder, i) ) break; /* We just installed the pages we need. */ ASSERT(i == target); index = gfn_remainder >> (i * EPT_TABLE_ORDER); offset = gfn_remainder & ((1UL << (i * EPT_TABLE_ORDER)) - 1); ept_entry = table + index; new_entry.emt = epte_get_entry_emt(d, gfn, mfn, &ipat, direct_mmio); new_entry.ipat = ipat; new_entry.sp = i ? 1 : 0; new_entry.sa_p2mt = p2mt; new_entry.access = p2ma; new_entry.rsvd2_snp = (iommu_enabled && iommu_snoop); /* the caller should take care of the previous page */ new_entry.mfn = mfn_x(mfn); /* Safe to read-then-write because we hold the p2m lock */ if ( ept_entry->mfn == new_entry.mfn ) need_modify_vtd_table = 0; ept_p2m_type_to_flags(&new_entry, p2mt, p2ma); atomic_write_ept_entry(ept_entry, new_entry); } /* Track the highest gfn for which we have ever had a valid mapping */ if ( p2mt != p2m_invalid && (gfn + (1UL << order) - 1 > p2m->max_mapped_pfn) ) p2m->max_mapped_pfn = gfn + (1UL << order) - 1; /* Success */ rv = 1; out: unmap_domain_page(table); if ( needs_sync ) ept_sync_domain(p2m); /* For non-nested p2m, may need to change VT-d page table.*/ if ( rv && !p2m_is_nestedp2m(p2m) && iommu_enabled && need_iommu(p2m->domain) && need_modify_vtd_table ) { if ( iommu_hap_pt_share ) iommu_pte_flush(d, gfn, (u64*)ept_entry, order, vtd_pte_present); else { if ( p2mt == p2m_ram_rw ) { if ( order > 0 ) { for ( i = 0; i < (1 << order); i++ ) iommu_map_page( p2m->domain, gfn - offset + i, mfn_x(mfn) - offset + i, IOMMUF_readable | IOMMUF_writable); } else if ( !order ) iommu_map_page( p2m->domain, gfn, mfn_x(mfn), IOMMUF_readable | IOMMUF_writable); } else { if ( order > 0 ) { for ( i = 0; i < (1 << order); i++ ) iommu_unmap_page(p2m->domain, gfn - offset + i); } else if ( !order ) iommu_unmap_page(p2m->domain, gfn); } } } /* Release the old intermediate tables, if any. This has to be the last thing we do, after the ept_sync_domain() and removal from the iommu tables, so as to avoid a potential use-after-free. */ if ( is_epte_present(&old_entry) ) ept_free_entry(p2m, &old_entry, target); return rv; } /* Read ept p2m entries */ static mfn_t ept_get_entry(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t* a, p2m_query_t q, unsigned int *page_order) { ept_entry_t *table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m))); unsigned long gfn_remainder = gfn; ept_entry_t *ept_entry; u32 index; int i; int ret = 0; mfn_t mfn = _mfn(INVALID_MFN); struct ept_data *ept = &p2m->ept; *t = p2m_mmio_dm; *a = p2m_access_n; /* This pfn is higher than the highest the p2m map currently holds */ if ( gfn > p2m->max_mapped_pfn ) goto out; /* Should check if gfn obeys GAW here. */ for ( i = ept_get_wl(ept); i > 0; i-- ) { retry: ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i); if ( !ret ) goto out; else if ( ret == GUEST_TABLE_POD_PAGE ) { if ( !(q & P2M_ALLOC) ) { *t = p2m_populate_on_demand; goto out; } /* Populate this superpage */ ASSERT(i <= 2); index = gfn_remainder >> ( i * EPT_TABLE_ORDER); ept_entry = table + index; if ( !p2m_pod_demand_populate(p2m, gfn, i * EPT_TABLE_ORDER, q) ) goto retry; else goto out; } else if ( ret == GUEST_TABLE_SUPER_PAGE ) break; } index = gfn_remainder >> (i * EPT_TABLE_ORDER); ept_entry = table + index; if ( ept_entry->sa_p2mt == p2m_populate_on_demand ) { if ( !(q & P2M_ALLOC) ) { *t = p2m_populate_on_demand; goto out; } ASSERT(i == 0); if ( p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_4K, q) ) goto out; } /* Need to check for all-zeroes because typecode 0 is p2m_ram and an * entirely empty entry shouldn't have RAM type. */ if ( ept_entry->epte != 0 && ept_entry->sa_p2mt != p2m_invalid ) { *t = ept_entry->sa_p2mt; *a = ept_entry->access; mfn = _mfn(ept_entry->mfn); if ( i ) { /* * We may meet super pages, and to split into 4k pages * to emulate p2m table */ unsigned long split_mfn = mfn_x(mfn) + (gfn_remainder & ((1 << (i * EPT_TABLE_ORDER)) - 1)); mfn = _mfn(split_mfn); } if ( page_order ) *page_order = i * EPT_TABLE_ORDER; } out: unmap_domain_page(table); return mfn; } void ept_walk_table(struct domain *d, unsigned long gfn) { struct p2m_domain *p2m = p2m_get_hostp2m(d); struct ept_data *ept = &p2m->ept; ept_entry_t *table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m))); unsigned long gfn_remainder = gfn; int i; gdprintk(XENLOG_ERR, "Walking EPT tables for domain %d gfn %lx\n", d->domain_id, gfn); /* This pfn is higher than the highest the p2m map currently holds */ if ( gfn > p2m->max_mapped_pfn ) { gdprintk(XENLOG_ERR, " gfn exceeds max_mapped_pfn %lx\n", p2m->max_mapped_pfn); goto out; } for ( i = ept_get_wl(ept); i >= 0; i-- ) { ept_entry_t *ept_entry, *next; u32 index; /* Stolen from ept_next_level */ index = gfn_remainder >> (i*EPT_TABLE_ORDER); ept_entry = table + index; gdprintk(XENLOG_ERR, " epte %"PRIx64"\n", ept_entry->epte); if ( (i == 0) || !is_epte_present(ept_entry) || is_epte_superpage(ept_entry) ) goto out; else { gfn_remainder &= (1UL << (i*EPT_TABLE_ORDER)) - 1; next = map_domain_page(ept_entry->mfn); unmap_domain_page(table); table = next; } } out: unmap_domain_page(table); return; } /* * Walk the whole p2m table, changing any entries of the old type * to the new type. This is used in hardware-assisted paging to * quickly enable or diable log-dirty tracking */ static void ept_change_entry_type_page(mfn_t ept_page_mfn, int ept_page_level, p2m_type_t ot, p2m_type_t nt) { ept_entry_t e, *epte = map_domain_page(mfn_x(ept_page_mfn)); for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ ) { if ( !is_epte_valid(epte + i) ) continue; if ( (ept_page_level > 0) && !is_epte_superpage(epte + i) ) ept_change_entry_type_page(_mfn(epte[i].mfn), ept_page_level - 1, ot, nt); else { e = atomic_read_ept_entry(&epte[i]); if ( e.sa_p2mt != ot ) continue; e.sa_p2mt = nt; ept_p2m_type_to_flags(&e, nt, e.access); atomic_write_ept_entry(&epte[i], e); } } unmap_domain_page(epte); } static void ept_change_entry_type_global(struct p2m_domain *p2m, p2m_type_t ot, p2m_type_t nt) { struct ept_data *ept = &p2m->ept; if ( ept_get_asr(ept) == 0 ) return; BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt)); BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct)); ept_change_entry_type_page(_mfn(ept_get_asr(ept)), ept_get_wl(ept), ot, nt); ept_sync_domain(p2m); } static void __ept_sync_domain(void *info) { struct ept_data *ept = &((struct p2m_domain *)info)->ept; __invept(INVEPT_SINGLE_CONTEXT, ept_get_eptp(ept), 0); } void ept_sync_domain(struct p2m_domain *p2m) { struct domain *d = p2m->domain; struct ept_data *ept = &p2m->ept; /* Only if using EPT and this domain has some VCPUs to dirty. */ if ( !paging_mode_hap(d) || !d->vcpu || !d->vcpu[0] ) return; ASSERT(local_irq_is_enabled()); /* * Flush active cpus synchronously. Flush others the next time this domain * is scheduled onto them. We accept the race of other CPUs adding to * the ept_synced mask before on_selected_cpus() reads it, resulting in * unnecessary extra flushes, to avoid allocating a cpumask_t on the stack. */ cpumask_and(ept_get_synced_mask(ept), d->domain_dirty_cpumask, &cpu_online_map); on_selected_cpus(ept_get_synced_mask(ept), __ept_sync_domain, p2m, 1); } int ept_p2m_init(struct p2m_domain *p2m) { struct ept_data *ept = &p2m->ept; p2m->set_entry = ept_set_entry; p2m->get_entry = ept_get_entry; p2m->change_entry_type_global = ept_change_entry_type_global; p2m->audit_p2m = NULL; /* Set the memory type used when accessing EPT paging structures. */ ept->ept_mt = EPT_DEFAULT_MT; /* set EPT page-walk length, now it's actual walk length - 1, i.e. 3 */ ept->ept_wl = 3; if ( !zalloc_cpumask_var(&ept->synced_mask) ) return -ENOMEM; on_each_cpu(__ept_sync_domain, p2m, 1); return 0; } void ept_p2m_uninit(struct p2m_domain *p2m) { struct ept_data *ept = &p2m->ept; free_cpumask_var(ept->synced_mask); } static void ept_dump_p2m_table(unsigned char key) { struct domain *d; ept_entry_t *table, *ept_entry; mfn_t mfn; int order; int i; int is_pod; int ret = 0; unsigned long index; unsigned long gfn, gfn_remainder; unsigned long record_counter = 0; struct p2m_domain *p2m; struct ept_data *ept; for_each_domain(d) { if ( !hap_enabled(d) ) continue; p2m = p2m_get_hostp2m(d); ept = &p2m->ept; printk("\ndomain%d EPT p2m table: \n", d->domain_id); for ( gfn = 0; gfn <= p2m->max_mapped_pfn; gfn += (1 << order) ) { gfn_remainder = gfn; mfn = _mfn(INVALID_MFN); table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m))); for ( i = ept_get_wl(ept); i > 0; i-- ) { ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i); if ( ret != GUEST_TABLE_NORMAL_PAGE ) break; } order = i * EPT_TABLE_ORDER; if ( ret == GUEST_TABLE_MAP_FAILED ) goto out; index = gfn_remainder >> order; ept_entry = table + index; if ( ept_entry->sa_p2mt != p2m_invalid ) { ( ept_entry->sa_p2mt == p2m_populate_on_demand ) ? ( mfn = _mfn(INVALID_MFN), is_pod = 1 ) : ( mfn = _mfn(ept_entry->mfn), is_pod = 0 ); printk("gfn: %-16lx mfn: %-16lx order: %2d is_pod: %d\n", gfn, mfn_x(mfn), order, is_pod); if ( !(record_counter++ % 100) ) process_pending_softirqs(); } out: unmap_domain_page(table); } } } static struct keyhandler ept_p2m_table = { .diagnostic = 0, .u.fn = ept_dump_p2m_table, .desc = "dump ept p2m table" }; void setup_ept_dump(void) { register_keyhandler('D', &ept_p2m_table); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/mm/guest_walk.c0000664000175000017500000003331012307313555015521 0ustar smbsmb/****************************************************************************** * arch/x86/mm/guest_walk.c * * Pagetable walker for guest memory accesses. * * Parts of this code are Copyright (c) 2006 by XenSource Inc. * Parts of this code are Copyright (c) 2006 by Michael A Fetterman * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include /* Flags that are needed in a pagetable entry, with the sense of NX inverted */ static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec) { static const uint32_t flags[] = { /* I/F - Usr Wr */ /* 0 0 0 0 */ _PAGE_PRESENT, /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW, /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER, /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER, /* 0 1 0 0 */ _PAGE_PRESENT, /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW, /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER, /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER, /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT, /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT, /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT, /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT, /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT, /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT, /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT, /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT, }; /* Don't demand not-NX if the CPU wouldn't enforce it. */ if ( !guest_supports_nx(v) ) pfec &= ~PFEC_insn_fetch; /* Don't demand R/W if the CPU wouldn't enforce it. */ if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) && !(pfec & PFEC_user_mode) ) pfec &= ~PFEC_write_access; return flags[(pfec & 0x1f) >> 1] | _PAGE_INVALID_BITS; } /* Modify a guest pagetable entry to set the Accessed and Dirty bits. * Returns non-zero if it actually writes to guest memory. */ static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty) { guest_intpte_t old, new; old = *(guest_intpte_t *)walk_p; new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0); if ( old != new ) { /* Write the new entry into the walk, and try to write it back * into the guest table as well. If the guest table has changed * under out feet then leave it alone. */ *(guest_intpte_t *)walk_p = new; if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) return 1; } return 0; } /* If the map is non-NULL, we leave this function having * acquired an extra ref on mfn_to_page(*mfn) */ void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, mfn_t *mfn, p2m_type_t *p2mt, p2m_query_t q, uint32_t *rc) { struct page_info *page; void *map; /* Translate the gfn, unsharing if shared */ page = get_page_from_gfn_p2m(p2m->domain, p2m, gfn_x(gfn), p2mt, NULL, q); if ( p2m_is_paging(*p2mt) ) { ASSERT(!p2m_is_nestedp2m(p2m)); if ( page ) put_page(page); p2m_mem_paging_populate(p2m->domain, gfn_x(gfn)); *rc = _PAGE_PAGED; return NULL; } if ( p2m_is_shared(*p2mt) ) { if ( page ) put_page(page); *rc = _PAGE_SHARED; return NULL; } if ( !page ) { *rc |= _PAGE_PRESENT; return NULL; } *mfn = _mfn(page_to_mfn(page)); ASSERT(mfn_valid(mfn_x(*mfn))); map = map_domain_page(mfn_x(*mfn)); return map; } /* Walk the guest pagetables, after the manner of a hardware walker. */ /* Because the walk is essentially random, it can cause a deadlock * warning in the p2m locking code. Highly unlikely this is an actual * deadlock, because who would walk page table in the opposite order? */ uint32_t guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, unsigned long va, walk_t *gw, uint32_t pfec, mfn_t top_mfn, void *top_map) { struct domain *d = v->domain; p2m_type_t p2mt; guest_l1e_t *l1p = NULL; guest_l2e_t *l2p = NULL; #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ guest_l3e_t *l3p = NULL; guest_l4e_t *l4p; #endif uint32_t gflags, mflags, iflags, rc = 0; int smep; bool_t pse1G = 0, pse2M = 0; p2m_query_t qt = P2M_ALLOC | P2M_UNSHARE; perfc_incr(guest_walk); memset(gw, 0, sizeof(*gw)); gw->va = va; /* Mandatory bits that must be set in every entry. We invert NX and * the invalid bits, to calculate as if there were an "X" bit that * allowed access. We will accumulate, in rc, the set of flags that * are missing/unwanted. */ mflags = mandatory_flags(v, pfec); iflags = (_PAGE_NX_BIT | _PAGE_INVALID_BITS); /* SMEP: kernel-mode instruction fetches from user-mode mappings * should fault. Unlike NX or invalid bits, we're looking for _all_ * entries in the walk to have _PAGE_USER set, so we need to do the * whole walk as if it were a user-mode one and then invert the answer. */ smep = (is_hvm_vcpu(v) && hvm_smep_enabled(v) && (pfec & PFEC_insn_fetch) && !(pfec & PFEC_user_mode) ); if ( smep ) mflags |= _PAGE_USER; #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ /* Get the l4e from the top level table and check its flags*/ gw->l4mfn = top_mfn; l4p = (guest_l4e_t *) top_map; gw->l4e = l4p[guest_l4_table_offset(va)]; gflags = guest_l4e_get_flags(gw->l4e) ^ iflags; if ( !(gflags & _PAGE_PRESENT) ) { rc |= _PAGE_PRESENT; goto out; } rc |= ((gflags & mflags) ^ mflags); /* Map the l3 table */ l3p = map_domain_gfn(p2m, guest_l4e_get_gfn(gw->l4e), &gw->l3mfn, &p2mt, qt, &rc); if(l3p == NULL) goto out; /* Get the l3e and check its flags*/ gw->l3e = l3p[guest_l3_table_offset(va)]; gflags = guest_l3e_get_flags(gw->l3e) ^ iflags; if ( !(gflags & _PAGE_PRESENT) ) { rc |= _PAGE_PRESENT; goto out; } rc |= ((gflags & mflags) ^ mflags); pse1G = (gflags & _PAGE_PSE) && guest_supports_1G_superpages(v); if ( pse1G ) { /* Generate a fake l1 table entry so callers don't all * have to understand superpages. */ gfn_t start = guest_l3e_get_gfn(gw->l3e); /* Grant full access in the l1e, since all the guest entry's * access controls are enforced in the l3e. */ int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| _PAGE_ACCESSED|_PAGE_DIRTY); /* Import cache-control bits. Note that _PAGE_PAT is actually * _PAGE_PSE, and it is always set. We will clear it in case * _PAGE_PSE_PAT (bit 12, i.e. first bit of gfn) is clear. */ flags |= (guest_l3e_get_flags(gw->l3e) & (_PAGE_PAT|_PAGE_PWT|_PAGE_PCD)); if ( !(gfn_x(start) & 1) ) /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */ flags &= ~_PAGE_PAT; if ( gfn_x(start) & GUEST_L3_GFN_MASK & ~0x1 ) rc |= _PAGE_INVALID_BITS; /* Increment the pfn by the right number of 4k pages. */ start = _gfn((gfn_x(start) & ~GUEST_L3_GFN_MASK) + ((va >> PAGE_SHIFT) & GUEST_L3_GFN_MASK)); gw->l1e = guest_l1e_from_gfn(start, flags); gw->l2mfn = gw->l1mfn = _mfn(INVALID_MFN); goto set_ad; } #else /* PAE only... */ /* Get the l3e and check its flag */ gw->l3e = ((guest_l3e_t *) top_map)[guest_l3_table_offset(va)]; if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) { rc |= _PAGE_PRESENT; goto out; } #endif /* PAE or 64... */ /* Map the l2 table */ l2p = map_domain_gfn(p2m, guest_l3e_get_gfn(gw->l3e), &gw->l2mfn, &p2mt, qt, &rc); if(l2p == NULL) goto out; /* Get the l2e */ gw->l2e = l2p[guest_l2_table_offset(va)]; #else /* 32-bit only... */ /* Get l2e from the top level table */ gw->l2mfn = top_mfn; l2p = (guest_l2e_t *) top_map; gw->l2e = l2p[guest_l2_table_offset(va)]; #endif /* All levels... */ gflags = guest_l2e_get_flags(gw->l2e) ^ iflags; if ( !(gflags & _PAGE_PRESENT) ) { rc |= _PAGE_PRESENT; goto out; } rc |= ((gflags & mflags) ^ mflags); pse2M = (gflags & _PAGE_PSE) && guest_supports_superpages(v); if ( pse2M ) { /* Special case: this guest VA is in a PSE superpage, so there's * no guest l1e. We make one up so that the propagation code * can generate a shadow l1 table. Start with the gfn of the * first 4k-page of the superpage. */ gfn_t start = guest_l2e_get_gfn(gw->l2e); /* Grant full access in the l1e, since all the guest entry's * access controls are enforced in the shadow l2e. */ int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| _PAGE_ACCESSED|_PAGE_DIRTY); /* Import cache-control bits. Note that _PAGE_PAT is actually * _PAGE_PSE, and it is always set. We will clear it in case * _PAGE_PSE_PAT (bit 12, i.e. first bit of gfn) is clear. */ flags |= (guest_l2e_get_flags(gw->l2e) & (_PAGE_PAT|_PAGE_PWT|_PAGE_PCD)); if ( !(gfn_x(start) & 1) ) /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */ flags &= ~_PAGE_PAT; if ( gfn_x(start) & GUEST_L2_GFN_MASK & ~0x1 ) { #if GUEST_PAGING_LEVELS == 2 /* * Note that _PAGE_INVALID_BITS is zero in this case, yielding a * no-op here. * * Architecturally, the walk should fail if bit 21 is set (others * aren't being checked at least in PSE36 mode), but we'll ignore * this here in order to avoid specifying a non-natural, non-zero * _PAGE_INVALID_BITS value just for that case. */ #endif rc |= _PAGE_INVALID_BITS; } /* Increment the pfn by the right number of 4k pages. * Mask out PAT and invalid bits. */ start = _gfn((gfn_x(start) & ~GUEST_L2_GFN_MASK) + guest_l1_table_offset(va)); gw->l1e = guest_l1e_from_gfn(start, flags); gw->l1mfn = _mfn(INVALID_MFN); } else { /* Not a superpage: carry on and find the l1e. */ l1p = map_domain_gfn(p2m, guest_l2e_get_gfn(gw->l2e), &gw->l1mfn, &p2mt, qt, &rc); if(l1p == NULL) goto out; gw->l1e = l1p[guest_l1_table_offset(va)]; gflags = guest_l1e_get_flags(gw->l1e) ^ iflags; if ( !(gflags & _PAGE_PRESENT) ) { rc |= _PAGE_PRESENT; goto out; } rc |= ((gflags & mflags) ^ mflags); } #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ set_ad: #endif /* Now re-invert the user-mode requirement for SMEP. */ if ( smep ) rc ^= _PAGE_USER; /* Go back and set accessed and dirty bits only if the walk was a * success. Although the PRMs say higher-level _PAGE_ACCESSED bits * get set whenever a lower-level PT is used, at least some hardware * walkers behave this way. */ if ( rc == 0 ) { #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */ if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) ) paging_mark_dirty(d, mfn_x(gw->l4mfn)); if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, (pse1G && (pfec & PFEC_write_access))) ) paging_mark_dirty(d, mfn_x(gw->l3mfn)); #endif if ( !pse1G ) { if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e, (pse2M && (pfec & PFEC_write_access))) ) paging_mark_dirty(d, mfn_x(gw->l2mfn)); if ( !pse2M ) { if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, (pfec & PFEC_write_access)) ) paging_mark_dirty(d, mfn_x(gw->l1mfn)); } } } out: #if GUEST_PAGING_LEVELS == 4 if ( l3p ) { unmap_domain_page(l3p); put_page(mfn_to_page(mfn_x(gw->l3mfn))); } #endif #if GUEST_PAGING_LEVELS >= 3 if ( l2p ) { unmap_domain_page(l2p); put_page(mfn_to_page(mfn_x(gw->l2mfn))); } #endif if ( l1p ) { unmap_domain_page(l1p); put_page(mfn_to_page(mfn_x(gw->l1mfn))); } return rc; } xen-4.4.0/xen/arch/x86/mm/p2m-pt.c0000664000175000017500000007532312307313555014505 0ustar smbsmb/****************************************************************************** * arch/x86/mm/p2m-pt.c * * Implementation of p2m datastructures as pagetables, for use by * NPT and shadow-pagetable code * * Parts of this code are Copyright (c) 2009-2011 by Citrix Systems, Inc. * Parts of this code are Copyright (c) 2007 by Advanced Micro Devices. * Parts of this code are Copyright (c) 2006-2007 by XenSource Inc. * Parts of this code are Copyright (c) 2006 by Michael A Fetterman * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include "mm-locks.h" /* Override macros from asm/page.h to make them work with mfn_t */ #undef mfn_to_page #define mfn_to_page(_m) __mfn_to_page(mfn_x(_m)) #undef mfn_valid #define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn)) #undef page_to_mfn #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) /* We may store INVALID_MFN in l1 PTEs. We need to clip this * to avoid trampling over higher-order bits (NX, p2m type, IOMMU flags). We * seem to not need to unclip on the return path, as callers are concerned only * with p2m type in such cases. */ #define p2m_l1e_from_pfn(pfn, flags) \ l1e_from_pfn((pfn) & (PADDR_MASK >> PAGE_SHIFT), (flags)) /* PTE flags for the various types of p2m entry */ #define P2M_BASE_FLAGS \ (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED) static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn) { unsigned long flags; /* * AMD IOMMU: When we share p2m table with iommu, bit 9 - bit 11 will be * used for iommu hardware to encode next io page level. Bit 59 - bit 62 * are used for iommu flags, We could not use these bits to store p2m types. */ flags = (unsigned long)(t & 0x7f) << 12; switch(t) { case p2m_invalid: case p2m_mmio_dm: case p2m_populate_on_demand: case p2m_ram_paging_out: case p2m_ram_paged: case p2m_ram_paging_in: default: return flags | _PAGE_NX_BIT; case p2m_grant_map_ro: return flags | P2M_BASE_FLAGS | _PAGE_NX_BIT; case p2m_ram_ro: case p2m_ram_logdirty: case p2m_ram_shared: return flags | P2M_BASE_FLAGS; case p2m_ram_rw: return flags | P2M_BASE_FLAGS | _PAGE_RW; case p2m_grant_map_rw: return flags | P2M_BASE_FLAGS | _PAGE_RW | _PAGE_NX_BIT; case p2m_mmio_direct: if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) ) flags |= _PAGE_RW; return flags | P2M_BASE_FLAGS | _PAGE_PCD; } } // Find the next level's P2M entry, checking for out-of-range gfn's... // Returns NULL on error. // static l1_pgentry_t * p2m_find_entry(void *table, unsigned long *gfn_remainder, unsigned long gfn, uint32_t shift, uint32_t max) { u32 index; index = *gfn_remainder >> shift; if ( index >= max ) { P2M_DEBUG("gfn=%#lx out of range " "(gfn_remainder=%#lx shift=%d index=%#x max=%#x)\n", gfn, *gfn_remainder, shift, index, max); return NULL; } *gfn_remainder &= (1 << shift) - 1; return (l1_pgentry_t *)table + index; } /* Free intermediate tables from a p2m sub-tree */ static void p2m_free_entry(struct p2m_domain *p2m, l1_pgentry_t *p2m_entry, int page_order) { /* End if the entry is a leaf entry. */ if ( page_order == PAGE_ORDER_4K || !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) || (l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) return; if ( page_order > PAGE_ORDER_2M ) { l1_pgentry_t *l3_table = map_domain_page(l1e_get_pfn(*p2m_entry)); for ( int i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) p2m_free_entry(p2m, l3_table + i, page_order - 9); unmap_domain_page(l3_table); } p2m_free_ptp(p2m, mfn_to_page(_mfn(l1e_get_pfn(*p2m_entry)))); } // Walk one level of the P2M table, allocating a new table if required. // Returns 0 on error. // /* AMD IOMMU: Convert next level bits and r/w bits into 24 bits p2m flags */ #define iommu_nlevel_to_flags(nl, f) ((((nl) & 0x7) << 9 )|(((f) & 0x3) << 21)) static void p2m_add_iommu_flags(l1_pgentry_t *p2m_entry, unsigned int nlevel, unsigned int flags) { if ( iommu_hap_pt_share ) l1e_add_flags(*p2m_entry, iommu_nlevel_to_flags(nlevel, flags)); } static int p2m_next_level(struct p2m_domain *p2m, mfn_t *table_mfn, void **table, unsigned long *gfn_remainder, unsigned long gfn, u32 shift, u32 max, unsigned long type) { l1_pgentry_t *l1_entry; l1_pgentry_t *p2m_entry; l1_pgentry_t new_entry; void *next; int i; if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn, shift, max)) ) return 0; /* PoD/paging: Not present doesn't imply empty. */ if ( !l1e_get_flags(*p2m_entry) ) { struct page_info *pg; pg = p2m_alloc_ptp(p2m, type); if ( pg == NULL ) return 0; new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), __PAGE_HYPERVISOR | _PAGE_USER); switch ( type ) { case PGT_l3_page_table: p2m_add_iommu_flags(&new_entry, 3, IOMMUF_readable|IOMMUF_writable); p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 4); break; case PGT_l2_page_table: p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable); p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3); break; case PGT_l1_page_table: p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable); p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 2); break; default: BUG(); break; } } ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE)); /* split 1GB pages into 2MB pages */ if ( type == PGT_l2_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) { unsigned long flags, pfn; struct page_info *pg; pg = p2m_alloc_ptp(p2m, PGT_l2_page_table); if ( pg == NULL ) return 0; flags = l1e_get_flags(*p2m_entry); pfn = l1e_get_pfn(*p2m_entry); l1_entry = map_domain_page(mfn_x(page_to_mfn(pg))); for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) { new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags); p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable); p2m->write_p2m_entry(p2m, gfn, l1_entry+i, *table_mfn, new_entry, 2); } unmap_domain_page(l1_entry); new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), __PAGE_HYPERVISOR|_PAGE_USER); //disable PSE p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable); p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3); } /* split single 2MB large page into 4KB page in P2M table */ if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) { unsigned long flags, pfn; struct page_info *pg; pg = p2m_alloc_ptp(p2m, PGT_l1_page_table); if ( pg == NULL ) return 0; /* New splintered mappings inherit the flags of the old superpage, * with a little reorganisation for the _PAGE_PSE_PAT bit. */ flags = l1e_get_flags(*p2m_entry); pfn = l1e_get_pfn(*p2m_entry); if ( pfn & 1 ) /* ==> _PAGE_PSE_PAT was set */ pfn -= 1; /* Clear it; _PAGE_PSE becomes _PAGE_PAT */ else flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */ l1_entry = __map_domain_page(pg); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) { new_entry = l1e_from_pfn(pfn + i, flags); p2m_add_iommu_flags(&new_entry, 0, 0); p2m->write_p2m_entry(p2m, gfn, l1_entry+i, *table_mfn, new_entry, 1); } unmap_domain_page(l1_entry); new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), __PAGE_HYPERVISOR|_PAGE_USER); p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable); p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 2); } *table_mfn = _mfn(l1e_get_pfn(*p2m_entry)); next = map_domain_page(mfn_x(*table_mfn)); unmap_domain_page(*table); *table = next; return 1; } // Returns 0 on error (out of memory) static int p2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma) { // XXX -- this might be able to be faster iff current->domain == d mfn_t table_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m)); void *table =map_domain_page(mfn_x(table_mfn)); unsigned long i, gfn_remainder = gfn; l1_pgentry_t *p2m_entry; l1_pgentry_t entry_content; l2_pgentry_t l2e_content; l3_pgentry_t l3e_content; int rv=0; unsigned int iommu_pte_flags = (p2mt == p2m_ram_rw) ? IOMMUF_readable|IOMMUF_writable: 0; unsigned long old_mfn = 0; if ( tb_init_done ) { struct { u64 gfn, mfn; int p2mt; int d:16,order:16; } t; t.gfn = gfn; t.mfn = mfn_x(mfn); t.p2mt = p2mt; t.d = p2m->domain->domain_id; t.order = page_order; __trace_var(TRC_MEM_SET_P2M_ENTRY, 0, sizeof(t), &t); } if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn, L4_PAGETABLE_SHIFT - PAGE_SHIFT, L4_PAGETABLE_ENTRIES, PGT_l3_page_table) ) goto out; /* * Try to allocate 1GB page table if this feature is supported. */ if ( page_order == PAGE_ORDER_1G ) { l1_pgentry_t old_entry = l1e_empty(); p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, L3_PAGETABLE_SHIFT - PAGE_SHIFT, L3_PAGETABLE_ENTRIES); ASSERT(p2m_entry); if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) && !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) { /* We're replacing a non-SP page with a superpage. Make sure to * handle freeing the table properly. */ old_entry = *p2m_entry; } ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct); l3e_content = mfn_valid(mfn) ? l3e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE) : l3e_empty(); entry_content.l1 = l3e_content.l3; if ( entry_content.l1 != 0 ) { p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags); old_mfn = l1e_get_pfn(*p2m_entry); } p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 3); /* NB: paging_write_p2m_entry() handles tlb flushes properly */ /* Free old intermediate tables if necessary */ if ( l1e_get_flags(old_entry) & _PAGE_PRESENT ) p2m_free_entry(p2m, &old_entry, page_order); } else if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn, L3_PAGETABLE_SHIFT - PAGE_SHIFT, L3_PAGETABLE_ENTRIES, PGT_l2_page_table) ) goto out; if ( page_order == PAGE_ORDER_4K ) { if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn, L2_PAGETABLE_SHIFT - PAGE_SHIFT, L2_PAGETABLE_ENTRIES, PGT_l1_page_table) ) goto out; p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, 0, L1_PAGETABLE_ENTRIES); ASSERT(p2m_entry); if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) || p2m_is_paging(p2mt) ) entry_content = p2m_l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt, mfn)); else entry_content = l1e_empty(); if ( entry_content.l1 != 0 ) { p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags); old_mfn = l1e_get_pfn(*p2m_entry); } /* level 1 entry */ p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 1); /* NB: paging_write_p2m_entry() handles tlb flushes properly */ } else if ( page_order == PAGE_ORDER_2M ) { l1_pgentry_t old_entry = l1e_empty(); p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, L2_PAGETABLE_SHIFT - PAGE_SHIFT, L2_PAGETABLE_ENTRIES); ASSERT(p2m_entry); /* FIXME: Deal with 4k replaced by 2meg pages */ if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) && !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) { /* We're replacing a non-SP page with a superpage. Make sure to * handle freeing the table properly. */ old_entry = *p2m_entry; } ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct); if ( mfn_valid(mfn) || p2m_is_pod(p2mt) ) l2e_content = l2e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE); else l2e_content = l2e_empty(); entry_content.l1 = l2e_content.l2; if ( entry_content.l1 != 0 ) { p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags); old_mfn = l1e_get_pfn(*p2m_entry); } p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 2); /* NB: paging_write_p2m_entry() handles tlb flushes properly */ /* Free old intermediate tables if necessary */ if ( l1e_get_flags(old_entry) & _PAGE_PRESENT ) p2m_free_entry(p2m, &old_entry, page_order); } /* Track the highest gfn for which we have ever had a valid mapping */ if ( p2mt != p2m_invalid && (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) ) p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1; if ( iommu_enabled && need_iommu(p2m->domain) ) { if ( iommu_hap_pt_share ) { if ( old_mfn && (old_mfn != mfn_x(mfn)) ) amd_iommu_flush_pages(p2m->domain, gfn, page_order); } else { if ( p2mt == p2m_ram_rw ) for ( i = 0; i < (1UL << page_order); i++ ) iommu_map_page(p2m->domain, gfn+i, mfn_x(mfn)+i, IOMMUF_readable|IOMMUF_writable); else for ( int i = 0; i < (1UL << page_order); i++ ) iommu_unmap_page(p2m->domain, gfn+i); } } /* Success */ rv = 1; out: unmap_domain_page(table); return rv; } static mfn_t p2m_gfn_to_mfn(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, p2m_query_t q, unsigned int *page_order) { mfn_t mfn; paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT; l2_pgentry_t *l2e; l1_pgentry_t *l1e; unsigned long l1e_flags; p2m_type_t l1t; ASSERT(paging_mode_translate(p2m->domain)); /* XXX This is for compatibility with the old model, where anything not * XXX marked as RAM was considered to be emulated MMIO space. * XXX Once we start explicitly registering MMIO regions in the p2m * XXX we will return p2m_invalid for unmapped gfns */ *t = p2m_mmio_dm; /* Not implemented except with EPT */ *a = p2m_access_rwx; if ( gfn > p2m->max_mapped_pfn ) /* This pfn is higher than the highest the p2m map currently holds */ return _mfn(INVALID_MFN); mfn = pagetable_get_mfn(p2m_get_pagetable(p2m)); { l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn)); l4e += l4_table_offset(addr); if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 ) { unmap_domain_page(l4e); return _mfn(INVALID_MFN); } mfn = _mfn(l4e_get_pfn(*l4e)); unmap_domain_page(l4e); } { l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn)); l3e += l3_table_offset(addr); pod_retry_l3: if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 ) { if ( p2m_flags_to_type(l3e_get_flags(*l3e)) == p2m_populate_on_demand ) { if ( q & P2M_ALLOC ) { if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_1G, q) ) goto pod_retry_l3; gdprintk(XENLOG_ERR, "%s: Allocate 1GB failed!\n", __func__); } else *t = p2m_populate_on_demand; } unmap_domain_page(l3e); return _mfn(INVALID_MFN); } else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) ) { mfn = _mfn(l3e_get_pfn(*l3e) + l2_table_offset(addr) * L1_PAGETABLE_ENTRIES + l1_table_offset(addr)); *t = p2m_flags_to_type(l3e_get_flags(*l3e)); unmap_domain_page(l3e); ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); if ( page_order ) *page_order = PAGE_ORDER_1G; return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN); } mfn = _mfn(l3e_get_pfn(*l3e)); unmap_domain_page(l3e); } l2e = map_domain_page(mfn_x(mfn)); l2e += l2_table_offset(addr); pod_retry_l2: if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) { /* PoD: Try to populate a 2-meg chunk */ if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand ) { if ( q & P2M_ALLOC ) { if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_2M, q) ) goto pod_retry_l2; } else *t = p2m_populate_on_demand; } unmap_domain_page(l2e); return _mfn(INVALID_MFN); } else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) ) { mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr)); *t = p2m_flags_to_type(l2e_get_flags(*l2e)); unmap_domain_page(l2e); ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); if ( page_order ) *page_order = PAGE_ORDER_2M; return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN); } mfn = _mfn(l2e_get_pfn(*l2e)); unmap_domain_page(l2e); l1e = map_domain_page(mfn_x(mfn)); l1e += l1_table_offset(addr); pod_retry_l1: l1e_flags = l1e_get_flags(*l1e); l1t = p2m_flags_to_type(l1e_flags); if ( ((l1e_flags & _PAGE_PRESENT) == 0) && (!p2m_is_paging(l1t)) ) { /* PoD: Try to populate */ if ( l1t == p2m_populate_on_demand ) { if ( q & P2M_ALLOC ) { if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_4K, q) ) goto pod_retry_l1; } else *t = p2m_populate_on_demand; } unmap_domain_page(l1e); return _mfn(INVALID_MFN); } mfn = _mfn(l1e_get_pfn(*l1e)); *t = l1t; unmap_domain_page(l1e); ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t) || p2m_is_paging(*t)); if ( page_order ) *page_order = PAGE_ORDER_4K; return (p2m_is_valid(*t) || p2m_is_grant(*t)) ? mfn : _mfn(INVALID_MFN); } /* Walk the whole p2m table, changing any entries of the old type * to the new type. This is used in hardware-assisted paging to * quickly enable or diable log-dirty tracking */ static void p2m_change_type_global(struct p2m_domain *p2m, p2m_type_t ot, p2m_type_t nt) { unsigned long mfn, gfn, flags; l1_pgentry_t l1e_content; l1_pgentry_t *l1e; l2_pgentry_t *l2e; mfn_t l1mfn, l2mfn, l3mfn; unsigned long i1, i2, i3; l3_pgentry_t *l3e; l4_pgentry_t *l4e; unsigned long i4; BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt)); BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct)); if ( !paging_mode_translate(p2m->domain) ) return; if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) == 0 ) return; ASSERT(p2m_locked_by_me(p2m)); l4e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m)))); for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ ) { if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) ) { continue; } l3mfn = _mfn(l4e_get_pfn(l4e[i4])); l3e = map_domain_page(l4e_get_pfn(l4e[i4])); for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; i3++ ) { if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) ) { continue; } if ( (l3e_get_flags(l3e[i3]) & _PAGE_PSE) ) { flags = l3e_get_flags(l3e[i3]); if ( p2m_flags_to_type(flags) != ot ) continue; mfn = l3e_get_pfn(l3e[i3]); gfn = get_gpfn_from_mfn(mfn); flags = p2m_type_to_flags(nt, _mfn(mfn)); l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE); p2m->write_p2m_entry(p2m, gfn, (l1_pgentry_t *)&l3e[i3], l3mfn, l1e_content, 3); continue; } l2mfn = _mfn(l3e_get_pfn(l3e[i3])); l2e = map_domain_page(l3e_get_pfn(l3e[i3])); for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) { if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) ) { continue; } if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) ) { flags = l2e_get_flags(l2e[i2]); if ( p2m_flags_to_type(flags) != ot ) continue; mfn = l2e_get_pfn(l2e[i2]); /* Do not use get_gpfn_from_mfn because it may return SHARED_M2P_ENTRY */ gfn = (i2 + (i3 + (i4 * L3_PAGETABLE_ENTRIES)) * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; flags = p2m_type_to_flags(nt, _mfn(mfn)); l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE); p2m->write_p2m_entry(p2m, gfn, (l1_pgentry_t *)&l2e[i2], l2mfn, l1e_content, 2); continue; } l1mfn = _mfn(l2e_get_pfn(l2e[i2])); l1e = map_domain_page(mfn_x(l1mfn)); for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++ ) { flags = l1e_get_flags(l1e[i1]); if ( p2m_flags_to_type(flags) != ot ) continue; mfn = l1e_get_pfn(l1e[i1]); gfn = i1 + (i2 + (i3 + (i4 * L3_PAGETABLE_ENTRIES)) * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; /* create a new 1le entry with the new type */ flags = p2m_type_to_flags(nt, _mfn(mfn)); l1e_content = p2m_l1e_from_pfn(mfn, flags); p2m->write_p2m_entry(p2m, gfn, &l1e[i1], l1mfn, l1e_content, 1); } unmap_domain_page(l1e); } unmap_domain_page(l2e); } unmap_domain_page(l3e); } unmap_domain_page(l4e); } #if P2M_AUDIT long p2m_pt_audit_p2m(struct p2m_domain *p2m) { unsigned long entry_count = 0, pmbad = 0; unsigned long mfn, gfn, m2pfn; ASSERT(p2m_locked_by_me(p2m)); ASSERT(pod_locked_by_me(p2m)); /* Audit part one: walk the domain's p2m table, checking the entries. */ if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) != 0 ) { l2_pgentry_t *l2e; l1_pgentry_t *l1e; int i1, i2; l4_pgentry_t *l4e; l3_pgentry_t *l3e; int i4, i3; l4e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m)))); gfn = 0; for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ ) { if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) ) { gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT); continue; } l3e = map_domain_page(mfn_x(_mfn(l4e_get_pfn(l4e[i4])))); for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; i3++ ) { if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) ) { gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); continue; } /* check for 1GB super page */ if ( l3e_get_flags(l3e[i3]) & _PAGE_PSE ) { mfn = l3e_get_pfn(l3e[i3]); ASSERT(mfn_valid(_mfn(mfn))); /* we have to cover 512x512 4K pages */ for ( i2 = 0; i2 < (L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES); i2++) { m2pfn = get_gpfn_from_mfn(mfn+i2); if ( m2pfn != (gfn + i2) ) { pmbad++; P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx" " -> gfn %#lx\n", gfn+i2, mfn+i2, m2pfn); BUG(); } gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); continue; } } l2e = map_domain_page(mfn_x(_mfn(l3e_get_pfn(l3e[i3])))); for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) { if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) ) { if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) && ( p2m_flags_to_type(l2e_get_flags(l2e[i2])) == p2m_populate_on_demand ) ) entry_count+=SUPERPAGE_PAGES; gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); continue; } /* check for super page */ if ( l2e_get_flags(l2e[i2]) & _PAGE_PSE ) { mfn = l2e_get_pfn(l2e[i2]); ASSERT(mfn_valid(_mfn(mfn))); for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++) { m2pfn = get_gpfn_from_mfn(mfn+i1); /* Allow shared M2Ps */ if ( (m2pfn != (gfn + i1)) && (m2pfn != SHARED_M2P_ENTRY) ) { pmbad++; P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx" " -> gfn %#lx\n", gfn+i1, mfn+i1, m2pfn); BUG(); } } gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); continue; } l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2])))); for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ ) { p2m_type_t type; type = p2m_flags_to_type(l1e_get_flags(l1e[i1])); if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) ) { if ( type == p2m_populate_on_demand ) entry_count++; continue; } mfn = l1e_get_pfn(l1e[i1]); ASSERT(mfn_valid(_mfn(mfn))); m2pfn = get_gpfn_from_mfn(mfn); if ( m2pfn != gfn && type != p2m_mmio_direct && !p2m_is_grant(type) && !p2m_is_shared(type) ) { pmbad++; printk("mismatch: gfn %#lx -> mfn %#lx" " -> gfn %#lx\n", gfn, mfn, m2pfn); P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx" " -> gfn %#lx\n", gfn, mfn, m2pfn); BUG(); } } unmap_domain_page(l1e); } unmap_domain_page(l2e); } unmap_domain_page(l3e); } unmap_domain_page(l4e); } if ( entry_count != p2m->pod.entry_count ) { printk("%s: refcounted entry count %ld, audit count %lu!\n", __func__, p2m->pod.entry_count, entry_count); BUG(); } return pmbad; } #endif /* P2M_AUDIT */ /* Set up the p2m function pointers for pagetable format */ void p2m_pt_init(struct p2m_domain *p2m) { p2m->set_entry = p2m_set_entry; p2m->get_entry = p2m_gfn_to_mfn; p2m->change_entry_type_global = p2m_change_type_global; p2m->write_p2m_entry = paging_write_p2m_entry; #if P2M_AUDIT p2m->audit_p2m = p2m_pt_audit_p2m; #else p2m->audit_p2m = NULL; #endif } xen-4.4.0/xen/arch/x86/mm/p2m-pod.c0000664000175000017500000010167412307313555014643 0ustar smbsmb/****************************************************************************** * arch/x86/mm/p2m-pod.c * * Populate-on-demand p2m entries. * * Copyright (c) 2009-2011 Citrix Systems, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include /* ept_p2m_init() */ #include #include #include #include #include #include #include #include "mm-locks.h" /* Override macros from asm/page.h to make them work with mfn_t */ #undef mfn_to_page #define mfn_to_page(_m) __mfn_to_page(mfn_x(_m)) #undef mfn_valid #define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn)) #undef page_to_mfn #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) #define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0) /* Enforce lock ordering when grabbing the "external" page_alloc lock */ static inline void lock_page_alloc(struct p2m_domain *p2m) { page_alloc_mm_pre_lock(); spin_lock(&(p2m->domain->page_alloc_lock)); page_alloc_mm_post_lock(p2m->domain->arch.page_alloc_unlock_level); } static inline void unlock_page_alloc(struct p2m_domain *p2m) { page_alloc_mm_unlock(p2m->domain->arch.page_alloc_unlock_level); spin_unlock(&(p2m->domain->page_alloc_lock)); } /* * Populate-on-demand functionality */ static int p2m_pod_cache_add(struct p2m_domain *p2m, struct page_info *page, unsigned int order) { int i; struct page_info *p; struct domain *d = p2m->domain; #ifndef NDEBUG mfn_t mfn; mfn = page_to_mfn(page); /* Check to make sure this is a contiguous region */ if( mfn_x(mfn) & ((1 << order) - 1) ) { printk("%s: mfn %lx not aligned order %u! (mask %lx)\n", __func__, mfn_x(mfn), order, ((1UL << order) - 1)); return -1; } for(i=0; i < 1 << order ; i++) { struct domain * od; p = mfn_to_page(_mfn(mfn_x(mfn) + i)); od = page_get_owner(p); if(od != d) { printk("%s: mfn %lx expected owner d%d, got owner d%d!\n", __func__, mfn_x(mfn), d->domain_id, od?od->domain_id:-1); return -1; } } #endif ASSERT(pod_locked_by_me(p2m)); /* * Pages from domain_alloc and returned by the balloon driver aren't * guaranteed to be zero; but by reclaiming zero pages, we implicitly * promise to provide zero pages. So we scrub pages before using. */ for ( i = 0; i < (1 << order); i++ ) { char *b = map_domain_page(mfn_x(page_to_mfn(page)) + i); clear_page(b); unmap_domain_page(b); } /* First, take all pages off the domain list */ lock_page_alloc(p2m); for(i=0; i < 1 << order ; i++) { p = page + i; page_list_del(p, &d->page_list); } unlock_page_alloc(p2m); /* Then add the first one to the appropriate populate-on-demand list */ switch(order) { case PAGE_ORDER_2M: page_list_add_tail(page, &p2m->pod.super); /* lock: page_alloc */ p2m->pod.count += 1 << order; break; case PAGE_ORDER_4K: page_list_add_tail(page, &p2m->pod.single); /* lock: page_alloc */ p2m->pod.count += 1; break; default: BUG(); } return 0; } /* Get a page of size order from the populate-on-demand cache. Will break * down 2-meg pages into singleton pages automatically. Returns null if * a superpage is requested and no superpages are available. */ static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m, unsigned int order) { struct page_info *p = NULL; int i; ASSERT(pod_locked_by_me(p2m)); if ( order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) ) { return NULL; } else if ( order == PAGE_ORDER_4K && page_list_empty(&p2m->pod.single) ) { unsigned long mfn; struct page_info *q; BUG_ON( page_list_empty(&p2m->pod.super) ); /* Break up a superpage to make single pages. NB count doesn't * need to be adjusted. */ p = page_list_remove_head(&p2m->pod.super); mfn = mfn_x(page_to_mfn(p)); for ( i=0; ipod.single); } } switch ( order ) { case PAGE_ORDER_2M: BUG_ON( page_list_empty(&p2m->pod.super) ); p = page_list_remove_head(&p2m->pod.super); p2m->pod.count -= 1 << order; break; case PAGE_ORDER_4K: BUG_ON( page_list_empty(&p2m->pod.single) ); p = page_list_remove_head(&p2m->pod.single); p2m->pod.count -= 1; break; default: BUG(); } /* Put the pages back on the domain page_list */ lock_page_alloc(p2m); for ( i = 0 ; i < (1 << order); i++ ) { BUG_ON(page_get_owner(p + i) != p2m->domain); page_list_add_tail(p + i, &p2m->domain->page_list); } unlock_page_alloc(p2m); return p; } /* Set the size of the cache, allocating or freeing as necessary. */ static int p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long pod_target, int preemptible) { struct domain *d = p2m->domain; int ret = 0; ASSERT(pod_locked_by_me(p2m)); /* Increasing the target */ while ( pod_target > p2m->pod.count ) { struct page_info * page; int order; if ( (pod_target - p2m->pod.count) >= SUPERPAGE_PAGES ) order = PAGE_ORDER_2M; else order = PAGE_ORDER_4K; retry: page = alloc_domheap_pages(d, order, PAGE_ORDER_4K); if ( unlikely(page == NULL) ) { if ( order == PAGE_ORDER_2M ) { /* If we can't allocate a superpage, try singleton pages */ order = PAGE_ORDER_4K; goto retry; } printk("%s: Unable to allocate page for PoD cache (target=%lu cache=%ld)\n", __func__, pod_target, p2m->pod.count); ret = -ENOMEM; goto out; } p2m_pod_cache_add(p2m, page, order); if ( hypercall_preempt_check() && preemptible ) { ret = -EAGAIN; goto out; } } /* Decreasing the target */ /* We hold the pod lock here, so we don't need to worry about * cache disappearing under our feet. */ while ( pod_target < p2m->pod.count ) { struct page_info * page; int order, i; if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES && !page_list_empty(&p2m->pod.super) ) order = PAGE_ORDER_2M; else order = PAGE_ORDER_4K; page = p2m_pod_cache_get(p2m, order); ASSERT(page != NULL); /* Then free them */ for ( i = 0 ; i < (1 << order) ; i++ ) { /* Copied from common/memory.c:guest_remove_page() */ if ( unlikely(!get_page(page+i, d)) ) { gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", d->domain_id); ret = -EINVAL; goto out; } if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) ) put_page_and_type(page+i); if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) ) put_page(page+i); put_page(page+i); if ( hypercall_preempt_check() && preemptible ) { ret = -EAGAIN; goto out; } } } out: return ret; } /* * The "right behavior" here requires some careful thought. First, some * definitions: * + M: static_max * + B: number of pages the balloon driver has ballooned down to. * + P: Number of populated pages. * + T: Old target * + T': New target * * The following equations should hold: * 0 <= P <= T <= B <= M * d->arch.p2m->pod.entry_count == B - P * d->tot_pages == P + d->arch.p2m->pod.count * * Now we have the following potential cases to cover: * B tot_pages - p2m->pod.count; if ( populated > 0 && p2m->pod.entry_count == 0 ) goto out; /* Don't do anything if the domain is being torn down */ if ( d->is_dying ) goto out; /* T' < B: Don't reduce the cache size; let the balloon driver * take care of it. */ if ( target < d->tot_pages ) goto out; pod_target = target - populated; /* B < T': Set the cache size equal to # of outstanding entries, * let the balloon driver fill in the rest. */ if ( populated > 0 && pod_target > p2m->pod.entry_count ) pod_target = p2m->pod.entry_count; ASSERT( pod_target >= p2m->pod.count ); ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/); out: pod_unlock(p2m); return ret; } void p2m_pod_empty_cache(struct domain *d) { struct p2m_domain *p2m = p2m_get_hostp2m(d); struct page_info *page; /* After this barrier no new PoD activities can happen. */ BUG_ON(!d->is_dying); spin_barrier(&p2m->pod.lock.lock); lock_page_alloc(p2m); while ( (page = page_list_remove_head(&p2m->pod.super)) ) { int i; for ( i = 0 ; i < SUPERPAGE_PAGES ; i++ ) { BUG_ON(page_get_owner(page + i) != d); page_list_add_tail(page + i, &d->page_list); } p2m->pod.count -= SUPERPAGE_PAGES; } while ( (page = page_list_remove_head(&p2m->pod.single)) ) { BUG_ON(page_get_owner(page) != d); page_list_add_tail(page, &d->page_list); p2m->pod.count -= 1; } BUG_ON(p2m->pod.count != 0); unlock_page_alloc(p2m); } int p2m_pod_offline_or_broken_hit(struct page_info *p) { struct domain *d; struct p2m_domain *p2m; struct page_info *q, *tmp; unsigned long mfn, bmfn; if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) ) return 0; pod_lock(p2m); bmfn = mfn_x(page_to_mfn(p)); page_list_for_each_safe(q, tmp, &p2m->pod.super) { mfn = mfn_x(page_to_mfn(q)); if ( (bmfn >= mfn) && ((bmfn - mfn) < SUPERPAGE_PAGES) ) { unsigned long i; page_list_del(q, &p2m->pod.super); for ( i = 0; i < SUPERPAGE_PAGES; i++) { q = mfn_to_page(_mfn(mfn + i)); page_list_add_tail(q, &p2m->pod.single); } page_list_del(p, &p2m->pod.single); p2m->pod.count--; goto pod_hit; } } page_list_for_each_safe(q, tmp, &p2m->pod.single) { mfn = mfn_x(page_to_mfn(q)); if ( mfn == bmfn ) { page_list_del(p, &p2m->pod.single); p2m->pod.count--; goto pod_hit; } } pod_unlock(p2m); return 0; pod_hit: lock_page_alloc(p2m); /* Insertion must be at list head (see iommu_populate_page_table()). */ page_list_add(p, &d->arch.relmem_list); unlock_page_alloc(p2m); pod_unlock(p2m); return 1; } void p2m_pod_offline_or_broken_replace(struct page_info *p) { struct domain *d; struct p2m_domain *p2m; if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) ) return; free_domheap_page(p); p = alloc_domheap_page(d, PAGE_ORDER_4K); if ( unlikely(!p) ) return; pod_lock(p2m); p2m_pod_cache_add(p2m, p, PAGE_ORDER_4K); pod_unlock(p2m); return; } static int p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn); /* This function is needed for two reasons: * + To properly handle clearing of PoD entries * + To "steal back" memory being freed for the PoD cache, rather than * releasing it. * * Once both of these functions have been completed, we can return and * allow decrease_reservation() to handle everything else. */ int p2m_pod_decrease_reservation(struct domain *d, xen_pfn_t gpfn, unsigned int order) { int ret=0; int i; struct p2m_domain *p2m = p2m_get_hostp2m(d); int steal_for_cache; int pod, nonpod, ram; gfn_lock(p2m, gpfn, order); pod_lock(p2m); /* If we don't have any outstanding PoD entries, let things take their * course */ if ( p2m->pod.entry_count == 0 ) goto out_unlock; if ( unlikely(d->is_dying) ) goto out_unlock; recount: pod = nonpod = ram = 0; /* Figure out if we need to steal some freed memory for our cache */ steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count ); /* FIXME: Add contiguous; query for PSE entries? */ for ( i=0; i<(1<get_entry(p2m, gpfn + i, &t, &a, 0, NULL); if ( t == p2m_populate_on_demand ) pod++; else { nonpod++; if ( p2m_is_ram(t) ) ram++; } } /* No populate-on-demand? Don't need to steal anything? Then we're done!*/ if(!pod && !steal_for_cache) goto out_unlock; if ( !nonpod ) { /* All PoD: Mark the whole region invalid and tell caller * we're done. */ set_p2m_entry(p2m, gpfn, _mfn(INVALID_MFN), order, p2m_invalid, p2m->default_access); p2m->pod.entry_count-=(1<pod.entry_count < 0); ret = 1; goto out_entry_check; } /* Try to grab entire superpages if possible. Since the common case is for drivers * to pass back singleton pages, see if we can take the whole page back and mark the * rest PoD. */ if ( steal_for_cache && p2m_pod_zero_check_superpage(p2m, gpfn & ~(SUPERPAGE_PAGES-1))) { /* Since order may be arbitrary, we may have taken more or less * than we were actually asked to; so just re-count from scratch */ goto recount; } /* Process as long as: * + There are PoD entries to handle, or * + There is ram left, and we want to steal it */ for ( i=0; i<(1<0 || (steal_for_cache && ram > 0)); i++) { mfn_t mfn; p2m_type_t t; p2m_access_t a; mfn = p2m->get_entry(p2m, gpfn + i, &t, &a, 0, NULL); if ( t == p2m_populate_on_demand ) { set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, p2m->default_access); p2m->pod.entry_count--; BUG_ON(p2m->pod.entry_count < 0); pod--; } else if ( steal_for_cache && p2m_is_ram(t) ) { struct page_info *page; ASSERT(mfn_valid(mfn)); page = mfn_to_page(mfn); set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, p2m->default_access); set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY); p2m_pod_cache_add(p2m, page, 0); steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count ); nonpod--; ram--; } } /* If there are no more non-PoD entries, tell decrease_reservation() that * there's nothing left to do. */ if ( nonpod == 0 ) ret = 1; out_entry_check: /* If we've reduced our "liabilities" beyond our "assets", free some */ if ( p2m->pod.entry_count < p2m->pod.count ) { p2m_pod_set_cache_target(p2m, p2m->pod.entry_count, 0/*can't preempt*/); } out_unlock: pod_unlock(p2m); gfn_unlock(p2m, gpfn, order); return ret; } void p2m_pod_dump_data(struct domain *d) { struct p2m_domain *p2m = p2m_get_hostp2m(d); printk(" PoD entries=%ld cachesize=%ld\n", p2m->pod.entry_count, p2m->pod.count); } /* Search for all-zero superpages to be reclaimed as superpages for the * PoD cache. Must be called w/ pod lock held, must lock the superpage * in the p2m */ static int p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn) { mfn_t mfn, mfn0 = _mfn(INVALID_MFN); p2m_type_t type, type0 = 0; unsigned long * map = NULL; int ret=0, reset = 0; int i, j; int max_ref = 1; struct domain *d = p2m->domain; ASSERT(pod_locked_by_me(p2m)); if ( !superpage_aligned(gfn) ) goto out; /* Allow an extra refcount for one shadow pt mapping in shadowed domains */ if ( paging_mode_shadow(d) ) max_ref++; /* NOTE: this is why we don't enforce deadlock constraints between p2m * and pod locks */ gfn_lock(p2m, gfn, SUPERPAGE_ORDER); /* Look up the mfns, checking to make sure they're the same mfn * and aligned, and mapping them. */ for ( i=0; iget_entry(p2m, gfn + i, &type, &a, 0, NULL); if ( i == 0 ) { mfn0 = mfn; type0 = type; } /* Conditions that must be met for superpage-superpage: * + All gfns are ram types * + All gfns have the same type * + All of the mfns are allocated to a domain * + None of the mfns are used as pagetables, or allocated via xenheap * + The first mfn is 2-meg aligned * + All the other mfns are in sequence * Adding for good measure: * + None of the mfns are likely to be mapped elsewhere (refcount * 2 or less for shadow, 1 for hap) */ if ( !p2m_is_ram(type) || type != type0 || ( (mfn_to_page(mfn)->count_info & PGC_allocated) == 0 ) || ( (mfn_to_page(mfn)->count_info & (PGC_page_table|PGC_xen_heap)) != 0 ) || ( (mfn_to_page(mfn)->count_info & PGC_xen_heap ) != 0 ) || ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > max_ref ) || !( ( i == 0 && superpage_aligned(mfn_x(mfn0)) ) || ( i != 0 && mfn_x(mfn) == (mfn_x(mfn0) + i) ) ) ) goto out; } /* Now, do a quick check to see if it may be zero before unmapping. */ for ( i=0; idefault_access); /* Make none of the MFNs are used elsewhere... for example, mapped * via the grant table interface, or by qemu. Allow one refcount for * being allocated to the domain. */ for ( i=0; i < SUPERPAGE_PAGES; i++ ) { mfn = _mfn(mfn_x(mfn0) + i); if ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > 1 ) { reset = 1; goto out_reset; } } /* Finally, do a full zero-check */ for ( i=0; i < SUPERPAGE_PAGES; i++ ) { map = map_domain_page(mfn_x(mfn0) + i); for ( j=0; jdomain_id; t.order = 9; __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t); } /* Finally! We've passed all the checks, and can add the mfn superpage * back on the PoD cache, and account for the new p2m PoD entries */ p2m_pod_cache_add(p2m, mfn_to_page(mfn0), PAGE_ORDER_2M); p2m->pod.entry_count += SUPERPAGE_PAGES; ret = SUPERPAGE_PAGES; out_reset: if ( reset ) set_p2m_entry(p2m, gfn, mfn0, 9, type0, p2m->default_access); out: gfn_unlock(p2m, gfn, SUPERPAGE_ORDER); return ret; } static void p2m_pod_zero_check(struct p2m_domain *p2m, unsigned long *gfns, int count) { mfn_t mfns[count]; p2m_type_t types[count]; unsigned long * map[count]; struct domain *d = p2m->domain; int i, j; int max_ref = 1; /* Allow an extra refcount for one shadow pt mapping in shadowed domains */ if ( paging_mode_shadow(d) ) max_ref++; /* First, get the gfn list, translate to mfns, and map the pages. */ for ( i=0; iget_entry(p2m, gfns[i], types + i, &a, 0, NULL); /* If this is ram, and not a pagetable or from the xen heap, and probably not mapped elsewhere, map it; otherwise, skip. */ if ( p2m_is_ram(types[i]) && ( (mfn_to_page(mfns[i])->count_info & PGC_allocated) != 0 ) && ( (mfn_to_page(mfns[i])->count_info & (PGC_page_table|PGC_xen_heap)) == 0 ) && ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) <= max_ref ) ) map[i] = map_domain_page(mfn_x(mfns[i])); else map[i] = NULL; } /* Then, go through and check for zeroed pages, removing write permission * for those with zeroes. */ for ( i=0; idefault_access); /* See if the page was successfully unmapped. (Allow one refcount * for being allocated to a domain.) */ if ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) > 1 ) { unmap_domain_page(map[i]); map[i] = NULL; set_p2m_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K, types[i], p2m->default_access); continue; } } /* Now check each page for real */ for ( i=0; i < count; i++ ) { if(!map[i]) continue; for ( j=0; jdefault_access); } else { if ( tb_init_done ) { struct { u64 gfn, mfn; int d:16,order:16; } t; t.gfn = gfns[i]; t.mfn = mfn_x(mfns[i]); t.d = d->domain_id; t.order = 0; __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t); } /* Add to cache, and account for the new p2m PoD entry */ p2m_pod_cache_add(p2m, mfn_to_page(mfns[i]), PAGE_ORDER_4K); p2m->pod.entry_count++; } } } #define POD_SWEEP_LIMIT 1024 /* When populating a new superpage, look at recently populated superpages * hoping that they've been zeroed. This will snap up zeroed pages as soon as * the guest OS is done with them. */ static void p2m_pod_check_last_super(struct p2m_domain *p2m, unsigned long gfn_aligned) { unsigned long check_gfn; ASSERT(p2m->pod.last_populated_index < POD_HISTORY_MAX); check_gfn = p2m->pod.last_populated[p2m->pod.last_populated_index]; p2m->pod.last_populated[p2m->pod.last_populated_index] = gfn_aligned; p2m->pod.last_populated_index = ( p2m->pod.last_populated_index + 1 ) % POD_HISTORY_MAX; p2m_pod_zero_check_superpage(p2m, check_gfn); } #define POD_SWEEP_STRIDE 16 static void p2m_pod_emergency_sweep(struct p2m_domain *p2m) { unsigned long gfns[POD_SWEEP_STRIDE]; unsigned long i, j=0, start, limit; p2m_type_t t; if ( p2m->pod.reclaim_single == 0 ) p2m->pod.reclaim_single = p2m->pod.max_guest; start = p2m->pod.reclaim_single; limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0; /* FIXME: Figure out how to avoid superpages */ /* NOTE: Promote to globally locking the p2m. This will get complicated * in a fine-grained scenario. If we lock each gfn individually we must be * careful about spinlock recursion limits and POD_SWEEP_STRIDE. */ p2m_lock(p2m); for ( i=p2m->pod.reclaim_single; i > 0 ; i-- ) { p2m_access_t a; (void)p2m->get_entry(p2m, i, &t, &a, 0, NULL); if ( p2m_is_ram(t) ) { gfns[j] = i; j++; BUG_ON(j > POD_SWEEP_STRIDE); if ( j == POD_SWEEP_STRIDE ) { p2m_pod_zero_check(p2m, gfns, j); j = 0; } } /* Stop if we're past our limit and we have found *something*. * * NB that this is a zero-sum game; we're increasing our cache size * by re-increasing our 'debt'. Since we hold the pod lock, * (entry_count - count) must remain the same. */ if ( p2m->pod.count > 0 && i < limit ) break; } if ( j ) p2m_pod_zero_check(p2m, gfns, j); p2m_unlock(p2m); p2m->pod.reclaim_single = i ? i - 1 : i; } int p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn, unsigned int order, p2m_query_t q) { struct domain *d = p2m->domain; struct page_info *p = NULL; /* Compiler warnings */ unsigned long gfn_aligned; mfn_t mfn; int i; ASSERT(gfn_locked_by_me(p2m, gfn)); pod_lock(p2m); /* This check is done with the pod lock held. This will make sure that * even if d->is_dying changes under our feet, p2m_pod_empty_cache() * won't start until we're done. */ if ( unlikely(d->is_dying) ) goto out_fail; /* Because PoD does not have cache list for 1GB pages, it has to remap * 1GB region to 2MB chunks for a retry. */ if ( order == PAGE_ORDER_1G ) { pod_unlock(p2m); gfn_aligned = (gfn >> order) << order; /* Note that we are supposed to call set_p2m_entry() 512 times to * split 1GB into 512 2MB pages here. But We only do once here because * set_p2m_entry() should automatically shatter the 1GB page into * 512 2MB pages. The rest of 511 calls are unnecessary. * * NOTE: In a fine-grained p2m locking scenario this operation * may need to promote its locking from gfn->1g superpage */ set_p2m_entry(p2m, gfn_aligned, _mfn(0), PAGE_ORDER_2M, p2m_populate_on_demand, p2m->default_access); return 0; } /* Only sweep if we're actually out of memory. Doing anything else * causes unnecessary time and fragmentation of superpages in the p2m. */ if ( p2m->pod.count == 0 ) p2m_pod_emergency_sweep(p2m); /* If the sweep failed, give up. */ if ( p2m->pod.count == 0 ) goto out_of_memory; /* Keep track of the highest gfn demand-populated by a guest fault */ if ( gfn > p2m->pod.max_guest ) p2m->pod.max_guest = gfn; /* Get a page f/ the cache. A NULL return value indicates that the * 2-meg range should be marked singleton PoD, and retried */ if ( (p = p2m_pod_cache_get(p2m, order)) == NULL ) goto remap_and_retry; mfn = page_to_mfn(p); BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0); gfn_aligned = (gfn >> order) << order; set_p2m_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, p2m->default_access); for( i = 0; i < (1UL << order); i++ ) { set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_aligned + i); paging_mark_dirty(d, mfn_x(mfn) + i); } p2m->pod.entry_count -= (1 << order); BUG_ON(p2m->pod.entry_count < 0); if ( tb_init_done ) { struct { u64 gfn, mfn; int d:16,order:16; } t; t.gfn = gfn; t.mfn = mfn_x(mfn); t.d = d->domain_id; t.order = order; __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t); } /* Check the last guest demand-populate */ if ( p2m->pod.entry_count > p2m->pod.count && (order == PAGE_ORDER_2M) && (q & P2M_ALLOC) ) p2m_pod_check_last_super(p2m, gfn_aligned); pod_unlock(p2m); return 0; out_of_memory: pod_unlock(p2m); printk("%s: Dom%d out of PoD memory! (tot=%"PRIu32" ents=%ld dom%d)\n", __func__, d->domain_id, d->tot_pages, p2m->pod.entry_count, current->domain->domain_id); domain_crash(d); return -1; out_fail: pod_unlock(p2m); return -1; remap_and_retry: BUG_ON(order != PAGE_ORDER_2M); pod_unlock(p2m); /* Remap this 2-meg region in singleton chunks */ /* NOTE: In a p2m fine-grained lock scenario this might * need promoting the gfn lock from gfn->2M superpage */ gfn_aligned = (gfn>>order)<default_access); if ( tb_init_done ) { struct { u64 gfn; int d:16; } t; t.gfn = gfn; t.d = d->domain_id; __trace_var(TRC_MEM_POD_SUPERPAGE_SPLINTER, 0, sizeof(t), &t); } return 0; } int guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn, unsigned int order) { struct p2m_domain *p2m = p2m_get_hostp2m(d); unsigned long i, pod_count = 0; p2m_type_t ot; mfn_t omfn; int rc = 0; if ( !paging_mode_translate(d) ) return -EINVAL; gfn_lock(p2m, gfn, order); P2M_DEBUG("mark pod gfn=%#lx\n", gfn); /* Make sure all gpfns are unused */ for ( i = 0; i < (1UL << order); i++ ) { p2m_access_t a; omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL); if ( p2m_is_ram(ot) ) { P2M_DEBUG("gfn_to_mfn returned type %d!\n", ot); rc = -EBUSY; goto out; } else if ( ot == p2m_populate_on_demand ) { /* Count how man PoD entries we'll be replacing if successful */ pod_count++; } } /* Now, actually do the two-way mapping */ if ( !set_p2m_entry(p2m, gfn, _mfn(0), order, p2m_populate_on_demand, p2m->default_access) ) rc = -EINVAL; else { pod_lock(p2m); p2m->pod.entry_count += 1 << order; p2m->pod.entry_count -= pod_count; BUG_ON(p2m->pod.entry_count < 0); pod_unlock(p2m); } out: gfn_unlock(p2m, gfn, order); return rc; } xen-4.4.0/xen/arch/x86/mm/mem_sharing.c0000664000175000017500000012641312307313555015654 0ustar smbsmb/****************************************************************************** * arch/x86/mm/mem_sharing.c * * Memory sharing support. * * Copyright (c) 2011 GridCentric, Inc. (Adin Scannell & Andres Lagar-Cavilla) * Copyright (c) 2009 Citrix Systems, Inc. (Grzegorz Milos) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mm-locks.h" static shr_handle_t next_handle = 1; typedef struct pg_lock_data { int mm_unlock_level; unsigned short recurse_count; } pg_lock_data_t; DEFINE_PER_CPU(pg_lock_data_t, __pld); #define MEM_SHARING_DEBUG(_f, _a...) \ debugtrace_printk("mem_sharing_debug: %s(): " _f, __func__, ##_a) /* Reverse map defines */ #define RMAP_HASHTAB_ORDER 0 #define RMAP_HASHTAB_SIZE \ ((PAGE_SIZE << RMAP_HASHTAB_ORDER) / sizeof(struct list_head)) #define RMAP_USES_HASHTAB(page) \ ((page)->sharing->hash_table.flag == NULL) #define RMAP_HEAVY_SHARED_PAGE RMAP_HASHTAB_SIZE /* A bit of hysteresis. We don't want to be mutating between list and hash * table constantly. */ #define RMAP_LIGHT_SHARED_PAGE (RMAP_HEAVY_SHARED_PAGE >> 2) #if MEM_SHARING_AUDIT static struct list_head shr_audit_list; static spinlock_t shr_audit_lock; DEFINE_RCU_READ_LOCK(shr_audit_read_lock); /* RCU delayed free of audit list entry */ static void _free_pg_shared_info(struct rcu_head *head) { xfree(container_of(head, struct page_sharing_info, rcu_head)); } static inline void audit_add_list(struct page_info *page) { INIT_LIST_HEAD(&page->sharing->entry); spin_lock(&shr_audit_lock); list_add_rcu(&page->sharing->entry, &shr_audit_list); spin_unlock(&shr_audit_lock); } /* Removes from the audit list and cleans up the page sharing metadata. */ static inline void page_sharing_dispose(struct page_info *page) { /* Unlikely given our thresholds, but we should be careful. */ if ( unlikely(RMAP_USES_HASHTAB(page)) ) free_xenheap_pages(page->sharing->hash_table.bucket, RMAP_HASHTAB_ORDER); spin_lock(&shr_audit_lock); list_del_rcu(&page->sharing->entry); spin_unlock(&shr_audit_lock); INIT_RCU_HEAD(&page->sharing->rcu_head); call_rcu(&page->sharing->rcu_head, _free_pg_shared_info); } #else int mem_sharing_audit(void) { return -ENOSYS; } #define audit_add_list(p) ((void)0) static inline void page_sharing_dispose(struct page_info *page) { /* Unlikely given our thresholds, but we should be careful. */ if ( unlikely(RMAP_USES_HASHTAB(page)) ) free_xenheap_pages(page->sharing->hash_table.bucket, RMAP_HASHTAB_ORDER); xfree(page->sharing); } #endif /* MEM_SHARING_AUDIT */ static inline int mem_sharing_page_lock(struct page_info *pg) { int rc; pg_lock_data_t *pld = &(this_cpu(__pld)); page_sharing_mm_pre_lock(); rc = page_lock(pg); if ( rc ) { preempt_disable(); page_sharing_mm_post_lock(&pld->mm_unlock_level, &pld->recurse_count); } return rc; } static inline void mem_sharing_page_unlock(struct page_info *pg) { pg_lock_data_t *pld = &(this_cpu(__pld)); page_sharing_mm_unlock(pld->mm_unlock_level, &pld->recurse_count); preempt_enable(); page_unlock(pg); } static inline shr_handle_t get_next_handle(void) { /* Get the next handle get_page style */ uint64_t x, y = next_handle; do { x = y; } while ( (y = cmpxchg(&next_handle, x, x + 1)) != x ); return x + 1; } #define mem_sharing_enabled(d) \ (is_hvm_domain(d) && (d)->arch.hvm_domain.mem_sharing_enabled) #undef mfn_to_page #define mfn_to_page(_m) __mfn_to_page(mfn_x(_m)) #undef mfn_valid #define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn)) #undef page_to_mfn #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) static atomic_t nr_saved_mfns = ATOMIC_INIT(0); static atomic_t nr_shared_mfns = ATOMIC_INIT(0); /** Reverse map **/ /* Every shared frame keeps a reverse map (rmap) of tuples that * this shared frame backs. For pages with a low degree of sharing, a O(n) * search linked list is good enough. For pages with higher degree of sharing, * we use a hash table instead. */ typedef struct gfn_info { unsigned long gfn; domid_t domain; struct list_head list; } gfn_info_t; static inline void rmap_init(struct page_info *page) { /* We always start off as a doubly linked list. */ INIT_LIST_HEAD(&page->sharing->gfns); } /* Exceedingly simple "hash function" */ #define HASH(domain, gfn) \ (((gfn) + (domain)) % RMAP_HASHTAB_SIZE) /* Conversions. Tuned by the thresholds. Should only happen twice * (once each) during the lifetime of a shared page */ static inline int rmap_list_to_hash_table(struct page_info *page) { unsigned int i; struct list_head *pos, *tmp, *b = alloc_xenheap_pages(RMAP_HASHTAB_ORDER, 0); if ( b == NULL ) return -ENOMEM; for ( i = 0; i < RMAP_HASHTAB_SIZE; i++ ) INIT_LIST_HEAD(b + i); list_for_each_safe(pos, tmp, &page->sharing->gfns) { gfn_info_t *gfn_info = list_entry(pos, gfn_info_t, list); struct list_head *bucket = b + HASH(gfn_info->domain, gfn_info->gfn); list_del(pos); list_add(pos, bucket); } page->sharing->hash_table.bucket = b; page->sharing->hash_table.flag = NULL; return 0; } static inline void rmap_hash_table_to_list(struct page_info *page) { unsigned int i; struct list_head *bucket = page->sharing->hash_table.bucket; INIT_LIST_HEAD(&page->sharing->gfns); for ( i = 0; i < RMAP_HASHTAB_SIZE; i++ ) { struct list_head *pos, *tmp, *head = bucket + i; list_for_each_safe(pos, tmp, head) { list_del(pos); list_add(pos, &page->sharing->gfns); } } free_xenheap_pages(bucket, RMAP_HASHTAB_ORDER); } /* Generic accessors to the rmap */ static inline unsigned long rmap_count(struct page_info *pg) { unsigned long count; unsigned long t = read_atomic(&pg->u.inuse.type_info); count = t & PGT_count_mask; if ( t & PGT_locked ) count--; return count; } /* The page type count is always decreased after removing from the rmap. * Use a convert flag to avoid mutating the rmap if in the middle of an * iterator, or if the page will be soon destroyed anyways. */ static inline void rmap_del(gfn_info_t *gfn_info, struct page_info *page, int convert) { if ( RMAP_USES_HASHTAB(page) && convert && (rmap_count(page) <= RMAP_LIGHT_SHARED_PAGE) ) rmap_hash_table_to_list(page); /* Regardless of rmap type, same removal operation */ list_del(&gfn_info->list); } /* The page type count is always increased before adding to the rmap. */ static inline void rmap_add(gfn_info_t *gfn_info, struct page_info *page) { struct list_head *head; if ( !RMAP_USES_HASHTAB(page) && (rmap_count(page) >= RMAP_HEAVY_SHARED_PAGE) ) /* The conversion may fail with ENOMEM. We'll be less efficient, * but no reason to panic. */ (void)rmap_list_to_hash_table(page); head = (RMAP_USES_HASHTAB(page)) ? page->sharing->hash_table.bucket + HASH(gfn_info->domain, gfn_info->gfn) : &page->sharing->gfns; INIT_LIST_HEAD(&gfn_info->list); list_add(&gfn_info->list, head); } static inline gfn_info_t * rmap_retrieve(uint16_t domain_id, unsigned long gfn, struct page_info *page) { gfn_info_t *gfn_info; struct list_head *le, *head; head = (RMAP_USES_HASHTAB(page)) ? page->sharing->hash_table.bucket + HASH(domain_id, gfn) : &page->sharing->gfns; list_for_each(le, head) { gfn_info = list_entry(le, gfn_info_t, list); if ( (gfn_info->gfn == gfn) && (gfn_info->domain == domain_id) ) return gfn_info; } /* Nothing was found */ return NULL; } /* Returns true if the rmap has only one entry. O(1) complexity. */ static inline int rmap_has_one_entry(struct page_info *page) { return (rmap_count(page) == 1); } /* Returns true if the rmap has any entries. O(1) complexity. */ static inline int rmap_has_entries(struct page_info *page) { return (rmap_count(page) != 0); } /* The iterator hides the details of how the rmap is implemented. This * involves splitting the list_for_each_safe macro into two steps. */ struct rmap_iterator { struct list_head *curr; struct list_head *next; unsigned int bucket; }; static inline void rmap_seed_iterator(struct page_info *page, struct rmap_iterator *ri) { ri->curr = (RMAP_USES_HASHTAB(page)) ? page->sharing->hash_table.bucket : &page->sharing->gfns; ri->next = ri->curr->next; ri->bucket = 0; } static inline gfn_info_t * rmap_iterate(struct page_info *page, struct rmap_iterator *ri) { struct list_head *head = (RMAP_USES_HASHTAB(page)) ? page->sharing->hash_table.bucket + ri->bucket : &page->sharing->gfns; retry: if ( ri->next == head) { if ( RMAP_USES_HASHTAB(page) ) { ri->bucket++; if ( ri->bucket >= RMAP_HASHTAB_SIZE ) /* No more hash table buckets */ return NULL; head = page->sharing->hash_table.bucket + ri->bucket; ri->curr = head; ri->next = ri->curr->next; goto retry; } else /* List exhausted */ return NULL; } ri->curr = ri->next; ri->next = ri->curr->next; return list_entry(ri->curr, gfn_info_t, list); } static inline gfn_info_t *mem_sharing_gfn_alloc(struct page_info *page, struct domain *d, unsigned long gfn) { gfn_info_t *gfn_info = xmalloc(gfn_info_t); if ( gfn_info == NULL ) return NULL; gfn_info->gfn = gfn; gfn_info->domain = d->domain_id; rmap_add(gfn_info, page); /* Increment our number of shared pges. */ atomic_inc(&d->shr_pages); return gfn_info; } static inline void mem_sharing_gfn_destroy(struct page_info *page, struct domain *d, gfn_info_t *gfn_info) { /* Decrement the number of pages. */ atomic_dec(&d->shr_pages); /* Free the gfn_info structure. */ rmap_del(gfn_info, page, 1); xfree(gfn_info); } static struct page_info* mem_sharing_lookup(unsigned long mfn) { if ( mfn_valid(_mfn(mfn)) ) { struct page_info* page = mfn_to_page(_mfn(mfn)); if ( page_get_owner(page) == dom_cow ) { /* Count has to be at least two, because we're called * with the mfn locked (1) and this is supposed to be * a shared page (1). */ unsigned long t = read_atomic(&page->u.inuse.type_info); ASSERT((t & PGT_type_mask) == PGT_shared_page); ASSERT((t & PGT_count_mask) >= 2); ASSERT(get_gpfn_from_mfn(mfn) == SHARED_M2P_ENTRY); return page; } } return NULL; } #if MEM_SHARING_AUDIT int mem_sharing_audit(void) { int errors = 0; unsigned long count_expected; unsigned long count_found = 0; struct list_head *ae; count_expected = atomic_read(&nr_shared_mfns); rcu_read_lock(&shr_audit_read_lock); list_for_each_rcu(ae, &shr_audit_list) { struct page_sharing_info *pg_shared_info; unsigned long nr_gfns = 0; struct page_info *pg; mfn_t mfn; gfn_info_t *g; struct rmap_iterator ri; pg_shared_info = list_entry(ae, struct page_sharing_info, entry); pg = pg_shared_info->pg; mfn = page_to_mfn(pg); /* If we can't lock it, it's definitely not a shared page */ if ( !mem_sharing_page_lock(pg) ) { MEM_SHARING_DEBUG("mfn %lx in audit list, but cannot be locked (%lx)!\n", mfn_x(mfn), pg->u.inuse.type_info); errors++; continue; } /* Check if the MFN has correct type, owner and handle. */ if ( !(pg->u.inuse.type_info & PGT_shared_page) ) { MEM_SHARING_DEBUG("mfn %lx in audit list, but not PGT_shared_page (%lx)!\n", mfn_x(mfn), pg->u.inuse.type_info & PGT_type_mask); errors++; continue; } /* Check the page owner. */ if ( page_get_owner(pg) != dom_cow ) { MEM_SHARING_DEBUG("mfn %lx shared, but wrong owner (%hu)!\n", mfn_x(mfn), page_get_owner(pg)->domain_id); errors++; } /* Check the m2p entry */ if ( get_gpfn_from_mfn(mfn_x(mfn)) != SHARED_M2P_ENTRY ) { MEM_SHARING_DEBUG("mfn %lx shared, but wrong m2p entry (%lx)!\n", mfn_x(mfn), get_gpfn_from_mfn(mfn_x(mfn))); errors++; } /* Check we have a list */ if ( (!pg->sharing) || !rmap_has_entries(pg) ) { MEM_SHARING_DEBUG("mfn %lx shared, but empty gfn list!\n", mfn_x(mfn)); errors++; continue; } /* We've found a page that is shared */ count_found++; /* Check if all GFNs map to the MFN, and the p2m types */ rmap_seed_iterator(pg, &ri); while ( (g = rmap_iterate(pg, &ri)) != NULL ) { struct domain *d; p2m_type_t t; mfn_t o_mfn; d = get_domain_by_id(g->domain); if ( d == NULL ) { MEM_SHARING_DEBUG("Unknown dom: %hu, for PFN=%lx, MFN=%lx\n", g->domain, g->gfn, mfn_x(mfn)); errors++; continue; } o_mfn = get_gfn_query_unlocked(d, g->gfn, &t); if ( mfn_x(o_mfn) != mfn_x(mfn) ) { MEM_SHARING_DEBUG("Incorrect P2M for d=%hu, PFN=%lx." "Expecting MFN=%lx, got %lx\n", g->domain, g->gfn, mfn_x(mfn), mfn_x(o_mfn)); errors++; } if ( t != p2m_ram_shared ) { MEM_SHARING_DEBUG("Incorrect P2M type for d=%hu, PFN=%lx MFN=%lx." "Expecting t=%d, got %d\n", g->domain, g->gfn, mfn_x(mfn), p2m_ram_shared, t); errors++; } put_domain(d); nr_gfns++; } /* The type count has an extra ref because we have locked the page */ if ( (nr_gfns + 1) != (pg->u.inuse.type_info & PGT_count_mask) ) { MEM_SHARING_DEBUG("Mismatched counts for MFN=%lx." "nr_gfns in list %lu, in type_info %lx\n", mfn_x(mfn), nr_gfns, (pg->u.inuse.type_info & PGT_count_mask)); errors++; } mem_sharing_page_unlock(pg); } rcu_read_unlock(&shr_audit_read_lock); if ( count_found != count_expected ) { MEM_SHARING_DEBUG("Expected %ld shared mfns, found %ld.", count_expected, count_found); errors++; } return errors; } #endif int mem_sharing_notify_enomem(struct domain *d, unsigned long gfn, bool_t allow_sleep) { struct vcpu *v = current; int rc; mem_event_request_t req = { .gfn = gfn }; if ( (rc = __mem_event_claim_slot(d, &d->mem_event->share, allow_sleep)) < 0 ) return rc; if ( v->domain == d ) { req.flags = MEM_EVENT_FLAG_VCPU_PAUSED; vcpu_pause_nosync(v); } req.p2mt = p2m_ram_shared; req.vcpu_id = v->vcpu_id; mem_event_put_request(d, &d->mem_event->share, &req); return 0; } unsigned int mem_sharing_get_nr_saved_mfns(void) { return ((unsigned int)atomic_read(&nr_saved_mfns)); } unsigned int mem_sharing_get_nr_shared_mfns(void) { return (unsigned int)atomic_read(&nr_shared_mfns); } int mem_sharing_sharing_resume(struct domain *d) { mem_event_response_t rsp; /* Get all requests off the ring */ while ( mem_event_get_response(d, &d->mem_event->share, &rsp) ) { if ( rsp.flags & MEM_EVENT_FLAG_DUMMY ) continue; /* Unpause domain/vcpu */ if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED ) vcpu_unpause(d->vcpu[rsp.vcpu_id]); } return 0; } /* Functions that change a page's type and ownership */ static int page_make_sharable(struct domain *d, struct page_info *page, int expected_refcnt) { bool_t drop_dom_ref; spin_lock(&d->page_alloc_lock); if ( d->is_dying ) { spin_unlock(&d->page_alloc_lock); return -EBUSY; } /* Change page type and count atomically */ if ( !get_page_and_type(page, d, PGT_shared_page) ) { spin_unlock(&d->page_alloc_lock); return -EINVAL; } /* Check it wasn't already sharable and undo if it was */ if ( (page->u.inuse.type_info & PGT_count_mask) != 1 ) { spin_unlock(&d->page_alloc_lock); put_page_and_type(page); return -EEXIST; } /* Check if the ref count is 2. The first from PGC_allocated, and * the second from get_page_and_type at the top of this function */ if ( page->count_info != (PGC_allocated | (2 + expected_refcnt)) ) { spin_unlock(&d->page_alloc_lock); /* Return type count back to zero */ put_page_and_type(page); return -E2BIG; } page_set_owner(page, dom_cow); drop_dom_ref = !domain_adjust_tot_pages(d, -1); page_list_del(page, &d->page_list); spin_unlock(&d->page_alloc_lock); if ( drop_dom_ref ) put_domain(d); return 0; } static int page_make_private(struct domain *d, struct page_info *page) { unsigned long expected_type; if ( !get_page(page, dom_cow) ) return -EINVAL; spin_lock(&d->page_alloc_lock); if ( d->is_dying ) { spin_unlock(&d->page_alloc_lock); put_page(page); return -EBUSY; } /* We can only change the type if count is one */ /* Because we are locking pages individually, we need to drop * the lock here, while the page is typed. We cannot risk the * race of page_unlock and then put_page_type. */ expected_type = (PGT_shared_page | PGT_validated | PGT_locked | 2); if ( page->u.inuse.type_info != expected_type ) { spin_unlock(&d->page_alloc_lock); put_page(page); return -EEXIST; } /* Drop the final typecount */ put_page_and_type(page); /* Now that we've dropped the type, we can unlock */ mem_sharing_page_unlock(page); /* Change the owner */ ASSERT(page_get_owner(page) == dom_cow); page_set_owner(page, d); if ( domain_adjust_tot_pages(d, 1) == 1 ) get_knownalive_domain(d); page_list_add_tail(page, &d->page_list); spin_unlock(&d->page_alloc_lock); put_page(page); return 0; } static inline struct page_info *__grab_shared_page(mfn_t mfn) { struct page_info *pg = NULL; if ( !mfn_valid(mfn) ) return NULL; pg = mfn_to_page(mfn); /* If the page is not validated we can't lock it, and if it's * not validated it's obviously not shared. */ if ( !mem_sharing_page_lock(pg) ) return NULL; if ( mem_sharing_lookup(mfn_x(mfn)) == NULL ) { mem_sharing_page_unlock(pg); return NULL; } return pg; } int mem_sharing_debug_mfn(mfn_t mfn) { struct page_info *page; int num_refs; if ( (page = __grab_shared_page(mfn)) == NULL) { gdprintk(XENLOG_ERR, "Invalid MFN=%lx\n", mfn_x(mfn)); return -EINVAL; } MEM_SHARING_DEBUG( "Debug page: MFN=%lx is ci=%lx, ti=%lx, owner_id=%d\n", mfn_x(page_to_mfn(page)), page->count_info, page->u.inuse.type_info, page_get_owner(page)->domain_id); /* -1 because the page is locked and that's an additional type ref */ num_refs = ((int) (page->u.inuse.type_info & PGT_count_mask)) - 1; mem_sharing_page_unlock(page); return num_refs; } int mem_sharing_debug_gfn(struct domain *d, unsigned long gfn) { p2m_type_t p2mt; mfn_t mfn; int num_refs; mfn = get_gfn_query(d, gfn, &p2mt); MEM_SHARING_DEBUG("Debug for domain=%d, gfn=%lx, ", d->domain_id, gfn); num_refs = mem_sharing_debug_mfn(mfn); put_gfn(d, gfn); return num_refs; } #define SHGNT_PER_PAGE_V1 (PAGE_SIZE / sizeof(grant_entry_v1_t)) #define shared_entry_v1(t, e) \ ((t)->shared_v1[(e)/SHGNT_PER_PAGE_V1][(e)%SHGNT_PER_PAGE_V1]) #define SHGNT_PER_PAGE_V2 (PAGE_SIZE / sizeof(grant_entry_v2_t)) #define shared_entry_v2(t, e) \ ((t)->shared_v2[(e)/SHGNT_PER_PAGE_V2][(e)%SHGNT_PER_PAGE_V2]) #define STGNT_PER_PAGE (PAGE_SIZE / sizeof(grant_status_t)) #define status_entry(t, e) \ ((t)->status[(e)/STGNT_PER_PAGE][(e)%STGNT_PER_PAGE]) static grant_entry_header_t * shared_entry_header(struct grant_table *t, grant_ref_t ref) { ASSERT (t->gt_version != 0); if ( t->gt_version == 1 ) return (grant_entry_header_t*)&shared_entry_v1(t, ref); else return &shared_entry_v2(t, ref).hdr; } static int mem_sharing_gref_to_gfn(struct domain *d, grant_ref_t ref, unsigned long *gfn) { if ( d->grant_table->gt_version < 1 ) return -1; if ( d->grant_table->gt_version == 1 ) { grant_entry_v1_t *sha1; sha1 = &shared_entry_v1(d->grant_table, ref); *gfn = sha1->frame; } else { grant_entry_v2_t *sha2; sha2 = &shared_entry_v2(d->grant_table, ref); *gfn = sha2->full_page.frame; } return 0; } int mem_sharing_debug_gref(struct domain *d, grant_ref_t ref) { grant_entry_header_t *shah; uint16_t status; unsigned long gfn; if ( d->grant_table->gt_version < 1 ) { MEM_SHARING_DEBUG( "Asked to debug [dom=%d,gref=%d], but not yet inited.\n", d->domain_id, ref); return -EINVAL; } (void)mem_sharing_gref_to_gfn(d, ref, &gfn); shah = shared_entry_header(d->grant_table, ref); if ( d->grant_table->gt_version == 1 ) status = shah->flags; else status = status_entry(d->grant_table, ref); MEM_SHARING_DEBUG( "==> Grant [dom=%d,ref=%d], status=%x. ", d->domain_id, ref, status); return mem_sharing_debug_gfn(d, gfn); } int mem_sharing_nominate_page(struct domain *d, unsigned long gfn, int expected_refcnt, shr_handle_t *phandle) { p2m_type_t p2mt; mfn_t mfn; struct page_info *page = NULL; /* gcc... */ int ret; struct gfn_info *gfn_info; *phandle = 0UL; mfn = get_gfn(d, gfn, &p2mt); /* Check if mfn is valid */ ret = -EINVAL; if ( !mfn_valid(mfn) ) goto out; /* Return the handle if the page is already shared */ if ( p2m_is_shared(p2mt) ) { struct page_info *pg = __grab_shared_page(mfn); if ( !pg ) { gdprintk(XENLOG_ERR, "Shared p2m entry gfn %lx, but could not " "grab page %lx dom %d\n", gfn, mfn_x(mfn), d->domain_id); BUG(); } *phandle = pg->sharing->handle; ret = 0; mem_sharing_page_unlock(pg); goto out; } /* Check p2m type */ if ( !p2m_is_sharable(p2mt) ) goto out; /* Try to convert the mfn to the sharable type */ page = mfn_to_page(mfn); ret = page_make_sharable(d, page, expected_refcnt); if ( ret ) goto out; /* Now that the page is validated, we can lock it. There is no * race because we're holding the p2m entry, so no one else * could be nominating this gfn */ ret = -ENOENT; if ( !mem_sharing_page_lock(page) ) goto out; /* Initialize the shared state */ ret = -ENOMEM; if ( (page->sharing = xmalloc(struct page_sharing_info)) == NULL ) { /* Making a page private atomically unlocks it */ BUG_ON(page_make_private(d, page) != 0); goto out; } page->sharing->pg = page; rmap_init(page); /* Create the handle */ page->sharing->handle = get_next_handle(); /* Create the local gfn info */ if ( (gfn_info = mem_sharing_gfn_alloc(page, d, gfn)) == NULL ) { xfree(page->sharing); page->sharing = NULL; BUG_ON(page_make_private(d, page) != 0); goto out; } /* Change the p2m type, should never fail with p2m locked. */ BUG_ON(p2m_change_type(d, gfn, p2mt, p2m_ram_shared) != p2mt); /* Account for this page. */ atomic_inc(&nr_shared_mfns); /* Update m2p entry to SHARED_M2P_ENTRY */ set_gpfn_from_mfn(mfn_x(mfn), SHARED_M2P_ENTRY); *phandle = page->sharing->handle; audit_add_list(page); mem_sharing_page_unlock(page); ret = 0; out: put_gfn(d, gfn); return ret; } int mem_sharing_share_pages(struct domain *sd, unsigned long sgfn, shr_handle_t sh, struct domain *cd, unsigned long cgfn, shr_handle_t ch) { struct page_info *spage, *cpage, *firstpg, *secondpg; gfn_info_t *gfn; struct domain *d; int ret = -EINVAL; mfn_t smfn, cmfn; p2m_type_t smfn_type, cmfn_type; struct two_gfns tg; struct rmap_iterator ri; get_two_gfns(sd, sgfn, &smfn_type, NULL, &smfn, cd, cgfn, &cmfn_type, NULL, &cmfn, 0, &tg); /* This tricky business is to avoid two callers deadlocking if * grabbing pages in opposite client/source order */ if( mfn_x(smfn) == mfn_x(cmfn) ) { /* The pages are already the same. We could return some * kind of error here, but no matter how you look at it, * the pages are already 'shared'. It possibly represents * a big problem somewhere else, but as far as sharing is * concerned: great success! */ ret = 0; goto err_out; } else if ( mfn_x(smfn) < mfn_x(cmfn) ) { ret = XENMEM_SHARING_OP_S_HANDLE_INVALID; spage = firstpg = __grab_shared_page(smfn); if ( spage == NULL ) goto err_out; ret = XENMEM_SHARING_OP_C_HANDLE_INVALID; cpage = secondpg = __grab_shared_page(cmfn); if ( cpage == NULL ) { mem_sharing_page_unlock(spage); goto err_out; } } else { ret = XENMEM_SHARING_OP_C_HANDLE_INVALID; cpage = firstpg = __grab_shared_page(cmfn); if ( cpage == NULL ) goto err_out; ret = XENMEM_SHARING_OP_S_HANDLE_INVALID; spage = secondpg = __grab_shared_page(smfn); if ( spage == NULL ) { mem_sharing_page_unlock(cpage); goto err_out; } } ASSERT(smfn_type == p2m_ram_shared); ASSERT(cmfn_type == p2m_ram_shared); /* Check that the handles match */ if ( spage->sharing->handle != sh ) { ret = XENMEM_SHARING_OP_S_HANDLE_INVALID; mem_sharing_page_unlock(secondpg); mem_sharing_page_unlock(firstpg); goto err_out; } if ( cpage->sharing->handle != ch ) { ret = XENMEM_SHARING_OP_C_HANDLE_INVALID; mem_sharing_page_unlock(secondpg); mem_sharing_page_unlock(firstpg); goto err_out; } /* Merge the lists together */ rmap_seed_iterator(cpage, &ri); while ( (gfn = rmap_iterate(cpage, &ri)) != NULL) { /* Get the source page and type, this should never fail: * we are under shr lock, and got a successful lookup */ BUG_ON(!get_page_and_type(spage, dom_cow, PGT_shared_page)); /* Move the gfn_info from client list to source list. * Don't change the type of rmap for the client page. */ rmap_del(gfn, cpage, 0); rmap_add(gfn, spage); put_page_and_type(cpage); d = get_domain_by_id(gfn->domain); BUG_ON(!d); BUG_ON(set_shared_p2m_entry(d, gfn->gfn, smfn) == 0); put_domain(d); } ASSERT(list_empty(&cpage->sharing->gfns)); /* Clear the rest of the shared state */ page_sharing_dispose(cpage); cpage->sharing = NULL; mem_sharing_page_unlock(secondpg); mem_sharing_page_unlock(firstpg); /* Free the client page */ if(test_and_clear_bit(_PGC_allocated, &cpage->count_info)) put_page(cpage); /* We managed to free a domain page. */ atomic_dec(&nr_shared_mfns); atomic_inc(&nr_saved_mfns); ret = 0; err_out: put_two_gfns(&tg); return ret; } int mem_sharing_add_to_physmap(struct domain *sd, unsigned long sgfn, shr_handle_t sh, struct domain *cd, unsigned long cgfn) { struct page_info *spage; int ret = -EINVAL; mfn_t smfn, cmfn; p2m_type_t smfn_type, cmfn_type; struct gfn_info *gfn_info; struct p2m_domain *p2m = p2m_get_hostp2m(cd); p2m_access_t a; struct two_gfns tg; get_two_gfns(sd, sgfn, &smfn_type, NULL, &smfn, cd, cgfn, &cmfn_type, &a, &cmfn, 0, &tg); /* Get the source shared page, check and lock */ ret = XENMEM_SHARING_OP_S_HANDLE_INVALID; spage = __grab_shared_page(smfn); if ( spage == NULL ) goto err_out; ASSERT(smfn_type == p2m_ram_shared); /* Check that the handles match */ if ( spage->sharing->handle != sh ) goto err_unlock; /* Make sure the target page is a hole in the physmap. These are typically * p2m_mmio_dm, but also accept p2m_invalid and paged out pages. See the * definition of p2m_is_hole in p2m.h. */ if ( !p2m_is_hole(cmfn_type) ) { ret = XENMEM_SHARING_OP_C_HANDLE_INVALID; goto err_unlock; } /* This is simpler than regular sharing */ BUG_ON(!get_page_and_type(spage, dom_cow, PGT_shared_page)); if ( (gfn_info = mem_sharing_gfn_alloc(spage, cd, cgfn)) == NULL ) { put_page_and_type(spage); ret = -ENOMEM; goto err_unlock; } ret = set_p2m_entry(p2m, cgfn, smfn, PAGE_ORDER_4K, p2m_ram_shared, a); /* Tempted to turn this into an assert */ if ( !ret ) { ret = -ENOENT; mem_sharing_gfn_destroy(spage, cd, gfn_info); put_page_and_type(spage); } else { ret = 0; /* There is a chance we're plugging a hole where a paged out page was */ if ( p2m_is_paging(cmfn_type) && (cmfn_type != p2m_ram_paging_out) ) { atomic_dec(&cd->paged_pages); /* Further, there is a chance this was a valid page. Don't leak it. */ if ( mfn_valid(cmfn) ) { struct page_info *cpage = mfn_to_page(cmfn); ASSERT(cpage != NULL); if ( test_and_clear_bit(_PGC_allocated, &cpage->count_info) ) put_page(cpage); } } } atomic_inc(&nr_saved_mfns); err_unlock: mem_sharing_page_unlock(spage); err_out: put_two_gfns(&tg); return ret; } /* A note on the rationale for unshare error handling: * 1. Unshare can only fail with ENOMEM. Any other error conditions BUG_ON()'s * 2. We notify a potential dom0 helper through a mem_event ring. But we * allow the notification to not go to sleep. If the event ring is full * of ENOMEM warnings, then it's on the ball. * 3. We cannot go to sleep until the unshare is resolved, because we might * be buried deep into locks (e.g. something -> copy_to_user -> __hvm_copy) * 4. So, we make sure we: * 4.1. return an error * 4.2. do not corrupt shared memory * 4.3. do not corrupt guest memory * 4.4. let the guest deal with it if the error propagation will reach it */ int __mem_sharing_unshare_page(struct domain *d, unsigned long gfn, uint16_t flags) { p2m_type_t p2mt; mfn_t mfn; struct page_info *page, *old_page; void *s, *t; int last_gfn; gfn_info_t *gfn_info = NULL; mfn = get_gfn(d, gfn, &p2mt); /* Has someone already unshared it? */ if ( !p2m_is_shared(p2mt) ) { put_gfn(d, gfn); return 0; } page = __grab_shared_page(mfn); if ( page == NULL ) { gdprintk(XENLOG_ERR, "Domain p2m is shared, but page is not: " "%lx\n", gfn); BUG(); } gfn_info = rmap_retrieve(d->domain_id, gfn, page); if ( unlikely(gfn_info == NULL) ) { gdprintk(XENLOG_ERR, "Could not find gfn_info for shared gfn: " "%lx\n", gfn); BUG(); } /* Do the accounting first. If anything fails below, we have bigger * bigger fish to fry. First, remove the gfn from the list. */ last_gfn = rmap_has_one_entry(page); if ( last_gfn ) { /* Clean up shared state. Get rid of the tuple * before destroying the rmap. */ mem_sharing_gfn_destroy(page, d, gfn_info); page_sharing_dispose(page); page->sharing = NULL; atomic_dec(&nr_shared_mfns); } else atomic_dec(&nr_saved_mfns); /* If the GFN is getting destroyed drop the references to MFN * (possibly freeing the page), and exit early */ if ( flags & MEM_SHARING_DESTROY_GFN ) { if ( !last_gfn ) mem_sharing_gfn_destroy(page, d, gfn_info); put_page_and_type(page); mem_sharing_page_unlock(page); if ( last_gfn && test_and_clear_bit(_PGC_allocated, &page->count_info) ) put_page(page); put_gfn(d, gfn); return 0; } if ( last_gfn ) { /* Making a page private atomically unlocks it */ BUG_ON(page_make_private(d, page) != 0); goto private_page_found; } old_page = page; page = alloc_domheap_page(d, 0); if ( !page ) { /* Undo dec of nr_saved_mfns, as the retry will decrease again. */ atomic_inc(&nr_saved_mfns); mem_sharing_page_unlock(old_page); put_gfn(d, gfn); /* Caller is responsible for placing an event * in the ring */ return -ENOMEM; } s = map_domain_page(__page_to_mfn(old_page)); t = map_domain_page(__page_to_mfn(page)); memcpy(t, s, PAGE_SIZE); unmap_domain_page(s); unmap_domain_page(t); BUG_ON(set_shared_p2m_entry(d, gfn, page_to_mfn(page)) == 0); mem_sharing_gfn_destroy(old_page, d, gfn_info); mem_sharing_page_unlock(old_page); put_page_and_type(old_page); private_page_found: if ( p2m_change_type(d, gfn, p2m_ram_shared, p2m_ram_rw) != p2m_ram_shared ) { gdprintk(XENLOG_ERR, "Could not change p2m type d %hu gfn %lx.\n", d->domain_id, gfn); BUG(); } /* Update m2p entry */ set_gpfn_from_mfn(mfn_x(page_to_mfn(page)), gfn); /* Now that the gfn<->mfn map is properly established, * marking dirty is feasible */ paging_mark_dirty(d, mfn_x(page_to_mfn(page))); /* We do not need to unlock a private page */ put_gfn(d, gfn); return 0; } int relinquish_shared_pages(struct domain *d) { int rc = 0; struct p2m_domain *p2m = p2m_get_hostp2m(d); unsigned long gfn, count = 0; if ( p2m == NULL ) return 0; p2m_lock(p2m); for (gfn = p2m->next_shared_gfn_to_relinquish; gfn < p2m->max_mapped_pfn; gfn++ ) { p2m_access_t a; p2m_type_t t; mfn_t mfn; int set_rc; if ( atomic_read(&d->shr_pages) == 0 ) break; mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL); if ( mfn_valid(mfn) && (t == p2m_ram_shared) ) { /* Does not fail with ENOMEM given the DESTROY flag */ BUG_ON(__mem_sharing_unshare_page(d, gfn, MEM_SHARING_DESTROY_GFN)); /* Clear out the p2m entry so no one else may try to * unshare. Must succeed: we just read the old entry and * we hold the p2m lock. */ set_rc = p2m->set_entry(p2m, gfn, _mfn(0), PAGE_ORDER_4K, p2m_invalid, p2m_access_rwx); ASSERT(set_rc != 0); count += 0x10; } else ++count; /* Preempt every 2MiB (shared) or 32MiB (unshared) - arbitrary. */ if ( count >= 0x2000 ) { if ( hypercall_preempt_check() ) { p2m->next_shared_gfn_to_relinquish = gfn + 1; rc = -EAGAIN; break; } count = 0; } } p2m_unlock(p2m); return rc; } int mem_sharing_memop(struct domain *d, xen_mem_sharing_op_t *mec) { int rc = 0; /* Only HAP is supported */ if ( !hap_enabled(d) || !d->arch.hvm_domain.mem_sharing_enabled ) return -ENODEV; switch(mec->op) { case XENMEM_sharing_op_nominate_gfn: { unsigned long gfn = mec->u.nominate.u.gfn; shr_handle_t handle; if ( !mem_sharing_enabled(d) ) return -EINVAL; rc = mem_sharing_nominate_page(d, gfn, 0, &handle); mec->u.nominate.handle = handle; } break; case XENMEM_sharing_op_nominate_gref: { grant_ref_t gref = mec->u.nominate.u.grant_ref; unsigned long gfn; shr_handle_t handle; if ( !mem_sharing_enabled(d) ) return -EINVAL; if ( mem_sharing_gref_to_gfn(d, gref, &gfn) < 0 ) return -EINVAL; rc = mem_sharing_nominate_page(d, gfn, 3, &handle); mec->u.nominate.handle = handle; } break; case XENMEM_sharing_op_share: { unsigned long sgfn, cgfn; struct domain *cd; shr_handle_t sh, ch; if ( !mem_sharing_enabled(d) ) return -EINVAL; rc = rcu_lock_live_remote_domain_by_id(mec->u.share.client_domain, &cd); if ( rc ) return rc; rc = xsm_mem_sharing_op(XSM_TARGET, d, cd, mec->op); if ( rc ) { rcu_unlock_domain(cd); return rc; } if ( !mem_sharing_enabled(cd) ) { rcu_unlock_domain(cd); return -EINVAL; } if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mec->u.share.source_gfn) ) { grant_ref_t gref = (grant_ref_t) (XENMEM_SHARING_OP_FIELD_GET_GREF( mec->u.share.source_gfn)); if ( mem_sharing_gref_to_gfn(d, gref, &sgfn) < 0 ) { rcu_unlock_domain(cd); return -EINVAL; } } else { sgfn = mec->u.share.source_gfn; } if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mec->u.share.client_gfn) ) { grant_ref_t gref = (grant_ref_t) (XENMEM_SHARING_OP_FIELD_GET_GREF( mec->u.share.client_gfn)); if ( mem_sharing_gref_to_gfn(cd, gref, &cgfn) < 0 ) { rcu_unlock_domain(cd); return -EINVAL; } } else { cgfn = mec->u.share.client_gfn; } sh = mec->u.share.source_handle; ch = mec->u.share.client_handle; rc = mem_sharing_share_pages(d, sgfn, sh, cd, cgfn, ch); rcu_unlock_domain(cd); } break; case XENMEM_sharing_op_add_physmap: { unsigned long sgfn, cgfn; struct domain *cd; shr_handle_t sh; if ( !mem_sharing_enabled(d) ) return -EINVAL; rc = rcu_lock_live_remote_domain_by_id(mec->u.share.client_domain, &cd); if ( rc ) return rc; rc = xsm_mem_sharing_op(XSM_TARGET, d, cd, mec->op); if ( rc ) { rcu_unlock_domain(cd); return rc; } if ( !mem_sharing_enabled(cd) ) { rcu_unlock_domain(cd); return -EINVAL; } if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mec->u.share.source_gfn) ) { /* Cannot add a gref to the physmap */ rcu_unlock_domain(cd); return -EINVAL; } sgfn = mec->u.share.source_gfn; sh = mec->u.share.source_handle; cgfn = mec->u.share.client_gfn; rc = mem_sharing_add_to_physmap(d, sgfn, sh, cd, cgfn); rcu_unlock_domain(cd); } break; case XENMEM_sharing_op_resume: { if ( !mem_sharing_enabled(d) ) return -EINVAL; rc = mem_sharing_sharing_resume(d); } break; case XENMEM_sharing_op_debug_gfn: { unsigned long gfn = mec->u.debug.u.gfn; rc = mem_sharing_debug_gfn(d, gfn); } break; case XENMEM_sharing_op_debug_gref: { grant_ref_t gref = mec->u.debug.u.gref; rc = mem_sharing_debug_gref(d, gref); } break; default: rc = -ENOSYS; break; } return rc; } int mem_sharing_domctl(struct domain *d, xen_domctl_mem_sharing_op_t *mec) { int rc; /* Only HAP is supported */ if ( !hap_enabled(d) ) return -ENODEV; switch(mec->op) { case XEN_DOMCTL_MEM_SHARING_CONTROL: { rc = 0; if ( unlikely(need_iommu(d) && mec->u.enable) ) rc = -EXDEV; else d->arch.hvm_domain.mem_sharing_enabled = mec->u.enable; } break; default: rc = -ENOSYS; } return rc; } void __init mem_sharing_init(void) { printk("Initing memory sharing.\n"); #if MEM_SHARING_AUDIT spin_lock_init(&shr_audit_lock); INIT_LIST_HEAD(&shr_audit_list); #endif } xen-4.4.0/xen/arch/x86/mm/mem_event.c0000664000175000017500000004461212307313555015342 0ustar smbsmb/****************************************************************************** * arch/x86/mm/mem_event.c * * Memory event support. * * Copyright (c) 2009 Citrix Systems, Inc. (Patrick Colp) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include /* for public/io/ring.h macros */ #define xen_mb() mb() #define xen_rmb() rmb() #define xen_wmb() wmb() #define mem_event_ring_lock_init(_med) spin_lock_init(&(_med)->ring_lock) #define mem_event_ring_lock(_med) spin_lock(&(_med)->ring_lock) #define mem_event_ring_unlock(_med) spin_unlock(&(_med)->ring_lock) static int mem_event_enable( struct domain *d, xen_domctl_mem_event_op_t *mec, struct mem_event_domain *med, int pause_flag, int param, xen_event_channel_notification_t notification_fn) { int rc; unsigned long ring_gfn = d->arch.hvm_domain.params[param]; /* Only one helper at a time. If the helper crashed, * the ring is in an undefined state and so is the guest. */ if ( med->ring_page ) return -EBUSY; /* The parameter defaults to zero, and it should be * set to something */ if ( ring_gfn == 0 ) return -ENOSYS; mem_event_ring_lock_init(med); mem_event_ring_lock(med); rc = prepare_ring_for_helper(d, ring_gfn, &med->ring_pg_struct, &med->ring_page); if ( rc < 0 ) goto err; /* Set the number of currently blocked vCPUs to 0. */ med->blocked = 0; /* Allocate event channel */ rc = alloc_unbound_xen_event_channel(d->vcpu[0], current->domain->domain_id, notification_fn); if ( rc < 0 ) goto err; med->xen_port = mec->port = rc; /* Prepare ring buffer */ FRONT_RING_INIT(&med->front_ring, (mem_event_sring_t *)med->ring_page, PAGE_SIZE); /* Save the pause flag for this particular ring. */ med->pause_flag = pause_flag; /* Initialize the last-chance wait queue. */ init_waitqueue_head(&med->wq); mem_event_ring_unlock(med); return 0; err: destroy_ring_for_helper(&med->ring_page, med->ring_pg_struct); mem_event_ring_unlock(med); return rc; } static unsigned int mem_event_ring_available(struct mem_event_domain *med) { int avail_req = RING_FREE_REQUESTS(&med->front_ring); avail_req -= med->target_producers; avail_req -= med->foreign_producers; BUG_ON(avail_req < 0); return avail_req; } /* * mem_event_wake_blocked() will wakeup vcpus waiting for room in the * ring. These vCPUs were paused on their way out after placing an event, * but need to be resumed where the ring is capable of processing at least * one event from them. */ static void mem_event_wake_blocked(struct domain *d, struct mem_event_domain *med) { struct vcpu *v; int online = d->max_vcpus; unsigned int avail_req = mem_event_ring_available(med); if ( avail_req == 0 || med->blocked == 0 ) return; /* * We ensure that we only have vCPUs online if there are enough free slots * for their memory events to be processed. This will ensure that no * memory events are lost (due to the fact that certain types of events * cannot be replayed, we need to ensure that there is space in the ring * for when they are hit). * See comment below in mem_event_put_request(). */ for_each_vcpu ( d, v ) if ( test_bit(med->pause_flag, &v->pause_flags) ) online--; ASSERT(online == (d->max_vcpus - med->blocked)); /* We remember which vcpu last woke up to avoid scanning always linearly * from zero and starving higher-numbered vcpus under high load */ if ( d->vcpu ) { int i, j, k; for (i = med->last_vcpu_wake_up + 1, j = 0; j < d->max_vcpus; i++, j++) { k = i % d->max_vcpus; v = d->vcpu[k]; if ( !v ) continue; if ( !(med->blocked) || online >= avail_req ) break; if ( test_and_clear_bit(med->pause_flag, &v->pause_flags) ) { vcpu_unpause(v); online++; med->blocked--; med->last_vcpu_wake_up = k; } } } } /* * In the event that a vCPU attempted to place an event in the ring and * was unable to do so, it is queued on a wait queue. These are woken as * needed, and take precedence over the blocked vCPUs. */ static void mem_event_wake_queued(struct domain *d, struct mem_event_domain *med) { unsigned int avail_req = mem_event_ring_available(med); if ( avail_req > 0 ) wake_up_nr(&med->wq, avail_req); } /* * mem_event_wake() will wakeup all vcpus waiting for the ring to * become available. If we have queued vCPUs, they get top priority. We * are guaranteed that they will go through code paths that will eventually * call mem_event_wake() again, ensuring that any blocked vCPUs will get * unpaused once all the queued vCPUs have made it through. */ void mem_event_wake(struct domain *d, struct mem_event_domain *med) { if (!list_empty(&med->wq.list)) mem_event_wake_queued(d, med); else mem_event_wake_blocked(d, med); } static int mem_event_disable(struct domain *d, struct mem_event_domain *med) { if ( med->ring_page ) { struct vcpu *v; mem_event_ring_lock(med); if ( !list_empty(&med->wq.list) ) { mem_event_ring_unlock(med); return -EBUSY; } /* Free domU's event channel and leave the other one unbound */ free_xen_event_channel(d->vcpu[0], med->xen_port); /* Unblock all vCPUs */ for_each_vcpu ( d, v ) { if ( test_and_clear_bit(med->pause_flag, &v->pause_flags) ) { vcpu_unpause(v); med->blocked--; } } destroy_ring_for_helper(&med->ring_page, med->ring_pg_struct); mem_event_ring_unlock(med); } return 0; } static inline void mem_event_release_slot(struct domain *d, struct mem_event_domain *med) { /* Update the accounting */ if ( current->domain == d ) med->target_producers--; else med->foreign_producers--; /* Kick any waiters */ mem_event_wake(d, med); } /* * mem_event_mark_and_pause() tags vcpu and put it to sleep. * The vcpu will resume execution in mem_event_wake_waiters(). */ void mem_event_mark_and_pause(struct vcpu *v, struct mem_event_domain *med) { if ( !test_and_set_bit(med->pause_flag, &v->pause_flags) ) { vcpu_pause_nosync(v); med->blocked++; } } /* * This must be preceded by a call to claim_slot(), and is guaranteed to * succeed. As a side-effect however, the vCPU may be paused if the ring is * overly full and its continued execution would cause stalling and excessive * waiting. The vCPU will be automatically unpaused when the ring clears. */ void mem_event_put_request(struct domain *d, struct mem_event_domain *med, mem_event_request_t *req) { mem_event_front_ring_t *front_ring; int free_req; unsigned int avail_req; RING_IDX req_prod; if ( current->domain != d ) { req->flags |= MEM_EVENT_FLAG_FOREIGN; ASSERT( !(req->flags & MEM_EVENT_FLAG_VCPU_PAUSED) ); } mem_event_ring_lock(med); /* Due to the reservations, this step must succeed. */ front_ring = &med->front_ring; free_req = RING_FREE_REQUESTS(front_ring); ASSERT(free_req > 0); /* Copy request */ req_prod = front_ring->req_prod_pvt; memcpy(RING_GET_REQUEST(front_ring, req_prod), req, sizeof(*req)); req_prod++; /* Update ring */ front_ring->req_prod_pvt = req_prod; RING_PUSH_REQUESTS(front_ring); /* We've actually *used* our reservation, so release the slot. */ mem_event_release_slot(d, med); /* Give this vCPU a black eye if necessary, on the way out. * See the comments above wake_blocked() for more information * on how this mechanism works to avoid waiting. */ avail_req = mem_event_ring_available(med); if( current->domain == d && avail_req < d->max_vcpus ) mem_event_mark_and_pause(current, med); mem_event_ring_unlock(med); notify_via_xen_event_channel(d, med->xen_port); } int mem_event_get_response(struct domain *d, struct mem_event_domain *med, mem_event_response_t *rsp) { mem_event_front_ring_t *front_ring; RING_IDX rsp_cons; mem_event_ring_lock(med); front_ring = &med->front_ring; rsp_cons = front_ring->rsp_cons; if ( !RING_HAS_UNCONSUMED_RESPONSES(front_ring) ) { mem_event_ring_unlock(med); return 0; } /* Copy response */ memcpy(rsp, RING_GET_RESPONSE(front_ring, rsp_cons), sizeof(*rsp)); rsp_cons++; /* Update ring */ front_ring->rsp_cons = rsp_cons; front_ring->sring->rsp_event = rsp_cons + 1; /* Kick any waiters -- since we've just consumed an event, * there may be additional space available in the ring. */ mem_event_wake(d, med); mem_event_ring_unlock(med); return 1; } void mem_event_cancel_slot(struct domain *d, struct mem_event_domain *med) { mem_event_ring_lock(med); mem_event_release_slot(d, med); mem_event_ring_unlock(med); } static int mem_event_grab_slot(struct mem_event_domain *med, int foreign) { unsigned int avail_req; if ( !med->ring_page ) return -ENOSYS; mem_event_ring_lock(med); avail_req = mem_event_ring_available(med); if ( avail_req == 0 ) { mem_event_ring_unlock(med); return -EBUSY; } if ( !foreign ) med->target_producers++; else med->foreign_producers++; mem_event_ring_unlock(med); return 0; } /* Simple try_grab wrapper for use in the wait_event() macro. */ static int mem_event_wait_try_grab(struct mem_event_domain *med, int *rc) { *rc = mem_event_grab_slot(med, 0); return *rc; } /* Call mem_event_grab_slot() until the ring doesn't exist, or is available. */ static int mem_event_wait_slot(struct mem_event_domain *med) { int rc = -EBUSY; wait_event(med->wq, mem_event_wait_try_grab(med, &rc) != -EBUSY); return rc; } bool_t mem_event_check_ring(struct mem_event_domain *med) { return (med->ring_page != NULL); } /* * Determines whether or not the current vCPU belongs to the target domain, * and calls the appropriate wait function. If it is a guest vCPU, then we * use mem_event_wait_slot() to reserve a slot. As long as there is a ring, * this function will always return 0 for a guest. For a non-guest, we check * for space and return -EBUSY if the ring is not available. * * Return codes: -ENOSYS: the ring is not yet configured * -EBUSY: the ring is busy * 0: a spot has been reserved * */ int __mem_event_claim_slot(struct domain *d, struct mem_event_domain *med, bool_t allow_sleep) { if ( (current->domain == d) && allow_sleep ) return mem_event_wait_slot(med); else return mem_event_grab_slot(med, (current->domain != d)); } /* Registered with Xen-bound event channel for incoming notifications. */ static void mem_paging_notification(struct vcpu *v, unsigned int port) { if ( likely(v->domain->mem_event->paging.ring_page != NULL) ) p2m_mem_paging_resume(v->domain); } /* Registered with Xen-bound event channel for incoming notifications. */ static void mem_access_notification(struct vcpu *v, unsigned int port) { if ( likely(v->domain->mem_event->access.ring_page != NULL) ) p2m_mem_access_resume(v->domain); } /* Registered with Xen-bound event channel for incoming notifications. */ static void mem_sharing_notification(struct vcpu *v, unsigned int port) { if ( likely(v->domain->mem_event->share.ring_page != NULL) ) mem_sharing_sharing_resume(v->domain); } int do_mem_event_op(int op, uint32_t domain, void *arg) { int ret; struct domain *d; ret = rcu_lock_live_remote_domain_by_id(domain, &d); if ( ret ) return ret; ret = xsm_mem_event_op(XSM_TARGET, d, op); if ( ret ) goto out; switch (op) { case XENMEM_paging_op: ret = mem_paging_memop(d, (xen_mem_event_op_t *) arg); break; case XENMEM_access_op: ret = mem_access_memop(d, (xen_mem_event_op_t *) arg); break; case XENMEM_sharing_op: ret = mem_sharing_memop(d, (xen_mem_sharing_op_t *) arg); break; default: ret = -ENOSYS; } out: rcu_unlock_domain(d); return ret; } /* Clean up on domain destruction */ void mem_event_cleanup(struct domain *d) { if ( d->mem_event->paging.ring_page ) { /* Destroying the wait queue head means waking up all * queued vcpus. This will drain the list, allowing * the disable routine to complete. It will also drop * all domain refs the wait-queued vcpus are holding. * Finally, because this code path involves previously * pausing the domain (domain_kill), unpausing the * vcpus causes no harm. */ destroy_waitqueue_head(&d->mem_event->paging.wq); (void)mem_event_disable(d, &d->mem_event->paging); } if ( d->mem_event->access.ring_page ) { destroy_waitqueue_head(&d->mem_event->access.wq); (void)mem_event_disable(d, &d->mem_event->access); } if ( d->mem_event->share.ring_page ) { destroy_waitqueue_head(&d->mem_event->share.wq); (void)mem_event_disable(d, &d->mem_event->share); } } int mem_event_domctl(struct domain *d, xen_domctl_mem_event_op_t *mec, XEN_GUEST_HANDLE_PARAM(void) u_domctl) { int rc; rc = xsm_mem_event_control(XSM_PRIV, d, mec->mode, mec->op); if ( rc ) return rc; if ( unlikely(d == current->domain) ) { gdprintk(XENLOG_INFO, "Tried to do a memory event op on itself.\n"); return -EINVAL; } if ( unlikely(d->is_dying) ) { gdprintk(XENLOG_INFO, "Ignoring memory event op on dying domain %u\n", d->domain_id); return 0; } if ( unlikely(d->vcpu == NULL) || unlikely(d->vcpu[0] == NULL) ) { gdprintk(XENLOG_INFO, "Memory event op on a domain (%u) with no vcpus\n", d->domain_id); return -EINVAL; } rc = -ENOSYS; switch ( mec->mode ) { case XEN_DOMCTL_MEM_EVENT_OP_PAGING: { struct mem_event_domain *med = &d->mem_event->paging; rc = -EINVAL; switch( mec->op ) { case XEN_DOMCTL_MEM_EVENT_OP_PAGING_ENABLE: { struct p2m_domain *p2m = p2m_get_hostp2m(d); rc = -ENODEV; /* Only HAP is supported */ if ( !hap_enabled(d) ) break; /* No paging if iommu is used */ rc = -EMLINK; if ( unlikely(need_iommu(d)) ) break; rc = -EXDEV; /* Disallow paging in a PoD guest */ if ( p2m->pod.entry_count ) break; rc = mem_event_enable(d, mec, med, _VPF_mem_paging, HVM_PARAM_PAGING_RING_PFN, mem_paging_notification); } break; case XEN_DOMCTL_MEM_EVENT_OP_PAGING_DISABLE: { if ( med->ring_page ) rc = mem_event_disable(d, med); } break; default: rc = -ENOSYS; break; } } break; case XEN_DOMCTL_MEM_EVENT_OP_ACCESS: { struct mem_event_domain *med = &d->mem_event->access; rc = -EINVAL; switch( mec->op ) { case XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE: { rc = -ENODEV; /* Only HAP is supported */ if ( !hap_enabled(d) ) break; /* Currently only EPT is supported */ if ( !cpu_has_vmx ) break; rc = mem_event_enable(d, mec, med, _VPF_mem_access, HVM_PARAM_ACCESS_RING_PFN, mem_access_notification); } break; case XEN_DOMCTL_MEM_EVENT_OP_ACCESS_DISABLE: { if ( med->ring_page ) rc = mem_event_disable(d, med); } break; default: rc = -ENOSYS; break; } } break; case XEN_DOMCTL_MEM_EVENT_OP_SHARING: { struct mem_event_domain *med = &d->mem_event->share; rc = -EINVAL; switch( mec->op ) { case XEN_DOMCTL_MEM_EVENT_OP_SHARING_ENABLE: { rc = -ENODEV; /* Only HAP is supported */ if ( !hap_enabled(d) ) break; rc = mem_event_enable(d, mec, med, _VPF_mem_sharing, HVM_PARAM_SHARING_RING_PFN, mem_sharing_notification); } break; case XEN_DOMCTL_MEM_EVENT_OP_SHARING_DISABLE: { if ( med->ring_page ) rc = mem_event_disable(d, med); } break; default: rc = -ENOSYS; break; } } break; default: rc = -ENOSYS; } return rc; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/mm/p2m.c0000664000175000017500000014376012307313555014065 0ustar smbsmb/****************************************************************************** * arch/x86/mm/p2m.c * * physical-to-machine mappings for automatically-translated domains. * * Parts of this code are Copyright (c) 2009 by Citrix Systems, Inc. (Patrick Colp) * Parts of this code are Copyright (c) 2007 by Advanced Micro Devices. * Parts of this code are Copyright (c) 2006-2007 by XenSource Inc. * Parts of this code are Copyright (c) 2006 by Michael A Fetterman * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include /* ept_p2m_init() */ #include #include #include #include #include #include #include #include "mm-locks.h" /* turn on/off 1GB host page table support for hap, default on */ bool_t __read_mostly opt_hap_1gb = 1; boolean_param("hap_1gb", opt_hap_1gb); bool_t __read_mostly opt_hap_2mb = 1; boolean_param("hap_2mb", opt_hap_2mb); /* Override macros from asm/page.h to make them work with mfn_t */ #undef mfn_to_page #define mfn_to_page(_m) __mfn_to_page(mfn_x(_m)) #undef mfn_valid #define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn)) #undef page_to_mfn #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) /* Init the datastructures for later use by the p2m code */ static int p2m_initialise(struct domain *d, struct p2m_domain *p2m) { int ret = 0; mm_rwlock_init(&p2m->lock); mm_lock_init(&p2m->pod.lock); INIT_LIST_HEAD(&p2m->np2m_list); INIT_PAGE_LIST_HEAD(&p2m->pages); INIT_PAGE_LIST_HEAD(&p2m->pod.super); INIT_PAGE_LIST_HEAD(&p2m->pod.single); p2m->domain = d; p2m->default_access = p2m_access_rwx; p2m->np2m_base = P2M_BASE_EADDR; if ( hap_enabled(d) && cpu_has_vmx ) ret = ept_p2m_init(p2m); else p2m_pt_init(p2m); return ret; } static struct p2m_domain *p2m_init_one(struct domain *d) { struct p2m_domain *p2m = xzalloc(struct p2m_domain); if ( !p2m ) return NULL; if ( !zalloc_cpumask_var(&p2m->dirty_cpumask) ) goto free_p2m; if ( p2m_initialise(d, p2m) ) goto free_cpumask; return p2m; free_cpumask: free_cpumask_var(p2m->dirty_cpumask); free_p2m: xfree(p2m); return NULL; } static void p2m_free_one(struct p2m_domain *p2m) { if ( hap_enabled(p2m->domain) && cpu_has_vmx ) ept_p2m_uninit(p2m); free_cpumask_var(p2m->dirty_cpumask); xfree(p2m); } static int p2m_init_hostp2m(struct domain *d) { struct p2m_domain *p2m = p2m_init_one(d); if ( p2m ) { d->arch.p2m = p2m; return 0; } return -ENOMEM; } static void p2m_teardown_hostp2m(struct domain *d) { /* Iterate over all p2m tables per domain */ struct p2m_domain *p2m = p2m_get_hostp2m(d); if ( p2m ) { p2m_free_one(p2m); d->arch.p2m = NULL; } } static void p2m_teardown_nestedp2m(struct domain *d); static int p2m_init_nestedp2m(struct domain *d) { uint8_t i; struct p2m_domain *p2m; mm_lock_init(&d->arch.nested_p2m_lock); for (i = 0; i < MAX_NESTEDP2M; i++) { d->arch.nested_p2m[i] = p2m = p2m_init_one(d); if ( p2m == NULL ) { p2m_teardown_nestedp2m(d); return -ENOMEM; } p2m->write_p2m_entry = nestedp2m_write_p2m_entry; list_add(&p2m->np2m_list, &p2m_get_hostp2m(d)->np2m_list); } return 0; } static void p2m_teardown_nestedp2m(struct domain *d) { uint8_t i; struct p2m_domain *p2m; for (i = 0; i < MAX_NESTEDP2M; i++) { if ( !d->arch.nested_p2m[i] ) continue; p2m = d->arch.nested_p2m[i]; list_del(&p2m->np2m_list); p2m_free_one(p2m); d->arch.nested_p2m[i] = NULL; } } int p2m_init(struct domain *d) { int rc; rc = p2m_init_hostp2m(d); if ( rc ) return rc; /* Must initialise nestedp2m unconditionally * since nestedhvm_enabled(d) returns false here. * (p2m_init runs too early for HVM_PARAM_* options) */ rc = p2m_init_nestedp2m(d); if ( rc ) p2m_teardown_hostp2m(d); return rc; } void p2m_change_entry_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt) { struct p2m_domain *p2m = p2m_get_hostp2m(d); p2m_lock(p2m); p2m->change_entry_type_global(p2m, ot, nt); p2m_unlock(p2m); } mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, p2m_query_t q, unsigned int *page_order, bool_t locked) { mfn_t mfn; /* Unshare makes no sense withuot populate. */ if ( q & P2M_UNSHARE ) q |= P2M_ALLOC; if ( !p2m || !paging_mode_translate(p2m->domain) ) { /* Not necessarily true, but for non-translated guests, we claim * it's the most generic kind of memory */ *t = p2m_ram_rw; return _mfn(gfn); } if ( locked ) /* Grab the lock here, don't release until put_gfn */ gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order); if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) ) { ASSERT(!p2m_is_nestedp2m(p2m)); /* Try to unshare. If we fail, communicate ENOMEM without * sleeping. */ if ( mem_sharing_unshare_page(p2m->domain, gfn, 0) < 0 ) (void)mem_sharing_notify_enomem(p2m->domain, gfn, 0); mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order); } if (unlikely((p2m_is_broken(*t)))) { /* Return invalid_mfn to avoid caller's access */ mfn = _mfn(INVALID_MFN); if ( q & P2M_ALLOC ) domain_crash(p2m->domain); } return mfn; } void __put_gfn(struct p2m_domain *p2m, unsigned long gfn) { if ( !p2m || !paging_mode_translate(p2m->domain) ) /* Nothing to do in this case */ return; ASSERT(gfn_locked_by_me(p2m, gfn)); gfn_unlock(p2m, gfn, 0); } /* Atomically look up a GFN and take a reference count on the backing page. */ struct page_info *get_page_from_gfn_p2m( struct domain *d, struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, p2m_query_t q) { struct page_info *page = NULL; p2m_access_t _a; p2m_type_t _t; mfn_t mfn; /* Allow t or a to be NULL */ t = t ?: &_t; a = a ?: &_a; if ( likely(!p2m_locked_by_me(p2m)) ) { /* Fast path: look up and get out */ p2m_read_lock(p2m); mfn = __get_gfn_type_access(p2m, gfn, t, a, 0, NULL, 0); if ( (p2m_is_ram(*t) || p2m_is_grant(*t)) && mfn_valid(mfn) && !((q & P2M_UNSHARE) && p2m_is_shared(*t)) ) { page = mfn_to_page(mfn); if ( !get_page(page, d) /* Page could be shared */ && !get_page(page, dom_cow) ) page = NULL; } p2m_read_unlock(p2m); if ( page ) return page; /* Error path: not a suitable GFN at all */ if ( !p2m_is_ram(*t) && !p2m_is_paging(*t) && !p2m_is_pod(*t) ) return NULL; } /* Slow path: take the write lock and do fixups */ mfn = get_gfn_type_access(p2m, gfn, t, a, q, NULL); if ( p2m_is_ram(*t) && mfn_valid(mfn) ) { page = mfn_to_page(mfn); if ( !get_page(page, d) ) page = NULL; } put_gfn(d, gfn); return page; } int set_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma) { struct domain *d = p2m->domain; unsigned long todo = 1ul << page_order; unsigned int order; int rc = 1; ASSERT(gfn_locked_by_me(p2m, gfn)); while ( todo ) { if ( hap_enabled(d) ) order = ( (((gfn | mfn_x(mfn) | todo) & ((1ul << PAGE_ORDER_1G) - 1)) == 0) && hvm_hap_has_1gb(d) && opt_hap_1gb ) ? PAGE_ORDER_1G : ((((gfn | mfn_x(mfn) | todo) & ((1ul << PAGE_ORDER_2M) - 1)) == 0) && hvm_hap_has_2mb(d) && opt_hap_2mb) ? PAGE_ORDER_2M : PAGE_ORDER_4K; else order = 0; if ( !p2m->set_entry(p2m, gfn, mfn, order, p2mt, p2ma) ) rc = 0; gfn += 1ul << order; if ( mfn_x(mfn) != INVALID_MFN ) mfn = _mfn(mfn_x(mfn) + (1ul << order)); todo -= 1ul << order; } return rc; } struct page_info *p2m_alloc_ptp(struct p2m_domain *p2m, unsigned long type) { struct page_info *pg; ASSERT(p2m); ASSERT(p2m->domain); ASSERT(p2m->domain->arch.paging.alloc_page); pg = p2m->domain->arch.paging.alloc_page(p2m->domain); if (pg == NULL) return NULL; page_list_add_tail(pg, &p2m->pages); pg->u.inuse.type_info = type | 1 | PGT_validated; return pg; } void p2m_free_ptp(struct p2m_domain *p2m, struct page_info *pg) { ASSERT(pg); ASSERT(p2m); ASSERT(p2m->domain); ASSERT(p2m->domain->arch.paging.free_page); page_list_del(pg, &p2m->pages); p2m->domain->arch.paging.free_page(p2m->domain, pg); return; } // Allocate a new p2m table for a domain. // // The structure of the p2m table is that of a pagetable for xen (i.e. it is // controlled by CONFIG_PAGING_LEVELS). // // Returns 0 for success or -errno. // int p2m_alloc_table(struct p2m_domain *p2m) { mfn_t mfn = _mfn(INVALID_MFN); struct page_info *page, *p2m_top; unsigned int page_count = 0; unsigned long gfn = -1UL; struct domain *d = p2m->domain; p2m_lock(p2m); if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) != 0 ) { P2M_ERROR("p2m already allocated for this domain\n"); p2m_unlock(p2m); return -EINVAL; } P2M_PRINTK("allocating p2m table\n"); p2m_top = p2m_alloc_ptp(p2m, PGT_l4_page_table); if ( p2m_top == NULL ) { p2m_unlock(p2m); return -ENOMEM; } p2m->phys_table = pagetable_from_mfn(page_to_mfn(p2m_top)); if ( hap_enabled(d) ) iommu_share_p2m_table(d); P2M_PRINTK("populating p2m table\n"); /* Initialise physmap tables for slot zero. Other code assumes this. */ p2m->defer_nested_flush = 1; if ( !set_p2m_entry(p2m, 0, _mfn(INVALID_MFN), PAGE_ORDER_4K, p2m_invalid, p2m->default_access) ) goto error; if ( !p2m_is_nestedp2m(p2m) ) { /* Copy all existing mappings from the page list and m2p */ spin_lock(&p2m->domain->page_alloc_lock); page_list_for_each(page, &p2m->domain->page_list) { mfn = page_to_mfn(page); gfn = get_gpfn_from_mfn(mfn_x(mfn)); /* Pages should not be shared that early */ ASSERT(gfn != SHARED_M2P_ENTRY); page_count++; if ( gfn != INVALID_M2P_ENTRY && !set_p2m_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2m_ram_rw, p2m->default_access) ) goto error_unlock; } spin_unlock(&p2m->domain->page_alloc_lock); } p2m->defer_nested_flush = 0; P2M_PRINTK("p2m table initialised (%u pages)\n", page_count); p2m_unlock(p2m); return 0; error_unlock: spin_unlock(&p2m->domain->page_alloc_lock); error: P2M_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" PRI_mfn "\n", gfn, mfn_x(mfn)); p2m_unlock(p2m); return -ENOMEM; } void p2m_teardown(struct p2m_domain *p2m) /* Return all the p2m pages to Xen. * We know we don't have any extra mappings to these pages */ { struct page_info *pg; struct domain *d; unsigned long gfn; p2m_type_t t; mfn_t mfn; if (p2m == NULL) return; d = p2m->domain; p2m_lock(p2m); /* Try to unshare any remaining shared p2m entries. Safeguard * Since relinquish_shared_pages should have done the work. */ for ( gfn=0; gfn < p2m->max_mapped_pfn; gfn++ ) { p2m_access_t a; if ( atomic_read(&d->shr_pages) == 0 ) break; mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL); if ( mfn_valid(mfn) && (t == p2m_ram_shared) ) { ASSERT(!p2m_is_nestedp2m(p2m)); /* Does not fail with ENOMEM given the DESTROY flag */ BUG_ON(mem_sharing_unshare_page(d, gfn, MEM_SHARING_DESTROY_GFN)); } } p2m->phys_table = pagetable_null(); while ( (pg = page_list_remove_head(&p2m->pages)) ) d->arch.paging.free_page(d, pg); p2m_unlock(p2m); } void p2m_final_teardown(struct domain *d) { /* We must teardown unconditionally because * we initialise them unconditionally. */ p2m_teardown_nestedp2m(d); /* Iterate over all p2m tables per domain */ p2m_teardown_hostp2m(d); } static void p2m_remove_page(struct p2m_domain *p2m, unsigned long gfn, unsigned long mfn, unsigned int page_order) { unsigned long i; mfn_t mfn_return; p2m_type_t t; p2m_access_t a; if ( !paging_mode_translate(p2m->domain) ) { if ( need_iommu(p2m->domain) ) for ( i = 0; i < (1 << page_order); i++ ) iommu_unmap_page(p2m->domain, mfn + i); return; } ASSERT(gfn_locked_by_me(p2m, gfn)); P2M_DEBUG("removing gfn=%#lx mfn=%#lx\n", gfn, mfn); if ( mfn_valid(_mfn(mfn)) ) { for ( i = 0; i < (1UL << page_order); i++ ) { mfn_return = p2m->get_entry(p2m, gfn + i, &t, &a, 0, NULL); if ( !p2m_is_grant(t) && !p2m_is_shared(t) ) set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY); ASSERT( !p2m_is_valid(t) || mfn + i == mfn_x(mfn_return) ); } } set_p2m_entry(p2m, gfn, _mfn(INVALID_MFN), page_order, p2m_invalid, p2m->default_access); } void guest_physmap_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int page_order) { struct p2m_domain *p2m = p2m_get_hostp2m(d); gfn_lock(p2m, gfn, page_order); p2m_remove_page(p2m, gfn, mfn, page_order); gfn_unlock(p2m, gfn, page_order); } int guest_physmap_add_entry(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int page_order, p2m_type_t t) { struct p2m_domain *p2m = p2m_get_hostp2m(d); unsigned long i, ogfn; p2m_type_t ot; p2m_access_t a; mfn_t omfn; int pod_count = 0; int rc = 0; if ( !paging_mode_translate(d) ) { if ( need_iommu(d) && t == p2m_ram_rw ) { for ( i = 0; i < (1 << page_order); i++ ) { rc = iommu_map_page( d, mfn + i, mfn + i, IOMMUF_readable|IOMMUF_writable); if ( rc != 0 ) { while ( i-- > 0 ) iommu_unmap_page(d, mfn + i); return rc; } } } return 0; } p2m_lock(p2m); P2M_DEBUG("adding gfn=%#lx mfn=%#lx\n", gfn, mfn); /* First, remove m->p mappings for existing p->m mappings */ for ( i = 0; i < (1UL << page_order); i++ ) { omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL); if ( p2m_is_shared(ot) ) { /* Do an unshare to cleanly take care of all corner * cases. */ int rc; rc = mem_sharing_unshare_page(p2m->domain, gfn + i, 0); if ( rc ) { p2m_unlock(p2m); /* NOTE: Should a guest domain bring this upon itself, * there is not a whole lot we can do. We are buried * deep in locks from most code paths by now. So, fail * the call and don't try to sleep on a wait queue * while placing the mem event. * * However, all current (changeset 3432abcf9380) code * paths avoid this unsavoury situation. For now. * * Foreign domains are okay to place an event as they * won't go to sleep. */ (void)mem_sharing_notify_enomem(p2m->domain, gfn + i, 0); return rc; } omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL); ASSERT(!p2m_is_shared(ot)); } if ( p2m_is_grant(ot) ) { /* Really shouldn't be unmapping grant maps this way */ domain_crash(d); p2m_unlock(p2m); return -EINVAL; } else if ( p2m_is_ram(ot) && !p2m_is_paged(ot) ) { ASSERT(mfn_valid(omfn)); set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); } else if ( ot == p2m_populate_on_demand ) { /* Count how man PoD entries we'll be replacing if successful */ pod_count++; } else if ( p2m_is_paging(ot) && (ot != p2m_ram_paging_out) ) { /* We're plugging a hole in the physmap where a paged out page was */ atomic_dec(&d->paged_pages); } } /* Then, look for m->p mappings for this range and deal with them */ for ( i = 0; i < (1UL << page_order); i++ ) { if ( page_get_owner(mfn_to_page(_mfn(mfn + i))) == dom_cow ) { /* This is no way to add a shared page to your physmap! */ gdprintk(XENLOG_ERR, "Adding shared mfn %lx directly to dom %hu " "physmap not allowed.\n", mfn+i, d->domain_id); p2m_unlock(p2m); return -EINVAL; } if ( page_get_owner(mfn_to_page(_mfn(mfn + i))) != d ) continue; ogfn = mfn_to_gfn(d, _mfn(mfn+i)); if ( (ogfn != INVALID_M2P_ENTRY) && (ogfn != gfn + i) ) { /* This machine frame is already mapped at another physical * address */ P2M_DEBUG("aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n", mfn + i, ogfn, gfn + i); omfn = p2m->get_entry(p2m, ogfn, &ot, &a, 0, NULL); if ( p2m_is_ram(ot) && !p2m_is_paged(ot) ) { ASSERT(mfn_valid(omfn)); P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n", ogfn , mfn_x(omfn)); if ( mfn_x(omfn) == (mfn + i) ) p2m_remove_page(p2m, ogfn, mfn + i, 0); } } } /* Now, actually do the two-way mapping */ if ( mfn_valid(_mfn(mfn)) ) { if ( !set_p2m_entry(p2m, gfn, _mfn(mfn), page_order, t, p2m->default_access) ) { rc = -EINVAL; goto out; /* Failed to update p2m, bail without updating m2p. */ } if ( !p2m_is_grant(t) ) { for ( i = 0; i < (1UL << page_order); i++ ) set_gpfn_from_mfn(mfn+i, gfn+i); } } else { gdprintk(XENLOG_WARNING, "Adding bad mfn to p2m map (%#lx -> %#lx)\n", gfn, mfn); if ( !set_p2m_entry(p2m, gfn, _mfn(INVALID_MFN), page_order, p2m_invalid, p2m->default_access) ) rc = -EINVAL; else { pod_lock(p2m); p2m->pod.entry_count -= pod_count; BUG_ON(p2m->pod.entry_count < 0); pod_unlock(p2m); } } out: p2m_unlock(p2m); return rc; } /* Modify the p2m type of a single gfn from ot to nt, returning the * entry's previous type. Resets the access permissions. */ p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn, p2m_type_t ot, p2m_type_t nt) { p2m_access_t a; p2m_type_t pt; mfn_t mfn; struct p2m_domain *p2m = p2m_get_hostp2m(d); BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt)); gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, &pt, &a, 0, NULL); if ( pt == ot ) set_p2m_entry(p2m, gfn, mfn, PAGE_ORDER_4K, nt, p2m->default_access); gfn_unlock(p2m, gfn, 0); return pt; } /* Modify the p2m type of a range of gfns from ot to nt. * Resets the access permissions. */ void p2m_change_type_range(struct domain *d, unsigned long start, unsigned long end, p2m_type_t ot, p2m_type_t nt) { p2m_access_t a; p2m_type_t pt; unsigned long gfn; mfn_t mfn; struct p2m_domain *p2m = p2m_get_hostp2m(d); BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt)); p2m_lock(p2m); p2m->defer_nested_flush = 1; for ( gfn = start; gfn < end; gfn++ ) { mfn = p2m->get_entry(p2m, gfn, &pt, &a, 0, NULL); if ( pt == ot ) set_p2m_entry(p2m, gfn, mfn, PAGE_ORDER_4K, nt, p2m->default_access); } p2m->defer_nested_flush = 0; if ( nestedhvm_enabled(d) ) p2m_flush_nestedp2m(d); p2m_unlock(p2m); } int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) { int rc = 0; p2m_access_t a; p2m_type_t ot; mfn_t omfn; struct p2m_domain *p2m = p2m_get_hostp2m(d); if ( !paging_mode_translate(d) ) return 0; gfn_lock(p2m, gfn, 0); omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL); if ( p2m_is_grant(ot) ) { p2m_unlock(p2m); domain_crash(d); return 0; } else if ( p2m_is_ram(ot) ) { ASSERT(mfn_valid(omfn)); set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); } P2M_DEBUG("set mmio %lx %lx\n", gfn, mfn_x(mfn)); rc = set_p2m_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2m_mmio_direct, p2m->default_access); gfn_unlock(p2m, gfn, 0); if ( 0 == rc ) gdprintk(XENLOG_ERR, "set_mmio_p2m_entry: set_p2m_entry failed! mfn=%08lx\n", mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot))); return rc; } int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn) { int rc = 0; mfn_t mfn; p2m_access_t a; p2m_type_t t; struct p2m_domain *p2m = p2m_get_hostp2m(d); if ( !paging_mode_translate(d) ) return 0; gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, &t, &a, 0, NULL); /* Do not use mfn_valid() here as it will usually fail for MMIO pages. */ if ( (INVALID_MFN == mfn_x(mfn)) || (t != p2m_mmio_direct) ) { gdprintk(XENLOG_ERR, "clear_mmio_p2m_entry: gfn_to_mfn failed! gfn=%08lx\n", gfn); goto out; } rc = set_p2m_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K, p2m_invalid, p2m->default_access); out: gfn_unlock(p2m, gfn, 0); return rc; } int set_shared_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) { struct p2m_domain *p2m = p2m_get_hostp2m(d); int rc = 0; p2m_access_t a; p2m_type_t ot; mfn_t omfn; unsigned long pg_type; if ( !paging_mode_translate(p2m->domain) ) return 0; gfn_lock(p2m, gfn, 0); omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL); /* At the moment we only allow p2m change if gfn has already been made * sharable first */ ASSERT(p2m_is_shared(ot)); ASSERT(mfn_valid(omfn)); /* Set the m2p entry to invalid only if there are no further type * refs to this page as shared */ pg_type = read_atomic(&(mfn_to_page(omfn)->u.inuse.type_info)); if ( (pg_type & PGT_count_mask) == 0 || (pg_type & PGT_type_mask) != PGT_shared_page ) set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); P2M_DEBUG("set shared %lx %lx\n", gfn, mfn_x(mfn)); rc = set_p2m_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2m_ram_shared, p2m->default_access); gfn_unlock(p2m, gfn, 0); if ( 0 == rc ) gdprintk(XENLOG_ERR, "set_shared_p2m_entry: set_p2m_entry failed! mfn=%08lx\n", mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot))); return rc; } /** * p2m_mem_paging_nominate - Mark a guest page as to-be-paged-out * @d: guest domain * @gfn: guest page to nominate * * Returns 0 for success or negative errno values if gfn is not pageable. * * p2m_mem_paging_nominate() is called by the pager and checks if a guest page * can be paged out. If the following conditions are met the p2mt will be * changed: * - the gfn is backed by a mfn * - the p2mt of the gfn is pageable * - the mfn is not used for IO * - the mfn has exactly one user and has no special meaning * * Once the p2mt is changed the page is readonly for the guest. On success the * pager can write the page contents to disk and later evict the page. */ int p2m_mem_paging_nominate(struct domain *d, unsigned long gfn) { struct page_info *page; struct p2m_domain *p2m = p2m_get_hostp2m(d); p2m_type_t p2mt; p2m_access_t a; mfn_t mfn; int ret = -EBUSY; gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL); /* Check if mfn is valid */ if ( !mfn_valid(mfn) ) goto out; /* Check p2m type */ if ( !p2m_is_pageable(p2mt) ) goto out; /* Check for io memory page */ if ( is_iomem_page(mfn_x(mfn)) ) goto out; /* Check page count and type */ page = mfn_to_page(mfn); if ( (page->count_info & (PGC_count_mask | PGC_allocated)) != (1 | PGC_allocated) ) goto out; if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) goto out; /* Fix p2m entry */ set_p2m_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2m_ram_paging_out, a); ret = 0; out: gfn_unlock(p2m, gfn, 0); return ret; } /** * p2m_mem_paging_evict - Mark a guest page as paged-out * @d: guest domain * @gfn: guest page to evict * * Returns 0 for success or negative errno values if eviction is not possible. * * p2m_mem_paging_evict() is called by the pager and will free a guest page and * release it back to Xen. If the following conditions are met the page can be * freed: * - the gfn is backed by a mfn * - the gfn was nominated * - the mfn has still exactly one user and has no special meaning * * After successful nomination some other process could have mapped the page. In * this case eviction can not be done. If the gfn was populated before the pager * could evict it, eviction can not be done either. In this case the gfn is * still backed by a mfn. */ int p2m_mem_paging_evict(struct domain *d, unsigned long gfn) { struct page_info *page; p2m_type_t p2mt; p2m_access_t a; mfn_t mfn; struct p2m_domain *p2m = p2m_get_hostp2m(d); int ret = -EBUSY; gfn_lock(p2m, gfn, 0); /* Get mfn */ mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL); if ( unlikely(!mfn_valid(mfn)) ) goto out; /* Allow only nominated pages */ if ( p2mt != p2m_ram_paging_out ) goto out; /* Get the page so it doesn't get modified under Xen's feet */ page = mfn_to_page(mfn); if ( unlikely(!get_page(page, d)) ) goto out; /* Check page count and type once more */ if ( (page->count_info & (PGC_count_mask | PGC_allocated)) != (2 | PGC_allocated) ) goto out_put; if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) goto out_put; /* Decrement guest domain's ref count of the page */ if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) put_page(page); /* Remove mapping from p2m table */ set_p2m_entry(p2m, gfn, _mfn(INVALID_MFN), PAGE_ORDER_4K, p2m_ram_paged, a); /* Clear content before returning the page to Xen */ scrub_one_page(page); /* Track number of paged gfns */ atomic_inc(&d->paged_pages); ret = 0; out_put: /* Put the page back so it gets freed */ put_page(page); out: gfn_unlock(p2m, gfn, 0); return ret; } /** * p2m_mem_paging_drop_page - Tell pager to drop its reference to a paged page * @d: guest domain * @gfn: guest page to drop * * p2m_mem_paging_drop_page() will notify the pager that a paged-out gfn was * released by the guest. The pager is supposed to drop its reference of the * gfn. */ void p2m_mem_paging_drop_page(struct domain *d, unsigned long gfn, p2m_type_t p2mt) { mem_event_request_t req = { .gfn = gfn }; /* We allow no ring in this unique case, because it won't affect * correctness of the guest execution at this point. If this is the only * page that happens to be paged-out, we'll be okay.. but it's likely the * guest will crash shortly anyways. */ int rc = mem_event_claim_slot(d, &d->mem_event->paging); if ( rc < 0 ) return; /* Send release notification to pager */ req.flags = MEM_EVENT_FLAG_DROP_PAGE; /* Update stats unless the page hasn't yet been evicted */ if ( p2mt != p2m_ram_paging_out ) atomic_dec(&d->paged_pages); else /* Evict will fail now, tag this request for pager */ req.flags |= MEM_EVENT_FLAG_EVICT_FAIL; mem_event_put_request(d, &d->mem_event->paging, &req); } /** * p2m_mem_paging_populate - Tell pager to populete a paged page * @d: guest domain * @gfn: guest page in paging state * * p2m_mem_paging_populate() will notify the pager that a page in any of the * paging states needs to be written back into the guest. * This function needs to be called whenever gfn_to_mfn() returns any of the p2m * paging types because the gfn may not be backed by a mfn. * * The gfn can be in any of the paging states, but the pager needs only be * notified when the gfn is in the paging-out path (paging_out or paged). This * function may be called more than once from several vcpus. If the vcpu belongs * to the guest, the vcpu must be stopped and the pager notified that the vcpu * was stopped. The pager needs to handle several requests for the same gfn. * * If the gfn is not in the paging-out path and the vcpu does not belong to the * guest, nothing needs to be done and the function assumes that a request was * already sent to the pager. In this case the caller has to try again until the * gfn is fully paged in again. */ void p2m_mem_paging_populate(struct domain *d, unsigned long gfn) { struct vcpu *v = current; mem_event_request_t req = { .gfn = gfn }; p2m_type_t p2mt; p2m_access_t a; mfn_t mfn; struct p2m_domain *p2m = p2m_get_hostp2m(d); /* We're paging. There should be a ring */ int rc = mem_event_claim_slot(d, &d->mem_event->paging); if ( rc == -ENOSYS ) { gdprintk(XENLOG_ERR, "Domain %hu paging gfn %lx yet no ring " "in place\n", d->domain_id, gfn); /* Prevent the vcpu from faulting repeatedly on the same gfn */ if ( v->domain == d ) vcpu_pause_nosync(v); domain_crash(d); return; } else if ( rc < 0 ) return; /* Fix p2m mapping */ gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL); /* Allow only nominated or evicted pages to enter page-in path */ if ( p2mt == p2m_ram_paging_out || p2mt == p2m_ram_paged ) { /* Evict will fail now, tag this request for pager */ if ( p2mt == p2m_ram_paging_out ) req.flags |= MEM_EVENT_FLAG_EVICT_FAIL; set_p2m_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2m_ram_paging_in, a); } gfn_unlock(p2m, gfn, 0); /* Pause domain if request came from guest and gfn has paging type */ if ( p2m_is_paging(p2mt) && v->domain == d ) { vcpu_pause_nosync(v); req.flags |= MEM_EVENT_FLAG_VCPU_PAUSED; } /* No need to inform pager if the gfn is not in the page-out path */ else if ( p2mt != p2m_ram_paging_out && p2mt != p2m_ram_paged ) { /* gfn is already on its way back and vcpu is not paused */ mem_event_cancel_slot(d, &d->mem_event->paging); return; } /* Send request to pager */ req.p2mt = p2mt; req.vcpu_id = v->vcpu_id; mem_event_put_request(d, &d->mem_event->paging, &req); } /** * p2m_mem_paging_prep - Allocate a new page for the guest * @d: guest domain * @gfn: guest page in paging state * * p2m_mem_paging_prep() will allocate a new page for the guest if the gfn is * not backed by a mfn. It is called by the pager. * It is required that the gfn was already populated. The gfn may already have a * mfn if populate was called for gfn which was nominated but not evicted. In * this case only the p2mt needs to be forwarded. */ int p2m_mem_paging_prep(struct domain *d, unsigned long gfn, uint64_t buffer) { struct page_info *page; p2m_type_t p2mt; p2m_access_t a; mfn_t mfn; struct p2m_domain *p2m = p2m_get_hostp2m(d); int ret, page_extant = 1; const void *user_ptr = (const void *) buffer; if ( user_ptr ) /* Sanity check the buffer and bail out early if trouble */ if ( (buffer & (PAGE_SIZE - 1)) || (!access_ok(user_ptr, PAGE_SIZE)) ) return -EINVAL; gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, &p2mt, &a, 0, NULL); ret = -ENOENT; /* Allow missing pages */ if ( (p2mt != p2m_ram_paging_in) && (p2mt != p2m_ram_paged) ) goto out; /* Allocate a page if the gfn does not have one yet */ if ( !mfn_valid(mfn) ) { /* If the user did not provide a buffer, we disallow */ ret = -EINVAL; if ( unlikely(user_ptr == NULL) ) goto out; /* Get a free page */ ret = -ENOMEM; page = alloc_domheap_page(p2m->domain, 0); if ( unlikely(page == NULL) ) goto out; mfn = page_to_mfn(page); page_extant = 0; } /* If we were given a buffer, now is the time to use it */ if ( !page_extant && user_ptr ) { void *guest_map; int rc; ASSERT( mfn_valid(mfn) ); guest_map = map_domain_page(mfn_x(mfn)); rc = copy_from_user(guest_map, user_ptr, PAGE_SIZE); unmap_domain_page(guest_map); if ( rc ) { gdprintk(XENLOG_ERR, "Failed to load paging-in gfn %lx domain %u " "bytes left %d\n", gfn, d->domain_id, rc); ret = -EFAULT; put_page(page); /* Don't leak pages */ goto out; } } /* Make the page already guest-accessible. If the pager still has a * pending resume operation, it will be idempotent p2m entry-wise, * but will unpause the vcpu */ set_p2m_entry(p2m, gfn, mfn, PAGE_ORDER_4K, paging_mode_log_dirty(d) ? p2m_ram_logdirty : p2m_ram_rw, a); set_gpfn_from_mfn(mfn_x(mfn), gfn); if ( !page_extant ) atomic_dec(&d->paged_pages); ret = 0; out: gfn_unlock(p2m, gfn, 0); return ret; } /** * p2m_mem_paging_resume - Resume guest gfn and vcpus * @d: guest domain * @gfn: guest page in paging state * * p2m_mem_paging_resume() will forward the p2mt of a gfn to ram_rw and all * waiting vcpus will be unpaused again. It is called by the pager. * * The gfn was previously either evicted and populated, or nominated and * populated. If the page was evicted the p2mt will be p2m_ram_paging_in. If * the page was just nominated the p2mt will be p2m_ram_paging_in_start because * the pager did not call p2m_mem_paging_prep(). * * If the gfn was dropped the vcpu needs to be unpaused. */ void p2m_mem_paging_resume(struct domain *d) { struct p2m_domain *p2m = p2m_get_hostp2m(d); mem_event_response_t rsp; p2m_type_t p2mt; p2m_access_t a; mfn_t mfn; /* Pull all responses off the ring */ while( mem_event_get_response(d, &d->mem_event->paging, &rsp) ) { if ( rsp.flags & MEM_EVENT_FLAG_DUMMY ) continue; /* Fix p2m entry if the page was not dropped */ if ( !(rsp.flags & MEM_EVENT_FLAG_DROP_PAGE) ) { gfn_lock(p2m, rsp.gfn, 0); mfn = p2m->get_entry(p2m, rsp.gfn, &p2mt, &a, 0, NULL); /* Allow only pages which were prepared properly, or pages which * were nominated but not evicted */ if ( mfn_valid(mfn) && (p2mt == p2m_ram_paging_in) ) { set_p2m_entry(p2m, rsp.gfn, mfn, PAGE_ORDER_4K, paging_mode_log_dirty(d) ? p2m_ram_logdirty : p2m_ram_rw, a); set_gpfn_from_mfn(mfn_x(mfn), rsp.gfn); } gfn_unlock(p2m, rsp.gfn, 0); } /* Unpause domain */ if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED ) vcpu_unpause(d->vcpu[rsp.vcpu_id]); } } bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla, bool_t access_r, bool_t access_w, bool_t access_x, mem_event_request_t **req_ptr) { struct vcpu *v = current; unsigned long gfn = gpa >> PAGE_SHIFT; struct domain *d = v->domain; struct p2m_domain* p2m = p2m_get_hostp2m(d); mfn_t mfn; p2m_type_t p2mt; p2m_access_t p2ma; mem_event_request_t *req; int rc; /* First, handle rx2rw conversion automatically. * These calls to p2m->set_entry() must succeed: we have the gfn * locked and just did a successful get_entry(). */ gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL); if ( access_w && p2ma == p2m_access_rx2rw ) { rc = p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2mt, p2m_access_rw); ASSERT(rc); gfn_unlock(p2m, gfn, 0); return 1; } else if ( p2ma == p2m_access_n2rwx ) { ASSERT(access_w || access_r || access_x); rc = p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2mt, p2m_access_rwx); ASSERT(rc); } gfn_unlock(p2m, gfn, 0); /* Otherwise, check if there is a memory event listener, and send the message along */ if ( !mem_event_check_ring(&d->mem_event->access) || !req_ptr ) { /* No listener */ if ( p2m->access_required ) { gdprintk(XENLOG_INFO, "Memory access permissions failure, " "no mem_event listener VCPU %d, dom %d\n", v->vcpu_id, d->domain_id); domain_crash(v->domain); return 0; } else { gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL); if ( p2ma != p2m_access_n2rwx ) { /* A listener is not required, so clear the access * restrictions. This set must succeed: we have the * gfn locked and just did a successful get_entry(). */ rc = p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2mt, p2m_access_rwx); ASSERT(rc); } gfn_unlock(p2m, gfn, 0); return 1; } } *req_ptr = NULL; req = xzalloc(mem_event_request_t); if ( req ) { *req_ptr = req; req->reason = MEM_EVENT_REASON_VIOLATION; /* Pause the current VCPU */ if ( p2ma != p2m_access_n2rwx ) req->flags |= MEM_EVENT_FLAG_VCPU_PAUSED; /* Send request to mem event */ req->gfn = gfn; req->offset = gpa & ((1 << PAGE_SHIFT) - 1); req->gla_valid = gla_valid; req->gla = gla; req->access_r = access_r; req->access_w = access_w; req->access_x = access_x; req->vcpu_id = v->vcpu_id; } /* Pause the current VCPU */ if ( p2ma != p2m_access_n2rwx ) vcpu_pause_nosync(v); /* VCPU may be paused, return whether we promoted automatically */ return (p2ma == p2m_access_n2rwx); } void p2m_mem_access_resume(struct domain *d) { mem_event_response_t rsp; /* Pull all responses off the ring */ while( mem_event_get_response(d, &d->mem_event->access, &rsp) ) { if ( rsp.flags & MEM_EVENT_FLAG_DUMMY ) continue; /* Unpause domain */ if ( rsp.flags & MEM_EVENT_FLAG_VCPU_PAUSED ) vcpu_unpause(d->vcpu[rsp.vcpu_id]); } } /* Set access type for a region of pfns. * If start_pfn == -1ul, sets the default access type */ int p2m_set_mem_access(struct domain *d, unsigned long start_pfn, uint32_t nr, hvmmem_access_t access) { struct p2m_domain *p2m = p2m_get_hostp2m(d); unsigned long pfn; p2m_access_t a, _a; p2m_type_t t; mfn_t mfn; int rc = 0; /* N.B. _not_ static: initializer depends on p2m->default_access */ p2m_access_t memaccess[] = { p2m_access_n, p2m_access_r, p2m_access_w, p2m_access_rw, p2m_access_x, p2m_access_rx, p2m_access_wx, p2m_access_rwx, p2m_access_rx2rw, p2m_access_n2rwx, p2m->default_access, }; if ( (unsigned) access >= HVMMEM_access_default ) return -EINVAL; a = memaccess[access]; /* If request to set default access */ if ( start_pfn == ~0ull ) { p2m->default_access = a; return 0; } p2m_lock(p2m); for ( pfn = start_pfn; pfn < start_pfn + nr; pfn++ ) { mfn = p2m->get_entry(p2m, pfn, &t, &_a, 0, NULL); if ( p2m->set_entry(p2m, pfn, mfn, PAGE_ORDER_4K, t, a) == 0 ) { rc = -ENOMEM; break; } } p2m_unlock(p2m); return rc; } /* Get access type for a pfn * If pfn == -1ul, gets the default access type */ int p2m_get_mem_access(struct domain *d, unsigned long pfn, hvmmem_access_t *access) { struct p2m_domain *p2m = p2m_get_hostp2m(d); p2m_type_t t; p2m_access_t a; mfn_t mfn; static const hvmmem_access_t memaccess[] = { HVMMEM_access_n, HVMMEM_access_r, HVMMEM_access_w, HVMMEM_access_rw, HVMMEM_access_x, HVMMEM_access_rx, HVMMEM_access_wx, HVMMEM_access_rwx, HVMMEM_access_rx2rw }; /* If request to get default access */ if ( pfn == ~0ull ) { *access = memaccess[p2m->default_access]; return 0; } gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, pfn, &t, &a, 0, NULL); gfn_unlock(p2m, gfn, 0); if ( mfn_x(mfn) == INVALID_MFN ) return -ESRCH; if ( (unsigned) a >= ARRAY_SIZE(memaccess) ) return -ERANGE; *access = memaccess[a]; return 0; } static struct p2m_domain * p2m_getlru_nestedp2m(struct domain *d, struct p2m_domain *p2m) { struct list_head *lru_list = &p2m_get_hostp2m(d)->np2m_list; ASSERT(!list_empty(lru_list)); if ( p2m == NULL ) p2m = list_entry(lru_list->prev, struct p2m_domain, np2m_list); list_move(&p2m->np2m_list, lru_list); return p2m; } /* Reset this p2m table to be empty */ static void p2m_flush_table(struct p2m_domain *p2m) { struct page_info *top, *pg; struct domain *d = p2m->domain; void *p; p2m_lock(p2m); /* "Host" p2m tables can have shared entries &c that need a bit more * care when discarding them */ ASSERT(p2m_is_nestedp2m(p2m)); /* Nested p2m's do not do pod, hence the asserts (and no pod lock)*/ ASSERT(page_list_empty(&p2m->pod.super)); ASSERT(page_list_empty(&p2m->pod.single)); /* This is no longer a valid nested p2m for any address space */ p2m->np2m_base = P2M_BASE_EADDR; /* Zap the top level of the trie */ top = mfn_to_page(pagetable_get_mfn(p2m_get_pagetable(p2m))); p = __map_domain_page(top); clear_page(p); unmap_domain_page(p); /* Make sure nobody else is using this p2m table */ nestedhvm_vmcx_flushtlb(p2m); /* Free the rest of the trie pages back to the paging pool */ while ( (pg = page_list_remove_head(&p2m->pages)) ) if ( pg != top ) d->arch.paging.free_page(d, pg); page_list_add(top, &p2m->pages); p2m_unlock(p2m); } void p2m_flush(struct vcpu *v, struct p2m_domain *p2m) { ASSERT(v->domain == p2m->domain); vcpu_nestedhvm(v).nv_p2m = NULL; p2m_flush_table(p2m); hvm_asid_flush_vcpu(v); } void p2m_flush_nestedp2m(struct domain *d) { int i; for ( i = 0; i < MAX_NESTEDP2M; i++ ) p2m_flush_table(d->arch.nested_p2m[i]); } struct p2m_domain * p2m_get_nestedp2m(struct vcpu *v, uint64_t np2m_base) { /* Use volatile to prevent gcc to cache nv->nv_p2m in a cpu register as * this may change within the loop by an other (v)cpu. */ volatile struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct domain *d; struct p2m_domain *p2m; /* Mask out low bits; this avoids collisions with P2M_BASE_EADDR */ np2m_base &= ~(0xfffull); if (nv->nv_flushp2m && nv->nv_p2m) { nv->nv_p2m = NULL; } d = v->domain; nestedp2m_lock(d); p2m = nv->nv_p2m; if ( p2m ) { p2m_lock(p2m); if ( p2m->np2m_base == np2m_base || p2m->np2m_base == P2M_BASE_EADDR ) { nv->nv_flushp2m = 0; p2m_getlru_nestedp2m(d, p2m); nv->nv_p2m = p2m; if ( p2m->np2m_base == P2M_BASE_EADDR ) hvm_asid_flush_vcpu(v); p2m->np2m_base = np2m_base; cpumask_set_cpu(v->processor, p2m->dirty_cpumask); p2m_unlock(p2m); nestedp2m_unlock(d); return p2m; } p2m_unlock(p2m); } /* All p2m's are or were in use. Take the least recent used one, * flush it and reuse. */ p2m = p2m_getlru_nestedp2m(d, NULL); p2m_flush_table(p2m); p2m_lock(p2m); nv->nv_p2m = p2m; p2m->np2m_base = np2m_base; nv->nv_flushp2m = 0; hvm_asid_flush_vcpu(v); cpumask_set_cpu(v->processor, p2m->dirty_cpumask); p2m_unlock(p2m); nestedp2m_unlock(d); return p2m; } struct p2m_domain * p2m_get_p2m(struct vcpu *v) { if (!nestedhvm_is_n2(v)) return p2m_get_hostp2m(v->domain); return p2m_get_nestedp2m(v, nhvm_vcpu_p2m_base(v)); } unsigned long paging_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec) { struct p2m_domain *hostp2m = p2m_get_hostp2m(v->domain); const struct paging_mode *hostmode = paging_get_hostmode(v); if ( is_hvm_domain(v->domain) && paging_mode_hap(v->domain) && nestedhvm_is_n2(v) ) { unsigned long gfn; struct p2m_domain *p2m; const struct paging_mode *mode; uint32_t pfec_21 = *pfec; uint64_t np2m_base = nhvm_vcpu_p2m_base(v); /* translate l2 guest va into l2 guest gfn */ p2m = p2m_get_nestedp2m(v, np2m_base); mode = paging_get_nestedmode(v); gfn = mode->gva_to_gfn(v, p2m, va, pfec); /* translate l2 guest gfn into l1 guest gfn */ return hostmode->p2m_ga_to_gfn(v, hostp2m, np2m_base, gfn << PAGE_SHIFT, &pfec_21, NULL); } return hostmode->gva_to_gfn(v, hostp2m, va, pfec); } /*** Audit ***/ #if P2M_AUDIT void audit_p2m(struct domain *d, uint64_t *orphans, uint64_t *m2p_bad, uint64_t *p2m_bad) { struct page_info *page; struct domain *od; unsigned long mfn, gfn; mfn_t p2mfn; unsigned long orphans_count = 0, mpbad = 0, pmbad = 0; p2m_access_t p2ma; p2m_type_t type; struct p2m_domain *p2m = p2m_get_hostp2m(d); if ( !paging_mode_translate(d) ) goto out_p2m_audit; P2M_PRINTK("p2m audit starts\n"); p2m_lock(p2m); pod_lock(p2m); if (p2m->audit_p2m) pmbad = p2m->audit_p2m(p2m); /* Audit part two: walk the domain's page allocation list, checking * the m2p entries. */ spin_lock(&d->page_alloc_lock); page_list_for_each ( page, &d->page_list ) { mfn = mfn_x(page_to_mfn(page)); P2M_PRINTK("auditing guest page, mfn=%#lx\n", mfn); od = page_get_owner(page); if ( od != d ) { P2M_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n", mfn, od, (od?od->domain_id:-1), d, d->domain_id); continue; } gfn = get_gpfn_from_mfn(mfn); if ( gfn == INVALID_M2P_ENTRY ) { orphans_count++; P2M_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n", mfn); continue; } if ( gfn == SHARED_M2P_ENTRY ) { P2M_PRINTK("shared mfn (%lx) on domain page list!\n", mfn); continue; } p2mfn = get_gfn_type_access(p2m, gfn, &type, &p2ma, 0, NULL); if ( mfn_x(p2mfn) != mfn ) { mpbad++; P2M_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx" " (-> gfn %#lx)\n", mfn, gfn, mfn_x(p2mfn), (mfn_valid(p2mfn) ? get_gpfn_from_mfn(mfn_x(p2mfn)) : -1u)); /* This m2p entry is stale: the domain has another frame in * this physical slot. No great disaster, but for neatness, * blow away the m2p entry. */ set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); } __put_gfn(p2m, gfn); P2M_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx\n", mfn, gfn, mfn_x(p2mfn)); } spin_unlock(&d->page_alloc_lock); pod_unlock(p2m); p2m_unlock(p2m); P2M_PRINTK("p2m audit complete\n"); if ( orphans_count | mpbad | pmbad ) P2M_PRINTK("p2m audit found %lu orphans\n", orphans); if ( mpbad | pmbad ) { P2M_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n", pmbad, mpbad); WARN(); } out_p2m_audit: *orphans = (uint64_t) orphans_count; *m2p_bad = (uint64_t) mpbad; *p2m_bad = (uint64_t) pmbad; } #endif /* P2M_AUDIT */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/mm/mem_paging.c0000664000175000017500000000333412307313555015462 0ustar smbsmb/****************************************************************************** * arch/x86/mm/mem_paging.c * * Memory paging support. * * Copyright (c) 2009 Citrix Systems, Inc. (Patrick Colp) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include int mem_paging_memop(struct domain *d, xen_mem_event_op_t *mec) { if ( unlikely(!d->mem_event->paging.ring_page) ) return -ENODEV; switch( mec->op ) { case XENMEM_paging_op_nominate: { unsigned long gfn = mec->gfn; return p2m_mem_paging_nominate(d, gfn); } break; case XENMEM_paging_op_evict: { unsigned long gfn = mec->gfn; return p2m_mem_paging_evict(d, gfn); } break; case XENMEM_paging_op_prep: { unsigned long gfn = mec->gfn; return p2m_mem_paging_prep(d, gfn, mec->buffer); } break; default: return -ENOSYS; break; } } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/mm/mm-locks.h0000664000175000017500000002466412307313555015117 0ustar smbsmb/****************************************************************************** * arch/x86/mm/mm-locks.h * * Spinlocks used by the code in arch/x86/mm. * * Copyright (c) 2011 Citrix Systems, inc. * Copyright (c) 2007 Advanced Micro Devices (Wei Huang) * Copyright (c) 2006-2007 XenSource Inc. * Copyright (c) 2006 Michael A Fetterman * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _MM_LOCKS_H #define _MM_LOCKS_H #include /* Per-CPU variable for enforcing the lock ordering */ DECLARE_PER_CPU(int, mm_lock_level); #define __get_lock_level() (this_cpu(mm_lock_level)) static inline void mm_lock_init(mm_lock_t *l) { spin_lock_init(&l->lock); l->locker = -1; l->locker_function = "nobody"; l->unlock_level = 0; } static inline int mm_locked_by_me(mm_lock_t *l) { return (l->lock.recurse_cpu == current->processor); } /* If you see this crash, the numbers printed are lines in this file * where the offending locks are declared. */ #define __check_lock_level(l) \ do { \ if ( unlikely(__get_lock_level() > (l)) ) \ { \ printk("mm locking order violation: %i > %i\n", \ __get_lock_level(), (l)); \ BUG(); \ } \ } while(0) #define __set_lock_level(l) \ do { \ __get_lock_level() = (l); \ } while(0) static inline void _mm_lock(mm_lock_t *l, const char *func, int level, int rec) { if ( !((mm_locked_by_me(l)) && rec) ) __check_lock_level(level); spin_lock_recursive(&l->lock); if ( l->lock.recurse_cnt == 1 ) { l->locker_function = func; l->unlock_level = __get_lock_level(); } else if ( (unlikely(!rec)) ) panic("mm lock already held by %s", l->locker_function); __set_lock_level(level); } static inline void _mm_enforce_order_lock_pre(int level) { __check_lock_level(level); } static inline void _mm_enforce_order_lock_post(int level, int *unlock_level, unsigned short *recurse_count) { if ( recurse_count ) { if ( (*recurse_count)++ == 0 ) { *unlock_level = __get_lock_level(); } } else { *unlock_level = __get_lock_level(); } __set_lock_level(level); } static inline void mm_rwlock_init(mm_rwlock_t *l) { rwlock_init(&l->lock); l->locker = -1; l->locker_function = "nobody"; l->unlock_level = 0; } static inline int mm_write_locked_by_me(mm_rwlock_t *l) { return (l->locker == get_processor_id()); } static inline void _mm_write_lock(mm_rwlock_t *l, const char *func, int level) { if ( !mm_write_locked_by_me(l) ) { __check_lock_level(level); write_lock(&l->lock); l->locker = get_processor_id(); l->locker_function = func; l->unlock_level = __get_lock_level(); __set_lock_level(level); } l->recurse_count++; } static inline void mm_write_unlock(mm_rwlock_t *l) { if ( --(l->recurse_count) != 0 ) return; l->locker = -1; l->locker_function = "nobody"; __set_lock_level(l->unlock_level); write_unlock(&l->lock); } static inline void _mm_read_lock(mm_rwlock_t *l, int level) { __check_lock_level(level); read_lock(&l->lock); /* There's nowhere to store the per-CPU unlock level so we can't * set the lock level. */ } static inline void mm_read_unlock(mm_rwlock_t *l) { read_unlock(&l->lock); } /* This wrapper uses the line number to express the locking order below */ #define declare_mm_lock(name) \ static inline void mm_lock_##name(mm_lock_t *l, const char *func, int rec)\ { _mm_lock(l, func, __LINE__, rec); } #define declare_mm_rwlock(name) \ static inline void mm_write_lock_##name(mm_rwlock_t *l, const char *func) \ { _mm_write_lock(l, func, __LINE__); } \ static inline void mm_read_lock_##name(mm_rwlock_t *l) \ { _mm_read_lock(l, __LINE__); } /* These capture the name of the calling function */ #define mm_lock(name, l) mm_lock_##name(l, __func__, 0) #define mm_lock_recursive(name, l) mm_lock_##name(l, __func__, 1) #define mm_write_lock(name, l) mm_write_lock_##name(l, __func__) #define mm_read_lock(name, l) mm_read_lock_##name(l) /* This wrapper is intended for "external" locks which do not use * the mm_lock_t types. Such locks inside the mm code are also subject * to ordering constraints. */ #define declare_mm_order_constraint(name) \ static inline void mm_enforce_order_lock_pre_##name(void) \ { _mm_enforce_order_lock_pre(__LINE__); } \ static inline void mm_enforce_order_lock_post_##name( \ int *unlock_level, unsigned short *recurse_count) \ { _mm_enforce_order_lock_post(__LINE__, unlock_level, recurse_count); } \ static inline void mm_unlock(mm_lock_t *l) { if ( l->lock.recurse_cnt == 1 ) { l->locker_function = "nobody"; __set_lock_level(l->unlock_level); } spin_unlock_recursive(&l->lock); } static inline void mm_enforce_order_unlock(int unlock_level, unsigned short *recurse_count) { if ( recurse_count ) { BUG_ON(*recurse_count == 0); if ( (*recurse_count)-- == 1 ) { __set_lock_level(unlock_level); } } else { __set_lock_level(unlock_level); } } /************************************************************************ * * * To avoid deadlocks, these locks _MUST_ be taken in the order they're * * declared in this file. The locking functions will enforce this. * * * ************************************************************************/ declare_mm_lock(nestedp2m) #define nestedp2m_lock(d) mm_lock(nestedp2m, &(d)->arch.nested_p2m_lock) #define nestedp2m_unlock(d) mm_unlock(&(d)->arch.nested_p2m_lock) /* P2M lock (per-p2m-table) * * This protects all queries and updates to the p2m table. * Queries may be made under the read lock but all modifications * need the main (write) lock. * * The write lock is recursive as it is common for a code path to look * up a gfn and later mutate it. */ declare_mm_rwlock(p2m); #define p2m_lock(p) mm_write_lock(p2m, &(p)->lock); #define p2m_unlock(p) mm_write_unlock(&(p)->lock); #define gfn_lock(p,g,o) p2m_lock(p) #define gfn_unlock(p,g,o) p2m_unlock(p) #define p2m_read_lock(p) mm_read_lock(p2m, &(p)->lock) #define p2m_read_unlock(p) mm_read_unlock(&(p)->lock) #define p2m_locked_by_me(p) mm_write_locked_by_me(&(p)->lock) #define gfn_locked_by_me(p,g) p2m_locked_by_me(p) /* Sharing per page lock * * This is an external lock, not represented by an mm_lock_t. The memory * sharing lock uses it to protect addition and removal of (gfn,domain) * tuples to a shared page. We enforce order here against the p2m lock, * which is taken after the page_lock to change the gfn's p2m entry. * * The lock is recursive because during share we lock two pages. */ declare_mm_order_constraint(per_page_sharing) #define page_sharing_mm_pre_lock() mm_enforce_order_lock_pre_per_page_sharing() #define page_sharing_mm_post_lock(l, r) \ mm_enforce_order_lock_post_per_page_sharing((l), (r)) #define page_sharing_mm_unlock(l, r) mm_enforce_order_unlock((l), (r)) /* Nested P2M lock (per-domain) * * A per-domain lock that protects the mapping from nested-CR3 to * nested-p2m. In particular it covers: * - the array of nested-p2m tables, and all LRU activity therein; and * - setting the "cr3" field of any p2m table to a non-P2M_BASE_EAADR value. * (i.e. assigning a p2m table to be the shadow of that cr3 */ /* PoD lock (per-p2m-table) * * Protects private PoD data structs: entry and cache * counts, page lists, sweep parameters. */ declare_mm_lock(pod) #define pod_lock(p) mm_lock(pod, &(p)->pod.lock) #define pod_unlock(p) mm_unlock(&(p)->pod.lock) #define pod_locked_by_me(p) mm_locked_by_me(&(p)->pod.lock) /* Page alloc lock (per-domain) * * This is an external lock, not represented by an mm_lock_t. However, * pod code uses it in conjunction with the p2m lock, and expecting * the ordering which we enforce here. * The lock is not recursive. */ declare_mm_order_constraint(page_alloc) #define page_alloc_mm_pre_lock() mm_enforce_order_lock_pre_page_alloc() #define page_alloc_mm_post_lock(l) mm_enforce_order_lock_post_page_alloc(&(l), NULL) #define page_alloc_mm_unlock(l) mm_enforce_order_unlock((l), NULL) /* Paging lock (per-domain) * * For shadow pagetables, this lock protects * - all changes to shadow page table pages * - the shadow hash table * - the shadow page allocator * - all changes to guest page table pages * - all changes to the page_info->tlbflush_timestamp * - the page_info->count fields on shadow pages * * For HAP, it protects the NPT/EPT tables and mode changes. * * It also protects the log-dirty bitmap from concurrent accesses (and * teardowns, etc). */ declare_mm_lock(paging) #define paging_lock(d) mm_lock(paging, &(d)->arch.paging.lock) #define paging_lock_recursive(d) \ mm_lock_recursive(paging, &(d)->arch.paging.lock) #define paging_unlock(d) mm_unlock(&(d)->arch.paging.lock) #define paging_locked_by_me(d) mm_locked_by_me(&(d)->arch.paging.lock) #endif /* _MM_LOCKS_H */ xen-4.4.0/xen/arch/x86/mm/hap/0000775000175000017500000000000012307313555013760 5ustar smbsmbxen-4.4.0/xen/arch/x86/mm/hap/Makefile0000664000175000017500000000037412307313555015424 0ustar smbsmbobj-y += hap.o obj-y += guest_walk_2level.o obj-y += guest_walk_3level.o obj-$(x86_64) += guest_walk_4level.o obj-y += nested_hap.o obj-y += nested_ept.o guest_walk_%level.o: guest_walk.c Makefile $(CC) $(CFLAGS) -DGUEST_PAGING_LEVELS=$* -c $< -o $@ xen-4.4.0/xen/arch/x86/mm/hap/guest_walk.c0000664000175000017500000001046312307313555016275 0ustar smbsmb/* * arch/x86/mm/hap/guest_walk.c * * Guest page table walker * Copyright (c) 2007, AMD Corporation (Wei Huang) * Copyright (c) 2007, XenSource Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include "private.h" /* for hap_gva_to_gfn_* */ #define _hap_gva_to_gfn(levels) hap_gva_to_gfn_##levels##_levels #define hap_gva_to_gfn(levels) _hap_gva_to_gfn(levels) #define _hap_p2m_ga_to_gfn(levels) hap_p2m_ga_to_gfn_##levels##_levels #define hap_p2m_ga_to_gfn(levels) _hap_p2m_ga_to_gfn(levels) #if GUEST_PAGING_LEVELS > CONFIG_PAGING_LEVELS #error GUEST_PAGING_LEVELS must not exceed CONFIG_PAGING_LEVELS #endif #include #include unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)( struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec) { unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3]; return hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(v, p2m, cr3, gva, pfec, NULL); } unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)( struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3, paddr_t ga, uint32_t *pfec, unsigned int *page_order) { uint32_t missing; mfn_t top_mfn; void *top_map; p2m_type_t p2mt; walk_t gw; unsigned long top_gfn; struct page_info *top_page; /* Get the top-level table's MFN */ top_gfn = cr3 >> PAGE_SHIFT; top_page = get_page_from_gfn_p2m(p2m->domain, p2m, top_gfn, &p2mt, NULL, P2M_ALLOC | P2M_UNSHARE); if ( p2m_is_paging(p2mt) ) { ASSERT(!p2m_is_nestedp2m(p2m)); pfec[0] = PFEC_page_paged; if ( top_page ) put_page(top_page); p2m_mem_paging_populate(p2m->domain, cr3 >> PAGE_SHIFT); return INVALID_GFN; } if ( p2m_is_shared(p2mt) ) { pfec[0] = PFEC_page_shared; if ( top_page ) put_page(top_page); return INVALID_GFN; } if ( !top_page ) { pfec[0] &= ~PFEC_page_present; return INVALID_GFN; } top_mfn = _mfn(page_to_mfn(top_page)); /* Map the top-level table and call the tree-walker */ ASSERT(mfn_valid(mfn_x(top_mfn))); top_map = map_domain_page(mfn_x(top_mfn)); #if GUEST_PAGING_LEVELS == 3 top_map += (cr3 & ~(PAGE_MASK | 31)); #endif missing = guest_walk_tables(v, p2m, ga, &gw, pfec[0], top_mfn, top_map); unmap_domain_page(top_map); put_page(top_page); /* Interpret the answer */ if ( missing == 0 ) { gfn_t gfn = guest_l1e_get_gfn(gw.l1e); struct page_info *page; page = get_page_from_gfn_p2m(p2m->domain, p2m, gfn_x(gfn), &p2mt, NULL, P2M_ALLOC | P2M_UNSHARE); if ( page ) put_page(page); if ( p2m_is_paging(p2mt) ) { ASSERT(!p2m_is_nestedp2m(p2m)); pfec[0] = PFEC_page_paged; p2m_mem_paging_populate(p2m->domain, gfn_x(gfn)); return INVALID_GFN; } if ( p2m_is_shared(p2mt) ) { pfec[0] = PFEC_page_shared; return INVALID_GFN; } if ( page_order ) *page_order = guest_walk_to_page_order(&gw); return gfn_x(gfn); } if ( missing & _PAGE_PRESENT ) pfec[0] &= ~PFEC_page_present; if ( missing & _PAGE_INVALID_BITS ) pfec[0] |= PFEC_reserved_bit; if ( missing & _PAGE_PAGED ) pfec[0] = PFEC_page_paged; if ( missing & _PAGE_SHARED ) pfec[0] = PFEC_page_shared; return INVALID_GFN; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/mm/hap/nested_ept.c0000664000175000017500000002112112307313555016253 0ustar smbsmb/* * nested_ept.c: Handling virtulized EPT for guest in nested case. * * Copyright (c) 2012, Intel Corporation * Xiantao Zhang * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include "private.h" #include #include /* EPT always use 4-level paging structure */ #define GUEST_PAGING_LEVELS 4 #include /* Must reserved bits in all level entries */ #define EPT_MUST_RSV_BITS (((1ull << PADDR_BITS) - 1) & \ ~((1ull << paddr_bits) - 1)) #define NEPT_CAP_BITS \ (VMX_EPT_INVEPT_ALL_CONTEXT | VMX_EPT_INVEPT_SINGLE_CONTEXT | \ VMX_EPT_INVEPT_INSTRUCTION | VMX_EPT_SUPERPAGE_1GB | \ VMX_EPT_SUPERPAGE_2MB | VMX_EPT_MEMORY_TYPE_WB | \ VMX_EPT_MEMORY_TYPE_UC | VMX_EPT_WALK_LENGTH_4_SUPPORTED | \ VMX_EPT_EXEC_ONLY_SUPPORTED) #define NVPID_CAP_BITS \ (VMX_VPID_INVVPID_INSTRUCTION | VMX_VPID_INVVPID_INDIVIDUAL_ADDR | \ VMX_VPID_INVVPID_SINGLE_CONTEXT | VMX_VPID_INVVPID_ALL_CONTEXT | \ VMX_VPID_INVVPID_SINGLE_CONTEXT_RETAINING_GLOBAL) #define NEPT_1G_ENTRY_FLAG (1 << 11) #define NEPT_2M_ENTRY_FLAG (1 << 10) #define NEPT_4K_ENTRY_FLAG (1 << 9) bool_t nept_sp_entry(ept_entry_t e) { return !!(e.sp); } static bool_t nept_rsv_bits_check(ept_entry_t e, uint32_t level) { uint64_t rsv_bits = EPT_MUST_RSV_BITS; switch ( level ) { case 1: break; case 2 ... 3: if ( nept_sp_entry(e) ) rsv_bits |= ((1ull << (9 * (level - 1))) - 1) << PAGE_SHIFT; else rsv_bits |= EPTE_EMT_MASK | EPTE_IGMT_MASK; break; case 4: rsv_bits |= EPTE_EMT_MASK | EPTE_IGMT_MASK | EPTE_SUPER_PAGE_MASK; break; default: gdprintk(XENLOG_ERR,"Unsupported EPT paging level: %d\n", level); BUG(); break; } return !!(e.epte & rsv_bits); } /* EMT checking*/ static bool_t nept_emt_bits_check(ept_entry_t e, uint32_t level) { if ( e.sp || level == 1 ) { if ( e.emt == EPT_EMT_RSV0 || e.emt == EPT_EMT_RSV1 || e.emt == EPT_EMT_RSV2 ) return 1; } return 0; } static bool_t nept_permission_check(uint32_t rwx_acc, uint32_t rwx_bits) { return !(EPTE_RWX_MASK & rwx_acc & ~rwx_bits); } /* nept's non-present check */ static bool_t nept_non_present_check(ept_entry_t e) { if ( e.epte & EPTE_RWX_MASK ) return 0; return 1; } uint64_t nept_get_ept_vpid_cap(void) { uint64_t caps = 0; if ( cpu_has_vmx_ept ) caps |= NEPT_CAP_BITS; if ( !cpu_has_vmx_ept_exec_only_supported ) caps &= ~VMX_EPT_EXEC_ONLY_SUPPORTED; if ( cpu_has_vmx_vpid ) caps |= NVPID_CAP_BITS; return caps; } static bool_t nept_rwx_bits_check(ept_entry_t e) { /*write only or write/execute only*/ uint8_t rwx_bits = e.epte & EPTE_RWX_MASK; if ( rwx_bits == ept_access_w || rwx_bits == ept_access_wx ) return 1; if ( rwx_bits == ept_access_x && !(nept_get_ept_vpid_cap() & VMX_EPT_EXEC_ONLY_SUPPORTED) ) return 1; return 0; } /* nept's misconfiguration check */ static bool_t nept_misconfiguration_check(ept_entry_t e, uint32_t level) { return nept_rsv_bits_check(e, level) || nept_emt_bits_check(e, level) || nept_rwx_bits_check(e); } static int ept_lvl_table_offset(unsigned long gpa, int lvl) { return (gpa >> (EPT_L4_PAGETABLE_SHIFT -(4 - lvl) * 9)) & (EPT_PAGETABLE_ENTRIES - 1); } static uint32_t nept_walk_tables(struct vcpu *v, unsigned long l2ga, ept_walk_t *gw) { int lvl; p2m_type_t p2mt; uint32_t rc = 0, ret = 0, gflags; struct domain *d = v->domain; struct p2m_domain *p2m = d->arch.p2m; gfn_t base_gfn = _gfn(nhvm_vcpu_p2m_base(v) >> PAGE_SHIFT); mfn_t lxmfn; ept_entry_t *lxp = NULL; memset(gw, 0, sizeof(*gw)); for (lvl = 4; lvl > 0; lvl--) { lxp = map_domain_gfn(p2m, base_gfn, &lxmfn, &p2mt, P2M_ALLOC, &rc); if ( !lxp ) goto map_err; gw->lxe[lvl] = lxp[ept_lvl_table_offset(l2ga, lvl)]; unmap_domain_page(lxp); put_page(mfn_to_page(mfn_x(lxmfn))); if ( nept_non_present_check(gw->lxe[lvl]) ) goto non_present; if ( nept_misconfiguration_check(gw->lxe[lvl], lvl) ) goto misconfig_err; if ( (lvl == 2 || lvl == 3) && nept_sp_entry(gw->lxe[lvl]) ) { /* Generate a fake l1 table entry so callers don't all * have to understand superpages. */ unsigned long gfn_lvl_mask = (1ull << ((lvl - 1) * 9)) - 1; gfn_t start = _gfn(gw->lxe[lvl].mfn); /* Increment the pfn by the right number of 4k pages. */ start = _gfn((gfn_x(start) & ~gfn_lvl_mask) + ((l2ga >> PAGE_SHIFT) & gfn_lvl_mask)); gflags = (gw->lxe[lvl].epte & EPTE_FLAG_MASK) | (lvl == 3 ? NEPT_1G_ENTRY_FLAG: NEPT_2M_ENTRY_FLAG); gw->lxe[0].epte = (gfn_x(start) << PAGE_SHIFT) | gflags; goto done; } if ( lvl > 1 ) base_gfn = _gfn(gw->lxe[lvl].mfn); } /* If this is not a super entry, we can reach here. */ gflags = (gw->lxe[1].epte & EPTE_FLAG_MASK) | NEPT_4K_ENTRY_FLAG; gw->lxe[0].epte = (gw->lxe[1].epte & PAGE_MASK) | gflags; done: ret = EPT_TRANSLATE_SUCCEED; goto out; map_err: if ( rc == _PAGE_PAGED ) { ret = EPT_TRANSLATE_RETRY; goto out; } /* fall through to misconfig error */ misconfig_err: ret = EPT_TRANSLATE_MISCONFIG; goto out; non_present: ret = EPT_TRANSLATE_VIOLATION; /* fall through. */ out: return ret; } /* Translate a L2 guest address to L1 gpa via L1 EPT paging structure */ int nept_translate_l2ga(struct vcpu *v, paddr_t l2ga, unsigned int *page_order, uint32_t rwx_acc, unsigned long *l1gfn, uint8_t *p2m_acc, uint64_t *exit_qual, uint32_t *exit_reason) { uint32_t rc, rwx_bits = 0; ept_walk_t gw; rwx_acc &= EPTE_RWX_MASK; *l1gfn = INVALID_GFN; rc = nept_walk_tables(v, l2ga, &gw); switch ( rc ) { case EPT_TRANSLATE_SUCCEED: if ( likely(gw.lxe[0].epte & NEPT_2M_ENTRY_FLAG) ) { rwx_bits = gw.lxe[4].epte & gw.lxe[3].epte & gw.lxe[2].epte & EPTE_RWX_MASK; *page_order = 9; } else if ( gw.lxe[0].epte & NEPT_4K_ENTRY_FLAG ) { rwx_bits = gw.lxe[4].epte & gw.lxe[3].epte & gw.lxe[2].epte & gw.lxe[1].epte & EPTE_RWX_MASK; *page_order = 0; } else if ( gw.lxe[0].epte & NEPT_1G_ENTRY_FLAG ) { rwx_bits = gw.lxe[4].epte & gw.lxe[3].epte & EPTE_RWX_MASK; *page_order = 18; } else { gdprintk(XENLOG_ERR, "Uncorrect l1 entry!\n"); BUG(); } if ( nept_permission_check(rwx_acc, rwx_bits) ) { *l1gfn = gw.lxe[0].mfn; *p2m_acc = (uint8_t)rwx_bits; break; } rc = EPT_TRANSLATE_VIOLATION; /* Fall through to EPT violation if permission check fails. */ case EPT_TRANSLATE_VIOLATION: *exit_qual = (*exit_qual & 0xffffffc0) | (rwx_bits << 3) | rwx_acc; *exit_reason = EXIT_REASON_EPT_VIOLATION; break; case EPT_TRANSLATE_MISCONFIG: rc = EPT_TRANSLATE_MISCONFIG; *exit_qual = 0; *exit_reason = EXIT_REASON_EPT_MISCONFIG; break; case EPT_TRANSLATE_RETRY: break; default: gdprintk(XENLOG_ERR, "Unsupported ept translation type!:%d\n", rc); BUG(); break; } return rc; } xen-4.4.0/xen/arch/x86/mm/hap/nested_hap.c0000664000175000017500000002161612307313555016244 0ustar smbsmb/****************************************************************************** * arch/x86/mm/hap/nested_hap.c * * Code for Nested Virtualization * Copyright (c) 2011 Advanced Micro Devices * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include "private.h" /* AlGORITHM for NESTED PAGE FAULT * * NOTATION * Levels: L0, L1, L2 * Guests: L1 guest, L2 guest * Hypervisor: L0 hypervisor * Addresses: L2-GVA, L2-GPA, L1-GVA, L1-GPA, MPA * * On L0, when #NPF happens, the handler function should do: * hap_page_fault(GPA) * { * 1. If #NPF is from L1 guest, then we crash the guest VM (same as old * code) * 2. If #NPF is from L2 guest, then we continue from (3) * 3. Get np2m base from L1 guest. Map np2m base into L0 hypervisor address * space. * 4. Walk the np2m's page table * 5. - if not present or permission check failure, then we inject #NPF * back to L1 guest and * re-launch L1 guest (L1 guest will either treat this #NPF as MMIO, * or fix its p2m table for L2 guest) * 6. - if present, then we will get the a new translated value L1-GPA * (points to L1 machine memory) * 7. * Use L1-GPA to walk L0 P2M table * 8. - if not present, then crash the guest (should not happen) * 9. - if present, then we get a new translated value MPA * (points to real machine memory) * 10. * Finally, use GPA and MPA to walk nested_p2m * and fix the bits. * } * */ /********************************************/ /* NESTED VIRT P2M FUNCTIONS */ /********************************************/ /* Override macros from asm/page.h to make them work with mfn_t */ #undef mfn_valid #define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn)) #undef page_to_mfn #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) void nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level) { struct domain *d = p2m->domain; uint32_t old_flags; paging_lock(d); old_flags = l1e_get_flags(*p); safe_write_pte(p, new); if (old_flags & _PAGE_PRESENT) flush_tlb_mask(p2m->dirty_cpumask); paging_unlock(d); } /********************************************/ /* NESTED VIRT FUNCTIONS */ /********************************************/ static void nestedhap_fix_p2m(struct vcpu *v, struct p2m_domain *p2m, paddr_t L2_gpa, paddr_t L0_gpa, unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma) { int rv = 1; ASSERT(p2m); ASSERT(p2m->set_entry); p2m_lock(p2m); /* If this p2m table has been flushed or recycled under our feet, * leave it alone. We'll pick up the right one as we try to * vmenter the guest. */ if ( p2m->np2m_base == nhvm_vcpu_p2m_base(v) ) { unsigned long gfn, mask; mfn_t mfn; /* If this is a superpage mapping, round down both addresses * to the start of the superpage. */ mask = ~((1UL << page_order) - 1); gfn = (L2_gpa >> PAGE_SHIFT) & mask; mfn = _mfn((L0_gpa >> PAGE_SHIFT) & mask); rv = set_p2m_entry(p2m, gfn, mfn, page_order, p2mt, p2ma); } p2m_unlock(p2m); if (rv == 0) { gdprintk(XENLOG_ERR, "failed to set entry for %#"PRIx64" -> %#"PRIx64"\n", L2_gpa, L0_gpa); BUG(); } } /* This function uses L2_gpa to walk the P2M page table in L1. If the * walk is successful, the translated value is returned in * L1_gpa. The result value tells what to do next. */ static int nestedhap_walk_L1_p2m(struct vcpu *v, paddr_t L2_gpa, paddr_t *L1_gpa, unsigned int *page_order, uint8_t *p2m_acc, bool_t access_r, bool_t access_w, bool_t access_x) { ASSERT(hvm_funcs.nhvm_hap_walk_L1_p2m); return hvm_funcs.nhvm_hap_walk_L1_p2m(v, L2_gpa, L1_gpa, page_order, p2m_acc, access_r, access_w, access_x); } /* This function uses L1_gpa to walk the P2M table in L0 hypervisor. If the * walk is successful, the translated value is returned in L0_gpa. The return * value tells the upper level what to do. */ static int nestedhap_walk_L0_p2m(struct p2m_domain *p2m, paddr_t L1_gpa, paddr_t *L0_gpa, p2m_type_t *p2mt, p2m_access_t *p2ma, unsigned int *page_order, bool_t access_r, bool_t access_w, bool_t access_x) { mfn_t mfn; int rc; /* walk L0 P2M table */ mfn = get_gfn_type_access(p2m, L1_gpa >> PAGE_SHIFT, p2mt, p2ma, 0, page_order); rc = NESTEDHVM_PAGEFAULT_DIRECT_MMIO; if ( *p2mt == p2m_mmio_direct ) goto direct_mmio_out; rc = NESTEDHVM_PAGEFAULT_MMIO; if ( *p2mt == p2m_mmio_dm ) goto out; rc = NESTEDHVM_PAGEFAULT_L0_ERROR; if ( access_w && p2m_is_readonly(*p2mt) ) goto out; if ( p2m_is_paging(*p2mt) || p2m_is_shared(*p2mt) || !p2m_is_ram(*p2mt) ) goto out; if ( !mfn_valid(mfn) ) goto out; rc = NESTEDHVM_PAGEFAULT_DONE; direct_mmio_out: *L0_gpa = (mfn_x(mfn) << PAGE_SHIFT) + (L1_gpa & ~PAGE_MASK); out: __put_gfn(p2m, L1_gpa >> PAGE_SHIFT); return rc; } /* * The following function, nestedhap_page_fault(), is for steps (3)--(10). * * Returns: */ int nestedhvm_hap_nested_page_fault(struct vcpu *v, paddr_t *L2_gpa, bool_t access_r, bool_t access_w, bool_t access_x) { int rv; paddr_t L1_gpa, L0_gpa; struct domain *d = v->domain; struct p2m_domain *p2m, *nested_p2m; unsigned int page_order_21, page_order_10, page_order_20; p2m_type_t p2mt_10; p2m_access_t p2ma_10 = p2m_access_rwx; uint8_t p2ma_21 = p2m_access_rwx; p2m = p2m_get_hostp2m(d); /* L0 p2m */ nested_p2m = p2m_get_nestedp2m(v, nhvm_vcpu_p2m_base(v)); /* walk the L1 P2M table */ rv = nestedhap_walk_L1_p2m(v, *L2_gpa, &L1_gpa, &page_order_21, &p2ma_21, access_r, access_w, access_x); /* let caller to handle these two cases */ switch (rv) { case NESTEDHVM_PAGEFAULT_INJECT: case NESTEDHVM_PAGEFAULT_RETRY: case NESTEDHVM_PAGEFAULT_L1_ERROR: return rv; case NESTEDHVM_PAGEFAULT_DONE: break; default: BUG(); break; } /* ==> we have to walk L0 P2M */ rv = nestedhap_walk_L0_p2m(p2m, L1_gpa, &L0_gpa, &p2mt_10, &p2ma_10, &page_order_10, access_r, access_w, access_x); /* let upper level caller to handle these two cases */ switch (rv) { case NESTEDHVM_PAGEFAULT_INJECT: return rv; case NESTEDHVM_PAGEFAULT_L0_ERROR: *L2_gpa = L1_gpa; return rv; case NESTEDHVM_PAGEFAULT_DONE: break; case NESTEDHVM_PAGEFAULT_MMIO: return rv; case NESTEDHVM_PAGEFAULT_DIRECT_MMIO: break; default: BUG(); break; } page_order_20 = min(page_order_21, page_order_10); ASSERT(p2ma_10 <= p2m_access_n2rwx); /*NOTE: if assert fails, needs to handle new access type here */ switch ( p2ma_10 ) { case p2m_access_n ... p2m_access_rwx: break; case p2m_access_rx2rw: p2ma_10 = p2m_access_rx; break; case p2m_access_n2rwx: p2ma_10 = p2m_access_n; break; default: p2ma_10 = p2m_access_n; /* For safety, remove all permissions. */ gdprintk(XENLOG_ERR, "Unhandled p2m access type:%d\n", p2ma_10); } /* Use minimal permission for nested p2m. */ p2ma_10 &= (p2m_access_t)p2ma_21; /* fix p2m_get_pagetable(nested_p2m) */ nestedhap_fix_p2m(v, nested_p2m, *L2_gpa, L0_gpa, page_order_20, p2mt_10, p2ma_10); return NESTEDHVM_PAGEFAULT_DONE; } /********************************************/ /* NESTED VIRT INITIALIZATION FUNCS */ /********************************************/ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/mm/hap/private.h0000664000175000017500000000420012307313555015577 0ustar smbsmb/* * arch/x86/mm/hap/private.h * * Copyright (c) 2007, AMD Corporation (Wei Huang) * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #ifndef __HAP_PRIVATE_H__ #define __HAP_PRIVATE_H__ #include "../mm-locks.h" /********************************************/ /* GUEST TRANSLATION FUNCS */ /********************************************/ unsigned long hap_gva_to_gfn_2_levels(struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec); unsigned long hap_gva_to_gfn_3_levels(struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec); unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec); unsigned long hap_p2m_ga_to_gfn_2_levels(struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3, paddr_t ga, uint32_t *pfec, unsigned int *page_order); unsigned long hap_p2m_ga_to_gfn_3_levels(struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3, paddr_t ga, uint32_t *pfec, unsigned int *page_order); unsigned long hap_p2m_ga_to_gfn_4_levels(struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3, paddr_t ga, uint32_t *pfec, unsigned int *page_order); #endif /* __HAP_PRIVATE_H__ */ xen-4.4.0/xen/arch/x86/mm/hap/hap.c0000664000175000017500000005557612307313555014716 0ustar smbsmb/****************************************************************************** * arch/x86/mm/hap/hap.c * * hardware assisted paging * Copyright (c) 2007 Advanced Micro Devices (Wei Huang) * Parts of this code are Copyright (c) 2007 by XenSource Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "private.h" /* Override macros from asm/page.h to make them work with mfn_t */ #undef mfn_to_page #define mfn_to_page(_m) __mfn_to_page(mfn_x(_m)) #undef mfn_valid #define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn)) #undef page_to_mfn #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) /************************************************/ /* HAP VRAM TRACKING SUPPORT */ /************************************************/ /* * hap_track_dirty_vram() * Create the domain's dv_dirty_vram struct on demand. * Create a dirty vram range on demand when some [begin_pfn:begin_pfn+nr] is * first encountered. * Collect the guest_dirty bitmask, a bit mask of the dirty vram pages, by * calling paging_log_dirty_range(), which interrogates each vram * page's p2m type looking for pages that have been made writable. */ int hap_track_dirty_vram(struct domain *d, unsigned long begin_pfn, unsigned long nr, XEN_GUEST_HANDLE_64(uint8) guest_dirty_bitmap) { long rc = 0; struct sh_dirty_vram *dirty_vram; uint8_t *dirty_bitmap = NULL; if ( nr ) { int size = (nr + BITS_PER_BYTE - 1) / BITS_PER_BYTE; if ( !paging_mode_log_dirty(d) ) { hap_logdirty_init(d); rc = paging_log_dirty_enable(d, 0); if ( rc ) goto out; } rc = -ENOMEM; dirty_bitmap = xzalloc_bytes(size); if ( !dirty_bitmap ) goto out; paging_lock(d); dirty_vram = d->arch.hvm_domain.dirty_vram; if ( !dirty_vram ) { rc = -ENOMEM; if ( (dirty_vram = xzalloc(struct sh_dirty_vram)) == NULL ) { paging_unlock(d); goto out; } d->arch.hvm_domain.dirty_vram = dirty_vram; } if ( begin_pfn != dirty_vram->begin_pfn || begin_pfn + nr != dirty_vram->end_pfn ) { dirty_vram->begin_pfn = begin_pfn; dirty_vram->end_pfn = begin_pfn + nr; paging_unlock(d); /* set l1e entries of range within P2M table to be read-only. */ p2m_change_type_range(d, begin_pfn, begin_pfn + nr, p2m_ram_rw, p2m_ram_logdirty); flush_tlb_mask(d->domain_dirty_cpumask); memset(dirty_bitmap, 0xff, size); /* consider all pages dirty */ } else { paging_unlock(d); domain_pause(d); /* get the bitmap */ paging_log_dirty_range(d, begin_pfn, nr, dirty_bitmap); domain_unpause(d); } rc = -EFAULT; if ( copy_to_guest(guest_dirty_bitmap, dirty_bitmap, size) == 0 ) rc = 0; } else { paging_lock(d); dirty_vram = d->arch.hvm_domain.dirty_vram; if ( dirty_vram ) { /* * If zero pages specified while tracking dirty vram * then stop tracking */ xfree(dirty_vram); d->arch.hvm_domain.dirty_vram = NULL; } paging_unlock(d); } out: if ( dirty_bitmap ) xfree(dirty_bitmap); return rc; } /************************************************/ /* HAP LOG DIRTY SUPPORT */ /************************************************/ /* * hap code to call when log_dirty is enable. return 0 if no problem found. * * NB: Domain that having device assigned should not set log_global. Because * there is no way to track the memory updating from device. */ static int hap_enable_log_dirty(struct domain *d, bool_t log_global) { /* turn on PG_log_dirty bit in paging mode */ paging_lock(d); d->arch.paging.mode |= PG_log_dirty; paging_unlock(d); if ( log_global ) { /* set l1e entries of P2M table to be read-only. */ p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_logdirty); flush_tlb_mask(d->domain_dirty_cpumask); } return 0; } static int hap_disable_log_dirty(struct domain *d) { paging_lock(d); d->arch.paging.mode &= ~PG_log_dirty; paging_unlock(d); /* set l1e entries of P2M table with normal mode */ p2m_change_entry_type_global(d, p2m_ram_logdirty, p2m_ram_rw); return 0; } static void hap_clean_dirty_bitmap(struct domain *d) { /* set l1e entries of P2M table to be read-only. */ p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_logdirty); flush_tlb_mask(d->domain_dirty_cpumask); } void hap_logdirty_init(struct domain *d) { /* Reinitialize logdirty mechanism */ paging_log_dirty_init(d, hap_enable_log_dirty, hap_disable_log_dirty, hap_clean_dirty_bitmap); } /************************************************/ /* HAP SUPPORT FUNCTIONS */ /************************************************/ static struct page_info *hap_alloc(struct domain *d) { struct page_info *pg = NULL; void *p; ASSERT(paging_locked_by_me(d)); pg = page_list_remove_head(&d->arch.paging.hap.freelist); if ( unlikely(!pg) ) return NULL; d->arch.paging.hap.free_pages--; p = __map_domain_page(pg); ASSERT(p != NULL); clear_page(p); hap_unmap_domain_page(p); return pg; } static void hap_free(struct domain *d, mfn_t mfn) { struct page_info *pg = mfn_to_page(mfn); ASSERT(paging_locked_by_me(d)); d->arch.paging.hap.free_pages++; page_list_add_tail(pg, &d->arch.paging.hap.freelist); } static struct page_info *hap_alloc_p2m_page(struct domain *d) { struct page_info *pg; /* This is called both from the p2m code (which never holds the * paging lock) and the log-dirty code (which always does). */ paging_lock_recursive(d); pg = hap_alloc(d); if ( likely(pg != NULL) ) { d->arch.paging.hap.total_pages--; d->arch.paging.hap.p2m_pages++; page_set_owner(pg, d); pg->count_info |= 1; } else if ( !d->arch.paging.p2m_alloc_failed ) { d->arch.paging.p2m_alloc_failed = 1; dprintk(XENLOG_ERR, "d%i failed to allocate from HAP pool", d->domain_id); } paging_unlock(d); return pg; } static void hap_free_p2m_page(struct domain *d, struct page_info *pg) { /* This is called both from the p2m code (which never holds the * paging lock) and the log-dirty code (which always does). */ paging_lock_recursive(d); ASSERT(page_get_owner(pg) == d); /* Should have just the one ref we gave it in alloc_p2m_page() */ if ( (pg->count_info & PGC_count_mask) != 1 ) { HAP_ERROR("Odd p2m page %p count c=%#lx t=%"PRtype_info"\n", pg, pg->count_info, pg->u.inuse.type_info); WARN(); } pg->count_info &= ~PGC_count_mask; /* Free should not decrement domain's total allocation, since * these pages were allocated without an owner. */ page_set_owner(pg, NULL); d->arch.paging.hap.p2m_pages--; d->arch.paging.hap.total_pages++; hap_free(d, page_to_mfn(pg)); paging_unlock(d); } /* Return the size of the pool, rounded up to the nearest MB */ static unsigned int hap_get_allocation(struct domain *d) { unsigned int pg = d->arch.paging.hap.total_pages + d->arch.paging.hap.p2m_pages; return ((pg >> (20 - PAGE_SHIFT)) + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0)); } /* Set the pool of pages to the required number of pages. * Returns 0 for success, non-zero for failure. */ static unsigned int hap_set_allocation(struct domain *d, unsigned int pages, int *preempted) { struct page_info *pg; ASSERT(paging_locked_by_me(d)); if ( pages < d->arch.paging.hap.p2m_pages ) pages = 0; else pages -= d->arch.paging.hap.p2m_pages; while ( d->arch.paging.hap.total_pages != pages ) { if ( d->arch.paging.hap.total_pages < pages ) { /* Need to allocate more memory from domheap */ pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d))); if ( pg == NULL ) { HAP_PRINTK("failed to allocate hap pages.\n"); return -ENOMEM; } d->arch.paging.hap.free_pages++; d->arch.paging.hap.total_pages++; page_list_add_tail(pg, &d->arch.paging.hap.freelist); } else if ( d->arch.paging.hap.total_pages > pages ) { /* Need to return memory to domheap */ if ( page_list_empty(&d->arch.paging.hap.freelist) ) { HAP_PRINTK("failed to free enough hap pages.\n"); return -ENOMEM; } pg = page_list_remove_head(&d->arch.paging.hap.freelist); ASSERT(pg); d->arch.paging.hap.free_pages--; d->arch.paging.hap.total_pages--; free_domheap_page(pg); } /* Check to see if we need to yield and try again */ if ( preempted && hypercall_preempt_check() ) { *preempted = 1; return 0; } } return 0; } static void hap_install_xen_entries_in_l4(struct vcpu *v, mfn_t l4mfn) { struct domain *d = v->domain; l4_pgentry_t *l4e; l4e = hap_map_domain_page(l4mfn); ASSERT(l4e != NULL); /* Copy the common Xen mappings from the idle domain */ memcpy(&l4e[ROOT_PAGETABLE_FIRST_XEN_SLOT], &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT], ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t)); /* Install the per-domain mappings for this domain */ l4e[l4_table_offset(PERDOMAIN_VIRT_START)] = l4e_from_pfn(mfn_x(page_to_mfn(d->arch.perdomain_l3_pg)), __PAGE_HYPERVISOR); /* Install a linear mapping */ l4e[l4_table_offset(LINEAR_PT_VIRT_START)] = l4e_from_pfn(mfn_x(l4mfn), __PAGE_HYPERVISOR); hap_unmap_domain_page(l4e); } static mfn_t hap_make_monitor_table(struct vcpu *v) { struct domain *d = v->domain; struct page_info *pg; mfn_t m4mfn; ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0); if ( (pg = hap_alloc(d)) == NULL ) goto oom; m4mfn = page_to_mfn(pg); hap_install_xen_entries_in_l4(v, m4mfn); return m4mfn; oom: HAP_ERROR("out of memory building monitor pagetable\n"); domain_crash(d); return _mfn(INVALID_MFN); } static void hap_destroy_monitor_table(struct vcpu* v, mfn_t mmfn) { struct domain *d = v->domain; /* Put the memory back in the pool */ hap_free(d, mmfn); } /************************************************/ /* HAP DOMAIN LEVEL FUNCTIONS */ /************************************************/ void hap_domain_init(struct domain *d) { INIT_PAGE_LIST_HEAD(&d->arch.paging.hap.freelist); } /* return 0 for success, -errno for failure */ int hap_enable(struct domain *d, u32 mode) { unsigned int old_pages; uint8_t i; int rv = 0; domain_pause(d); /* error check */ if ( (d == current->domain) ) { rv = -EINVAL; goto out; } old_pages = d->arch.paging.hap.total_pages; if ( old_pages == 0 ) { unsigned int r; paging_lock(d); r = hap_set_allocation(d, 256, NULL); if ( r != 0 ) { hap_set_allocation(d, 0, NULL); paging_unlock(d); rv = -ENOMEM; goto out; } paging_unlock(d); } /* Allow p2m and log-dirty code to borrow our memory */ d->arch.paging.alloc_page = hap_alloc_p2m_page; d->arch.paging.free_page = hap_free_p2m_page; /* allocate P2m table */ if ( mode & PG_translate ) { rv = p2m_alloc_table(p2m_get_hostp2m(d)); if ( rv != 0 ) goto out; } for (i = 0; i < MAX_NESTEDP2M; i++) { rv = p2m_alloc_table(d->arch.nested_p2m[i]); if ( rv != 0 ) goto out; } /* Now let other users see the new mode */ d->arch.paging.mode = mode | PG_HAP_enable; out: domain_unpause(d); return rv; } void hap_final_teardown(struct domain *d) { uint8_t i; /* Destroy nestedp2m's first */ for (i = 0; i < MAX_NESTEDP2M; i++) { p2m_teardown(d->arch.nested_p2m[i]); } if ( d->arch.paging.hap.total_pages != 0 ) hap_teardown(d); p2m_teardown(p2m_get_hostp2m(d)); /* Free any memory that the p2m teardown released */ paging_lock(d); hap_set_allocation(d, 0, NULL); ASSERT(d->arch.paging.hap.p2m_pages == 0); paging_unlock(d); } void hap_teardown(struct domain *d) { struct vcpu *v; mfn_t mfn; ASSERT(d->is_dying); ASSERT(d != current->domain); if ( !paging_locked_by_me(d) ) paging_lock(d); /* Keep various asserts happy */ if ( paging_mode_enabled(d) ) { /* release the monitor table held by each vcpu */ for_each_vcpu ( d, v ) { if ( paging_get_hostmode(v) && paging_mode_external(d) ) { mfn = pagetable_get_mfn(v->arch.monitor_table); if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) ) hap_destroy_monitor_table(v, mfn); v->arch.monitor_table = pagetable_null(); } } } if ( d->arch.paging.hap.total_pages != 0 ) { HAP_PRINTK("teardown of domain %u starts." " pages total = %u, free = %u, p2m=%u\n", d->domain_id, d->arch.paging.hap.total_pages, d->arch.paging.hap.free_pages, d->arch.paging.hap.p2m_pages); hap_set_allocation(d, 0, NULL); HAP_PRINTK("teardown done." " pages total = %u, free = %u, p2m=%u\n", d->arch.paging.hap.total_pages, d->arch.paging.hap.free_pages, d->arch.paging.hap.p2m_pages); ASSERT(d->arch.paging.hap.total_pages == 0); } d->arch.paging.mode &= ~PG_log_dirty; xfree(d->arch.hvm_domain.dirty_vram); d->arch.hvm_domain.dirty_vram = NULL; paging_unlock(d); } int hap_domctl(struct domain *d, xen_domctl_shadow_op_t *sc, XEN_GUEST_HANDLE_PARAM(void) u_domctl) { int rc, preempted = 0; switch ( sc->op ) { case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: paging_lock(d); rc = hap_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted); paging_unlock(d); if ( preempted ) /* Not finished. Set up to re-run the call. */ rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h", u_domctl); else /* Finished. Return the new allocation */ sc->mb = hap_get_allocation(d); return rc; case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: sc->mb = hap_get_allocation(d); /* Fall through... */ case XEN_DOMCTL_SHADOW_OP_OFF: return 0; default: HAP_PRINTK("Bad hap domctl op %u\n", sc->op); return -EINVAL; } } static const struct paging_mode hap_paging_real_mode; static const struct paging_mode hap_paging_protected_mode; static const struct paging_mode hap_paging_pae_mode; static const struct paging_mode hap_paging_long_mode; void hap_vcpu_init(struct vcpu *v) { v->arch.paging.mode = &hap_paging_real_mode; v->arch.paging.nestedmode = &hap_paging_real_mode; } /************************************************/ /* HAP PAGING MODE FUNCTIONS */ /************************************************/ /* * HAP guests can handle page faults (in the guest page tables) without * needing any action from Xen, so we should not be intercepting them. */ static int hap_page_fault(struct vcpu *v, unsigned long va, struct cpu_user_regs *regs) { struct domain *d = v->domain; HAP_ERROR("Intercepted a guest #PF (%u:%u) with HAP enabled.\n", d->domain_id, v->vcpu_id); domain_crash(d); return 0; } /* * HAP guests can handle invlpg without needing any action from Xen, so * should not be intercepting it. */ static int hap_invlpg(struct vcpu *v, unsigned long va) { if (nestedhvm_enabled(v->domain)) { /* Emulate INVLPGA: * Must perform the flush right now or an other vcpu may * use it when we use the next VMRUN emulation, otherwise. */ p2m_flush(v, vcpu_nestedhvm(v).nv_p2m); return 1; } HAP_ERROR("Intercepted a guest INVLPG (%u:%u) with HAP enabled.\n", v->domain->domain_id, v->vcpu_id); domain_crash(v->domain); return 0; } static void hap_update_cr3(struct vcpu *v, int do_locking) { v->arch.hvm_vcpu.hw_cr[3] = v->arch.hvm_vcpu.guest_cr[3]; hvm_update_guest_cr(v, 3); } const struct paging_mode * hap_paging_get_mode(struct vcpu *v) { return !hvm_paging_enabled(v) ? &hap_paging_real_mode : hvm_long_mode_enabled(v) ? &hap_paging_long_mode : hvm_pae_enabled(v) ? &hap_paging_pae_mode : &hap_paging_protected_mode; } static void hap_update_paging_modes(struct vcpu *v) { struct domain *d = v->domain; unsigned long cr3_gfn = v->arch.hvm_vcpu.guest_cr[3] >> PAGE_SHIFT; p2m_type_t t; /* We hold onto the cr3 as it may be modified later, and * we need to respect lock ordering. No need for * checks here as they are performed by vmx_load_pdptrs * (the potential user of the cr3) */ (void)get_gfn(d, cr3_gfn, &t); paging_lock(d); v->arch.paging.mode = hap_paging_get_mode(v); if ( pagetable_is_null(v->arch.monitor_table) ) { mfn_t mmfn = hap_make_monitor_table(v); v->arch.monitor_table = pagetable_from_mfn(mmfn); make_cr3(v, mfn_x(mmfn)); hvm_update_host_cr3(v); } /* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */ hap_update_cr3(v, 0); paging_unlock(d); put_gfn(d, cr3_gfn); } static void hap_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level) { struct domain *d = v->domain; uint32_t old_flags; bool_t flush_nestedp2m = 0; /* We know always use the host p2m here, regardless if the vcpu * is in host or guest mode. The vcpu can be in guest mode by * a hypercall which passes a domain and chooses mostly the first * vcpu. */ paging_lock(d); old_flags = l1e_get_flags(*p); if ( nestedhvm_enabled(d) && (old_flags & _PAGE_PRESENT) && !p2m_get_hostp2m(d)->defer_nested_flush ) { /* We are replacing a valid entry so we need to flush nested p2ms, * unless the only change is an increase in access rights. */ mfn_t omfn = _mfn(l1e_get_pfn(*p)); mfn_t nmfn = _mfn(l1e_get_pfn(new)); flush_nestedp2m = !( mfn_x(omfn) == mfn_x(nmfn) && perms_strictly_increased(old_flags, l1e_get_flags(new)) ); } safe_write_pte(p, new); if ( (old_flags & _PAGE_PRESENT) && (level == 1 || (level == 2 && (old_flags & _PAGE_PSE))) ) flush_tlb_mask(d->domain_dirty_cpumask); paging_unlock(d); if ( flush_nestedp2m ) p2m_flush_nestedp2m(d); } static unsigned long hap_gva_to_gfn_real_mode( struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec) { return ((paddr_t)gva >> PAGE_SHIFT); } static unsigned long hap_p2m_ga_to_gfn_real_mode( struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3, paddr_t ga, uint32_t *pfec, unsigned int *page_order) { if ( page_order ) *page_order = PAGE_ORDER_4K; return (ga >> PAGE_SHIFT); } /* Entry points into this mode of the hap code. */ static const struct paging_mode hap_paging_real_mode = { .page_fault = hap_page_fault, .invlpg = hap_invlpg, .gva_to_gfn = hap_gva_to_gfn_real_mode, .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_real_mode, .update_cr3 = hap_update_cr3, .update_paging_modes = hap_update_paging_modes, .write_p2m_entry = hap_write_p2m_entry, .guest_levels = 1 }; static const struct paging_mode hap_paging_protected_mode = { .page_fault = hap_page_fault, .invlpg = hap_invlpg, .gva_to_gfn = hap_gva_to_gfn_2_levels, .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_2_levels, .update_cr3 = hap_update_cr3, .update_paging_modes = hap_update_paging_modes, .write_p2m_entry = hap_write_p2m_entry, .guest_levels = 2 }; static const struct paging_mode hap_paging_pae_mode = { .page_fault = hap_page_fault, .invlpg = hap_invlpg, .gva_to_gfn = hap_gva_to_gfn_3_levels, .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_3_levels, .update_cr3 = hap_update_cr3, .update_paging_modes = hap_update_paging_modes, .write_p2m_entry = hap_write_p2m_entry, .guest_levels = 3 }; static const struct paging_mode hap_paging_long_mode = { .page_fault = hap_page_fault, .invlpg = hap_invlpg, .gva_to_gfn = hap_gva_to_gfn_4_levels, .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_4_levels, .update_cr3 = hap_update_cr3, .update_paging_modes = hap_update_paging_modes, .write_p2m_entry = hap_write_p2m_entry, .guest_levels = 4 }; /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/mm/paging.c0000664000175000017500000005237112307313555014631 0ustar smbsmb/****************************************************************************** * arch/x86/paging.c * * x86 specific paging support * Copyright (c) 2007 Advanced Micro Devices (Wei Huang) * Copyright (c) 2007 XenSource Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include "mm-locks.h" /* Printouts */ #define PAGING_PRINTK(_f, _a...) \ debugtrace_printk("pg: %s(): " _f, __func__, ##_a) #define PAGING_ERROR(_f, _a...) \ printk("pg error: %s(): " _f, __func__, ##_a) #define PAGING_DEBUG(flag, _f, _a...) \ do { \ if (PAGING_DEBUG_ ## flag) \ debugtrace_printk("pgdebug: %s(): " _f, __func__, ##_a); \ } while (0) /* Per-CPU variable for enforcing the lock ordering */ DEFINE_PER_CPU(int, mm_lock_level); /* Override macros from asm/page.h to make them work with mfn_t */ #undef mfn_to_page #define mfn_to_page(_m) __mfn_to_page(mfn_x(_m)) #undef mfn_valid #define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn)) #undef page_to_mfn #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) /************************************************/ /* LOG DIRTY SUPPORT */ /************************************************/ static mfn_t paging_new_log_dirty_page(struct domain *d) { struct page_info *page; page = d->arch.paging.alloc_page(d); if ( unlikely(page == NULL) ) { d->arch.paging.log_dirty.failed_allocs++; return _mfn(INVALID_MFN); } d->arch.paging.log_dirty.allocs++; return page_to_mfn(page); } /* Alloc and init a new leaf node */ static mfn_t paging_new_log_dirty_leaf(struct domain *d) { mfn_t mfn = paging_new_log_dirty_page(d); if ( mfn_valid(mfn) ) { void *leaf = map_domain_page(mfn_x(mfn)); clear_page(leaf); unmap_domain_page(leaf); } return mfn; } /* Alloc and init a new non-leaf node */ static mfn_t paging_new_log_dirty_node(struct domain *d) { mfn_t mfn = paging_new_log_dirty_page(d); if ( mfn_valid(mfn) ) { int i; mfn_t *node = map_domain_page(mfn_x(mfn)); for ( i = 0; i < LOGDIRTY_NODE_ENTRIES; i++ ) node[i] = _mfn(INVALID_MFN); unmap_domain_page(node); } return mfn; } /* get the top of the log-dirty bitmap trie */ static mfn_t *paging_map_log_dirty_bitmap(struct domain *d) { if ( likely(mfn_valid(d->arch.paging.log_dirty.top)) ) return map_domain_page(mfn_x(d->arch.paging.log_dirty.top)); return NULL; } static void paging_free_log_dirty_page(struct domain *d, mfn_t mfn) { d->arch.paging.log_dirty.allocs--; d->arch.paging.free_page(d, mfn_to_page(mfn)); } void paging_free_log_dirty_bitmap(struct domain *d) { mfn_t *l4, *l3, *l2; int i4, i3, i2; if ( !mfn_valid(d->arch.paging.log_dirty.top) ) return; paging_lock(d); l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top)); for ( i4 = 0; i4 < LOGDIRTY_NODE_ENTRIES; i4++ ) { if ( !mfn_valid(l4[i4]) ) continue; l3 = map_domain_page(mfn_x(l4[i4])); for ( i3 = 0; i3 < LOGDIRTY_NODE_ENTRIES; i3++ ) { if ( !mfn_valid(l3[i3]) ) continue; l2 = map_domain_page(mfn_x(l3[i3])); for ( i2 = 0; i2 < LOGDIRTY_NODE_ENTRIES; i2++ ) if ( mfn_valid(l2[i2]) ) paging_free_log_dirty_page(d, l2[i2]); unmap_domain_page(l2); paging_free_log_dirty_page(d, l3[i3]); } unmap_domain_page(l3); paging_free_log_dirty_page(d, l4[i4]); } unmap_domain_page(l4); paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top); d->arch.paging.log_dirty.top = _mfn(INVALID_MFN); ASSERT(d->arch.paging.log_dirty.allocs == 0); d->arch.paging.log_dirty.failed_allocs = 0; paging_unlock(d); } int paging_log_dirty_enable(struct domain *d, bool_t log_global) { int ret; if ( paging_mode_log_dirty(d) ) return -EINVAL; domain_pause(d); ret = d->arch.paging.log_dirty.enable_log_dirty(d, log_global); domain_unpause(d); return ret; } int paging_log_dirty_disable(struct domain *d) { int ret; domain_pause(d); /* Safe because the domain is paused. */ ret = d->arch.paging.log_dirty.disable_log_dirty(d); if ( !paging_mode_log_dirty(d) ) paging_free_log_dirty_bitmap(d); domain_unpause(d); return ret; } /* Mark a page as dirty */ void paging_mark_dirty(struct domain *d, unsigned long guest_mfn) { unsigned long pfn; mfn_t gmfn; int changed; mfn_t mfn, *l4, *l3, *l2; unsigned long *l1; int i1, i2, i3, i4; gmfn = _mfn(guest_mfn); if ( !paging_mode_log_dirty(d) || !mfn_valid(gmfn) || page_get_owner(mfn_to_page(gmfn)) != d ) return; /* We /really/ mean PFN here, even for non-translated guests. */ pfn = get_gpfn_from_mfn(mfn_x(gmfn)); /* Shared MFNs should NEVER be marked dirty */ BUG_ON(SHARED_M2P(pfn)); /* * Values with the MSB set denote MFNs that aren't really part of the * domain's pseudo-physical memory map (e.g., the shared info frame). * Nothing to do here... */ if ( unlikely(!VALID_M2P(pfn)) ) return; i1 = L1_LOGDIRTY_IDX(pfn); i2 = L2_LOGDIRTY_IDX(pfn); i3 = L3_LOGDIRTY_IDX(pfn); i4 = L4_LOGDIRTY_IDX(pfn); /* Recursive: this is called from inside the shadow code */ paging_lock_recursive(d); if ( unlikely(!mfn_valid(d->arch.paging.log_dirty.top)) ) { d->arch.paging.log_dirty.top = paging_new_log_dirty_node(d); if ( unlikely(!mfn_valid(d->arch.paging.log_dirty.top)) ) goto out; } l4 = paging_map_log_dirty_bitmap(d); mfn = l4[i4]; if ( !mfn_valid(mfn) ) l4[i4] = mfn = paging_new_log_dirty_node(d); unmap_domain_page(l4); if ( !mfn_valid(mfn) ) goto out; l3 = map_domain_page(mfn_x(mfn)); mfn = l3[i3]; if ( !mfn_valid(mfn) ) l3[i3] = mfn = paging_new_log_dirty_node(d); unmap_domain_page(l3); if ( !mfn_valid(mfn) ) goto out; l2 = map_domain_page(mfn_x(mfn)); mfn = l2[i2]; if ( !mfn_valid(mfn) ) l2[i2] = mfn = paging_new_log_dirty_leaf(d); unmap_domain_page(l2); if ( !mfn_valid(mfn) ) goto out; l1 = map_domain_page(mfn_x(mfn)); changed = !__test_and_set_bit(i1, l1); unmap_domain_page(l1); if ( changed ) { PAGING_DEBUG(LOGDIRTY, "marked mfn %" PRI_mfn " (pfn=%lx), dom %d\n", mfn_x(gmfn), pfn, d->domain_id); d->arch.paging.log_dirty.dirty_count++; } out: /* We've already recorded any failed allocations */ paging_unlock(d); return; } /* Is this guest page dirty? */ int paging_mfn_is_dirty(struct domain *d, mfn_t gmfn) { unsigned long pfn; mfn_t mfn, *l4, *l3, *l2; unsigned long *l1; int rv; ASSERT(paging_locked_by_me(d)); ASSERT(paging_mode_log_dirty(d)); /* We /really/ mean PFN here, even for non-translated guests. */ pfn = get_gpfn_from_mfn(mfn_x(gmfn)); /* Shared pages are always read-only; invalid pages can't be dirty. */ if ( unlikely(SHARED_M2P(pfn) || !VALID_M2P(pfn)) ) return 0; mfn = d->arch.paging.log_dirty.top; if ( !mfn_valid(mfn) ) return 0; l4 = map_domain_page(mfn_x(mfn)); mfn = l4[L4_LOGDIRTY_IDX(pfn)]; unmap_domain_page(l4); if ( !mfn_valid(mfn) ) return 0; l3 = map_domain_page(mfn_x(mfn)); mfn = l3[L3_LOGDIRTY_IDX(pfn)]; unmap_domain_page(l3); if ( !mfn_valid(mfn) ) return 0; l2 = map_domain_page(mfn_x(mfn)); mfn = l2[L2_LOGDIRTY_IDX(pfn)]; unmap_domain_page(l2); if ( !mfn_valid(mfn) ) return 0; l1 = map_domain_page(mfn_x(mfn)); rv = test_bit(L1_LOGDIRTY_IDX(pfn), l1); unmap_domain_page(l1); return rv; } /* Read a domain's log-dirty bitmap and stats. If the operation is a CLEAN, * clear the bitmap and stats as well. */ int paging_log_dirty_op(struct domain *d, struct xen_domctl_shadow_op *sc) { int rv = 0, clean = 0, peek = 1; unsigned long pages = 0; mfn_t *l4 = NULL, *l3 = NULL, *l2 = NULL; unsigned long *l1 = NULL; int i4, i3, i2; domain_pause(d); paging_lock(d); clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN); PAGING_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n", (clean) ? "clean" : "peek", d->domain_id, d->arch.paging.log_dirty.fault_count, d->arch.paging.log_dirty.dirty_count); sc->stats.fault_count = d->arch.paging.log_dirty.fault_count; sc->stats.dirty_count = d->arch.paging.log_dirty.dirty_count; if ( clean ) { d->arch.paging.log_dirty.fault_count = 0; d->arch.paging.log_dirty.dirty_count = 0; } if ( guest_handle_is_null(sc->dirty_bitmap) ) /* caller may have wanted just to clean the state or access stats. */ peek = 0; if ( unlikely(d->arch.paging.log_dirty.failed_allocs) ) { printk("%s: %d failed page allocs while logging dirty pages\n", __FUNCTION__, d->arch.paging.log_dirty.failed_allocs); rv = -ENOMEM; goto out; } pages = 0; l4 = paging_map_log_dirty_bitmap(d); for ( i4 = 0; (pages < sc->pages) && (i4 < LOGDIRTY_NODE_ENTRIES); i4++ ) { l3 = (l4 && mfn_valid(l4[i4])) ? map_domain_page(mfn_x(l4[i4])) : NULL; for ( i3 = 0; (pages < sc->pages) && (i3 < LOGDIRTY_NODE_ENTRIES); i3++ ) { l2 = ((l3 && mfn_valid(l3[i3])) ? map_domain_page(mfn_x(l3[i3])) : NULL); for ( i2 = 0; (pages < sc->pages) && (i2 < LOGDIRTY_NODE_ENTRIES); i2++ ) { unsigned int bytes = PAGE_SIZE; l1 = ((l2 && mfn_valid(l2[i2])) ? map_domain_page(mfn_x(l2[i2])) : NULL); if ( unlikely(((sc->pages - pages + 7) >> 3) < bytes) ) bytes = (unsigned int)((sc->pages - pages + 7) >> 3); if ( likely(peek) ) { if ( (l1 ? copy_to_guest_offset(sc->dirty_bitmap, pages >> 3, (uint8_t *)l1, bytes) : clear_guest_offset(sc->dirty_bitmap, pages >> 3, bytes)) != 0 ) { rv = -EFAULT; goto out; } } pages += bytes << 3; if ( l1 ) { if ( clean ) clear_page(l1); unmap_domain_page(l1); } } if ( l2 ) unmap_domain_page(l2); } if ( l3 ) unmap_domain_page(l3); } if ( l4 ) unmap_domain_page(l4); if ( pages < sc->pages ) sc->pages = pages; paging_unlock(d); if ( clean ) { /* We need to further call clean_dirty_bitmap() functions of specific * paging modes (shadow or hap). Safe because the domain is paused. */ d->arch.paging.log_dirty.clean_dirty_bitmap(d); } domain_unpause(d); return rv; out: paging_unlock(d); domain_unpause(d); if ( l1 ) unmap_domain_page(l1); if ( l2 ) unmap_domain_page(l2); if ( l3 ) unmap_domain_page(l3); if ( l4 ) unmap_domain_page(l4); return rv; } void paging_log_dirty_range(struct domain *d, unsigned long begin_pfn, unsigned long nr, uint8_t *dirty_bitmap) { struct p2m_domain *p2m = p2m_get_hostp2m(d); int i; unsigned long pfn; /* * Set l1e entries of P2M table to be read-only. * * On first write, it page faults, its entry is changed to read-write, * and on retry the write succeeds. * * We populate dirty_bitmap by looking for entries that have been * switched to read-write. */ p2m_lock(p2m); for ( i = 0, pfn = begin_pfn; pfn < begin_pfn + nr; i++, pfn++ ) { p2m_type_t pt; pt = p2m_change_type(d, pfn, p2m_ram_rw, p2m_ram_logdirty); if ( pt == p2m_ram_rw ) dirty_bitmap[i >> 3] |= (1 << (i & 7)); } p2m_unlock(p2m); flush_tlb_mask(d->domain_dirty_cpumask); } /* Note that this function takes three function pointers. Callers must supply * these functions for log dirty code to call. This function usually is * invoked when paging is enabled. Check shadow_enable() and hap_enable() for * reference. * * These function pointers must not be followed with the log-dirty lock held. */ void paging_log_dirty_init(struct domain *d, int (*enable_log_dirty)(struct domain *d, bool_t log_global), int (*disable_log_dirty)(struct domain *d), void (*clean_dirty_bitmap)(struct domain *d)) { d->arch.paging.log_dirty.enable_log_dirty = enable_log_dirty; d->arch.paging.log_dirty.disable_log_dirty = disable_log_dirty; d->arch.paging.log_dirty.clean_dirty_bitmap = clean_dirty_bitmap; } /* This function fress log dirty bitmap resources. */ static void paging_log_dirty_teardown(struct domain*d) { paging_free_log_dirty_bitmap(d); } /************************************************/ /* CODE FOR PAGING SUPPORT */ /************************************************/ /* Domain paging struct initialization. */ int paging_domain_init(struct domain *d, unsigned int domcr_flags) { int rc; if ( (rc = p2m_init(d)) != 0 ) return rc; mm_lock_init(&d->arch.paging.lock); /* This must be initialized separately from the rest of the * log-dirty init code as that can be called more than once and we * don't want to leak any active log-dirty bitmaps */ d->arch.paging.log_dirty.top = _mfn(INVALID_MFN); /* The order of the *_init calls below is important, as the later * ones may rewrite some common fields. Shadow pagetables are the * default... */ shadow_domain_init(d, domcr_flags); /* ... but we will use hardware assistance if it's available. */ if ( hap_enabled(d) ) hap_domain_init(d); return 0; } /* vcpu paging struct initialization goes here */ void paging_vcpu_init(struct vcpu *v) { if ( hap_enabled(v->domain) ) hap_vcpu_init(v); else shadow_vcpu_init(v); } int paging_domctl(struct domain *d, xen_domctl_shadow_op_t *sc, XEN_GUEST_HANDLE_PARAM(void) u_domctl) { int rc; if ( unlikely(d == current->domain) ) { gdprintk(XENLOG_INFO, "Tried to do a paging op on itself.\n"); return -EINVAL; } if ( unlikely(d->is_dying) ) { gdprintk(XENLOG_INFO, "Ignoring paging op on dying domain %u\n", d->domain_id); return 0; } if ( unlikely(d->vcpu == NULL) || unlikely(d->vcpu[0] == NULL) ) { gdprintk(XENLOG_DEBUG, "Paging op on a domain (%u) with no vcpus\n", d->domain_id); return -EINVAL; } rc = xsm_shadow_control(XSM_HOOK, d, sc->op); if ( rc ) return rc; /* Code to handle log-dirty. Note that some log dirty operations * piggy-back on shadow operations. For example, when * XEN_DOMCTL_SHADOW_OP_OFF is called, it first checks whether log dirty * mode is enabled. If does, we disables log dirty and continues with * shadow code. For this reason, we need to further dispatch domctl * to next-level paging code (shadow or hap). */ switch ( sc->op ) { case XEN_DOMCTL_SHADOW_OP_ENABLE: if ( !(sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY) ) break; /* Else fall through... */ case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY: if ( hap_enabled(d) ) hap_logdirty_init(d); return paging_log_dirty_enable(d, 1); case XEN_DOMCTL_SHADOW_OP_OFF: if ( paging_mode_log_dirty(d) ) if ( (rc = paging_log_dirty_disable(d)) != 0 ) return rc; break; case XEN_DOMCTL_SHADOW_OP_CLEAN: case XEN_DOMCTL_SHADOW_OP_PEEK: return paging_log_dirty_op(d, sc); } /* Here, dispatch domctl to the appropriate paging code */ if ( hap_enabled(d) ) return hap_domctl(d, sc, u_domctl); else return shadow_domctl(d, sc, u_domctl); } /* Call when destroying a domain */ void paging_teardown(struct domain *d) { if ( hap_enabled(d) ) hap_teardown(d); else shadow_teardown(d); /* clean up log dirty resources. */ paging_log_dirty_teardown(d); /* Move populate-on-demand cache back to domain_list for destruction */ p2m_pod_empty_cache(d); } /* Call once all of the references to the domain have gone away */ void paging_final_teardown(struct domain *d) { if ( hap_enabled(d) ) hap_final_teardown(d); else shadow_final_teardown(d); p2m_final_teardown(d); } /* Enable an arbitrary paging-assistance mode. Call once at domain * creation. */ int paging_enable(struct domain *d, u32 mode) { if ( hap_enabled(d) ) return hap_enable(d, mode | PG_HAP_enable); else return shadow_enable(d, mode | PG_SH_enable); } /* Called from the guest to indicate that a process is being torn down * and therefore its pagetables will soon be discarded */ void pagetable_dying(struct domain *d, paddr_t gpa) { struct vcpu *v; ASSERT(paging_mode_shadow(d)); v = d->vcpu[0]; v->arch.paging.mode->shadow.pagetable_dying(v, gpa); } /* Print paging-assistance info to the console */ void paging_dump_domain_info(struct domain *d) { if ( paging_mode_enabled(d) ) { printk(" paging assistance: "); if ( paging_mode_shadow(d) ) printk("shadow "); if ( paging_mode_hap(d) ) printk("hap "); if ( paging_mode_refcounts(d) ) printk("refcounts "); if ( paging_mode_log_dirty(d) ) printk("log_dirty "); if ( paging_mode_translate(d) ) printk("translate "); if ( paging_mode_external(d) ) printk("external "); printk("\n"); } } void paging_dump_vcpu_info(struct vcpu *v) { if ( paging_mode_enabled(v->domain) ) { printk(" paging assistance: "); if ( paging_mode_shadow(v->domain) ) { if ( paging_get_hostmode(v) ) printk("shadowed %u-on-%u\n", paging_get_hostmode(v)->guest_levels, paging_get_hostmode(v)->shadow.shadow_levels); else printk("not shadowed\n"); } else if ( paging_mode_hap(v->domain) && paging_get_hostmode(v) ) printk("hap, %u levels\n", paging_get_hostmode(v)->guest_levels); else printk("none\n"); } } const struct paging_mode *paging_get_mode(struct vcpu *v) { if (!nestedhvm_is_n2(v)) return paging_get_hostmode(v); return paging_get_nestedmode(v); } void paging_update_nestedmode(struct vcpu *v) { ASSERT(nestedhvm_enabled(v->domain)); if (nestedhvm_paging_mode_hap(v)) /* nested-on-nested */ v->arch.paging.nestedmode = hap_paging_get_mode(v); else /* TODO: shadow-on-shadow */ v->arch.paging.nestedmode = NULL; hvm_asid_flush_vcpu(v); } void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level) { struct domain *d = p2m->domain; struct vcpu *v = current; if ( v->domain != d ) v = d->vcpu ? d->vcpu[0] : NULL; if ( likely(v && paging_mode_enabled(d) && paging_get_hostmode(v) != NULL) ) { return paging_get_hostmode(v)->write_p2m_entry(v, gfn, p, table_mfn, new, level); } else safe_write_pte(p, new); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/mm/shadow/0000775000175000017500000000000012307313555014475 5ustar smbsmbxen-4.4.0/xen/arch/x86/mm/shadow/Makefile0000664000175000017500000000021312307313555016131 0ustar smbsmbobj-$(x86_64) += common.o guest_2.o guest_3.o guest_4.o guest_%.o: multi.c Makefile $(CC) $(CFLAGS) -DGUEST_PAGING_LEVELS=$* -c $< -o $@ xen-4.4.0/xen/arch/x86/mm/shadow/common.c0000664000175000017500000037456712307313555016157 0ustar smbsmb/****************************************************************************** * arch/x86/mm/shadow/common.c * * Shadow code that does not need to be multiply compiled. * Parts of this code are Copyright (c) 2006 by XenSource Inc. * Parts of this code are Copyright (c) 2006 by Michael A Fetterman * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "private.h" DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags); /* Set up the shadow-specific parts of a domain struct at start of day. * Called for every domain from arch_domain_create() */ void shadow_domain_init(struct domain *d, unsigned int domcr_flags) { INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.freelist); INIT_PAGE_LIST_HEAD(&d->arch.paging.shadow.pinned_shadows); /* Use shadow pagetables for log-dirty support */ paging_log_dirty_init(d, shadow_enable_log_dirty, shadow_disable_log_dirty, shadow_clean_dirty_bitmap); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) d->arch.paging.shadow.oos_active = 0; d->arch.paging.shadow.oos_off = (domcr_flags & DOMCRF_oos_off) ? 1 : 0; #endif d->arch.paging.shadow.pagetable_dying_op = 0; } /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important * job is to initialize the update_paging_modes() function pointer, which is * used to initialized the rest of resources. Therefore, it really does not * matter to have v->arch.paging.mode pointing to any mode, as long as it can * be compiled. */ void shadow_vcpu_init(struct vcpu *v) { #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) int i, j; for ( i = 0; i < SHADOW_OOS_PAGES; i++ ) { v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN); v->arch.paging.shadow.oos_snapshot[i] = _mfn(INVALID_MFN); for ( j = 0; j < SHADOW_OOS_FIXUPS; j++ ) v->arch.paging.shadow.oos_fixup[i].smfn[j] = _mfn(INVALID_MFN); } #endif v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3); } #if SHADOW_AUDIT int shadow_audit_enable = 0; static void shadow_audit_key(unsigned char key) { shadow_audit_enable = !shadow_audit_enable; printk("%s shadow_audit_enable=%d\n", __func__, shadow_audit_enable); } static struct keyhandler shadow_audit_keyhandler = { .u.fn = shadow_audit_key, .desc = "toggle shadow audits" }; static int __init shadow_audit_key_init(void) { register_keyhandler('O', &shadow_audit_keyhandler); return 0; } __initcall(shadow_audit_key_init); #endif /* SHADOW_AUDIT */ /**************************************************************************/ /* x86 emulator support for the shadow code */ struct segment_register *hvm_get_seg_reg( enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt) { struct segment_register *seg_reg = &sh_ctxt->seg_reg[seg]; if ( !__test_and_set_bit(seg, &sh_ctxt->valid_seg_regs) ) hvm_get_segment_register(current, seg, seg_reg); return seg_reg; } static int hvm_translate_linear_addr( enum x86_segment seg, unsigned long offset, unsigned int bytes, enum hvm_access_type access_type, struct sh_emulate_ctxt *sh_ctxt, unsigned long *paddr) { struct segment_register *reg = hvm_get_seg_reg(seg, sh_ctxt); int okay; okay = hvm_virtual_to_linear_addr( seg, reg, offset, bytes, access_type, sh_ctxt->ctxt.addr_size, paddr); if ( !okay ) { hvm_inject_hw_exception(TRAP_gp_fault, 0); return X86EMUL_EXCEPTION; } return 0; } static int hvm_read(enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, enum hvm_access_type access_type, struct sh_emulate_ctxt *sh_ctxt) { unsigned long addr; int rc; rc = hvm_translate_linear_addr( seg, offset, bytes, access_type, sh_ctxt, &addr); if ( rc ) return rc; if ( access_type == hvm_access_insn_fetch ) rc = hvm_fetch_from_guest_virt(p_data, addr, bytes, 0); else rc = hvm_copy_from_guest_virt(p_data, addr, bytes, 0); switch ( rc ) { case HVMCOPY_okay: return X86EMUL_OKAY; case HVMCOPY_bad_gva_to_gfn: return X86EMUL_EXCEPTION; case HVMCOPY_bad_gfn_to_mfn: case HVMCOPY_unhandleable: return X86EMUL_UNHANDLEABLE; case HVMCOPY_gfn_paged_out: case HVMCOPY_gfn_shared: return X86EMUL_RETRY; } BUG(); return X86EMUL_UNHANDLEABLE; } static int hvm_emulate_read(enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { if ( !is_x86_user_segment(seg) ) return X86EMUL_UNHANDLEABLE; return hvm_read(seg, offset, p_data, bytes, hvm_access_read, container_of(ctxt, struct sh_emulate_ctxt, ctxt)); } static int hvm_emulate_insn_fetch(enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { struct sh_emulate_ctxt *sh_ctxt = container_of(ctxt, struct sh_emulate_ctxt, ctxt); unsigned int insn_off = offset - sh_ctxt->insn_buf_eip; ASSERT(seg == x86_seg_cs); /* Fall back if requested bytes are not in the prefetch cache. */ if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) ) return hvm_read(seg, offset, p_data, bytes, hvm_access_insn_fetch, sh_ctxt); /* Hit the cache. Simple memcpy. */ memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes); return X86EMUL_OKAY; } static int hvm_emulate_write(enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { struct sh_emulate_ctxt *sh_ctxt = container_of(ctxt, struct sh_emulate_ctxt, ctxt); struct vcpu *v = current; unsigned long addr; int rc; if ( !is_x86_user_segment(seg) ) return X86EMUL_UNHANDLEABLE; /* How many emulations could we save if we unshadowed on stack writes? */ if ( seg == x86_seg_ss ) perfc_incr(shadow_fault_emulate_stack); rc = hvm_translate_linear_addr( seg, offset, bytes, hvm_access_write, sh_ctxt, &addr); if ( rc ) return rc; return v->arch.paging.mode->shadow.x86_emulate_write( v, addr, p_data, bytes, sh_ctxt); } static int hvm_emulate_cmpxchg(enum x86_segment seg, unsigned long offset, void *p_old, void *p_new, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { struct sh_emulate_ctxt *sh_ctxt = container_of(ctxt, struct sh_emulate_ctxt, ctxt); struct vcpu *v = current; unsigned long addr, old[2], new[2]; int rc; if ( !is_x86_user_segment(seg) ) return X86EMUL_UNHANDLEABLE; rc = hvm_translate_linear_addr( seg, offset, bytes, hvm_access_write, sh_ctxt, &addr); if ( rc ) return rc; old[0] = new[0] = 0; memcpy(old, p_old, bytes); memcpy(new, p_new, bytes); if ( bytes <= sizeof(long) ) return v->arch.paging.mode->shadow.x86_emulate_cmpxchg( v, addr, old[0], new[0], bytes, sh_ctxt); return X86EMUL_UNHANDLEABLE; } static const struct x86_emulate_ops hvm_shadow_emulator_ops = { .read = hvm_emulate_read, .insn_fetch = hvm_emulate_insn_fetch, .write = hvm_emulate_write, .cmpxchg = hvm_emulate_cmpxchg, }; static int pv_emulate_read(enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { unsigned int rc; if ( !is_x86_user_segment(seg) ) return X86EMUL_UNHANDLEABLE; if ( (rc = copy_from_user(p_data, (void *)offset, bytes)) != 0 ) { propagate_page_fault(offset + bytes - rc, 0); /* read fault */ return X86EMUL_EXCEPTION; } return X86EMUL_OKAY; } static int pv_emulate_write(enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { struct sh_emulate_ctxt *sh_ctxt = container_of(ctxt, struct sh_emulate_ctxt, ctxt); struct vcpu *v = current; if ( !is_x86_user_segment(seg) ) return X86EMUL_UNHANDLEABLE; return v->arch.paging.mode->shadow.x86_emulate_write( v, offset, p_data, bytes, sh_ctxt); } static int pv_emulate_cmpxchg(enum x86_segment seg, unsigned long offset, void *p_old, void *p_new, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { struct sh_emulate_ctxt *sh_ctxt = container_of(ctxt, struct sh_emulate_ctxt, ctxt); unsigned long old[2], new[2]; struct vcpu *v = current; if ( !is_x86_user_segment(seg) ) return X86EMUL_UNHANDLEABLE; old[0] = new[0] = 0; memcpy(old, p_old, bytes); memcpy(new, p_new, bytes); if ( bytes <= sizeof(long) ) return v->arch.paging.mode->shadow.x86_emulate_cmpxchg( v, offset, old[0], new[0], bytes, sh_ctxt); return X86EMUL_UNHANDLEABLE; } static const struct x86_emulate_ops pv_shadow_emulator_ops = { .read = pv_emulate_read, .insn_fetch = pv_emulate_read, .write = pv_emulate_write, .cmpxchg = pv_emulate_cmpxchg, }; const struct x86_emulate_ops *shadow_init_emulation( struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs) { struct segment_register *creg, *sreg; struct vcpu *v = current; unsigned long addr; sh_ctxt->ctxt.regs = regs; sh_ctxt->ctxt.force_writeback = 0; if ( is_pv_vcpu(v) ) { sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = BITS_PER_LONG; return &pv_shadow_emulator_ops; } /* Segment cache initialisation. Primed with CS. */ sh_ctxt->valid_seg_regs = 0; creg = hvm_get_seg_reg(x86_seg_cs, sh_ctxt); /* Work out the emulation mode. */ if ( hvm_long_mode_enabled(v) && creg->attr.fields.l ) { sh_ctxt->ctxt.addr_size = sh_ctxt->ctxt.sp_size = 64; } else { sreg = hvm_get_seg_reg(x86_seg_ss, sh_ctxt); sh_ctxt->ctxt.addr_size = creg->attr.fields.db ? 32 : 16; sh_ctxt->ctxt.sp_size = sreg->attr.fields.db ? 32 : 16; } /* Attempt to prefetch whole instruction. */ sh_ctxt->insn_buf_eip = regs->eip; sh_ctxt->insn_buf_bytes = (!hvm_translate_linear_addr( x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf), hvm_access_insn_fetch, sh_ctxt, &addr) && !hvm_fetch_from_guest_virt_nofault( sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0)) ? sizeof(sh_ctxt->insn_buf) : 0; return &hvm_shadow_emulator_ops; } /* Update an initialized emulation context to prepare for the next * instruction */ void shadow_continue_emulation(struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs) { struct vcpu *v = current; unsigned long addr, diff; /* We don't refetch the segment bases, because we don't emulate * writes to segment registers */ if ( is_hvm_vcpu(v) ) { diff = regs->eip - sh_ctxt->insn_buf_eip; if ( diff > sh_ctxt->insn_buf_bytes ) { /* Prefetch more bytes. */ sh_ctxt->insn_buf_bytes = (!hvm_translate_linear_addr( x86_seg_cs, regs->eip, sizeof(sh_ctxt->insn_buf), hvm_access_insn_fetch, sh_ctxt, &addr) && !hvm_fetch_from_guest_virt_nofault( sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), 0)) ? sizeof(sh_ctxt->insn_buf) : 0; sh_ctxt->insn_buf_eip = regs->eip; } } } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /**************************************************************************/ /* Out-of-sync shadows. */ /* From time to time, we let a shadowed pagetable page go out of sync * with its shadow: the guest is allowed to write directly to the page, * and those writes are not synchronously reflected in the shadow. * This lets us avoid many emulations if the guest is writing a lot to a * pagetable, but it relaxes a pretty important invariant in the shadow * pagetable design. Therefore, some rules: * * 1. Only L1 pagetables may go out of sync: any page that is shadowed * at at higher level must be synchronously updated. This makes * using linear shadow pagetables much less dangerous. * That means that: (a) unsyncing code needs to check for higher-level * shadows, and (b) promotion code needs to resync. * * 2. All shadow operations on a guest page require the page to be brought * back into sync before proceeding. This must be done under the * paging lock so that the page is guaranteed to remain synced until * the operation completes. * * Exceptions to this rule: the pagefault and invlpg handlers may * update only one entry on an out-of-sync page without resyncing it. * * 3. Operations on shadows that do not start from a guest page need to * be aware that they may be handling an out-of-sync shadow. * * 4. Operations that do not normally take the paging lock (fast-path * #PF handler, INVLPG) must fall back to a locking, syncing version * if they see an out-of-sync table. * * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG) * must explicitly resync all relevant pages or update their * shadows. * * Currently out-of-sync pages are listed in a simple open-addressed * hash table with a second chance (must resist temptation to radically * over-engineer hash tables...) The virtual address of the access * which caused us to unsync the page is also kept in the hash table, as * a hint for finding the writable mappings later. * * We keep a hash per vcpu, because we want as much as possible to do * the re-sync on the save vcpu we did the unsync on, so the VA hint * will be valid. */ #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL static void sh_oos_audit(struct domain *d) { int idx, expected_idx, expected_idx_alt; struct page_info *pg; struct vcpu *v; for_each_vcpu(d, v) { for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) { mfn_t *oos = v->arch.paging.shadow.oos; if ( !mfn_valid(oos[idx]) ) continue; expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES; expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES); if ( idx != expected_idx && idx != expected_idx_alt ) { printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n", __func__, idx, mfn_x(oos[idx]), expected_idx, expected_idx_alt); BUG(); } pg = mfn_to_page(oos[idx]); if ( !(pg->count_info & PGC_page_table) ) { printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n", __func__, idx, mfn_x(oos[idx]), pg->count_info); BUG(); } if ( !(pg->shadow_flags & SHF_out_of_sync) ) { printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n", __func__, idx, mfn_x(oos[idx]), pg->shadow_flags); BUG(); } if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) ) { printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n", __func__, idx, mfn_x(oos[idx]), pg->shadow_flags); BUG(); } } } } #endif #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn) { int idx; struct vcpu *v; mfn_t *oos; ASSERT(mfn_is_out_of_sync(gmfn)); for_each_vcpu(d, v) { oos = v->arch.paging.shadow.oos; idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) idx = (idx + 1) % SHADOW_OOS_PAGES; if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) return; } SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn)); BUG(); } #endif /* Update the shadow, but keep the page out of sync. */ static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn, mfn_t snpmfn) { struct page_info *pg = mfn_to_page(gmfn); ASSERT(mfn_valid(gmfn)); ASSERT(page_is_out_of_sync(pg)); /* Call out to the appropriate per-mode resyncing function */ if ( pg->shadow_flags & SHF_L1_32 ) SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn, snpmfn); else if ( pg->shadow_flags & SHF_L1_PAE ) SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn, snpmfn); else if ( pg->shadow_flags & SHF_L1_64 ) SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn, snpmfn); } /* * Fixup arrays: We limit the maximum number of writable mappings to * SHADOW_OOS_FIXUPS and store enough information to remove them * quickly on resync. */ static inline int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn, struct oos_fixup *fixup) { int i; for ( i = 0; i < SHADOW_OOS_FIXUPS; i++ ) { if ( mfn_x(fixup->smfn[i]) != INVALID_MFN ) { sh_remove_write_access_from_sl1p(v, gmfn, fixup->smfn[i], fixup->off[i]); fixup->smfn[i] = _mfn(INVALID_MFN); } } /* Always flush the TLBs. See comment on oos_fixup_add(). */ return 1; } void oos_fixup_add(struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off) { int idx, next; mfn_t *oos; struct oos_fixup *oos_fixup; struct domain *d = v->domain; perfc_incr(shadow_oos_fixup_add); for_each_vcpu(d, v) { oos = v->arch.paging.shadow.oos; oos_fixup = v->arch.paging.shadow.oos_fixup; idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) idx = (idx + 1) % SHADOW_OOS_PAGES; if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) { int i; for ( i = 0; i < SHADOW_OOS_FIXUPS; i++ ) { if ( mfn_valid(oos_fixup[idx].smfn[i]) && (mfn_x(oos_fixup[idx].smfn[i]) == mfn_x(smfn)) && (oos_fixup[idx].off[i] == off) ) return; } next = oos_fixup[idx].next; if ( mfn_x(oos_fixup[idx].smfn[next]) != INVALID_MFN ) { TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_EVICT); /* Reuse this slot and remove current writable mapping. */ sh_remove_write_access_from_sl1p(v, gmfn, oos_fixup[idx].smfn[next], oos_fixup[idx].off[next]); perfc_incr(shadow_oos_fixup_evict); /* We should flush the TLBs now, because we removed a writable mapping, but since the shadow is already OOS we have no problem if another vcpu write to this page table. We just have to be very careful to *always* flush the tlbs on resync. */ } oos_fixup[idx].smfn[next] = smfn; oos_fixup[idx].off[next] = off; oos_fixup[idx].next = (next + 1) % SHADOW_OOS_FIXUPS; TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_ADD); return; } } SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn)); BUG(); } static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn, struct oos_fixup *fixup) { int ftlb = 0; ftlb |= oos_fixup_flush_gmfn(v, gmfn, fixup); switch ( sh_remove_write_access(v, gmfn, 0, 0) ) { default: case 0: break; case 1: ftlb |= 1; break; case -1: /* An unfindable writeable typecount has appeared, probably via a * grant table entry: can't shoot the mapping, so try to unshadow * the page. If that doesn't work either, the guest is granting * his pagetables and must be killed after all. * This will flush the tlb, so we can return with no worries. */ sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */); return 1; } if ( ftlb ) flush_tlb_mask(v->domain->domain_dirty_cpumask); return 0; } static inline void trace_resync(int event, mfn_t gmfn) { if ( tb_init_done ) { /* Convert gmfn to gfn */ unsigned long gfn = mfn_to_gfn(current->domain, gmfn); __trace_var(event, 0/*!tsc*/, sizeof(gfn), &gfn); } } /* Pull all the entries on an out-of-sync page back into sync. */ static void _sh_resync(struct vcpu *v, mfn_t gmfn, struct oos_fixup *fixup, mfn_t snp) { struct page_info *pg = mfn_to_page(gmfn); ASSERT(paging_locked_by_me(v->domain)); ASSERT(mfn_is_out_of_sync(gmfn)); /* Guest page must be shadowed *only* as L1 when out of sync. */ ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY)); ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn))); SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n", v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); /* Need to pull write access so the page *stays* in sync. */ if ( oos_remove_write_access(v, gmfn, fixup) ) { /* Page has been unshadowed. */ return; } /* No more writable mappings of this page, please */ pg->shadow_flags &= ~SHF_oos_may_write; /* Update the shadows with current guest entries. */ _sh_resync_l1(v, gmfn, snp); /* Now we know all the entries are synced, and will stay that way */ pg->shadow_flags &= ~SHF_out_of_sync; perfc_incr(shadow_resync); trace_resync(TRC_SHADOW_RESYNC_FULL, gmfn); } /* Add an MFN to the list of out-of-sync guest pagetables */ static void oos_hash_add(struct vcpu *v, mfn_t gmfn) { int i, idx, oidx, swap = 0; void *gptr, *gsnpptr; mfn_t *oos = v->arch.paging.shadow.oos; mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot; struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup; struct oos_fixup fixup = { .next = 0 }; for (i = 0; i < SHADOW_OOS_FIXUPS; i++ ) fixup.smfn[i] = _mfn(INVALID_MFN); idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; oidx = idx; if ( mfn_valid(oos[idx]) && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx ) { /* Punt the current occupant into the next slot */ SWAP(oos[idx], gmfn); SWAP(oos_fixup[idx], fixup); swap = 1; idx = (idx + 1) % SHADOW_OOS_PAGES; } if ( mfn_valid(oos[idx]) ) { /* Crush the current occupant. */ _sh_resync(v, oos[idx], &oos_fixup[idx], oos_snapshot[idx]); perfc_incr(shadow_unsync_evict); } oos[idx] = gmfn; oos_fixup[idx] = fixup; if ( swap ) SWAP(oos_snapshot[idx], oos_snapshot[oidx]); gptr = sh_map_domain_page(oos[oidx]); gsnpptr = sh_map_domain_page(oos_snapshot[oidx]); memcpy(gsnpptr, gptr, PAGE_SIZE); sh_unmap_domain_page(gptr); sh_unmap_domain_page(gsnpptr); } /* Remove an MFN from the list of out-of-sync guest pagetables */ static void oos_hash_remove(struct vcpu *v, mfn_t gmfn) { int idx; mfn_t *oos; struct domain *d = v->domain; SHADOW_PRINTK("D%dV%d gmfn %lx\n", v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); for_each_vcpu(d, v) { oos = v->arch.paging.shadow.oos; idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) idx = (idx + 1) % SHADOW_OOS_PAGES; if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) { oos[idx] = _mfn(INVALID_MFN); return; } } SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn)); BUG(); } mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn) { int idx; mfn_t *oos; mfn_t *oos_snapshot; struct domain *d = v->domain; for_each_vcpu(d, v) { oos = v->arch.paging.shadow.oos; oos_snapshot = v->arch.paging.shadow.oos_snapshot; idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) idx = (idx + 1) % SHADOW_OOS_PAGES; if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) { return oos_snapshot[idx]; } } SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn)); BUG(); return _mfn(INVALID_MFN); } /* Pull a single guest page back into sync */ void sh_resync(struct vcpu *v, mfn_t gmfn) { int idx; mfn_t *oos; mfn_t *oos_snapshot; struct oos_fixup *oos_fixup; struct domain *d = v->domain; for_each_vcpu(d, v) { oos = v->arch.paging.shadow.oos; oos_fixup = v->arch.paging.shadow.oos_fixup; oos_snapshot = v->arch.paging.shadow.oos_snapshot; idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) idx = (idx + 1) % SHADOW_OOS_PAGES; if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) { _sh_resync(v, gmfn, &oos_fixup[idx], oos_snapshot[idx]); oos[idx] = _mfn(INVALID_MFN); return; } } SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn)); BUG(); } /* Figure out whether it's definitely safe not to sync this l1 table, * by making a call out to the mode in which that shadow was made. */ static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn) { struct page_info *pg = mfn_to_page(gl1mfn); if ( pg->shadow_flags & SHF_L1_32 ) return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn); else if ( pg->shadow_flags & SHF_L1_PAE ) return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn); else if ( pg->shadow_flags & SHF_L1_64 ) return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn); SHADOW_ERROR("gmfn %#lx was OOS but not shadowed as an l1.\n", mfn_x(gl1mfn)); BUG(); return 0; /* BUG() is no longer __attribute__((noreturn)). */ } /* Pull all out-of-sync pages back into sync. Pages brought out of sync * on other vcpus are allowed to remain out of sync, but their contents * will be made safe (TLB flush semantics); pages unsynced by this vcpu * are brought back into sync and write-protected. If skip != 0, we try * to avoid resyncing at all if we think we can get away with it. */ void sh_resync_all(struct vcpu *v, int skip, int this, int others) { int idx; struct vcpu *other; mfn_t *oos = v->arch.paging.shadow.oos; mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot; struct oos_fixup *oos_fixup = v->arch.paging.shadow.oos_fixup; SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id); ASSERT(paging_locked_by_me(v->domain)); if ( !this ) goto resync_others; /* First: resync all of this vcpu's oos pages */ for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) if ( mfn_valid(oos[idx]) ) { /* Write-protect and sync contents */ _sh_resync(v, oos[idx], &oos_fixup[idx], oos_snapshot[idx]); oos[idx] = _mfn(INVALID_MFN); } resync_others: if ( !others ) return; /* Second: make all *other* vcpus' oos pages safe. */ for_each_vcpu(v->domain, other) { if ( v == other ) continue; oos = other->arch.paging.shadow.oos; oos_fixup = other->arch.paging.shadow.oos_fixup; oos_snapshot = other->arch.paging.shadow.oos_snapshot; for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) { if ( !mfn_valid(oos[idx]) ) continue; if ( skip ) { /* Update the shadows and leave the page OOS. */ if ( sh_skip_sync(v, oos[idx]) ) continue; trace_resync(TRC_SHADOW_RESYNC_ONLY, oos[idx]); _sh_resync_l1(other, oos[idx], oos_snapshot[idx]); } else { /* Write-protect and sync contents */ _sh_resync(other, oos[idx], &oos_fixup[idx], oos_snapshot[idx]); oos[idx] = _mfn(INVALID_MFN); } } } } /* Allow a shadowed page to go out of sync. Unsyncs are traced in * multi.c:sh_page_fault() */ int sh_unsync(struct vcpu *v, mfn_t gmfn) { struct page_info *pg; ASSERT(paging_locked_by_me(v->domain)); SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n", v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); pg = mfn_to_page(gmfn); /* Guest page must be shadowed *only* as L1 and *only* once when out * of sync. Also, get out now if it's already out of sync. * Also, can't safely unsync if some vcpus have paging disabled.*/ if ( pg->shadow_flags & ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync) || sh_page_has_multiple_shadows(pg) || is_pv_domain(v->domain) || !v->domain->arch.paging.shadow.oos_active ) return 0; pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write; oos_hash_add(v, gmfn); perfc_incr(shadow_unsync); TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_UNSYNC); return 1; } #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ /**************************************************************************/ /* Code for "promoting" a guest page to the point where the shadow code is * willing to let it be treated as a guest page table. This generally * involves making sure there are no writable mappings available to the guest * for this page. */ void shadow_promote(struct vcpu *v, mfn_t gmfn, unsigned int type) { struct page_info *page = mfn_to_page(gmfn); ASSERT(mfn_valid(gmfn)); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Is the page already shadowed and out of sync? */ if ( page_is_out_of_sync(page) ) sh_resync(v, gmfn); #endif /* We should never try to promote a gmfn that has writeable mappings */ ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page || (page->u.inuse.type_info & PGT_count_mask) == 0 || v->domain->is_shutting_down); /* Is the page already shadowed? */ if ( !test_and_set_bit(_PGC_page_table, &page->count_info) ) page->shadow_flags = 0; ASSERT(!test_bit(type, &page->shadow_flags)); set_bit(type, &page->shadow_flags); TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PROMOTE); } void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type) { struct page_info *page = mfn_to_page(gmfn); ASSERT(test_bit(_PGC_page_table, &page->count_info)); ASSERT(test_bit(type, &page->shadow_flags)); clear_bit(type, &page->shadow_flags); if ( (page->shadow_flags & SHF_page_type_mask) == 0 ) { #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Was the page out of sync? */ if ( page_is_out_of_sync(page) ) { oos_hash_remove(v, gmfn); } #endif clear_bit(_PGC_page_table, &page->count_info); } TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_DEMOTE); } /**************************************************************************/ /* Validate a pagetable change from the guest and update the shadows. * Returns a bitmask of SHADOW_SET_* flags. */ int sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size) { int result = 0; struct page_info *page = mfn_to_page(gmfn); paging_mark_dirty(v->domain, mfn_x(gmfn)); // Determine which types of shadows are affected, and update each. // // Always validate L1s before L2s to prevent another cpu with a linear // mapping of this gmfn from seeing a walk that results from // using the new L2 value and the old L1 value. (It is OK for such a // guest to see a walk that uses the old L2 value with the new L1 value, // as hardware could behave this way if one level of the pagewalk occurs // before the store, and the next level of the pagewalk occurs after the // store. // // Ditto for L2s before L3s, etc. // if ( !(page->count_info & PGC_page_table) ) return 0; /* Not shadowed at all */ if ( page->shadow_flags & SHF_L1_32 ) result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2) (v, gmfn, entry, size); if ( page->shadow_flags & SHF_L2_32 ) result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2) (v, gmfn, entry, size); if ( page->shadow_flags & SHF_L1_PAE ) result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3) (v, gmfn, entry, size); if ( page->shadow_flags & SHF_L2_PAE ) result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3) (v, gmfn, entry, size); if ( page->shadow_flags & SHF_L2H_PAE ) result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3) (v, gmfn, entry, size); if ( page->shadow_flags & SHF_L1_64 ) result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4) (v, gmfn, entry, size); if ( page->shadow_flags & SHF_L2_64 ) result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4) (v, gmfn, entry, size); if ( page->shadow_flags & SHF_L2H_64 ) result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 4) (v, gmfn, entry, size); if ( page->shadow_flags & SHF_L3_64 ) result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4) (v, gmfn, entry, size); if ( page->shadow_flags & SHF_L4_64 ) result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4) (v, gmfn, entry, size); this_cpu(trace_shadow_path_flags) |= (result<<(TRCE_SFLAG_SET_CHANGED)); return result; } void sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, void *entry, u32 size) /* This is the entry point for emulated writes to pagetables in HVM guests and * PV translated guests. */ { struct domain *d = v->domain; int rc; ASSERT(paging_locked_by_me(v->domain)); rc = sh_validate_guest_entry(v, gmfn, entry, size); if ( rc & SHADOW_SET_FLUSH ) /* Need to flush TLBs to pick up shadow PT changes */ flush_tlb_mask(d->domain_dirty_cpumask); if ( rc & SHADOW_SET_ERROR ) { /* This page is probably not a pagetable any more: tear it out of the * shadows, along with any tables that reference it. * Since the validate call above will have made a "safe" (i.e. zero) * shadow entry, we can let the domain live even if we can't fully * unshadow the page. */ sh_remove_shadows(v, gmfn, 0, 0); } } int shadow_write_guest_entry(struct vcpu *v, intpte_t *p, intpte_t new, mfn_t gmfn) /* Write a new value into the guest pagetable, and update the shadows * appropriately. Returns 0 if we page-faulted, 1 for success. */ { int failed; paging_lock(v->domain); failed = __copy_to_user(p, &new, sizeof(new)); if ( failed != sizeof(new) ) sh_validate_guest_entry(v, gmfn, p, sizeof(new)); paging_unlock(v->domain); return (failed == 0); } int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p, intpte_t *old, intpte_t new, mfn_t gmfn) /* Cmpxchg a new value into the guest pagetable, and update the shadows * appropriately. Returns 0 if we page-faulted, 1 if not. * N.B. caller should check the value of "old" to see if the * cmpxchg itself was successful. */ { int failed; intpte_t t = *old; paging_lock(v->domain); failed = cmpxchg_user(p, t, new); if ( t == *old ) sh_validate_guest_entry(v, gmfn, p, sizeof(new)); *old = t; paging_unlock(v->domain); return (failed == 0); } /**************************************************************************/ /* Memory management for shadow pages. */ /* Allocating shadow pages * ----------------------- * * Most shadow pages are allocated singly, but there is one case where * we need to allocate multiple pages together: shadowing 32-bit guest * tables on PAE or 64-bit shadows. A 32-bit guest l1 table covers 4MB * of virtual address space, and needs to be shadowed by two PAE/64-bit * l1 tables (covering 2MB of virtual address space each). Similarly, a * 32-bit guest l2 table (4GB va) needs to be shadowed by four * PAE/64-bit l2 tables (1GB va each). These multi-page shadows are * not contiguous in memory; functions for handling offsets into them are * defined in shadow/multi.c (shadow_l1_index() etc.) * * This table shows the allocation behaviour of the different modes: * * Xen paging 64b 64b 64b * Guest paging 32b pae 64b * PV or HVM HVM HVM * * Shadow paging pae pae 64b * * sl1 size 8k 4k 4k * sl2 size 16k 4k 4k * sl3 size - - 4k * sl4 size - - 4k * * In HVM guests, the p2m table is built out of shadow pages, and we provide * a function for the p2m management to steal pages, in max-order chunks, from * the free pool. */ /* Figure out the least acceptable quantity of shadow memory. * The minimum memory requirement for always being able to free up a * chunk of memory is very small -- only three max-order chunks per * vcpu to hold the top level shadows and pages with Xen mappings in them. * * But for a guest to be guaranteed to successfully execute a single * instruction, we must be able to map a large number (about thirty) VAs * at the same time, which means that to guarantee progress, we must * allow for more than ninety allocated pages per vcpu. We round that * up to 128 pages, or half a megabyte per vcpu, and add 1 more vcpu's * worth to make sure we never return zero. */ static unsigned int shadow_min_acceptable_pages(struct domain *d) { u32 vcpu_count = 1; struct vcpu *v; for_each_vcpu(d, v) vcpu_count++; return (vcpu_count * 128); } /* Figure out the size (in pages) of a given shadow type */ static inline u32 shadow_size(unsigned int shadow_type) { static const u32 type_to_size[SH_type_unused] = { 1, /* SH_type_none */ 2, /* SH_type_l1_32_shadow */ 2, /* SH_type_fl1_32_shadow */ 4, /* SH_type_l2_32_shadow */ 1, /* SH_type_l1_pae_shadow */ 1, /* SH_type_fl1_pae_shadow */ 1, /* SH_type_l2_pae_shadow */ 1, /* SH_type_l2h_pae_shadow */ 1, /* SH_type_l1_64_shadow */ 1, /* SH_type_fl1_64_shadow */ 1, /* SH_type_l2_64_shadow */ 1, /* SH_type_l2h_64_shadow */ 1, /* SH_type_l3_64_shadow */ 1, /* SH_type_l4_64_shadow */ 1, /* SH_type_p2m_table */ 1, /* SH_type_monitor_table */ 1 /* SH_type_oos_snapshot */ }; ASSERT(shadow_type < SH_type_unused); return type_to_size[shadow_type]; } /* Dispatcher function: call the per-mode function that will unhook the * non-Xen mappings in this top-level shadow mfn. With user_only == 1, * unhooks only the user-mode mappings. */ void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn, int user_only) { struct page_info *sp = mfn_to_page(smfn); switch ( sp->u.sh.type ) { case SH_type_l2_32_shadow: SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, 2)(v, smfn, user_only); break; case SH_type_l2_pae_shadow: case SH_type_l2h_pae_shadow: SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, 3)(v, smfn, user_only); break; case SH_type_l4_64_shadow: SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, 4)(v, smfn, user_only); break; default: SHADOW_ERROR("top-level shadow has bad type %08x\n", sp->u.sh.type); BUG(); } } static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn) { if ( tb_init_done ) { /* Convert smfn to gfn */ unsigned long gfn; ASSERT(mfn_valid(smfn)); gfn = mfn_to_gfn(d, backpointer(mfn_to_page(smfn))); __trace_var(TRC_SHADOW_PREALLOC_UNPIN, 0/*!tsc*/, sizeof(gfn), &gfn); } } /* Make sure there are at least count order-sized pages * available in the shadow page pool. */ static void _shadow_prealloc( struct domain *d, unsigned int pages) { /* Need a vpcu for calling unpins; for now, since we don't have * per-vcpu shadows, any will do */ struct vcpu *v, *v2; struct page_info *sp, *t; mfn_t smfn; int i; if ( d->arch.paging.shadow.free_pages >= pages ) return; v = current; if ( v->domain != d ) v = d->vcpu[0]; ASSERT(v != NULL); /* Shouldn't have enabled shadows if we've no vcpus */ /* Stage one: walk the list of pinned pages, unpinning them */ perfc_incr(shadow_prealloc_1); foreach_pinned_shadow(d, sp, t) { smfn = page_to_mfn(sp); /* Unpin this top-level shadow */ trace_shadow_prealloc_unpin(d, smfn); sh_unpin(v, smfn); /* See if that freed up enough space */ if ( d->arch.paging.shadow.free_pages >= pages ) return; } /* Stage two: all shadow pages are in use in hierarchies that are * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen * mappings. */ perfc_incr(shadow_prealloc_2); for_each_vcpu(d, v2) for ( i = 0 ; i < 4 ; i++ ) { if ( !pagetable_is_null(v2->arch.shadow_table[i]) ) { TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PREALLOC_UNHOOK); shadow_unhook_mappings(v, pagetable_get_mfn(v2->arch.shadow_table[i]), 0); /* See if that freed up enough space */ if ( d->arch.paging.shadow.free_pages >= pages ) { flush_tlb_mask(d->domain_dirty_cpumask); return; } } } /* Nothing more we can do: all remaining shadows are of pages that * hold Xen mappings for some vcpu. This can never happen. */ SHADOW_ERROR("Can't pre-allocate %u shadow pages!\n" " shadow pages total = %u, free = %u, p2m=%u\n", pages, d->arch.paging.shadow.total_pages, d->arch.paging.shadow.free_pages, d->arch.paging.shadow.p2m_pages); BUG(); } /* Make sure there are at least count pages of the order according to * type available in the shadow page pool. * This must be called before any calls to shadow_alloc(). Since this * will free existing shadows to make room, it must be called early enough * to avoid freeing shadows that the caller is currently working on. */ void shadow_prealloc(struct domain *d, u32 type, unsigned int count) { return _shadow_prealloc(d, shadow_size(type) * count); } /* Deliberately free all the memory we can: this will tear down all of * this domain's shadows */ static void shadow_blow_tables(struct domain *d) { struct page_info *sp, *t; struct vcpu *v = d->vcpu[0]; mfn_t smfn; int i; ASSERT(v != NULL); /* Pass one: unpin all pinned pages */ foreach_pinned_shadow(d, sp, t) { smfn = page_to_mfn(sp); sh_unpin(v, smfn); } /* Second pass: unhook entries of in-use shadows */ for_each_vcpu(d, v) for ( i = 0 ; i < 4 ; i++ ) if ( !pagetable_is_null(v->arch.shadow_table[i]) ) shadow_unhook_mappings(v, pagetable_get_mfn(v->arch.shadow_table[i]), 0); /* Make sure everyone sees the unshadowings */ flush_tlb_mask(d->domain_dirty_cpumask); } void shadow_blow_tables_per_domain(struct domain *d) { if ( shadow_mode_enabled(d) && d->vcpu != NULL && d->vcpu[0] != NULL ) { paging_lock(d); shadow_blow_tables(d); paging_unlock(d); } } #ifndef NDEBUG /* Blow all shadows of all shadowed domains: this can be used to cause the * guest's pagetables to be re-shadowed if we suspect that the shadows * have somehow got out of sync */ static void shadow_blow_all_tables(unsigned char c) { struct domain *d; printk("'%c' pressed -> blowing all shadow tables\n", c); rcu_read_lock(&domlist_read_lock); for_each_domain(d) { if ( shadow_mode_enabled(d) && d->vcpu != NULL && d->vcpu[0] != NULL ) { paging_lock(d); shadow_blow_tables(d); paging_unlock(d); } } rcu_read_unlock(&domlist_read_lock); } static struct keyhandler shadow_blow_all_tables_keyhandler = { .u.fn = shadow_blow_all_tables, .desc = "reset shadow pagetables" }; /* Register this function in the Xen console keypress table */ static __init int shadow_blow_tables_keyhandler_init(void) { register_keyhandler('S', &shadow_blow_all_tables_keyhandler); return 0; } __initcall(shadow_blow_tables_keyhandler_init); #endif /* !NDEBUG */ /* Accessors for the singly-linked list that's used for hash chains */ static inline struct page_info * next_shadow(const struct page_info *sp) { return sp->next_shadow ? pdx_to_page(sp->next_shadow) : NULL; } static inline void set_next_shadow(struct page_info *sp, struct page_info *next) { sp->next_shadow = next ? page_to_pdx(next) : 0; } /* Allocate another shadow's worth of (contiguous, aligned) pages, * and fill in the type and backpointer fields of their page_infos. * Never fails to allocate. */ mfn_t shadow_alloc(struct domain *d, u32 shadow_type, unsigned long backpointer) { struct page_info *sp = NULL; unsigned int pages = shadow_size(shadow_type); struct page_list_head tmp_list; cpumask_t mask; void *p; int i; ASSERT(paging_locked_by_me(d)); ASSERT(shadow_type != SH_type_none); perfc_incr(shadow_alloc); if ( d->arch.paging.shadow.free_pages < pages ) { /* If we get here, we failed to allocate. This should never * happen. It means that we didn't call shadow_prealloc() * correctly before we allocated. We can't recover by calling * prealloc here, because we might free up higher-level pages * that the caller is working on. */ SHADOW_ERROR("Can't allocate %i shadow pages!\n", pages); BUG(); } d->arch.paging.shadow.free_pages -= pages; /* Backpointers that are MFNs need to be packed into PDXs (PFNs don't) */ switch (shadow_type) { case SH_type_fl1_32_shadow: case SH_type_fl1_pae_shadow: case SH_type_fl1_64_shadow: break; default: backpointer = pfn_to_pdx(backpointer); break; } /* Page lists don't have pointers back to the head structure, so * it's safe to use a head structure on the stack to link the pages * together. */ INIT_PAGE_LIST_HEAD(&tmp_list); /* Init page info fields and clear the pages */ for ( i = 0; i < pages ; i++ ) { sp = page_list_remove_head(&d->arch.paging.shadow.freelist); /* Before we overwrite the old contents of this page, * we need to be sure that no TLB holds a pointer to it. */ cpumask_copy(&mask, d->domain_dirty_cpumask); tlbflush_filter(mask, sp->tlbflush_timestamp); if ( unlikely(!cpumask_empty(&mask)) ) { perfc_incr(shadow_alloc_tlbflush); flush_tlb_mask(&mask); } /* Now safe to clear the page for reuse */ p = __map_domain_page(sp); ASSERT(p != NULL); clear_page(p); sh_unmap_domain_page(p); INIT_PAGE_LIST_ENTRY(&sp->list); page_list_add(sp, &tmp_list); sp->u.sh.type = shadow_type; sp->u.sh.pinned = 0; sp->u.sh.count = 0; sp->u.sh.head = 0; sp->v.sh.back = backpointer; set_next_shadow(sp, NULL); perfc_incr(shadow_alloc_count); } if ( shadow_type >= SH_type_min_shadow && shadow_type <= SH_type_max_shadow ) sp->u.sh.head = 1; return page_to_mfn(sp); } /* Return some shadow pages to the pool. */ void shadow_free(struct domain *d, mfn_t smfn) { struct page_info *next = NULL, *sp = mfn_to_page(smfn); unsigned int pages; u32 shadow_type; int i; ASSERT(paging_locked_by_me(d)); perfc_incr(shadow_free); shadow_type = sp->u.sh.type; ASSERT(shadow_type != SH_type_none); ASSERT(sp->u.sh.head || (shadow_type > SH_type_max_shadow)); pages = shadow_size(shadow_type); for ( i = 0; i < pages; i++ ) { #if SHADOW_OPTIMIZATIONS & (SHOPT_WRITABLE_HEURISTIC | SHOPT_FAST_EMULATION) struct vcpu *v; for_each_vcpu(d, v) { #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC /* No longer safe to look for a writeable mapping in this shadow */ if ( v->arch.paging.shadow.last_writeable_pte_smfn == mfn_x(page_to_mfn(sp)) ) v->arch.paging.shadow.last_writeable_pte_smfn = 0; #endif #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION v->arch.paging.last_write_emul_ok = 0; #endif } #endif /* Get the next page before we overwrite the list header */ if ( i < pages - 1 ) next = pdx_to_page(sp->list.next); /* Strip out the type: this is now a free shadow page */ sp->u.sh.type = sp->u.sh.head = 0; /* Remember the TLB timestamp so we will know whether to flush * TLBs when we reuse the page. Because the destructors leave the * contents of the pages in place, we can delay TLB flushes until * just before the allocator hands the page out again. */ sp->tlbflush_timestamp = tlbflush_current_time(); perfc_decr(shadow_alloc_count); page_list_add_tail(sp, &d->arch.paging.shadow.freelist); sp = next; } d->arch.paging.shadow.free_pages += pages; } /* Divert a page from the pool to be used by the p2m mapping. * This action is irreversible: the p2m mapping only ever grows. * That's OK because the p2m table only exists for translated domains, * and those domains can't ever turn off shadow mode. */ static struct page_info * shadow_alloc_p2m_page(struct domain *d) { struct page_info *pg; /* This is called both from the p2m code (which never holds the * paging lock) and the log-dirty code (which always does). */ paging_lock_recursive(d); if ( d->arch.paging.shadow.total_pages < shadow_min_acceptable_pages(d) + 1 ) { if ( !d->arch.paging.p2m_alloc_failed ) { d->arch.paging.p2m_alloc_failed = 1; dprintk(XENLOG_ERR, "d%i failed to allocate from shadow pool", d->domain_id); } paging_unlock(d); return NULL; } shadow_prealloc(d, SH_type_p2m_table, 1); pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0)); d->arch.paging.shadow.p2m_pages++; d->arch.paging.shadow.total_pages--; paging_unlock(d); /* Unlike shadow pages, mark p2m pages as owned by the domain. * Marking the domain as the owner would normally allow the guest to * create mappings of these pages, but these p2m pages will never be * in the domain's guest-physical address space, and so that is not * believed to be a concern. */ page_set_owner(pg, d); pg->count_info |= 1; return pg; } static void shadow_free_p2m_page(struct domain *d, struct page_info *pg) { ASSERT(page_get_owner(pg) == d); /* Should have just the one ref we gave it in alloc_p2m_page() */ if ( (pg->count_info & PGC_count_mask) != 1 ) { SHADOW_ERROR("Odd p2m page count c=%#lx t=%"PRtype_info"\n", pg->count_info, pg->u.inuse.type_info); } pg->count_info &= ~PGC_count_mask; pg->u.sh.type = SH_type_p2m_table; /* p2m code reuses type-info */ page_set_owner(pg, NULL); /* This is called both from the p2m code (which never holds the * paging lock) and the log-dirty code (which always does). */ paging_lock_recursive(d); shadow_free(d, page_to_mfn(pg)); d->arch.paging.shadow.p2m_pages--; d->arch.paging.shadow.total_pages++; paging_unlock(d); } /* Set the pool of shadow pages to the required number of pages. * Input will be rounded up to at least shadow_min_acceptable_pages(), * plus space for the p2m table. * Returns 0 for success, non-zero for failure. */ static unsigned int sh_set_allocation(struct domain *d, unsigned int pages, int *preempted) { struct page_info *sp; unsigned int lower_bound; ASSERT(paging_locked_by_me(d)); if ( pages > 0 ) { /* Check for minimum value. */ if ( pages < d->arch.paging.shadow.p2m_pages ) pages = 0; else pages -= d->arch.paging.shadow.p2m_pages; /* Don't allocate less than the minimum acceptable, plus one page per * megabyte of RAM (for the p2m table) */ lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256); if ( pages < lower_bound ) pages = lower_bound; } SHADOW_PRINTK("current %i target %i\n", d->arch.paging.shadow.total_pages, pages); while ( d->arch.paging.shadow.total_pages != pages ) { if ( d->arch.paging.shadow.total_pages < pages ) { /* Need to allocate more memory from domheap */ sp = (struct page_info *) alloc_domheap_page(NULL, MEMF_node(domain_to_node(d))); if ( sp == NULL ) { SHADOW_PRINTK("failed to allocate shadow pages.\n"); return -ENOMEM; } d->arch.paging.shadow.free_pages++; d->arch.paging.shadow.total_pages++; sp->u.sh.type = 0; sp->u.sh.pinned = 0; sp->u.sh.count = 0; sp->tlbflush_timestamp = 0; /* Not in any TLB */ page_list_add_tail(sp, &d->arch.paging.shadow.freelist); } else if ( d->arch.paging.shadow.total_pages > pages ) { /* Need to return memory to domheap */ _shadow_prealloc(d, 1); sp = page_list_remove_head(&d->arch.paging.shadow.freelist); ASSERT(sp); /* * The pages were allocated anonymously, but the owner field * gets overwritten normally, so need to clear it here. */ page_set_owner(sp, NULL); d->arch.paging.shadow.free_pages--; d->arch.paging.shadow.total_pages--; free_domheap_page(sp); } /* Check to see if we need to yield and try again */ if ( preempted && hypercall_preempt_check() ) { *preempted = 1; return 0; } } return 0; } /* Return the size of the shadow pool, rounded up to the nearest MB */ static unsigned int shadow_get_allocation(struct domain *d) { unsigned int pg = d->arch.paging.shadow.total_pages + d->arch.paging.shadow.p2m_pages; return ((pg >> (20 - PAGE_SHIFT)) + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0)); } /**************************************************************************/ /* Hash table for storing the guest->shadow mappings. * The table itself is an array of pointers to shadows; the shadows are then * threaded on a singly-linked list of shadows with the same hash value */ #define SHADOW_HASH_BUCKETS 251 /* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */ /* Hash function that takes a gfn or mfn, plus another byte of type info */ typedef u32 key_t; static inline key_t sh_hash(unsigned long n, unsigned int t) { unsigned char *p = (unsigned char *)&n; key_t k = t; int i; for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k; return k % SHADOW_HASH_BUCKETS; } #if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL) /* Before we get to the mechanism, define a pair of audit functions * that sanity-check the contents of the hash table. */ static void sh_hash_audit_bucket(struct domain *d, int bucket) /* Audit one bucket of the hash table */ { struct page_info *sp, *x; if ( !(SHADOW_AUDIT_ENABLE) ) return; sp = d->arch.paging.shadow.hash_table[bucket]; while ( sp ) { /* Not a shadow? */ BUG_ON( (sp->count_info & PGC_count_mask )!= 0 ) ; /* Bogus type? */ BUG_ON( sp->u.sh.type == 0 ); BUG_ON( sp->u.sh.type > SH_type_max_shadow ); /* Wrong page of a multi-page shadow? */ BUG_ON( !sp->u.sh.head ); /* Wrong bucket? */ BUG_ON( sh_hash(__backpointer(sp), sp->u.sh.type) != bucket ); /* Duplicate entry? */ for ( x = next_shadow(sp); x; x = next_shadow(x) ) BUG_ON( x->v.sh.back == sp->v.sh.back && x->u.sh.type == sp->u.sh.type ); /* Follow the backpointer to the guest pagetable */ if ( sp->u.sh.type != SH_type_fl1_32_shadow && sp->u.sh.type != SH_type_fl1_pae_shadow && sp->u.sh.type != SH_type_fl1_64_shadow ) { struct page_info *gpg = mfn_to_page(backpointer(sp)); /* Bad shadow flags on guest page? */ BUG_ON( !(gpg->shadow_flags & (1<u.sh.type)) ); /* Bad type count on guest page? */ #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) if ( sp->u.sh.type == SH_type_l1_32_shadow || sp->u.sh.type == SH_type_l1_pae_shadow || sp->u.sh.type == SH_type_l1_64_shadow ) { if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && (gpg->u.inuse.type_info & PGT_count_mask) != 0 ) { if ( !page_is_out_of_sync(gpg) ) { SHADOW_ERROR("MFN %#"PRI_mfn" shadowed (by %#"PRI_mfn")" " and not OOS but has typecount %#lx\n", __backpointer(sp), mfn_x(page_to_mfn(sp)), gpg->u.inuse.type_info); BUG(); } } } else /* Not an l1 */ #endif if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && (gpg->u.inuse.type_info & PGT_count_mask) != 0 ) { SHADOW_ERROR("MFN %#"PRI_mfn" shadowed (by %#"PRI_mfn")" " but has typecount %#lx\n", __backpointer(sp), mfn_x(page_to_mfn(sp)), gpg->u.inuse.type_info); BUG(); } } /* That entry was OK; on we go */ sp = next_shadow(sp); } } #else #define sh_hash_audit_bucket(_d, _b) do {} while(0) #endif /* Hashtable bucket audit */ #if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL static void sh_hash_audit(struct domain *d) /* Full audit: audit every bucket in the table */ { int i; if ( !(SHADOW_AUDIT_ENABLE) ) return; for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) { sh_hash_audit_bucket(d, i); } } #else #define sh_hash_audit(_d) do {} while(0) #endif /* Hashtable bucket audit */ /* Allocate and initialise the table itself. * Returns 0 for success, 1 for error. */ static int shadow_hash_alloc(struct domain *d) { struct page_info **table; ASSERT(paging_locked_by_me(d)); ASSERT(!d->arch.paging.shadow.hash_table); table = xzalloc_array(struct page_info *, SHADOW_HASH_BUCKETS); if ( !table ) return 1; d->arch.paging.shadow.hash_table = table; return 0; } /* Tear down the hash table and return all memory to Xen. * This function does not care whether the table is populated. */ static void shadow_hash_teardown(struct domain *d) { ASSERT(paging_locked_by_me(d)); ASSERT(d->arch.paging.shadow.hash_table); xfree(d->arch.paging.shadow.hash_table); d->arch.paging.shadow.hash_table = NULL; } mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t) /* Find an entry in the hash table. Returns the MFN of the shadow, * or INVALID_MFN if it doesn't exist */ { struct domain *d = v->domain; struct page_info *sp, *prev; key_t key; ASSERT(paging_locked_by_me(d)); ASSERT(d->arch.paging.shadow.hash_table); ASSERT(t); sh_hash_audit(d); perfc_incr(shadow_hash_lookups); key = sh_hash(n, t); sh_hash_audit_bucket(d, key); sp = d->arch.paging.shadow.hash_table[key]; prev = NULL; while(sp) { if ( __backpointer(sp) == n && sp->u.sh.type == t ) { /* Pull-to-front if 'sp' isn't already the head item */ if ( unlikely(sp != d->arch.paging.shadow.hash_table[key]) ) { if ( unlikely(d->arch.paging.shadow.hash_walking != 0) ) /* Can't reorder: someone is walking the hash chains */ return page_to_mfn(sp); else { ASSERT(prev); /* Delete sp from the list */ prev->next_shadow = sp->next_shadow; /* Re-insert it at the head of the list */ set_next_shadow(sp, d->arch.paging.shadow.hash_table[key]); d->arch.paging.shadow.hash_table[key] = sp; } } else { perfc_incr(shadow_hash_lookup_head); } return page_to_mfn(sp); } prev = sp; sp = next_shadow(sp); } perfc_incr(shadow_hash_lookup_miss); return _mfn(INVALID_MFN); } void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t, mfn_t smfn) /* Put a mapping (n,t)->smfn into the hash table */ { struct domain *d = v->domain; struct page_info *sp; key_t key; ASSERT(paging_locked_by_me(d)); ASSERT(d->arch.paging.shadow.hash_table); ASSERT(t); sh_hash_audit(d); perfc_incr(shadow_hash_inserts); key = sh_hash(n, t); sh_hash_audit_bucket(d, key); /* Insert this shadow at the top of the bucket */ sp = mfn_to_page(smfn); set_next_shadow(sp, d->arch.paging.shadow.hash_table[key]); d->arch.paging.shadow.hash_table[key] = sp; sh_hash_audit_bucket(d, key); } void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t, mfn_t smfn) /* Excise the mapping (n,t)->smfn from the hash table */ { struct domain *d = v->domain; struct page_info *sp, *x; key_t key; ASSERT(paging_locked_by_me(d)); ASSERT(d->arch.paging.shadow.hash_table); ASSERT(t); sh_hash_audit(d); perfc_incr(shadow_hash_deletes); key = sh_hash(n, t); sh_hash_audit_bucket(d, key); sp = mfn_to_page(smfn); if ( d->arch.paging.shadow.hash_table[key] == sp ) /* Easy case: we're deleting the head item. */ d->arch.paging.shadow.hash_table[key] = next_shadow(sp); else { /* Need to search for the one we want */ x = d->arch.paging.shadow.hash_table[key]; while ( 1 ) { ASSERT(x); /* We can't have hit the end, since our target is * still in the chain somehwere... */ if ( next_shadow(x) == sp ) { x->next_shadow = sp->next_shadow; break; } x = next_shadow(x); } } set_next_shadow(sp, NULL); sh_hash_audit_bucket(d, key); } typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn); static void hash_foreach(struct vcpu *v, unsigned int callback_mask, const hash_callback_t callbacks[], mfn_t callback_mfn) /* Walk the hash table looking at the types of the entries and * calling the appropriate callback function for each entry. * The mask determines which shadow types we call back for, and the array * of callbacks tells us which function to call. * Any callback may return non-zero to let us skip the rest of the scan. * * WARNING: Callbacks MUST NOT add or remove hash entries unless they * then return non-zero to terminate the scan. */ { int i, done = 0; struct domain *d = v->domain; struct page_info *x; ASSERT(paging_locked_by_me(d)); /* Can be called via p2m code &c after shadow teardown. */ if ( unlikely(!d->arch.paging.shadow.hash_table) ) return; /* Say we're here, to stop hash-lookups reordering the chains */ ASSERT(d->arch.paging.shadow.hash_walking == 0); d->arch.paging.shadow.hash_walking = 1; for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) { /* WARNING: This is not safe against changes to the hash table. * The callback *must* return non-zero if it has inserted or * deleted anything from the hash (lookups are OK, though). */ for ( x = d->arch.paging.shadow.hash_table[i]; x; x = next_shadow(x) ) { if ( callback_mask & (1 << x->u.sh.type) ) { ASSERT(x->u.sh.type <= 15); ASSERT(callbacks[x->u.sh.type] != NULL); done = callbacks[x->u.sh.type](v, page_to_mfn(x), callback_mfn); if ( done ) break; } } if ( done ) break; } d->arch.paging.shadow.hash_walking = 0; } /**************************************************************************/ /* Destroy a shadow page: simple dispatcher to call the per-type destructor * which will decrement refcounts appropriately and return memory to the * free pool. */ void sh_destroy_shadow(struct vcpu *v, mfn_t smfn) { struct page_info *sp = mfn_to_page(smfn); unsigned int t = sp->u.sh.type; SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn)); /* Double-check, if we can, that the shadowed page belongs to this * domain, (by following the back-pointer). */ ASSERT(t == SH_type_fl1_32_shadow || t == SH_type_fl1_pae_shadow || t == SH_type_fl1_64_shadow || t == SH_type_monitor_table || (is_pv_32on64_vcpu(v) && t == SH_type_l4_64_shadow) || (page_get_owner(mfn_to_page(backpointer(sp))) == v->domain)); /* The down-shifts here are so that the switch statement is on nice * small numbers that the compiler will enjoy */ switch ( t ) { case SH_type_l1_32_shadow: case SH_type_fl1_32_shadow: SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2)(v, smfn); break; case SH_type_l2_32_shadow: SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2)(v, smfn); break; case SH_type_l1_pae_shadow: case SH_type_fl1_pae_shadow: SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3)(v, smfn); break; case SH_type_l2_pae_shadow: case SH_type_l2h_pae_shadow: SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3)(v, smfn); break; case SH_type_l1_64_shadow: case SH_type_fl1_64_shadow: SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4)(v, smfn); break; case SH_type_l2h_64_shadow: ASSERT(is_pv_32on64_vcpu(v)); /* Fall through... */ case SH_type_l2_64_shadow: SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4)(v, smfn); break; case SH_type_l3_64_shadow: SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4)(v, smfn); break; case SH_type_l4_64_shadow: SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4)(v, smfn); break; default: SHADOW_ERROR("tried to destroy shadow of bad type %08lx\n", (unsigned long)t); BUG(); } } static inline void trace_shadow_wrmap_bf(mfn_t gmfn) { if ( tb_init_done ) { /* Convert gmfn to gfn */ unsigned long gfn = mfn_to_gfn(current->domain, gmfn); __trace_var(TRC_SHADOW_WRMAP_BF, 0/*!tsc*/, sizeof(gfn), &gfn); } } /**************************************************************************/ /* Remove all writeable mappings of a guest frame from the shadow tables * Returns non-zero if we need to flush TLBs. * level and fault_addr desribe how we found this to be a pagetable; * level==0 means we have some other reason for revoking write access. * If level==0 we are allowed to fail, returning -1. */ int sh_remove_write_access(struct vcpu *v, mfn_t gmfn, unsigned int level, unsigned long fault_addr) { /* Dispatch table for getting per-type functions */ static const hash_callback_t callbacks[SH_type_unused] = { NULL, /* none */ SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* l1_32 */ SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 2), /* fl1_32 */ NULL, /* l2_32 */ SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* l1_pae */ SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 3), /* fl1_pae */ NULL, /* l2_pae */ NULL, /* l2h_pae */ SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* l1_64 */ SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, 4), /* fl1_64 */ NULL, /* l2_64 */ NULL, /* l2h_64 */ NULL, /* l3_64 */ NULL, /* l4_64 */ NULL, /* p2m */ NULL /* unused */ }; static unsigned int callback_mask = 1 << SH_type_l1_32_shadow | 1 << SH_type_fl1_32_shadow | 1 << SH_type_l1_pae_shadow | 1 << SH_type_fl1_pae_shadow | 1 << SH_type_l1_64_shadow | 1 << SH_type_fl1_64_shadow ; struct page_info *pg = mfn_to_page(gmfn); ASSERT(paging_locked_by_me(v->domain)); /* Only remove writable mappings if we are doing shadow refcounts. * In guest refcounting, we trust Xen to already be restricting * all the writes to the guest page tables, so we do not need to * do more. */ if ( !shadow_mode_refcounts(v->domain) ) return 0; /* Early exit if it's already a pagetable, or otherwise not writeable */ if ( (sh_mfn_is_a_page_table(gmfn) #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Unless they've been allowed to go out of sync with their shadows */ && !mfn_oos_may_write(gmfn) #endif ) || (pg->u.inuse.type_info & PGT_count_mask) == 0 ) return 0; TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP); perfc_incr(shadow_writeable); /* If this isn't a "normal" writeable page, the domain is trying to * put pagetables in special memory of some kind. We can't allow that. */ if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page ) { SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %" PRtype_info "\n", mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info); domain_crash(v->domain); } #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC if ( v == current ) { unsigned long gfn; /* Heuristic: there is likely to be only one writeable mapping, * and that mapping is likely to be in the current pagetable, * in the guest's linear map (on non-HIGHPTE linux and windows)*/ #define GUESS(_a, _h) do { \ if ( v->arch.paging.mode->shadow.guess_wrmap(v, (_a), gmfn) ) \ perfc_incr(shadow_writeable_h_ ## _h); \ if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \ { \ TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND); \ return 1; \ } \ } while (0) if ( v->arch.paging.mode->guest_levels == 2 ) { if ( level == 1 ) /* 32bit non-PAE w2k3: linear map at 0xC0000000 */ GUESS(0xC0000000UL + (fault_addr >> 10), 1); /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */ if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 ) GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4); /* FreeBSD: Linear map at 0xBFC00000 */ if ( level == 1 ) GUESS(0xBFC00000UL + ((fault_addr & VADDR_MASK) >> 10), 6); } else if ( v->arch.paging.mode->guest_levels == 3 ) { /* 32bit PAE w2k3: linear map at 0xC0000000 */ switch ( level ) { case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break; case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break; } /* Linux lowmem: first 896MB is mapped 1-to-1 above 0xC0000000 */ if ((gfn = mfn_to_gfn(v->domain, gmfn)) < 0x38000 ) GUESS(0xC0000000UL + (gfn << PAGE_SHIFT), 4); /* FreeBSD PAE: Linear map at 0xBF800000 */ switch ( level ) { case 1: GUESS(0xBF800000UL + ((fault_addr & VADDR_MASK) >> 9), 6); break; case 2: GUESS(0xBFDFC000UL + ((fault_addr & VADDR_MASK) >> 18), 6); break; } } else if ( v->arch.paging.mode->guest_levels == 4 ) { /* 64bit w2k3: linear map at 0xfffff68000000000 */ switch ( level ) { case 1: GUESS(0xfffff68000000000UL + ((fault_addr & VADDR_MASK) >> 9), 3); break; case 2: GUESS(0xfffff6fb40000000UL + ((fault_addr & VADDR_MASK) >> 18), 3); break; case 3: GUESS(0xfffff6fb7da00000UL + ((fault_addr & VADDR_MASK) >> 27), 3); break; } /* 64bit Linux direct map at 0xffff880000000000; older kernels * had it at 0xffff810000000000, and older kernels yet had it * at 0x0000010000000000UL */ gfn = mfn_to_gfn(v->domain, gmfn); GUESS(0xffff880000000000UL + (gfn << PAGE_SHIFT), 4); GUESS(0xffff810000000000UL + (gfn << PAGE_SHIFT), 4); GUESS(0x0000010000000000UL + (gfn << PAGE_SHIFT), 4); /* * 64bit Solaris kernel page map at * kpm_vbase; 0xfffffe0000000000UL */ GUESS(0xfffffe0000000000UL + (gfn << PAGE_SHIFT), 4); /* FreeBSD 64bit: linear map 0xffff800000000000 */ switch ( level ) { case 1: GUESS(0xffff800000000000 + ((fault_addr & VADDR_MASK) >> 9), 6); break; case 2: GUESS(0xffff804000000000UL + ((fault_addr & VADDR_MASK) >> 18), 6); break; case 3: GUESS(0xffff804020000000UL + ((fault_addr & VADDR_MASK) >> 27), 6); break; } /* FreeBSD 64bit: direct map at 0xffffff0000000000 */ GUESS(0xffffff0000000000 + (gfn << PAGE_SHIFT), 6); } #undef GUESS } if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) return 1; /* Second heuristic: on HIGHPTE linux, there are two particular PTEs * (entries in the fixmap) where linux maps its pagetables. Since * we expect to hit them most of the time, we start the search for * the writeable mapping by looking at the same MFN where the last * brute-force search succeeded. */ if ( v->arch.paging.shadow.last_writeable_pte_smfn != 0 ) { unsigned long old_count = (pg->u.inuse.type_info & PGT_count_mask); mfn_t last_smfn = _mfn(v->arch.paging.shadow.last_writeable_pte_smfn); int shtype = mfn_to_page(last_smfn)->u.sh.type; if ( callbacks[shtype] ) callbacks[shtype](v, last_smfn, gmfn); if ( (pg->u.inuse.type_info & PGT_count_mask) != old_count ) perfc_incr(shadow_writeable_h_5); } if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) return 1; #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */ /* Brute-force search of all the shadows, by walking the hash */ trace_shadow_wrmap_bf(gmfn); if ( level == 0 ) perfc_incr(shadow_writeable_bf_1); else perfc_incr(shadow_writeable_bf); hash_foreach(v, callback_mask, callbacks, gmfn); /* If that didn't catch the mapping, then there's some non-pagetable * mapping -- ioreq page, grant mapping, &c. */ if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 ) { if ( level == 0 ) return -1; SHADOW_ERROR("can't remove write access to mfn %lx: guest has " "%lu special-use mappings of it\n", mfn_x(gmfn), (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask)); domain_crash(v->domain); } /* We killed at least one writeable mapping, so must flush TLBs. */ return 1; } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off) { struct page_info *sp = mfn_to_page(smfn); ASSERT(mfn_valid(smfn)); ASSERT(mfn_valid(gmfn)); if ( sp->u.sh.type == SH_type_l1_32_shadow || sp->u.sh.type == SH_type_fl1_32_shadow ) { return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2) (v, gmfn, smfn, off); } else if ( sp->u.sh.type == SH_type_l1_pae_shadow || sp->u.sh.type == SH_type_fl1_pae_shadow ) return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3) (v, gmfn, smfn, off); else if ( sp->u.sh.type == SH_type_l1_64_shadow || sp->u.sh.type == SH_type_fl1_64_shadow ) return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4) (v, gmfn, smfn, off); return 0; } #endif /**************************************************************************/ /* Remove all mappings of a guest frame from the shadow tables. * Returns non-zero if we need to flush TLBs. */ int sh_remove_all_mappings(struct vcpu *v, mfn_t gmfn) { struct page_info *page = mfn_to_page(gmfn); /* Dispatch table for getting per-type functions */ static const hash_callback_t callbacks[SH_type_unused] = { NULL, /* none */ SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* l1_32 */ SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 2), /* fl1_32 */ NULL, /* l2_32 */ SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* l1_pae */ SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 3), /* fl1_pae */ NULL, /* l2_pae */ NULL, /* l2h_pae */ SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* l1_64 */ SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, 4), /* fl1_64 */ NULL, /* l2_64 */ NULL, /* l2h_64 */ NULL, /* l3_64 */ NULL, /* l4_64 */ NULL, /* p2m */ NULL /* unused */ }; static unsigned int callback_mask = 1 << SH_type_l1_32_shadow | 1 << SH_type_fl1_32_shadow | 1 << SH_type_l1_pae_shadow | 1 << SH_type_fl1_pae_shadow | 1 << SH_type_l1_64_shadow | 1 << SH_type_fl1_64_shadow ; perfc_incr(shadow_mappings); if ( sh_check_page_has_no_refs(page) ) return 0; /* Although this is an externally visible function, we do not know * whether the paging lock will be held when it is called (since it * can be called via put_page_type when we clear a shadow l1e).*/ paging_lock_recursive(v->domain); /* XXX TODO: * Heuristics for finding the (probably) single mapping of this gmfn */ /* Brute-force search of all the shadows, by walking the hash */ perfc_incr(shadow_mappings_bf); hash_foreach(v, callback_mask, callbacks, gmfn); /* If that didn't catch the mapping, something is very wrong */ if ( !sh_check_page_has_no_refs(page) ) { /* Don't complain if we're in HVM and there are some extra mappings: * The qemu helper process has an untyped mapping of this dom's RAM * and the HVM restore program takes another. * Also allow one typed refcount for xenheap pages, to match * share_xen_page_with_guest(). */ if ( !(shadow_mode_external(v->domain) && (page->count_info & PGC_count_mask) <= 3 && ((page->u.inuse.type_info & PGT_count_mask) == !!is_xen_heap_page(page))) ) { SHADOW_ERROR("can't find all mappings of mfn %lx: " "c=%08lx t=%08lx\n", mfn_x(gmfn), page->count_info, page->u.inuse.type_info); } } paging_unlock(v->domain); /* We killed at least one mapping, so must flush TLBs. */ return 1; } /**************************************************************************/ /* Remove all shadows of a guest frame from the shadow tables */ static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn) /* Follow this shadow's up-pointer, if it has one, and remove the reference * found there. Returns 1 if that was the only reference to this shadow */ { struct page_info *sp = mfn_to_page(smfn); mfn_t pmfn; void *vaddr; int rc; ASSERT(sp->u.sh.type > 0); ASSERT(sp->u.sh.type < SH_type_max_shadow); ASSERT(sh_type_has_up_pointer(v, sp->u.sh.type)); if (sp->up == 0) return 0; pmfn = _mfn(sp->up >> PAGE_SHIFT); ASSERT(mfn_valid(pmfn)); vaddr = sh_map_domain_page(pmfn); ASSERT(vaddr); vaddr += sp->up & (PAGE_SIZE-1); ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn)); /* Is this the only reference to this shadow? */ rc = (sp->u.sh.count == 1) ? 1 : 0; /* Blank the offending entry */ switch (sp->u.sh.type) { case SH_type_l1_32_shadow: case SH_type_l2_32_shadow: SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 2)(v, vaddr, pmfn); break; case SH_type_l1_pae_shadow: case SH_type_l2_pae_shadow: case SH_type_l2h_pae_shadow: SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 3)(v, vaddr, pmfn); break; case SH_type_l1_64_shadow: case SH_type_l2_64_shadow: case SH_type_l2h_64_shadow: case SH_type_l3_64_shadow: case SH_type_l4_64_shadow: SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, 4)(v, vaddr, pmfn); break; default: BUG(); /* Some wierd unknown shadow type */ } sh_unmap_domain_page(vaddr); if ( rc ) perfc_incr(shadow_up_pointer); else perfc_incr(shadow_unshadow_bf); return rc; } void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all) /* Remove the shadows of this guest page. * If fast != 0, just try the quick heuristic, which will remove * at most one reference to each shadow of the page. Otherwise, walk * all the shadow tables looking for refs to shadows of this gmfn. * If all != 0, kill the domain if we can't find all the shadows. * (all != 0 implies fast == 0) */ { struct page_info *pg = mfn_to_page(gmfn); mfn_t smfn; unsigned char t; /* Dispatch table for getting per-type functions: each level must * be called with the function to remove a lower-level shadow. */ static const hash_callback_t callbacks[SH_type_unused] = { NULL, /* none */ NULL, /* l1_32 */ NULL, /* fl1_32 */ SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 2), /* l2_32 */ NULL, /* l1_pae */ NULL, /* fl1_pae */ SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2_pae */ SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 3), /* l2h_pae */ NULL, /* l1_64 */ NULL, /* fl1_64 */ SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2_64 */ SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, 4), /* l2h_64 */ SHADOW_INTERNAL_NAME(sh_remove_l2_shadow, 4), /* l3_64 */ SHADOW_INTERNAL_NAME(sh_remove_l3_shadow, 4), /* l4_64 */ NULL, /* p2m */ NULL /* unused */ }; /* Another lookup table, for choosing which mask to use */ static unsigned int masks[SH_type_unused] = { 0, /* none */ 1 << SH_type_l2_32_shadow, /* l1_32 */ 0, /* fl1_32 */ 0, /* l2_32 */ ((1 << SH_type_l2h_pae_shadow) | (1 << SH_type_l2_pae_shadow)), /* l1_pae */ 0, /* fl1_pae */ 0, /* l2_pae */ 0, /* l2h_pae */ ((1 << SH_type_l2h_64_shadow) | (1 << SH_type_l2_64_shadow)), /* l1_64 */ 0, /* fl1_64 */ 1 << SH_type_l3_64_shadow, /* l2_64 */ 1 << SH_type_l3_64_shadow, /* l2h_64 */ 1 << SH_type_l4_64_shadow, /* l3_64 */ 0, /* l4_64 */ 0, /* p2m */ 0 /* unused */ }; ASSERT(!(all && fast)); ASSERT(mfn_valid(gmfn)); /* Although this is an externally visible function, we do not know * whether the paging lock will be held when it is called (since it * can be called via put_page_type when we clear a shadow l1e).*/ paging_lock_recursive(v->domain); SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n", v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); /* Bail out now if the page is not shadowed */ if ( (pg->count_info & PGC_page_table) == 0 ) { paging_unlock(v->domain); return; } /* Search for this shadow in all appropriate shadows */ perfc_incr(shadow_unshadow); /* Lower-level shadows need to be excised from upper-level shadows. * This call to hash_foreach() looks dangerous but is in fact OK: each * call will remove at most one shadow, and terminate immediately when * it does remove it, so we never walk the hash after doing a deletion. */ #define DO_UNSHADOW(_type) do { \ t = (_type); \ if( !(pg->count_info & PGC_page_table) \ || !(pg->shadow_flags & (1 << t)) ) \ break; \ smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \ if ( unlikely(!mfn_valid(smfn)) ) \ { \ SHADOW_ERROR(": gmfn %#lx has flags %#"PRIx32 \ " but no type-%#"PRIx32" shadow\n", \ mfn_x(gmfn), (uint32_t)pg->shadow_flags, t); \ break; \ } \ if ( sh_type_is_pinnable(v, t) ) \ sh_unpin(v, smfn); \ else if ( sh_type_has_up_pointer(v, t) ) \ sh_remove_shadow_via_pointer(v, smfn); \ if( !fast \ && (pg->count_info & PGC_page_table) \ && (pg->shadow_flags & (1 << t)) ) \ hash_foreach(v, masks[t], callbacks, smfn); \ } while (0) DO_UNSHADOW(SH_type_l2_32_shadow); DO_UNSHADOW(SH_type_l1_32_shadow); DO_UNSHADOW(SH_type_l2h_pae_shadow); DO_UNSHADOW(SH_type_l2_pae_shadow); DO_UNSHADOW(SH_type_l1_pae_shadow); DO_UNSHADOW(SH_type_l4_64_shadow); DO_UNSHADOW(SH_type_l3_64_shadow); DO_UNSHADOW(SH_type_l2h_64_shadow); DO_UNSHADOW(SH_type_l2_64_shadow); DO_UNSHADOW(SH_type_l1_64_shadow); #undef DO_UNSHADOW /* If that didn't catch the shadows, something is wrong */ if ( !fast && all && (pg->count_info & PGC_page_table) ) { SHADOW_ERROR("can't find all shadows of mfn %05lx " "(shadow_flags=%08x)\n", mfn_x(gmfn), pg->shadow_flags); domain_crash(v->domain); } /* Need to flush TLBs now, so that linear maps are safe next time we * take a fault. */ flush_tlb_mask(v->domain->domain_dirty_cpumask); paging_unlock(v->domain); } static void sh_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn) /* Even harsher: this is a HVM page that we thing is no longer a pagetable. * Unshadow it, and recursively unshadow pages that reference it. */ { sh_remove_shadows(v, gmfn, 0, 1); /* XXX TODO: * Rework this hashtable walker to return a linked-list of all * the shadows it modified, then do breadth-first recursion * to find the way up to higher-level tables and unshadow them too. * * The current code (just tearing down each page's shadows as we * detect that it is not a pagetable) is correct, but very slow. * It means extra emulated writes and slows down removal of mappings. */ } /**************************************************************************/ /* Reset the up-pointers of every L3 shadow to 0. * This is called when l3 shadows stop being pinnable, to clear out all * the list-head bits so the up-pointer field is properly inititalised. */ static int sh_clear_up_pointer(struct vcpu *v, mfn_t smfn, mfn_t unused) { mfn_to_page(smfn)->up = 0; return 0; } void sh_reset_l3_up_pointers(struct vcpu *v) { static hash_callback_t callbacks[SH_type_unused] = { NULL, /* none */ NULL, /* l1_32 */ NULL, /* fl1_32 */ NULL, /* l2_32 */ NULL, /* l1_pae */ NULL, /* fl1_pae */ NULL, /* l2_pae */ NULL, /* l2h_pae */ NULL, /* l1_64 */ NULL, /* fl1_64 */ NULL, /* l2_64 */ NULL, /* l2h_64 */ sh_clear_up_pointer, /* l3_64 */ NULL, /* l4_64 */ NULL, /* p2m */ NULL /* unused */ }; static unsigned int callback_mask = 1 << SH_type_l3_64_shadow; hash_foreach(v, callback_mask, callbacks, _mfn(INVALID_MFN)); } /**************************************************************************/ static void sh_update_paging_modes(struct vcpu *v) { struct domain *d = v->domain; const struct paging_mode *old_mode = v->arch.paging.mode; ASSERT(paging_locked_by_me(d)); #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) /* Make sure this vcpu has a virtual TLB array allocated */ if ( unlikely(!v->arch.paging.vtlb) ) { v->arch.paging.vtlb = xzalloc_array(struct shadow_vtlb, VTLB_ENTRIES); if ( unlikely(!v->arch.paging.vtlb) ) { SHADOW_ERROR("Could not allocate vTLB space for dom %u vcpu %u\n", d->domain_id, v->vcpu_id); domain_crash(v->domain); return; } spin_lock_init(&v->arch.paging.vtlb_lock); } #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) if ( mfn_x(v->arch.paging.shadow.oos_snapshot[0]) == INVALID_MFN ) { int i; for(i = 0; i < SHADOW_OOS_PAGES; i++) { shadow_prealloc(d, SH_type_oos_snapshot, 1); v->arch.paging.shadow.oos_snapshot[i] = shadow_alloc(d, SH_type_oos_snapshot, 0); } } #endif /* OOS */ // Valid transitions handled by this function: // - For PV guests: // - after a shadow mode has been changed // - For HVM guests: // - after a shadow mode has been changed // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE // // First, tear down any old shadow tables held by this vcpu. // if ( v->arch.paging.mode ) v->arch.paging.mode->shadow.detach_old_tables(v); if ( is_pv_domain(d) ) { /// /// PV guest /// v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 4); } else { /// /// HVM guest /// ASSERT(shadow_mode_translate(d)); ASSERT(shadow_mode_external(d)); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Need to resync all our pages now, because if a page goes out * of sync with paging enabled and is resynced with paging * disabled, the resync will go wrong. */ shadow_resync_all(v); #endif /* OOS */ if ( !hvm_paging_enabled(v) ) { /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE * pagetable for it, mapping 4 GB one-to-one using a single l2 * page of 1024 superpage mappings */ v->arch.guest_table = d->arch.paging.shadow.unpaged_pagetable; v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 2); } else if ( hvm_long_mode_enabled(v) ) { // long mode guest... v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 4); } else if ( hvm_pae_enabled(v) ) { // 32-bit PAE mode guest... v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3); } else { // 32-bit 2 level guest... v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 2); } if ( pagetable_is_null(v->arch.monitor_table) ) { mfn_t mmfn = v->arch.paging.mode->shadow.make_monitor_table(v); v->arch.monitor_table = pagetable_from_mfn(mmfn); make_cr3(v, mfn_x(mmfn)); hvm_update_host_cr3(v); } if ( v->arch.paging.mode != old_mode ) { SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d gl=%u " "sl=%u (was g=%u s=%u)\n", d->domain_id, v->vcpu_id, is_hvm_domain(d) ? hvm_paging_enabled(v) : 1, v->arch.paging.mode->guest_levels, v->arch.paging.mode->shadow.shadow_levels, old_mode ? old_mode->guest_levels : 0, old_mode ? old_mode->shadow.shadow_levels : 0); if ( old_mode && (v->arch.paging.mode->shadow.shadow_levels != old_mode->shadow.shadow_levels) ) { /* Need to make a new monitor table for the new mode */ mfn_t new_mfn, old_mfn; if ( v != current && vcpu_runnable(v) ) { SHADOW_ERROR("Some third party (d=%u v=%u) is changing " "this HVM vcpu's (d=%u v=%u) paging mode " "while it is running.\n", current->domain->domain_id, current->vcpu_id, v->domain->domain_id, v->vcpu_id); /* It's not safe to do that because we can't change * the host CR3 for a running domain */ domain_crash(v->domain); return; } old_mfn = pagetable_get_mfn(v->arch.monitor_table); v->arch.monitor_table = pagetable_null(); new_mfn = v->arch.paging.mode->shadow.make_monitor_table(v); v->arch.monitor_table = pagetable_from_mfn(new_mfn); SHADOW_PRINTK("new monitor table %"PRI_mfn "\n", mfn_x(new_mfn)); /* Don't be running on the old monitor table when we * pull it down! Switch CR3, and warn the HVM code that * its host cr3 has changed. */ make_cr3(v, mfn_x(new_mfn)); if ( v == current ) write_ptbase(v); hvm_update_host_cr3(v); old_mode->shadow.destroy_monitor_table(v, old_mfn); } } // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE. // These are HARD: think about the case where two CPU's have // different values for CR4.PSE and CR4.PGE at the same time. // This *does* happen, at least for CR4.PGE... } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* We need to check that all the vcpus have paging enabled to * unsync PTs. */ if ( is_hvm_domain(d) && !d->arch.paging.shadow.oos_off ) { int pe = 1; struct vcpu *vptr; for_each_vcpu(d, vptr) { if ( !hvm_paging_enabled(vptr) ) { pe = 0; break; } } d->arch.paging.shadow.oos_active = pe; } #endif /* OOS */ v->arch.paging.mode->update_cr3(v, 0); } void shadow_update_paging_modes(struct vcpu *v) { paging_lock(v->domain); sh_update_paging_modes(v); paging_unlock(v->domain); } /**************************************************************************/ /* Turning on and off shadow features */ static void sh_new_mode(struct domain *d, u32 new_mode) /* Inform all the vcpus that the shadow mode has been changed */ { struct vcpu *v; ASSERT(paging_locked_by_me(d)); ASSERT(d != current->domain); d->arch.paging.mode = new_mode; for_each_vcpu(d, v) sh_update_paging_modes(v); } int shadow_enable(struct domain *d, u32 mode) /* Turn on "permanent" shadow features: external, translate, refcount. * Can only be called once on a domain, and these features cannot be * disabled. * Returns 0 for success, -errno for failure. */ { unsigned int old_pages; struct page_info *pg = NULL; uint32_t *e; int i, rv = 0; struct p2m_domain *p2m = p2m_get_hostp2m(d); mode |= PG_SH_enable; domain_pause(d); /* Sanity check the arguments */ if ( (d == current->domain) || shadow_mode_enabled(d) || ((mode & PG_translate) && !(mode & PG_refcounts)) || ((mode & PG_external) && !(mode & PG_translate)) ) { rv = -EINVAL; goto out_unlocked; } /* Init the shadow memory allocation if the user hasn't done so */ old_pages = d->arch.paging.shadow.total_pages; if ( old_pages == 0 ) { unsigned int r; paging_lock(d); r = sh_set_allocation(d, 1024, NULL); /* Use at least 4MB */ if ( r != 0 ) { sh_set_allocation(d, 0, NULL); rv = -ENOMEM; goto out_locked; } paging_unlock(d); } /* Allow p2m and log-dirty code to borrow shadow memory */ d->arch.paging.alloc_page = shadow_alloc_p2m_page; d->arch.paging.free_page = shadow_free_p2m_page; /* Init the P2M table. Must be done before we take the paging lock * to avoid possible deadlock. */ if ( mode & PG_translate ) { rv = p2m_alloc_table(p2m); if (rv != 0) goto out_unlocked; } /* HVM domains need an extra pagetable for vcpus that think they * have paging disabled */ if ( is_hvm_domain(d) ) { /* Get a single page from the shadow pool. Take it via the * P2M interface to make freeing it simpler afterwards. */ pg = shadow_alloc_p2m_page(d); if ( pg == NULL ) { rv = -ENOMEM; goto out_unlocked; } /* Fill it with 32-bit, non-PAE superpage entries, each mapping 4MB * of virtual address space onto the same physical address range */ e = __map_domain_page(pg); for ( i = 0; i < PAGE_SIZE / sizeof(*e); i++ ) e[i] = ((0x400000U * i) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); sh_unmap_domain_page(e); pg->u.inuse.type_info = PGT_l2_page_table | 1 | PGT_validated; } paging_lock(d); /* Sanity check again with the lock held */ if ( shadow_mode_enabled(d) ) { rv = -EINVAL; goto out_locked; } /* Init the hash table */ if ( shadow_hash_alloc(d) != 0 ) { rv = -ENOMEM; goto out_locked; } #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL) /* We assume we're dealing with an older 64bit linux guest until we * see the guest use more than one l4 per vcpu. */ d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL; #endif /* Record the 1-to-1 pagetable we just made */ if ( is_hvm_domain(d) ) d->arch.paging.shadow.unpaged_pagetable = pagetable_from_page(pg); /* Update the bits */ sh_new_mode(d, mode); out_locked: paging_unlock(d); out_unlocked: if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) ) p2m_teardown(p2m); if ( rv != 0 && pg != NULL ) shadow_free_p2m_page(d, pg); domain_unpause(d); return rv; } void shadow_teardown(struct domain *d) /* Destroy the shadow pagetables of this domain and free its shadow memory. * Should only be called for dying domains. */ { struct vcpu *v; mfn_t mfn; struct page_info *unpaged_pagetable = NULL; ASSERT(d->is_dying); ASSERT(d != current->domain); paging_lock(d); if ( shadow_mode_enabled(d) ) { /* Release the shadow and monitor tables held by each vcpu */ for_each_vcpu(d, v) { if ( v->arch.paging.mode ) { v->arch.paging.mode->shadow.detach_old_tables(v); if ( shadow_mode_external(d) ) { mfn = pagetable_get_mfn(v->arch.monitor_table); if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) ) v->arch.paging.mode->shadow.destroy_monitor_table(v, mfn); v->arch.monitor_table = pagetable_null(); } } } } #if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) /* Free the virtual-TLB array attached to each vcpu */ for_each_vcpu(d, v) { #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) if ( v->arch.paging.vtlb ) { xfree(v->arch.paging.vtlb); v->arch.paging.vtlb = NULL; } #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) { int i; mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot; for ( i = 0; i < SHADOW_OOS_PAGES; i++ ) if ( mfn_valid(oos_snapshot[i]) ) { shadow_free(d, oos_snapshot[i]); oos_snapshot[i] = _mfn(INVALID_MFN); } } #endif /* OOS */ } #endif /* (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) */ if ( d->arch.paging.shadow.total_pages != 0 ) { SHADOW_PRINTK("teardown of domain %u starts." " Shadow pages total = %u, free = %u, p2m=%u\n", d->domain_id, d->arch.paging.shadow.total_pages, d->arch.paging.shadow.free_pages, d->arch.paging.shadow.p2m_pages); /* Destroy all the shadows and release memory to domheap */ sh_set_allocation(d, 0, NULL); /* Release the hash table back to xenheap */ if (d->arch.paging.shadow.hash_table) shadow_hash_teardown(d); /* Should not have any more memory held */ SHADOW_PRINTK("teardown done." " Shadow pages total = %u, free = %u, p2m=%u\n", d->arch.paging.shadow.total_pages, d->arch.paging.shadow.free_pages, d->arch.paging.shadow.p2m_pages); ASSERT(d->arch.paging.shadow.total_pages == 0); } /* Free the non-paged-vcpus pagetable; must happen after we've * destroyed any shadows of it or sh_destroy_shadow will get confused. */ if ( !pagetable_is_null(d->arch.paging.shadow.unpaged_pagetable) ) { for_each_vcpu(d, v) { ASSERT(is_hvm_vcpu(v)); if ( !hvm_paging_enabled(v) ) v->arch.guest_table = pagetable_null(); } unpaged_pagetable = pagetable_get_page(d->arch.paging.shadow.unpaged_pagetable); d->arch.paging.shadow.unpaged_pagetable = pagetable_null(); } /* We leave the "permanent" shadow modes enabled, but clear the * log-dirty mode bit. We don't want any more mark_dirty() * calls now that we've torn down the bitmap */ d->arch.paging.mode &= ~PG_log_dirty; if (d->arch.hvm_domain.dirty_vram) { xfree(d->arch.hvm_domain.dirty_vram->sl1ma); xfree(d->arch.hvm_domain.dirty_vram->dirty_bitmap); xfree(d->arch.hvm_domain.dirty_vram); d->arch.hvm_domain.dirty_vram = NULL; } paging_unlock(d); /* Must be called outside the lock */ if ( unpaged_pagetable ) shadow_free_p2m_page(d, unpaged_pagetable); } void shadow_final_teardown(struct domain *d) /* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */ { SHADOW_PRINTK("dom %u final teardown starts." " Shadow pages total = %u, free = %u, p2m=%u\n", d->domain_id, d->arch.paging.shadow.total_pages, d->arch.paging.shadow.free_pages, d->arch.paging.shadow.p2m_pages); /* Double-check that the domain didn't have any shadow memory. * It is possible for a domain that never got domain_kill()ed * to get here with its shadow allocation intact. */ if ( d->arch.paging.shadow.total_pages != 0 ) shadow_teardown(d); /* It is now safe to pull down the p2m map. */ p2m_teardown(p2m_get_hostp2m(d)); /* Free any shadow memory that the p2m teardown released */ paging_lock(d); sh_set_allocation(d, 0, NULL); SHADOW_PRINTK("dom %u final teardown done." " Shadow pages total = %u, free = %u, p2m=%u\n", d->domain_id, d->arch.paging.shadow.total_pages, d->arch.paging.shadow.free_pages, d->arch.paging.shadow.p2m_pages); paging_unlock(d); } static int shadow_one_bit_enable(struct domain *d, u32 mode) /* Turn on a single shadow mode feature */ { ASSERT(paging_locked_by_me(d)); /* Sanity check the call */ if ( d == current->domain || (d->arch.paging.mode & mode) == mode ) { return -EINVAL; } mode |= PG_SH_enable; if ( d->arch.paging.shadow.total_pages == 0 ) { /* Init the shadow memory allocation if the user hasn't done so */ if ( sh_set_allocation(d, 1, NULL) != 0 ) { sh_set_allocation(d, 0, NULL); return -ENOMEM; } } /* Allow p2m and log-dirty code to borrow shadow memory */ d->arch.paging.alloc_page = shadow_alloc_p2m_page; d->arch.paging.free_page = shadow_free_p2m_page; if ( d->arch.paging.mode == 0 ) { /* Init the shadow hash table */ if ( shadow_hash_alloc(d) != 0 ) return -ENOMEM; } /* Update the bits */ sh_new_mode(d, d->arch.paging.mode | mode); return 0; } static int shadow_one_bit_disable(struct domain *d, u32 mode) /* Turn off a single shadow mode feature */ { struct vcpu *v; ASSERT(paging_locked_by_me(d)); /* Sanity check the call */ if ( d == current->domain || !((d->arch.paging.mode & mode) == mode) ) { return -EINVAL; } /* Update the bits */ sh_new_mode(d, d->arch.paging.mode & ~mode); if ( d->arch.paging.mode == 0 ) { /* Get this domain off shadows */ SHADOW_PRINTK("un-shadowing of domain %u starts." " Shadow pages total = %u, free = %u, p2m=%u\n", d->domain_id, d->arch.paging.shadow.total_pages, d->arch.paging.shadow.free_pages, d->arch.paging.shadow.p2m_pages); for_each_vcpu(d, v) { if ( v->arch.paging.mode ) v->arch.paging.mode->shadow.detach_old_tables(v); if ( !(v->arch.flags & TF_kernel_mode) ) make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user)); else make_cr3(v, pagetable_get_pfn(v->arch.guest_table)); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) { int i; mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot; for ( i = 0; i < SHADOW_OOS_PAGES; i++ ) if ( mfn_valid(oos_snapshot[i]) ) { shadow_free(d, oos_snapshot[i]); oos_snapshot[i] = _mfn(INVALID_MFN); } } #endif /* OOS */ } /* Pull down the memory allocation */ if ( sh_set_allocation(d, 0, NULL) != 0 ) BUG(); /* In fact, we will have BUG()ed already */ shadow_hash_teardown(d); SHADOW_PRINTK("un-shadowing of domain %u done." " Shadow pages total = %u, free = %u, p2m=%u\n", d->domain_id, d->arch.paging.shadow.total_pages, d->arch.paging.shadow.free_pages, d->arch.paging.shadow.p2m_pages); } return 0; } /* Enable/disable ops for the "test" and "log-dirty" modes */ static int shadow_test_enable(struct domain *d) { int ret; domain_pause(d); paging_lock(d); ret = shadow_one_bit_enable(d, PG_SH_enable); paging_unlock(d); domain_unpause(d); return ret; } static int shadow_test_disable(struct domain *d) { int ret; domain_pause(d); paging_lock(d); ret = shadow_one_bit_disable(d, PG_SH_enable); paging_unlock(d); domain_unpause(d); return ret; } /**************************************************************************/ /* P2M map manipulations */ /* shadow specific code which should be called when P2M table entry is updated * with new content. It is responsible for update the entry, as well as other * shadow processing jobs. */ static void sh_unshadow_for_p2m_change(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level) { struct domain *d = v->domain; /* The following assertion is to make sure we don't step on 1GB host * page support of HVM guest. */ ASSERT(!(level > 2 && (l1e_get_flags(*p) & _PAGE_PRESENT) && (l1e_get_flags(*p) & _PAGE_PSE))); /* If we're removing an MFN from the p2m, remove it from the shadows too */ if ( level == 1 ) { mfn_t mfn = _mfn(l1e_get_pfn(*p)); p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p)); if ( (p2m_is_valid(p2mt) || p2m_is_grant(p2mt)) && mfn_valid(mfn) ) { sh_remove_all_shadows_and_parents(v, mfn); if ( sh_remove_all_mappings(v, mfn) ) flush_tlb_mask(d->domain_dirty_cpumask); } } /* If we're removing a superpage mapping from the p2m, we need to check * all the pages covered by it. If they're still there in the new * scheme, that's OK, but otherwise they must be unshadowed. */ if ( level == 2 && (l1e_get_flags(*p) & _PAGE_PRESENT) && (l1e_get_flags(*p) & _PAGE_PSE) ) { unsigned int i; cpumask_t flushmask; mfn_t omfn = _mfn(l1e_get_pfn(*p)); mfn_t nmfn = _mfn(l1e_get_pfn(new)); l1_pgentry_t *npte = NULL; p2m_type_t p2mt = p2m_flags_to_type(l1e_get_flags(*p)); if ( p2m_is_valid(p2mt) && mfn_valid(omfn) ) { cpumask_clear(&flushmask); /* If we're replacing a superpage with a normal L1 page, map it */ if ( (l1e_get_flags(new) & _PAGE_PRESENT) && !(l1e_get_flags(new) & _PAGE_PSE) && mfn_valid(nmfn) ) npte = map_domain_page(mfn_x(nmfn)); for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) { if ( !npte || !p2m_is_ram(p2m_flags_to_type(l1e_get_flags(npte[i]))) || l1e_get_pfn(npte[i]) != mfn_x(omfn) ) { /* This GFN->MFN mapping has gone away */ sh_remove_all_shadows_and_parents(v, omfn); if ( sh_remove_all_mappings(v, omfn) ) cpumask_or(&flushmask, &flushmask, d->domain_dirty_cpumask); } omfn = _mfn(mfn_x(omfn) + 1); } flush_tlb_mask(&flushmask); if ( npte ) unmap_domain_page(npte); } } } void shadow_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level) { struct domain *d = v->domain; paging_lock(d); /* If there are any shadows, update them. But if shadow_teardown() * has already been called then it's not safe to try. */ if ( likely(d->arch.paging.shadow.total_pages != 0) ) sh_unshadow_for_p2m_change(v, gfn, p, table_mfn, new, level); /* Update the entry with new content */ safe_write_pte(p, new); #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) /* If we're doing FAST_FAULT_PATH, then shadow mode may have cached the fact that this is an mmio region in the shadow page tables. Blow the tables away to remove the cache. This is pretty heavy handed, but this is a rare operation (it might happen a dozen times during boot and then never again), so it doesn't matter too much. */ if ( d->arch.paging.shadow.has_fast_mmio_entries ) { shadow_blow_tables(d); d->arch.paging.shadow.has_fast_mmio_entries = 0; } #endif paging_unlock(d); } /**************************************************************************/ /* Log-dirty mode support */ /* Shadow specific code which is called in paging_log_dirty_enable(). * Return 0 if no problem found. */ int shadow_enable_log_dirty(struct domain *d, bool_t log_global) { int ret; paging_lock(d); if ( shadow_mode_enabled(d) ) { /* This domain already has some shadows: need to clear them out * of the way to make sure that all references to guest memory are * properly write-protected */ shadow_blow_tables(d); } #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL) /* 32bit PV guests on 64bit xen behave like older 64bit linux: they * change an l4e instead of cr3 to switch tables. Give them the * same optimization */ if ( is_pv_32on64_domain(d) ) d->arch.paging.shadow.opt_flags = SHOPT_LINUX_L3_TOPLEVEL; #endif ret = shadow_one_bit_enable(d, PG_log_dirty); paging_unlock(d); return ret; } /* shadow specfic code which is called in paging_log_dirty_disable() */ int shadow_disable_log_dirty(struct domain *d) { int ret; paging_lock(d); ret = shadow_one_bit_disable(d, PG_log_dirty); paging_unlock(d); return ret; } /* This function is called when we CLEAN log dirty bitmap. See * paging_log_dirty_op() for details. */ void shadow_clean_dirty_bitmap(struct domain *d) { paging_lock(d); /* Need to revoke write access to the domain's pages again. * In future, we'll have a less heavy-handed approach to this, * but for now, we just unshadow everything except Xen. */ shadow_blow_tables(d); paging_unlock(d); } /**************************************************************************/ /* VRAM dirty tracking support */ int shadow_track_dirty_vram(struct domain *d, unsigned long begin_pfn, unsigned long nr, XEN_GUEST_HANDLE_64(uint8) dirty_bitmap) { int rc; unsigned long end_pfn = begin_pfn + nr; unsigned long dirty_size = (nr + 7) / 8; int flush_tlb = 0; unsigned long i; p2m_type_t t; struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram; struct p2m_domain *p2m = p2m_get_hostp2m(d); if (end_pfn < begin_pfn || begin_pfn > p2m->max_mapped_pfn || end_pfn >= p2m->max_mapped_pfn) return -EINVAL; /* We perform p2m lookups, so lock the p2m upfront to avoid deadlock */ p2m_lock(p2m_get_hostp2m(d)); paging_lock(d); if ( dirty_vram && (!nr || ( begin_pfn != dirty_vram->begin_pfn || end_pfn != dirty_vram->end_pfn )) ) { /* Different tracking, tear the previous down. */ gdprintk(XENLOG_INFO, "stopping tracking VRAM %lx - %lx\n", dirty_vram->begin_pfn, dirty_vram->end_pfn); xfree(dirty_vram->sl1ma); xfree(dirty_vram->dirty_bitmap); xfree(dirty_vram); dirty_vram = d->arch.hvm_domain.dirty_vram = NULL; } if ( !nr ) { rc = 0; goto out; } /* This should happen seldomly (Video mode change), * no need to be careful. */ if ( !dirty_vram ) { /* Throw away all the shadows rather than walking through them * up to nr times getting rid of mappings of each pfn */ shadow_blow_tables(d); gdprintk(XENLOG_INFO, "tracking VRAM %lx - %lx\n", begin_pfn, end_pfn); rc = -ENOMEM; if ( (dirty_vram = xmalloc(struct sh_dirty_vram)) == NULL ) goto out; dirty_vram->begin_pfn = begin_pfn; dirty_vram->end_pfn = end_pfn; d->arch.hvm_domain.dirty_vram = dirty_vram; if ( (dirty_vram->sl1ma = xmalloc_array(paddr_t, nr)) == NULL ) goto out_dirty_vram; memset(dirty_vram->sl1ma, ~0, sizeof(paddr_t) * nr); if ( (dirty_vram->dirty_bitmap = xzalloc_array(uint8_t, dirty_size)) == NULL ) goto out_sl1ma; dirty_vram->last_dirty = NOW(); /* Tell the caller that this time we could not track dirty bits. */ rc = -ENODATA; } else if (dirty_vram->last_dirty == -1) { /* still completely clean, just copy our empty bitmap */ rc = -EFAULT; if ( copy_to_guest(dirty_bitmap, dirty_vram->dirty_bitmap, dirty_size) == 0 ) rc = 0; } else { unsigned long map_mfn = INVALID_MFN; void *map_sl1p = NULL; /* Iterate over VRAM to track dirty bits. */ for ( i = 0; i < nr; i++ ) { mfn_t mfn = get_gfn_query_unlocked(d, begin_pfn + i, &t); struct page_info *page; int dirty = 0; paddr_t sl1ma = dirty_vram->sl1ma[i]; if (mfn_x(mfn) == INVALID_MFN) { dirty = 1; } else { page = mfn_to_page(mfn); switch (page->u.inuse.type_info & PGT_count_mask) { case 0: /* No guest reference, nothing to track. */ break; case 1: /* One guest reference. */ if ( sl1ma == INVALID_PADDR ) { /* We don't know which sl1e points to this, too bad. */ dirty = 1; /* TODO: Heuristics for finding the single mapping of * this gmfn */ flush_tlb |= sh_remove_all_mappings(d->vcpu[0], mfn); } else { /* Hopefully the most common case: only one mapping, * whose dirty bit we can use. */ l1_pgentry_t *sl1e; unsigned long sl1mfn = paddr_to_pfn(sl1ma); if ( sl1mfn != map_mfn ) { if ( map_sl1p ) sh_unmap_domain_page(map_sl1p); map_sl1p = sh_map_domain_page(_mfn(sl1mfn)); map_mfn = sl1mfn; } sl1e = map_sl1p + (sl1ma & ~PAGE_MASK); if ( l1e_get_flags(*sl1e) & _PAGE_DIRTY ) { dirty = 1; /* Note: this is atomic, so we may clear a * _PAGE_ACCESSED set by another processor. */ l1e_remove_flags(*sl1e, _PAGE_DIRTY); flush_tlb = 1; } } break; default: /* More than one guest reference, * we don't afford tracking that. */ dirty = 1; break; } } if ( dirty ) { dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8); dirty_vram->last_dirty = NOW(); } } if ( map_sl1p ) sh_unmap_domain_page(map_sl1p); rc = -EFAULT; if ( copy_to_guest(dirty_bitmap, dirty_vram->dirty_bitmap, dirty_size) == 0 ) { memset(dirty_vram->dirty_bitmap, 0, dirty_size); if (dirty_vram->last_dirty + SECONDS(2) < NOW()) { /* was clean for more than two seconds, try to disable guest * write access */ for ( i = begin_pfn; i < end_pfn; i++ ) { mfn_t mfn = get_gfn_query_unlocked(d, i, &t); if (mfn_x(mfn) != INVALID_MFN) flush_tlb |= sh_remove_write_access(d->vcpu[0], mfn, 1, 0); } dirty_vram->last_dirty = -1; } rc = 0; } } if ( flush_tlb ) flush_tlb_mask(d->domain_dirty_cpumask); goto out; out_sl1ma: xfree(dirty_vram->sl1ma); out_dirty_vram: xfree(dirty_vram); dirty_vram = d->arch.hvm_domain.dirty_vram = NULL; out: paging_unlock(d); p2m_unlock(p2m_get_hostp2m(d)); return rc; } /**************************************************************************/ /* Shadow-control XEN_DOMCTL dispatcher */ int shadow_domctl(struct domain *d, xen_domctl_shadow_op_t *sc, XEN_GUEST_HANDLE_PARAM(void) u_domctl) { int rc, preempted = 0; switch ( sc->op ) { case XEN_DOMCTL_SHADOW_OP_OFF: if ( d->arch.paging.mode == PG_SH_enable ) if ( (rc = shadow_test_disable(d)) != 0 ) return rc; return 0; case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST: return shadow_test_enable(d); case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE: return shadow_enable(d, PG_refcounts|PG_translate); case XEN_DOMCTL_SHADOW_OP_ENABLE: return shadow_enable(d, sc->mode << PG_mode_shift); case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: sc->mb = shadow_get_allocation(d); return 0; case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: paging_lock(d); if ( sc->mb == 0 && shadow_mode_enabled(d) ) { /* Can't set the allocation to zero unless the domain stops using * shadow pagetables first */ SHADOW_ERROR("Can't set shadow allocation to zero, domain %u" " is still using shadows.\n", d->domain_id); paging_unlock(d); return -EINVAL; } rc = sh_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted); paging_unlock(d); if ( preempted ) /* Not finished. Set up to re-run the call. */ rc = hypercall_create_continuation( __HYPERVISOR_domctl, "h", u_domctl); else /* Finished. Return the new allocation */ sc->mb = shadow_get_allocation(d); return rc; default: SHADOW_ERROR("Bad shadow op %u\n", sc->op); return -EINVAL; } } /**************************************************************************/ /* Auditing shadow tables */ #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL void shadow_audit_tables(struct vcpu *v) { /* Dispatch table for getting per-type functions */ static const hash_callback_t callbacks[SH_type_unused] = { NULL, /* none */ SHADOW_INTERNAL_NAME(sh_audit_l1_table, 2), /* l1_32 */ SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 2), /* fl1_32 */ SHADOW_INTERNAL_NAME(sh_audit_l2_table, 2), /* l2_32 */ SHADOW_INTERNAL_NAME(sh_audit_l1_table, 3), /* l1_pae */ SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 3), /* fl1_pae */ SHADOW_INTERNAL_NAME(sh_audit_l2_table, 3), /* l2_pae */ SHADOW_INTERNAL_NAME(sh_audit_l2_table, 3), /* l2h_pae */ SHADOW_INTERNAL_NAME(sh_audit_l1_table, 4), /* l1_64 */ SHADOW_INTERNAL_NAME(sh_audit_fl1_table, 4), /* fl1_64 */ SHADOW_INTERNAL_NAME(sh_audit_l2_table, 4), /* l2_64 */ SHADOW_INTERNAL_NAME(sh_audit_l2_table, 4), /* l2h_64 */ SHADOW_INTERNAL_NAME(sh_audit_l3_table, 4), /* l3_64 */ SHADOW_INTERNAL_NAME(sh_audit_l4_table, 4), /* l4_64 */ NULL /* All the rest */ }; unsigned int mask; if ( !(SHADOW_AUDIT_ENABLE) ) return; #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) sh_oos_audit(v->domain); #endif if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL ) mask = ~1; /* Audit every table in the system */ else { /* Audit only the current mode's tables */ switch ( v->arch.paging.mode->guest_levels ) { case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break; case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE |SHF_L2H_PAE); break; case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64 |SHF_L3_64|SHF_L4_64); break; default: BUG(); } } hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN)); } #endif /* Shadow audit */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/mm/shadow/private.h0000664000175000017500000007075012307313555016331 0ustar smbsmb/****************************************************************************** * arch/x86/mm/shadow/private.h * * Shadow code that is private, and does not need to be multiply compiled. * Parts of this code are Copyright (c) 2006 by XenSource Inc. * Parts of this code are Copyright (c) 2006 by Michael A Fetterman * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _XEN_SHADOW_PRIVATE_H #define _XEN_SHADOW_PRIVATE_H // In order to override the definition of mfn_to_page, we make sure page.h has // been included... #include #include #include #include #include #include "../mm-locks.h" /****************************************************************************** * Levels of self-test and paranoia */ #define SHADOW_AUDIT_HASH 0x01 /* Check current hash bucket */ #define SHADOW_AUDIT_HASH_FULL 0x02 /* Check every hash bucket */ #define SHADOW_AUDIT_ENTRIES 0x04 /* Check this walk's shadows */ #define SHADOW_AUDIT_ENTRIES_FULL 0x08 /* Check every shadow */ #define SHADOW_AUDIT_ENTRIES_MFNS 0x10 /* Check gfn-mfn map in shadows */ #ifdef NDEBUG #define SHADOW_AUDIT 0 #define SHADOW_AUDIT_ENABLE 0 #else #define SHADOW_AUDIT 0x15 /* Basic audit of all */ #define SHADOW_AUDIT_ENABLE shadow_audit_enable extern int shadow_audit_enable; #endif /****************************************************************************** * Levels of optimization */ #define SHOPT_WRITABLE_HEURISTIC 0x01 /* Guess at RW PTEs via linear maps */ #define SHOPT_EARLY_UNSHADOW 0x02 /* Unshadow l1s on fork or exit */ #define SHOPT_FAST_FAULT_PATH 0x04 /* Fast-path MMIO and not-present */ #define SHOPT_PREFETCH 0x08 /* Shadow multiple entries per fault */ #define SHOPT_LINUX_L3_TOPLEVEL 0x10 /* Pin l3es on early 64bit linux */ #define SHOPT_SKIP_VERIFY 0x20 /* Skip PTE v'fy when safe to do so */ #define SHOPT_VIRTUAL_TLB 0x40 /* Cache guest v->p translations */ #define SHOPT_FAST_EMULATION 0x80 /* Fast write emulation */ #define SHOPT_OUT_OF_SYNC 0x100 /* Allow guest writes to L1 PTs */ #define SHADOW_OPTIMIZATIONS 0x1ff /****************************************************************************** * Debug and error-message output */ #define SHADOW_PRINTK(_f, _a...) \ debugtrace_printk("sh: %s(): " _f, __func__, ##_a) #define SHADOW_ERROR(_f, _a...) \ printk("sh error: %s(): " _f, __func__, ##_a) #define SHADOW_DEBUG(flag, _f, _a...) \ do { \ if (SHADOW_DEBUG_ ## flag) \ debugtrace_printk("shdebug: %s(): " _f, __func__, ##_a); \ } while (0) // The flags for use with SHADOW_DEBUG: #define SHADOW_DEBUG_PROPAGATE 1 #define SHADOW_DEBUG_MAKE_SHADOW 1 #define SHADOW_DEBUG_DESTROY_SHADOW 1 #define SHADOW_DEBUG_A_AND_D 1 #define SHADOW_DEBUG_EMULATE 1 #define SHADOW_DEBUG_P2M 1 #define SHADOW_DEBUG_LOGDIRTY 0 /****************************************************************************** * Tracing */ DECLARE_PER_CPU(uint32_t,trace_shadow_path_flags); #define TRACE_SHADOW_PATH_FLAG(_x) \ do { \ this_cpu(trace_shadow_path_flags) |= (1<<(_x)); \ } while(0) #define TRACE_CLEAR_PATH_FLAGS \ this_cpu(trace_shadow_path_flags) = 0 enum { TRCE_SFLAG_SET_AD, TRCE_SFLAG_SET_A, TRCE_SFLAG_SHADOW_L1_GET_REF, TRCE_SFLAG_SHADOW_L1_PUT_REF, TRCE_SFLAG_L2_PROPAGATE, TRCE_SFLAG_SET_CHANGED, TRCE_SFLAG_SET_FLUSH, TRCE_SFLAG_SET_ERROR, TRCE_SFLAG_DEMOTE, TRCE_SFLAG_PROMOTE, TRCE_SFLAG_WRMAP, TRCE_SFLAG_WRMAP_GUESS_FOUND, TRCE_SFLAG_WRMAP_BRUTE_FORCE, TRCE_SFLAG_EARLY_UNSHADOW, TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN, TRCE_SFLAG_EMULATION_LAST_FAILED, TRCE_SFLAG_EMULATE_FULL_PT, TRCE_SFLAG_PREALLOC_UNHOOK, TRCE_SFLAG_UNSYNC, TRCE_SFLAG_OOS_FIXUP_ADD, TRCE_SFLAG_OOS_FIXUP_EVICT, }; /* Size (in bytes) of a guest PTE */ #if GUEST_PAGING_LEVELS >= 3 # define GUEST_PTE_SIZE 8 #else # define GUEST_PTE_SIZE 4 #endif /****************************************************************************** * Auditing routines */ #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL extern void shadow_audit_tables(struct vcpu *v); #else #define shadow_audit_tables(_v) do {} while(0) #endif /****************************************************************************** * Macro for dealing with the naming of the internal names of the * shadow code's external entry points. */ #define SHADOW_INTERNAL_NAME_HIDDEN(name, guest_levels) \ name ## __guest_ ## guest_levels #define SHADOW_INTERNAL_NAME(name, guest_levels) \ SHADOW_INTERNAL_NAME_HIDDEN(name, guest_levels) #define GUEST_LEVELS 2 #include "multi.h" #undef GUEST_LEVELS #define GUEST_LEVELS 3 #include "multi.h" #undef GUEST_LEVELS #define GUEST_LEVELS 4 #include "multi.h" #undef GUEST_LEVELS /* Shadow type codes */ #define SH_type_none (0U) /* on the shadow free list */ #define SH_type_min_shadow (1U) #define SH_type_l1_32_shadow (1U) /* shadowing a 32-bit L1 guest page */ #define SH_type_fl1_32_shadow (2U) /* L1 shadow for a 32b 4M superpage */ #define SH_type_l2_32_shadow (3U) /* shadowing a 32-bit L2 guest page */ #define SH_type_l1_pae_shadow (4U) /* shadowing a pae L1 page */ #define SH_type_fl1_pae_shadow (5U) /* L1 shadow for pae 2M superpg */ #define SH_type_l2_pae_shadow (6U) /* shadowing a pae L2-low page */ #define SH_type_l2h_pae_shadow (7U) /* shadowing a pae L2-high page */ #define SH_type_l1_64_shadow (8U) /* shadowing a 64-bit L1 page */ #define SH_type_fl1_64_shadow (9U) /* L1 shadow for 64-bit 2M superpg */ #define SH_type_l2_64_shadow (10U) /* shadowing a 64-bit L2 page */ #define SH_type_l2h_64_shadow (11U) /* shadowing a compat PAE L2 high page */ #define SH_type_l3_64_shadow (12U) /* shadowing a 64-bit L3 page */ #define SH_type_l4_64_shadow (13U) /* shadowing a 64-bit L4 page */ #define SH_type_max_shadow (13U) #define SH_type_p2m_table (14U) /* in use as the p2m table */ #define SH_type_monitor_table (15U) /* in use as a monitor table */ #define SH_type_oos_snapshot (16U) /* in use as OOS snapshot */ #define SH_type_unused (17U) /* * What counts as a pinnable shadow? */ static inline int sh_type_is_pinnable(struct vcpu *v, unsigned int t) { /* Top-level shadow types in each mode can be pinned, so that they * persist even when not currently in use in a guest CR3 */ if ( t == SH_type_l2_32_shadow || t == SH_type_l2_pae_shadow || t == SH_type_l2h_pae_shadow || t == SH_type_l4_64_shadow ) return 1; #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL) /* Early 64-bit linux used three levels of pagetables for the guest * and context switched by changing one l4 entry in a per-cpu l4 * page. When we're shadowing those kernels, we have to pin l3 * shadows so they don't just evaporate on every context switch. * For all other guests, we'd rather use the up-pointer field in l3s. */ if ( unlikely((v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) && t == SH_type_l3_64_shadow) ) return 1; #endif /* Everything else is not pinnable, and can use the "up" pointer */ return 0; } static inline int sh_type_has_up_pointer(struct vcpu *v, unsigned int t) { /* Multi-page shadows don't have up-pointers */ if ( t == SH_type_l1_32_shadow || t == SH_type_fl1_32_shadow || t == SH_type_l2_32_shadow ) return 0; /* Pinnable shadows don't have up-pointers either */ return !sh_type_is_pinnable(v, t); } /* * Definitions for the shadow_flags field in page_info. * These flags are stored on *guest* pages... * Bits 1-13 are encodings for the shadow types. */ #define SHF_page_type_mask \ (((1u << (SH_type_max_shadow + 1u)) - 1u) - \ ((1u << SH_type_min_shadow) - 1u)) #define SHF_L1_32 (1u << SH_type_l1_32_shadow) #define SHF_FL1_32 (1u << SH_type_fl1_32_shadow) #define SHF_L2_32 (1u << SH_type_l2_32_shadow) #define SHF_L1_PAE (1u << SH_type_l1_pae_shadow) #define SHF_FL1_PAE (1u << SH_type_fl1_pae_shadow) #define SHF_L2_PAE (1u << SH_type_l2_pae_shadow) #define SHF_L2H_PAE (1u << SH_type_l2h_pae_shadow) #define SHF_L1_64 (1u << SH_type_l1_64_shadow) #define SHF_FL1_64 (1u << SH_type_fl1_64_shadow) #define SHF_L2_64 (1u << SH_type_l2_64_shadow) #define SHF_L2H_64 (1u << SH_type_l2h_64_shadow) #define SHF_L3_64 (1u << SH_type_l3_64_shadow) #define SHF_L4_64 (1u << SH_type_l4_64_shadow) #define SHF_32 (SHF_L1_32|SHF_FL1_32|SHF_L2_32) #define SHF_PAE (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE|SHF_L2H_PAE) #define SHF_64 (SHF_L1_64|SHF_FL1_64|SHF_L2_64|SHF_L2H_64|SHF_L3_64|SHF_L4_64) #define SHF_L1_ANY (SHF_L1_32|SHF_L1_PAE|SHF_L1_64) #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Marks a guest L1 page table which is shadowed but not write-protected. * If set, then *only* L1 shadows (SHF_L1_*) are allowed. * * out_of_sync indicates that the shadow tables may not reflect the * guest tables. If it is clear, then the shadow tables *must* reflect * the guest tables. * * oos_may_write indicates that a page may have writable mappings. * * Most of the time the flags are synonymous. There is a short period of time * during resync that oos_may_write is clear but out_of_sync is not. If a * codepath is called during that time and is sensitive to oos issues, it may * need to use the second flag. */ #define SHF_out_of_sync (1u<<30) #define SHF_oos_may_write (1u<<29) #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ #define SHF_pagetable_dying (1u<<31) static inline int sh_page_has_multiple_shadows(struct page_info *pg) { u32 shadows; if ( !(pg->count_info & PGC_page_table) ) return 0; shadows = pg->shadow_flags & SHF_page_type_mask; /* More than one type bit set in shadow-flags? */ return ( (shadows & ~(1UL << find_first_set_bit(shadows))) != 0 ); } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* The caller must verify this is reasonable to call; i.e., valid mfn, * domain is translated, &c */ static inline int page_is_out_of_sync(struct page_info *p) { return (p->count_info & PGC_page_table) && (p->shadow_flags & SHF_out_of_sync); } static inline int mfn_is_out_of_sync(mfn_t gmfn) { return page_is_out_of_sync(mfn_to_page(mfn_x(gmfn))); } static inline int page_oos_may_write(struct page_info *p) { return (p->count_info & PGC_page_table) && (p->shadow_flags & SHF_oos_may_write); } static inline int mfn_oos_may_write(mfn_t gmfn) { return page_oos_may_write(mfn_to_page(mfn_x(gmfn))); } #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ /****************************************************************************** * Various function declarations */ /* Hash table functions */ mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, unsigned int t); void shadow_hash_insert(struct vcpu *v, unsigned long n, unsigned int t, mfn_t smfn); void shadow_hash_delete(struct vcpu *v, unsigned long n, unsigned int t, mfn_t smfn); /* shadow promotion */ void shadow_promote(struct vcpu *v, mfn_t gmfn, u32 type); void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type); /* Shadow page allocation functions */ void shadow_prealloc(struct domain *d, u32 shadow_type, unsigned int count); mfn_t shadow_alloc(struct domain *d, u32 shadow_type, unsigned long backpointer); void shadow_free(struct domain *d, mfn_t smfn); /* Install the xen mappings in various flavours of shadow */ void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn); /* Update the shadows in response to a pagetable write from Xen */ int sh_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size); /* Update the shadows in response to a pagetable write from a HVM guest */ void sh_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, void *entry, u32 size); /* Remove all writeable mappings of a guest frame from the shadows. * Returns non-zero if we need to flush TLBs. * level and fault_addr desribe how we found this to be a pagetable; * level==0 means we have some other reason for revoking write access. */ extern int sh_remove_write_access(struct vcpu *v, mfn_t readonly_mfn, unsigned int level, unsigned long fault_addr); /* Functions that atomically write PT/P2M entries and update state */ void shadow_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level); int shadow_write_guest_entry(struct vcpu *v, intpte_t *p, intpte_t new, mfn_t gmfn); int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p, intpte_t *old, intpte_t new, mfn_t gmfn); /* Unhook the non-Xen mappings in this top-level shadow mfn. * With user_only == 1, unhooks only the user-mode mappings. */ void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn, int user_only); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Allow a shadowed page to go out of sync */ int sh_unsync(struct vcpu *v, mfn_t gmfn); /* Pull an out-of-sync page back into sync. */ void sh_resync(struct vcpu *v, mfn_t gmfn); void oos_fixup_add(struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off); int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long offset); /* Pull all out-of-sync shadows back into sync. If skip != 0, we try * to avoid resyncing where we think we can get away with it. */ void sh_resync_all(struct vcpu *v, int skip, int this, int others); static inline void shadow_resync_all(struct vcpu *v) { sh_resync_all(v, 0 /* skip */, 1 /* this */, 1 /* others */); } static inline void shadow_resync_current_vcpu(struct vcpu *v) { sh_resync_all(v, 0 /* skip */, 1 /* this */, 0 /* others */); } static inline void shadow_sync_other_vcpus(struct vcpu *v) { sh_resync_all(v, 1 /* skip */, 0 /* this */, 1 /* others */); } void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn); mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn); #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ /* Reset the up-pointers of every L3 shadow to 0. * This is called when l3 shadows stop being pinnable, to clear out all * the list-head bits so the up-pointer field is properly inititalised. */ void sh_reset_l3_up_pointers(struct vcpu *v); /****************************************************************************** * Flags used in the return value of the shadow_set_lXe() functions... */ /* We actually wrote something new to the shadow */ #define SHADOW_SET_CHANGED 0x1 /* Caller should flush TLBs to clear the old entry */ #define SHADOW_SET_FLUSH 0x2 /* Something went wrong: the shadow entry was invalid or refcount failed */ #define SHADOW_SET_ERROR 0x4 /****************************************************************************** * MFN/page-info handling */ /* Override macros from asm/page.h to make them work with mfn_t */ #undef mfn_to_page #define mfn_to_page(_m) __mfn_to_page(mfn_x(_m)) #undef mfn_valid #define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn)) #undef page_to_mfn #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) /* Override pagetable_t <-> struct page_info conversions to work with mfn_t */ #undef pagetable_get_page #define pagetable_get_page(x) mfn_to_page(pagetable_get_mfn(x)) #undef pagetable_from_page #define pagetable_from_page(pg) pagetable_from_mfn(page_to_mfn(pg)) #define backpointer(sp) _mfn(pdx_to_pfn((unsigned long)(sp)->v.sh.back)) static inline unsigned long __backpointer(const struct page_info *sp) { switch (sp->u.sh.type) { case SH_type_fl1_32_shadow: case SH_type_fl1_pae_shadow: case SH_type_fl1_64_shadow: return sp->v.sh.back; } return pdx_to_pfn(sp->v.sh.back); } static inline int sh_mfn_is_a_page_table(mfn_t gmfn) { struct page_info *page = mfn_to_page(gmfn); struct domain *owner; unsigned long type_info; if ( !mfn_valid(gmfn) ) return 0; owner = page_get_owner(page); if ( owner && shadow_mode_refcounts(owner) && (page->count_info & PGC_page_table) ) return 1; type_info = page->u.inuse.type_info & PGT_type_mask; return type_info && (type_info <= PGT_l4_page_table); } // Provide mfn_t-aware versions of common xen functions static inline void * sh_map_domain_page(mfn_t mfn) { return map_domain_page(mfn_x(mfn)); } static inline void sh_unmap_domain_page(void *p) { unmap_domain_page(p); } static inline void * sh_map_domain_page_global(mfn_t mfn) { return map_domain_page_global(mfn_x(mfn)); } static inline void sh_unmap_domain_page_global(void *p) { unmap_domain_page_global(p); } /**************************************************************************/ /* Shadow-page refcounting. */ void sh_destroy_shadow(struct vcpu *v, mfn_t smfn); /* Increase the refcount of a shadow page. Arguments are the mfn to refcount, * and the physical address of the shadow entry that holds the ref (or zero * if the ref is held by something else). * Returns 0 for failure, 1 for success. */ static inline int sh_get_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa) { u32 x, nx; struct page_info *sp = mfn_to_page(smfn); ASSERT(mfn_valid(smfn)); ASSERT(sp->u.sh.head); x = sp->u.sh.count; nx = x + 1; if ( unlikely(nx >= 1U<<26) ) { SHADOW_PRINTK("shadow ref overflow, gmfn=%lx smfn=%lx\n", __backpointer(sp), mfn_x(smfn)); return 0; } /* Guarded by the paging lock, so no need for atomic update */ sp->u.sh.count = nx; /* We remember the first shadow entry that points to each shadow. */ if ( entry_pa != 0 && sh_type_has_up_pointer(v, sp->u.sh.type) && sp->up == 0 ) sp->up = entry_pa; return 1; } /* Decrease the refcount of a shadow page. As for get_ref, takes the * physical address of the shadow entry that held this reference. */ static inline void sh_put_ref(struct vcpu *v, mfn_t smfn, paddr_t entry_pa) { u32 x, nx; struct page_info *sp = mfn_to_page(smfn); ASSERT(mfn_valid(smfn)); ASSERT(sp->u.sh.head); ASSERT(!(sp->count_info & PGC_count_mask)); /* If this is the entry in the up-pointer, remove it */ if ( entry_pa != 0 && sh_type_has_up_pointer(v, sp->u.sh.type) && sp->up == entry_pa ) sp->up = 0; x = sp->u.sh.count; nx = x - 1; if ( unlikely(x == 0) ) { SHADOW_ERROR("shadow ref underflow, smfn=%lx oc=%08x t=%#x\n", mfn_x(smfn), sp->u.sh.count, sp->u.sh.type); BUG(); } /* Guarded by the paging lock, so no need for atomic update */ sp->u.sh.count = nx; if ( unlikely(nx == 0) ) sh_destroy_shadow(v, smfn); } /* Walk the list of pinned shadows, from the tail forwards, * skipping the non-head-page entries */ static inline struct page_info * prev_pinned_shadow(const struct page_info *page, const struct domain *d) { struct page_info *p; if ( page == d->arch.paging.shadow.pinned_shadows.next ) return NULL; if ( page == NULL ) /* If no current place, start at the tail */ p = d->arch.paging.shadow.pinned_shadows.tail; else p = pdx_to_page(page->list.prev); /* Skip over the non-tail parts of multi-page shadows */ if ( p && p->u.sh.type == SH_type_l2_32_shadow ) { p = pdx_to_page(p->list.prev); ASSERT(p && p->u.sh.type == SH_type_l2_32_shadow); p = pdx_to_page(p->list.prev); ASSERT(p && p->u.sh.type == SH_type_l2_32_shadow); p = pdx_to_page(p->list.prev); ASSERT(p && p->u.sh.type == SH_type_l2_32_shadow); } ASSERT(!p || p->u.sh.head); return p; } #define foreach_pinned_shadow(dom, pos, tmp) \ for ( pos = prev_pinned_shadow(NULL, (dom)); \ pos ? (tmp = prev_pinned_shadow(pos, (dom)), 1) : 0; \ pos = tmp ) /* Pin a shadow page: take an extra refcount, set the pin bit, * and put the shadow at the head of the list of pinned shadows. * Returns 0 for failure, 1 for success. */ static inline int sh_pin(struct vcpu *v, mfn_t smfn) { struct page_info *sp; struct page_list_head h, *pin_list; ASSERT(mfn_valid(smfn)); sp = mfn_to_page(smfn); ASSERT(sh_type_is_pinnable(v, sp->u.sh.type)); ASSERT(sp->u.sh.head); /* Treat the up-to-four pages of the shadow as a unit in the list ops */ h.next = h.tail = sp; if ( sp->u.sh.type == SH_type_l2_32_shadow ) { h.tail = pdx_to_page(h.tail->list.next); h.tail = pdx_to_page(h.tail->list.next); h.tail = pdx_to_page(h.tail->list.next); ASSERT(h.tail->u.sh.type == SH_type_l2_32_shadow); } pin_list = &v->domain->arch.paging.shadow.pinned_shadows; if ( sp->u.sh.pinned ) { /* Already pinned: take it out of the pinned-list so it can go * at the front */ if ( pin_list->next == h.next ) return 1; page_list_prev(h.next, pin_list)->list.next = h.tail->list.next; if ( pin_list->tail == h.tail ) pin_list->tail = page_list_prev(h.next, pin_list); else page_list_next(h.tail, pin_list)->list.prev = h.next->list.prev; h.tail->list.next = h.next->list.prev = PAGE_LIST_NULL; } else { /* Not pinned: pin it! */ if ( !sh_get_ref(v, smfn, 0) ) return 0; sp->u.sh.pinned = 1; ASSERT(h.next->list.prev == PAGE_LIST_NULL); ASSERT(h.tail->list.next == PAGE_LIST_NULL); } /* Put it at the head of the list of pinned shadows */ page_list_splice(&h, pin_list); return 1; } /* Unpin a shadow page: unset the pin bit, take the shadow off the list * of pinned shadows, and release the extra ref. */ static inline void sh_unpin(struct vcpu *v, mfn_t smfn) { struct page_list_head h, *pin_list; struct page_info *sp; ASSERT(mfn_valid(smfn)); sp = mfn_to_page(smfn); ASSERT(sh_type_is_pinnable(v, sp->u.sh.type)); ASSERT(sp->u.sh.head); /* Treat the up-to-four pages of the shadow as a unit in the list ops */ h.next = h.tail = sp; if ( sp->u.sh.type == SH_type_l2_32_shadow ) { h.tail = pdx_to_page(h.tail->list.next); h.tail = pdx_to_page(h.tail->list.next); h.tail = pdx_to_page(h.tail->list.next); ASSERT(h.tail->u.sh.type == SH_type_l2_32_shadow); } pin_list = &v->domain->arch.paging.shadow.pinned_shadows; if ( !sp->u.sh.pinned ) return; sp->u.sh.pinned = 0; /* Cut the sub-list out of the list of pinned shadows */ if ( pin_list->next == h.next && pin_list->tail == h.tail ) pin_list->next = pin_list->tail = NULL; else { if ( pin_list->next == h.next ) pin_list->next = page_list_next(h.tail, pin_list); else page_list_prev(h.next, pin_list)->list.next = h.tail->list.next; if ( pin_list->tail == h.tail ) pin_list->tail = page_list_prev(h.next, pin_list); else page_list_next(h.tail, pin_list)->list.prev = h.next->list.prev; } h.tail->list.next = h.next->list.prev = PAGE_LIST_NULL; sh_put_ref(v, smfn, 0); } /**************************************************************************/ /* PTE-write emulation. */ struct sh_emulate_ctxt { struct x86_emulate_ctxt ctxt; /* Cache of up to 31 bytes of instruction. */ uint8_t insn_buf[31]; uint8_t insn_buf_bytes; unsigned long insn_buf_eip; /* Cache of segment registers already gathered for this emulation. */ unsigned int valid_seg_regs; struct segment_register seg_reg[6]; /* MFNs being written to in write/cmpxchg callbacks */ mfn_t mfn1, mfn2; #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY) /* Special case for avoiding having to verify writes: remember * whether the old value had its low bit (_PAGE_PRESENT) clear. */ int low_bit_was_clear:1; #endif }; const struct x86_emulate_ops *shadow_init_emulation( struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs); void shadow_continue_emulation( struct sh_emulate_ctxt *sh_ctxt, struct cpu_user_regs *regs); struct segment_register *hvm_get_seg_reg( enum x86_segment seg, struct sh_emulate_ctxt *sh_ctxt); #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) /**************************************************************************/ /* Virtual TLB entries * * We keep a cache of virtual-to-physical translations that we have seen * since the last TLB flush. This is safe to use for frame translations, * but callers need to re-check the actual guest tables if the lookup fails. * * Lookups and updates are protected by a per-vTLB (and hence per-vcpu) * lock. This lock is held *only* while reading or writing the table, * so it is safe to take in any non-interrupt context. Most lookups * happen with v==current, so we expect contention to be low. */ #define VTLB_ENTRIES 13 struct shadow_vtlb { unsigned long page_number; /* Guest virtual address >> PAGE_SHIFT */ unsigned long frame_number; /* Guest physical address >> PAGE_SHIFT */ uint32_t pfec; /* PF error code of the lookup that filled this * entry. A pfec of zero means the slot is empty * (since that would require us to re-try anyway) */ }; /* Call whenever the guest flushes hit actual TLB */ static inline void vtlb_flush(struct vcpu *v) { spin_lock(&v->arch.paging.vtlb_lock); memset(v->arch.paging.vtlb, 0, VTLB_ENTRIES * sizeof (struct shadow_vtlb)); spin_unlock(&v->arch.paging.vtlb_lock); } static inline int vtlb_hash(unsigned long page_number) { return page_number % VTLB_ENTRIES; } /* Put a translation into the vTLB, potentially clobbering an old one */ static inline void vtlb_insert(struct vcpu *v, unsigned long page, unsigned long frame, uint32_t pfec) { struct shadow_vtlb entry = { .page_number = page, .frame_number = frame, .pfec = pfec }; spin_lock(&v->arch.paging.vtlb_lock); v->arch.paging.vtlb[vtlb_hash(page)] = entry; spin_unlock(&v->arch.paging.vtlb_lock); } /* Look a translation up in the vTLB. Returns INVALID_GFN if not found. */ static inline unsigned long vtlb_lookup(struct vcpu *v, unsigned long va, uint32_t pfec) { unsigned long page_number = va >> PAGE_SHIFT; unsigned long frame_number = INVALID_GFN; int i = vtlb_hash(page_number); spin_lock(&v->arch.paging.vtlb_lock); if ( v->arch.paging.vtlb[i].pfec != 0 && v->arch.paging.vtlb[i].page_number == page_number /* Any successful walk that had at least these pfec bits is OK */ && (v->arch.paging.vtlb[i].pfec & pfec) == pfec ) { frame_number = v->arch.paging.vtlb[i].frame_number; } spin_unlock(&v->arch.paging.vtlb_lock); return frame_number; } #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ static inline int sh_check_page_has_no_refs(struct page_info *page) { unsigned long count = read_atomic(&page->count_info); return ( (count & PGC_count_mask) == ((count & PGC_allocated) ? 1 : 0) ); } #endif /* _XEN_SHADOW_PRIVATE_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/mm/shadow/multi.c0000664000175000017500000054604612307313555016012 0ustar smbsmb/****************************************************************************** * arch/x86/mm/shadow/multi.c * * Simple, mostly-synchronous shadow page tables. * Parts of this code are Copyright (c) 2006 by XenSource Inc. * Parts of this code are Copyright (c) 2006 by Michael A Fetterman * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "private.h" #include "types.h" /* THINGS TO DO LATER: * * TEARDOWN HEURISTICS * Also: have a heuristic for when to destroy a previous paging-mode's * shadows. When a guest is done with its start-of-day 32-bit tables * and reuses the memory we want to drop those shadows. Start with * shadows in a page in two modes as a hint, but beware of clever tricks * like reusing a pagetable for both PAE and 64-bit during boot... * * PAE LINEAR MAPS * Rework shadow_get_l*e() to have the option of using map_domain_page() * instead of linear maps. Add appropriate unmap_l*e calls in the users. * Then we can test the speed difference made by linear maps. If the * map_domain_page() version is OK on PAE, we could maybe allow a lightweight * l3-and-l2h-only shadow mode for PAE PV guests that would allow them * to share l2h pages again. * * PSE disabled / PSE36 * We don't support any modes other than PSE enabled, PSE36 disabled. * Neither of those would be hard to change, but we'd need to be able to * deal with shadows made in one mode and used in another. */ #define FETCH_TYPE_PREFETCH 1 #define FETCH_TYPE_DEMAND 2 #define FETCH_TYPE_WRITE 4 typedef enum { ft_prefetch = FETCH_TYPE_PREFETCH, ft_demand_read = FETCH_TYPE_DEMAND, ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE, } fetch_type_t; #ifdef DEBUG_TRACE_DUMP static char *fetch_type_names[] = { [ft_prefetch] "prefetch", [ft_demand_read] "demand read", [ft_demand_write] "demand write", }; #endif /**************************************************************************/ /* Hash table mapping from guest pagetables to shadows * * Normal case: maps the mfn of a guest page to the mfn of its shadow page. * FL1's: maps the *gfn* of the start of a superpage to the mfn of a * shadow L1 which maps its "splinters". */ static inline mfn_t get_fl1_shadow_status(struct vcpu *v, gfn_t gfn) /* Look for FL1 shadows in the hash table */ { mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), SH_type_fl1_shadow); ASSERT(!mfn_valid(smfn) || mfn_to_page(smfn)->u.sh.head); return smfn; } static inline mfn_t get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type) /* Look for shadows in the hash table */ { mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), shadow_type); ASSERT(!mfn_valid(smfn) || mfn_to_page(smfn)->u.sh.head); perfc_incr(shadow_get_shadow_status); return smfn; } static inline void set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn) /* Put an FL1 shadow into the hash table */ { SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n", gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn)); ASSERT(mfn_to_page(smfn)->u.sh.head); shadow_hash_insert(v, gfn_x(gfn), SH_type_fl1_shadow, smfn); } static inline void set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn) /* Put a shadow into the hash table */ { struct domain *d = v->domain; int res; SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n", d->domain_id, v->vcpu_id, mfn_x(gmfn), shadow_type, mfn_x(smfn)); ASSERT(mfn_to_page(smfn)->u.sh.head); /* 32-on-64 PV guests don't own their l4 pages so can't get_page them */ if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow ) { res = get_page(mfn_to_page(gmfn), d); ASSERT(res == 1); } shadow_hash_insert(v, mfn_x(gmfn), shadow_type, smfn); } static inline void delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn) /* Remove a shadow from the hash table */ { SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n", gfn_x(gfn), SH_type_fl1_shadow, mfn_x(smfn)); ASSERT(mfn_to_page(smfn)->u.sh.head); shadow_hash_delete(v, gfn_x(gfn), SH_type_fl1_shadow, smfn); } static inline void delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn) /* Remove a shadow from the hash table */ { SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n", v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), shadow_type, mfn_x(smfn)); ASSERT(mfn_to_page(smfn)->u.sh.head); shadow_hash_delete(v, mfn_x(gmfn), shadow_type, smfn); /* 32-on-64 PV guests don't own their l4 pages; see set_shadow_status */ if ( !is_pv_32on64_vcpu(v) || shadow_type != SH_type_l4_64_shadow ) put_page(mfn_to_page(gmfn)); } /**************************************************************************/ /* Functions for walking the guest page tables */ static inline uint32_t sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec) { return guest_walk_tables(v, p2m_get_hostp2m(v->domain), va, gw, pfec, #if GUEST_PAGING_LEVELS == 3 /* PAE */ _mfn(INVALID_MFN), v->arch.paging.shadow.gl3e #else /* 32 or 64 */ pagetable_get_mfn(v->arch.guest_table), v->arch.paging.shadow.guest_vtable #endif ); } /* This validation is called with lock held, and after write permission * removal. Then check is atomic and no more inconsistent content can * be observed before lock is released * * Return 1 to indicate success and 0 for inconsistency */ static inline uint32_t shadow_check_gwalk(struct vcpu *v, unsigned long va, walk_t *gw, int version) { struct domain *d = v->domain; guest_l1e_t *l1p; guest_l2e_t *l2p; #if GUEST_PAGING_LEVELS >= 4 guest_l3e_t *l3p; guest_l4e_t *l4p; #endif int mismatch = 0; ASSERT(paging_locked_by_me(d)); if ( version == atomic_read(&d->arch.paging.shadow.gtable_dirty_version) ) return 1; /* We may consider caching guest page mapping from last * guest table walk. However considering this check happens * relatively less-frequent, and a bit burden here to * remap guest page is better than caching mapping in each * guest table walk. * * Also when inconsistency occurs, simply return to trigger * another fault instead of re-validate new path to make * logic simple. */ perfc_incr(shadow_check_gwalk); #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ l4p = (guest_l4e_t *)v->arch.paging.shadow.guest_vtable; mismatch |= (gw->l4e.l4 != l4p[guest_l4_table_offset(va)].l4); l3p = sh_map_domain_page(gw->l3mfn); mismatch |= (gw->l3e.l3 != l3p[guest_l3_table_offset(va)].l3); sh_unmap_domain_page(l3p); #else mismatch |= (gw->l3e.l3 != v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)].l3); #endif l2p = sh_map_domain_page(gw->l2mfn); mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2); sh_unmap_domain_page(l2p); #else l2p = (guest_l2e_t *)v->arch.paging.shadow.guest_vtable; mismatch |= (gw->l2e.l2 != l2p[guest_l2_table_offset(va)].l2); #endif if ( !(guest_supports_superpages(v) && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) ) { l1p = sh_map_domain_page(gw->l1mfn); mismatch |= (gw->l1e.l1 != l1p[guest_l1_table_offset(va)].l1); sh_unmap_domain_page(l1p); } return !mismatch; } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) static int shadow_check_gl1e(struct vcpu *v, walk_t *gw) { guest_l1e_t *l1p, nl1e; if ( !mfn_valid(gw->l1mfn) ) return 0; /* Can't just pull-through because mfn may have changed */ l1p = map_domain_page(mfn_x(gw->l1mfn)); nl1e.l1 = l1p[guest_l1_table_offset(gw->va)].l1; unmap_domain_page(l1p); return gw->l1e.l1 != nl1e.l1; } #endif /* Remove write access permissions from a gwalk_t in a batch, and * return OR-ed result for TLB flush hint and need to rewalk the guest * pages. * * Syncing pages will remove write access to that page; but it may * also give write access to other pages in the path. If we resync any * pages, re-walk from the beginning. */ #define GW_RMWR_FLUSHTLB 1 #define GW_RMWR_REWALK 2 static inline uint32_t gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw) { uint32_t rc = 0; #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) if ( mfn_is_out_of_sync(gw->l3mfn) ) { sh_resync(v, gw->l3mfn); rc = GW_RMWR_REWALK; } else #endif /* OOS */ if ( sh_remove_write_access(v, gw->l3mfn, 3, va) ) rc = GW_RMWR_FLUSHTLB; #endif /* GUEST_PAGING_LEVELS >= 4 */ #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) if ( mfn_is_out_of_sync(gw->l2mfn) ) { sh_resync(v, gw->l2mfn); rc |= GW_RMWR_REWALK; } else #endif /* OOS */ if ( sh_remove_write_access(v, gw->l2mfn, 2, va) ) rc |= GW_RMWR_FLUSHTLB; #endif /* GUEST_PAGING_LEVELS >= 3 */ if ( !(guest_supports_superpages(v) && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) && !mfn_is_out_of_sync(gw->l1mfn) #endif /* OOS */ && sh_remove_write_access(v, gw->l1mfn, 1, va) ) rc |= GW_RMWR_FLUSHTLB; return rc; } #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES /* Lightweight audit: pass all the shadows associated with this guest walk * through the audit mechanisms */ static void sh_audit_gw(struct vcpu *v, walk_t *gw) { mfn_t smfn; if ( !(SHADOW_AUDIT_ENABLE) ) return; #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ if ( mfn_valid(gw->l4mfn) && mfn_valid((smfn = get_shadow_status(v, gw->l4mfn, SH_type_l4_shadow))) ) (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN)); if ( mfn_valid(gw->l3mfn) && mfn_valid((smfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow))) ) (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN)); #endif /* PAE or 64... */ if ( mfn_valid(gw->l2mfn) ) { if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn, SH_type_l2_shadow))) ) (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN)); #if GUEST_PAGING_LEVELS == 3 if ( mfn_valid((smfn = get_shadow_status(v, gw->l2mfn, SH_type_l2h_shadow))) ) (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN)); #endif } if ( mfn_valid(gw->l1mfn) && mfn_valid((smfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow))) ) (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN)); else if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PRESENT) && (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE) && mfn_valid( (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(gw->l2e)))) ) (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN)); } #else #define sh_audit_gw(_v, _gw) do {} while(0) #endif /* audit code */ #if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS) void * sh_guest_map_l1e(struct vcpu *v, unsigned long addr, unsigned long *gl1mfn) { void *pl1e = NULL; walk_t gw; ASSERT(shadow_mode_translate(v->domain)); // XXX -- this is expensive, but it's easy to cobble together... // FIXME! if ( sh_walk_guest_tables(v, addr, &gw, PFEC_page_present) == 0 && mfn_valid(gw.l1mfn) ) { if ( gl1mfn ) *gl1mfn = mfn_x(gw.l1mfn); pl1e = map_domain_page(mfn_x(gw.l1mfn)) + (guest_l1_table_offset(addr) * sizeof(guest_l1e_t)); } return pl1e; } void sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e) { walk_t gw; ASSERT(shadow_mode_translate(v->domain)); // XXX -- this is expensive, but it's easy to cobble together... // FIXME! (void) sh_walk_guest_tables(v, addr, &gw, PFEC_page_present); *(guest_l1e_t *)eff_l1e = gw.l1e; } #endif /* CONFIG == GUEST (== SHADOW) */ /**************************************************************************/ /* Functions to compute the correct index into a shadow page, given an * index into the guest page (as returned by guest_get_index()). * This is trivial when the shadow and guest use the same sized PTEs, but * gets more interesting when those sizes are mismatched (e.g. 32-bit guest, * PAE- or 64-bit shadows). * * These functions also increment the shadow mfn, when necessary. When PTE * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1 * page. In this case, we allocate 2 contiguous pages for the shadow L1, and * use simple pointer arithmetic on a pointer to the guest L1e to figure out * which shadow page we really want. Similarly, when PTE sizes are * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address * space.) */ /* From one page of a multi-page shadow, find the next one */ static inline mfn_t sh_next_page(mfn_t smfn) { mfn_t next; struct page_info *pg = mfn_to_page(smfn); ASSERT(pg->u.sh.type == SH_type_l1_32_shadow || pg->u.sh.type == SH_type_fl1_32_shadow || pg->u.sh.type == SH_type_l2_32_shadow); ASSERT(pg->u.sh.type == SH_type_l2_32_shadow || pg->u.sh.head); ASSERT(pg->list.next != PAGE_LIST_NULL); next = _mfn(pdx_to_pfn(pg->list.next)); ASSERT(mfn_to_page(next)->u.sh.type == pg->u.sh.type); ASSERT(!mfn_to_page(next)->u.sh.head); return next; } static inline u32 guest_index(void *ptr) { return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t); } static u32 shadow_l1_index(mfn_t *smfn, u32 guest_index) { #if (GUEST_PAGING_LEVELS == 2) ASSERT(mfn_to_page(*smfn)->u.sh.head); if ( guest_index >= SHADOW_L1_PAGETABLE_ENTRIES ) *smfn = sh_next_page(*smfn); return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES); #else return guest_index; #endif } static u32 shadow_l2_index(mfn_t *smfn, u32 guest_index) { #if (GUEST_PAGING_LEVELS == 2) int i; ASSERT(mfn_to_page(*smfn)->u.sh.head); // Because we use 2 shadow l2 entries for each guest entry, the number of // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2 for ( i = 0; i < guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2); i++ ) *smfn = sh_next_page(*smfn); // We multiply by two to get the index of the first of the two entries // used to shadow the specified guest entry. return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2; #else return guest_index; #endif } #if GUEST_PAGING_LEVELS >= 4 static u32 shadow_l3_index(mfn_t *smfn, u32 guest_index) { return guest_index; } static u32 shadow_l4_index(mfn_t *smfn, u32 guest_index) { return guest_index; } #endif // GUEST_PAGING_LEVELS >= 4 /**************************************************************************/ /* Function which computes shadow entries from their corresponding guest * entries. This is the "heart" of the shadow code. It operates using * level-1 shadow types, but handles all levels of entry. * Don't call it directly, but use the four wrappers below. */ static always_inline void _sh_propagate(struct vcpu *v, guest_intpte_t guest_intpte, mfn_t target_mfn, void *shadow_entry_ptr, int level, fetch_type_t ft, p2m_type_t p2mt) { guest_l1e_t guest_entry = { guest_intpte }; shadow_l1e_t *sp = shadow_entry_ptr; struct domain *d = v->domain; struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram; gfn_t target_gfn = guest_l1e_get_gfn(guest_entry); u32 pass_thru_flags; u32 gflags, sflags; /* We don't shadow PAE l3s */ ASSERT(GUEST_PAGING_LEVELS > 3 || level != 3); /* Check there's something for the shadows to map to */ if ( !p2m_is_valid(p2mt) && !p2m_is_grant(p2mt) ) { *sp = shadow_l1e_empty(); goto done; } gflags = guest_l1e_get_flags(guest_entry); if ( unlikely(!(gflags & _PAGE_PRESENT)) ) { #if !(SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* If a guest l1 entry is not present, shadow with the magic * guest-not-present entry. */ if ( level == 1 ) *sp = sh_l1e_gnp(); else #endif /* !OOS */ *sp = shadow_l1e_empty(); goto done; } if ( level == 1 && p2mt == p2m_mmio_dm ) { /* Guest l1e maps emulated MMIO space */ *sp = sh_l1e_mmio(target_gfn, gflags); if ( !d->arch.paging.shadow.has_fast_mmio_entries ) d->arch.paging.shadow.has_fast_mmio_entries = 1; goto done; } // Must have a valid target_mfn unless this is a prefetch or an l1 // pointing at MMIO space. In the case of a prefetch, an invalid // mfn means that we can not usefully shadow anything, and so we // return early. // if ( !mfn_valid(target_mfn) && !(level == 1 && (!shadow_mode_refcounts(d) || p2mt == p2m_mmio_direct)) ) { ASSERT((ft == ft_prefetch)); *sp = shadow_l1e_empty(); goto done; } // Propagate bits from the guest to the shadow. // Some of these may be overwritten, below. // Since we know the guest's PRESENT bit is set, we also set the shadow's // SHADOW_PRESENT bit. // pass_thru_flags = (_PAGE_ACCESSED | _PAGE_USER | _PAGE_RW | _PAGE_PRESENT); if ( guest_supports_nx(v) ) pass_thru_flags |= _PAGE_NX_BIT; if ( !shadow_mode_refcounts(d) && !mfn_valid(target_mfn) ) pass_thru_flags |= _PAGE_PAT | _PAGE_PCD | _PAGE_PWT; sflags = gflags & pass_thru_flags; /* * For HVM domains with direct access to MMIO areas, set the correct * caching attributes in the shadows to match what was asked for. */ if ( (level == 1) && is_hvm_domain(d) && !is_xen_heap_mfn(mfn_x(target_mfn)) ) { unsigned int type; /* compute the PAT index for shadow page entry when VT-d is enabled * and device assigned. * 1) direct MMIO: compute the PAT index with gMTRR=UC and gPAT. * 2) if enables snoop control, compute the PAT index as WB. * 3) if disables snoop control, compute the PAT index with * gMTRR and gPAT. */ if ( hvm_get_mem_pinned_cacheattr(d, gfn_x(target_gfn), &type) ) sflags |= pat_type_2_pte_flags(type); else if ( d->arch.hvm_domain.is_in_uc_mode ) sflags |= pat_type_2_pte_flags(PAT_TYPE_UNCACHABLE); else if ( iomem_access_permitted(d, mfn_x(target_mfn), mfn_x(target_mfn)) ) { if ( p2mt == p2m_mmio_direct ) sflags |= get_pat_flags(v, gflags, gfn_to_paddr(target_gfn), pfn_to_paddr(mfn_x(target_mfn)), MTRR_TYPE_UNCACHABLE); else if ( iommu_snoop ) sflags |= pat_type_2_pte_flags(PAT_TYPE_WRBACK); else sflags |= get_pat_flags(v, gflags, gfn_to_paddr(target_gfn), pfn_to_paddr(mfn_x(target_mfn)), NO_HARDCODE_MEM_TYPE); } } // Set the A&D bits for higher level shadows. // Higher level entries do not, strictly speaking, have dirty bits, but // since we use shadow linear tables, each of these entries may, at some // point in time, also serve as a shadow L1 entry. // By setting both the A&D bits in each of these, we eliminate the burden // on the hardware to update these bits on initial accesses. // if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) ) sflags |= _PAGE_ACCESSED | _PAGE_DIRTY; // If the A or D bit has not yet been set in the guest, then we must // prevent the corresponding kind of access. // if ( unlikely(!(gflags & _PAGE_ACCESSED)) ) sflags &= ~_PAGE_PRESENT; /* D bits exist in L1es and PSE L2es */ if ( unlikely(((level == 1) || ((level == 2) && (gflags & _PAGE_PSE) && guest_supports_superpages(v))) && !(gflags & _PAGE_DIRTY)) ) sflags &= ~_PAGE_RW; // shadow_mode_log_dirty support // // Only allow the guest write access to a page a) on a demand fault, // or b) if the page is already marked as dirty. // // (We handle log-dirty entirely inside the shadow code, without using the // p2m_ram_logdirty p2m type: only HAP uses that.) if ( unlikely((level == 1) && shadow_mode_log_dirty(d)) ) { if ( mfn_valid(target_mfn) ) { if ( ft & FETCH_TYPE_WRITE ) paging_mark_dirty(d, mfn_x(target_mfn)); else if ( !paging_mfn_is_dirty(d, target_mfn) ) sflags &= ~_PAGE_RW; } } if ( unlikely((level == 1) && dirty_vram && dirty_vram->last_dirty == -1 && gfn_x(target_gfn) >= dirty_vram->begin_pfn && gfn_x(target_gfn) < dirty_vram->end_pfn) ) { if ( ft & FETCH_TYPE_WRITE ) dirty_vram->last_dirty = NOW(); else sflags &= ~_PAGE_RW; } /* Read-only memory */ if ( p2m_is_readonly(p2mt) || (p2mt == p2m_mmio_direct && rangeset_contains_singleton(mmio_ro_ranges, mfn_x(target_mfn))) ) sflags &= ~_PAGE_RW; // protect guest page tables // if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn) #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) /* Unless the page is out of sync and the guest is writing to it. */ && !(mfn_oos_may_write(target_mfn) && (ft == ft_demand_write)) #endif /* OOS */ ) ) { if ( shadow_mode_trap_reads(d) ) { // if we are trapping both reads & writes, then mark this page // as not present... // sflags &= ~_PAGE_PRESENT; } else { // otherwise, just prevent any writes... // sflags &= ~_PAGE_RW; } } // PV guests in 64-bit mode use two different page tables for user vs // supervisor permissions, making the guest's _PAGE_USER bit irrelevant. // It is always shadowed as present... if ( (GUEST_PAGING_LEVELS == 4) && !is_pv_32on64_domain(d) && is_pv_domain(d) ) { sflags |= _PAGE_USER; } *sp = shadow_l1e_from_mfn(target_mfn, sflags); done: SHADOW_DEBUG(PROPAGATE, "%s level %u guest %" SH_PRI_gpte " shadow %" SH_PRI_pte "\n", fetch_type_names[ft], level, guest_entry.l1, sp->l1); } /* These four wrappers give us a little bit of type-safety back around * the use of void-* pointers and intpte types in _sh_propagate(), and * allow the compiler to optimize out some level checks. */ #if GUEST_PAGING_LEVELS >= 4 static void l4e_propagate_from_guest(struct vcpu *v, guest_l4e_t gl4e, mfn_t sl3mfn, shadow_l4e_t *sl4e, fetch_type_t ft) { _sh_propagate(v, gl4e.l4, sl3mfn, sl4e, 4, ft, p2m_ram_rw); } static void l3e_propagate_from_guest(struct vcpu *v, guest_l3e_t gl3e, mfn_t sl2mfn, shadow_l3e_t *sl3e, fetch_type_t ft) { _sh_propagate(v, gl3e.l3, sl2mfn, sl3e, 3, ft, p2m_ram_rw); } #endif // GUEST_PAGING_LEVELS >= 4 static void l2e_propagate_from_guest(struct vcpu *v, guest_l2e_t gl2e, mfn_t sl1mfn, shadow_l2e_t *sl2e, fetch_type_t ft) { _sh_propagate(v, gl2e.l2, sl1mfn, sl2e, 2, ft, p2m_ram_rw); } static void l1e_propagate_from_guest(struct vcpu *v, guest_l1e_t gl1e, mfn_t gmfn, shadow_l1e_t *sl1e, fetch_type_t ft, p2m_type_t p2mt) { _sh_propagate(v, gl1e.l1, gmfn, sl1e, 1, ft, p2mt); } /**************************************************************************/ /* These functions update shadow entries (and do bookkeeping on the shadow * tables they are in). It is intended that they are the only * functions which ever write (non-zero) data onto a shadow page. */ static inline void safe_write_entry(void *dst, void *src) /* Copy one PTE safely when processors might be running on the * destination pagetable. This does *not* give safety against * concurrent writes (that's what the paging lock is for), just * stops the hardware picking up partially written entries. */ { volatile unsigned long *d = dst; unsigned long *s = src; ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1))); /* In 64-bit, sizeof(pte) == sizeof(ulong) == 1 word, * which will be an atomic write, since the entry is aligned. */ BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long)); *d = *s; } static inline void shadow_write_entries(void *d, void *s, int entries, mfn_t mfn) /* This function does the actual writes to shadow pages. * It must not be called directly, since it doesn't do the bookkeeping * that shadow_set_l*e() functions do. */ { shadow_l1e_t *dst = d; shadow_l1e_t *src = s; void *map = NULL; int i; /* Because we mirror access rights at all levels in the shadow, an * l2 (or higher) entry with the RW bit cleared will leave us with * no write access through the linear map. * We detect that by writing to the shadow with copy_to_user() and * using map_domain_page() to get a writeable mapping if we need to. */ if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 ) { perfc_incr(shadow_linear_map_failed); map = sh_map_domain_page(mfn); ASSERT(map != NULL); dst = map + ((unsigned long)dst & (PAGE_SIZE - 1)); } for ( i = 0; i < entries; i++ ) safe_write_entry(dst++, src++); if ( map != NULL ) sh_unmap_domain_page(map); } /* type is only used to distinguish grant map pages from ordinary RAM * i.e. non-p2m_is_grant() pages are treated as p2m_ram_rw. */ static int inline shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d, p2m_type_t type) { int res; mfn_t mfn; struct domain *owner; ASSERT(!sh_l1e_is_magic(sl1e)); if ( !shadow_mode_refcounts(d) ) return 1; res = get_page_from_l1e(sl1e, d, d); // If a privileged domain is attempting to install a map of a page it does // not own, we let it succeed anyway. // if ( unlikely(res < 0) && !shadow_mode_translate(d) && mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) && (owner = page_get_owner(mfn_to_page(mfn))) && (d != owner) ) { res = xsm_priv_mapping(XSM_TARGET, d, owner); if ( !res ) { res = get_page_from_l1e(sl1e, d, owner); SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx " "which is owned by domain %d: %s\n", d->domain_id, mfn_x(mfn), owner->domain_id, res >= 0 ? "success" : "failed"); } } /* Okay, it might still be a grant mapping PTE. Try it. */ if ( unlikely(res < 0) && (type == p2m_grant_map_rw || (type == p2m_grant_map_ro && !(shadow_l1e_get_flags(sl1e) & _PAGE_RW))) ) { /* It's a grant mapping. The grant table implementation will already have checked that we're supposed to have access, so we can just grab a reference directly. */ mfn = shadow_l1e_get_mfn(sl1e); if ( mfn_valid(mfn) ) res = get_page_from_l1e(sl1e, d, page_get_owner(mfn_to_page(mfn))); } if ( unlikely(res < 0) ) { perfc_incr(shadow_get_page_fail); SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n"); } return res; } static void inline shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d) { if ( !shadow_mode_refcounts(d) ) return; put_page_from_l1e(sl1e, d); } #if GUEST_PAGING_LEVELS >= 4 static int shadow_set_l4e(struct vcpu *v, shadow_l4e_t *sl4e, shadow_l4e_t new_sl4e, mfn_t sl4mfn) { int flags = 0, ok; shadow_l4e_t old_sl4e; paddr_t paddr; ASSERT(sl4e != NULL); old_sl4e = *sl4e; if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */ paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) | (((unsigned long)sl4e) & ~PAGE_MASK)); if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT ) { /* About to install a new reference */ mfn_t sl3mfn = shadow_l4e_get_mfn(new_sl4e); ok = sh_get_ref(v, sl3mfn, paddr); /* Are we pinning l3 shadows to handle wierd linux behaviour? */ if ( sh_type_is_pinnable(v, SH_type_l3_64_shadow) ) ok |= sh_pin(v, sl3mfn); if ( !ok ) { domain_crash(v->domain); return SHADOW_SET_ERROR; } } /* Write the new entry */ shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn); flags |= SHADOW_SET_CHANGED; if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT ) { /* We lost a reference to an old mfn. */ mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e); if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e))) || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e), shadow_l4e_get_flags(new_sl4e)) ) { flags |= SHADOW_SET_FLUSH; } sh_put_ref(v, osl3mfn, paddr); } return flags; } static int shadow_set_l3e(struct vcpu *v, shadow_l3e_t *sl3e, shadow_l3e_t new_sl3e, mfn_t sl3mfn) { int flags = 0; shadow_l3e_t old_sl3e; paddr_t paddr; ASSERT(sl3e != NULL); old_sl3e = *sl3e; if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */ paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) | (((unsigned long)sl3e) & ~PAGE_MASK)); if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT ) { /* About to install a new reference */ if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) ) { domain_crash(v->domain); return SHADOW_SET_ERROR; } } /* Write the new entry */ shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn); flags |= SHADOW_SET_CHANGED; if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT ) { /* We lost a reference to an old mfn. */ mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e); if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) || !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e), shadow_l3e_get_flags(new_sl3e)) ) { flags |= SHADOW_SET_FLUSH; } sh_put_ref(v, osl2mfn, paddr); } return flags; } #endif /* GUEST_PAGING_LEVELS >= 4 */ static int shadow_set_l2e(struct vcpu *v, shadow_l2e_t *sl2e, shadow_l2e_t new_sl2e, mfn_t sl2mfn) { int flags = 0; shadow_l2e_t old_sl2e; paddr_t paddr; #if GUEST_PAGING_LEVELS == 2 /* In 2-on-3 we work with pairs of l2es pointing at two-page * shadows. Reference counting and up-pointers track from the first * page of the shadow to the first l2e, so make sure that we're * working with those: * Start with a pair of identical entries */ shadow_l2e_t pair[2] = { new_sl2e, new_sl2e }; /* Align the pointer down so it's pointing at the first of the pair */ sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t))); #endif ASSERT(sl2e != NULL); old_sl2e = *sl2e; if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */ paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT) | (((unsigned long)sl2e) & ~PAGE_MASK)); if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT ) { mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e); ASSERT(mfn_to_page(sl1mfn)->u.sh.head); /* About to install a new reference */ if ( !sh_get_ref(v, sl1mfn, paddr) ) { domain_crash(v->domain); return SHADOW_SET_ERROR; } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) { struct page_info *sp = mfn_to_page(sl1mfn); mfn_t gl1mfn; ASSERT(sp->u.sh.head); gl1mfn = backpointer(sp); /* If the shadow is a fl1 then the backpointer contains the GFN instead of the GMFN, and it's definitely not OOS. */ if ( (sp->u.sh.type != SH_type_fl1_shadow) && mfn_valid(gl1mfn) && mfn_is_out_of_sync(gl1mfn) ) sh_resync(v, gl1mfn); } #endif #if GUEST_PAGING_LEVELS == 2 /* Update the second entry to point tio the second half of the l1 */ sl1mfn = sh_next_page(sl1mfn); pair[1] = shadow_l2e_from_mfn(sl1mfn, shadow_l2e_get_flags(new_sl2e)); #endif } /* Write the new entry */ #if GUEST_PAGING_LEVELS == 2 shadow_write_entries(sl2e, &pair, 2, sl2mfn); #else /* normal case */ shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn); #endif flags |= SHADOW_SET_CHANGED; if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT ) { /* We lost a reference to an old mfn. */ mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e); if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) || !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e), shadow_l2e_get_flags(new_sl2e)) ) { flags |= SHADOW_SET_FLUSH; } sh_put_ref(v, osl1mfn, paddr); } return flags; } static inline void shadow_vram_get_l1e(shadow_l1e_t new_sl1e, shadow_l1e_t *sl1e, mfn_t sl1mfn, struct domain *d) { mfn_t mfn = shadow_l1e_get_mfn(new_sl1e); int flags = shadow_l1e_get_flags(new_sl1e); unsigned long gfn; struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram; if ( !dirty_vram /* tracking disabled? */ || !(flags & _PAGE_RW) /* read-only mapping? */ || !mfn_valid(mfn) ) /* mfn can be invalid in mmio_direct */ return; gfn = mfn_to_gfn(d, mfn); /* Page sharing not supported on shadow PTs */ BUG_ON(SHARED_M2P(gfn)); if ( (gfn >= dirty_vram->begin_pfn) && (gfn < dirty_vram->end_pfn) ) { unsigned long i = gfn - dirty_vram->begin_pfn; struct page_info *page = mfn_to_page(mfn); if ( (page->u.inuse.type_info & PGT_count_mask) == 1 ) /* Initial guest reference, record it */ dirty_vram->sl1ma[i] = pfn_to_paddr(mfn_x(sl1mfn)) | ((unsigned long)sl1e & ~PAGE_MASK); } } static inline void shadow_vram_put_l1e(shadow_l1e_t old_sl1e, shadow_l1e_t *sl1e, mfn_t sl1mfn, struct domain *d) { mfn_t mfn = shadow_l1e_get_mfn(old_sl1e); int flags = shadow_l1e_get_flags(old_sl1e); unsigned long gfn; struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram; if ( !dirty_vram /* tracking disabled? */ || !(flags & _PAGE_RW) /* read-only mapping? */ || !mfn_valid(mfn) ) /* mfn can be invalid in mmio_direct */ return; gfn = mfn_to_gfn(d, mfn); /* Page sharing not supported on shadow PTs */ BUG_ON(SHARED_M2P(gfn)); if ( (gfn >= dirty_vram->begin_pfn) && (gfn < dirty_vram->end_pfn) ) { unsigned long i = gfn - dirty_vram->begin_pfn; struct page_info *page = mfn_to_page(mfn); int dirty = 0; paddr_t sl1ma = pfn_to_paddr(mfn_x(sl1mfn)) | ((unsigned long)sl1e & ~PAGE_MASK); if ( (page->u.inuse.type_info & PGT_count_mask) == 1 ) { /* Last reference */ if ( dirty_vram->sl1ma[i] == INVALID_PADDR ) { /* We didn't know it was that one, let's say it is dirty */ dirty = 1; } else { ASSERT(dirty_vram->sl1ma[i] == sl1ma); dirty_vram->sl1ma[i] = INVALID_PADDR; if ( flags & _PAGE_DIRTY ) dirty = 1; } } else { /* We had more than one reference, just consider the page dirty. */ dirty = 1; /* Check that it's not the one we recorded. */ if ( dirty_vram->sl1ma[i] == sl1ma ) { /* Too bad, we remembered the wrong one... */ dirty_vram->sl1ma[i] = INVALID_PADDR; } else { /* Ok, our recorded sl1e is still pointing to this page, let's * just hope it will remain. */ } } if ( dirty ) { dirty_vram->dirty_bitmap[i / 8] |= 1 << (i % 8); dirty_vram->last_dirty = NOW(); } } } static int shadow_set_l1e(struct vcpu *v, shadow_l1e_t *sl1e, shadow_l1e_t new_sl1e, p2m_type_t new_type, mfn_t sl1mfn) { int flags = 0; struct domain *d = v->domain; shadow_l1e_t old_sl1e; #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e); #endif ASSERT(sl1e != NULL); #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn) && ((shadow_l1e_get_flags(new_sl1e) & (_PAGE_RW|_PAGE_PRESENT)) == (_PAGE_RW|_PAGE_PRESENT)) ) oos_fixup_add(v, new_gmfn, sl1mfn, pgentry_ptr_to_slot(sl1e)); #endif old_sl1e = *sl1e; if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */ if ( (shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT) && !sh_l1e_is_magic(new_sl1e) ) { /* About to install a new reference */ if ( shadow_mode_refcounts(d) ) { TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_GET_REF); switch ( shadow_get_page_from_l1e(new_sl1e, d, new_type) ) { default: /* Doesn't look like a pagetable. */ flags |= SHADOW_SET_ERROR; new_sl1e = shadow_l1e_empty(); break; case 1: shadow_l1e_remove_flags(new_sl1e, _PAGE_RW); /* fall through */ case 0: shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d); break; } } } /* Write the new entry */ shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn); flags |= SHADOW_SET_CHANGED; if ( (shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT) && !sh_l1e_is_magic(old_sl1e) ) { /* We lost a reference to an old mfn. */ /* N.B. Unlike higher-level sets, never need an extra flush * when writing an l1e. Because it points to the same guest frame * as the guest l1e did, it's the guest's responsibility to * trigger a flush later. */ if ( shadow_mode_refcounts(d) ) { shadow_vram_put_l1e(old_sl1e, sl1e, sl1mfn, d); shadow_put_page_from_l1e(old_sl1e, d); TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_PUT_REF); } } return flags; } /**************************************************************************/ /* Macros to walk pagetables. These take the shadow of a pagetable and * walk every "interesting" entry. That is, they don't touch Xen mappings, * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every * second entry (since pairs of entries are managed together). For multi-page * shadows they walk all pages. * * Arguments are an MFN, the variable to point to each entry, a variable * to indicate that we are done (we will shortcut to the end of the scan * when _done != 0), a variable to indicate that we should avoid Xen mappings, * and the code. * * WARNING: These macros have side-effects. They change the values of both * the pointer and the MFN. */ static inline void increment_ptr_to_guest_entry(void *ptr) { if ( ptr ) { guest_l1e_t **entry = ptr; (*entry)++; } } /* All kinds of l1: touch all entries */ #define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ do { \ int _i; \ shadow_l1e_t *_sp = sh_map_domain_page((_sl1mfn)); \ ASSERT(mfn_to_page(_sl1mfn)->u.sh.type == SH_type_l1_shadow \ || mfn_to_page(_sl1mfn)->u.sh.type == SH_type_fl1_shadow);\ for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \ { \ (_sl1e) = _sp + _i; \ if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \ {_code} \ if ( _done ) break; \ increment_ptr_to_guest_entry(_gl1p); \ } \ sh_unmap_domain_page(_sp); \ } while (0) /* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */ #if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2 #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ do { \ int __done = 0; \ _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \ ({ (__done = _done); }), _code); \ _sl1mfn = sh_next_page(_sl1mfn); \ if ( !__done ) \ _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \ ({ (__done = _done); }), _code); \ } while (0) #else /* Everything else; l1 shadows are only one page */ #define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) #endif #if GUEST_PAGING_LEVELS == 2 /* 32-bit l2 on PAE/64: four pages, touch every second entry */ #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \ do { \ int _i, _j, __done = 0; \ ASSERT(shadow_mode_external(_dom)); \ ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_32_shadow); \ for ( _j = 0; _j < 4 && !__done; _j++ ) \ { \ shadow_l2e_t *_sp = sh_map_domain_page(_sl2mfn); \ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \ { \ (_sl2e) = _sp + _i; \ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ {_code} \ if ( (__done = (_done)) ) break; \ increment_ptr_to_guest_entry(_gl2p); \ } \ sh_unmap_domain_page(_sp); \ if ( _j < 3 ) _sl2mfn = sh_next_page(_sl2mfn); \ } \ } while (0) #elif GUEST_PAGING_LEVELS == 3 /* PAE: touch all entries */ #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \ do { \ int _i; \ shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \ ASSERT(shadow_mode_external(_dom)); \ ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_pae_shadow \ || mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2h_pae_shadow); \ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \ { \ (_sl2e) = _sp + _i; \ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ {_code} \ if ( _done ) break; \ increment_ptr_to_guest_entry(_gl2p); \ } \ sh_unmap_domain_page(_sp); \ } while (0) #else /* 64-bit l2: touch all entries except for PAE compat guests. */ #define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _dom, _code) \ do { \ int _i; \ int _xen = !shadow_mode_external(_dom); \ shadow_l2e_t *_sp = sh_map_domain_page((_sl2mfn)); \ ASSERT(mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2_64_shadow ||\ mfn_to_page(_sl2mfn)->u.sh.type == SH_type_l2h_64_shadow);\ for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \ { \ if ( (!(_xen)) \ || !is_pv_32on64_domain(_dom) \ || mfn_to_page(_sl2mfn)->u.sh.type != SH_type_l2h_64_shadow\ || (_i < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_dom)) ) \ { \ (_sl2e) = _sp + _i; \ if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ {_code} \ if ( _done ) break; \ increment_ptr_to_guest_entry(_gl2p); \ } \ } \ sh_unmap_domain_page(_sp); \ } while (0) #endif /* different kinds of l2 */ #if GUEST_PAGING_LEVELS == 4 /* 64-bit l3: touch all entries */ #define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \ do { \ int _i; \ shadow_l3e_t *_sp = sh_map_domain_page((_sl3mfn)); \ ASSERT(mfn_to_page(_sl3mfn)->u.sh.type == SH_type_l3_64_shadow);\ for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \ { \ (_sl3e) = _sp + _i; \ if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \ {_code} \ if ( _done ) break; \ increment_ptr_to_guest_entry(_gl3p); \ } \ sh_unmap_domain_page(_sp); \ } while (0) /* 64-bit l4: avoid Xen mappings */ #define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _dom, _code) \ do { \ shadow_l4e_t *_sp = sh_map_domain_page((_sl4mfn)); \ int _xen = !shadow_mode_external(_dom); \ int _i; \ ASSERT(mfn_to_page(_sl4mfn)->u.sh.type == SH_type_l4_64_shadow);\ for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \ { \ if ( (!(_xen)) || is_guest_l4_slot(_dom, _i) ) \ { \ (_sl4e) = _sp + _i; \ if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \ {_code} \ if ( _done ) break; \ } \ increment_ptr_to_guest_entry(_gl4p); \ } \ sh_unmap_domain_page(_sp); \ } while (0) #endif /**************************************************************************/ /* Functions to install Xen mappings and linear mappings in shadow pages */ // XXX -- this function should probably be moved to shadow-common.c, but that // probably wants to wait until the shadow types have been moved from // shadow-types.h to shadow-private.h // #if GUEST_PAGING_LEVELS == 4 void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn) { struct domain *d = v->domain; shadow_l4e_t *sl4e; unsigned int slots; sl4e = sh_map_domain_page(sl4mfn); ASSERT(sl4e != NULL); ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t)); /* Copy the common Xen mappings from the idle domain */ slots = (shadow_mode_external(d) ? ROOT_PAGETABLE_XEN_SLOTS : ROOT_PAGETABLE_PV_XEN_SLOTS); memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT], &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT], slots * sizeof(l4_pgentry_t)); /* Install the per-domain mappings for this domain */ sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] = shadow_l4e_from_mfn(page_to_mfn(d->arch.perdomain_l3_pg), __PAGE_HYPERVISOR); /* Shadow linear mapping for 4-level shadows. N.B. for 3-level * shadows on 64-bit xen, this linear mapping is later replaced by the * monitor pagetable structure, which is built in make_monitor_table * and maintained by sh_update_linear_entries. */ sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] = shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR); /* Self linear mapping. */ if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) ) { // linear tables may not be used with translated PV guests sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] = shadow_l4e_empty(); } else { sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] = shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR); } sh_unmap_domain_page(sl4e); } #endif #if GUEST_PAGING_LEVELS >= 3 // For 3-on-3 PV guests, we need to make sure the xen mappings are in // place, which means that we need to populate the l2h entry in the l3 // table. static void sh_install_xen_entries_in_l2h(struct vcpu *v, mfn_t sl2hmfn) { struct domain *d = v->domain; shadow_l2e_t *sl2e; if ( !is_pv_32on64_vcpu(v) ) return; sl2e = sh_map_domain_page(sl2hmfn); ASSERT(sl2e != NULL); ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t)); /* Copy the common Xen mappings from the idle domain */ memcpy( &sl2e[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)], &compat_idle_pg_table_l2[l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)], COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*sl2e)); sh_unmap_domain_page(sl2e); } #endif /**************************************************************************/ /* Create a shadow of a given guest page. */ static mfn_t sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type) { mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn)); SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n", mfn_x(gmfn), shadow_type, mfn_x(smfn)); if ( sh_type_has_up_pointer(v, shadow_type) ) /* Lower-level shadow, not yet linked form a higher level */ mfn_to_page(smfn)->up = 0; #if GUEST_PAGING_LEVELS == 4 #if (SHADOW_OPTIMIZATIONS & SHOPT_LINUX_L3_TOPLEVEL) if ( shadow_type == SH_type_l4_64_shadow && unlikely(v->domain->arch.paging.shadow.opt_flags & SHOPT_LINUX_L3_TOPLEVEL) ) { /* We're shadowing a new l4, but we've been assuming the guest uses * only one l4 per vcpu and context switches using an l4 entry. * Count the number of active l4 shadows. If there are enough * of them, decide that this isn't an old linux guest, and stop * pinning l3es. This is not very quick but it doesn't happen * very often. */ struct page_info *sp, *t; struct vcpu *v2; int l4count = 0, vcpus = 0; page_list_for_each(sp, &v->domain->arch.paging.shadow.pinned_shadows) { if ( sp->u.sh.type == SH_type_l4_64_shadow ) l4count++; } for_each_vcpu ( v->domain, v2 ) vcpus++; if ( l4count > 2 * vcpus ) { /* Unpin all the pinned l3 tables, and don't pin any more. */ page_list_for_each_safe(sp, t, &v->domain->arch.paging.shadow.pinned_shadows) { if ( sp->u.sh.type == SH_type_l3_64_shadow ) sh_unpin(v, page_to_mfn(sp)); } v->domain->arch.paging.shadow.opt_flags &= ~SHOPT_LINUX_L3_TOPLEVEL; sh_reset_l3_up_pointers(v); } } #endif #endif // Create the Xen mappings... if ( !shadow_mode_external(v->domain) ) { switch (shadow_type) { #if GUEST_PAGING_LEVELS == 4 case SH_type_l4_shadow: sh_install_xen_entries_in_l4(v, gmfn, smfn); break; #endif #if GUEST_PAGING_LEVELS >= 3 case SH_type_l2h_shadow: sh_install_xen_entries_in_l2h(v, smfn); break; #endif default: /* Do nothing */ break; } } shadow_promote(v, gmfn, shadow_type); set_shadow_status(v, gmfn, shadow_type, smfn); return smfn; } /* Make a splintered superpage shadow */ static mfn_t make_fl1_shadow(struct vcpu *v, gfn_t gfn) { mfn_t smfn = shadow_alloc(v->domain, SH_type_fl1_shadow, (unsigned long) gfn_x(gfn)); SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" PRI_mfn "\n", gfn_x(gfn), mfn_x(smfn)); set_fl1_shadow_status(v, gfn, smfn); return smfn; } #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS mfn_t sh_make_monitor_table(struct vcpu *v) { struct domain *d = v->domain; ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0); /* Guarantee we can get the memory we need */ shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS); { mfn_t m4mfn; m4mfn = shadow_alloc(d, SH_type_monitor_table, 0); sh_install_xen_entries_in_l4(v, m4mfn, m4mfn); /* Remember the level of this table */ mfn_to_page(m4mfn)->shadow_flags = 4; #if SHADOW_PAGING_LEVELS < 4 { mfn_t m3mfn, m2mfn; l4_pgentry_t *l4e; l3_pgentry_t *l3e; /* Install an l3 table and an l2 table that will hold the shadow * linear map entries. This overrides the linear map entry that * was installed by sh_install_xen_entries_in_l4. */ l4e = sh_map_domain_page(m4mfn); m3mfn = shadow_alloc(d, SH_type_monitor_table, 0); mfn_to_page(m3mfn)->shadow_flags = 3; l4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR); m2mfn = shadow_alloc(d, SH_type_monitor_table, 0); mfn_to_page(m2mfn)->shadow_flags = 2; l3e = sh_map_domain_page(m3mfn); l3e[0] = l3e_from_pfn(mfn_x(m2mfn), __PAGE_HYPERVISOR); sh_unmap_domain_page(l3e); if ( is_pv_32on64_vcpu(v) ) { /* For 32-on-64 PV guests, we need to map the 32-bit Xen * area into its usual VAs in the monitor tables */ m3mfn = shadow_alloc(d, SH_type_monitor_table, 0); mfn_to_page(m3mfn)->shadow_flags = 3; l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR); m2mfn = shadow_alloc(d, SH_type_monitor_table, 0); mfn_to_page(m2mfn)->shadow_flags = 2; l3e = sh_map_domain_page(m3mfn); l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT); sh_install_xen_entries_in_l2h(v, m2mfn); sh_unmap_domain_page(l3e); } sh_unmap_domain_page(l4e); } #endif /* SHADOW_PAGING_LEVELS < 4 */ return m4mfn; } } #endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */ /**************************************************************************/ /* These functions also take a virtual address and return the level-N * shadow table mfn and entry, but they create the shadow pagetables if * they are needed. The "demand" argument is non-zero when handling * a demand fault (so we know what to do about accessed bits &c). * If the necessary tables are not present in the guest, they return NULL. */ /* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has * more levels than the guest, the upper levels are always fixed and do not * reflect any information from the guest, so we do not use these functions * to access them. */ #if GUEST_PAGING_LEVELS >= 4 static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v, walk_t *gw, mfn_t *sl4mfn) { /* There is always a shadow of the top level table. Get it. */ *sl4mfn = pagetable_get_mfn(v->arch.shadow_table[0]); /* Reading the top level table is always valid. */ return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va); } static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v, walk_t *gw, mfn_t *sl3mfn, fetch_type_t ft, int *resync) { mfn_t sl4mfn; shadow_l4e_t *sl4e; if ( !mfn_valid(gw->l3mfn) ) return NULL; /* No guest page. */ /* Get the l4e */ sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn); ASSERT(sl4e != NULL); if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) { *sl3mfn = shadow_l4e_get_mfn(*sl4e); ASSERT(mfn_valid(*sl3mfn)); } else { int r; shadow_l4e_t new_sl4e; /* No l3 shadow installed: find and install it. */ *sl3mfn = get_shadow_status(v, gw->l3mfn, SH_type_l3_shadow); if ( !mfn_valid(*sl3mfn) ) { /* No l3 shadow of this page exists at all: make one. */ *sl3mfn = sh_make_shadow(v, gw->l3mfn, SH_type_l3_shadow); } /* Install the new sl3 table in the sl4e */ l4e_propagate_from_guest(v, gw->l4e, *sl3mfn, &new_sl4e, ft); r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn); ASSERT((r & SHADOW_SET_FLUSH) == 0); if ( r & SHADOW_SET_ERROR ) return NULL; #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) *resync |= 1; #endif } /* Now follow it down a level. Guaranteed to succeed. */ return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va); } #endif /* GUEST_PAGING_LEVELS >= 4 */ static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v, walk_t *gw, mfn_t *sl2mfn, fetch_type_t ft, int *resync) { #if GUEST_PAGING_LEVELS >= 4 /* 64bit... */ mfn_t sl3mfn = _mfn(INVALID_MFN); shadow_l3e_t *sl3e; if ( !mfn_valid(gw->l2mfn) ) return NULL; /* No guest page. */ /* Get the l3e */ sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft, resync); if ( sl3e == NULL ) return NULL; if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) { *sl2mfn = shadow_l3e_get_mfn(*sl3e); ASSERT(mfn_valid(*sl2mfn)); } else { int r; shadow_l3e_t new_sl3e; unsigned int t = SH_type_l2_shadow; /* Tag compat L2 containing hypervisor (m2p) mappings */ if ( is_pv_32on64_domain(v->domain) && guest_l4_table_offset(gw->va) == 0 && guest_l3_table_offset(gw->va) == 3 ) t = SH_type_l2h_shadow; /* No l2 shadow installed: find and install it. */ *sl2mfn = get_shadow_status(v, gw->l2mfn, t); if ( !mfn_valid(*sl2mfn) ) { /* No l2 shadow of this page exists at all: make one. */ *sl2mfn = sh_make_shadow(v, gw->l2mfn, t); } /* Install the new sl2 table in the sl3e */ l3e_propagate_from_guest(v, gw->l3e, *sl2mfn, &new_sl3e, ft); r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn); ASSERT((r & SHADOW_SET_FLUSH) == 0); if ( r & SHADOW_SET_ERROR ) return NULL; #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) *resync |= 1; #endif } /* Now follow it down a level. Guaranteed to succeed. */ return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va); #elif GUEST_PAGING_LEVELS == 3 /* PAE... */ /* We never demand-shadow PAE l3es: they are only created in * sh_update_cr3(). Check if the relevant sl3e is present. */ shadow_l3e_t *sl3e = ((shadow_l3e_t *)&v->arch.paging.shadow.l3table) + shadow_l3_linear_offset(gw->va); if ( !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) return NULL; *sl2mfn = shadow_l3e_get_mfn(*sl3e); ASSERT(mfn_valid(*sl2mfn)); return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va); #else /* 32bit... */ /* There is always a shadow of the top level table. Get it. */ *sl2mfn = pagetable_get_mfn(v->arch.shadow_table[0]); /* This next line is important: the guest l2 has a 16k * shadow, we need to return the right mfn of the four. This * call will set it for us as a side-effect. */ (void) shadow_l2_index(sl2mfn, guest_l2_table_offset(gw->va)); /* Reading the top level table is always valid. */ return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va); #endif } static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v, walk_t *gw, mfn_t *sl1mfn, fetch_type_t ft) { mfn_t sl2mfn; int resync = 0; shadow_l2e_t *sl2e; /* Get the l2e */ sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft, &resync); if ( sl2e == NULL ) return NULL; /* Install the sl1 in the l2e if it wasn't there or if we need to * re-do it to fix a PSE dirty bit. */ if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT && likely(ft != ft_demand_write || (shadow_l2e_get_flags(*sl2e) & _PAGE_RW) || !(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) ) { *sl1mfn = shadow_l2e_get_mfn(*sl2e); ASSERT(mfn_valid(*sl1mfn)); } else { shadow_l2e_t new_sl2e; int r, flags = guest_l2e_get_flags(gw->l2e); /* No l1 shadow installed: find and install it. */ if ( !(flags & _PAGE_PRESENT) ) return NULL; /* No guest page. */ if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) ) { /* Splintering a superpage */ gfn_t l2gfn = guest_l2e_get_gfn(gw->l2e); *sl1mfn = get_fl1_shadow_status(v, l2gfn); if ( !mfn_valid(*sl1mfn) ) { /* No fl1 shadow of this superpage exists at all: make one. */ *sl1mfn = make_fl1_shadow(v, l2gfn); } } else { /* Shadowing an actual guest l1 table */ if ( !mfn_valid(gw->l1mfn) ) return NULL; /* No guest page. */ *sl1mfn = get_shadow_status(v, gw->l1mfn, SH_type_l1_shadow); if ( !mfn_valid(*sl1mfn) ) { /* No l1 shadow of this page exists at all: make one. */ *sl1mfn = sh_make_shadow(v, gw->l1mfn, SH_type_l1_shadow); } } /* Install the new sl1 table in the sl2e */ l2e_propagate_from_guest(v, gw->l2e, *sl1mfn, &new_sl2e, ft); r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn); ASSERT((r & SHADOW_SET_FLUSH) == 0); if ( r & SHADOW_SET_ERROR ) return NULL; /* This next line is important: in 32-on-PAE and 32-on-64 modes, * the guest l1 table has an 8k shadow, and we need to return * the right mfn of the pair. This call will set it for us as a * side-effect. (In all other cases, it's a no-op and will be * compiled out.) */ (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va)); } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) /* All pages walked are now pagetables. Safe to resync pages in case level 4 or 3 shadows were set. */ if ( resync ) shadow_resync_all(v); #endif /* Now follow it down a level. Guaranteed to succeed. */ return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va); } /**************************************************************************/ /* Destructors for shadow tables: * Unregister the shadow, decrement refcounts of any entries present in it, * and release the memory. * * N.B. These destructors do not clear the contents of the shadows. * This allows us to delay TLB shootdowns until the page is being reused. * See shadow_alloc() and shadow_free() for how this is handled. */ #if GUEST_PAGING_LEVELS >= 4 void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn) { shadow_l4e_t *sl4e; struct page_info *sp = mfn_to_page(smfn); u32 t = sp->u.sh.type; mfn_t gmfn, sl4mfn; SHADOW_DEBUG(DESTROY_SHADOW, "%s(%05lx)\n", __func__, mfn_x(smfn)); ASSERT(t == SH_type_l4_shadow); ASSERT(sp->u.sh.head); /* Record that the guest page isn't shadowed any more (in this type) */ gmfn = backpointer(sp); delete_shadow_status(v, gmfn, t, smfn); shadow_demote(v, gmfn, t); /* Decrement refcounts of all the old entries */ sl4mfn = smfn; SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, { if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) { sh_put_ref(v, shadow_l4e_get_mfn(*sl4e), (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) | ((unsigned long)sl4e & ~PAGE_MASK)); } }); /* Put the memory back in the pool */ shadow_free(v->domain, smfn); } void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn) { shadow_l3e_t *sl3e; struct page_info *sp = mfn_to_page(smfn); u32 t = sp->u.sh.type; mfn_t gmfn, sl3mfn; SHADOW_DEBUG(DESTROY_SHADOW, "%s(%05lx)\n", __func__, mfn_x(smfn)); ASSERT(t == SH_type_l3_shadow); ASSERT(sp->u.sh.head); /* Record that the guest page isn't shadowed any more (in this type) */ gmfn = backpointer(sp); delete_shadow_status(v, gmfn, t, smfn); shadow_demote(v, gmfn, t); /* Decrement refcounts of all the old entries */ sl3mfn = smfn; SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, { if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) sh_put_ref(v, shadow_l3e_get_mfn(*sl3e), (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) | ((unsigned long)sl3e & ~PAGE_MASK)); }); /* Put the memory back in the pool */ shadow_free(v->domain, smfn); } #endif /* GUEST_PAGING_LEVELS >= 4 */ void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn) { shadow_l2e_t *sl2e; struct page_info *sp = mfn_to_page(smfn); u32 t = sp->u.sh.type; mfn_t gmfn, sl2mfn; SHADOW_DEBUG(DESTROY_SHADOW, "%s(%05lx)\n", __func__, mfn_x(smfn)); #if GUEST_PAGING_LEVELS >= 3 ASSERT(t == SH_type_l2_shadow || t == SH_type_l2h_shadow); #else ASSERT(t == SH_type_l2_shadow); #endif ASSERT(sp->u.sh.head); /* Record that the guest page isn't shadowed any more (in this type) */ gmfn = backpointer(sp); delete_shadow_status(v, gmfn, t, smfn); shadow_demote(v, gmfn, t); /* Decrement refcounts of all the old entries */ sl2mfn = smfn; SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, { if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT ) sh_put_ref(v, shadow_l2e_get_mfn(*sl2e), (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT) | ((unsigned long)sl2e & ~PAGE_MASK)); }); /* Put the memory back in the pool */ shadow_free(v->domain, smfn); } void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn) { struct domain *d = v->domain; shadow_l1e_t *sl1e; struct page_info *sp = mfn_to_page(smfn); u32 t = sp->u.sh.type; SHADOW_DEBUG(DESTROY_SHADOW, "%s(%05lx)\n", __func__, mfn_x(smfn)); ASSERT(t == SH_type_l1_shadow || t == SH_type_fl1_shadow); ASSERT(sp->u.sh.head); /* Record that the guest page isn't shadowed any more (in this type) */ if ( t == SH_type_fl1_shadow ) { gfn_t gfn = _gfn(sp->v.sh.back); delete_fl1_shadow_status(v, gfn, smfn); } else { mfn_t gmfn = backpointer(sp); delete_shadow_status(v, gmfn, t, smfn); shadow_demote(v, gmfn, t); } if ( shadow_mode_refcounts(d) ) { /* Decrement refcounts of all the old entries */ mfn_t sl1mfn = smfn; SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, { if ( (shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT) && !sh_l1e_is_magic(*sl1e) ) { shadow_vram_put_l1e(*sl1e, sl1e, sl1mfn, d); shadow_put_page_from_l1e(*sl1e, d); } }); } /* Put the memory back in the pool */ shadow_free(v->domain, smfn); } #if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn) { struct domain *d = v->domain; ASSERT(mfn_to_page(mmfn)->u.sh.type == SH_type_monitor_table); #if SHADOW_PAGING_LEVELS != 4 { mfn_t m3mfn; l4_pgentry_t *l4e = sh_map_domain_page(mmfn); l3_pgentry_t *l3e; int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START); /* Need to destroy the l3 and l2 monitor pages used * for the linear map */ ASSERT(l4e_get_flags(l4e[linear_slot]) & _PAGE_PRESENT); m3mfn = _mfn(l4e_get_pfn(l4e[linear_slot])); l3e = sh_map_domain_page(m3mfn); ASSERT(l3e_get_flags(l3e[0]) & _PAGE_PRESENT); shadow_free(d, _mfn(l3e_get_pfn(l3e[0]))); sh_unmap_domain_page(l3e); shadow_free(d, m3mfn); if ( is_pv_32on64_vcpu(v) ) { /* Need to destroy the l3 and l2 monitor pages that map the * Xen VAs at 3GB-4GB */ ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT); m3mfn = _mfn(l4e_get_pfn(l4e[0])); l3e = sh_map_domain_page(m3mfn); ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT); shadow_free(d, _mfn(l3e_get_pfn(l3e[3]))); sh_unmap_domain_page(l3e); shadow_free(d, m3mfn); } sh_unmap_domain_page(l4e); } #endif /* Put the memory back in the pool */ shadow_free(d, mmfn); } #endif /**************************************************************************/ /* Functions to destroy non-Xen mappings in a pagetable hierarchy. * These are called from common code when we are running out of shadow * memory, and unpinning all the top-level shadows hasn't worked. * * With user_only == 1, we leave guest kernel-mode mappings in place too, * unhooking only the user-mode mappings * * This implementation is pretty crude and slow, but we hope that it won't * be called very often. */ #if GUEST_PAGING_LEVELS == 2 void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn, int user_only) { shadow_l2e_t *sl2e; SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, { if ( !user_only || (sl2e->l2 & _PAGE_USER) ) (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn); }); } #elif GUEST_PAGING_LEVELS == 3 void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl2mfn, int user_only) /* Walk a PAE l2 shadow, unhooking entries from all the subshadows */ { shadow_l2e_t *sl2e; SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, v->domain, { if ( !user_only || (sl2e->l2 & _PAGE_USER) ) (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn); }); } #elif GUEST_PAGING_LEVELS == 4 void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn, int user_only) { shadow_l4e_t *sl4e; SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, v->domain, { if ( !user_only || (sl4e->l4 & _PAGE_USER) ) (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn); }); } #endif /**************************************************************************/ /* Internal translation functions. * These functions require a pointer to the shadow entry that will be updated. */ /* These functions take a new guest entry, translate it to shadow and write * the shadow entry. * * They return the same bitmaps as the shadow_set_lXe() functions. */ #if GUEST_PAGING_LEVELS >= 4 static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se) { shadow_l4e_t new_sl4e; guest_l4e_t new_gl4e = *(guest_l4e_t *)new_ge; shadow_l4e_t *sl4p = se; mfn_t sl3mfn = _mfn(INVALID_MFN); struct domain *d = v->domain; p2m_type_t p2mt; int result = 0; perfc_incr(shadow_validate_gl4e_calls); if ( guest_l4e_get_flags(new_gl4e) & _PAGE_PRESENT ) { gfn_t gl3gfn = guest_l4e_get_gfn(new_gl4e); mfn_t gl3mfn = get_gfn_query_unlocked(d, gfn_x(gl3gfn), &p2mt); if ( p2m_is_ram(p2mt) ) sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow); else if ( p2mt != p2m_populate_on_demand ) result |= SHADOW_SET_ERROR; #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) if ( mfn_valid(sl3mfn) ) shadow_resync_all(v); #endif } l4e_propagate_from_guest(v, new_gl4e, sl3mfn, &new_sl4e, ft_prefetch); // check for updates to xen reserved slots if ( !shadow_mode_external(d) ) { int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) / sizeof(shadow_l4e_t)); int reserved_xen_slot = !is_guest_l4_slot(d, shadow_index); if ( unlikely(reserved_xen_slot) ) { // attempt by the guest to write to a xen reserved slot // SHADOW_PRINTK("%s out-of-range update " "sl4mfn=%05lx index=%#x val=%" SH_PRI_pte "\n", __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4); if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT ) { SHADOW_ERROR("out-of-range l4e update\n"); result |= SHADOW_SET_ERROR; } // do not call shadow_set_l4e... return result; } } result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn); return result; } static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se) { shadow_l3e_t new_sl3e; guest_l3e_t new_gl3e = *(guest_l3e_t *)new_ge; shadow_l3e_t *sl3p = se; mfn_t sl2mfn = _mfn(INVALID_MFN); p2m_type_t p2mt; int result = 0; perfc_incr(shadow_validate_gl3e_calls); if ( guest_l3e_get_flags(new_gl3e) & _PAGE_PRESENT ) { gfn_t gl2gfn = guest_l3e_get_gfn(new_gl3e); mfn_t gl2mfn = get_gfn_query_unlocked(v->domain, gfn_x(gl2gfn), &p2mt); if ( p2m_is_ram(p2mt) ) sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow); else if ( p2mt != p2m_populate_on_demand ) result |= SHADOW_SET_ERROR; #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) if ( mfn_valid(sl2mfn) ) shadow_resync_all(v); #endif } l3e_propagate_from_guest(v, new_gl3e, sl2mfn, &new_sl3e, ft_prefetch); result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn); return result; } #endif // GUEST_PAGING_LEVELS >= 4 static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se) { shadow_l2e_t new_sl2e; guest_l2e_t new_gl2e = *(guest_l2e_t *)new_ge; shadow_l2e_t *sl2p = se; mfn_t sl1mfn = _mfn(INVALID_MFN); p2m_type_t p2mt; int result = 0; perfc_incr(shadow_validate_gl2e_calls); if ( guest_l2e_get_flags(new_gl2e) & _PAGE_PRESENT ) { gfn_t gl1gfn = guest_l2e_get_gfn(new_gl2e); if ( guest_supports_superpages(v) && (guest_l2e_get_flags(new_gl2e) & _PAGE_PSE) ) { // superpage -- need to look up the shadow L1 which holds the // splitters... sl1mfn = get_fl1_shadow_status(v, gl1gfn); #if 0 // XXX - it's possible that we want to do some kind of prefetch // for superpage fl1's here, but this is *not* on the demand path, // so we'll hold off trying that for now... // if ( !mfn_valid(sl1mfn) ) sl1mfn = make_fl1_shadow(v, gl1gfn); #endif } else { mfn_t gl1mfn = get_gfn_query_unlocked(v->domain, gfn_x(gl1gfn), &p2mt); if ( p2m_is_ram(p2mt) ) sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow); else if ( p2mt != p2m_populate_on_demand ) result |= SHADOW_SET_ERROR; } } l2e_propagate_from_guest(v, new_gl2e, sl1mfn, &new_sl2e, ft_prefetch); result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn); return result; } static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se) { shadow_l1e_t new_sl1e; guest_l1e_t new_gl1e = *(guest_l1e_t *)new_ge; shadow_l1e_t *sl1p = se; gfn_t gfn; mfn_t gmfn; p2m_type_t p2mt; int result = 0; #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) mfn_t gl1mfn; #endif /* OOS */ perfc_incr(shadow_validate_gl1e_calls); gfn = guest_l1e_get_gfn(new_gl1e); gmfn = get_gfn_query_unlocked(v->domain, gfn_x(gfn), &p2mt); l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt); result |= shadow_set_l1e(v, sl1p, new_sl1e, p2mt, sl1mfn); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) gl1mfn = backpointer(mfn_to_page(sl1mfn)); if ( mfn_valid(gl1mfn) && mfn_is_out_of_sync(gl1mfn) ) { /* Update the OOS snapshot. */ mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn); guest_l1e_t *snp; ASSERT(mfn_valid(snpmfn)); snp = sh_map_domain_page(snpmfn); snp[guest_index(new_ge)] = new_gl1e; sh_unmap_domain_page(snp); } #endif /* OOS */ return result; } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /**************************************************************************/ /* Special validation function for re-syncing out-of-sync shadows. * Walks the *shadow* page, and for every entry that it finds, * revalidates the guest entry that corresponds to it. * N.B. This function is called with the vcpu that unsynced the page, * *not* the one that is causing it to be resynced. */ void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn) { mfn_t sl1mfn; shadow_l1e_t *sl1p; guest_l1e_t *gl1p, *gp, *snp; int rc = 0; ASSERT(mfn_valid(snpmfn)); sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow); ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */ snp = sh_map_domain_page(snpmfn); gp = sh_map_domain_page(gl1mfn); gl1p = gp; SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, { guest_l1e_t gl1e = *gl1p; guest_l1e_t *snpl1p = (guest_l1e_t *)snp + guest_index(gl1p); if ( memcmp(snpl1p, &gl1e, sizeof(gl1e)) ) { gfn_t gfn; mfn_t gmfn; p2m_type_t p2mt; shadow_l1e_t nsl1e; gfn = guest_l1e_get_gfn(gl1e); gmfn = get_gfn_query_unlocked(v->domain, gfn_x(gfn), &p2mt); l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt); rc |= shadow_set_l1e(v, sl1p, nsl1e, p2mt, sl1mfn); *snpl1p = gl1e; } }); sh_unmap_domain_page(gp); sh_unmap_domain_page(snp); /* Setting shadow L1 entries should never need us to flush the TLB */ ASSERT(!(rc & SHADOW_SET_FLUSH)); } /* Figure out whether it's definitely safe not to sync this l1 table. * That is: if we can tell that it's only used once, and that the * toplevel shadow responsible is not one of ours. * N.B. This function is called with the vcpu that required the resync, * *not* the one that originally unsynced the page, but it is * called in the *mode* of the vcpu that unsynced it. Clear? Good. */ int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn) { struct page_info *sp; mfn_t smfn; if ( !sh_type_has_up_pointer(v, SH_type_l1_shadow) ) return 0; smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow); ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */ /* Up to l2 */ sp = mfn_to_page(smfn); if ( sp->u.sh.count != 1 || !sp->up ) return 0; smfn = _mfn(sp->up >> PAGE_SHIFT); ASSERT(mfn_valid(smfn)); #if (SHADOW_PAGING_LEVELS == 4) /* up to l3 */ sp = mfn_to_page(smfn); ASSERT(sh_type_has_up_pointer(v, SH_type_l2_shadow)); if ( sp->u.sh.count != 1 || !sp->up ) return 0; smfn = _mfn(sp->up >> PAGE_SHIFT); ASSERT(mfn_valid(smfn)); /* up to l4 */ sp = mfn_to_page(smfn); if ( sp->u.sh.count != 1 || !sh_type_has_up_pointer(v, SH_type_l3_64_shadow) || !sp->up ) return 0; smfn = _mfn(sp->up >> PAGE_SHIFT); ASSERT(mfn_valid(smfn)); #endif if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn) #if (SHADOW_PAGING_LEVELS == 3) || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn) || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn) || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn) #endif ) return 0; /* Only in use in one toplevel shadow, and it's not the one we're * running on */ return 1; } #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ /**************************************************************************/ /* Functions which translate and install the shadows of arbitrary guest * entries that we have just seen the guest write. */ static inline int sh_map_and_validate(struct vcpu *v, mfn_t gmfn, void *new_gp, u32 size, u32 sh_type, u32 (*shadow_index)(mfn_t *smfn, u32 idx), int (*validate_ge)(struct vcpu *v, void *ge, mfn_t smfn, void *se)) /* Generic function for mapping and validating. */ { mfn_t smfn, smfn2, map_mfn; shadow_l1e_t *sl1p; u32 shadow_idx, guest_idx; int result = 0; /* Align address and size to guest entry boundaries */ size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1); new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1)); size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1); ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE); /* Map the shadow page */ smfn = get_shadow_status(v, gmfn, sh_type); ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */ guest_idx = guest_index(new_gp); map_mfn = smfn; shadow_idx = shadow_index(&map_mfn, guest_idx); sl1p = sh_map_domain_page(map_mfn); /* Validate one entry at a time */ while ( size ) { smfn2 = smfn; guest_idx = guest_index(new_gp); shadow_idx = shadow_index(&smfn2, guest_idx); if ( mfn_x(smfn2) != mfn_x(map_mfn) ) { /* We have moved to another page of the shadow */ map_mfn = smfn2; sh_unmap_domain_page(sl1p); sl1p = sh_map_domain_page(map_mfn); } result |= validate_ge(v, new_gp, map_mfn, &sl1p[shadow_idx]); size -= sizeof(guest_l1e_t); new_gp += sizeof(guest_l1e_t); } sh_unmap_domain_page(sl1p); return result; } int sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn, void *new_gl4p, u32 size) { #if GUEST_PAGING_LEVELS >= 4 return sh_map_and_validate(v, gl4mfn, new_gl4p, size, SH_type_l4_shadow, shadow_l4_index, validate_gl4e); #else // ! GUEST_PAGING_LEVELS >= 4 SHADOW_ERROR("called in wrong paging mode!\n"); BUG(); return 0; #endif } int sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn, void *new_gl3p, u32 size) { #if GUEST_PAGING_LEVELS >= 4 return sh_map_and_validate(v, gl3mfn, new_gl3p, size, SH_type_l3_shadow, shadow_l3_index, validate_gl3e); #else // ! GUEST_PAGING_LEVELS >= 4 SHADOW_ERROR("called in wrong paging mode!\n"); BUG(); return 0; #endif } int sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size) { return sh_map_and_validate(v, gl2mfn, new_gl2p, size, SH_type_l2_shadow, shadow_l2_index, validate_gl2e); } int sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size) { #if GUEST_PAGING_LEVELS >= 3 return sh_map_and_validate(v, gl2mfn, new_gl2p, size, SH_type_l2h_shadow, shadow_l2_index, validate_gl2e); #else /* Non-PAE guests don't have different kinds of l2 table */ SHADOW_ERROR("called in wrong paging mode!\n"); BUG(); return 0; #endif } int sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn, void *new_gl1p, u32 size) { return sh_map_and_validate(v, gl1mfn, new_gl1p, size, SH_type_l1_shadow, shadow_l1_index, validate_gl1e); } /**************************************************************************/ /* Optimization: If we see two emulated writes of zeros to the same * page-table without another kind of page fault in between, we guess * that this is a batch of changes (for process destruction) and * unshadow the page so we don't take a pagefault on every entry. This * should also make finding writeable mappings of pagetables much * easier. */ /* Look to see if this is the second emulated write in a row to this * page, and unshadow if it is */ static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn) { #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW /* If the domain has never made a "dying" op, use the two-writes * heuristic; otherwise, unshadow as soon as we write a zero for a dying * process. * * Don't bother trying to unshadow if it's not a PT, or if it's > l1. */ if ( ( v->arch.paging.shadow.pagetable_dying || ( !v->domain->arch.paging.shadow.pagetable_dying_op && v->arch.paging.shadow.last_emulated_mfn_for_unshadow == mfn_x(gmfn) ) ) && sh_mfn_is_a_page_table(gmfn) && (!v->domain->arch.paging.shadow.pagetable_dying_op || !(mfn_to_page(gmfn)->shadow_flags & (SHF_L2_32|SHF_L2_PAE|SHF_L2H_PAE|SHF_L4_64))) ) { perfc_incr(shadow_early_unshadow); sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ ); TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW); } v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn); #endif } /* Stop counting towards early unshadows, as we've seen a real page fault */ static inline void reset_early_unshadow(struct vcpu *v) { #if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW v->arch.paging.shadow.last_emulated_mfn_for_unshadow = INVALID_MFN; #endif } /**************************************************************************/ /* Optimization: Prefetch multiple L1 entries. This is called after we have * demand-faulted a shadow l1e in the fault handler, to see if it's * worth fetching some more. */ #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH /* XXX magic number */ #define PREFETCH_DISTANCE 32 static void sh_prefetch(struct vcpu *v, walk_t *gw, shadow_l1e_t *ptr_sl1e, mfn_t sl1mfn) { int i, dist; gfn_t gfn; mfn_t gmfn; guest_l1e_t *gl1p = NULL, gl1e; shadow_l1e_t sl1e; u32 gflags; p2m_type_t p2mt; #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) guest_l1e_t *snpl1p = NULL; #endif /* OOS */ /* Prefetch no further than the end of the _shadow_ l1 MFN */ dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e; /* And no more than a maximum fetches-per-fault */ if ( dist > PREFETCH_DISTANCE ) dist = PREFETCH_DISTANCE; if ( mfn_valid(gw->l1mfn) ) { /* Normal guest page; grab the next guest entry */ gl1p = sh_map_domain_page(gw->l1mfn); gl1p += guest_l1_table_offset(gw->va); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) if ( mfn_is_out_of_sync(gw->l1mfn) ) { mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn); ASSERT(mfn_valid(snpmfn)); snpl1p = sh_map_domain_page(snpmfn); snpl1p += guest_l1_table_offset(gw->va); } #endif /* OOS */ } for ( i = 1; i < dist ; i++ ) { /* No point in prefetching if there's already a shadow */ if ( ptr_sl1e[i].l1 != 0 ) break; if ( mfn_valid(gw->l1mfn) ) { /* Normal guest page; grab the next guest entry */ gl1e = gl1p[i]; /* Not worth continuing if we hit an entry that will need another * fault for A/D-bit propagation anyway */ gflags = guest_l1e_get_flags(gl1e); if ( (gflags & _PAGE_PRESENT) && (!(gflags & _PAGE_ACCESSED) || ((gflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY))) ) break; } else { /* Fragmented superpage, unless we've been called wrongly */ ASSERT(guest_l2e_get_flags(gw->l2e) & _PAGE_PSE); /* Increment the l1e's GFN by the right number of guest pages */ gl1e = guest_l1e_from_gfn( _gfn(gfn_x(guest_l1e_get_gfn(gw->l1e)) + i), guest_l1e_get_flags(gw->l1e)); } /* Look at the gfn that the l1e is pointing at */ gfn = guest_l1e_get_gfn(gl1e); gmfn = get_gfn_query_unlocked(v->domain, gfn_x(gfn), &p2mt); /* Propagate the entry. */ l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt); (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, p2mt, sl1mfn); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) if ( snpl1p != NULL ) snpl1p[i] = gl1e; #endif /* OOS */ } if ( gl1p != NULL ) sh_unmap_domain_page(gl1p); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) if ( snpl1p != NULL ) sh_unmap_domain_page(snpl1p); #endif /* OOS */ } #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */ #if GUEST_PAGING_LEVELS == 4 typedef u64 guest_va_t; typedef u64 guest_pa_t; #elif GUEST_PAGING_LEVELS == 3 typedef u32 guest_va_t; typedef u64 guest_pa_t; #else typedef u32 guest_va_t; typedef u32 guest_pa_t; #endif static inline void trace_shadow_gen(u32 event, guest_va_t va) { if ( tb_init_done ) { event |= (GUEST_PAGING_LEVELS-2)<<8; __trace_var(event, 0/*!tsc*/, sizeof(va), &va); } } static inline void trace_shadow_fixup(guest_l1e_t gl1e, guest_va_t va) { if ( tb_init_done ) { struct { /* for PAE, guest_l1e may be 64 while guest_va may be 32; so put it first for alignment sake. */ guest_l1e_t gl1e; guest_va_t va; u32 flags; } __attribute__((packed)) d; u32 event; event = TRC_SHADOW_FIXUP | ((GUEST_PAGING_LEVELS-2)<<8); d.gl1e = gl1e; d.va = va; d.flags = this_cpu(trace_shadow_path_flags); __trace_var(event, 0/*!tsc*/, sizeof(d), &d); } } static inline void trace_not_shadow_fault(guest_l1e_t gl1e, guest_va_t va) { if ( tb_init_done ) { struct { /* for PAE, guest_l1e may be 64 while guest_va may be 32; so put it first for alignment sake. */ guest_l1e_t gl1e; guest_va_t va; u32 flags; } __attribute__((packed)) d; u32 event; event = TRC_SHADOW_NOT_SHADOW | ((GUEST_PAGING_LEVELS-2)<<8); d.gl1e = gl1e; d.va = va; d.flags = this_cpu(trace_shadow_path_flags); __trace_var(event, 0/*!tsc*/, sizeof(d), &d); } } static inline void trace_shadow_emulate_other(u32 event, guest_va_t va, gfn_t gfn) { if ( tb_init_done ) { struct { /* for PAE, guest_l1e may be 64 while guest_va may be 32; so put it first for alignment sake. */ #if GUEST_PAGING_LEVELS == 2 u32 gfn; #else u64 gfn; #endif guest_va_t va; } __attribute__((packed)) d; event |= ((GUEST_PAGING_LEVELS-2)<<8); d.gfn=gfn_x(gfn); d.va = va; __trace_var(event, 0/*!tsc*/, sizeof(d), &d); } } #if GUEST_PAGING_LEVELS == 3 static DEFINE_PER_CPU(guest_va_t,trace_emulate_initial_va); static DEFINE_PER_CPU(int,trace_extra_emulation_count); #endif static DEFINE_PER_CPU(guest_pa_t,trace_emulate_write_val); static inline void trace_shadow_emulate(guest_l1e_t gl1e, unsigned long va) { if ( tb_init_done ) { struct { /* for PAE, guest_l1e may be 64 while guest_va may be 32; so put it first for alignment sake. */ guest_l1e_t gl1e, write_val; guest_va_t va; unsigned flags:29, emulation_count:3; } __attribute__((packed)) d; u32 event; event = TRC_SHADOW_EMULATE | ((GUEST_PAGING_LEVELS-2)<<8); d.gl1e = gl1e; d.write_val.l1 = this_cpu(trace_emulate_write_val); d.va = va; #if GUEST_PAGING_LEVELS == 3 d.emulation_count = this_cpu(trace_extra_emulation_count); #endif d.flags = this_cpu(trace_shadow_path_flags); __trace_var(event, 0/*!tsc*/, sizeof(d), &d); } } /**************************************************************************/ /* Entry points into the shadow code */ /* Called from pagefault handler in Xen, and from the HVM trap handlers * for pagefaults. Returns 1 if this fault was an artefact of the * shadow code (and the guest should retry) or 0 if it is not (and the * fault should be handled elsewhere or passed to the guest). */ static int sh_page_fault(struct vcpu *v, unsigned long va, struct cpu_user_regs *regs) { struct domain *d = v->domain; walk_t gw; gfn_t gfn = _gfn(0); mfn_t gmfn, sl1mfn = _mfn(0); shadow_l1e_t sl1e, *ptr_sl1e; paddr_t gpa; struct sh_emulate_ctxt emul_ctxt; const struct x86_emulate_ops *emul_ops; int r; fetch_type_t ft = 0; p2m_type_t p2mt; uint32_t rc; int version; #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION int fast_emul = 0; #endif SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n", v->domain->domain_id, v->vcpu_id, va, regs->error_code, regs->eip); perfc_incr(shadow_fault); #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION /* If faulting frame is successfully emulated in last shadow fault * it's highly likely to reach same emulation action for this frame. * Then try to emulate early to avoid lock aquisition. */ if ( v->arch.paging.last_write_emul_ok && v->arch.paging.shadow.last_emulated_frame == (va >> PAGE_SHIFT) ) { /* check whether error code is 3, or else fall back to normal path * in case of some validation is required */ if ( regs->error_code == (PFEC_write_access | PFEC_page_present) ) { fast_emul = 1; gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Fall back to the slow path if we're trying to emulate writes to an out of sync page. */ if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) ) { fast_emul = 0; v->arch.paging.last_write_emul_ok = 0; goto page_fault_slow_path; } #endif /* OOS */ perfc_incr(shadow_fault_fast_emulate); goto early_emulation; } else v->arch.paging.last_write_emul_ok = 0; } #endif // // XXX: Need to think about eventually mapping superpages directly in the // shadow (when possible), as opposed to splintering them into a // bunch of 4K maps. // #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) if ( (regs->error_code & PFEC_reserved_bit) ) { #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* First, need to check that this isn't an out-of-sync * shadow l1e. If it is, we fall back to the slow path, which * will sync it up again. */ { shadow_l2e_t sl2e; mfn_t gl1mfn; if ( (__copy_from_user(&sl2e, (sh_linear_l2_table(v) + shadow_l2_linear_offset(va)), sizeof(sl2e)) != 0) || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) || !mfn_valid(gl1mfn = backpointer(mfn_to_page( shadow_l2e_get_mfn(sl2e)))) || unlikely(mfn_is_out_of_sync(gl1mfn)) ) { /* Hit the slow path as if there had been no * shadow entry at all, and let it tidy up */ ASSERT(regs->error_code & PFEC_page_present); regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present); goto page_fault_slow_path; } } #endif /* SHOPT_OUT_OF_SYNC */ /* The only reasons for reserved bits to be set in shadow entries * are the two "magic" shadow_l1e entries. */ if ( likely((__copy_from_user(&sl1e, (sh_linear_l1_table(v) + shadow_l1_linear_offset(va)), sizeof(sl1e)) == 0) && sh_l1e_is_magic(sl1e)) ) { if ( sh_l1e_is_gnp(sl1e) ) { /* Not-present in a guest PT: pass to the guest as * a not-present fault (by flipping two bits). */ ASSERT(regs->error_code & PFEC_page_present); regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present); reset_early_unshadow(v); perfc_incr(shadow_fault_fast_gnp); SHADOW_PRINTK("fast path not-present\n"); trace_shadow_gen(TRC_SHADOW_FAST_PROPAGATE, va); return 0; } else { /* Magic MMIO marker: extract gfn for MMIO address */ ASSERT(sh_l1e_is_mmio(sl1e)); gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e)))) << PAGE_SHIFT) | (va & ~PAGE_MASK); } perfc_incr(shadow_fault_fast_mmio); SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa); reset_early_unshadow(v); trace_shadow_gen(TRC_SHADOW_FAST_MMIO, va); return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT) ? EXCRET_fault_fixed : 0); } else { /* This should be exceptionally rare: another vcpu has fixed * the tables between the fault and our reading the l1e. * Retry and let the hardware give us the right fault next time. */ perfc_incr(shadow_fault_fast_fail); SHADOW_PRINTK("fast path false alarm!\n"); trace_shadow_gen(TRC_SHADOW_FALSE_FAST_PATH, va); return EXCRET_fault_fixed; } } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) page_fault_slow_path: #endif #endif /* SHOPT_FAST_FAULT_PATH */ /* Detect if this page fault happened while we were already in Xen * doing a shadow operation. If that happens, the only thing we can * do is let Xen's normal fault handlers try to fix it. In any case, * a diagnostic trace of the fault will be more useful than * a BUG() when we try to take the lock again. */ if ( unlikely(paging_locked_by_me(d)) ) { SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n", d->arch.paging.lock.locker_function); return 0; } rewalk: /* The walk is done in a lock-free style, with some sanity check * postponed after grabbing paging lock later. Those delayed checks * will make sure no inconsistent mapping being translated into * shadow page table. */ version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version); rmb(); rc = sh_walk_guest_tables(v, va, &gw, regs->error_code); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) regs->error_code &= ~PFEC_page_present; if ( !(rc & _PAGE_PRESENT) ) regs->error_code |= PFEC_page_present; #endif if ( rc != 0 ) { perfc_incr(shadow_fault_bail_real_fault); SHADOW_PRINTK("not a shadow fault\n"); reset_early_unshadow(v); if ( (rc & _PAGE_INVALID_BITS) ) regs->error_code |= PFEC_reserved_bit; goto propagate; } /* It's possible that the guest has put pagetables in memory that it has * already used for some special purpose (ioreq pages, or granted pages). * If that happens we'll have killed the guest already but it's still not * safe to propagate entries out of the guest PT so get out now. */ if ( unlikely(d->is_shutting_down && d->shutdown_code == SHUTDOWN_crash) ) { SHADOW_PRINTK("guest is shutting down\n"); goto propagate; } /* What kind of access are we dealing with? */ ft = ((regs->error_code & PFEC_write_access) ? ft_demand_write : ft_demand_read); /* What mfn is the guest trying to access? */ gfn = guest_l1e_get_gfn(gw.l1e); gmfn = get_gfn(d, gfn, &p2mt); if ( shadow_mode_refcounts(d) && ((!p2m_is_valid(p2mt) && !p2m_is_grant(p2mt)) || (!p2m_is_mmio(p2mt) && !mfn_valid(gmfn))) ) { perfc_incr(shadow_fault_bail_bad_gfn); SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n", gfn_x(gfn), mfn_x(gmfn)); reset_early_unshadow(v); put_gfn(d, gfn_x(gfn)); goto propagate; } #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) /* Remember this successful VA->GFN translation for later. */ vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), regs->error_code | PFEC_page_present); #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ paging_lock(d); TRACE_CLEAR_PATH_FLAGS; /* Make sure there is enough free shadow memory to build a chain of * shadow tables. (We never allocate a top-level shadow on this path, * only a 32b l1, pae l1, or 64b l3+2+1. Note that while * SH_type_l1_shadow isn't correct in the latter case, all page * tables are the same size there.) * * Preallocate shadow pages *before* removing writable accesses * otherwhise an OOS L1 might be demoted and promoted again with * writable mappings. */ shadow_prealloc(d, SH_type_l1_shadow, GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1); rc = gw_remove_write_accesses(v, va, &gw); /* First bit set: Removed write access to a page. */ if ( rc & GW_RMWR_FLUSHTLB ) { /* Write permission removal is also a hint that other gwalks * overlapping with this one may be inconsistent */ perfc_incr(shadow_rm_write_flush_tlb); atomic_inc(&d->arch.paging.shadow.gtable_dirty_version); flush_tlb_mask(d->domain_dirty_cpumask); } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Second bit set: Resynced a page. Re-walk needed. */ if ( rc & GW_RMWR_REWALK ) { paging_unlock(d); put_gfn(d, gfn_x(gfn)); goto rewalk; } #endif /* OOS */ if ( !shadow_check_gwalk(v, va, &gw, version) ) { perfc_incr(shadow_inconsistent_gwalk); paging_unlock(d); put_gfn(d, gfn_x(gfn)); goto rewalk; } shadow_audit_tables(v); sh_audit_gw(v, &gw); /* Acquire the shadow. This must happen before we figure out the rights * for the shadow entry, since we might promote a page here. */ ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft); if ( unlikely(ptr_sl1e == NULL) ) { /* Couldn't get the sl1e! Since we know the guest entries * are OK, this can only have been caused by a failed * shadow_set_l*e(), which will have crashed the guest. * Get out of the fault handler immediately. */ /* Windows 7 apparently relies on the hardware to do something * it explicitly hasn't promised to do: load l3 values after * the cr3 is loaded. * In any case, in the PAE case, the ASSERT is not true; it can * happen because of actions the guest is taking. */ #if GUEST_PAGING_LEVELS == 3 v->arch.paging.mode->update_cr3(v, 0); #else ASSERT(d->is_shutting_down); #endif paging_unlock(d); put_gfn(d, gfn_x(gfn)); trace_shadow_gen(TRC_SHADOW_DOMF_DYING, va); return 0; } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Always unsync when writing to L1 page tables. */ if ( sh_mfn_is_a_page_table(gmfn) && ft == ft_demand_write ) sh_unsync(v, gmfn); if ( unlikely(d->is_shutting_down && d->shutdown_code == SHUTDOWN_crash) ) { /* We might end up with a crashed domain here if * sh_remove_shadows() in a previous sh_resync() call has * failed. We cannot safely continue since some page is still * OOS but not in the hash table anymore. */ paging_unlock(d); put_gfn(d, gfn_x(gfn)); return 0; } /* Final check: if someone has synced a page, it's possible that * our l1e is stale. Compare the entries, and rewalk if necessary. */ if ( shadow_check_gl1e(v, &gw) ) { perfc_incr(shadow_inconsistent_gwalk); paging_unlock(d); put_gfn(d, gfn_x(gfn)); goto rewalk; } #endif /* OOS */ /* Calculate the shadow entry and write it */ l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt); r = shadow_set_l1e(v, ptr_sl1e, sl1e, p2mt, sl1mfn); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) if ( mfn_valid(gw.l1mfn) && mfn_is_out_of_sync(gw.l1mfn) ) { /* Update the OOS snapshot. */ mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn); guest_l1e_t *snp; ASSERT(mfn_valid(snpmfn)); snp = sh_map_domain_page(snpmfn); snp[guest_l1_table_offset(va)] = gw.l1e; sh_unmap_domain_page(snp); } #endif /* OOS */ #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH /* Prefetch some more shadow entries */ sh_prefetch(v, &gw, ptr_sl1e, sl1mfn); #endif /* Need to emulate accesses to page tables */ if ( sh_mfn_is_a_page_table(gmfn) #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Unless they've been allowed to go out of sync with their shadows and we don't need to unshadow it. */ && !(mfn_is_out_of_sync(gmfn) && !(regs->error_code & PFEC_user_mode)) #endif ) { if ( ft == ft_demand_write ) { perfc_incr(shadow_fault_emulate_write); goto emulate; } else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read ) { perfc_incr(shadow_fault_emulate_read); goto emulate; } } /* Need to hand off device-model MMIO to the device model */ if ( p2mt == p2m_mmio_dm ) { gpa = guest_walk_to_gpa(&gw); goto mmio; } /* Ignore attempts to write to read-only memory. */ if ( p2m_is_readonly(p2mt) && (ft == ft_demand_write) ) { static unsigned long lastpage; if ( xchg(&lastpage, va & PAGE_MASK) != (va & PAGE_MASK) ) gdprintk(XENLOG_DEBUG, "guest attempted write to read-only memory" " page. va page=%#lx, mfn=%#lx\n", va & PAGE_MASK, mfn_x(gmfn)); goto emulate_readonly; /* skip over the instruction */ } /* In HVM guests, we force CR0.WP always to be set, so that the * pagetables are always write-protected. If the guest thinks * CR0.WP is clear, we must emulate faulting supervisor writes to * allow the guest to write through read-only PTEs. Emulate if the * fault was a non-user write to a present page. */ if ( is_hvm_domain(d) && unlikely(!hvm_wp_enabled(v)) && regs->error_code == (PFEC_write_access|PFEC_page_present) && mfn_valid(gmfn) ) { perfc_incr(shadow_fault_emulate_wp); goto emulate; } perfc_incr(shadow_fault_fixed); d->arch.paging.log_dirty.fault_count++; reset_early_unshadow(v); trace_shadow_fixup(gw.l1e, va); done: sh_audit_gw(v, &gw); SHADOW_PRINTK("fixed\n"); shadow_audit_tables(v); paging_unlock(d); put_gfn(d, gfn_x(gfn)); return EXCRET_fault_fixed; emulate: if ( !shadow_mode_refcounts(d) || !guest_mode(regs) ) goto not_a_shadow_fault; /* * We do not emulate user writes. Instead we use them as a hint that the * page is no longer a page table. This behaviour differs from native, but * it seems very unlikely that any OS grants user access to page tables. */ if ( (regs->error_code & PFEC_user_mode) ) { SHADOW_PRINTK("user-mode fault to PT, unshadowing mfn %#lx\n", mfn_x(gmfn)); perfc_incr(shadow_fault_emulate_failed); sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */); trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_USER, va, gfn); goto done; } /* * Write from userspace to ro-mem needs to jump here to avoid getting * caught by user-mode page-table check above. */ emulate_readonly: /* Unshadow if we are writing to a toplevel pagetable that is * flagged as a dying process, and that is not currently used. */ if ( sh_mfn_is_a_page_table(gmfn) && (mfn_to_page(gmfn)->shadow_flags & SHF_pagetable_dying) ) { int used = 0; struct vcpu *tmp; for_each_vcpu(d, tmp) { #if GUEST_PAGING_LEVELS == 3 int i; for ( i = 0; i < 4; i++ ) { mfn_t smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[i])); if ( mfn_valid(smfn) && (mfn_x(smfn) != 0) ) { used |= (mfn_to_page(smfn)->v.sh.back == mfn_x(gmfn)); if ( used ) break; } } #else /* 32 or 64 */ used = (mfn_x(pagetable_get_mfn(tmp->arch.guest_table)) == mfn_x(gmfn)); #endif if ( used ) break; } if ( !used ) sh_remove_shadows(v, gmfn, 1 /* fast */, 0 /* can fail */); } /* * We don't need to hold the lock for the whole emulation; we will * take it again when we write to the pagetables. */ sh_audit_gw(v, &gw); shadow_audit_tables(v); paging_unlock(d); put_gfn(d, gfn_x(gfn)); this_cpu(trace_emulate_write_val) = 0; #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION early_emulation: #endif if ( is_hvm_domain(d) ) { /* * If we are in the middle of injecting an exception or interrupt then * we should not emulate: it is not the instruction at %eip that caused * the fault. Furthermore it is almost certainly the case the handler * stack is currently considered to be a page table, so we should * unshadow the faulting page before exiting. */ if ( unlikely(hvm_event_pending(v)) ) { #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION if ( fast_emul ) { perfc_incr(shadow_fault_fast_emulate_fail); v->arch.paging.last_write_emul_ok = 0; } #endif gdprintk(XENLOG_DEBUG, "write to pagetable during event " "injection: cr2=%#lx, mfn=%#lx\n", va, mfn_x(gmfn)); sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */); trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ, va, gfn); return EXCRET_fault_fixed; } } SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n", (unsigned long)regs->eip, (unsigned long)regs->esp); emul_ops = shadow_init_emulation(&emul_ctxt, regs); r = x86_emulate(&emul_ctxt.ctxt, emul_ops); /* * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it * would be a good unshadow hint. If we *do* decide to unshadow-on-fault * then it must be 'failable': we cannot require the unshadow to succeed. */ if ( r == X86EMUL_UNHANDLEABLE ) { perfc_incr(shadow_fault_emulate_failed); #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION if ( fast_emul ) { perfc_incr(shadow_fault_fast_emulate_fail); v->arch.paging.last_write_emul_ok = 0; } #endif SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n", mfn_x(gmfn)); /* If this is actually a page table, then we have a bug, and need * to support more operations in the emulator. More likely, * though, this is a hint that this page should not be shadowed. */ shadow_remove_all_shadows(v, gmfn); trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED, va, gfn); goto emulate_done; } #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION /* Record successfully emulated information as heuristics to next * fault on same frame for acceleration. But be careful to verify * its attribute still as page table, or else unshadow triggered * in write emulation normally requires a re-sync with guest page * table to recover r/w permission. Incorrect record for such case * will cause unexpected more shadow faults due to propagation is * skipped. */ if ( (r == X86EMUL_OKAY) && sh_mfn_is_a_page_table(gmfn) ) { if ( !fast_emul ) { v->arch.paging.shadow.last_emulated_frame = va >> PAGE_SHIFT; v->arch.paging.shadow.last_emulated_mfn = mfn_x(gmfn); v->arch.paging.last_write_emul_ok = 1; } } else if ( fast_emul ) v->arch.paging.last_write_emul_ok = 0; #endif #if GUEST_PAGING_LEVELS == 3 /* PAE guest */ if ( r == X86EMUL_OKAY ) { int i, emulation_count=0; this_cpu(trace_emulate_initial_va) = va; /* Emulate up to four extra instructions in the hope of catching * the "second half" of a 64-bit pagetable write. */ for ( i = 0 ; i < 4 ; i++ ) { shadow_continue_emulation(&emul_ctxt, regs); v->arch.paging.last_write_was_pt = 0; r = x86_emulate(&emul_ctxt.ctxt, emul_ops); if ( r == X86EMUL_OKAY ) { emulation_count++; if ( v->arch.paging.last_write_was_pt ) { perfc_incr(shadow_em_ex_pt); TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN); break; /* Don't emulate past the other half of the write */ } else perfc_incr(shadow_em_ex_non_pt); } else { perfc_incr(shadow_em_ex_fail); TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_LAST_FAILED); break; /* Don't emulate again if we failed! */ } } this_cpu(trace_extra_emulation_count)=emulation_count; } #endif /* PAE guest */ trace_shadow_emulate(gw.l1e, va); emulate_done: SHADOW_PRINTK("emulated\n"); return EXCRET_fault_fixed; mmio: if ( !guest_mode(regs) ) goto not_a_shadow_fault; perfc_incr(shadow_fault_mmio); sh_audit_gw(v, &gw); SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa); shadow_audit_tables(v); reset_early_unshadow(v); paging_unlock(d); put_gfn(d, gfn_x(gfn)); trace_shadow_gen(TRC_SHADOW_MMIO, va); return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT) ? EXCRET_fault_fixed : 0); not_a_shadow_fault: sh_audit_gw(v, &gw); SHADOW_PRINTK("not a shadow fault\n"); shadow_audit_tables(v); reset_early_unshadow(v); paging_unlock(d); put_gfn(d, gfn_x(gfn)); propagate: trace_not_shadow_fault(gw.l1e, va); return 0; } static int sh_invlpg(struct vcpu *v, unsigned long va) /* Called when the guest requests an invlpg. Returns 1 if the invlpg * instruction should be issued on the hardware, or 0 if it's safe not * to do so. */ { mfn_t sl1mfn; shadow_l2e_t sl2e; perfc_incr(shadow_invlpg); #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) /* No longer safe to use cached gva->gfn translations */ vtlb_flush(v); #endif #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION v->arch.paging.last_write_emul_ok = 0; #endif /* First check that we can safely read the shadow l2e. SMP/PAE linux can * run as high as 6% of invlpg calls where we haven't shadowed the l2 * yet. */ #if SHADOW_PAGING_LEVELS == 4 { shadow_l3e_t sl3e; if ( !(shadow_l4e_get_flags( sh_linear_l4_table(v)[shadow_l4_linear_offset(va)]) & _PAGE_PRESENT) ) return 0; /* This must still be a copy-from-user because we don't have the * paging lock, and the higher-level shadows might disappear * under our feet. */ if ( __copy_from_user(&sl3e, (sh_linear_l3_table(v) + shadow_l3_linear_offset(va)), sizeof (sl3e)) != 0 ) { perfc_incr(shadow_invlpg_fault); return 0; } if ( !(shadow_l3e_get_flags(sl3e) & _PAGE_PRESENT) ) return 0; } #else /* SHADOW_PAGING_LEVELS == 3 */ if ( !(l3e_get_flags(v->arch.paging.shadow.l3table[shadow_l3_linear_offset(va)]) & _PAGE_PRESENT) ) // no need to flush anything if there's no SL2... return 0; #endif /* This must still be a copy-from-user because we don't have the shadow * lock, and the higher-level shadows might disappear under our feet. */ if ( __copy_from_user(&sl2e, sh_linear_l2_table(v) + shadow_l2_linear_offset(va), sizeof (sl2e)) != 0 ) { perfc_incr(shadow_invlpg_fault); return 0; } // If there's nothing shadowed for this particular sl2e, then // there is no need to do an invlpg, either... // if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) ) return 0; // Check to see if the SL2 is a splintered superpage... // If so, then we'll need to flush the entire TLB (because that's // easier than invalidating all of the individual 4K pages). // sl1mfn = shadow_l2e_get_mfn(sl2e); if ( mfn_to_page(sl1mfn)->u.sh.type == SH_type_fl1_shadow ) { flush_tlb_local(); return 0; } #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Check to see if the SL1 is out of sync. */ { mfn_t gl1mfn = backpointer(mfn_to_page(sl1mfn)); struct page_info *pg = mfn_to_page(gl1mfn); if ( mfn_valid(gl1mfn) && page_is_out_of_sync(pg) ) { /* The test above may give false positives, since we don't * hold the paging lock yet. Check again with the lock held. */ paging_lock(v->domain); /* This must still be a copy-from-user because we didn't * have the paging lock last time we checked, and the * higher-level shadows might have disappeared under our * feet. */ if ( __copy_from_user(&sl2e, sh_linear_l2_table(v) + shadow_l2_linear_offset(va), sizeof (sl2e)) != 0 ) { perfc_incr(shadow_invlpg_fault); paging_unlock(v->domain); return 0; } if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) ) { paging_unlock(v->domain); return 0; } sl1mfn = shadow_l2e_get_mfn(sl2e); gl1mfn = backpointer(mfn_to_page(sl1mfn)); pg = mfn_to_page(gl1mfn); if ( likely(sh_mfn_is_a_page_table(gl1mfn) && page_is_out_of_sync(pg) ) ) { shadow_l1e_t *sl1; sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va); /* Remove the shadow entry that maps this VA */ (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), p2m_invalid, sl1mfn); } paging_unlock(v->domain); /* Need the invlpg, to pick up the disappeareance of the sl1e */ return 1; } } #endif return 1; } static unsigned long sh_gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m, unsigned long va, uint32_t *pfec) /* Called to translate a guest virtual address to what the *guest* * pagetables would map it to. */ { walk_t gw; gfn_t gfn; uint32_t missing; #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) /* Check the vTLB cache first */ unsigned long vtlb_gfn = vtlb_lookup(v, va, pfec[0]); if ( VALID_GFN(vtlb_gfn) ) return vtlb_gfn; #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ if ( (missing = sh_walk_guest_tables(v, va, &gw, pfec[0])) != 0 ) { if ( (missing & _PAGE_PRESENT) ) pfec[0] &= ~PFEC_page_present; if ( missing & _PAGE_INVALID_BITS ) pfec[0] |= PFEC_reserved_bit; return INVALID_GFN; } gfn = guest_walk_to_gfn(&gw); #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) /* Remember this successful VA->GFN translation for later. */ vtlb_insert(v, va >> PAGE_SHIFT, gfn_x(gfn), pfec[0]); #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ return gfn_x(gfn); } static inline void sh_update_linear_entries(struct vcpu *v) /* Sync up all the linear mappings for this vcpu's pagetables */ { struct domain *d = v->domain; /* Linear pagetables in PV guests * ------------------------------ * * Guest linear pagetables, which map the guest pages, are at * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these * are set up at shadow creation time, but (of course!) the PAE case * is subtler. Normal linear mappings are made by having an entry * in the top-level table that points to itself (shadow linear) or * to the guest top-level table (guest linear). For PAE, to set up * a linear map requires us to copy the four top-level entries into * level-2 entries. That means that every time we change a PAE l3e, * we need to reflect the change into the copy. * * Linear pagetables in HVM guests * ------------------------------- * * For HVM guests, the linear pagetables are installed in the monitor * tables (since we can't put them in the shadow). Shadow linear * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START, * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for * a linear pagetable of the monitor tables themselves. We have * the same issue of having to re-copy PAE l3 entries whevever we use * PAE shadows. * * Because HVM guests run on the same monitor tables regardless of the * shadow tables in use, the linear mapping of the shadow tables has to * be updated every time v->arch.shadow_table changes. */ /* Don't try to update the monitor table if it doesn't exist */ if ( shadow_mode_external(d) && pagetable_get_pfn(v->arch.monitor_table) == 0 ) return; #if SHADOW_PAGING_LEVELS == 4 /* For PV, one l4e points at the guest l4, one points at the shadow * l4. No maintenance required. * For HVM, just need to update the l4e that points to the shadow l4. */ if ( shadow_mode_external(d) ) { /* Use the linear map if we can; otherwise make a new mapping */ if ( v == current ) { __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] = l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]), __PAGE_HYPERVISOR); } else { l4_pgentry_t *ml4e; ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] = l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table[0]), __PAGE_HYPERVISOR); sh_unmap_domain_page(ml4e); } } #elif SHADOW_PAGING_LEVELS == 3 /* PV: XXX * * HVM: To give ourselves a linear map of the shadows, we need to * extend a PAE shadow to 4 levels. We do this by having a monitor * l3 in slot 0 of the monitor l4 table, and copying the PAE l3 * entries into it. Then, by having the monitor l4e for shadow * pagetables also point to the monitor l4, we can use it to access * the shadows. */ if ( shadow_mode_external(d) ) { /* Install copies of the shadow l3es into the monitor l2 table * that maps SH_LINEAR_PT_VIRT_START. */ shadow_l3e_t *sl3e; l2_pgentry_t *ml2e; int i; /* Use linear mappings if we can; otherwise make new mappings */ if ( v == current ) ml2e = __linear_l2_table + l2_linear_offset(SH_LINEAR_PT_VIRT_START); else { mfn_t l3mfn, l2mfn; l4_pgentry_t *ml4e; l3_pgentry_t *ml3e; int linear_slot = shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START); ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); ASSERT(l4e_get_flags(ml4e[linear_slot]) & _PAGE_PRESENT); l3mfn = _mfn(l4e_get_pfn(ml4e[linear_slot])); ml3e = sh_map_domain_page(l3mfn); sh_unmap_domain_page(ml4e); ASSERT(l3e_get_flags(ml3e[0]) & _PAGE_PRESENT); l2mfn = _mfn(l3e_get_pfn(ml3e[0])); ml2e = sh_map_domain_page(l2mfn); sh_unmap_domain_page(ml3e); } /* Shadow l3 tables are made up by sh_update_cr3 */ sl3e = v->arch.paging.shadow.l3table; for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) { ml2e[i] = (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT) ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])), __PAGE_HYPERVISOR) : l2e_empty(); } if ( v != current ) sh_unmap_domain_page(ml2e); } else domain_crash(d); /* XXX */ #else #error this should not happen #endif if ( shadow_mode_external(d) ) { /* * Having modified the linear pagetable mapping, flush local host TLBs. * This was not needed when vmenter/vmexit always had the side effect * of flushing host TLBs but, with ASIDs, it is possible to finish * this CR3 update, vmenter the guest, vmexit due to a page fault, * without an intervening host TLB flush. Then the page fault code * could use the linear pagetable to read a top-level shadow page * table entry. But, without this change, it would fetch the wrong * value due to a stale TLB. */ flush_tlb_local(); } } /* Removes vcpu->arch.paging.shadow.guest_vtable and vcpu->arch.shadow_table[]. * Does all appropriate management/bookkeeping/refcounting/etc... */ static void sh_detach_old_tables(struct vcpu *v) { mfn_t smfn; int i = 0; //// //// vcpu->arch.paging.shadow.guest_vtable //// #if GUEST_PAGING_LEVELS == 3 /* PAE guests don't have a mapping of the guest top-level table */ ASSERT(v->arch.paging.shadow.guest_vtable == NULL); #else if ( v->arch.paging.shadow.guest_vtable ) { struct domain *d = v->domain; if ( shadow_mode_external(d) || shadow_mode_translate(d) ) sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable); v->arch.paging.shadow.guest_vtable = NULL; } #endif // !NDEBUG //// //// vcpu->arch.shadow_table[] //// #if GUEST_PAGING_LEVELS == 3 /* PAE guests have four shadow_table entries */ for ( i = 0 ; i < 4 ; i++ ) #endif { smfn = pagetable_get_mfn(v->arch.shadow_table[i]); if ( mfn_x(smfn) ) sh_put_ref(v, smfn, 0); v->arch.shadow_table[i] = pagetable_null(); } } /* Set up the top-level shadow and install it in slot 'slot' of shadow_table */ static void sh_set_toplevel_shadow(struct vcpu *v, int slot, mfn_t gmfn, unsigned int root_type) { mfn_t smfn; pagetable_t old_entry, new_entry; struct domain *d = v->domain; /* Remember the old contents of this slot */ old_entry = v->arch.shadow_table[slot]; /* Now figure out the new contents: is this a valid guest MFN? */ if ( !mfn_valid(gmfn) ) { new_entry = pagetable_null(); goto install_new_entry; } /* Guest mfn is valid: shadow it and install the shadow */ smfn = get_shadow_status(v, gmfn, root_type); if ( !mfn_valid(smfn) ) { /* Make sure there's enough free shadow memory. */ shadow_prealloc(d, root_type, 1); /* Shadow the page. */ smfn = sh_make_shadow(v, gmfn, root_type); } ASSERT(mfn_valid(smfn)); /* Pin the shadow and put it (back) on the list of pinned shadows */ if ( sh_pin(v, smfn) == 0 ) { SHADOW_ERROR("can't pin %#lx as toplevel shadow\n", mfn_x(smfn)); domain_crash(v->domain); } /* Take a ref to this page: it will be released in sh_detach_old_tables() * or the next call to set_toplevel_shadow() */ if ( !sh_get_ref(v, smfn, 0) ) { SHADOW_ERROR("can't install %#lx as toplevel shadow\n", mfn_x(smfn)); domain_crash(v->domain); } new_entry = pagetable_from_mfn(smfn); install_new_entry: /* Done. Install it */ SHADOW_PRINTK("%u/%u [%u] gmfn %#"PRI_mfn" smfn %#"PRI_mfn"\n", GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, slot, mfn_x(gmfn), mfn_x(pagetable_get_mfn(new_entry))); v->arch.shadow_table[slot] = new_entry; /* Decrement the refcount of the old contents of this slot */ if ( !pagetable_is_null(old_entry) ) { mfn_t old_smfn = pagetable_get_mfn(old_entry); /* Need to repin the old toplevel shadow if it's been unpinned * by shadow_prealloc(): in PV mode we're still running on this * shadow and it's not safe to free it yet. */ if ( !mfn_to_page(old_smfn)->u.sh.pinned && !sh_pin(v, old_smfn) ) { SHADOW_ERROR("can't re-pin %#lx\n", mfn_x(old_smfn)); domain_crash(v->domain); } sh_put_ref(v, old_smfn, 0); } } static void sh_update_cr3(struct vcpu *v, int do_locking) /* Updates vcpu->arch.cr3 after the guest has changed CR3. * Paravirtual guests should set v->arch.guest_table (and guest_table_user, * if appropriate). * HVM guests should also make sure hvm_get_guest_cntl_reg(v, 3) works; * this function will call hvm_update_guest_cr(v, 3) to tell them where the * shadow tables are. * If do_locking != 0, assume we are being called from outside the * shadow code, and must take and release the paging lock; otherwise * that is the caller's responsibility. */ { struct domain *d = v->domain; mfn_t gmfn; #if GUEST_PAGING_LEVELS == 3 guest_l3e_t *gl3e; u32 guest_idx=0; int i; #endif /* Don't do anything on an uninitialised vcpu */ if ( is_pv_domain(d) && !v->is_initialised ) { ASSERT(v->arch.cr3 == 0); return; } if ( do_locking ) paging_lock(v->domain); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Need to resync all the shadow entries on a TLB flush. Resync * current vcpus OOS pages before switching to the new shadow * tables so that the VA hint is still valid. */ shadow_resync_current_vcpu(v); #endif ASSERT(paging_locked_by_me(v->domain)); ASSERT(v->arch.paging.mode); //// //// vcpu->arch.guest_table is already set //// #ifndef NDEBUG /* Double-check that the HVM code has sent us a sane guest_table */ if ( is_hvm_domain(d) ) { ASSERT(shadow_mode_external(d)); if ( hvm_paging_enabled(v) ) ASSERT(pagetable_get_pfn(v->arch.guest_table)); else ASSERT(v->arch.guest_table.pfn == d->arch.paging.shadow.unpaged_pagetable.pfn); } #endif SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n", d->domain_id, v->vcpu_id, (unsigned long)pagetable_get_pfn(v->arch.guest_table)); #if GUEST_PAGING_LEVELS == 4 if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32on64_vcpu(v) ) gmfn = pagetable_get_mfn(v->arch.guest_table_user); else #endif gmfn = pagetable_get_mfn(v->arch.guest_table); //// //// vcpu->arch.paging.shadow.guest_vtable //// #if GUEST_PAGING_LEVELS == 4 if ( shadow_mode_external(d) || shadow_mode_translate(d) ) { if ( v->arch.paging.shadow.guest_vtable ) sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable); v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn); /* PAGING_LEVELS==4 implies 64-bit, which means that * map_domain_page_global can't fail */ BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); } else v->arch.paging.shadow.guest_vtable = __linear_l4_table; #elif GUEST_PAGING_LEVELS == 3 /* On PAE guests we don't use a mapping of the guest's own top-level * table. We cache the current state of that table and shadow that, * until the next CR3 write makes us refresh our cache. */ ASSERT(v->arch.paging.shadow.guest_vtable == NULL); if ( shadow_mode_external(d) ) /* Find where in the page the l3 table is */ guest_idx = guest_index((void *)v->arch.hvm_vcpu.guest_cr[3]); else /* PV guest: l3 is at the start of a page */ guest_idx = 0; // Ignore the low 2 bits of guest_idx -- they are really just // cache control. guest_idx &= ~3; gl3e = ((guest_l3e_t *)sh_map_domain_page(gmfn)) + guest_idx; for ( i = 0; i < 4 ; i++ ) v->arch.paging.shadow.gl3e[i] = gl3e[i]; sh_unmap_domain_page(gl3e); #elif GUEST_PAGING_LEVELS == 2 if ( shadow_mode_external(d) || shadow_mode_translate(d) ) { if ( v->arch.paging.shadow.guest_vtable ) sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable); v->arch.paging.shadow.guest_vtable = sh_map_domain_page_global(gmfn); /* Does this really need map_domain_page_global? Handle the * error properly if so. */ BUG_ON(v->arch.paging.shadow.guest_vtable == NULL); /* XXX */ } else v->arch.paging.shadow.guest_vtable = __linear_l2_table; #else #error this should never happen #endif //// //// vcpu->arch.shadow_table[] //// /* We revoke write access to the new guest toplevel page(s) before we * replace the old shadow pagetable(s), so that we can safely use the * (old) shadow linear maps in the writeable mapping heuristics. */ #if GUEST_PAGING_LEVELS == 2 if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 ) flush_tlb_mask(d->domain_dirty_cpumask); sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow); #elif GUEST_PAGING_LEVELS == 3 /* PAE guests have four shadow_table entries, based on the * current values of the guest's four l3es. */ { int flush = 0; gfn_t gl2gfn; mfn_t gl2mfn; p2m_type_t p2mt; guest_l3e_t *gl3e = (guest_l3e_t*)&v->arch.paging.shadow.gl3e; /* First, make all four entries read-only. */ for ( i = 0; i < 4; i++ ) { if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT ) { gl2gfn = guest_l3e_get_gfn(gl3e[i]); gl2mfn = get_gfn_query_unlocked(d, gfn_x(gl2gfn), &p2mt); if ( p2m_is_ram(p2mt) ) flush |= sh_remove_write_access(v, gl2mfn, 2, 0); } } if ( flush ) flush_tlb_mask(d->domain_dirty_cpumask); /* Now install the new shadows. */ for ( i = 0; i < 4; i++ ) { if ( guest_l3e_get_flags(gl3e[i]) & _PAGE_PRESENT ) { gl2gfn = guest_l3e_get_gfn(gl3e[i]); gl2mfn = get_gfn_query_unlocked(d, gfn_x(gl2gfn), &p2mt); if ( p2m_is_ram(p2mt) ) sh_set_toplevel_shadow(v, i, gl2mfn, (i == 3) ? SH_type_l2h_shadow : SH_type_l2_shadow); else sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0); } else sh_set_toplevel_shadow(v, i, _mfn(INVALID_MFN), 0); } } #elif GUEST_PAGING_LEVELS == 4 if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 ) flush_tlb_mask(d->domain_dirty_cpumask); sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow); #else #error This should never happen #endif /// /// v->arch.paging.shadow.l3table /// #if SHADOW_PAGING_LEVELS == 3 { mfn_t smfn = pagetable_get_mfn(v->arch.shadow_table[0]); int i; for ( i = 0; i < 4; i++ ) { #if GUEST_PAGING_LEVELS == 2 /* 2-on-3: make a PAE l3 that points at the four-page l2 */ if ( i != 0 ) smfn = sh_next_page(smfn); #else /* 3-on-3: make a PAE l3 that points at the four l2 pages */ smfn = pagetable_get_mfn(v->arch.shadow_table[i]); #endif v->arch.paging.shadow.l3table[i] = (mfn_x(smfn) == 0) ? shadow_l3e_empty() : shadow_l3e_from_mfn(smfn, _PAGE_PRESENT); } } #endif /* SHADOW_PAGING_LEVELS == 3 */ /// /// v->arch.cr3 /// if ( shadow_mode_external(d) ) { make_cr3(v, pagetable_get_pfn(v->arch.monitor_table)); } else // not shadow_mode_external... { /* We don't support PV except guest == shadow == config levels */ BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS); #if SHADOW_PAGING_LEVELS == 3 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated. * Don't use make_cr3 because (a) we know it's below 4GB, and * (b) it's not necessarily page-aligned, and make_cr3 takes a pfn */ ASSERT(virt_to_maddr(&v->arch.paging.shadow.l3table) <= 0xffffffe0ULL); v->arch.cr3 = virt_to_maddr(&v->arch.paging.shadow.l3table); #else /* 4-on-4: Just use the shadow top-level directly */ make_cr3(v, pagetable_get_pfn(v->arch.shadow_table[0])); #endif } /// /// v->arch.hvm_vcpu.hw_cr[3] /// if ( shadow_mode_external(d) ) { ASSERT(is_hvm_domain(d)); #if SHADOW_PAGING_LEVELS == 3 /* 2-on-3 or 3-on-3: Use the PAE shadow l3 table we just fabricated */ v->arch.hvm_vcpu.hw_cr[3] = virt_to_maddr(&v->arch.paging.shadow.l3table); #else /* 4-on-4: Just use the shadow top-level directly */ v->arch.hvm_vcpu.hw_cr[3] = pagetable_get_paddr(v->arch.shadow_table[0]); #endif hvm_update_guest_cr(v, 3); } /* Fix up the linear pagetable mappings */ sh_update_linear_entries(v); #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) /* No longer safe to use cached gva->gfn translations */ vtlb_flush(v); #endif #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION v->arch.paging.last_write_emul_ok = 0; #endif #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Need to resync all the shadow entries on a TLB flush. We only * update the shadows, leaving the pages out of sync. Also, we try * to skip synchronization of shadows not mapped in the new * tables. */ shadow_sync_other_vcpus(v); #endif /* Release the lock, if we took it (otherwise it's the caller's problem) */ if ( do_locking ) paging_unlock(v->domain); } /**************************************************************************/ /* Functions to revoke guest rights */ #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off) { int r; shadow_l1e_t *sl1p, sl1e; struct page_info *sp; ASSERT(mfn_valid(gmfn)); ASSERT(mfn_valid(smfn)); /* Remember if we've been told that this process is being torn down */ v->arch.paging.shadow.pagetable_dying = !!(mfn_to_page(gmfn)->shadow_flags & SHF_pagetable_dying); sp = mfn_to_page(smfn); if ( ((sp->count_info & PGC_count_mask) != 0) || (sp->u.sh.type != SH_type_l1_shadow && sp->u.sh.type != SH_type_fl1_shadow) ) goto fail; sl1p = sh_map_domain_page(smfn); sl1p += off; sl1e = *sl1p; if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW)) != (_PAGE_PRESENT|_PAGE_RW)) || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) ) { sh_unmap_domain_page(sl1p); goto fail; } /* Found it! Need to remove its write permissions. */ sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW); r = shadow_set_l1e(v, sl1p, sl1e, p2m_ram_rw, smfn); ASSERT( !(r & SHADOW_SET_ERROR) ); sh_unmap_domain_page(sl1p); perfc_incr(shadow_writeable_h_7); return 1; fail: perfc_incr(shadow_writeable_h_8); return 0; } #endif /* OOS */ #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn) /* Look up this vaddr in the current shadow and see if it's a writeable * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */ { shadow_l1e_t sl1e, *sl1p; shadow_l2e_t *sl2p; shadow_l3e_t *sl3p; #if SHADOW_PAGING_LEVELS >= 4 shadow_l4e_t *sl4p; #endif mfn_t sl1mfn; int r; /* Carefully look in the shadow linear map for the l1e we expect */ #if SHADOW_PAGING_LEVELS >= 4 sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr); if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) ) return 0; sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr); if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) ) return 0; #else /* SHADOW_PAGING_LEVELS == 3 */ sl3p = ((shadow_l3e_t *) v->arch.paging.shadow.l3table) + shadow_l3_linear_offset(vaddr); if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) ) return 0; #endif sl2p = sh_linear_l2_table(v) + shadow_l2_linear_offset(vaddr); if ( !(shadow_l2e_get_flags(*sl2p) & _PAGE_PRESENT) ) return 0; sl1p = sh_linear_l1_table(v) + shadow_l1_linear_offset(vaddr); sl1e = *sl1p; if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW)) != (_PAGE_PRESENT|_PAGE_RW)) || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) ) return 0; /* Found it! Need to remove its write permissions. */ sl1mfn = shadow_l2e_get_mfn(*sl2p); sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW); r = shadow_set_l1e(v, sl1p, sl1e, p2m_ram_rw, sl1mfn); if ( r & SHADOW_SET_ERROR ) { /* Can only currently happen if we found a grant-mapped * page. Just make the guess fail. */ return 0; } TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND); return 1; } #endif int sh_rm_write_access_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn) /* Excises all writeable mappings to readonly_mfn from this l1 shadow table */ { shadow_l1e_t *sl1e; int done = 0; int flags; #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC mfn_t base_sl1mfn = sl1mfn; /* Because sl1mfn changes in the foreach */ #endif SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, { flags = shadow_l1e_get_flags(*sl1e); if ( (flags & _PAGE_PRESENT) && (flags & _PAGE_RW) && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(readonly_mfn)) ) { shadow_l1e_t ro_sl1e = shadow_l1e_remove_flags(*sl1e, _PAGE_RW); (void) shadow_set_l1e(v, sl1e, ro_sl1e, p2m_ram_rw, sl1mfn); #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC /* Remember the last shadow that we shot a writeable mapping in */ v->arch.paging.shadow.last_writeable_pte_smfn = mfn_x(base_sl1mfn); #endif if ( (mfn_to_page(readonly_mfn)->u.inuse.type_info & PGT_count_mask) == 0 ) /* This breaks us cleanly out of the FOREACH macro */ done = 1; } }); return done; } int sh_rm_mappings_from_l1(struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn) /* Excises all mappings to guest frame from this shadow l1 table */ { shadow_l1e_t *sl1e; int done = 0; int flags; SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, { flags = shadow_l1e_get_flags(*sl1e); if ( (flags & _PAGE_PRESENT) && (mfn_x(shadow_l1e_get_mfn(*sl1e)) == mfn_x(target_mfn)) ) { (void) shadow_set_l1e(v, sl1e, shadow_l1e_empty(), p2m_invalid, sl1mfn); if ( sh_check_page_has_no_refs(mfn_to_page(target_mfn)) ) /* This breaks us cleanly out of the FOREACH macro */ done = 1; } }); return done; } /**************************************************************************/ /* Functions to excise all pointers to shadows from higher-level shadows. */ void sh_clear_shadow_entry(struct vcpu *v, void *ep, mfn_t smfn) /* Blank out a single shadow entry */ { switch ( mfn_to_page(smfn)->u.sh.type ) { case SH_type_l1_shadow: (void) shadow_set_l1e(v, ep, shadow_l1e_empty(), p2m_invalid, smfn); break; case SH_type_l2_shadow: #if GUEST_PAGING_LEVELS >= 3 case SH_type_l2h_shadow: #endif (void) shadow_set_l2e(v, ep, shadow_l2e_empty(), smfn); break; #if GUEST_PAGING_LEVELS >= 4 case SH_type_l3_shadow: (void) shadow_set_l3e(v, ep, shadow_l3e_empty(), smfn); break; case SH_type_l4_shadow: (void) shadow_set_l4e(v, ep, shadow_l4e_empty(), smfn); break; #endif default: BUG(); /* Called with the wrong kind of shadow. */ } } int sh_remove_l1_shadow(struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn) /* Remove all mappings of this l1 shadow from this l2 shadow */ { shadow_l2e_t *sl2e; int done = 0; int flags; SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, done, v->domain, { flags = shadow_l2e_get_flags(*sl2e); if ( (flags & _PAGE_PRESENT) && (mfn_x(shadow_l2e_get_mfn(*sl2e)) == mfn_x(sl1mfn)) ) { (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn); if ( mfn_to_page(sl1mfn)->u.sh.type == 0 ) /* This breaks us cleanly out of the FOREACH macro */ done = 1; } }); return done; } #if GUEST_PAGING_LEVELS >= 4 int sh_remove_l2_shadow(struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn) /* Remove all mappings of this l2 shadow from this l3 shadow */ { shadow_l3e_t *sl3e; int done = 0; int flags; SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, done, { flags = shadow_l3e_get_flags(*sl3e); if ( (flags & _PAGE_PRESENT) && (mfn_x(shadow_l3e_get_mfn(*sl3e)) == mfn_x(sl2mfn)) ) { (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn); if ( mfn_to_page(sl2mfn)->u.sh.type == 0 ) /* This breaks us cleanly out of the FOREACH macro */ done = 1; } }); return done; } int sh_remove_l3_shadow(struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn) /* Remove all mappings of this l3 shadow from this l4 shadow */ { shadow_l4e_t *sl4e; int done = 0; int flags; SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, done, v->domain, { flags = shadow_l4e_get_flags(*sl4e); if ( (flags & _PAGE_PRESENT) && (mfn_x(shadow_l4e_get_mfn(*sl4e)) == mfn_x(sl3mfn)) ) { (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn); if ( mfn_to_page(sl3mfn)->u.sh.type == 0 ) /* This breaks us cleanly out of the FOREACH macro */ done = 1; } }); return done; } #endif /* 64bit guest */ /**************************************************************************/ /* Function for the guest to inform us that a process is being torn * down. We remember that as a hint to unshadow its pagetables soon, * and in the meantime we unhook its top-level user-mode entries. */ #if GUEST_PAGING_LEVELS == 3 static void sh_pagetable_dying(struct vcpu *v, paddr_t gpa) { int i = 0; int flush = 0; int fast_path = 0; paddr_t gcr3 = 0; p2m_type_t p2mt; char *gl3pa = NULL; guest_l3e_t *gl3e = NULL; paddr_t gl2a = 0; unsigned long l3gfn; mfn_t l3mfn; gcr3 = (v->arch.hvm_vcpu.guest_cr[3]); /* fast path: the pagetable belongs to the current context */ if ( gcr3 == gpa ) fast_path = 1; l3gfn = gpa >> PAGE_SHIFT; l3mfn = get_gfn_query(v->domain, _gfn(l3gfn), &p2mt); if ( !mfn_valid(l3mfn) || !p2m_is_ram(p2mt) ) { printk(XENLOG_DEBUG "sh_pagetable_dying: gpa not valid %"PRIpaddr"\n", gpa); goto out_put_gfn; } paging_lock(v->domain); if ( !fast_path ) { gl3pa = sh_map_domain_page(l3mfn); gl3e = (guest_l3e_t *)(gl3pa + ((unsigned long)gpa & ~PAGE_MASK)); } for ( i = 0; i < 4; i++ ) { unsigned long gfn; mfn_t smfn, gmfn; if ( fast_path ) { if ( pagetable_is_null(v->arch.shadow_table[i]) ) smfn = _mfn(INVALID_MFN); else smfn = _mfn(pagetable_get_pfn(v->arch.shadow_table[i])); } else { /* retrieving the l2s */ gl2a = guest_l3e_get_paddr(gl3e[i]); gfn = gl2a >> PAGE_SHIFT; gmfn = get_gfn_query_unlocked(v->domain, gfn, &p2mt); smfn = shadow_hash_lookup(v, mfn_x(gmfn), SH_type_l2_pae_shadow); } if ( mfn_valid(smfn) ) { gmfn = _mfn(mfn_to_page(smfn)->v.sh.back); mfn_to_page(gmfn)->shadow_flags |= SHF_pagetable_dying; shadow_unhook_mappings(v, smfn, 1/* user pages only */); flush = 1; } } if ( flush ) flush_tlb_mask(v->domain->domain_dirty_cpumask); /* Remember that we've seen the guest use this interface, so we * can rely on it using it in future, instead of guessing at * when processes are being torn down. */ v->domain->arch.paging.shadow.pagetable_dying_op = 1; v->arch.paging.shadow.pagetable_dying = 1; if ( !fast_path ) unmap_domain_page(gl3pa); paging_unlock(v->domain); out_put_gfn: put_gfn(v->domain, l3gfn); } #else static void sh_pagetable_dying(struct vcpu *v, paddr_t gpa) { mfn_t smfn, gmfn; p2m_type_t p2mt; gmfn = get_gfn_query(v->domain, _gfn(gpa >> PAGE_SHIFT), &p2mt); paging_lock(v->domain); #if GUEST_PAGING_LEVELS == 2 smfn = shadow_hash_lookup(v, mfn_x(gmfn), SH_type_l2_32_shadow); #else smfn = shadow_hash_lookup(v, mfn_x(gmfn), SH_type_l4_64_shadow); #endif if ( mfn_valid(smfn) ) { mfn_to_page(gmfn)->shadow_flags |= SHF_pagetable_dying; shadow_unhook_mappings(v, smfn, 1/* user pages only */); /* Now flush the TLB: we removed toplevel mappings. */ flush_tlb_mask(v->domain->domain_dirty_cpumask); } /* Remember that we've seen the guest use this interface, so we * can rely on it using it in future, instead of guessing at * when processes are being torn down. */ v->domain->arch.paging.shadow.pagetable_dying_op = 1; v->arch.paging.shadow.pagetable_dying = 1; paging_unlock(v->domain); put_gfn(v->domain, gpa >> PAGE_SHIFT); } #endif /**************************************************************************/ /* Handling HVM guest writes to pagetables */ /* Translate a VA to an MFN, injecting a page-fault if we fail */ #define BAD_GVA_TO_GFN (~0UL) #define BAD_GFN_TO_MFN (~1UL) #define READONLY_GFN (~2UL) static mfn_t emulate_gva_to_mfn(struct vcpu *v, unsigned long vaddr, struct sh_emulate_ctxt *sh_ctxt) { unsigned long gfn; struct page_info *page; mfn_t mfn; p2m_type_t p2mt; uint32_t pfec = PFEC_page_present | PFEC_write_access; /* Translate the VA to a GFN */ gfn = sh_gva_to_gfn(v, NULL, vaddr, &pfec); if ( gfn == INVALID_GFN ) { if ( is_hvm_vcpu(v) ) hvm_inject_page_fault(pfec, vaddr); else propagate_page_fault(vaddr, pfec); return _mfn(BAD_GVA_TO_GFN); } /* Translate the GFN to an MFN */ ASSERT(!paging_locked_by_me(v->domain)); page = get_page_from_gfn(v->domain, gfn, &p2mt, P2M_ALLOC); /* Sanity checking */ if ( page == NULL ) { return _mfn(BAD_GFN_TO_MFN); } if ( p2m_is_readonly(p2mt) ) { put_page(page); return _mfn(READONLY_GFN); } if ( !p2m_is_ram(p2mt) ) { put_page(page); return _mfn(BAD_GFN_TO_MFN); } mfn = page_to_mfn(page); ASSERT(mfn_valid(mfn)); v->arch.paging.last_write_was_pt = !!sh_mfn_is_a_page_table(mfn); /* Note shadow cannot page out or unshare this mfn, so the map won't * disappear. Otherwise, caller must hold onto page until done. */ put_page(page); return mfn; } /* Check that the user is allowed to perform this write. * Returns a mapped pointer to write to, or NULL for error. */ #define MAPPING_UNHANDLEABLE ((void *)(unsigned long)X86EMUL_UNHANDLEABLE) #define MAPPING_EXCEPTION ((void *)(unsigned long)X86EMUL_EXCEPTION) #define MAPPING_SILENT_FAIL ((void *)(unsigned long)X86EMUL_OKAY) #define emulate_map_dest_failed(rc) ((unsigned long)(rc) <= 3) static void *emulate_map_dest(struct vcpu *v, unsigned long vaddr, u32 bytes, struct sh_emulate_ctxt *sh_ctxt) { void *map = NULL; sh_ctxt->mfn1 = emulate_gva_to_mfn(v, vaddr, sh_ctxt); if ( !mfn_valid(sh_ctxt->mfn1) ) return ((mfn_x(sh_ctxt->mfn1) == BAD_GVA_TO_GFN) ? MAPPING_EXCEPTION : (mfn_x(sh_ctxt->mfn1) == READONLY_GFN) ? MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE); #ifndef NDEBUG /* We don't emulate user-mode writes to page tables */ if ( hvm_get_seg_reg(x86_seg_ss, sh_ctxt)->attr.fields.dpl == 3 ) { gdprintk(XENLOG_DEBUG, "User-mode write to pagetable reached " "emulate_map_dest(). This should never happen!\n"); return MAPPING_UNHANDLEABLE; } #endif /* Unaligned writes mean probably this isn't a pagetable */ if ( vaddr & (bytes - 1) ) sh_remove_shadows(v, sh_ctxt->mfn1, 0, 0 /* Slow, can fail */ ); if ( likely(((vaddr + bytes - 1) & PAGE_MASK) == (vaddr & PAGE_MASK)) ) { /* Whole write fits on a single page */ sh_ctxt->mfn2 = _mfn(INVALID_MFN); map = sh_map_domain_page(sh_ctxt->mfn1) + (vaddr & ~PAGE_MASK); } else { unsigned long mfns[2]; /* Cross-page emulated writes are only supported for HVM guests; * PV guests ought to know better */ if ( !is_hvm_vcpu(v) ) return MAPPING_UNHANDLEABLE; /* This write crosses a page boundary. Translate the second page */ sh_ctxt->mfn2 = emulate_gva_to_mfn(v, (vaddr + bytes - 1) & PAGE_MASK, sh_ctxt); if ( !mfn_valid(sh_ctxt->mfn2) ) return ((mfn_x(sh_ctxt->mfn2) == BAD_GVA_TO_GFN) ? MAPPING_EXCEPTION : (mfn_x(sh_ctxt->mfn2) == READONLY_GFN) ? MAPPING_SILENT_FAIL : MAPPING_UNHANDLEABLE); /* Cross-page writes mean probably not a pagetable */ sh_remove_shadows(v, sh_ctxt->mfn2, 0, 0 /* Slow, can fail */ ); mfns[0] = mfn_x(sh_ctxt->mfn1); mfns[1] = mfn_x(sh_ctxt->mfn2); map = vmap(mfns, 2); if ( !map ) return MAPPING_UNHANDLEABLE; map += (vaddr & ~PAGE_MASK); } #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY) /* Remember if the bottom bit was clear, so we can choose not to run * the change through the verify code if it's still clear afterwards */ sh_ctxt->low_bit_was_clear = map != NULL && !(*(u8 *)map & _PAGE_PRESENT); #endif return map; } /* Tidy up after the emulated write: mark pages dirty, verify the new * contents, and undo the mapping */ static void emulate_unmap_dest(struct vcpu *v, void *addr, u32 bytes, struct sh_emulate_ctxt *sh_ctxt) { u32 b1 = bytes, b2 = 0, shflags; ASSERT(mfn_valid(sh_ctxt->mfn1)); /* If we are writing lots of PTE-aligned zeros, might want to unshadow */ if ( likely(bytes >= 4) && (*(u32 *)addr == 0) ) { if ( ((unsigned long) addr & ((sizeof (guest_intpte_t)) - 1)) == 0 ) check_for_early_unshadow(v, sh_ctxt->mfn1); /* Don't reset the heuristic if we're writing zeros at non-aligned * addresses, otherwise it doesn't catch REP MOVSD on PAE guests */ } else reset_early_unshadow(v); /* We can avoid re-verifying the page contents after the write if: * - it was no larger than the PTE type of this pagetable; * - it was aligned to the PTE boundaries; and * - _PAGE_PRESENT was clear before and after the write. */ shflags = mfn_to_page(sh_ctxt->mfn1)->shadow_flags; #if (SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY) if ( sh_ctxt->low_bit_was_clear && !(*(u8 *)addr & _PAGE_PRESENT) && ((!(shflags & SHF_32) /* Not shadowed 32-bit: aligned 64-bit writes that leave * the present bit unset are safe to ignore. */ && ((unsigned long)addr & 7) == 0 && bytes <= 8) || (!(shflags & (SHF_PAE|SHF_64)) /* Not shadowed PAE/64-bit: aligned 32-bit writes that * leave the present bit unset are safe to ignore. */ && ((unsigned long)addr & 3) == 0 && bytes <= 4)) ) { /* Writes with this alignment constraint can't possibly cross pages */ ASSERT(!mfn_valid(sh_ctxt->mfn2)); } else #endif /* SHADOW_OPTIMIZATIONS & SHOPT_SKIP_VERIFY */ { if ( unlikely(mfn_valid(sh_ctxt->mfn2)) ) { /* Validate as two writes, one to each page */ b1 = PAGE_SIZE - (((unsigned long)addr) & ~PAGE_MASK); b2 = bytes - b1; ASSERT(b2 < bytes); } if ( likely(b1 > 0) ) sh_validate_guest_pt_write(v, sh_ctxt->mfn1, addr, b1); if ( unlikely(b2 > 0) ) sh_validate_guest_pt_write(v, sh_ctxt->mfn2, addr + b1, b2); } paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn1)); if ( unlikely(mfn_valid(sh_ctxt->mfn2)) ) { paging_mark_dirty(v->domain, mfn_x(sh_ctxt->mfn2)); vunmap((void *)((unsigned long)addr & PAGE_MASK)); } else sh_unmap_domain_page(addr); atomic_inc(&v->domain->arch.paging.shadow.gtable_dirty_version); } static int sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src, u32 bytes, struct sh_emulate_ctxt *sh_ctxt) { void *addr; /* Unaligned writes are only acceptable on HVM */ if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) ) return X86EMUL_UNHANDLEABLE; addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt); if ( emulate_map_dest_failed(addr) ) return (long)addr; paging_lock(v->domain); memcpy(addr, src, bytes); if ( tb_init_done ) { #if GUEST_PAGING_LEVELS == 3 if ( vaddr == this_cpu(trace_emulate_initial_va) ) memcpy(&this_cpu(trace_emulate_write_val), src, bytes); else if ( (vaddr & ~(0x7UL)) == this_cpu(trace_emulate_initial_va) ) { TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATE_FULL_PT); memcpy(&this_cpu(trace_emulate_write_val), (void *)(((unsigned long) addr) & ~(0x7UL)), GUEST_PTE_SIZE); } #else memcpy(&this_cpu(trace_emulate_write_val), src, bytes); #endif } emulate_unmap_dest(v, addr, bytes, sh_ctxt); shadow_audit_tables(v); paging_unlock(v->domain); return X86EMUL_OKAY; } static int sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr, unsigned long old, unsigned long new, unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt) { void *addr; unsigned long prev; int rv = X86EMUL_OKAY; /* Unaligned writes are only acceptable on HVM */ if ( (vaddr & (bytes - 1)) && !is_hvm_vcpu(v) ) return X86EMUL_UNHANDLEABLE; addr = emulate_map_dest(v, vaddr, bytes, sh_ctxt); if ( emulate_map_dest_failed(addr) ) return (long)addr; paging_lock(v->domain); switch ( bytes ) { case 1: prev = cmpxchg(((u8 *)addr), old, new); break; case 2: prev = cmpxchg(((u16 *)addr), old, new); break; case 4: prev = cmpxchg(((u32 *)addr), old, new); break; case 8: prev = cmpxchg(((u64 *)addr), old, new); break; default: SHADOW_PRINTK("cmpxchg of size %i is not supported\n", bytes); prev = ~old; } if ( prev != old ) rv = X86EMUL_CMPXCHG_FAILED; SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx" " wanted %#lx now %#lx bytes %u\n", vaddr, prev, old, new, *(unsigned long *)addr, bytes); emulate_unmap_dest(v, addr, bytes, sh_ctxt); shadow_audit_tables(v); paging_unlock(v->domain); return rv; } /**************************************************************************/ /* Audit tools */ #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES #define AUDIT_FAIL(_level, _fmt, _a...) do { \ printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \ "gl" #_level "mfn = %" PRI_mfn \ " sl" #_level "mfn = %" PRI_mfn \ " &gl" #_level "e = %p &sl" #_level "e = %p" \ " gl" #_level "e = %" SH_PRI_gpte \ " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \ GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \ _level, guest_index(gl ## _level ## e), \ mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \ gl ## _level ## e, sl ## _level ## e, \ gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \ ##_a); \ BUG(); \ done = 1; \ } while (0) #define AUDIT_FAIL_MIN(_level, _fmt, _a...) do { \ printk("Shadow %u-on-%u audit failed at level %i\n" \ "gl" #_level "mfn = %" PRI_mfn \ " sl" #_level "mfn = %" PRI_mfn \ " Error: " _fmt "\n", \ GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \ _level, \ mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \ ##_a); \ BUG(); \ done = 1; \ } while (0) static char * sh_audit_flags(struct vcpu *v, int level, int gflags, int sflags) /* Common code for auditing flag bits */ { if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_PRESENT) ) return "shadow is present but guest is not present"; if ( (sflags & _PAGE_GLOBAL) && !is_hvm_vcpu(v) ) return "global bit set in PV shadow"; if ( level == 2 && (sflags & _PAGE_PSE) ) return "PS bit set in shadow"; #if SHADOW_PAGING_LEVELS == 3 if ( level == 3 ) return NULL; /* All the other bits are blank in PAEl3 */ #endif if ( (sflags & _PAGE_PRESENT) && !(gflags & _PAGE_ACCESSED) ) return "accessed bit not propagated"; if ( (level == 1 || (level == 2 && (gflags & _PAGE_PSE))) && ((sflags & _PAGE_RW) && !(gflags & _PAGE_DIRTY)) ) return "dirty bit not propagated"; if ( (sflags & _PAGE_USER) != (gflags & _PAGE_USER) ) return "user/supervisor bit does not match"; if ( (sflags & _PAGE_NX_BIT) != (gflags & _PAGE_NX_BIT) ) return "NX bit does not match"; if ( (sflags & _PAGE_RW) && !(gflags & _PAGE_RW) ) return "shadow grants write access but guest does not"; return NULL; } int sh_audit_l1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x) { guest_l1e_t *gl1e, *gp; shadow_l1e_t *sl1e; mfn_t mfn, gmfn, gl1mfn; gfn_t gfn; p2m_type_t p2mt; char *s; int done = 0; /* Follow the backpointer */ ASSERT(mfn_to_page(sl1mfn)->u.sh.head); gl1mfn = backpointer(mfn_to_page(sl1mfn)); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */ if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) ) { oos_audit_hash_is_present(v->domain, gl1mfn); return 0; } #endif gl1e = gp = sh_map_domain_page(gl1mfn); SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, { if ( sh_l1e_is_magic(*sl1e) ) { #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) if ( sh_l1e_is_gnp(*sl1e) ) { if ( guest_l1e_get_flags(*gl1e) & _PAGE_PRESENT ) AUDIT_FAIL(1, "shadow is GNP magic but guest is present"); } else { ASSERT(sh_l1e_is_mmio(*sl1e)); gfn = sh_l1e_mmio_get_gfn(*sl1e); if ( gfn_x(gfn) != gfn_x(guest_l1e_get_gfn(*gl1e)) ) AUDIT_FAIL(1, "shadow MMIO gfn is %" SH_PRI_gfn " but guest gfn is %" SH_PRI_gfn, gfn_x(gfn), gfn_x(guest_l1e_get_gfn(*gl1e))); } #endif } else { s = sh_audit_flags(v, 1, guest_l1e_get_flags(*gl1e), shadow_l1e_get_flags(*sl1e)); if ( s ) AUDIT_FAIL(1, "%s", s); if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS ) { gfn = guest_l1e_get_gfn(*gl1e); mfn = shadow_l1e_get_mfn(*sl1e); gmfn = get_gfn_query_unlocked(v->domain, gfn_x(gfn), &p2mt); if ( !p2m_is_grant(p2mt) && mfn_x(gmfn) != mfn_x(mfn) ) AUDIT_FAIL(1, "bad translation: gfn %" SH_PRI_gfn " --> %" PRI_mfn " != mfn %" PRI_mfn, gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn)); } } }); sh_unmap_domain_page(gp); return done; } int sh_audit_fl1_table(struct vcpu *v, mfn_t sl1mfn, mfn_t x) { guest_l1e_t *gl1e, e; shadow_l1e_t *sl1e; mfn_t gl1mfn = _mfn(INVALID_MFN); int f; int done = 0; /* fl1 has no useful backpointer: all we can check are flags */ e = guest_l1e_from_gfn(_gfn(0), 0); gl1e = &e; /* Needed for macro */ SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, done, { f = shadow_l1e_get_flags(*sl1e); f &= ~(_PAGE_AVAIL0|_PAGE_AVAIL1|_PAGE_AVAIL2); if ( !(f == 0 || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| _PAGE_ACCESSED) || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED) || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| _PAGE_ACCESSED|_PAGE_DIRTY) || f == (_PAGE_PRESENT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY) || sh_l1e_is_magic(*sl1e)) ) AUDIT_FAIL(1, "fl1e has bad flags"); }); return 0; } int sh_audit_l2_table(struct vcpu *v, mfn_t sl2mfn, mfn_t x) { guest_l2e_t *gl2e, *gp; shadow_l2e_t *sl2e; mfn_t mfn, gmfn, gl2mfn; gfn_t gfn; p2m_type_t p2mt; char *s; int done = 0; /* Follow the backpointer */ ASSERT(mfn_to_page(sl2mfn)->u.sh.head); gl2mfn = backpointer(mfn_to_page(sl2mfn)); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Only L1's may be out of sync. */ if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) ) AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn)); #endif gl2e = gp = sh_map_domain_page(gl2mfn); SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, { s = sh_audit_flags(v, 2, guest_l2e_get_flags(*gl2e), shadow_l2e_get_flags(*sl2e)); if ( s ) AUDIT_FAIL(2, "%s", s); if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS ) { gfn = guest_l2e_get_gfn(*gl2e); mfn = shadow_l2e_get_mfn(*sl2e); gmfn = (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? get_fl1_shadow_status(v, gfn) : get_shadow_status(v, get_gfn_query_unlocked(v->domain, gfn_x(gfn), &p2mt), SH_type_l1_shadow); if ( mfn_x(gmfn) != mfn_x(mfn) ) AUDIT_FAIL(2, "bad translation: gfn %" SH_PRI_gfn " (--> %" PRI_mfn ")" " --> %" PRI_mfn " != mfn %" PRI_mfn, gfn_x(gfn), (guest_l2e_get_flags(*gl2e) & _PAGE_PSE) ? 0 : mfn_x(get_gfn_query_unlocked(v->domain, gfn_x(gfn), &p2mt)), mfn_x(gmfn), mfn_x(mfn)); } }); sh_unmap_domain_page(gp); return 0; } #if GUEST_PAGING_LEVELS >= 4 int sh_audit_l3_table(struct vcpu *v, mfn_t sl3mfn, mfn_t x) { guest_l3e_t *gl3e, *gp; shadow_l3e_t *sl3e; mfn_t mfn, gmfn, gl3mfn; gfn_t gfn; p2m_type_t p2mt; char *s; int done = 0; /* Follow the backpointer */ ASSERT(mfn_to_page(sl3mfn)->u.sh.head); gl3mfn = backpointer(mfn_to_page(sl3mfn)); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Only L1's may be out of sync. */ if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) ) AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn)); #endif gl3e = gp = sh_map_domain_page(gl3mfn); SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, { s = sh_audit_flags(v, 3, guest_l3e_get_flags(*gl3e), shadow_l3e_get_flags(*sl3e)); if ( s ) AUDIT_FAIL(3, "%s", s); if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS ) { gfn = guest_l3e_get_gfn(*gl3e); mfn = shadow_l3e_get_mfn(*sl3e); gmfn = get_shadow_status(v, get_gfn_query_unlocked( v->domain, gfn_x(gfn), &p2mt), ((GUEST_PAGING_LEVELS == 3 || is_pv_32on64_vcpu(v)) && !shadow_mode_external(v->domain) && (guest_index(gl3e) % 4) == 3) ? SH_type_l2h_shadow : SH_type_l2_shadow); if ( mfn_x(gmfn) != mfn_x(mfn) ) AUDIT_FAIL(3, "bad translation: gfn %" SH_PRI_gfn " --> %" PRI_mfn " != mfn %" PRI_mfn, gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn)); } }); sh_unmap_domain_page(gp); return 0; } int sh_audit_l4_table(struct vcpu *v, mfn_t sl4mfn, mfn_t x) { guest_l4e_t *gl4e, *gp; shadow_l4e_t *sl4e; mfn_t mfn, gmfn, gl4mfn; gfn_t gfn; p2m_type_t p2mt; char *s; int done = 0; /* Follow the backpointer */ ASSERT(mfn_to_page(sl4mfn)->u.sh.head); gl4mfn = backpointer(mfn_to_page(sl4mfn)); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) /* Only L1's may be out of sync. */ if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) ) AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn)); #endif gl4e = gp = sh_map_domain_page(gl4mfn); SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain, { s = sh_audit_flags(v, 4, guest_l4e_get_flags(*gl4e), shadow_l4e_get_flags(*sl4e)); if ( s ) AUDIT_FAIL(4, "%s", s); if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_MFNS ) { gfn = guest_l4e_get_gfn(*gl4e); mfn = shadow_l4e_get_mfn(*sl4e); gmfn = get_shadow_status(v, get_gfn_query_unlocked( v->domain, gfn_x(gfn), &p2mt), SH_type_l3_shadow); if ( mfn_x(gmfn) != mfn_x(mfn) ) AUDIT_FAIL(4, "bad translation: gfn %" SH_PRI_gfn " --> %" PRI_mfn " != mfn %" PRI_mfn, gfn_x(gfn), mfn_x(gmfn), mfn_x(mfn)); } }); sh_unmap_domain_page(gp); return 0; } #endif /* GUEST_PAGING_LEVELS >= 4 */ #undef AUDIT_FAIL #endif /* Audit code */ /**************************************************************************/ /* Entry points into this mode of the shadow code. * This will all be mangled by the preprocessor to uniquify everything. */ const struct paging_mode sh_paging_mode = { .page_fault = sh_page_fault, .invlpg = sh_invlpg, .gva_to_gfn = sh_gva_to_gfn, .update_cr3 = sh_update_cr3, .update_paging_modes = shadow_update_paging_modes, .write_p2m_entry = shadow_write_p2m_entry, .write_guest_entry = shadow_write_guest_entry, .cmpxchg_guest_entry = shadow_cmpxchg_guest_entry, .guest_map_l1e = sh_guest_map_l1e, .guest_get_eff_l1e = sh_guest_get_eff_l1e, .guest_levels = GUEST_PAGING_LEVELS, .shadow.detach_old_tables = sh_detach_old_tables, .shadow.x86_emulate_write = sh_x86_emulate_write, .shadow.x86_emulate_cmpxchg = sh_x86_emulate_cmpxchg, .shadow.make_monitor_table = sh_make_monitor_table, .shadow.destroy_monitor_table = sh_destroy_monitor_table, #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC .shadow.guess_wrmap = sh_guess_wrmap, #endif .shadow.pagetable_dying = sh_pagetable_dying, .shadow.shadow_levels = SHADOW_PAGING_LEVELS, }; /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/mm/shadow/types.h0000664000175000017500000003445012307313555016020 0ustar smbsmb/****************************************************************************** * arch/x86/mm/shadow/types.h * * Parts of this code are Copyright (c) 2006 by XenSource Inc. * Parts of this code are Copyright (c) 2006 by Michael A Fetterman * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _XEN_SHADOW_TYPES_H #define _XEN_SHADOW_TYPES_H /* The number of levels in the shadow pagetable is entirely determined * by the number of levels in the guest pagetable */ #if GUEST_PAGING_LEVELS == 4 #define SHADOW_PAGING_LEVELS 4 #else #define SHADOW_PAGING_LEVELS 3 #endif /* * Define various types for handling pagetabels, based on these options: * SHADOW_PAGING_LEVELS : Number of levels of shadow pagetables * GUEST_PAGING_LEVELS : Number of levels of guest pagetables */ #if SHADOW_PAGING_LEVELS == 3 #define SHADOW_L1_PAGETABLE_ENTRIES 512 #define SHADOW_L2_PAGETABLE_ENTRIES 512 #define SHADOW_L3_PAGETABLE_ENTRIES 4 #define SHADOW_L1_PAGETABLE_SHIFT 12 #define SHADOW_L2_PAGETABLE_SHIFT 21 #define SHADOW_L3_PAGETABLE_SHIFT 30 #else /* SHADOW_PAGING_LEVELS == 4 */ #define SHADOW_L1_PAGETABLE_ENTRIES 512 #define SHADOW_L2_PAGETABLE_ENTRIES 512 #define SHADOW_L3_PAGETABLE_ENTRIES 512 #define SHADOW_L4_PAGETABLE_ENTRIES 512 #define SHADOW_L1_PAGETABLE_SHIFT 12 #define SHADOW_L2_PAGETABLE_SHIFT 21 #define SHADOW_L3_PAGETABLE_SHIFT 30 #define SHADOW_L4_PAGETABLE_SHIFT 39 #endif /* Types of the shadow page tables */ typedef l1_pgentry_t shadow_l1e_t; typedef l2_pgentry_t shadow_l2e_t; typedef l3_pgentry_t shadow_l3e_t; #if SHADOW_PAGING_LEVELS >= 4 typedef l4_pgentry_t shadow_l4e_t; #endif /* Access functions for them */ static inline paddr_t shadow_l1e_get_paddr(shadow_l1e_t sl1e) { return l1e_get_paddr(sl1e); } static inline paddr_t shadow_l2e_get_paddr(shadow_l2e_t sl2e) { return l2e_get_paddr(sl2e); } static inline paddr_t shadow_l3e_get_paddr(shadow_l3e_t sl3e) { return l3e_get_paddr(sl3e); } #if SHADOW_PAGING_LEVELS >= 4 static inline paddr_t shadow_l4e_get_paddr(shadow_l4e_t sl4e) { return l4e_get_paddr(sl4e); } #endif static inline mfn_t shadow_l1e_get_mfn(shadow_l1e_t sl1e) { return _mfn(l1e_get_pfn(sl1e)); } static inline mfn_t shadow_l2e_get_mfn(shadow_l2e_t sl2e) { return _mfn(l2e_get_pfn(sl2e)); } static inline mfn_t shadow_l3e_get_mfn(shadow_l3e_t sl3e) { return _mfn(l3e_get_pfn(sl3e)); } #if SHADOW_PAGING_LEVELS >= 4 static inline mfn_t shadow_l4e_get_mfn(shadow_l4e_t sl4e) { return _mfn(l4e_get_pfn(sl4e)); } #endif static inline u32 shadow_l1e_get_flags(shadow_l1e_t sl1e) { return l1e_get_flags(sl1e); } static inline u32 shadow_l2e_get_flags(shadow_l2e_t sl2e) { return l2e_get_flags(sl2e); } static inline u32 shadow_l3e_get_flags(shadow_l3e_t sl3e) { return l3e_get_flags(sl3e); } #if SHADOW_PAGING_LEVELS >= 4 static inline u32 shadow_l4e_get_flags(shadow_l4e_t sl4e) { return l4e_get_flags(sl4e); } #endif static inline shadow_l1e_t shadow_l1e_remove_flags(shadow_l1e_t sl1e, u32 flags) { l1e_remove_flags(sl1e, flags); return sl1e; } static inline shadow_l1e_t shadow_l1e_empty(void) { return l1e_empty(); } static inline shadow_l2e_t shadow_l2e_empty(void) { return l2e_empty(); } static inline shadow_l3e_t shadow_l3e_empty(void) { return l3e_empty(); } #if SHADOW_PAGING_LEVELS >= 4 static inline shadow_l4e_t shadow_l4e_empty(void) { return l4e_empty(); } #endif static inline shadow_l1e_t shadow_l1e_from_mfn(mfn_t mfn, u32 flags) { return l1e_from_pfn(mfn_x(mfn), flags); } static inline shadow_l2e_t shadow_l2e_from_mfn(mfn_t mfn, u32 flags) { return l2e_from_pfn(mfn_x(mfn), flags); } static inline shadow_l3e_t shadow_l3e_from_mfn(mfn_t mfn, u32 flags) { return l3e_from_pfn(mfn_x(mfn), flags); } #if SHADOW_PAGING_LEVELS >= 4 static inline shadow_l4e_t shadow_l4e_from_mfn(mfn_t mfn, u32 flags) { return l4e_from_pfn(mfn_x(mfn), flags); } #endif #define shadow_l1_table_offset(a) l1_table_offset(a) #define shadow_l2_table_offset(a) l2_table_offset(a) #define shadow_l3_table_offset(a) l3_table_offset(a) #define shadow_l4_table_offset(a) l4_table_offset(a) /**************************************************************************/ /* Access to the linear mapping of shadow page tables. */ /* Offsets into each level of the linear mapping for a virtual address. */ #define shadow_l1_linear_offset(_a) \ (((_a) & VADDR_MASK) >> SHADOW_L1_PAGETABLE_SHIFT) #define shadow_l2_linear_offset(_a) \ (((_a) & VADDR_MASK) >> SHADOW_L2_PAGETABLE_SHIFT) #define shadow_l3_linear_offset(_a) \ (((_a) & VADDR_MASK) >> SHADOW_L3_PAGETABLE_SHIFT) #define shadow_l4_linear_offset(_a) \ (((_a) & VADDR_MASK) >> SHADOW_L4_PAGETABLE_SHIFT) /* Where to find each level of the linear mapping. For PV guests, we use * the shadow linear-map self-entry as many times as we need. For HVM * guests, the shadow doesn't have a linear-map self-entry so we must use * the monitor-table's linear-map entry N-1 times and then the shadow-map * entry once. */ #define __sh_linear_l1_table ((shadow_l1e_t *)(SH_LINEAR_PT_VIRT_START)) #define __sh_linear_l2_table ((shadow_l2e_t *) \ (__sh_linear_l1_table + shadow_l1_linear_offset(SH_LINEAR_PT_VIRT_START))) // shadow linear L3 and L4 tables only exist in 4 level paging... #if SHADOW_PAGING_LEVELS == 4 #define __sh_linear_l3_table ((shadow_l3e_t *) \ (__sh_linear_l2_table + shadow_l2_linear_offset(SH_LINEAR_PT_VIRT_START))) #define __sh_linear_l4_table ((shadow_l4e_t *) \ (__sh_linear_l3_table + shadow_l3_linear_offset(SH_LINEAR_PT_VIRT_START))) #endif #define sh_linear_l1_table(v) ({ \ ASSERT(current == (v)); \ __sh_linear_l1_table; \ }) // XXX -- these should not be conditional on is_hvm_vcpu(v), but rather on // shadow_mode_external(d)... // #define sh_linear_l2_table(v) ({ \ ASSERT(current == (v)); \ ((shadow_l2e_t *) \ (is_hvm_vcpu(v) ? __linear_l1_table : __sh_linear_l1_table) + \ shadow_l1_linear_offset(SH_LINEAR_PT_VIRT_START)); \ }) #if SHADOW_PAGING_LEVELS >= 4 #define sh_linear_l3_table(v) ({ \ ASSERT(current == (v)); \ ((shadow_l3e_t *) \ (is_hvm_vcpu(v) ? __linear_l2_table : __sh_linear_l2_table) + \ shadow_l2_linear_offset(SH_LINEAR_PT_VIRT_START)); \ }) // we use l4_pgentry_t instead of shadow_l4e_t below because shadow_l4e_t is // not defined for when xen_levels==4 & shadow_levels==3... #define sh_linear_l4_table(v) ({ \ ASSERT(current == (v)); \ ((l4_pgentry_t *) \ (is_hvm_vcpu(v) ? __linear_l3_table : __sh_linear_l3_table) + \ shadow_l3_linear_offset(SH_LINEAR_PT_VIRT_START)); \ }) #endif /* Override get_gfn to work with gfn_t */ #undef get_gfn_query #define get_gfn_query(d, g, t) get_gfn_type((d), gfn_x(g), (t), 0) /* The shadow types needed for the various levels. */ #if GUEST_PAGING_LEVELS == 2 #define SH_type_l1_shadow SH_type_l1_32_shadow #define SH_type_l2_shadow SH_type_l2_32_shadow #define SH_type_fl1_shadow SH_type_fl1_32_shadow #elif GUEST_PAGING_LEVELS == 3 #define SH_type_l1_shadow SH_type_l1_pae_shadow #define SH_type_fl1_shadow SH_type_fl1_pae_shadow #define SH_type_l2_shadow SH_type_l2_pae_shadow #define SH_type_l2h_shadow SH_type_l2h_pae_shadow #else #define SH_type_l1_shadow SH_type_l1_64_shadow #define SH_type_fl1_shadow SH_type_fl1_64_shadow #define SH_type_l2_shadow SH_type_l2_64_shadow #define SH_type_l2h_shadow SH_type_l2h_64_shadow #define SH_type_l3_shadow SH_type_l3_64_shadow #define SH_type_l4_shadow SH_type_l4_64_shadow #endif /* macros for dealing with the naming of the internal function names of the * shadow code's external entry points. */ #define INTERNAL_NAME(name) SHADOW_INTERNAL_NAME(name, GUEST_PAGING_LEVELS) /* macros for renaming the primary entry points, so that they are more * easily distinguished from a debugger */ #define sh_page_fault INTERNAL_NAME(sh_page_fault) #define sh_invlpg INTERNAL_NAME(sh_invlpg) #define sh_gva_to_gfn INTERNAL_NAME(sh_gva_to_gfn) #define sh_update_cr3 INTERNAL_NAME(sh_update_cr3) #define sh_rm_write_access_from_l1 INTERNAL_NAME(sh_rm_write_access_from_l1) #define sh_rm_mappings_from_l1 INTERNAL_NAME(sh_rm_mappings_from_l1) #define sh_remove_l1_shadow INTERNAL_NAME(sh_remove_l1_shadow) #define sh_remove_l2_shadow INTERNAL_NAME(sh_remove_l2_shadow) #define sh_remove_l3_shadow INTERNAL_NAME(sh_remove_l3_shadow) #define sh_map_and_validate_gl4e INTERNAL_NAME(sh_map_and_validate_gl4e) #define sh_map_and_validate_gl3e INTERNAL_NAME(sh_map_and_validate_gl3e) #define sh_map_and_validate_gl2e INTERNAL_NAME(sh_map_and_validate_gl2e) #define sh_map_and_validate_gl2he INTERNAL_NAME(sh_map_and_validate_gl2he) #define sh_map_and_validate_gl1e INTERNAL_NAME(sh_map_and_validate_gl1e) #define sh_destroy_l4_shadow INTERNAL_NAME(sh_destroy_l4_shadow) #define sh_destroy_l3_shadow INTERNAL_NAME(sh_destroy_l3_shadow) #define sh_destroy_l2_shadow INTERNAL_NAME(sh_destroy_l2_shadow) #define sh_destroy_l1_shadow INTERNAL_NAME(sh_destroy_l1_shadow) #define sh_unhook_32b_mappings INTERNAL_NAME(sh_unhook_32b_mappings) #define sh_unhook_pae_mappings INTERNAL_NAME(sh_unhook_pae_mappings) #define sh_unhook_64b_mappings INTERNAL_NAME(sh_unhook_64b_mappings) #define sh_paging_mode INTERNAL_NAME(sh_paging_mode) #define sh_detach_old_tables INTERNAL_NAME(sh_detach_old_tables) #define sh_x86_emulate_write INTERNAL_NAME(sh_x86_emulate_write) #define sh_x86_emulate_cmpxchg INTERNAL_NAME(sh_x86_emulate_cmpxchg) #define sh_x86_emulate_cmpxchg8b INTERNAL_NAME(sh_x86_emulate_cmpxchg8b) #define sh_audit_l1_table INTERNAL_NAME(sh_audit_l1_table) #define sh_audit_fl1_table INTERNAL_NAME(sh_audit_fl1_table) #define sh_audit_l2_table INTERNAL_NAME(sh_audit_l2_table) #define sh_audit_l3_table INTERNAL_NAME(sh_audit_l3_table) #define sh_audit_l4_table INTERNAL_NAME(sh_audit_l4_table) #define sh_guess_wrmap INTERNAL_NAME(sh_guess_wrmap) #define sh_clear_shadow_entry INTERNAL_NAME(sh_clear_shadow_entry) #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC #define sh_resync_l1 INTERNAL_NAME(sh_resync_l1) #define sh_safe_not_to_sync INTERNAL_NAME(sh_safe_not_to_sync) #define sh_rm_write_access_from_sl1p INTERNAL_NAME(sh_rm_write_access_from_sl1p) #endif /* The sh_guest_(map|get)_* functions depends on Xen's paging levels */ #define sh_guest_map_l1e \ SHADOW_INTERNAL_NAME(sh_guest_map_l1e, CONFIG_PAGING_LEVELS) #define sh_guest_get_eff_l1e \ SHADOW_INTERNAL_NAME(sh_guest_get_eff_l1e, CONFIG_PAGING_LEVELS) /* sh_make_monitor_table depends only on the number of shadow levels */ #define sh_make_monitor_table \ SHADOW_INTERNAL_NAME(sh_make_monitor_table, SHADOW_PAGING_LEVELS) #define sh_destroy_monitor_table \ SHADOW_INTERNAL_NAME(sh_destroy_monitor_table, SHADOW_PAGING_LEVELS) #if SHADOW_PAGING_LEVELS == 3 #define MFN_FITS_IN_HVM_CR3(_MFN) !(mfn_x(_MFN) >> 20) #endif #define SH_PRI_pte PRIpte #define SH_PRI_gpte PRI_gpte #define SH_PRI_gfn PRI_gfn #if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) /****************************************************************************** * We implement a "fast path" for two special cases: faults that require * MMIO emulation, and faults where the guest PTE is not present. We * record these as shadow l1 entries that have reserved bits set in * them, so we can spot them immediately in the fault handler and handle * them without needing to hold the paging lock or walk the guest * pagetables. * * This is only feasible for PAE and 64bit Xen: 32-bit non-PAE PTEs don't * have reserved bits that we can use for this. */ #define SH_L1E_MAGIC 0xffffffff00000001ULL static inline int sh_l1e_is_magic(shadow_l1e_t sl1e) { return ((sl1e.l1 & SH_L1E_MAGIC) == SH_L1E_MAGIC); } /* Guest not present: a single magic value */ static inline shadow_l1e_t sh_l1e_gnp(void) { return (shadow_l1e_t){ -1ULL }; } static inline int sh_l1e_is_gnp(shadow_l1e_t sl1e) { return (sl1e.l1 == sh_l1e_gnp().l1); } /* MMIO: an invalid PTE that contains the GFN of the equivalent guest l1e. * We store 28 bits of GFN in bits 4:32 of the entry. * The present bit is set, and the U/S and R/W bits are taken from the guest. * Bit 3 is always 0, to differentiate from gnp above. */ #define SH_L1E_MMIO_MAGIC 0xffffffff00000001ULL #define SH_L1E_MMIO_MAGIC_MASK 0xffffffff00000009ULL #define SH_L1E_MMIO_GFN_MASK 0x00000000fffffff0ULL #define SH_L1E_MMIO_GFN_SHIFT 4 static inline shadow_l1e_t sh_l1e_mmio(gfn_t gfn, u32 gflags) { return (shadow_l1e_t) { (SH_L1E_MMIO_MAGIC | (gfn_x(gfn) << SH_L1E_MMIO_GFN_SHIFT) | (gflags & (_PAGE_USER|_PAGE_RW))) }; } static inline int sh_l1e_is_mmio(shadow_l1e_t sl1e) { return ((sl1e.l1 & SH_L1E_MMIO_MAGIC_MASK) == SH_L1E_MMIO_MAGIC); } static inline gfn_t sh_l1e_mmio_get_gfn(shadow_l1e_t sl1e) { return _gfn((sl1e.l1 & SH_L1E_MMIO_GFN_MASK) >> SH_L1E_MMIO_GFN_SHIFT); } static inline u32 sh_l1e_mmio_get_flags(shadow_l1e_t sl1e) { return (u32)((sl1e.l1 & (_PAGE_USER|_PAGE_RW))); } #else #define sh_l1e_gnp() shadow_l1e_empty() #define sh_l1e_mmio(_gfn, _flags) shadow_l1e_empty() #define sh_l1e_is_magic(_e) (0) #endif /* SHOPT_FAST_FAULT_PATH */ #endif /* _XEN_SHADOW_TYPES_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/arch/x86/mm/shadow/multi.h0000664000175000017500000001127512307313555016006 0ustar smbsmb/****************************************************************************** * arch/x86/mm/shadow/multi.h * * Shadow declarations which will be multiply compiled. * Parts of this code are Copyright (c) 2006 by XenSource Inc. * Parts of this code are Copyright (c) 2006 by Michael A Fetterman * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ extern int SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, GUEST_LEVELS)( struct vcpu *v, mfn_t gl1mfn, void *new_gl1p, u32 size); extern int SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, GUEST_LEVELS)( struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size); extern int SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, GUEST_LEVELS)( struct vcpu *v, mfn_t gl2mfn, void *new_gl2p, u32 size); extern int SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, GUEST_LEVELS)( struct vcpu *v, mfn_t gl3mfn, void *new_gl3p, u32 size); extern int SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, GUEST_LEVELS)( struct vcpu *v, mfn_t gl4mfn, void *new_gl4p, u32 size); extern void SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, GUEST_LEVELS)( struct vcpu *v, mfn_t smfn); extern void SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, GUEST_LEVELS)( struct vcpu *v, mfn_t smfn); extern void SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, GUEST_LEVELS)( struct vcpu *v, mfn_t smfn); extern void SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, GUEST_LEVELS)( struct vcpu *v, mfn_t smfn); extern void SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings, GUEST_LEVELS) (struct vcpu *v, mfn_t sl2mfn, int user_only); extern void SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings, GUEST_LEVELS) (struct vcpu *v, mfn_t sl3mfn, int user_only); extern void SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings, GUEST_LEVELS) (struct vcpu *v, mfn_t sl4mfn, int user_only); extern int SHADOW_INTERNAL_NAME(sh_rm_write_access_from_l1, GUEST_LEVELS) (struct vcpu *v, mfn_t sl1mfn, mfn_t readonly_mfn); extern int SHADOW_INTERNAL_NAME(sh_rm_mappings_from_l1, GUEST_LEVELS) (struct vcpu *v, mfn_t sl1mfn, mfn_t target_mfn); extern void SHADOW_INTERNAL_NAME(sh_clear_shadow_entry, GUEST_LEVELS) (struct vcpu *v, void *ep, mfn_t smfn); extern int SHADOW_INTERNAL_NAME(sh_remove_l1_shadow, GUEST_LEVELS) (struct vcpu *v, mfn_t sl2mfn, mfn_t sl1mfn); extern int SHADOW_INTERNAL_NAME(sh_remove_l2_shadow, GUEST_LEVELS) (struct vcpu *v, mfn_t sl3mfn, mfn_t sl2mfn); extern int SHADOW_INTERNAL_NAME(sh_remove_l3_shadow, GUEST_LEVELS) (struct vcpu *v, mfn_t sl4mfn, mfn_t sl3mfn); #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES int SHADOW_INTERNAL_NAME(sh_audit_l1_table, GUEST_LEVELS) (struct vcpu *v, mfn_t sl1mfn, mfn_t x); int SHADOW_INTERNAL_NAME(sh_audit_fl1_table, GUEST_LEVELS) (struct vcpu *v, mfn_t sl1mfn, mfn_t x); int SHADOW_INTERNAL_NAME(sh_audit_l2_table, GUEST_LEVELS) (struct vcpu *v, mfn_t sl2mfn, mfn_t x); int SHADOW_INTERNAL_NAME(sh_audit_l3_table, GUEST_LEVELS) (struct vcpu *v, mfn_t sl3mfn, mfn_t x); int SHADOW_INTERNAL_NAME(sh_audit_l4_table, GUEST_LEVELS) (struct vcpu *v, mfn_t sl4mfn, mfn_t x); #endif extern void * SHADOW_INTERNAL_NAME(sh_guest_map_l1e, GUEST_LEVELS) (struct vcpu *v, unsigned long va, unsigned long *gl1mfn); extern void SHADOW_INTERNAL_NAME(sh_guest_get_eff_l1e, GUEST_LEVELS) (struct vcpu *v, unsigned long va, void *eff_l1e); extern mfn_t SHADOW_INTERNAL_NAME(sh_make_monitor_table, GUEST_LEVELS) (struct vcpu *v); extern void SHADOW_INTERNAL_NAME(sh_destroy_monitor_table, GUEST_LEVELS) (struct vcpu *v, mfn_t mmfn); extern const struct paging_mode SHADOW_INTERNAL_NAME(sh_paging_mode, GUEST_LEVELS); #if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC extern void SHADOW_INTERNAL_NAME(sh_resync_l1, GUEST_LEVELS) (struct vcpu *v, mfn_t gmfn, mfn_t snpmfn); extern int SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, GUEST_LEVELS) (struct vcpu*v, mfn_t gmfn); extern int SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p, GUEST_LEVELS) (struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off); #endif xen-4.4.0/xen/include/0000775000175000017500000000000012307313555012660 5ustar smbsmbxen-4.4.0/xen/include/xen/0000775000175000017500000000000012307313555013452 5ustar smbsmbxen-4.4.0/xen/include/xen/tasklet.h0000664000175000017500000000353212307313555015275 0ustar smbsmb/****************************************************************************** * tasklet.h * * Tasklets are dynamically-allocatable tasks run in either VCPU context * (specifically, the idle VCPU's context) or in softirq context, on at most * one CPU at a time. Softirq versus VCPU context execution is specified * during per-tasklet initialisation. */ #ifndef __XEN_TASKLET_H__ #define __XEN_TASKLET_H__ #include #include #include struct tasklet { struct list_head list; int scheduled_on; bool_t is_softirq; bool_t is_running; bool_t is_dead; void (*func)(unsigned long); unsigned long data; }; #define _DECLARE_TASKLET(name, func, data, softirq) \ struct tasklet name = { \ LIST_HEAD_INIT(name.list), -1, softirq, 0, 0, func, data } #define DECLARE_TASKLET(name, func, data) \ _DECLARE_TASKLET(name, func, data, 0) #define DECLARE_SOFTIRQ_TASKLET(name, func, data) \ _DECLARE_TASKLET(name, func, data, 1) /* Indicates status of tasklet work on each CPU. */ DECLARE_PER_CPU(unsigned long, tasklet_work_to_do); #define _TASKLET_enqueued 0 /* Tasklet work is enqueued for this CPU. */ #define _TASKLET_scheduled 1 /* Scheduler has scheduled do_tasklet(). */ #define TASKLET_enqueued (1ul << _TASKLET_enqueued) #define TASKLET_scheduled (1ul << _TASKLET_scheduled) void tasklet_schedule_on_cpu(struct tasklet *t, unsigned int cpu); void tasklet_schedule(struct tasklet *t); void do_tasklet(void); void tasklet_kill(struct tasklet *t); void tasklet_init( struct tasklet *t, void (*func)(unsigned long), unsigned long data); void softirq_tasklet_init( struct tasklet *t, void (*func)(unsigned long), unsigned long data); void tasklet_subsys_init(void); #endif /* __XEN_TASKLET_H__ */ xen-4.4.0/xen/include/xen/byteorder/0000775000175000017500000000000012307313555015451 5ustar smbsmbxen-4.4.0/xen/include/xen/byteorder/little_endian.h0000664000175000017500000000661412307313555020444 0ustar smbsmb#ifndef __XEN_BYTEORDER_LITTLE_ENDIAN_H__ #define __XEN_BYTEORDER_LITTLE_ENDIAN_H__ #ifndef __LITTLE_ENDIAN #define __LITTLE_ENDIAN 1234 #endif #ifndef __LITTLE_ENDIAN_BITFIELD #define __LITTLE_ENDIAN_BITFIELD #endif #include #include #define __constant_cpu_to_le64(x) ((__force __le64)(__u64)(x)) #define __constant_le64_to_cpu(x) ((__force __u64)(__le64)(x)) #define __constant_cpu_to_le32(x) ((__force __le32)(__u32)(x)) #define __constant_le32_to_cpu(x) ((__force __u32)(__le32)(x)) #define __constant_cpu_to_le16(x) ((__force __le16)(__u16)(x)) #define __constant_le16_to_cpu(x) ((__force __u16)(__le16)(x)) #define __constant_cpu_to_be64(x) ((__force __be64)___constant_swab64((x))) #define __constant_be64_to_cpu(x) ___constant_swab64((__force __u64)(__be64)(x)) #define __constant_cpu_to_be32(x) ((__force __be32)___constant_swab32((x))) #define __constant_be32_to_cpu(x) ___constant_swab32((__force __u32)(__be32)(x)) #define __constant_cpu_to_be16(x) ((__force __be16)___constant_swab16((x))) #define __constant_be16_to_cpu(x) ___constant_swab16((__force __u16)(__be16)(x)) #define __cpu_to_le64(x) ((__force __le64)(__u64)(x)) #define __le64_to_cpu(x) ((__force __u64)(__le64)(x)) #define __cpu_to_le32(x) ((__force __le32)(__u32)(x)) #define __le32_to_cpu(x) ((__force __u32)(__le32)(x)) #define __cpu_to_le16(x) ((__force __le16)(__u16)(x)) #define __le16_to_cpu(x) ((__force __u16)(__le16)(x)) #define __cpu_to_be64(x) ((__force __be64)__swab64((x))) #define __be64_to_cpu(x) __swab64((__force __u64)(__be64)(x)) #define __cpu_to_be32(x) ((__force __be32)__swab32((x))) #define __be32_to_cpu(x) __swab32((__force __u32)(__be32)(x)) #define __cpu_to_be16(x) ((__force __be16)__swab16((x))) #define __be16_to_cpu(x) __swab16((__force __u16)(__be16)(x)) static inline __le64 __cpu_to_le64p(const __u64 *p) { return (__force __le64)*p; } static inline __u64 __le64_to_cpup(const __le64 *p) { return (__force __u64)*p; } static inline __le32 __cpu_to_le32p(const __u32 *p) { return (__force __le32)*p; } static inline __u32 __le32_to_cpup(const __le32 *p) { return (__force __u32)*p; } static inline __le16 __cpu_to_le16p(const __u16 *p) { return (__force __le16)*p; } static inline __u16 __le16_to_cpup(const __le16 *p) { return (__force __u16)*p; } static inline __be64 __cpu_to_be64p(const __u64 *p) { return (__force __be64)__swab64p(p); } static inline __u64 __be64_to_cpup(const __be64 *p) { return __swab64p((__u64 *)p); } static inline __be32 __cpu_to_be32p(const __u32 *p) { return (__force __be32)__swab32p(p); } static inline __u32 __be32_to_cpup(const __be32 *p) { return __swab32p((__u32 *)p); } static inline __be16 __cpu_to_be16p(const __u16 *p) { return (__force __be16)__swab16p(p); } static inline __u16 __be16_to_cpup(const __be16 *p) { return __swab16p((__u16 *)p); } #define __cpu_to_le64s(x) do {} while (0) #define __le64_to_cpus(x) do {} while (0) #define __cpu_to_le32s(x) do {} while (0) #define __le32_to_cpus(x) do {} while (0) #define __cpu_to_le16s(x) do {} while (0) #define __le16_to_cpus(x) do {} while (0) #define __cpu_to_be64s(x) __swab64s((x)) #define __be64_to_cpus(x) __swab64s((x)) #define __cpu_to_be32s(x) __swab32s((x)) #define __be32_to_cpus(x) __swab32s((x)) #define __cpu_to_be16s(x) __swab16s((x)) #define __be16_to_cpus(x) __swab16s((x)) #include #endif /* __XEN_BYTEORDER_LITTLE_ENDIAN_H__ */ xen-4.4.0/xen/include/xen/byteorder/generic.h0000664000175000017500000000430212307313555017235 0ustar smbsmb#ifndef __XEN_BYTEORDER_GENERIC_H__ #define __XEN_BYTEORDER_GENERIC_H__ /* * Generic Byte-reordering support * * The "... p" macros, like le64_to_cpup, can be used with pointers * to unaligned data, but there will be a performance penalty on * some architectures. Use get_unaligned for unaligned data. * * The following macros are to be defined by : * * Conversion of XX-bit integers (16- 32- or 64-) * between native CPU format and little/big endian format * 64-bit stuff only defined for proper architectures * cpu_to_[bl]eXX(__uXX x) * [bl]eXX_to_cpu(__uXX x) * * The same, but takes a pointer to the value to convert * cpu_to_[bl]eXXp(__uXX x) * [bl]eXX_to_cpup(__uXX x) * * The same, but change in situ * cpu_to_[bl]eXXs(__uXX x) * [bl]eXX_to_cpus(__uXX x) * * See asm-foo/byteorder.h for examples of how to provide * architecture-optimized versions */ #define cpu_to_le64 __cpu_to_le64 #define le64_to_cpu __le64_to_cpu #define cpu_to_le32 __cpu_to_le32 #define le32_to_cpu __le32_to_cpu #define cpu_to_le16 __cpu_to_le16 #define le16_to_cpu __le16_to_cpu #define cpu_to_be64 __cpu_to_be64 #define be64_to_cpu __be64_to_cpu #define cpu_to_be32 __cpu_to_be32 #define be32_to_cpu __be32_to_cpu #define cpu_to_be16 __cpu_to_be16 #define be16_to_cpu __be16_to_cpu #define cpu_to_le64p __cpu_to_le64p #define le64_to_cpup __le64_to_cpup #define cpu_to_le32p __cpu_to_le32p #define le32_to_cpup __le32_to_cpup #define cpu_to_le16p __cpu_to_le16p #define le16_to_cpup __le16_to_cpup #define cpu_to_be64p __cpu_to_be64p #define be64_to_cpup __be64_to_cpup #define cpu_to_be32p __cpu_to_be32p #define be32_to_cpup __be32_to_cpup #define cpu_to_be16p __cpu_to_be16p #define be16_to_cpup __be16_to_cpup #define cpu_to_le64s __cpu_to_le64s #define le64_to_cpus __le64_to_cpus #define cpu_to_le32s __cpu_to_le32s #define le32_to_cpus __le32_to_cpus #define cpu_to_le16s __cpu_to_le16s #define le16_to_cpus __le16_to_cpus #define cpu_to_be64s __cpu_to_be64s #define be64_to_cpus __be64_to_cpus #define cpu_to_be32s __cpu_to_be32s #define be32_to_cpus __be32_to_cpus #define cpu_to_be16s __cpu_to_be16s #define be16_to_cpus __be16_to_cpus #endif /* __XEN_BYTEORDER_GENERIC_H__ */ xen-4.4.0/xen/include/xen/byteorder/swab.h0000664000175000017500000001421112307313555016555 0ustar smbsmb#ifndef __XEN_BYTEORDER_SWAB_H__ #define __XEN_BYTEORDER_SWAB_H__ /* * Byte-swapping, independently from CPU endianness * swabXX[ps]?(foo) * * Francois-Rene Rideau 19971205 * separated swab functions from cpu_to_XX, * to clean up support for bizarre-endian architectures. */ /* casts are necessary for constants, because we never know how for sure * how U/UL/ULL map to __u16, __u32, __u64. At least not in a portable way. */ #define ___swab16(x) \ ({ \ __u16 __x = (x); \ ((__u16)( \ (((__u16)(__x) & (__u16)0x00ffU) << 8) | \ (((__u16)(__x) & (__u16)0xff00U) >> 8) )); \ }) #define ___swab32(x) \ ({ \ __u32 __x = (x); \ ((__u32)( \ (((__u32)(__x) & (__u32)0x000000ffUL) << 24) | \ (((__u32)(__x) & (__u32)0x0000ff00UL) << 8) | \ (((__u32)(__x) & (__u32)0x00ff0000UL) >> 8) | \ (((__u32)(__x) & (__u32)0xff000000UL) >> 24) )); \ }) #define ___swab64(x) \ ({ \ __u64 __x = (x); \ ((__u64)( \ (__u64)(((__u64)(__x) & (__u64)0x00000000000000ffULL) << 56) | \ (__u64)(((__u64)(__x) & (__u64)0x000000000000ff00ULL) << 40) | \ (__u64)(((__u64)(__x) & (__u64)0x0000000000ff0000ULL) << 24) | \ (__u64)(((__u64)(__x) & (__u64)0x00000000ff000000ULL) << 8) | \ (__u64)(((__u64)(__x) & (__u64)0x000000ff00000000ULL) >> 8) | \ (__u64)(((__u64)(__x) & (__u64)0x0000ff0000000000ULL) >> 24) | \ (__u64)(((__u64)(__x) & (__u64)0x00ff000000000000ULL) >> 40) | \ (__u64)(((__u64)(__x) & (__u64)0xff00000000000000ULL) >> 56) )); \ }) #define ___constant_swab16(x) \ ((__u16)( \ (((__u16)(x) & (__u16)0x00ffU) << 8) | \ (((__u16)(x) & (__u16)0xff00U) >> 8) )) #define ___constant_swab32(x) \ ((__u32)( \ (((__u32)(x) & (__u32)0x000000ffUL) << 24) | \ (((__u32)(x) & (__u32)0x0000ff00UL) << 8) | \ (((__u32)(x) & (__u32)0x00ff0000UL) >> 8) | \ (((__u32)(x) & (__u32)0xff000000UL) >> 24) )) #define ___constant_swab64(x) \ ((__u64)( \ (__u64)(((__u64)(x) & (__u64)0x00000000000000ffULL) << 56) | \ (__u64)(((__u64)(x) & (__u64)0x000000000000ff00ULL) << 40) | \ (__u64)(((__u64)(x) & (__u64)0x0000000000ff0000ULL) << 24) | \ (__u64)(((__u64)(x) & (__u64)0x00000000ff000000ULL) << 8) | \ (__u64)(((__u64)(x) & (__u64)0x000000ff00000000ULL) >> 8) | \ (__u64)(((__u64)(x) & (__u64)0x0000ff0000000000ULL) >> 24) | \ (__u64)(((__u64)(x) & (__u64)0x00ff000000000000ULL) >> 40) | \ (__u64)(((__u64)(x) & (__u64)0xff00000000000000ULL) >> 56) )) /* * provide defaults when no architecture-specific optimization is detected */ #ifndef __arch__swab16 # define __arch__swab16(x) ({ __u16 __tmp = (x) ; ___swab16(__tmp); }) #endif #ifndef __arch__swab32 # define __arch__swab32(x) ({ __u32 __tmp = (x) ; ___swab32(__tmp); }) #endif #ifndef __arch__swab64 # define __arch__swab64(x) ({ __u64 __tmp = (x) ; ___swab64(__tmp); }) #endif #ifndef __arch__swab16p # define __arch__swab16p(x) __arch__swab16(*(x)) #endif #ifndef __arch__swab32p # define __arch__swab32p(x) __arch__swab32(*(x)) #endif #ifndef __arch__swab64p # define __arch__swab64p(x) __arch__swab64(*(x)) #endif #ifndef __arch__swab16s # define __arch__swab16s(x) do { *(x) = __arch__swab16p((x)); } while (0) #endif #ifndef __arch__swab32s # define __arch__swab32s(x) do { *(x) = __arch__swab32p((x)); } while (0) #endif #ifndef __arch__swab64s # define __arch__swab64s(x) do { *(x) = __arch__swab64p((x)); } while (0) #endif /* * Allow constant folding */ #if defined(__GNUC__) && defined(__OPTIMIZE__) # define __swab16(x) \ (__builtin_constant_p((__u16)(x)) ? \ ___swab16((x)) : \ __fswab16((x))) # define __swab32(x) \ (__builtin_constant_p((__u32)(x)) ? \ ___swab32((x)) : \ __fswab32((x))) # define __swab64(x) \ (__builtin_constant_p((__u64)(x)) ? \ ___swab64((x)) : \ __fswab64((x))) #else # define __swab16(x) __fswab16(x) # define __swab32(x) __fswab32(x) # define __swab64(x) __fswab64(x) #endif /* OPTIMIZE */ static inline __attribute_const__ __u16 __fswab16(__u16 x) { return __arch__swab16(x); } static inline __u16 __swab16p(const __u16 *x) { return __arch__swab16p(x); } static inline void __swab16s(__u16 *addr) { __arch__swab16s(addr); } static inline __attribute_const__ __u32 __fswab32(__u32 x) { return __arch__swab32(x); } static inline __u32 __swab32p(const __u32 *x) { return __arch__swab32p(x); } static inline void __swab32s(__u32 *addr) { __arch__swab32s(addr); } #ifdef __BYTEORDER_HAS_U64__ static inline __attribute_const__ __u64 __fswab64(__u64 x) { # ifdef __SWAB_64_THRU_32__ __u32 h = x >> 32; __u32 l = x & ((1ULL<<32)-1); return (((__u64)__swab32(l)) << 32) | ((__u64)(__swab32(h))); # else return __arch__swab64(x); # endif } static inline __u64 __swab64p(const __u64 *x) { return __arch__swab64p(x); } static inline void __swab64s(__u64 *addr) { __arch__swab64s(addr); } #endif /* __BYTEORDER_HAS_U64__ */ #define swab16 __swab16 #define swab32 __swab32 #define swab64 __swab64 #define swab16p __swab16p #define swab32p __swab32p #define swab64p __swab64p #define swab16s __swab16s #define swab32s __swab32s #define swab64s __swab64s #endif /* __XEN_BYTEORDER_SWAB_H__ */ xen-4.4.0/xen/include/xen/byteorder/big_endian.h0000664000175000017500000000656712307313555017717 0ustar smbsmb#ifndef __XEN_BYTEORDER_BIG_ENDIAN_H__ #define __XEN_BYTEORDER_BIG_ENDIAN_H__ #ifndef __BIG_ENDIAN #define __BIG_ENDIAN 4321 #endif #ifndef __BIG_ENDIAN_BITFIELD #define __BIG_ENDIAN_BITFIELD #endif #include #include #define __constant_cpu_to_le64(x) ((__force __le64)___constant_swab64((x))) #define __constant_le64_to_cpu(x) ___constant_swab64((__force __u64)(__le64)(x)) #define __constant_cpu_to_le32(x) ((__force __le32)___constant_swab32((x))) #define __constant_le32_to_cpu(x) ___constant_swab32((__force __u32)(__le32)(x)) #define __constant_cpu_to_le16(x) ((__force __le16)___constant_swab16((x))) #define __constant_le16_to_cpu(x) ___constant_swab16((__force __u16)(__le16)(x)) #define __constant_cpu_to_be64(x) ((__force __be64)(__u64)(x)) #define __constant_be64_to_cpu(x) ((__force __u64)(__be64)(x)) #define __constant_cpu_to_be32(x) ((__force __be32)(__u32)(x)) #define __constant_be32_to_cpu(x) ((__force __u32)(__be32)(x)) #define __constant_cpu_to_be16(x) ((__force __be16)(__u16)(x)) #define __constant_be16_to_cpu(x) ((__force __u16)(__be16)(x)) #define __cpu_to_le64(x) ((__force __le64)__swab64((x))) #define __le64_to_cpu(x) __swab64((__force __u64)(__le64)(x)) #define __cpu_to_le32(x) ((__force __le32)__swab32((x))) #define __le32_to_cpu(x) __swab32((__force __u32)(__le32)(x)) #define __cpu_to_le16(x) ((__force __le16)__swab16((x))) #define __le16_to_cpu(x) __swab16((__force __u16)(__le16)(x)) #define __cpu_to_be64(x) ((__force __be64)(__u64)(x)) #define __be64_to_cpu(x) ((__force __u64)(__be64)(x)) #define __cpu_to_be32(x) ((__force __be32)(__u32)(x)) #define __be32_to_cpu(x) ((__force __u32)(__be32)(x)) #define __cpu_to_be16(x) ((__force __be16)(__u16)(x)) #define __be16_to_cpu(x) ((__force __u16)(__be16)(x)) static inline __le64 __cpu_to_le64p(const __u64 *p) { return (__force __le64)__swab64p(p); } static inline __u64 __le64_to_cpup(const __le64 *p) { return __swab64p((__u64 *)p); } static inline __le32 __cpu_to_le32p(const __u32 *p) { return (__force __le32)__swab32p(p); } static inline __u32 __le32_to_cpup(const __le32 *p) { return __swab32p((__u32 *)p); } static inline __le16 __cpu_to_le16p(const __u16 *p) { return (__force __le16)__swab16p(p); } static inline __u16 __le16_to_cpup(const __le16 *p) { return __swab16p((__u16 *)p); } static inline __be64 __cpu_to_be64p(const __u64 *p) { return (__force __be64)*p; } static inline __u64 __be64_to_cpup(const __be64 *p) { return (__force __u64)*p; } static inline __be32 __cpu_to_be32p(const __u32 *p) { return (__force __be32)*p; } static inline __u32 __be32_to_cpup(const __be32 *p) { return (__force __u32)*p; } static inline __be16 __cpu_to_be16p(const __u16 *p) { return (__force __be16)*p; } static inline __u16 __be16_to_cpup(const __be16 *p) { return (__force __u16)*p; } #define __cpu_to_le64s(x) __swab64s((x)) #define __le64_to_cpus(x) __swab64s((x)) #define __cpu_to_le32s(x) __swab32s((x)) #define __le32_to_cpus(x) __swab32s((x)) #define __cpu_to_le16s(x) __swab16s((x)) #define __le16_to_cpus(x) __swab16s((x)) #define __cpu_to_be64s(x) do {} while (0) #define __be64_to_cpus(x) do {} while (0) #define __cpu_to_be32s(x) do {} while (0) #define __be32_to_cpus(x) do {} while (0) #define __cpu_to_be16s(x) do {} while (0) #define __be16_to_cpus(x) do {} while (0) #include #endif /* __XEN_BYTEORDER_BIG_ENDIAN_H__ */ xen-4.4.0/xen/include/xen/softirq.h0000664000175000017500000000214412307313555015313 0ustar smbsmb#if !defined(__XEN_SOFTIRQ_H__) && !defined(__ASSEMBLY__) #define __XEN_SOFTIRQ_H__ /* Low-latency softirqs come first in the following list. */ enum { TIMER_SOFTIRQ = 0, SCHEDULE_SOFTIRQ, NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, RCU_SOFTIRQ, TASKLET_SOFTIRQ, NR_COMMON_SOFTIRQS }; #include #include #include #include #include #include #define NR_SOFTIRQS (NR_COMMON_SOFTIRQS + NR_ARCH_SOFTIRQS) typedef void (*softirq_handler)(void); asmlinkage void do_softirq(void); void open_softirq(int nr, softirq_handler handler); void softirq_init(void); void cpumask_raise_softirq(const cpumask_t *, unsigned int nr); void cpu_raise_softirq(unsigned int cpu, unsigned int nr); void raise_softirq(unsigned int nr); /* * Process pending softirqs on this CPU. This should be called periodically * when performing work that prevents softirqs from running in a timely manner. * Use this instead of do_softirq() when you do not want to be preempted. */ void process_pending_softirqs(void); #endif /* __XEN_SOFTIRQ_H__ */ xen-4.4.0/xen/include/xen/hypercall.h0000664000175000017500000000676512307313555015624 0ustar smbsmb/****************************************************************************** * hypercall.h */ #ifndef __XEN_HYPERCALL_H__ #define __XEN_HYPERCALL_H__ #include #include #include #include #include #include #include #include #include #include #include extern long do_ni_hypercall( void); extern long do_sched_op_compat( int cmd, unsigned long arg); extern long do_sched_op( int cmd, XEN_GUEST_HANDLE_PARAM(void) arg); extern long do_domctl( XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl); extern long arch_do_domctl( struct xen_domctl *domctl, struct domain *d, XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl); extern long do_sysctl( XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) u_sysctl); extern long arch_do_sysctl( struct xen_sysctl *sysctl, XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) u_sysctl); extern long do_platform_op( XEN_GUEST_HANDLE_PARAM(xen_platform_op_t) u_xenpf_op); /* * To allow safe resume of do_memory_op() after preemption, we need to know * at what point in the page list to resume. For this purpose I steal the * high-order bits of the @cmd parameter, which are otherwise unused and zero. */ #define MEMOP_EXTENT_SHIFT 6 /* cmd[:6] == start_extent */ #define MEMOP_CMD_MASK ((1 << MEMOP_EXTENT_SHIFT) - 1) extern long do_memory_op( unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg); extern long do_multicall( XEN_GUEST_HANDLE_PARAM(multicall_entry_t) call_list, unsigned int nr_calls); extern long do_set_timer_op( s_time_t timeout); extern long do_event_channel_op( int cmd, XEN_GUEST_HANDLE_PARAM(void) arg); extern long do_xen_version( int cmd, XEN_GUEST_HANDLE_PARAM(void) arg); extern long do_console_io( int cmd, int count, XEN_GUEST_HANDLE_PARAM(char) buffer); extern long do_grant_table_op( unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) uop, unsigned int count); extern long do_vm_assist( unsigned int cmd, unsigned int type); extern long do_vcpu_op( int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg); struct vcpu; extern long arch_do_vcpu_op(int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg); extern long do_nmi_op( unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) arg); extern long do_hvm_op( unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg); extern long do_kexec_op( unsigned long op, XEN_GUEST_HANDLE_PARAM(void) uarg); extern long do_xsm_op( XEN_GUEST_HANDLE_PARAM(xsm_op_t) u_xsm_op); extern long do_tmem_op( XEN_GUEST_HANDLE_PARAM(tmem_op_t) uops); extern long do_xenoprof_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg); #ifdef CONFIG_COMPAT extern int compat_memory_op( unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) arg); extern int compat_grant_table_op( unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) uop, unsigned int count); extern int compat_vcpu_op( int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg); extern int compat_xenoprof_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg); extern int compat_xen_version( int cmd, XEN_GUEST_HANDLE_PARAM(void) arg); extern int compat_sched_op( int cmd, XEN_GUEST_HANDLE_PARAM(void) arg); extern int compat_set_timer_op( u32 lo, s32 hi); #endif void arch_get_xen_caps(xen_capabilities_info_t *info); #endif /* __XEN_HYPERCALL_H__ */ xen-4.4.0/xen/include/xen/xenoprof.h0000664000175000017500000000405312307313555015465 0ustar smbsmb/****************************************************************************** * xenoprof.h * * Xenoprof: Xenoprof enables performance profiling in Xen * * Copyright (C) 2005 Hewlett-Packard Co. * written by Aravind Menon & Jose Renato Santos */ #ifndef __XEN_XENOPROF_H__ #define __XEN_XENOPROF_H__ #include #include #include #define XENOPROF_DOMAIN_IGNORED 0 #define XENOPROF_DOMAIN_ACTIVE 1 #define XENOPROF_DOMAIN_PASSIVE 2 #define XENOPROF_IDLE 0 #define XENOPROF_INITIALIZED 1 #define XENOPROF_COUNTERS_RESERVED 2 #define XENOPROF_READY 3 #define XENOPROF_PROFILING 4 #ifndef CONFIG_COMPAT typedef struct xenoprof_buf xenoprof_buf_t; #else #include typedef union { struct xenoprof_buf native; struct compat_oprof_buf compat; } xenoprof_buf_t; #endif struct xenoprof_vcpu { int event_size; xenoprof_buf_t *buffer; }; struct xenoprof { char *rawbuf; int npages; int nbuf; int bufsize; int domain_type; int domain_ready; int is_primary; #ifdef CONFIG_COMPAT int is_compat; #endif struct xenoprof_vcpu *vcpu; }; #ifndef CONFIG_COMPAT #define XENOPROF_COMPAT(x) 0 #define xenoprof_buf(d, b, field) ((b)->field) #else #define XENOPROF_COMPAT(x) ((x)->is_compat) #define xenoprof_buf(d, b, field) (*(!(d)->xenoprof->is_compat ? \ &(b)->native.field : \ &(b)->compat.field)) #endif struct domain; int is_active(struct domain *d); int is_passive(struct domain *d); void free_xenoprof_pages(struct domain *d); int xenoprof_add_trace(struct vcpu *, uint64_t pc, int mode); #define PMU_OWNER_NONE 0 #define PMU_OWNER_XENOPROF 1 #define PMU_OWNER_HVM 2 int acquire_pmu_ownship(int pmu_ownership); void release_pmu_ownship(int pmu_ownership); void xenoprof_log_event(struct vcpu *, const struct cpu_user_regs *, uint64_t pc, int mode, int event); #endif /* __XEN__XENOPROF_H__ */ xen-4.4.0/xen/include/xen/mm.h0000664000175000017500000002754312307313555014247 0ustar smbsmb/****************************************************************************** * include/xen/mm.h * * Definitions for memory pages, frame numbers, addresses, allocations, etc. * * Note that Xen must handle several different physical 'address spaces' and * there is a consistent terminology for these: * * 1. gpfn/gpaddr: A guest-specific pseudo-physical frame number or address. * 2. gmfn/gmaddr: A machine address from the p.o.v. of a particular guest. * 3. mfn/maddr: A real machine frame number or address. * 4. pfn/paddr: Used in 'polymorphic' functions that work across all * address spaces, depending on context. See the pagetable * conversion macros in asm-x86/page.h for examples. * Also 'paddr_t' is big enough to store any physical address. * * This scheme provides consistent function and variable names even when * different guests are running in different memory-management modes. * 1. A guest running in auto-translated mode (e.g., shadow_mode_translate()) * will have gpfn == gmfn and gmfn != mfn. * 2. A paravirtualised x86 guest will have gpfn != gmfn and gmfn == mfn. * 3. A paravirtualised guest with no pseudophysical overlay will have * gpfn == gpmfn == mfn. * * Copyright (c) 2002-2006, K A Fraser */ #ifndef __XEN_MM_H__ #define __XEN_MM_H__ #include #include #include struct domain; struct page_info; /* Boot-time allocator. Turns into generic allocator after bootstrap. */ void init_boot_pages(paddr_t ps, paddr_t pe); unsigned long alloc_boot_pages( unsigned long nr_pfns, unsigned long pfn_align); void end_boot_allocator(void); /* Xen suballocator. These functions are interrupt-safe. */ void init_xenheap_pages(paddr_t ps, paddr_t pe); void xenheap_max_mfn(unsigned long mfn); void *alloc_xenheap_pages(unsigned int order, unsigned int memflags); void free_xenheap_pages(void *v, unsigned int order); #define alloc_xenheap_page() (alloc_xenheap_pages(0,0)) #define free_xenheap_page(v) (free_xenheap_pages(v,0)) /* Map machine page range in Xen virtual address space. */ int map_pages_to_xen( unsigned long virt, unsigned long mfn, unsigned long nr_mfns, unsigned int flags); void destroy_xen_mappings(unsigned long v, unsigned long e); /* Claim handling */ unsigned long domain_adjust_tot_pages(struct domain *d, long pages); int domain_set_outstanding_pages(struct domain *d, unsigned long pages); void get_outstanding_claims(uint64_t *free_pages, uint64_t *outstanding_pages); /* Domain suballocator. These functions are *not* interrupt-safe.*/ void init_domheap_pages(paddr_t ps, paddr_t pe); struct page_info *alloc_domheap_pages( struct domain *d, unsigned int order, unsigned int memflags); void free_domheap_pages(struct page_info *pg, unsigned int order); unsigned long avail_domheap_pages_region( unsigned int node, unsigned int min_width, unsigned int max_width); unsigned long avail_domheap_pages(void); unsigned long avail_node_heap_pages(unsigned int); #define alloc_domheap_page(d,f) (alloc_domheap_pages(d,0,f)) #define free_domheap_page(p) (free_domheap_pages(p,0)) unsigned int online_page(unsigned long mfn, uint32_t *status); int offline_page(unsigned long mfn, int broken, uint32_t *status); int query_page_offline(unsigned long mfn, uint32_t *status); unsigned long total_free_pages(void); void scrub_heap_pages(void); int assign_pages( struct domain *d, struct page_info *pg, unsigned int order, unsigned int memflags); /* Dump info to serial console */ void arch_dump_shared_mem_info(void); /* memflags: */ #define _MEMF_no_refcount 0 #define MEMF_no_refcount (1U<<_MEMF_no_refcount) #define _MEMF_populate_on_demand 1 #define MEMF_populate_on_demand (1U<<_MEMF_populate_on_demand) #define _MEMF_tmem 2 #define MEMF_tmem (1U<<_MEMF_tmem) #define _MEMF_no_dma 3 #define MEMF_no_dma (1U<<_MEMF_no_dma) #define _MEMF_exact_node 4 #define MEMF_exact_node (1U<<_MEMF_exact_node) #define _MEMF_node 8 #define MEMF_node(n) ((((n)+1)&0xff)<<_MEMF_node) #define _MEMF_bits 24 #define MEMF_bits(n) ((n)<<_MEMF_bits) #ifdef CONFIG_PAGEALLOC_MAX_ORDER #define MAX_ORDER CONFIG_PAGEALLOC_MAX_ORDER #else #define MAX_ORDER 20 /* 2^20 contiguous pages */ #endif #define page_list_entry list_head #include #ifndef page_list_entry struct page_list_head { struct page_info *next, *tail; }; /* These must only have instances in struct page_info. */ # define page_list_entry # define PAGE_LIST_NULL ((typeof(((struct page_info){}).list.next))~0) # if !defined(pdx_to_page) && !defined(page_to_pdx) # if defined(__page_to_mfn) || defined(__mfn_to_page) # define page_to_pdx __page_to_mfn # define pdx_to_page __mfn_to_page # else # define page_to_pdx page_to_mfn # define pdx_to_page mfn_to_page # endif # endif # define PAGE_LIST_HEAD_INIT(name) { NULL, NULL } # define PAGE_LIST_HEAD(name) \ struct page_list_head name = PAGE_LIST_HEAD_INIT(name) # define INIT_PAGE_LIST_HEAD(head) ((head)->tail = (head)->next = NULL) # define INIT_PAGE_LIST_ENTRY(ent) ((ent)->prev = (ent)->next = PAGE_LIST_NULL) static inline int page_list_empty(const struct page_list_head *head) { return !head->next; } static inline struct page_info * page_list_first(const struct page_list_head *head) { return head->next; } static inline struct page_info * page_list_next(const struct page_info *page, const struct page_list_head *head) { return page != head->tail ? pdx_to_page(page->list.next) : NULL; } static inline struct page_info * page_list_prev(const struct page_info *page, const struct page_list_head *head) { return page != head->next ? pdx_to_page(page->list.prev) : NULL; } static inline void page_list_add(struct page_info *page, struct page_list_head *head) { if ( head->next ) { page->list.next = page_to_pdx(head->next); head->next->list.prev = page_to_pdx(page); } else { head->tail = page; page->list.next = PAGE_LIST_NULL; } page->list.prev = PAGE_LIST_NULL; head->next = page; } static inline void page_list_add_tail(struct page_info *page, struct page_list_head *head) { page->list.next = PAGE_LIST_NULL; if ( head->next ) { page->list.prev = page_to_pdx(head->tail); head->tail->list.next = page_to_pdx(page); } else { page->list.prev = PAGE_LIST_NULL; head->next = page; } head->tail = page; } static inline bool_t __page_list_del_head(struct page_info *page, struct page_list_head *head, struct page_info *next, struct page_info *prev) { if ( head->next == page ) { if ( head->tail != page ) { next->list.prev = PAGE_LIST_NULL; head->next = next; } else head->tail = head->next = NULL; return 1; } if ( head->tail == page ) { prev->list.next = PAGE_LIST_NULL; head->tail = prev; return 1; } return 0; } static inline void page_list_del(struct page_info *page, struct page_list_head *head) { struct page_info *next = pdx_to_page(page->list.next); struct page_info *prev = pdx_to_page(page->list.prev); if ( !__page_list_del_head(page, head, next, prev) ) { next->list.prev = page->list.prev; prev->list.next = page->list.next; } } static inline void page_list_del2(struct page_info *page, struct page_list_head *head1, struct page_list_head *head2) { struct page_info *next = pdx_to_page(page->list.next); struct page_info *prev = pdx_to_page(page->list.prev); if ( !__page_list_del_head(page, head1, next, prev) && !__page_list_del_head(page, head2, next, prev) ) { next->list.prev = page->list.prev; prev->list.next = page->list.next; } } static inline struct page_info * page_list_remove_head(struct page_list_head *head) { struct page_info *page = head->next; if ( page ) page_list_del(page, head); return page; } static inline void page_list_move(struct page_list_head *dst, struct page_list_head *src) { if ( !page_list_empty(src) ) { *dst = *src; INIT_PAGE_LIST_HEAD(src); } } static inline void page_list_splice(struct page_list_head *list, struct page_list_head *head) { struct page_info *first, *last, *at; if ( page_list_empty(list) ) return; if ( page_list_empty(head) ) { head->next = list->next; head->tail = list->tail; return; } first = list->next; last = list->tail; at = head->next; ASSERT(first->list.prev == PAGE_LIST_NULL); ASSERT(first->list.prev == at->list.prev); head->next = first; last->list.next = page_to_pdx(at); at->list.prev = page_to_pdx(last); } #define page_list_for_each(pos, head) \ for ( pos = (head)->next; pos; pos = page_list_next(pos, head) ) #define page_list_for_each_safe(pos, tmp, head) \ for ( pos = (head)->next; \ pos ? (tmp = page_list_next(pos, head), 1) : 0; \ pos = tmp ) #define page_list_for_each_safe_reverse(pos, tmp, head) \ for ( pos = (head)->tail; \ pos ? (tmp = page_list_prev(pos, head), 1) : 0; \ pos = tmp ) #else # define page_list_head list_head # define PAGE_LIST_HEAD_INIT LIST_HEAD_INIT # define PAGE_LIST_HEAD LIST_HEAD # define INIT_PAGE_LIST_HEAD INIT_LIST_HEAD # define INIT_PAGE_LIST_ENTRY INIT_LIST_HEAD # define page_list_empty list_empty # define page_list_first(hd) list_entry((hd)->next, \ struct page_info, list) # define page_list_next(pg, hd) list_entry((pg)->list.next, \ struct page_info, list) # define page_list_add(pg, hd) list_add(&(pg)->list, hd) # define page_list_add_tail(pg, hd) list_add_tail(&(pg)->list, hd) # define page_list_del(pg, hd) list_del(&(pg)->list) # define page_list_del2(pg, hd1, hd2) list_del(&(pg)->list) # define page_list_remove_head(hd) (!page_list_empty(hd) ? \ ({ \ struct page_info *__pg = page_list_first(hd); \ list_del(&__pg->list); \ __pg; \ }) : NULL) # define page_list_move(dst, src) (!list_empty(src) ? \ list_replace_init(src, dst) : (void)0) # define page_list_for_each(pos, head) list_for_each_entry(pos, head, list) # define page_list_for_each_safe(pos, tmp, head) \ list_for_each_entry_safe(pos, tmp, head, list) # define page_list_for_each_safe_reverse(pos, tmp, head) \ list_for_each_entry_safe_reverse(pos, tmp, head, list) # define page_list_splice(list, hd) list_splice(list, hd) #endif static inline unsigned int get_order_from_bytes(paddr_t size) { unsigned int order; size = (size - 1) >> PAGE_SHIFT; for ( order = 0; size; order++ ) size >>= 1; return order; } static inline unsigned int get_order_from_pages(unsigned long nr_pages) { unsigned int order; nr_pages--; for ( order = 0; nr_pages; order++ ) nr_pages >>= 1; return order; } void scrub_one_page(struct page_info *); int xenmem_add_to_physmap_one(struct domain *d, unsigned int space, domid_t foreign_domid, unsigned long idx, xen_pfn_t gpfn); /* Returns 1 on success, 0 on error, negative if the ring * for event propagation is full in the presence of paging */ int guest_remove_page(struct domain *d, unsigned long gmfn); #define RAM_TYPE_CONVENTIONAL 0x00000001 #define RAM_TYPE_RESERVED 0x00000002 #define RAM_TYPE_UNUSABLE 0x00000004 #define RAM_TYPE_ACPI 0x00000008 /* TRUE if the whole page at @mfn is of the requested RAM type(s) above. */ int page_is_ram_type(unsigned long mfn, unsigned long mem_type); #endif /* __XEN_MM_H__ */ xen-4.4.0/xen/include/xen/stringify.h0000664000175000017500000000052112307313555015637 0ustar smbsmb#ifndef __XEN_STRINGIFY_H #define __XEN_STRINGIFY_H /* Indirect stringification. Doing two levels allows the parameter to be a * macro itself. For example, compile with -DFOO=bar, __stringify(FOO) * converts to "bar". */ #define __stringify_1(x...) #x #define __stringify(x...) __stringify_1(x) #endif /* !__XEN_STRINGIFY_H */ xen-4.4.0/xen/include/xen/init.h0000664000175000017500000000763112307313555014575 0ustar smbsmb#ifndef _LINUX_INIT_H #define _LINUX_INIT_H #include /* * Mark functions and data as being only used at initialization * or exit time. */ #define __init __text_section(".init.text") #define __exit __text_section(".exit.text") #define __initdata __section(".init.data") #define __initconst __section(".init.rodata") #define __exitdata __used_section(".exit.data") #define __initsetup __used_section(".init.setup") #define __init_call(lvl) __used_section(".initcall" lvl ".init") #define __exit_call __used_section(".exitcall.exit") /* These macros are used to mark some functions or * initialized data (doesn't apply to uninitialized data) * as `initialization' functions. The kernel can take this * as hint that the function is used only during the initialization * phase and free up used memory resources after * * Usage: * For functions: * * You should add __init immediately before the function name, like: * * static void __init initme(int x, int y) * { * extern int z; z = x * y; * } * * If the function has a prototype somewhere, you can also add * __init between closing brace of the prototype and semicolon: * * extern int initialize_foobar_device(int, int, int) __init; * * For initialized data: * You should insert __initdata between the variable name and equal * sign followed by value, e.g.: * * static int init_variable __initdata = 0; * static char linux_logo[] __initdata = { 0x32, 0x36, ... }; * * Don't forget to initialize data not at file scope, i.e. within a function, * as gcc otherwise puts the data into the bss section and not into the init * section. * * Also note, that this data cannot be "const". */ #ifndef __ASSEMBLY__ /* * Used for initialization calls.. */ typedef int (*initcall_t)(void); typedef void (*exitcall_t)(void); #define presmp_initcall(fn) \ static initcall_t __initcall_##fn __init_call("presmp") = fn #define __initcall(fn) \ static initcall_t __initcall_##fn __init_call("1") = fn #define __exitcall(fn) \ static exitcall_t __exitcall_##fn __exit_call = fn void do_presmp_initcalls(void); void do_initcalls(void); /* * Used for kernel command line parameter setup */ struct kernel_param { const char *name; enum { OPT_STR, OPT_UINT, OPT_BOOL, OPT_INVBOOL, OPT_SIZE, OPT_CUSTOM } type; void *var; unsigned int len; }; extern struct kernel_param __setup_start, __setup_end; #define __setup_str static __initdata __attribute__((__aligned__(1))) char #define __kparam static __initsetup struct kernel_param #define custom_param(_name, _var) \ __setup_str __setup_str_##_var[] = _name; \ __kparam __setup_##_var = { __setup_str_##_var, OPT_CUSTOM, _var, 0 } #define boolean_param(_name, _var) \ __setup_str __setup_str_##_var[] = _name; \ __kparam __setup_##_var = \ { __setup_str_##_var, OPT_BOOL, &_var, sizeof(_var) } #define invbool_param(_name, _var) \ __setup_str __setup_str_##_var[] = _name; \ __kparam __setup_##_var = \ { __setup_str_##_var, OPT_INVBOOL, &_var, sizeof(_var) } #define integer_param(_name, _var) \ __setup_str __setup_str_##_var[] = _name; \ __kparam __setup_##_var = \ { __setup_str_##_var, OPT_UINT, &_var, sizeof(_var) } #define size_param(_name, _var) \ __setup_str __setup_str_##_var[] = _name; \ __kparam __setup_##_var = \ { __setup_str_##_var, OPT_SIZE, &_var, sizeof(_var) } #define string_param(_name, _var) \ __setup_str __setup_str_##_var[] = _name; \ __kparam __setup_##_var = \ { __setup_str_##_var, OPT_STR, &_var, sizeof(_var) } #endif /* __ASSEMBLY__ */ #ifdef CONFIG_HOTPLUG #define __devinit #define __devinitdata #define __devexit #define __devexitdata #else #define __devinit __init #define __devinitdata __initdata #define __devexit __exit #define __devexitdata __exitdata #endif #endif /* _LINUX_INIT_H */ xen-4.4.0/xen/include/xen/gdbstub.h0000664000175000017500000000655212307313555015265 0ustar smbsmb/* * Copyright (C) 2005 Hollis Blanchard , IBM Corporation * Copyright (C) 2006 Isaku Yamahata * VA Linux Systems Japan. K.K. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef __XEN_GDBSTUB_H__ #define __XEN_GDBSTUB_H__ #include #include #ifdef CRASH_DEBUG struct gdb_context { int serhnd; /* handle on our serial line */ int console_steal_id; /* handle on stolen console */ bool_t currently_attached; atomic_t running; unsigned long connected; u8 signum; char in_buf[PAGE_SIZE]; unsigned long in_bytes; char out_buf[PAGE_SIZE]; unsigned long out_offset; u8 out_csum; }; /* interface to arch specific routines */ void gdb_write_to_packet( const char *buf, int count, struct gdb_context *ctx); void gdb_write_to_packet_hex( unsigned long x, int int_size, struct gdb_context *ctx); /* ... writes in target native byte order as required by gdb spec. */ void gdb_send_packet(struct gdb_context *ctx); void gdb_send_reply(const char *buf, struct gdb_context *ctx); /* gdb stub trap handler: entry point */ int __trap_to_gdb(struct cpu_user_regs *regs, unsigned long cookie); /* arch specific routines */ u16 gdb_arch_signal_num( struct cpu_user_regs *regs, unsigned long cookie); void gdb_arch_read_reg_array( struct cpu_user_regs *regs, struct gdb_context *ctx); void gdb_arch_write_reg_array( struct cpu_user_regs *regs, const char* buf, struct gdb_context *ctx); void gdb_arch_read_reg( unsigned long regnum, struct cpu_user_regs *regs, struct gdb_context *ctx); void gdb_arch_write_reg( unsigned long regnum, unsigned long val, struct cpu_user_regs *regs, struct gdb_context *ctx); unsigned int gdb_arch_copy_from_user( void *dest, const void *src, unsigned len); unsigned int gdb_arch_copy_to_user( void *dest, const void *src, unsigned len); void gdb_arch_resume( struct cpu_user_regs *regs, unsigned long addr, unsigned long type, struct gdb_context *ctx); void gdb_arch_print_state(struct cpu_user_regs *regs); void gdb_arch_enter(struct cpu_user_regs *regs); void gdb_arch_exit(struct cpu_user_regs *regs); #define GDB_CONTINUE 0 #define GDB_STEP 1 #define SIGILL 4 #define SIGTRAP 5 #define SIGBUS 7 #define SIGFPE 8 #define SIGSEGV 11 #define SIGALRM 14 #define SIGTERM 15 #endif #endif /* __XEN_GDBSTUB_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * End: */ xen-4.4.0/xen/include/xen/cper.h0000664000175000017500000001266512307313555014566 0ustar smbsmb/* * UEFI Common Platform Error Record * * Copyright (C) 2010, Intel Corp. * Author: Huang Ying * Ported by: Liu, Jinsong * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef LINUX_CPER_H #define LINUX_CPER_H #include #include extern unsigned long get_sec(void); typedef struct { __u8 b[16]; } uuid_le; static inline int uuid_le_cmp(const uuid_le u1, const uuid_le u2) { return memcmp(&u1, &u2, sizeof(uuid_le)); } static inline u64 cper_next_record_id(void) { static u64 record_id; if (!record_id) record_id = (u64)get_sec() << 32; return ++record_id; } #define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \ ((uuid_le) \ {{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ (b) & 0xff, ((b) >> 8) & 0xff, \ (c) & 0xff, ((c) >> 8) & 0xff, \ (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}) /* CPER record signature and the size */ #define CPER_SIG_RECORD "CPER" #define CPER_SIG_SIZE 4 /* Used in signature_end field in struct cper_record_header */ #define CPER_SIG_END 0xffffffff /* * CPER record header revision, used in revision field in struct * cper_record_header */ #define CPER_RECORD_REV 0x0100 /* * Severity difinition for error_severity in struct cper_record_header * and section_severity in struct cper_section_descriptor */ #define CPER_SER_RECOVERABLE 0x0 #define CPER_SER_FATAL 0x1 #define CPER_SER_CORRECTED 0x2 #define CPER_SER_INFORMATIONAL 0x3 /* * Notification type used to generate error record, used in * notification_type in struct cper_record_header * * Corrected Machine Check */ #define CPER_NOTIFY_CMC \ UUID_LE(0x2DCE8BB1, 0xBDD7, 0x450e, 0xB9, 0xAD, 0x9C, 0xF4, \ 0xEB, 0xD4, 0xF8, 0x90) /* Corrected Platform Error */ #define CPER_NOTIFY_CPE \ UUID_LE(0x4E292F96, 0xD843, 0x4a55, 0xA8, 0xC2, 0xD4, 0x81, \ 0xF2, 0x7E, 0xBE, 0xEE) /* Machine Check Exception */ #define CPER_NOTIFY_MCE \ UUID_LE(0xE8F56FFE, 0x919C, 0x4cc5, 0xBA, 0x88, 0x65, 0xAB, \ 0xE1, 0x49, 0x13, 0xBB) /* PCI Express Error */ #define CPER_NOTIFY_PCIE \ UUID_LE(0xCF93C01F, 0x1A16, 0x4dfc, 0xB8, 0xBC, 0x9C, 0x4D, \ 0xAF, 0x67, 0xC1, 0x04) /* INIT Record (for IPF) */ #define CPER_NOTIFY_INIT \ UUID_LE(0xCC5263E8, 0x9308, 0x454a, 0x89, 0xD0, 0x34, 0x0B, \ 0xD3, 0x9B, 0xC9, 0x8E) /* Non-Maskable Interrupt */ #define CPER_NOTIFY_NMI \ UUID_LE(0x5BAD89FF, 0xB7E6, 0x42c9, 0x81, 0x4A, 0xCF, 0x24, \ 0x85, 0xD6, 0xE9, 0x8A) /* BOOT Error Record */ #define CPER_NOTIFY_BOOT \ UUID_LE(0x3D61A466, 0xAB40, 0x409a, 0xA6, 0x98, 0xF3, 0x62, \ 0xD4, 0x64, 0xB3, 0x8F) /* DMA Remapping Error */ #define CPER_NOTIFY_DMAR \ UUID_LE(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \ 0x72, 0x2D, 0xEB, 0x41) /* * Flags bits definitions for flags in struct cper_record_header * If set, the error has been recovered */ #define CPER_HW_ERROR_FLAGS_RECOVERED 0x1 /* If set, the error is for previous boot */ #define CPER_HW_ERROR_FLAGS_PREVERR 0x2 /* If set, the error is injected for testing */ #define CPER_HW_ERROR_FLAGS_SIMULATED 0x4 /* * CPER section header revision, used in revision field in struct * cper_section_descriptor */ #define CPER_SEC_REV 0x0100 /* * Validation bits difinition for validation_bits in struct * cper_section_descriptor. If set, corresponding fields in struct * cper_section_descriptor contain valid information. * * corresponds fru_id */ #define CPER_SEC_VALID_FRU_ID 0x1 /* corresponds fru_text */ #define CPER_SEC_VALID_FRU_TEXT 0x2 /* * Flags bits definitions for flags in struct cper_section_descriptor * * If set, the section is associated with the error condition * directly, and should be focused on */ #define CPER_SEC_PRIMARY 0x0001 /* * All tables and structs must be byte-packed to match CPER * specification, since the tables are provided by the system BIOS */ #pragma pack(1) struct cper_record_header { char signature[CPER_SIG_SIZE]; /* must be CPER_SIG_RECORD */ __u16 revision; /* must be CPER_RECORD_REV */ __u32 signature_end; /* must be CPER_SIG_END */ __u16 section_count; __u32 error_severity; __u32 validation_bits; __u32 record_length; __u64 timestamp; uuid_le platform_id; uuid_le partition_id; uuid_le creator_id; uuid_le notification_type; __u64 record_id; __u32 flags; __u64 persistence_information; __u8 reserved[12]; /* must be zero */ }; struct cper_section_descriptor { __u32 section_offset; /* Offset in bytes of the * section body from the base * of the record header */ __u32 section_length; __u16 revision; /* must be CPER_RECORD_REV */ __u8 validation_bits; __u8 reserved; /* must be zero */ __u32 flags; uuid_le section_type; uuid_le fru_id; __u32 section_severity; __u8 fru_text[20]; }; /* Reset to default packing */ #pragma pack() #endif xen-4.4.0/xen/include/xen/lz4.h0000664000175000017500000000620612307313555014340 0ustar smbsmb#ifndef __LZ4_H__ #define __LZ4_H__ /* * LZ4 Kernel Interface * * Copyright (C) 2013, LG Electronics, Kyungsik Lee * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #define LZ4_MEM_COMPRESS (4096 * sizeof(unsigned char *)) #define LZ4HC_MEM_COMPRESS (65538 * sizeof(unsigned char *)) /* * lz4_compressbound() * Provides the maximum size that LZ4 may output in a "worst case" scenario * (input data not compressible) */ static inline size_t lz4_compressbound(size_t isize) { return isize + (isize / 255) + 16; } /* * lz4_compress() * src : source address of the original data * src_len : size of the original data * dst : output buffer address of the compressed data * This requires 'dst' of size LZ4_COMPRESSBOUND. * dst_len : is the output size, which is returned after compress done * workmem : address of the working memory. * This requires 'workmem' of size LZ4_MEM_COMPRESS. * return : Success if return 0 * Error if return (< 0) * note : Destination buffer and workmem must be already allocated with * the defined size. */ int lz4_compress(const unsigned char *src, size_t src_len, unsigned char *dst, size_t *dst_len, void *wrkmem); /* * lz4hc_compress() * src : source address of the original data * src_len : size of the original data * dst : output buffer address of the compressed data * This requires 'dst' of size LZ4_COMPRESSBOUND. * dst_len : is the output size, which is returned after compress done * workmem : address of the working memory. * This requires 'workmem' of size LZ4HC_MEM_COMPRESS. * return : Success if return 0 * Error if return (< 0) * note : Destination buffer and workmem must be already allocated with * the defined size. */ int lz4hc_compress(const unsigned char *src, size_t src_len, unsigned char *dst, size_t *dst_len, void *wrkmem); /* * lz4_decompress() * src : source address of the compressed data * src_len : is the input size, whcih is returned after decompress done * dest : output buffer address of the decompressed data * actual_dest_len: is the size of uncompressed data, supposing it's known * return : Success if return 0 * Error if return (< 0) * note : Destination buffer must be already allocated. * slightly faster than lz4_decompress_unknownoutputsize() */ int lz4_decompress(const unsigned char *src, size_t *src_len, unsigned char *dest, size_t actual_dest_len); /* * lz4_decompress_unknownoutputsize() * src : source address of the compressed data * src_len : is the input size, therefore the compressed size * dest : output buffer address of the decompressed data * dest_len: is the max size of the destination buffer, which is * returned with actual size of decompressed data after * decompress done * return : Success if return 0 * Error if return (< 0) * note : Destination buffer must be already allocated. */ int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, unsigned char *dest, size_t *dest_len); #endif xen-4.4.0/xen/include/xen/vmap.h0000664000175000017500000000127512307313555014573 0ustar smbsmb#if !defined(__XEN_VMAP_H__) && defined(VMAP_VIRT_START) #define __XEN_VMAP_H__ #include #include void *vm_alloc(unsigned int nr, unsigned int align); void vm_free(const void *); void *__vmap(const unsigned long *mfn, unsigned int granularity, unsigned int nr, unsigned int align, unsigned int flags); void *vmap(const unsigned long *mfn, unsigned int nr); void vunmap(const void *); void __iomem *ioremap(paddr_t, size_t); static inline void iounmap(void __iomem *va) { unsigned long addr = (unsigned long)(void __force *)va; vunmap((void *)(addr & PAGE_MASK)); } void vm_init(void); void *arch_vmap_virt_end(void); #endif /* __XEN_VMAP_H__ */ xen-4.4.0/xen/include/xen/err.h0000664000175000017500000000252012307313555014412 0ustar smbsmb#if !defined(__XEN_ERR_H__) && !defined(__ASSEMBLY__) #define __XEN_ERR_H__ #include #include /* * Kernel pointers have redundant information, so we can use a * scheme where we can return either an error code or a dentry * pointer with the same return value. * * This could be a per-architecture thing, to allow different * error and pointer decisions. */ #define MAX_ERRNO 4095 #define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO) static inline void *__must_check ERR_PTR(long error) { return (void *)error; } static inline long __must_check PTR_ERR(const void *ptr) { return (long)ptr; } static inline long __must_check IS_ERR(const void *ptr) { return IS_ERR_VALUE((unsigned long)ptr); } static inline long __must_check IS_ERR_OR_NULL(const void *ptr) { return !ptr || IS_ERR_VALUE((unsigned long)ptr); } /** * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type * @ptr: The pointer to cast. * * Explicitly cast an error-valued pointer to another pointer type in such a * way as to make it clear that's what's going on. */ static inline void * __must_check ERR_CAST(const void *ptr) { /* cast away the const */ return (void *)ptr; } static inline int __must_check PTR_RET(const void *ptr) { return IS_ERR(ptr) ? PTR_ERR(ptr) : 0; } #endif /* __XEN_ERR_H__ */ xen-4.4.0/xen/include/xen/device_tree.h0000664000175000017500000004537212307313555016114 0ustar smbsmb/* * Device Tree * * Copyright (C) 2012 Citrix Systems, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #ifndef __XEN_DEVICE_TREE_H__ #define __XEN_DEVICE_TREE_H__ #include #include #include #include #include #define DEVICE_TREE_MAX_DEPTH 16 #define NR_MEM_BANKS 8 #define MOD_XEN 0 #define MOD_FDT 1 #define MOD_KERNEL 2 #define MOD_INITRD 3 #define NR_MODULES 4 #define MOD_DISCARD_FIRST MOD_FDT struct membank { paddr_t start; paddr_t size; }; struct dt_mem_info { int nr_banks; struct membank bank[NR_MEM_BANKS]; }; struct dt_mb_module { paddr_t start; paddr_t size; char cmdline[1024]; }; struct dt_module_info { int nr_mods; /* Module 0 is Xen itself, followed by the provided modules-proper */ struct dt_mb_module module[NR_MODULES]; }; struct dt_early_info { struct dt_mem_info mem; struct dt_module_info modules; }; /* * Struct used for matching a device */ struct dt_device_match { const char *path; const char *type; const char *compatible; }; #define DT_MATCH_PATH(p) { .path = p } #define DT_MATCH_TYPE(typ) { .type = typ } #define DT_MATCH_COMPATIBLE(compat) { .compatible = compat } typedef u32 dt_phandle; /** * dt_property - describe a property for a device * @name: name of the property * @length: size of the value * @value: pointer to data contained in the property * @next: pointer to the next property of a specific node */ struct dt_property { const char *name; u32 length; void *value; struct dt_property *next; }; /** * dt_device_node - describe a node in the device tree * @name: name of the node * @type: type of the node (ie: memory, cpu, ...) * @full_name: full name, it's composed of all the ascendant name separate by / * @used_by: who owns the node? (ie: xen, dom0...) * @properties: list of properties for the node * @child: pointer to the first child * @sibling: pointer to the next sibling * @allnext: pointer to the next in list of all nodes */ struct dt_device_node { const char *name; const char *type; dt_phandle phandle; char *full_name; domid_t used_by; /* By default it's used by dom0 */ struct dt_property *properties; struct dt_device_node *parent; struct dt_device_node *child; struct dt_device_node *sibling; struct dt_device_node *next; /* TODO: Remove it. Only use to know the last children */ struct dt_device_node *allnext; }; /** * IRQ line type. * * DT_IRQ_TYPE_NONE - default, unspecified type * DT_IRQ_TYPE_EDGE_RISING - rising edge triggered * DT_IRQ_TYPE_EDGE_FALLING - falling edge triggered * DT_IRQ_TYPE_EDGE_BOTH - rising and falling edge triggered * DT_IRQ_TYPE_LEVEL_HIGH - high level triggered * DT_IRQ_TYPE_LEVEL_LOW - low level triggered * DT_IRQ_TYPE_LEVEL_MASK - Mask to filter out the level bits * DT_IRQ_TYPE_SENSE_MASK - Mask for all the above bits */ #define DT_IRQ_TYPE_NONE 0x00000000 #define DT_IRQ_TYPE_EDGE_RISING 0x00000001 #define DT_IRQ_TYPE_EDGE_FALLING 0x00000002 #define DT_IRQ_TYPE_EDGE_BOTH \ (DT_IRQ_TYPE_EDGE_FALLING | DT_IRQ_TYPE_EDGE_RISING) #define DT_IRQ_TYPE_LEVEL_HIGH 0x00000004 #define DT_IRQ_TYPE_LEVEL_LOW 0x00000008 #define DT_IRQ_TYPE_LEVEL_MASK \ (DT_IRQ_TYPE_LEVEL_LOW | DT_IRQ_TYPE_LEVEL_HIGH) #define DT_IRQ_TYPE_SENSE_MASK 0x0000000f /** * dt_irq - describe an IRQ in the device tree * @irq: IRQ number * @type: IRQ type (see DT_IRQ_TYPE_*) * * This structure is returned when an interrupt is mapped. */ struct dt_irq { unsigned int irq; unsigned int type; }; /* If type == DT_IRQ_TYPE_NONE, assume we use level triggered */ static inline bool_t dt_irq_is_level_triggered(const struct dt_irq *irq) { unsigned int type = irq->type; return (type & DT_IRQ_TYPE_LEVEL_MASK) || (type == DT_IRQ_TYPE_NONE); } /** * dt_raw_irq - container for device_node/irq_specifier for an irq controller * @controller: pointer to interrupt controller deivce tree node * @size: size of interrupt specifier * @specifier: array of cells @size long specifying the specific interrupt * * This structure is returned when an interrupt is mapped but not translated. */ #define DT_MAX_IRQ_SPEC 4 /* We handle specifiers of at most 4 cells */ struct dt_raw_irq { const struct dt_device_node *controller; u32 size; u32 specifier[DT_MAX_IRQ_SPEC]; }; #define dt_irq(irq) ((irq)->irq) #define dt_irq_flags(irq) ((irq)->flags) typedef int (*device_tree_node_func)(const void *fdt, int node, const char *name, int depth, u32 address_cells, u32 size_cells, void *data); extern struct dt_early_info early_info; extern const void *device_tree_flattened; size_t __init device_tree_early_init(const void *fdt, paddr_t paddr); const char __init *device_tree_bootargs(const void *fdt); void __init device_tree_dump(const void *fdt); /** * dt_unflatten_host_device_tree - Unflatten the host device tree * * Create a hierarchical device tree for the host DTB to be able * to retrieve parents. */ void __init dt_unflatten_host_device_tree(void); /** * IRQ translation callback * TODO: For the moment we assume that we only have ONE * interrupt-controller. */ typedef int (*dt_irq_xlate_func)(const u32 *intspec, unsigned int intsize, unsigned int *out_hwirq, unsigned int *out_type); extern dt_irq_xlate_func dt_irq_xlate; /** * Host device tree * DO NOT modify it! */ extern struct dt_device_node *dt_host; /** * Primary interrupt controller * Exynos SOC has an interrupt combiner, interrupt has no physical * meaning when it's not connected to the primary controller. * We will only map interrupt whose parent controller is * dt_interrupt_controller. It should always be a GIC. * TODO: Handle multiple GIC */ extern const struct dt_device_node *dt_interrupt_controller; /** * Find the interrupt controller * For the moment we handle only one interrupt controller: the first * one without parent which is compatible with the string "compat". * * If found, return the interrupt controller device node. */ struct dt_device_node * __init dt_find_interrupt_controller(const struct dt_device_match *matches); #define dt_prop_cmp(s1, s2) strcmp((s1), (s2)) #define dt_node_cmp(s1, s2) strcasecmp((s1), (s2)) #define dt_compat_cmp(s1, s2) strcasecmp((s1), (s2)) /* Default #address and #size cells */ #define DT_ROOT_NODE_ADDR_CELLS_DEFAULT 2 #define DT_ROOT_NODE_SIZE_CELLS_DEFAULT 1 #define dt_for_each_property_node(dn, pp) \ for ( pp = dn->properties; pp != NULL; pp = pp->next ) #define dt_for_each_device_node(dt, dn) \ for ( dn = dt; dn != NULL; dn = dn->allnext ) #define dt_for_each_child_node(dt, dn) \ for ( dn = dt->child; dn != NULL; dn = dn->sibling ) /* Helper to read a big number; size is in cells (not bytes) */ static inline u64 dt_read_number(const __be32 *cell, int size) { u64 r = 0; while ( size-- ) r = (r << 32) | be32_to_cpu(*(cell++)); return r; } /* Helper to convert a number of cells to bytes */ static inline int dt_cells_to_size(int size) { return (size * sizeof (u32)); } /* Helper to convert a number of bytes to cells, rounds down */ static inline int dt_size_to_cells(int bytes) { return (bytes / sizeof(u32)); } static inline u64 dt_next_cell(int s, const __be32 **cellp) { const __be32 *p = *cellp; *cellp = p + s; return dt_read_number(p, s); } static inline const char *dt_node_full_name(const struct dt_device_node *np) { return (np && np->full_name) ? np->full_name : ""; } static inline const char *dt_node_name(const struct dt_device_node *np) { return (np && np->name) ? np->name : ""; } static inline bool_t dt_node_name_is_equal(const struct dt_device_node *np, const char *name) { return !dt_node_cmp(np->name, name); } static inline bool_t dt_node_path_is_equal(const struct dt_device_node *np, const char *path) { return !dt_node_cmp(np->full_name, path); } static inline bool_t dt_device_type_is_equal(const struct dt_device_node *device, const char *type) { return !dt_node_cmp(device->type, type); } static inline void dt_device_set_used_by(struct dt_device_node *device, domid_t used_by) { /* TODO: children must inherit to the used_by thing */ device->used_by = used_by; } static inline domid_t dt_device_used_by(const struct dt_device_node *device) { return device->used_by; } static inline bool_t dt_property_name_is_equal(const struct dt_property *pp, const char *name) { return !dt_prop_cmp(pp->name, name); } /** * dt_find_compatible_node - Find a node based on type and one of the * tokens in its "compatible" property * @from: The node to start searching from or NULL, the node * you pass will not be searched, only the next one * will; typically, you pass what the previous call * returned. * @type: The type string to match "device_type" or NULL to ignore * @compatible: The string to match to one of the tokens in the device * "compatible" list. * * Returns a node pointer. */ struct dt_device_node *dt_find_compatible_node(struct dt_device_node *from, const char *type, const char *compatible); /** * Find a property with a given name for a given node * and return the value. */ const void *dt_get_property(const struct dt_device_node *np, const char *name, u32 *lenp); /** * dt_property_read_u32 - Helper to read a u32 property. * @np: node to get the value * @name: name of the property * @out_value: pointer to return value * * Return true if get the desired value. */ bool_t dt_property_read_u32(const struct dt_device_node *np, const char *name, u32 *out_value); /** * dt_property_read_u64 - Helper to read a u64 property. * @np: node to get the value * @name: name of the property * @out_value: pointer to return value * * Return true if get the desired value. */ bool_t dt_property_read_u64(const struct dt_device_node *np, const char *name, u64 *out_value); /** * dt_property_read_string - Find and read a string from a property * @np: Device node from which the property value is to be read * @propname: Name of the property to be searched * @out_string: Pointer to null terminated return string, modified only * if return value if 0. * * Search for a property in a device tree node and retrieve a null * terminated string value (pointer to data, not a copy). Returns 0 on * success, -EINVAL if the property does not exist, -ENODATA if property * doest not have value, and -EILSEQ if the string is not * null-terminated with the length of the property data. * * The out_string pointer is modified only if a valid string can be decoded. */ int dt_property_read_string(const struct dt_device_node *np, const char *propname, const char **out_string); /** * Checks if the given "compat" string matches one of the strings in * the device's "compatible" property */ bool_t dt_device_is_compatible(const struct dt_device_node *device, const char *compat); /** * dt_machine_is_compatible - Test root of device tree for a given compatible value * @compat: compatible string to look for in root node's compatible property. * * Returns true if the root node has the given value in its * compatible property. */ bool_t dt_machine_is_compatible(const char *compat); /** * dt_find_node_by_name - Find a node by its "name" property * @from: The node to start searching from or NULL, the node * you pass will not be searched, only the next one * will; typically, you pass what the previous call * returned. of_node_put() will be called on it * @name: The name string to match against * * Returns a node pointer with refcount incremented, use * of_node_put() on it when done. */ struct dt_device_node *dt_find_node_by_name(struct dt_device_node *node, const char *name); /** * dt_find_node_by_type - Find a node by its "type" property */ struct dt_device_node *dt_find_node_by_type(struct dt_device_node *from, const char *type); /** * df_find_node_by_alias - Find a node matching an alias * @alias: The alias to match * * Returns a node pointer. */ struct dt_device_node *dt_find_node_by_alias(const char *alias); /** * dt_find_node_by_path - Find a node matching a full DT path * @path: The full path to match * * Returns a node pointer. */ struct dt_device_node *dt_find_node_by_path(const char *path); /** * dt_get_parent - Get a node's parent if any * @node: Node to get parent * * Returns a node pointer. */ const struct dt_device_node *dt_get_parent(const struct dt_device_node *node); /** * dt_device_get_address - Resolve an address for a device * @device: the device whose address is to be resolved * @index: index of the address to resolve * @addr: address filled by this function * @size: size filled by this function * * This function resolves an address, walking the tree, for a give * device-tree node. It returns 0 on success. */ int dt_device_get_address(const struct dt_device_node *dev, int index, u64 *addr, u64 *size); /** * dt_number_of_irq - Get the number of IRQ for a device * @device: the device whose number of interrupt is to be retrieved * * Return the number of irq for this device or 0 if there is no * interrupt or an error occurred. */ unsigned int dt_number_of_irq(const struct dt_device_node *device); /** * dt_number_of_address - Get the number of addresses for a device * @device: the device whose number of address is to be retrieved * * Return the number of address for this device or 0 if there is no * address or an error occurred. */ unsigned int dt_number_of_address(const struct dt_device_node *device); /** * dt_device_get_irq - Resolve an interrupt for a device * @device: the device whose interrupt is to be resolved * @index: index of the interrupt to resolve * @out_irq: structure dt_irq filled by this function * * This function resolves an interrupt, walking the tree, for a given * device-tree node. It's the high level pendant to dt_device_get_raw_irq(). */ int dt_device_get_irq(const struct dt_device_node *device, int index, struct dt_irq *irq); /** * dt_device_get_raw_irq - Resolve an interrupt for a device without translation * @device: the device whose interrupt is to be resolved * @index: index of the interrupt to resolve * @out_irq: structure dt_raw_irq filled by this function * * This function resolves an interrupt for a device, no translation is * made. dt_irq_translate can be called after. */ int dt_device_get_raw_irq(const struct dt_device_node *device, int index, struct dt_raw_irq *irq); /** * dt_irq_translate - Translate an irq * @raw: IRQ to translate (raw format) * @out_irq: structure dt_irq filled by this function */ int dt_irq_translate(const struct dt_raw_irq *raw, struct dt_irq *out_irq); /** * dt_n_size_cells - Helper to retrieve the number of cell for the size * @np: node to get the value * * This function retrieves for a give device-tree node the number of * cell for the size field. */ int dt_n_size_cells(const struct dt_device_node *np); /** * dt_n_addr_cells - Helper to retrieve the number of cell for the address * @np: node to get the value * * This function retrieves for a give device-tree node the number of * cell for the address field. */ int dt_n_addr_cells(const struct dt_device_node *np); /** * dt_device_is_available - Check if a device is available for use * * @device: Node to check for availability * * Returns true if the status property is absent or set to "okay" or "ok", * false otherwise. */ bool_t dt_device_is_available(const struct dt_device_node *device); /** * dt_match_node - Tell if a device_node has a matching of dt_device_match * @matches: array of dt_device_match structures to search in * @node: the dt_device_node structure to match against * * Returns true if the device node match one of dt_device_match. */ bool_t dt_match_node(const struct dt_device_match *matches, const struct dt_device_node *node); /** * dt_find_matching_node - Find a node based on an dt_device_match match table * @from: The node to start searching from or NULL, the node you pass * will not be searched, only the next one will; typically, you pass * what the returned call returned * @matches: array of dt_device_match structures to search in * * Returns a node pointer. */ struct dt_device_node * dt_find_matching_node(struct dt_device_node *from, const struct dt_device_match *matches); /** * dt_set_cell - Write a value into a series of cells * * @cellp: Pointer to cells * @size: number of cells to write the value * @value: number to write * * Write a value into a series of cells and update cellp to point to the * cell just after. */ void dt_set_cell(__be32 **cellp, int size, u64 val); /** * dt_set_range - Write range into a series of cells * * @cellp: Pointer to cells * @np: Node which contains the encoding for the address and the size * @address: Start of range * @size: Size of the range * * Write a range into a series of cells and update cellp to point to the * cell just after. */ void dt_set_range(__be32 **cellp, const struct dt_device_node *np, u64 address, u64 size); /** * dt_get_range - Read a range (address/size) from a series of cells * * @cellp: Pointer to cells * @np Node which contains the encoding for the addresss and the size * @address: Address filled by this function * @size: Size filled by this function * * WARNING: This function should not be used to decode an address * This function reads a range (address/size) from a series of cells and * update cellp to point to the cell just after. */ void dt_get_range(const __be32 **cellp, const struct dt_device_node *np, u64 *address, u64 *size); #endif /* __XEN_DEVICE_TREE_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/xen/8250-uart.h0000664000175000017500000001355612307313555015204 0ustar smbsmb/* * xen/include/xen/8250-uart.h * * This header is extracted from driver/char/ns16550.c * * Common constant definition between early printk and the UART driver * for the 16550-series UART * * Copyright (c) 2003-2005, K A Fraser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #ifndef __XEN_8250_UART_H__ #define __XEN_8250_UART_H__ /* Register offsets */ #define UART_RBR 0x00 /* receive buffer */ #define UART_THR 0x00 /* transmit holding */ #define UART_IER 0x01 /* interrupt enable */ #define UART_IIR 0x02 /* interrupt identity */ #define UART_FCR 0x02 /* FIFO control */ #define UART_LCR 0x03 /* line control */ #define UART_MCR 0x04 /* Modem control */ #define UART_LSR 0x05 /* line status */ #define UART_MSR 0x06 /* Modem status */ #define UART_USR 0x1f /* Status register (DW) */ #define UART_DLL 0x00 /* divisor latch (ls) (DLAB=1) */ #define UART_DLM 0x01 /* divisor latch (ms) (DLAB=1) */ /* Interrupt Enable Register */ #define UART_IER_ERDAI 0x01 /* rx data recv'd */ #define UART_IER_ETHREI 0x02 /* tx reg. empty */ #define UART_IER_ELSI 0x04 /* rx line status */ #define UART_IER_EMSI 0x08 /* MODEM status */ /* Interrupt Identificatiegister */ #define UART_IIR_NOINT 0x01 /* no interrupt pending */ #define UART_IIR_IMA 0x06 /* interrupt identity: */ #define UART_IIR_LSI 0x06 /* - rx line status */ #define UART_IIR_RDA 0x04 /* - rx data recv'd */ #define UART_IIR_THR 0x02 /* - tx reg. empty */ #define UART_IIR_MSI 0x00 /* - MODEM status */ #define UART_IIR_BSY 0x07 /* - busy detect (DW) */ /* FIFO Control Register */ #define UART_FCR_ENABLE 0x01 /* enable FIFO */ #define UART_FCR_CLRX 0x02 /* clear Rx FIFO */ #define UART_FCR_CLTX 0x04 /* clear Tx FIFO */ #define UART_FCR_DMA 0x10 /* enter DMA mode */ #define UART_FCR_TRG1 0x00 /* Rx FIFO trig lev 1 */ #define UART_FCR_TRG4 0x40 /* Rx FIFO trig lev 4 */ #define UART_FCR_TRG8 0x80 /* Rx FIFO trig lev 8 */ #define UART_FCR_TRG14 0xc0 /* Rx FIFO trig lev 14 */ /* * Note: The FIFO trigger levels are chip specific: * RX:76 = 00 01 10 11 TX:54 = 00 01 10 11 * PC16550D: 1 4 8 14 xx xx xx xx * TI16C550A: 1 4 8 14 xx xx xx xx * TI16C550C: 1 4 8 14 xx xx xx xx * ST16C550: 1 4 8 14 xx xx xx xx * ST16C650: 8 16 24 28 16 8 24 30 PORT_16650V2 * NS16C552: 1 4 8 14 xx xx xx xx * ST16C654: 8 16 56 60 8 16 32 56 PORT_16654 * TI16C750: 1 16 32 56 xx xx xx xx PORT_16750 * TI16C752: 8 16 56 60 8 16 32 56 * Tegra: 1 4 8 14 16 8 4 1 PORT_TEGRA */ #define UART_FCR_R_TRIG_00 0x00 #define UART_FCR_R_TRIG_01 0x40 #define UART_FCR_R_TRIG_10 0x80 #define UART_FCR_R_TRIG_11 0xc0 #define UART_FCR_T_TRIG_00 0x00 #define UART_FCR_T_TRIG_01 0x10 #define UART_FCR_T_TRIG_10 0x20 #define UART_FCR_T_TRIG_11 0x30 /* Line Control Register */ #define UART_LCR_DLAB 0x80 /* Divisor Latch Access */ /* * Access to some registers depends on register access / configuration * mode. */ #define UART_LCR_CONF_MODE_A UART_LCR_DLAB /* Configuration mode A */ #define UART_LCR_CONF_MODE_B 0xBF /* Configuration mode B */ /* Modem Control Register */ #define UART_MCR_DTR 0x01 /* Data Terminal Ready */ #define UART_MCR_RTS 0x02 /* Request to Send */ #define UART_MCR_OUT2 0x08 /* OUT2: interrupt mask */ #define UART_MCR_LOOP 0x10 /* Enable loopback test mode */ #define UART_MCR_TCRTLR 0x40 /* Access TCR/TLR (TI16C752, EFR[4]=1) */ /* Line Status Register */ #define UART_LSR_DR 0x01 /* Data ready */ #define UART_LSR_OE 0x02 /* Overrun */ #define UART_LSR_PE 0x04 /* Parity error */ #define UART_LSR_FE 0x08 /* Framing error */ #define UART_LSR_BI 0x10 /* Break */ #define UART_LSR_THRE 0x20 /* Xmit hold reg empty */ #define UART_LSR_TEMT 0x40 /* Xmitter empty */ #define UART_LSR_ERR 0x80 /* Error */ /* These parity settings can be ORed directly into the LCR. */ #define UART_PARITY_NONE (0<<3) #define UART_PARITY_ODD (1<<3) #define UART_PARITY_EVEN (3<<3) #define UART_PARITY_MARK (5<<3) #define UART_PARITY_SPACE (7<<3) /* Frequency of external clock source. This definition assumes PC platform. */ #define UART_CLOCK_HZ 1843200 /* Resume retry settings */ #define RESUME_DELAY MILLISECS(10) #define RESUME_RETRIES 100 /* Enhanced feature register */ #define UART_OMAP_EFR 0x02 #define UART_OMAP_EFR_ECB 0x10 /* Enhanced control bit */ /* Mode definition register 1 */ #define UART_OMAP_MDR1 0x08 /* * These are the definitions for the MDR1 register */ #define UART_OMAP_MDR1_16X_MODE 0x00 /* UART 16x mode */ #define UART_OMAP_MDR1_DISABLE 0x07 /* Disable (default state) */ /* Supplementary control register */ #define UART_OMAP_SCR 0x10 /* SCR register bitmasks */ #define OMAP_UART_SCR_RX_TRIG_GRANU1_MASK (1 << 7) #endif /* __XEN_8250_UART_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/xen/hvm/0000775000175000017500000000000012307313555014244 5ustar smbsmbxen-4.4.0/xen/include/xen/hvm/save.h0000664000175000017500000001421512307313555015356 0ustar smbsmb/* * save.h: HVM support routines for save/restore * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #ifndef __XEN_HVM_SAVE_H__ #define __XEN_HVM_SAVE_H__ #include #include #include #include /* Marshalling and unmarshalling uses a buffer with size and cursor. */ typedef struct hvm_domain_context { uint32_t cur; uint32_t size; uint8_t *data; } hvm_domain_context_t; /* Marshalling an entry: check space and fill in the header */ int _hvm_init_entry(struct hvm_domain_context *h, uint16_t tc, uint16_t inst, uint32_t len); /* Marshalling: copy the contents in a type-safe way */ void _hvm_write_entry(struct hvm_domain_context *h, void *src, uint32_t src_len); /* Marshalling: init and copy; evaluates to zero on success */ #define hvm_save_entry(_x, _inst, _h, _src) ({ \ int r; \ r = _hvm_init_entry((_h), HVM_SAVE_CODE(_x), \ (_inst), HVM_SAVE_LENGTH(_x)); \ if ( r == 0 ) \ _hvm_write_entry((_h), (_src), HVM_SAVE_LENGTH(_x)); \ r; }) /* Unmarshalling: test an entry's size and typecode and record the instance */ int _hvm_check_entry(struct hvm_domain_context *h, uint16_t type, uint32_t len, bool_t strict_length); /* Unmarshalling: copy the contents in a type-safe way */ void _hvm_read_entry(struct hvm_domain_context *h, void *dest, uint32_t dest_len); /* * Unmarshalling: check, then copy. Evaluates to zero on success. This load * function requires the save entry to be the same size as the dest structure. */ #define _hvm_load_entry(_x, _h, _dst, _strict) ({ \ int r; \ if ( (r = _hvm_check_entry((_h), HVM_SAVE_CODE(_x), \ HVM_SAVE_LENGTH(_x), (_strict))) == 0 ) \ _hvm_read_entry((_h), (_dst), HVM_SAVE_LENGTH(_x)); \ else if (HVM_SAVE_HAS_COMPAT(_x) \ && (r = _hvm_check_entry((_h), HVM_SAVE_CODE(_x), \ HVM_SAVE_LENGTH_COMPAT(_x), (_strict))) == 0 ) { \ _hvm_read_entry((_h), (_dst), HVM_SAVE_LENGTH_COMPAT(_x)); \ r=HVM_SAVE_FIX_COMPAT(_x, (_dst)); \ } \ r; }) #define hvm_load_entry(_x, _h, _dst) \ _hvm_load_entry(_x, _h, _dst, 1) #define hvm_load_entry_zeroextend(_x, _h, _dst) \ _hvm_load_entry(_x, _h, _dst, 0) /* Unmarshalling: what is the instance ID of the next entry? */ static inline uint16_t hvm_load_instance(struct hvm_domain_context *h) { struct hvm_save_descriptor *d = (struct hvm_save_descriptor *)&h->data[h->cur]; return d->instance; } /* Handler types for different types of save-file entry. * The save handler may save multiple instances of a type into the buffer; * the load handler will be called once for each instance found when * restoring. Both return non-zero on error. */ typedef int (*hvm_save_handler) (struct domain *d, hvm_domain_context_t *h); typedef int (*hvm_load_handler) (struct domain *d, hvm_domain_context_t *h); /* Init-time function to declare a pair of handlers for a type, * and the maximum buffer space needed to save this type of state */ void hvm_register_savevm(uint16_t typecode, const char *name, hvm_save_handler save_state, hvm_load_handler load_state, size_t size, int kind); /* The space needed for saving can be per-domain or per-vcpu: */ #define HVMSR_PER_DOM 0 #define HVMSR_PER_VCPU 1 /* Syntactic sugar around that function: specify the max number of * saves, and this calculates the size of buffer needed */ #define HVM_REGISTER_SAVE_RESTORE(_x, _save, _load, _num, _k) \ static int __init __hvm_register_##_x##_save_and_restore(void) \ { \ hvm_register_savevm(HVM_SAVE_CODE(_x), \ #_x, \ &_save, \ &_load, \ (_num) * (HVM_SAVE_LENGTH(_x) \ + sizeof (struct hvm_save_descriptor)), \ _k); \ return 0; \ } \ __initcall(__hvm_register_##_x##_save_and_restore); /* Entry points for saving and restoring HVM domain state */ size_t hvm_save_size(struct domain *d); int hvm_save(struct domain *d, hvm_domain_context_t *h); int hvm_save_one(struct domain *d, uint16_t typecode, uint16_t instance, XEN_GUEST_HANDLE_64(uint8) handle); int hvm_load(struct domain *d, hvm_domain_context_t *h); /* Arch-specific definitions. */ struct hvm_save_header; void arch_hvm_save(struct domain *d, struct hvm_save_header *hdr); int arch_hvm_load(struct domain *d, struct hvm_save_header *hdr); #endif /* __XEN_HVM_SAVE_H__ */ xen-4.4.0/xen/include/xen/hvm/irq.h0000664000175000017500000001031112307313555015204 0ustar smbsmb/****************************************************************************** * irq.h * * Interrupt distribution and delivery logic. * * Copyright (c) 2006, K A Fraser, XenSource Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #ifndef __XEN_HVM_IRQ_H__ #define __XEN_HVM_IRQ_H__ #include #include #include #include #include struct dev_intx_gsi_link { struct list_head list; uint8_t device; uint8_t intx; uint8_t gsi; uint8_t link; }; #define _HVM_IRQ_DPCI_MACH_PCI_SHIFT 0 #define _HVM_IRQ_DPCI_MACH_MSI_SHIFT 1 #define _HVM_IRQ_DPCI_MAPPED_SHIFT 2 #define _HVM_IRQ_DPCI_EOI_LATCH_SHIFT 3 #define _HVM_IRQ_DPCI_GUEST_PCI_SHIFT 4 #define _HVM_IRQ_DPCI_GUEST_MSI_SHIFT 5 #define _HVM_IRQ_DPCI_TRANSLATE_SHIFT 15 #define HVM_IRQ_DPCI_MACH_PCI (1 << _HVM_IRQ_DPCI_MACH_PCI_SHIFT) #define HVM_IRQ_DPCI_MACH_MSI (1 << _HVM_IRQ_DPCI_MACH_MSI_SHIFT) #define HVM_IRQ_DPCI_MAPPED (1 << _HVM_IRQ_DPCI_MAPPED_SHIFT) #define HVM_IRQ_DPCI_EOI_LATCH (1 << _HVM_IRQ_DPCI_EOI_LATCH_SHIFT) #define HVM_IRQ_DPCI_GUEST_PCI (1 << _HVM_IRQ_DPCI_GUEST_PCI_SHIFT) #define HVM_IRQ_DPCI_GUEST_MSI (1 << _HVM_IRQ_DPCI_GUEST_MSI_SHIFT) #define HVM_IRQ_DPCI_TRANSLATE (1 << _HVM_IRQ_DPCI_TRANSLATE_SHIFT) #define VMSI_DEST_ID_MASK 0xff #define VMSI_RH_MASK 0x100 #define VMSI_DM_MASK 0x200 #define VMSI_DELIV_MASK 0x7000 #define VMSI_TRIG_MODE 0x8000 #define GFLAGS_SHIFT_RH 8 #define GFLAGS_SHIFT_DELIV_MODE 12 #define GFLAGS_SHIFT_TRG_MODE 15 struct hvm_gmsi_info { uint32_t gvec; uint32_t gflags; int dest_vcpu_id; /* -1 :multi-dest, non-negative: dest_vcpu_id */ }; struct hvm_girq_dpci_mapping { struct list_head list; uint8_t device; uint8_t intx; uint8_t machine_gsi; }; #define NR_ISAIRQS 16 #define NR_LINK 4 #if defined(CONFIG_X86) # define NR_HVM_IRQS VIOAPIC_NUM_PINS #endif /* Protected by domain's event_lock */ struct hvm_irq_dpci { /* Guest IRQ to guest device/intx mapping. */ struct list_head girq[NR_HVM_IRQS]; /* Record of mapped ISA IRQs */ DECLARE_BITMAP(isairq_map, NR_ISAIRQS); /* Record of mapped Links */ uint8_t link_cnt[NR_LINK]; struct tasklet dirq_tasklet; }; /* Machine IRQ to guest device/intx mapping. */ struct hvm_pirq_dpci { uint32_t flags; bool_t masked; uint16_t pending; struct list_head digl_list; struct domain *dom; struct hvm_gmsi_info gmsi; struct timer timer; }; void pt_pirq_init(struct domain *, struct hvm_pirq_dpci *); bool_t pt_pirq_cleanup_check(struct hvm_pirq_dpci *); int pt_pirq_iterate(struct domain *d, int (*cb)(struct domain *, struct hvm_pirq_dpci *, void *arg), void *arg); /* Modify state of a PCI INTx wire. */ void hvm_pci_intx_assert( struct domain *d, unsigned int device, unsigned int intx); void hvm_pci_intx_deassert( struct domain *d, unsigned int device, unsigned int intx); /* Modify state of an ISA device's IRQ wire. */ void hvm_isa_irq_assert( struct domain *d, unsigned int isa_irq); void hvm_isa_irq_deassert( struct domain *d, unsigned int isa_irq); void hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq); void hvm_inject_msi(struct domain *d, uint64_t addr, uint32_t data); void hvm_maybe_deassert_evtchn_irq(void); void hvm_assert_evtchn_irq(struct vcpu *v); void hvm_set_callback_via(struct domain *d, uint64_t via); #endif /* __XEN_HVM_IRQ_H__ */ xen-4.4.0/xen/include/xen/hvm/iommu.h0000664000175000017500000000325012307313555015543 0ustar smbsmb/* * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Copyright (C) Allen Kay */ #ifndef __XEN_HVM_IOMMU_H__ #define __XEN_HVM_IOMMU_H__ #include struct g2m_ioport { struct list_head list; unsigned int gport; unsigned int mport; unsigned int np; }; struct mapped_rmrr { struct list_head list; u64 base; u64 end; }; struct hvm_iommu { u64 pgd_maddr; /* io page directory machine address */ spinlock_t mapping_lock; /* io page table lock */ int agaw; /* adjusted guest address width, 0 is level 2 30-bit */ struct list_head g2m_ioport_list; /* guest to machine ioport mapping */ u64 iommu_bitmap; /* bitmap of iommu(s) that the domain uses */ struct list_head mapped_rmrrs; /* amd iommu support */ int domain_id; int paging_mode; struct page_info *root_table; struct guest_iommu *g_iommu; /* iommu_ops */ const struct iommu_ops *platform_ops; }; #endif /* __XEN_HVM_IOMMU_H__ */ xen-4.4.0/xen/include/xen/kexec.h0000664000175000017500000000566512307313555014736 0ustar smbsmb#ifndef __XEN_KEXEC_H__ #define __XEN_KEXEC_H__ #ifdef CONFIG_KEXEC #include #include #include #include typedef struct xen_kexec_reserve { unsigned long size; unsigned long start; } xen_kexec_reserve_t; extern xen_kexec_reserve_t kexec_crash_area; extern bool_t kexecing; void set_kexec_crash_area_size(u64 system_ram); /* We have space for 4 images to support atomic update * of images. This is important for CRASH images since * a panic can happen at any time... */ #define KEXEC_IMAGE_DEFAULT_BASE 0 #define KEXEC_IMAGE_CRASH_BASE 2 #define KEXEC_IMAGE_NR 4 enum low_crashinfo { LOW_CRASHINFO_INVALID = 0, LOW_CRASHINFO_NONE = 1, LOW_CRASHINFO_MIN = 2, LOW_CRASHINFO_ALL = 3 }; /* Low crashinfo mode. Start as INVALID so serveral codepaths can set up * defaults without needing to know the state of the others. */ extern enum low_crashinfo low_crashinfo_mode; extern paddr_t crashinfo_maxaddr_bits; void kexec_early_calculations(void); int machine_kexec_add_page(struct kexec_image *image, unsigned long vaddr, unsigned long maddr); int machine_kexec_load(struct kexec_image *image); void machine_kexec_unload(struct kexec_image *image); void machine_kexec_reserved(xen_kexec_reserve_t *reservation); void machine_reboot_kexec(struct kexec_image *image); void machine_kexec(struct kexec_image *image); void kexec_crash(void); void kexec_crash_save_cpu(void); crash_xen_info_t *kexec_crash_save_info(void); void machine_crash_shutdown(void); int machine_kexec_get(xen_kexec_range_t *range); int machine_kexec_get_xen(xen_kexec_range_t *range); /* vmcoreinfo stuff */ #define VMCOREINFO_BYTES (4096) #define VMCOREINFO_NOTE_NAME "VMCOREINFO_XEN" void arch_crash_save_vmcoreinfo(void); void vmcoreinfo_append_str(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); #define VMCOREINFO_PAGESIZE(value) \ vmcoreinfo_append_str("PAGESIZE=%ld\n", value) #define VMCOREINFO_SYMBOL(name) \ vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) #define VMCOREINFO_SYMBOL_ALIAS(alias, name) \ vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #alias, (unsigned long)&name) #define VMCOREINFO_STRUCT_SIZE(name) \ vmcoreinfo_append_str("SIZE(%s)=%zu\n", #name, sizeof(struct name)) #define VMCOREINFO_OFFSET(name, field) \ vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ (unsigned long)offsetof(struct name, field)) #define VMCOREINFO_OFFSET_SUB(name, sub, field) \ vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ (unsigned long)offsetof(struct name, sub.field)) #else /* !CONFIG_KEXEC */ #define crashinfo_maxaddr_bits 0 #endif #endif /* __XEN_KEXEC_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/xen/time.h0000664000175000017500000000437612307313555014573 0ustar smbsmb/****************************************************************************** * time.h * * Copyright (c) 2002-2003 Rolf Neugebauer * Copyright (c) 2002-2005 K A Fraser */ #ifndef __XEN_TIME_H__ #define __XEN_TIME_H__ #include #include extern int init_xen_time(void); extern void cstate_restore_tsc(void); extern unsigned long cpu_khz; extern unsigned long pit0_ticks; struct domain; struct vcpu; /* * System Time * 64 bit value containing the nanoseconds elapsed since boot time. * This value is adjusted by frequency drift. * NOW() returns the current time. * The other macros are for convenience to approximate short intervals * of real time into system time */ typedef s64 s_time_t; #define PRI_stime PRId64 s_time_t get_s_time(void); unsigned long get_localtime(struct domain *d); uint64_t get_localtime_us(struct domain *d); struct tm { int tm_sec; /* seconds */ int tm_min; /* minutes */ int tm_hour; /* hours */ int tm_mday; /* day of the month */ int tm_mon; /* month */ int tm_year; /* year */ int tm_wday; /* day of the week */ int tm_yday; /* day in the year */ int tm_isdst; /* daylight saving time */ }; struct tm gmtime(unsigned long t); #define SYSTEM_TIME_HZ 1000000000ULL #define NOW() ((s_time_t)get_s_time()) #define SECONDS(_s) ((s_time_t)((_s) * 1000000000ULL)) #define MILLISECS(_ms) ((s_time_t)((_ms) * 1000000ULL)) #define MICROSECS(_us) ((s_time_t)((_us) * 1000ULL)) #define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1)) /* Chosen so (NOW() + delta) wont overflow without an uptime of 200 years */ #define STIME_DELTA_MAX ((s_time_t)((uint64_t)~0ull>>2)) extern void update_vcpu_system_time(struct vcpu *v); extern void update_domain_wallclock_time(struct domain *d); extern void do_settime( unsigned long secs, unsigned long nsecs, u64 system_time_base); extern void send_timer_event(struct vcpu *v); void domain_set_time_offset(struct domain *d, int32_t time_offset_seconds); #include #endif /* __XEN_TIME_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/xen/sched-if.h0000664000175000017500000001700712307313555015312 0ustar smbsmb/****************************************************************************** * Additional declarations for the generic scheduler interface. This should * only be included by files that implement conforming schedulers. * * Portions by Mark Williamson are (C) 2004 Intel Research Cambridge */ #ifndef __XEN_SCHED_IF_H__ #define __XEN_SCHED_IF_H__ #include /* A global pointer to the initial cpupool (POOL0). */ extern struct cpupool *cpupool0; /* cpus currently in no cpupool */ extern cpumask_t cpupool_free_cpus; /* Scheduler generic parameters * */ #define SCHED_DEFAULT_RATELIMIT_US 1000 extern int sched_ratelimit_us; /* * In order to allow a scheduler to remap the lock->cpu mapping, * we have a per-cpu pointer, along with a pre-allocated set of * locks. The generic schedule init code will point each schedule lock * pointer to the schedule lock; if the scheduler wants to remap them, * it can simply modify the schedule locks. * * For cache betterness, keep the actual lock in the same cache area * as the rest of the struct. Just have the scheduler point to the * one it wants (This may be the one right in front of it).*/ struct schedule_data { spinlock_t *schedule_lock, _lock; struct vcpu *curr; /* current task */ void *sched_priv; struct timer s_timer; /* scheduling timer */ atomic_t urgent_count; /* how many urgent vcpus */ }; #define curr_on_cpu(c) (per_cpu(schedule_data, c).curr) DECLARE_PER_CPU(struct schedule_data, schedule_data); DECLARE_PER_CPU(struct scheduler *, scheduler); DECLARE_PER_CPU(struct cpupool *, cpupool); #define sched_lock(kind, param, cpu, irq, arg...) \ static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ { \ for ( ; ; ) \ { \ spinlock_t *lock = per_cpu(schedule_data, cpu).schedule_lock; \ /* \ * v->processor may change when grabbing the lock; but \ * per_cpu(v->processor) may also change, if changing cpu pool \ * also changes the scheduler lock. Retry until they match. \ * \ * It may also be the case that v->processor may change but the \ * lock may be the same; this will succeed in that case. \ */ \ spin_lock##irq(lock, ## arg); \ if ( likely(lock == per_cpu(schedule_data, cpu).schedule_lock) ) \ return lock; \ spin_unlock##irq(lock, ## arg); \ } \ } #define sched_unlock(kind, param, cpu, irq, arg...) \ static inline void kind##_schedule_unlock##irq(spinlock_t *lock \ EXTRA_TYPE(arg), param) \ { \ ASSERT(lock == per_cpu(schedule_data, cpu).schedule_lock); \ spin_unlock##irq(lock, ## arg); \ } #define EXTRA_TYPE(arg) sched_lock(pcpu, unsigned int cpu, cpu, ) sched_lock(vcpu, const struct vcpu *v, v->processor, ) sched_lock(pcpu, unsigned int cpu, cpu, _irq) sched_lock(vcpu, const struct vcpu *v, v->processor, _irq) sched_unlock(pcpu, unsigned int cpu, cpu, ) sched_unlock(vcpu, const struct vcpu *v, v->processor, ) sched_unlock(pcpu, unsigned int cpu, cpu, _irq) sched_unlock(vcpu, const struct vcpu *v, v->processor, _irq) #undef EXTRA_TYPE #define EXTRA_TYPE(arg) , unsigned long arg #define spin_unlock_irqsave spin_unlock_irqrestore sched_lock(pcpu, unsigned int cpu, cpu, _irqsave, *flags) sched_lock(vcpu, const struct vcpu *v, v->processor, _irqsave, *flags) #undef spin_unlock_irqsave sched_unlock(pcpu, unsigned int cpu, cpu, _irqrestore, flags) sched_unlock(vcpu, const struct vcpu *v, v->processor, _irqrestore, flags) #undef EXTRA_TYPE #undef sched_unlock #undef sched_lock static inline spinlock_t *pcpu_schedule_trylock(unsigned int cpu) { spinlock_t *lock = per_cpu(schedule_data, cpu).schedule_lock; if ( !spin_trylock(lock) ) return NULL; if ( lock == per_cpu(schedule_data, cpu).schedule_lock ) return lock; spin_unlock(lock); return NULL; } struct task_slice { struct vcpu *task; s_time_t time; bool_t migrated; }; struct scheduler { char *name; /* full name for this scheduler */ char *opt_name; /* option name for this scheduler */ unsigned int sched_id; /* ID for this scheduler */ void *sched_data; /* global data pointer */ int (*global_init) (void); int (*init) (struct scheduler *); void (*deinit) (const struct scheduler *); void (*free_vdata) (const struct scheduler *, void *); void * (*alloc_vdata) (const struct scheduler *, struct vcpu *, void *); void (*free_pdata) (const struct scheduler *, void *, int); void * (*alloc_pdata) (const struct scheduler *, int); void (*free_domdata) (const struct scheduler *, void *); void * (*alloc_domdata) (const struct scheduler *, struct domain *); int (*init_domain) (const struct scheduler *, struct domain *); void (*destroy_domain) (const struct scheduler *, struct domain *); /* Activate / deactivate vcpus in a cpu pool */ void (*insert_vcpu) (const struct scheduler *, struct vcpu *); void (*remove_vcpu) (const struct scheduler *, struct vcpu *); void (*sleep) (const struct scheduler *, struct vcpu *); void (*wake) (const struct scheduler *, struct vcpu *); void (*yield) (const struct scheduler *, struct vcpu *); void (*context_saved) (const struct scheduler *, struct vcpu *); struct task_slice (*do_schedule) (const struct scheduler *, s_time_t, bool_t tasklet_work_scheduled); int (*pick_cpu) (const struct scheduler *, struct vcpu *); void (*migrate) (const struct scheduler *, struct vcpu *, unsigned int); int (*adjust) (const struct scheduler *, struct domain *, struct xen_domctl_scheduler_op *); int (*adjust_global) (const struct scheduler *, struct xen_sysctl_scheduler_op *); void (*set_node_affinity) (const struct scheduler *, struct domain *, nodemask_t *); void (*dump_settings) (const struct scheduler *); void (*dump_cpu_state) (const struct scheduler *, int); void (*tick_suspend) (const struct scheduler *, unsigned int); void (*tick_resume) (const struct scheduler *, unsigned int); }; extern const struct scheduler sched_sedf_def; extern const struct scheduler sched_credit_def; extern const struct scheduler sched_credit2_def; extern const struct scheduler sched_arinc653_def; struct cpupool { int cpupool_id; cpumask_var_t cpu_valid; /* all cpus assigned to pool */ cpumask_var_t cpu_suspended; /* cpus in S3 that should be in this pool */ struct cpupool *next; unsigned int n_dom; struct scheduler *sched; atomic_t refcnt; }; #define cpupool_scheduler_cpumask(_pool) \ (((_pool) == NULL) ? &cpupool_free_cpus : (_pool)->cpu_valid) #define cpupool_online_cpumask(_pool) \ (((_pool) == NULL) ? &cpu_online_map : (_pool)->cpu_valid) #endif /* __XEN_SCHED_IF_H__ */ xen-4.4.0/xen/include/xen/event_fifo.h0000664000175000017500000000234412307313555015752 0ustar smbsmb/* * FIFO-based event channel ABI. * * Copyright (C) 2013 Citrix Systems R&D Ltd. * * This source code is licensed under the GNU General Public License, * Version 2 or later. See the file COPYING for more details. */ #ifndef __XEN_EVENT_FIFO_H__ #define __XEN_EVENT_FIFO_H__ struct evtchn_fifo_queue { uint32_t *head; /* points into control block */ uint32_t tail; uint8_t priority; spinlock_t lock; }; struct evtchn_fifo_vcpu { struct evtchn_fifo_control_block *control_block; struct evtchn_fifo_queue queue[EVTCHN_FIFO_MAX_QUEUES]; }; #define EVTCHN_FIFO_EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t)) #define EVTCHN_FIFO_MAX_EVENT_ARRAY_PAGES \ (EVTCHN_FIFO_NR_CHANNELS / EVTCHN_FIFO_EVENT_WORDS_PER_PAGE) struct evtchn_fifo_domain { event_word_t *event_array[EVTCHN_FIFO_MAX_EVENT_ARRAY_PAGES]; unsigned int num_evtchns; }; int evtchn_fifo_init_control(struct evtchn_init_control *init_control); int evtchn_fifo_expand_array(const struct evtchn_expand_array *expand_array); void evtchn_fifo_destroy(struct domain *domain); #endif /* __XEN_EVENT_FIFO_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/xen/pci_regs.h0000664000175000017500000007013012307313555015417 0ustar smbsmb/* * pci_regs.h * * PCI standard defines * Copyright 1994, Drew Eckhardt * Copyright 1997--1999 Martin Mares * * For more information, please consult the following manuals (look at * http://www.pcisig.com/ for how to get them): * * PCI BIOS Specification * PCI Local Bus Specification * PCI to PCI Bridge Specification * PCI System Design Guide * * For hypertransport information, please consult the following manuals * from http://www.hypertransport.org * * The Hypertransport I/O Link Specification */ #ifndef LINUX_PCI_REGS_H #define LINUX_PCI_REGS_H /* * Under PCI, each device has 256 bytes of configuration address space, * of which the first 64 bytes are standardized as follows: */ #define PCI_VENDOR_ID 0x00 /* 16 bits */ #define PCI_DEVICE_ID 0x02 /* 16 bits */ #define PCI_COMMAND 0x04 /* 16 bits */ #define PCI_COMMAND_IO 0x1 /* Enable response in I/O space */ #define PCI_COMMAND_MEMORY 0x2 /* Enable response in Memory space */ #define PCI_COMMAND_MASTER 0x4 /* Enable bus mastering */ #define PCI_COMMAND_SPECIAL 0x8 /* Enable response to special cycles */ #define PCI_COMMAND_INVALIDATE 0x10 /* Use memory write and invalidate */ #define PCI_COMMAND_VGA_PALETTE 0x20 /* Enable palette snooping */ #define PCI_COMMAND_PARITY 0x40 /* Enable parity checking */ #define PCI_COMMAND_WAIT 0x80 /* Enable address/data stepping */ #define PCI_COMMAND_SERR 0x100 /* Enable SERR */ #define PCI_COMMAND_FAST_BACK 0x200 /* Enable back-to-back writes */ #define PCI_COMMAND_INTX_DISABLE 0x400 /* INTx Emulation Disable */ #define PCI_STATUS 0x06 /* 16 bits */ #define PCI_STATUS_CAP_LIST 0x10 /* Support Capability List */ #define PCI_STATUS_66MHZ 0x20 /* Support 66 Mhz PCI 2.1 bus */ #define PCI_STATUS_UDF 0x40 /* Support User Definable Features [obsolete] */ #define PCI_STATUS_FAST_BACK 0x80 /* Accept fast-back to back */ #define PCI_STATUS_PARITY 0x100 /* Detected parity error */ #define PCI_STATUS_DEVSEL_MASK 0x600 /* DEVSEL timing */ #define PCI_STATUS_DEVSEL_FAST 0x000 #define PCI_STATUS_DEVSEL_MEDIUM 0x200 #define PCI_STATUS_DEVSEL_SLOW 0x400 #define PCI_STATUS_SIG_TARGET_ABORT 0x800 /* Set on target abort */ #define PCI_STATUS_REC_TARGET_ABORT 0x1000 /* Master ack of " */ #define PCI_STATUS_REC_MASTER_ABORT 0x2000 /* Set on master abort */ #define PCI_STATUS_SIG_SYSTEM_ERROR 0x4000 /* Set when we drive SERR */ #define PCI_STATUS_DETECTED_PARITY 0x8000 /* Set on parity error */ #define PCI_CLASS_REVISION 0x08 /* High 24 bits are class, low 8 revision */ #define PCI_REVISION_ID 0x08 /* Revision ID */ #define PCI_CLASS_PROG 0x09 /* Reg. Level Programming Interface */ #define PCI_CLASS_DEVICE 0x0a /* Device class */ #define PCI_CACHE_LINE_SIZE 0x0c /* 8 bits */ #define PCI_LATENCY_TIMER 0x0d /* 8 bits */ #define PCI_HEADER_TYPE 0x0e /* 8 bits */ #define PCI_HEADER_TYPE_NORMAL 0 #define PCI_HEADER_TYPE_BRIDGE 1 #define PCI_HEADER_TYPE_CARDBUS 2 #define PCI_BIST 0x0f /* 8 bits */ #define PCI_BIST_CODE_MASK 0x0f /* Return result */ #define PCI_BIST_START 0x40 /* 1 to start BIST, 2 secs or less */ #define PCI_BIST_CAPABLE 0x80 /* 1 if BIST capable */ /* * Base addresses specify locations in memory or I/O space. * Decoded size can be determined by writing a value of * 0xffffffff to the register, and reading it back. Only * 1 bits are decoded. */ #define PCI_BASE_ADDRESS_0 0x10 /* 32 bits */ #define PCI_BASE_ADDRESS_1 0x14 /* 32 bits [htype 0,1 only] */ #define PCI_BASE_ADDRESS_2 0x18 /* 32 bits [htype 0 only] */ #define PCI_BASE_ADDRESS_3 0x1c /* 32 bits */ #define PCI_BASE_ADDRESS_4 0x20 /* 32 bits */ #define PCI_BASE_ADDRESS_5 0x24 /* 32 bits */ #define PCI_BASE_ADDRESS_SPACE 0x01 /* 0 = memory, 1 = I/O */ #define PCI_BASE_ADDRESS_SPACE_IO 0x01 #define PCI_BASE_ADDRESS_SPACE_MEMORY 0x00 #define PCI_BASE_ADDRESS_MEM_TYPE_MASK 0x06 #define PCI_BASE_ADDRESS_MEM_TYPE_32 0x00 /* 32 bit address */ #define PCI_BASE_ADDRESS_MEM_TYPE_1M 0x02 /* Below 1M [obsolete] */ #define PCI_BASE_ADDRESS_MEM_TYPE_64 0x04 /* 64 bit address */ #define PCI_BASE_ADDRESS_MEM_PREFETCH 0x08 /* prefetchable? */ #define PCI_BASE_ADDRESS_MEM_MASK (~0x0fUL) #define PCI_BASE_ADDRESS_IO_MASK (~0x03UL) /* bit 1 is reserved if address_space = 1 */ /* Header type 0 (normal devices) */ #define PCI_CARDBUS_CIS 0x28 #define PCI_SUBSYSTEM_VENDOR_ID 0x2c #define PCI_SUBSYSTEM_ID 0x2e #define PCI_ROM_ADDRESS 0x30 /* Bits 31..11 are address, 10..1 reserved */ #define PCI_ROM_ADDRESS_ENABLE 0x01 #define PCI_ROM_ADDRESS_MASK (~0x7ffUL) #define PCI_CAPABILITY_LIST 0x34 /* Offset of first capability list entry */ /* 0x35-0x3b are reserved */ #define PCI_INTERRUPT_LINE 0x3c /* 8 bits */ #define PCI_INTERRUPT_PIN 0x3d /* 8 bits */ #define PCI_MIN_GNT 0x3e /* 8 bits */ #define PCI_MAX_LAT 0x3f /* 8 bits */ /* Header type 1 (PCI-to-PCI bridges) */ #define PCI_PRIMARY_BUS 0x18 /* Primary bus number */ #define PCI_SECONDARY_BUS 0x19 /* Secondary bus number */ #define PCI_SUBORDINATE_BUS 0x1a /* Highest bus number behind the bridge */ #define PCI_SEC_LATENCY_TIMER 0x1b /* Latency timer for secondary interface */ #define PCI_IO_BASE 0x1c /* I/O range behind the bridge */ #define PCI_IO_LIMIT 0x1d #define PCI_IO_RANGE_TYPE_MASK 0x0fUL /* I/O bridging type */ #define PCI_IO_RANGE_TYPE_16 0x00 #define PCI_IO_RANGE_TYPE_32 0x01 #define PCI_IO_RANGE_MASK (~0x0fUL) #define PCI_SEC_STATUS 0x1e /* Secondary status register, only bit 14 used */ #define PCI_MEMORY_BASE 0x20 /* Memory range behind */ #define PCI_MEMORY_LIMIT 0x22 #define PCI_MEMORY_RANGE_TYPE_MASK 0x0fUL #define PCI_MEMORY_RANGE_MASK (~0x0fUL) #define PCI_PREF_MEMORY_BASE 0x24 /* Prefetchable memory range behind */ #define PCI_PREF_MEMORY_LIMIT 0x26 #define PCI_PREF_RANGE_TYPE_MASK 0x0fUL #define PCI_PREF_RANGE_TYPE_32 0x00 #define PCI_PREF_RANGE_TYPE_64 0x01 #define PCI_PREF_RANGE_MASK (~0x0fUL) #define PCI_PREF_BASE_UPPER32 0x28 /* Upper half of prefetchable memory range */ #define PCI_PREF_LIMIT_UPPER32 0x2c #define PCI_IO_BASE_UPPER16 0x30 /* Upper half of I/O addresses */ #define PCI_IO_LIMIT_UPPER16 0x32 /* 0x34 same as for htype 0 */ /* 0x35-0x3b is reserved */ #define PCI_ROM_ADDRESS1 0x38 /* Same as PCI_ROM_ADDRESS, but for htype 1 */ /* 0x3c-0x3d are same as for htype 0 */ #define PCI_BRIDGE_CONTROL 0x3e #define PCI_BRIDGE_CTL_PARITY 0x01 /* Enable parity detection on secondary interface */ #define PCI_BRIDGE_CTL_SERR 0x02 /* The same for SERR forwarding */ #define PCI_BRIDGE_CTL_ISA 0x04 /* Enable ISA mode */ #define PCI_BRIDGE_CTL_VGA 0x08 /* Forward VGA addresses */ #define PCI_BRIDGE_CTL_MASTER_ABORT 0x20 /* Report master aborts */ #define PCI_BRIDGE_CTL_BUS_RESET 0x40 /* Secondary bus reset */ #define PCI_BRIDGE_CTL_FAST_BACK 0x80 /* Fast Back2Back enabled on secondary interface */ /* Header type 2 (CardBus bridges) */ #define PCI_CB_CAPABILITY_LIST 0x14 /* 0x15 reserved */ #define PCI_CB_SEC_STATUS 0x16 /* Secondary status */ #define PCI_CB_PRIMARY_BUS 0x18 /* PCI bus number */ #define PCI_CB_CARD_BUS 0x19 /* CardBus bus number */ #define PCI_CB_SUBORDINATE_BUS 0x1a /* Subordinate bus number */ #define PCI_CB_LATENCY_TIMER 0x1b /* CardBus latency timer */ #define PCI_CB_MEMORY_BASE_0 0x1c #define PCI_CB_MEMORY_LIMIT_0 0x20 #define PCI_CB_MEMORY_BASE_1 0x24 #define PCI_CB_MEMORY_LIMIT_1 0x28 #define PCI_CB_IO_BASE_0 0x2c #define PCI_CB_IO_BASE_0_HI 0x2e #define PCI_CB_IO_LIMIT_0 0x30 #define PCI_CB_IO_LIMIT_0_HI 0x32 #define PCI_CB_IO_BASE_1 0x34 #define PCI_CB_IO_BASE_1_HI 0x36 #define PCI_CB_IO_LIMIT_1 0x38 #define PCI_CB_IO_LIMIT_1_HI 0x3a #define PCI_CB_IO_RANGE_MASK (~0x03UL) /* 0x3c-0x3d are same as for htype 0 */ #define PCI_CB_BRIDGE_CONTROL 0x3e #define PCI_CB_BRIDGE_CTL_PARITY 0x01 /* Similar to standard bridge control register */ #define PCI_CB_BRIDGE_CTL_SERR 0x02 #define PCI_CB_BRIDGE_CTL_ISA 0x04 #define PCI_CB_BRIDGE_CTL_VGA 0x08 #define PCI_CB_BRIDGE_CTL_MASTER_ABORT 0x20 #define PCI_CB_BRIDGE_CTL_CB_RESET 0x40 /* CardBus reset */ #define PCI_CB_BRIDGE_CTL_16BIT_INT 0x80 /* Enable interrupt for 16-bit cards */ #define PCI_CB_BRIDGE_CTL_PREFETCH_MEM0 0x100 /* Prefetch enable for both memory regions */ #define PCI_CB_BRIDGE_CTL_PREFETCH_MEM1 0x200 #define PCI_CB_BRIDGE_CTL_POST_WRITES 0x400 #define PCI_CB_SUBSYSTEM_VENDOR_ID 0x40 #define PCI_CB_SUBSYSTEM_ID 0x42 #define PCI_CB_LEGACY_MODE_BASE 0x44 /* 16-bit PC Card legacy mode base address (ExCa) */ /* 0x48-0x7f reserved */ /* Capability lists */ #define PCI_CAP_LIST_ID 0 /* Capability ID */ #define PCI_CAP_ID_PM 0x01 /* Power Management */ #define PCI_CAP_ID_AGP 0x02 /* Accelerated Graphics Port */ #define PCI_CAP_ID_VPD 0x03 /* Vital Product Data */ #define PCI_CAP_ID_SLOTID 0x04 /* Slot Identification */ #define PCI_CAP_ID_MSI 0x05 /* Message Signalled Interrupts */ #define PCI_CAP_ID_CHSWP 0x06 /* CompactPCI HotSwap */ #define PCI_CAP_ID_PCIX 0x07 /* PCI-X */ #define PCI_CAP_ID_HT 0x08 /* HyperTransport */ #define PCI_CAP_ID_VNDR 0x09 /* Vendor specific */ #define PCI_CAP_ID_DBG 0x0A /* Debug port */ #define PCI_CAP_ID_CCRC 0x0B /* CompactPCI Central Resource Control */ #define PCI_CAP_ID_SHPC 0x0C /* PCI Standard Hot-Plug Controller */ #define PCI_CAP_ID_SSVID 0x0D /* Bridge subsystem vendor/device ID */ #define PCI_CAP_ID_AGP3 0x0E /* AGP Target PCI-PCI bridge */ #define PCI_CAP_ID_EXP 0x10 /* PCI Express */ #define PCI_CAP_ID_MSIX 0x11 /* MSI-X */ #define PCI_CAP_LIST_NEXT 1 /* Next capability in the list */ #define PCI_CAP_FLAGS 2 /* Capability defined flags (16 bits) */ #define PCI_CAP_SIZEOF 4 /* Power Management Registers */ #define PCI_PM_PMC 2 /* PM Capabilities Register */ #define PCI_PM_CAP_VER_MASK 0x0007 /* Version */ #define PCI_PM_CAP_PME_CLOCK 0x0008 /* PME clock required */ #define PCI_PM_CAP_RESERVED 0x0010 /* Reserved field */ #define PCI_PM_CAP_DSI 0x0020 /* Device specific initialization */ #define PCI_PM_CAP_AUX_POWER 0x01C0 /* Auxilliary power support mask */ #define PCI_PM_CAP_D1 0x0200 /* D1 power state support */ #define PCI_PM_CAP_D2 0x0400 /* D2 power state support */ #define PCI_PM_CAP_PME 0x0800 /* PME pin supported */ #define PCI_PM_CAP_PME_MASK 0xF800 /* PME Mask of all supported states */ #define PCI_PM_CAP_PME_D0 0x0800 /* PME# from D0 */ #define PCI_PM_CAP_PME_D1 0x1000 /* PME# from D1 */ #define PCI_PM_CAP_PME_D2 0x2000 /* PME# from D2 */ #define PCI_PM_CAP_PME_D3 0x4000 /* PME# from D3 (hot) */ #define PCI_PM_CAP_PME_D3cold 0x8000 /* PME# from D3 (cold) */ #define PCI_PM_CTRL 4 /* PM control and status register */ #define PCI_PM_CTRL_STATE_MASK 0x0003 /* Current power state (D0 to D3) */ #define PCI_PM_CTRL_NO_SOFT_RESET 0x0008 /* No reset for D3hot->D0 */ #define PCI_PM_CTRL_PME_ENABLE 0x0100 /* PME pin enable */ #define PCI_PM_CTRL_DATA_SEL_MASK 0x1e00 /* Data select (??) */ #define PCI_PM_CTRL_DATA_SCALE_MASK 0x6000 /* Data scale (??) */ #define PCI_PM_CTRL_PME_STATUS 0x8000 /* PME pin status */ #define PCI_PM_PPB_EXTENSIONS 6 /* PPB support extensions (??) */ #define PCI_PM_PPB_B2_B3 0x40 /* Stop clock when in D3hot (??) */ #define PCI_PM_BPCC_ENABLE 0x80 /* Bus power/clock control enable (??) */ #define PCI_PM_DATA_REGISTER 7 /* (??) */ #define PCI_PM_SIZEOF 8 /* AGP registers */ #define PCI_AGP_VERSION 2 /* BCD version number */ #define PCI_AGP_RFU 3 /* Rest of capability flags */ #define PCI_AGP_STATUS 4 /* Status register */ #define PCI_AGP_STATUS_RQ_MASK 0xff000000 /* Maximum number of requests - 1 */ #define PCI_AGP_STATUS_SBA 0x0200 /* Sideband addressing supported */ #define PCI_AGP_STATUS_64BIT 0x0020 /* 64-bit addressing supported */ #define PCI_AGP_STATUS_FW 0x0010 /* FW transfers supported */ #define PCI_AGP_STATUS_RATE4 0x0004 /* 4x transfer rate supported */ #define PCI_AGP_STATUS_RATE2 0x0002 /* 2x transfer rate supported */ #define PCI_AGP_STATUS_RATE1 0x0001 /* 1x transfer rate supported */ #define PCI_AGP_COMMAND 8 /* Control register */ #define PCI_AGP_COMMAND_RQ_MASK 0xff000000 /* Master: Maximum number of requests */ #define PCI_AGP_COMMAND_SBA 0x0200 /* Sideband addressing enabled */ #define PCI_AGP_COMMAND_AGP 0x0100 /* Allow processing of AGP transactions */ #define PCI_AGP_COMMAND_64BIT 0x0020 /* Allow processing of 64-bit addresses */ #define PCI_AGP_COMMAND_FW 0x0010 /* Force FW transfers */ #define PCI_AGP_COMMAND_RATE4 0x0004 /* Use 4x rate */ #define PCI_AGP_COMMAND_RATE2 0x0002 /* Use 2x rate */ #define PCI_AGP_COMMAND_RATE1 0x0001 /* Use 1x rate */ #define PCI_AGP_SIZEOF 12 /* Vital Product Data */ #define PCI_VPD_ADDR 2 /* Address to access (15 bits!) */ #define PCI_VPD_ADDR_MASK 0x7fff /* Address mask */ #define PCI_VPD_ADDR_F 0x8000 /* Write 0, 1 indicates completion */ #define PCI_VPD_DATA 4 /* 32-bits of data returned here */ /* Slot Identification */ #define PCI_SID_ESR 2 /* Expansion Slot Register */ #define PCI_SID_ESR_NSLOTS 0x1f /* Number of expansion slots available */ #define PCI_SID_ESR_FIC 0x20 /* First In Chassis Flag */ #define PCI_SID_CHASSIS_NR 3 /* Chassis Number */ /* Message Signalled Interrupts registers */ #define PCI_MSI_FLAGS 2 /* Various flags */ #define PCI_MSI_FLAGS_64BIT 0x80 /* 64-bit addresses allowed */ #define PCI_MSI_FLAGS_QSIZE 0x70 /* Message queue size configured */ #define PCI_MSI_FLAGS_QMASK 0x0e /* Maximum queue size available */ #define PCI_MSI_FLAGS_ENABLE 0x01 /* MSI feature enabled */ #define PCI_MSI_FLAGS_MASKBIT 0x100 /* 64-bit mask bits allowed */ #define PCI_MSI_RFU 3 /* Rest of capability flags */ #define PCI_MSI_ADDRESS_LO 4 /* Lower 32 bits */ #define PCI_MSI_ADDRESS_HI 8 /* Upper 32 bits (if PCI_MSI_FLAGS_64BIT set) */ #define PCI_MSI_DATA_32 8 /* 16 bits of data for 32-bit devices */ #define PCI_MSI_DATA_64 12 /* 16 bits of data for 64-bit devices */ #define PCI_MSI_MASK_BIT 16 /* Mask bits register */ /* MSI-X registers (these are at offset PCI_MSIX_FLAGS) */ #define PCI_MSIX_FLAGS 2 #define PCI_MSIX_FLAGS_QSIZE 0x7FF #define PCI_MSIX_FLAGS_ENABLE (1 << 15) #define PCI_MSIX_FLAGS_MASKALL (1 << 14) #define PCI_MSIX_TABLE 4 #define PCI_MSIX_PBA 8 #define PCI_MSIX_BIRMASK (7 << 0) #define PCI_MSIX_ENTRY_SIZE 16 #define PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET 0 #define PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET 4 #define PCI_MSIX_ENTRY_DATA_OFFSET 8 #define PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET 12 #define PCI_MSIX_VECTOR_BITMASK (1 << 0) /* CompactPCI Hotswap Register */ #define PCI_CHSWP_CSR 2 /* Control and Status Register */ #define PCI_CHSWP_DHA 0x01 /* Device Hiding Arm */ #define PCI_CHSWP_EIM 0x02 /* ENUM# Signal Mask */ #define PCI_CHSWP_PIE 0x04 /* Pending Insert or Extract */ #define PCI_CHSWP_LOO 0x08 /* LED On / Off */ #define PCI_CHSWP_PI 0x30 /* Programming Interface */ #define PCI_CHSWP_EXT 0x40 /* ENUM# status - extraction */ #define PCI_CHSWP_INS 0x80 /* ENUM# status - insertion */ /* PCI-X registers */ #define PCI_X_CMD 2 /* Modes & Features */ #define PCI_X_CMD_DPERR_E 0x0001 /* Data Parity Error Recovery Enable */ #define PCI_X_CMD_ERO 0x0002 /* Enable Relaxed Ordering */ #define PCI_X_CMD_READ_512 0x0000 /* 512 byte maximum read byte count */ #define PCI_X_CMD_READ_1K 0x0004 /* 1Kbyte maximum read byte count */ #define PCI_X_CMD_READ_2K 0x0008 /* 2Kbyte maximum read byte count */ #define PCI_X_CMD_READ_4K 0x000c /* 4Kbyte maximum read byte count */ #define PCI_X_CMD_MAX_READ 0x000c /* Max Memory Read Byte Count */ /* Max # of outstanding split transactions */ #define PCI_X_CMD_SPLIT_1 0x0000 /* Max 1 */ #define PCI_X_CMD_SPLIT_2 0x0010 /* Max 2 */ #define PCI_X_CMD_SPLIT_3 0x0020 /* Max 3 */ #define PCI_X_CMD_SPLIT_4 0x0030 /* Max 4 */ #define PCI_X_CMD_SPLIT_8 0x0040 /* Max 8 */ #define PCI_X_CMD_SPLIT_12 0x0050 /* Max 12 */ #define PCI_X_CMD_SPLIT_16 0x0060 /* Max 16 */ #define PCI_X_CMD_SPLIT_32 0x0070 /* Max 32 */ #define PCI_X_CMD_MAX_SPLIT 0x0070 /* Max Outstanding Split Transactions */ #define PCI_X_CMD_VERSION(x) (((x) >> 12) & 3) /* Version */ #define PCI_X_STATUS 4 /* PCI-X capabilities */ #define PCI_X_STATUS_DEVFN 0x000000ff /* A copy of devfn */ #define PCI_X_STATUS_BUS 0x0000ff00 /* A copy of bus nr */ #define PCI_X_STATUS_64BIT 0x00010000 /* 64-bit device */ #define PCI_X_STATUS_133MHZ 0x00020000 /* 133 MHz capable */ #define PCI_X_STATUS_SPL_DISC 0x00040000 /* Split Completion Discarded */ #define PCI_X_STATUS_UNX_SPL 0x00080000 /* Unexpected Split Completion */ #define PCI_X_STATUS_COMPLEX 0x00100000 /* Device Complexity */ #define PCI_X_STATUS_MAX_READ 0x00600000 /* Designed Max Memory Read Count */ #define PCI_X_STATUS_MAX_SPLIT 0x03800000 /* Designed Max Outstanding Split Transactions */ #define PCI_X_STATUS_MAX_CUM 0x1c000000 /* Designed Max Cumulative Read Size */ #define PCI_X_STATUS_SPL_ERR 0x20000000 /* Rcvd Split Completion Error Msg */ #define PCI_X_STATUS_266MHZ 0x40000000 /* 266 MHz capable */ #define PCI_X_STATUS_533MHZ 0x80000000 /* 533 MHz capable */ /* PCI Express capability registers */ #define PCI_EXP_FLAGS 2 /* Capabilities register */ #define PCI_EXP_FLAGS_VERS 0x000f /* Capability version */ #define PCI_EXP_FLAGS_TYPE 0x00f0 /* Device/Port type */ #define PCI_EXP_TYPE_ENDPOINT 0x0 /* Express Endpoint */ #define PCI_EXP_TYPE_LEG_END 0x1 /* Legacy Endpoint */ #define PCI_EXP_TYPE_ROOT_PORT 0x4 /* Root Port */ #define PCI_EXP_TYPE_UPSTREAM 0x5 /* Upstream Port */ #define PCI_EXP_TYPE_DOWNSTREAM 0x6 /* Downstream Port */ #define PCI_EXP_TYPE_PCI_BRIDGE 0x7 /* PCI/PCI-X Bridge */ #define PCI_EXP_TYPE_PCIE_BRIDGE 0x8 /* PCI/PCI-X to PCIE Bridge */ #define PCI_EXP_TYPE_RC_END 0x9 /* Root Complex Integrated Endpoint */ #define PCI_EXP_TYPE_RC_EC 0xa /* Root Complex Event Collector */ #define PCI_EXP_FLAGS_SLOT 0x0100 /* Slot implemented */ #define PCI_EXP_FLAGS_IRQ 0x3e00 /* Interrupt message number */ #define PCI_EXP_DEVCAP 4 /* Device capabilities */ #define PCI_EXP_DEVCAP_PAYLOAD 0x07 /* Max_Payload_Size */ #define PCI_EXP_DEVCAP_PHANTOM 0x18 /* Phantom functions */ #define PCI_EXP_DEVCAP_EXT_TAG 0x20 /* Extended tags */ #define PCI_EXP_DEVCAP_L0S 0x1c0 /* L0s Acceptable Latency */ #define PCI_EXP_DEVCAP_L1 0xe00 /* L1 Acceptable Latency */ #define PCI_EXP_DEVCAP_ATN_BUT 0x1000 /* Attention Button Present */ #define PCI_EXP_DEVCAP_ATN_IND 0x2000 /* Attention Indicator Present */ #define PCI_EXP_DEVCAP_PWR_IND 0x4000 /* Power Indicator Present */ #define PCI_EXP_DEVCAP_PWR_VAL 0x3fc0000 /* Slot Power Limit Value */ #define PCI_EXP_DEVCAP_PWR_SCL 0xc000000 /* Slot Power Limit Scale */ #define PCI_EXP_DEVCTL 8 /* Device Control */ #define PCI_EXP_DEVCTL_CERE 0x0001 /* Correctable Error Reporting En. */ #define PCI_EXP_DEVCTL_NFERE 0x0002 /* Non-Fatal Error Reporting Enable */ #define PCI_EXP_DEVCTL_FERE 0x0004 /* Fatal Error Reporting Enable */ #define PCI_EXP_DEVCTL_URRE 0x0008 /* Unsupported Request Reporting En. */ #define PCI_EXP_DEVCTL_RELAX_EN 0x0010 /* Enable relaxed ordering */ #define PCI_EXP_DEVCTL_PAYLOAD 0x00e0 /* Max_Payload_Size */ #define PCI_EXP_DEVCTL_EXT_TAG 0x0100 /* Extended Tag Field Enable */ #define PCI_EXP_DEVCTL_PHANTOM 0x0200 /* Phantom Functions Enable */ #define PCI_EXP_DEVCTL_AUX_PME 0x0400 /* Auxiliary Power PM Enable */ #define PCI_EXP_DEVCTL_NOSNOOP_EN 0x0800 /* Enable No Snoop */ #define PCI_EXP_DEVCTL_READRQ 0x7000 /* Max_Read_Request_Size */ #define PCI_EXP_DEVSTA 10 /* Device Status */ #define PCI_EXP_DEVSTA_CED 0x01 /* Correctable Error Detected */ #define PCI_EXP_DEVSTA_NFED 0x02 /* Non-Fatal Error Detected */ #define PCI_EXP_DEVSTA_FED 0x04 /* Fatal Error Detected */ #define PCI_EXP_DEVSTA_URD 0x08 /* Unsupported Request Detected */ #define PCI_EXP_DEVSTA_AUXPD 0x10 /* AUX Power Detected */ #define PCI_EXP_DEVSTA_TRPND 0x20 /* Transactions Pending */ #define PCI_EXP_LNKCAP 12 /* Link Capabilities */ #define PCI_EXP_LNKCTL 16 /* Link Control */ #define PCI_EXP_LNKCTL_CLKREQ_EN 0x100 /* Enable clkreq */ #define PCI_EXP_LNKSTA 18 /* Link Status */ #define PCI_EXP_SLTCAP 20 /* Slot Capabilities */ #define PCI_EXP_SLTCTL 24 /* Slot Control */ #define PCI_EXP_SLTSTA 26 /* Slot Status */ #define PCI_EXP_RTCTL 28 /* Root Control */ #define PCI_EXP_RTCTL_SECEE 0x01 /* System Error on Correctable Error */ #define PCI_EXP_RTCTL_SENFEE 0x02 /* System Error on Non-Fatal Error */ #define PCI_EXP_RTCTL_SEFEE 0x04 /* System Error on Fatal Error */ #define PCI_EXP_RTCTL_PMEIE 0x08 /* PME Interrupt Enable */ #define PCI_EXP_RTCTL_CRSSVE 0x10 /* CRS Software Visibility Enable */ #define PCI_EXP_RTCAP 30 /* Root Capabilities */ #define PCI_EXP_RTSTA 32 /* Root Status */ /* Extended Capabilities (PCI-X 2.0 and Express) */ #define PCI_EXT_CAP_ID(header) (header & 0x0000ffff) #define PCI_EXT_CAP_VER(header) ((header >> 16) & 0xf) #define PCI_EXT_CAP_NEXT(header) ((header >> 20) & 0xffc) #define PCI_EXT_CAP_ID_ERR 1 #define PCI_EXT_CAP_ID_VC 2 #define PCI_EXT_CAP_ID_DSN 3 #define PCI_EXT_CAP_ID_PWR 4 #define PCI_EXT_CAP_ID_ACS 13 #define PCI_EXT_CAP_ID_ARI 14 #define PCI_EXT_CAP_ID_ATS 15 #define PCI_EXT_CAP_ID_SRIOV 16 /* Advanced Error Reporting */ #define PCI_ERR_UNCOR_STATUS 4 /* Uncorrectable Error Status */ #define PCI_ERR_UNC_TRAIN 0x00000001 /* Training */ #define PCI_ERR_UNC_DLP 0x00000010 /* Data Link Protocol */ #define PCI_ERR_UNC_POISON_TLP 0x00001000 /* Poisoned TLP */ #define PCI_ERR_UNC_FCP 0x00002000 /* Flow Control Protocol */ #define PCI_ERR_UNC_COMP_TIME 0x00004000 /* Completion Timeout */ #define PCI_ERR_UNC_COMP_ABORT 0x00008000 /* Completer Abort */ #define PCI_ERR_UNC_UNX_COMP 0x00010000 /* Unexpected Completion */ #define PCI_ERR_UNC_RX_OVER 0x00020000 /* Receiver Overflow */ #define PCI_ERR_UNC_MALF_TLP 0x00040000 /* Malformed TLP */ #define PCI_ERR_UNC_ECRC 0x00080000 /* ECRC Error Status */ #define PCI_ERR_UNC_UNSUP 0x00100000 /* Unsupported Request */ #define PCI_ERR_UNCOR_MASK 8 /* Uncorrectable Error Mask */ /* Same bits as above */ #define PCI_ERR_UNCOR_SEVER 12 /* Uncorrectable Error Severity */ /* Same bits as above */ #define PCI_ERR_COR_STATUS 16 /* Correctable Error Status */ #define PCI_ERR_COR_RCVR 0x00000001 /* Receiver Error Status */ #define PCI_ERR_COR_BAD_TLP 0x00000040 /* Bad TLP Status */ #define PCI_ERR_COR_BAD_DLLP 0x00000080 /* Bad DLLP Status */ #define PCI_ERR_COR_REP_ROLL 0x00000100 /* REPLAY_NUM Rollover */ #define PCI_ERR_COR_REP_TIMER 0x00001000 /* Replay Timer Timeout */ #define PCI_ERR_COR_MASK 20 /* Correctable Error Mask */ /* Same bits as above */ #define PCI_ERR_CAP 24 /* Advanced Error Capabilities */ #define PCI_ERR_CAP_FEP(x) ((x) & 31) /* First Error Pointer */ #define PCI_ERR_CAP_ECRC_GENC 0x00000020 /* ECRC Generation Capable */ #define PCI_ERR_CAP_ECRC_GENE 0x00000040 /* ECRC Generation Enable */ #define PCI_ERR_CAP_ECRC_CHKC 0x00000080 /* ECRC Check Capable */ #define PCI_ERR_CAP_ECRC_CHKE 0x00000100 /* ECRC Check Enable */ #define PCI_ERR_HEADER_LOG 28 /* Header Log Register (16 bytes) */ #define PCI_ERR_ROOT_COMMAND 44 /* Root Error Command */ /* Correctable Err Reporting Enable */ #define PCI_ERR_ROOT_CMD_COR_EN 0x00000001 /* Non-fatal Err Reporting Enable */ #define PCI_ERR_ROOT_CMD_NONFATAL_EN 0x00000002 /* Fatal Err Reporting Enable */ #define PCI_ERR_ROOT_CMD_FATAL_EN 0x00000004 #define PCI_ERR_ROOT_STATUS 48 #define PCI_ERR_ROOT_COR_RCV 0x00000001 /* ERR_COR Received */ /* Multi ERR_COR Received */ #define PCI_ERR_ROOT_MULTI_COR_RCV 0x00000002 /* ERR_FATAL/NONFATAL Recevied */ #define PCI_ERR_ROOT_UNCOR_RCV 0x00000004 /* Multi ERR_FATAL/NONFATAL Recevied */ #define PCI_ERR_ROOT_MULTI_UNCOR_RCV 0x00000008 #define PCI_ERR_ROOT_FIRST_FATAL 0x00000010 /* First Fatal */ #define PCI_ERR_ROOT_NONFATAL_RCV 0x00000020 /* Non-Fatal Received */ #define PCI_ERR_ROOT_FATAL_RCV 0x00000040 /* Fatal Received */ #define PCI_ERR_ROOT_COR_SRC 52 #define PCI_ERR_ROOT_SRC 54 /* Virtual Channel */ #define PCI_VC_PORT_REG1 4 #define PCI_VC_PORT_REG2 8 #define PCI_VC_PORT_CTRL 12 #define PCI_VC_PORT_STATUS 14 #define PCI_VC_RES_CAP 16 #define PCI_VC_RES_CTRL 20 #define PCI_VC_RES_STATUS 26 /* Power Budgeting */ #define PCI_PWR_DSR 4 /* Data Select Register */ #define PCI_PWR_DATA 8 /* Data Register */ #define PCI_PWR_DATA_BASE(x) ((x) & 0xff) /* Base Power */ #define PCI_PWR_DATA_SCALE(x) (((x) >> 8) & 3) /* Data Scale */ #define PCI_PWR_DATA_PM_SUB(x) (((x) >> 10) & 7) /* PM Sub State */ #define PCI_PWR_DATA_PM_STATE(x) (((x) >> 13) & 3) /* PM State */ #define PCI_PWR_DATA_TYPE(x) (((x) >> 15) & 7) /* Type */ #define PCI_PWR_DATA_RAIL(x) (((x) >> 18) & 7) /* Power Rail */ #define PCI_PWR_CAP 12 /* Capability */ #define PCI_PWR_CAP_BUDGET(x) ((x) & 1) /* Included in system budget */ /* * Hypertransport sub capability types * * Unfortunately there are both 3 bit and 5 bit capability types defined * in the HT spec, catering for that is a little messy. You probably don't * want to use these directly, just use pci_find_ht_capability() and it * will do the right thing for you. */ #define HT_3BIT_CAP_MASK 0xE0 #define HT_CAPTYPE_SLAVE 0x00 /* Slave/Primary link configuration */ #define HT_CAPTYPE_HOST 0x20 /* Host/Secondary link configuration */ #define HT_5BIT_CAP_MASK 0xF8 #define HT_CAPTYPE_IRQ 0x80 /* IRQ Configuration */ #define HT_CAPTYPE_REMAPPING_40 0xA0 /* 40 bit address remapping */ #define HT_CAPTYPE_REMAPPING_64 0xA2 /* 64 bit address remapping */ #define HT_CAPTYPE_UNITID_CLUMP 0x90 /* Unit ID clumping */ #define HT_CAPTYPE_EXTCONF 0x98 /* Extended Configuration Space Access */ #define HT_CAPTYPE_MSI_MAPPING 0xA8 /* MSI Mapping Capability */ #define HT_MSI_FLAGS 0x02 /* Offset to flags */ #define HT_MSI_FLAGS_ENABLE 0x1 /* Mapping enable */ #define HT_MSI_FLAGS_FIXED 0x2 /* Fixed mapping only */ #define HT_MSI_FIXED_ADDR 0x00000000FEE00000ULL /* Fixed addr */ #define HT_MSI_ADDR_LO 0x04 /* Offset to low addr bits */ #define HT_MSI_ADDR_LO_MASK 0xFFF00000 /* Low address bit mask */ #define HT_MSI_ADDR_HI 0x08 /* Offset to high addr bits */ #define HT_CAPTYPE_DIRECT_ROUTE 0xB0 /* Direct routing configuration */ #define HT_CAPTYPE_VCSET 0xB8 /* Virtual Channel configuration */ #define HT_CAPTYPE_ERROR_RETRY 0xC0 /* Retry on error configuration */ #define HT_CAPTYPE_GEN3 0xD0 /* Generation 3 hypertransport configuration */ #define HT_CAPTYPE_PM 0xE0 /* Hypertransport powermanagement configuration */ /* Access Control Service */ #define PCI_ACS_CAP 0x04 /* ACS Capability Register */ #define PCI_ACS_SV 0x01 /* Source Validation */ #define PCI_ACS_TB 0x02 /* Translation Blocking */ #define PCI_ACS_RR 0x04 /* P2P Request Redirect */ #define PCI_ACS_CR 0x08 /* P2P Completion Redirect */ #define PCI_ACS_UF 0x10 /* Upstream Forwarding */ #define PCI_ACS_EC 0x20 /* P2P Egress Control */ #define PCI_ACS_DT 0x40 /* Direct Translated P2P */ #define PCI_ACS_CTRL 0x06 /* ACS Control Register */ #define PCI_ACS_EGRESS_CTL_V 0x08 /* ACS Egress Control Vector */ /* Single Root I/O Virtualization */ #define PCI_SRIOV_CAP 0x04 /* SR-IOV Capabilities */ #define PCI_SRIOV_CAP_VFM 0x01 /* VF Migration Capable */ #define PCI_SRIOV_CAP_INTR(x) ((x) >> 21) /* Interrupt Message Number */ #define PCI_SRIOV_CTRL 0x08 /* SR-IOV Control */ #define PCI_SRIOV_CTRL_VFE 0x01 /* VF Enable */ #define PCI_SRIOV_CTRL_VFM 0x02 /* VF Migration Enable */ #define PCI_SRIOV_CTRL_INTR 0x04 /* VF Migration Interrupt Enable */ #define PCI_SRIOV_CTRL_MSE 0x08 /* VF Memory Space Enable */ #define PCI_SRIOV_CTRL_ARI 0x10 /* ARI Capable Hierarchy */ #define PCI_SRIOV_STATUS 0x0a /* SR-IOV Status */ #define PCI_SRIOV_STATUS_VFM 0x01 /* VF Migration Status */ #define PCI_SRIOV_INITIAL_VF 0x0c /* Initial VFs */ #define PCI_SRIOV_TOTAL_VF 0x0e /* Total VFs */ #define PCI_SRIOV_NUM_VF 0x10 /* Number of VFs */ #define PCI_SRIOV_FUNC_LINK 0x12 /* Function Dependency Link */ #define PCI_SRIOV_VF_OFFSET 0x14 /* First VF Offset */ #define PCI_SRIOV_VF_STRIDE 0x16 /* Following VF Stride */ #define PCI_SRIOV_VF_DID 0x1a /* VF Device ID */ #define PCI_SRIOV_SUP_PGSIZE 0x1c /* Supported Page Sizes */ #define PCI_SRIOV_SYS_PGSIZE 0x20 /* System Page Size */ #define PCI_SRIOV_BAR 0x24 /* VF BAR0 */ #define PCI_SRIOV_NUM_BARS 6 /* Number of VF BARs */ #define PCI_SRIOV_VFM 0x3c /* VF Migration State Array Offset*/ #define PCI_SRIOV_VFM_BIR(x) ((x) & 7) /* State BIR */ #define PCI_SRIOV_VFM_OFFSET(x) ((x) & ~7) /* State Offset */ #define PCI_SRIOV_VFM_UA 0x0 /* Inactive.Unavailable */ #define PCI_SRIOV_VFM_MI 0x1 /* Dormant.MigrateIn */ #define PCI_SRIOV_VFM_MO 0x2 /* Active.MigrateOut */ #define PCI_SRIOV_VFM_AV 0x3 /* Active.Available */ #endif /* LINUX_PCI_REGS_H */ xen-4.4.0/xen/include/xen/cache.h0000664000175000017500000000060312307313555014665 0ustar smbsmb#ifndef __LINUX_CACHE_H #define __LINUX_CACHE_H #include #ifndef L1_CACHE_ALIGN #define L1_CACHE_ALIGN(x) (((x)+(L1_CACHE_BYTES-1))&~(L1_CACHE_BYTES-1)) #endif #ifndef SMP_CACHE_BYTES #define SMP_CACHE_BYTES L1_CACHE_BYTES #endif #ifndef __cacheline_aligned #define __cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) #endif #endif /* __LINUX_CACHE_H */ xen-4.4.0/xen/include/xen/console.h0000664000175000017500000000213612307313555015267 0ustar smbsmb/****************************************************************************** * xen/console.h * * Xen header file concerning console access. */ #ifndef __CONSOLE_H__ #define __CONSOLE_H__ #include #include struct xen_sysctl_readconsole; long read_console_ring(struct xen_sysctl_readconsole *op); void console_init_preirq(void); void console_init_postirq(void); void console_endboot(void); int console_has(const char *device); int fill_console_start_info(struct dom0_vga_console_info *); void console_force_unlock(void); void console_start_sync(void); void console_end_sync(void); void console_start_log_everything(void); void console_end_log_everything(void); /* * Steal output from the console. Returns +ve identifier, else -ve error. * Takes the handle of the serial line to steal, and steal callback function. */ int console_steal(int handle, void (*fn)(const char *)); /* Give back stolen console. Takes the identifier returned by console_steal. */ void console_giveback(int id); int console_suspend(void); int console_resume(void); #endif /* __CONSOLE_H__ */ xen-4.4.0/xen/include/xen/event.h0000664000175000017500000001552212307313555014751 0ustar smbsmb/****************************************************************************** * event.h * * A nice interface for passing asynchronous events to guest OSes. * * Copyright (c) 2002-2006, K A Fraser */ #ifndef __XEN_EVENT_H__ #define __XEN_EVENT_H__ #include #include #include #include #include /* * send_guest_vcpu_virq: Notify guest via a per-VCPU VIRQ. * @v: VCPU to which virtual IRQ should be sent * @virq: Virtual IRQ number (VIRQ_*) */ void send_guest_vcpu_virq(struct vcpu *v, uint32_t virq); /* * send_global_virq: Notify the domain handling a global VIRQ. * @virq: Virtual IRQ number (VIRQ_*) */ void send_global_virq(uint32_t virq); /* * sent_global_virq_handler: Set a global VIRQ handler. * @d: New target domain for this VIRQ * @virq: Virtual IRQ number (VIRQ_*), must be global */ int set_global_virq_handler(struct domain *d, uint32_t virq); /* * send_guest_pirq: * @d: Domain to which physical IRQ should be sent * @pirq: Physical IRQ number */ void send_guest_pirq(struct domain *, const struct pirq *); /* Send a notification from a given domain's event-channel port. */ int evtchn_send(struct domain *d, unsigned int lport); /* Bind a local event-channel port to the specified VCPU. */ long evtchn_bind_vcpu(unsigned int port, unsigned int vcpu_id); /* Unmask a local event-channel port. */ int evtchn_unmask(unsigned int port); /* Move all PIRQs after a vCPU was moved to another pCPU. */ void evtchn_move_pirqs(struct vcpu *v); /* Allocate/free a Xen-attached event channel port. */ typedef void (*xen_event_channel_notification_t)( struct vcpu *v, unsigned int port); int alloc_unbound_xen_event_channel( struct vcpu *local_vcpu, domid_t remote_domid, xen_event_channel_notification_t notification_fn); void free_xen_event_channel( struct vcpu *local_vcpu, int port); /* Query if event channel is in use by the guest */ int guest_enabled_event(struct vcpu *v, uint32_t virq); /* Notify remote end of a Xen-attached event channel.*/ void notify_via_xen_event_channel(struct domain *ld, int lport); /* * Internal event channel object storage. * * The objects (struct evtchn) are indexed using a two level scheme of * groups and buckets. Each group is a page of bucket pointers. Each * bucket is a page-sized array of struct evtchn's. * * The first bucket is directly accessed via d->evtchn. */ #define group_from_port(d, p) \ ((d)->evtchn_group[(p) / EVTCHNS_PER_GROUP]) #define bucket_from_port(d, p) \ ((group_from_port(d, p))[((p) % EVTCHNS_PER_GROUP) / EVTCHNS_PER_BUCKET]) static inline bool_t port_is_valid(struct domain *d, unsigned int p) { if ( p >= d->max_evtchns ) return 0; if ( !d->evtchn ) return 0; if ( p < EVTCHNS_PER_BUCKET ) return 1; return group_from_port(d, p) != NULL && bucket_from_port(d, p) != NULL; } static inline struct evtchn *evtchn_from_port(struct domain *d, unsigned int p) { if ( p < EVTCHNS_PER_BUCKET ) return &d->evtchn[p]; return bucket_from_port(d, p) + (p % EVTCHNS_PER_BUCKET); } /* Wait on a Xen-attached event channel. */ #define wait_on_xen_event_channel(port, condition) \ do { \ if ( condition ) \ break; \ set_bit(_VPF_blocked_in_xen, ¤t->pause_flags); \ smp_mb(); /* set blocked status /then/ re-evaluate condition */ \ if ( condition ) \ { \ clear_bit(_VPF_blocked_in_xen, ¤t->pause_flags); \ break; \ } \ raise_softirq(SCHEDULE_SOFTIRQ); \ do_softirq(); \ } while ( 0 ) #define prepare_wait_on_xen_event_channel(port) \ do { \ set_bit(_VPF_blocked_in_xen, ¤t->pause_flags); \ raise_softirq(SCHEDULE_SOFTIRQ); \ smp_mb(); /* set blocked status /then/ caller does his work */ \ } while ( 0 ) void evtchn_check_pollers(struct domain *d, unsigned int port); void evtchn_2l_init(struct domain *d); /* * Low-level event channel port ops. */ struct evtchn_port_ops { void (*init)(struct domain *d, struct evtchn *evtchn); void (*set_pending)(struct vcpu *v, struct evtchn *evtchn); void (*clear_pending)(struct domain *d, struct evtchn *evtchn); void (*unmask)(struct domain *d, struct evtchn *evtchn); bool_t (*is_pending)(struct domain *d, const struct evtchn *evtchn); bool_t (*is_masked)(struct domain *d, const struct evtchn *evtchn); int (*set_priority)(struct domain *d, struct evtchn *evtchn, unsigned int priority); void (*print_state)(struct domain *d, const struct evtchn *evtchn); }; static inline void evtchn_port_init(struct domain *d, struct evtchn *evtchn) { if ( d->evtchn_port_ops->init ) d->evtchn_port_ops->init(d, evtchn); } static inline void evtchn_port_set_pending(struct vcpu *v, struct evtchn *evtchn) { v->domain->evtchn_port_ops->set_pending(v, evtchn); } static inline void evtchn_port_clear_pending(struct domain *d, struct evtchn *evtchn) { d->evtchn_port_ops->clear_pending(d, evtchn); } static inline void evtchn_port_unmask(struct domain *d, struct evtchn *evtchn) { d->evtchn_port_ops->unmask(d, evtchn); } static inline bool_t evtchn_port_is_pending(struct domain *d, const struct evtchn *evtchn) { return d->evtchn_port_ops->is_pending(d, evtchn); } static inline bool_t evtchn_port_is_masked(struct domain *d, const struct evtchn *evtchn) { return d->evtchn_port_ops->is_masked(d, evtchn); } static inline int evtchn_port_set_priority(struct domain *d, struct evtchn *evtchn, unsigned int priority) { if ( !d->evtchn_port_ops->set_priority ) return -ENOSYS; return d->evtchn_port_ops->set_priority(d, evtchn, priority); } static inline void evtchn_port_print_state(struct domain *d, const struct evtchn *evtchn) { d->evtchn_port_ops->print_state(d, evtchn); } #endif /* __XEN_EVENT_H__ */ xen-4.4.0/xen/include/xen/libfdt/0000775000175000017500000000000012307313555014716 5ustar smbsmbxen-4.4.0/xen/include/xen/libfdt/libfdt.h0000664000175000017500000013266712307313555016352 0ustar smbsmb#ifndef _LIBFDT_H #define _LIBFDT_H /* * libfdt - Flat Device Tree manipulation * Copyright (C) 2006 David Gibson, IBM Corporation. * * libfdt is dual licensed: you can use it either under the terms of * the GPL, or the BSD license, at your option. * * a) This library is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, * MA 02110-1301 USA * * Alternatively, * * b) Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * 1. Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #define FDT_FIRST_SUPPORTED_VERSION 0x10 #define FDT_LAST_SUPPORTED_VERSION 0x11 /* Error codes: informative error codes */ #define FDT_ERR_NOTFOUND 1 /* FDT_ERR_NOTFOUND: The requested node or property does not exist */ #define FDT_ERR_EXISTS 2 /* FDT_ERR_EXISTS: Attemped to create a node or property which * already exists */ #define FDT_ERR_NOSPACE 3 /* FDT_ERR_NOSPACE: Operation needed to expand the device * tree, but its buffer did not have sufficient space to * contain the expanded tree. Use fdt_open_into() to move the * device tree to a buffer with more space. */ /* Error codes: codes for bad parameters */ #define FDT_ERR_BADOFFSET 4 /* FDT_ERR_BADOFFSET: Function was passed a structure block * offset which is out-of-bounds, or which points to an * unsuitable part of the structure for the operation. */ #define FDT_ERR_BADPATH 5 /* FDT_ERR_BADPATH: Function was passed a badly formatted path * (e.g. missing a leading / for a function which requires an * absolute path) */ #define FDT_ERR_BADPHANDLE 6 /* FDT_ERR_BADPHANDLE: Function was passed an invalid phandle * value. phandle values of 0 and -1 are not permitted. */ #define FDT_ERR_BADSTATE 7 /* FDT_ERR_BADSTATE: Function was passed an incomplete device * tree created by the sequential-write functions, which is * not sufficiently complete for the requested operation. */ /* Error codes: codes for bad device tree blobs */ #define FDT_ERR_TRUNCATED 8 /* FDT_ERR_TRUNCATED: Structure block of the given device tree * ends without an FDT_END tag. */ #define FDT_ERR_BADMAGIC 9 /* FDT_ERR_BADMAGIC: Given "device tree" appears not to be a * device tree at all - it is missing the flattened device * tree magic number. */ #define FDT_ERR_BADVERSION 10 /* FDT_ERR_BADVERSION: Given device tree has a version which * can't be handled by the requested operation. For * read-write functions, this may mean that fdt_open_into() is * required to convert the tree to the expected version. */ #define FDT_ERR_BADSTRUCTURE 11 /* FDT_ERR_BADSTRUCTURE: Given device tree has a corrupt * structure block or other serious error (e.g. misnested * nodes, or subnodes preceding properties). */ #define FDT_ERR_BADLAYOUT 12 /* FDT_ERR_BADLAYOUT: For read-write functions, the given * device tree has it's sub-blocks in an order that the * function can't handle (memory reserve map, then structure, * then strings). Use fdt_open_into() to reorganize the tree * into a form suitable for the read-write operations. */ /* "Can't happen" error indicating a bug in libfdt */ #define FDT_ERR_INTERNAL 13 /* FDT_ERR_INTERNAL: libfdt has failed an internal assertion. * Should never be returned, if it is, it indicates a bug in * libfdt itself. */ #define FDT_ERR_MAX 13 /**********************************************************************/ /* Low-level functions (you probably don't need these) */ /**********************************************************************/ const void *fdt_offset_ptr(const void *fdt, int offset, unsigned int checklen); static inline void *fdt_offset_ptr_w(void *fdt, int offset, int checklen) { return (void *)(uintptr_t)fdt_offset_ptr(fdt, offset, checklen); } uint32_t fdt_next_tag(const void *fdt, int offset, int *nextoffset); /**********************************************************************/ /* Traversal functions */ /**********************************************************************/ int fdt_next_node(const void *fdt, int offset, int *depth); /**********************************************************************/ /* General functions */ /**********************************************************************/ #define fdt_get_header(fdt, field) \ (fdt32_to_cpu(((const struct fdt_header *)(fdt))->field)) #define fdt_magic(fdt) (fdt_get_header(fdt, magic)) #define fdt_totalsize(fdt) (fdt_get_header(fdt, totalsize)) #define fdt_off_dt_struct(fdt) (fdt_get_header(fdt, off_dt_struct)) #define fdt_off_dt_strings(fdt) (fdt_get_header(fdt, off_dt_strings)) #define fdt_off_mem_rsvmap(fdt) (fdt_get_header(fdt, off_mem_rsvmap)) #define fdt_version(fdt) (fdt_get_header(fdt, version)) #define fdt_last_comp_version(fdt) (fdt_get_header(fdt, last_comp_version)) #define fdt_boot_cpuid_phys(fdt) (fdt_get_header(fdt, boot_cpuid_phys)) #define fdt_size_dt_strings(fdt) (fdt_get_header(fdt, size_dt_strings)) #define fdt_size_dt_struct(fdt) (fdt_get_header(fdt, size_dt_struct)) #define __fdt_set_hdr(name) \ static inline void fdt_set_##name(void *fdt, uint32_t val) \ { \ struct fdt_header *fdth = (struct fdt_header*)fdt; \ fdth->name = cpu_to_fdt32(val); \ } __fdt_set_hdr(magic); __fdt_set_hdr(totalsize); __fdt_set_hdr(off_dt_struct); __fdt_set_hdr(off_dt_strings); __fdt_set_hdr(off_mem_rsvmap); __fdt_set_hdr(version); __fdt_set_hdr(last_comp_version); __fdt_set_hdr(boot_cpuid_phys); __fdt_set_hdr(size_dt_strings); __fdt_set_hdr(size_dt_struct); #undef __fdt_set_hdr /** * fdt_check_header - sanity check a device tree or possible device tree * @fdt: pointer to data which might be a flattened device tree * * fdt_check_header() checks that the given buffer contains what * appears to be a flattened device tree with sane information in its * header. * * returns: * 0, if the buffer appears to contain a valid device tree * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, standard meanings, as above */ int fdt_check_header(const void *fdt); /** * fdt_move - move a device tree around in memory * @fdt: pointer to the device tree to move * @buf: pointer to memory where the device is to be moved * @bufsize: size of the memory space at buf * * fdt_move() relocates, if possible, the device tree blob located at * fdt to the buffer at buf of size bufsize. The buffer may overlap * with the existing device tree blob at fdt. Therefore, * fdt_move(fdt, fdt, fdt_totalsize(fdt)) * should always succeed. * * returns: * 0, on success * -FDT_ERR_NOSPACE, bufsize is insufficient to contain the device tree * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, standard meanings */ int fdt_move(const void *fdt, void *buf, int bufsize); /**********************************************************************/ /* Read-only functions */ /**********************************************************************/ /** * fdt_string - retrieve a string from the strings block of a device tree * @fdt: pointer to the device tree blob * @stroffset: offset of the string within the strings block (native endian) * * fdt_string() retrieves a pointer to a single string from the * strings block of the device tree blob at fdt. * * returns: * a pointer to the string, on success * NULL, if stroffset is out of bounds */ const char *fdt_string(const void *fdt, int stroffset); /** * fdt_num_mem_rsv - retrieve the number of memory reserve map entries * @fdt: pointer to the device tree blob * * Returns the number of entries in the device tree blob's memory * reservation map. This does not include the terminating 0,0 entry * or any other (0,0) entries reserved for expansion. * * returns: * the number of entries */ int fdt_num_mem_rsv(const void *fdt); /** * fdt_get_mem_rsv - retrieve one memory reserve map entry * @fdt: pointer to the device tree blob * @address, @size: pointers to 64-bit variables * * On success, *address and *size will contain the address and size of * the n-th reserve map entry from the device tree blob, in * native-endian format. * * returns: * 0, on success * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, standard meanings */ int fdt_get_mem_rsv(const void *fdt, int n, uint64_t *address, uint64_t *size); /** * fdt_subnode_offset_namelen - find a subnode based on substring * @fdt: pointer to the device tree blob * @parentoffset: structure block offset of a node * @name: name of the subnode to locate * @namelen: number of characters of name to consider * * Identical to fdt_subnode_offset(), but only examine the first * namelen characters of name for matching the subnode name. This is * useful for finding subnodes based on a portion of a larger string, * such as a full path. */ int fdt_subnode_offset_namelen(const void *fdt, int parentoffset, const char *name, int namelen); /** * fdt_subnode_offset - find a subnode of a given node * @fdt: pointer to the device tree blob * @parentoffset: structure block offset of a node * @name: name of the subnode to locate * * fdt_subnode_offset() finds a subnode of the node at structure block * offset parentoffset with the given name. name may include a unit * address, in which case fdt_subnode_offset() will find the subnode * with that unit address, or the unit address may be omitted, in * which case fdt_subnode_offset() will find an arbitrary subnode * whose name excluding unit address matches the given name. * * returns: * structure block offset of the requested subnode (>=0), on success * -FDT_ERR_NOTFOUND, if the requested subnode does not exist * -FDT_ERR_BADOFFSET, if parentoffset did not point to an FDT_BEGIN_NODE tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_TRUNCATED, standard meanings. */ int fdt_subnode_offset(const void *fdt, int parentoffset, const char *name); /** * fdt_path_offset - find a tree node by its full path * @fdt: pointer to the device tree blob * @path: full path of the node to locate * * fdt_path_offset() finds a node of a given path in the device tree. * Each path component may omit the unit address portion, but the * results of this are undefined if any such path component is * ambiguous (that is if there are multiple nodes at the relevant * level matching the given component, differentiated only by unit * address). * * returns: * structure block offset of the node with the requested path (>=0), on success * -FDT_ERR_BADPATH, given path does not begin with '/' or is invalid * -FDT_ERR_NOTFOUND, if the requested node does not exist * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_TRUNCATED, standard meanings. */ int fdt_path_offset(const void *fdt, const char *path); /** * fdt_get_name - retrieve the name of a given node * @fdt: pointer to the device tree blob * @nodeoffset: structure block offset of the starting node * @lenp: pointer to an integer variable (will be overwritten) or NULL * * fdt_get_name() retrieves the name (including unit address) of the * device tree node at structure block offset nodeoffset. If lenp is * non-NULL, the length of this name is also returned, in the integer * pointed to by lenp. * * returns: * pointer to the node's name, on success * If lenp is non-NULL, *lenp contains the length of that name (>=0) * NULL, on error * if lenp is non-NULL *lenp contains an error code (<0): * -FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, standard meanings */ const char *fdt_get_name(const void *fdt, int nodeoffset, int *lenp); /** * fdt_first_property_offset - find the offset of a node's first property * @fdt: pointer to the device tree blob * @nodeoffset: structure block offset of a node * * fdt_first_property_offset() finds the first property of the node at * the given structure block offset. * * returns: * structure block offset of the property (>=0), on success * -FDT_ERR_NOTFOUND, if the requested node has no properties * -FDT_ERR_BADOFFSET, if nodeoffset did not point to an FDT_BEGIN_NODE tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_TRUNCATED, standard meanings. */ int fdt_first_property_offset(const void *fdt, int nodeoffset); /** * fdt_next_property_offset - step through a node's properties * @fdt: pointer to the device tree blob * @offset: structure block offset of a property * * fdt_next_property_offset() finds the property immediately after the * one at the given structure block offset. This will be a property * of the same node as the given property. * * returns: * structure block offset of the next property (>=0), on success * -FDT_ERR_NOTFOUND, if the given property is the last in its node * -FDT_ERR_BADOFFSET, if nodeoffset did not point to an FDT_PROP tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_TRUNCATED, standard meanings. */ int fdt_next_property_offset(const void *fdt, int offset); /** * fdt_get_property_by_offset - retrieve the property at a given offset * @fdt: pointer to the device tree blob * @offset: offset of the property to retrieve * @lenp: pointer to an integer variable (will be overwritten) or NULL * * fdt_get_property_by_offset() retrieves a pointer to the * fdt_property structure within the device tree blob at the given * offset. If lenp is non-NULL, the length of the property value is * also returned, in the integer pointed to by lenp. * * returns: * pointer to the structure representing the property * if lenp is non-NULL, *lenp contains the length of the property * value (>=0) * NULL, on error * if lenp is non-NULL, *lenp contains an error code (<0): * -FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_PROP tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_TRUNCATED, standard meanings */ const struct fdt_property *fdt_get_property_by_offset(const void *fdt, int offset, int *lenp); /** * fdt_get_property_namelen - find a property based on substring * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node whose property to find * @name: name of the property to find * @namelen: number of characters of name to consider * @lenp: pointer to an integer variable (will be overwritten) or NULL * * Identical to fdt_get_property_namelen(), but only examine the first * namelen characters of name for matching the property name. */ const struct fdt_property *fdt_get_property_namelen(const void *fdt, int nodeoffset, const char *name, int namelen, int *lenp); /** * fdt_get_property - find a given property in a given node * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node whose property to find * @name: name of the property to find * @lenp: pointer to an integer variable (will be overwritten) or NULL * * fdt_get_property() retrieves a pointer to the fdt_property * structure within the device tree blob corresponding to the property * named 'name' of the node at offset nodeoffset. If lenp is * non-NULL, the length of the property value is also returned, in the * integer pointed to by lenp. * * returns: * pointer to the structure representing the property * if lenp is non-NULL, *lenp contains the length of the property * value (>=0) * NULL, on error * if lenp is non-NULL, *lenp contains an error code (<0): * -FDT_ERR_NOTFOUND, node does not have named property * -FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_TRUNCATED, standard meanings */ const struct fdt_property *fdt_get_property(const void *fdt, int nodeoffset, const char *name, int *lenp); static inline struct fdt_property *fdt_get_property_w(void *fdt, int nodeoffset, const char *name, int *lenp) { return (struct fdt_property *)(uintptr_t) fdt_get_property(fdt, nodeoffset, name, lenp); } /** * fdt_getprop_by_offset - retrieve the value of a property at a given offset * @fdt: pointer to the device tree blob * @ffset: offset of the property to read * @namep: pointer to a string variable (will be overwritten) or NULL * @lenp: pointer to an integer variable (will be overwritten) or NULL * * fdt_getprop_by_offset() retrieves a pointer to the value of the * property at structure block offset 'offset' (this will be a pointer * to within the device blob itself, not a copy of the value). If * lenp is non-NULL, the length of the property value is also * returned, in the integer pointed to by lenp. If namep is non-NULL, * the property's namne will also be returned in the char * pointed to * by namep (this will be a pointer to within the device tree's string * block, not a new copy of the name). * * returns: * pointer to the property's value * if lenp is non-NULL, *lenp contains the length of the property * value (>=0) * if namep is non-NULL *namep contiains a pointer to the property * name. * NULL, on error * if lenp is non-NULL, *lenp contains an error code (<0): * -FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_PROP tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_TRUNCATED, standard meanings */ const void *fdt_getprop_by_offset(const void *fdt, int offset, const char **namep, int *lenp); /** * fdt_getprop_namelen - get property value based on substring * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node whose property to find * @name: name of the property to find * @namelen: number of characters of name to consider * @lenp: pointer to an integer variable (will be overwritten) or NULL * * Identical to fdt_getprop(), but only examine the first namelen * characters of name for matching the property name. */ const void *fdt_getprop_namelen(const void *fdt, int nodeoffset, const char *name, int namelen, int *lenp); /** * fdt_getprop - retrieve the value of a given property * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node whose property to find * @name: name of the property to find * @lenp: pointer to an integer variable (will be overwritten) or NULL * * fdt_getprop() retrieves a pointer to the value of the property * named 'name' of the node at offset nodeoffset (this will be a * pointer to within the device blob itself, not a copy of the value). * If lenp is non-NULL, the length of the property value is also * returned, in the integer pointed to by lenp. * * returns: * pointer to the property's value * if lenp is non-NULL, *lenp contains the length of the property * value (>=0) * NULL, on error * if lenp is non-NULL, *lenp contains an error code (<0): * -FDT_ERR_NOTFOUND, node does not have named property * -FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_TRUNCATED, standard meanings */ const void *fdt_getprop(const void *fdt, int nodeoffset, const char *name, int *lenp); static inline void *fdt_getprop_w(void *fdt, int nodeoffset, const char *name, int *lenp) { return (void *)(uintptr_t)fdt_getprop(fdt, nodeoffset, name, lenp); } /** * fdt_get_phandle - retrieve the phandle of a given node * @fdt: pointer to the device tree blob * @nodeoffset: structure block offset of the node * * fdt_get_phandle() retrieves the phandle of the device tree node at * structure block offset nodeoffset. * * returns: * the phandle of the node at nodeoffset, on success (!= 0, != -1) * 0, if the node has no phandle, or another error occurs */ uint32_t fdt_get_phandle(const void *fdt, int nodeoffset); /** * fdt_get_alias_namelen - get alias based on substring * @fdt: pointer to the device tree blob * @name: name of the alias th look up * @namelen: number of characters of name to consider * * Identical to fdt_get_alias(), but only examine the first namelen * characters of name for matching the alias name. */ const char *fdt_get_alias_namelen(const void *fdt, const char *name, int namelen); /** * fdt_get_alias - retreive the path referenced by a given alias * @fdt: pointer to the device tree blob * @name: name of the alias th look up * * fdt_get_alias() retrieves the value of a given alias. That is, the * value of the property named 'name' in the node /aliases. * * returns: * a pointer to the expansion of the alias named 'name', of it exists * NULL, if the given alias or the /aliases node does not exist */ const char *fdt_get_alias(const void *fdt, const char *name); /** * fdt_get_path - determine the full path of a node * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node whose path to find * @buf: character buffer to contain the returned path (will be overwritten) * @buflen: size of the character buffer at buf * * fdt_get_path() computes the full path of the node at offset * nodeoffset, and records that path in the buffer at buf. * * NOTE: This function is expensive, as it must scan the device tree * structure from the start to nodeoffset. * * returns: * 0, on success * buf contains the absolute path of the node at * nodeoffset, as a NUL-terminated string. * -FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag * -FDT_ERR_NOSPACE, the path of the given node is longer than (bufsize-1) * characters and will not fit in the given buffer. * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, standard meanings */ int fdt_get_path(const void *fdt, int nodeoffset, char *buf, int buflen); /** * fdt_supernode_atdepth_offset - find a specific ancestor of a node * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node whose parent to find * @supernodedepth: depth of the ancestor to find * @nodedepth: pointer to an integer variable (will be overwritten) or NULL * * fdt_supernode_atdepth_offset() finds an ancestor of the given node * at a specific depth from the root (where the root itself has depth * 0, its immediate subnodes depth 1 and so forth). So * fdt_supernode_atdepth_offset(fdt, nodeoffset, 0, NULL); * will always return 0, the offset of the root node. If the node at * nodeoffset has depth D, then: * fdt_supernode_atdepth_offset(fdt, nodeoffset, D, NULL); * will return nodeoffset itself. * * NOTE: This function is expensive, as it must scan the device tree * structure from the start to nodeoffset. * * returns: * structure block offset of the node at node offset's ancestor * of depth supernodedepth (>=0), on success * -FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag * -FDT_ERR_NOTFOUND, supernodedepth was greater than the depth of nodeoffset * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, standard meanings */ int fdt_supernode_atdepth_offset(const void *fdt, int nodeoffset, int supernodedepth, int *nodedepth); /** * fdt_node_depth - find the depth of a given node * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node whose parent to find * * fdt_node_depth() finds the depth of a given node. The root node * has depth 0, its immediate subnodes depth 1 and so forth. * * NOTE: This function is expensive, as it must scan the device tree * structure from the start to nodeoffset. * * returns: * depth of the node at nodeoffset (>=0), on success * -FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, standard meanings */ int fdt_node_depth(const void *fdt, int nodeoffset); /** * fdt_parent_offset - find the parent of a given node * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node whose parent to find * * fdt_parent_offset() locates the parent node of a given node (that * is, it finds the offset of the node which contains the node at * nodeoffset as a subnode). * * NOTE: This function is expensive, as it must scan the device tree * structure from the start to nodeoffset, *twice*. * * returns: * structure block offset of the parent of the node at nodeoffset * (>=0), on success * -FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, standard meanings */ int fdt_parent_offset(const void *fdt, int nodeoffset); /** * fdt_node_offset_by_prop_value - find nodes with a given property value * @fdt: pointer to the device tree blob * @startoffset: only find nodes after this offset * @propname: property name to check * @propval: property value to search for * @proplen: length of the value in propval * * fdt_node_offset_by_prop_value() returns the offset of the first * node after startoffset, which has a property named propname whose * value is of length proplen and has value equal to propval; or if * startoffset is -1, the very first such node in the tree. * * To iterate through all nodes matching the criterion, the following * idiom can be used: * offset = fdt_node_offset_by_prop_value(fdt, -1, propname, * propval, proplen); * while (offset != -FDT_ERR_NOTFOUND) { * // other code here * offset = fdt_node_offset_by_prop_value(fdt, offset, propname, * propval, proplen); * } * * Note the -1 in the first call to the function, if 0 is used here * instead, the function will never locate the root node, even if it * matches the criterion. * * returns: * structure block offset of the located node (>= 0, >startoffset), * on success * -FDT_ERR_NOTFOUND, no node matching the criterion exists in the * tree after startoffset * -FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, standard meanings */ int fdt_node_offset_by_prop_value(const void *fdt, int startoffset, const char *propname, const void *propval, int proplen); /** * fdt_node_offset_by_phandle - find the node with a given phandle * @fdt: pointer to the device tree blob * @phandle: phandle value * * fdt_node_offset_by_phandle() returns the offset of the node * which has the given phandle value. If there is more than one node * in the tree with the given phandle (an invalid tree), results are * undefined. * * returns: * structure block offset of the located node (>= 0), on success * -FDT_ERR_NOTFOUND, no node with that phandle exists * -FDT_ERR_BADPHANDLE, given phandle value was invalid (0 or -1) * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, standard meanings */ int fdt_node_offset_by_phandle(const void *fdt, uint32_t phandle); /** * fdt_node_check_compatible: check a node's compatible property * @fdt: pointer to the device tree blob * @nodeoffset: offset of a tree node * @compatible: string to match against * * * fdt_node_check_compatible() returns 0 if the given node contains a * 'compatible' property with the given string as one of its elements, * it returns non-zero otherwise, or on error. * * returns: * 0, if the node has a 'compatible' property listing the given string * 1, if the node has a 'compatible' property, but it does not list * the given string * -FDT_ERR_NOTFOUND, if the given node has no 'compatible' property * -FDT_ERR_BADOFFSET, if nodeoffset does not refer to a BEGIN_NODE tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, standard meanings */ int fdt_node_check_compatible(const void *fdt, int nodeoffset, const char *compatible); /** * fdt_node_offset_by_compatible - find nodes with a given 'compatible' value * @fdt: pointer to the device tree blob * @startoffset: only find nodes after this offset * @compatible: 'compatible' string to match against * * fdt_node_offset_by_compatible() returns the offset of the first * node after startoffset, which has a 'compatible' property which * lists the given compatible string; or if startoffset is -1, the * very first such node in the tree. * * To iterate through all nodes matching the criterion, the following * idiom can be used: * offset = fdt_node_offset_by_compatible(fdt, -1, compatible); * while (offset != -FDT_ERR_NOTFOUND) { * // other code here * offset = fdt_node_offset_by_compatible(fdt, offset, compatible); * } * * Note the -1 in the first call to the function, if 0 is used here * instead, the function will never locate the root node, even if it * matches the criterion. * * returns: * structure block offset of the located node (>= 0, >startoffset), * on success * -FDT_ERR_NOTFOUND, no node matching the criterion exists in the * tree after startoffset * -FDT_ERR_BADOFFSET, nodeoffset does not refer to a BEGIN_NODE tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, standard meanings */ int fdt_node_offset_by_compatible(const void *fdt, int startoffset, const char *compatible); /**********************************************************************/ /* Write-in-place functions */ /**********************************************************************/ /** * fdt_setprop_inplace - change a property's value, but not its size * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node whose property to change * @name: name of the property to change * @val: pointer to data to replace the property value with * @len: length of the property value * * fdt_setprop_inplace() replaces the value of a given property with * the data in val, of length len. This function cannot change the * size of a property, and so will only work if len is equal to the * current length of the property. * * This function will alter only the bytes in the blob which contain * the given property value, and will not alter or move any other part * of the tree. * * returns: * 0, on success * -FDT_ERR_NOSPACE, if len is not equal to the property's current length * -FDT_ERR_NOTFOUND, node does not have the named property * -FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_TRUNCATED, standard meanings */ int fdt_setprop_inplace(void *fdt, int nodeoffset, const char *name, const void *val, int len); /** * fdt_setprop_inplace_cell - change the value of a single-cell property * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node whose property to change * @name: name of the property to change * @val: cell (32-bit integer) value to replace the property with * * fdt_setprop_inplace_cell() replaces the value of a given property * with the 32-bit integer cell value in val, converting val to * big-endian if necessary. This function cannot change the size of a * property, and so will only work if the property already exists and * has length 4. * * This function will alter only the bytes in the blob which contain * the given property value, and will not alter or move any other part * of the tree. * * returns: * 0, on success * -FDT_ERR_NOSPACE, if the property's length is not equal to 4 * -FDT_ERR_NOTFOUND, node does not have the named property * -FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_TRUNCATED, standard meanings */ static inline int fdt_setprop_inplace_cell(void *fdt, int nodeoffset, const char *name, uint32_t val) { val = cpu_to_fdt32(val); return fdt_setprop_inplace(fdt, nodeoffset, name, &val, sizeof(val)); } /** * fdt_nop_property - replace a property with nop tags * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node whose property to nop * @name: name of the property to nop * * fdt_nop_property() will replace a given property's representation * in the blob with FDT_NOP tags, effectively removing it from the * tree. * * This function will alter only the bytes in the blob which contain * the property, and will not alter or move any other part of the * tree. * * returns: * 0, on success * -FDT_ERR_NOTFOUND, node does not have the named property * -FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_TRUNCATED, standard meanings */ int fdt_nop_property(void *fdt, int nodeoffset, const char *name); /** * fdt_nop_node - replace a node (subtree) with nop tags * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node to nop * * fdt_nop_node() will replace a given node's representation in the * blob, including all its subnodes, if any, with FDT_NOP tags, * effectively removing it from the tree. * * This function will alter only the bytes in the blob which contain * the node and its properties and subnodes, and will not alter or * move any other part of the tree. * * returns: * 0, on success * -FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_TRUNCATED, standard meanings */ int fdt_nop_node(void *fdt, int nodeoffset); /**********************************************************************/ /* Sequential write functions */ /**********************************************************************/ int fdt_create(void *buf, int bufsize); int fdt_add_reservemap_entry(void *fdt, uint64_t addr, uint64_t size); int fdt_finish_reservemap(void *fdt); int fdt_begin_node(void *fdt, const char *name); int fdt_property(void *fdt, const char *name, const void *val, int len); static inline int fdt_property_cell(void *fdt, const char *name, uint32_t val) { val = cpu_to_fdt32(val); return fdt_property(fdt, name, &val, sizeof(val)); } #define fdt_property_string(fdt, name, str) \ fdt_property(fdt, name, str, strlen(str)+1) int fdt_end_node(void *fdt); int fdt_finish(void *fdt); /**********************************************************************/ /* Read-write functions */ /**********************************************************************/ int fdt_open_into(const void *fdt, void *buf, int bufsize); int fdt_pack(void *fdt); /** * fdt_add_mem_rsv - add one memory reserve map entry * @fdt: pointer to the device tree blob * @address, @size: 64-bit values (native endian) * * Adds a reserve map entry to the given blob reserving a region at * address address of length size. * * This function will insert data into the reserve map and will * therefore change the indexes of some entries in the table. * * returns: * 0, on success * -FDT_ERR_NOSPACE, there is insufficient free space in the blob to * contain the new reservation entry * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_BADLAYOUT, * -FDT_ERR_TRUNCATED, standard meanings */ int fdt_add_mem_rsv(void *fdt, uint64_t address, uint64_t size); /** * fdt_del_mem_rsv - remove a memory reserve map entry * @fdt: pointer to the device tree blob * @n: entry to remove * * fdt_del_mem_rsv() removes the n-th memory reserve map entry from * the blob. * * This function will delete data from the reservation table and will * therefore change the indexes of some entries in the table. * * returns: * 0, on success * -FDT_ERR_NOTFOUND, there is no entry of the given index (i.e. there * are less than n+1 reserve map entries) * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_BADLAYOUT, * -FDT_ERR_TRUNCATED, standard meanings */ int fdt_del_mem_rsv(void *fdt, int n); /** * fdt_set_name - change the name of a given node * @fdt: pointer to the device tree blob * @nodeoffset: structure block offset of a node * @name: name to give the node * * fdt_set_name() replaces the name (including unit address, if any) * of the given node with the given string. NOTE: this function can't * efficiently check if the new name is unique amongst the given * node's siblings; results are undefined if this function is invoked * with a name equal to one of the given node's siblings. * * This function may insert or delete data from the blob, and will * therefore change the offsets of some existing nodes. * * returns: * 0, on success * -FDT_ERR_NOSPACE, there is insufficient free space in the blob * to contain the new name * -FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, standard meanings */ int fdt_set_name(void *fdt, int nodeoffset, const char *name); /** * fdt_setprop - create or change a property * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node whose property to change * @name: name of the property to change * @val: pointer to data to set the property value to * @len: length of the property value * * fdt_setprop() sets the value of the named property in the given * node to the given value and length, creating the property if it * does not already exist. * * This function may insert or delete data from the blob, and will * therefore change the offsets of some existing nodes. * * returns: * 0, on success * -FDT_ERR_NOSPACE, there is insufficient free space in the blob to * contain the new property value * -FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag * -FDT_ERR_BADLAYOUT, * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_BADLAYOUT, * -FDT_ERR_TRUNCATED, standard meanings */ int fdt_setprop(void *fdt, int nodeoffset, const char *name, const void *val, int len); /** * fdt_setprop_cell - set a property to a single cell value * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node whose property to change * @name: name of the property to change * @val: 32-bit integer value for the property (native endian) * * fdt_setprop_cell() sets the value of the named property in the * given node to the given cell value (converting to big-endian if * necessary), or creates a new property with that value if it does * not already exist. * * This function may insert or delete data from the blob, and will * therefore change the offsets of some existing nodes. * * returns: * 0, on success * -FDT_ERR_NOSPACE, there is insufficient free space in the blob to * contain the new property value * -FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag * -FDT_ERR_BADLAYOUT, * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_BADLAYOUT, * -FDT_ERR_TRUNCATED, standard meanings */ static inline int fdt_setprop_cell(void *fdt, int nodeoffset, const char *name, uint32_t val) { val = cpu_to_fdt32(val); return fdt_setprop(fdt, nodeoffset, name, &val, sizeof(val)); } /** * fdt_setprop_string - set a property to a string value * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node whose property to change * @name: name of the property to change * @str: string value for the property * * fdt_setprop_string() sets the value of the named property in the * given node to the given string value (using the length of the * string to determine the new length of the property), or creates a * new property with that value if it does not already exist. * * This function may insert or delete data from the blob, and will * therefore change the offsets of some existing nodes. * * returns: * 0, on success * -FDT_ERR_NOSPACE, there is insufficient free space in the blob to * contain the new property value * -FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag * -FDT_ERR_BADLAYOUT, * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_BADLAYOUT, * -FDT_ERR_TRUNCATED, standard meanings */ #define fdt_setprop_string(fdt, nodeoffset, name, str) \ fdt_setprop((fdt), (nodeoffset), (name), (str), strlen(str)+1) /** * fdt_delprop - delete a property * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node whose property to nop * @name: name of the property to nop * * fdt_del_property() will delete the given property. * * This function will delete data from the blob, and will therefore * change the offsets of some existing nodes. * * returns: * 0, on success * -FDT_ERR_NOTFOUND, node does not have the named property * -FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag * -FDT_ERR_BADLAYOUT, * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_TRUNCATED, standard meanings */ int fdt_delprop(void *fdt, int nodeoffset, const char *name); /** * fdt_add_subnode_namelen - creates a new node based on substring * @fdt: pointer to the device tree blob * @parentoffset: structure block offset of a node * @name: name of the subnode to locate * @namelen: number of characters of name to consider * * Identical to fdt_add_subnode(), but use only the first namelen * characters of name as the name of the new node. This is useful for * creating subnodes based on a portion of a larger string, such as a * full path. */ int fdt_add_subnode_namelen(void *fdt, int parentoffset, const char *name, int namelen); /** * fdt_add_subnode - creates a new node * @fdt: pointer to the device tree blob * @parentoffset: structure block offset of a node * @name: name of the subnode to locate * * fdt_add_subnode() creates a new node as a subnode of the node at * structure block offset parentoffset, with the given name (which * should include the unit address, if any). * * This function will insert data into the blob, and will therefore * change the offsets of some existing nodes. * returns: * structure block offset of the created nodeequested subnode (>=0), on success * -FDT_ERR_NOTFOUND, if the requested subnode does not exist * -FDT_ERR_BADOFFSET, if parentoffset did not point to an FDT_BEGIN_NODE tag * -FDT_ERR_EXISTS, if the node at parentoffset already has a subnode of * the given name * -FDT_ERR_NOSPACE, if there is insufficient free space in the * blob to contain the new node * -FDT_ERR_NOSPACE * -FDT_ERR_BADLAYOUT * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_TRUNCATED, standard meanings. */ int fdt_add_subnode(void *fdt, int parentoffset, const char *name); /** * fdt_del_node - delete a node (subtree) * @fdt: pointer to the device tree blob * @nodeoffset: offset of the node to nop * * fdt_del_node() will remove the given node, including all its * subnodes if any, from the blob. * * This function will delete data from the blob, and will therefore * change the offsets of some existing nodes. * * returns: * 0, on success * -FDT_ERR_BADOFFSET, nodeoffset did not point to FDT_BEGIN_NODE tag * -FDT_ERR_BADLAYOUT, * -FDT_ERR_BADMAGIC, * -FDT_ERR_BADVERSION, * -FDT_ERR_BADSTATE, * -FDT_ERR_BADSTRUCTURE, * -FDT_ERR_TRUNCATED, standard meanings */ int fdt_del_node(void *fdt, int nodeoffset); /**********************************************************************/ /* Debugging / informational functions */ /**********************************************************************/ const char *fdt_strerror(int errval); #endif /* _LIBFDT_H */ xen-4.4.0/xen/include/xen/libfdt/fdt.h0000664000175000017500000000302512307313555015644 0ustar smbsmb#ifndef _FDT_H #define _FDT_H #ifndef __ASSEMBLY__ struct fdt_header { uint32_t magic; /* magic word FDT_MAGIC */ uint32_t totalsize; /* total size of DT block */ uint32_t off_dt_struct; /* offset to structure */ uint32_t off_dt_strings; /* offset to strings */ uint32_t off_mem_rsvmap; /* offset to memory reserve map */ uint32_t version; /* format version */ uint32_t last_comp_version; /* last compatible version */ /* version 2 fields below */ uint32_t boot_cpuid_phys; /* Which physical CPU id we're booting on */ /* version 3 fields below */ uint32_t size_dt_strings; /* size of the strings block */ /* version 17 fields below */ uint32_t size_dt_struct; /* size of the structure block */ }; struct fdt_reserve_entry { uint64_t address; uint64_t size; }; struct fdt_node_header { uint32_t tag; char name[0]; }; struct fdt_property { uint32_t tag; uint32_t len; uint32_t nameoff; char data[0]; }; #endif /* !__ASSEMBLY */ #define FDT_MAGIC 0xd00dfeed /* 4: version, 4: total size */ #define FDT_TAGSIZE sizeof(uint32_t) #define FDT_BEGIN_NODE 0x1 /* Start node: full name */ #define FDT_END_NODE 0x2 /* End node */ #define FDT_PROP 0x3 /* Property: name off, size, content */ #define FDT_NOP 0x4 /* nop */ #define FDT_END 0x9 #define FDT_V1_SIZE (7*sizeof(uint32_t)) #define FDT_V2_SIZE (FDT_V1_SIZE + sizeof(uint32_t)) #define FDT_V3_SIZE (FDT_V2_SIZE + sizeof(uint32_t)) #define FDT_V16_SIZE FDT_V3_SIZE #define FDT_V17_SIZE (FDT_V16_SIZE + sizeof(uint32_t)) #endif /* _FDT_H */ xen-4.4.0/xen/include/xen/libfdt/libfdt_env.h0000664000175000017500000000075412307313555017211 0ustar smbsmb#ifndef _LIBFDT_ENV_H #define _LIBFDT_ENV_H #include #include #include #include #define fdt16_to_cpu(x) be16_to_cpu(x) #define cpu_to_fdt16(x) cpu_to_be16(x) #define fdt32_to_cpu(x) be32_to_cpu(x) #define cpu_to_fdt32(x) cpu_to_be32(x) #define fdt64_to_cpu(x) be64_to_cpu(x) #define cpu_to_fdt64(x) cpu_to_be64(x) /* Xen-specific libfdt error code. */ #define FDT_ERR_XEN(err) (FDT_ERR_MAX + (err)) #endif /* _LIBFDT_ENV_H */ xen-4.4.0/xen/include/xen/cpu.h0000664000175000017500000000462612307313555014422 0ustar smbsmb#ifndef __XEN_CPU_H__ #define __XEN_CPU_H__ #include #include #include /* Safely access cpu_online_map, cpu_present_map, etc. */ bool_t get_cpu_maps(void); void put_cpu_maps(void); /* Safely perform CPU hotplug and update cpu_online_map, etc. */ bool_t cpu_hotplug_begin(void); void cpu_hotplug_done(void); /* Receive notification of CPU hotplug events. */ void register_cpu_notifier(struct notifier_block *nb); /* * Possible event sequences for a given CPU: * CPU_UP_PREPARE -> CPU_UP_CANCELLED -- failed CPU up * CPU_UP_PREPARE -> CPU_STARTING -> CPU_ONLINE -- successful CPU up * CPU_DOWN_PREPARE -> CPU_DOWN_FAILED -- failed CPU down * CPU_DOWN_PREPARE -> CPU_DYING -> CPU_DEAD -- successful CPU down * * Hence note that only CPU_*_PREPARE handlers are allowed to fail. Also note * that once CPU_DYING is delivered, an offline action can no longer fail. * * Notifiers are called highest-priority-first when: * (a) A CPU is coming up; or (b) CPU_DOWN_FAILED * Notifiers are called lowest-priority-first when: * (a) A CPU is going down; or (b) CPU_UP_CANCELED */ /* CPU_UP_PREPARE: Preparing to bring CPU online. */ #define CPU_UP_PREPARE (0x0001 | NOTIFY_FORWARD) /* CPU_UP_CANCELED: CPU is no longer being brought online. */ #define CPU_UP_CANCELED (0x0002 | NOTIFY_REVERSE) /* CPU_STARTING: CPU nearly online. Runs on new CPU, irqs still disabled. */ #define CPU_STARTING (0x0003 | NOTIFY_FORWARD) /* CPU_ONLINE: CPU is up. */ #define CPU_ONLINE (0x0004 | NOTIFY_FORWARD) /* CPU_DOWN_PREPARE: CPU is going down. */ #define CPU_DOWN_PREPARE (0x0005 | NOTIFY_REVERSE) /* CPU_DOWN_FAILED: CPU is no longer going down. */ #define CPU_DOWN_FAILED (0x0006 | NOTIFY_FORWARD) /* CPU_DYING: CPU is nearly dead (in stop_machine context). */ #define CPU_DYING (0x0007 | NOTIFY_REVERSE) /* CPU_DEAD: CPU is dead. */ #define CPU_DEAD (0x0008 | NOTIFY_REVERSE) /* Perform CPU hotplug. May return -EAGAIN. */ int cpu_down(unsigned int cpu); int cpu_up(unsigned int cpu); /* From arch code, send CPU_STARTING notification. */ void notify_cpu_starting(unsigned int cpu); /* Power management. */ int disable_nonboot_cpus(void); void enable_nonboot_cpus(void); /* Private arch-dependent helpers for CPU hotplug. */ int __cpu_up(unsigned int cpunum); void __cpu_disable(void); void __cpu_die(unsigned int cpu); #endif /* __XEN_CPU_H__ */ xen-4.4.0/xen/include/xen/domain.h0000664000175000017500000000524412307313555015077 0ustar smbsmb #ifndef __XEN_DOMAIN_H__ #define __XEN_DOMAIN_H__ #include #include typedef union { struct vcpu_guest_context *nat; struct compat_vcpu_guest_context *cmp; } vcpu_guest_context_u __attribute__((__transparent_union__)); struct vcpu *alloc_vcpu( struct domain *d, unsigned int vcpu_id, unsigned int cpu_id); struct vcpu *alloc_dom0_vcpu0(void); int vcpu_reset(struct vcpu *); struct xen_domctl_getdomaininfo; void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info); /* * Arch-specifics. */ /* Allocate/free a domain structure. */ struct domain *alloc_domain_struct(void); void free_domain_struct(struct domain *d); /* Allocate/free a VCPU structure. */ struct vcpu *alloc_vcpu_struct(void); void free_vcpu_struct(struct vcpu *v); /* Allocate/free a vcpu_guest_context structure. */ #ifndef alloc_vcpu_guest_context struct vcpu_guest_context *alloc_vcpu_guest_context(void); void free_vcpu_guest_context(struct vcpu_guest_context *); #endif /* Allocate/free a PIRQ structure. */ #ifndef alloc_pirq_struct struct pirq *alloc_pirq_struct(struct domain *); #endif void free_pirq_struct(void *); /* * Initialise/destroy arch-specific details of a VCPU. * - vcpu_initialise() is called after the basic generic fields of the * VCPU structure are initialised. Many operations can be applied to the * VCPU at this point (e.g., vcpu_pause()). * - vcpu_destroy() is called only if vcpu_initialise() previously succeeded. */ int vcpu_initialise(struct vcpu *v); void vcpu_destroy(struct vcpu *v); int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset); void unmap_vcpu_info(struct vcpu *v); int arch_domain_create(struct domain *d, unsigned int domcr_flags); void arch_domain_destroy(struct domain *d); int arch_set_info_guest(struct vcpu *, vcpu_guest_context_u); void arch_get_info_guest(struct vcpu *, vcpu_guest_context_u); int domain_relinquish_resources(struct domain *d); void dump_pageframe_info(struct domain *d); void arch_dump_vcpu_info(struct vcpu *v); void arch_dump_domain_info(struct domain *d); int arch_vcpu_reset(struct vcpu *); extern spinlock_t vcpu_alloc_lock; bool_t domctl_lock_acquire(void); void domctl_lock_release(void); /* * Continue the current hypercall via func(data) on specified cpu. * If this function returns 0 then the function is guaranteed to run at some * point in the future. If this function returns an error code then the * function has not been and will not be executed. */ int continue_hypercall_on_cpu( unsigned int cpu, long (*func)(void *data), void *data); extern unsigned int xen_processor_pmbits; extern bool_t opt_dom0_vcpus_pin; #endif /* __XEN_DOMAIN_H__ */ xen-4.4.0/xen/include/xen/prefetch.h0000664000175000017500000000274212307313555015430 0ustar smbsmb/* * Generic cache management functions. Everything is arch-specific, * but this header exists to make sure the defines/functions can be * used in a generic way. * * 2000-11-13 Arjan van de Ven * */ #ifndef _LINUX_PREFETCH_H #define _LINUX_PREFETCH_H #include #include /* prefetch(x) attempts to pre-emptively get the memory pointed to by address "x" into the CPU L1 cache. prefetch(x) should not cause any kind of exception, prefetch(0) is specifically ok. prefetch() should be defined by the architecture, if not, the #define below provides a no-op define. There are 3 prefetch() macros: prefetch(x) - prefetches the cacheline at "x" for read prefetchw(x) - prefetches the cacheline at "x" for write spin_lock_prefetch(x) - prefectches the spinlock *x for taking there is also PREFETCH_STRIDE which is the architecure-prefered "lookahead" size for prefetching streamed operations. */ /* * These cannot be do{}while(0) macros. See the mental gymnastics in * the loop macro. */ #ifndef ARCH_HAS_PREFETCH #define ARCH_HAS_PREFETCH static inline void prefetch(const void *x) {;} #endif #ifndef ARCH_HAS_PREFETCHW #define ARCH_HAS_PREFETCHW static inline void prefetchw(const void *x) {;} #endif #ifndef ARCH_HAS_SPINLOCK_PREFETCH #define ARCH_HAS_SPINLOCK_PREFETCH #define spin_lock_prefetch(x) prefetchw(x) #endif #ifndef PREFETCH_STRIDE #define PREFETCH_STRIDE (4*L1_CACHE_BYTES) #endif #endif xen-4.4.0/xen/include/xen/iocap.h0000664000175000017500000000361712307313555014725 0ustar smbsmb/****************************************************************************** * iocap.h * * Per-domain I/O capabilities. */ #ifndef __XEN_IOCAP_H__ #define __XEN_IOCAP_H__ #include #include #define iomem_permit_access(d, s, e) \ rangeset_add_range((d)->iomem_caps, s, e) #define iomem_deny_access(d, s, e) \ rangeset_remove_range((d)->iomem_caps, s, e) #define iomem_access_permitted(d, s, e) \ rangeset_contains_range((d)->iomem_caps, s, e) #define irq_permit_access(d, i) \ rangeset_add_singleton((d)->irq_caps, i) #define irq_deny_access(d, i) \ rangeset_remove_singleton((d)->irq_caps, i) #define irqs_permit_access(d, s, e) \ rangeset_add_range((d)->irq_caps, s, e) #define irqs_deny_access(d, s, e) \ rangeset_remove_range((d)->irq_caps, s, e) #define irq_access_permitted(d, i) \ rangeset_contains_singleton((d)->irq_caps, i) #define pirq_permit_access(d, i) ({ \ struct domain *d__ = (d); \ int i__ = domain_pirq_to_irq(d__, i); \ i__ > 0 ? rangeset_add_singleton(d__->irq_caps, i__)\ : -EINVAL; \ }) #define pirq_deny_access(d, i) ({ \ struct domain *d__ = (d); \ int i__ = domain_pirq_to_irq(d__, i); \ i__ > 0 ? rangeset_remove_singleton(d__->irq_caps, i__)\ : -EINVAL; \ }) #define pirq_access_permitted(d, i) ({ \ struct domain *d__ = (d); \ rangeset_contains_singleton(d__->irq_caps, \ domain_pirq_to_irq(d__, i));\ }) #endif /* __XEN_IOCAP_H__ */ xen-4.4.0/xen/include/xen/dmi.h0000664000175000017500000000146312307313555014400 0ustar smbsmb#ifndef __DMI_H__ #define __DMI_H__ enum dmi_field { DMI_NONE, DMI_BIOS_VENDOR, DMI_BIOS_VERSION, DMI_BIOS_DATE, DMI_SYS_VENDOR, DMI_PRODUCT_NAME, DMI_PRODUCT_VERSION, DMI_BOARD_VENDOR, DMI_BOARD_NAME, DMI_BOARD_VERSION, DMI_STRING_MAX, }; /* * DMI callbacks for problem boards */ struct dmi_strmatch { u8 slot; char *substr; }; struct dmi_system_id { int (*callback)(struct dmi_system_id *); char *ident; struct dmi_strmatch matches[4]; void *driver_data; }; #define DMI_MATCH(a,b) { a, b } extern int dmi_check_system(struct dmi_system_id *list); extern void dmi_scan_machine(void); extern int dmi_get_table(u32 *base, u32 *len); extern void dmi_efi_get_table(void *); bool_t dmi_get_date(int field, int *yearp, int *monthp, int *dayp); extern void dmi_end_boot(void); #endif /* __DMI_H__ */ xen-4.4.0/xen/include/xen/cpuidle.h0000664000175000017500000000552512307313555015257 0ustar smbsmb/* * cpuidle.h - xen idle state module derived from Linux * * (C) 2007 Venkatesh Pallipadi * Shaohua Li * Adam Belay * Copyright (C) 2008 Intel Corporation * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ #ifndef _XEN_CPUIDLE_H #define _XEN_CPUIDLE_H #include #include #define ACPI_PROCESSOR_MAX_POWER 8 #define CPUIDLE_NAME_LEN 16 #define ACPI_CSTATE_EM_NONE 0 #define ACPI_CSTATE_EM_SYSIO 1 #define ACPI_CSTATE_EM_FFH 2 #define ACPI_CSTATE_EM_HALT 3 struct acpi_processor_cx { u8 idx; u8 type; /* ACPI_STATE_Cn */ u8 entry_method; /* ACPI_CSTATE_EM_xxx */ u32 address; u32 latency; u32 target_residency; u32 usage; u64 time; }; struct acpi_processor_flags { u8 bm_control:1; u8 bm_check:1; u8 has_cst:1; u8 power_setup_done:1; u8 bm_rld_set:1; }; struct acpi_processor_power { unsigned int cpu; struct acpi_processor_flags flags; struct acpi_processor_cx *last_state; struct acpi_processor_cx *safe_state; void *gdata; /* governor specific data */ u32 last_residency; u32 count; spinlock_t stat_lock; struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER]; }; struct cpuidle_governor { char name[CPUIDLE_NAME_LEN]; unsigned int rating; int (*enable) (struct acpi_processor_power *dev); void (*disable) (struct acpi_processor_power *dev); int (*select) (struct acpi_processor_power *dev); void (*reflect) (struct acpi_processor_power *dev); }; extern s8 xen_cpuidle; extern struct cpuidle_governor *cpuidle_current_governor; bool_t cpuidle_using_deep_cstate(void); void cpuidle_disable_deep_cstate(void); extern void cpuidle_wakeup_mwait(cpumask_t *mask); #define CPUIDLE_DRIVER_STATE_START 1 extern void menu_get_trace_data(u32 *expected, u32 *pred); #endif /* _XEN_CPUIDLE_H */ xen-4.4.0/xen/include/xen/stdarg.h0000664000175000017500000000042612307313555015111 0ustar smbsmb#ifndef __XEN_STDARG_H__ #define __XEN_STDARG_H__ typedef __builtin_va_list va_list; #define va_start(ap, last) __builtin_va_start((ap), (last)) #define va_end(ap) __builtin_va_end(ap) #define va_arg __builtin_va_arg #endif /* __XEN_STDARG_H__ */ xen-4.4.0/xen/include/xen/irq.h0000664000175000017500000001125212307313555014417 0ustar smbsmb#ifndef __XEN_IRQ_H__ #define __XEN_IRQ_H__ #include #include #include #include #include #include #include struct irqaction { void (*handler)(int, void *, struct cpu_user_regs *); const char *name; void *dev_id; bool_t free_on_release; }; /* * IRQ line status. */ #define IRQ_INPROGRESS (1u<<0) /* IRQ handler active - do not enter! */ #define IRQ_DISABLED (1u<<1) /* IRQ disabled - do not enter! */ #define IRQ_PENDING (1u<<2) /* IRQ pending - replay on enable */ #define IRQ_REPLAY (1u<<3) /* IRQ has been replayed but not acked yet */ #define IRQ_GUEST (1u<<4) /* IRQ is handled by guest OS(es) */ #define IRQ_MOVE_PENDING (1u<<5) /* IRQ is migrating to another CPUs */ #define IRQ_PER_CPU (1u<<6) /* IRQ is per CPU */ #define IRQ_GUEST_EOI_PENDING (1u<<7) /* IRQ was disabled, pending a guest EOI */ /* Special IRQ numbers. */ #define AUTO_ASSIGN_IRQ (-1) #define NEVER_ASSIGN_IRQ (-2) #define FREE_TO_ASSIGN_IRQ (-3) struct irq_desc; /* * Interrupt controller descriptor. This is all we need * to describe about the low-level hardware. */ struct hw_interrupt_type { const char *typename; unsigned int (*startup)(struct irq_desc *); void (*shutdown)(struct irq_desc *); void (*enable)(struct irq_desc *); void (*disable)(struct irq_desc *); void (*ack)(struct irq_desc *); #ifdef CONFIG_X86 void (*end)(struct irq_desc *, u8 vector); #else void (*end)(struct irq_desc *); #endif void (*set_affinity)(struct irq_desc *, const cpumask_t *); }; typedef const struct hw_interrupt_type hw_irq_controller; #include struct msi_desc; /* * This is the "IRQ descriptor", which contains various information * about the irq, including what kind of hardware handling it has, * whether it is disabled etc etc. */ typedef struct irq_desc { unsigned int status; /* IRQ status */ hw_irq_controller *handler; struct msi_desc *msi_desc; struct irqaction *action; /* IRQ action list */ int irq; spinlock_t lock; struct arch_irq_desc arch; cpumask_var_t affinity; /* irq ratelimit */ s_time_t rl_quantum_start; unsigned int rl_cnt; struct list_head rl_link; } __cacheline_aligned irq_desc_t; #ifndef irq_to_desc #define irq_to_desc(irq) (&irq_desc[irq]) #endif int init_one_irq_desc(struct irq_desc *); int arch_init_one_irq_desc(struct irq_desc *); #define irq_desc_initialized(desc) ((desc)->handler != NULL) extern int setup_irq(unsigned int irq, struct irqaction *); extern void release_irq(unsigned int irq); extern int request_irq(unsigned int irq, void (*handler)(int, void *, struct cpu_user_regs *), const char * devname, void *dev_id); extern hw_irq_controller no_irq_type; extern void no_action(int cpl, void *dev_id, struct cpu_user_regs *regs); extern unsigned int irq_startup_none(struct irq_desc *); extern void irq_actor_none(struct irq_desc *); #define irq_shutdown_none irq_actor_none #define irq_disable_none irq_actor_none #define irq_enable_none irq_actor_none struct domain; struct vcpu; struct pirq { int pirq; u16 evtchn; bool_t masked; struct rcu_head rcu_head; struct arch_pirq arch; }; #define pirq_info(d, p) ((struct pirq *)radix_tree_lookup(&(d)->pirq_tree, p)) /* Use this instead of pirq_info() if the structure may need allocating. */ extern struct pirq *pirq_get_info(struct domain *, int pirq); #define pirq_field(d, p, f, def) ({ \ const struct pirq *__pi = pirq_info(d, p); \ __pi ? __pi->f : def; \ }) #define pirq_to_evtchn(d, pirq) pirq_field(d, pirq, evtchn, 0) #define pirq_masked(d, pirq) pirq_field(d, pirq, masked, 0) void pirq_cleanup_check(struct pirq *, struct domain *); #define pirq_cleanup_check(pirq, d) \ ((pirq)->evtchn ? pirq_cleanup_check(pirq, d) : (void)0) extern void pirq_guest_eoi(struct pirq *); extern void desc_guest_eoi(struct irq_desc *, struct pirq *); extern int pirq_guest_unmask(struct domain *d); extern int pirq_guest_bind(struct vcpu *, struct pirq *, int will_share); extern void pirq_guest_unbind(struct domain *d, struct pirq *); extern void pirq_set_affinity(struct domain *d, int irq, const cpumask_t *); extern irq_desc_t *domain_spin_lock_irq_desc( struct domain *d, int irq, unsigned long *pflags); extern irq_desc_t *pirq_spin_lock_irq_desc( const struct pirq *, unsigned long *pflags); static inline void set_native_irq_info(unsigned int irq, const cpumask_t *mask) { cpumask_copy(irq_to_desc(irq)->affinity, mask); } unsigned int set_desc_affinity(struct irq_desc *, const cpumask_t *); #endif /* __XEN_IRQ_H__ */ xen-4.4.0/xen/include/xen/trace.h0000664000175000017500000001355412307313555014731 0ustar smbsmb/****************************************************************************** * include/xen/trace.h * * Xen Trace Buffer * * Copyright (C) 2003 by Intel Research Cambridge * * Author: Mark Williamson, mark.a.williamson@intel.com * Date: January 2004 * * Copyright (C) 2005 Bin Ren * * The trace buffer code is designed to allow debugging traces of Xen to be * generated on UP / SMP machines. Each trace entry is timestamped so that * it's possible to reconstruct a chronological record of trace events. * * Access to the trace buffers is via a dom0 hypervisor op and analysis of * trace buffer contents can then be performed using a userland tool. */ #ifndef __XEN_TRACE_H__ #define __XEN_TRACE_H__ extern int tb_init_done; #include #include #include /* Used to initialise trace buffer functionality */ void init_trace_bufs(void); /* used to retrieve the physical address of the trace buffers */ int tb_control(struct xen_sysctl_tbuf_op *tbc); int trace_will_trace_event(u32 event); void __trace_var(u32 event, bool_t cycles, unsigned int extra, const void *); static inline void trace_var(u32 event, int cycles, int extra, const void *extra_data) { if ( unlikely(tb_init_done) ) __trace_var(event, cycles, extra, extra_data); } void __trace_hypercall(uint32_t event, unsigned long op, const unsigned long *args); /* Convenience macros for calling the trace function. */ #define TRACE_0D(_e) \ do { \ trace_var(_e, 1, 0, NULL); \ } while ( 0 ) #define TRACE_1D(_e,d1) \ do { \ if ( unlikely(tb_init_done) ) \ { \ u32 _d[1]; \ _d[0] = d1; \ __trace_var(_e, 1, sizeof(_d), _d); \ } \ } while ( 0 ) #define TRACE_2D(_e,d1,d2) \ do { \ if ( unlikely(tb_init_done) ) \ { \ u32 _d[2]; \ _d[0] = d1; \ _d[1] = d2; \ __trace_var(_e, 1, sizeof(_d), _d); \ } \ } while ( 0 ) #define TRACE_3D(_e,d1,d2,d3) \ do { \ if ( unlikely(tb_init_done) ) \ { \ u32 _d[3]; \ _d[0] = d1; \ _d[1] = d2; \ _d[2] = d3; \ __trace_var(_e, 1, sizeof(_d), _d); \ } \ } while ( 0 ) #define TRACE_4D(_e,d1,d2,d3,d4) \ do { \ if ( unlikely(tb_init_done) ) \ { \ u32 _d[4]; \ _d[0] = d1; \ _d[1] = d2; \ _d[2] = d3; \ _d[3] = d4; \ __trace_var(_e, 1, sizeof(_d), _d); \ } \ } while ( 0 ) #define TRACE_5D(_e,d1,d2,d3,d4,d5) \ do { \ if ( unlikely(tb_init_done) ) \ { \ u32 _d[5]; \ _d[0] = d1; \ _d[1] = d2; \ _d[2] = d3; \ _d[3] = d4; \ _d[4] = d5; \ __trace_var(_e, 1, sizeof(_d), _d); \ } \ } while ( 0 ) #define TRACE_6D(_e,d1,d2,d3,d4,d5,d6) \ do { \ if ( unlikely(tb_init_done) ) \ { \ u32 _d[6]; \ _d[0] = d1; \ _d[1] = d2; \ _d[2] = d3; \ _d[3] = d4; \ _d[4] = d5; \ _d[5] = d6; \ __trace_var(_e, 1, sizeof(_d), _d); \ } \ } while ( 0 ) #endif /* __XEN_TRACE_H__ */ xen-4.4.0/xen/include/xen/hash.h0000664000175000017500000000315612307313555014553 0ustar smbsmb#ifndef _XEN_HASH_H #define _XEN_HASH_H /* Fast hashing routine for a long. (C) 2002 William Lee Irwin III, IBM */ /* * Knuth recommends primes in approximately golden ratio to the maximum * integer representable by a machine word for multiplicative hashing. * Chuck Lever verified the effectiveness of this technique: * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf * * These primes are chosen to be bit-sparse, that is operations on * them can use shifts and additions instead of multiplications for * machines where multiplications are slow. */ #if BITS_PER_LONG == 32 /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ #define GOLDEN_RATIO_PRIME 0x9e370001UL #elif BITS_PER_LONG == 64 /* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ #define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL #else #error Define GOLDEN_RATIO_PRIME for your wordsize. #endif static inline unsigned long hash_long(unsigned long val, unsigned int bits) { unsigned long hash = val; #if BITS_PER_LONG == 64 /* Sigh, gcc can't optimise this alone like it does for 32 bits. */ unsigned long n = hash; n <<= 18; hash -= n; n <<= 33; hash -= n; n <<= 3; hash += n; n <<= 3; hash -= n; n <<= 4; hash += n; n <<= 2; hash += n; #else /* On some cpus multiply is faster, on others gcc will do shifts */ hash *= GOLDEN_RATIO_PRIME; #endif /* High bits are more random, so use them. */ return hash >> (BITS_PER_LONG - bits); } static inline unsigned long hash_ptr(void *ptr, unsigned int bits) { return hash_long((unsigned long)ptr, bits); } #endif /* _XEN_HASH_H */ xen-4.4.0/xen/include/xen/pfn.h0000664000175000017500000000030412307313555014403 0ustar smbsmb#ifndef __XEN_PFN_H__ #define __XEN_PFN_H__ #include #define PFN_DOWN(x) ((x) >> PAGE_SHIFT) #define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) #endif /* __XEN_PFN_H__ */ xen-4.4.0/xen/include/xen/config.h0000664000175000017500000000543012307313555015072 0ustar smbsmb/****************************************************************************** * config.h * * A Linux-style configuration list. */ #ifndef __XEN_CONFIG_H__ #define __XEN_CONFIG_H__ #ifndef __ASSEMBLY__ #include #endif #include #define EXPORT_SYMBOL(var) #define EXPORT_SYMBOL_GPL(var) /* * The following log levels are as follows: * * XENLOG_ERR: Fatal errors, either Xen, Guest or Dom0 * is about to crash. * * XENLOG_WARNING: Something bad happened, but we can recover. * * XENLOG_INFO: Interesting stuff, but not too noisy. * * XENLOG_DEBUG: Use where ever you like. Lots of noise. * * * Since we don't trust the guest operating system, we don't want * it to allow for DoS by causing the HV to print out a lot of * info, so where ever the guest has control of what is printed * we use the XENLOG_GUEST to distinguish that the output is * controlled by the guest. * * To make it easier on the typing, the above log levels all * have a corresponding _G_ equivalent that appends the * XENLOG_GUEST. (see the defines below). * */ #define XENLOG_ERR "<0>" #define XENLOG_WARNING "<1>" #define XENLOG_INFO "<2>" #define XENLOG_DEBUG "<3>" #define XENLOG_GUEST "" #define XENLOG_G_ERR XENLOG_GUEST XENLOG_ERR #define XENLOG_G_WARNING XENLOG_GUEST XENLOG_WARNING #define XENLOG_G_INFO XENLOG_GUEST XENLOG_INFO #define XENLOG_G_DEBUG XENLOG_GUEST XENLOG_DEBUG /* * Some code is copied directly from Linux. * Match some of the Linux log levels to Xen. */ #define KERN_ERR XENLOG_ERR #define KERN_CRIT XENLOG_ERR #define KERN_EMERG XENLOG_ERR #define KERN_WARNING XENLOG_WARNING #define KERN_NOTICE XENLOG_INFO #define KERN_INFO XENLOG_INFO #define KERN_DEBUG XENLOG_DEBUG /* Linux 'checker' project. */ #define __iomem #define __user #define __force #define __bitwise #define MB(_mb) (_AC(_mb, UL) << 20) #define GB(_gb) (_AC(_gb, UL) << 30) #ifndef __ASSEMBLY__ int current_domain_id(void); #define dprintk(_l, _f, _a...) \ printk(_l "%s:%d: " _f, __FILE__ , __LINE__ , ## _a ) #define gdprintk(_l, _f, _a...) \ printk(XENLOG_GUEST _l "%s:%d:d%d " _f, __FILE__, \ __LINE__, current_domain_id() , ## _a ) #endif /* !__ASSEMBLY__ */ #define __STR(...) #__VA_ARGS__ #define STR(...) __STR(__VA_ARGS__) #ifndef __ASSEMBLY__ /* Turn a plain number into a C unsigned long constant. */ #define __mk_unsigned_long(x) x ## UL #define mk_unsigned_long(x) __mk_unsigned_long(x) #else /* __ASSEMBLY__ */ /* In assembly code we cannot use C numeric constant suffixes. */ #define mk_unsigned_long(x) x #endif /* !__ASSEMBLY__ */ #define fastcall #define __cpuinitdata #define __cpuinit #endif /* __XEN_CONFIG_H__ */ xen-4.4.0/xen/include/xen/shutdown.h0000664000175000017500000000050612307313555015477 0ustar smbsmb#ifndef __XEN_SHUTDOWN_H__ #define __XEN_SHUTDOWN_H__ /* opt_noreboot: If true, machine will need manual reset on error. */ extern bool_t opt_noreboot; void dom0_shutdown(u8 reason); void machine_restart(unsigned int delay_millisecs); void machine_halt(void); void machine_power_off(void); #endif /* __XEN_SHUTDOWN_H__ */ xen-4.4.0/xen/include/xen/paging.h0000664000175000017500000000104212307313555015065 0ustar smbsmb #ifndef __XEN_PAGING_H__ #define __XEN_PAGING_H__ #if defined CONFIG_PAGING_ASSISTANCE #include #include #elif defined CONFIG_SHADOW #include #define paging_mode_translate(d) shadow_mode_translate(d) #define paging_mode_external(d) (0) #else #define paging_mode_translate(d) (0) #define paging_mode_external(d) (0) #define guest_physmap_add_page(d, p, m, o) ((void)0) #define guest_physmap_remove_page(d, p, m, o) ((void)0) #endif #endif /* __XEN_PAGING_H__ */ xen-4.4.0/xen/include/xen/symbols.h0000664000175000017500000000053012307313555015311 0ustar smbsmb#ifndef _XEN_SYMBOLS_H #define _XEN_SYMBOLS_H #include #define KSYM_NAME_LEN 127 /* Lookup an address. */ const char *symbols_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char *namebuf); #endif /*_XEN_SYMBOLS_H*/ xen-4.4.0/xen/include/xen/stdbool.h0000664000175000017500000000026112307313555015270 0ustar smbsmb#ifndef __XEN_STDBOOL_H__ #define __XEN_STDBOOL_H__ #define bool _Bool #define true 1 #define false 0 #define __bool_true_false_are_defined 1 #endif /* __XEN_STDBOOL_H__ */ xen-4.4.0/xen/include/xen/perfc.h0000664000175000017500000001120512307313555014721 0ustar smbsmb#ifndef __XEN_PERFC_H__ #define __XEN_PERFC_H__ #ifdef PERF_COUNTERS #include #include #include /* * NOTE: new counters must be defined in perfc_defn.h * * Counter declarations: * PERFCOUNTER (counter, string) define a new performance counter * PERFCOUNTER_ARRAY (counter, string, size) define an array of counters * * Unlike counters, status variables do not reset: * PERFSTATUS (counter, string) define a new performance stauts * PERFSTATUS_ARRAY (counter, string, size) define an array of status vars * * unsigned long perfc_value (counter) get value of a counter * unsigned long perfc_valuea (counter, index) get value of an array counter * unsigned long perfc_set (counter, val) set value of a counter * unsigned long perfc_seta (counter, index, val) set value of an array counter * void perfc_incr (counter) increment a counter * void perfc_decr (counter) decrement a status * void perfc_incra (counter, index) increment an array counter * void perfc_add (counter, value) add a value to a counter * void perfc_adda (counter, index, value) add a value to array counter * void perfc_print (counter) print out the counter */ #define PERFCOUNTER( name, descr ) \ PERFC_##name, #define PERFCOUNTER_ARRAY( name, descr, size ) \ PERFC_##name, \ PERFC_LAST_##name = PERFC_ ## name + (size) - sizeof(char[2 * !!(size) - 1]), #define PERFSTATUS PERFCOUNTER #define PERFSTATUS_ARRAY PERFCOUNTER_ARRAY enum perfcounter { #include NUM_PERFCOUNTERS }; #undef PERFCOUNTER #undef PERFCOUNTER_ARRAY #undef PERFSTATUS #undef PERFSTATUS_ARRAY typedef unsigned perfc_t; #define PRIperfc "" DECLARE_PER_CPU(perfc_t[NUM_PERFCOUNTERS], perfcounters); #define perfc_value(x) this_cpu(perfcounters)[PERFC_ ## x] #define perfc_valuea(x,y) \ ( (y) <= PERFC_LAST_ ## x - PERFC_ ## x ? \ this_cpu(perfcounters)[PERFC_ ## x + (y)] : 0 ) #define perfc_set(x,v) (this_cpu(perfcounters)[PERFC_ ## x] = (v)) #define perfc_seta(x,y,v) \ ( (y) <= PERFC_LAST_ ## x - PERFC_ ## x ? \ this_cpu(perfcounters)[PERFC_ ## x + (y)] = (v) : (v) ) #define perfc_incr(x) (++this_cpu(perfcounters)[PERFC_ ## x]) #define perfc_decr(x) (--this_cpu(perfcounters)[PERFC_ ## x]) #define perfc_incra(x,y) \ ( (y) <= PERFC_LAST_ ## x - PERFC_ ## x ? \ ++this_cpu(perfcounters)[PERFC_ ## x + (y)] : 0 ) #define perfc_add(x,v) (this_cpu(perfcounters)[PERFC_ ## x] += (v)) #define perfc_adda(x,y,v) \ ( (y) <= PERFC_LAST_ ## x - PERFC_ ## x ? \ this_cpu(perfcounters)[PERFC_ ## x + (y)] = (v) : (v) ) /* * Histogram: special treatment for 0 and 1 count. After that equally spaced * with last bucket taking the rest. */ #ifdef PERF_ARRAYS #define perfc_incr_histo(x,v) \ do { \ if ( (v) == 0 ) \ perfc_incra(x, 0); \ else if ( (v) == 1 ) \ perfc_incra(x, 1); \ else if ( (((v) - 2) / PERFC_ ## x ## _BUCKET_SIZE) < \ (PERFC_LAST_ ## x - PERFC_ ## x - 2) ) \ perfc_incra(x, (((v) - 2) / PERFC_ ## x ## _BUCKET_SIZE) + 2); \ else \ perfc_incra(x, PERFC_LAST_ ## x - PERFC_ ## x); \ } while ( 0 ) #else #define perfc_incr_histo(x,v) ((void)0) #endif struct xen_sysctl_perfc_op; int perfc_control(struct xen_sysctl_perfc_op *); extern void perfc_printall(unsigned char key); extern void perfc_reset(unsigned char key); #else /* PERF_COUNTERS */ #define perfc_value(x) (0) #define perfc_valuea(x,y) (0) #define perfc_set(x,v) ((void)0) #define perfc_seta(x,y,v) ((void)0) #define perfc_incr(x) ((void)0) #define perfc_decr(x) ((void)0) #define perfc_incra(x,y) ((void)0) #define perfc_decra(x,y) ((void)0) #define perfc_add(x,y) ((void)0) #define perfc_adda(x,y,z) ((void)0) #define perfc_incr_histo(x,y,z) ((void)0) #endif /* PERF_COUNTERS */ #endif /* __XEN_PERFC_H__ */ xen-4.4.0/xen/include/xen/iommu.h0000664000175000017500000001413012307313555014750 0ustar smbsmb/* * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Copyright (C) Allen Kay */ #ifndef _IOMMU_H_ #define _IOMMU_H_ #include #include #include #include #include extern bool_t iommu_enable, iommu_enabled; extern bool_t force_iommu, iommu_verbose; extern bool_t iommu_workaround_bios_bug, iommu_passthrough; extern bool_t iommu_snoop, iommu_qinval, iommu_intremap; extern bool_t iommu_hap_pt_share; extern bool_t iommu_debug; extern bool_t amd_iommu_perdev_intremap; /* Does this domain have a P2M table we can use as its IOMMU pagetable? */ #define iommu_use_hap_pt(d) (hap_enabled(d) && iommu_hap_pt_share) #define domain_hvm_iommu(d) (&d->arch.hvm_domain.hvm_iommu) #define MAX_IOMMUS 32 #define PAGE_SHIFT_4K (12) #define PAGE_SIZE_4K (1UL << PAGE_SHIFT_4K) #define PAGE_MASK_4K (((u64)-1) << PAGE_SHIFT_4K) #define PAGE_ALIGN_4K(addr) (((addr) + PAGE_SIZE_4K - 1) & PAGE_MASK_4K) int iommu_setup(void); int iommu_supports_eim(void); int iommu_enable_x2apic_IR(void); void iommu_disable_x2apic_IR(void); int iommu_add_device(struct pci_dev *pdev); int iommu_enable_device(struct pci_dev *pdev); int iommu_remove_device(struct pci_dev *pdev); int iommu_domain_init(struct domain *d); void iommu_dom0_init(struct domain *d); void iommu_domain_destroy(struct domain *d); int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn); /* iommu_map_page() takes flags to direct the mapping operation. */ #define _IOMMUF_readable 0 #define IOMMUF_readable (1u<<_IOMMUF_readable) #define _IOMMUF_writable 1 #define IOMMUF_writable (1u<<_IOMMUF_writable) int iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int flags); int iommu_unmap_page(struct domain *d, unsigned long gfn); void iommu_pte_flush(struct domain *d, u64 gfn, u64 *pte, int order, int present); void iommu_set_pgd(struct domain *d); void iommu_domain_teardown(struct domain *d); void pt_pci_init(void); struct pirq; int hvm_do_IRQ_dpci(struct domain *, struct pirq *); int dpci_ioport_intercept(ioreq_t *p); int pt_irq_create_bind(struct domain *, xen_domctl_bind_pt_irq_t *); int pt_irq_destroy_bind(struct domain *, xen_domctl_bind_pt_irq_t *); void hvm_dpci_isairq_eoi(struct domain *d, unsigned int isairq); struct hvm_irq_dpci *domain_get_irq_dpci(const struct domain *); void free_hvm_irq_dpci(struct hvm_irq_dpci *dpci); bool_t pt_irq_need_timer(uint32_t flags); #define PT_IRQ_TIME_OUT MILLISECS(8) struct msi_desc; struct msi_msg; struct page_info; struct iommu_ops { int (*init)(struct domain *d); void (*dom0_init)(struct domain *d); int (*add_device)(u8 devfn, struct pci_dev *); int (*enable_device)(struct pci_dev *pdev); int (*remove_device)(u8 devfn, struct pci_dev *); int (*assign_device)(struct domain *, u8 devfn, struct pci_dev *); void (*teardown)(struct domain *d); int (*map_page)(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int flags); int (*unmap_page)(struct domain *d, unsigned long gfn); void (*free_page_table)(struct page_info *); int (*reassign_device)(struct domain *s, struct domain *t, u8 devfn, struct pci_dev *); int (*get_device_group_id)(u16 seg, u8 bus, u8 devfn); void (*update_ire_from_apic)(unsigned int apic, unsigned int reg, unsigned int value); int (*update_ire_from_msi)(struct msi_desc *msi_desc, struct msi_msg *msg); void (*read_msi_from_ire)(struct msi_desc *msi_desc, struct msi_msg *msg); unsigned int (*read_apic_from_ire)(unsigned int apic, unsigned int reg); int (*setup_hpet_msi)(struct msi_desc *); void (*suspend)(void); void (*resume)(void); void (*share_p2m)(struct domain *d); void (*crash_shutdown)(void); void (*iotlb_flush)(struct domain *d, unsigned long gfn, unsigned int page_count); void (*iotlb_flush_all)(struct domain *d); void (*dump_p2m_table)(struct domain *d); }; void iommu_update_ire_from_apic(unsigned int apic, unsigned int reg, unsigned int value); int iommu_update_ire_from_msi(struct msi_desc *msi_desc, struct msi_msg *msg); void iommu_read_msi_from_ire(struct msi_desc *msi_desc, struct msi_msg *msg); unsigned int iommu_read_apic_from_ire(unsigned int apic, unsigned int reg); int iommu_setup_hpet_msi(struct msi_desc *); void iommu_suspend(void); void iommu_resume(void); void iommu_crash_shutdown(void); void iommu_set_dom0_mapping(struct domain *d); void iommu_share_p2m_table(struct domain *d); int iommu_do_domctl(struct xen_domctl *, struct domain *d, XEN_GUEST_HANDLE_PARAM(xen_domctl_t)); void iommu_iotlb_flush(struct domain *d, unsigned long gfn, unsigned int page_count); void iommu_iotlb_flush_all(struct domain *d); /* While VT-d specific, this must get declared in a generic header. */ int adjust_vtd_irq_affinities(void); /* * The purpose of the iommu_dont_flush_iotlb optional cpu flag is to * avoid unecessary iotlb_flush in the low level IOMMU code. * * iommu_map_page/iommu_unmap_page must flush the iotlb but somethimes * this operation can be really expensive. This flag will be set by the * caller to notify the low level IOMMU code to avoid the iotlb flushes. * iommu_iotlb_flush/iommu_iotlb_flush_all will be explicitly called by * the caller. */ DECLARE_PER_CPU(bool_t, iommu_dont_flush_iotlb); extern struct spinlock iommu_pt_cleanup_lock; extern struct page_list_head iommu_pt_cleanup_list; #endif /* _IOMMU_H_ */ xen-4.4.0/xen/include/xen/elf.h0000664000175000017500000000510312307313555014370 0ustar smbsmb/* * Copyright (c) 1995, 1996 Erik Theisen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __XEN_ELF_H__ #define __XEN_ELF_H__ #include #define ELFNOTE_ALIGN(_n_) (((_n_)+3)&~3) #define ELFNOTE_NAME(_n_) ((char*)(_n_) + sizeof(*(_n_))) #define ELFNOTE_DESC(_n_) (ELFNOTE_NAME(_n_) + ELFNOTE_ALIGN((_n_)->namesz)) #define ELFNOTE_NEXT(_n_) ((Elf_Note *)(ELFNOTE_DESC(_n_) + ELFNOTE_ALIGN((_n_)->descsz))) struct domain_setup_info; extern int loadelfimage(struct domain_setup_info *); extern int parseelfimage(struct domain_setup_info *); extern unsigned long long xen_elfnote_numeric(struct domain_setup_info *dsi, int type, int *defined); extern const char *xen_elfnote_string(struct domain_setup_info *dsi, int type); #ifdef CONFIG_COMPAT extern int elf32_sanity_check(const Elf32_Ehdr *ehdr); extern int loadelf32image(struct domain_setup_info *); extern int parseelf32image(struct domain_setup_info *); extern unsigned long long xen_elf32note_numeric(struct domain_setup_info *, int type, int *defined); extern const char *xen_elf32note_string(struct domain_setup_info *, int type); #endif #ifdef Elf_Ehdr extern int elf_sanity_check(const Elf_Ehdr *ehdr); #endif #endif /* __XEN_ELF_H__ */ xen-4.4.0/xen/include/xen/kernel.h0000664000175000017500000000561212307313555015107 0ustar smbsmb#ifndef _LINUX_KERNEL_H #define _LINUX_KERNEL_H /* * 'kernel.h' contains some often-used function prototypes etc */ #include /* * min()/max() macros that also do * strict type-checking.. See the * "unnecessary" pointer comparison. */ #define min(x,y) ({ \ const typeof(x) _x = (x); \ const typeof(y) _y = (y); \ (void) (&_x == &_y); \ _x < _y ? _x : _y; }) #define max(x,y) ({ \ const typeof(x) _x = (x); \ const typeof(y) _y = (y); \ (void) (&_x == &_y); \ _x > _y ? _x : _y; }) /* * ..and if you can't take the strict * types, you can specify one yourself. * * Or not use min/max at all, of course. */ #define min_t(type,x,y) \ ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; }) #define max_t(type,x,y) \ ({ type __x = (x); type __y = (y); __x > __y ? __x: __y; }) /* * pre-processor, array size, and bit field width suitable variants; * please don't use in "normal" expressions. */ #define MIN(x,y) ((x) < (y) ? (x) : (y)) #define MAX(x,y) ((x) > (y) ? (x) : (y)) /** * container_of - cast a member of a structure out to the containing structure * * @ptr: the pointer to the member. * @type: the type of the container struct this is embedded in. * @member: the name of the member within the struct. * */ #define container_of(ptr, type, member) ({ \ typeof( ((type *)0)->member ) *__mptr = (ptr); \ (type *)( (char *)__mptr - offsetof(type,member) );}) /* * Check at compile time that something is of a particular type. * Always evaluates to 1 so you may use it easily in comparisons. */ #define typecheck(type,x) \ ({ type __dummy; \ typeof(x) __dummy2; \ (void)(&__dummy == &__dummy2); \ 1; \ }) extern char _start[], _end[]; #define is_kernel(p) ({ \ char *__p = (char *)(unsigned long)(p); \ (__p >= _start) && (__p < _end); \ }) extern char _stext[], _etext[]; #define is_kernel_text(p) ({ \ char *__p = (char *)(unsigned long)(p); \ (__p >= _stext) && (__p < _etext); \ }) extern const char _srodata[], _erodata[]; #define is_kernel_rodata(p) ({ \ const char *__p = (const char *)(unsigned long)(p); \ (__p >= _srodata) && (__p < _erodata); \ }) extern char _sinittext[], _einittext[]; #define is_kernel_inittext(p) ({ \ char *__p = (char *)(unsigned long)(p); \ (__p >= _sinittext) && (__p < _einittext); \ }) extern enum system_state { SYS_STATE_early_boot, SYS_STATE_boot, SYS_STATE_active, SYS_STATE_suspend, SYS_STATE_resume } system_state; bool_t is_active_kernel_text(unsigned long addr); #endif /* _LINUX_KERNEL_H */ xen-4.4.0/xen/include/xen/const.h0000664000175000017500000000112412307313555014747 0ustar smbsmb/* const.h: Macros for dealing with constants. */ #ifndef __XEN_CONST_H__ #define __XEN_CONST_H__ /* Some constant macros are used in both assembler and * C code. Therefore we cannot annotate them always with * 'UL' and other type specifiers unilaterally. We * use the following macros to deal with this. * * Similarly, _AT() will cast an expression with a type in C, but * leave it unchanged in asm. */ #ifdef __ASSEMBLY__ #define _AC(X,Y) X #define _AT(T,X) X #else #define __AC(X,Y) (X##Y) #define _AC(X,Y) __AC(X,Y) #define _AT(T,X) ((T)(X)) #endif #endif /* __XEN_CONST_H__ */ xen-4.4.0/xen/include/xen/delay.h0000664000175000017500000000034312307313555014721 0ustar smbsmb#ifndef _LINUX_DELAY_H #define _LINUX_DELAY_H /* Copyright (C) 1993 Linus Torvalds */ #include #define mdelay(n) (\ {unsigned long msec=(n); while (msec--) udelay(1000);}) #endif /* defined(_LINUX_DELAY_H) */ xen-4.4.0/xen/include/xen/errno.h0000664000175000017500000001420012307313555014745 0ustar smbsmb#ifndef _I386_ERRNO_H #define _I386_ERRNO_H /* ` enum neg_errnoval { [ -Efoo for each Efoo in the list below ] } */ /* ` enum errnoval { */ #define EPERM 1 /* Operation not permitted */ #define ENOENT 2 /* No such file or directory */ #define ESRCH 3 /* No such process */ #define EINTR 4 /* Interrupted system call */ #define EIO 5 /* I/O error */ #define ENXIO 6 /* No such device or address */ #define E2BIG 7 /* Arg list too long */ #define ENOEXEC 8 /* Exec format error */ #define EBADF 9 /* Bad file number */ #define ECHILD 10 /* No child processes */ #define EAGAIN 11 /* Try again */ #define ENOMEM 12 /* Out of memory */ #define EACCES 13 /* Permission denied */ #define EFAULT 14 /* Bad address */ #define ENOTBLK 15 /* Block device required */ #define EBUSY 16 /* Device or resource busy */ #define EEXIST 17 /* File exists */ #define EXDEV 18 /* Cross-device link */ #define ENODEV 19 /* No such device */ #define ENOTDIR 20 /* Not a directory */ #define EISDIR 21 /* Is a directory */ #define EINVAL 22 /* Invalid argument */ #define ENFILE 23 /* File table overflow */ #define EMFILE 24 /* Too many open files */ #define ENOTTY 25 /* Not a typewriter */ #define ETXTBSY 26 /* Text file busy */ #define EFBIG 27 /* File too large */ #define ENOSPC 28 /* No space left on device */ #define ESPIPE 29 /* Illegal seek */ #define EROFS 30 /* Read-only file system */ #define EMLINK 31 /* Too many links */ #define EPIPE 32 /* Broken pipe */ #define EDOM 33 /* Math argument out of domain of func */ #define ERANGE 34 /* Math result not representable */ #define EDEADLK 35 /* Resource deadlock would occur */ #define ENAMETOOLONG 36 /* File name too long */ #define ENOLCK 37 /* No record locks available */ #define ENOSYS 38 /* Function not implemented */ #define ENOTEMPTY 39 /* Directory not empty */ #define ELOOP 40 /* Too many symbolic links encountered */ #define EWOULDBLOCK EAGAIN /* Operation would block */ #define ENOMSG 42 /* No message of desired type */ #define EIDRM 43 /* Identifier removed */ #define ECHRNG 44 /* Channel number out of range */ #define EL2NSYNC 45 /* Level 2 not synchronized */ #define EL3HLT 46 /* Level 3 halted */ #define EL3RST 47 /* Level 3 reset */ #define ELNRNG 48 /* Link number out of range */ #define EUNATCH 49 /* Protocol driver not attached */ #define ENOCSI 50 /* No CSI structure available */ #define EL2HLT 51 /* Level 2 halted */ #define EBADE 52 /* Invalid exchange */ #define EBADR 53 /* Invalid request descriptor */ #define EXFULL 54 /* Exchange full */ #define ENOANO 55 /* No anode */ #define EBADRQC 56 /* Invalid request code */ #define EBADSLT 57 /* Invalid slot */ #define EDEADLOCK EDEADLK #define EBFONT 59 /* Bad font file format */ #define ENOSTR 60 /* Device not a stream */ #define ENODATA 61 /* No data available */ #define ETIME 62 /* Timer expired */ #define ENOSR 63 /* Out of streams resources */ #define ENONET 64 /* Machine is not on the network */ #define ENOPKG 65 /* Package not installed */ #define EREMOTE 66 /* Object is remote */ #define ENOLINK 67 /* Link has been severed */ #define EADV 68 /* Advertise error */ #define ESRMNT 69 /* Srmount error */ #define ECOMM 70 /* Communication error on send */ #define EPROTO 71 /* Protocol error */ #define EMULTIHOP 72 /* Multihop attempted */ #define EDOTDOT 73 /* RFS specific error */ #define EBADMSG 74 /* Not a data message */ #define EOVERFLOW 75 /* Value too large for defined data type */ #define ENOTUNIQ 76 /* Name not unique on network */ #define EBADFD 77 /* File descriptor in bad state */ #define EREMCHG 78 /* Remote address changed */ #define ELIBACC 79 /* Can not access a needed shared library */ #define ELIBBAD 80 /* Accessing a corrupted shared library */ #define ELIBSCN 81 /* .lib section in a.out corrupted */ #define ELIBMAX 82 /* Attempting to link in too many shared libraries */ #define ELIBEXEC 83 /* Cannot exec a shared library directly */ #define EILSEQ 84 /* Illegal byte sequence */ #define ERESTART 85 /* Interrupted system call should be restarted */ #define ESTRPIPE 86 /* Streams pipe error */ #define EUSERS 87 /* Too many users */ #define ENOTSOCK 88 /* Socket operation on non-socket */ #define EDESTADDRREQ 89 /* Destination address required */ #define EMSGSIZE 90 /* Message too long */ #define EPROTOTYPE 91 /* Protocol wrong type for socket */ #define ENOPROTOOPT 92 /* Protocol not available */ #define EPROTONOSUPPORT 93 /* Protocol not supported */ #define ESOCKTNOSUPPORT 94 /* Socket type not supported */ #define EOPNOTSUPP 95 /* Operation not supported on transport endpoint */ #define EPFNOSUPPORT 96 /* Protocol family not supported */ #define EAFNOSUPPORT 97 /* Address family not supported by protocol */ #define EADDRINUSE 98 /* Address already in use */ #define EADDRNOTAVAIL 99 /* Cannot assign requested address */ #define ENETDOWN 100 /* Network is down */ #define ENETUNREACH 101 /* Network is unreachable */ #define ENETRESET 102 /* Network dropped connection because of reset */ #define ECONNABORTED 103 /* Software caused connection abort */ #define ECONNRESET 104 /* Connection reset by peer */ #define ENOBUFS 105 /* No buffer space available */ #define EISCONN 106 /* Transport endpoint is already connected */ #define ENOTCONN 107 /* Transport endpoint is not connected */ #define ESHUTDOWN 108 /* Cannot send after transport endpoint shutdown */ #define ETOOMANYREFS 109 /* Too many references: cannot splice */ #define ETIMEDOUT 110 /* Connection timed out */ #define ECONNREFUSED 111 /* Connection refused */ #define EHOSTDOWN 112 /* Host is down */ #define EHOSTUNREACH 113 /* No route to host */ #define EALREADY 114 /* Operation already in progress */ #define EINPROGRESS 115 /* Operation now in progress */ #define ESTALE 116 /* Stale NFS file handle */ #define EUCLEAN 117 /* Structure needs cleaning */ #define ENOTNAM 118 /* Not a XENIX named type file */ #define ENAVAIL 119 /* No XENIX semaphores available */ #define EISNAM 120 /* Is a named type file */ #define EREMOTEIO 121 /* Remote I/O error */ #define EDQUOT 122 /* Quota exceeded */ #define ENOMEDIUM 123 /* No medium found */ #define EMEDIUMTYPE 124 /* Wrong medium type */ /* ` } */ #endif xen-4.4.0/xen/include/xen/nmi.h0000664000175000017500000000045512307313555014412 0ustar smbsmb/****************************************************************************** * nmi.h * * Register and unregister NMI callbacks. * * Copyright (c) 2006, Ian Campbell */ #ifndef __XEN_NMI_H__ #define __XEN_NMI_H__ #include #endif /* __XEN_NMI_H__ */ xen-4.4.0/xen/include/xen/notifier.h0000664000175000017500000000406212307313555015444 0ustar smbsmb/****************************************************************************** * include/xen/notifier.h * * Routines to manage notifier chains for passing status changes to any * interested routines. * * Original code from Linux kernel 2.6.27 (Alan Cox ) */ #ifndef __XEN_NOTIFIER_H__ #define __XEN_NOTIFIER_H__ #include #include #include #include /* * Xen includes only one type of notifier chains inherited from Linux: * Raw notifier chains: There are no restrictions on callbacks, * registration, or unregistration. All locking and protection * must be provided by the caller. */ struct notifier_block { int (*notifier_call)(struct notifier_block *, unsigned long, void *); struct list_head chain; int priority; }; struct notifier_head { struct notifier_block head; }; #define NOTIFIER_INIT(name) { .head.chain = LIST_HEAD_INIT(name.head.chain) } #define NOTIFIER_HEAD(name) \ struct notifier_head name = NOTIFIER_INIT(name) void notifier_chain_register( struct notifier_head *nh, struct notifier_block *nb); void notifier_chain_unregister( struct notifier_head *nh, struct notifier_block *nb); int notifier_call_chain( struct notifier_head *nh, unsigned long val, void *v, struct notifier_block **pcursor); /* Notifier flag values: OR into @val passed to notifier_call_chain(). */ #define NOTIFY_FORWARD 0x0000 /* Call chain highest-priority-first */ #define NOTIFY_REVERSE 0x8000 /* Call chain lowest-priority-first */ /* Handler completion values */ #define NOTIFY_DONE 0x0000 #define NOTIFY_STOP_MASK 0x8000 #define NOTIFY_STOP (NOTIFY_STOP_MASK|NOTIFY_DONE) #define NOTIFY_BAD (NOTIFY_STOP_MASK|EINVAL) /* Encapsulate (negative) errno value. */ static inline int notifier_from_errno(int err) { return NOTIFY_STOP_MASK | -err; } /* Restore (negative) errno value from notify return value. */ static inline int notifier_to_errno(int ret) { return -(ret & ~NOTIFY_STOP_MASK); } #endif /* __XEN_NOTIFIER_H__ */ xen-4.4.0/xen/include/xen/gcov.h0000664000175000017500000000513612307313555014566 0ustar smbsmb/* * Profiling infrastructure declarations. * * This file is based on gcc-internal definitions. Data structures are * defined to be compatible with gcc counterparts. For a better * understanding, refer to gcc source: gcc/gcov-io.h. * * Copyright IBM Corp. 2009 * Author(s): Peter Oberparleiter * * Uses gcc-internal data definitions. */ #ifndef __XEN_GCOV_H__ #define __XEN_GCOV_H__ __XEN_GCOV_H__ #include /* * Profiling data types used for gcc 3.4 and above - these are defined by * gcc and need to be kept as close to the original definition as possible to * remain compatible. */ typedef uint64_t gcov_type; /** * struct gcov_fn_info - profiling meta data per function * @ident: object file-unique function identifier * @checksum: function checksum * @n_ctrs: number of values per counter type belonging to this function * * This data is generated by gcc during compilation and doesn't change * at run-time. */ struct gcov_fn_info { unsigned int ident; unsigned int checksum; unsigned int n_ctrs[0]; }; /** * struct gcov_ctr_info - profiling data per counter type * @num: number of counter values for this type * @values: array of counter values for this type * @merge: merge function for counter values of this type (unused) * * This data is generated by gcc during compilation and doesn't change * at run-time with the exception of the values array. */ struct gcov_ctr_info { unsigned int num; gcov_type *values; void (*merge)(gcov_type *, unsigned int); }; /** * struct gcov_info - profiling data per object file * @version: gcov version magic indicating the gcc version used for compilation * @next: list head for a singly-linked list * @stamp: time stamp * @filename: name of the associated gcov data file * @n_functions: number of instrumented functions * @functions: function data * @ctr_mask: mask specifying which counter types are active * @counts: counter data per counter type * * This data is generated by gcc during compilation and doesn't change * at run-time with the exception of the next pointer. */ struct gcov_info { unsigned int version; struct gcov_info *next; unsigned int stamp; const char *filename; unsigned int n_functions; const struct gcov_fn_info *functions; unsigned int ctr_mask; struct gcov_ctr_info counts[0]; }; /** * Sysctl operations for coverage */ #ifdef TEST_COVERAGE int sysctl_coverage_op(xen_sysctl_coverage_op_t *op); #endif #endif /* __XEN_GCOV_H__ */ xen-4.4.0/xen/include/xen/list.h0000664000175000017500000007443412307313555014612 0ustar smbsmb/****************************************************************************** * list.h * * Useful linked-list definitions taken from the Linux kernel (2.6.18). */ #ifndef __XEN_LIST_H__ #define __XEN_LIST_H__ #include #include /* These are non-NULL pointers that will result in page faults * under normal circumstances, used to verify that nobody uses * non-initialized list entries. */ #define LIST_POISON1 ((void *) 0x00100100) #define LIST_POISON2 ((void *) 0x00200200) /* * Simple doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when * manipulating whole lists rather than single entries, as * sometimes we already know the next/prev entries and we can * generate better code by using them directly rather than * using the generic single-entry routines. */ struct list_head { struct list_head *next, *prev; }; #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) #define LIST_HEAD_READ_MOSTLY(name) \ struct list_head __read_mostly name = LIST_HEAD_INIT(name) /* Do not move this ahead of the struct list_head definition! */ #include static inline void INIT_LIST_HEAD(struct list_head *list) { list->next = list; list->prev = list; } /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { next->prev = new; new->next = next; new->prev = prev; prev->next = new; } /** * list_add - add a new entry * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } /** * list_add_tail - add a new entry * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. */ static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add_rcu(struct list_head *new, struct list_head *prev, struct list_head *next) { new->next = next; new->prev = prev; smp_wmb(); next->prev = new; prev->next = new; } /** * list_add_rcu - add a new entry to rcu-protected list * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_add_rcu() * or list_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). */ static inline void list_add_rcu(struct list_head *new, struct list_head *head) { __list_add_rcu(new, head, head->next); } /** * list_add_tail_rcu - add a new entry to rcu-protected list * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_add_tail_rcu() * or list_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). */ static inline void list_add_tail_rcu(struct list_head *new, struct list_head *head) { __list_add_rcu(new, head->prev, head); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_del(struct list_head *prev, struct list_head *next) { next->prev = prev; prev->next = next; } /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty on entry does not return true after this, the entry is * in an undefined state. */ static inline void list_del(struct list_head *entry) { ASSERT(entry->next->prev == entry); ASSERT(entry->prev->next == entry); __list_del(entry->prev, entry->next); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } /** * list_del_rcu - deletes entry from list without re-initialization * @entry: the element to delete from the list. * * Note: list_empty on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_del_rcu() * or list_add_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). * * Note that the caller is not permitted to immediately free * the newly deleted entry. Instead, either synchronize_rcu() * or call_rcu() must be used to defer freeing until an RCU * grace period has elapsed. */ static inline void list_del_rcu(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->prev = LIST_POISON2; } /** * list_replace - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * Note: if 'old' was empty, it will be overwritten. */ static inline void list_replace(struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } static inline void list_replace_init(struct list_head *old, struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); } /* * list_replace_rcu - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * The old entry will be replaced with the new entry atomically. * Note: 'old' should not be empty. */ static inline void list_replace_rcu(struct list_head *old, struct list_head *new) { new->next = old->next; new->prev = old->prev; smp_wmb(); new->next->prev = new; new->prev->next = new; old->prev = LIST_POISON2; } /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ static inline void list_del_init(struct list_head *entry) { __list_del(entry->prev, entry->next); INIT_LIST_HEAD(entry); } /** * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry */ static inline void list_move(struct list_head *list, struct list_head *head) { __list_del(list->prev, list->next); list_add(list, head); } /** * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry */ static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del(list->prev, list->next); list_add_tail(list, head); } /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } /** * list_empty - tests whether a list is empty * @head: the list to test. */ static inline int list_empty(const struct list_head *head) { return head->next == head; } /** * list_empty_careful - tests whether a list is empty and not being modified * @head: the list to test * * Description: * tests whether a list is empty _and_ checks that no other CPU might be * in the process of modifying either member (next or prev) * * NOTE: using list_empty_careful() without synchronization * can only be safe if the only activity that can happen * to the list entry is list_del_init(). Eg. it cannot be used * if another CPU could re-list_add() it. */ static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = head->next; return (next == head) && (next == head->prev); } static inline void __list_splice(struct list_head *list, struct list_head *head) { struct list_head *first = list->next; struct list_head *last = list->prev; struct list_head *at = head->next; first->prev = head; head->next = first; last->next = at; at->prev = last; } /** * list_splice - join two lists * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice(struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head); } /** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. * * The list at @list is reinitialised */ static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head); INIT_LIST_HEAD(list); } } /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_struct within the struct. */ #define list_entry(ptr, type, member) \ container_of(ptr, type, member) /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each(pos, head) \ for (pos = (head)->next; prefetch(pos->next), pos != (head); \ pos = pos->next) /** * __list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * * This variant differs from list_for_each() in that it's the * simplest possible list iteration code, no prefetching is done. * Use this for code that knows the list to be very short (empty * or 1 entry) most of the time. */ #define __list_for_each(pos, head) \ for (pos = (head)->next; pos != (head); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ for (pos = (head)->prev; prefetch(pos->prev), pos != (head); \ pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; pos != (head); \ pos = n, n = pos->next) /** * list_for_each_backwards_safe - iterate backwards over a list safe * against removal of list entry * @pos: the &struct list_head to use as a loop counter. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_backwards_safe(pos, n, head) \ for ( pos = (head)->prev, n = pos->prev; pos != (head); \ pos = n, n = pos->prev ) /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. */ #define list_for_each_entry(pos, head, member) \ for (pos = list_entry((head)->next, typeof(*pos), member); \ prefetch(pos->member.next), &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_entry((head)->prev, typeof(*pos), member); \ prefetch(pos->member.prev), &pos->member != (head); \ pos = list_entry(pos->member.prev, typeof(*pos), member)) /** * list_prepare_entry - prepare a pos entry for use in * list_for_each_entry_continue * @pos: the type * to use as a start point * @head: the head of the list * @member: the name of the list_struct within the struct. * * Prepares a pos entry for use as a start point in * list_for_each_entry_continue. */ #define list_prepare_entry(pos, head, member) \ ((pos) ? : list_entry(head, typeof(*pos), member)) /** * list_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Continue to iterate over list of given type, continuing after * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_entry(pos->member.next, typeof(*pos), member); \ prefetch(pos->member.next), &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_from - iterate over list of given type from the * current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ for (; prefetch(pos->member.next), &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_safe - iterate over list of given type safe * against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_struct within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_entry((head)->next, typeof(*pos), member), \ n = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.next, typeof(*n), member)) /** * list_for_each_entry_safe_continue * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Iterate over list of given type, continuing after current point, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_entry(pos->member.next, typeof(*pos), member), \ n = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.next, typeof(*n), member)) /** * list_for_each_entry_safe_from * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Iterate over list of given type from current point, safe against * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.next, typeof(*n), member)) /** * list_for_each_entry_safe_reverse * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Iterate backwards over list of given type, safe against removal * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_entry((head)->prev, typeof(*pos), member), \ n = list_entry(pos->member.prev, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.prev, typeof(*n), member)) /** * list_for_each_rcu - iterate over an rcu-protected list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_rcu(pos, head) \ for (pos = (head)->next; \ prefetch(rcu_dereference(pos)->next), pos != (head); \ pos = pos->next) #define __list_for_each_rcu(pos, head) \ for (pos = (head)->next; \ rcu_dereference(pos) != (head); \ pos = pos->next) /** * list_for_each_safe_rcu * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. * * Iterate over an rcu-protected list, safe against removal of list entry. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_safe_rcu(pos, n, head) \ for (pos = (head)->next; \ n = rcu_dereference(pos)->next, pos != (head); \ pos = n) /** * list_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_entry_rcu(pos, head, member) \ for (pos = list_entry((head)->next, typeof(*pos), member); \ prefetch(rcu_dereference(pos)->member.next), \ &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) /** * list_for_each_continue_rcu * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * * Iterate over an rcu-protected list, continuing after current point. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_continue_rcu(pos, head) \ for ((pos) = (pos)->next; \ prefetch(rcu_dereference((pos))->next), (pos) != (head); \ (pos) = (pos)->next) /* * Double linked lists with a single pointer list head. * Mostly useful for hash tables where the two pointer list head is * too wasteful. * You lose the ability to access the tail in O(1). */ struct hlist_head { struct hlist_node *first; }; struct hlist_node { struct hlist_node *next, **pprev; }; #define HLIST_HEAD_INIT { .first = NULL } #define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE(struct hlist_node *h) { h->next = NULL; h->pprev = NULL; } static inline int hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } static inline int hlist_empty(const struct hlist_head *h) { return !h->first; } static inline void __hlist_del(struct hlist_node *n) { struct hlist_node *next = n->next; struct hlist_node **pprev = n->pprev; *pprev = next; if (next) next->pprev = pprev; } static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } /** * hlist_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: list_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry(). */ static inline void hlist_del_rcu(struct hlist_node *n) { __hlist_del(n); n->pprev = LIST_POISON2; } static inline void hlist_del_init(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); INIT_HLIST_NODE(n); } } /* * hlist_replace_rcu - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * The old entry will be replaced with the new entry atomically. */ static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *new) { struct hlist_node *next = old->next; new->next = next; new->pprev = old->pprev; smp_wmb(); if (next) new->next->pprev = &new->next; *new->pprev = new; old->pprev = LIST_POISON2; } static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; n->next = first; if (first) first->pprev = &n->next; h->first = n; n->pprev = &h->first; } /** * hlist_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_add_head_rcu(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; n->next = first; n->pprev = &h->first; smp_wmb(); if (first) first->pprev = &n->next; h->first = n; } /* next must be != NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { n->pprev = next->pprev; n->next = next; next->pprev = &n->next; *(n->pprev) = n; } static inline void hlist_add_after(struct hlist_node *n, struct hlist_node *next) { next->next = n->next; n->next = next; next->pprev = &n->next; if(next->next) next->next->pprev = &next->next; } /** * hlist_add_before_rcu * @n: the new element to add to the hash list. * @next: the existing element to add the new element before. * * Description: * Adds the specified element to the specified hlist * before the specified node while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ static inline void hlist_add_before_rcu(struct hlist_node *n, struct hlist_node *next) { n->pprev = next->pprev; n->next = next; smp_wmb(); next->pprev = &n->next; *(n->pprev) = n; } /** * hlist_add_after_rcu * @prev: the existing element to add the new element after. * @n: the new element to add to the hash list. * * Description: * Adds the specified element to the specified hlist * after the specified node while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ static inline void hlist_add_after_rcu(struct hlist_node *prev, struct hlist_node *n) { n->next = prev->next; n->pprev = &prev->next; smp_wmb(); prev->next = n; if (n->next) n->next->pprev = &n->next; } #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ for (pos = (head)->first; pos && ({ prefetch(pos->next); 1; }); \ pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ pos = n) /** * hlist_for_each_entry - iterate over list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry(tpos, pos, head, member) \ for (pos = (head)->first; \ pos && ({ prefetch(pos->next); 1;}) && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) /** * hlist_for_each_entry_continue - iterate over a hlist continuing * after current point * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue(tpos, pos, member) \ for (pos = (pos)->next; \ pos && ({ prefetch(pos->next); 1;}) && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) /** * hlist_for_each_entry_from - iterate over a hlist continuing from * current point * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from(tpos, pos, member) \ for (; pos && ({ prefetch(pos->next); 1;}) && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) /** * hlist_for_each_entry_safe - iterate over list of given type safe * against removal of list entry * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @n: another &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_safe(tpos, pos, n, head, member) \ for (pos = (head)->first; \ pos && ({ n = pos->next; 1; }) && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ pos = n) /** * hlist_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define hlist_for_each_entry_rcu(tpos, pos, head, member) \ for (pos = (head)->first; \ rcu_dereference(pos) && ({ prefetch(pos->next); 1;}) && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) #endif /* __XEN_LIST_H__ */ xen-4.4.0/xen/include/xen/bitops.h0000664000175000017500000001121012307313555015116 0ustar smbsmb#ifndef _LINUX_BITOPS_H #define _LINUX_BITOPS_H #include /* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ static inline int generic_ffs(int x) { int r = 1; if (!x) return 0; if (!(x & 0xffff)) { x >>= 16; r += 16; } if (!(x & 0xff)) { x >>= 8; r += 8; } if (!(x & 0xf)) { x >>= 4; r += 4; } if (!(x & 3)) { x >>= 2; r += 2; } if (!(x & 1)) { x >>= 1; r += 1; } return r; } /* * fls: find last bit set. */ static __inline__ int generic_fls(int x) { int r = 32; if (!x) return 0; if (!(x & 0xffff0000u)) { x <<= 16; r -= 16; } if (!(x & 0xff000000u)) { x <<= 8; r -= 8; } if (!(x & 0xf0000000u)) { x <<= 4; r -= 4; } if (!(x & 0xc0000000u)) { x <<= 2; r -= 2; } if (!(x & 0x80000000u)) { x <<= 1; r -= 1; } return r; } /* * Include this here because some architectures need generic_ffs/fls in * scope */ #include static inline int generic_fls64(__u64 x) { __u32 h = x >> 32; if (h) return fls(x) + 32; return fls(x); } static __inline__ int get_bitmask_order(unsigned int count) { int order; order = fls(count); return order; /* We could be slightly more clever with -1 here... */ } static __inline__ int get_count_order(unsigned int count) { int order; order = fls(count) - 1; if (count & (count - 1)) order++; return order; } /* * hweightN: returns the hamming weight (i.e. the number * of bits set) of a N-bit word */ static inline unsigned int generic_hweight32(unsigned int w) { unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555); res = (res & 0x33333333) + ((res >> 2) & 0x33333333); res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F); res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF); return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF); } static inline unsigned int generic_hweight16(unsigned int w) { unsigned int res = (w & 0x5555) + ((w >> 1) & 0x5555); res = (res & 0x3333) + ((res >> 2) & 0x3333); res = (res & 0x0F0F) + ((res >> 4) & 0x0F0F); return (res & 0x00FF) + ((res >> 8) & 0x00FF); } static inline unsigned int generic_hweight8(unsigned int w) { unsigned int res = (w & 0x55) + ((w >> 1) & 0x55); res = (res & 0x33) + ((res >> 2) & 0x33); return (res & 0x0F) + ((res >> 4) & 0x0F); } static inline unsigned long generic_hweight64(__u64 w) { #if BITS_PER_LONG < 64 return generic_hweight32((unsigned int)(w >> 32)) + generic_hweight32((unsigned int)w); #else u64 res; res = (w & 0x5555555555555555ul) + ((w >> 1) & 0x5555555555555555ul); res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul); res = (res & 0x0F0F0F0F0F0F0F0Ful) + ((res >> 4) & 0x0F0F0F0F0F0F0F0Ful); res = (res & 0x00FF00FF00FF00FFul) + ((res >> 8) & 0x00FF00FF00FF00FFul); res = (res & 0x0000FFFF0000FFFFul) + ((res >> 16) & 0x0000FFFF0000FFFFul); return (res & 0x00000000FFFFFFFFul) + ((res >> 32) & 0x00000000FFFFFFFFul); #endif } static inline unsigned long hweight_long(unsigned long w) { return sizeof(w) == 4 ? generic_hweight32(w) : generic_hweight64(w); } /* * rol32 - rotate a 32-bit value left * * @word: value to rotate * @shift: bits to roll */ static inline __u32 rol32(__u32 word, unsigned int shift) { return (word << shift) | (word >> (32 - shift)); } /* * ror32 - rotate a 32-bit value right * * @word: value to rotate * @shift: bits to roll */ static inline __u32 ror32(__u32 word, unsigned int shift) { return (word >> shift) | (word << (32 - shift)); } /* base-2 logarithm */ #define __L2(_x) (((_x) & 0x00000002) ? 1 : 0) #define __L4(_x) (((_x) & 0x0000000c) ? ( 2 + __L2( (_x)>> 2)) : __L2( _x)) #define __L8(_x) (((_x) & 0x000000f0) ? ( 4 + __L4( (_x)>> 4)) : __L4( _x)) #define __L16(_x) (((_x) & 0x0000ff00) ? ( 8 + __L8( (_x)>> 8)) : __L8( _x)) #define LOG_2(_x) (((_x) & 0xffff0000) ? (16 + __L16((_x)>>16)) : __L16(_x)) /** * for_each_set_bit - iterate over every set bit in a memory region * @bit: The integer iterator * @addr: The address to base the search on * @size: The maximum size to search */ #define for_each_set_bit(bit, addr, size) \ for ( (bit) = find_first_bit(addr, size); \ (bit) < (size); \ (bit) = find_next_bit(addr, size, (bit) + 1) ) #endif xen-4.4.0/xen/include/xen/compiler.h0000664000175000017500000000615512307313555015444 0ustar smbsmb#ifndef __LINUX_COMPILER_H #define __LINUX_COMPILER_H #if !defined(__GNUC__) || (__GNUC__ < 4) #error Sorry, your compiler is too old/not recognized. #endif #define barrier() __asm__ __volatile__("": : :"memory") #define likely(x) __builtin_expect((x),1) #define unlikely(x) __builtin_expect((x),0) #define inline __inline__ #define always_inline __inline__ __attribute__ ((always_inline)) #define noinline __attribute__((noinline)) #if (!defined(__clang__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 5)) #define unreachable() do {} while (1) #else #define unreachable() __builtin_unreachable() #endif #ifdef __clang__ /* Clang can replace some vars with new automatic ones that go in .data; * mark all explicit-segment vars 'used' to prevent that. */ #define __section(s) __used __attribute__((__section__(s))) #else #define __section(s) __attribute__((__section__(s))) #endif #define __used_section(s) __used __attribute__((__section__(s))) #define __text_section(s) __attribute__((__section__(s))) #ifdef INIT_SECTIONS_ONLY /* * For sources indicated to have only init code, make sure even * inline functions not expanded inline get placed in .init.text. */ #include #define __inline__ __inline__ __init #endif #define __attribute_pure__ __attribute__((pure)) #define __attribute_const__ __attribute__((__const__)) /* * The difference between the following two attributes is that __used is * intended to be used in cases where a reference to an identifier may be * invisible to the compiler (e.g. an inline assembly operand not listed * in the asm()'s operands), preventing the compiler from eliminating the * variable or function. * __maybe_unused otoh is to be used to merely prevent warnings (e.g. when * an identifier is used only inside a preprocessor conditional, yet putting * its declaration/definition inside another conditional would harm code * readability). */ #define __used __attribute__((__used__)) #define __maybe_unused __attribute__((__unused__)) #define __must_check __attribute__((warn_unused_result)) #define offsetof(a,b) __builtin_offsetof(a,b) #if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 201112L #define alignof __alignof__ #endif /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) \ BUILD_BUG_ON_ZERO(__builtin_types_compatible_p(typeof(a), typeof(&a[0]))) #ifdef GCC_HAS_VISIBILITY_ATTRIBUTE /* Results in more efficient PIC code (no indirections through GOT or PLT). */ #pragma GCC visibility push(hidden) #endif /* This macro obfuscates arithmetic on a variable address so that gcc shouldn't recognize the original var, and make assumptions about it */ /* * Versions of the ppc64 compiler before 4.1 had a bug where use of * RELOC_HIDE could trash r30. The bug can be worked around by changing * the inline assembly constraint from =g to =r, in this particular * case either is valid. */ #define RELOC_HIDE(ptr, off) \ ({ unsigned long __ptr; \ __asm__ ("" : "=r"(__ptr) : "0"(ptr)); \ (typeof(ptr)) (__ptr + (off)); }) #endif /* __LINUX_COMPILER_H */ xen-4.4.0/xen/include/xen/tmem_xen.h0000664000175000017500000002137712307313555015451 0ustar smbsmb/****************************************************************************** * tmem_xen.h * * Xen-specific Transcendent memory * * Copyright (c) 2009, Dan Magenheimer, Oracle Corp. */ #ifndef __XEN_TMEM_XEN_H__ #define __XEN_TMEM_XEN_H__ #include /* heap alloc/free */ #include #include /* xmalloc/xfree */ #include /* struct domain */ #include /* copy_from_guest */ #include /* hash_long */ #include /* __map_domain_page */ #include /* xsm_tmem_control */ #include #ifdef CONFIG_COMPAT #include #endif typedef uint32_t pagesize_t; /* like size_t, must handle largest PAGE_SIZE */ #define IS_PAGE_ALIGNED(addr) \ ((void *)((((unsigned long)addr + (PAGE_SIZE - 1)) & PAGE_MASK)) == addr) #define IS_VALID_PAGE(_pi) ( mfn_valid(page_to_mfn(_pi)) ) extern struct page_list_head tmem_page_list; extern spinlock_t tmem_page_list_lock; extern unsigned long tmem_page_list_pages; extern atomic_t freeable_page_count; extern int tmem_init(void); #define tmem_hash hash_long extern bool_t opt_tmem_compress; static inline bool_t tmem_compression_enabled(void) { return opt_tmem_compress; } extern bool_t opt_tmem_dedup; static inline bool_t tmem_dedup_enabled(void) { return opt_tmem_dedup; } extern bool_t opt_tmem_tze; static inline bool_t tmem_tze_enabled(void) { return opt_tmem_tze; } static inline void tmem_tze_disable(void) { opt_tmem_tze = 0; } extern bool_t opt_tmem_shared_auth; static inline bool_t tmem_shared_auth(void) { return opt_tmem_shared_auth; } extern bool_t opt_tmem; static inline bool_t tmem_enabled(void) { return opt_tmem; } /* * Memory free page list management */ static inline struct page_info *tmem_page_list_get(void) { struct page_info *pi; spin_lock(&tmem_page_list_lock); if ( (pi = page_list_remove_head(&tmem_page_list)) != NULL ) tmem_page_list_pages--; spin_unlock(&tmem_page_list_lock); ASSERT((pi == NULL) || IS_VALID_PAGE(pi)); return pi; } static inline void tmem_page_list_put(struct page_info *pi) { ASSERT(IS_VALID_PAGE(pi)); spin_lock(&tmem_page_list_lock); page_list_add(pi, &tmem_page_list); tmem_page_list_pages++; spin_unlock(&tmem_page_list_lock); } /* * Memory allocation for persistent data */ static inline struct page_info *__tmem_alloc_page_thispool(struct domain *d) { struct page_info *pi; /* note that this tot_pages check is not protected by d->page_alloc_lock, * so may race and periodically fail in donate_page or alloc_domheap_pages * That's OK... neither is a problem, though chatty if log_lvl is set */ if ( d->tot_pages >= d->max_pages ) return NULL; if ( tmem_page_list_pages ) { if ( (pi = tmem_page_list_get()) != NULL ) { if ( donate_page(d,pi,0) == 0 ) goto out; else tmem_page_list_put(pi); } } pi = alloc_domheap_pages(d,0,MEMF_tmem); out: ASSERT((pi == NULL) || IS_VALID_PAGE(pi)); return pi; } static inline void __tmem_free_page_thispool(struct page_info *pi) { struct domain *d = page_get_owner(pi); ASSERT(IS_VALID_PAGE(pi)); if ( (d == NULL) || steal_page(d,pi,0) == 0 ) tmem_page_list_put(pi); else { scrub_one_page(pi); ASSERT((pi->count_info & ~(PGC_allocated | 1)) == 0); free_domheap_pages(pi,0); } } /* * Memory allocation for ephemeral (non-persistent) data */ static inline struct page_info *__tmem_alloc_page(void) { struct page_info *pi = tmem_page_list_get(); if ( pi == NULL) pi = alloc_domheap_pages(0,0,MEMF_tmem); if ( pi ) atomic_inc(&freeable_page_count); ASSERT((pi == NULL) || IS_VALID_PAGE(pi)); return pi; } static inline void __tmem_free_page(struct page_info *pi) { ASSERT(IS_VALID_PAGE(pi)); tmem_page_list_put(pi); atomic_dec(&freeable_page_count); } /* "Client" (==domain) abstraction */ static inline struct client *tmem_client_from_cli_id(domid_t cli_id) { struct client *c; struct domain *d = rcu_lock_domain_by_id(cli_id); if (d == NULL) return NULL; c = d->tmem_client; rcu_unlock_domain(d); return c; } static inline uint8_t tmem_get_first_byte(struct page_info *pfp) { const uint8_t *p = __map_domain_page(pfp); uint8_t byte = p[0]; unmap_domain_page(p); return byte; } static inline int tmem_page_cmp(struct page_info *pfp1, struct page_info *pfp2) { const uint64_t *p1 = __map_domain_page(pfp1); const uint64_t *p2 = __map_domain_page(pfp2); int rc = memcmp(p1, p2, PAGE_SIZE); unmap_domain_page(p2); unmap_domain_page(p1); return rc; } static inline int tmem_pcd_cmp(void *va1, pagesize_t len1, void *va2, pagesize_t len2) { const char *p1 = (char *)va1; const char *p2 = (char *)va2; pagesize_t i; ASSERT(len1 <= PAGE_SIZE); ASSERT(len2 <= PAGE_SIZE); if ( len1 < len2 ) return -1; if ( len1 > len2 ) return 1; ASSERT(len1 == len2); for ( i = len2; i && *p1 == *p2; i--, p1++, p2++ ); if ( !i ) return 0; if ( *p1 < *p2 ) return -1; return 1; } static inline int tmem_tze_pfp_cmp(struct page_info *pfp1, pagesize_t pfp_len, void *tva, const pagesize_t tze_len) { const uint64_t *p1 = __map_domain_page(pfp1); const uint64_t *p2 = tze_len == PAGE_SIZE ? __map_domain_page((struct page_info *)tva) : tva; int rc; ASSERT(pfp_len <= PAGE_SIZE); ASSERT(!(pfp_len & (sizeof(uint64_t)-1))); ASSERT(tze_len <= PAGE_SIZE); ASSERT(!(tze_len & (sizeof(uint64_t)-1))); if ( pfp_len < tze_len ) rc = -1; else if ( pfp_len > tze_len ) rc = 1; else rc = memcmp(p1, p2, tze_len); if ( tze_len == PAGE_SIZE ) unmap_domain_page(p2); unmap_domain_page(p1); return rc; } /* return the size of the data in the pfp, ignoring trailing zeroes and * rounded up to the nearest multiple of 8 */ static inline pagesize_t tmem_tze_pfp_scan(struct page_info *pfp) { const uint64_t *const page = __map_domain_page(pfp); const uint64_t *p = page; pagesize_t bytecount = PAGE_SIZE; pagesize_t len = PAGE_SIZE/sizeof(uint64_t); p += len; while ( len-- && !*--p ) bytecount -= sizeof(uint64_t); unmap_domain_page(page); return bytecount; } static inline void tmem_tze_copy_from_pfp(void *tva, struct page_info *pfp, pagesize_t len) { const uint64_t *p = __map_domain_page(pfp); ASSERT(!(len & (sizeof(uint64_t)-1))); memcpy(tva, p, len); unmap_domain_page(p); } /* these typedefs are in the public/tmem.h interface typedef XEN_GUEST_HANDLE(void) cli_mfn_t; typedef XEN_GUEST_HANDLE(char) cli_va_t; */ typedef XEN_GUEST_HANDLE_PARAM(tmem_op_t) tmem_cli_op_t; typedef XEN_GUEST_HANDLE_PARAM(char) tmem_cli_va_param_t; static inline int tmem_get_tmemop_from_client(tmem_op_t *op, tmem_cli_op_t uops) { #ifdef CONFIG_COMPAT if ( has_hvm_container_vcpu(current) ? hvm_guest_x86_mode(current) != 8 : is_pv_32on64_vcpu(current) ) { int rc; enum XLAT_tmem_op_u u; tmem_op_compat_t cop; rc = copy_from_guest(&cop, guest_handle_cast(uops, void), 1); if ( rc ) return rc; switch ( cop.cmd ) { case TMEM_NEW_POOL: u = XLAT_tmem_op_u_creat; break; case TMEM_CONTROL: u = XLAT_tmem_op_u_ctrl; break; case TMEM_AUTH: u = XLAT_tmem_op_u_creat; break; case TMEM_RESTORE_NEW:u = XLAT_tmem_op_u_creat; break; default: u = XLAT_tmem_op_u_gen ; break; } #define XLAT_tmem_op_HNDL_u_ctrl_buf(_d_, _s_) \ guest_from_compat_handle((_d_)->u.ctrl.buf, (_s_)->u.ctrl.buf) XLAT_tmem_op(op, &cop); #undef XLAT_tmem_op_HNDL_u_ctrl_buf return 0; } #endif return copy_from_guest(op, uops, 1); } #define tmem_cli_buf_null guest_handle_from_ptr(NULL, char) #define TMEM_CLI_ID_NULL ((domid_t)((domid_t)-1L)) #define tmem_cli_id_str "domid" #define tmem_client_str "domain" int tmem_decompress_to_client(xen_pfn_t, void *, size_t, tmem_cli_va_param_t); int tmem_compress_from_client(xen_pfn_t, void **, size_t *, tmem_cli_va_param_t); int tmem_copy_from_client(struct page_info *, xen_pfn_t, tmem_cli_va_param_t); int tmem_copy_to_client(xen_pfn_t, struct page_info *, tmem_cli_va_param_t); extern int tmem_copy_tze_to_client(xen_pfn_t cmfn, void *tmem_va, pagesize_t len); #define tmem_client_err(fmt, args...) printk(XENLOG_G_ERR fmt, ##args) #define tmem_client_warn(fmt, args...) printk(XENLOG_G_WARNING fmt, ##args) #define tmem_client_info(fmt, args...) printk(XENLOG_G_INFO fmt, ##args) #endif /* __XEN_TMEM_XEN_H__ */ xen-4.4.0/xen/include/xen/smp.h0000664000175000017500000000271412307313555014426 0ustar smbsmb#ifndef __XEN_SMP_H__ #define __XEN_SMP_H__ #include /* * stops all CPUs but the current one: */ extern void smp_send_stop(void); extern void smp_send_event_check_mask(const cpumask_t *mask); #define smp_send_event_check_cpu(cpu) \ smp_send_event_check_mask(cpumask_of(cpu)) extern void smp_send_state_dump(unsigned int cpu); /* * Prepare machine for booting other CPUs. */ extern void smp_prepare_cpus(unsigned int max_cpus); /* * Final polishing of CPUs */ extern void smp_cpus_done(void); /* * Call a function on all other processors */ extern void smp_call_function( void (*func) (void *info), void *info, int wait); /* * Call a function on a selection of processors */ extern void on_selected_cpus( const cpumask_t *selected, void (*func) (void *info), void *info, int wait); /* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. */ void smp_prepare_boot_cpu(void); /* * Call a function on all processors */ static inline void on_each_cpu( void (*func) (void *info), void *info, int wait) { on_selected_cpus(&cpu_online_map, func, info, wait); } /* * Call a function on the current CPU */ void smp_call_function_interrupt(void); void smp_send_call_function_mask(const cpumask_t *mask); #define smp_processor_id() raw_smp_processor_id() int alloc_cpu_id(void); extern void *stack_base[NR_CPUS]; #endif /* __XEN_SMP_H__ */ xen-4.4.0/xen/include/xen/sched.h0000664000175000017500000006636712307313555014733 0ustar smbsmb #ifndef __SCHED_H__ #define __SCHED_H__ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CONFIG_COMPAT #include DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t); #endif /* * Stats * * Enable and ease the use of scheduling related performance counters. * */ #ifdef PERF_COUNTERS #define SCHED_STATS #endif #define SCHED_STAT_CRANK(_X) (perfc_incr(_X)) /* A global pointer to the initial domain (DOM0). */ extern struct domain *dom0; #ifndef CONFIG_COMPAT #define BITS_PER_EVTCHN_WORD(d) BITS_PER_XEN_ULONG #else #define BITS_PER_EVTCHN_WORD(d) (has_32bit_shinfo(d) ? 32 : BITS_PER_XEN_ULONG) #endif #define BUCKETS_PER_GROUP (PAGE_SIZE/sizeof(struct evtchn *)) /* Round size of struct evtchn up to power of 2 size */ #define __RDU2(x) ( (x) | ( (x) >> 1)) #define __RDU4(x) ( __RDU2(x) | ( __RDU2(x) >> 2)) #define __RDU8(x) ( __RDU4(x) | ( __RDU4(x) >> 4)) #define __RDU16(x) ( __RDU8(x) | ( __RDU8(x) >> 8)) #define __RDU32(x) (__RDU16(x) | (__RDU16(x) >>16)) #define next_power_of_2(x) (__RDU32((x)-1) + 1) /* Maximum number of event channels for any ABI. */ #define MAX_NR_EVTCHNS MAX(EVTCHN_2L_NR_CHANNELS, EVTCHN_FIFO_NR_CHANNELS) #define EVTCHNS_PER_BUCKET (PAGE_SIZE / next_power_of_2(sizeof(struct evtchn))) #define EVTCHNS_PER_GROUP (BUCKETS_PER_GROUP * EVTCHNS_PER_BUCKET) #define NR_EVTCHN_GROUPS DIV_ROUND_UP(MAX_NR_EVTCHNS, EVTCHNS_PER_GROUP) struct evtchn { #define ECS_FREE 0 /* Channel is available for use. */ #define ECS_RESERVED 1 /* Channel is reserved. */ #define ECS_UNBOUND 2 /* Channel is waiting to bind to a remote domain. */ #define ECS_INTERDOMAIN 3 /* Channel is bound to another domain. */ #define ECS_PIRQ 4 /* Channel is bound to a physical IRQ line. */ #define ECS_VIRQ 5 /* Channel is bound to a virtual IRQ line. */ #define ECS_IPI 6 /* Channel is bound to a virtual IPI line. */ u8 state; /* ECS_* */ u8 xen_consumer; /* Consumer in Xen, if any? (0 = send to guest) */ u16 notify_vcpu_id; /* VCPU for local delivery notification */ u32 port; union { struct { domid_t remote_domid; } unbound; /* state == ECS_UNBOUND */ struct { u16 remote_port; struct domain *remote_dom; } interdomain; /* state == ECS_INTERDOMAIN */ struct { u16 irq; u16 next_port; u16 prev_port; } pirq; /* state == ECS_PIRQ */ u16 virq; /* state == ECS_VIRQ */ } u; u8 priority; u8 pending:1; u16 last_vcpu_id; u8 last_priority; #ifdef FLASK_ENABLE void *ssid; #endif }; int evtchn_init(struct domain *d); /* from domain_create */ void evtchn_destroy(struct domain *d); /* from domain_kill */ void evtchn_destroy_final(struct domain *d); /* from complete_domain_destroy */ struct waitqueue_vcpu; struct vcpu { int vcpu_id; int processor; vcpu_info_t *vcpu_info; struct domain *domain; struct vcpu *next_in_list; s_time_t periodic_period; s_time_t periodic_last_event; struct timer periodic_timer; struct timer singleshot_timer; struct timer poll_timer; /* timeout for SCHEDOP_poll */ void *sched_priv; /* scheduler-specific data */ struct vcpu_runstate_info runstate; #ifndef CONFIG_COMPAT # define runstate_guest(v) ((v)->runstate_guest) XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */ #else # define runstate_guest(v) ((v)->runstate_guest.native) union { XEN_GUEST_HANDLE(vcpu_runstate_info_t) native; XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat; } runstate_guest; /* guest address */ #endif /* last time when vCPU is scheduled out */ uint64_t last_run_time; /* Has the FPU been initialised? */ bool_t fpu_initialised; /* Has the FPU been used since it was last saved? */ bool_t fpu_dirtied; /* Initialization completed for this VCPU? */ bool_t is_initialised; /* Currently running on a CPU? */ bool_t is_running; /* VCPU should wake fast (do not deep sleep the CPU). */ bool_t is_urgent; #ifdef VCPU_TRAP_LAST #define VCPU_TRAP_NONE 0 struct { bool_t pending; uint8_t old_mask; } async_exception_state[VCPU_TRAP_LAST]; #define async_exception_state(t) async_exception_state[(t)-1] uint8_t async_exception_mask; #endif /* Require shutdown to be deferred for some asynchronous operation? */ bool_t defer_shutdown; /* VCPU is paused following shutdown request (d->is_shutting_down)? */ bool_t paused_for_shutdown; /* VCPU need affinity restored */ bool_t affinity_broken; /* * > 0: a single port is being polled; * = 0: nothing is being polled (vcpu should be clear in d->poll_mask); * < 0: multiple ports may be being polled. */ int poll_evtchn; /* (over-)protected by ->domain->event_lock */ int pirq_evtchn_head; unsigned long pause_flags; atomic_t pause_count; /* IRQ-safe virq_lock protects against delivering VIRQ to stale evtchn. */ u16 virq_to_evtchn[NR_VIRQS]; spinlock_t virq_lock; /* Bitmask of CPUs on which this VCPU may run. */ cpumask_var_t cpu_affinity; /* Used to change affinity temporarily. */ cpumask_var_t cpu_affinity_tmp; /* Used to restore affinity across S3. */ cpumask_var_t cpu_affinity_saved; /* Bitmask of CPUs which are holding onto this VCPU's state. */ cpumask_var_t vcpu_dirty_cpumask; /* Tasklet for continue_hypercall_on_cpu(). */ struct tasklet continue_hypercall_tasklet; /* Multicall information. */ struct mc_state mc_state; struct waitqueue_vcpu *waitqueue_vcpu; /* Guest-specified relocation of vcpu_info. */ unsigned long vcpu_info_mfn; struct evtchn_fifo_vcpu *evtchn_fifo; struct arch_vcpu arch; }; /* Per-domain lock can be recursively acquired in fault handlers. */ #define domain_lock(d) spin_lock_recursive(&(d)->domain_lock) #define domain_unlock(d) spin_unlock_recursive(&(d)->domain_lock) #define domain_is_locked(d) spin_is_locked(&(d)->domain_lock) /* Memory event */ struct mem_event_domain { /* ring lock */ spinlock_t ring_lock; /* The ring has 64 entries */ unsigned char foreign_producers; unsigned char target_producers; /* shared ring page */ void *ring_page; struct page_info *ring_pg_struct; /* front-end ring */ mem_event_front_ring_t front_ring; /* event channel port (vcpu0 only) */ int xen_port; /* mem_event bit for vcpu->pause_flags */ int pause_flag; /* list of vcpus waiting for room in the ring */ struct waitqueue_head wq; /* the number of vCPUs blocked */ unsigned int blocked; /* The last vcpu woken up */ unsigned int last_vcpu_wake_up; }; struct mem_event_per_domain { /* Memory sharing support */ struct mem_event_domain share; /* Memory paging support */ struct mem_event_domain paging; /* Memory access support */ struct mem_event_domain access; }; struct evtchn_port_ops; /* * PVH is a PV guest running in an HVM container. is_hvm_* checks * will be false, but has_hvm_container_* checks will be true. */ enum guest_type { guest_type_pv, guest_type_pvh, guest_type_hvm }; struct domain { domid_t domain_id; shared_info_t *shared_info; /* shared data area */ spinlock_t domain_lock; spinlock_t page_alloc_lock; /* protects all the following fields */ struct page_list_head page_list; /* linked list */ struct page_list_head xenpage_list; /* linked list (size xenheap_pages) */ unsigned int tot_pages; /* number of pages currently possesed */ unsigned int outstanding_pages; /* pages claimed but not possessed */ unsigned int max_pages; /* maximum value for tot_pages */ atomic_t shr_pages; /* number of shared pages */ atomic_t paged_pages; /* number of paged-out pages */ unsigned int xenheap_pages; /* # pages allocated from Xen heap */ unsigned int max_vcpus; /* Scheduling. */ void *sched_priv; /* scheduler-specific data */ struct cpupool *cpupool; struct domain *next_in_list; struct domain *next_in_hashbucket; struct list_head rangesets; spinlock_t rangesets_lock; /* Event channel information. */ struct evtchn *evtchn; /* first bucket only */ struct evtchn **evtchn_group[NR_EVTCHN_GROUPS]; /* all other buckets */ unsigned int max_evtchns; unsigned int max_evtchn_port; spinlock_t event_lock; const struct evtchn_port_ops *evtchn_port_ops; struct evtchn_fifo_domain *evtchn_fifo; struct grant_table *grant_table; /* * Interrupt to event-channel mappings and other per-guest-pirq data. * Protected by the domain's event-channel spinlock. */ unsigned int nr_pirqs; struct radix_tree_root pirq_tree; /* I/O capabilities (access to IRQs and memory-mapped I/O). */ struct rangeset *iomem_caps; struct rangeset *irq_caps; enum guest_type guest_type; #ifdef HAS_PASSTHROUGH /* Does this guest need iommu mappings (-1 meaning "being set up")? */ s8 need_iommu; #endif /* is node-affinity automatically computed? */ bool_t auto_node_affinity; /* Is this guest fully privileged (aka dom0)? */ bool_t is_privileged; /* Which guest this guest has privileges on */ struct domain *target; /* Is this guest being debugged by dom0? */ bool_t debugger_attached; /* Is this guest dying (i.e., a zombie)? */ enum { DOMDYING_alive, DOMDYING_dying, DOMDYING_dead } is_dying; /* Domain is paused by controller software? */ bool_t is_paused_by_controller; /* Domain's VCPUs are pinned 1:1 to physical CPUs? */ bool_t is_pinned; /* Are any VCPUs polling event channels (SCHEDOP_poll)? */ #if MAX_VIRT_CPUS <= BITS_PER_LONG DECLARE_BITMAP(poll_mask, MAX_VIRT_CPUS); #else unsigned long *poll_mask; #endif /* Guest has shut down (inc. reason code)? */ spinlock_t shutdown_lock; bool_t is_shutting_down; /* in process of shutting down? */ bool_t is_shut_down; /* fully shut down? */ int shutdown_code; /* If this is not 0, send suspend notification here instead of * raising DOM_EXC */ int suspend_evtchn; atomic_t pause_count; unsigned long vm_assist; atomic_t refcnt; struct vcpu **vcpu; /* Bitmask of CPUs which are holding onto this domain's state. */ cpumask_var_t domain_dirty_cpumask; struct arch_domain arch; void *ssid; /* sHype security subject identifier */ /* Control-plane tools handle for this domain. */ xen_domain_handle_t handle; /* hvm_print_line() and guest_console_write() logging. */ #define DOMAIN_PBUF_SIZE 80 char *pbuf; unsigned pbuf_idx; spinlock_t pbuf_lock; /* OProfile support. */ struct xenoprof *xenoprof; int32_t time_offset_seconds; /* Domain watchdog. */ #define NR_DOMAIN_WATCHDOG_TIMERS 2 spinlock_t watchdog_lock; uint32_t watchdog_inuse_map; struct timer watchdog_timer[NR_DOMAIN_WATCHDOG_TIMERS]; struct rcu_head rcu; /* * Hypercall deadlock avoidance lock. Used if a hypercall might * cause a deadlock. Acquirers don't spin waiting; they preempt. */ spinlock_t hypercall_deadlock_mutex; /* transcendent memory, auto-allocated on first tmem op by each domain */ struct client *tmem_client; struct lock_profile_qhead profile_head; /* Non-migratable and non-restoreable? */ bool_t disable_migrate; /* Various mem_events */ struct mem_event_per_domain *mem_event; /* * Can be specified by the user. If that is not the case, it is * computed from the union of all the vcpu cpu-affinity masks. */ nodemask_t node_affinity; unsigned int last_alloc_node; spinlock_t node_affinity_lock; }; struct domain_setup_info { /* Initialised by caller. */ unsigned long image_addr; unsigned long image_len; /* Initialised by loader: Public. */ unsigned long v_start; unsigned long v_end; unsigned long v_kernstart; unsigned long v_kernend; unsigned long v_kernentry; #define PAEKERN_no 0 #define PAEKERN_yes 1 #define PAEKERN_extended_cr3 2 #define PAEKERN_bimodal 3 unsigned int pae_kernel; /* Initialised by loader: Private. */ unsigned long elf_paddr_offset; unsigned int load_symtab; unsigned long symtab_addr; unsigned long symtab_len; }; /* Protect updates/reads (resp.) of domain_list and domain_hash. */ extern spinlock_t domlist_update_lock; extern rcu_read_lock_t domlist_read_lock; extern struct vcpu *idle_vcpu[NR_CPUS]; #define is_idle_domain(d) ((d)->domain_id == DOMID_IDLE) #define is_idle_vcpu(v) (is_idle_domain((v)->domain)) #define DOMAIN_DESTROYED (1<<31) /* assumes atomic_t is >= 32 bits */ #define put_domain(_d) \ if ( atomic_dec_and_test(&(_d)->refcnt) ) domain_destroy(_d) /* * Use this when you don't have an existing reference to @d. It returns * FALSE if @d is being destroyed. */ static always_inline int get_domain(struct domain *d) { atomic_t old, new, seen = d->refcnt; do { old = seen; if ( unlikely(_atomic_read(old) & DOMAIN_DESTROYED) ) return 0; _atomic_set(new, _atomic_read(old) + 1); seen = atomic_compareandswap(old, new, &d->refcnt); } while ( unlikely(_atomic_read(seen) != _atomic_read(old)) ); return 1; } /* * Use this when you already have, or are borrowing, a reference to @d. * In this case we know that @d cannot be destroyed under our feet. */ static inline void get_knownalive_domain(struct domain *d) { atomic_inc(&d->refcnt); ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED)); } int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity); void domain_update_node_affinity(struct domain *d); struct domain *domain_create( domid_t domid, unsigned int domcr_flags, uint32_t ssidref); /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */ #define _DOMCRF_hvm 0 #define DOMCRF_hvm (1U<<_DOMCRF_hvm) /* DOMCRF_hap: Create a domain with hardware-assisted paging. */ #define _DOMCRF_hap 1 #define DOMCRF_hap (1U<<_DOMCRF_hap) /* DOMCRF_s3_integrity: Create a domain with tboot memory integrity protection by tboot */ #define _DOMCRF_s3_integrity 2 #define DOMCRF_s3_integrity (1U<<_DOMCRF_s3_integrity) /* DOMCRF_dummy: Create a dummy domain (not scheduled; not on domain list) */ #define _DOMCRF_dummy 3 #define DOMCRF_dummy (1U<<_DOMCRF_dummy) /* DOMCRF_oos_off: dont use out-of-sync optimization for shadow page tables */ #define _DOMCRF_oos_off 4 #define DOMCRF_oos_off (1U<<_DOMCRF_oos_off) /* DOMCRF_pvh: Create PV domain in HVM container. */ #define _DOMCRF_pvh 5 #define DOMCRF_pvh (1U<<_DOMCRF_pvh) /* * rcu_lock_domain_by_id() is more efficient than get_domain_by_id(). * This is the preferred function if the returned domain reference * is short lived, but it cannot be used if the domain reference needs * to be kept beyond the current scope (e.g., across a softirq). * The returned domain reference must be discarded using rcu_unlock_domain(). */ struct domain *rcu_lock_domain_by_id(domid_t dom); /* * As above function, but resolves DOMID_SELF to current domain */ struct domain *rcu_lock_domain_by_any_id(domid_t dom); /* * As rcu_lock_domain_by_id(), but will fail EPERM or ESRCH rather than resolve * to local domain. */ int rcu_lock_remote_domain_by_id(domid_t dom, struct domain **d); /* * As rcu_lock_remote_domain_by_id() but will fail EINVAL if the domain is * dying. */ int rcu_lock_live_remote_domain_by_id(domid_t dom, struct domain **d); /* Finish a RCU critical region started by rcu_lock_domain_by_id(). */ static inline void rcu_unlock_domain(struct domain *d) { if ( d != current->domain ) rcu_read_unlock(&domlist_read_lock); } static inline struct domain *rcu_lock_domain(struct domain *d) { if ( d != current->domain ) rcu_read_lock(d); return d; } static inline struct domain *rcu_lock_current_domain(void) { return /*rcu_lock_domain*/(current->domain); } struct domain *get_domain_by_id(domid_t dom); void domain_destroy(struct domain *d); int domain_kill(struct domain *d); void domain_shutdown(struct domain *d, u8 reason); void domain_resume(struct domain *d); void domain_pause_for_debugger(void); int vcpu_start_shutdown_deferral(struct vcpu *v); void vcpu_end_shutdown_deferral(struct vcpu *v); /* * Mark specified domain as crashed. This function always returns, even if the * caller is the specified domain. The domain is not synchronously descheduled * from any processor. */ void __domain_crash(struct domain *d); #define domain_crash(d) do { \ printk("domain_crash called from %s:%d\n", __FILE__, __LINE__); \ __domain_crash(d); \ } while (0) /* * Mark current domain as crashed and synchronously deschedule from the local * processor. This function never returns. */ void __domain_crash_synchronous(void) __attribute__((noreturn)); #define domain_crash_synchronous() do { \ printk("domain_crash_sync called from %s:%d\n", __FILE__, __LINE__); \ __domain_crash_synchronous(); \ } while (0) /* * Called from assembly code, with an optional address to help indicate why * the crash occured. If addr is 0, look up address from last extable * redirection. */ void asm_domain_crash_synchronous(unsigned long addr) __attribute__((noreturn)); #define set_current_state(_s) do { current->state = (_s); } while (0) void scheduler_init(void); int sched_init_vcpu(struct vcpu *v, unsigned int processor); void sched_destroy_vcpu(struct vcpu *v); int sched_init_domain(struct domain *d); void sched_destroy_domain(struct domain *d); int sched_move_domain(struct domain *d, struct cpupool *c); long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *); long sched_adjust_global(struct xen_sysctl_scheduler_op *); void sched_set_node_affinity(struct domain *, nodemask_t *); int sched_id(void); void sched_tick_suspend(void); void sched_tick_resume(void); void vcpu_wake(struct vcpu *v); void vcpu_sleep_nosync(struct vcpu *v); void vcpu_sleep_sync(struct vcpu *v); /* * Force synchronisation of given VCPU's state. If it is currently descheduled, * this call will ensure that all its state is committed to memory and that * no CPU is using critical state (e.g., page tables) belonging to the VCPU. */ void sync_vcpu_execstate(struct vcpu *v); /* As above, for any lazy state being held on the local CPU. */ void sync_local_execstate(void); /* * Called by the scheduler to switch to another VCPU. This function must * call context_saved(@prev) when the local CPU is no longer running in * @prev's context, and that context is saved to memory. Alternatively, if * implementing lazy context switching, it suffices to ensure that invoking * sync_vcpu_execstate() will switch and commit @prev's state. */ void context_switch( struct vcpu *prev, struct vcpu *next); /* * As described above, context_switch() must call this function when the * local CPU is no longer running in @prev's context, and @prev's context is * saved to memory. Alternatively, if implementing lazy context switching, * ensure that invoking sync_vcpu_execstate() will switch and commit @prev. */ void context_saved(struct vcpu *prev); /* Called by the scheduler to continue running the current VCPU. */ void continue_running( struct vcpu *same); void startup_cpu_idle_loop(void); extern void (*pm_idle) (void); extern void (*dead_idle) (void); /* * Creates a continuation to resume the current hypercall. The caller should * return immediately, propagating the value returned from this invocation. * The format string specifies the types and number of hypercall arguments. * It contains one character per argument as follows: * 'i' [unsigned] {char, int} * 'l' [unsigned] long * 'h' guest handle (XEN_GUEST_HANDLE(foo)) */ unsigned long hypercall_create_continuation( unsigned int op, const char *format, ...); void hypercall_cancel_continuation(void); #define hypercall_preempt_check() (unlikely( \ softirq_pending(smp_processor_id()) | \ local_events_need_delivery() \ )) extern struct domain *domain_list; /* Caller must hold the domlist_read_lock or domlist_update_lock. */ static inline struct domain *first_domain_in_cpupool( struct cpupool *c) { struct domain *d; for (d = rcu_dereference(domain_list); d && d->cpupool != c; d = rcu_dereference(d->next_in_list)); return d; } static inline struct domain *next_domain_in_cpupool( struct domain *d, struct cpupool *c) { for (d = rcu_dereference(d->next_in_list); d && d->cpupool != c; d = rcu_dereference(d->next_in_list)); return d; } #define for_each_domain(_d) \ for ( (_d) = rcu_dereference(domain_list); \ (_d) != NULL; \ (_d) = rcu_dereference((_d)->next_in_list )) \ #define for_each_domain_in_cpupool(_d,_c) \ for ( (_d) = first_domain_in_cpupool(_c); \ (_d) != NULL; \ (_d) = next_domain_in_cpupool((_d), (_c))) #define for_each_vcpu(_d,_v) \ for ( (_v) = (_d)->vcpu ? (_d)->vcpu[0] : NULL; \ (_v) != NULL; \ (_v) = (_v)->next_in_list ) /* * Per-VCPU pause flags. */ /* Domain is blocked waiting for an event. */ #define _VPF_blocked 0 #define VPF_blocked (1UL<<_VPF_blocked) /* VCPU is offline. */ #define _VPF_down 1 #define VPF_down (1UL<<_VPF_down) /* VCPU is blocked awaiting an event to be consumed by Xen. */ #define _VPF_blocked_in_xen 2 #define VPF_blocked_in_xen (1UL<<_VPF_blocked_in_xen) /* VCPU affinity has changed: migrating to a new CPU. */ #define _VPF_migrating 3 #define VPF_migrating (1UL<<_VPF_migrating) /* VCPU is blocked due to missing mem_paging ring. */ #define _VPF_mem_paging 4 #define VPF_mem_paging (1UL<<_VPF_mem_paging) /* VCPU is blocked due to missing mem_access ring. */ #define _VPF_mem_access 5 #define VPF_mem_access (1UL<<_VPF_mem_access) /* VCPU is blocked due to missing mem_sharing ring. */ #define _VPF_mem_sharing 6 #define VPF_mem_sharing (1UL<<_VPF_mem_sharing) /* VCPU is being reset. */ #define _VPF_in_reset 7 #define VPF_in_reset (1UL<<_VPF_in_reset) static inline int vcpu_runnable(struct vcpu *v) { return !(v->pause_flags | atomic_read(&v->pause_count) | atomic_read(&v->domain->pause_count)); } void vcpu_block(void); void vcpu_unblock(struct vcpu *v); void vcpu_pause(struct vcpu *v); void vcpu_pause_nosync(struct vcpu *v); void domain_pause(struct domain *d); void domain_pause_nosync(struct domain *d); void vcpu_unpause(struct vcpu *v); void domain_unpause(struct domain *d); void domain_pause_by_systemcontroller(struct domain *d); void domain_unpause_by_systemcontroller(struct domain *d); void cpu_init(void); struct scheduler; struct scheduler *scheduler_get_default(void); struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr); void scheduler_free(struct scheduler *sched); int schedule_cpu_switch(unsigned int cpu, struct cpupool *c); void vcpu_force_reschedule(struct vcpu *v); int cpu_disable_scheduler(unsigned int cpu); int vcpu_set_affinity(struct vcpu *v, const cpumask_t *affinity); void restore_vcpu_affinity(struct domain *d); void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate); uint64_t get_cpu_idle_time(unsigned int cpu); /* * Used by idle loop to decide whether there is work to do: * (1) Run softirqs; or (2) Play dead; or (3) Run tasklets. */ #define cpu_is_haltable(cpu) \ (!softirq_pending(cpu) && \ cpu_online(cpu) && \ !per_cpu(tasklet_work_to_do, cpu)) void watchdog_domain_init(struct domain *d); void watchdog_domain_destroy(struct domain *d); /* * Use this check when the following are both true: * - Using this feature or interface requires full access to the hardware * (that is, this is would not be suitable for a driver domain) * - There is never a reason to deny dom0 access to this */ #define is_hardware_domain(_d) ((_d)->is_privileged) /* This check is for functionality specific to a control domain */ #define is_control_domain(_d) ((_d)->is_privileged) #define VM_ASSIST(_d,_t) (test_bit((_t), &(_d)->vm_assist)) #define is_pv_domain(d) ((d)->guest_type == guest_type_pv) #define is_pv_vcpu(v) (is_pv_domain((v)->domain)) #define is_pvh_domain(d) ((d)->guest_type == guest_type_pvh) #define is_pvh_vcpu(v) (is_pvh_domain((v)->domain)) #define is_hvm_domain(d) ((d)->guest_type == guest_type_hvm) #define is_hvm_vcpu(v) (is_hvm_domain(v->domain)) #define has_hvm_container_domain(d) ((d)->guest_type != guest_type_pv) #define has_hvm_container_vcpu(v) (has_hvm_container_domain((v)->domain)) #define is_pinned_vcpu(v) ((v)->domain->is_pinned || \ cpumask_weight((v)->cpu_affinity) == 1) #ifdef HAS_PASSTHROUGH #define need_iommu(d) ((d)->need_iommu) #else #define need_iommu(d) (0) #endif void set_vcpu_migration_delay(unsigned int delay); unsigned int get_vcpu_migration_delay(void); extern bool_t sched_smt_power_savings; extern enum cpufreq_controller { FREQCTL_none, FREQCTL_dom0_kernel, FREQCTL_xen } cpufreq_controller; #define CPUPOOLID_NONE -1 struct cpupool *cpupool_get_by_id(int poolid); void cpupool_put(struct cpupool *pool); int cpupool_add_domain(struct domain *d, int poolid); void cpupool_rm_domain(struct domain *d); int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op); void schedule_dump(struct cpupool *c); extern void dump_runq(unsigned char key); #define num_cpupool_cpus(c) cpumask_weight((c)->cpu_valid) void arch_do_physinfo(xen_sysctl_physinfo_t *pi); #endif /* __SCHED_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/xen/rcupdate.h0000664000175000017500000001255412307313555015441 0ustar smbsmb/* * Read-Copy Update mechanism for mutual exclusion * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * Copyright (C) IBM Corporation, 2001 * * Author: Dipankar Sarma * * Based on the original work by Paul McKenney * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) * * For detailed explanation of Read-Copy Update mechanism see - * http://lse.sourceforge.net/locking/rcupdate.html */ #ifndef __XEN_RCUPDATE_H #define __XEN_RCUPDATE_H #include #include #include #include #include #define __rcu /** * struct rcu_head - callback structure for use with RCU * @next: next update requests in a list * @func: actual update function to call after the grace period. */ struct rcu_head { struct rcu_head *next; void (*func)(struct rcu_head *head); }; #define RCU_HEAD_INIT { .next = NULL, .func = NULL } #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT #define INIT_RCU_HEAD(ptr) do { \ (ptr)->next = NULL; (ptr)->func = NULL; \ } while (0) int rcu_pending(int cpu); int rcu_needs_cpu(int cpu); /* * Dummy lock type for passing to rcu_read_{lock,unlock}. Currently exists * only to document the reason for rcu_read_lock() critical sections. */ struct _rcu_read_lock {}; typedef struct _rcu_read_lock rcu_read_lock_t; #define DEFINE_RCU_READ_LOCK(x) rcu_read_lock_t x /** * rcu_read_lock - mark the beginning of an RCU read-side critical section. * * When call_rcu() is invoked * on one CPU while other CPUs are within RCU read-side critical * sections, invocation of the corresponding RCU callback is deferred * until after the all the other CPUs exit their critical sections. * * Note, however, that RCU callbacks are permitted to run concurrently * with RCU read-side critical sections. One way that this can happen * is via the following sequence of events: (1) CPU 0 enters an RCU * read-side critical section, (2) CPU 1 invokes call_rcu() to register * an RCU callback, (3) CPU 0 exits the RCU read-side critical section, * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU * callback is invoked. This is legal, because the RCU read-side critical * section that was running concurrently with the call_rcu() (and which * therefore might be referencing something that the corresponding RCU * callback would free up) has completed before the corresponding * RCU callback is invoked. * * RCU read-side critical sections may be nested. Any deferred actions * will be deferred until the outermost RCU read-side critical section * completes. * * It is illegal to block while in an RCU read-side critical section. */ #define rcu_read_lock(x) ({ ((void)(x)); preempt_disable(); }) /** * rcu_read_unlock - marks the end of an RCU read-side critical section. * * See rcu_read_lock() for more information. */ #define rcu_read_unlock(x) ({ ((void)(x)); preempt_enable(); }) /* * So where is rcu_write_lock()? It does not exist, as there is no * way for writers to lock out RCU readers. This is a feature, not * a bug -- this property is what provides RCU's performance benefits. * Of course, writers must coordinate with each other. The normal * spinlock primitives work well for this, but any other technique may be * used as well. RCU does not care how the writers keep out of each * others' way, as long as they do so. */ /** * rcu_dereference - fetch an RCU-protected pointer in an * RCU read-side critical section. This pointer may later * be safely dereferenced. * * Inserts memory barriers on architectures that require them * (currently only the Alpha), and, more importantly, documents * exactly which pointers are protected by RCU. */ #define rcu_dereference(p) (p) /** * rcu_assign_pointer - assign (publicize) a pointer to a newly * initialized structure that will be dereferenced by RCU read-side * critical sections. Returns the value assigned. * * Inserts memory barriers on architectures that require them * (pretty much all of them other than x86), and also prevents * the compiler from reordering the code that initializes the * structure after the pointer assignment. More importantly, this * call documents which pointers will be dereferenced by RCU read-side * code. */ #define rcu_assign_pointer(p, v) ({ smp_wmb(); (p) = (v); }) void rcu_init(void); void rcu_check_callbacks(int cpu); /* Exported interfaces */ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *head)); int rcu_barrier(void); #endif /* __XEN_RCUPDATE_H */ xen-4.4.0/xen/include/xen/string.h0000664000175000017500000000525612307313555015141 0ustar smbsmb#ifndef _LINUX_STRING_H_ #define _LINUX_STRING_H_ #include /* for size_t */ #ifdef __cplusplus extern "C" { #endif #define __kernel_size_t size_t extern char * strpbrk(const char *,const char *); extern char * strsep(char **,const char *); extern __kernel_size_t strspn(const char *,const char *); /* * Include machine specific inline routines */ #include /* * These string functions are considered too dangerous for normal use. * Use safe_strcpy(), safe_strcat(), strlcpy(), strlcat() as appropriate. */ #define strcpy __xen_has_no_strcpy__ #define strcat __xen_has_no_strcat__ #define strncpy __xen_has_no_strncpy__ #define strncat __xen_has_no_strncat__ #ifndef __HAVE_ARCH_STRLCPY extern size_t strlcpy(char *,const char *, __kernel_size_t); #endif #ifndef __HAVE_ARCH_STRLCAT extern size_t strlcat(char *,const char *, __kernel_size_t); #endif #ifndef __HAVE_ARCH_STRCMP extern int strcmp(const char *,const char *); #endif #ifndef __HAVE_ARCH_STRNCMP extern int strncmp(const char *,const char *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_STRNICMP extern int strnicmp(const char *, const char *, __kernel_size_t); #endif #ifndef __HAVE_ARCH_STRCASECMP extern int strcasecmp(const char *, const char *); #endif #ifndef __HAVE_ARCH_STRCHR extern char * strchr(const char *,int); #endif #ifndef __HAVE_ARCH_STRRCHR extern char * strrchr(const char *,int); #endif #ifndef __HAVE_ARCH_STRSTR extern char * strstr(const char *,const char *); #endif #ifndef __HAVE_ARCH_STRLEN extern __kernel_size_t strlen(const char *); #endif #ifndef __HAVE_ARCH_STRNLEN extern __kernel_size_t strnlen(const char *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMSET extern void * memset(void *,int,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMCPY extern void * memcpy(void *,const void *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMMOVE extern void * memmove(void *,const void *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMSCAN extern void * memscan(void *,int,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMCMP extern int memcmp(const void *,const void *,__kernel_size_t); #endif #ifndef __HAVE_ARCH_MEMCHR extern void * memchr(const void *,int,__kernel_size_t); #endif #ifdef __cplusplus } #endif #define is_char_array(x) __builtin_types_compatible_p(typeof(x), char[]) /* safe_xxx always NUL-terminates and returns !=0 if result is truncated. */ #define safe_strcpy(d, s) ({ \ BUILD_BUG_ON(!is_char_array(d)); \ (strlcpy(d, s, sizeof(d)) >= sizeof(d)); \ }) #define safe_strcat(d, s) ({ \ BUILD_BUG_ON(!is_char_array(d)); \ (strlcat(d, s, sizeof(d)) >= sizeof(d)); \ }) #endif /* _LINUX_STRING_H_ */ xen-4.4.0/xen/include/xen/ctype.h0000664000175000017500000000257512307313555014760 0ustar smbsmb#ifndef _LINUX_CTYPE_H #define _LINUX_CTYPE_H /* * NOTE! This ctype does not handle EOF like the standard C * library is required to. */ #define _U 0x01 /* upper */ #define _L 0x02 /* lower */ #define _D 0x04 /* digit */ #define _C 0x08 /* cntrl */ #define _P 0x10 /* punct */ #define _S 0x20 /* white space (space/lf/tab) */ #define _X 0x40 /* hex digit */ #define _SP 0x80 /* hard space (0x20) */ extern const unsigned char _ctype[]; #define __ismask(x) (_ctype[(int)(unsigned char)(x)]) #define isalnum(c) ((__ismask(c)&(_U|_L|_D)) != 0) #define isalpha(c) ((__ismask(c)&(_U|_L)) != 0) #define iscntrl(c) ((__ismask(c)&(_C)) != 0) #define isdigit(c) ((__ismask(c)&(_D)) != 0) #define isgraph(c) ((__ismask(c)&(_P|_U|_L|_D)) != 0) #define islower(c) ((__ismask(c)&(_L)) != 0) #define isprint(c) ((__ismask(c)&(_P|_U|_L|_D|_SP)) != 0) #define ispunct(c) ((__ismask(c)&(_P)) != 0) #define isspace(c) ((__ismask(c)&(_S)) != 0) #define isupper(c) ((__ismask(c)&(_U)) != 0) #define isxdigit(c) ((__ismask(c)&(_D|_X)) != 0) #define isascii(c) (((unsigned char)(c))<=0x7f) #define toascii(c) (((unsigned char)(c))&0x7f) static inline unsigned char __tolower(unsigned char c) { if (isupper(c)) c -= 'A'-'a'; return c; } static inline unsigned char __toupper(unsigned char c) { if (islower(c)) c -= 'a'-'A'; return c; } #define tolower(c) __tolower(c) #define toupper(c) __toupper(c) #endif xen-4.4.0/xen/include/xen/inttypes.h0000664000175000017500000001467412307313555015516 0ustar smbsmb/* Copyright (C) 1997-2001, 2004 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ /* * ISO C99: 7.8 Format conversion of integer types */ #ifndef _XEN_INTTYPES_H #define _XEN_INTTYPES_H 1 #include # if BITS_PER_LONG == 64 # define __PRI64_PREFIX "l" # define __PRIPTR_PREFIX "l" # else # define __PRI64_PREFIX "ll" # define __PRIPTR_PREFIX # endif /* Macros for printing format specifiers. */ /* Decimal notation. */ # define PRId8 "d" # define PRId16 "d" # define PRId32 "d" # define PRId64 __PRI64_PREFIX "d" # define PRIdLEAST8 "d" # define PRIdLEAST16 "d" # define PRIdLEAST32 "d" # define PRIdLEAST64 __PRI64_PREFIX "d" # define PRIdFAST8 "d" # define PRIdFAST16 __PRIPTR_PREFIX "d" # define PRIdFAST32 __PRIPTR_PREFIX "d" # define PRIdFAST64 __PRI64_PREFIX "d" # define PRIi8 "i" # define PRIi16 "i" # define PRIi32 "i" # define PRIi64 __PRI64_PREFIX "i" # define PRIiLEAST8 "i" # define PRIiLEAST16 "i" # define PRIiLEAST32 "i" # define PRIiLEAST64 __PRI64_PREFIX "i" # define PRIiFAST8 "i" # define PRIiFAST16 __PRIPTR_PREFIX "i" # define PRIiFAST32 __PRIPTR_PREFIX "i" # define PRIiFAST64 __PRI64_PREFIX "i" /* Octal notation. */ # define PRIo8 "o" # define PRIo16 "o" # define PRIo32 "o" # define PRIo64 __PRI64_PREFIX "o" # define PRIoLEAST8 "o" # define PRIoLEAST16 "o" # define PRIoLEAST32 "o" # define PRIoLEAST64 __PRI64_PREFIX "o" # define PRIoFAST8 "o" # define PRIoFAST16 __PRIPTR_PREFIX "o" # define PRIoFAST32 __PRIPTR_PREFIX "o" # define PRIoFAST64 __PRI64_PREFIX "o" /* Unsigned integers. */ # define PRIu8 "u" # define PRIu16 "u" # define PRIu32 "u" # define PRIu64 __PRI64_PREFIX "u" # define PRIuLEAST8 "u" # define PRIuLEAST16 "u" # define PRIuLEAST32 "u" # define PRIuLEAST64 __PRI64_PREFIX "u" # define PRIuFAST8 "u" # define PRIuFAST16 __PRIPTR_PREFIX "u" # define PRIuFAST32 __PRIPTR_PREFIX "u" # define PRIuFAST64 __PRI64_PREFIX "u" /* lowercase hexadecimal notation. */ # define PRIx8 "x" # define PRIx16 "x" # define PRIx32 "x" # define PRIx64 __PRI64_PREFIX "x" # define PRIxLEAST8 "x" # define PRIxLEAST16 "x" # define PRIxLEAST32 "x" # define PRIxLEAST64 __PRI64_PREFIX "x" # define PRIxFAST8 "x" # define PRIxFAST16 __PRIPTR_PREFIX "x" # define PRIxFAST32 __PRIPTR_PREFIX "x" # define PRIxFAST64 __PRI64_PREFIX "x" /* UPPERCASE hexadecimal notation. */ # define PRIX8 "X" # define PRIX16 "X" # define PRIX32 "X" # define PRIX64 __PRI64_PREFIX "X" # define PRIXLEAST8 "X" # define PRIXLEAST16 "X" # define PRIXLEAST32 "X" # define PRIXLEAST64 __PRI64_PREFIX "X" # define PRIXFAST8 "X" # define PRIXFAST16 __PRIPTR_PREFIX "X" # define PRIXFAST32 __PRIPTR_PREFIX "X" # define PRIXFAST64 __PRI64_PREFIX "X" /* Macros for printing `intmax_t' and `uintmax_t'. */ # define PRIdMAX __PRI64_PREFIX "d" # define PRIiMAX __PRI64_PREFIX "i" # define PRIoMAX __PRI64_PREFIX "o" # define PRIuMAX __PRI64_PREFIX "u" # define PRIxMAX __PRI64_PREFIX "x" # define PRIXMAX __PRI64_PREFIX "X" /* Macros for printing `intptr_t' and `uintptr_t'. */ # define PRIdPTR __PRIPTR_PREFIX "d" # define PRIiPTR __PRIPTR_PREFIX "i" # define PRIoPTR __PRIPTR_PREFIX "o" # define PRIuPTR __PRIPTR_PREFIX "u" # define PRIxPTR __PRIPTR_PREFIX "x" # define PRIXPTR __PRIPTR_PREFIX "X" /* Macros for scanning format specifiers. */ /* Signed decimal notation. */ # define SCNd8 "hhd" # define SCNd16 "hd" # define SCNd32 "d" # define SCNd64 __PRI64_PREFIX "d" # define SCNdLEAST8 "hhd" # define SCNdLEAST16 "hd" # define SCNdLEAST32 "d" # define SCNdLEAST64 __PRI64_PREFIX "d" # define SCNdFAST8 "hhd" # define SCNdFAST16 __PRIPTR_PREFIX "d" # define SCNdFAST32 __PRIPTR_PREFIX "d" # define SCNdFAST64 __PRI64_PREFIX "d" /* Signed decimal notation. */ # define SCNi8 "hhi" # define SCNi16 "hi" # define SCNi32 "i" # define SCNi64 __PRI64_PREFIX "i" # define SCNiLEAST8 "hhi" # define SCNiLEAST16 "hi" # define SCNiLEAST32 "i" # define SCNiLEAST64 __PRI64_PREFIX "i" # define SCNiFAST8 "hhi" # define SCNiFAST16 __PRIPTR_PREFIX "i" # define SCNiFAST32 __PRIPTR_PREFIX "i" # define SCNiFAST64 __PRI64_PREFIX "i" /* Unsigned decimal notation. */ # define SCNu8 "hhu" # define SCNu16 "hu" # define SCNu32 "u" # define SCNu64 __PRI64_PREFIX "u" # define SCNuLEAST8 "hhu" # define SCNuLEAST16 "hu" # define SCNuLEAST32 "u" # define SCNuLEAST64 __PRI64_PREFIX "u" # define SCNuFAST8 "hhu" # define SCNuFAST16 __PRIPTR_PREFIX "u" # define SCNuFAST32 __PRIPTR_PREFIX "u" # define SCNuFAST64 __PRI64_PREFIX "u" /* Octal notation. */ # define SCNo8 "hho" # define SCNo16 "ho" # define SCNo32 "o" # define SCNo64 __PRI64_PREFIX "o" # define SCNoLEAST8 "hho" # define SCNoLEAST16 "ho" # define SCNoLEAST32 "o" # define SCNoLEAST64 __PRI64_PREFIX "o" # define SCNoFAST8 "hho" # define SCNoFAST16 __PRIPTR_PREFIX "o" # define SCNoFAST32 __PRIPTR_PREFIX "o" # define SCNoFAST64 __PRI64_PREFIX "o" /* Hexadecimal notation. */ # define SCNx8 "hhx" # define SCNx16 "hx" # define SCNx32 "x" # define SCNx64 __PRI64_PREFIX "x" # define SCNxLEAST8 "hhx" # define SCNxLEAST16 "hx" # define SCNxLEAST32 "x" # define SCNxLEAST64 __PRI64_PREFIX "x" # define SCNxFAST8 "hhx" # define SCNxFAST16 __PRIPTR_PREFIX "x" # define SCNxFAST32 __PRIPTR_PREFIX "x" # define SCNxFAST64 __PRI64_PREFIX "x" /* Macros for scanning `intmax_t' and `uintmax_t'. */ # define SCNdMAX __PRI64_PREFIX "d" # define SCNiMAX __PRI64_PREFIX "i" # define SCNoMAX __PRI64_PREFIX "o" # define SCNuMAX __PRI64_PREFIX "u" # define SCNxMAX __PRI64_PREFIX "x" /* Macros for scaning `intptr_t' and `uintptr_t'. */ # define SCNdPTR __PRIPTR_PREFIX "d" # define SCNiPTR __PRIPTR_PREFIX "i" # define SCNoPTR __PRIPTR_PREFIX "o" # define SCNuPTR __PRIPTR_PREFIX "u" # define SCNxPTR __PRIPTR_PREFIX "x" #endif /* _XEN_INTTYPES_H */ xen-4.4.0/xen/include/xen/rbtree.h0000664000175000017500000000532712307313555015115 0ustar smbsmb/* Red Black Trees (C) 1999 Andrea Arcangeli This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef __RBTREE_H__ #define __RBTREE_H__ struct rb_node { unsigned long rb_parent_color; #define RB_RED 0 #define RB_BLACK 1 struct rb_node *rb_right; struct rb_node *rb_left; }; struct rb_root { struct rb_node *rb_node; }; #define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3)) #define rb_color(r) ((r)->rb_parent_color & 1) #define rb_is_red(r) (!rb_color(r)) #define rb_is_black(r) rb_color(r) #define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0) #define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0) static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p; } static inline void rb_set_color(struct rb_node *rb, int color) { rb->rb_parent_color = (rb->rb_parent_color & ~1) | color; } #define RB_ROOT (struct rb_root) { NULL, } #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) #define RB_EMPTY_NODE(node) (rb_parent(node) == node) #define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(struct rb_node *); extern struct rb_node *rb_prev(struct rb_node *); extern struct rb_node *rb_first(struct rb_root *); extern struct rb_node *rb_last(struct rb_root *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, struct rb_node ** rb_link) { node->rb_parent_color = (unsigned long )parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } #endif /* __RBTREE_H__ */ xen-4.4.0/xen/include/xen/preempt.h0000664000175000017500000000161712307313555015304 0ustar smbsmb/****************************************************************************** * preempt.h * * Track atomic regions in the hypervisor which disallow sleeping. * * Copyright (c) 2010, Keir Fraser */ #ifndef __XEN_PREEMPT_H__ #define __XEN_PREEMPT_H__ #include #include DECLARE_PER_CPU(unsigned int, __preempt_count); #define preempt_count() (this_cpu(__preempt_count)) #define preempt_disable() do { \ preempt_count()++; \ barrier(); \ } while (0) #define preempt_enable() do { \ barrier(); \ preempt_count()--; \ } while (0) bool_t in_atomic(void); #ifndef NDEBUG void ASSERT_NOT_IN_ATOMIC(void); #else #define ASSERT_NOT_IN_ATOMIC() ((void)0) #endif #endif /* __XEN_PREEMPT_H__ */ xen-4.4.0/xen/include/xen/types.h0000664000175000017500000000302312307313555014765 0ustar smbsmb#ifndef __TYPES_H__ #define __TYPES_H__ #include #define BITS_TO_LONGS(bits) \ (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG) #define DECLARE_BITMAP(name,bits) \ unsigned long name[BITS_TO_LONGS(bits)] #ifndef NULL #define NULL ((void*)0) #endif #define INT_MAX ((int)(~0U>>1)) #define INT_MIN (-INT_MAX - 1) #define UINT_MAX (~0U) #define LONG_MAX ((long)(~0UL>>1)) #define LONG_MIN (-LONG_MAX - 1) #define ULONG_MAX (~0UL) /* bsd */ typedef unsigned char u_char; typedef unsigned short u_short; typedef unsigned int u_int; typedef unsigned long u_long; /* sysv */ typedef unsigned char unchar; typedef unsigned short ushort; typedef unsigned int uint; typedef unsigned long ulong; typedef __u8 uint8_t; typedef __u8 u_int8_t; typedef __s8 int8_t; typedef __u16 uint16_t; typedef __u16 u_int16_t; typedef __s16 int16_t; typedef __u32 uint32_t; typedef __u32 u_int32_t; typedef __s32 int32_t; typedef __u64 uint64_t; typedef __u64 u_int64_t; typedef __s64 int64_t; struct domain; struct vcpu; typedef __u16 __le16; typedef __u16 __be16; typedef __u32 __le32; typedef __u32 __be32; typedef __u64 __le64; typedef __u64 __be64; typedef unsigned long uintptr_t; #endif /* __TYPES_H__ */ xen-4.4.0/xen/include/xen/acpi.h0000664000175000017500000001151512307313555014542 0ustar smbsmb/* * acpi.h - ACPI Interface * * Copyright (C) 2001 Paul Diefenbaugh * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ #ifndef _LINUX_ACPI_H #define _LINUX_ACPI_H #ifndef _LINUX #define _LINUX #endif #include #include #include #define ACPI_MADT_GET_(fld, x) (((x) & ACPI_MADT_##fld##_MASK) / \ (ACPI_MADT_##fld##_MASK & -ACPI_MADT_##fld##_MASK)) #define ACPI_MADT_GET_POLARITY(inti) ACPI_MADT_GET_(POLARITY, inti) #define ACPI_MADT_GET_TRIGGER(inti) ACPI_MADT_GET_(TRIGGER, inti) #ifdef CONFIG_ACPI_BOOT enum acpi_interrupt_id { ACPI_INTERRUPT_PMI = 1, ACPI_INTERRUPT_INIT, ACPI_INTERRUPT_CPEI, ACPI_INTERRUPT_COUNT }; typedef int (*acpi_madt_entry_handler) (struct acpi_subtable_header *header, const unsigned long end); typedef int (*acpi_table_handler) (struct acpi_table_header *table); typedef int (*acpi_table_entry_handler) (struct acpi_subtable_header *header, const unsigned long end); unsigned int acpi_get_processor_id (unsigned int cpu); char * __acpi_map_table (paddr_t phys_addr, unsigned long size); int acpi_boot_init (void); int acpi_boot_table_init (void); int acpi_numa_init (void); int erst_init(void); int acpi_table_init (void); int acpi_table_parse(char *id, acpi_table_handler handler); int acpi_table_parse_entries(char *id, unsigned long table_size, int entry_id, acpi_table_entry_handler handler, unsigned int max_entries); int acpi_table_parse_madt(enum acpi_madt_type id, acpi_table_entry_handler handler, unsigned int max_entries); int acpi_table_parse_srat(int id, acpi_madt_entry_handler handler, unsigned int max_entries); int acpi_parse_srat(struct acpi_table_header *); void acpi_table_print (struct acpi_table_header *header, unsigned long phys_addr); void acpi_table_print_madt_entry (struct acpi_subtable_header *madt); void acpi_table_print_srat_entry (struct acpi_subtable_header *srat); /* the following four functions are architecture-dependent */ void acpi_numa_slit_init (struct acpi_table_slit *slit); void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa); void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa); void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma); void acpi_numa_arch_fixup(void); #ifdef CONFIG_ACPI_HOTPLUG_CPU /* Arch dependent functions for cpu hotplug support */ int acpi_map_lsapic(acpi_handle handle, int *pcpu); int acpi_unmap_lsapic(int cpu); #endif /* CONFIG_ACPI_HOTPLUG_CPU */ extern int acpi_mp_config; extern u32 pci_mmcfg_base_addr; #else /*!CONFIG_ACPI_BOOT*/ #define acpi_mp_config 0 static inline int acpi_boot_init(void) { return 0; } static inline int acpi_boot_table_init(void) { return 0; } #endif /*!CONFIG_ACPI_BOOT*/ int get_cpu_id(u32 acpi_id); unsigned int acpi_register_gsi (u32 gsi, int edge_level, int active_high_low); int acpi_gsi_to_irq (u32 gsi, unsigned int *irq); /* * This function undoes the effect of one call to acpi_register_gsi(). * If this matches the last registration, any IRQ resources for gsi * are freed. */ #ifdef CONFIG_ACPI_DEALLOCATE_IRQ void acpi_unregister_gsi (u32 gsi); #endif #ifdef CONFIG_ACPI_CSTATE /* * Set highest legal C-state * 0: C0 okay, but not C1 * 1: C1 okay, but not C2 * 2: C2 okay, but not C3 etc. */ extern unsigned int max_cstate; static inline unsigned int acpi_get_cstate_limit(void) { return max_cstate; } static inline void acpi_set_cstate_limit(unsigned int new_limit) { max_cstate = new_limit; return; } #else static inline unsigned int acpi_get_cstate_limit(void) { return 0; } static inline void acpi_set_cstate_limit(unsigned int new_limit) { return; } #endif #ifdef XEN_GUEST_HANDLE_PARAM int acpi_set_pdc_bits(u32 acpi_id, XEN_GUEST_HANDLE_PARAM(uint32)); #endif int arch_acpi_set_pdc_bits(u32 acpi_id, u32 *, u32 mask); #ifdef CONFIG_ACPI_NUMA int acpi_get_pxm(acpi_handle handle); #else static inline int acpi_get_pxm(acpi_handle handle) { return 0; } #endif void acpi_reboot(void); void acpi_dmar_zap(void); void acpi_dmar_reinstate(void); #endif /*_LINUX_ACPI_H*/ xen-4.4.0/xen/include/xen/sort.h0000664000175000017500000000036312307313555014614 0ustar smbsmb#ifndef __XEN_SORT_H__ #define __XEN_SORT_H__ #include void sort(void *base, size_t num, size_t size, int (*cmp)(const void *, const void *), void (*swap)(void *, void *, int)); #endif /* __XEN_SORT_H__ */ xen-4.4.0/xen/include/xen/compile.h.in0000664000175000017500000000062512307313555015663 0ustar smbsmb#define XEN_COMPILE_DATE "@@date@@" #define XEN_COMPILE_TIME "@@time@@" #define XEN_COMPILE_BY "@@whoami@@" #define XEN_COMPILE_DOMAIN "@@domain@@" #define XEN_COMPILE_HOST "@@hostname@@" #define XEN_COMPILER "@@compiler@@" #define XEN_VERSION @@version@@ #define XEN_SUBVERSION @@subversion@@ #define XEN_EXTRAVERSION "@@extraversion@@" #define XEN_CHANGESET "@@changeset@@" #define XEN_BANNER \ xen-4.4.0/xen/include/xen/multiboot.h0000664000175000017500000000647212307313555015652 0ustar smbsmb/* multiboot.h - the header for Multiboot */ /* Copyright (C) 1999, 2001 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef __MULTIBOOT_H__ #define __MULTIBOOT_H__ #include "const.h" /* * Multiboot header structure. */ #define MULTIBOOT_HEADER_MAGIC 0x1BADB002 #define MULTIBOOT_HEADER_MODS_ALIGNED 0x00000001 #define MULTIBOOT_HEADER_WANT_MEMORY 0x00000002 #define MULTIBOOT_HEADER_HAS_VBE 0x00000004 #define MULTIBOOT_HEADER_HAS_ADDR 0x00010000 /* The magic number passed by a Multiboot-compliant boot loader. */ #define MULTIBOOT_BOOTLOADER_MAGIC 0x2BADB002 #define MBI_MEMLIMITS (_AC(1,u) << 0) #define MBI_BOOTDEV (_AC(1,u) << 1) #define MBI_CMDLINE (_AC(1,u) << 2) #define MBI_MODULES (_AC(1,u) << 3) #define MBI_AOUT_SYMS (_AC(1,u) << 4) #define MBI_ELF_SYMS (_AC(1,u) << 5) #define MBI_MEMMAP (_AC(1,u) << 6) #define MBI_DRIVES (_AC(1,u) << 7) #define MBI_BIOSCONFIG (_AC(1,u) << 8) #define MBI_LOADERNAME (_AC(1,u) << 9) #define MBI_APM (_AC(1,u) << 10) #ifndef __ASSEMBLY__ /* The symbol table for a.out. */ typedef struct { u32 tabsize; u32 strsize; u32 addr; u32 reserved; } aout_symbol_table_t; /* The section header table for ELF. */ typedef struct { u32 num; u32 size; u32 addr; u32 shndx; } elf_section_header_table_t; /* The Multiboot information. */ typedef struct { u32 flags; /* Valid if flags sets MBI_MEMLIMITS */ u32 mem_lower; u32 mem_upper; /* Valid if flags sets MBI_BOOTDEV */ u32 boot_device; /* Valid if flags sets MBI_CMDLINE */ u32 cmdline; /* Valid if flags sets MBI_MODULES */ u32 mods_count; u32 mods_addr; /* Valid if flags sets ... */ union { aout_symbol_table_t aout_sym; /* ... MBI_AOUT_SYMS */ elf_section_header_table_t elf_sec; /* ... MBI_ELF_SYMS */ } u; /* Valid if flags sets MBI_MEMMAP */ u32 mmap_length; u32 mmap_addr; /* Valid if flags sets MBI_DRIVES */ u32 drives_length; u32 drives_addr; /* Valid if flags sets MBI_BIOSCONFIG */ u32 config_table; /* Valid if flags sets MBI_LOADERNAME */ u32 boot_loader_name; /* Valid if flags sets MBI_APM */ u32 apm_table; } multiboot_info_t; /* The module structure. */ typedef struct { u32 mod_start; u32 mod_end; u32 string; u32 reserved; } module_t; /* The memory map. Be careful that the offset 0 is base_addr_low but no size. */ typedef struct { u32 size; u32 base_addr_low; u32 base_addr_high; u32 length_low; u32 length_high; u32 type; } memory_map_t; #endif /* __ASSEMBLY__ */ #endif /* __MULTIBOOT_H__ */ xen-4.4.0/xen/include/xen/lzo.h0000664000175000017500000000257712307313555014442 0ustar smbsmb#ifndef __LZO_H__ #define __LZO_H__ /* * LZO Public Kernel Interface * A mini subset of the LZO real-time data compression library * * Copyright (C) 1996-2005 Markus F.X.J. Oberhumer * * The full LZO package can be found at: * http://www.oberhumer.com/opensource/lzo/ * * Changed for kernel use by: * Nitin Gupta * Richard Purdie */ #define LZO1X_MEM_COMPRESS (16384 * sizeof(unsigned char *)) #define LZO1X_1_MEM_COMPRESS LZO1X_MEM_COMPRESS #define lzo1x_worst_compress(x) ((x) + ((x) / 16) + 64 + 3) /* This requires 'workmem' of size LZO1X_1_MEM_COMPRESS */ int lzo1x_1_compress(const unsigned char *src, size_t src_len, unsigned char *dst, size_t *dst_len, void *wrkmem); /* safe decompression with overrun testing */ int lzo1x_decompress_safe(const unsigned char *src, size_t src_len, unsigned char *dst, size_t *dst_len); /* * Return values (< 0 = Error) */ #define LZO_E_OK 0 #define LZO_E_ERROR (-1) #define LZO_E_OUT_OF_MEMORY (-2) #define LZO_E_NOT_COMPRESSIBLE (-3) #define LZO_E_INPUT_OVERRUN (-4) #define LZO_E_OUTPUT_OVERRUN (-5) #define LZO_E_LOOKBEHIND_OVERRUN (-6) #define LZO_E_EOF_NOT_FOUND (-7) #define LZO_E_INPUT_NOT_CONSUMED (-8) #define LZO_E_NOT_YET_IMPLEMENTED (-9) #endif xen-4.4.0/xen/include/xen/elfstructs.h0000664000175000017500000004612012307313555016024 0ustar smbsmb#ifndef __XEN_ELFSTRUCTS_H__ #define __XEN_ELFSTRUCTS_H__ /* * Copyright (c) 1995, 1996 Erik Theisen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ typedef uint32_t Elf32_Addr; /* Unsigned program address */ typedef uint32_t Elf32_Off; /* Unsigned file offset */ typedef uint16_t Elf32_Half; /* Unsigned medium integer */ typedef int32_t Elf32_Sword; /* Signed large integer */ typedef uint32_t Elf32_Word; /* Unsigned large integer */ typedef uint64_t Elf64_Addr; typedef uint64_t Elf64_Off; typedef uint16_t Elf64_Half; typedef int32_t Elf64_Sword; typedef uint32_t Elf64_Word; typedef int64_t Elf64_Sxword; typedef uint64_t Elf64_Xword; /* * e_ident[] identification indexes * See http://www.caldera.com/developers/gabi/2000-07-17/ch4.eheader.html */ #define EI_MAG0 0 /* file ID */ #define EI_MAG1 1 /* file ID */ #define EI_MAG2 2 /* file ID */ #define EI_MAG3 3 /* file ID */ #define EI_CLASS 4 /* file class */ #define EI_DATA 5 /* data encoding */ #define EI_VERSION 6 /* ELF header version */ #define EI_OSABI 7 /* OS/ABI ID */ #define EI_ABIVERSION 8 /* ABI version */ #define EI_PAD 9 /* start of pad bytes */ #define EI_NIDENT 16 /* Size of e_ident[] */ /* e_ident[] magic number */ #define ELFMAG0 0x7f /* e_ident[EI_MAG0] */ #define ELFMAG1 'E' /* e_ident[EI_MAG1] */ #define ELFMAG2 'L' /* e_ident[EI_MAG2] */ #define ELFMAG3 'F' /* e_ident[EI_MAG3] */ #define ELFMAG "\177ELF" /* magic */ #define SELFMAG 4 /* size of magic */ /* e_ident[] file class */ #define ELFCLASSNONE 0 /* invalid */ #define ELFCLASS32 1 /* 32-bit objs */ #define ELFCLASS64 2 /* 64-bit objs */ #define ELFCLASSNUM 3 /* number of classes */ /* e_ident[] data encoding */ #define ELFDATANONE 0 /* invalid */ #define ELFDATA2LSB 1 /* Little-Endian */ #define ELFDATA2MSB 2 /* Big-Endian */ #define ELFDATANUM 3 /* number of data encode defines */ /* e_ident[] Operating System/ABI */ #define ELFOSABI_SYSV 0 /* UNIX System V ABI */ #define ELFOSABI_HPUX 1 /* HP-UX operating system */ #define ELFOSABI_NETBSD 2 /* NetBSD */ #define ELFOSABI_LINUX 3 /* GNU/Linux */ #define ELFOSABI_HURD 4 /* GNU/Hurd */ #define ELFOSABI_86OPEN 5 /* 86Open common IA32 ABI */ #define ELFOSABI_SOLARIS 6 /* Solaris */ #define ELFOSABI_MONTEREY 7 /* Monterey */ #define ELFOSABI_IRIX 8 /* IRIX */ #define ELFOSABI_FREEBSD 9 /* FreeBSD */ #define ELFOSABI_TRU64 10 /* TRU64 UNIX */ #define ELFOSABI_MODESTO 11 /* Novell Modesto */ #define ELFOSABI_OPENBSD 12 /* OpenBSD */ #define ELFOSABI_ARM 97 /* ARM */ #define ELFOSABI_STANDALONE 255 /* Standalone (embedded) application */ /* e_ident */ #define IS_ELF(ehdr) ((ehdr).e_ident[EI_MAG0] == ELFMAG0 && \ (ehdr).e_ident[EI_MAG1] == ELFMAG1 && \ (ehdr).e_ident[EI_MAG2] == ELFMAG2 && \ (ehdr).e_ident[EI_MAG3] == ELFMAG3) /* ELF Header */ typedef struct elfhdr { unsigned char e_ident[EI_NIDENT]; /* ELF Identification */ Elf32_Half e_type; /* object file type */ Elf32_Half e_machine; /* machine */ Elf32_Word e_version; /* object file version */ Elf32_Addr e_entry; /* virtual entry point */ Elf32_Off e_phoff; /* program header table offset */ Elf32_Off e_shoff; /* section header table offset */ Elf32_Word e_flags; /* processor-specific flags */ Elf32_Half e_ehsize; /* ELF header size */ Elf32_Half e_phentsize; /* program header entry size */ Elf32_Half e_phnum; /* number of program header entries */ Elf32_Half e_shentsize; /* section header entry size */ Elf32_Half e_shnum; /* number of section header entries */ Elf32_Half e_shstrndx; /* section header table's "section header string table" entry offset */ } Elf32_Ehdr; typedef struct { unsigned char e_ident[EI_NIDENT]; /* Id bytes */ Elf64_Half e_type; /* file type */ Elf64_Half e_machine; /* machine type */ Elf64_Word e_version; /* version number */ Elf64_Addr e_entry; /* entry point */ Elf64_Off e_phoff; /* Program hdr offset */ Elf64_Off e_shoff; /* Section hdr offset */ Elf64_Word e_flags; /* Processor flags */ Elf64_Half e_ehsize; /* sizeof ehdr */ Elf64_Half e_phentsize; /* Program header entry size */ Elf64_Half e_phnum; /* Number of program headers */ Elf64_Half e_shentsize; /* Section header entry size */ Elf64_Half e_shnum; /* Number of section headers */ Elf64_Half e_shstrndx; /* String table index */ } Elf64_Ehdr; /* e_type */ #define ET_NONE 0 /* No file type */ #define ET_REL 1 /* relocatable file */ #define ET_EXEC 2 /* executable file */ #define ET_DYN 3 /* shared object file */ #define ET_CORE 4 /* core file */ #define ET_NUM 5 /* number of types */ #define ET_LOPROC 0xff00 /* reserved range for processor */ #define ET_HIPROC 0xffff /* specific e_type */ /* e_machine */ #define EM_NONE 0 /* No Machine */ #define EM_M32 1 /* AT&T WE 32100 */ #define EM_SPARC 2 /* SPARC */ #define EM_386 3 /* Intel 80386 */ #define EM_68K 4 /* Motorola 68000 */ #define EM_88K 5 /* Motorola 88000 */ #define EM_486 6 /* Intel 80486 - unused? */ #define EM_860 7 /* Intel 80860 */ #define EM_MIPS 8 /* MIPS R3000 Big-Endian only */ /* * Don't know if EM_MIPS_RS4_BE, * EM_SPARC64, EM_PARISC, * or EM_PPC are ABI compliant */ #define EM_MIPS_RS4_BE 10 /* MIPS R4000 Big-Endian */ #define EM_SPARC64 11 /* SPARC v9 64-bit unoffical */ #define EM_PARISC 15 /* HPPA */ #define EM_SPARC32PLUS 18 /* Enhanced instruction set SPARC */ #define EM_PPC 20 /* PowerPC */ #define EM_PPC64 21 /* PowerPC 64-bit */ #define EM_ARM 40 /* Advanced RISC Machines ARM */ #define EM_ALPHA 41 /* DEC ALPHA */ #define EM_SPARCV9 43 /* SPARC version 9 */ #define EM_ALPHA_EXP 0x9026 /* DEC ALPHA */ #define EM_IA_64 50 /* Intel Merced */ #define EM_X86_64 62 /* AMD x86-64 architecture */ #define EM_VAX 75 /* DEC VAX */ /* Version */ #define EV_NONE 0 /* Invalid */ #define EV_CURRENT 1 /* Current */ #define EV_NUM 2 /* number of versions */ /* Section Header */ typedef struct { Elf32_Word sh_name; /* name - index into section header string table section */ Elf32_Word sh_type; /* type */ Elf32_Word sh_flags; /* flags */ Elf32_Addr sh_addr; /* address */ Elf32_Off sh_offset; /* file offset */ Elf32_Word sh_size; /* section size */ Elf32_Word sh_link; /* section header table index link */ Elf32_Word sh_info; /* extra information */ Elf32_Word sh_addralign; /* address alignment */ Elf32_Word sh_entsize; /* section entry size */ } Elf32_Shdr; typedef struct { Elf64_Word sh_name; /* section name */ Elf64_Word sh_type; /* section type */ Elf64_Xword sh_flags; /* section flags */ Elf64_Addr sh_addr; /* virtual address */ Elf64_Off sh_offset; /* file offset */ Elf64_Xword sh_size; /* section size */ Elf64_Word sh_link; /* link to another */ Elf64_Word sh_info; /* misc info */ Elf64_Xword sh_addralign; /* memory alignment */ Elf64_Xword sh_entsize; /* table entry size */ } Elf64_Shdr; /* Special Section Indexes */ #define SHN_UNDEF 0 /* undefined */ #define SHN_LORESERVE 0xff00 /* lower bounds of reserved indexes */ #define SHN_LOPROC 0xff00 /* reserved range for processor */ #define SHN_HIPROC 0xff1f /* specific section indexes */ #define SHN_ABS 0xfff1 /* absolute value */ #define SHN_COMMON 0xfff2 /* common symbol */ #define SHN_HIRESERVE 0xffff /* upper bounds of reserved indexes */ /* sh_type */ #define SHT_NULL 0 /* inactive */ #define SHT_PROGBITS 1 /* program defined information */ #define SHT_SYMTAB 2 /* symbol table section */ #define SHT_STRTAB 3 /* string table section */ #define SHT_RELA 4 /* relocation section with addends*/ #define SHT_HASH 5 /* symbol hash table section */ #define SHT_DYNAMIC 6 /* dynamic section */ #define SHT_NOTE 7 /* note section */ #define SHT_NOBITS 8 /* no space section */ #define SHT_REL 9 /* relation section without addends */ #define SHT_SHLIB 10 /* reserved - purpose unknown */ #define SHT_DYNSYM 11 /* dynamic symbol table section */ #define SHT_NUM 12 /* number of section types */ #define SHT_LOPROC 0x70000000 /* reserved range for processor */ #define SHT_HIPROC 0x7fffffff /* specific section header types */ #define SHT_LOUSER 0x80000000 /* reserved range for application */ #define SHT_HIUSER 0xffffffff /* specific indexes */ /* Section names */ #define ELF_BSS ".bss" /* uninitialized data */ #define ELF_DATA ".data" /* initialized data */ #define ELF_DEBUG ".debug" /* debug */ #define ELF_DYNAMIC ".dynamic" /* dynamic linking information */ #define ELF_DYNSTR ".dynstr" /* dynamic string table */ #define ELF_DYNSYM ".dynsym" /* dynamic symbol table */ #define ELF_FINI ".fini" /* termination code */ #define ELF_GOT ".got" /* global offset table */ #define ELF_HASH ".hash" /* symbol hash table */ #define ELF_INIT ".init" /* initialization code */ #define ELF_REL_DATA ".rel.data" /* relocation data */ #define ELF_REL_FINI ".rel.fini" /* relocation termination code */ #define ELF_REL_INIT ".rel.init" /* relocation initialization code */ #define ELF_REL_DYN ".rel.dyn" /* relocaltion dynamic link info */ #define ELF_REL_RODATA ".rel.rodata" /* relocation read-only data */ #define ELF_REL_TEXT ".rel.text" /* relocation code */ #define ELF_RODATA ".rodata" /* read-only data */ #define ELF_SHSTRTAB ".shstrtab" /* section header string table */ #define ELF_STRTAB ".strtab" /* string table */ #define ELF_SYMTAB ".symtab" /* symbol table */ #define ELF_TEXT ".text" /* code */ /* Section Attribute Flags - sh_flags */ #define SHF_WRITE 0x1 /* Writable */ #define SHF_ALLOC 0x2 /* occupies memory */ #define SHF_EXECINSTR 0x4 /* executable */ #define SHF_MASKPROC 0xf0000000 /* reserved bits for processor */ /* specific section attributes */ /* Symbol Table Entry */ typedef struct elf32_sym { Elf32_Word st_name; /* name - index into string table */ Elf32_Addr st_value; /* symbol value */ Elf32_Word st_size; /* symbol size */ unsigned char st_info; /* type and binding */ unsigned char st_other; /* 0 - no defined meaning */ Elf32_Half st_shndx; /* section header index */ } Elf32_Sym; typedef struct { Elf64_Word st_name; /* Symbol name index in str table */ unsigned char st_info; /* type / binding attrs */ unsigned char st_other; /* unused */ Elf64_Half st_shndx; /* section index of symbol */ Elf64_Addr st_value; /* value of symbol */ Elf64_Xword st_size; /* size of symbol */ } Elf64_Sym; /* Symbol table index */ #define STN_UNDEF 0 /* undefined */ /* Extract symbol info - st_info */ #define ELF32_ST_BIND(x) ((x) >> 4) #define ELF32_ST_TYPE(x) (((unsigned int) x) & 0xf) #define ELF32_ST_INFO(b,t) (((b) << 4) + ((t) & 0xf)) #define ELF64_ST_BIND(x) ((x) >> 4) #define ELF64_ST_TYPE(x) (((unsigned int) x) & 0xf) #define ELF64_ST_INFO(b,t) (((b) << 4) + ((t) & 0xf)) /* Symbol Binding - ELF32_ST_BIND - st_info */ #define STB_LOCAL 0 /* Local symbol */ #define STB_GLOBAL 1 /* Global symbol */ #define STB_WEAK 2 /* like global - lower precedence */ #define STB_NUM 3 /* number of symbol bindings */ #define STB_LOPROC 13 /* reserved range for processor */ #define STB_HIPROC 15 /* specific symbol bindings */ /* Symbol type - ELF32_ST_TYPE - st_info */ #define STT_NOTYPE 0 /* not specified */ #define STT_OBJECT 1 /* data object */ #define STT_FUNC 2 /* function */ #define STT_SECTION 3 /* section */ #define STT_FILE 4 /* file */ #define STT_NUM 5 /* number of symbol types */ #define STT_LOPROC 13 /* reserved range for processor */ #define STT_HIPROC 15 /* specific symbol types */ /* Relocation entry with implicit addend */ typedef struct { Elf32_Addr r_offset; /* offset of relocation */ Elf32_Word r_info; /* symbol table index and type */ } Elf32_Rel; /* Relocation entry with explicit addend */ typedef struct { Elf32_Addr r_offset; /* offset of relocation */ Elf32_Word r_info; /* symbol table index and type */ Elf32_Sword r_addend; } Elf32_Rela; /* Extract relocation info - r_info */ #define ELF32_R_SYM(i) ((i) >> 8) #define ELF32_R_TYPE(i) ((unsigned char) (i)) #define ELF32_R_INFO(s,t) (((s) << 8) + (unsigned char)(t)) typedef struct { Elf64_Addr r_offset; /* where to do it */ Elf64_Xword r_info; /* index & type of relocation */ } Elf64_Rel; typedef struct { Elf64_Addr r_offset; /* where to do it */ Elf64_Xword r_info; /* index & type of relocation */ Elf64_Sxword r_addend; /* adjustment value */ } Elf64_Rela; #define ELF64_R_SYM(info) ((info) >> 32) #define ELF64_R_TYPE(info) ((info) & 0xFFFFFFFF) #define ELF64_R_INFO(s,t) (((s) << 32) + (u_int32_t)(t)) /* Program Header */ typedef struct { Elf32_Word p_type; /* segment type */ Elf32_Off p_offset; /* segment offset */ Elf32_Addr p_vaddr; /* virtual address of segment */ Elf32_Addr p_paddr; /* physical address - ignored? */ Elf32_Word p_filesz; /* number of bytes in file for seg. */ Elf32_Word p_memsz; /* number of bytes in mem. for seg. */ Elf32_Word p_flags; /* flags */ Elf32_Word p_align; /* memory alignment */ } Elf32_Phdr; typedef struct { Elf64_Word p_type; /* entry type */ Elf64_Word p_flags; /* flags */ Elf64_Off p_offset; /* offset */ Elf64_Addr p_vaddr; /* virtual address */ Elf64_Addr p_paddr; /* physical address */ Elf64_Xword p_filesz; /* file size */ Elf64_Xword p_memsz; /* memory size */ Elf64_Xword p_align; /* memory & file alignment */ } Elf64_Phdr; /* Segment types - p_type */ #define PT_NULL 0 /* unused */ #define PT_LOAD 1 /* loadable segment */ #define PT_DYNAMIC 2 /* dynamic linking section */ #define PT_INTERP 3 /* the RTLD */ #define PT_NOTE 4 /* auxiliary information */ #define PT_SHLIB 5 /* reserved - purpose undefined */ #define PT_PHDR 6 /* program header */ #define PT_NUM 7 /* Number of segment types */ #define PT_LOPROC 0x70000000 /* reserved range for processor */ #define PT_HIPROC 0x7fffffff /* specific segment types */ /* Segment flags - p_flags */ #define PF_X 0x1 /* Executable */ #define PF_W 0x2 /* Writable */ #define PF_R 0x4 /* Readable */ #define PF_MASKPROC 0xf0000000 /* reserved bits for processor */ /* specific segment flags */ /* Dynamic structure */ typedef struct { Elf32_Sword d_tag; /* controls meaning of d_val */ union { Elf32_Word d_val; /* Multiple meanings - see d_tag */ Elf32_Addr d_ptr; /* program virtual address */ } d_un; } Elf32_Dyn; typedef struct { Elf64_Sxword d_tag; /* controls meaning of d_val */ union { Elf64_Xword d_val; Elf64_Addr d_ptr; } d_un; } Elf64_Dyn; /* Dynamic Array Tags - d_tag */ #define DT_NULL 0 /* marks end of _DYNAMIC array */ #define DT_NEEDED 1 /* string table offset of needed lib */ #define DT_PLTRELSZ 2 /* size of relocation entries in PLT */ #define DT_PLTGOT 3 /* address PLT/GOT */ #define DT_HASH 4 /* address of symbol hash table */ #define DT_STRTAB 5 /* address of string table */ #define DT_SYMTAB 6 /* address of symbol table */ #define DT_RELA 7 /* address of relocation table */ #define DT_RELASZ 8 /* size of relocation table */ #define DT_RELAENT 9 /* size of relocation entry */ #define DT_STRSZ 10 /* size of string table */ #define DT_SYMENT 11 /* size of symbol table entry */ #define DT_INIT 12 /* address of initialization func. */ #define DT_FINI 13 /* address of termination function */ #define DT_SONAME 14 /* string table offset of shared obj */ #define DT_RPATH 15 /* string table offset of library search path */ #define DT_SYMBOLIC 16 /* start sym search in shared obj. */ #define DT_REL 17 /* address of rel. tbl. w addends */ #define DT_RELSZ 18 /* size of DT_REL relocation table */ #define DT_RELENT 19 /* size of DT_REL relocation entry */ #define DT_PLTREL 20 /* PLT referenced relocation entry */ #define DT_DEBUG 21 /* bugger */ #define DT_TEXTREL 22 /* Allow rel. mod. to unwritable seg */ #define DT_JMPREL 23 /* add. of PLT's relocation entries */ #define DT_BIND_NOW 24 /* Bind now regardless of env setting */ #define DT_NUM 25 /* Number used. */ #define DT_LOPROC 0x70000000 /* reserved range for processor */ #define DT_HIPROC 0x7fffffff /* specific dynamic array tags */ /* Standard ELF hashing function */ unsigned int elf_hash(const unsigned char *name); /* * Note Definitions */ typedef struct { Elf32_Word namesz; Elf32_Word descsz; Elf32_Word type; } Elf32_Note; typedef struct { Elf64_Word namesz; Elf64_Word descsz; Elf64_Word type; } Elf64_Note; #if defined(ELFSIZE) #define CONCAT(x,y) __CONCAT(x,y) #define ELFNAME(x) CONCAT(elf,CONCAT(ELFSIZE,CONCAT(_,x))) #define ELFNAME2(x,y) CONCAT(x,CONCAT(_elf,CONCAT(ELFSIZE,CONCAT(_,y)))) #define ELFNAMEEND(x) CONCAT(x,CONCAT(_elf,ELFSIZE)) #define ELFDEFNNAME(x) CONCAT(ELF,CONCAT(ELFSIZE,CONCAT(_,x))) #endif #if defined(ELFSIZE) && (ELFSIZE == 32) #define Elf_Ehdr Elf32_Ehdr #define Elf_Phdr Elf32_Phdr #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Rel Elf32_Rel #define Elf_RelA Elf32_Rela #define Elf_Dyn Elf32_Dyn #define Elf_Word Elf32_Word #define Elf_Sword Elf32_Sword #define Elf_Addr Elf32_Addr #define Elf_Off Elf32_Off #define Elf_Nhdr Elf32_Nhdr #define Elf_Note Elf32_Note #define ELF_R_SYM ELF32_R_SYM #define ELF_R_TYPE ELF32_R_TYPE #define ELF_R_INFO ELF32_R_INFO #define ELFCLASS ELFCLASS32 #define ELF_ST_BIND ELF32_ST_BIND #define ELF_ST_TYPE ELF32_ST_TYPE #define ELF_ST_INFO ELF32_ST_INFO #define AuxInfo Aux32Info #elif defined(ELFSIZE) && (ELFSIZE == 64) #define Elf_Ehdr Elf64_Ehdr #define Elf_Phdr Elf64_Phdr #define Elf_Shdr Elf64_Shdr #define Elf_Sym Elf64_Sym #define Elf_Rel Elf64_Rel #define Elf_RelA Elf64_Rela #define Elf_Dyn Elf64_Dyn #define Elf_Word Elf64_Word #define Elf_Sword Elf64_Sword #define Elf_Addr Elf64_Addr #define Elf_Off Elf64_Off #define Elf_Nhdr Elf64_Nhdr #define Elf_Note Elf64_Note #define ELF_R_SYM ELF64_R_SYM #define ELF_R_TYPE ELF64_R_TYPE #define ELF_R_INFO ELF64_R_INFO #define ELFCLASS ELFCLASS64 #define ELF_ST_BIND ELF64_ST_BIND #define ELF_ST_TYPE ELF64_ST_TYPE #define ELF_ST_INFO ELF64_ST_INFO #define AuxInfo Aux64Info #endif #endif /* __XEN_ELFSTRUCTS_H__ */ xen-4.4.0/xen/include/xen/domain_page.h0000664000175000017500000000741412307313555016074 0ustar smbsmb/****************************************************************************** * domain_page.h * * Allow temporary mapping of domain page frames into Xen space. * * Copyright (c) 2003-2006, Keir Fraser */ #ifndef __XEN_DOMAIN_PAGE_H__ #define __XEN_DOMAIN_PAGE_H__ #include #ifdef CONFIG_DOMAIN_PAGE /* * Map a given page frame, returning the mapped virtual address. The page is * then accessible within the current VCPU until a corresponding unmap call. */ void *map_domain_page(unsigned long mfn); /* * Pass a VA within a page previously mapped in the context of the * currently-executing VCPU via a call to map_domain_page(). */ void unmap_domain_page(const void *va); /* * Clear a given page frame, or copy between two of them. */ void clear_domain_page(unsigned long mfn); void copy_domain_page(unsigned long dmfn, unsigned long smfn); /* * Given a VA from map_domain_page(), return its underlying MFN. */ unsigned long domain_page_map_to_mfn(const void *va); /* * Similar to the above calls, except the mapping is accessible in all * address spaces (not just within the VCPU that created the mapping). Global * mappings can also be unmapped from any context. */ void *map_domain_page_global(unsigned long mfn); void unmap_domain_page_global(const void *va); #define __map_domain_page(pg) map_domain_page(__page_to_mfn(pg)) #define __map_domain_page_global(pg) map_domain_page_global(__page_to_mfn(pg)) #define DMCACHE_ENTRY_VALID 1U #define DMCACHE_ENTRY_HELD 2U struct domain_mmap_cache { unsigned long mfn; void *va; unsigned int flags; }; static inline void domain_mmap_cache_init(struct domain_mmap_cache *cache) { ASSERT(cache != NULL); cache->flags = 0; cache->mfn = 0; cache->va = NULL; } static inline void * map_domain_page_with_cache(unsigned long mfn, struct domain_mmap_cache *cache) { ASSERT(cache != NULL); BUG_ON(cache->flags & DMCACHE_ENTRY_HELD); if ( likely(cache->flags & DMCACHE_ENTRY_VALID) ) { cache->flags |= DMCACHE_ENTRY_HELD; if ( likely(mfn == cache->mfn) ) goto done; unmap_domain_page(cache->va); } cache->mfn = mfn; cache->va = map_domain_page(mfn); cache->flags = DMCACHE_ENTRY_HELD | DMCACHE_ENTRY_VALID; done: return cache->va; } static inline void unmap_domain_page_with_cache(const void *va, struct domain_mmap_cache *cache) { ASSERT(cache != NULL); cache->flags &= ~DMCACHE_ENTRY_HELD; } static inline void domain_mmap_cache_destroy(struct domain_mmap_cache *cache) { ASSERT(cache != NULL); BUG_ON(cache->flags & DMCACHE_ENTRY_HELD); if ( likely(cache->flags & DMCACHE_ENTRY_VALID) ) { unmap_domain_page(cache->va); cache->flags = 0; } } #else /* !CONFIG_DOMAIN_PAGE */ #define map_domain_page(mfn) mfn_to_virt(mfn) #define __map_domain_page(pg) page_to_virt(pg) #define unmap_domain_page(va) ((void)(va)) #define clear_domain_page(mfn) clear_page(mfn_to_virt(mfn)) #define copy_domain_page(dmfn, smfn) copy_page(mfn_to_virt(dmfn), \ mfn_to_virt(smfn)) #define domain_page_map_to_mfn(va) virt_to_mfn((unsigned long)(va)) #define map_domain_page_global(mfn) mfn_to_virt(mfn) #define __map_domain_page_global(pg) page_to_virt(pg) #define unmap_domain_page_global(va) ((void)(va)) struct domain_mmap_cache { }; #define domain_mmap_cache_init(c) ((void)(c)) #define map_domain_page_with_cache(mfn,c) (map_domain_page(mfn)) #define unmap_domain_page_with_cache(va,c) ((void)(va)) #define domain_mmap_cache_destroy(c) ((void)(c)) #endif /* !CONFIG_DOMAIN_PAGE */ #endif /* __XEN_DOMAIN_PAGE_H__ */ xen-4.4.0/xen/include/xen/xmalloc.h0000664000175000017500000000677112307313555015275 0ustar smbsmb #ifndef __XMALLOC_H__ #define __XMALLOC_H__ #include #include /* * Xen malloc/free-style interface. */ /* Allocate space for typed object. */ #define xmalloc(_type) ((_type *)_xmalloc(sizeof(_type), __alignof__(_type))) #define xzalloc(_type) ((_type *)_xzalloc(sizeof(_type), __alignof__(_type))) /* Allocate space for array of typed objects. */ #define xmalloc_array(_type, _num) \ ((_type *)_xmalloc_array(sizeof(_type), __alignof__(_type), _num)) #define xzalloc_array(_type, _num) \ ((_type *)_xzalloc_array(sizeof(_type), __alignof__(_type), _num)) /* Allocate untyped storage. */ #define xmalloc_bytes(_bytes) _xmalloc(_bytes, SMP_CACHE_BYTES) #define xzalloc_bytes(_bytes) _xzalloc(_bytes, SMP_CACHE_BYTES) /* Free any of the above. */ extern void xfree(void *); /* Underlying functions */ extern void *_xmalloc(unsigned long size, unsigned long align); extern void *_xzalloc(unsigned long size, unsigned long align); static inline void *_xmalloc_array( unsigned long size, unsigned long align, unsigned long num) { /* Check for overflow. */ if (size && num > UINT_MAX / size) return NULL; return _xmalloc(size * num, align); } static inline void *_xzalloc_array( unsigned long size, unsigned long align, unsigned long num) { /* Check for overflow. */ if (size && num > UINT_MAX / size) return NULL; return _xzalloc(size * num, align); } /* * Pooled allocator interface. */ struct xmem_pool; typedef void *(xmem_pool_get_memory)(unsigned long bytes); typedef void (xmem_pool_put_memory)(void *ptr); /** * xmem_pool_create - create dynamic memory pool * @name: name of the pool * @get_mem: callback function used to expand pool * @put_mem: callback function used to shrink pool * @init_size: inital pool size (in bytes) * @max_size: maximum pool size (in bytes) - set this as 0 for no limit * @grow_size: amount of memory (in bytes) added to pool whenever required * * All size values are rounded up to next page boundary. */ struct xmem_pool *xmem_pool_create( const char *name, xmem_pool_get_memory get_mem, xmem_pool_put_memory put_mem, unsigned long init_size, unsigned long max_size, unsigned long grow_size); /** * xmem_pool_destroy - cleanup given pool * @mem_pool: Pool to be destroyed * * Data structures associated with pool are freed. * All memory allocated from pool must be freed before * destorying it. */ void xmem_pool_destroy(struct xmem_pool *pool); /** * xmem_pool_alloc - allocate memory from given pool * @size: no. of bytes * @mem_pool: pool to allocate from */ void *xmem_pool_alloc(unsigned long size, struct xmem_pool *pool); /** * xmem_pool_maxalloc - xmem_pool_alloc's greater than this size will fail * @mem_pool: pool */ int xmem_pool_maxalloc(struct xmem_pool *pool); /** * xmem_pool_maxsize - * @ptr: address of memory to be freed * @mem_pool: pool to free from */ void xmem_pool_free(void *ptr, struct xmem_pool *pool); /** * xmem_pool_get_used_size - get memory currently used by given pool * * Used memory includes stored data + metadata + internal fragmentation */ unsigned long xmem_pool_get_used_size(struct xmem_pool *pool); /** * xmem_pool_get_total_size - get total memory currently allocated for pool * * This is the total memory currently allocated for this pool which includes * used size + free size. * * (Total - Used) is good indicator of memory efficiency of allocator. */ unsigned long xmem_pool_get_total_size(struct xmem_pool *pool); #endif /* __XMALLOC_H__ */ xen-4.4.0/xen/include/xen/spinlock.h0000664000175000017500000002256512307313555015457 0ustar smbsmb#ifndef __SPINLOCK_H__ #define __SPINLOCK_H__ #include #include #ifndef NDEBUG struct lock_debug { int irq_safe; /* +1: IRQ-safe; 0: not IRQ-safe; -1: don't know yet */ }; #define _LOCK_DEBUG { -1 } void spin_debug_enable(void); void spin_debug_disable(void); #else struct lock_debug { }; #define _LOCK_DEBUG { } #define spin_debug_enable() ((void)0) #define spin_debug_disable() ((void)0) #endif #ifdef LOCK_PROFILE #include /* lock profiling on: Global locks which should be subject to profiling must be declared via DEFINE_SPINLOCK. For locks in structures further measures are necessary: - the structure definition must include a profile_head with exactly this name: struct lock_profile_qhead profile_head; - the single locks which are subject to profiling have to be initialized via spin_lock_init_prof(ptr, lock); with ptr being the main structure pointer and lock the spinlock field - each structure has to be added to profiling with lock_profile_register_struct(type, ptr, idx, print); with: type: something like LOCKPROF_TYPE_PERDOM ptr: pointer to the structure idx: index of that structure, e.g. domid print: descriptive string like "domain" - removing of a structure is done via lock_profile_deregister_struct(type, ptr); */ struct spinlock; struct lock_profile { struct lock_profile *next; /* forward link */ char *name; /* lock name */ struct spinlock *lock; /* the lock itself */ u64 lock_cnt; /* # of complete locking ops */ u64 block_cnt; /* # of complete wait for lock */ s64 time_hold; /* cumulated lock time */ s64 time_block; /* cumulated wait time */ s64 time_locked; /* system time of last locking */ }; struct lock_profile_qhead { struct lock_profile_qhead *head_q; /* next head of this type */ struct lock_profile *elem_q; /* first element in q */ int32_t idx; /* index for printout */ }; #define _LOCK_PROFILE(name) { 0, #name, &name, 0, 0, 0, 0, 0 } #define _LOCK_PROFILE_PTR(name) \ static struct lock_profile *__lock_profile_##name \ __used_section(".lockprofile.data") = \ &__lock_profile_data_##name #define _SPIN_LOCK_UNLOCKED(x) { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, \ _LOCK_DEBUG, x } #define SPIN_LOCK_UNLOCKED _SPIN_LOCK_UNLOCKED(NULL) #define DEFINE_SPINLOCK(l) \ spinlock_t l = _SPIN_LOCK_UNLOCKED(NULL); \ static struct lock_profile __lock_profile_data_##l = _LOCK_PROFILE(l); \ _LOCK_PROFILE_PTR(l) #define spin_lock_init_prof(s, l) \ do { \ struct lock_profile *prof; \ prof = xzalloc(struct lock_profile); \ if (!prof) break; \ prof->name = #l; \ prof->lock = &(s)->l; \ (s)->l = (spinlock_t)_SPIN_LOCK_UNLOCKED(prof); \ prof->next = (s)->profile_head.elem_q; \ (s)->profile_head.elem_q = prof; \ } while(0) void _lock_profile_register_struct( int32_t, struct lock_profile_qhead *, int32_t, char *); void _lock_profile_deregister_struct(int32_t, struct lock_profile_qhead *); #define lock_profile_register_struct(type, ptr, idx, print) \ _lock_profile_register_struct(type, &((ptr)->profile_head), idx, print) #define lock_profile_deregister_struct(type, ptr) \ _lock_profile_deregister_struct(type, &((ptr)->profile_head)) extern int spinlock_profile_control(xen_sysctl_lockprof_op_t *pc); extern void spinlock_profile_printall(unsigned char key); extern void spinlock_profile_reset(unsigned char key); #else struct lock_profile_qhead { }; #define SPIN_LOCK_UNLOCKED \ { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, _LOCK_DEBUG } #define DEFINE_SPINLOCK(l) spinlock_t l = SPIN_LOCK_UNLOCKED #define spin_lock_init_prof(s, l) spin_lock_init(&((s)->l)) #define lock_profile_register_struct(type, ptr, idx, print) #define lock_profile_deregister_struct(type, ptr) #endif typedef struct spinlock { raw_spinlock_t raw; u16 recurse_cpu:12; u16 recurse_cnt:4; struct lock_debug debug; #ifdef LOCK_PROFILE struct lock_profile *profile; #endif } spinlock_t; #define spin_lock_init(l) (*(l) = (spinlock_t)SPIN_LOCK_UNLOCKED) typedef struct { raw_rwlock_t raw; struct lock_debug debug; } rwlock_t; #define RW_LOCK_UNLOCKED { _RAW_RW_LOCK_UNLOCKED, _LOCK_DEBUG } #define DEFINE_RWLOCK(l) rwlock_t l = RW_LOCK_UNLOCKED #define rwlock_init(l) (*(l) = (rwlock_t)RW_LOCK_UNLOCKED) void _spin_lock(spinlock_t *lock); void _spin_lock_irq(spinlock_t *lock); unsigned long _spin_lock_irqsave(spinlock_t *lock); void _spin_unlock(spinlock_t *lock); void _spin_unlock_irq(spinlock_t *lock); void _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags); int _spin_is_locked(spinlock_t *lock); int _spin_trylock(spinlock_t *lock); void _spin_barrier(spinlock_t *lock); int _spin_trylock_recursive(spinlock_t *lock); void _spin_lock_recursive(spinlock_t *lock); void _spin_unlock_recursive(spinlock_t *lock); void _read_lock(rwlock_t *lock); void _read_lock_irq(rwlock_t *lock); unsigned long _read_lock_irqsave(rwlock_t *lock); void _read_unlock(rwlock_t *lock); void _read_unlock_irq(rwlock_t *lock); void _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags); int _read_trylock(rwlock_t *lock); void _write_lock(rwlock_t *lock); void _write_lock_irq(rwlock_t *lock); unsigned long _write_lock_irqsave(rwlock_t *lock); int _write_trylock(rwlock_t *lock); void _write_unlock(rwlock_t *lock); void _write_unlock_irq(rwlock_t *lock); void _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags); int _rw_is_locked(rwlock_t *lock); int _rw_is_write_locked(rwlock_t *lock); #define spin_lock(l) _spin_lock(l) #define spin_lock_irq(l) _spin_lock_irq(l) #define spin_lock_irqsave(l, f) \ ({ \ BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ ((f) = _spin_lock_irqsave(l)); \ }) #define spin_unlock(l) _spin_unlock(l) #define spin_unlock_irq(l) _spin_unlock_irq(l) #define spin_unlock_irqrestore(l, f) _spin_unlock_irqrestore(l, f) #define spin_is_locked(l) _spin_is_locked(l) #define spin_trylock(l) _spin_trylock(l) #define spin_trylock_irqsave(lock, flags) \ ({ \ local_irq_save(flags); \ spin_trylock(lock) ? \ 1 : ({ local_irq_restore(flags); 0; }); \ }) /* Ensure a lock is quiescent between two critical operations. */ #define spin_barrier(l) _spin_barrier(l) /* * spin_[un]lock_recursive(): Use these forms when the lock can (safely!) be * reentered recursively on the same CPU. All critical regions that may form * part of a recursively-nested set must be protected by these forms. If there * are any critical regions that cannot form part of such a set, they can use * standard spin_[un]lock(). */ #define spin_trylock_recursive(l) _spin_trylock_recursive(l) #define spin_lock_recursive(l) _spin_lock_recursive(l) #define spin_unlock_recursive(l) _spin_unlock_recursive(l) #define read_lock(l) _read_lock(l) #define read_lock_irq(l) _read_lock_irq(l) #define read_lock_irqsave(l, f) \ ({ \ BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ ((f) = _read_lock_irqsave(l)); \ }) #define read_unlock(l) _read_unlock(l) #define read_unlock_irq(l) _read_unlock_irq(l) #define read_unlock_irqrestore(l, f) _read_unlock_irqrestore(l, f) #define read_trylock(l) _read_trylock(l) #define write_lock(l) _write_lock(l) #define write_lock_irq(l) _write_lock_irq(l) #define write_lock_irqsave(l, f) \ ({ \ BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ ((f) = _write_lock_irqsave(l)); \ }) #define write_trylock(l) _write_trylock(l) #define write_unlock(l) _write_unlock(l) #define write_unlock_irq(l) _write_unlock_irq(l) #define write_unlock_irqrestore(l, f) _write_unlock_irqrestore(l, f) #define rw_is_locked(l) _rw_is_locked(l) #define rw_is_write_locked(l) _rw_is_write_locked(l) #endif /* __SPINLOCK_H__ */ xen-4.4.0/xen/include/xen/kimage.h0000664000175000017500000000342212307313555015061 0ustar smbsmb#ifndef __XEN_KIMAGE_H__ #define __XEN_KIMAGE_H__ #define IND_DESTINATION 0x1 #define IND_INDIRECTION 0x2 #define IND_DONE 0x4 #define IND_SOURCE 0x8 #define IND_ZERO 0x10 #ifndef __ASSEMBLY__ #include #include #include #define KEXEC_SEGMENT_MAX 16 typedef paddr_t kimage_entry_t; struct kexec_image { uint8_t type; uint16_t arch; uint64_t entry_maddr; uint32_t nr_segments; xen_kexec_segment_t *segments; kimage_entry_t head; struct page_info *entry_page; unsigned next_entry; struct page_info *control_code_page; struct page_info *aux_page; struct page_list_head control_pages; struct page_list_head dest_pages; struct page_list_head unusable_pages; /* Address of next control page to allocate for crash kernels. */ paddr_t next_crash_page; }; int kimage_alloc(struct kexec_image **rimage, uint8_t type, uint16_t arch, uint64_t entry_maddr, uint32_t nr_segments, xen_kexec_segment_t *segment); void kimage_free(struct kexec_image *image); int kimage_load_segments(struct kexec_image *image); struct page_info *kimage_alloc_control_page(struct kexec_image *image, unsigned memflags); kimage_entry_t *kimage_entry_next(kimage_entry_t *entry, bool_t compat); unsigned long kimage_entry_mfn(kimage_entry_t *entry, bool_t compat); unsigned long kimage_entry_ind(kimage_entry_t *entry, bool_t compat); int kimage_build_ind(struct kexec_image *image, unsigned long ind_mfn, bool_t compat); #endif /* __ASSEMBLY__ */ #endif /* __XEN_KIMAGE_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/xen/irq_cpustat.h0000664000175000017500000000164112307313555016163 0ustar smbsmb#ifndef __irq_cpustat_h #define __irq_cpustat_h /* * Contains default mappings for irq_cpustat_t, used by almost every * architecture. Some arch (like s390) have per cpu hardware pages and * they define their own mappings for irq_stat. * * Keith Owens July 2000. */ #include /* * Simple wrappers reducing source bloat. Define all irq_stat fields * here, even ones that are arch dependent. That way we get common * definitions instead of differing sets for each arch. */ extern irq_cpustat_t irq_stat[]; #define __IRQ_STAT(cpu, member) (irq_stat[cpu].member) /* arch independent irq_stat fields */ #define softirq_pending(cpu) __IRQ_STAT((cpu), __softirq_pending) #define local_irq_count(cpu) __IRQ_STAT((cpu), __local_irq_count) #define nmi_count(cpu) __IRQ_STAT((cpu), __nmi_count) #define mwait_wakeup(cpu) __IRQ_STAT((cpu), __mwait_wakeup) #endif /* __irq_cpustat_h */ xen-4.4.0/xen/include/xen/version.h0000664000175000017500000000073112307313555015311 0ustar smbsmb#ifndef __XEN_VERSION_H__ #define __XEN_VERSION_H__ const char *xen_compile_date(void); const char *xen_compile_time(void); const char *xen_compile_by(void); const char *xen_compile_domain(void); const char *xen_compile_host(void); const char *xen_compiler(void); unsigned int xen_major_version(void); unsigned int xen_minor_version(void); const char *xen_extra_version(void); const char *xen_changeset(void); const char *xen_banner(void); #endif /* __XEN_VERSION_H__ */ xen-4.4.0/xen/include/xen/video.h0000664000175000017500000000100512307313555014725 0ustar smbsmb/* * video.h * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. */ #ifndef _XEN_VIDEO_H #define _XEN_VIDEO_H #include #ifdef CONFIG_VIDEO void video_init(void); extern void (*video_puts)(const char *); void video_endboot(void); #else #define video_init() ((void)0) #define video_puts(s) ((void)0) #define video_endboot() ((void)0) #endif #endif /* _XEN_VIDEO_H */ xen-4.4.0/xen/include/xen/pci.h0000664000175000017500000001204312307313555014376 0ustar smbsmb/****************************************************************************** * pci.h * * PCI access functions. */ #ifndef __XEN_PCI_H__ #define __XEN_PCI_H__ #include #include #include #include #include #include #include /* * The PCI interface treats multi-function devices as independent * devices. The slot/function address of each device is encoded * in a single byte as follows: * * 15:8 = bus * 7:3 = slot * 2:0 = function */ #define PCI_BUS(bdf) (((bdf) >> 8) & 0xff) #define PCI_SLOT(bdf) (((bdf) >> 3) & 0x1f) #define PCI_FUNC(bdf) ((bdf) & 0x07) #define PCI_DEVFN(d,f) ((((d) & 0x1f) << 3) | ((f) & 0x07)) #define PCI_DEVFN2(bdf) ((bdf) & 0xff) #define PCI_BDF(b,d,f) ((((b) & 0xff) << 8) | PCI_DEVFN(d,f)) #define PCI_BDF2(b,df) ((((b) & 0xff) << 8) | ((df) & 0xff)) struct pci_dev_info { bool_t is_extfn; bool_t is_virtfn; struct { u8 bus; u8 devfn; } physfn; }; struct pci_dev { struct list_head alldevs_list; struct list_head domain_list; struct list_head msi_list; struct arch_msix *msix; struct domain *domain; const u16 seg; const u8 bus; const u8 devfn; u8 phantom_stride; enum pdev_type { DEV_TYPE_PCI_UNKNOWN, DEV_TYPE_PCIe_ENDPOINT, DEV_TYPE_PCIe_BRIDGE, // PCIe root port, switch DEV_TYPE_PCIe2PCI_BRIDGE, // PCIe-to-PCI/PCIx bridge DEV_TYPE_PCI2PCIe_BRIDGE, // PCI/PCIx-to-PCIe bridge DEV_TYPE_LEGACY_PCI_BRIDGE, // Legacy PCI bridge DEV_TYPE_PCI_HOST_BRIDGE, // PCI Host bridge DEV_TYPE_PCI, } type; struct pci_dev_info info; struct arch_pci_dev arch; struct { s_time_t time; unsigned int count; #define PT_FAULT_THRESHOLD 10 } fault; u64 vf_rlen[6]; }; #define for_each_pdev(domain, pdev) \ list_for_each_entry(pdev, &(domain->arch.pdev_list), domain_list) /* * The pcidevs_lock protect alldevs_list, and the assignment for the * devices, it also sync the access to the msi capability that is not * interrupt handling related (the mask bit register). */ extern spinlock_t pcidevs_lock; bool_t pci_known_segment(u16 seg); int pci_device_detect(u16 seg, u8 bus, u8 dev, u8 func); int scan_pci_devices(void); enum pdev_type pdev_type(u16 seg, u8 bus, u8 devfn); int find_upstream_bridge(u16 seg, u8 *bus, u8 *devfn, u8 *secbus); struct pci_dev *pci_lock_pdev(int seg, int bus, int devfn); struct pci_dev *pci_lock_domain_pdev( struct domain *, int seg, int bus, int devfn); void setup_dom0_pci_devices(struct domain *, int (*)(u8 devfn, struct pci_dev *)); void pci_release_devices(struct domain *d); int pci_add_segment(u16 seg); const unsigned long *pci_get_ro_map(u16 seg); int pci_add_device(u16 seg, u8 bus, u8 devfn, const struct pci_dev_info *); int pci_remove_device(u16 seg, u8 bus, u8 devfn); int pci_ro_device(int seg, int bus, int devfn); void arch_pci_ro_device(int seg, int bdf); int pci_hide_device(int bus, int devfn); struct pci_dev *pci_get_pdev(int seg, int bus, int devfn); struct pci_dev *pci_get_real_pdev(int seg, int bus, int devfn); struct pci_dev *pci_get_pdev_by_domain( struct domain *, int seg, int bus, int devfn); void pci_check_disable_device(u16 seg, u8 bus, u8 devfn); uint8_t pci_conf_read8( unsigned int seg, unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg); uint16_t pci_conf_read16( unsigned int seg, unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg); uint32_t pci_conf_read32( unsigned int seg, unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg); void pci_conf_write8( unsigned int seg, unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, uint8_t data); void pci_conf_write16( unsigned int seg, unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, uint16_t data); void pci_conf_write32( unsigned int seg, unsigned int bus, unsigned int dev, unsigned int func, unsigned int reg, uint32_t data); uint32_t pci_conf_read(uint32_t cf8, uint8_t offset, uint8_t bytes); void pci_conf_write(uint32_t cf8, uint8_t offset, uint8_t bytes, uint32_t data); int pci_mmcfg_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value); int pci_mmcfg_write(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 value); int pci_find_cap_offset(u16 seg, u8 bus, u8 dev, u8 func, u8 cap); int pci_find_next_cap(u16 seg, u8 bus, unsigned int devfn, u8 pos, int cap); int pci_find_ext_capability(int seg, int bus, int devfn, int cap); const char *parse_pci(const char *, unsigned int *seg, unsigned int *bus, unsigned int *dev, unsigned int *func); struct pirq; int msixtbl_pt_register(struct domain *, struct pirq *, uint64_t gtable); void msixtbl_pt_unregister(struct domain *, struct pirq *); void msixtbl_pt_cleanup(struct domain *d); #endif /* __XEN_PCI_H__ */ xen-4.4.0/xen/include/xen/perfc_defn.h0000664000175000017500000000536712307313555015731 0ustar smbsmb/* This file is legitimately included multiple times. */ /*#ifndef __XEN_PERFC_DEFN_H__*/ /*#define __XEN_PERFC_DEFN_H__*/ #include PERFCOUNTER_ARRAY(hypercalls, "hypercalls", NR_hypercalls) PERFCOUNTER(calls_to_multicall, "calls to multicall") PERFCOUNTER(calls_from_multicall, "calls from multicall") PERFCOUNTER(irqs, "#interrupts") PERFCOUNTER(ipis, "#IPIs") /* Generic scheduler counters (applicable to all schedulers) */ PERFCOUNTER(sched_irq, "sched: timer") PERFCOUNTER(sched_run, "sched: runs through scheduler") PERFCOUNTER(sched_ctx, "sched: context switches") PERFCOUNTER(schedule, "sched: specific scheduler") PERFCOUNTER(dom_init, "sched: dom_init") PERFCOUNTER(dom_destroy, "sched: dom_destroy") PERFCOUNTER(vcpu_init, "sched: vcpu_init") PERFCOUNTER(vcpu_destroy, "sched: vcpu_destroy") /* credit specific counters */ PERFCOUNTER(delay_ms, "csched: delay") PERFCOUNTER(vcpu_check, "csched: vcpu_check") PERFCOUNTER(acct_run, "csched: acct_run") PERFCOUNTER(acct_no_work, "csched: acct_no_work") PERFCOUNTER(acct_balance, "csched: acct_balance") PERFCOUNTER(acct_reorder, "csched: acct_reorder") PERFCOUNTER(acct_min_credit, "csched: acct_min_credit") PERFCOUNTER(acct_vcpu_active, "csched: acct_vcpu_active") PERFCOUNTER(acct_vcpu_idle, "csched: acct_vcpu_idle") PERFCOUNTER(vcpu_sleep, "csched: vcpu_sleep") PERFCOUNTER(vcpu_wake_running, "csched: vcpu_wake_running") PERFCOUNTER(vcpu_wake_onrunq, "csched: vcpu_wake_onrunq") PERFCOUNTER(vcpu_wake_runnable, "csched: vcpu_wake_runnable") PERFCOUNTER(vcpu_wake_not_runnable, "csched: vcpu_wake_not_runnable") PERFCOUNTER(vcpu_park, "csched: vcpu_park") PERFCOUNTER(vcpu_unpark, "csched: vcpu_unpark") PERFCOUNTER(tickle_idlers_none, "csched: tickle_idlers_none") PERFCOUNTER(tickle_idlers_some, "csched: tickle_idlers_some") PERFCOUNTER(load_balance_idle, "csched: load_balance_idle") PERFCOUNTER(load_balance_over, "csched: load_balance_over") PERFCOUNTER(load_balance_other, "csched: load_balance_other") PERFCOUNTER(steal_trylock_failed, "csched: steal_trylock_failed") PERFCOUNTER(steal_peer_idle, "csched: steal_peer_idle") PERFCOUNTER(migrate_queued, "csched: migrate_queued") PERFCOUNTER(migrate_running, "csched: migrate_running") PERFCOUNTER(migrate_kicked_away, "csched: migrate_kicked_away") PERFCOUNTER(vcpu_hot, "csched: vcpu_hot") PERFCOUNTER(need_flush_tlb_flush, "PG_need_flush tlb flushes") /*#endif*/ /* __XEN_PERFC_DEFN_H__ */ xen-4.4.0/xen/include/xen/multicall.h0000664000175000017500000000120612307313555015610 0ustar smbsmb/****************************************************************************** * multicall.h */ #ifndef __XEN_MULTICALL_H__ #define __XEN_MULTICALL_H__ #include #include #ifdef CONFIG_COMPAT #include #endif #define _MCSF_in_multicall 0 #define _MCSF_call_preempted 1 #define MCSF_in_multicall (1<<_MCSF_in_multicall) #define MCSF_call_preempted (1<<_MCSF_call_preempted) struct mc_state { unsigned long flags; union { struct multicall_entry call; #ifdef CONFIG_COMPAT struct compat_multicall_entry compat_call; #endif }; }; #endif /* __XEN_MULTICALL_H__ */ xen-4.4.0/xen/include/xen/grant_table.h0000664000175000017500000001131712307313555016110 0ustar smbsmb/****************************************************************************** * include/xen/grant_table.h * * Mechanism for granting foreign access to page frames, and receiving * page-ownership transfers. * * Copyright (c) 2004-2005 K A Fraser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef __XEN_GRANT_TABLE_H__ #define __XEN_GRANT_TABLE_H__ #include #include #include /* Count of writable host-CPU mappings. */ #define GNTPIN_hstw_shift (0) #define GNTPIN_hstw_inc (1 << GNTPIN_hstw_shift) #define GNTPIN_hstw_mask (0xFFU << GNTPIN_hstw_shift) /* Count of read-only host-CPU mappings. */ #define GNTPIN_hstr_shift (8) #define GNTPIN_hstr_inc (1 << GNTPIN_hstr_shift) #define GNTPIN_hstr_mask (0xFFU << GNTPIN_hstr_shift) /* Count of writable device-bus mappings. */ #define GNTPIN_devw_shift (16) #define GNTPIN_devw_inc (1 << GNTPIN_devw_shift) #define GNTPIN_devw_mask (0xFFU << GNTPIN_devw_shift) /* Count of read-only device-bus mappings. */ #define GNTPIN_devr_shift (24) #define GNTPIN_devr_inc (1 << GNTPIN_devr_shift) #define GNTPIN_devr_mask (0xFFU << GNTPIN_devr_shift) #ifndef DEFAULT_MAX_NR_GRANT_FRAMES /* to allow arch to override */ /* Default maximum size of a grant table. [POLICY] */ #define DEFAULT_MAX_NR_GRANT_FRAMES 32 #endif #ifndef max_nr_grant_frames /* to allow arch to override */ /* The maximum size of a grant table. */ extern unsigned int max_nr_grant_frames; #endif /* * Tracks a mapping of another domain's grant reference. Each domain has a * table of these, indexes into which are returned as a 'mapping handle'. */ struct grant_mapping { u32 ref; /* grant ref */ u16 flags; /* 0-4: GNTMAP_* ; 5-15: unused */ domid_t domid; /* granting domain */ }; /* Per-domain grant information. */ struct grant_table { /* Table size. Number of frames shared with guest */ unsigned int nr_grant_frames; /* Shared grant table (see include/public/grant_table.h). */ union { void **shared_raw; struct grant_entry_v1 **shared_v1; union grant_entry_v2 **shared_v2; }; /* Number of grant status frames shared with guest (for version 2) */ unsigned int nr_status_frames; /* State grant table (see include/public/grant_table.h). */ grant_status_t **status; /* Active grant table. */ struct active_grant_entry **active; /* Mapping tracking table. */ struct grant_mapping **maptrack; unsigned int maptrack_head; unsigned int maptrack_limit; /* Lock protecting updates to active and shared grant tables. */ spinlock_t lock; /* The defined versions are 1 and 2. Set to 0 if we don't know what version to use yet. */ unsigned gt_version; }; /* Create/destroy per-domain grant table context. */ int grant_table_create( struct domain *d); void grant_table_destroy( struct domain *d); /* Domain death release of granted mappings of other domains' memory. */ void gnttab_release_mappings( struct domain *d); /* Increase the size of a domain's grant table. * Caller must hold d's grant table lock. */ int gnttab_grow_table(struct domain *d, unsigned int req_nr_frames); /* Number of grant table frames. Caller must hold d's grant table lock. */ static inline unsigned int nr_grant_frames(struct grant_table *gt) { return gt->nr_grant_frames; } /* Number of status grant table frames. Caller must hold d's gr. table lock.*/ static inline unsigned int nr_status_frames(struct grant_table *gt) { return gt->nr_status_frames; } #define GRANT_STATUS_PER_PAGE (PAGE_SIZE / sizeof(grant_status_t)) #define GRANT_PER_PAGE (PAGE_SIZE / sizeof(grant_entry_v2_t)) /* Number of grant table status entries. Caller must hold d's gr. table lock.*/ static inline unsigned int grant_to_status_frames(int grant_frames) { return (grant_frames * GRANT_PER_PAGE + GRANT_STATUS_PER_PAGE - 1) / GRANT_STATUS_PER_PAGE; } #endif /* __XEN_GRANT_TABLE_H__ */ xen-4.4.0/xen/include/xen/decompress.h0000664000175000017500000000304412307313555015770 0ustar smbsmb#ifndef __XEN_GENERIC_H #define __XEN_GENERIC_H typedef int decompress_fn(unsigned char *inbuf, unsigned int len, int (*fill)(void*, unsigned int), int (*flush)(void*, unsigned int), unsigned char *outbuf, unsigned int *posp, void (*error)(const char *x)); /* inbuf - input buffer * len - len of pre-read data in inbuf * fill - function to fill inbuf when empty * flush - function to write out outbuf * outbuf - output buffer * posp - if non-null, input position (number of bytes read) will be * returned here * error - error reporting function * * If len != 0, inbuf should contain all the necessary input data, and fill * should be NULL * If len = 0, inbuf can be NULL, in which case the decompressor will allocate * the input buffer. If inbuf != NULL it must be at least XXX_IOBUF_SIZE bytes. * fill will be called (repeatedly...) to read data, at most XXX_IOBUF_SIZE * bytes should be read per call. Replace XXX with the appropriate decompressor * name, i.e. LZMA_IOBUF_SIZE. * * If flush = NULL, outbuf must be large enough to buffer all the expected * output. If flush != NULL, the output buffer will be allocated by the * decompressor (outbuf = NULL), and the flush function will be called to * flush the output buffer at the appropriate time (decompressor and stream * dependent). */ decompress_fn bunzip2, unxz, unlzma, unlzo, unlz4; int decompress(void *inbuf, unsigned int len, void *outbuf); #endif xen-4.4.0/xen/include/xen/rangeset.h0000664000175000017500000000462412307313555015441 0ustar smbsmb/****************************************************************************** * rangeset.h * * Creation, maintenance and automatic destruction of per-domain sets of * numeric ranges. * * Copyright (c) 2005, K A Fraser */ #ifndef __XEN_RANGESET_H__ #define __XEN_RANGESET_H__ struct domain; struct rangeset; /* * Initialise/destroy per-domain rangeset information. * * It is invalid to create or destroy a rangeset belonging to a domain @d * before rangeset_domain_initialise(d) returns or after calling * rangeset_domain_destroy(d). */ void rangeset_domain_initialise( struct domain *d); void rangeset_domain_destroy( struct domain *d); /* * Create/destroy a rangeset. Optionally attach to specified domain @d for * auto-destruction when the domain dies. A name may be specified, for use * in debug pretty-printing, and various RANGESETF flags (defined below). * * It is invalid to perform any operation on a rangeset @r after calling * rangeset_destroy(r). */ struct rangeset *rangeset_new( struct domain *d, char *name, unsigned int flags); void rangeset_destroy( struct rangeset *r); /* Flags for passing to rangeset_new(). */ /* Pretty-print range limits in hexadecimal. */ #define _RANGESETF_prettyprint_hex 0 #define RANGESETF_prettyprint_hex (1U << _RANGESETF_prettyprint_hex) int __must_check rangeset_is_empty( struct rangeset *r); /* Add/remove/query a numeric range. */ int __must_check rangeset_add_range( struct rangeset *r, unsigned long s, unsigned long e); int __must_check rangeset_remove_range( struct rangeset *r, unsigned long s, unsigned long e); int __must_check rangeset_contains_range( struct rangeset *r, unsigned long s, unsigned long e); int __must_check rangeset_overlaps_range( struct rangeset *r, unsigned long s, unsigned long e); int rangeset_report_ranges( struct rangeset *r, unsigned long s, unsigned long e, int (*cb)(unsigned long s, unsigned long e, void *), void *ctxt); /* Add/remove/query a single number. */ int __must_check rangeset_add_singleton( struct rangeset *r, unsigned long s); int __must_check rangeset_remove_singleton( struct rangeset *r, unsigned long s); int __must_check rangeset_contains_singleton( struct rangeset *r, unsigned long s); /* Rangeset pretty printing. */ void rangeset_printk( struct rangeset *r); void rangeset_domain_printk( struct domain *d); #endif /* __XEN_RANGESET_H__ */ xen-4.4.0/xen/include/xen/compat.h0000664000175000017500000002304412307313555015111 0ustar smbsmb/****************************************************************************** * compat.h */ #ifndef __XEN_COMPAT_H__ #define __XEN_COMPAT_H__ #ifdef CONFIG_COMPAT #include #include #include #define __DEFINE_COMPAT_HANDLE(name, type) \ typedef struct { \ compat_ptr_t c; \ type *_[0] __attribute__((__packed__)); \ } __compat_handle_ ## name #define DEFINE_COMPAT_HANDLE(name) \ __DEFINE_COMPAT_HANDLE(name, name); \ __DEFINE_COMPAT_HANDLE(const_ ## name, const name) #define COMPAT_HANDLE(name) __compat_handle_ ## name /* NB: it is assumed that if an arch uses the compat layer it does not * distinguish handles from parameter handles. */ #define COMPAT_HANDLE_PARAM(name) __compat_handle_ ## name /* Is the compat handle a NULL reference? */ #define compat_handle_is_null(hnd) ((hnd).c == 0) /* Offset the given compat handle into the array it refers to. */ #define compat_handle_add_offset(hnd, nr) \ ((hnd).c += (nr) * sizeof(**(hnd)._)) /* Cast a compat handle to the specified type of handle. */ #define compat_handle_cast(chnd, type) ({ \ type *_x = (__typeof__(**(chnd)._) *)(full_ptr_t)(chnd).c; \ (COMPAT_HANDLE(type)) { (full_ptr_t)_x }; \ }) #define guest_from_compat_handle(ghnd, chnd) \ set_xen_guest_handle(ghnd, \ (__typeof__(**(chnd)._) *)(full_ptr_t)(chnd).c) /* * Copy an array of objects to guest context via a compat handle, * specifying an offset into the guest array. */ #define copy_to_compat_offset(hnd, off, ptr, nr) ({ \ const typeof(*(ptr)) *_s = (ptr); \ char (*_d)[sizeof(*_s)] = (void *)(full_ptr_t)(hnd).c; \ ((void)((typeof(**(hnd)._) *)(full_ptr_t)(hnd).c == (ptr))); \ raw_copy_to_guest(_d + (off), _s, sizeof(*_s) * (nr)); \ }) /* * Copy an array of objects from guest context via a compat handle, * specifying an offset into the guest array. */ #define copy_from_compat_offset(ptr, hnd, off, nr) ({ \ const typeof(*(ptr)) *_s = (typeof(**(hnd)._) *)(full_ptr_t)(hnd).c; \ typeof(*(ptr)) *_d = (ptr); \ raw_copy_from_guest(_d, _s + (off), sizeof(*_d) * (nr)); \ }) #define copy_to_compat(hnd, ptr, nr) \ copy_to_compat_offset(hnd, 0, ptr, nr) #define copy_from_compat(ptr, hnd, nr) \ copy_from_compat_offset(ptr, hnd, 0, nr) /* Copy sub-field of a structure to guest context via a compat handle. */ #define copy_field_to_compat(hnd, ptr, field) ({ \ const typeof(&(ptr)->field) _s = &(ptr)->field; \ void *_d = &((typeof(**(hnd)._) *)(full_ptr_t)(hnd).c)->field; \ ((void)(&((typeof(**(hnd)._) *)(full_ptr_t)(hnd).c)->field == \ &(ptr)->field)); \ raw_copy_to_guest(_d, _s, sizeof(*_s)); \ }) /* Copy sub-field of a structure from guest context via a compat handle. */ #define copy_field_from_compat(ptr, hnd, field) ({ \ const typeof(&(ptr)->field) _s = \ &((typeof(**(hnd)._) *)(full_ptr_t)(hnd).c)->field; \ typeof(&(ptr)->field) _d = &(ptr)->field; \ raw_copy_from_guest(_d, _s, sizeof(*_d)); \ }) /* * Pre-validate a guest handle. * Allows use of faster __copy_* functions. */ #define compat_handle_okay(hnd, nr) \ (paging_mode_external(current->domain) || \ compat_array_access_ok((void *)(full_ptr_t)(hnd).c, (nr), \ sizeof(**(hnd)._))) #define __copy_to_compat_offset(hnd, off, ptr, nr) ({ \ const typeof(*(ptr)) *_s = (ptr); \ char (*_d)[sizeof(*_s)] = (void *)(full_ptr_t)(hnd).c; \ ((void)((typeof(**(hnd)._) *)(full_ptr_t)(hnd).c == (ptr))); \ __raw_copy_to_guest(_d + (off), _s, sizeof(*_s) * (nr)); \ }) #define __copy_from_compat_offset(ptr, hnd, off, nr) ({ \ const typeof(*(ptr)) *_s = (typeof(**(hnd)._) *)(full_ptr_t)(hnd).c; \ typeof(*(ptr)) *_d = (ptr); \ __raw_copy_from_guest(_d, _s + (off), sizeof(*_d) * (nr)); \ }) #define __copy_to_compat(hnd, ptr, nr) \ __copy_to_compat_offset(hnd, 0, ptr, nr) #define __copy_from_compat(ptr, hnd, nr) \ __copy_from_compat_offset(ptr, hnd, 0, nr) #define __copy_field_to_compat(hnd, ptr, field) ({ \ const typeof(&(ptr)->field) _s = &(ptr)->field; \ void *_d = &((typeof(**(hnd)._) *)(full_ptr_t)(hnd).c)->field; \ ((void)(&((typeof(**(hnd)._) *)(full_ptr_t)(hnd).c)->field == \ &(ptr)->field)); \ __raw_copy_to_guest(_d, _s, sizeof(*_s)); \ }) #define __copy_field_from_compat(ptr, hnd, field) ({ \ const typeof(&(ptr)->field) _s = \ &((typeof(**(hnd)._) *)(full_ptr_t)(hnd).c)->field; \ typeof(&(ptr)->field) _d = &(ptr)->field; \ __raw_copy_from_guest(_d, _s, sizeof(*_d)); \ }) #define CHECK_NAME(name, tag) __check ## tag ## name #define CHECK_NAME_(k, n, tag) __check ## tag ## k ## _ ## n #define CHECK_TYPE(name) \ static inline int CHECK_NAME(name, T)(xen_ ## name ## _t *x, \ compat_ ## name ## _t *c) \ { \ return x == c; \ } #define CHECK_TYPE_(k, n) \ static inline int CHECK_NAME_(k, n, T)(k xen_ ## n *x, \ k compat_ ## n *c) \ { \ return x == c; \ } #define CHECK_SIZE(name) \ typedef int CHECK_NAME(name, S)[1 - (sizeof(xen_ ## name ## _t) != \ sizeof(compat_ ## name ## _t)) * 2] #define CHECK_SIZE_(k, n) \ typedef int CHECK_NAME_(k, n, S)[1 - (sizeof(k xen_ ## n) != \ sizeof(k compat_ ## n)) * 2] #define CHECK_FIELD_COMMON(name, t, f) \ static inline int name(xen_ ## t ## _t *x, compat_ ## t ## _t *c) \ { \ BUILD_BUG_ON(offsetof(xen_ ## t ## _t, f) != \ offsetof(compat_ ## t ## _t, f)); \ return &x->f == &c->f; \ } #define CHECK_FIELD_COMMON_(k, name, n, f) \ static inline int name(k xen_ ## n *x, k compat_ ## n *c) \ { \ BUILD_BUG_ON(offsetof(k xen_ ## n, f) != \ offsetof(k compat_ ## n, f)); \ return &x->f == &c->f; \ } #define CHECK_FIELD(t, f) \ CHECK_FIELD_COMMON(CHECK_NAME(t ## __ ## f, F), t, f) #define CHECK_FIELD_(k, n, f) \ CHECK_FIELD_COMMON_(k, CHECK_NAME_(k, n ## __ ## f, F), n, f) #define CHECK_SUBFIELD_1(t, f1, f2) \ CHECK_FIELD_COMMON(CHECK_NAME(t ## __ ## f1 ## __ ## f2, F1), t, f1.f2) #define CHECK_SUBFIELD_1_(k, n, f1, f2) \ CHECK_FIELD_COMMON_(k, CHECK_NAME_(k, n ## __ ## f1 ## __ ## f2, F1), \ n, f1.f2) #define CHECK_SUBFIELD_2(t, f1, f2, f3) \ CHECK_FIELD_COMMON(CHECK_NAME(t ## __ ## f1 ## __ ## f2 ## __ ## f3, F2), \ t, f1.f2.f3) #define CHECK_SUBFIELD_2_(k, n, f1, f2, f3) \ CHECK_FIELD_COMMON_(k, CHECK_NAME_(k, n ## __ ## f1 ## __ ## f2 ## __ ## \ f3, F2), n, f1.f2.f3) /* * Translate a native continuation into a compat guest continuation. * * id: If non-NULL then points to an integer N between 0-5. Will be updated * with the value of the N'th argument to the hypercall. The N'th argument must * not be subject to translation (i.e. cannot be referenced by @mask below). * This option is useful for extracting the "op" argument or similar from the * hypercall to enable further xlat processing. * * mask: Specifies which of the hypercall arguments require compat translation. * bit 0 indicates that the 0'th argument requires translation, bit 1 indicates * that the first argument requires translation and so on. Native and compat * values for each translated argument are provided as @varargs (see below). * * varargs: For each bit which is set in @mask the varargs contain a native * value (unsigned long) and a compat value (unsigned int). If the native value * and compat value differ and the N'th argument is equal to the native value * then that argument is replaced by the compat value. If the native and compat * values are equal then no translation takes place. If the N'th argument does * not equal the native value then no translation takes place. * * Any untranslated argument (whether due to not being requested in @mask, * native and compat values being equal or N'th argument not equalling native * value) must be equal in both native and compat representations (i.e. the * native version cannot have any bits > 32 set) * * Return: Number of arguments which were actually translated. */ int hypercall_xlat_continuation(unsigned int *id, unsigned int mask, ...); /* In-place translation functons: */ struct start_info; void xlat_start_info(struct start_info *, enum XLAT_start_info_console); struct vcpu_runstate_info; void xlat_vcpu_runstate_info(struct vcpu_runstate_info *); int switch_compat(struct domain *); int switch_native(struct domain *); #else #define compat_handle_is_null(hnd) 0 #endif #endif /* __XEN_COMPAT_H__ */ xen-4.4.0/xen/include/xen/shared.h0000664000175000017500000000262412307313555015075 0ustar smbsmb#ifndef __XEN_SHARED_H__ #define __XEN_SHARED_H__ #ifdef CONFIG_COMPAT #include typedef union { struct shared_info native; struct compat_shared_info compat; } shared_info_t; /* * Compat field is never larger than native field, so cast to that as it * is the largest memory range it is safe for the caller to modify without * further discrimination between compat and native cases. */ #define __shared_info(d, s, field) \ (*(!has_32bit_shinfo(d) ? \ (typeof(&(s)->compat.field))&(s)->native.field : \ (typeof(&(s)->compat.field))&(s)->compat.field)) typedef union { struct vcpu_info native; struct compat_vcpu_info compat; } vcpu_info_t; /* As above, cast to compat field type. */ #define __vcpu_info(v, i, field) \ (*(!has_32bit_shinfo((v)->domain) ? \ (typeof(&(i)->compat.field))&(i)->native.field : \ (typeof(&(i)->compat.field))&(i)->compat.field)) #else typedef struct shared_info shared_info_t; #define __shared_info(d, s, field) ((s)->field) typedef struct vcpu_info vcpu_info_t; #define __vcpu_info(v, i, field) ((i)->field) #endif extern vcpu_info_t dummy_vcpu_info; #define shared_info(d, field) __shared_info(d, (d)->shared_info, field) #define vcpu_info(v, field) __vcpu_info(v, (v)->vcpu_info, field) #endif /* __XEN_SHARED_H__ */ xen-4.4.0/xen/include/xen/stop_machine.h0000664000175000017500000000141012307313555016270 0ustar smbsmb#ifndef __XEN_STOP_MACHINE_H__ #define __XEN_STOP_MACHINE_H__ /** * stop_machine_run: freeze the machine on all CPUs and run this function * @fn: the function to run * @data: the data ptr for the @fn() * @cpu: the cpu to run @fn() on (or all, if @cpu == NR_CPUS). * * Description: This causes every other cpu to enter a safe point, with * each of which disables interrupts, and finally interrupts are disabled * on the current CPU. The result is that none is holding a spinlock * or inside any other preempt-disabled region when @fn() runs. * * This can be thought of as a very heavy write lock, equivalent to * grabbing every spinlock in the kernel. */ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu); #endif /* __XEN_STOP_MACHINE_H__ */ xen-4.4.0/xen/include/xen/vga.h0000664000175000017500000000055112307313555014401 0ustar smbsmb/* * vga.h * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. */ #ifndef _XEN_VGA_H #define _XEN_VGA_H #include #ifdef CONFIG_VGA extern struct xen_vga_console_info vga_console_info; #endif #endif /* _XEN_VGA_H */ xen-4.4.0/xen/include/xen/cpumask.h0000664000175000017500000003377412307313555015304 0ustar smbsmb#ifndef __XEN_CPUMASK_H #define __XEN_CPUMASK_H /* * Cpumasks provide a bitmap suitable for representing the * set of CPU's in a system, one bit position per CPU number. * * See detailed comments in the file xen/bitmap.h describing the * data type on which these cpumasks are based. * * For details of cpumask_scnprintf() and cpulist_scnprintf(), * see bitmap_scnprintf() and bitmap_scnlistprintf() in lib/bitmap.c. * * The available cpumask operations are: * * void cpumask_set_cpu(cpu, mask) turn on bit 'cpu' in mask * void cpumask_clear_cpu(cpu, mask) turn off bit 'cpu' in mask * void cpumask_setall(mask) set all bits * void cpumask_clear(mask) clear all bits * int cpumask_test_cpu(cpu, mask) true iff bit 'cpu' set in mask * int cpumask_test_and_set_cpu(cpu, mask) test and set bit 'cpu' in mask * int cpumask_test_and_clear_cpu(cpu, mask) test and clear bit 'cpu' in mask * * void cpumask_and(dst, src1, src2) dst = src1 & src2 [intersection] * void cpumask_or(dst, src1, src2) dst = src1 | src2 [union] * void cpumask_xor(dst, src1, src2) dst = src1 ^ src2 * void cpumask_andnot(dst, src1, src2) dst = src1 & ~src2 * void cpumask_complement(dst, src) dst = ~src * * int cpumask_equal(mask1, mask2) Does mask1 == mask2? * int cpumask_intersects(mask1, mask2) Do mask1 and mask2 intersect? * int cpumask_subset(mask1, mask2) Is mask1 a subset of mask2? * int cpumask_empty(mask) Is mask empty (no bits sets)? * int cpumask_full(mask) Is mask full (all bits sets)? * int cpumask_weight(mask) Hamming weigh - number of set bits * * void cpumask_shift_right(dst, src, n) Shift right * void cpumask_shift_left(dst, src, n) Shift left * * int cpumask_first(mask) Number lowest set bit, or NR_CPUS * int cpumask_next(cpu, mask) Next cpu past 'cpu', or NR_CPUS * int cpumask_last(mask) Number highest set bit, or NR_CPUS * int cpumask_any(mask) Any cpu in mask, or NR_CPUS * int cpumask_cycle(cpu, mask) Next cpu cycling from 'cpu', or NR_CPUS * * const cpumask_t *cpumask_of(cpu) Return cpumask with bit 'cpu' set * unsigned long *cpumask_bits(mask) Array of unsigned long's in mask * * int cpumask_scnprintf(buf, len, mask) Format cpumask for printing * int cpulist_scnprintf(buf, len, mask) Format cpumask as list for printing * * for_each_cpu(cpu, mask) for-loop cpu over mask * * int num_online_cpus() Number of online CPUs * int num_possible_cpus() Number of all possible CPUs * int num_present_cpus() Number of present CPUs * * int cpu_online(cpu) Is some cpu online? * int cpu_possible(cpu) Is some cpu possible? * int cpu_present(cpu) Is some cpu present (can schedule)? * * int any_online_cpu(mask) First online cpu in mask, or NR_CPUS * * for_each_possible_cpu(cpu) for-loop cpu over cpu_possible_map * for_each_online_cpu(cpu) for-loop cpu over cpu_online_map * for_each_present_cpu(cpu) for-loop cpu over cpu_present_map * * Subtlety: * 1) The 'type-checked' form of cpumask_test_cpu() causes gcc (3.3.2, anyway) * to generate slightly worse code. Note for example the additional * 40 lines of assembly code compiling the "for each possible cpu" * loops buried in the disk_stat_read() macros calls when compiling * drivers/block/genhd.c (arch i386, CONFIG_SMP=y). So use a simple * one-line #define for cpumask_test_cpu(), instead of wrapping an inline * inside a macro, the way we do the other calls. */ #include #include #include typedef struct cpumask{ DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; extern unsigned int nr_cpu_ids; #if NR_CPUS > 4 * BITS_PER_LONG /* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also, * not all bits may be allocated. */ extern unsigned int nr_cpumask_bits; #else # define nr_cpumask_bits (BITS_TO_LONGS(NR_CPUS) * BITS_PER_LONG) #endif /* verify cpu argument to cpumask_* operators */ static inline unsigned int cpumask_check(unsigned int cpu) { ASSERT(cpu < nr_cpu_ids); return cpu; } static inline void cpumask_set_cpu(int cpu, volatile cpumask_t *dstp) { set_bit(cpumask_check(cpu), dstp->bits); } static inline void cpumask_clear_cpu(int cpu, volatile cpumask_t *dstp) { clear_bit(cpumask_check(cpu), dstp->bits); } static inline void cpumask_setall(cpumask_t *dstp) { bitmap_fill(dstp->bits, nr_cpumask_bits); } static inline void cpumask_clear(cpumask_t *dstp) { bitmap_zero(dstp->bits, nr_cpumask_bits); } /* No static inline type checking - see Subtlety (1) above. */ #define cpumask_test_cpu(cpu, cpumask) \ test_bit(cpumask_check(cpu), (cpumask)->bits) static inline int cpumask_test_and_set_cpu(int cpu, cpumask_t *addr) { return test_and_set_bit(cpumask_check(cpu), addr->bits); } static inline int cpumask_test_and_clear_cpu(int cpu, cpumask_t *addr) { return test_and_clear_bit(cpumask_check(cpu), addr->bits); } static inline void cpumask_and(cpumask_t *dstp, const cpumask_t *src1p, const cpumask_t *src2p) { bitmap_and(dstp->bits, src1p->bits, src2p->bits, nr_cpumask_bits); } static inline void cpumask_or(cpumask_t *dstp, const cpumask_t *src1p, const cpumask_t *src2p) { bitmap_or(dstp->bits, src1p->bits, src2p->bits, nr_cpumask_bits); } static inline void cpumask_xor(cpumask_t *dstp, const cpumask_t *src1p, const cpumask_t *src2p) { bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nr_cpumask_bits); } static inline void cpumask_andnot(cpumask_t *dstp, const cpumask_t *src1p, const cpumask_t *src2p) { bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nr_cpumask_bits); } static inline void cpumask_complement(cpumask_t *dstp, const cpumask_t *srcp) { bitmap_complement(dstp->bits, srcp->bits, nr_cpumask_bits); } static inline int cpumask_equal(const cpumask_t *src1p, const cpumask_t *src2p) { return bitmap_equal(src1p->bits, src2p->bits, nr_cpu_ids); } static inline int cpumask_intersects(const cpumask_t *src1p, const cpumask_t *src2p) { return bitmap_intersects(src1p->bits, src2p->bits, nr_cpu_ids); } static inline int cpumask_subset(const cpumask_t *src1p, const cpumask_t *src2p) { return bitmap_subset(src1p->bits, src2p->bits, nr_cpu_ids); } static inline int cpumask_empty(const cpumask_t *srcp) { return bitmap_empty(srcp->bits, nr_cpu_ids); } static inline int cpumask_full(const cpumask_t *srcp) { return bitmap_full(srcp->bits, nr_cpu_ids); } static inline int cpumask_weight(const cpumask_t *srcp) { return bitmap_weight(srcp->bits, nr_cpu_ids); } static inline void cpumask_copy(cpumask_t *dstp, const cpumask_t *srcp) { bitmap_copy(dstp->bits, srcp->bits, nr_cpumask_bits); } static inline void cpumask_shift_right(cpumask_t *dstp, const cpumask_t *srcp, int n) { bitmap_shift_right(dstp->bits, srcp->bits, n, nr_cpumask_bits); } static inline void cpumask_shift_left(cpumask_t *dstp, const cpumask_t *srcp, int n) { bitmap_shift_left(dstp->bits, srcp->bits, n, nr_cpumask_bits); } static inline int cpumask_first(const cpumask_t *srcp) { return min_t(int, nr_cpu_ids, find_first_bit(srcp->bits, nr_cpu_ids)); } static inline int cpumask_next(int n, const cpumask_t *srcp) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return min_t(int, nr_cpu_ids, find_next_bit(srcp->bits, nr_cpu_ids, n + 1)); } static inline int cpumask_last(const cpumask_t *srcp) { int cpu, pcpu = nr_cpu_ids; for (cpu = cpumask_first(srcp); cpu < nr_cpu_ids; cpu = cpumask_next(cpu, srcp)) pcpu = cpu; return pcpu; } static inline int cpumask_cycle(int n, const cpumask_t *srcp) { int nxt = cpumask_next(n, srcp); if (nxt == nr_cpu_ids) nxt = cpumask_first(srcp); return nxt; } static inline unsigned int cpumask_any(const cpumask_t *srcp) { unsigned int cpu = cpumask_first(srcp); unsigned int w = cpumask_weight(srcp); if ( w > 1 && cpu < nr_cpu_ids ) for ( w = get_random() % w; w--; ) { unsigned int next = cpumask_next(cpu, srcp); if ( next >= nr_cpu_ids ) break; cpu = next; } return cpu; } /* * Special-case data structure for "single bit set only" constant CPU masks. * * We pre-generate all the 64 (or 32) possible bit positions, with enough * padding to the left and the right, and return the constant pointer * appropriately offset. */ extern const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)]; static inline const cpumask_t *cpumask_of(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; return (const cpumask_t *)(p - cpu / BITS_PER_LONG); } #define cpumask_bits(maskp) ((maskp)->bits) static inline int cpumask_scnprintf(char *buf, int len, const cpumask_t *srcp) { return bitmap_scnprintf(buf, len, srcp->bits, nr_cpu_ids); } static inline int cpulist_scnprintf(char *buf, int len, const cpumask_t *srcp) { return bitmap_scnlistprintf(buf, len, srcp->bits, nr_cpu_ids); } /* * cpumask_var_t: struct cpumask for stack usage. * * Oh, the wicked games we play! In order to make kernel coding a * little more difficult, we typedef cpumask_var_t to an array or a * pointer: doing &mask on an array is a noop, so it still works. * * ie. * cpumask_var_t tmpmask; * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) * return -ENOMEM; * * ... use 'tmpmask' like a normal struct cpumask * ... * * free_cpumask_var(tmpmask); */ #if NR_CPUS > 2 * BITS_PER_LONG #include typedef cpumask_t *cpumask_var_t; static inline bool_t alloc_cpumask_var(cpumask_var_t *mask) { *(void **)mask = _xmalloc(nr_cpumask_bits / 8, sizeof(long)); return *mask != NULL; } static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask) { *(void **)mask = _xzalloc(nr_cpumask_bits / 8, sizeof(long)); return *mask != NULL; } static inline void free_cpumask_var(cpumask_var_t mask) { xfree(mask); } #else typedef cpumask_t cpumask_var_t[1]; static inline bool_t alloc_cpumask_var(cpumask_var_t *mask) { return 1; } static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask) { cpumask_clear(*mask); return 1; } static inline void free_cpumask_var(cpumask_var_t mask) { } #endif #if NR_CPUS > 1 #define for_each_cpu(cpu, mask) \ for ((cpu) = cpumask_first(mask); \ (cpu) < nr_cpu_ids; \ (cpu) = cpumask_next(cpu, mask)) #else /* NR_CPUS == 1 */ #define for_each_cpu(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)(mask)) #endif /* NR_CPUS */ /* * The following particular system cpumasks and operations manage * possible, present and online cpus. Each of them is a fixed size * bitmap of size NR_CPUS. * * #ifdef CONFIG_HOTPLUG_CPU * cpu_possible_map - has bit 'cpu' set iff cpu is populatable * cpu_present_map - has bit 'cpu' set iff cpu is populated * cpu_online_map - has bit 'cpu' set iff cpu available to scheduler * #else * cpu_possible_map - has bit 'cpu' set iff cpu is populated * cpu_present_map - copy of cpu_possible_map * cpu_online_map - has bit 'cpu' set iff cpu available to scheduler * #endif * * In either case, NR_CPUS is fixed at compile time, as the static * size of these bitmaps. The cpu_possible_map is fixed at boot * time, as the set of CPU id's that it is possible might ever * be plugged in at anytime during the life of that system boot. * The cpu_present_map is dynamic(*), representing which CPUs * are currently plugged in. And cpu_online_map is the dynamic * subset of cpu_present_map, indicating those CPUs available * for scheduling. * * If HOTPLUG is enabled, then cpu_possible_map is forced to have * all NR_CPUS bits set, otherwise it is just the set of CPUs that * ACPI reports present at boot. * * If HOTPLUG is enabled, then cpu_present_map varies dynamically, * depending on what ACPI reports as currently plugged in, otherwise * cpu_present_map is just a copy of cpu_possible_map. * * (*) Well, cpu_present_map is dynamic in the hotplug case. If not * hotplug, it's a copy of cpu_possible_map, hence fixed at boot. * * Subtleties: * 1) UP arch's (NR_CPUS == 1, CONFIG_SMP not defined) hardcode * assumption that their single CPU is online. The UP * cpu_{online,possible,present}_maps are placebos. Changing them * will have no useful affect on the following num_*_cpus() * and cpu_*() macros in the UP case. This ugliness is a UP * optimization - don't waste any instructions or memory references * asking if you're online or how many CPUs there are if there is * only one CPU. * 2) Most SMP arch's #define some of these maps to be some * other map specific to that arch. Therefore, the following * must be #define macros, not inlines. To see why, examine * the assembly code produced by the following. Note that * set1() writes phys_x_map, but set2() writes x_map: * int x_map, phys_x_map; * #define set1(a) x_map = a * inline void set2(int a) { x_map = a; } * #define x_map phys_x_map * main(){ set1(3); set2(5); } */ extern cpumask_t cpu_possible_map; extern cpumask_t cpu_online_map; extern cpumask_t cpu_present_map; #if NR_CPUS > 1 #define num_online_cpus() cpumask_weight(&cpu_online_map) #define num_possible_cpus() cpumask_weight(&cpu_possible_map) #define num_present_cpus() cpumask_weight(&cpu_present_map) #define cpu_online(cpu) cpumask_test_cpu(cpu, &cpu_online_map) #define cpu_possible(cpu) cpumask_test_cpu(cpu, &cpu_possible_map) #define cpu_present(cpu) cpumask_test_cpu(cpu, &cpu_present_map) #else #define num_online_cpus() 1 #define num_possible_cpus() 1 #define num_present_cpus() 1 #define cpu_online(cpu) ((cpu) == 0) #define cpu_possible(cpu) ((cpu) == 0) #define cpu_present(cpu) ((cpu) == 0) #endif #define for_each_possible_cpu(cpu) for_each_cpu(cpu, &cpu_possible_map) #define for_each_online_cpu(cpu) for_each_cpu(cpu, &cpu_online_map) #define for_each_present_cpu(cpu) for_each_cpu(cpu, &cpu_present_map) /* Copy to/from cpumap provided by control tools. */ struct xenctl_bitmap; int cpumask_to_xenctl_bitmap(struct xenctl_bitmap *, const cpumask_t *); int xenctl_bitmap_to_cpumask(cpumask_var_t *, const struct xenctl_bitmap *); #endif /* __XEN_CPUMASK_H */ xen-4.4.0/xen/include/xen/pmstat.h0000664000175000017500000000115012307313555015130 0ustar smbsmb#ifndef __XEN_PMSTAT_H_ #define __XEN_PMSTAT_H_ #include #include /* for struct xen_processor_power */ #include /* for struct pm_cx_stat */ int set_px_pminfo(uint32_t cpu, struct xen_processor_performance *perf); long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power); uint32_t pmstat_get_cx_nr(uint32_t cpuid); int pmstat_get_cx_stat(uint32_t cpuid, struct pm_cx_stat *stat); int pmstat_reset_cx_stat(uint32_t cpuid); int do_get_pm_info(struct xen_sysctl_get_pmstat *op); int do_pm_op(struct xen_sysctl_pm_op *op); #endif /* __XEN_PMSTAT_H_ */ xen-4.4.0/xen/include/xen/random.h0000664000175000017500000000016112307313555015101 0ustar smbsmb#ifndef __XEN_RANDOM_H__ #define __XEN_RANDOM_H__ unsigned int get_random(void); #endif /* __XEN_RANDOM_H__ */ xen-4.4.0/xen/include/xen/earlycpio.h0000664000175000017500000000036412307313555015615 0ustar smbsmb#ifndef _EARLYCPIO_H #define _EARLYCPIO_H #define MAX_CPIO_FILE_NAME 18 struct cpio_data { void *data; size_t size; }; struct cpio_data find_cpio_data(const char *path, void *data, size_t len, long *offset); #endif /* _EARLYCPIO_H */ xen-4.4.0/xen/include/xen/xencomm.h0000664000175000017500000001532512307313555015277 0ustar smbsmb/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright (C) IBM Corp. 2006 * * Authors: Hollis Blanchard */ #ifndef __XENCOMM_H__ #define __XENCOMM_H__ #include unsigned long xencomm_copy_to_guest( void *to, const void *from, unsigned int len, unsigned int skip); unsigned long xencomm_copy_from_guest( void *to, const void *from, unsigned int len, unsigned int skip); unsigned long xencomm_clear_guest( void *to, unsigned int n, unsigned int skip); int xencomm_add_offset(void **handle, unsigned int bytes); int xencomm_handle_is_null(void *ptr); static inline int xencomm_is_inline(const void *handle) { unsigned long addr = (unsigned long)handle; return (addr & XENCOMM_INLINE_FLAG) == XENCOMM_INLINE_FLAG; } static inline unsigned long xencomm_inline_addr(const void *handle) { return (unsigned long)handle & ~XENCOMM_INLINE_FLAG; } #define raw_copy_to_guest(dst, src, len) \ xencomm_copy_to_guest(dst, src, len, 0) #define raw_copy_from_guest(dst, src, len) \ xencomm_copy_from_guest(dst, src, nr, 0) #define raw_clear_guest(dst, len) \ xencomm_clear_guest(dst, len, 0) #define __raw_copy_to_guest raw_copy_to_guest #define __raw_copy_from_guest raw_copy_from_guest #define __raw_clear_guest raw_clear_guest /* Is the guest handle a NULL reference? */ #define guest_handle_is_null(hnd) \ ((hnd).p == NULL || xencomm_handle_is_null((hnd).p)) /* Offset the given guest handle into the array it refers to. */ #define guest_handle_add_offset(hnd, nr) ({ \ const typeof((hnd).p) _ptr; \ xencomm_add_offset((void **)&((hnd).p), nr * sizeof(*_ptr)); \ }) /* Cast a guest handle to the specified type of handle. */ #define guest_handle_cast(hnd, type) ({ \ type *_x = (hnd).p; \ XEN_GUEST_HANDLE_PARAM(type) _y; \ set_xen_guest_handle(_y, _x); \ _y; \ }) /* Cast a XEN_GUEST_HANDLE to XEN_GUEST_HANDLE_PARAM */ #define guest_handle_to_param(hnd, type) ({ \ /* type checking: make sure that the pointers inside \ * XEN_GUEST_HANDLE and XEN_GUEST_HANDLE_PARAM are of \ * the same type, then return hnd */ \ (void)((typeof(&(hnd).p)) 0 == \ (typeof(&((XEN_GUEST_HANDLE_PARAM(type)) {}).p)) 0); \ (hnd); \ }) /* Cast a XEN_GUEST_HANDLE_PARAM to XEN_GUEST_HANDLE */ #define guest_handle_from_param(hnd, type) ({ \ /* type checking: make sure that the pointers inside \ * XEN_GUEST_HANDLE and XEN_GUEST_HANDLE_PARAM are of \ * the same type, then return hnd */ \ (void)((typeof(&(hnd).p)) 0 == \ (typeof(&((XEN_GUEST_HANDLE_PARAM(type)) {}).p)) 0); \ (hnd); \ }) /* Since we run in real mode, we can safely access all addresses. That also * means our __routines are identical to our "normal" routines. */ #define guest_handle_okay(hnd, nr) 1 #define guest_handle_subrange_okay(hnd, first, last) 1 /* * Copy an array of objects to guest context via a guest handle. * Optionally specify an offset into the guest array. */ #define copy_to_guest_offset(hnd, idx, ptr, nr) \ __copy_to_guest_offset(hnd, idx, ptr, nr) /* Copy sub-field of a structure to guest context via a guest handle. */ #define copy_field_to_guest(hnd, ptr, field) \ __copy_field_to_guest(hnd, ptr, field) /* * Copy an array of objects from guest context via a guest handle. * Optionally specify an offset into the guest array. */ #define copy_from_guest_offset(ptr, hnd, idx, nr) \ __copy_from_guest_offset(ptr, hnd, idx, nr) /* * Clear an array of objects in guest context via a guest handle. * Optionally specify an offset into the guest array. */ #define clear_guest_offset(hnd, idx, nr) \ __clear_guest_offset(hnd, idx, nr) /* Copy sub-field of a structure from guest context via a guest handle. */ #define copy_field_from_guest(ptr, hnd, field) \ __copy_field_from_guest(ptr, hnd, field) #define __copy_to_guest_offset(hnd, idx, ptr, nr) ({ \ const typeof(*(ptr)) *_s = (ptr); \ void *_d = (hnd).p; \ ((void)((hnd).p == (ptr))); \ xencomm_copy_to_guest(_d, _s, sizeof(*_s)*(nr), sizeof(*_s)*(idx)); \ }) #define __copy_field_to_guest(hnd, ptr, field) ({ \ unsigned int _off = offsetof(typeof(*(hnd).p), field); \ const typeof(&(ptr)->field) _s = &(ptr)->field; \ void *_d = (hnd).p; \ ((void)(&(hnd).p->field == &(ptr)->field)); \ xencomm_copy_to_guest(_d, _s, sizeof(*_s), _off); \ }) #define __copy_from_guest_offset(ptr, hnd, idx, nr) ({ \ const typeof(*(ptr)) *_s = (hnd).p; \ typeof(*(ptr)) *_d = (ptr); \ xencomm_copy_from_guest(_d, _s, sizeof(*_d)*(nr), sizeof(*_d)*(idx)); \ }) #define __copy_field_from_guest(ptr, hnd, field) ({ \ unsigned int _off = offsetof(typeof(*(hnd).p), field); \ const void *_s = (hnd).p; \ typeof(&(ptr)->field) _d = &(ptr)->field; \ ((void)(&(hnd).p->field == &(ptr)->field)); \ xencomm_copy_from_guest(_d, _s, sizeof(*_d), _off); \ }) #define __clear_guest_offset(hnd, idx, nr) ({ \ void *_d = (hnd).p; \ xencomm_clear_guest(_d, nr, idx); \ }) #ifdef CONFIG_XENCOMM_MARK_DIRTY extern void xencomm_mark_dirty(unsigned long addr, unsigned int len); #else static inline void xencomm_mark_dirty(unsigned long addr, unsigned int len) { } #endif #endif /* __XENCOMM_H__ */ xen-4.4.0/xen/include/xen/efi.h0000664000175000017500000000211212307313555014362 0ustar smbsmb#ifndef __XEN_EFI_H__ #define __XEN_EFI_H__ #ifndef __ASSEMBLY__ #include #endif extern const bool_t efi_enabled; #define EFI_INVALID_TABLE_ADDR (~0UL) /* Add fields here only if they need to be referenced from non-EFI code. */ struct efi { unsigned long mps; /* MPS table */ unsigned long acpi; /* ACPI table (IA64 ext 0.71) */ unsigned long acpi20; /* ACPI table (ACPI 2.0) */ unsigned long smbios; /* SM BIOS table */ }; extern struct efi efi; #ifndef __ASSEMBLY__ union xenpf_efi_info; union compat_pf_efi_info; struct xenpf_efi_runtime_call; struct compat_pf_efi_runtime_call; void efi_init_memory(void); unsigned long efi_get_time(void); void efi_halt_system(void); void efi_reset_system(bool_t warm); #ifndef COMPAT int efi_get_info(uint32_t idx, union xenpf_efi_info *); int efi_runtime_call(struct xenpf_efi_runtime_call *); #endif int efi_compat_get_info(uint32_t idx, union compat_pf_efi_info *); int efi_compat_runtime_call(struct compat_pf_efi_runtime_call *); #endif /* !__ASSEMBLY__ */ #endif /* __XEN_EFI_H__ */ xen-4.4.0/xen/include/xen/lib.h0000664000175000017500000001116412307313555014374 0ustar smbsmb#ifndef __LIB_H__ #define __LIB_H__ #include #include #include #include #include #include void __bug(char *file, int line) __attribute__((noreturn)); void __warn(char *file, int line); #define BUG_ON(p) do { if (unlikely(p)) BUG(); } while (0) #define WARN_ON(p) do { if (unlikely(p)) WARN(); } while (0) #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) /* Force a compilation error if condition is true */ #define BUILD_BUG_ON(cond) ({ _Static_assert(!(cond), "!(" #cond ")"); }) /* Force a compilation error if condition is true, but also produce a result (of value 0 and type size_t), so the expression can be used e.g. in a structure initializer (or where-ever else comma expressions aren't permitted). */ #define BUILD_BUG_ON_ZERO(cond) \ sizeof(struct { _Static_assert(!(cond), "!(" #cond ")"); }) #else #define BUILD_BUG_ON_ZERO(cond) sizeof(struct { int:-!!(cond); }) #define BUILD_BUG_ON(cond) ((void)BUILD_BUG_ON_ZERO(cond)) #endif #ifndef assert_failed #define assert_failed(p) \ do { \ printk("Assertion '%s' failed, line %d, file %s\n", p , \ __LINE__, __FILE__); \ BUG(); \ } while (0) #endif #ifndef NDEBUG #define ASSERT(p) \ do { if ( unlikely(!(p)) ) assert_failed(#p); } while (0) #define debug_build() 1 #else #define ASSERT(p) do { if ( 0 && (p) ); } while (0) #define debug_build() 0 #endif #define ABS(_x) ({ \ typeof(_x) __x = (_x); \ (__x < 0) ? -__x : __x; \ }) #define SWAP(_a, _b) \ do { typeof(_a) _t = (_a); (_a) = (_b); (_b) = _t; } while ( 0 ) #define DIV_ROUND(n, d) (((n) + (d) / 2) / (d)) #define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]) + __must_be_array(x)) #define MASK_EXTR(v, m) (((v) & (m)) / ((m) & -(m))) #define MASK_INSR(v, m) (((v) * ((m) & -(m))) & (m)) #define ROUNDUP(x, a) (((x) + (a) - 1) & ~((a) - 1)) #define reserve_bootmem(_p,_l) ((void)0) struct domain; void cmdline_parse(const char *cmdline); int parse_bool(const char *s); /*#define DEBUG_TRACE_DUMP*/ #ifdef DEBUG_TRACE_DUMP extern void debugtrace_dump(void); extern void debugtrace_printk(const char *fmt, ...); #else #define debugtrace_dump() ((void)0) #define debugtrace_printk(_f, ...) ((void)0) #endif /* Allows us to use '%p' as general-purpose machine-word format char. */ #define _p(_x) ((void *)(unsigned long)(_x)) extern void printk(const char *format, ...) __attribute__ ((format (printf, 1, 2))); extern void guest_printk(const struct domain *d, const char *format, ...) __attribute__ ((format (printf, 2, 3))); extern void panic(const char *format, ...) __attribute__ ((format (printf, 1, 2))); extern long vm_assist(struct domain *, unsigned int, unsigned int); extern int __printk_ratelimit(int ratelimit_ms, int ratelimit_burst); extern int printk_ratelimit(void); /* vsprintf.c */ #define sprintf __xen_has_no_sprintf__ #define vsprintf __xen_has_no_vsprintf__ extern int snprintf(char * buf, size_t size, const char * fmt, ...) __attribute__ ((format (printf, 3, 4))); extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) __attribute__ ((format (printf, 3, 0))); extern int scnprintf(char * buf, size_t size, const char * fmt, ...) __attribute__ ((format (printf, 3, 4))); extern int vscnprintf(char *buf, size_t size, const char *fmt, va_list args) __attribute__ ((format (printf, 3, 0))); long simple_strtol( const char *cp,const char **endp, unsigned int base); unsigned long simple_strtoul( const char *cp,const char **endp, unsigned int base); long long simple_strtoll( const char *cp,const char **endp, unsigned int base); unsigned long long simple_strtoull( const char *cp,const char **endp, unsigned int base); unsigned long long parse_size_and_unit(const char *s, const char **ps); uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c); #define TAINT_UNSAFE_SMP (1<<0) #define TAINT_MACHINE_CHECK (1<<1) #define TAINT_BAD_PAGE (1<<2) #define TAINT_SYNC_CONSOLE (1<<3) #define TAINT_ERROR_INJECT (1<<4) extern int tainted; #define TAINT_STRING_MAX_LEN 20 extern char *print_tainted(char *str); extern void add_taint(unsigned); struct cpu_user_regs; void dump_execstate(struct cpu_user_regs *); void init_constructors(void); #endif /* __LIB_H__ */ xen-4.4.0/xen/include/xen/libelf.h0000664000175000017500000004135212307313555015065 0ustar smbsmb/****************************************************************************** * libelf.h * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef __XEN_LIBELF_H__ #define __XEN_LIBELF_H__ #if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) #define XEN_ELF_LITTLE_ENDIAN #else #error define architectural endianness #endif typedef int elf_errorstatus; /* 0: ok; -ve (normally -1): error */ typedef int elf_negerrnoval; /* 0: ok; -EFOO: error */ #undef ELFSIZE #include "elfstructs.h" #ifdef __XEN__ #include #include #include #include #else #include #include #include #include #include struct elf_binary; typedef void elf_log_callback(struct elf_binary*, void *caller_data, bool iserr, const char *fmt, va_list al); #endif #define ELF_MAX_STRING_LENGTH 4096 #define ELF_MAX_TOTAL_NOTE_COUNT 65536 /* ------------------------------------------------------------------------ */ /* Macros for accessing the input image and output area. */ /* * We abstract away the pointerness of these pointers, replacing * various void*, char* and struct* with the following: * elf_ptrval A pointer to a byte; one can do pointer arithmetic * on this. * HANDLE A pointer to a struct. There is one of these types * for each pointer type - that is, for each "structname". * In the arguments to the various HANDLE macros, structname * must be a single identifier which is a typedef. * It is not permitted to do arithmetic on these * pointers. In the current code attempts to do so will * compile, but in the next patch this will become a * compile error. */ typedef uintptr_t elf_ptrval; #define ELF_REALPTR2PTRVAL(realpointer) ((elf_ptrval)(realpointer)) /* Converts an actual C pointer into a PTRVAL */ #define ELF_HANDLE_DECL(structname) structname##_handle /* Provides a type declaration for a HANDLE. */ #ifdef __XEN__ # define ELF_PRPTRVAL "lx" /* * PRIxPTR is misdefined in xen/include/xen/inttypes.h, on 32-bit, * to "x", when in fact uintptr_t is an unsigned long. */ #else # define ELF_PRPTRVAL PRIxPTR #endif /* printf format a la PRId... for a PTRVAL */ #define ELF_DEFINE_HANDLE(structname) \ typedef union { \ elf_ptrval ptrval; \ const structname *typeonly; /* for sizeof, offsetof, &c only */ \ } structname##_handle; /* * This must be invoked for each HANDLE type to define * the actual C type used for that kind of HANDLE. */ #define ELF_MAKE_HANDLE(structname, ptrval) ((structname##_handle){ ptrval }) /* Converts a PTRVAL to a HANDLE */ #define ELF_IMAGE_BASE(elf) ((elf_ptrval)(elf)->image_base) /* Returns the base of the image as a PTRVAL. */ #define ELF_HANDLE_PTRVAL(handleval) ((handleval).ptrval) /* Converts a HANDLE to a PTRVAL. */ #define ELF_UNSAFE_PTR(ptrval) ((void*)(elf_ptrval)(ptrval)) /* * Turns a PTRVAL into an actual C pointer. Before this is done * the caller must have ensured that the PTRVAL does in fact point * to a permissible location. */ /* PTRVALs can be INVALID (ie, NULL). */ #define ELF_INVALID_PTRVAL ((elf_ptrval)0) /* returns NULL PTRVAL */ #define ELF_INVALID_HANDLE(structname) /* returns NULL handle */ \ ELF_MAKE_HANDLE(structname, ELF_INVALID_PTRVAL) #define ELF_PTRVAL_VALID(ptrval) (!!(ptrval)) /* } */ #define ELF_HANDLE_VALID(handleval) (!!(handleval).ptrval) /* } predicates */ #define ELF_PTRVAL_INVALID(ptrval) (!ELF_PTRVAL_VALID((ptrval))) /* } */ #define ELF_MAX_PTRVAL (~(elf_ptrval)0) /* PTRVAL value guaranteed to compare > to any valid PTRVAL */ /* For internal use by other macros here */ #define ELF__HANDLE_FIELD_TYPE(handleval, elm) \ typeof((handleval).typeonly->elm) #define ELF__HANDLE_FIELD_OFFSET(handleval, elm) \ offsetof(typeof(*(handleval).typeonly),elm) /* ------------------------------------------------------------------------ */ typedef union { Elf32_Ehdr e32; Elf64_Ehdr e64; } elf_ehdr; typedef union { Elf32_Phdr e32; Elf64_Phdr e64; } elf_phdr; typedef union { Elf32_Shdr e32; Elf64_Shdr e64; } elf_shdr; typedef union { Elf32_Sym e32; Elf64_Sym e64; } elf_sym; typedef union { Elf32_Rel e32; Elf64_Rel e64; } elf_rel; typedef union { Elf32_Rela e32; Elf64_Rela e64; } elf_rela; typedef union { Elf32_Note e32; Elf64_Note e64; } elf_note; ELF_DEFINE_HANDLE(elf_ehdr) ELF_DEFINE_HANDLE(elf_shdr) ELF_DEFINE_HANDLE(elf_phdr) ELF_DEFINE_HANDLE(elf_sym) ELF_DEFINE_HANDLE(elf_note) struct elf_binary { /* elf binary */ const void *image_base; size_t size; char class; char data; ELF_HANDLE_DECL(elf_ehdr) ehdr; elf_ptrval sec_strtab; ELF_HANDLE_DECL(elf_shdr) sym_tab; uint64_t sym_strtab; /* loaded to */ /* * dest_base and dest_size are trusted and must be correct; * whenever dest_size is not 0, both of these must be valid * so long as the struct elf_binary is in use. */ char *dest_base; size_t dest_size; uint64_t pstart; uint64_t pend; uint64_t reloc_offset; uint64_t bsd_symtab_pstart; uint64_t bsd_symtab_pend; /* * caller's other acceptable destination * * Again, these are trusted and must be valid (or 0) so long * as the struct elf_binary is in use. */ void *caller_xdest_base; uint64_t caller_xdest_size; #ifndef __XEN__ /* misc */ elf_log_callback *log_callback; void *log_caller_data; #endif bool verbose; const char *broken; }; /* ------------------------------------------------------------------------ */ /* accessing elf header fields */ #ifdef XEN_ELF_BIG_ENDIAN # define NATIVE_ELFDATA ELFDATA2MSB #else # define NATIVE_ELFDATA ELFDATA2LSB #endif #define elf_32bit(elf) (ELFCLASS32 == (elf)->class) #define elf_64bit(elf) (ELFCLASS64 == (elf)->class) #define elf_msb(elf) (ELFDATA2MSB == (elf)->data) #define elf_lsb(elf) (ELFDATA2LSB == (elf)->data) #define elf_swap(elf) (NATIVE_ELFDATA != (elf)->data) #define elf_uval_3264(elf, handle, elem) \ elf_access_unsigned((elf), (handle).ptrval, \ offsetof(typeof(*(handle).typeonly),elem), \ sizeof((handle).typeonly->elem)) #define elf_uval(elf, handle, elem) \ ((ELFCLASS64 == (elf)->class) \ ? elf_uval_3264(elf, handle, e64.elem) \ : elf_uval_3264(elf, handle, e32.elem)) /* * Reads an unsigned field in a header structure in the ELF. * str is a HANDLE, and elem is the field name in it. */ #define elf_size(elf, handle_or_handletype) ({ \ typeof(handle_or_handletype) elf_size__dummy; \ ((ELFCLASS64 == (elf)->class) \ ? sizeof(elf_size__dummy.typeonly->e64) \ : sizeof(elf_size__dummy.typeonly->e32)); \ }) /* * Returns the size of the substructure for the appropriate 32/64-bitness. * str should be a HANDLE. */ uint64_t elf_access_unsigned(struct elf_binary *elf, elf_ptrval ptr, uint64_t offset, size_t size); /* Reads a field at arbitrary offset and alignemnt */ uint64_t elf_round_up(struct elf_binary *elf, uint64_t addr); const char *elf_strval(struct elf_binary *elf, elf_ptrval start); /* may return NULL if the string is out of range etc. */ const char *elf_strfmt(struct elf_binary *elf, elf_ptrval start); /* like elf_strval but returns "(invalid)" instead of NULL */ void elf_memcpy_safe(struct elf_binary*, elf_ptrval dst, elf_ptrval src, size_t); void elf_memset_safe(struct elf_binary*, elf_ptrval dst, int c, size_t); /* * Versions of memcpy and memset which arrange never to write * outside permitted areas. */ bool elf_access_ok(struct elf_binary * elf, uint64_t ptrval, size_t size); #define elf_store_val(elf, type, ptr, val) \ ({ \ typeof(type) elf_store__val = (val); \ elf_ptrval elf_store__targ = ptr; \ if (elf_access_ok((elf), elf_store__targ, \ sizeof(elf_store__val))) { \ elf_memcpy_unchecked((void*)elf_store__targ, &elf_store__val, \ sizeof(elf_store__val)); \ } \ }) \ /* Stores a value at a particular PTRVAL. */ #define elf_store_field(elf, hdr, elm, val) \ (elf_store_val((elf), ELF__HANDLE_FIELD_TYPE(hdr, elm), \ ELF_HANDLE_PTRVAL(hdr) + ELF__HANDLE_FIELD_OFFSET(hdr, elm), \ (val))) /* Stores a 32/64-bit field. hdr is a HANDLE and elm is the field name. */ /* ------------------------------------------------------------------------ */ /* xc_libelf_tools.c */ unsigned elf_shdr_count(struct elf_binary *elf); unsigned elf_phdr_count(struct elf_binary *elf); ELF_HANDLE_DECL(elf_shdr) elf_shdr_by_name(struct elf_binary *elf, const char *name); ELF_HANDLE_DECL(elf_shdr) elf_shdr_by_index(struct elf_binary *elf, unsigned index); ELF_HANDLE_DECL(elf_phdr) elf_phdr_by_index(struct elf_binary *elf, unsigned index); const char *elf_section_name(struct elf_binary *elf, ELF_HANDLE_DECL(elf_shdr) shdr); /* might return NULL if inputs are invalid */ elf_ptrval elf_section_start(struct elf_binary *elf, ELF_HANDLE_DECL(elf_shdr) shdr); elf_ptrval elf_section_end(struct elf_binary *elf, ELF_HANDLE_DECL(elf_shdr) shdr); elf_ptrval elf_segment_start(struct elf_binary *elf, ELF_HANDLE_DECL(elf_phdr) phdr); elf_ptrval elf_segment_end(struct elf_binary *elf, ELF_HANDLE_DECL(elf_phdr) phdr); ELF_HANDLE_DECL(elf_sym) elf_sym_by_name(struct elf_binary *elf, const char *symbol); ELF_HANDLE_DECL(elf_sym) elf_sym_by_index(struct elf_binary *elf, unsigned index); const char *elf_note_name(struct elf_binary *elf, ELF_HANDLE_DECL(elf_note) note); /* may return NULL */ elf_ptrval elf_note_desc(struct elf_binary *elf, ELF_HANDLE_DECL(elf_note) note); uint64_t elf_note_numeric(struct elf_binary *elf, ELF_HANDLE_DECL(elf_note) note); uint64_t elf_note_numeric_array(struct elf_binary *, ELF_HANDLE_DECL(elf_note), unsigned int unitsz, unsigned int idx); /* * If you use elf_note_next in a loop, you must put a nontrivial upper * bound on the returned value as part of your loop condition. In * some cases elf_note_next will substitute ELF_PTRVAL_MAX as return * value to indicate that the iteration isn't going well (for example, * the putative "next" value would be earlier in memory). In this * case the caller's loop must terminate. Checking against the * end of the notes segment with a strict inequality is sufficient. */ ELF_HANDLE_DECL(elf_note) elf_note_next(struct elf_binary *elf, ELF_HANDLE_DECL(elf_note) note); /* (Only) checks that the image has the right magic number. */ bool elf_is_elfbinary(const void *image_start, size_t image_size); bool elf_phdr_is_loadable(struct elf_binary *elf, ELF_HANDLE_DECL(elf_phdr) phdr); /* ------------------------------------------------------------------------ */ /* xc_libelf_loader.c */ elf_errorstatus elf_init(struct elf_binary *elf, const char *image, size_t size); /* * image and size must be correct. They will be recorded in * *elf, and must remain valid while the elf is in use. */ #ifdef __XEN__ void elf_set_verbose(struct elf_binary *elf); #else void elf_set_log(struct elf_binary *elf, elf_log_callback*, void *log_caller_pointer, bool verbose); #endif void elf_parse_binary(struct elf_binary *elf); elf_errorstatus elf_load_binary(struct elf_binary *elf); elf_ptrval elf_get_ptr(struct elf_binary *elf, unsigned long addr); uint64_t elf_lookup_addr(struct elf_binary *elf, const char *symbol); void elf_parse_bsdsyms(struct elf_binary *elf, uint64_t pstart); /* private */ void elf_mark_broken(struct elf_binary *elf, const char *msg); const char *elf_check_broken(const struct elf_binary *elf); /* NULL means OK */ /* ------------------------------------------------------------------------ */ /* xc_libelf_relocate.c */ elf_errorstatus elf_reloc(struct elf_binary *elf); /* ------------------------------------------------------------------------ */ /* xc_libelf_dominfo.c */ #define UNSET_ADDR ((uint64_t)-1) enum xen_elfnote_type { XEN_ENT_NONE = 0, XEN_ENT_LONG = 1, XEN_ENT_STR = 2 }; struct xen_elfnote { enum xen_elfnote_type type; const char *name; union { const char *str; uint64_t num; } data; }; struct elf_dom_parms { /* raw */ elf_ptrval guest_info; elf_ptrval elf_note_start; elf_ptrval elf_note_end; struct xen_elfnote elf_notes[XEN_ELFNOTE_MAX + 1]; /* parsed */ char guest_os[16]; char guest_ver[16]; char xen_ver[16]; char loader[16]; int pae; /* some kind of enum apparently */ bool bsd_symtab; uint64_t virt_base; uint64_t virt_entry; uint64_t virt_hypercall; uint64_t virt_hv_start_low; uint64_t p2m_base; uint64_t elf_paddr_offset; uint32_t f_supported[XENFEAT_NR_SUBMAPS]; uint32_t f_required[XENFEAT_NR_SUBMAPS]; /* calculated */ uint64_t virt_offset; uint64_t virt_kstart; uint64_t virt_kend; }; static inline void elf_xen_feature_set(int nr, uint32_t * addr) { addr[nr >> 5] |= 1 << (nr & 31); } static inline int elf_xen_feature_get(int nr, uint32_t * addr) { return !!(addr[nr >> 5] & (1 << (nr & 31))); } int elf_xen_parse_features(const char *features, uint32_t *supported, uint32_t *required); int elf_xen_parse_note(struct elf_binary *elf, struct elf_dom_parms *parms, ELF_HANDLE_DECL(elf_note) note); int elf_xen_parse_guest_info(struct elf_binary *elf, struct elf_dom_parms *parms); int elf_xen_parse(struct elf_binary *elf, struct elf_dom_parms *parms); static inline void *elf_memcpy_unchecked(void *dest, const void *src, size_t n) { return memcpy(dest, src, n); } static inline void *elf_memmove_unchecked(void *dest, const void *src, size_t n) { return memmove(dest, src, n); } static inline void *elf_memset_unchecked(void *s, int c, size_t n) { return memset(s, c, n); } /* * Unsafe versions of memcpy, memmove memset which take actual C * pointers. These are just like the real functions. * We provide these so that in libelf-private.h we can #define * memcpy, memset and memmove to undefined MISTAKE things. */ /* Advances past amount bytes of the current destination area. */ static inline void ELF_ADVANCE_DEST(struct elf_binary *elf, uint64_t amount) { if ( elf->dest_base == NULL ) { elf_mark_broken(elf, "advancing in null image"); } else if ( elf->dest_size >= amount ) { elf->dest_base += amount; elf->dest_size -= amount; } else { elf->dest_size = 0; elf_mark_broken(elf, "advancing past end (image very short?)"); } } #endif /* __XEN_LIBELF_H__ */ xen-4.4.0/xen/include/xen/elfcore.h0000664000175000017500000000463112307313555015246 0ustar smbsmb/****************************************************************************** * elfcore.h * * Based heavily on include/linux/elfcore.h from Linux 2.6.16 * Naming scheeme based on include/xen/elf.h (not include/linux/elfcore.h) * */ #ifndef __ELFCOREC_H__ #define __ELFCOREC_H__ #include #include #include #include #define NT_PRSTATUS 1 typedef struct { int signo; /* signal number */ int code; /* extra code */ int errno; /* errno */ } ELF_Signifo; /* These seem to be the same length on all architectures on Linux */ typedef int ELF_Pid; typedef struct { long tv_sec; long tv_usec; } ELF_Timeval; /* * Definitions to generate Intel SVR4-like core files. * These mostly have the same names as the SVR4 types with "elf_" * tacked on the front to prevent clashes with linux definitions, * and the typedef forms have been avoided. This is mostly like * the SVR4 structure, but more Linuxy, with things that Linux does * not support and which gdb doesn't really use excluded. */ typedef struct { ELF_Signifo pr_info; /* Info associated with signal */ short pr_cursig; /* Current signal */ unsigned long pr_sigpend; /* Set of pending signals */ unsigned long pr_sighold; /* Set of held signals */ ELF_Pid pr_pid; ELF_Pid pr_ppid; ELF_Pid pr_pgrp; ELF_Pid pr_sid; ELF_Timeval pr_utime; /* User time */ ELF_Timeval pr_stime; /* System time */ ELF_Timeval pr_cutime; /* Cumulative user time */ ELF_Timeval pr_cstime; /* Cumulative system time */ ELF_Gregset pr_reg; /* GP registers - from asm header file */ int pr_fpvalid; /* True if math co-processor being used. */ } ELF_Prstatus; typedef struct { unsigned long xen_major_version; unsigned long xen_minor_version; unsigned long xen_extra_version; unsigned long xen_changeset; unsigned long xen_compiler; unsigned long xen_compile_date; unsigned long xen_compile_time; unsigned long tainted; #if defined(CONFIG_X86) unsigned long xen_phys_start; unsigned long dom0_pfn_to_mfn_frame_list_list; #endif } crash_xen_info_t; #endif /* __ELFCOREC_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/xen/radix-tree.h0000664000175000017500000001623312307313555015674 0ustar smbsmb/* * Copyright (C) 2001 Momchil Velikov * Portions Copyright (C) 2001 Christoph Hellwig * Copyright (C) 2006 Nick Piggin * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef _XEN_RADIX_TREE_H #define _XEN_RADIX_TREE_H #include #include #include /* * An indirect pointer (root->rnode pointing to a radix_tree_node, rather * than a data item) is signalled by the low bit set in the root->rnode * pointer. * * In this case root->height is > 0, but the indirect pointer tests are * needed for RCU lookups (because root->height is unreliable). The only * time callers need worry about this is when doing a lookup_slot under * RCU. * * Indirect pointer in fact is also used to tag the last pointer of a node * when it is shrunk, before we rcu free the node. See shrink code for * details. */ #define RADIX_TREE_INDIRECT_PTR 1 static inline int radix_tree_is_indirect_ptr(void *ptr) { return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR); } /* *** Radix tree structure definitions. *** These are public to allow users to allocate instances of them. *** However all fields are absolutely private. */ #define RADIX_TREE_MAP_SHIFT 6 #define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) #define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) struct radix_tree_node { unsigned int height; /* Height from the bottom */ unsigned int count; void __rcu *slots[RADIX_TREE_MAP_SIZE]; }; typedef struct radix_tree_node *radix_tree_alloc_fn_t(void *); typedef void radix_tree_free_fn_t(struct radix_tree_node *, void *); struct radix_tree_root { unsigned int height; struct radix_tree_node __rcu *rnode; /* Allow to specify custom node alloc/dealloc routines. */ radix_tree_alloc_fn_t *node_alloc; radix_tree_free_fn_t *node_free; void *node_alloc_free_arg; }; /* *** radix-tree API starts here ** */ void radix_tree_init(struct radix_tree_root *root); void radix_tree_set_alloc_callbacks( struct radix_tree_root *root, radix_tree_alloc_fn_t *node_alloc, radix_tree_free_fn_t *node_free, void *node_alloc_free_arg); void radix_tree_destroy( struct radix_tree_root *root, void (*slot_free)(void *)); /** * Radix-tree synchronization * * The radix-tree API requires that users provide all synchronisation (with * specific exceptions, noted below). * * Synchronization of access to the data items being stored in the tree, and * management of their lifetimes must be completely managed by API users. * * For API usage, in general, * - any function _modifying_ the tree (inserting or deleting items) must * exclude other modifications, and exclude any functions reading the tree. * - any function _reading_ the tree (looking up items) must exclude * modifications to the tree, but may occur concurrently with other readers. * * The notable exceptions to this rule are the following functions: * radix_tree_lookup * radix_tree_lookup_slot * radix_tree_gang_lookup * radix_tree_gang_lookup_slot * * The first 7 functions are able to be called locklessly, using RCU. The * caller must ensure calls to these functions are made within rcu_read_lock() * regions. Other readers (lock-free or otherwise) and modifications may be * running concurrently. * * It is still required that the caller manage the synchronization and lifetimes * of the items. So if RCU lock-free lookups are used, typically this would mean * that the items have their own locks, or are amenable to lock-free access; and * that the items are freed by RCU (or only freed after having been deleted from * the radix tree *and* a synchronize_rcu() grace period). * * (Note, rcu_assign_pointer and rcu_dereference are not needed to control * access to data items when inserting into or looking up from the radix tree) */ /** * radix_tree_deref_slot - dereference a slot * @pslot: pointer to slot, returned by radix_tree_lookup_slot * Returns: item that was stored in that slot with any direct pointer flag * removed. * * For use with radix_tree_lookup_slot(). Caller must hold tree at least read * locked across slot lookup and dereference. Not required if write lock is * held (ie. items cannot be concurrently inserted). * * radix_tree_deref_retry must be used to confirm validity of the pointer if * only the read lock is held. */ static inline void *radix_tree_deref_slot(void **pslot) { return rcu_dereference(*pslot); } /** * radix_tree_deref_retry - check radix_tree_deref_slot * @arg: pointer returned by radix_tree_deref_slot * Returns: 0 if retry is not required, otherwise retry is required * * radix_tree_deref_retry must be used with radix_tree_deref_slot. */ static inline int radix_tree_deref_retry(void *arg) { return unlikely((unsigned long)arg & RADIX_TREE_INDIRECT_PTR); } /** * radix_tree_replace_slot - replace item in a slot * @pslot: pointer to slot, returned by radix_tree_lookup_slot * @item: new item to store in the slot. * * For use with radix_tree_lookup_slot(). Caller must hold tree write locked * across slot lookup and replacement. */ static inline void radix_tree_replace_slot(void **pslot, void *item) { BUG_ON(radix_tree_is_indirect_ptr(item)); rcu_assign_pointer(*pslot, item); } /** * radix_tree_{int_to_ptr,ptr_to_int}: * * Allow storage of signed integers in radix-tree slots. We use an encoding * in which the bottom two bits of the slot pointer are reserved (bit 0 for * the indirect-pointer tag; bit 1 always set to prevent an in-use * integer-valued slot from being NULL and thus mistakenly being reaped). */ static inline void *radix_tree_int_to_ptr(int val) { long _ptr = ((long)val << 2) | 0x2l; ASSERT((_ptr >> 2) == val); return (void *)_ptr; } static inline int radix_tree_ptr_to_int(void *ptr) { ASSERT(((long)ptr & 0x3) == 0x2); return (int)((long)ptr >> 2); } int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); void *radix_tree_delete(struct radix_tree_root *, unsigned long); unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items); unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, unsigned long first_index, unsigned int max_items); unsigned long radix_tree_next_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan); unsigned long radix_tree_prev_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan); #endif /* _XEN_RADIX_TREE_H */ xen-4.4.0/xen/include/xen/bitmap.h0000664000175000017500000002050412307313555015100 0ustar smbsmb#ifndef __XEN_BITMAP_H #define __XEN_BITMAP_H #ifndef __ASSEMBLY__ #include #include #include /* * bitmaps provide bit arrays that consume one or more unsigned * longs. The bitmap interface and available operations are listed * here, in bitmap.h * * Function implementations generic to all architectures are in * lib/bitmap.c. Functions implementations that are architecture * specific are in various include/asm-/bitops.h headers * and other arch/ specific files. * * See lib/bitmap.c for more details. */ /* * The available bitmap operations and their rough meaning in the * case that the bitmap is a single unsigned long are thus: * * bitmap_zero(dst, nbits) *dst = 0UL * bitmap_fill(dst, nbits) *dst = ~0UL * bitmap_copy(dst, src, nbits) *dst = *src * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2 * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2 * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2 * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2) * bitmap_complement(dst, src, nbits) *dst = ~(*src) * bitmap_equal(src1, src2, nbits) Are *src1 and *src2 equal? * bitmap_intersects(src1, src2, nbits) Do *src1 and *src2 overlap? * bitmap_subset(src1, src2, nbits) Is *src1 a subset of *src2? * bitmap_empty(src, nbits) Are all bits zero in *src? * bitmap_full(src, nbits) Are all bits set in *src? * bitmap_weight(src, nbits) Hamming Weight: number set bits * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n * bitmap_scnprintf(buf, len, src, nbits) Print bitmap src to buf * bitmap_scnlistprintf(buf, len, src, nbits) Print bitmap src as list to buf */ /* * Also the following operations in asm/bitops.h apply to bitmaps. * * set_bit(bit, addr) *addr |= bit * clear_bit(bit, addr) *addr &= ~bit * change_bit(bit, addr) *addr ^= bit * test_bit(bit, addr) Is bit set in *addr? * test_and_set_bit(bit, addr) Set bit and return old value * test_and_clear_bit(bit, addr) Clear bit and return old value * test_and_change_bit(bit, addr) Change bit and return old value * find_first_zero_bit(addr, nbits) Position first zero bit in *addr * find_first_bit(addr, nbits) Position first set bit in *addr * find_next_zero_bit(addr, nbits, bit) Position next zero bit in *addr >= bit * find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit */ /* * The DECLARE_BITMAP(name,bits) macro, in xen/types.h, can be used * to declare an array named 'name' of just enough unsigned longs to * contain all bit positions from 0 to 'bits' - 1. */ /* * lib/bitmap.c provides these functions: */ extern int __bitmap_empty(const unsigned long *bitmap, int bits); extern int __bitmap_full(const unsigned long *bitmap, int bits); extern int __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); extern void __bitmap_complement(unsigned long *dst, const unsigned long *src, int bits); extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, int shift, int bits); extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, int shift, int bits); extern void __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); extern void __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); extern int __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); extern int __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); extern int __bitmap_weight(const unsigned long *bitmap, int bits); extern int bitmap_scnprintf(char *buf, unsigned int len, const unsigned long *src, int nbits); extern int bitmap_scnlistprintf(char *buf, unsigned int len, const unsigned long *src, int nbits); extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order); extern void bitmap_release_region(unsigned long *bitmap, int pos, int order); extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order); #define BITMAP_LAST_WORD_MASK(nbits) \ ( \ ((nbits) % BITS_PER_LONG) ? \ (1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL \ ) #define bitmap_bytes(nbits) (BITS_TO_LONGS(nbits) * sizeof(unsigned long)) #define bitmap_switch(nbits, zero_ret, small, large) \ switch (-!__builtin_constant_p(nbits) | (nbits)) { \ case 0: return zero_ret; \ case 1 ... BITS_PER_LONG: \ small; break; \ default: \ large; break; \ } static inline void bitmap_zero(unsigned long *dst, int nbits) { bitmap_switch(nbits,, *dst = 0UL, memset(dst, 0, bitmap_bytes(nbits))); } static inline void bitmap_fill(unsigned long *dst, int nbits) { size_t nlongs = BITS_TO_LONGS(nbits); switch (nlongs) { default: memset(dst, -1, (nlongs - 1) * sizeof(unsigned long)); /* fall through */ case 1: dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits); break; } } static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, int nbits) { bitmap_switch(nbits,, *dst = *src, memcpy(dst, src, bitmap_bytes(nbits))); } static inline void bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, int nbits) { bitmap_switch(nbits,, *dst = *src1 & *src2, __bitmap_and(dst, src1, src2, nbits)); } static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, int nbits) { bitmap_switch(nbits,, *dst = *src1 | *src2, __bitmap_or(dst, src1, src2, nbits)); } static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, int nbits) { bitmap_switch(nbits,, *dst = *src1 ^ *src2, __bitmap_xor(dst, src1, src2, nbits)); } static inline void bitmap_andnot(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, int nbits) { bitmap_switch(nbits,, *dst = *src1 & ~*src2, __bitmap_andnot(dst, src1, src2, nbits)); } static inline void bitmap_complement(unsigned long *dst, const unsigned long *src, int nbits) { bitmap_switch(nbits,, *dst = ~*src & BITMAP_LAST_WORD_MASK(nbits), __bitmap_complement(dst, src, nbits)); } static inline int bitmap_equal(const unsigned long *src1, const unsigned long *src2, int nbits) { bitmap_switch(nbits, -1, return !((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits)), return __bitmap_equal(src1, src2, nbits)); } static inline int bitmap_intersects(const unsigned long *src1, const unsigned long *src2, int nbits) { bitmap_switch(nbits, -1, return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0, return __bitmap_intersects(src1, src2, nbits)); } static inline int bitmap_subset(const unsigned long *src1, const unsigned long *src2, int nbits) { bitmap_switch(nbits, -1, return !((*src1 & ~*src2) & BITMAP_LAST_WORD_MASK(nbits)), return __bitmap_subset(src1, src2, nbits)); } static inline int bitmap_empty(const unsigned long *src, int nbits) { bitmap_switch(nbits, -1, return !(*src & BITMAP_LAST_WORD_MASK(nbits)), return __bitmap_empty(src, nbits)); } static inline int bitmap_full(const unsigned long *src, int nbits) { bitmap_switch(nbits, -1, return !(~*src & BITMAP_LAST_WORD_MASK(nbits)), return __bitmap_full(src, nbits)); } static inline int bitmap_weight(const unsigned long *src, int nbits) { return __bitmap_weight(src, nbits); } static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, int n, int nbits) { bitmap_switch(nbits,, *dst = *src >> n, __bitmap_shift_right(dst, src, n, nbits)); } static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src, int n, int nbits) { bitmap_switch(nbits,, *dst = (*src << n) & BITMAP_LAST_WORD_MASK(nbits), __bitmap_shift_left(dst, src, n, nbits)); } #undef bitmap_switch #undef bitmap_bytes void bitmap_long_to_byte(uint8_t *bp, const unsigned long *lp, int nbits); void bitmap_byte_to_long(unsigned long *lp, const uint8_t *bp, int nbits); #endif /* __ASSEMBLY__ */ #endif /* __XEN_BITMAP_H */ xen-4.4.0/xen/include/xen/wait.h0000664000175000017500000000361712307313555014576 0ustar smbsmb/****************************************************************************** * wait.h * * Sleep in hypervisor context for some event to occur. * * Copyright (c) 2010, Keir Fraser */ #ifndef __XEN_WAIT_H__ #define __XEN_WAIT_H__ #include #include #include struct waitqueue_head { spinlock_t lock; struct list_head list; }; /* Statically define and initialise a waitqueue. */ #define DEFINE_WAITQUEUE_HEAD(name) \ struct waitqueue_head name = { \ .lock = SPIN_LOCK_UNLOCKED, \ .list = LIST_HEAD_INIT((name).list) \ } /* Dynamically initialise/destroy a waitqueue. */ void init_waitqueue_head(struct waitqueue_head *wq); void destroy_waitqueue_head(struct waitqueue_head *wq); /* Wake VCPU(s) waiting on specified waitqueue. */ void wake_up_nr(struct waitqueue_head *wq, unsigned int nr); void wake_up_one(struct waitqueue_head *wq); void wake_up_all(struct waitqueue_head *wq); /* Wait on specified waitqueue until @condition is true. */ #define wait_event(wq, condition) \ do { \ if ( condition ) \ break; \ for ( ; ; ) { \ prepare_to_wait(&wq); \ if ( condition ) \ break; \ wait(); \ } \ finish_wait(&wq); \ } while (0) /* Private functions. */ int init_waitqueue_vcpu(struct vcpu *v); void destroy_waitqueue_vcpu(struct vcpu *v); void prepare_to_wait(struct waitqueue_head *wq); void wait(void); void finish_wait(struct waitqueue_head *wq); void check_wakeup_from_wait(void); #endif /* __XEN_WAIT_H__ */ xen-4.4.0/xen/include/xen/numa.h0000664000175000017500000000061612307313555014566 0ustar smbsmb#ifndef _XEN_NUMA_H #define _XEN_NUMA_H #include #ifndef NODES_SHIFT #define NODES_SHIFT 0 #endif #define NUMA_NO_NODE 0xFF #define MAX_NUMNODES (1 << NODES_SHIFT) #define vcpu_to_node(v) (cpu_to_node((v)->processor)) #define domain_to_node(d) \ (((d)->vcpu != NULL && (d)->vcpu[0] != NULL) \ ? vcpu_to_node((d)->vcpu[0]) : NUMA_NO_NODE) #endif /* _XEN_NUMA_H */ xen-4.4.0/xen/include/xen/guest_access.h0000664000175000017500000000155112307313555016275 0ustar smbsmb/****************************************************************************** * guest_access.h * * Copyright (x) 2006, K A Fraser */ #ifndef __XEN_GUEST_ACCESS_H__ #define __XEN_GUEST_ACCESS_H__ #include #define copy_to_guest(hnd, ptr, nr) \ copy_to_guest_offset(hnd, 0, ptr, nr) #define copy_from_guest(ptr, hnd, nr) \ copy_from_guest_offset(ptr, hnd, 0, nr) #define clear_guest(hnd, nr) \ clear_guest_offset(hnd, 0, nr) #define __copy_to_guest(hnd, ptr, nr) \ __copy_to_guest_offset(hnd, 0, ptr, nr) #define __copy_from_guest(ptr, hnd, nr) \ __copy_from_guest_offset(ptr, hnd, 0, nr) #define __clear_guest(hnd, nr) \ __clear_guest_offset(hnd, 0, nr) #endif /* __XEN_GUEST_ACCESS_H__ */ xen-4.4.0/xen/include/xen/keyhandler.h0000664000175000017500000000344212307313555015754 0ustar smbsmb/****************************************************************************** * keyhandler.h * * We keep an array of 'handlers' for each key code between 0 and 255; * this is intended to allow very simple debugging routines (toggle * debug flag, dump registers, reboot, etc) to be hooked in in a slightly * nicer fashion than just editing the serial/keyboard drivers. */ #ifndef __XEN_KEYHANDLER_H__ #define __XEN_KEYHANDLER_H__ typedef void keyhandler_fn_t( unsigned char key); typedef void irq_keyhandler_fn_t( unsigned char key, struct cpu_user_regs *regs); struct keyhandler { /* * If TRUE then u.irq_fn is called in hardirq context with interrupts * disabled. The @regs callback parameter points at the interrupted * register context. * If FALSE then u.fn is called in softirq context with no locks held and * interrupts enabled. */ bool_t irq_callback; /* * If TRUE then the keyhandler will be included in the "dump everything" * keyhandler, so must not have any side-effects. */ bool_t diagnostic; union { keyhandler_fn_t *fn; irq_keyhandler_fn_t *irq_fn; } u; /* The string is not copied by register_keyhandler(), so must persist. */ char *desc; }; /* Initialize keytable with default handlers */ extern void initialize_keytable(void); /* * Register a callback handler for key @key. The keyhandler structure is not * copied, so must persist. */ extern void register_keyhandler(unsigned char key, struct keyhandler *handler); /* Inject a keypress into the key-handling subsystem. */ extern void handle_keypress(unsigned char key, struct cpu_user_regs *regs); /* Scratch space is available for use of any keyhandler. */ extern char keyhandler_scratch[1024]; #endif /* __XEN_KEYHANDLER_H__ */ xen-4.4.0/xen/include/xen/watchdog.h0000664000175000017500000000123412307313555015423 0ustar smbsmb/****************************************************************************** * watchdog.h * * Common watchdog code */ #ifndef __XEN_WATCHDOG_H__ #define __XEN_WATCHDOG_H__ #include #ifdef CONFIG_WATCHDOG /* Try to set up a watchdog. */ int watchdog_setup(void); /* Enable the watchdog. */ void watchdog_enable(void); /* Disable the watchdog. */ void watchdog_disable(void); /* Is the watchdog currently enabled. */ bool_t watchdog_enabled(void); #else #define watchdog_setup() ((void)0) #define watchdog_enable() ((void)0) #define watchdog_disable() ((void)0) #define watchdog_enabled() ((void)0) #endif #endif /* __XEN_WATCHDOG_H__ */ xen-4.4.0/xen/include/xen/percpu.h0000664000175000017500000000125412307313555015123 0ustar smbsmb#ifndef __XEN_PERCPU_H__ #define __XEN_PERCPU_H__ #include /* * Separate out the type, so (int[3], foo) works. * * The _##name concatenation is being used here to prevent 'name' from getting * macro expanded, while still allowing a per-architecture symbol name prefix. */ #define DEFINE_PER_CPU(type, name) __DEFINE_PER_CPU(type, _##name, ) #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ __DEFINE_PER_CPU(type, _##name, .read_mostly) /* Preferred on Xen. Also see arch-defined per_cpu(). */ #define this_cpu(var) __get_cpu_var(var) /* Linux compatibility. */ #define get_cpu_var(var) this_cpu(var) #define put_cpu_var(var) #endif /* __XEN_PERCPU_H__ */ xen-4.4.0/xen/include/xen/serial.h0000664000175000017500000001517312307313555015111 0ustar smbsmb/****************************************************************************** * serial.h * * Framework for serial device drivers. * * Copyright (c) 2003-2008, K A Fraser */ #ifndef __XEN_SERIAL_H__ #define __XEN_SERIAL_H__ #include #include struct cpu_user_regs; /* Register a character-receive hook on the specified COM port. */ typedef void (*serial_rx_fn)(char, struct cpu_user_regs *); void serial_set_rx_handler(int handle, serial_rx_fn fn); /* Number of characters we buffer for a polling receiver. */ #define serial_rxbufsz 32 /* Number of characters we buffer for an interrupt-driven transmitter. */ extern unsigned int serial_txbufsz; struct uart_driver; enum serial_port_state { serial_unused, serial_parsed, serial_initialized }; struct vuart_info { paddr_t base_addr; /* Base address of the UART */ unsigned long size; /* Size of the memory region */ unsigned long data_off; /* Data register offset */ unsigned long status_off; /* Status register offset */ unsigned long status; /* Ready status value */ }; struct serial_port { /* Uart-driver parameters. */ struct uart_driver *driver; void *uart; enum serial_port_state state; /* Transmit data buffer (interrupt-driven uart). */ char *txbuf; unsigned int txbufp, txbufc; bool_t tx_quench; int tx_log_everything; /* Force synchronous transmit. */ int sync; /* Receiver callback functions (asynchronous receivers). */ serial_rx_fn rx_lo, rx_hi, rx; /* Receive data buffer (polling receivers). */ char rxbuf[serial_rxbufsz]; unsigned int rxbufp, rxbufc; /* Serial I/O is concurrency-safe. */ spinlock_t rx_lock, tx_lock; }; struct uart_driver { /* Driver initialisation (pre- and post-IRQ subsystem setup). */ void (*init_preirq)(struct serial_port *); void (*init_postirq)(struct serial_port *); /* Hook to clean up after Xen bootstrap (before domain 0 runs). */ void (*endboot)(struct serial_port *); /* Driver suspend/resume. */ void (*suspend)(struct serial_port *); void (*resume)(struct serial_port *); /* Return number of characters the port can hold for transmit, * or -EIO if port is inaccesible */ int (*tx_ready)(struct serial_port *); /* Put a character onto the serial line. */ void (*putc)(struct serial_port *, char); /* Flush accumulated characters. */ void (*flush)(struct serial_port *); /* Get a character from the serial line: returns 0 if none available. */ int (*getc)(struct serial_port *, char *); /* Get IRQ number for this port's serial line: returns -1 if none. */ int (*irq)(struct serial_port *); /* Get IRQ device node for this port's serial line: returns NULL if none. */ const struct dt_irq *(*dt_irq_get)(struct serial_port *); /* Get serial information */ const struct vuart_info *(*vuart_info)(struct serial_port *); }; /* 'Serial handles' are composed from the following fields. */ #define SERHND_IDX (3<<0) /* COM1, COM2, DBGP, DTUART? */ # define SERHND_COM1 (0<<0) # define SERHND_COM2 (1<<0) # define SERHND_DBGP (2<<0) # define SERHND_DTUART (0<<0) /* Steal SERHND_COM1 value */ #define SERHND_HI (1<<2) /* Mux/demux each transferred char by MSB. */ #define SERHND_LO (1<<3) /* Ditto, except that the MSB is cleared. */ #define SERHND_COOKED (1<<4) /* Newline/carriage-return translation? */ /* Two-stage initialisation (before/after IRQ-subsystem initialisation). */ void serial_init_preirq(void); void serial_init_postirq(void); /* Clean-up hook before domain 0 runs. */ void serial_endboot(void); /* Takes a config string and creates a numeric handle on the COM port. */ int serial_parse_handle(char *conf); /* Transmit a single character via the specified COM port. */ void serial_putc(int handle, char c); /* Transmit a NULL-terminated string via the specified COM port. */ void serial_puts(int handle, const char *s); /* * An alternative to registering a character-receive hook. This function * will not return until a character is available. It can safely be * called with interrupts disabled. */ char serial_getc(int handle); /* Forcibly prevent serial lockup when the system is in a bad way. */ /* (NB. This also forces an implicit serial_start_sync()). */ void serial_force_unlock(int handle); /* Start/end a synchronous region (temporarily disable interrupt-driven tx). */ void serial_start_sync(int handle); void serial_end_sync(int handle); /* Start/end a region where we will wait rather than drop characters. */ void serial_start_log_everything(int handle); void serial_end_log_everything(int handle); /* Return irq number for specified serial port (identified by index). */ int serial_irq(int idx); /* Return irq device node for specified serial port (identified by index). */ const struct dt_irq *serial_dt_irq(int idx); /* Retrieve basic UART information to emulate it (base address, size...) */ const struct vuart_info* serial_vuart_info(int idx); /* Serial suspend/resume. */ void serial_suspend(void); void serial_resume(void); /* * Initialisation and helper functions for uart drivers. */ /* Register a uart on serial port @idx (e.g., @idx==0 is COM1). */ void serial_register_uart(int idx, struct uart_driver *driver, void *uart); /* Place the serial port into asynchronous transmit mode. */ void serial_async_transmit(struct serial_port *port); /* Process work in interrupt context. */ void serial_rx_interrupt(struct serial_port *port, struct cpu_user_regs *regs); void serial_tx_interrupt(struct serial_port *port, struct cpu_user_regs *regs); /* * Initialisers for individual uart drivers. */ /* NB. Any default value can be 0 if it is unknown and must be specified. */ struct ns16550_defaults { int baud; /* default baud rate; BAUD_AUTO == pre-configured */ int data_bits; /* default data bits (5, 6, 7 or 8) */ int parity; /* default parity (n, o, e, m or s) */ int stop_bits; /* default stop bits (1 or 2) */ int irq; /* default irq */ unsigned long io_base; /* default io_base address */ }; void ns16550_init(int index, struct ns16550_defaults *defaults); void ehci_dbgp_init(void); void __init dt_uart_init(void); struct physdev_dbgp_op; int dbgp_op(const struct physdev_dbgp_op *); /* Baud rate was pre-configured before invoking the UART driver. */ #define BAUD_AUTO (-1) #endif /* __XEN_SERIAL_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/xen/nodemask.h0000664000175000017500000002627612307313555015441 0ustar smbsmb#ifndef __LINUX_NODEMASK_H #define __LINUX_NODEMASK_H /* * Nodemasks provide a bitmap suitable for representing the * set of Node's in a system, one bit position per Node number. * * See detailed comments in the file linux/bitmap.h describing the * data type on which these nodemasks are based. * * For details of nodemask_scnprintf(), nodelist_scnpintf() and * nodemask_parse(), see bitmap_scnprintf() and bitmap_parse() * in lib/bitmap.c. * * The available nodemask operations are: * * void node_set(node, mask) turn on bit 'node' in mask * void node_clear(node, mask) turn off bit 'node' in mask * void nodes_setall(mask) set all bits * void nodes_clear(mask) clear all bits * int node_isset(node, mask) true iff bit 'node' set in mask * int node_test_and_set(node, mask) test and set bit 'node' in mask * * void nodes_and(dst, src1, src2) dst = src1 & src2 [intersection] * void nodes_or(dst, src1, src2) dst = src1 | src2 [union] * void nodes_xor(dst, src1, src2) dst = src1 ^ src2 * void nodes_andnot(dst, src1, src2) dst = src1 & ~src2 * void nodes_complement(dst, src) dst = ~src * * int nodes_equal(mask1, mask2) Does mask1 == mask2? * int nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect? * int nodes_subset(mask1, mask2) Is mask1 a subset of mask2? * int nodes_empty(mask) Is mask empty (no bits sets)? * int nodes_full(mask) Is mask full (all bits sets)? * int nodes_weight(mask) Hamming weight - number of set bits * * void nodes_shift_right(dst, src, n) Shift right * void nodes_shift_left(dst, src, n) Shift left * * int first_node(mask) Number lowest set bit, or MAX_NUMNODES * int next_node(node, mask) Next node past 'node', or MAX_NUMNODES * int last_node(mask) Number highest set bit, or MAX_NUMNODES * int first_unset_node(mask) First node not set in mask, or * MAX_NUMNODES. * int cycle_node(node, mask) Next node cycling from 'node', or * MAX_NUMNODES * * nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set * NODE_MASK_ALL Initializer - all bits set * NODE_MASK_NONE Initializer - no bits set * unsigned long *nodes_addr(mask) Array of unsigned long's in mask * * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing * int nodelist_scnprintf(buf, len, mask) Format nodemask as a list for printing * int nodemask_parse(ubuf, ulen, mask) Parse ascii string as nodemask * * for_each_node_mask(node, mask) for-loop node over mask * * int num_online_nodes() Number of online Nodes * * int node_online(node) Is some node online? * * int any_online_node(mask) First online node in mask * * node_set_online(node) set bit 'node' in node_online_map * node_set_offline(node) clear bit 'node' in node_online_map * * for_each_online_node(node) for-loop node over node_online_map * * Subtlety: * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway) * to generate slightly worse code. So use a simple one-line #define * for node_isset(), instead of wrapping an inline inside a macro, the * way we do the other calls. */ #include #include #include typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; extern nodemask_t _unused_nodemask_arg_; #define node_set(node, dst) __node_set((node), &(dst)) static inline void __node_set(int node, volatile nodemask_t *dstp) { set_bit(node, dstp->bits); } #define node_clear(node, dst) __node_clear((node), &(dst)) static inline void __node_clear(int node, volatile nodemask_t *dstp) { clear_bit(node, dstp->bits); } #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) static inline void __nodes_setall(nodemask_t *dstp, int nbits) { bitmap_fill(dstp->bits, nbits); } #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) static inline void __nodes_clear(nodemask_t *dstp, int nbits) { bitmap_zero(dstp->bits, nbits); } /* No static inline type checking - see Subtlety (1) above. */ #define node_isset(node, nodemask) test_bit((node), (nodemask).bits) #define node_test_and_set(node, nodemask) \ __node_test_and_set((node), &(nodemask)) static inline int __node_test_and_set(int node, nodemask_t *addr) { return test_and_set_bit(node, addr->bits); } #define nodes_and(dst, src1, src2) \ __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, int nbits) { bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_or(dst, src1, src2) \ __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, int nbits) { bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_xor(dst, src1, src2) \ __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, int nbits) { bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_andnot(dst, src1, src2) \ __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, int nbits) { bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_complement(dst, src) \ __nodes_complement(&(dst), &(src), MAX_NUMNODES) static inline void __nodes_complement(nodemask_t *dstp, const nodemask_t *srcp, int nbits) { bitmap_complement(dstp->bits, srcp->bits, nbits); } #define nodes_equal(src1, src2) \ __nodes_equal(&(src1), &(src2), MAX_NUMNODES) static inline int __nodes_equal(const nodemask_t *src1p, const nodemask_t *src2p, int nbits) { return bitmap_equal(src1p->bits, src2p->bits, nbits); } #define nodes_intersects(src1, src2) \ __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) static inline int __nodes_intersects(const nodemask_t *src1p, const nodemask_t *src2p, int nbits) { return bitmap_intersects(src1p->bits, src2p->bits, nbits); } #define nodes_subset(src1, src2) \ __nodes_subset(&(src1), &(src2), MAX_NUMNODES) static inline int __nodes_subset(const nodemask_t *src1p, const nodemask_t *src2p, int nbits) { return bitmap_subset(src1p->bits, src2p->bits, nbits); } #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) static inline int __nodes_empty(const nodemask_t *srcp, int nbits) { return bitmap_empty(srcp->bits, nbits); } #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) static inline int __nodes_full(const nodemask_t *srcp, int nbits) { return bitmap_full(srcp->bits, nbits); } #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) static inline int __nodes_weight(const nodemask_t *srcp, int nbits) { return bitmap_weight(srcp->bits, nbits); } #define nodes_shift_right(dst, src, n) \ __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES) static inline void __nodes_shift_right(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); } #define nodes_shift_left(dst, src, n) \ __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES) static inline void __nodes_shift_left(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); } /* FIXME: better would be to fix all architectures to never return > MAX_NUMNODES, then the silly min_ts could be dropped. */ #define first_node(src) __first_node(&(src), MAX_NUMNODES) static inline int __first_node(const nodemask_t *srcp, int nbits) { return min_t(int, nbits, find_first_bit(srcp->bits, nbits)); } #define next_node(n, src) __next_node((n), &(src), MAX_NUMNODES) static inline int __next_node(int n, const nodemask_t *srcp, int nbits) { return min_t(int, nbits, find_next_bit(srcp->bits, nbits, n+1)); } #define last_node(src) __last_node(&(src), MAX_NUMNODES) static inline int __last_node(const nodemask_t *srcp, int nbits) { int node, pnode = nbits; for (node = __first_node(srcp, nbits); node < nbits; node = __next_node(node, srcp, nbits)) pnode = node; return pnode; } #define nodemask_of_node(node) \ ({ \ typeof(_unused_nodemask_arg_) m; \ if (sizeof(m) == sizeof(unsigned long)) { \ m.bits[0] = 1UL<<(node); \ } else { \ nodes_clear(m); \ node_set((node), m); \ } \ m; \ }) #define first_unset_node(mask) __first_unset_node(&(mask)) static inline int __first_unset_node(const nodemask_t *maskp) { return min_t(int,MAX_NUMNODES, find_first_zero_bit(maskp->bits, MAX_NUMNODES)); } #define cycle_node(n, src) __cycle_node((n), &(src), MAX_NUMNODES) static inline int __cycle_node(int n, const nodemask_t *maskp, int nbits) { int nxt = __next_node(n, maskp, nbits); if (nxt == nbits) nxt = __first_node(maskp, nbits); return nxt; } #define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES) #if MAX_NUMNODES <= BITS_PER_LONG #define NODE_MASK_ALL \ ((nodemask_t) { { \ [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ } }) #else #define NODE_MASK_ALL \ ((nodemask_t) { { \ [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL, \ [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ } }) #endif #define NODE_MASK_NONE \ ((nodemask_t) { { \ [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] = 0UL \ } }) #define nodes_addr(src) ((src).bits) #define nodelist_scnprintf(buf, len, src) \ __nodelist_scnprintf((buf), (len), (src), MAX_NUMNODES) static inline int __nodelist_scnprintf(char *buf, int len, const nodemask_t *srcp, int nbits) { return bitmap_scnlistprintf(buf, len, srcp->bits, nbits); } #if 0 #define nodemask_scnprintf(buf, len, src) \ __nodemask_scnprintf((buf), (len), &(src), MAX_NUMNODES) static inline int __nodemask_scnprintf(char *buf, int len, const nodemask_t *srcp, int nbits) { return bitmap_scnprintf(buf, len, srcp->bits, nbits); } #define nodemask_parse(ubuf, ulen, dst) \ __nodemask_parse((ubuf), (ulen), &(dst), MAX_NUMNODES) static inline int __nodemask_parse(const char __user *buf, int len, nodemask_t *dstp, int nbits) { return bitmap_parse(buf, len, dstp->bits, nbits); } #endif #if MAX_NUMNODES > 1 #define for_each_node_mask(node, mask) \ for ((node) = first_node(mask); \ (node) < MAX_NUMNODES; \ (node) = next_node((node), (mask))) #else /* MAX_NUMNODES == 1 */ #define for_each_node_mask(node, mask) \ if (!nodes_empty(mask)) \ for ((node) = 0; (node) < 1; (node)++) #endif /* MAX_NUMNODES */ /* * The following particular system nodemasks and operations * on them manage online nodes. */ extern nodemask_t node_online_map; #if MAX_NUMNODES > 1 #define num_online_nodes() nodes_weight(node_online_map) #define node_online(node) node_isset((node), node_online_map) #else #define num_online_nodes() 1 #define node_online(node) ((node) == 0) #endif #define any_online_node(mask) \ ({ \ int node; \ for_each_node_mask(node, (mask)) \ if (node_online(node)) \ break; \ node; \ }) #define node_set_online(node) set_bit((node), node_online_map.bits) #define node_set_offline(node) clear_bit((node), node_online_map.bits) #define for_each_online_node(node) for_each_node_mask((node), node_online_map) #endif /* __LINUX_NODEMASK_H */ xen-4.4.0/xen/include/xen/timer.h0000664000175000017500000000617312307313555014752 0ustar smbsmb/****************************************************************************** * timer.h * * Copyright (c) 2002-2003 Rolf Neugebauer * Copyright (c) 2002-2005 K A Fraser */ #ifndef _TIMER_H_ #define _TIMER_H_ #include #include #include #include #include struct timer { /* System time expiry value (nanoseconds since boot). */ s_time_t expires; /* Position in active-timer data structure. */ union { /* Timer-heap offset (TIMER_STATUS_in_heap). */ unsigned int heap_offset; /* Linked list (TIMER_STATUS_in_list). */ struct timer *list_next; /* Linked list of inactive timers (TIMER_STATUS_inactive). */ struct list_head inactive; }; /* On expiry, '(*function)(data)' will be executed in softirq context. */ void (*function)(void *); void *data; /* CPU on which this timer will be installed and executed. */ #define TIMER_CPU_status_killed 0xffffu /* Timer is TIMER_STATUS_killed */ uint16_t cpu; /* Timer status. */ #define TIMER_STATUS_invalid 0 /* Should never see this. */ #define TIMER_STATUS_inactive 1 /* Not in use; can be activated. */ #define TIMER_STATUS_killed 2 /* Not in use; cannot be activated. */ #define TIMER_STATUS_in_heap 3 /* In use; on timer heap. */ #define TIMER_STATUS_in_list 4 /* In use; on overflow linked list. */ uint8_t status; }; /* * All functions below can be called for any CPU from any CPU in any context. */ /* * Initialise a timer structure with an initial callback CPU, callback * function and callback data pointer. This function must only be called on * a brand new timer, or a killed timer. It must *never* execute concurrently * with any other operation on the same timer. */ void init_timer( struct timer *timer, void (*function)(void *), void *data, unsigned int cpu); /* Set the expiry time and activate a timer. */ void set_timer(struct timer *timer, s_time_t expires); /* * Deactivate a timer This function has no effect if the timer is not currently * active. */ void stop_timer(struct timer *timer); /* Migrate a timer to a different CPU. The timer may be currently active. */ void migrate_timer(struct timer *timer, unsigned int new_cpu); /* * Deactivate a timer and prevent it from being re-set (future calls to * set_timer will silently fail). When this function returns it is guaranteed * that the timer callback handler is not running on any CPU. */ void kill_timer(struct timer *timer); /* Bootstrap initialisation. Must be called before any other timer function. */ void timer_init(void); /* Next timer deadline for each CPU. */ DECLARE_PER_CPU(s_time_t, timer_deadline); /* Arch-defined function to reprogram timer hardware for new deadline. */ int reprogram_timer(s_time_t timeout); /* Calculate the aligned first tick time for a given periodic timer. */ s_time_t align_timer(s_time_t firsttick, uint64_t period); #endif /* _TIMER_H_ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/xen/tmem.h0000664000175000017500000000061712307313555014571 0ustar smbsmb/****************************************************************************** * tmem.h * * Transcendent memory * * Copyright (c) 2008, Dan Magenheimer, Oracle Corp. */ #ifndef __XEN_TMEM_H__ #define __XEN_TMEM_H__ extern void tmem_destroy(void *); extern void *tmem_relinquish_pages(unsigned int, unsigned int); extern unsigned long tmem_freeable_pages(void); #endif /* __XEN_TMEM_H__ */ xen-4.4.0/xen/include/Makefile0000664000175000017500000000553612307313555014331 0ustar smbsmbinclude $(XEN_ROOT)/Config.mk ifneq ($(CONFIG_COMPAT),) compat-arch-$(CONFIG_X86) := x86_32 headers-y := \ compat/callback.h \ compat/elfnote.h \ compat/event_channel.h \ compat/features.h \ compat/grant_table.h \ compat/kexec.h \ compat/memory.h \ compat/nmi.h \ compat/physdev.h \ compat/platform.h \ compat/sched.h \ compat/tmem.h \ compat/trace.h \ compat/vcpu.h \ compat/version.h \ compat/xen.h \ compat/xencomm.h \ compat/xenoprof.h headers-$(CONFIG_X86) += compat/arch-x86/xen-mca.h headers-$(CONFIG_X86) += compat/arch-x86/xen.h headers-$(CONFIG_X86) += compat/arch-x86/xen-$(compat-arch-y).h headers-y += compat/arch-$(compat-arch-y).h compat/xlat.h cppflags-y := -include public/xen-compat.h cppflags-$(CONFIG_X86) += -m32 # 8-byte types are 4-byte aligned on x86_32 ... prefix-$(CONFIG_X86) := \#pragma pack(4) suffix-$(CONFIG_X86) := \#pragma pack() endif public-$(CONFIG_X86) := $(wildcard public/arch-x86/*.h public/arch-x86/*/*.h) public-$(CONFIG_ARM) := $(wildcard public/arch-arm/*.h public/arch-arm/*/*.h) .PHONY: all all: $(headers-y) compat/%.h: compat/%.i Makefile $(BASEDIR)/tools/compat-build-header.py set -e; id=_$$(echo $@ | tr '[:lower:]-/.' '[:upper:]___'); \ echo "#ifndef $$id" >$@.new; \ echo "#define $$id" >>$@.new; \ echo "#include " >>$@.new; \ $(if $(filter-out compat/arch-%.h,$@),echo "#include <$(patsubst compat/%,public/%,$@)>" >>$@.new;) \ $(if $(prefix-y),echo "$(prefix-y)" >>$@.new;) \ grep -v '^# [0-9]' $< | \ $(PYTHON) $(BASEDIR)/tools/compat-build-header.py | uniq >>$@.new; \ $(if $(suffix-y),echo "$(suffix-y)" >>$@.new;) \ echo "#endif /* $$id */" >>$@.new mv -f $@.new $@ compat/%.i: compat/%.c Makefile $(CPP) $(filter-out -M% .%.d -include %/include/xen/config.h,$(CFLAGS)) $(cppflags-y) -o $@ $< compat/%.c: public/%.h xlat.lst Makefile $(BASEDIR)/tools/compat-build-source.py mkdir -p $(@D) grep -v 'DEFINE_XEN_GUEST_HANDLE(long)' $< | \ $(PYTHON) $(BASEDIR)/tools/compat-build-source.py >$@.new mv -f $@.new $@ compat/xlat.h: xlat.lst $(filter-out compat/xlat.h,$(headers-y)) $(BASEDIR)/tools/get-fields.sh Makefile export PYTHON=$(PYTHON); \ grep -v '^[ ]*#' xlat.lst | \ while read what name hdr; do \ $(SHELL) $(BASEDIR)/tools/get-fields.sh "$$what" compat_$$name $$(echo compat/$$hdr | sed 's,@arch@,$(compat-arch-y),g') || exit $$?; \ done >$@.new mv -f $@.new $@ ifeq ($(XEN_TARGET_ARCH),$(XEN_COMPILE_ARCH)) all: headers.chk headers.chk: $(filter-out public/arch-% public/%ctl.h public/xsm/% public/%hvm/save.h, $(wildcard public/*.h public/*/*.h) $(public-y)) Makefile for i in $(filter %.h,$^); do $(CC) -ansi -include stdint.h -Wall -W -Werror -S -o /dev/null -xc $$i || exit 1; echo $$i; done >$@.new mv $@.new $@ endif clean:: rm -rf compat headers.chk xen-4.4.0/xen/include/crypto/0000775000175000017500000000000012307313555014200 5ustar smbsmbxen-4.4.0/xen/include/crypto/rijndael.h0000664000175000017500000000437312307313555016150 0ustar smbsmb/* $OpenBSD: rijndael.h,v 1.13 2008/06/09 07:49:45 djm Exp $ */ /** * rijndael-alg-fst.h * * @version 3.0 (December 2000) * * Optimised ANSI C code for the Rijndael cipher (now AES) * * @author Vincent Rijmen * @author Antoon Bosselaers * @author Paulo Barreto * * This code is hereby placed in the public domain. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __RIJNDAEL_H #define __RIJNDAEL_H #define AES_MAXKEYBITS (256) #define AES_MAXKEYBYTES (AES_MAXKEYBITS/8) /* for 256-bit keys, fewer for less */ #define AES_MAXROUNDS 14 //typedef unsigned char u8; //typedef unsigned short u16; //typedef unsigned int u32; /* The structure for key information */ typedef struct { int enc_only; /* context contains only encrypt schedule */ int Nr; /* key-length-dependent number of rounds */ u32 ek[4*(AES_MAXROUNDS + 1)]; /* encrypt key schedule */ u32 dk[4*(AES_MAXROUNDS + 1)]; /* decrypt key schedule */ } rijndael_ctx; int rijndael_set_key(rijndael_ctx *, const u_char *, int); int rijndael_set_key_enc_only(rijndael_ctx *, const u_char *, int); void rijndael_decrypt(rijndael_ctx *, const u_char *, u_char *); void rijndael_encrypt(rijndael_ctx *, const u_char *, u_char *); int rijndaelKeySetupEnc(unsigned int [], const unsigned char [], int); int rijndaelKeySetupDec(unsigned int [], const unsigned char [], int); void rijndaelEncrypt(const unsigned int [], int, const unsigned char [], unsigned char []); #endif /* __RIJNDAEL_H */ xen-4.4.0/xen/include/crypto/vmac.h0000664000175000017500000001600012307313555015274 0ustar smbsmb#ifndef HEADER_VMAC_H #define HEADER_VMAC_H /* -------------------------------------------------------------------------- * VMAC and VHASH Implementation by Ted Krovetz (tdk@acm.org) and Wei Dai. * This implementation is herby placed in the public domain. * The authors offers no warranty. Use at your own risk. * Please send bug reports to the authors. * Last modified: 17 APR 08, 1700 PDT * ----------------------------------------------------------------------- */ /* -------------------------------------------------------------------------- * User definable settings. * ----------------------------------------------------------------------- */ #define VMAC_TAG_LEN 64 /* Must be 64 or 128 - 64 sufficient for most */ #define VMAC_KEY_LEN 128 /* Must be 128, 192 or 256 */ #define VMAC_NHBYTES 128 /* Must 2^i for any 3 < i < 13. Standard = 128 */ #define VMAC_PREFER_BIG_ENDIAN 0 /* Prefer non-x86 */ #define VMAC_USE_OPENSSL 0 /* Set to non-zero to use OpenSSL's AES */ #define VMAC_CACHE_NONCES 1 /* Set to non-zero to cause caching */ /* of consecutive nonces on 64-bit tags */ #define VMAC_RUN_TESTS 0 /* Set to non-zero to check vectors and speed */ #define VMAC_HZ (448e6) /* Set to hz of host machine to get speed */ #define VMAC_HASH_ONLY 0 /* Set to non-zero to time hash only (not-mac) */ /* Speeds of cpus I have access to #define hz (2400e6) glyme Core 2 "Conroe" #define hz (2000e6) jupiter G5 #define hz (1592e6) titan #define hz (2793e6) athena/gaia #define hz (1250e6) isis G4 #define hz (2160e6) imac Core 2 "Merom" #define hz (266e6) ppc/arm #define hz (400e6) mips */ /* -------------------------------------------------------------------------- * This implementation uses uint32_t and uint64_t as names for unsigned 32- * and 64-bit integer types. These are defined in C99 stdint.h. The * following may need adaptation if you are not running a C99 or * Microsoft C environment. * ----------------------------------------------------------------------- */ #define VMAC_USE_STDINT 1 /* Set to zero if system has no stdint.h */ #if VMAC_USE_STDINT && !_MSC_VER /* Try stdint.h if non-Microsoft */ #ifdef __cplusplus #define __STDC_CONSTANT_MACROS #endif //#include #elif (_MSC_VER) /* Microsoft C does not have stdint.h */ typedef unsigned __int32 uint32_t; typedef unsigned __int64 uint64_t; #define UINT64_C(v) v ## UI64 #else /* Guess sensibly - may need adaptation */ typedef unsigned int uint32_t; typedef unsigned long long uint64_t; #define UINT64_C(v) v ## ULL #endif /* -------------------------------------------------------------------------- * This implementation supports two free AES implementations: OpenSSL's and * Paulo Barreto's. To use OpenSSL's, you will need to include the OpenSSL * crypto library (eg, gcc -lcrypto foo.c). For Barreto's, you will need * to compile rijndael-alg-fst.c, last seen at http://www.iaik.tu-graz.ac.at/ * research/krypto/AES/old/~rijmen/rijndael/rijndael-fst-3.0.zip and * http://homes.esat.kuleuven.be/~rijmen/rijndael/rijndael-fst-3.0.zip. * To use a different implementation, use these definitions as a model. * ----------------------------------------------------------------------- */ #if VMAC_USE_OPENSSL #include typedef AES_KEY aes_int_key; #define aes_encryption(in,out,int_key) \ AES_encrypt((unsigned char *)(in),(unsigned char *)(out),(int_key)) #define aes_key_setup(key,int_key) \ AES_set_encrypt_key((key),VMAC_KEY_LEN,(int_key)) #else //#include "rijndael-alg-fst.h" typedef uint64_t vmac_t; #include "rijndael.h" typedef u32 aes_int_key[4*(VMAC_KEY_LEN/32+7)]; #define aes_encryption(in,out,int_key) \ rijndaelEncrypt((u32 *)(int_key), \ ((VMAC_KEY_LEN/32)+6), \ (u8 *)(in), (u8 *)(out)) #define aes_key_setup(user_key,int_key) \ rijndaelKeySetupEnc((u32 *)(int_key), \ (u8 *)(user_key), \ VMAC_KEY_LEN) #endif /* --------------------------------------------------------------------- */ typedef struct { uint64_t nhkey [(VMAC_NHBYTES/8)+2*(VMAC_TAG_LEN/64-1)]; uint64_t polykey[2*VMAC_TAG_LEN/64]; uint64_t l3key [2*VMAC_TAG_LEN/64]; uint64_t polytmp[2*VMAC_TAG_LEN/64]; aes_int_key cipher_key; #if (VMAC_TAG_LEN == 64) && (VMAC_CACHE_NONCES) uint64_t cached_nonce[2]; uint64_t cached_aes[2]; #endif int first_block_processed; } vmac_ctx_t; /* --------------------------------------------------------------------- */ #ifdef __cplusplus extern "C" { #endif /* -------------------------------------------------------------------------- * <<<<< USAGE NOTES >>>>> * * Given msg m (mbytes in length) and nonce buffer n * this function returns a tag as its output. The tag is returned as * a number. When VMAC_TAG_LEN == 64, the 'return'ed integer is the tag, * and *tagl is meaningless. When VMAC_TAG_LEN == 128 the tag is the * number y * 2^64 + *tagl where y is the function's return value. * If you want to consider tags to be strings, then you must do so with * an agreed upon endian orientation for interoperability, and convert * the results appropriately. VHASH hashes m without creating any tag. * Consecutive substrings forming a prefix of a message may be passed * to vhash_update, with vhash or vmac being called with the remainder * to produce the output. * * Requirements: * - On 32-bit architectures with SSE2 instructions, ctx and m MUST be * begin on 16-byte memory boundaries. * - m MUST be your message followed by zeroes to the nearest 16-byte * boundary. If m is a length multiple of 16 bytes, then it is already * at a 16-byte boundary and needs no padding. mbytes should be your * message length without any padding. * - The first bit of the nonce buffer n must be 0. An i byte nonce, is made * as the first 16-i bytes of n being zero, and the final i the nonce. * - vhash_update MUST have mbytes be a positive multiple of VMAC_NHBYTES * ----------------------------------------------------------------------- */ #define vmac_update vhash_update void vhash_update(unsigned char m[], unsigned int mbytes, vmac_ctx_t *ctx); uint64_t vmac(unsigned char m[], unsigned int mbytes, unsigned char n[16], uint64_t *tagl, vmac_ctx_t *ctx); uint64_t vhash(unsigned char m[], unsigned int mbytes, uint64_t *tagl, vmac_ctx_t *ctx); /* -------------------------------------------------------------------------- * When passed a VMAC_KEY_LEN bit user_key, this function initialazies ctx. * ----------------------------------------------------------------------- */ void vmac_set_key(unsigned char user_key[], vmac_ctx_t *ctx); /* --------------------------------------------------------------------- */ #ifdef __cplusplus } #endif #endif /* HEADER_AES_H */ xen-4.4.0/xen/include/xlat.lst0000664000175000017500000000625012307313555014357 0ustar smbsmb# First column indicator: # ! - needs translation # ? - needs checking ? dom0_vga_console_info xen.h ? xenctl_bitmap xen.h ? mmu_update xen.h ! mmuext_op xen.h ! start_info xen.h ? vcpu_info xen.h ? vcpu_time_info xen.h ! cpu_user_regs arch-x86/xen-@arch@.h ! trap_info arch-x86/xen.h ? cpu_offline_action arch-x86/xen-mca.h ? mc arch-x86/xen-mca.h ? mcinfo_bank arch-x86/xen-mca.h ? mcinfo_common arch-x86/xen-mca.h ? mcinfo_extended arch-x86/xen-mca.h ? mcinfo_global arch-x86/xen-mca.h ? mcinfo_logical_cpu arch-x86/xen-mca.h ? mcinfo_msr arch-x86/xen-mca.h ? mcinfo_recovery arch-x86/xen-mca.h ! mc_fetch arch-x86/xen-mca.h ? mc_info arch-x86/xen-mca.h ? mc_mceinject arch-x86/xen-mca.h ? mc_msrinject arch-x86/xen-mca.h ? mc_notifydomain arch-x86/xen-mca.h ! mc_physcpuinfo arch-x86/xen-mca.h ? page_offline_action arch-x86/xen-mca.h ? evtchn_alloc_unbound event_channel.h ? evtchn_bind_interdomain event_channel.h ? evtchn_bind_ipi event_channel.h ? evtchn_bind_pirq event_channel.h ? evtchn_bind_vcpu event_channel.h ? evtchn_bind_virq event_channel.h ? evtchn_close event_channel.h ? evtchn_op event_channel.h ? evtchn_send event_channel.h ? evtchn_status event_channel.h ? evtchn_unmask event_channel.h ! gnttab_copy grant_table.h ? gnttab_dump_table grant_table.h ? gnttab_map_grant_ref grant_table.h ! gnttab_setup_table grant_table.h ! gnttab_transfer grant_table.h ? gnttab_unmap_grant_ref grant_table.h ? gnttab_unmap_and_replace grant_table.h ? gnttab_set_version grant_table.h ? gnttab_get_version grant_table.h ! gnttab_get_status_frames grant_table.h ? grant_entry_v1 grant_table.h ? grant_entry_header grant_table.h ? grant_entry_v2 grant_table.h ? gnttab_swap_grant_ref grant_table.h ? kexec_exec kexec.h ! kexec_image kexec.h ! kexec_range kexec.h ! add_to_physmap memory.h ! add_to_physmap_batch memory.h ! foreign_memory_map memory.h ! memory_exchange memory.h ! memory_map memory.h ! memory_reservation memory.h ! pod_target memory.h ! remove_from_physmap memory.h ? physdev_eoi physdev.h ? physdev_get_free_pirq physdev.h ? physdev_irq physdev.h ? physdev_irq_status_query physdev.h ? physdev_manage_pci physdev.h ? physdev_manage_pci_ext physdev.h ? physdev_pci_device physdev.h ? physdev_pci_device_add physdev.h ? physdev_pci_mmcfg_reserved physdev.h ? physdev_unmap_pirq physdev.h ? physdev_restore_msi physdev.h ? physdev_set_iopl physdev.h ? physdev_setup_gsi physdev.h ! pct_register platform.h ! power_register platform.h ? processor_csd platform.h ! processor_cx platform.h ! processor_flags platform.h ! processor_performance platform.h ! processor_power platform.h ? processor_px platform.h ! psd_package platform.h ? xenpf_enter_acpi_sleep platform.h ? xenpf_pcpuinfo platform.h ? xenpf_pcpu_version platform.h ! sched_poll sched.h ? sched_remote_shutdown sched.h ? sched_shutdown sched.h ! tmem_op tmem.h ? t_buf trace.h ? vcpu_get_physid vcpu.h ? vcpu_register_vcpu_info vcpu.h ! vcpu_runstate_info vcpu.h ? vcpu_set_periodic_timer vcpu.h ! vcpu_set_singleshot_timer vcpu.h ? xenoprof_init xenoprof.h ? xenoprof_passive xenoprof.h xen-4.4.0/xen/include/public/0000775000175000017500000000000012307313555014136 5ustar smbsmbxen-4.4.0/xen/include/public/xenoprof.h0000664000175000017500000001065512307313555016156 0ustar smbsmb/****************************************************************************** * xenoprof.h * * Interface for enabling system wide profiling based on hardware performance * counters * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (C) 2005 Hewlett-Packard Co. * Written by Aravind Menon & Jose Renato Santos */ #ifndef __XEN_PUBLIC_XENOPROF_H__ #define __XEN_PUBLIC_XENOPROF_H__ #include "xen.h" /* * Commands to HYPERVISOR_xenoprof_op(). */ #define XENOPROF_init 0 #define XENOPROF_reset_active_list 1 #define XENOPROF_reset_passive_list 2 #define XENOPROF_set_active 3 #define XENOPROF_set_passive 4 #define XENOPROF_reserve_counters 5 #define XENOPROF_counter 6 #define XENOPROF_setup_events 7 #define XENOPROF_enable_virq 8 #define XENOPROF_start 9 #define XENOPROF_stop 10 #define XENOPROF_disable_virq 11 #define XENOPROF_release_counters 12 #define XENOPROF_shutdown 13 #define XENOPROF_get_buffer 14 #define XENOPROF_set_backtrace 15 /* AMD IBS support */ #define XENOPROF_get_ibs_caps 16 #define XENOPROF_ibs_counter 17 #define XENOPROF_last_op 17 #define MAX_OPROF_EVENTS 32 #define MAX_OPROF_DOMAINS 25 #define XENOPROF_CPU_TYPE_SIZE 64 /* Xenoprof performance events (not Xen events) */ struct event_log { uint64_t eip; uint8_t mode; uint8_t event; }; /* PC value that indicates a special code */ #define XENOPROF_ESCAPE_CODE (~0ULL) /* Transient events for the xenoprof->oprofile cpu buf */ #define XENOPROF_TRACE_BEGIN 1 /* Xenoprof buffer shared between Xen and domain - 1 per VCPU */ struct xenoprof_buf { uint32_t event_head; uint32_t event_tail; uint32_t event_size; uint32_t vcpu_id; uint64_t xen_samples; uint64_t kernel_samples; uint64_t user_samples; uint64_t lost_samples; struct event_log event_log[1]; }; #ifndef __XEN__ typedef struct xenoprof_buf xenoprof_buf_t; DEFINE_XEN_GUEST_HANDLE(xenoprof_buf_t); #endif struct xenoprof_init { int32_t num_events; int32_t is_primary; char cpu_type[XENOPROF_CPU_TYPE_SIZE]; }; typedef struct xenoprof_init xenoprof_init_t; DEFINE_XEN_GUEST_HANDLE(xenoprof_init_t); struct xenoprof_get_buffer { int32_t max_samples; int32_t nbuf; int32_t bufsize; uint64_t buf_gmaddr; }; typedef struct xenoprof_get_buffer xenoprof_get_buffer_t; DEFINE_XEN_GUEST_HANDLE(xenoprof_get_buffer_t); struct xenoprof_counter { uint32_t ind; uint64_t count; uint32_t enabled; uint32_t event; uint32_t hypervisor; uint32_t kernel; uint32_t user; uint64_t unit_mask; }; typedef struct xenoprof_counter xenoprof_counter_t; DEFINE_XEN_GUEST_HANDLE(xenoprof_counter_t); typedef struct xenoprof_passive { uint16_t domain_id; int32_t max_samples; int32_t nbuf; int32_t bufsize; uint64_t buf_gmaddr; } xenoprof_passive_t; DEFINE_XEN_GUEST_HANDLE(xenoprof_passive_t); struct xenoprof_ibs_counter { uint64_t op_enabled; uint64_t fetch_enabled; uint64_t max_cnt_fetch; uint64_t max_cnt_op; uint64_t rand_en; uint64_t dispatched_ops; }; typedef struct xenoprof_ibs_counter xenoprof_ibs_counter_t; DEFINE_XEN_GUEST_HANDLE(xenoprof_ibs_counter_t); #endif /* __XEN_PUBLIC_XENOPROF_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/hvm/0000775000175000017500000000000012307313555014730 5ustar smbsmbxen-4.4.0/xen/include/public/hvm/save.h0000664000175000017500000001063512307313555016044 0ustar smbsmb/* * hvm/save.h * * Structure definitions for HVM state that is held by Xen and must * be saved along with the domain's memory and device-model state. * * Copyright (c) 2007 XenSource Ltd. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef __XEN_PUBLIC_HVM_SAVE_H__ #define __XEN_PUBLIC_HVM_SAVE_H__ /* * Structures in this header *must* have the same layout in 32bit * and 64bit environments: this means that all fields must be explicitly * sized types and aligned to their sizes, and the structs must be * a multiple of eight bytes long. * * Only the state necessary for saving and restoring (i.e. fields * that are analogous to actual hardware state) should go in this file. * Internal mechanisms should be kept in Xen-private headers. */ #if !defined(__GNUC__) || defined(__STRICT_ANSI__) #error "Anonymous structs/unions are a GNU extension." #endif /* * Each entry is preceded by a descriptor giving its type and length */ struct hvm_save_descriptor { uint16_t typecode; /* Used to demux the various types below */ uint16_t instance; /* Further demux within a type */ uint32_t length; /* In bytes, *not* including this descriptor */ }; /* * Each entry has a datatype associated with it: for example, the CPU state * is saved as a HVM_SAVE_TYPE(CPU), which has HVM_SAVE_LENGTH(CPU), * and is identified by a descriptor with typecode HVM_SAVE_CODE(CPU). * DECLARE_HVM_SAVE_TYPE binds these things together with some type-system * ugliness. */ #ifdef __XEN__ # define DECLARE_HVM_SAVE_TYPE_COMPAT(_x, _code, _type, _ctype, _fix) \ static inline int __HVM_SAVE_FIX_COMPAT_##_x(void *h) { return _fix(h); } \ struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; char cpt[2];}; \ struct __HVM_SAVE_TYPE_COMPAT_##_x { _ctype t; } # include /* BUG() */ # define DECLARE_HVM_SAVE_TYPE(_x, _code, _type) \ static inline int __HVM_SAVE_FIX_COMPAT_##_x(void *h) { BUG(); return -1; } \ struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; char cpt[1];}; \ struct __HVM_SAVE_TYPE_COMPAT_##_x { _type t; } #else # define DECLARE_HVM_SAVE_TYPE_COMPAT(_x, _code, _type, _ctype, _fix) \ struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; char cpt[2];} # define DECLARE_HVM_SAVE_TYPE(_x, _code, _type) \ struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; char cpt[1];} #endif #define HVM_SAVE_TYPE(_x) typeof (((struct __HVM_SAVE_TYPE_##_x *)(0))->t) #define HVM_SAVE_LENGTH(_x) (sizeof (HVM_SAVE_TYPE(_x))) #define HVM_SAVE_CODE(_x) (sizeof (((struct __HVM_SAVE_TYPE_##_x *)(0))->c)) #ifdef __XEN__ # define HVM_SAVE_TYPE_COMPAT(_x) typeof (((struct __HVM_SAVE_TYPE_COMPAT_##_x *)(0))->t) # define HVM_SAVE_LENGTH_COMPAT(_x) (sizeof (HVM_SAVE_TYPE_COMPAT(_x))) # define HVM_SAVE_HAS_COMPAT(_x) (sizeof (((struct __HVM_SAVE_TYPE_##_x *)(0))->cpt)-1) # define HVM_SAVE_FIX_COMPAT(_x, _dst) __HVM_SAVE_FIX_COMPAT_##_x(_dst) #endif /* * The series of save records is teminated by a zero-type, zero-length * descriptor. */ struct hvm_save_end {}; DECLARE_HVM_SAVE_TYPE(END, 0, struct hvm_save_end); #if defined(__i386__) || defined(__x86_64__) #include "../arch-x86/hvm/save.h" #elif defined(__arm__) || defined(__aarch64__) #include "../arch-arm/hvm/save.h" #else #error "unsupported architecture" #endif #endif /* __XEN_PUBLIC_HVM_SAVE_H__ */ xen-4.4.0/xen/include/public/hvm/pvdrivers.h0000664000175000017500000000434112307313555017127 0ustar smbsmb/* * pvdrivers.h: Register of PV drivers product numbers. * Copyright (c) 2012, Citrix Systems Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef _XEN_PUBLIC_PVDRIVERS_H_ #define _XEN_PUBLIC_PVDRIVERS_H_ /* * This is the master registry of product numbers for * PV drivers. * If you need a new product number allocating, please * post to xen-devel@lists.xensource.com. You should NOT use * a product number without allocating one. * If you maintain a separate versioning and distribution path * for PV drivers you should have a separate product number so * that your drivers can be separated from others. * * During development, you may use the product ID to * indicate a driver which is yet to be released. */ #define PVDRIVERS_PRODUCT_LIST(EACH) \ EACH("xensource-windows", 0x0001) /* Citrix */ \ EACH("gplpv-windows", 0x0002) /* James Harper */ \ EACH("linux", 0x0003) \ EACH("xenserver-windows-v7.0+", 0x0004) /* Citrix */ \ EACH("xenserver-windows-v7.2+", 0x0005) /* Citrix */ \ EACH("experimental", 0xffff) #endif /* _XEN_PUBLIC_PVDRIVERS_H_ */ xen-4.4.0/xen/include/public/hvm/params.h0000664000175000017500000001276412307313555016376 0ustar smbsmb/* * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef __XEN_PUBLIC_HVM_PARAMS_H__ #define __XEN_PUBLIC_HVM_PARAMS_H__ #include "hvm_op.h" /* * Parameter space for HVMOP_{set,get}_param. */ /* * How should CPU0 event-channel notifications be delivered? * val[63:56] == 0: val[55:0] is a delivery GSI (Global System Interrupt). * val[63:56] == 1: val[55:0] is a delivery PCI INTx line, as follows: * Domain = val[47:32], Bus = val[31:16], * DevFn = val[15: 8], IntX = val[ 1: 0] * val[63:56] == 2: val[7:0] is a vector number, check for * XENFEAT_hvm_callback_vector to know if this delivery * method is available. * If val == 0 then CPU0 event-channel notifications are not delivered. */ #define HVM_PARAM_CALLBACK_IRQ 0 /* * These are not used by Xen. They are here for convenience of HVM-guest * xenbus implementations. */ #define HVM_PARAM_STORE_PFN 1 #define HVM_PARAM_STORE_EVTCHN 2 #define HVM_PARAM_PAE_ENABLED 4 #define HVM_PARAM_IOREQ_PFN 5 #define HVM_PARAM_BUFIOREQ_PFN 6 #define HVM_PARAM_BUFIOREQ_EVTCHN 26 #if defined(__i386__) || defined(__x86_64__) /* Expose Viridian interfaces to this HVM guest? */ #define HVM_PARAM_VIRIDIAN 9 #endif /* * Set mode for virtual timers (currently x86 only): * delay_for_missed_ticks (default): * Do not advance a vcpu's time beyond the correct delivery time for * interrupts that have been missed due to preemption. Deliver missed * interrupts when the vcpu is rescheduled and advance the vcpu's virtual * time stepwise for each one. * no_delay_for_missed_ticks: * As above, missed interrupts are delivered, but guest time always tracks * wallclock (i.e., real) time while doing so. * no_missed_ticks_pending: * No missed interrupts are held pending. Instead, to ensure ticks are * delivered at some non-zero rate, if we detect missed ticks then the * internal tick alarm is not disabled if the VCPU is preempted during the * next tick period. * one_missed_tick_pending: * Missed interrupts are collapsed together and delivered as one 'late tick'. * Guest time always tracks wallclock (i.e., real) time. */ #define HVM_PARAM_TIMER_MODE 10 #define HVMPTM_delay_for_missed_ticks 0 #define HVMPTM_no_delay_for_missed_ticks 1 #define HVMPTM_no_missed_ticks_pending 2 #define HVMPTM_one_missed_tick_pending 3 /* Boolean: Enable virtual HPET (high-precision event timer)? (x86-only) */ #define HVM_PARAM_HPET_ENABLED 11 /* Identity-map page directory used by Intel EPT when CR0.PG=0. */ #define HVM_PARAM_IDENT_PT 12 /* Device Model domain, defaults to 0. */ #define HVM_PARAM_DM_DOMAIN 13 /* ACPI S state: currently support S0 and S3 on x86. */ #define HVM_PARAM_ACPI_S_STATE 14 /* TSS used on Intel when CR0.PE=0. */ #define HVM_PARAM_VM86_TSS 15 /* Boolean: Enable aligning all periodic vpts to reduce interrupts */ #define HVM_PARAM_VPT_ALIGN 16 /* Console debug shared memory ring and event channel */ #define HVM_PARAM_CONSOLE_PFN 17 #define HVM_PARAM_CONSOLE_EVTCHN 18 /* * Select location of ACPI PM1a and TMR control blocks. Currently two locations * are supported, specified by version 0 or 1 in this parameter: * - 0: default, use the old addresses * PM1A_EVT == 0x1f40; PM1A_CNT == 0x1f44; PM_TMR == 0x1f48 * - 1: use the new default qemu addresses * PM1A_EVT == 0xb000; PM1A_CNT == 0xb004; PM_TMR == 0xb008 * You can find these address definitions in */ #define HVM_PARAM_ACPI_IOPORTS_LOCATION 19 /* Enable blocking memory events, async or sync (pause vcpu until response) * onchangeonly indicates messages only on a change of value */ #define HVM_PARAM_MEMORY_EVENT_CR0 20 #define HVM_PARAM_MEMORY_EVENT_CR3 21 #define HVM_PARAM_MEMORY_EVENT_CR4 22 #define HVM_PARAM_MEMORY_EVENT_INT3 23 #define HVM_PARAM_MEMORY_EVENT_SINGLE_STEP 25 #define HVM_PARAM_MEMORY_EVENT_MSR 30 #define HVMPME_MODE_MASK (3 << 0) #define HVMPME_mode_disabled 0 #define HVMPME_mode_async 1 #define HVMPME_mode_sync 2 #define HVMPME_onchangeonly (1 << 2) /* Boolean: Enable nestedhvm (hvm only) */ #define HVM_PARAM_NESTEDHVM 24 /* Params for the mem event rings */ #define HVM_PARAM_PAGING_RING_PFN 27 #define HVM_PARAM_ACCESS_RING_PFN 28 #define HVM_PARAM_SHARING_RING_PFN 29 /* SHUTDOWN_* action in case of a triple fault */ #define HVM_PARAM_TRIPLE_FAULT_REASON 31 #define HVM_NR_PARAMS 32 #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */ xen-4.4.0/xen/include/public/hvm/hvm_op.h0000664000175000017500000002300312307313555016367 0ustar smbsmb/* * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef __XEN_PUBLIC_HVM_HVM_OP_H__ #define __XEN_PUBLIC_HVM_HVM_OP_H__ #include "../xen.h" #include "../trace.h" /* Get/set subcommands: extra argument == pointer to xen_hvm_param struct. */ #define HVMOP_set_param 0 #define HVMOP_get_param 1 struct xen_hvm_param { domid_t domid; /* IN */ uint32_t index; /* IN */ uint64_t value; /* IN/OUT */ }; typedef struct xen_hvm_param xen_hvm_param_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_param_t); /* Set the logical level of one of a domain's PCI INTx wires. */ #define HVMOP_set_pci_intx_level 2 struct xen_hvm_set_pci_intx_level { /* Domain to be updated. */ domid_t domid; /* PCI INTx identification in PCI topology (domain:bus:device:intx). */ uint8_t domain, bus, device, intx; /* Assertion level (0 = unasserted, 1 = asserted). */ uint8_t level; }; typedef struct xen_hvm_set_pci_intx_level xen_hvm_set_pci_intx_level_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t); /* Set the logical level of one of a domain's ISA IRQ wires. */ #define HVMOP_set_isa_irq_level 3 struct xen_hvm_set_isa_irq_level { /* Domain to be updated. */ domid_t domid; /* ISA device identification, by ISA IRQ (0-15). */ uint8_t isa_irq; /* Assertion level (0 = unasserted, 1 = asserted). */ uint8_t level; }; typedef struct xen_hvm_set_isa_irq_level xen_hvm_set_isa_irq_level_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t); #define HVMOP_set_pci_link_route 4 struct xen_hvm_set_pci_link_route { /* Domain to be updated. */ domid_t domid; /* PCI link identifier (0-3). */ uint8_t link; /* ISA IRQ (1-15), or 0 (disable link). */ uint8_t isa_irq; }; typedef struct xen_hvm_set_pci_link_route xen_hvm_set_pci_link_route_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t); /* Flushes all VCPU TLBs: @arg must be NULL. */ #define HVMOP_flush_tlbs 5 typedef enum { HVMMEM_ram_rw, /* Normal read/write guest RAM */ HVMMEM_ram_ro, /* Read-only; writes are discarded */ HVMMEM_mmio_dm, /* Reads and write go to the device model */ } hvmmem_type_t; /* Following tools-only interfaces may change in future. */ #if defined(__XEN__) || defined(__XEN_TOOLS__) /* Track dirty VRAM. */ #define HVMOP_track_dirty_vram 6 struct xen_hvm_track_dirty_vram { /* Domain to be tracked. */ domid_t domid; /* First pfn to track. */ uint64_aligned_t first_pfn; /* Number of pages to track. */ uint64_aligned_t nr; /* OUT variable. */ /* Dirty bitmap buffer. */ XEN_GUEST_HANDLE_64(uint8) dirty_bitmap; }; typedef struct xen_hvm_track_dirty_vram xen_hvm_track_dirty_vram_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_track_dirty_vram_t); /* Notify that some pages got modified by the Device Model. */ #define HVMOP_modified_memory 7 struct xen_hvm_modified_memory { /* Domain to be updated. */ domid_t domid; /* First pfn. */ uint64_aligned_t first_pfn; /* Number of pages. */ uint64_aligned_t nr; }; typedef struct xen_hvm_modified_memory xen_hvm_modified_memory_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_modified_memory_t); #define HVMOP_set_mem_type 8 /* Notify that a region of memory is to be treated in a specific way. */ struct xen_hvm_set_mem_type { /* Domain to be updated. */ domid_t domid; /* Memory type */ uint16_t hvmmem_type; /* Number of pages. */ uint32_t nr; /* First pfn. */ uint64_aligned_t first_pfn; }; typedef struct xen_hvm_set_mem_type xen_hvm_set_mem_type_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_mem_type_t); #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */ /* Hint from PV drivers for pagetable destruction. */ #define HVMOP_pagetable_dying 9 struct xen_hvm_pagetable_dying { /* Domain with a pagetable about to be destroyed. */ domid_t domid; uint16_t pad[3]; /* align next field on 8-byte boundary */ /* guest physical address of the toplevel pagetable dying */ uint64_t gpa; }; typedef struct xen_hvm_pagetable_dying xen_hvm_pagetable_dying_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_pagetable_dying_t); /* Get the current Xen time, in nanoseconds since system boot. */ #define HVMOP_get_time 10 struct xen_hvm_get_time { uint64_t now; /* OUT */ }; typedef struct xen_hvm_get_time xen_hvm_get_time_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_time_t); #define HVMOP_xentrace 11 struct xen_hvm_xentrace { uint16_t event, extra_bytes; uint8_t extra[TRACE_EXTRA_MAX * sizeof(uint32_t)]; }; typedef struct xen_hvm_xentrace xen_hvm_xentrace_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_xentrace_t); /* Following tools-only interfaces may change in future. */ #if defined(__XEN__) || defined(__XEN_TOOLS__) #define HVMOP_set_mem_access 12 typedef enum { HVMMEM_access_n, HVMMEM_access_r, HVMMEM_access_w, HVMMEM_access_rw, HVMMEM_access_x, HVMMEM_access_rx, HVMMEM_access_wx, HVMMEM_access_rwx, HVMMEM_access_rx2rw, /* Page starts off as r-x, but automatically * change to r-w on a write */ HVMMEM_access_n2rwx, /* Log access: starts off as n, automatically * goes to rwx, generating an event without * pausing the vcpu */ HVMMEM_access_default /* Take the domain default */ } hvmmem_access_t; /* Notify that a region of memory is to have specific access types */ struct xen_hvm_set_mem_access { /* Domain to be updated. */ domid_t domid; /* Memory type */ uint16_t hvmmem_access; /* hvm_access_t */ /* Number of pages, ignored on setting default access */ uint32_t nr; /* First pfn, or ~0ull to set the default access for new pages */ uint64_aligned_t first_pfn; }; typedef struct xen_hvm_set_mem_access xen_hvm_set_mem_access_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_mem_access_t); #define HVMOP_get_mem_access 13 /* Get the specific access type for that region of memory */ struct xen_hvm_get_mem_access { /* Domain to be queried. */ domid_t domid; /* Memory type: OUT */ uint16_t hvmmem_access; /* hvm_access_t */ /* pfn, or ~0ull for default access for new pages. IN */ uint64_aligned_t pfn; }; typedef struct xen_hvm_get_mem_access xen_hvm_get_mem_access_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_mem_access_t); #define HVMOP_inject_trap 14 /* Inject a trap into a VCPU, which will get taken up on the next * scheduling of it. Note that the caller should know enough of the * state of the CPU before injecting, to know what the effect of * injecting the trap will be. */ struct xen_hvm_inject_trap { /* Domain to be queried. */ domid_t domid; /* VCPU */ uint32_t vcpuid; /* Vector number */ uint32_t vector; /* Trap type (HVMOP_TRAP_*) */ uint32_t type; /* NB. This enumeration precisely matches hvm.h:X86_EVENTTYPE_* */ # define HVMOP_TRAP_ext_int 0 /* external interrupt */ # define HVMOP_TRAP_nmi 2 /* nmi */ # define HVMOP_TRAP_hw_exc 3 /* hardware exception */ # define HVMOP_TRAP_sw_int 4 /* software interrupt (CD nn) */ # define HVMOP_TRAP_pri_sw_exc 5 /* ICEBP (F1) */ # define HVMOP_TRAP_sw_exc 6 /* INT3 (CC), INTO (CE) */ /* Error code, or ~0u to skip */ uint32_t error_code; /* Intruction length */ uint32_t insn_len; /* CR2 for page faults */ uint64_aligned_t cr2; }; typedef struct xen_hvm_inject_trap xen_hvm_inject_trap_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_inject_trap_t); #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */ #define HVMOP_get_mem_type 15 /* Return hvmmem_type_t for the specified pfn. */ struct xen_hvm_get_mem_type { /* Domain to be queried. */ domid_t domid; /* OUT variable. */ uint16_t mem_type; uint16_t pad[2]; /* align next field on 8-byte boundary */ /* IN variable. */ uint64_t pfn; }; typedef struct xen_hvm_get_mem_type xen_hvm_get_mem_type_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_mem_type_t); /* Following tools-only interfaces may change in future. */ #if defined(__XEN__) || defined(__XEN_TOOLS__) /* MSI injection for emulated devices */ #define HVMOP_inject_msi 16 struct xen_hvm_inject_msi { /* Domain to be injected */ domid_t domid; /* Data -- lower 32 bits */ uint32_t data; /* Address (0xfeexxxxx) */ uint64_t addr; }; typedef struct xen_hvm_inject_msi xen_hvm_inject_msi_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_inject_msi_t); #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */ #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */ xen-4.4.0/xen/include/public/hvm/e820.h0000664000175000017500000000300612307313555015556 0ustar smbsmb /* * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef __XEN_PUBLIC_HVM_E820_H__ #define __XEN_PUBLIC_HVM_E820_H__ /* E820 location in HVM virtual address space. */ #define HVM_E820_PAGE 0x00090000 #define HVM_E820_NR_OFFSET 0x000001E8 #define HVM_E820_OFFSET 0x000002D0 #define HVM_BELOW_4G_RAM_END 0xF0000000 #define HVM_BELOW_4G_MMIO_START HVM_BELOW_4G_RAM_END #define HVM_BELOW_4G_MMIO_LENGTH ((1ULL << 32) - HVM_BELOW_4G_MMIO_START) #endif /* __XEN_PUBLIC_HVM_E820_H__ */ xen-4.4.0/xen/include/public/hvm/hvm_xs_strings.h0000664000175000017500000000777512307313555020176 0ustar smbsmb/****************************************************************************** * hvm/hvm_xs_strings.h * * HVM xenstore strings used in HVMLOADER. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef __XEN_PUBLIC_HVM_HVM_XS_STRINGS_H__ #define __XEN_PUBLIC_HVM_HVM_XS_STRINGS_H__ #define HVM_XS_HVMLOADER "hvmloader" #define HVM_XS_BIOS "hvmloader/bios" #define HVM_XS_GENERATION_ID_ADDRESS "hvmloader/generation-id-address" #define HVM_XS_ALLOW_MEMORY_RELOCATE "hvmloader/allow-memory-relocate" /* The following values allow additional ACPI tables to be added to the * virtual ACPI BIOS that hvmloader constructs. The values specify the guest * physical address and length of a block of ACPI tables to add. The format of * the block is simply concatenated raw tables (which specify their own length * in the ACPI header). */ #define HVM_XS_ACPI_PT_ADDRESS "hvmloader/acpi/address" #define HVM_XS_ACPI_PT_LENGTH "hvmloader/acpi/length" /* Any number of SMBIOS types can be passed through to an HVM guest using * the following xenstore values. The values specify the guest physical * address and length of a block of SMBIOS structures for hvmloader to use. * The block is formatted in the following way: * * ... * * Each length separator is a 32b integer indicating the length of the next * SMBIOS structure. For DMTF defined types (0 - 121), the passed in struct * will replace the default structure in hvmloader. In addition, any * OEM/vendortypes (128 - 255) will all be added. */ #define HVM_XS_SMBIOS_PT_ADDRESS "hvmloader/smbios/address" #define HVM_XS_SMBIOS_PT_LENGTH "hvmloader/smbios/length" /* Set to 1 to enable SMBIOS default portable battery (type 22) values. */ #define HVM_XS_SMBIOS_DEFAULT_BATTERY "hvmloader/smbios/default_battery" /* The following xenstore values are used to override some of the default * string values in the SMBIOS table constructed in hvmloader. */ #define HVM_XS_BIOS_STRINGS "bios-strings" #define HVM_XS_BIOS_VENDOR "bios-strings/bios-vendor" #define HVM_XS_BIOS_VERSION "bios-strings/bios-version" #define HVM_XS_SYSTEM_MANUFACTURER "bios-strings/system-manufacturer" #define HVM_XS_SYSTEM_PRODUCT_NAME "bios-strings/system-product-name" #define HVM_XS_SYSTEM_VERSION "bios-strings/system-version" #define HVM_XS_SYSTEM_SERIAL_NUMBER "bios-strings/system-serial-number" #define HVM_XS_ENCLOSURE_MANUFACTURER "bios-strings/enclosure-manufacturer" #define HVM_XS_ENCLOSURE_SERIAL_NUMBER "bios-strings/enclosure-serial-number" #define HVM_XS_BATTERY_MANUFACTURER "bios-strings/battery-manufacturer" #define HVM_XS_BATTERY_DEVICE_NAME "bios-strings/battery-device-name" /* 1 to 99 OEM strings can be set in xenstore using values of the form * below. These strings will be loaded into the SMBIOS type 11 structure. */ #define HVM_XS_OEM_STRINGS "bios-strings/oem-%d" #endif /* __XEN_PUBLIC_HVM_HVM_XS_STRINGS_H__ */ xen-4.4.0/xen/include/public/hvm/hvm_info_table.h0000664000175000017500000000515312307313555020061 0ustar smbsmb/****************************************************************************** * hvm/hvm_info_table.h * * HVM parameter and information table, written into guest memory map. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ #define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ #define HVM_INFO_PFN 0x09F #define HVM_INFO_OFFSET 0x800 #define HVM_INFO_PADDR ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET) /* Maximum we can support with current vLAPIC ID mapping. */ #define HVM_MAX_VCPUS 128 struct hvm_info_table { char signature[8]; /* "HVM INFO" */ uint32_t length; uint8_t checksum; /* Should firmware build APIC descriptors (APIC MADT / MP BIOS)? */ uint8_t apic_mode; /* How many CPUs does this domain have? */ uint32_t nr_vcpus; /* * MEMORY MAP provided by HVM domain builder. * Notes: * 1. page_to_phys(x) = x << 12 * 2. If a field is zero, the corresponding range does not exist. */ /* * 0x0 to page_to_phys(low_mem_pgend)-1: * RAM below 4GB (except for VGA hole 0xA0000-0xBFFFF) */ uint32_t low_mem_pgend; /* * page_to_phys(reserved_mem_pgstart) to 0xFFFFFFFF: * Reserved for special memory mappings */ uint32_t reserved_mem_pgstart; /* * 0x100000000 to page_to_phys(high_mem_pgend)-1: * RAM above 4GB */ uint32_t high_mem_pgend; /* Bitmap of which CPUs are online at boot time. */ uint8_t vcpu_online[(HVM_MAX_VCPUS + 7)/8]; }; #endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */ xen-4.4.0/xen/include/public/hvm/ioreq.h0000664000175000017500000001072312307313555016223 0ustar smbsmb/* * ioreq.h: I/O request definitions for device models * Copyright (c) 2004, Intel Corporation. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef _IOREQ_H_ #define _IOREQ_H_ #define IOREQ_READ 1 #define IOREQ_WRITE 0 #define STATE_IOREQ_NONE 0 #define STATE_IOREQ_READY 1 #define STATE_IOREQ_INPROCESS 2 #define STATE_IORESP_READY 3 #define IOREQ_TYPE_PIO 0 /* pio */ #define IOREQ_TYPE_COPY 1 /* mmio ops */ #define IOREQ_TYPE_TIMEOFFSET 7 #define IOREQ_TYPE_INVALIDATE 8 /* mapcache */ /* * VMExit dispatcher should cooperate with instruction decoder to * prepare this structure and notify service OS and DM by sending * virq */ struct ioreq { uint64_t addr; /* physical address */ uint64_t data; /* data (or paddr of data) */ uint32_t count; /* for rep prefixes */ uint32_t size; /* size in bytes */ uint32_t vp_eport; /* evtchn for notifications to/from device model */ uint16_t _pad0; uint8_t state:4; uint8_t data_is_ptr:1; /* if 1, data above is the guest paddr * of the real data to use. */ uint8_t dir:1; /* 1=read, 0=write */ uint8_t df:1; uint8_t _pad1:1; uint8_t type; /* I/O type */ }; typedef struct ioreq ioreq_t; struct shared_iopage { struct ioreq vcpu_ioreq[1]; }; typedef struct shared_iopage shared_iopage_t; struct buf_ioreq { uint8_t type; /* I/O type */ uint8_t pad:1; uint8_t dir:1; /* 1=read, 0=write */ uint8_t size:2; /* 0=>1, 1=>2, 2=>4, 3=>8. If 8, use two buf_ioreqs */ uint32_t addr:20;/* physical address */ uint32_t data; /* data */ }; typedef struct buf_ioreq buf_ioreq_t; #define IOREQ_BUFFER_SLOT_NUM 511 /* 8 bytes each, plus 2 4-byte indexes */ struct buffered_iopage { unsigned int read_pointer; unsigned int write_pointer; buf_ioreq_t buf_ioreq[IOREQ_BUFFER_SLOT_NUM]; }; /* NB. Size of this structure must be no greater than one page. */ typedef struct buffered_iopage buffered_iopage_t; /* * ACPI Control/Event register locations. Location is controlled by a * version number in HVM_PARAM_ACPI_IOPORTS_LOCATION. */ /* Version 0 (default): Traditional Xen locations. */ #define ACPI_PM1A_EVT_BLK_ADDRESS_V0 0x1f40 #define ACPI_PM1A_CNT_BLK_ADDRESS_V0 (ACPI_PM1A_EVT_BLK_ADDRESS_V0 + 0x04) #define ACPI_PM_TMR_BLK_ADDRESS_V0 (ACPI_PM1A_EVT_BLK_ADDRESS_V0 + 0x08) #define ACPI_GPE0_BLK_ADDRESS_V0 (ACPI_PM_TMR_BLK_ADDRESS_V0 + 0x20) #define ACPI_GPE0_BLK_LEN_V0 0x08 /* Version 1: Locations preferred by modern Qemu. */ #define ACPI_PM1A_EVT_BLK_ADDRESS_V1 0xb000 #define ACPI_PM1A_CNT_BLK_ADDRESS_V1 (ACPI_PM1A_EVT_BLK_ADDRESS_V1 + 0x04) #define ACPI_PM_TMR_BLK_ADDRESS_V1 (ACPI_PM1A_EVT_BLK_ADDRESS_V1 + 0x08) #define ACPI_GPE0_BLK_ADDRESS_V1 0xafe0 #define ACPI_GPE0_BLK_LEN_V1 0x04 /* Compatibility definitions for the default location (version 0). */ #define ACPI_PM1A_EVT_BLK_ADDRESS ACPI_PM1A_EVT_BLK_ADDRESS_V0 #define ACPI_PM1A_CNT_BLK_ADDRESS ACPI_PM1A_CNT_BLK_ADDRESS_V0 #define ACPI_PM_TMR_BLK_ADDRESS ACPI_PM_TMR_BLK_ADDRESS_V0 #define ACPI_GPE0_BLK_ADDRESS ACPI_GPE0_BLK_ADDRESS_V0 #define ACPI_GPE0_BLK_LEN ACPI_GPE0_BLK_LEN_V0 #endif /* _IOREQ_H_ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/arch-x86/0000775000175000017500000000000012307313555015476 5ustar smbsmbxen-4.4.0/xen/include/public/arch-x86/hvm/0000775000175000017500000000000012307313555016270 5ustar smbsmbxen-4.4.0/xen/include/public/arch-x86/hvm/save.h0000664000175000017500000003413712307313555017407 0ustar smbsmb/* * Structure definitions for HVM state that is held by Xen and must * be saved along with the domain's memory and device-model state. * * Copyright (c) 2007 XenSource Ltd. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef __XEN_PUBLIC_HVM_SAVE_X86_H__ #define __XEN_PUBLIC_HVM_SAVE_X86_H__ /* * Save/restore header: general info about the save file. */ #define HVM_FILE_MAGIC 0x54381286 #define HVM_FILE_VERSION 0x00000001 struct hvm_save_header { uint32_t magic; /* Must be HVM_FILE_MAGIC */ uint32_t version; /* File format version */ uint64_t changeset; /* Version of Xen that saved this file */ uint32_t cpuid; /* CPUID[0x01][%eax] on the saving machine */ uint32_t gtsc_khz; /* Guest's TSC frequency in kHz */ }; DECLARE_HVM_SAVE_TYPE(HEADER, 1, struct hvm_save_header); /* * Processor * * Compat: Pre-3.4 didn't have msr_tsc_aux */ struct hvm_hw_cpu { uint8_t fpu_regs[512]; uint64_t rax; uint64_t rbx; uint64_t rcx; uint64_t rdx; uint64_t rbp; uint64_t rsi; uint64_t rdi; uint64_t rsp; uint64_t r8; uint64_t r9; uint64_t r10; uint64_t r11; uint64_t r12; uint64_t r13; uint64_t r14; uint64_t r15; uint64_t rip; uint64_t rflags; uint64_t cr0; uint64_t cr2; uint64_t cr3; uint64_t cr4; uint64_t dr0; uint64_t dr1; uint64_t dr2; uint64_t dr3; uint64_t dr6; uint64_t dr7; uint32_t cs_sel; uint32_t ds_sel; uint32_t es_sel; uint32_t fs_sel; uint32_t gs_sel; uint32_t ss_sel; uint32_t tr_sel; uint32_t ldtr_sel; uint32_t cs_limit; uint32_t ds_limit; uint32_t es_limit; uint32_t fs_limit; uint32_t gs_limit; uint32_t ss_limit; uint32_t tr_limit; uint32_t ldtr_limit; uint32_t idtr_limit; uint32_t gdtr_limit; uint64_t cs_base; uint64_t ds_base; uint64_t es_base; uint64_t fs_base; uint64_t gs_base; uint64_t ss_base; uint64_t tr_base; uint64_t ldtr_base; uint64_t idtr_base; uint64_t gdtr_base; uint32_t cs_arbytes; uint32_t ds_arbytes; uint32_t es_arbytes; uint32_t fs_arbytes; uint32_t gs_arbytes; uint32_t ss_arbytes; uint32_t tr_arbytes; uint32_t ldtr_arbytes; uint64_t sysenter_cs; uint64_t sysenter_esp; uint64_t sysenter_eip; /* msr for em64t */ uint64_t shadow_gs; /* msr content saved/restored. */ uint64_t msr_flags; uint64_t msr_lstar; uint64_t msr_star; uint64_t msr_cstar; uint64_t msr_syscall_mask; uint64_t msr_efer; uint64_t msr_tsc_aux; /* guest's idea of what rdtsc() would return */ uint64_t tsc; /* pending event, if any */ union { uint32_t pending_event; struct { uint8_t pending_vector:8; uint8_t pending_type:3; uint8_t pending_error_valid:1; uint32_t pending_reserved:19; uint8_t pending_valid:1; }; }; /* error code for pending event */ uint32_t error_code; }; struct hvm_hw_cpu_compat { uint8_t fpu_regs[512]; uint64_t rax; uint64_t rbx; uint64_t rcx; uint64_t rdx; uint64_t rbp; uint64_t rsi; uint64_t rdi; uint64_t rsp; uint64_t r8; uint64_t r9; uint64_t r10; uint64_t r11; uint64_t r12; uint64_t r13; uint64_t r14; uint64_t r15; uint64_t rip; uint64_t rflags; uint64_t cr0; uint64_t cr2; uint64_t cr3; uint64_t cr4; uint64_t dr0; uint64_t dr1; uint64_t dr2; uint64_t dr3; uint64_t dr6; uint64_t dr7; uint32_t cs_sel; uint32_t ds_sel; uint32_t es_sel; uint32_t fs_sel; uint32_t gs_sel; uint32_t ss_sel; uint32_t tr_sel; uint32_t ldtr_sel; uint32_t cs_limit; uint32_t ds_limit; uint32_t es_limit; uint32_t fs_limit; uint32_t gs_limit; uint32_t ss_limit; uint32_t tr_limit; uint32_t ldtr_limit; uint32_t idtr_limit; uint32_t gdtr_limit; uint64_t cs_base; uint64_t ds_base; uint64_t es_base; uint64_t fs_base; uint64_t gs_base; uint64_t ss_base; uint64_t tr_base; uint64_t ldtr_base; uint64_t idtr_base; uint64_t gdtr_base; uint32_t cs_arbytes; uint32_t ds_arbytes; uint32_t es_arbytes; uint32_t fs_arbytes; uint32_t gs_arbytes; uint32_t ss_arbytes; uint32_t tr_arbytes; uint32_t ldtr_arbytes; uint64_t sysenter_cs; uint64_t sysenter_esp; uint64_t sysenter_eip; /* msr for em64t */ uint64_t shadow_gs; /* msr content saved/restored. */ uint64_t msr_flags; uint64_t msr_lstar; uint64_t msr_star; uint64_t msr_cstar; uint64_t msr_syscall_mask; uint64_t msr_efer; /*uint64_t msr_tsc_aux; COMPAT */ /* guest's idea of what rdtsc() would return */ uint64_t tsc; /* pending event, if any */ union { uint32_t pending_event; struct { uint8_t pending_vector:8; uint8_t pending_type:3; uint8_t pending_error_valid:1; uint32_t pending_reserved:19; uint8_t pending_valid:1; }; }; /* error code for pending event */ uint32_t error_code; }; static inline int _hvm_hw_fix_cpu(void *h) { union hvm_hw_cpu_union { struct hvm_hw_cpu nat; struct hvm_hw_cpu_compat cmp; } *ucpu = (union hvm_hw_cpu_union *)h; /* If we copy from the end backwards, we should * be able to do the modification in-place */ ucpu->nat.error_code = ucpu->cmp.error_code; ucpu->nat.pending_event = ucpu->cmp.pending_event; ucpu->nat.tsc = ucpu->cmp.tsc; ucpu->nat.msr_tsc_aux = 0; return 0; } DECLARE_HVM_SAVE_TYPE_COMPAT(CPU, 2, struct hvm_hw_cpu, \ struct hvm_hw_cpu_compat, _hvm_hw_fix_cpu); /* * PIC */ struct hvm_hw_vpic { /* IR line bitmasks. */ uint8_t irr; uint8_t imr; uint8_t isr; /* Line IRx maps to IRQ irq_base+x */ uint8_t irq_base; /* * Where are we in ICW2-4 initialisation (0 means no init in progress)? * Bits 0-1 (=x): Next write at A=1 sets ICW(x+1). * Bit 2: ICW1.IC4 (1 == ICW4 included in init sequence) * Bit 3: ICW1.SNGL (0 == ICW3 included in init sequence) */ uint8_t init_state:4; /* IR line with highest priority. */ uint8_t priority_add:4; /* Reads from A=0 obtain ISR or IRR? */ uint8_t readsel_isr:1; /* Reads perform a polling read? */ uint8_t poll:1; /* Automatically clear IRQs from the ISR during INTA? */ uint8_t auto_eoi:1; /* Automatically rotate IRQ priorities during AEOI? */ uint8_t rotate_on_auto_eoi:1; /* Exclude slave inputs when considering in-service IRQs? */ uint8_t special_fully_nested_mode:1; /* Special mask mode excludes masked IRs from AEOI and priority checks. */ uint8_t special_mask_mode:1; /* Is this a master PIC or slave PIC? (NB. This is not programmable.) */ uint8_t is_master:1; /* Edge/trigger selection. */ uint8_t elcr; /* Virtual INT output. */ uint8_t int_output; }; DECLARE_HVM_SAVE_TYPE(PIC, 3, struct hvm_hw_vpic); /* * IO-APIC */ #define VIOAPIC_NUM_PINS 48 /* 16 ISA IRQs, 32 non-legacy PCI IRQS. */ struct hvm_hw_vioapic { uint64_t base_address; uint32_t ioregsel; uint32_t id; union vioapic_redir_entry { uint64_t bits; struct { uint8_t vector; uint8_t delivery_mode:3; uint8_t dest_mode:1; uint8_t delivery_status:1; uint8_t polarity:1; uint8_t remote_irr:1; uint8_t trig_mode:1; uint8_t mask:1; uint8_t reserve:7; uint8_t reserved[4]; uint8_t dest_id; } fields; } redirtbl[VIOAPIC_NUM_PINS]; }; DECLARE_HVM_SAVE_TYPE(IOAPIC, 4, struct hvm_hw_vioapic); /* * LAPIC */ struct hvm_hw_lapic { uint64_t apic_base_msr; uint32_t disabled; /* VLAPIC_xx_DISABLED */ uint32_t timer_divisor; uint64_t tdt_msr; }; DECLARE_HVM_SAVE_TYPE(LAPIC, 5, struct hvm_hw_lapic); struct hvm_hw_lapic_regs { uint8_t data[1024]; }; DECLARE_HVM_SAVE_TYPE(LAPIC_REGS, 6, struct hvm_hw_lapic_regs); /* * IRQs */ struct hvm_hw_pci_irqs { /* * Virtual interrupt wires for a single PCI bus. * Indexed by: device*4 + INTx#. */ union { unsigned long i[16 / sizeof (unsigned long)]; /* DECLARE_BITMAP(i, 32*4); */ uint64_t pad[2]; }; }; DECLARE_HVM_SAVE_TYPE(PCI_IRQ, 7, struct hvm_hw_pci_irqs); struct hvm_hw_isa_irqs { /* * Virtual interrupt wires for ISA devices. * Indexed by ISA IRQ (assumes no ISA-device IRQ sharing). */ union { unsigned long i[1]; /* DECLARE_BITMAP(i, 16); */ uint64_t pad[1]; }; }; DECLARE_HVM_SAVE_TYPE(ISA_IRQ, 8, struct hvm_hw_isa_irqs); struct hvm_hw_pci_link { /* * PCI-ISA interrupt router. * Each PCI is 'wire-ORed' into one of four links using * the traditional 'barber's pole' mapping ((device + INTx#) & 3). * The router provides a programmable mapping from each link to a GSI. */ uint8_t route[4]; uint8_t pad0[4]; }; DECLARE_HVM_SAVE_TYPE(PCI_LINK, 9, struct hvm_hw_pci_link); /* * PIT */ struct hvm_hw_pit { struct hvm_hw_pit_channel { uint32_t count; /* can be 65536 */ uint16_t latched_count; uint8_t count_latched; uint8_t status_latched; uint8_t status; uint8_t read_state; uint8_t write_state; uint8_t write_latch; uint8_t rw_mode; uint8_t mode; uint8_t bcd; /* not supported */ uint8_t gate; /* timer start */ } channels[3]; /* 3 x 16 bytes */ uint32_t speaker_data_on; uint32_t pad0; }; DECLARE_HVM_SAVE_TYPE(PIT, 10, struct hvm_hw_pit); /* * RTC */ #define RTC_CMOS_SIZE 14 struct hvm_hw_rtc { /* CMOS bytes */ uint8_t cmos_data[RTC_CMOS_SIZE]; /* Index register for 2-part operations */ uint8_t cmos_index; uint8_t pad0; }; DECLARE_HVM_SAVE_TYPE(RTC, 11, struct hvm_hw_rtc); /* * HPET */ #define HPET_TIMER_NUM 3 /* 3 timers supported now */ struct hvm_hw_hpet { /* Memory-mapped, software visible registers */ uint64_t capability; /* capabilities */ uint64_t res0; /* reserved */ uint64_t config; /* configuration */ uint64_t res1; /* reserved */ uint64_t isr; /* interrupt status reg */ uint64_t res2[25]; /* reserved */ uint64_t mc64; /* main counter */ uint64_t res3; /* reserved */ struct { /* timers */ uint64_t config; /* configuration/cap */ uint64_t cmp; /* comparator */ uint64_t fsb; /* FSB route, not supported now */ uint64_t res4; /* reserved */ } timers[HPET_TIMER_NUM]; uint64_t res5[4*(24-HPET_TIMER_NUM)]; /* reserved, up to 0x3ff */ /* Hidden register state */ uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */ }; DECLARE_HVM_SAVE_TYPE(HPET, 12, struct hvm_hw_hpet); /* * PM timer */ struct hvm_hw_pmtimer { uint32_t tmr_val; /* PM_TMR_BLK.TMR_VAL: 32bit free-running counter */ uint16_t pm1a_sts; /* PM1a_EVT_BLK.PM1a_STS: status register */ uint16_t pm1a_en; /* PM1a_EVT_BLK.PM1a_EN: enable register */ }; DECLARE_HVM_SAVE_TYPE(PMTIMER, 13, struct hvm_hw_pmtimer); /* * MTRR MSRs */ struct hvm_hw_mtrr { #define MTRR_VCNT 8 #define NUM_FIXED_MSR 11 uint64_t msr_pat_cr; /* mtrr physbase & physmask msr pair*/ uint64_t msr_mtrr_var[MTRR_VCNT*2]; uint64_t msr_mtrr_fixed[NUM_FIXED_MSR]; uint64_t msr_mtrr_cap; uint64_t msr_mtrr_def_type; }; DECLARE_HVM_SAVE_TYPE(MTRR, 14, struct hvm_hw_mtrr); /* * The save area of XSAVE/XRSTOR. */ struct hvm_hw_cpu_xsave { uint64_t xfeature_mask; uint64_t xcr0; /* Updated by XSETBV */ uint64_t xcr0_accum; /* Updated by XSETBV */ struct { struct { char x[512]; } fpu_sse; struct { uint64_t xstate_bv; /* Updated by XRSTOR */ uint64_t reserved[7]; } xsave_hdr; /* The 64-byte header */ struct { char x[0]; } ymm; /* YMM */ } save_area; }; #define CPU_XSAVE_CODE 16 /* * Viridian hypervisor context. */ struct hvm_viridian_domain_context { uint64_t hypercall_gpa; uint64_t guest_os_id; }; DECLARE_HVM_SAVE_TYPE(VIRIDIAN_DOMAIN, 15, struct hvm_viridian_domain_context); struct hvm_viridian_vcpu_context { uint64_t apic_assist; }; DECLARE_HVM_SAVE_TYPE(VIRIDIAN_VCPU, 17, struct hvm_viridian_vcpu_context); struct hvm_vmce_vcpu { uint64_t caps; uint64_t mci_ctl2_bank0; uint64_t mci_ctl2_bank1; }; DECLARE_HVM_SAVE_TYPE(VMCE_VCPU, 18, struct hvm_vmce_vcpu); struct hvm_tsc_adjust { uint64_t tsc_adjust; }; DECLARE_HVM_SAVE_TYPE(TSC_ADJUST, 19, struct hvm_tsc_adjust); /* * Largest type-code in use */ #define HVM_SAVE_CODE_MAX 19 #endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */ xen-4.4.0/xen/include/public/arch-x86/cpuid.h0000664000175000017500000000506012307313555016754 0ustar smbsmb/****************************************************************************** * arch-x86/cpuid.h * * CPUID interface to Xen. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2007 Citrix Systems, Inc. * * Authors: * Keir Fraser */ #ifndef __XEN_PUBLIC_ARCH_X86_CPUID_H__ #define __XEN_PUBLIC_ARCH_X86_CPUID_H__ /* Xen identification leaves start at 0x40000000. */ #define XEN_CPUID_FIRST_LEAF 0x40000000 #define XEN_CPUID_LEAF(i) (XEN_CPUID_FIRST_LEAF + (i)) /* * Leaf 1 (0x40000000) * EAX: Largest Xen-information leaf. All leaves up to an including @EAX * are supported by the Xen host. * EBX-EDX: "XenVMMXenVMM" signature, allowing positive identification * of a Xen host. */ #define XEN_CPUID_SIGNATURE_EBX 0x566e6558 /* "XenV" */ #define XEN_CPUID_SIGNATURE_ECX 0x65584d4d /* "MMXe" */ #define XEN_CPUID_SIGNATURE_EDX 0x4d4d566e /* "nVMM" */ /* * Leaf 2 (0x40000001) * EAX[31:16]: Xen major version. * EAX[15: 0]: Xen minor version. * EBX-EDX: Reserved (currently all zeroes). */ /* * Leaf 3 (0x40000002) * EAX: Number of hypercall transfer pages. This register is always guaranteed * to specify one hypercall page. * EBX: Base address of Xen-specific MSRs. * ECX: Features 1. Unused bits are set to zero. * EDX: Features 2. Unused bits are set to zero. */ /* Does the host support MMU_PT_UPDATE_PRESERVE_AD for this guest? */ #define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0 #define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD (1u<<0) #endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */ xen-4.4.0/xen/include/public/arch-x86/xen.h0000664000175000017500000002335412307313555016450 0ustar smbsmb/****************************************************************************** * arch-x86/xen.h * * Guest OS interface to x86 Xen. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2004-2006, K A Fraser */ #include "../xen.h" #ifndef __XEN_PUBLIC_ARCH_X86_XEN_H__ #define __XEN_PUBLIC_ARCH_X86_XEN_H__ /* Structural guest handles introduced in 0x00030201. */ #if __XEN_INTERFACE_VERSION__ >= 0x00030201 #define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ typedef struct { type *p; } __guest_handle_ ## name #else #define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ typedef type * __guest_handle_ ## name #endif /* * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field * in a struct in memory. * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an * hypercall argument. * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but * they might not be on other architectures. */ #define __DEFINE_XEN_GUEST_HANDLE(name, type) \ ___DEFINE_XEN_GUEST_HANDLE(name, type); \ ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type) #define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name) #define __XEN_GUEST_HANDLE(name) __guest_handle_ ## name #define XEN_GUEST_HANDLE(name) __XEN_GUEST_HANDLE(name) #define XEN_GUEST_HANDLE_PARAM(name) XEN_GUEST_HANDLE(name) #define set_xen_guest_handle_raw(hnd, val) do { (hnd).p = val; } while (0) #ifdef __XEN_TOOLS__ #define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0) #endif #define set_xen_guest_handle(hnd, val) set_xen_guest_handle_raw(hnd, val) #if defined(__i386__) #include "xen-x86_32.h" #elif defined(__x86_64__) #include "xen-x86_64.h" #endif #ifndef __ASSEMBLY__ typedef unsigned long xen_pfn_t; #define PRI_xen_pfn "lx" #endif #define XEN_HAVE_PV_GUEST_ENTRY 1 #define XEN_HAVE_PV_UPCALL_MASK 1 /* * `incontents 200 segdesc Segment Descriptor Tables */ /* * ` enum neg_errnoval * ` HYPERVISOR_set_gdt(const xen_pfn_t frames[], unsigned int entries); * ` */ /* * A number of GDT entries are reserved by Xen. These are not situated at the * start of the GDT because some stupid OSes export hard-coded selector values * in their ABI. These hard-coded values are always near the start of the GDT, * so Xen places itself out of the way, at the far end of the GDT. * * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op */ #define FIRST_RESERVED_GDT_PAGE 14 #define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096) #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) /* * ` enum neg_errnoval * ` HYPERVISOR_update_descriptor(u64 pa, u64 desc); * ` * ` @pa The machine physical address of the descriptor to * ` update. Must be either a descriptor page or writable. * ` @desc The descriptor value to update, in the same format as a * ` native descriptor table entry. */ /* Maximum number of virtual CPUs in legacy multi-processor guests. */ #define XEN_LEGACY_MAX_VCPUS 32 #ifndef __ASSEMBLY__ typedef unsigned long xen_ulong_t; #define PRI_xen_ulong "lx" /* * ` enum neg_errnoval * ` HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp); * ` * Sets the stack segment and pointer for the current vcpu. */ /* * ` enum neg_errnoval * ` HYPERVISOR_set_trap_table(const struct trap_info traps[]); * ` */ /* * Send an array of these to HYPERVISOR_set_trap_table(). * Terminate the array with a sentinel entry, with traps[].address==0. * The privilege level specifies which modes may enter a trap via a software * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate * privilege levels as follows: * Level == 0: Noone may enter * Level == 1: Kernel may enter * Level == 2: Kernel may enter * Level == 3: Everyone may enter */ #define TI_GET_DPL(_ti) ((_ti)->flags & 3) #define TI_GET_IF(_ti) ((_ti)->flags & 4) #define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl)) #define TI_SET_IF(_ti,_if) ((_ti)->flags |= ((!!(_if))<<2)) struct trap_info { uint8_t vector; /* exception vector */ uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */ uint16_t cs; /* code selector */ unsigned long address; /* code offset */ }; typedef struct trap_info trap_info_t; DEFINE_XEN_GUEST_HANDLE(trap_info_t); typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */ /* * The following is all CPU context. Note that the fpu_ctxt block is filled * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. * * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise * for HVM and PVH guests, not all information in this structure is updated: * * - For HVM guests, the structures read include: fpu_ctxt (if * VGCT_I387_VALID is set), flags, user_regs, debugreg[*] * * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to * set cr3. All other fields not used should be set to 0. */ struct vcpu_guest_context { /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */ struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */ #define VGCF_I387_VALID (1<<0) #define VGCF_IN_KERNEL (1<<2) #define _VGCF_i387_valid 0 #define VGCF_i387_valid (1<<_VGCF_i387_valid) #define _VGCF_in_kernel 2 #define VGCF_in_kernel (1<<_VGCF_in_kernel) #define _VGCF_failsafe_disables_events 3 #define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events) #define _VGCF_syscall_disables_events 4 #define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events) #define _VGCF_online 5 #define VGCF_online (1<<_VGCF_online) unsigned long flags; /* VGCF_* flags */ struct cpu_user_regs user_regs; /* User-level CPU registers */ struct trap_info trap_ctxt[256]; /* Virtual IDT */ unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */ unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */ unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */ /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */ unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */ unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */ #ifdef __i386__ unsigned long event_callback_cs; /* CS:EIP of event callback */ unsigned long event_callback_eip; unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */ unsigned long failsafe_callback_eip; #else unsigned long event_callback_eip; unsigned long failsafe_callback_eip; #ifdef __XEN__ union { unsigned long syscall_callback_eip; struct { unsigned int event_callback_cs; /* compat CS of event cb */ unsigned int failsafe_callback_cs; /* compat CS of failsafe cb */ }; }; #else unsigned long syscall_callback_eip; #endif #endif unsigned long vm_assist; /* VMASST_TYPE_* bitmap */ #ifdef __x86_64__ /* Segment base addresses. */ uint64_t fs_base; uint64_t gs_base_kernel; uint64_t gs_base_user; #endif }; typedef struct vcpu_guest_context vcpu_guest_context_t; DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t); struct arch_shared_info { unsigned long max_pfn; /* max pfn that appears in table */ /* Frame containing list of mfns containing list of mfns containing p2m. */ xen_pfn_t pfn_to_mfn_frame_list_list; unsigned long nmi_reason; uint64_t pad[32]; }; typedef struct arch_shared_info arch_shared_info_t; #endif /* !__ASSEMBLY__ */ /* * ` enum neg_errnoval * ` HYPERVISOR_fpu_taskswitch(int set); * ` * Sets (if set!=0) or clears (if set==0) CR0.TS. */ /* * ` enum neg_errnoval * ` HYPERVISOR_set_debugreg(int regno, unsigned long value); * * ` unsigned long * ` HYPERVISOR_get_debugreg(int regno); * For 0<=reg<=7, returns the debug register value. * For other values of reg, returns ((unsigned long)-EINVAL). * (Unfortunately, this interface is defective.) */ /* * Prefix forces emulation of some non-trapping instructions. * Currently only CPUID. */ #ifdef __ASSEMBLY__ #define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ; #define XEN_CPUID XEN_EMULATE_PREFIX cpuid #else #define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; " #define XEN_CPUID XEN_EMULATE_PREFIX "cpuid" #endif #endif /* __XEN_PUBLIC_ARCH_X86_XEN_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/arch-x86/xen-x86_32.h0000664000175000017500000001421012307313555017366 0ustar smbsmb/****************************************************************************** * xen-x86_32.h * * Guest OS interface to x86 32-bit Xen. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2004-2007, K A Fraser */ #ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__ #define __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__ /* * Hypercall interface: * Input: %ebx, %ecx, %edx, %esi, %edi, %ebp (arguments 1-6) * Output: %eax * Access is via hypercall page (set up by guest loader or via a Xen MSR): * call hypercall_page + hypercall-number * 32 * Clobbered: Argument registers (e.g., 2-arg hypercall clobbers %ebx,%ecx) */ /* * These flat segments are in the Xen-private section of every GDT. Since these * are also present in the initial GDT, many OSes will be able to avoid * installing their own GDT. */ #define FLAT_RING1_CS 0xe019 /* GDT index 259 */ #define FLAT_RING1_DS 0xe021 /* GDT index 260 */ #define FLAT_RING1_SS 0xe021 /* GDT index 260 */ #define FLAT_RING3_CS 0xe02b /* GDT index 261 */ #define FLAT_RING3_DS 0xe033 /* GDT index 262 */ #define FLAT_RING3_SS 0xe033 /* GDT index 262 */ #define FLAT_KERNEL_CS FLAT_RING1_CS #define FLAT_KERNEL_DS FLAT_RING1_DS #define FLAT_KERNEL_SS FLAT_RING1_SS #define FLAT_USER_CS FLAT_RING3_CS #define FLAT_USER_DS FLAT_RING3_DS #define FLAT_USER_SS FLAT_RING3_SS #define __HYPERVISOR_VIRT_START_PAE 0xF5800000 #define __MACH2PHYS_VIRT_START_PAE 0xF5800000 #define __MACH2PHYS_VIRT_END_PAE 0xF6800000 #define HYPERVISOR_VIRT_START_PAE \ mk_unsigned_long(__HYPERVISOR_VIRT_START_PAE) #define MACH2PHYS_VIRT_START_PAE \ mk_unsigned_long(__MACH2PHYS_VIRT_START_PAE) #define MACH2PHYS_VIRT_END_PAE \ mk_unsigned_long(__MACH2PHYS_VIRT_END_PAE) /* Non-PAE bounds are obsolete. */ #define __HYPERVISOR_VIRT_START_NONPAE 0xFC000000 #define __MACH2PHYS_VIRT_START_NONPAE 0xFC000000 #define __MACH2PHYS_VIRT_END_NONPAE 0xFC400000 #define HYPERVISOR_VIRT_START_NONPAE \ mk_unsigned_long(__HYPERVISOR_VIRT_START_NONPAE) #define MACH2PHYS_VIRT_START_NONPAE \ mk_unsigned_long(__MACH2PHYS_VIRT_START_NONPAE) #define MACH2PHYS_VIRT_END_NONPAE \ mk_unsigned_long(__MACH2PHYS_VIRT_END_NONPAE) #define __HYPERVISOR_VIRT_START __HYPERVISOR_VIRT_START_PAE #define __MACH2PHYS_VIRT_START __MACH2PHYS_VIRT_START_PAE #define __MACH2PHYS_VIRT_END __MACH2PHYS_VIRT_END_PAE #ifndef HYPERVISOR_VIRT_START #define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) #endif #define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START) #define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END) #define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>2) #ifndef machine_to_phys_mapping #define machine_to_phys_mapping ((unsigned long *)MACH2PHYS_VIRT_START) #endif /* 32-/64-bit invariability for control interfaces (domctl/sysctl). */ #if defined(__XEN__) || defined(__XEN_TOOLS__) #undef ___DEFINE_XEN_GUEST_HANDLE #define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ typedef struct { type *p; } \ __guest_handle_ ## name; \ typedef struct { union { type *p; uint64_aligned_t q; }; } \ __guest_handle_64_ ## name #undef set_xen_guest_handle_raw #define set_xen_guest_handle_raw(hnd, val) \ do { if ( sizeof(hnd) == 8 ) *(uint64_t *)&(hnd) = 0; \ (hnd).p = val; \ } while ( 0 ) #define uint64_aligned_t uint64_t __attribute__((aligned(8))) #define __XEN_GUEST_HANDLE_64(name) __guest_handle_64_ ## name #define XEN_GUEST_HANDLE_64(name) __XEN_GUEST_HANDLE_64(name) #endif #ifndef __ASSEMBLY__ struct cpu_user_regs { uint32_t ebx; uint32_t ecx; uint32_t edx; uint32_t esi; uint32_t edi; uint32_t ebp; uint32_t eax; uint16_t error_code; /* private */ uint16_t entry_vector; /* private */ uint32_t eip; uint16_t cs; uint8_t saved_upcall_mask; uint8_t _pad0; uint32_t eflags; /* eflags.IF == !saved_upcall_mask */ uint32_t esp; uint16_t ss, _pad1; uint16_t es, _pad2; uint16_t ds, _pad3; uint16_t fs, _pad4; uint16_t gs, _pad5; }; typedef struct cpu_user_regs cpu_user_regs_t; DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t); /* * Page-directory addresses above 4GB do not fit into architectural %cr3. * When accessing %cr3, or equivalent field in vcpu_guest_context, guests * must use the following accessor macros to pack/unpack valid MFNs. */ #define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) #define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) struct arch_vcpu_info { unsigned long cr2; unsigned long pad[5]; /* sizeof(vcpu_info_t) == 64 */ }; typedef struct arch_vcpu_info arch_vcpu_info_t; struct xen_callback { unsigned long cs; unsigned long eip; }; typedef struct xen_callback xen_callback_t; #endif /* !__ASSEMBLY__ */ #endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/arch-x86/xen-mca.h0000664000175000017500000003513312307313555017204 0ustar smbsmb/****************************************************************************** * arch-x86/mca.h * * Contributed by Advanced Micro Devices, Inc. * Author: Christoph Egger * * Guest OS machine check interface to x86 Xen. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ /* Full MCA functionality has the following Usecases from the guest side: * * Must have's: * 1. Dom0 and DomU register machine check trap callback handlers * (already done via "set_trap_table" hypercall) * 2. Dom0 registers machine check event callback handler * (doable via EVTCHNOP_bind_virq) * 3. Dom0 and DomU fetches machine check data * 4. Dom0 wants Xen to notify a DomU * 5. Dom0 gets DomU ID from physical address * 6. Dom0 wants Xen to kill DomU (already done for "xm destroy") * * Nice to have's: * 7. Dom0 wants Xen to deactivate a physical CPU * This is better done as separate task, physical CPU hotplugging, * and hypercall(s) should be sysctl's * 8. Page migration proposed from Xen NUMA work, where Dom0 can tell Xen to * move a DomU (or Dom0 itself) away from a malicious page * producing correctable errors. * 9. offlining physical page: * Xen free's and never re-uses a certain physical page. * 10. Testfacility: Allow Dom0 to write values into machine check MSR's * and tell Xen to trigger a machine check */ #ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__ #define __XEN_PUBLIC_ARCH_X86_MCA_H__ /* Hypercall */ #define __HYPERVISOR_mca __HYPERVISOR_arch_0 /* * The xen-unstable repo has interface version 0x03000001; out interface * is incompatible with that and any future minor revisions, so we * choose a different version number range that is numerically less * than that used in xen-unstable. */ #define XEN_MCA_INTERFACE_VERSION 0x01ecc003 /* IN: Dom0 calls hypercall to retrieve nonurgent telemetry */ #define XEN_MC_NONURGENT 0x0001 /* IN: Dom0/DomU calls hypercall to retrieve urgent telemetry */ #define XEN_MC_URGENT 0x0002 /* IN: Dom0 acknowledges previosly-fetched telemetry */ #define XEN_MC_ACK 0x0004 /* OUT: All is ok */ #define XEN_MC_OK 0x0 /* OUT: Domain could not fetch data. */ #define XEN_MC_FETCHFAILED 0x1 /* OUT: There was no machine check data to fetch. */ #define XEN_MC_NODATA 0x2 /* OUT: Between notification time and this hypercall an other * (most likely) correctable error happened. The fetched data, * does not match the original machine check data. */ #define XEN_MC_NOMATCH 0x4 /* OUT: DomU did not register MC NMI handler. Try something else. */ #define XEN_MC_CANNOTHANDLE 0x8 /* OUT: Notifying DomU failed. Retry later or try something else. */ #define XEN_MC_NOTDELIVERED 0x10 /* Note, XEN_MC_CANNOTHANDLE and XEN_MC_NOTDELIVERED are mutually exclusive. */ #ifndef __ASSEMBLY__ #define VIRQ_MCA VIRQ_ARCH_0 /* G. (DOM0) Machine Check Architecture */ /* * Machine Check Architecure: * structs are read-only and used to report all kinds of * correctable and uncorrectable errors detected by the HW. * Dom0 and DomU: register a handler to get notified. * Dom0 only: Correctable errors are reported via VIRQ_MCA * Dom0 and DomU: Uncorrectable errors are reported via nmi handlers */ #define MC_TYPE_GLOBAL 0 #define MC_TYPE_BANK 1 #define MC_TYPE_EXTENDED 2 #define MC_TYPE_RECOVERY 3 struct mcinfo_common { uint16_t type; /* structure type */ uint16_t size; /* size of this struct in bytes */ }; #define MC_FLAG_CORRECTABLE (1 << 0) #define MC_FLAG_UNCORRECTABLE (1 << 1) #define MC_FLAG_RECOVERABLE (1 << 2) #define MC_FLAG_POLLED (1 << 3) #define MC_FLAG_RESET (1 << 4) #define MC_FLAG_CMCI (1 << 5) #define MC_FLAG_MCE (1 << 6) /* contains global x86 mc information */ struct mcinfo_global { struct mcinfo_common common; /* running domain at the time in error (most likely the impacted one) */ uint16_t mc_domid; uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */ uint32_t mc_socketid; /* physical socket of the physical core */ uint16_t mc_coreid; /* physical impacted core */ uint16_t mc_core_threadid; /* core thread of physical core */ uint32_t mc_apicid; uint32_t mc_flags; uint64_t mc_gstatus; /* global status */ }; /* contains bank local x86 mc information */ struct mcinfo_bank { struct mcinfo_common common; uint16_t mc_bank; /* bank nr */ uint16_t mc_domid; /* Usecase 5: domain referenced by mc_addr on dom0 * and if mc_addr is valid. Never valid on DomU. */ uint64_t mc_status; /* bank status */ uint64_t mc_addr; /* bank address, only valid * if addr bit is set in mc_status */ uint64_t mc_misc; uint64_t mc_ctrl2; uint64_t mc_tsc; }; struct mcinfo_msr { uint64_t reg; /* MSR */ uint64_t value; /* MSR value */ }; /* contains mc information from other * or additional mc MSRs */ struct mcinfo_extended { struct mcinfo_common common; /* You can fill up to five registers. * If you need more, then use this structure * multiple times. */ uint32_t mc_msrs; /* Number of msr with valid values. */ /* * Currently Intel extended MSR (32/64) include all gp registers * and E(R)FLAGS, E(R)IP, E(R)MISC, up to 11/19 of them might be * useful at present. So expand this array to 16/32 to leave room. */ struct mcinfo_msr mc_msr[sizeof(void *) * 4]; }; /* Recovery Action flags. Giving recovery result information to DOM0 */ /* Xen takes successful recovery action, the error is recovered */ #define REC_ACTION_RECOVERED (0x1 << 0) /* No action is performed by XEN */ #define REC_ACTION_NONE (0x1 << 1) /* It's possible DOM0 might take action ownership in some case */ #define REC_ACTION_NEED_RESET (0x1 << 2) /* Different Recovery Action types, if the action is performed successfully, * REC_ACTION_RECOVERED flag will be returned. */ /* Page Offline Action */ #define MC_ACTION_PAGE_OFFLINE (0x1 << 0) /* CPU offline Action */ #define MC_ACTION_CPU_OFFLINE (0x1 << 1) /* L3 cache disable Action */ #define MC_ACTION_CACHE_SHRINK (0x1 << 2) /* Below interface used between XEN/DOM0 for passing XEN's recovery action * information to DOM0. * usage Senario: After offlining broken page, XEN might pass its page offline * recovery action result to DOM0. DOM0 will save the information in * non-volatile memory for further proactive actions, such as offlining the * easy broken page earlier when doing next reboot. */ struct page_offline_action { /* Params for passing the offlined page number to DOM0 */ uint64_t mfn; uint64_t status; }; struct cpu_offline_action { /* Params for passing the identity of the offlined CPU to DOM0 */ uint32_t mc_socketid; uint16_t mc_coreid; uint16_t mc_core_threadid; }; #define MAX_UNION_SIZE 16 struct mcinfo_recovery { struct mcinfo_common common; uint16_t mc_bank; /* bank nr */ uint8_t action_flags; uint8_t action_types; union { struct page_offline_action page_retire; struct cpu_offline_action cpu_offline; uint8_t pad[MAX_UNION_SIZE]; } action_info; }; #define MCINFO_HYPERCALLSIZE 1024 #define MCINFO_MAXSIZE 768 #define MCINFO_FLAGS_UNCOMPLETE 0x1 struct mc_info { /* Number of mcinfo_* entries in mi_data */ uint32_t mi_nentries; uint32_t flags; uint64_t mi_data[(MCINFO_MAXSIZE - 1) / 8]; }; typedef struct mc_info mc_info_t; DEFINE_XEN_GUEST_HANDLE(mc_info_t); #define __MC_MSR_ARRAYSIZE 8 #define __MC_NMSRS 1 #define MC_NCAPS 7 /* 7 CPU feature flag words */ #define MC_CAPS_STD_EDX 0 /* cpuid level 0x00000001 (%edx) */ #define MC_CAPS_AMD_EDX 1 /* cpuid level 0x80000001 (%edx) */ #define MC_CAPS_TM 2 /* cpuid level 0x80860001 (TransMeta) */ #define MC_CAPS_LINUX 3 /* Linux-defined */ #define MC_CAPS_STD_ECX 4 /* cpuid level 0x00000001 (%ecx) */ #define MC_CAPS_VIA 5 /* cpuid level 0xc0000001 */ #define MC_CAPS_AMD_ECX 6 /* cpuid level 0x80000001 (%ecx) */ struct mcinfo_logical_cpu { uint32_t mc_cpunr; uint32_t mc_chipid; uint16_t mc_coreid; uint16_t mc_threadid; uint32_t mc_apicid; uint32_t mc_clusterid; uint32_t mc_ncores; uint32_t mc_ncores_active; uint32_t mc_nthreads; int32_t mc_cpuid_level; uint32_t mc_family; uint32_t mc_vendor; uint32_t mc_model; uint32_t mc_step; char mc_vendorid[16]; char mc_brandid[64]; uint32_t mc_cpu_caps[MC_NCAPS]; uint32_t mc_cache_size; uint32_t mc_cache_alignment; int32_t mc_nmsrvals; struct mcinfo_msr mc_msrvalues[__MC_MSR_ARRAYSIZE]; }; typedef struct mcinfo_logical_cpu xen_mc_logical_cpu_t; DEFINE_XEN_GUEST_HANDLE(xen_mc_logical_cpu_t); /* * OS's should use these instead of writing their own lookup function * each with its own bugs and drawbacks. * We use macros instead of static inline functions to allow guests * to include this header in assembly files (*.S). */ /* Prototype: * uint32_t x86_mcinfo_nentries(struct mc_info *mi); */ #define x86_mcinfo_nentries(_mi) \ (_mi)->mi_nentries /* Prototype: * struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi); */ #define x86_mcinfo_first(_mi) \ ((struct mcinfo_common *)(_mi)->mi_data) /* Prototype: * struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic); */ #define x86_mcinfo_next(_mic) \ ((struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size)) /* Prototype: * void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type); */ #define x86_mcinfo_lookup(_ret, _mi, _type) \ do { \ uint32_t found, i; \ struct mcinfo_common *_mic; \ \ found = 0; \ (_ret) = NULL; \ if (_mi == NULL) break; \ _mic = x86_mcinfo_first(_mi); \ for (i = 0; i < x86_mcinfo_nentries(_mi); i++) { \ if (_mic->type == (_type)) { \ found = 1; \ break; \ } \ _mic = x86_mcinfo_next(_mic); \ } \ (_ret) = found ? _mic : NULL; \ } while (0) /* Usecase 1 * Register machine check trap callback handler * (already done via "set_trap_table" hypercall) */ /* Usecase 2 * Dom0 registers machine check event callback handler * done by EVTCHNOP_bind_virq */ /* Usecase 3 * Fetch machine check data from hypervisor. * Note, this hypercall is special, because both Dom0 and DomU must use this. */ #define XEN_MC_fetch 1 struct xen_mc_fetch { /* IN/OUT variables. */ uint32_t flags; /* IN: XEN_MC_NONURGENT, XEN_MC_URGENT, XEN_MC_ACK if ack'ing an earlier fetch */ /* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED, XEN_MC_NODATA, XEN_MC_NOMATCH */ uint32_t _pad0; uint64_t fetch_id; /* OUT: id for ack, IN: id we are ack'ing */ /* OUT variables. */ XEN_GUEST_HANDLE(mc_info_t) data; }; typedef struct xen_mc_fetch xen_mc_fetch_t; DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t); /* Usecase 4 * This tells the hypervisor to notify a DomU about the machine check error */ #define XEN_MC_notifydomain 2 struct xen_mc_notifydomain { /* IN variables. */ uint16_t mc_domid; /* The unprivileged domain to notify. */ uint16_t mc_vcpuid; /* The vcpu in mc_domid to notify. * Usually echo'd value from the fetch hypercall. */ /* IN/OUT variables. */ uint32_t flags; /* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */ /* OUT: XEN_MC_OK, XEN_MC_CANNOTHANDLE, XEN_MC_NOTDELIVERED, XEN_MC_NOMATCH */ }; typedef struct xen_mc_notifydomain xen_mc_notifydomain_t; DEFINE_XEN_GUEST_HANDLE(xen_mc_notifydomain_t); #define XEN_MC_physcpuinfo 3 struct xen_mc_physcpuinfo { /* IN/OUT */ uint32_t ncpus; uint32_t _pad0; /* OUT */ XEN_GUEST_HANDLE(xen_mc_logical_cpu_t) info; }; #define XEN_MC_msrinject 4 #define MC_MSRINJ_MAXMSRS 8 struct xen_mc_msrinject { /* IN */ uint32_t mcinj_cpunr; /* target processor id */ uint32_t mcinj_flags; /* see MC_MSRINJ_F_* below */ uint32_t mcinj_count; /* 0 .. count-1 in array are valid */ uint32_t _pad0; struct mcinfo_msr mcinj_msr[MC_MSRINJ_MAXMSRS]; }; /* Flags for mcinj_flags above; bits 16-31 are reserved */ #define MC_MSRINJ_F_INTERPOSE 0x1 #define XEN_MC_mceinject 5 struct xen_mc_mceinject { unsigned int mceinj_cpunr; /* target processor id */ }; #if defined(__XEN__) || defined(__XEN_TOOLS__) #define XEN_MC_inject_v2 6 #define XEN_MC_INJECT_TYPE_MASK 0x7 #define XEN_MC_INJECT_TYPE_MCE 0x0 #define XEN_MC_INJECT_TYPE_CMCI 0x1 #define XEN_MC_INJECT_CPU_BROADCAST 0x8 struct xen_mc_inject_v2 { uint32_t flags; struct xenctl_bitmap cpumap; }; #endif struct xen_mc { uint32_t cmd; uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */ union { struct xen_mc_fetch mc_fetch; struct xen_mc_notifydomain mc_notifydomain; struct xen_mc_physcpuinfo mc_physcpuinfo; struct xen_mc_msrinject mc_msrinject; struct xen_mc_mceinject mc_mceinject; #if defined(__XEN__) || defined(__XEN_TOOLS__) struct xen_mc_inject_v2 mc_inject_v2; #endif } u; }; typedef struct xen_mc xen_mc_t; DEFINE_XEN_GUEST_HANDLE(xen_mc_t); #endif /* __ASSEMBLY__ */ #endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */ xen-4.4.0/xen/include/public/arch-x86/xen-x86_64.h0000664000175000017500000001531412307313555017401 0ustar smbsmb/****************************************************************************** * xen-x86_64.h * * Guest OS interface to x86 64-bit Xen. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2004-2006, K A Fraser */ #ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__ #define __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__ /* * Hypercall interface: * Input: %rdi, %rsi, %rdx, %r10, %r8, %r9 (arguments 1-6) * Output: %rax * Access is via hypercall page (set up by guest loader or via a Xen MSR): * call hypercall_page + hypercall-number * 32 * Clobbered: argument registers (e.g., 2-arg hypercall clobbers %rdi,%rsi) */ /* * 64-bit segment selectors * These flat segments are in the Xen-private section of every GDT. Since these * are also present in the initial GDT, many OSes will be able to avoid * installing their own GDT. */ #define FLAT_RING3_CS32 0xe023 /* GDT index 260 */ #define FLAT_RING3_CS64 0xe033 /* GDT index 261 */ #define FLAT_RING3_DS32 0xe02b /* GDT index 262 */ #define FLAT_RING3_DS64 0x0000 /* NULL selector */ #define FLAT_RING3_SS32 0xe02b /* GDT index 262 */ #define FLAT_RING3_SS64 0xe02b /* GDT index 262 */ #define FLAT_KERNEL_DS64 FLAT_RING3_DS64 #define FLAT_KERNEL_DS32 FLAT_RING3_DS32 #define FLAT_KERNEL_DS FLAT_KERNEL_DS64 #define FLAT_KERNEL_CS64 FLAT_RING3_CS64 #define FLAT_KERNEL_CS32 FLAT_RING3_CS32 #define FLAT_KERNEL_CS FLAT_KERNEL_CS64 #define FLAT_KERNEL_SS64 FLAT_RING3_SS64 #define FLAT_KERNEL_SS32 FLAT_RING3_SS32 #define FLAT_KERNEL_SS FLAT_KERNEL_SS64 #define FLAT_USER_DS64 FLAT_RING3_DS64 #define FLAT_USER_DS32 FLAT_RING3_DS32 #define FLAT_USER_DS FLAT_USER_DS64 #define FLAT_USER_CS64 FLAT_RING3_CS64 #define FLAT_USER_CS32 FLAT_RING3_CS32 #define FLAT_USER_CS FLAT_USER_CS64 #define FLAT_USER_SS64 FLAT_RING3_SS64 #define FLAT_USER_SS32 FLAT_RING3_SS32 #define FLAT_USER_SS FLAT_USER_SS64 #define __HYPERVISOR_VIRT_START 0xFFFF800000000000 #define __HYPERVISOR_VIRT_END 0xFFFF880000000000 #define __MACH2PHYS_VIRT_START 0xFFFF800000000000 #define __MACH2PHYS_VIRT_END 0xFFFF804000000000 #ifndef HYPERVISOR_VIRT_START #define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) #define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END) #endif #define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START) #define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END) #define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3) #ifndef machine_to_phys_mapping #define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) #endif /* * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base) * @which == SEGBASE_* ; @base == 64-bit base address * Returns 0 on success. */ #define SEGBASE_FS 0 #define SEGBASE_GS_USER 1 #define SEGBASE_GS_KERNEL 2 #define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */ /* * int HYPERVISOR_iret(void) * All arguments are on the kernel stack, in the following format. * Never returns if successful. Current kernel context is lost. * The saved CS is mapped as follows: * RING0 -> RING3 kernel mode. * RING1 -> RING3 kernel mode. * RING2 -> RING3 kernel mode. * RING3 -> RING3 user mode. * However RING0 indicates that the guest kernel should return to iteself * directly with * orb $3,1*8(%rsp) * iretq * If flags contains VGCF_in_syscall: * Restore RAX, RIP, RFLAGS, RSP. * Discard R11, RCX, CS, SS. * Otherwise: * Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP. * All other registers are saved on hypercall entry and restored to user. */ /* Guest exited in SYSCALL context? Return to guest with SYSRET? */ #define _VGCF_in_syscall 8 #define VGCF_in_syscall (1<<_VGCF_in_syscall) #define VGCF_IN_SYSCALL VGCF_in_syscall #ifndef __ASSEMBLY__ struct iret_context { /* Top of stack (%rsp at point of hypercall). */ uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss; /* Bottom of iret stack frame. */ }; #if defined(__GNUC__) && !defined(__STRICT_ANSI__) /* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */ #define __DECL_REG(name) union { \ uint64_t r ## name, e ## name; \ uint32_t _e ## name; \ } #else /* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */ #define __DECL_REG(name) uint64_t r ## name #endif struct cpu_user_regs { uint64_t r15; uint64_t r14; uint64_t r13; uint64_t r12; __DECL_REG(bp); __DECL_REG(bx); uint64_t r11; uint64_t r10; uint64_t r9; uint64_t r8; __DECL_REG(ax); __DECL_REG(cx); __DECL_REG(dx); __DECL_REG(si); __DECL_REG(di); uint32_t error_code; /* private */ uint32_t entry_vector; /* private */ __DECL_REG(ip); uint16_t cs, _pad0[1]; uint8_t saved_upcall_mask; uint8_t _pad1[3]; __DECL_REG(flags); /* rflags.IF == !saved_upcall_mask */ __DECL_REG(sp); uint16_t ss, _pad2[3]; uint16_t es, _pad3[3]; uint16_t ds, _pad4[3]; uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */ uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */ }; typedef struct cpu_user_regs cpu_user_regs_t; DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t); #undef __DECL_REG #define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12) #define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12) struct arch_vcpu_info { unsigned long cr2; unsigned long pad; /* sizeof(vcpu_info_t) == 64 */ }; typedef struct arch_vcpu_info arch_vcpu_info_t; typedef unsigned long xen_callback_t; #endif /* !__ASSEMBLY__ */ #endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/kexec.h0000664000175000017500000002146212307313555015413 0ustar smbsmb/****************************************************************************** * kexec.h - Public portion * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Xen port written by: * - Simon 'Horms' Horman * - Magnus Damm */ #ifndef _XEN_PUBLIC_KEXEC_H #define _XEN_PUBLIC_KEXEC_H /* This file describes the Kexec / Kdump hypercall interface for Xen. * * Kexec under vanilla Linux allows a user to reboot the physical machine * into a new user-specified kernel. The Xen port extends this idea * to allow rebooting of the machine from dom0. When kexec for dom0 * is used to reboot, both the hypervisor and the domains get replaced * with some other kernel. It is possible to kexec between vanilla * Linux and Xen and back again. Xen to Xen works well too. * * The hypercall interface for kexec can be divided into three main * types of hypercall operations: * * 1) Range information: * This is used by the dom0 kernel to ask the hypervisor about various * address information. This information is needed to allow kexec-tools * to fill in the ELF headers for /proc/vmcore properly. * * 2) Load and unload of images: * There are no big surprises here, the kexec binary from kexec-tools * runs in userspace in dom0. The tool loads/unloads data into the * dom0 kernel such as new kernel, initramfs and hypervisor. When * loaded the dom0 kernel performs a load hypercall operation, and * before releasing all page references the dom0 kernel calls unload. * * 3) Kexec operation: * This is used to start a previously loaded kernel. */ #include "xen.h" #if defined(__i386__) || defined(__x86_64__) #define KEXEC_XEN_NO_PAGES 17 #endif /* * Prototype for this hypercall is: * int kexec_op(int cmd, void *args) * @cmd == KEXEC_CMD_... * KEXEC operation to perform * @args == Operation-specific extra arguments (NULL if none). */ /* * Kexec supports two types of operation: * - kexec into a regular kernel, very similar to a standard reboot * - KEXEC_TYPE_DEFAULT is used to specify this type * - kexec into a special "crash kernel", aka kexec-on-panic * - KEXEC_TYPE_CRASH is used to specify this type * - parts of our system may be broken at kexec-on-panic time * - the code should be kept as simple and self-contained as possible */ #define KEXEC_TYPE_DEFAULT 0 #define KEXEC_TYPE_CRASH 1 /* The kexec implementation for Xen allows the user to load two * types of kernels, KEXEC_TYPE_DEFAULT and KEXEC_TYPE_CRASH. * All data needed for a kexec reboot is kept in one xen_kexec_image_t * per "instance". The data mainly consists of machine address lists to pages * together with destination addresses. The data in xen_kexec_image_t * is passed to the "code page" which is one page of code that performs * the final relocations before jumping to the new kernel. */ typedef struct xen_kexec_image { #if defined(__i386__) || defined(__x86_64__) unsigned long page_list[KEXEC_XEN_NO_PAGES]; #endif unsigned long indirection_page; unsigned long start_address; } xen_kexec_image_t; /* * Perform kexec having previously loaded a kexec or kdump kernel * as appropriate. * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in] * * Control is transferred to the image entry point with the host in * the following state. * * - The image may be executed on any PCPU and all other PCPUs are * stopped. * * - Local interrupts are disabled. * * - Register values are undefined. * * - The image segments have writeable 1:1 virtual to machine * mappings. The location of any page tables is undefined and these * page table frames are not be mapped. */ #define KEXEC_CMD_kexec 0 typedef struct xen_kexec_exec { int type; } xen_kexec_exec_t; /* * Load/Unload kernel image for kexec or kdump. * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in] * image == relocation information for kexec (ignored for unload) [in] */ #define KEXEC_CMD_kexec_load_v1 1 /* obsolete since 0x00040400 */ #define KEXEC_CMD_kexec_unload_v1 2 /* obsolete since 0x00040400 */ typedef struct xen_kexec_load_v1 { int type; xen_kexec_image_t image; } xen_kexec_load_v1_t; #define KEXEC_RANGE_MA_CRASH 0 /* machine address and size of crash area */ #define KEXEC_RANGE_MA_XEN 1 /* machine address and size of Xen itself */ #define KEXEC_RANGE_MA_CPU 2 /* machine address and size of a CPU note */ #define KEXEC_RANGE_MA_XENHEAP 3 /* machine address and size of xenheap * Note that although this is adjacent * to Xen it exists in a separate EFI * region on ia64, and thus needs to be * inserted into iomem_machine separately */ #define KEXEC_RANGE_MA_BOOT_PARAM 4 /* Obsolete: machine address and size of * the ia64_boot_param */ #define KEXEC_RANGE_MA_EFI_MEMMAP 5 /* machine address and size of * of the EFI Memory Map */ #define KEXEC_RANGE_MA_VMCOREINFO 6 /* machine address and size of vmcoreinfo */ /* * Find the address and size of certain memory areas * range == KEXEC_RANGE_... [in] * nr == physical CPU number (starting from 0) if KEXEC_RANGE_MA_CPU [in] * size == number of bytes reserved in window [out] * start == address of the first byte in the window [out] */ #define KEXEC_CMD_kexec_get_range 3 typedef struct xen_kexec_range { int range; int nr; unsigned long size; unsigned long start; } xen_kexec_range_t; #if __XEN_INTERFACE_VERSION__ >= 0x00040400 /* * A contiguous chunk of a kexec image and it's destination machine * address. */ typedef struct xen_kexec_segment { union { XEN_GUEST_HANDLE(const_void) h; uint64_t _pad; } buf; uint64_t buf_size; uint64_t dest_maddr; uint64_t dest_size; } xen_kexec_segment_t; DEFINE_XEN_GUEST_HANDLE(xen_kexec_segment_t); /* * Load a kexec image into memory. * * For KEXEC_TYPE_DEFAULT images, the segments may be anywhere in RAM. * The image is relocated prior to being executed. * * For KEXEC_TYPE_CRASH images, each segment of the image must reside * in the memory region reserved for kexec (KEXEC_RANGE_MA_CRASH) and * the entry point must be within the image. The caller is responsible * for ensuring that multiple images do not overlap. * * All image segments will be loaded to their destination machine * addresses prior to being executed. The trailing portion of any * segments with a source buffer (from dest_maddr + buf_size to * dest_maddr + dest_size) will be zeroed. * * Segments with no source buffer will be accessible to the image when * it is executed. */ #define KEXEC_CMD_kexec_load 4 typedef struct xen_kexec_load { uint8_t type; /* One of KEXEC_TYPE_* */ uint8_t _pad; uint16_t arch; /* ELF machine type (EM_*). */ uint32_t nr_segments; union { XEN_GUEST_HANDLE(xen_kexec_segment_t) h; uint64_t _pad; } segments; uint64_t entry_maddr; /* image entry point machine address. */ } xen_kexec_load_t; DEFINE_XEN_GUEST_HANDLE(xen_kexec_load_t); /* * Unload a kexec image. * * Type must be one of KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH. */ #define KEXEC_CMD_kexec_unload 5 typedef struct xen_kexec_unload { uint8_t type; } xen_kexec_unload_t; DEFINE_XEN_GUEST_HANDLE(xen_kexec_unload_t); #else /* __XEN_INTERFACE_VERSION__ < 0x00040400 */ #define KEXEC_CMD_kexec_load KEXEC_CMD_kexec_load_v1 #define KEXEC_CMD_kexec_unload KEXEC_CMD_kexec_unload_v1 #define xen_kexec_load xen_kexec_load_v1 #define xen_kexec_load_t xen_kexec_load_v1_t #endif #endif /* _XEN_PUBLIC_KEXEC_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/features.h0000664000175000017500000000750412307313555016133 0ustar smbsmb/****************************************************************************** * features.h * * Feature flags, reported by XENVER_get_features. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2006, Keir Fraser */ #ifndef __XEN_PUBLIC_FEATURES_H__ #define __XEN_PUBLIC_FEATURES_H__ /* * `incontents 200 elfnotes_features XEN_ELFNOTE_FEATURES * * The list of all the features the guest supports. They are set by * parsing the XEN_ELFNOTE_FEATURES and XEN_ELFNOTE_SUPPORTED_FEATURES * string. The format is the feature names (as given here without the * "XENFEAT_" prefix) separated by '|' characters. * If a feature is required for the kernel to function then the feature name * must be preceded by a '!' character. * * Note that if XEN_ELFNOTE_SUPPORTED_FEATURES is used, then in the * XENFEAT_dom0 MUST be set if the guest is to be booted as dom0, */ /* * If set, the guest does not need to write-protect its pagetables, and can * update them via direct writes. */ #define XENFEAT_writable_page_tables 0 /* * If set, the guest does not need to write-protect its segment descriptor * tables, and can update them via direct writes. */ #define XENFEAT_writable_descriptor_tables 1 /* * If set, translation between the guest's 'pseudo-physical' address space * and the host's machine address space are handled by the hypervisor. In this * mode the guest does not need to perform phys-to/from-machine translations * when performing page table operations. */ #define XENFEAT_auto_translated_physmap 2 /* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */ #define XENFEAT_supervisor_mode_kernel 3 /* * If set, the guest does not need to allocate x86 PAE page directories * below 4GB. This flag is usually implied by auto_translated_physmap. */ #define XENFEAT_pae_pgdir_above_4gb 4 /* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */ #define XENFEAT_mmu_pt_update_preserve_ad 5 /* x86: Does this Xen host support the MMU_{CLEAR,COPY}_PAGE hypercall? */ #define XENFEAT_highmem_assist 6 /* * If set, GNTTABOP_map_grant_ref honors flags to be placed into guest kernel * available pte bits. */ #define XENFEAT_gnttab_map_avail_bits 7 /* x86: Does this Xen host support the HVM callback vector type? */ #define XENFEAT_hvm_callback_vector 8 /* x86: pvclock algorithm is safe to use on HVM */ #define XENFEAT_hvm_safe_pvclock 9 /* x86: pirq can be used by HVM guests */ #define XENFEAT_hvm_pirqs 10 /* operation as Dom0 is supported */ #define XENFEAT_dom0 11 #define XENFEAT_NR_SUBMAPS 1 #endif /* __XEN_PUBLIC_FEATURES_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/platform.h0000664000175000017500000004603212307313555016140 0ustar smbsmb/****************************************************************************** * platform.h * * Hardware platform operations. Intended for use by domain-0 kernel. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2002-2006, K Fraser */ #ifndef __XEN_PUBLIC_PLATFORM_H__ #define __XEN_PUBLIC_PLATFORM_H__ #include "xen.h" #define XENPF_INTERFACE_VERSION 0x03000001 /* * Set clock such that it would read after 00:00:00 UTC, * 1 January, 1970 if the current system time was . */ #define XENPF_settime 17 struct xenpf_settime { /* IN variables. */ uint32_t secs; uint32_t nsecs; uint64_t system_time; }; typedef struct xenpf_settime xenpf_settime_t; DEFINE_XEN_GUEST_HANDLE(xenpf_settime_t); /* * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type. * On x86, @type is an architecture-defined MTRR memory type. * On success, returns the MTRR that was used (@reg) and a handle that can * be passed to XENPF_DEL_MEMTYPE to accurately tear down the new setting. * (x86-specific). */ #define XENPF_add_memtype 31 struct xenpf_add_memtype { /* IN variables. */ xen_pfn_t mfn; uint64_t nr_mfns; uint32_t type; /* OUT variables. */ uint32_t handle; uint32_t reg; }; typedef struct xenpf_add_memtype xenpf_add_memtype_t; DEFINE_XEN_GUEST_HANDLE(xenpf_add_memtype_t); /* * Tear down an existing memory-range type. If @handle is remembered then it * should be passed in to accurately tear down the correct setting (in case * of overlapping memory regions with differing types). If it is not known * then @handle should be set to zero. In all cases @reg must be set. * (x86-specific). */ #define XENPF_del_memtype 32 struct xenpf_del_memtype { /* IN variables. */ uint32_t handle; uint32_t reg; }; typedef struct xenpf_del_memtype xenpf_del_memtype_t; DEFINE_XEN_GUEST_HANDLE(xenpf_del_memtype_t); /* Read current type of an MTRR (x86-specific). */ #define XENPF_read_memtype 33 struct xenpf_read_memtype { /* IN variables. */ uint32_t reg; /* OUT variables. */ xen_pfn_t mfn; uint64_t nr_mfns; uint32_t type; }; typedef struct xenpf_read_memtype xenpf_read_memtype_t; DEFINE_XEN_GUEST_HANDLE(xenpf_read_memtype_t); #define XENPF_microcode_update 35 struct xenpf_microcode_update { /* IN variables. */ XEN_GUEST_HANDLE(const_void) data;/* Pointer to microcode data */ uint32_t length; /* Length of microcode data. */ }; typedef struct xenpf_microcode_update xenpf_microcode_update_t; DEFINE_XEN_GUEST_HANDLE(xenpf_microcode_update_t); #define XENPF_platform_quirk 39 #define QUIRK_NOIRQBALANCING 1 /* Do not restrict IO-APIC RTE targets */ #define QUIRK_IOAPIC_BAD_REGSEL 2 /* IO-APIC REGSEL forgets its value */ #define QUIRK_IOAPIC_GOOD_REGSEL 3 /* IO-APIC REGSEL behaves properly */ struct xenpf_platform_quirk { /* IN variables. */ uint32_t quirk_id; }; typedef struct xenpf_platform_quirk xenpf_platform_quirk_t; DEFINE_XEN_GUEST_HANDLE(xenpf_platform_quirk_t); #define XENPF_efi_runtime_call 49 #define XEN_EFI_get_time 1 #define XEN_EFI_set_time 2 #define XEN_EFI_get_wakeup_time 3 #define XEN_EFI_set_wakeup_time 4 #define XEN_EFI_get_next_high_monotonic_count 5 #define XEN_EFI_get_variable 6 #define XEN_EFI_set_variable 7 #define XEN_EFI_get_next_variable_name 8 #define XEN_EFI_query_variable_info 9 #define XEN_EFI_query_capsule_capabilities 10 #define XEN_EFI_update_capsule 11 struct xenpf_efi_runtime_call { uint32_t function; /* * This field is generally used for per sub-function flags (defined * below), except for the XEN_EFI_get_next_high_monotonic_count case, * where it holds the single returned value. */ uint32_t misc; unsigned long status; union { #define XEN_EFI_GET_TIME_SET_CLEARS_NS 0x00000001 struct { struct xenpf_efi_time { uint16_t year; uint8_t month; uint8_t day; uint8_t hour; uint8_t min; uint8_t sec; uint32_t ns; int16_t tz; uint8_t daylight; } time; uint32_t resolution; uint32_t accuracy; } get_time; struct xenpf_efi_time set_time; #define XEN_EFI_GET_WAKEUP_TIME_ENABLED 0x00000001 #define XEN_EFI_GET_WAKEUP_TIME_PENDING 0x00000002 struct xenpf_efi_time get_wakeup_time; #define XEN_EFI_SET_WAKEUP_TIME_ENABLE 0x00000001 #define XEN_EFI_SET_WAKEUP_TIME_ENABLE_ONLY 0x00000002 struct xenpf_efi_time set_wakeup_time; #define XEN_EFI_VARIABLE_NON_VOLATILE 0x00000001 #define XEN_EFI_VARIABLE_BOOTSERVICE_ACCESS 0x00000002 #define XEN_EFI_VARIABLE_RUNTIME_ACCESS 0x00000004 struct { XEN_GUEST_HANDLE(void) name; /* UCS-2/UTF-16 string */ unsigned long size; XEN_GUEST_HANDLE(void) data; struct xenpf_efi_guid { uint32_t data1; uint16_t data2; uint16_t data3; uint8_t data4[8]; } vendor_guid; } get_variable, set_variable; struct { unsigned long size; XEN_GUEST_HANDLE(void) name; /* UCS-2/UTF-16 string */ struct xenpf_efi_guid vendor_guid; } get_next_variable_name; #define XEN_EFI_VARINFO_BOOT_SNAPSHOT 0x00000001 struct { uint32_t attr; uint64_t max_store_size; uint64_t remain_store_size; uint64_t max_size; } query_variable_info; struct { XEN_GUEST_HANDLE(void) capsule_header_array; unsigned long capsule_count; uint64_t max_capsule_size; unsigned int reset_type; } query_capsule_capabilities; struct { XEN_GUEST_HANDLE(void) capsule_header_array; unsigned long capsule_count; uint64_t sg_list; /* machine address */ } update_capsule; } u; }; typedef struct xenpf_efi_runtime_call xenpf_efi_runtime_call_t; DEFINE_XEN_GUEST_HANDLE(xenpf_efi_runtime_call_t); #define XENPF_firmware_info 50 #define XEN_FW_DISK_INFO 1 /* from int 13 AH=08/41/48 */ #define XEN_FW_DISK_MBR_SIGNATURE 2 /* from MBR offset 0x1b8 */ #define XEN_FW_VBEDDC_INFO 3 /* from int 10 AX=4f15 */ #define XEN_FW_EFI_INFO 4 /* from EFI */ #define XEN_FW_EFI_VERSION 0 #define XEN_FW_EFI_CONFIG_TABLE 1 #define XEN_FW_EFI_VENDOR 2 #define XEN_FW_EFI_MEM_INFO 3 #define XEN_FW_EFI_RT_VERSION 4 #define XEN_FW_EFI_PCI_ROM 5 #define XEN_FW_KBD_SHIFT_FLAGS 5 struct xenpf_firmware_info { /* IN variables. */ uint32_t type; uint32_t index; /* OUT variables. */ union { struct { /* Int13, Fn48: Check Extensions Present. */ uint8_t device; /* %dl: bios device number */ uint8_t version; /* %ah: major version */ uint16_t interface_support; /* %cx: support bitmap */ /* Int13, Fn08: Legacy Get Device Parameters. */ uint16_t legacy_max_cylinder; /* %cl[7:6]:%ch: max cyl # */ uint8_t legacy_max_head; /* %dh: max head # */ uint8_t legacy_sectors_per_track; /* %cl[5:0]: max sector # */ /* Int13, Fn41: Get Device Parameters (as filled into %ds:%esi). */ /* NB. First uint16_t of buffer must be set to buffer size. */ XEN_GUEST_HANDLE(void) edd_params; } disk_info; /* XEN_FW_DISK_INFO */ struct { uint8_t device; /* bios device number */ uint32_t mbr_signature; /* offset 0x1b8 in mbr */ } disk_mbr_signature; /* XEN_FW_DISK_MBR_SIGNATURE */ struct { /* Int10, AX=4F15: Get EDID info. */ uint8_t capabilities; uint8_t edid_transfer_time; /* must refer to 128-byte buffer */ XEN_GUEST_HANDLE(uint8) edid; } vbeddc_info; /* XEN_FW_VBEDDC_INFO */ union xenpf_efi_info { uint32_t version; struct { uint64_t addr; /* EFI_CONFIGURATION_TABLE */ uint32_t nent; } cfg; struct { uint32_t revision; uint32_t bufsz; /* input, in bytes */ XEN_GUEST_HANDLE(void) name; /* UCS-2/UTF-16 string */ } vendor; struct { uint64_t addr; uint64_t size; uint64_t attr; uint32_t type; } mem; struct { /* IN variables */ uint16_t segment; uint8_t bus; uint8_t devfn; uint16_t vendor; uint16_t devid; /* OUT variables */ uint64_t address; xen_ulong_t size; } pci_rom; } efi_info; /* XEN_FW_EFI_INFO */ /* Int16, Fn02: Get keyboard shift flags. */ uint8_t kbd_shift_flags; /* XEN_FW_KBD_SHIFT_FLAGS */ } u; }; typedef struct xenpf_firmware_info xenpf_firmware_info_t; DEFINE_XEN_GUEST_HANDLE(xenpf_firmware_info_t); #define XENPF_enter_acpi_sleep 51 struct xenpf_enter_acpi_sleep { /* IN variables */ #if __XEN_INTERFACE_VERSION__ < 0x00040300 uint16_t pm1a_cnt_val; /* PM1a control value. */ uint16_t pm1b_cnt_val; /* PM1b control value. */ #else uint16_t val_a; /* PM1a control / sleep type A. */ uint16_t val_b; /* PM1b control / sleep type B. */ #endif uint32_t sleep_state; /* Which state to enter (Sn). */ #define XENPF_ACPI_SLEEP_EXTENDED 0x00000001 uint32_t flags; /* XENPF_ACPI_SLEEP_*. */ }; typedef struct xenpf_enter_acpi_sleep xenpf_enter_acpi_sleep_t; DEFINE_XEN_GUEST_HANDLE(xenpf_enter_acpi_sleep_t); #define XENPF_change_freq 52 struct xenpf_change_freq { /* IN variables */ uint32_t flags; /* Must be zero. */ uint32_t cpu; /* Physical cpu. */ uint64_t freq; /* New frequency (Hz). */ }; typedef struct xenpf_change_freq xenpf_change_freq_t; DEFINE_XEN_GUEST_HANDLE(xenpf_change_freq_t); /* * Get idle times (nanoseconds since boot) for physical CPUs specified in the * @cpumap_bitmap with range [0..@cpumap_nr_cpus-1]. The @idletime array is * indexed by CPU number; only entries with the corresponding @cpumap_bitmap * bit set are written to. On return, @cpumap_bitmap is modified so that any * non-existent CPUs are cleared. Such CPUs have their @idletime array entry * cleared. */ #define XENPF_getidletime 53 struct xenpf_getidletime { /* IN/OUT variables */ /* IN: CPUs to interrogate; OUT: subset of IN which are present */ XEN_GUEST_HANDLE(uint8) cpumap_bitmap; /* IN variables */ /* Size of cpumap bitmap. */ uint32_t cpumap_nr_cpus; /* Must be indexable for every cpu in cpumap_bitmap. */ XEN_GUEST_HANDLE(uint64) idletime; /* OUT variables */ /* System time when the idletime snapshots were taken. */ uint64_t now; }; typedef struct xenpf_getidletime xenpf_getidletime_t; DEFINE_XEN_GUEST_HANDLE(xenpf_getidletime_t); #define XENPF_set_processor_pminfo 54 /* ability bits */ #define XEN_PROCESSOR_PM_CX 1 #define XEN_PROCESSOR_PM_PX 2 #define XEN_PROCESSOR_PM_TX 4 /* cmd type */ #define XEN_PM_CX 0 #define XEN_PM_PX 1 #define XEN_PM_TX 2 #define XEN_PM_PDC 3 /* Px sub info type */ #define XEN_PX_PCT 1 #define XEN_PX_PSS 2 #define XEN_PX_PPC 4 #define XEN_PX_PSD 8 struct xen_power_register { uint32_t space_id; uint32_t bit_width; uint32_t bit_offset; uint32_t access_size; uint64_t address; }; struct xen_processor_csd { uint32_t domain; /* domain number of one dependent group */ uint32_t coord_type; /* coordination type */ uint32_t num; /* number of processors in same domain */ }; typedef struct xen_processor_csd xen_processor_csd_t; DEFINE_XEN_GUEST_HANDLE(xen_processor_csd_t); struct xen_processor_cx { struct xen_power_register reg; /* GAS for Cx trigger register */ uint8_t type; /* cstate value, c0: 0, c1: 1, ... */ uint32_t latency; /* worst latency (ms) to enter/exit this cstate */ uint32_t power; /* average power consumption(mW) */ uint32_t dpcnt; /* number of dependency entries */ XEN_GUEST_HANDLE(xen_processor_csd_t) dp; /* NULL if no dependency */ }; typedef struct xen_processor_cx xen_processor_cx_t; DEFINE_XEN_GUEST_HANDLE(xen_processor_cx_t); struct xen_processor_flags { uint32_t bm_control:1; uint32_t bm_check:1; uint32_t has_cst:1; uint32_t power_setup_done:1; uint32_t bm_rld_set:1; }; struct xen_processor_power { uint32_t count; /* number of C state entries in array below */ struct xen_processor_flags flags; /* global flags of this processor */ XEN_GUEST_HANDLE(xen_processor_cx_t) states; /* supported c states */ }; struct xen_pct_register { uint8_t descriptor; uint16_t length; uint8_t space_id; uint8_t bit_width; uint8_t bit_offset; uint8_t reserved; uint64_t address; }; struct xen_processor_px { uint64_t core_frequency; /* megahertz */ uint64_t power; /* milliWatts */ uint64_t transition_latency; /* microseconds */ uint64_t bus_master_latency; /* microseconds */ uint64_t control; /* control value */ uint64_t status; /* success indicator */ }; typedef struct xen_processor_px xen_processor_px_t; DEFINE_XEN_GUEST_HANDLE(xen_processor_px_t); struct xen_psd_package { uint64_t num_entries; uint64_t revision; uint64_t domain; uint64_t coord_type; uint64_t num_processors; }; struct xen_processor_performance { uint32_t flags; /* flag for Px sub info type */ uint32_t platform_limit; /* Platform limitation on freq usage */ struct xen_pct_register control_register; struct xen_pct_register status_register; uint32_t state_count; /* total available performance states */ XEN_GUEST_HANDLE(xen_processor_px_t) states; struct xen_psd_package domain_info; uint32_t shared_type; /* coordination type of this processor */ }; typedef struct xen_processor_performance xen_processor_performance_t; DEFINE_XEN_GUEST_HANDLE(xen_processor_performance_t); struct xenpf_set_processor_pminfo { /* IN variables */ uint32_t id; /* ACPI CPU ID */ uint32_t type; /* {XEN_PM_CX, XEN_PM_PX} */ union { struct xen_processor_power power;/* Cx: _CST/_CSD */ struct xen_processor_performance perf; /* Px: _PPC/_PCT/_PSS/_PSD */ XEN_GUEST_HANDLE(uint32) pdc; /* _PDC */ } u; }; typedef struct xenpf_set_processor_pminfo xenpf_set_processor_pminfo_t; DEFINE_XEN_GUEST_HANDLE(xenpf_set_processor_pminfo_t); #define XENPF_get_cpuinfo 55 struct xenpf_pcpuinfo { /* IN */ uint32_t xen_cpuid; /* OUT */ /* The maxium cpu_id that is present */ uint32_t max_present; #define XEN_PCPU_FLAGS_ONLINE 1 /* Correponding xen_cpuid is not present*/ #define XEN_PCPU_FLAGS_INVALID 2 uint32_t flags; uint32_t apic_id; uint32_t acpi_id; }; typedef struct xenpf_pcpuinfo xenpf_pcpuinfo_t; DEFINE_XEN_GUEST_HANDLE(xenpf_pcpuinfo_t); #define XENPF_get_cpu_version 48 struct xenpf_pcpu_version { /* IN */ uint32_t xen_cpuid; /* OUT */ /* The maxium cpu_id that is present */ uint32_t max_present; char vendor_id[12]; uint32_t family; uint32_t model; uint32_t stepping; }; typedef struct xenpf_pcpu_version xenpf_pcpu_version_t; DEFINE_XEN_GUEST_HANDLE(xenpf_pcpu_version_t); #define XENPF_cpu_online 56 #define XENPF_cpu_offline 57 struct xenpf_cpu_ol { uint32_t cpuid; }; typedef struct xenpf_cpu_ol xenpf_cpu_ol_t; DEFINE_XEN_GUEST_HANDLE(xenpf_cpu_ol_t); #define XENPF_cpu_hotadd 58 struct xenpf_cpu_hotadd { uint32_t apic_id; uint32_t acpi_id; uint32_t pxm; }; #define XENPF_mem_hotadd 59 struct xenpf_mem_hotadd { uint64_t spfn; uint64_t epfn; uint32_t pxm; uint32_t flags; }; #define XENPF_core_parking 60 #define XEN_CORE_PARKING_SET 1 #define XEN_CORE_PARKING_GET 2 struct xenpf_core_parking { /* IN variables */ uint32_t type; /* IN variables: set cpu nums expected to be idled */ /* OUT variables: get cpu nums actually be idled */ uint32_t idle_nums; }; typedef struct xenpf_core_parking xenpf_core_parking_t; DEFINE_XEN_GUEST_HANDLE(xenpf_core_parking_t); /* * ` enum neg_errnoval * ` HYPERVISOR_platform_op(const struct xen_platform_op*); */ struct xen_platform_op { uint32_t cmd; uint32_t interface_version; /* XENPF_INTERFACE_VERSION */ union { struct xenpf_settime settime; struct xenpf_add_memtype add_memtype; struct xenpf_del_memtype del_memtype; struct xenpf_read_memtype read_memtype; struct xenpf_microcode_update microcode; struct xenpf_platform_quirk platform_quirk; struct xenpf_efi_runtime_call efi_runtime_call; struct xenpf_firmware_info firmware_info; struct xenpf_enter_acpi_sleep enter_acpi_sleep; struct xenpf_change_freq change_freq; struct xenpf_getidletime getidletime; struct xenpf_set_processor_pminfo set_pminfo; struct xenpf_pcpuinfo pcpu_info; struct xenpf_pcpu_version pcpu_version; struct xenpf_cpu_ol cpu_ol; struct xenpf_cpu_hotadd cpu_add; struct xenpf_mem_hotadd mem_add; struct xenpf_core_parking core_parking; uint8_t pad[128]; } u; }; typedef struct xen_platform_op xen_platform_op_t; DEFINE_XEN_GUEST_HANDLE(xen_platform_op_t); #endif /* __XEN_PUBLIC_PLATFORM_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/trace.h0000664000175000017500000003272012307313555015411 0ustar smbsmb/****************************************************************************** * include/public/trace.h * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Mark Williamson, (C) 2004 Intel Research Cambridge * Copyright (C) 2005 Bin Ren */ #ifndef __XEN_PUBLIC_TRACE_H__ #define __XEN_PUBLIC_TRACE_H__ #define TRACE_EXTRA_MAX 7 #define TRACE_EXTRA_SHIFT 28 /* Trace classes */ #define TRC_CLS_SHIFT 16 #define TRC_GEN 0x0001f000 /* General trace */ #define TRC_SCHED 0x0002f000 /* Xen Scheduler trace */ #define TRC_DOM0OP 0x0004f000 /* Xen DOM0 operation trace */ #define TRC_HVM 0x0008f000 /* Xen HVM trace */ #define TRC_MEM 0x0010f000 /* Xen memory trace */ #define TRC_PV 0x0020f000 /* Xen PV traces */ #define TRC_SHADOW 0x0040f000 /* Xen shadow tracing */ #define TRC_HW 0x0080f000 /* Xen hardware-related traces */ #define TRC_GUEST 0x0800f000 /* Guest-generated traces */ #define TRC_ALL 0x0ffff000 #define TRC_HD_TO_EVENT(x) ((x)&0x0fffffff) #define TRC_HD_CYCLE_FLAG (1UL<<31) #define TRC_HD_INCLUDES_CYCLE_COUNT(x) ( !!( (x) & TRC_HD_CYCLE_FLAG ) ) #define TRC_HD_EXTRA(x) (((x)>>TRACE_EXTRA_SHIFT)&TRACE_EXTRA_MAX) /* Trace subclasses */ #define TRC_SUBCLS_SHIFT 12 /* trace subclasses for SVM */ #define TRC_HVM_ENTRYEXIT 0x00081000 /* VMENTRY and #VMEXIT */ #define TRC_HVM_HANDLER 0x00082000 /* various HVM handlers */ #define TRC_SCHED_MIN 0x00021000 /* Just runstate changes */ #define TRC_SCHED_CLASS 0x00022000 /* Scheduler-specific */ #define TRC_SCHED_VERBOSE 0x00028000 /* More inclusive scheduling */ /* * The highest 3 bits of the last 12 bits of TRC_SCHED_CLASS above are * reserved for encoding what scheduler produced the information. The * actual event is encoded in the last 9 bits. * * This means we have 8 scheduling IDs available (which means at most 8 * schedulers generating events) and, in each scheduler, up to 512 * different events. */ #define TRC_SCHED_ID_BITS 3 #define TRC_SCHED_ID_SHIFT (TRC_SUBCLS_SHIFT - TRC_SCHED_ID_BITS) #define TRC_SCHED_ID_MASK (((1UL<cpu_offset[cpu]). */ struct t_info { uint16_t tbuf_size; /* Size in pages of each trace buffer */ uint16_t mfn_offset[]; /* Offset within t_info structure of the page list per cpu */ /* MFN lists immediately after the header */ }; #endif /* __XEN_PUBLIC_TRACE_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/callback.h0000664000175000017500000000766212307313555016056 0ustar smbsmb/****************************************************************************** * callback.h * * Register guest OS callbacks with Xen. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2006, Ian Campbell */ #ifndef __XEN_PUBLIC_CALLBACK_H__ #define __XEN_PUBLIC_CALLBACK_H__ #include "xen.h" /* * Prototype for this hypercall is: * long callback_op(int cmd, void *extra_args) * @cmd == CALLBACKOP_??? (callback operation). * @extra_args == Operation-specific extra arguments (NULL if none). */ /* x86: Callback for event delivery. */ #define CALLBACKTYPE_event 0 /* x86: Failsafe callback when guest state cannot be restored by Xen. */ #define CALLBACKTYPE_failsafe 1 /* x86/64 hypervisor: Syscall by 64-bit guest app ('64-on-64-on-64'). */ #define CALLBACKTYPE_syscall 2 /* * x86/32 hypervisor: Only available on x86/32 when supervisor_mode_kernel * feature is enabled. Do not use this callback type in new code. */ #define CALLBACKTYPE_sysenter_deprecated 3 /* x86: Callback for NMI delivery. */ #define CALLBACKTYPE_nmi 4 /* * x86: sysenter is only available as follows: * - 32-bit hypervisor: with the supervisor_mode_kernel feature enabled * - 64-bit hypervisor: 32-bit guest applications on Intel CPUs * ('32-on-32-on-64', '32-on-64-on-64') * [nb. also 64-bit guest applications on Intel CPUs * ('64-on-64-on-64'), but syscall is preferred] */ #define CALLBACKTYPE_sysenter 5 /* * x86/64 hypervisor: Syscall by 32-bit guest app on AMD CPUs * ('32-on-32-on-64', '32-on-64-on-64') */ #define CALLBACKTYPE_syscall32 7 /* * Disable event deliver during callback? This flag is ignored for event and * NMI callbacks: event delivery is unconditionally disabled. */ #define _CALLBACKF_mask_events 0 #define CALLBACKF_mask_events (1U << _CALLBACKF_mask_events) /* * Register a callback. */ #define CALLBACKOP_register 0 struct callback_register { uint16_t type; uint16_t flags; xen_callback_t address; }; typedef struct callback_register callback_register_t; DEFINE_XEN_GUEST_HANDLE(callback_register_t); /* * Unregister a callback. * * Not all callbacks can be unregistered. -EINVAL will be returned if * you attempt to unregister such a callback. */ #define CALLBACKOP_unregister 1 struct callback_unregister { uint16_t type; uint16_t _unused; }; typedef struct callback_unregister callback_unregister_t; DEFINE_XEN_GUEST_HANDLE(callback_unregister_t); #if __XEN_INTERFACE_VERSION__ < 0x00030207 #undef CALLBACKTYPE_sysenter #define CALLBACKTYPE_sysenter CALLBACKTYPE_sysenter_deprecated #endif #endif /* __XEN_PUBLIC_CALLBACK_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/io/0000775000175000017500000000000012307313555014545 5ustar smbsmbxen-4.4.0/xen/include/public/io/libxenvchan.h0000664000175000017500000000661312307313555017225 0ustar smbsmb/** * @file * @section AUTHORS * * Copyright (C) 2010 Rafal Wojtczuk * * Authors: * Rafal Wojtczuk * Daniel De Graaf * * @section LICENSE * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * @section DESCRIPTION * * Originally borrowed from the Qubes OS Project, http://www.qubes-os.org, * this code has been substantially rewritten to use the gntdev and gntalloc * devices instead of raw MFNs and map_foreign_range. * * This is a library for inter-domain communication. A standard Xen ring * buffer is used, with a datagram-based interface built on top. The grant * reference and event channels are shared in XenStore under a user-specified * path. * * The ring.h macros define an asymmetric interface to a shared data structure * that assumes all rings reside in a single contiguous memory space. This is * not suitable for vchan because the interface to the ring is symmetric except * for the setup. Unlike the producer-consumer rings defined in ring.h, the * size of the rings used in vchan are determined at execution time instead of * compile time, so the macros in ring.h cannot be used to access the rings. */ #include #include struct ring_shared { uint32_t cons, prod; }; #define VCHAN_NOTIFY_WRITE 0x1 #define VCHAN_NOTIFY_READ 0x2 /** * vchan_interface: primary shared data structure */ struct vchan_interface { /** * Standard consumer/producer interface, one pair per buffer * left is client write, server read * right is client read, server write */ struct ring_shared left, right; /** * size of the rings, which determines their location * 10 - at offset 1024 in ring's page * 11 - at offset 2048 in ring's page * 12+ - uses 2^(N-12) grants to describe the multi-page ring * These should remain constant once the page is shared. * Only one of the two orders can be 10 (or 11). */ uint16_t left_order, right_order; /** * Shutdown detection: * 0: client (or server) has exited * 1: client (or server) is connected * 2: client has not yet connected */ uint8_t cli_live, srv_live; /** * Notification bits: * VCHAN_NOTIFY_WRITE: send notify when data is written * VCHAN_NOTIFY_READ: send notify when data is read (consumed) * cli_notify is used for the client to inform the server of its action */ uint8_t cli_notify, srv_notify; /** * Grant list: ordering is left, right. Must not extend into actual ring * or grow beyond the end of the initial shared page. * These should remain constant once the page is shared, to allow * for possible remapping by a client that restarts. */ uint32_t grants[0]; }; xen-4.4.0/xen/include/public/io/console.h0000664000175000017500000000333512307313555016364 0ustar smbsmb/****************************************************************************** * console.h * * Console I/O interface for Xen guest OSes. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2005, Keir Fraser */ #ifndef __XEN_PUBLIC_IO_CONSOLE_H__ #define __XEN_PUBLIC_IO_CONSOLE_H__ typedef uint32_t XENCONS_RING_IDX; #define MASK_XENCONS_IDX(idx, ring) ((idx) & (sizeof(ring)-1)) struct xencons_interface { char in[1024]; char out[2048]; XENCONS_RING_IDX in_cons, in_prod; XENCONS_RING_IDX out_cons, out_prod; }; #endif /* __XEN_PUBLIC_IO_CONSOLE_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/io/blkif.h0000664000175000017500000006224212307313555016013 0ustar smbsmb/****************************************************************************** * blkif.h * * Unified block-device I/O interface for Xen guest OSes. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2003-2004, Keir Fraser * Copyright (c) 2012, Spectra Logic Corporation */ #ifndef __XEN_PUBLIC_IO_BLKIF_H__ #define __XEN_PUBLIC_IO_BLKIF_H__ #include "ring.h" #include "../grant_table.h" /* * Front->back notifications: When enqueuing a new request, sending a * notification can be made conditional on req_event (i.e., the generic * hold-off mechanism provided by the ring macros). Backends must set * req_event appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()). * * Back->front notifications: When enqueuing a new response, sending a * notification can be made conditional on rsp_event (i.e., the generic * hold-off mechanism provided by the ring macros). Frontends must set * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()). */ #ifndef blkif_vdev_t #define blkif_vdev_t uint16_t #endif #define blkif_sector_t uint64_t /* * Feature and Parameter Negotiation * ================================= * The two halves of a Xen block driver utilize nodes within the XenStore to * communicate capabilities and to negotiate operating parameters. This * section enumerates these nodes which reside in the respective front and * backend portions of the XenStore, following the XenBus convention. * * All data in the XenStore is stored as strings. Nodes specifying numeric * values are encoded in decimal. Integer value ranges listed below are * expressed as fixed sized integer types capable of storing the conversion * of a properly formated node string, without loss of information. * * Any specified default value is in effect if the corresponding XenBus node * is not present in the XenStore. * * XenStore nodes in sections marked "PRIVATE" are solely for use by the * driver side whose XenBus tree contains them. * * XenStore nodes marked "DEPRECATED" in their notes section should only be * used to provide interoperability with legacy implementations. * * See the XenBus state transition diagram below for details on when XenBus * nodes must be published and when they can be queried. * ***************************************************************************** * Backend XenBus Nodes ***************************************************************************** * *------------------ Backend Device Identification (PRIVATE) ------------------ * * mode * Values: "r" (read only), "w" (writable) * * The read or write access permissions to the backing store to be * granted to the frontend. * * params * Values: string * * A free formatted string providing sufficient information for the * backend driver to open the backing device. (e.g. the path to the * file or block device representing the backing store.) * * type * Values: "file", "phy", "tap" * * The type of the backing device/object. * *--------------------------------- Features --------------------------------- * * feature-barrier * Values: 0/1 (boolean) * Default Value: 0 * * A value of "1" indicates that the backend can process requests * containing the BLKIF_OP_WRITE_BARRIER request opcode. Requests * of this type may still be returned at any time with the * BLKIF_RSP_EOPNOTSUPP result code. * * feature-flush-cache * Values: 0/1 (boolean) * Default Value: 0 * * A value of "1" indicates that the backend can process requests * containing the BLKIF_OP_FLUSH_DISKCACHE request opcode. Requests * of this type may still be returned at any time with the * BLKIF_RSP_EOPNOTSUPP result code. * * feature-discard * Values: 0/1 (boolean) * Default Value: 0 * * A value of "1" indicates that the backend can process requests * containing the BLKIF_OP_DISCARD request opcode. Requests * of this type may still be returned at any time with the * BLKIF_RSP_EOPNOTSUPP result code. * * feature-persistent * Values: 0/1 (boolean) * Default Value: 0 * Notes: 7 * * A value of "1" indicates that the backend can keep the grants used * by the frontend driver mapped, so the same set of grants should be * used in all transactions. The maximum number of grants the backend * can map persistently depends on the implementation, but ideally it * should be RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST. Using this * feature the backend doesn't need to unmap each grant, preventing * costly TLB flushes. The backend driver should only map grants * persistently if the frontend supports it. If a backend driver chooses * to use the persistent protocol when the frontend doesn't support it, * it will probably hit the maximum number of persistently mapped grants * (due to the fact that the frontend won't be reusing the same grants), * and fall back to non-persistent mode. Backend implementations may * shrink or expand the number of persistently mapped grants without * notifying the frontend depending on memory constraints (this might * cause a performance degradation). * * If a backend driver wants to limit the maximum number of persistently * mapped grants to a value less than RING_SIZE * * BLKIF_MAX_SEGMENTS_PER_REQUEST a LRU strategy should be used to * discard the grants that are less commonly used. Using a LRU in the * backend driver paired with a LIFO queue in the frontend will * allow us to have better performance in this scenario. * *----------------------- Request Transport Parameters ------------------------ * * max-ring-page-order * Values: * Default Value: 0 * Notes: 1, 3 * * The maximum supported size of the request ring buffer in units of * lb(machine pages). (e.g. 0 == 1 page, 1 = 2 pages, 2 == 4 pages, * etc.). * * max-ring-pages * Values: * Default Value: 1 * Notes: DEPRECATED, 2, 3 * * The maximum supported size of the request ring buffer in units of * machine pages. The value must be a power of 2. * *------------------------- Backend Device Properties ------------------------- * * discard-alignment * Values: * Default Value: 0 * Notes: 4, 5 * * The offset, in bytes from the beginning of the virtual block device, * to the first, addressable, discard extent on the underlying device. * * discard-granularity * Values: * Default Value: <"sector-size"> * Notes: 4 * * The size, in bytes, of the individually addressable discard extents * of the underlying device. * * discard-secure * Values: 0/1 (boolean) * Default Value: 0 * Notes: 10 * * A value of "1" indicates that the backend can process BLKIF_OP_DISCARD * requests with the BLKIF_DISCARD_SECURE flag set. * * info * Values: (bitmap) * * A collection of bit flags describing attributes of the backing * device. The VDISK_* macros define the meaning of each bit * location. * * sector-size * Values: * * The logical sector size, in bytes, of the backend device. * * physical-sector-size * Values: * * The physical sector size, in bytes, of the backend device. * * sectors * Values: * * The size of the backend device, expressed in units of its logical * sector size ("sector-size"). * ***************************************************************************** * Frontend XenBus Nodes ***************************************************************************** * *----------------------- Request Transport Parameters ----------------------- * * event-channel * Values: * * The identifier of the Xen event channel used to signal activity * in the ring buffer. * * ring-ref * Values: * Notes: 6 * * The Xen grant reference granting permission for the backend to map * the sole page in a single page sized ring buffer. * * ring-ref%u * Values: * Notes: 6 * * For a frontend providing a multi-page ring, a "number of ring pages" * sized list of nodes, each containing a Xen grant reference granting * permission for the backend to map the page of the ring located * at page index "%u". Page indexes are zero based. * * protocol * Values: string (XEN_IO_PROTO_ABI_*) * Default Value: XEN_IO_PROTO_ABI_NATIVE * * The machine ABI rules governing the format of all ring request and * response structures. * * ring-page-order * Values: * Default Value: 0 * Maximum Value: MAX(ffs(max-ring-pages) - 1, max-ring-page-order) * Notes: 1, 3 * * The size of the frontend allocated request ring buffer in units * of lb(machine pages). (e.g. 0 == 1 page, 1 = 2 pages, 2 == 4 pages, * etc.). * * num-ring-pages * Values: * Default Value: 1 * Maximum Value: MAX(max-ring-pages,(0x1 << max-ring-page-order)) * Notes: DEPRECATED, 2, 3 * * The size of the frontend allocated request ring buffer in units of * machine pages. The value must be a power of 2. * * feature-persistent * Values: 0/1 (boolean) * Default Value: 0 * Notes: 7, 8, 9 * * A value of "1" indicates that the frontend will reuse the same grants * for all transactions, allowing the backend to map them with write * access (even when it should be read-only). If the frontend hits the * maximum number of allowed persistently mapped grants, it can fallback * to non persistent mode. This will cause a performance degradation, * since the the backend driver will still try to map those grants * persistently. Since the persistent grants protocol is compatible with * the previous protocol, a frontend driver can choose to work in * persistent mode even when the backend doesn't support it. * * It is recommended that the frontend driver stores the persistently * mapped grants in a LIFO queue, so a subset of all persistently mapped * grants gets used commonly. This is done in case the backend driver * decides to limit the maximum number of persistently mapped grants * to a value less than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST. * *------------------------- Virtual Device Properties ------------------------- * * device-type * Values: "disk", "cdrom", "floppy", etc. * * virtual-device * Values: * * A value indicating the physical device to virtualize within the * frontend's domain. (e.g. "The first ATA disk", "The third SCSI * disk", etc.) * * See docs/misc/vbd-interface.txt for details on the format of this * value. * * Notes * ----- * (1) Multi-page ring buffer scheme first developed in the Citrix XenServer * PV drivers. * (2) Multi-page ring buffer scheme first used in some RedHat distributions * including a distribution deployed on certain nodes of the Amazon * EC2 cluster. * (3) Support for multi-page ring buffers was implemented independently, * in slightly different forms, by both Citrix and RedHat/Amazon. * For full interoperability, block front and backends should publish * identical ring parameters, adjusted for unit differences, to the * XenStore nodes used in both schemes. * (4) Devices that support discard functionality may internally allocate space * (discardable extents) in units that are larger than the exported logical * block size. If the backing device has such discardable extents the * backend should provide both discard-granularity and discard-alignment. * Providing just one of the two may be considered an error by the frontend. * Backends supporting discard should include discard-granularity and * discard-alignment even if it supports discarding individual sectors. * Frontends should assume discard-alignment == 0 and discard-granularity * == sector size if these keys are missing. * (5) The discard-alignment parameter allows a physical device to be * partitioned into virtual devices that do not necessarily begin or * end on a discardable extent boundary. * (6) When there is only a single page allocated to the request ring, * 'ring-ref' is used to communicate the grant reference for this * page to the backend. When using a multi-page ring, the 'ring-ref' * node is not created. Instead 'ring-ref0' - 'ring-refN' are used. * (7) When using persistent grants data has to be copied from/to the page * where the grant is currently mapped. The overhead of doing this copy * however doesn't suppress the speed improvement of not having to unmap * the grants. * (8) The frontend driver has to allow the backend driver to map all grants * with write access, even when they should be mapped read-only, since * further requests may reuse these grants and require write permissions. * (9) Linux implementation doesn't have a limit on the maximum number of * grants that can be persistently mapped in the frontend driver, but * due to the frontent driver implementation it should never be bigger * than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST. *(10) The discard-secure property may be present and will be set to 1 if the * backing device supports secure discard. */ /* * STATE DIAGRAMS * ***************************************************************************** * Startup * ***************************************************************************** * * Tool stack creates front and back nodes with state XenbusStateInitialising. * * Front Back * ================================= ===================================== * XenbusStateInitialising XenbusStateInitialising * o Query virtual device o Query backend device identification * properties. data. * o Setup OS device instance. o Open and validate backend device. * o Publish backend features and * transport parameters. * | * | * V * XenbusStateInitWait * * o Query backend features and * transport parameters. * o Allocate and initialize the * request ring. * o Publish transport parameters * that will be in effect during * this connection. * | * | * V * XenbusStateInitialised * * o Query frontend transport parameters. * o Connect to the request ring and * event channel. * o Publish backend device properties. * | * | * V * XenbusStateConnected * * o Query backend device properties. * o Finalize OS virtual device * instance. * | * | * V * XenbusStateConnected * * Note: Drivers that do not support any optional features, or the negotiation * of transport parameters, can skip certain states in the state machine: * * o A frontend may transition to XenbusStateInitialised without * waiting for the backend to enter XenbusStateInitWait. In this * case, default transport parameters are in effect and any * transport parameters published by the frontend must contain * their default values. * * o A backend may transition to XenbusStateInitialised, bypassing * XenbusStateInitWait, without waiting for the frontend to first * enter the XenbusStateInitialised state. In this case, default * transport parameters are in effect and any transport parameters * published by the backend must contain their default values. * * Drivers that support optional features and/or transport parameter * negotiation must tolerate these additional state transition paths. * In general this means performing the work of any skipped state * transition, if it has not already been performed, in addition to the * work associated with entry into the current state. */ /* * REQUEST CODES. */ #define BLKIF_OP_READ 0 #define BLKIF_OP_WRITE 1 /* * All writes issued prior to a request with the BLKIF_OP_WRITE_BARRIER * operation code ("barrier request") must be completed prior to the * execution of the barrier request. All writes issued after the barrier * request must not execute until after the completion of the barrier request. * * Optional. See "feature-barrier" XenBus node documentation above. */ #define BLKIF_OP_WRITE_BARRIER 2 /* * Commit any uncommitted contents of the backing device's volatile cache * to stable storage. * * Optional. See "feature-flush-cache" XenBus node documentation above. */ #define BLKIF_OP_FLUSH_DISKCACHE 3 /* * Used in SLES sources for device specific command packet * contained within the request. Reserved for that purpose. */ #define BLKIF_OP_RESERVED_1 4 /* * Indicate to the backend device that a region of storage is no longer in * use, and may be discarded at any time without impact to the client. If * the BLKIF_DISCARD_SECURE flag is set on the request, all copies of the * discarded region on the device must be rendered unrecoverable before the * command returns. * * This operation is analogous to performing a trim (ATA) or unamp (SCSI), * command on a native device. * * More information about trim/unmap operations can be found at: * http://t13.org/Documents/UploadedDocuments/docs2008/ * e07154r6-Data_Set_Management_Proposal_for_ATA-ACS2.doc * http://www.seagate.com/staticfiles/support/disc/manuals/ * Interface%20manuals/100293068c.pdf * * Optional. See "feature-discard", "discard-alignment", * "discard-granularity", and "discard-secure" in the XenBus node * documentation above. */ #define BLKIF_OP_DISCARD 5 /* * Recognized if "feature-max-indirect-segments" in present in the backend * xenbus info. The "feature-max-indirect-segments" node contains the maximum * number of segments allowed by the backend per request. If the node is * present, the frontend might use blkif_request_indirect structs in order to * issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The * maximum number of indirect segments is fixed by the backend, but the * frontend can issue requests with any number of indirect segments as long as * it's less than the number provided by the backend. The indirect_grefs field * in blkif_request_indirect should be filled by the frontend with the * grant references of the pages that are holding the indirect segments. * These pages are filled with an array of blkif_request_segment that hold the * information about the segments. The number of indirect pages to use is * determined by the number of segments an indirect request contains. Every * indirect page can contain a maximum of * (PAGE_SIZE / sizeof(struct blkif_request_segment)) segments, so to * calculate the number of indirect pages to use we have to do * ceil(indirect_segments / (PAGE_SIZE / sizeof(struct blkif_request_segment))). * * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not* * create the "feature-max-indirect-segments" node! */ #define BLKIF_OP_INDIRECT 6 /* * Maximum scatter/gather segments per request. * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE. * NB. This could be 12 if the ring indexes weren't stored in the same page. */ #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 /* * Maximum number of indirect pages to use per request. */ #define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8 /* * NB. first_sect and last_sect in blkif_request_segment, as well as * sector_number in blkif_request, are always expressed in 512-byte units. * However they must be properly aligned to the real sector size of the * physical disk, which is reported in the "physical-sector-size" node in * the backend xenbus info. Also the xenbus "sectors" node is expressed in * 512-byte units. */ struct blkif_request_segment { grant_ref_t gref; /* reference to I/O buffer frame */ /* @first_sect: first sector in frame to transfer (inclusive). */ /* @last_sect: last sector in frame to transfer (inclusive). */ uint8_t first_sect, last_sect; }; /* * Starting ring element for any I/O request. */ struct blkif_request { uint8_t operation; /* BLKIF_OP_??? */ uint8_t nr_segments; /* number of segments */ blkif_vdev_t handle; /* only for read/write requests */ uint64_t id; /* private guest value, echoed in resp */ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; }; typedef struct blkif_request blkif_request_t; /* * Cast to this structure when blkif_request.operation == BLKIF_OP_DISCARD * sizeof(struct blkif_request_discard) <= sizeof(struct blkif_request) */ struct blkif_request_discard { uint8_t operation; /* BLKIF_OP_DISCARD */ uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */ #define BLKIF_DISCARD_SECURE (1<<0) /* ignored if discard-secure=0 */ blkif_vdev_t handle; /* same as for read/write requests */ uint64_t id; /* private guest value, echoed in resp */ blkif_sector_t sector_number;/* start sector idx on disk */ uint64_t nr_sectors; /* number of contiguous sectors to discard*/ }; typedef struct blkif_request_discard blkif_request_discard_t; struct blkif_request_indirect { uint8_t operation; /* BLKIF_OP_INDIRECT */ uint8_t indirect_op; /* BLKIF_OP_{READ/WRITE} */ uint16_t nr_segments; /* number of segments */ uint64_t id; /* private guest value, echoed in resp */ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ blkif_vdev_t handle; /* same as for read/write requests */ grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; #ifdef __i386__ uint64_t pad; /* Make it 64 byte aligned on i386 */ #endif }; typedef struct blkif_request_indirect blkif_request_indirect_t; struct blkif_response { uint64_t id; /* copied from request */ uint8_t operation; /* copied from request */ int16_t status; /* BLKIF_RSP_??? */ }; typedef struct blkif_response blkif_response_t; /* * STATUS RETURN CODES. */ /* Operation not supported (only happens on barrier writes). */ #define BLKIF_RSP_EOPNOTSUPP -2 /* Operation failed for some unspecified reason (-EIO). */ #define BLKIF_RSP_ERROR -1 /* Operation completed successfully. */ #define BLKIF_RSP_OKAY 0 /* * Generate blkif ring structures and types. */ DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response); #define VDISK_CDROM 0x1 #define VDISK_REMOVABLE 0x2 #define VDISK_READONLY 0x4 #endif /* __XEN_PUBLIC_IO_BLKIF_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/io/protocols.h0000664000175000017500000000323512307313555016745 0ustar smbsmb/****************************************************************************** * protocols.h * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef __XEN_PROTOCOLS_H__ #define __XEN_PROTOCOLS_H__ #define XEN_IO_PROTO_ABI_X86_32 "x86_32-abi" #define XEN_IO_PROTO_ABI_X86_64 "x86_64-abi" #define XEN_IO_PROTO_ABI_ARM "arm-abi" #if defined(__i386__) # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_32 #elif defined(__x86_64__) # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_64 #elif defined(__arm__) || defined(__aarch64__) # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_ARM #else # error arch fixup needed here #endif #endif xen-4.4.0/xen/include/public/io/fsif.h0000664000175000017500000001237612307313555015656 0ustar smbsmb/****************************************************************************** * fsif.h * * Interface to FS level split device drivers. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2007, Grzegorz Milos, . */ #ifndef __XEN_PUBLIC_IO_FSIF_H__ #define __XEN_PUBLIC_IO_FSIF_H__ #include "ring.h" #include "../grant_table.h" #define REQ_FILE_OPEN 1 #define REQ_FILE_CLOSE 2 #define REQ_FILE_READ 3 #define REQ_FILE_WRITE 4 #define REQ_STAT 5 #define REQ_FILE_TRUNCATE 6 #define REQ_REMOVE 7 #define REQ_RENAME 8 #define REQ_CREATE 9 #define REQ_DIR_LIST 10 #define REQ_CHMOD 11 #define REQ_FS_SPACE 12 #define REQ_FILE_SYNC 13 struct fsif_open_request { grant_ref_t gref; }; struct fsif_close_request { uint32_t fd; }; struct fsif_read_request { uint32_t fd; int32_t pad; uint64_t len; uint64_t offset; grant_ref_t grefs[1]; /* Variable length */ }; struct fsif_write_request { uint32_t fd; int32_t pad; uint64_t len; uint64_t offset; grant_ref_t grefs[1]; /* Variable length */ }; struct fsif_stat_request { uint32_t fd; }; /* This structure is a copy of some fields from stat structure, returned * via the ring. */ struct fsif_stat_response { int32_t stat_mode; uint32_t stat_uid; uint32_t stat_gid; int32_t stat_ret; int64_t stat_size; int64_t stat_atime; int64_t stat_mtime; int64_t stat_ctime; }; struct fsif_truncate_request { uint32_t fd; int32_t pad; int64_t length; }; struct fsif_remove_request { grant_ref_t gref; }; struct fsif_rename_request { uint16_t old_name_offset; uint16_t new_name_offset; grant_ref_t gref; }; struct fsif_create_request { int8_t directory; int8_t pad; int16_t pad2; int32_t mode; grant_ref_t gref; }; struct fsif_list_request { uint32_t offset; grant_ref_t gref; }; #define NR_FILES_SHIFT 0 #define NR_FILES_SIZE 16 /* 16 bits for the number of files mask */ #define NR_FILES_MASK (((1ULL << NR_FILES_SIZE) - 1) << NR_FILES_SHIFT) #define ERROR_SIZE 32 /* 32 bits for the error mask */ #define ERROR_SHIFT (NR_FILES_SIZE + NR_FILES_SHIFT) #define ERROR_MASK (((1ULL << ERROR_SIZE) - 1) << ERROR_SHIFT) #define HAS_MORE_SHIFT (ERROR_SHIFT + ERROR_SIZE) #define HAS_MORE_FLAG (1ULL << HAS_MORE_SHIFT) struct fsif_chmod_request { uint32_t fd; int32_t mode; }; struct fsif_space_request { grant_ref_t gref; }; struct fsif_sync_request { uint32_t fd; }; /* FS operation request */ struct fsif_request { uint8_t type; /* Type of the request */ uint8_t pad; uint16_t id; /* Request ID, copied to the response */ uint32_t pad2; union { struct fsif_open_request fopen; struct fsif_close_request fclose; struct fsif_read_request fread; struct fsif_write_request fwrite; struct fsif_stat_request fstat; struct fsif_truncate_request ftruncate; struct fsif_remove_request fremove; struct fsif_rename_request frename; struct fsif_create_request fcreate; struct fsif_list_request flist; struct fsif_chmod_request fchmod; struct fsif_space_request fspace; struct fsif_sync_request fsync; } u; }; typedef struct fsif_request fsif_request_t; /* FS operation response */ struct fsif_response { uint16_t id; uint16_t pad1; uint32_t pad2; union { uint64_t ret_val; struct fsif_stat_response fstat; } u; }; typedef struct fsif_response fsif_response_t; #define FSIF_RING_ENTRY_SIZE 64 #define FSIF_NR_READ_GNTS ((FSIF_RING_ENTRY_SIZE - sizeof(struct fsif_read_request)) / \ sizeof(grant_ref_t) + 1) #define FSIF_NR_WRITE_GNTS ((FSIF_RING_ENTRY_SIZE - sizeof(struct fsif_write_request)) / \ sizeof(grant_ref_t) + 1) DEFINE_RING_TYPES(fsif, struct fsif_request, struct fsif_response); #define STATE_INITIALISED "init" #define STATE_READY "ready" #define STATE_CLOSING "closing" #define STATE_CLOSED "closed" #endif xen-4.4.0/xen/include/public/io/xenbus.h0000664000175000017500000000464112307313555016227 0ustar smbsmb/***************************************************************************** * xenbus.h * * Xenbus protocol details. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (C) 2005 XenSource Ltd. */ #ifndef _XEN_PUBLIC_IO_XENBUS_H #define _XEN_PUBLIC_IO_XENBUS_H /* * The state of either end of the Xenbus, i.e. the current communication * status of initialisation across the bus. States here imply nothing about * the state of the connection between the driver and the kernel's device * layers. */ enum xenbus_state { XenbusStateUnknown = 0, XenbusStateInitialising = 1, /* * InitWait: Finished early initialisation but waiting for information * from the peer or hotplug scripts. */ XenbusStateInitWait = 2, /* * Initialised: Waiting for a connection from the peer. */ XenbusStateInitialised = 3, XenbusStateConnected = 4, /* * Closing: The device is being closed due to an error or an unplug event. */ XenbusStateClosing = 5, XenbusStateClosed = 6, /* * Reconfiguring: The device is being reconfigured. */ XenbusStateReconfiguring = 7, XenbusStateReconfigured = 8 }; typedef enum xenbus_state XenbusState; #endif /* _XEN_PUBLIC_IO_XENBUS_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/io/ring.h0000664000175000017500000003553612307313555015671 0ustar smbsmb/****************************************************************************** * ring.h * * Shared producer-consumer ring macros. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Tim Deegan and Andrew Warfield November 2004. */ #ifndef __XEN_PUBLIC_IO_RING_H__ #define __XEN_PUBLIC_IO_RING_H__ #include "../xen-compat.h" #if __XEN_INTERFACE_VERSION__ < 0x00030208 #define xen_mb() mb() #define xen_rmb() rmb() #define xen_wmb() wmb() #endif typedef unsigned int RING_IDX; /* Round a 32-bit unsigned constant down to the nearest power of two. */ #define __RD2(_x) (((_x) & 0x00000002) ? 0x2 : ((_x) & 0x1)) #define __RD4(_x) (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2 : __RD2(_x)) #define __RD8(_x) (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4 : __RD4(_x)) #define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8 : __RD8(_x)) #define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x)) /* * Calculate size of a shared ring, given the total available space for the * ring and indexes (_sz), and the name tag of the request/response structure. * A ring contains as many entries as will fit, rounded down to the nearest * power of two (so we can mask with (size-1) to loop around). */ #define __CONST_RING_SIZE(_s, _sz) \ (__RD32(((_sz) - offsetof(struct _s##_sring, ring)) / \ sizeof(((struct _s##_sring *)0)->ring[0]))) /* * The same for passing in an actual pointer instead of a name tag. */ #define __RING_SIZE(_s, _sz) \ (__RD32(((_sz) - (long)(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0]))) /* * Macros to make the correct C datatypes for a new kind of ring. * * To make a new ring datatype, you need to have two message structures, * let's say request_t, and response_t already defined. * * In a header where you want the ring datatype declared, you then do: * * DEFINE_RING_TYPES(mytag, request_t, response_t); * * These expand out to give you a set of types, as you can see below. * The most important of these are: * * mytag_sring_t - The shared ring. * mytag_front_ring_t - The 'front' half of the ring. * mytag_back_ring_t - The 'back' half of the ring. * * To initialize a ring in your code you need to know the location and size * of the shared memory area (PAGE_SIZE, for instance). To initialise * the front half: * * mytag_front_ring_t front_ring; * SHARED_RING_INIT((mytag_sring_t *)shared_page); * FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page, PAGE_SIZE); * * Initializing the back follows similarly (note that only the front * initializes the shared ring): * * mytag_back_ring_t back_ring; * BACK_RING_INIT(&back_ring, (mytag_sring_t *)shared_page, PAGE_SIZE); */ #define DEFINE_RING_TYPES(__name, __req_t, __rsp_t) \ \ /* Shared ring entry */ \ union __name##_sring_entry { \ __req_t req; \ __rsp_t rsp; \ }; \ \ /* Shared ring page */ \ struct __name##_sring { \ RING_IDX req_prod, req_event; \ RING_IDX rsp_prod, rsp_event; \ union { \ struct { \ uint8_t smartpoll_active; \ } netif; \ struct { \ uint8_t msg; \ } tapif_user; \ uint8_t pvt_pad[4]; \ } private; \ uint8_t __pad[44]; \ union __name##_sring_entry ring[1]; /* variable-length */ \ }; \ \ /* "Front" end's private variables */ \ struct __name##_front_ring { \ RING_IDX req_prod_pvt; \ RING_IDX rsp_cons; \ unsigned int nr_ents; \ struct __name##_sring *sring; \ }; \ \ /* "Back" end's private variables */ \ struct __name##_back_ring { \ RING_IDX rsp_prod_pvt; \ RING_IDX req_cons; \ unsigned int nr_ents; \ struct __name##_sring *sring; \ }; \ \ /* Syntactic sugar */ \ typedef struct __name##_sring __name##_sring_t; \ typedef struct __name##_front_ring __name##_front_ring_t; \ typedef struct __name##_back_ring __name##_back_ring_t /* * Macros for manipulating rings. * * FRONT_RING_whatever works on the "front end" of a ring: here * requests are pushed on to the ring and responses taken off it. * * BACK_RING_whatever works on the "back end" of a ring: here * requests are taken off the ring and responses put on. * * N.B. these macros do NO INTERLOCKS OR FLOW CONTROL. * This is OK in 1-for-1 request-response situations where the * requestor (front end) never has more than RING_SIZE()-1 * outstanding requests. */ /* Initialising empty rings */ #define SHARED_RING_INIT(_s) do { \ (_s)->req_prod = (_s)->rsp_prod = 0; \ (_s)->req_event = (_s)->rsp_event = 1; \ (void)memset((_s)->private.pvt_pad, 0, sizeof((_s)->private.pvt_pad)); \ (void)memset((_s)->__pad, 0, sizeof((_s)->__pad)); \ } while(0) #define FRONT_RING_INIT(_r, _s, __size) do { \ (_r)->req_prod_pvt = 0; \ (_r)->rsp_cons = 0; \ (_r)->nr_ents = __RING_SIZE(_s, __size); \ (_r)->sring = (_s); \ } while (0) #define BACK_RING_INIT(_r, _s, __size) do { \ (_r)->rsp_prod_pvt = 0; \ (_r)->req_cons = 0; \ (_r)->nr_ents = __RING_SIZE(_s, __size); \ (_r)->sring = (_s); \ } while (0) /* How big is this ring? */ #define RING_SIZE(_r) \ ((_r)->nr_ents) /* Number of free requests (for use on front side only). */ #define RING_FREE_REQUESTS(_r) \ (RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons)) /* Test if there is an empty slot available on the front ring. * (This is only meaningful from the front. ) */ #define RING_FULL(_r) \ (RING_FREE_REQUESTS(_r) == 0) /* Test if there are outstanding messages to be processed on a ring. */ #define RING_HAS_UNCONSUMED_RESPONSES(_r) \ ((_r)->sring->rsp_prod - (_r)->rsp_cons) #ifdef __GNUC__ #define RING_HAS_UNCONSUMED_REQUESTS(_r) ({ \ unsigned int req = (_r)->sring->req_prod - (_r)->req_cons; \ unsigned int rsp = RING_SIZE(_r) - \ ((_r)->req_cons - (_r)->rsp_prod_pvt); \ req < rsp ? req : rsp; \ }) #else /* Same as above, but without the nice GCC ({ ... }) syntax. */ #define RING_HAS_UNCONSUMED_REQUESTS(_r) \ ((((_r)->sring->req_prod - (_r)->req_cons) < \ (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt))) ? \ ((_r)->sring->req_prod - (_r)->req_cons) : \ (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt))) #endif /* Direct access to individual ring elements, by index. */ #define RING_GET_REQUEST(_r, _idx) \ (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req)) #define RING_GET_RESPONSE(_r, _idx) \ (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp)) /* Loop termination condition: Would the specified index overflow the ring? */ #define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) /* Ill-behaved frontend determination: Can there be this many requests? */ #define RING_REQUEST_PROD_OVERFLOW(_r, _prod) \ (((_prod) - (_r)->rsp_prod_pvt) > RING_SIZE(_r)) #define RING_PUSH_REQUESTS(_r) do { \ xen_wmb(); /* back sees requests /before/ updated producer index */ \ (_r)->sring->req_prod = (_r)->req_prod_pvt; \ } while (0) #define RING_PUSH_RESPONSES(_r) do { \ xen_wmb(); /* front sees resps /before/ updated producer index */ \ (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt; \ } while (0) /* * Notification hold-off (req_event and rsp_event): * * When queueing requests or responses on a shared ring, it may not always be * necessary to notify the remote end. For example, if requests are in flight * in a backend, the front may be able to queue further requests without * notifying the back (if the back checks for new requests when it queues * responses). * * When enqueuing requests or responses: * * Use RING_PUSH_{REQUESTS,RESPONSES}_AND_CHECK_NOTIFY(). The second argument * is a boolean return value. True indicates that the receiver requires an * asynchronous notification. * * After dequeuing requests or responses (before sleeping the connection): * * Use RING_FINAL_CHECK_FOR_REQUESTS() or RING_FINAL_CHECK_FOR_RESPONSES(). * The second argument is a boolean return value. True indicates that there * are pending messages on the ring (i.e., the connection should not be put * to sleep). * * These macros will set the req_event/rsp_event field to trigger a * notification on the very next message that is enqueued. If you want to * create batches of work (i.e., only receive a notification after several * messages have been enqueued) then you will need to create a customised * version of the FINAL_CHECK macro in your own code, which sets the event * field appropriately. */ #define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do { \ RING_IDX __old = (_r)->sring->req_prod; \ RING_IDX __new = (_r)->req_prod_pvt; \ xen_wmb(); /* back sees requests /before/ updated producer index */ \ (_r)->sring->req_prod = __new; \ xen_mb(); /* back sees new requests /before/ we check req_event */ \ (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) < \ (RING_IDX)(__new - __old)); \ } while (0) #define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do { \ RING_IDX __old = (_r)->sring->rsp_prod; \ RING_IDX __new = (_r)->rsp_prod_pvt; \ xen_wmb(); /* front sees resps /before/ updated producer index */ \ (_r)->sring->rsp_prod = __new; \ xen_mb(); /* front sees new resps /before/ we check rsp_event */ \ (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) < \ (RING_IDX)(__new - __old)); \ } while (0) #define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do { \ (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \ if (_work_to_do) break; \ (_r)->sring->req_event = (_r)->req_cons + 1; \ xen_mb(); \ (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \ } while (0) #define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do { \ (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ if (_work_to_do) break; \ (_r)->sring->rsp_event = (_r)->rsp_cons + 1; \ xen_mb(); \ (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ } while (0) #endif /* __XEN_PUBLIC_IO_RING_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/io/vscsiif.h0000664000175000017500000001030412307313555016362 0ustar smbsmb/****************************************************************************** * vscsiif.h * * Based on the blkif.h code. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright(c) FUJITSU Limited 2008. */ #ifndef __XEN__PUBLIC_IO_SCSI_H__ #define __XEN__PUBLIC_IO_SCSI_H__ #include "ring.h" #include "../grant_table.h" /* commands between backend and frontend */ #define VSCSIIF_ACT_SCSI_CDB 1 /* SCSI CDB command */ #define VSCSIIF_ACT_SCSI_ABORT 2 /* SCSI Device(Lun) Abort*/ #define VSCSIIF_ACT_SCSI_RESET 3 /* SCSI Device(Lun) Reset*/ #define VSCSIIF_ACT_SCSI_SG_PRESET 4 /* Preset SG elements */ /* * Maximum scatter/gather segments per request. * * Considering balance between allocating at least 16 "vscsiif_request" * structures on one page (4096 bytes) and the number of scatter/gather * elements needed, we decided to use 26 as a magic number. */ #define VSCSIIF_SG_TABLESIZE 26 /* * based on Linux kernel 2.6.18 */ #define VSCSIIF_MAX_COMMAND_SIZE 16 #define VSCSIIF_SENSE_BUFFERSIZE 96 struct scsiif_request_segment { grant_ref_t gref; uint16_t offset; uint16_t length; }; typedef struct scsiif_request_segment vscsiif_segment_t; struct vscsiif_request { uint16_t rqid; /* private guest value, echoed in resp */ uint8_t act; /* command between backend and frontend */ uint8_t cmd_len; uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE]; uint16_t timeout_per_command; /* The command is issued by twice the value in Backend. */ uint16_t channel, id, lun; uint16_t padding; uint8_t sc_data_direction; /* for DMA_TO_DEVICE(1) DMA_FROM_DEVICE(2) DMA_NONE(3) requests */ uint8_t nr_segments; /* Number of pieces of scatter-gather */ vscsiif_segment_t seg[VSCSIIF_SG_TABLESIZE]; uint32_t reserved[3]; }; typedef struct vscsiif_request vscsiif_request_t; #define VSCSIIF_SG_LIST_SIZE ((sizeof(vscsiif_request_t) - 4) \ / sizeof(vscsiif_segment_t)) struct vscsiif_sg_list { /* First two fields must match struct vscsiif_request! */ uint16_t rqid; /* private guest value, must match main req */ uint8_t act; /* VSCSIIF_ACT_SCSI_SG_PRESET */ uint8_t nr_segments; /* Number of pieces of scatter-gather */ vscsiif_segment_t seg[VSCSIIF_SG_LIST_SIZE]; }; typedef struct vscsiif_sg_list vscsiif_sg_list_t; struct vscsiif_response { uint16_t rqid; uint8_t act; /* valid only when backend supports SG_PRESET */ uint8_t sense_len; uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE]; int32_t rslt; uint32_t residual_len; /* request bufflen - return the value from physical device */ uint32_t reserved[36]; }; typedef struct vscsiif_response vscsiif_response_t; DEFINE_RING_TYPES(vscsiif, struct vscsiif_request, struct vscsiif_response); #endif /*__XEN__PUBLIC_IO_SCSI_H__*/ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/io/xs_wire.h0000664000175000017500000000721312307313555016401 0ustar smbsmb/* * Details of the "wire" protocol between Xen Store Daemon and client * library or guest kernel. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (C) 2005 Rusty Russell IBM Corporation */ #ifndef _XS_WIRE_H #define _XS_WIRE_H enum xsd_sockmsg_type { XS_DEBUG, XS_DIRECTORY, XS_READ, XS_GET_PERMS, XS_WATCH, XS_UNWATCH, XS_TRANSACTION_START, XS_TRANSACTION_END, XS_INTRODUCE, XS_RELEASE, XS_GET_DOMAIN_PATH, XS_WRITE, XS_MKDIR, XS_RM, XS_SET_PERMS, XS_WATCH_EVENT, XS_ERROR, XS_IS_DOMAIN_INTRODUCED, XS_RESUME, XS_SET_TARGET, XS_RESTRICT, XS_RESET_WATCHES }; #define XS_WRITE_NONE "NONE" #define XS_WRITE_CREATE "CREATE" #define XS_WRITE_CREATE_EXCL "CREATE|EXCL" /* We hand errors as strings, for portability. */ struct xsd_errors { int errnum; const char *errstring; }; #ifdef EINVAL #define XSD_ERROR(x) { x, #x } /* LINTED: static unused */ static struct xsd_errors xsd_errors[] #if defined(__GNUC__) __attribute__((unused)) #endif = { XSD_ERROR(EINVAL), XSD_ERROR(EACCES), XSD_ERROR(EEXIST), XSD_ERROR(EISDIR), XSD_ERROR(ENOENT), XSD_ERROR(ENOMEM), XSD_ERROR(ENOSPC), XSD_ERROR(EIO), XSD_ERROR(ENOTEMPTY), XSD_ERROR(ENOSYS), XSD_ERROR(EROFS), XSD_ERROR(EBUSY), XSD_ERROR(EAGAIN), XSD_ERROR(EISCONN), XSD_ERROR(E2BIG) }; #endif struct xsd_sockmsg { uint32_t type; /* XS_??? */ uint32_t req_id;/* Request identifier, echoed in daemon's response. */ uint32_t tx_id; /* Transaction id (0 if not related to a transaction). */ uint32_t len; /* Length of data following this. */ /* Generally followed by nul-terminated string(s). */ }; enum xs_watch_type { XS_WATCH_PATH = 0, XS_WATCH_TOKEN }; /* * `incontents 150 xenstore_struct XenStore wire protocol. * * Inter-domain shared memory communications. */ #define XENSTORE_RING_SIZE 1024 typedef uint32_t XENSTORE_RING_IDX; #define MASK_XENSTORE_IDX(idx) ((idx) & (XENSTORE_RING_SIZE-1)) struct xenstore_domain_interface { char req[XENSTORE_RING_SIZE]; /* Requests to xenstore daemon. */ char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch events. */ XENSTORE_RING_IDX req_cons, req_prod; XENSTORE_RING_IDX rsp_cons, rsp_prod; }; /* Violating this is very bad. See docs/misc/xenstore.txt. */ #define XENSTORE_PAYLOAD_MAX 4096 /* Violating these just gets you an error back */ #define XENSTORE_ABS_PATH_MAX 3072 #define XENSTORE_REL_PATH_MAX 2048 #endif /* _XS_WIRE_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/io/pciif.h0000664000175000017500000000750412307313555016016 0ustar smbsmb/* * PCI Backend/Frontend Common Data Structures & Macros * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Author: Ryan Wilson */ #ifndef __XEN_PCI_COMMON_H__ #define __XEN_PCI_COMMON_H__ /* Be sure to bump this number if you change this file */ #define XEN_PCI_MAGIC "7" /* xen_pci_sharedinfo flags */ #define _XEN_PCIF_active (0) #define XEN_PCIF_active (1<<_XEN_PCIF_active) #define _XEN_PCIB_AERHANDLER (1) #define XEN_PCIB_AERHANDLER (1<<_XEN_PCIB_AERHANDLER) #define _XEN_PCIB_active (2) #define XEN_PCIB_active (1<<_XEN_PCIB_active) /* xen_pci_op commands */ #define XEN_PCI_OP_conf_read (0) #define XEN_PCI_OP_conf_write (1) #define XEN_PCI_OP_enable_msi (2) #define XEN_PCI_OP_disable_msi (3) #define XEN_PCI_OP_enable_msix (4) #define XEN_PCI_OP_disable_msix (5) #define XEN_PCI_OP_aer_detected (6) #define XEN_PCI_OP_aer_resume (7) #define XEN_PCI_OP_aer_mmio (8) #define XEN_PCI_OP_aer_slotreset (9) #define XEN_PCI_OP_enable_multi_msi (10) /* xen_pci_op error numbers */ #define XEN_PCI_ERR_success (0) #define XEN_PCI_ERR_dev_not_found (-1) #define XEN_PCI_ERR_invalid_offset (-2) #define XEN_PCI_ERR_access_denied (-3) #define XEN_PCI_ERR_not_implemented (-4) /* XEN_PCI_ERR_op_failed - backend failed to complete the operation */ #define XEN_PCI_ERR_op_failed (-5) /* * it should be PAGE_SIZE-sizeof(struct xen_pci_op))/sizeof(struct msix_entry)) * Should not exceed 128 */ #define SH_INFO_MAX_VEC 128 struct xen_msix_entry { uint16_t vector; uint16_t entry; }; struct xen_pci_op { /* IN: what action to perform: XEN_PCI_OP_* */ uint32_t cmd; /* OUT: will contain an error number (if any) from errno.h */ int32_t err; /* IN: which device to touch */ uint32_t domain; /* PCI Domain/Segment */ uint32_t bus; uint32_t devfn; /* IN: which configuration registers to touch */ int32_t offset; int32_t size; /* IN/OUT: Contains the result after a READ or the value to WRITE */ uint32_t value; /* IN: Contains extra infor for this operation */ uint32_t info; /*IN: param for msi-x */ struct xen_msix_entry msix_entries[SH_INFO_MAX_VEC]; }; /*used for pcie aer handling*/ struct xen_pcie_aer_op { /* IN: what action to perform: XEN_PCI_OP_* */ uint32_t cmd; /*IN/OUT: return aer_op result or carry error_detected state as input*/ int32_t err; /* IN: which device to touch */ uint32_t domain; /* PCI Domain/Segment*/ uint32_t bus; uint32_t devfn; }; struct xen_pci_sharedinfo { /* flags - XEN_PCIF_* */ uint32_t flags; struct xen_pci_op op; struct xen_pcie_aer_op aer_op; }; #endif /* __XEN_PCI_COMMON_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/io/usbif.h0000664000175000017500000001050612307313555016030 0ustar smbsmb/* * usbif.h * * USB I/O interface for Xen guest OSes. * * Copyright (C) 2009, FUJITSU LABORATORIES LTD. * Author: Noboru Iwamatsu * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef __XEN_PUBLIC_IO_USBIF_H__ #define __XEN_PUBLIC_IO_USBIF_H__ #include "ring.h" #include "../grant_table.h" enum usb_spec_version { USB_VER_UNKNOWN = 0, USB_VER_USB11, USB_VER_USB20, USB_VER_USB30, /* not supported yet */ }; /* * USB pipe in usbif_request * * bits 0-5 are specific bits for virtual USB driver. * bits 7-31 are standard urb pipe. * * - port number(NEW): bits 0-4 * (USB_MAXCHILDREN is 31) * * - operation flag(NEW): bit 5 * (0 = submit urb, * 1 = unlink urb) * * - direction: bit 7 * (0 = Host-to-Device [Out] * 1 = Device-to-Host [In]) * * - device address: bits 8-14 * * - endpoint: bits 15-18 * * - pipe type: bits 30-31 * (00 = isochronous, 01 = interrupt, * 10 = control, 11 = bulk) */ #define usbif_pipeportnum(pipe) ((pipe) & 0x1f) #define usbif_setportnum_pipe(pipe, portnum) \ ((pipe)|(portnum)) #define usbif_pipeunlink(pipe) ((pipe) & 0x20) #define usbif_pipesubmit(pipe) (!usbif_pipeunlink(pipe)) #define usbif_setunlink_pipe(pipe) ((pipe)|(0x20)) #define USBIF_MAX_SEGMENTS_PER_REQUEST (16) /* * RING for transferring urbs. */ struct usbif_request_segment { grant_ref_t gref; uint16_t offset; uint16_t length; }; struct usbif_urb_request { uint16_t id; /* request id */ uint16_t nr_buffer_segs; /* number of urb->transfer_buffer segments */ /* basic urb parameter */ uint32_t pipe; uint16_t transfer_flags; uint16_t buffer_length; union { uint8_t ctrl[8]; /* setup_packet (Ctrl) */ struct { uint16_t interval; /* maximum (1024*8) in usb core */ uint16_t start_frame; /* start frame */ uint16_t number_of_packets; /* number of ISO packet */ uint16_t nr_frame_desc_segs; /* number of iso_frame_desc segments */ } isoc; struct { uint16_t interval; /* maximum (1024*8) in usb core */ uint16_t pad[3]; } intr; struct { uint16_t unlink_id; /* unlink request id */ uint16_t pad[3]; } unlink; } u; /* urb data segments */ struct usbif_request_segment seg[USBIF_MAX_SEGMENTS_PER_REQUEST]; }; typedef struct usbif_urb_request usbif_urb_request_t; struct usbif_urb_response { uint16_t id; /* request id */ uint16_t start_frame; /* start frame (ISO) */ int32_t status; /* status (non-ISO) */ int32_t actual_length; /* actual transfer length */ int32_t error_count; /* number of ISO errors */ }; typedef struct usbif_urb_response usbif_urb_response_t; DEFINE_RING_TYPES(usbif_urb, struct usbif_urb_request, struct usbif_urb_response); #define USB_URB_RING_SIZE __CONST_RING_SIZE(usbif_urb, PAGE_SIZE) /* * RING for notifying connect/disconnect events to frontend */ struct usbif_conn_request { uint16_t id; }; typedef struct usbif_conn_request usbif_conn_request_t; struct usbif_conn_response { uint16_t id; /* request id */ uint8_t portnum; /* port number */ uint8_t speed; /* usb_device_speed */ }; typedef struct usbif_conn_response usbif_conn_response_t; DEFINE_RING_TYPES(usbif_conn, struct usbif_conn_request, struct usbif_conn_response); #define USB_CONN_RING_SIZE __CONST_RING_SIZE(usbif_conn, PAGE_SIZE) #endif /* __XEN_PUBLIC_IO_USBIF_H__ */ xen-4.4.0/xen/include/public/io/fbif.h0000664000175000017500000001270612307313555015632 0ustar smbsmb/* * fbif.h -- Xen virtual frame buffer device * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (C) 2005 Anthony Liguori * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster */ #ifndef __XEN_PUBLIC_IO_FBIF_H__ #define __XEN_PUBLIC_IO_FBIF_H__ /* Out events (frontend -> backend) */ /* * Out events may be sent only when requested by backend, and receipt * of an unknown out event is an error. */ /* Event type 1 currently not used */ /* * Framebuffer update notification event * Capable frontend sets feature-update in xenstore. * Backend requests it by setting request-update in xenstore. */ #define XENFB_TYPE_UPDATE 2 struct xenfb_update { uint8_t type; /* XENFB_TYPE_UPDATE */ int32_t x; /* source x */ int32_t y; /* source y */ int32_t width; /* rect width */ int32_t height; /* rect height */ }; /* * Framebuffer resize notification event * Capable backend sets feature-resize in xenstore. */ #define XENFB_TYPE_RESIZE 3 struct xenfb_resize { uint8_t type; /* XENFB_TYPE_RESIZE */ int32_t width; /* width in pixels */ int32_t height; /* height in pixels */ int32_t stride; /* stride in bytes */ int32_t depth; /* depth in bits */ int32_t offset; /* offset of the framebuffer in bytes */ }; #define XENFB_OUT_EVENT_SIZE 40 union xenfb_out_event { uint8_t type; struct xenfb_update update; struct xenfb_resize resize; char pad[XENFB_OUT_EVENT_SIZE]; }; /* In events (backend -> frontend) */ /* * Frontends should ignore unknown in events. */ /* * Framebuffer refresh period advice * Backend sends it to advise the frontend their preferred period of * refresh. Frontends that keep the framebuffer constantly up-to-date * just ignore it. Frontends that use the advice should immediately * refresh the framebuffer (and send an update notification event if * those have been requested), then use the update frequency to guide * their periodical refreshs. */ #define XENFB_TYPE_REFRESH_PERIOD 1 #define XENFB_NO_REFRESH 0 struct xenfb_refresh_period { uint8_t type; /* XENFB_TYPE_UPDATE_PERIOD */ uint32_t period; /* period of refresh, in ms, * XENFB_NO_REFRESH if no refresh is needed */ }; #define XENFB_IN_EVENT_SIZE 40 union xenfb_in_event { uint8_t type; struct xenfb_refresh_period refresh_period; char pad[XENFB_IN_EVENT_SIZE]; }; /* shared page */ #define XENFB_IN_RING_SIZE 1024 #define XENFB_IN_RING_LEN (XENFB_IN_RING_SIZE / XENFB_IN_EVENT_SIZE) #define XENFB_IN_RING_OFFS 1024 #define XENFB_IN_RING(page) \ ((union xenfb_in_event *)((char *)(page) + XENFB_IN_RING_OFFS)) #define XENFB_IN_RING_REF(page, idx) \ (XENFB_IN_RING((page))[(idx) % XENFB_IN_RING_LEN]) #define XENFB_OUT_RING_SIZE 2048 #define XENFB_OUT_RING_LEN (XENFB_OUT_RING_SIZE / XENFB_OUT_EVENT_SIZE) #define XENFB_OUT_RING_OFFS (XENFB_IN_RING_OFFS + XENFB_IN_RING_SIZE) #define XENFB_OUT_RING(page) \ ((union xenfb_out_event *)((char *)(page) + XENFB_OUT_RING_OFFS)) #define XENFB_OUT_RING_REF(page, idx) \ (XENFB_OUT_RING((page))[(idx) % XENFB_OUT_RING_LEN]) struct xenfb_page { uint32_t in_cons, in_prod; uint32_t out_cons, out_prod; int32_t width; /* the width of the framebuffer (in pixels) */ int32_t height; /* the height of the framebuffer (in pixels) */ uint32_t line_length; /* the length of a row of pixels (in bytes) */ uint32_t mem_length; /* the length of the framebuffer (in bytes) */ uint8_t depth; /* the depth of a pixel (in bits) */ /* * Framebuffer page directory * * Each directory page holds PAGE_SIZE / sizeof(*pd) * framebuffer pages, and can thus map up to PAGE_SIZE * * PAGE_SIZE / sizeof(*pd) bytes. With PAGE_SIZE == 4096 and * sizeof(unsigned long) == 4/8, that's 4 Megs 32 bit and 2 Megs * 64 bit. 256 directories give enough room for a 512 Meg * framebuffer with a max resolution of 12,800x10,240. Should * be enough for a while with room leftover for expansion. */ unsigned long pd[256]; }; /* * Wart: xenkbd needs to know default resolution. Put it here until a * better solution is found, but don't leak it to the backend. */ #ifdef __KERNEL__ #define XENFB_WIDTH 800 #define XENFB_HEIGHT 600 #define XENFB_DEPTH 32 #endif #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/io/kbdif.h0000664000175000017500000000762512307313555016007 0ustar smbsmb/* * kbdif.h -- Xen virtual keyboard/mouse * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (C) 2005 Anthony Liguori * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster */ #ifndef __XEN_PUBLIC_IO_KBDIF_H__ #define __XEN_PUBLIC_IO_KBDIF_H__ /* In events (backend -> frontend) */ /* * Frontends should ignore unknown in events. */ /* Pointer movement event */ #define XENKBD_TYPE_MOTION 1 /* Event type 2 currently not used */ /* Key event (includes pointer buttons) */ #define XENKBD_TYPE_KEY 3 /* * Pointer position event * Capable backend sets feature-abs-pointer in xenstore. * Frontend requests ot instead of XENKBD_TYPE_MOTION by setting * request-abs-update in xenstore. */ #define XENKBD_TYPE_POS 4 struct xenkbd_motion { uint8_t type; /* XENKBD_TYPE_MOTION */ int32_t rel_x; /* relative X motion */ int32_t rel_y; /* relative Y motion */ int32_t rel_z; /* relative Z motion (wheel) */ }; struct xenkbd_key { uint8_t type; /* XENKBD_TYPE_KEY */ uint8_t pressed; /* 1 if pressed; 0 otherwise */ uint32_t keycode; /* KEY_* from linux/input.h */ }; struct xenkbd_position { uint8_t type; /* XENKBD_TYPE_POS */ int32_t abs_x; /* absolute X position (in FB pixels) */ int32_t abs_y; /* absolute Y position (in FB pixels) */ int32_t rel_z; /* relative Z motion (wheel) */ }; #define XENKBD_IN_EVENT_SIZE 40 union xenkbd_in_event { uint8_t type; struct xenkbd_motion motion; struct xenkbd_key key; struct xenkbd_position pos; char pad[XENKBD_IN_EVENT_SIZE]; }; /* Out events (frontend -> backend) */ /* * Out events may be sent only when requested by backend, and receipt * of an unknown out event is an error. * No out events currently defined. */ #define XENKBD_OUT_EVENT_SIZE 40 union xenkbd_out_event { uint8_t type; char pad[XENKBD_OUT_EVENT_SIZE]; }; /* shared page */ #define XENKBD_IN_RING_SIZE 2048 #define XENKBD_IN_RING_LEN (XENKBD_IN_RING_SIZE / XENKBD_IN_EVENT_SIZE) #define XENKBD_IN_RING_OFFS 1024 #define XENKBD_IN_RING(page) \ ((union xenkbd_in_event *)((char *)(page) + XENKBD_IN_RING_OFFS)) #define XENKBD_IN_RING_REF(page, idx) \ (XENKBD_IN_RING((page))[(idx) % XENKBD_IN_RING_LEN]) #define XENKBD_OUT_RING_SIZE 1024 #define XENKBD_OUT_RING_LEN (XENKBD_OUT_RING_SIZE / XENKBD_OUT_EVENT_SIZE) #define XENKBD_OUT_RING_OFFS (XENKBD_IN_RING_OFFS + XENKBD_IN_RING_SIZE) #define XENKBD_OUT_RING(page) \ ((union xenkbd_out_event *)((char *)(page) + XENKBD_OUT_RING_OFFS)) #define XENKBD_OUT_RING_REF(page, idx) \ (XENKBD_OUT_RING((page))[(idx) % XENKBD_OUT_RING_LEN]) struct xenkbd_page { uint32_t in_cons, in_prod; uint32_t out_cons, out_prod; }; #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/io/tpmif.h0000664000175000017500000001302612307313555016037 0ustar smbsmb/****************************************************************************** * tpmif.h * * TPM I/O interface for Xen guest OSes. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2005, IBM Corporation * * Author: Stefan Berger, stefanb@us.ibm.com * Grant table support: Mahadevan Gomathisankaran * * This code has been derived from tools/libxc/xen/io/netif.h * * Copyright (c) 2003-2004, Keir Fraser */ #ifndef __XEN_PUBLIC_IO_TPMIF_H__ #define __XEN_PUBLIC_IO_TPMIF_H__ #include "../grant_table.h" struct tpmif_tx_request { unsigned long addr; /* Machine address of packet. */ grant_ref_t ref; /* grant table access reference */ uint16_t unused; uint16_t size; /* Packet size in bytes. */ }; typedef struct tpmif_tx_request tpmif_tx_request_t; /* * The TPMIF_TX_RING_SIZE defines the number of pages the * front-end and backend can exchange (= size of array). */ typedef uint32_t TPMIF_RING_IDX; #define TPMIF_TX_RING_SIZE 1 /* This structure must fit in a memory page. */ struct tpmif_ring { struct tpmif_tx_request req; }; typedef struct tpmif_ring tpmif_ring_t; struct tpmif_tx_interface { struct tpmif_ring ring[TPMIF_TX_RING_SIZE]; }; typedef struct tpmif_tx_interface tpmif_tx_interface_t; /****************************************************************************** * TPM I/O interface for Xen guest OSes, v2 * * Author: Daniel De Graaf * * This protocol emulates the request/response behavior of a TPM using a Xen * shared memory interface. All interaction with the TPM is at the direction * of the frontend, since a TPM (hardware or virtual) is a passive device - * the backend only processes commands as requested by the frontend. * * The frontend sends a request to the TPM by populating the shared page with * the request packet, changing the state to TPMIF_STATE_SUBMIT, and sending * and event channel notification. When the backend is finished, it will set * the state to TPMIF_STATE_FINISH and send an event channel notification. * * In order to allow long-running commands to be canceled, the frontend can * at any time change the state to TPMIF_STATE_CANCEL and send a notification. * The TPM can either finish the command (changing state to TPMIF_STATE_FINISH) * or can cancel the command and change the state to TPMIF_STATE_IDLE. The TPM * can also change the state to TPMIF_STATE_IDLE instead of TPMIF_STATE_FINISH * if another reason for cancellation is required - for example, a physical * TPM may cancel a command if the interface is seized by another locality. * * The TPM command format is defined by the TCG, and is available at * http://www.trustedcomputinggroup.org/resources/tpm_main_specification */ enum tpmif_state { TPMIF_STATE_IDLE, /* no contents / vTPM idle / cancel complete */ TPMIF_STATE_SUBMIT, /* request ready / vTPM working */ TPMIF_STATE_FINISH, /* response ready / vTPM idle */ TPMIF_STATE_CANCEL, /* cancel requested / vTPM working */ }; /* Note: The backend should only change state to IDLE or FINISH, while the * frontend should only change to SUBMIT or CANCEL. Status changes do not need * to use atomic operations. */ /* The shared page for vTPM request/response packets looks like: * * Offset Contents * ================================================= * 0 struct tpmif_shared_page * 16 [optional] List of grant IDs * 16+4*nr_extra_pages TPM packet data * * If the TPM packet data extends beyond the end of a single page, the grant IDs * defined in extra_pages are used as if they were mapped immediately following * the primary shared page. The grants are allocated by the frontend and mapped * by the backend. Before sending a request spanning multiple pages, the * frontend should verify that the TPM supports such large requests by querying * the TPM_CAP_PROP_INPUT_BUFFER property from the TPM. */ struct tpmif_shared_page { uint32_t length; /* request/response length in bytes */ uint8_t state; /* enum tpmif_state */ uint8_t locality; /* for the current request */ uint8_t pad; /* should be zero */ uint8_t nr_extra_pages; /* extra pages for long packets; may be zero */ uint32_t extra_pages[0]; /* grant IDs; length is actually nr_extra_pages */ }; typedef struct tpmif_shared_page tpmif_shared_page_t; #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/io/netif.h0000664000175000017500000002216312307313555016027 0ustar smbsmb/****************************************************************************** * netif.h * * Unified network-device I/O interface for Xen guest OSes. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2003-2004, Keir Fraser */ #ifndef __XEN_PUBLIC_IO_NETIF_H__ #define __XEN_PUBLIC_IO_NETIF_H__ #include "ring.h" #include "../grant_table.h" /* * Older implementation of Xen network frontend / backend has an * implicit dependency on the MAX_SKB_FRAGS as the maximum number of * ring slots a skb can use. Netfront / netback may not work as * expected when frontend and backend have different MAX_SKB_FRAGS. * * A better approach is to add mechanism for netfront / netback to * negotiate this value. However we cannot fix all possible * frontends, so we need to define a value which states the minimum * slots backend must support. * * The minimum value derives from older Linux kernel's MAX_SKB_FRAGS * (18), which is proved to work with most frontends. Any new backend * which doesn't negotiate with frontend should expect frontend to * send a valid packet using slots up to this value. */ #define XEN_NETIF_NR_SLOTS_MIN 18 /* * Notifications after enqueuing any type of message should be conditional on * the appropriate req_event or rsp_event field in the shared ring. * If the client sends notification for rx requests then it should specify * feature 'feature-rx-notify' via xenbus. Otherwise the backend will assume * that it cannot safely queue packets (as it may not be kicked to send them). */ /* * "feature-split-event-channels" is introduced to separate guest TX * and RX notification. Backend either doesn't support this feature or * advertises it via xenstore as 0 (disabled) or 1 (enabled). * * To make use of this feature, frontend should allocate two event * channels for TX and RX, advertise them to backend as * "event-channel-tx" and "event-channel-rx" respectively. If frontend * doesn't want to use this feature, it just writes "event-channel" * node as before. */ /* * "feature-no-csum-offload" should be used to turn IPv4 TCP/UDP checksum * offload off or on. If it is missing then the feature is assumed to be on. * "feature-ipv6-csum-offload" should be used to turn IPv6 TCP/UDP checksum * offload on or off. If it is missing then the feature is assumed to be off. */ /* * "feature-gso-tcpv4" and "feature-gso-tcpv6" advertise the capability to * handle large TCP packets (in IPv4 or IPv6 form respectively). Neither * frontends nor backends are assumed to be capable unless the flags are * present. */ /* * This is the 'wire' format for packets: * Request 1: netif_tx_request -- NETTXF_* (any flags) * [Request 2: netif_tx_extra] (only if request 1 has NETTXF_extra_info) * [Request 3: netif_tx_extra] (only if request 2 has XEN_NETIF_EXTRA_MORE) * Request 4: netif_tx_request -- NETTXF_more_data * Request 5: netif_tx_request -- NETTXF_more_data * ... * Request N: netif_tx_request -- 0 */ /* Protocol checksum field is blank in the packet (hardware offload)? */ #define _NETTXF_csum_blank (0) #define NETTXF_csum_blank (1U<<_NETTXF_csum_blank) /* Packet data has been validated against protocol checksum. */ #define _NETTXF_data_validated (1) #define NETTXF_data_validated (1U<<_NETTXF_data_validated) /* Packet continues in the next request descriptor. */ #define _NETTXF_more_data (2) #define NETTXF_more_data (1U<<_NETTXF_more_data) /* Packet to be followed by extra descriptor(s). */ #define _NETTXF_extra_info (3) #define NETTXF_extra_info (1U<<_NETTXF_extra_info) #define XEN_NETIF_MAX_TX_SIZE 0xFFFF struct netif_tx_request { grant_ref_t gref; /* Reference to buffer page */ uint16_t offset; /* Offset within buffer page */ uint16_t flags; /* NETTXF_* */ uint16_t id; /* Echoed in response message. */ uint16_t size; /* Packet size in bytes. */ }; typedef struct netif_tx_request netif_tx_request_t; /* Types of netif_extra_info descriptors. */ #define XEN_NETIF_EXTRA_TYPE_NONE (0) /* Never used - invalid */ #define XEN_NETIF_EXTRA_TYPE_GSO (1) /* u.gso */ #define XEN_NETIF_EXTRA_TYPE_MCAST_ADD (2) /* u.mcast */ #define XEN_NETIF_EXTRA_TYPE_MCAST_DEL (3) /* u.mcast */ #define XEN_NETIF_EXTRA_TYPE_MAX (4) /* netif_extra_info flags. */ #define _XEN_NETIF_EXTRA_FLAG_MORE (0) #define XEN_NETIF_EXTRA_FLAG_MORE (1U<<_XEN_NETIF_EXTRA_FLAG_MORE) /* GSO types */ #define XEN_NETIF_GSO_TYPE_NONE (0) #define XEN_NETIF_GSO_TYPE_TCPV4 (1) #define XEN_NETIF_GSO_TYPE_TCPV6 (2) /* * This structure needs to fit within both netif_tx_request and * netif_rx_response for compatibility. */ struct netif_extra_info { uint8_t type; /* XEN_NETIF_EXTRA_TYPE_* */ uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */ union { /* * XEN_NETIF_EXTRA_TYPE_GSO: */ struct { /* * Maximum payload size of each segment. For example, for TCP this * is just the path MSS. */ uint16_t size; /* * GSO type. This determines the protocol of the packet and any * extra features required to segment the packet properly. */ uint8_t type; /* XEN_NETIF_GSO_TYPE_* */ /* Future expansion. */ uint8_t pad; /* * GSO features. This specifies any extra GSO features required * to process this packet, such as ECN support for TCPv4. */ uint16_t features; /* XEN_NETIF_GSO_FEAT_* */ } gso; /* * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}: * Backend advertises availability via 'feature-multicast-control' * xenbus node containing value '1'. * Frontend requests this feature by advertising * 'request-multicast-control' xenbus node containing value '1'. * If multicast control is requested then multicast flooding is * disabled and the frontend must explicitly register its interest * in multicast groups using dummy transmit requests containing * MCAST_{ADD,DEL} extra-info fragments. */ struct { uint8_t addr[6]; /* Address to add/remove. */ } mcast; uint16_t pad[3]; } u; }; typedef struct netif_extra_info netif_extra_info_t; struct netif_tx_response { uint16_t id; int16_t status; /* NETIF_RSP_* */ }; typedef struct netif_tx_response netif_tx_response_t; struct netif_rx_request { uint16_t id; /* Echoed in response message. */ grant_ref_t gref; /* Reference to incoming granted frame */ }; typedef struct netif_rx_request netif_rx_request_t; /* Packet data has been validated against protocol checksum. */ #define _NETRXF_data_validated (0) #define NETRXF_data_validated (1U<<_NETRXF_data_validated) /* Protocol checksum field is blank in the packet (hardware offload)? */ #define _NETRXF_csum_blank (1) #define NETRXF_csum_blank (1U<<_NETRXF_csum_blank) /* Packet continues in the next request descriptor. */ #define _NETRXF_more_data (2) #define NETRXF_more_data (1U<<_NETRXF_more_data) /* Packet to be followed by extra descriptor(s). */ #define _NETRXF_extra_info (3) #define NETRXF_extra_info (1U<<_NETRXF_extra_info) struct netif_rx_response { uint16_t id; uint16_t offset; /* Offset in page of start of received packet */ uint16_t flags; /* NETRXF_* */ int16_t status; /* -ve: NETIF_RSP_* ; +ve: Rx'ed pkt size. */ }; typedef struct netif_rx_response netif_rx_response_t; /* * Generate netif ring structures and types. */ DEFINE_RING_TYPES(netif_tx, struct netif_tx_request, struct netif_tx_response); DEFINE_RING_TYPES(netif_rx, struct netif_rx_request, struct netif_rx_response); #define NETIF_RSP_DROPPED -2 #define NETIF_RSP_ERROR -1 #define NETIF_RSP_OKAY 0 /* No response: used for auxiliary requests (e.g., netif_tx_extra). */ #define NETIF_RSP_NULL 1 #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/nmi.h0000664000175000017500000000566712307313555015110 0ustar smbsmb/****************************************************************************** * nmi.h * * NMI callback registration and reason codes. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2005, Keir Fraser */ #ifndef __XEN_PUBLIC_NMI_H__ #define __XEN_PUBLIC_NMI_H__ #include "xen.h" /* * NMI reason codes: * Currently these are x86-specific, stored in arch_shared_info.nmi_reason. */ /* I/O-check error reported via ISA port 0x61, bit 6. */ #define _XEN_NMIREASON_io_error 0 #define XEN_NMIREASON_io_error (1UL << _XEN_NMIREASON_io_error) /* PCI SERR reported via ISA port 0x61, bit 7. */ #define _XEN_NMIREASON_pci_serr 1 #define XEN_NMIREASON_pci_serr (1UL << _XEN_NMIREASON_pci_serr) #if __XEN_INTERFACE_VERSION__ < 0x00040300 /* legacy alias of the above */ /* Parity error reported via ISA port 0x61, bit 7. */ #define _XEN_NMIREASON_parity_error 1 #define XEN_NMIREASON_parity_error (1UL << _XEN_NMIREASON_parity_error) #endif /* Unknown hardware-generated NMI. */ #define _XEN_NMIREASON_unknown 2 #define XEN_NMIREASON_unknown (1UL << _XEN_NMIREASON_unknown) /* * long nmi_op(unsigned int cmd, void *arg) * NB. All ops return zero on success, else a negative error code. */ /* * Register NMI callback for this (calling) VCPU. Currently this only makes * sense for domain 0, vcpu 0. All other callers will be returned EINVAL. * arg == pointer to xennmi_callback structure. */ #define XENNMI_register_callback 0 struct xennmi_callback { unsigned long handler_address; unsigned long pad; }; typedef struct xennmi_callback xennmi_callback_t; DEFINE_XEN_GUEST_HANDLE(xennmi_callback_t); /* * Deregister NMI callback for this (calling) VCPU. * arg == NULL. */ #define XENNMI_unregister_callback 1 #endif /* __XEN_PUBLIC_NMI_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/domctl.h0000664000175000017500000011257312307313555015602 0ustar smbsmb/****************************************************************************** * domctl.h * * Domain management operations. For use by node control stack. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2002-2003, B Dragovic * Copyright (c) 2002-2006, K Fraser */ #ifndef __XEN_PUBLIC_DOMCTL_H__ #define __XEN_PUBLIC_DOMCTL_H__ #if !defined(__XEN__) && !defined(__XEN_TOOLS__) #error "domctl operations are intended for use by node control tools only" #endif #include "xen.h" #include "grant_table.h" #include "hvm/save.h" #define XEN_DOMCTL_INTERFACE_VERSION 0x00000009 /* * NB. xen_domctl.domain is an IN/OUT parameter for this operation. * If it is specified as zero, an id is auto-allocated and returned. */ /* XEN_DOMCTL_createdomain */ struct xen_domctl_createdomain { /* IN parameters */ uint32_t ssidref; xen_domain_handle_t handle; /* Is this an HVM guest (as opposed to a PVH or PV guest)? */ #define _XEN_DOMCTL_CDF_hvm_guest 0 #define XEN_DOMCTL_CDF_hvm_guest (1U<<_XEN_DOMCTL_CDF_hvm_guest) /* Use hardware-assisted paging if available? */ #define _XEN_DOMCTL_CDF_hap 1 #define XEN_DOMCTL_CDF_hap (1U<<_XEN_DOMCTL_CDF_hap) /* Should domain memory integrity be verifed by tboot during Sx? */ #define _XEN_DOMCTL_CDF_s3_integrity 2 #define XEN_DOMCTL_CDF_s3_integrity (1U<<_XEN_DOMCTL_CDF_s3_integrity) /* Disable out-of-sync shadow page tables? */ #define _XEN_DOMCTL_CDF_oos_off 3 #define XEN_DOMCTL_CDF_oos_off (1U<<_XEN_DOMCTL_CDF_oos_off) /* Is this a PVH guest (as opposed to an HVM or PV guest)? */ #define _XEN_DOMCTL_CDF_pvh_guest 4 #define XEN_DOMCTL_CDF_pvh_guest (1U<<_XEN_DOMCTL_CDF_pvh_guest) uint32_t flags; }; typedef struct xen_domctl_createdomain xen_domctl_createdomain_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t); /* XEN_DOMCTL_getdomaininfo */ struct xen_domctl_getdomaininfo { /* OUT variables. */ domid_t domain; /* Also echoed in domctl.domain */ /* Domain is scheduled to die. */ #define _XEN_DOMINF_dying 0 #define XEN_DOMINF_dying (1U<<_XEN_DOMINF_dying) /* Domain is an HVM guest (as opposed to a PV guest). */ #define _XEN_DOMINF_hvm_guest 1 #define XEN_DOMINF_hvm_guest (1U<<_XEN_DOMINF_hvm_guest) /* The guest OS has shut down. */ #define _XEN_DOMINF_shutdown 2 #define XEN_DOMINF_shutdown (1U<<_XEN_DOMINF_shutdown) /* Currently paused by control software. */ #define _XEN_DOMINF_paused 3 #define XEN_DOMINF_paused (1U<<_XEN_DOMINF_paused) /* Currently blocked pending an event. */ #define _XEN_DOMINF_blocked 4 #define XEN_DOMINF_blocked (1U<<_XEN_DOMINF_blocked) /* Domain is currently running. */ #define _XEN_DOMINF_running 5 #define XEN_DOMINF_running (1U<<_XEN_DOMINF_running) /* Being debugged. */ #define _XEN_DOMINF_debugged 6 #define XEN_DOMINF_debugged (1U<<_XEN_DOMINF_debugged) /* domain is PVH */ #define _XEN_DOMINF_pvh_guest 7 #define XEN_DOMINF_pvh_guest (1U<<_XEN_DOMINF_pvh_guest) /* XEN_DOMINF_shutdown guest-supplied code. */ #define XEN_DOMINF_shutdownmask 255 #define XEN_DOMINF_shutdownshift 16 uint32_t flags; /* XEN_DOMINF_* */ uint64_aligned_t tot_pages; uint64_aligned_t max_pages; uint64_aligned_t outstanding_pages; uint64_aligned_t shr_pages; uint64_aligned_t paged_pages; uint64_aligned_t shared_info_frame; /* GMFN of shared_info struct */ uint64_aligned_t cpu_time; uint32_t nr_online_vcpus; /* Number of VCPUs currently online. */ uint32_t max_vcpu_id; /* Maximum VCPUID in use by this domain. */ uint32_t ssidref; xen_domain_handle_t handle; uint32_t cpupool; }; typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t); /* XEN_DOMCTL_getmemlist */ struct xen_domctl_getmemlist { /* IN variables. */ /* Max entries to write to output buffer. */ uint64_aligned_t max_pfns; /* Start index in guest's page list. */ uint64_aligned_t start_pfn; XEN_GUEST_HANDLE_64(uint64) buffer; /* OUT variables. */ uint64_aligned_t num_pfns; }; typedef struct xen_domctl_getmemlist xen_domctl_getmemlist_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_getmemlist_t); /* XEN_DOMCTL_getpageframeinfo */ #define XEN_DOMCTL_PFINFO_LTAB_SHIFT 28 #define XEN_DOMCTL_PFINFO_NOTAB (0x0U<<28) #define XEN_DOMCTL_PFINFO_L1TAB (0x1U<<28) #define XEN_DOMCTL_PFINFO_L2TAB (0x2U<<28) #define XEN_DOMCTL_PFINFO_L3TAB (0x3U<<28) #define XEN_DOMCTL_PFINFO_L4TAB (0x4U<<28) #define XEN_DOMCTL_PFINFO_LTABTYPE_MASK (0x7U<<28) #define XEN_DOMCTL_PFINFO_LPINTAB (0x1U<<31) #define XEN_DOMCTL_PFINFO_XTAB (0xfU<<28) /* invalid page */ #define XEN_DOMCTL_PFINFO_XALLOC (0xeU<<28) /* allocate-only page */ #define XEN_DOMCTL_PFINFO_BROKEN (0xdU<<28) /* broken page */ #define XEN_DOMCTL_PFINFO_LTAB_MASK (0xfU<<28) struct xen_domctl_getpageframeinfo { /* IN variables. */ uint64_aligned_t gmfn; /* GMFN to query */ /* OUT variables. */ /* Is the page PINNED to a type? */ uint32_t type; /* see above type defs */ }; typedef struct xen_domctl_getpageframeinfo xen_domctl_getpageframeinfo_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo_t); /* XEN_DOMCTL_getpageframeinfo2 */ struct xen_domctl_getpageframeinfo2 { /* IN variables. */ uint64_aligned_t num; /* IN/OUT variables. */ XEN_GUEST_HANDLE_64(uint32) array; }; typedef struct xen_domctl_getpageframeinfo2 xen_domctl_getpageframeinfo2_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo2_t); /* XEN_DOMCTL_getpageframeinfo3 */ struct xen_domctl_getpageframeinfo3 { /* IN variables. */ uint64_aligned_t num; /* IN/OUT variables. */ XEN_GUEST_HANDLE_64(xen_pfn_t) array; }; /* * Control shadow pagetables operation */ /* XEN_DOMCTL_shadow_op */ /* Disable shadow mode. */ #define XEN_DOMCTL_SHADOW_OP_OFF 0 /* Enable shadow mode (mode contains ORed XEN_DOMCTL_SHADOW_ENABLE_* flags). */ #define XEN_DOMCTL_SHADOW_OP_ENABLE 32 /* Log-dirty bitmap operations. */ /* Return the bitmap and clean internal copy for next round. */ #define XEN_DOMCTL_SHADOW_OP_CLEAN 11 /* Return the bitmap but do not modify internal copy. */ #define XEN_DOMCTL_SHADOW_OP_PEEK 12 /* Memory allocation accessors. */ #define XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION 30 #define XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION 31 /* Legacy enable operations. */ /* Equiv. to ENABLE with no mode flags. */ #define XEN_DOMCTL_SHADOW_OP_ENABLE_TEST 1 /* Equiv. to ENABLE with mode flag ENABLE_LOG_DIRTY. */ #define XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY 2 /* Equiv. to ENABLE with mode flags ENABLE_REFCOUNT and ENABLE_TRANSLATE. */ #define XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE 3 /* Mode flags for XEN_DOMCTL_SHADOW_OP_ENABLE. */ /* * Shadow pagetables are refcounted: guest does not use explicit mmu * operations nor write-protect its pagetables. */ #define XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT (1 << 1) /* * Log pages in a bitmap as they are dirtied. * Used for live relocation to determine which pages must be re-sent. */ #define XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY (1 << 2) /* * Automatically translate GPFNs into MFNs. */ #define XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE (1 << 3) /* * Xen does not steal virtual address space from the guest. * Requires HVM support. */ #define XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL (1 << 4) struct xen_domctl_shadow_op_stats { uint32_t fault_count; uint32_t dirty_count; }; typedef struct xen_domctl_shadow_op_stats xen_domctl_shadow_op_stats_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_stats_t); struct xen_domctl_shadow_op { /* IN variables. */ uint32_t op; /* XEN_DOMCTL_SHADOW_OP_* */ /* OP_ENABLE */ uint32_t mode; /* XEN_DOMCTL_SHADOW_ENABLE_* */ /* OP_GET_ALLOCATION / OP_SET_ALLOCATION */ uint32_t mb; /* Shadow memory allocation in MB */ /* OP_PEEK / OP_CLEAN */ XEN_GUEST_HANDLE_64(uint8) dirty_bitmap; uint64_aligned_t pages; /* Size of buffer. Updated with actual size. */ struct xen_domctl_shadow_op_stats stats; }; typedef struct xen_domctl_shadow_op xen_domctl_shadow_op_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_t); /* XEN_DOMCTL_max_mem */ struct xen_domctl_max_mem { /* IN variables. */ uint64_aligned_t max_memkb; }; typedef struct xen_domctl_max_mem xen_domctl_max_mem_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_mem_t); /* XEN_DOMCTL_setvcpucontext */ /* XEN_DOMCTL_getvcpucontext */ struct xen_domctl_vcpucontext { uint32_t vcpu; /* IN */ XEN_GUEST_HANDLE_64(vcpu_guest_context_t) ctxt; /* IN/OUT */ }; typedef struct xen_domctl_vcpucontext xen_domctl_vcpucontext_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpucontext_t); /* XEN_DOMCTL_getvcpuinfo */ struct xen_domctl_getvcpuinfo { /* IN variables. */ uint32_t vcpu; /* OUT variables. */ uint8_t online; /* currently online (not hotplugged)? */ uint8_t blocked; /* blocked waiting for an event? */ uint8_t running; /* currently scheduled on its CPU? */ uint64_aligned_t cpu_time; /* total cpu time consumed (ns) */ uint32_t cpu; /* current mapping */ }; typedef struct xen_domctl_getvcpuinfo xen_domctl_getvcpuinfo_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t); /* Get/set the NUMA node(s) with which the guest has affinity with. */ /* XEN_DOMCTL_setnodeaffinity */ /* XEN_DOMCTL_getnodeaffinity */ struct xen_domctl_nodeaffinity { struct xenctl_bitmap nodemap;/* IN */ }; typedef struct xen_domctl_nodeaffinity xen_domctl_nodeaffinity_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_nodeaffinity_t); /* Get/set which physical cpus a vcpu can execute on. */ /* XEN_DOMCTL_setvcpuaffinity */ /* XEN_DOMCTL_getvcpuaffinity */ struct xen_domctl_vcpuaffinity { uint32_t vcpu; /* IN */ struct xenctl_bitmap cpumap; /* IN/OUT */ }; typedef struct xen_domctl_vcpuaffinity xen_domctl_vcpuaffinity_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuaffinity_t); /* XEN_DOMCTL_max_vcpus */ struct xen_domctl_max_vcpus { uint32_t max; /* maximum number of vcpus */ }; typedef struct xen_domctl_max_vcpus xen_domctl_max_vcpus_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t); /* XEN_DOMCTL_scheduler_op */ /* Scheduler types. */ #define XEN_SCHEDULER_SEDF 4 #define XEN_SCHEDULER_CREDIT 5 #define XEN_SCHEDULER_CREDIT2 6 #define XEN_SCHEDULER_ARINC653 7 /* Set or get info? */ #define XEN_DOMCTL_SCHEDOP_putinfo 0 #define XEN_DOMCTL_SCHEDOP_getinfo 1 struct xen_domctl_scheduler_op { uint32_t sched_id; /* XEN_SCHEDULER_* */ uint32_t cmd; /* XEN_DOMCTL_SCHEDOP_* */ union { struct xen_domctl_sched_sedf { uint64_aligned_t period; uint64_aligned_t slice; uint64_aligned_t latency; uint32_t extratime; uint32_t weight; } sedf; struct xen_domctl_sched_credit { uint16_t weight; uint16_t cap; } credit; struct xen_domctl_sched_credit2 { uint16_t weight; } credit2; } u; }; typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_scheduler_op_t); /* XEN_DOMCTL_setdomainhandle */ struct xen_domctl_setdomainhandle { xen_domain_handle_t handle; }; typedef struct xen_domctl_setdomainhandle xen_domctl_setdomainhandle_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdomainhandle_t); /* XEN_DOMCTL_setdebugging */ struct xen_domctl_setdebugging { uint8_t enable; }; typedef struct xen_domctl_setdebugging xen_domctl_setdebugging_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdebugging_t); /* XEN_DOMCTL_irq_permission */ struct xen_domctl_irq_permission { uint8_t pirq; uint8_t allow_access; /* flag to specify enable/disable of IRQ access */ }; typedef struct xen_domctl_irq_permission xen_domctl_irq_permission_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_irq_permission_t); /* XEN_DOMCTL_iomem_permission */ struct xen_domctl_iomem_permission { uint64_aligned_t first_mfn;/* first page (physical page number) in range */ uint64_aligned_t nr_mfns; /* number of pages in range (>0) */ uint8_t allow_access; /* allow (!0) or deny (0) access to range? */ }; typedef struct xen_domctl_iomem_permission xen_domctl_iomem_permission_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_iomem_permission_t); /* XEN_DOMCTL_ioport_permission */ struct xen_domctl_ioport_permission { uint32_t first_port; /* first port int range */ uint32_t nr_ports; /* size of port range */ uint8_t allow_access; /* allow or deny access to range? */ }; typedef struct xen_domctl_ioport_permission xen_domctl_ioport_permission_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_permission_t); /* XEN_DOMCTL_hypercall_init */ struct xen_domctl_hypercall_init { uint64_aligned_t gmfn; /* GMFN to be initialised */ }; typedef struct xen_domctl_hypercall_init xen_domctl_hypercall_init_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_hypercall_init_t); /* XEN_DOMCTL_arch_setup */ #define _XEN_DOMAINSETUP_hvm_guest 0 #define XEN_DOMAINSETUP_hvm_guest (1UL<<_XEN_DOMAINSETUP_hvm_guest) #define _XEN_DOMAINSETUP_query 1 /* Get parameters (for save) */ #define XEN_DOMAINSETUP_query (1UL<<_XEN_DOMAINSETUP_query) #define _XEN_DOMAINSETUP_sioemu_guest 2 #define XEN_DOMAINSETUP_sioemu_guest (1UL<<_XEN_DOMAINSETUP_sioemu_guest) typedef struct xen_domctl_arch_setup { uint64_aligned_t flags; /* XEN_DOMAINSETUP_* */ } xen_domctl_arch_setup_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_arch_setup_t); /* XEN_DOMCTL_settimeoffset */ struct xen_domctl_settimeoffset { int32_t time_offset_seconds; /* applied to domain wallclock time */ }; typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t); /* XEN_DOMCTL_gethvmcontext */ /* XEN_DOMCTL_sethvmcontext */ typedef struct xen_domctl_hvmcontext { uint32_t size; /* IN/OUT: size of buffer / bytes filled */ XEN_GUEST_HANDLE_64(uint8) buffer; /* IN/OUT: data, or call * gethvmcontext with NULL * buffer to get size req'd */ } xen_domctl_hvmcontext_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_t); /* XEN_DOMCTL_set_address_size */ /* XEN_DOMCTL_get_address_size */ typedef struct xen_domctl_address_size { uint32_t size; } xen_domctl_address_size_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_address_size_t); /* XEN_DOMCTL_real_mode_area */ struct xen_domctl_real_mode_area { uint32_t log; /* log2 of Real Mode Area size */ }; typedef struct xen_domctl_real_mode_area xen_domctl_real_mode_area_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_real_mode_area_t); /* XEN_DOMCTL_sendtrigger */ #define XEN_DOMCTL_SENDTRIGGER_NMI 0 #define XEN_DOMCTL_SENDTRIGGER_RESET 1 #define XEN_DOMCTL_SENDTRIGGER_INIT 2 #define XEN_DOMCTL_SENDTRIGGER_POWER 3 #define XEN_DOMCTL_SENDTRIGGER_SLEEP 4 struct xen_domctl_sendtrigger { uint32_t trigger; /* IN */ uint32_t vcpu; /* IN */ }; typedef struct xen_domctl_sendtrigger xen_domctl_sendtrigger_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_sendtrigger_t); /* Assign PCI device to HVM guest. Sets up IOMMU structures. */ /* XEN_DOMCTL_assign_device */ /* XEN_DOMCTL_test_assign_device */ /* XEN_DOMCTL_deassign_device */ struct xen_domctl_assign_device { uint32_t machine_sbdf; /* machine PCI ID of assigned device */ }; typedef struct xen_domctl_assign_device xen_domctl_assign_device_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_assign_device_t); /* Retrieve sibling devices infomation of machine_sbdf */ /* XEN_DOMCTL_get_device_group */ struct xen_domctl_get_device_group { uint32_t machine_sbdf; /* IN */ uint32_t max_sdevs; /* IN */ uint32_t num_sdevs; /* OUT */ XEN_GUEST_HANDLE_64(uint32) sdev_array; /* OUT */ }; typedef struct xen_domctl_get_device_group xen_domctl_get_device_group_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_get_device_group_t); /* Pass-through interrupts: bind real irq -> hvm devfn. */ /* XEN_DOMCTL_bind_pt_irq */ /* XEN_DOMCTL_unbind_pt_irq */ typedef enum pt_irq_type_e { PT_IRQ_TYPE_PCI, PT_IRQ_TYPE_ISA, PT_IRQ_TYPE_MSI, PT_IRQ_TYPE_MSI_TRANSLATE, } pt_irq_type_t; struct xen_domctl_bind_pt_irq { uint32_t machine_irq; pt_irq_type_t irq_type; uint32_t hvm_domid; union { struct { uint8_t isa_irq; } isa; struct { uint8_t bus; uint8_t device; uint8_t intx; } pci; struct { uint8_t gvec; uint32_t gflags; uint64_aligned_t gtable; } msi; } u; }; typedef struct xen_domctl_bind_pt_irq xen_domctl_bind_pt_irq_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_pt_irq_t); /* Bind machine I/O address range -> HVM address range. */ /* XEN_DOMCTL_memory_mapping */ #define DPCI_ADD_MAPPING 1 #define DPCI_REMOVE_MAPPING 0 struct xen_domctl_memory_mapping { uint64_aligned_t first_gfn; /* first page (hvm guest phys page) in range */ uint64_aligned_t first_mfn; /* first page (machine page) in range */ uint64_aligned_t nr_mfns; /* number of pages in range (>0) */ uint32_t add_mapping; /* add or remove mapping */ uint32_t padding; /* padding for 64-bit aligned structure */ }; typedef struct xen_domctl_memory_mapping xen_domctl_memory_mapping_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_memory_mapping_t); /* Bind machine I/O port range -> HVM I/O port range. */ /* XEN_DOMCTL_ioport_mapping */ struct xen_domctl_ioport_mapping { uint32_t first_gport; /* first guest IO port*/ uint32_t first_mport; /* first machine IO port */ uint32_t nr_ports; /* size of port range */ uint32_t add_mapping; /* add or remove mapping */ }; typedef struct xen_domctl_ioport_mapping xen_domctl_ioport_mapping_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_mapping_t); /* * Pin caching type of RAM space for x86 HVM domU. */ /* XEN_DOMCTL_pin_mem_cacheattr */ /* Caching types: these happen to be the same as x86 MTRR/PAT type codes. */ #define XEN_DOMCTL_MEM_CACHEATTR_UC 0 #define XEN_DOMCTL_MEM_CACHEATTR_WC 1 #define XEN_DOMCTL_MEM_CACHEATTR_WT 4 #define XEN_DOMCTL_MEM_CACHEATTR_WP 5 #define XEN_DOMCTL_MEM_CACHEATTR_WB 6 #define XEN_DOMCTL_MEM_CACHEATTR_UCM 7 struct xen_domctl_pin_mem_cacheattr { uint64_aligned_t start, end; uint32_t type; /* XEN_DOMCTL_MEM_CACHEATTR_* */ }; typedef struct xen_domctl_pin_mem_cacheattr xen_domctl_pin_mem_cacheattr_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_pin_mem_cacheattr_t); /* XEN_DOMCTL_set_ext_vcpucontext */ /* XEN_DOMCTL_get_ext_vcpucontext */ struct xen_domctl_ext_vcpucontext { /* IN: VCPU that this call applies to. */ uint32_t vcpu; /* * SET: Size of struct (IN) * GET: Size of struct (OUT, up to 128 bytes) */ uint32_t size; #if defined(__i386__) || defined(__x86_64__) /* SYSCALL from 32-bit mode and SYSENTER callback information. */ /* NB. SYSCALL from 64-bit mode is contained in vcpu_guest_context_t */ uint64_aligned_t syscall32_callback_eip; uint64_aligned_t sysenter_callback_eip; uint16_t syscall32_callback_cs; uint16_t sysenter_callback_cs; uint8_t syscall32_disables_events; uint8_t sysenter_disables_events; #if defined(__GNUC__) union { uint64_aligned_t mcg_cap; struct hvm_vmce_vcpu vmce; }; #else struct hvm_vmce_vcpu vmce; #endif #endif }; typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_ext_vcpucontext_t); /* * Set the target domain for a domain */ /* XEN_DOMCTL_set_target */ struct xen_domctl_set_target { domid_t target; }; typedef struct xen_domctl_set_target xen_domctl_set_target_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_target_t); #if defined(__i386__) || defined(__x86_64__) # define XEN_CPUID_INPUT_UNUSED 0xFFFFFFFF /* XEN_DOMCTL_set_cpuid */ struct xen_domctl_cpuid { uint32_t input[2]; uint32_t eax; uint32_t ebx; uint32_t ecx; uint32_t edx; }; typedef struct xen_domctl_cpuid xen_domctl_cpuid_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_cpuid_t); #endif /* * Arranges that if the domain suspends (specifically, if it shuts * down with code SHUTDOWN_suspend), this event channel will be * notified. * * This is _instead of_ the usual notification to the global * VIRQ_DOM_EXC. (In most systems that pirq is owned by xenstored.) * * Only one subscription per domain is possible. Last subscriber * wins; others are silently displaced. * * NB that contrary to the rather general name, it only applies to * domain shutdown with code suspend. Shutdown for other reasons * (including crash), and domain death, are notified to VIRQ_DOM_EXC * regardless. */ /* XEN_DOMCTL_subscribe */ struct xen_domctl_subscribe { uint32_t port; /* IN */ }; typedef struct xen_domctl_subscribe xen_domctl_subscribe_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_subscribe_t); /* * Define the maximum machine address size which should be allocated * to a guest. */ /* XEN_DOMCTL_set_machine_address_size */ /* XEN_DOMCTL_get_machine_address_size */ /* * Do not inject spurious page faults into this domain. */ /* XEN_DOMCTL_suppress_spurious_page_faults */ /* XEN_DOMCTL_debug_op */ #define XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF 0 #define XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON 1 struct xen_domctl_debug_op { uint32_t op; /* IN */ uint32_t vcpu; /* IN */ }; typedef struct xen_domctl_debug_op xen_domctl_debug_op_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_debug_op_t); /* * Request a particular record from the HVM context */ /* XEN_DOMCTL_gethvmcontext_partial */ typedef struct xen_domctl_hvmcontext_partial { uint32_t type; /* IN: Type of record required */ uint32_t instance; /* IN: Instance of that type */ XEN_GUEST_HANDLE_64(uint8) buffer; /* OUT: buffer to write record into */ } xen_domctl_hvmcontext_partial_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_partial_t); /* XEN_DOMCTL_disable_migrate */ typedef struct xen_domctl_disable_migrate { uint32_t disable; /* IN: 1: disable migration and restore */ } xen_domctl_disable_migrate_t; /* XEN_DOMCTL_gettscinfo */ /* XEN_DOMCTL_settscinfo */ struct xen_guest_tsc_info { uint32_t tsc_mode; uint32_t gtsc_khz; uint32_t incarnation; uint32_t pad; uint64_aligned_t elapsed_nsec; }; typedef struct xen_guest_tsc_info xen_guest_tsc_info_t; DEFINE_XEN_GUEST_HANDLE(xen_guest_tsc_info_t); typedef struct xen_domctl_tsc_info { XEN_GUEST_HANDLE_64(xen_guest_tsc_info_t) out_info; /* OUT */ xen_guest_tsc_info_t info; /* IN */ } xen_domctl_tsc_info_t; /* XEN_DOMCTL_gdbsx_guestmemio guest mem io */ struct xen_domctl_gdbsx_memio { /* IN */ uint64_aligned_t pgd3val;/* optional: init_mm.pgd[3] value */ uint64_aligned_t gva; /* guest virtual address */ uint64_aligned_t uva; /* user buffer virtual address */ uint32_t len; /* number of bytes to read/write */ uint8_t gwr; /* 0 = read from guest. 1 = write to guest */ /* OUT */ uint32_t remain; /* bytes remaining to be copied */ }; /* XEN_DOMCTL_gdbsx_pausevcpu */ /* XEN_DOMCTL_gdbsx_unpausevcpu */ struct xen_domctl_gdbsx_pauseunp_vcpu { /* pause/unpause a vcpu */ uint32_t vcpu; /* which vcpu */ }; /* XEN_DOMCTL_gdbsx_domstatus */ struct xen_domctl_gdbsx_domstatus { /* OUT */ uint8_t paused; /* is the domain paused */ uint32_t vcpu_id; /* any vcpu in an event? */ uint32_t vcpu_ev; /* if yes, what event? */ }; /* * Memory event operations */ /* XEN_DOMCTL_mem_event_op */ /* * Domain memory paging * Page memory in and out. * Domctl interface to set up and tear down the * pager<->hypervisor interface. Use XENMEM_paging_op* * to perform per-page operations. * * The XEN_DOMCTL_MEM_EVENT_OP_PAGING_ENABLE domctl returns several * non-standard error codes to indicate why paging could not be enabled: * ENODEV - host lacks HAP support (EPT/NPT) or HAP is disabled in guest * EMLINK - guest has iommu passthrough enabled * EXDEV - guest has PoD enabled * EBUSY - guest has or had paging enabled, ring buffer still active */ #define XEN_DOMCTL_MEM_EVENT_OP_PAGING 1 #define XEN_DOMCTL_MEM_EVENT_OP_PAGING_ENABLE 0 #define XEN_DOMCTL_MEM_EVENT_OP_PAGING_DISABLE 1 /* * Access permissions. * * As with paging, use the domctl for teardown/setup of the * helper<->hypervisor interface. * * There are HVM hypercalls to set the per-page access permissions of every * page in a domain. When one of these permissions--independent, read, * write, and execute--is violated, the VCPU is paused and a memory event * is sent with what happened. (See public/mem_event.h) . * * The memory event handler can then resume the VCPU and redo the access * with a XENMEM_access_op_resume hypercall. * * The XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE domctl returns several * non-standard error codes to indicate why access could not be enabled: * ENODEV - host lacks HAP support (EPT/NPT) or HAP is disabled in guest * EBUSY - guest has or had access enabled, ring buffer still active */ #define XEN_DOMCTL_MEM_EVENT_OP_ACCESS 2 #define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE 0 #define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_DISABLE 1 /* * Sharing ENOMEM helper. * * As with paging, use the domctl for teardown/setup of the * helper<->hypervisor interface. * * If setup, this ring is used to communicate failed allocations * in the unshare path. XENMEM_sharing_op_resume is used to wake up * vcpus that could not unshare. * * Note that shring can be turned on (as per the domctl below) * *without* this ring being setup. */ #define XEN_DOMCTL_MEM_EVENT_OP_SHARING 3 #define XEN_DOMCTL_MEM_EVENT_OP_SHARING_ENABLE 0 #define XEN_DOMCTL_MEM_EVENT_OP_SHARING_DISABLE 1 /* Use for teardown/setup of helper<->hypervisor interface for paging, * access and sharing.*/ struct xen_domctl_mem_event_op { uint32_t op; /* XEN_DOMCTL_MEM_EVENT_OP_*_* */ uint32_t mode; /* XEN_DOMCTL_MEM_EVENT_OP_* */ uint32_t port; /* OUT: event channel for ring */ }; typedef struct xen_domctl_mem_event_op xen_domctl_mem_event_op_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_mem_event_op_t); /* * Memory sharing operations */ /* XEN_DOMCTL_mem_sharing_op. * The CONTROL sub-domctl is used for bringup/teardown. */ #define XEN_DOMCTL_MEM_SHARING_CONTROL 0 struct xen_domctl_mem_sharing_op { uint8_t op; /* XEN_DOMCTL_MEM_SHARING_* */ union { uint8_t enable; /* CONTROL */ } u; }; typedef struct xen_domctl_mem_sharing_op xen_domctl_mem_sharing_op_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_mem_sharing_op_t); struct xen_domctl_audit_p2m { /* OUT error counts */ uint64_t orphans; uint64_t m2p_bad; uint64_t p2m_bad; }; typedef struct xen_domctl_audit_p2m xen_domctl_audit_p2m_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_audit_p2m_t); struct xen_domctl_set_virq_handler { uint32_t virq; /* IN */ }; typedef struct xen_domctl_set_virq_handler xen_domctl_set_virq_handler_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_virq_handler_t); #if defined(__i386__) || defined(__x86_64__) /* XEN_DOMCTL_setvcpuextstate */ /* XEN_DOMCTL_getvcpuextstate */ struct xen_domctl_vcpuextstate { /* IN: VCPU that this call applies to. */ uint32_t vcpu; /* * SET: xfeature support mask of struct (IN) * GET: xfeature support mask of struct (IN/OUT) * xfeature mask is served as identifications of the saving format * so that compatible CPUs can have a check on format to decide * whether it can restore. */ uint64_aligned_t xfeature_mask; /* * SET: Size of struct (IN) * GET: Size of struct (IN/OUT) */ uint64_aligned_t size; XEN_GUEST_HANDLE_64(uint64) buffer; }; typedef struct xen_domctl_vcpuextstate xen_domctl_vcpuextstate_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuextstate_t); #endif /* XEN_DOMCTL_set_access_required: sets whether a memory event listener * must be present to handle page access events: if false, the page * access will revert to full permissions if no one is listening; * */ struct xen_domctl_set_access_required { uint8_t access_required; }; typedef struct xen_domctl_set_access_required xen_domctl_set_access_required_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_access_required_t); struct xen_domctl_set_broken_page_p2m { uint64_aligned_t pfn; }; typedef struct xen_domctl_set_broken_page_p2m xen_domctl_set_broken_page_p2m_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_broken_page_p2m_t); /* * XEN_DOMCTL_set_max_evtchn: sets the maximum event channel port * number the guest may use. Use this limit the amount of resources * (global mapping space, xenheap) a guest may use for event channels. */ struct xen_domctl_set_max_evtchn { uint32_t max_port; }; typedef struct xen_domctl_set_max_evtchn xen_domctl_set_max_evtchn_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_max_evtchn_t); /* * ARM: Clean and invalidate caches associated with given region of * guest memory. */ struct xen_domctl_cacheflush { /* IN: page range to flush. */ xen_pfn_t start_pfn, nr_pfns; }; typedef struct xen_domctl_cacheflush xen_domctl_cacheflush_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_cacheflush_t); struct xen_domctl { uint32_t cmd; #define XEN_DOMCTL_createdomain 1 #define XEN_DOMCTL_destroydomain 2 #define XEN_DOMCTL_pausedomain 3 #define XEN_DOMCTL_unpausedomain 4 #define XEN_DOMCTL_getdomaininfo 5 #define XEN_DOMCTL_getmemlist 6 #define XEN_DOMCTL_getpageframeinfo 7 #define XEN_DOMCTL_getpageframeinfo2 8 #define XEN_DOMCTL_setvcpuaffinity 9 #define XEN_DOMCTL_shadow_op 10 #define XEN_DOMCTL_max_mem 11 #define XEN_DOMCTL_setvcpucontext 12 #define XEN_DOMCTL_getvcpucontext 13 #define XEN_DOMCTL_getvcpuinfo 14 #define XEN_DOMCTL_max_vcpus 15 #define XEN_DOMCTL_scheduler_op 16 #define XEN_DOMCTL_setdomainhandle 17 #define XEN_DOMCTL_setdebugging 18 #define XEN_DOMCTL_irq_permission 19 #define XEN_DOMCTL_iomem_permission 20 #define XEN_DOMCTL_ioport_permission 21 #define XEN_DOMCTL_hypercall_init 22 #define XEN_DOMCTL_arch_setup 23 #define XEN_DOMCTL_settimeoffset 24 #define XEN_DOMCTL_getvcpuaffinity 25 #define XEN_DOMCTL_real_mode_area 26 #define XEN_DOMCTL_resumedomain 27 #define XEN_DOMCTL_sendtrigger 28 #define XEN_DOMCTL_subscribe 29 #define XEN_DOMCTL_gethvmcontext 33 #define XEN_DOMCTL_sethvmcontext 34 #define XEN_DOMCTL_set_address_size 35 #define XEN_DOMCTL_get_address_size 36 #define XEN_DOMCTL_assign_device 37 #define XEN_DOMCTL_bind_pt_irq 38 #define XEN_DOMCTL_memory_mapping 39 #define XEN_DOMCTL_ioport_mapping 40 #define XEN_DOMCTL_pin_mem_cacheattr 41 #define XEN_DOMCTL_set_ext_vcpucontext 42 #define XEN_DOMCTL_get_ext_vcpucontext 43 #define XEN_DOMCTL_set_opt_feature 44 /* Obsolete IA64 only */ #define XEN_DOMCTL_test_assign_device 45 #define XEN_DOMCTL_set_target 46 #define XEN_DOMCTL_deassign_device 47 #define XEN_DOMCTL_unbind_pt_irq 48 #define XEN_DOMCTL_set_cpuid 49 #define XEN_DOMCTL_get_device_group 50 #define XEN_DOMCTL_set_machine_address_size 51 #define XEN_DOMCTL_get_machine_address_size 52 #define XEN_DOMCTL_suppress_spurious_page_faults 53 #define XEN_DOMCTL_debug_op 54 #define XEN_DOMCTL_gethvmcontext_partial 55 #define XEN_DOMCTL_mem_event_op 56 #define XEN_DOMCTL_mem_sharing_op 57 #define XEN_DOMCTL_disable_migrate 58 #define XEN_DOMCTL_gettscinfo 59 #define XEN_DOMCTL_settscinfo 60 #define XEN_DOMCTL_getpageframeinfo3 61 #define XEN_DOMCTL_setvcpuextstate 62 #define XEN_DOMCTL_getvcpuextstate 63 #define XEN_DOMCTL_set_access_required 64 #define XEN_DOMCTL_audit_p2m 65 #define XEN_DOMCTL_set_virq_handler 66 #define XEN_DOMCTL_set_broken_page_p2m 67 #define XEN_DOMCTL_setnodeaffinity 68 #define XEN_DOMCTL_getnodeaffinity 69 #define XEN_DOMCTL_set_max_evtchn 70 #define XEN_DOMCTL_cacheflush 71 #define XEN_DOMCTL_gdbsx_guestmemio 1000 #define XEN_DOMCTL_gdbsx_pausevcpu 1001 #define XEN_DOMCTL_gdbsx_unpausevcpu 1002 #define XEN_DOMCTL_gdbsx_domstatus 1003 uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */ domid_t domain; union { struct xen_domctl_createdomain createdomain; struct xen_domctl_getdomaininfo getdomaininfo; struct xen_domctl_getmemlist getmemlist; struct xen_domctl_getpageframeinfo getpageframeinfo; struct xen_domctl_getpageframeinfo2 getpageframeinfo2; struct xen_domctl_getpageframeinfo3 getpageframeinfo3; struct xen_domctl_nodeaffinity nodeaffinity; struct xen_domctl_vcpuaffinity vcpuaffinity; struct xen_domctl_shadow_op shadow_op; struct xen_domctl_max_mem max_mem; struct xen_domctl_vcpucontext vcpucontext; struct xen_domctl_getvcpuinfo getvcpuinfo; struct xen_domctl_max_vcpus max_vcpus; struct xen_domctl_scheduler_op scheduler_op; struct xen_domctl_setdomainhandle setdomainhandle; struct xen_domctl_setdebugging setdebugging; struct xen_domctl_irq_permission irq_permission; struct xen_domctl_iomem_permission iomem_permission; struct xen_domctl_ioport_permission ioport_permission; struct xen_domctl_hypercall_init hypercall_init; struct xen_domctl_arch_setup arch_setup; struct xen_domctl_settimeoffset settimeoffset; struct xen_domctl_disable_migrate disable_migrate; struct xen_domctl_tsc_info tsc_info; struct xen_domctl_real_mode_area real_mode_area; struct xen_domctl_hvmcontext hvmcontext; struct xen_domctl_hvmcontext_partial hvmcontext_partial; struct xen_domctl_address_size address_size; struct xen_domctl_sendtrigger sendtrigger; struct xen_domctl_get_device_group get_device_group; struct xen_domctl_assign_device assign_device; struct xen_domctl_bind_pt_irq bind_pt_irq; struct xen_domctl_memory_mapping memory_mapping; struct xen_domctl_ioport_mapping ioport_mapping; struct xen_domctl_pin_mem_cacheattr pin_mem_cacheattr; struct xen_domctl_ext_vcpucontext ext_vcpucontext; struct xen_domctl_set_target set_target; struct xen_domctl_subscribe subscribe; struct xen_domctl_debug_op debug_op; struct xen_domctl_mem_event_op mem_event_op; struct xen_domctl_mem_sharing_op mem_sharing_op; #if defined(__i386__) || defined(__x86_64__) struct xen_domctl_cpuid cpuid; struct xen_domctl_vcpuextstate vcpuextstate; #endif struct xen_domctl_set_access_required access_required; struct xen_domctl_audit_p2m audit_p2m; struct xen_domctl_set_virq_handler set_virq_handler; struct xen_domctl_set_max_evtchn set_max_evtchn; struct xen_domctl_gdbsx_memio gdbsx_guest_memio; struct xen_domctl_set_broken_page_p2m set_broken_page_p2m; struct xen_domctl_cacheflush cacheflush; struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu; struct xen_domctl_gdbsx_domstatus gdbsx_domstatus; uint8_t pad[128]; } u; }; typedef struct xen_domctl xen_domctl_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_t); #endif /* __XEN_PUBLIC_DOMCTL_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/gcov.h0000664000175000017500000000644412307313555015255 0ustar smbsmb/****************************************************************************** * gcov.h * * Coverage structures exported by Xen. * Structure is different from Gcc one. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2013, Citrix Systems R&D Ltd. */ #ifndef __XEN_PUBLIC_GCOV_H__ #define __XEN_PUBLIC_GCOV_H__ __XEN_PUBLIC_GCOV_H__ #define XENCOV_COUNTERS 5 #define XENCOV_TAG_BASE 0x58544300u #define XENCOV_TAG_FILE (XENCOV_TAG_BASE+0x46u) #define XENCOV_TAG_FUNC (XENCOV_TAG_BASE+0x66u) #define XENCOV_TAG_COUNTER(n) (XENCOV_TAG_BASE+0x30u+((n)&0xfu)) #define XENCOV_TAG_END (XENCOV_TAG_BASE+0x2eu) #define XENCOV_IS_TAG_COUNTER(n) \ ((n) >= XENCOV_TAG_COUNTER(0) && (n) < XENCOV_TAG_COUNTER(XENCOV_COUNTERS)) #define XENCOV_COUNTER_NUM(n) ((n)-XENCOV_TAG_COUNTER(0)) /* * The main structure for the blob is * BLOB := FILE.. END * FILE := TAG_FILE VERSION STAMP FILENAME COUNTERS FUNCTIONS * FILENAME := LEN characters * characters are padded to 32 bit * LEN := 32 bit value * COUNTERS := TAG_COUNTER(n) NUM COUNTER.. * NUM := 32 bit valie * COUNTER := 64 bit value * FUNCTIONS := TAG_FUNC NUM FUNCTION.. * FUNCTION := IDENT CHECKSUM NUM_COUNTERS * * All tagged structures are aligned to 8 bytes */ /** * File information * Prefixed with XENCOV_TAG_FILE and a string with filename * Aligned to 8 bytes */ struct xencov_file { uint32_t tag; /* XENCOV_TAG_FILE */ uint32_t version; uint32_t stamp; uint32_t fn_len; char filename[1]; }; /** * Counters information * Prefixed with XENCOV_TAG_COUNTER(n) where n is 0..(XENCOV_COUNTERS-1) * Aligned to 8 bytes */ struct xencov_counter { uint32_t tag; /* XENCOV_TAG_COUNTER(n) */ uint32_t num; uint64_t values[1]; }; /** * Information for each function * Number of counter is equal to the number of counter structures got before */ struct xencov_function { uint32_t ident; uint32_t checksum; uint32_t num_counters[1]; }; /** * Information for all functions * Aligned to 8 bytes */ struct xencov_functions { uint32_t tag; /* XENCOV_TAG_FUNC */ uint32_t num; struct xencov_function xencov_function[1]; }; /** * Terminator */ struct xencov_end { uint32_t tag; /* XENCOV_TAG_END */ }; #endif /* __XEN_PUBLIC_GCOV_H__ */ xen-4.4.0/xen/include/public/physdev.h0000664000175000017500000002547012307313555016001 0ustar smbsmb/* * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef __XEN_PUBLIC_PHYSDEV_H__ #define __XEN_PUBLIC_PHYSDEV_H__ #include "xen.h" /* * Prototype for this hypercall is: * int physdev_op(int cmd, void *args) * @cmd == PHYSDEVOP_??? (physdev operation). * @args == Operation-specific extra arguments (NULL if none). */ /* * Notify end-of-interrupt (EOI) for the specified IRQ. * @arg == pointer to physdev_eoi structure. */ #define PHYSDEVOP_eoi 12 struct physdev_eoi { /* IN */ uint32_t irq; }; typedef struct physdev_eoi physdev_eoi_t; DEFINE_XEN_GUEST_HANDLE(physdev_eoi_t); /* * Register a shared page for the hypervisor to indicate whether the guest * must issue PHYSDEVOP_eoi. The semantics of PHYSDEVOP_eoi change slightly * once the guest used this function in that the associated event channel * will automatically get unmasked. The page registered is used as a bit * array indexed by Xen's PIRQ value. */ #define PHYSDEVOP_pirq_eoi_gmfn_v1 17 /* * Register a shared page for the hypervisor to indicate whether the * guest must issue PHYSDEVOP_eoi. This hypercall is very similar to * PHYSDEVOP_pirq_eoi_gmfn_v1 but it doesn't change the semantics of * PHYSDEVOP_eoi. The page registered is used as a bit array indexed by * Xen's PIRQ value. */ #define PHYSDEVOP_pirq_eoi_gmfn_v2 28 struct physdev_pirq_eoi_gmfn { /* IN */ xen_pfn_t gmfn; }; typedef struct physdev_pirq_eoi_gmfn physdev_pirq_eoi_gmfn_t; DEFINE_XEN_GUEST_HANDLE(physdev_pirq_eoi_gmfn_t); /* * Query the status of an IRQ line. * @arg == pointer to physdev_irq_status_query structure. */ #define PHYSDEVOP_irq_status_query 5 struct physdev_irq_status_query { /* IN */ uint32_t irq; /* OUT */ uint32_t flags; /* XENIRQSTAT_* */ }; typedef struct physdev_irq_status_query physdev_irq_status_query_t; DEFINE_XEN_GUEST_HANDLE(physdev_irq_status_query_t); /* Need to call PHYSDEVOP_eoi when the IRQ has been serviced? */ #define _XENIRQSTAT_needs_eoi (0) #define XENIRQSTAT_needs_eoi (1U<<_XENIRQSTAT_needs_eoi) /* IRQ shared by multiple guests? */ #define _XENIRQSTAT_shared (1) #define XENIRQSTAT_shared (1U<<_XENIRQSTAT_shared) /* * Set the current VCPU's I/O privilege level. * @arg == pointer to physdev_set_iopl structure. */ #define PHYSDEVOP_set_iopl 6 struct physdev_set_iopl { /* IN */ uint32_t iopl; }; typedef struct physdev_set_iopl physdev_set_iopl_t; DEFINE_XEN_GUEST_HANDLE(physdev_set_iopl_t); /* * Set the current VCPU's I/O-port permissions bitmap. * @arg == pointer to physdev_set_iobitmap structure. */ #define PHYSDEVOP_set_iobitmap 7 struct physdev_set_iobitmap { /* IN */ #if __XEN_INTERFACE_VERSION__ >= 0x00030205 XEN_GUEST_HANDLE(uint8) bitmap; #else uint8_t *bitmap; #endif uint32_t nr_ports; }; typedef struct physdev_set_iobitmap physdev_set_iobitmap_t; DEFINE_XEN_GUEST_HANDLE(physdev_set_iobitmap_t); /* * Read or write an IO-APIC register. * @arg == pointer to physdev_apic structure. */ #define PHYSDEVOP_apic_read 8 #define PHYSDEVOP_apic_write 9 struct physdev_apic { /* IN */ unsigned long apic_physbase; uint32_t reg; /* IN or OUT */ uint32_t value; }; typedef struct physdev_apic physdev_apic_t; DEFINE_XEN_GUEST_HANDLE(physdev_apic_t); /* * Allocate or free a physical upcall vector for the specified IRQ line. * @arg == pointer to physdev_irq structure. */ #define PHYSDEVOP_alloc_irq_vector 10 #define PHYSDEVOP_free_irq_vector 11 struct physdev_irq { /* IN */ uint32_t irq; /* IN or OUT */ uint32_t vector; }; typedef struct physdev_irq physdev_irq_t; DEFINE_XEN_GUEST_HANDLE(physdev_irq_t); #define MAP_PIRQ_TYPE_MSI 0x0 #define MAP_PIRQ_TYPE_GSI 0x1 #define MAP_PIRQ_TYPE_UNKNOWN 0x2 #define MAP_PIRQ_TYPE_MSI_SEG 0x3 #define MAP_PIRQ_TYPE_MULTI_MSI 0x4 #define PHYSDEVOP_map_pirq 13 struct physdev_map_pirq { domid_t domid; /* IN */ int type; /* IN (ignored for ..._MULTI_MSI) */ int index; /* IN or OUT */ int pirq; /* IN - high 16 bits hold segment for ..._MSI_SEG and ..._MULTI_MSI */ int bus; /* IN */ int devfn; /* IN (also OUT for ..._MULTI_MSI) */ int entry_nr; /* IN */ uint64_t table_base; }; typedef struct physdev_map_pirq physdev_map_pirq_t; DEFINE_XEN_GUEST_HANDLE(physdev_map_pirq_t); #define PHYSDEVOP_unmap_pirq 14 struct physdev_unmap_pirq { domid_t domid; /* IN */ int pirq; }; typedef struct physdev_unmap_pirq physdev_unmap_pirq_t; DEFINE_XEN_GUEST_HANDLE(physdev_unmap_pirq_t); #define PHYSDEVOP_manage_pci_add 15 #define PHYSDEVOP_manage_pci_remove 16 struct physdev_manage_pci { /* IN */ uint8_t bus; uint8_t devfn; }; typedef struct physdev_manage_pci physdev_manage_pci_t; DEFINE_XEN_GUEST_HANDLE(physdev_manage_pci_t); #define PHYSDEVOP_restore_msi 19 struct physdev_restore_msi { /* IN */ uint8_t bus; uint8_t devfn; }; typedef struct physdev_restore_msi physdev_restore_msi_t; DEFINE_XEN_GUEST_HANDLE(physdev_restore_msi_t); #define PHYSDEVOP_manage_pci_add_ext 20 struct physdev_manage_pci_ext { /* IN */ uint8_t bus; uint8_t devfn; unsigned is_extfn; unsigned is_virtfn; struct { uint8_t bus; uint8_t devfn; } physfn; }; typedef struct physdev_manage_pci_ext physdev_manage_pci_ext_t; DEFINE_XEN_GUEST_HANDLE(physdev_manage_pci_ext_t); /* * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op() * hypercall since 0x00030202. */ struct physdev_op { uint32_t cmd; union { struct physdev_irq_status_query irq_status_query; struct physdev_set_iopl set_iopl; struct physdev_set_iobitmap set_iobitmap; struct physdev_apic apic_op; struct physdev_irq irq_op; } u; }; typedef struct physdev_op physdev_op_t; DEFINE_XEN_GUEST_HANDLE(physdev_op_t); #define PHYSDEVOP_setup_gsi 21 struct physdev_setup_gsi { int gsi; /* IN */ uint8_t triggering; /* IN */ uint8_t polarity; /* IN */ }; typedef struct physdev_setup_gsi physdev_setup_gsi_t; DEFINE_XEN_GUEST_HANDLE(physdev_setup_gsi_t); /* leave PHYSDEVOP 22 free */ /* type is MAP_PIRQ_TYPE_GSI or MAP_PIRQ_TYPE_MSI * the hypercall returns a free pirq */ #define PHYSDEVOP_get_free_pirq 23 struct physdev_get_free_pirq { /* IN */ int type; /* OUT */ uint32_t pirq; }; typedef struct physdev_get_free_pirq physdev_get_free_pirq_t; DEFINE_XEN_GUEST_HANDLE(physdev_get_free_pirq_t); #define XEN_PCI_MMCFG_RESERVED 0x1 #define PHYSDEVOP_pci_mmcfg_reserved 24 struct physdev_pci_mmcfg_reserved { uint64_t address; uint16_t segment; uint8_t start_bus; uint8_t end_bus; uint32_t flags; }; typedef struct physdev_pci_mmcfg_reserved physdev_pci_mmcfg_reserved_t; DEFINE_XEN_GUEST_HANDLE(physdev_pci_mmcfg_reserved_t); #define XEN_PCI_DEV_EXTFN 0x1 #define XEN_PCI_DEV_VIRTFN 0x2 #define XEN_PCI_DEV_PXM 0x4 #define PHYSDEVOP_pci_device_add 25 struct physdev_pci_device_add { /* IN */ uint16_t seg; uint8_t bus; uint8_t devfn; uint32_t flags; struct { uint8_t bus; uint8_t devfn; } physfn; #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L uint32_t optarr[]; #elif defined(__GNUC__) uint32_t optarr[0]; #endif }; typedef struct physdev_pci_device_add physdev_pci_device_add_t; DEFINE_XEN_GUEST_HANDLE(physdev_pci_device_add_t); #define PHYSDEVOP_pci_device_remove 26 #define PHYSDEVOP_restore_msi_ext 27 /* * Dom0 should use these two to announce MMIO resources assigned to * MSI-X capable devices won't (prepare) or may (release) change. */ #define PHYSDEVOP_prepare_msix 30 #define PHYSDEVOP_release_msix 31 struct physdev_pci_device { /* IN */ uint16_t seg; uint8_t bus; uint8_t devfn; }; typedef struct physdev_pci_device physdev_pci_device_t; DEFINE_XEN_GUEST_HANDLE(physdev_pci_device_t); #define PHYSDEVOP_DBGP_RESET_PREPARE 1 #define PHYSDEVOP_DBGP_RESET_DONE 2 #define PHYSDEVOP_DBGP_BUS_UNKNOWN 0 #define PHYSDEVOP_DBGP_BUS_PCI 1 #define PHYSDEVOP_dbgp_op 29 struct physdev_dbgp_op { /* IN */ uint8_t op; uint8_t bus; union { struct physdev_pci_device pci; } u; }; typedef struct physdev_dbgp_op physdev_dbgp_op_t; DEFINE_XEN_GUEST_HANDLE(physdev_dbgp_op_t); /* * Notify that some PIRQ-bound event channels have been unmasked. * ** This command is obsolete since interface version 0x00030202 and is ** * ** unsupported by newer versions of Xen. ** */ #define PHYSDEVOP_IRQ_UNMASK_NOTIFY 4 /* * These all-capitals physdev operation names are superceded by the new names * (defined above) since interface version 0x00030202. */ #define PHYSDEVOP_IRQ_STATUS_QUERY PHYSDEVOP_irq_status_query #define PHYSDEVOP_SET_IOPL PHYSDEVOP_set_iopl #define PHYSDEVOP_SET_IOBITMAP PHYSDEVOP_set_iobitmap #define PHYSDEVOP_APIC_READ PHYSDEVOP_apic_read #define PHYSDEVOP_APIC_WRITE PHYSDEVOP_apic_write #define PHYSDEVOP_ASSIGN_VECTOR PHYSDEVOP_alloc_irq_vector #define PHYSDEVOP_FREE_VECTOR PHYSDEVOP_free_irq_vector #define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi #define PHYSDEVOP_IRQ_SHARED XENIRQSTAT_shared #if __XEN_INTERFACE_VERSION__ < 0x00040200 #define PHYSDEVOP_pirq_eoi_gmfn PHYSDEVOP_pirq_eoi_gmfn_v1 #else #define PHYSDEVOP_pirq_eoi_gmfn PHYSDEVOP_pirq_eoi_gmfn_v2 #endif #endif /* __XEN_PUBLIC_PHYSDEV_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/xen.h0000664000175000017500000010642612307313555015112 0ustar smbsmb/****************************************************************************** * xen.h * * Guest OS interface to Xen. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2004, K A Fraser */ #ifndef __XEN_PUBLIC_XEN_H__ #define __XEN_PUBLIC_XEN_H__ #include "xen-compat.h" #if defined(__i386__) || defined(__x86_64__) #include "arch-x86/xen.h" #elif defined(__arm__) || defined (__aarch64__) #include "arch-arm.h" #else #error "Unsupported architecture" #endif #ifndef __ASSEMBLY__ /* Guest handles for primitive C types. */ DEFINE_XEN_GUEST_HANDLE(char); __DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char); DEFINE_XEN_GUEST_HANDLE(int); __DEFINE_XEN_GUEST_HANDLE(uint, unsigned int); #if __XEN_INTERFACE_VERSION__ < 0x00040300 DEFINE_XEN_GUEST_HANDLE(long); __DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long); #endif DEFINE_XEN_GUEST_HANDLE(void); DEFINE_XEN_GUEST_HANDLE(uint64_t); DEFINE_XEN_GUEST_HANDLE(xen_pfn_t); DEFINE_XEN_GUEST_HANDLE(xen_ulong_t); #endif /* * HYPERCALLS */ /* `incontents 100 hcalls List of hypercalls * ` enum hypercall_num { // __HYPERVISOR_* => HYPERVISOR_*() */ #define __HYPERVISOR_set_trap_table 0 #define __HYPERVISOR_mmu_update 1 #define __HYPERVISOR_set_gdt 2 #define __HYPERVISOR_stack_switch 3 #define __HYPERVISOR_set_callbacks 4 #define __HYPERVISOR_fpu_taskswitch 5 #define __HYPERVISOR_sched_op_compat 6 /* compat since 0x00030101 */ #define __HYPERVISOR_platform_op 7 #define __HYPERVISOR_set_debugreg 8 #define __HYPERVISOR_get_debugreg 9 #define __HYPERVISOR_update_descriptor 10 #define __HYPERVISOR_memory_op 12 #define __HYPERVISOR_multicall 13 #define __HYPERVISOR_update_va_mapping 14 #define __HYPERVISOR_set_timer_op 15 #define __HYPERVISOR_event_channel_op_compat 16 /* compat since 0x00030202 */ #define __HYPERVISOR_xen_version 17 #define __HYPERVISOR_console_io 18 #define __HYPERVISOR_physdev_op_compat 19 /* compat since 0x00030202 */ #define __HYPERVISOR_grant_table_op 20 #define __HYPERVISOR_vm_assist 21 #define __HYPERVISOR_update_va_mapping_otherdomain 22 #define __HYPERVISOR_iret 23 /* x86 only */ #define __HYPERVISOR_vcpu_op 24 #define __HYPERVISOR_set_segment_base 25 /* x86/64 only */ #define __HYPERVISOR_mmuext_op 26 #define __HYPERVISOR_xsm_op 27 #define __HYPERVISOR_nmi_op 28 #define __HYPERVISOR_sched_op 29 #define __HYPERVISOR_callback_op 30 #define __HYPERVISOR_xenoprof_op 31 #define __HYPERVISOR_event_channel_op 32 #define __HYPERVISOR_physdev_op 33 #define __HYPERVISOR_hvm_op 34 #define __HYPERVISOR_sysctl 35 #define __HYPERVISOR_domctl 36 #define __HYPERVISOR_kexec_op 37 #define __HYPERVISOR_tmem_op 38 #define __HYPERVISOR_xc_reserved_op 39 /* reserved for XenClient */ /* Architecture-specific hypercall definitions. */ #define __HYPERVISOR_arch_0 48 #define __HYPERVISOR_arch_1 49 #define __HYPERVISOR_arch_2 50 #define __HYPERVISOR_arch_3 51 #define __HYPERVISOR_arch_4 52 #define __HYPERVISOR_arch_5 53 #define __HYPERVISOR_arch_6 54 #define __HYPERVISOR_arch_7 55 /* ` } */ /* * HYPERCALL COMPATIBILITY. */ /* New sched_op hypercall introduced in 0x00030101. */ #if __XEN_INTERFACE_VERSION__ < 0x00030101 #undef __HYPERVISOR_sched_op #define __HYPERVISOR_sched_op __HYPERVISOR_sched_op_compat #endif /* New event-channel and physdev hypercalls introduced in 0x00030202. */ #if __XEN_INTERFACE_VERSION__ < 0x00030202 #undef __HYPERVISOR_event_channel_op #define __HYPERVISOR_event_channel_op __HYPERVISOR_event_channel_op_compat #undef __HYPERVISOR_physdev_op #define __HYPERVISOR_physdev_op __HYPERVISOR_physdev_op_compat #endif /* New platform_op hypercall introduced in 0x00030204. */ #if __XEN_INTERFACE_VERSION__ < 0x00030204 #define __HYPERVISOR_dom0_op __HYPERVISOR_platform_op #endif /* * VIRTUAL INTERRUPTS * * Virtual interrupts that a guest OS may receive from Xen. * * In the side comments, 'V.' denotes a per-VCPU VIRQ while 'G.' denotes a * global VIRQ. The former can be bound once per VCPU and cannot be re-bound. * The latter can be allocated only once per guest: they must initially be * allocated to VCPU0 but can subsequently be re-bound. */ /* ` enum virq { */ #define VIRQ_TIMER 0 /* V. Timebase update, and/or requested timeout. */ #define VIRQ_DEBUG 1 /* V. Request guest to dump debug info. */ #define VIRQ_CONSOLE 2 /* G. (DOM0) Bytes received on emergency console. */ #define VIRQ_DOM_EXC 3 /* G. (DOM0) Exceptional event for some domain. */ #define VIRQ_TBUF 4 /* G. (DOM0) Trace buffer has records available. */ #define VIRQ_DEBUGGER 6 /* G. (DOM0) A domain has paused for debugging. */ #define VIRQ_XENOPROF 7 /* V. XenOprofile interrupt: new sample available */ #define VIRQ_CON_RING 8 /* G. (DOM0) Bytes received on console */ #define VIRQ_PCPU_STATE 9 /* G. (DOM0) PCPU state changed */ #define VIRQ_MEM_EVENT 10 /* G. (DOM0) A memory event has occured */ #define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient */ #define VIRQ_ENOMEM 12 /* G. (DOM0) Low on heap memory */ /* Architecture-specific VIRQ definitions. */ #define VIRQ_ARCH_0 16 #define VIRQ_ARCH_1 17 #define VIRQ_ARCH_2 18 #define VIRQ_ARCH_3 19 #define VIRQ_ARCH_4 20 #define VIRQ_ARCH_5 21 #define VIRQ_ARCH_6 22 #define VIRQ_ARCH_7 23 /* ` } */ #define NR_VIRQS 24 /* * ` enum neg_errnoval * ` HYPERVISOR_mmu_update(const struct mmu_update reqs[], * ` unsigned count, unsigned *done_out, * ` unsigned foreigndom) * ` * @reqs is an array of mmu_update_t structures ((ptr, val) pairs). * @count is the length of the above array. * @pdone is an output parameter indicating number of completed operations * @foreigndom[15:0]: FD, the expected owner of data pages referenced in this * hypercall invocation. Can be DOMID_SELF. * @foreigndom[31:16]: PFD, the expected owner of pagetable pages referenced * in this hypercall invocation. The value of this field * (x) encodes the PFD as follows: * x == 0 => PFD == DOMID_SELF * x != 0 => PFD == x - 1 * * Sub-commands: ptr[1:0] specifies the appropriate MMU_* command. * ------------- * ptr[1:0] == MMU_NORMAL_PT_UPDATE: * Updates an entry in a page table belonging to PFD. If updating an L1 table, * and the new table entry is valid/present, the mapped frame must belong to * FD. If attempting to map an I/O page then the caller assumes the privilege * of the FD. * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller. * FD == DOMID_XEN: Map restricted areas of Xen's heap space. * ptr[:2] -- Machine address of the page-table entry to modify. * val -- Value to write. * * There also certain implicit requirements when using this hypercall. The * pages that make up a pagetable must be mapped read-only in the guest. * This prevents uncontrolled guest updates to the pagetable. Xen strictly * enforces this, and will disallow any pagetable update which will end up * mapping pagetable page RW, and will disallow using any writable page as a * pagetable. In practice it means that when constructing a page table for a * process, thread, etc, we MUST be very dilligient in following these rules: * 1). Start with top-level page (PGD or in Xen language: L4). Fill out * the entries. * 2). Keep on going, filling out the upper (PUD or L3), and middle (PMD * or L2). * 3). Start filling out the PTE table (L1) with the PTE entries. Once * done, make sure to set each of those entries to RO (so writeable bit * is unset). Once that has been completed, set the PMD (L2) for this * PTE table as RO. * 4). When completed with all of the PMD (L2) entries, and all of them have * been set to RO, make sure to set RO the PUD (L3). Do the same * operation on PGD (L4) pagetable entries that have a PUD (L3) entry. * 5). Now before you can use those pages (so setting the cr3), you MUST also * pin them so that the hypervisor can verify the entries. This is done * via the HYPERVISOR_mmuext_op(MMUEXT_PIN_L4_TABLE, guest physical frame * number of the PGD (L4)). And this point the HYPERVISOR_mmuext_op( * MMUEXT_NEW_BASEPTR, guest physical frame number of the PGD (L4)) can be * issued. * For 32-bit guests, the L4 is not used (as there is less pagetables), so * instead use L3. * At this point the pagetables can be modified using the MMU_NORMAL_PT_UPDATE * hypercall. Also if so desired the OS can also try to write to the PTE * and be trapped by the hypervisor (as the PTE entry is RO). * * To deallocate the pages, the operations are the reverse of the steps * mentioned above. The argument is MMUEXT_UNPIN_TABLE for all levels and the * pagetable MUST not be in use (meaning that the cr3 is not set to it). * * ptr[1:0] == MMU_MACHPHYS_UPDATE: * Updates an entry in the machine->pseudo-physical mapping table. * ptr[:2] -- Machine address within the frame whose mapping to modify. * The frame must belong to the FD, if one is specified. * val -- Value to write into the mapping entry. * * ptr[1:0] == MMU_PT_UPDATE_PRESERVE_AD: * As MMU_NORMAL_PT_UPDATE above, but A/D bits currently in the PTE are ORed * with those in @val. * * @val is usually the machine frame number along with some attributes. * The attributes by default follow the architecture defined bits. Meaning that * if this is a X86_64 machine and four page table layout is used, the layout * of val is: * - 63 if set means No execute (NX) * - 46-13 the machine frame number * - 12 available for guest * - 11 available for guest * - 10 available for guest * - 9 available for guest * - 8 global * - 7 PAT (PSE is disabled, must use hypercall to make 4MB or 2MB pages) * - 6 dirty * - 5 accessed * - 4 page cached disabled * - 3 page write through * - 2 userspace accessible * - 1 writeable * - 0 present * * The one bits that does not fit with the default layout is the PAGE_PSE * also called PAGE_PAT). The MMUEXT_[UN]MARK_SUPER arguments to the * HYPERVISOR_mmuext_op serve as mechanism to set a pagetable to be 4MB * (or 2MB) instead of using the PAGE_PSE bit. * * The reason that the PAGE_PSE (bit 7) is not being utilized is due to Xen * using it as the Page Attribute Table (PAT) bit - for details on it please * refer to Intel SDM 10.12. The PAT allows to set the caching attributes of * pages instead of using MTRRs. * * The PAT MSR is as follows (it is a 64-bit value, each entry is 8 bits): * PAT4 PAT0 * +-----+-----+----+----+----+-----+----+----+ * | UC | UC- | WC | WB | UC | UC- | WC | WB | <= Linux * +-----+-----+----+----+----+-----+----+----+ * | UC | UC- | WT | WB | UC | UC- | WT | WB | <= BIOS (default when machine boots) * +-----+-----+----+----+----+-----+----+----+ * | rsv | rsv | WP | WC | UC | UC- | WT | WB | <= Xen * +-----+-----+----+----+----+-----+----+----+ * * The lookup of this index table translates to looking up * Bit 7, Bit 4, and Bit 3 of val entry: * * PAT/PSE (bit 7) ... PCD (bit 4) .. PWT (bit 3). * * If all bits are off, then we are using PAT0. If bit 3 turned on, * then we are using PAT1, if bit 3 and bit 4, then PAT2.. * * As you can see, the Linux PAT1 translates to PAT4 under Xen. Which means * that if a guest that follows Linux's PAT setup and would like to set Write * Combined on pages it MUST use PAT4 entry. Meaning that Bit 7 (PAGE_PAT) is * set. For example, under Linux it only uses PAT0, PAT1, and PAT2 for the * caching as: * * WB = none (so PAT0) * WC = PWT (bit 3 on) * UC = PWT | PCD (bit 3 and 4 are on). * * To make it work with Xen, it needs to translate the WC bit as so: * * PWT (so bit 3 on) --> PAT (so bit 7 is on) and clear bit 3 * * And to translate back it would: * * PAT (bit 7 on) --> PWT (bit 3 on) and clear bit 7. */ #define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ #define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */ #define MMU_PT_UPDATE_PRESERVE_AD 2 /* atomically: *ptr = val | (*ptr&(A|D)) */ /* * MMU EXTENDED OPERATIONS * * ` enum neg_errnoval * ` HYPERVISOR_mmuext_op(mmuext_op_t uops[], * ` unsigned int count, * ` unsigned int *pdone, * ` unsigned int foreigndom) */ /* HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures. * A foreigndom (FD) can be specified (or DOMID_SELF for none). * Where the FD has some effect, it is described below. * * cmd: MMUEXT_(UN)PIN_*_TABLE * mfn: Machine frame number to be (un)pinned as a p.t. page. * The frame must belong to the FD, if one is specified. * * cmd: MMUEXT_NEW_BASEPTR * mfn: Machine frame number of new page-table base to install in MMU. * * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only] * mfn: Machine frame number of new page-table base to install in MMU * when in user space. * * cmd: MMUEXT_TLB_FLUSH_LOCAL * No additional arguments. Flushes local TLB. * * cmd: MMUEXT_INVLPG_LOCAL * linear_addr: Linear address to be flushed from the local TLB. * * cmd: MMUEXT_TLB_FLUSH_MULTI * vcpumask: Pointer to bitmap of VCPUs to be flushed. * * cmd: MMUEXT_INVLPG_MULTI * linear_addr: Linear address to be flushed. * vcpumask: Pointer to bitmap of VCPUs to be flushed. * * cmd: MMUEXT_TLB_FLUSH_ALL * No additional arguments. Flushes all VCPUs' TLBs. * * cmd: MMUEXT_INVLPG_ALL * linear_addr: Linear address to be flushed from all VCPUs' TLBs. * * cmd: MMUEXT_FLUSH_CACHE * No additional arguments. Writes back and flushes cache contents. * * cmd: MMUEXT_FLUSH_CACHE_GLOBAL * No additional arguments. Writes back and flushes cache contents * on all CPUs in the system. * * cmd: MMUEXT_SET_LDT * linear_addr: Linear address of LDT base (NB. must be page-aligned). * nr_ents: Number of entries in LDT. * * cmd: MMUEXT_CLEAR_PAGE * mfn: Machine frame number to be cleared. * * cmd: MMUEXT_COPY_PAGE * mfn: Machine frame number of the destination page. * src_mfn: Machine frame number of the source page. * * cmd: MMUEXT_[UN]MARK_SUPER * mfn: Machine frame number of head of superpage to be [un]marked. */ /* ` enum mmuext_cmd { */ #define MMUEXT_PIN_L1_TABLE 0 #define MMUEXT_PIN_L2_TABLE 1 #define MMUEXT_PIN_L3_TABLE 2 #define MMUEXT_PIN_L4_TABLE 3 #define MMUEXT_UNPIN_TABLE 4 #define MMUEXT_NEW_BASEPTR 5 #define MMUEXT_TLB_FLUSH_LOCAL 6 #define MMUEXT_INVLPG_LOCAL 7 #define MMUEXT_TLB_FLUSH_MULTI 8 #define MMUEXT_INVLPG_MULTI 9 #define MMUEXT_TLB_FLUSH_ALL 10 #define MMUEXT_INVLPG_ALL 11 #define MMUEXT_FLUSH_CACHE 12 #define MMUEXT_SET_LDT 13 #define MMUEXT_NEW_USER_BASEPTR 15 #define MMUEXT_CLEAR_PAGE 16 #define MMUEXT_COPY_PAGE 17 #define MMUEXT_FLUSH_CACHE_GLOBAL 18 #define MMUEXT_MARK_SUPER 19 #define MMUEXT_UNMARK_SUPER 20 /* ` } */ #ifndef __ASSEMBLY__ struct mmuext_op { unsigned int cmd; /* => enum mmuext_cmd */ union { /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR * CLEAR_PAGE, COPY_PAGE, [UN]MARK_SUPER */ xen_pfn_t mfn; /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */ unsigned long linear_addr; } arg1; union { /* SET_LDT */ unsigned int nr_ents; /* TLB_FLUSH_MULTI, INVLPG_MULTI */ #if __XEN_INTERFACE_VERSION__ >= 0x00030205 XEN_GUEST_HANDLE(const_void) vcpumask; #else const void *vcpumask; #endif /* COPY_PAGE */ xen_pfn_t src_mfn; } arg2; }; typedef struct mmuext_op mmuext_op_t; DEFINE_XEN_GUEST_HANDLE(mmuext_op_t); #endif /* * ` enum neg_errnoval * ` HYPERVISOR_update_va_mapping(unsigned long va, u64 val, * ` enum uvm_flags flags) * ` * ` enum neg_errnoval * ` HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, u64 val, * ` enum uvm_flags flags, * ` domid_t domid) * ` * ` @va: The virtual address whose mapping we want to change * ` @val: The new page table entry, must contain a machine address * ` @flags: Control TLB flushes */ /* These are passed as 'flags' to update_va_mapping. They can be ORed. */ /* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap. */ /* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer. */ /* ` enum uvm_flags { */ #define UVMF_NONE (0UL<<0) /* No flushing at all. */ #define UVMF_TLB_FLUSH (1UL<<0) /* Flush entire TLB(s). */ #define UVMF_INVLPG (2UL<<0) /* Flush only one entry. */ #define UVMF_FLUSHTYPE_MASK (3UL<<0) #define UVMF_MULTI (0UL<<2) /* Flush subset of TLBs. */ #define UVMF_LOCAL (0UL<<2) /* Flush local TLB. */ #define UVMF_ALL (1UL<<2) /* Flush all TLBs. */ /* ` } */ /* * Commands to HYPERVISOR_console_io(). */ #define CONSOLEIO_write 0 #define CONSOLEIO_read 1 /* * Commands to HYPERVISOR_vm_assist(). */ #define VMASST_CMD_enable 0 #define VMASST_CMD_disable 1 /* x86/32 guests: simulate full 4GB segment limits. */ #define VMASST_TYPE_4gb_segments 0 /* x86/32 guests: trap (vector 15) whenever above vmassist is used. */ #define VMASST_TYPE_4gb_segments_notify 1 /* * x86 guests: support writes to bottom-level PTEs. * NB1. Page-directory entries cannot be written. * NB2. Guest must continue to remove all writable mappings of PTEs. */ #define VMASST_TYPE_writable_pagetables 2 /* x86/PAE guests: support PDPTs above 4GB. */ #define VMASST_TYPE_pae_extended_cr3 3 #define MAX_VMASST_TYPE 3 #ifndef __ASSEMBLY__ typedef uint16_t domid_t; /* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary domains. */ #define DOMID_FIRST_RESERVED (0x7FF0U) /* DOMID_SELF is used in certain contexts to refer to oneself. */ #define DOMID_SELF (0x7FF0U) /* * DOMID_IO is used to restrict page-table updates to mapping I/O memory. * Although no Foreign Domain need be specified to map I/O pages, DOMID_IO * is useful to ensure that no mappings to the OS's own heap are accidentally * installed. (e.g., in Linux this could cause havoc as reference counts * aren't adjusted on the I/O-mapping code path). * This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that context can * be specified by any calling domain. */ #define DOMID_IO (0x7FF1U) /* * DOMID_XEN is used to allow privileged domains to map restricted parts of * Xen's heap space (e.g., the machine_to_phys table). * This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only permitted if * the caller is privileged. */ #define DOMID_XEN (0x7FF2U) /* * DOMID_COW is used as the owner of sharable pages */ #define DOMID_COW (0x7FF3U) /* DOMID_INVALID is used to identify pages with unknown owner. */ #define DOMID_INVALID (0x7FF4U) /* Idle domain. */ #define DOMID_IDLE (0x7FFFU) /* * Send an array of these to HYPERVISOR_mmu_update(). * NB. The fields are natural pointer/address size for this architecture. */ struct mmu_update { uint64_t ptr; /* Machine address of PTE. */ uint64_t val; /* New contents of PTE. */ }; typedef struct mmu_update mmu_update_t; DEFINE_XEN_GUEST_HANDLE(mmu_update_t); /* * ` enum neg_errnoval * ` HYPERVISOR_multicall(multicall_entry_t call_list[], * ` unsigned int nr_calls); * * NB. The fields are natural register size for this architecture. */ struct multicall_entry { unsigned long op, result; unsigned long args[6]; }; typedef struct multicall_entry multicall_entry_t; DEFINE_XEN_GUEST_HANDLE(multicall_entry_t); #if __XEN_INTERFACE_VERSION__ < 0x00040400 /* * Event channel endpoints per domain (when using the 2-level ABI): * 1024 if a long is 32 bits; 4096 if a long is 64 bits. */ #define NR_EVENT_CHANNELS EVTCHN_2L_NR_CHANNELS #endif struct vcpu_time_info { /* * Updates to the following values are preceded and followed by an * increment of 'version'. The guest can therefore detect updates by * looking for changes to 'version'. If the least-significant bit of * the version number is set then an update is in progress and the guest * must wait to read a consistent set of values. * The correct way to interact with the version number is similar to * Linux's seqlock: see the implementations of read_seqbegin/read_seqretry. */ uint32_t version; uint32_t pad0; uint64_t tsc_timestamp; /* TSC at last update of time vals. */ uint64_t system_time; /* Time, in nanosecs, since boot. */ /* * Current system time: * system_time + * ((((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul) >> 32) * CPU frequency (Hz): * ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift */ uint32_t tsc_to_system_mul; int8_t tsc_shift; int8_t pad1[3]; }; /* 32 bytes */ typedef struct vcpu_time_info vcpu_time_info_t; struct vcpu_info { /* * 'evtchn_upcall_pending' is written non-zero by Xen to indicate * a pending notification for a particular VCPU. It is then cleared * by the guest OS /before/ checking for pending work, thus avoiding * a set-and-check race. Note that the mask is only accessed by Xen * on the CPU that is currently hosting the VCPU. This means that the * pending and mask flags can be updated by the guest without special * synchronisation (i.e., no need for the x86 LOCK prefix). * This may seem suboptimal because if the pending flag is set by * a different CPU then an IPI may be scheduled even when the mask * is set. However, note: * 1. The task of 'interrupt holdoff' is covered by the per-event- * channel mask bits. A 'noisy' event that is continually being * triggered can be masked at source at this very precise * granularity. * 2. The main purpose of the per-VCPU mask is therefore to restrict * reentrant execution: whether for concurrency control, or to * prevent unbounded stack usage. Whatever the purpose, we expect * that the mask will be asserted only for short periods at a time, * and so the likelihood of a 'spurious' IPI is suitably small. * The mask is read before making an event upcall to the guest: a * non-zero mask therefore guarantees that the VCPU will not receive * an upcall activation. The mask is cleared when the VCPU requests * to block: this avoids wakeup-waiting races. */ uint8_t evtchn_upcall_pending; #ifdef XEN_HAVE_PV_UPCALL_MASK uint8_t evtchn_upcall_mask; #else /* XEN_HAVE_PV_UPCALL_MASK */ uint8_t pad0; #endif /* XEN_HAVE_PV_UPCALL_MASK */ xen_ulong_t evtchn_pending_sel; struct arch_vcpu_info arch; struct vcpu_time_info time; }; /* 64 bytes (x86) */ #ifndef __XEN__ typedef struct vcpu_info vcpu_info_t; #endif /* * `incontents 200 startofday_shared Start-of-day shared data structure * Xen/kernel shared data -- pointer provided in start_info. * * This structure is defined to be both smaller than a page, and the * only data on the shared page, but may vary in actual size even within * compatible Xen versions; guests should not rely on the size * of this structure remaining constant. */ struct shared_info { struct vcpu_info vcpu_info[XEN_LEGACY_MAX_VCPUS]; /* * A domain can create "event channels" on which it can send and receive * asynchronous event notifications. There are three classes of event that * are delivered by this mechanism: * 1. Bi-directional inter- and intra-domain connections. Domains must * arrange out-of-band to set up a connection (usually by allocating * an unbound 'listener' port and avertising that via a storage service * such as xenstore). * 2. Physical interrupts. A domain with suitable hardware-access * privileges can bind an event-channel port to a physical interrupt * source. * 3. Virtual interrupts ('events'). A domain can bind an event-channel * port to a virtual interrupt source, such as the virtual-timer * device or the emergency console. * * Event channels are addressed by a "port index". Each channel is * associated with two bits of information: * 1. PENDING -- notifies the domain that there is a pending notification * to be processed. This bit is cleared by the guest. * 2. MASK -- if this bit is clear then a 0->1 transition of PENDING * will cause an asynchronous upcall to be scheduled. This bit is only * updated by the guest. It is read-only within Xen. If a channel * becomes pending while the channel is masked then the 'edge' is lost * (i.e., when the channel is unmasked, the guest must manually handle * pending notifications as no upcall will be scheduled by Xen). * * To expedite scanning of pending notifications, any 0->1 pending * transition on an unmasked channel causes a corresponding bit in a * per-vcpu selector word to be set. Each bit in the selector covers a * 'C long' in the PENDING bitfield array. */ xen_ulong_t evtchn_pending[sizeof(xen_ulong_t) * 8]; xen_ulong_t evtchn_mask[sizeof(xen_ulong_t) * 8]; /* * Wallclock time: updated only by control software. Guests should base * their gettimeofday() syscall on this wallclock-base value. */ uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */ uint32_t wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */ uint32_t wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */ struct arch_shared_info arch; }; #ifndef __XEN__ typedef struct shared_info shared_info_t; #endif /* * `incontents 200 startofday Start-of-day memory layout * * 1. The domain is started within contiguous virtual-memory region. * 2. The contiguous region ends on an aligned 4MB boundary. * 3. This the order of bootstrap elements in the initial virtual region: * a. relocated kernel image * b. initial ram disk [mod_start, mod_len] * c. list of allocated page frames [mfn_list, nr_pages] * (unless relocated due to XEN_ELFNOTE_INIT_P2M) * d. start_info_t structure [register ESI (x86)] * e. bootstrap page tables [pt_base and CR3 (x86)] * f. bootstrap stack [register ESP (x86)] * 4. Bootstrap elements are packed together, but each is 4kB-aligned. * 5. The initial ram disk may be omitted. * 6. The list of page frames forms a contiguous 'pseudo-physical' memory * layout for the domain. In particular, the bootstrap virtual-memory * region is a 1:1 mapping to the first section of the pseudo-physical map. * 7. All bootstrap elements are mapped read-writable for the guest OS. The * only exception is the bootstrap page table, which is mapped read-only. * 8. There is guaranteed to be at least 512kB padding after the final * bootstrap element. If necessary, the bootstrap virtual region is * extended by an extra 4MB to ensure this. * * Note: Prior to 25833:bb85bbccb1c9. ("x86/32-on-64 adjust Dom0 initial page * table layout") a bug caused the pt_base (3.e above) and cr3 to not point * to the start of the guest page tables (it was offset by two pages). * This only manifested itself on 32-on-64 dom0 kernels and not 32-on-64 domU * or 64-bit kernels of any colour. The page tables for a 32-on-64 dom0 got * allocated in the order: 'first L1','first L2', 'first L3', so the offset * to the page table base is by two pages back. The initial domain if it is * 32-bit and runs under a 64-bit hypervisor should _NOT_ use two of the * pages preceding pt_base and mark them as reserved/unused. */ #ifdef XEN_HAVE_PV_GUEST_ENTRY struct start_info { /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME. */ char magic[32]; /* "xen--". */ unsigned long nr_pages; /* Total pages allocated to this domain. */ unsigned long shared_info; /* MACHINE address of shared info struct. */ uint32_t flags; /* SIF_xxx flags. */ xen_pfn_t store_mfn; /* MACHINE page number of shared page. */ uint32_t store_evtchn; /* Event channel for store communication. */ union { struct { xen_pfn_t mfn; /* MACHINE page number of console page. */ uint32_t evtchn; /* Event channel for console page. */ } domU; struct { uint32_t info_off; /* Offset of console_info struct. */ uint32_t info_size; /* Size of console_info struct from start.*/ } dom0; } console; /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME). */ unsigned long pt_base; /* VIRTUAL address of page directory. */ unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames. */ unsigned long mfn_list; /* VIRTUAL address of page-frame list. */ unsigned long mod_start; /* VIRTUAL address of pre-loaded module */ /* (PFN of pre-loaded module if */ /* SIF_MOD_START_PFN set in flags). */ unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ #define MAX_GUEST_CMDLINE 1024 int8_t cmd_line[MAX_GUEST_CMDLINE]; /* The pfn range here covers both page table and p->m table frames. */ unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */ unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table. */ }; typedef struct start_info start_info_t; /* New console union for dom0 introduced in 0x00030203. */ #if __XEN_INTERFACE_VERSION__ < 0x00030203 #define console_mfn console.domU.mfn #define console_evtchn console.domU.evtchn #endif #endif /* XEN_HAVE_PV_GUEST_ENTRY */ /* These flags are passed in the 'flags' field of start_info_t. */ #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ #define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */ #define SIF_MOD_START_PFN (1<<3) /* Is mod_start a PFN? */ #define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ /* * A multiboot module is a package containing modules very similar to a * multiboot module array. The only differences are: * - the array of module descriptors is by convention simply at the beginning * of the multiboot module, * - addresses in the module descriptors are based on the beginning of the * multiboot module, * - the number of modules is determined by a termination descriptor that has * mod_start == 0. * * This permits to both build it statically and reference it in a configuration * file, and let the PV guest easily rebase the addresses to virtual addresses * and at the same time count the number of modules. */ struct xen_multiboot_mod_list { /* Address of first byte of the module */ uint32_t mod_start; /* Address of last byte of the module (inclusive) */ uint32_t mod_end; /* Address of zero-terminated command line */ uint32_t cmdline; /* Unused, must be zero */ uint32_t pad; }; /* * `incontents 200 startofday_dom0_console Dom0_console * * The console structure in start_info.console.dom0 * * This structure includes a variety of information required to * have a working VGA/VESA console. */ typedef struct dom0_vga_console_info { uint8_t video_type; /* DOM0_VGA_CONSOLE_??? */ #define XEN_VGATYPE_TEXT_MODE_3 0x03 #define XEN_VGATYPE_VESA_LFB 0x23 #define XEN_VGATYPE_EFI_LFB 0x70 union { struct { /* Font height, in pixels. */ uint16_t font_height; /* Cursor location (column, row). */ uint16_t cursor_x, cursor_y; /* Number of rows and columns (dimensions in characters). */ uint16_t rows, columns; } text_mode_3; struct { /* Width and height, in pixels. */ uint16_t width, height; /* Bytes per scan line. */ uint16_t bytes_per_line; /* Bits per pixel. */ uint16_t bits_per_pixel; /* LFB physical address, and size (in units of 64kB). */ uint32_t lfb_base; uint32_t lfb_size; /* RGB mask offsets and sizes, as defined by VBE 1.2+ */ uint8_t red_pos, red_size; uint8_t green_pos, green_size; uint8_t blue_pos, blue_size; uint8_t rsvd_pos, rsvd_size; #if __XEN_INTERFACE_VERSION__ >= 0x00030206 /* VESA capabilities (offset 0xa, VESA command 0x4f00). */ uint32_t gbl_caps; /* Mode attributes (offset 0x0, VESA command 0x4f01). */ uint16_t mode_attrs; #endif } vesa_lfb; } u; } dom0_vga_console_info_t; #define xen_vga_console_info dom0_vga_console_info #define xen_vga_console_info_t dom0_vga_console_info_t typedef uint8_t xen_domain_handle_t[16]; /* Turn a plain number into a C unsigned long constant. */ #define __mk_unsigned_long(x) x ## UL #define mk_unsigned_long(x) __mk_unsigned_long(x) __DEFINE_XEN_GUEST_HANDLE(uint8, uint8_t); __DEFINE_XEN_GUEST_HANDLE(uint16, uint16_t); __DEFINE_XEN_GUEST_HANDLE(uint32, uint32_t); __DEFINE_XEN_GUEST_HANDLE(uint64, uint64_t); #else /* __ASSEMBLY__ */ /* In assembly code we cannot use C numeric constant suffixes. */ #define mk_unsigned_long(x) x #endif /* !__ASSEMBLY__ */ /* Default definitions for macros used by domctl/sysctl. */ #if defined(__XEN__) || defined(__XEN_TOOLS__) #ifndef uint64_aligned_t #define uint64_aligned_t uint64_t #endif #ifndef XEN_GUEST_HANDLE_64 #define XEN_GUEST_HANDLE_64(name) XEN_GUEST_HANDLE(name) #endif #ifndef __ASSEMBLY__ struct xenctl_bitmap { XEN_GUEST_HANDLE_64(uint8) bitmap; uint32_t nr_bits; }; #endif #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */ #endif /* __XEN_PUBLIC_XEN_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/arch-x86_32.h0000664000175000017500000000241312307313555016153 0ustar smbsmb/****************************************************************************** * arch-x86_32.h * * Guest OS interface to x86 32-bit Xen. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2004-2006, K A Fraser */ #include "arch-x86/xen.h" xen-4.4.0/xen/include/public/memory.h0000664000175000017500000004077012307313555015627 0ustar smbsmb/****************************************************************************** * memory.h * * Memory reservation and information. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2005, Keir Fraser */ #ifndef __XEN_PUBLIC_MEMORY_H__ #define __XEN_PUBLIC_MEMORY_H__ #include "xen.h" /* * Increase or decrease the specified domain's memory reservation. Returns the * number of extents successfully allocated or freed. * arg == addr of struct xen_memory_reservation. */ #define XENMEM_increase_reservation 0 #define XENMEM_decrease_reservation 1 #define XENMEM_populate_physmap 6 #if __XEN_INTERFACE_VERSION__ >= 0x00030209 /* * Maximum # bits addressable by the user of the allocated region (e.g., I/O * devices often have a 32-bit limitation even in 64-bit systems). If zero * then the user has no addressing restriction. This field is not used by * XENMEM_decrease_reservation. */ #define XENMEMF_address_bits(x) (x) #define XENMEMF_get_address_bits(x) ((x) & 0xffu) /* NUMA node to allocate from. */ #define XENMEMF_node(x) (((x) + 1) << 8) #define XENMEMF_get_node(x) ((((x) >> 8) - 1) & 0xffu) /* Flag to populate physmap with populate-on-demand entries */ #define XENMEMF_populate_on_demand (1<<16) /* Flag to request allocation only from the node specified */ #define XENMEMF_exact_node_request (1<<17) #define XENMEMF_exact_node(n) (XENMEMF_node(n) | XENMEMF_exact_node_request) #endif struct xen_memory_reservation { /* * XENMEM_increase_reservation: * OUT: MFN (*not* GMFN) bases of extents that were allocated * XENMEM_decrease_reservation: * IN: GMFN bases of extents to free * XENMEM_populate_physmap: * IN: GPFN bases of extents to populate with memory * OUT: GMFN bases of extents that were allocated * (NB. This command also updates the mach_to_phys translation table) * XENMEM_claim_pages: * IN: must be zero */ XEN_GUEST_HANDLE(xen_pfn_t) extent_start; /* Number of extents, and size/alignment of each (2^extent_order pages). */ xen_ulong_t nr_extents; unsigned int extent_order; #if __XEN_INTERFACE_VERSION__ >= 0x00030209 /* XENMEMF flags. */ unsigned int mem_flags; #else unsigned int address_bits; #endif /* * Domain whose reservation is being changed. * Unprivileged domains can specify only DOMID_SELF. */ domid_t domid; }; typedef struct xen_memory_reservation xen_memory_reservation_t; DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t); /* * An atomic exchange of memory pages. If return code is zero then * @out.extent_list provides GMFNs of the newly-allocated memory. * Returns zero on complete success, otherwise a negative error code. * On complete success then always @nr_exchanged == @in.nr_extents. * On partial success @nr_exchanged indicates how much work was done. */ #define XENMEM_exchange 11 struct xen_memory_exchange { /* * [IN] Details of memory extents to be exchanged (GMFN bases). * Note that @in.address_bits is ignored and unused. */ struct xen_memory_reservation in; /* * [IN/OUT] Details of new memory extents. * We require that: * 1. @in.domid == @out.domid * 2. @in.nr_extents << @in.extent_order == * @out.nr_extents << @out.extent_order * 3. @in.extent_start and @out.extent_start lists must not overlap * 4. @out.extent_start lists GPFN bases to be populated * 5. @out.extent_start is overwritten with allocated GMFN bases */ struct xen_memory_reservation out; /* * [OUT] Number of input extents that were successfully exchanged: * 1. The first @nr_exchanged input extents were successfully * deallocated. * 2. The corresponding first entries in the output extent list correctly * indicate the GMFNs that were successfully exchanged. * 3. All other input and output extents are untouched. * 4. If not all input exents are exchanged then the return code of this * command will be non-zero. * 5. THIS FIELD MUST BE INITIALISED TO ZERO BY THE CALLER! */ xen_ulong_t nr_exchanged; }; typedef struct xen_memory_exchange xen_memory_exchange_t; DEFINE_XEN_GUEST_HANDLE(xen_memory_exchange_t); /* * Returns the maximum machine frame number of mapped RAM in this system. * This command always succeeds (it never returns an error code). * arg == NULL. */ #define XENMEM_maximum_ram_page 2 /* * Returns the current or maximum memory reservation, in pages, of the * specified domain (may be DOMID_SELF). Returns -ve errcode on failure. * arg == addr of domid_t. */ #define XENMEM_current_reservation 3 #define XENMEM_maximum_reservation 4 /* * Returns the maximum GPFN in use by the guest, or -ve errcode on failure. */ #define XENMEM_maximum_gpfn 14 /* * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys * mapping table. Architectures which do not have a m2p table do not implement * this command. * arg == addr of xen_machphys_mfn_list_t. */ #define XENMEM_machphys_mfn_list 5 struct xen_machphys_mfn_list { /* * Size of the 'extent_start' array. Fewer entries will be filled if the * machphys table is smaller than max_extents * 2MB. */ unsigned int max_extents; /* * Pointer to buffer to fill with list of extent starts. If there are * any large discontiguities in the machine address space, 2MB gaps in * the machphys table will be represented by an MFN base of zero. */ XEN_GUEST_HANDLE(xen_pfn_t) extent_start; /* * Number of extents written to the above array. This will be smaller * than 'max_extents' if the machphys table is smaller than max_e * 2MB. */ unsigned int nr_extents; }; typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t; DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t); /* * Returns the location in virtual address space of the machine_to_phys * mapping table. Architectures which do not have a m2p table, or which do not * map it by default into guest address space, do not implement this command. * arg == addr of xen_machphys_mapping_t. */ #define XENMEM_machphys_mapping 12 struct xen_machphys_mapping { xen_ulong_t v_start, v_end; /* Start and end virtual addresses. */ xen_ulong_t max_mfn; /* Maximum MFN that can be looked up. */ }; typedef struct xen_machphys_mapping xen_machphys_mapping_t; DEFINE_XEN_GUEST_HANDLE(xen_machphys_mapping_t); /* Source mapping space. */ /* ` enum phys_map_space { */ #define XENMAPSPACE_shared_info 0 /* shared info page */ #define XENMAPSPACE_grant_table 1 /* grant table page */ #define XENMAPSPACE_gmfn 2 /* GMFN */ #define XENMAPSPACE_gmfn_range 3 /* GMFN range, XENMEM_add_to_physmap only. */ #define XENMAPSPACE_gmfn_foreign 4 /* GMFN from another dom, * XENMEM_add_to_physmap_batch only. */ /* ` } */ /* * Sets the GPFN at which a particular page appears in the specified guest's * pseudophysical address space. * arg == addr of xen_add_to_physmap_t. */ #define XENMEM_add_to_physmap 7 struct xen_add_to_physmap { /* Which domain to change the mapping for. */ domid_t domid; /* Number of pages to go through for gmfn_range */ uint16_t size; unsigned int space; /* => enum phys_map_space */ #define XENMAPIDX_grant_table_status 0x80000000 /* Index into space being mapped. */ xen_ulong_t idx; /* GPFN in domid where the source mapping page should appear. */ xen_pfn_t gpfn; }; typedef struct xen_add_to_physmap xen_add_to_physmap_t; DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t); /* A batched version of add_to_physmap. */ #define XENMEM_add_to_physmap_batch 23 struct xen_add_to_physmap_batch { /* IN */ /* Which domain to change the mapping for. */ domid_t domid; uint16_t space; /* => enum phys_map_space */ /* Number of pages to go through */ uint16_t size; domid_t foreign_domid; /* IFF gmfn_foreign */ /* Indexes into space being mapped. */ XEN_GUEST_HANDLE(xen_ulong_t) idxs; /* GPFN in domid where the source mapping page should appear. */ XEN_GUEST_HANDLE(xen_pfn_t) gpfns; /* OUT */ /* Per index error code. */ XEN_GUEST_HANDLE(int) errs; }; typedef struct xen_add_to_physmap_batch xen_add_to_physmap_batch_t; DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_batch_t); #if __XEN_INTERFACE_VERSION__ < 0x00040400 #define XENMEM_add_to_physmap_range XENMEM_add_to_physmap_batch #define xen_add_to_physmap_range xen_add_to_physmap_batch typedef struct xen_add_to_physmap_batch xen_add_to_physmap_range_t; DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_range_t); #endif /* * Unmaps the page appearing at a particular GPFN from the specified guest's * pseudophysical address space. * arg == addr of xen_remove_from_physmap_t. */ #define XENMEM_remove_from_physmap 15 struct xen_remove_from_physmap { /* Which domain to change the mapping for. */ domid_t domid; /* GPFN of the current mapping of the page. */ xen_pfn_t gpfn; }; typedef struct xen_remove_from_physmap xen_remove_from_physmap_t; DEFINE_XEN_GUEST_HANDLE(xen_remove_from_physmap_t); /*** REMOVED ***/ /*#define XENMEM_translate_gpfn_list 8*/ /* * Returns the pseudo-physical memory map as it was when the domain * was started (specified by XENMEM_set_memory_map). * arg == addr of xen_memory_map_t. */ #define XENMEM_memory_map 9 struct xen_memory_map { /* * On call the number of entries which can be stored in buffer. On * return the number of entries which have been stored in * buffer. */ unsigned int nr_entries; /* * Entries in the buffer are in the same format as returned by the * BIOS INT 0x15 EAX=0xE820 call. */ XEN_GUEST_HANDLE(void) buffer; }; typedef struct xen_memory_map xen_memory_map_t; DEFINE_XEN_GUEST_HANDLE(xen_memory_map_t); /* * Returns the real physical memory map. Passes the same structure as * XENMEM_memory_map. * arg == addr of xen_memory_map_t. */ #define XENMEM_machine_memory_map 10 /* * Set the pseudo-physical memory map of a domain, as returned by * XENMEM_memory_map. * arg == addr of xen_foreign_memory_map_t. */ #define XENMEM_set_memory_map 13 struct xen_foreign_memory_map { domid_t domid; struct xen_memory_map map; }; typedef struct xen_foreign_memory_map xen_foreign_memory_map_t; DEFINE_XEN_GUEST_HANDLE(xen_foreign_memory_map_t); #define XENMEM_set_pod_target 16 #define XENMEM_get_pod_target 17 struct xen_pod_target { /* IN */ uint64_t target_pages; /* OUT */ uint64_t tot_pages; uint64_t pod_cache_pages; uint64_t pod_entries; /* IN */ domid_t domid; }; typedef struct xen_pod_target xen_pod_target_t; #if defined(__XEN__) || defined(__XEN_TOOLS__) #ifndef uint64_aligned_t #define uint64_aligned_t uint64_t #endif /* * Get the number of MFNs saved through memory sharing. * The call never fails. */ #define XENMEM_get_sharing_freed_pages 18 #define XENMEM_get_sharing_shared_pages 19 #define XENMEM_paging_op 20 #define XENMEM_paging_op_nominate 0 #define XENMEM_paging_op_evict 1 #define XENMEM_paging_op_prep 2 #define XENMEM_access_op 21 #define XENMEM_access_op_resume 0 struct xen_mem_event_op { uint8_t op; /* XENMEM_*_op_* */ domid_t domain; /* PAGING_PREP IN: buffer to immediately fill page in */ uint64_aligned_t buffer; /* Other OPs */ uint64_aligned_t gfn; /* IN: gfn of page being operated on */ }; typedef struct xen_mem_event_op xen_mem_event_op_t; DEFINE_XEN_GUEST_HANDLE(xen_mem_event_op_t); #define XENMEM_sharing_op 22 #define XENMEM_sharing_op_nominate_gfn 0 #define XENMEM_sharing_op_nominate_gref 1 #define XENMEM_sharing_op_share 2 #define XENMEM_sharing_op_resume 3 #define XENMEM_sharing_op_debug_gfn 4 #define XENMEM_sharing_op_debug_mfn 5 #define XENMEM_sharing_op_debug_gref 6 #define XENMEM_sharing_op_add_physmap 7 #define XENMEM_sharing_op_audit 8 #define XENMEM_SHARING_OP_S_HANDLE_INVALID (-10) #define XENMEM_SHARING_OP_C_HANDLE_INVALID (-9) /* The following allows sharing of grant refs. This is useful * for sharing utilities sitting as "filters" in IO backends * (e.g. memshr + blktap(2)). The IO backend is only exposed * to grant references, and this allows sharing of the grefs */ #define XENMEM_SHARING_OP_FIELD_IS_GREF_FLAG (1ULL << 62) #define XENMEM_SHARING_OP_FIELD_MAKE_GREF(field, val) \ (field) = (XENMEM_SHARING_OP_FIELD_IS_GREF_FLAG | val) #define XENMEM_SHARING_OP_FIELD_IS_GREF(field) \ ((field) & XENMEM_SHARING_OP_FIELD_IS_GREF_FLAG) #define XENMEM_SHARING_OP_FIELD_GET_GREF(field) \ ((field) & (~XENMEM_SHARING_OP_FIELD_IS_GREF_FLAG)) struct xen_mem_sharing_op { uint8_t op; /* XENMEM_sharing_op_* */ domid_t domain; union { struct mem_sharing_op_nominate { /* OP_NOMINATE_xxx */ union { uint64_aligned_t gfn; /* IN: gfn to nominate */ uint32_t grant_ref; /* IN: grant ref to nominate */ } u; uint64_aligned_t handle; /* OUT: the handle */ } nominate; struct mem_sharing_op_share { /* OP_SHARE/ADD_PHYSMAP */ uint64_aligned_t source_gfn; /* IN: the gfn of the source page */ uint64_aligned_t source_handle; /* IN: handle to the source page */ uint64_aligned_t client_gfn; /* IN: the client gfn */ uint64_aligned_t client_handle; /* IN: handle to the client page */ domid_t client_domain; /* IN: the client domain id */ } share; struct mem_sharing_op_debug { /* OP_DEBUG_xxx */ union { uint64_aligned_t gfn; /* IN: gfn to debug */ uint64_aligned_t mfn; /* IN: mfn to debug */ uint32_t gref; /* IN: gref to debug */ } u; } debug; } u; }; typedef struct xen_mem_sharing_op xen_mem_sharing_op_t; DEFINE_XEN_GUEST_HANDLE(xen_mem_sharing_op_t); /* * Attempt to stake a claim for a domain on a quantity of pages * of system RAM, but _not_ assign specific pageframes. Only * arithmetic is performed so the hypercall is very fast and need * not be preemptible, thus sidestepping time-of-check-time-of-use * races for memory allocation. Returns 0 if the hypervisor page * allocator has atomically and successfully claimed the requested * number of pages, else non-zero. * * Any domain may have only one active claim. When sufficient memory * has been allocated to resolve the claim, the claim silently expires. * Claiming zero pages effectively resets any outstanding claim and * is always successful. * * Note that a valid claim may be staked even after memory has been * allocated for a domain. In this case, the claim is not incremental, * i.e. if the domain's tot_pages is 3, and a claim is staked for 10, * only 7 additional pages are claimed. * * Caller must be privileged or the hypercall fails. */ #define XENMEM_claim_pages 24 /* * XENMEM_claim_pages flags - the are no flags at this time. * The zero value is appropiate. */ #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */ #endif /* __XEN_PUBLIC_MEMORY_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/sched.h0000664000175000017500000001417512307313555015405 0ustar smbsmb/****************************************************************************** * sched.h * * Scheduler state interactions * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2005, Keir Fraser */ #ifndef __XEN_PUBLIC_SCHED_H__ #define __XEN_PUBLIC_SCHED_H__ #include "event_channel.h" /* * `incontents 150 sched Guest Scheduler Operations * * The SCHEDOP interface provides mechanisms for a guest to interact * with the scheduler, including yield, blocking and shutting itself * down. */ /* * The prototype for this hypercall is: * ` long HYPERVISOR_sched_op(enum sched_op cmd, void *arg, ...) * * @cmd == SCHEDOP_??? (scheduler operation). * @arg == Operation-specific extra argument(s), as described below. * ... == Additional Operation-specific extra arguments, described below. * * Versions of Xen prior to 3.0.2 provided only the following legacy version * of this hypercall, supporting only the commands yield, block and shutdown: * long sched_op(int cmd, unsigned long arg) * @cmd == SCHEDOP_??? (scheduler operation). * @arg == 0 (SCHEDOP_yield and SCHEDOP_block) * == SHUTDOWN_* code (SCHEDOP_shutdown) * * This legacy version is available to new guests as: * ` long HYPERVISOR_sched_op_compat(enum sched_op cmd, unsigned long arg) */ /* ` enum sched_op { // SCHEDOP_* => struct sched_* */ /* * Voluntarily yield the CPU. * @arg == NULL. */ #define SCHEDOP_yield 0 /* * Block execution of this VCPU until an event is received for processing. * If called with event upcalls masked, this operation will atomically * reenable event delivery and check for pending events before blocking the * VCPU. This avoids a "wakeup waiting" race. * @arg == NULL. */ #define SCHEDOP_block 1 /* * Halt execution of this domain (all VCPUs) and notify the system controller. * @arg == pointer to sched_shutdown_t structure. * * If the sched_shutdown_t reason is SHUTDOWN_suspend then this * hypercall takes an additional extra argument which should be the * MFN of the guest's start_info_t. * * In addition, which reason is SHUTDOWN_suspend this hypercall * returns 1 if suspend was cancelled or the domain was merely * checkpointed, and 0 if it is resuming in a new domain. */ #define SCHEDOP_shutdown 2 /* * Poll a set of event-channel ports. Return when one or more are pending. An * optional timeout may be specified. * @arg == pointer to sched_poll_t structure. */ #define SCHEDOP_poll 3 /* * Declare a shutdown for another domain. The main use of this function is * in interpreting shutdown requests and reasons for fully-virtualized * domains. A para-virtualized domain may use SCHEDOP_shutdown directly. * @arg == pointer to sched_remote_shutdown_t structure. */ #define SCHEDOP_remote_shutdown 4 /* * Latch a shutdown code, so that when the domain later shuts down it * reports this code to the control tools. * @arg == sched_shutdown_t, as for SCHEDOP_shutdown. */ #define SCHEDOP_shutdown_code 5 /* * Setup, poke and destroy a domain watchdog timer. * @arg == pointer to sched_watchdog_t structure. * With id == 0, setup a domain watchdog timer to cause domain shutdown * after timeout, returns watchdog id. * With id != 0 and timeout == 0, destroy domain watchdog timer. * With id != 0 and timeout != 0, poke watchdog timer and set new timeout. */ #define SCHEDOP_watchdog 6 /* ` } */ struct sched_shutdown { unsigned int reason; /* SHUTDOWN_* => enum sched_shutdown_reason */ }; typedef struct sched_shutdown sched_shutdown_t; DEFINE_XEN_GUEST_HANDLE(sched_shutdown_t); struct sched_poll { XEN_GUEST_HANDLE(evtchn_port_t) ports; unsigned int nr_ports; uint64_t timeout; }; typedef struct sched_poll sched_poll_t; DEFINE_XEN_GUEST_HANDLE(sched_poll_t); struct sched_remote_shutdown { domid_t domain_id; /* Remote domain ID */ unsigned int reason; /* SHUTDOWN_* => enum sched_shutdown_reason */ }; typedef struct sched_remote_shutdown sched_remote_shutdown_t; DEFINE_XEN_GUEST_HANDLE(sched_remote_shutdown_t); struct sched_watchdog { uint32_t id; /* watchdog ID */ uint32_t timeout; /* timeout */ }; typedef struct sched_watchdog sched_watchdog_t; DEFINE_XEN_GUEST_HANDLE(sched_watchdog_t); /* * Reason codes for SCHEDOP_shutdown. These may be interpreted by control * software to determine the appropriate action. For the most part, Xen does * not care about the shutdown code. */ /* ` enum sched_shutdown_reason { */ #define SHUTDOWN_poweroff 0 /* Domain exited normally. Clean up and kill. */ #define SHUTDOWN_reboot 1 /* Clean up, kill, and then restart. */ #define SHUTDOWN_suspend 2 /* Clean up, save suspend info, kill. */ #define SHUTDOWN_crash 3 /* Tell controller we've crashed. */ #define SHUTDOWN_watchdog 4 /* Restart because watchdog time expired. */ #define SHUTDOWN_MAX 4 /* Maximum valid shutdown reason. */ /* ` } */ #endif /* __XEN_PUBLIC_SCHED_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/arch-arm.h0000664000175000017500000003303312307313555016003 0ustar smbsmb/****************************************************************************** * arch-arm.h * * Guest OS interface to ARM Xen. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright 2011 (C) Citrix Systems */ #ifndef __XEN_PUBLIC_ARCH_ARM_H__ #define __XEN_PUBLIC_ARCH_ARM_H__ /* * `incontents 50 arm_abi Hypercall Calling Convention * * A hypercall is issued using the ARM HVC instruction. * * A hypercall can take up to 5 arguments. These are passed in * registers, the first argument in x0/r0 (for arm64/arm32 guests * respectively irrespective of whether the underlying hypervisor is * 32- or 64-bit), the second argument in x1/r1, the third in x2/r2, * the forth in x3/r3 and the fifth in x4/r4. * * The hypercall number is passed in r12 (arm) or x16 (arm64). In both * cases the relevant ARM procedure calling convention specifies this * is an inter-procedure-call scratch register (e.g. for use in linker * stubs). This use does not conflict with use during a hypercall. * * The HVC ISS must contain a Xen specific TAG: XEN_HYPERCALL_TAG. * * The return value is in x0/r0. * * The hypercall will clobber x16/r12 and the argument registers used * by that hypercall (except r0 which is the return value) i.e. in * addition to x16/r12 a 2 argument hypercall will clobber x1/r1 and a * 4 argument hypercall will clobber x1/r1, x2/r2 and x3/r3. * * Parameter structs passed to hypercalls are laid out according to * the Procedure Call Standard for the ARM Architecture (AAPCS, AKA * EABI) and Procedure Call Standard for the ARM 64-bit Architecture * (AAPCS64). Where there is a conflict the 64-bit standard should be * used regardless of guest type. Structures which are passed as * hypercall arguments are always little endian. * * All memory which is shared with other entities in the system * (including the hypervisor and other guests) must reside in memory * which is mapped as Normal Inner-cacheable. This applies to: * - hypercall arguments passed via a pointer to guest memory. * - memory shared via the grant table mechanism (including PV I/O * rings etc). * - memory shared with the hypervisor (struct shared_info, struct * vcpu_info, the grant table, etc). * * Any Inner cache allocation strategy (Write-Back, Write-Through etc) * is acceptable. There is no restriction on the Outer-cacheability. */ /* * `incontents 55 arm_hcall Supported Hypercalls * * Xen on ARM makes extensive use of hardware facilities and therefore * only a subset of the potential hypercalls are required. * * Since ARM uses second stage paging any machine/physical addresses * passed to hypercalls are Guest Physical Addresses (Intermediate * Physical Addresses) unless otherwise noted. * * The following hypercalls (and sub operations) are supported on the * ARM platform. Other hypercalls should be considered * unavailable/unsupported. * * HYPERVISOR_memory_op * All generic sub-operations. * * In addition the following arch specific sub-ops: * * XENMEM_add_to_physmap * * XENMEM_add_to_physmap_batch * * HYPERVISOR_domctl * All generic sub-operations, with the exception of: * * XEN_DOMCTL_iomem_permission (not yet implemented) * * XEN_DOMCTL_irq_permission (not yet implemented) * * HYPERVISOR_sched_op * All generic sub-operations, with the exception of: * * SCHEDOP_block -- prefer wfi hardware instruction * * HYPERVISOR_console_io * All generic sub-operations * * HYPERVISOR_xen_version * All generic sub-operations * * HYPERVISOR_event_channel_op * All generic sub-operations * * HYPERVISOR_physdev_op * No sub-operations are currenty supported * * HYPERVISOR_sysctl * All generic sub-operations, with the exception of: * * XEN_SYSCTL_page_offline_op * * XEN_SYSCTL_get_pmstat * * XEN_SYSCTL_pm_op * * HYPERVISOR_hvm_op * Exactly these sub-operations are supported: * * HVMOP_set_param * * HVMOP_get_param * * HYPERVISOR_grant_table_op * All generic sub-operations * * HYPERVISOR_vcpu_op * Exactly these sub-operations are supported: * * VCPUOP_register_vcpu_info * * VCPUOP_register_runstate_memory_area * * * Other notes on the ARM ABI: * * - struct start_info is not exported to ARM guests. * * - struct shared_info is mapped by ARM guests using the * HYPERVISOR_memory_op sub-op XENMEM_add_to_physmap, passing * XENMAPSPACE_shared_info as space parameter. * * - All the per-cpu struct vcpu_info are mapped by ARM guests using the * HYPERVISOR_vcpu_op sub-op VCPUOP_register_vcpu_info, including cpu0 * struct vcpu_info. * * - The grant table is mapped using the HYPERVISOR_memory_op sub-op * XENMEM_add_to_physmap, passing XENMAPSPACE_grant_table as space * parameter. The memory range specified under the Xen compatible * hypervisor node on device tree can be used as target gpfn for the * mapping. * * - Xenstore is initialized by using the two hvm_params * HVM_PARAM_STORE_PFN and HVM_PARAM_STORE_EVTCHN. They can be read * with the HYPERVISOR_hvm_op sub-op HVMOP_get_param. * * - The paravirtualized console is initialized by using the two * hvm_params HVM_PARAM_CONSOLE_PFN and HVM_PARAM_CONSOLE_EVTCHN. They * can be read with the HYPERVISOR_hvm_op sub-op HVMOP_get_param. * * - Event channel notifications are delivered using the percpu GIC * interrupt specified under the Xen compatible hypervisor node on * device tree. * * - The device tree Xen compatible node is fully described under Linux * at Documentation/devicetree/bindings/arm/xen.txt. */ #define XEN_HYPERCALL_TAG 0XEA1 #define uint64_aligned_t uint64_t __attribute__((aligned(8))) #ifndef __ASSEMBLY__ #define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ typedef union { type *p; unsigned long q; } \ __guest_handle_ ## name; \ typedef union { type *p; uint64_aligned_t q; } \ __guest_handle_64_ ## name; /* * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field * in a struct in memory. On ARM is always 8 bytes sizes and 8 bytes * aligned. * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an * hypercall argument. It is 4 bytes on aarch and 8 bytes on aarch64. */ #define __DEFINE_XEN_GUEST_HANDLE(name, type) \ ___DEFINE_XEN_GUEST_HANDLE(name, type); \ ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type) #define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name) #define __XEN_GUEST_HANDLE(name) __guest_handle_64_ ## name #define XEN_GUEST_HANDLE(name) __XEN_GUEST_HANDLE(name) /* this is going to be changed on 64 bit */ #define XEN_GUEST_HANDLE_PARAM(name) __guest_handle_ ## name #define set_xen_guest_handle_raw(hnd, val) \ do { \ typeof(&(hnd)) _sxghr_tmp = &(hnd); \ _sxghr_tmp->q = 0; \ _sxghr_tmp->p = val; \ } while ( 0 ) #ifdef __XEN_TOOLS__ #define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0) #endif #define set_xen_guest_handle(hnd, val) set_xen_guest_handle_raw(hnd, val) #if defined(__GNUC__) && !defined(__STRICT_ANSI__) /* Anonymous union includes both 32- and 64-bit names (e.g., r0/x0). */ # define __DECL_REG(n64, n32) union { \ uint64_t n64; \ uint32_t n32; \ } #else /* Non-gcc sources must always use the proper 64-bit name (e.g., x0). */ #define __DECL_REG(n64, n32) uint64_t n64 #endif struct vcpu_guest_core_regs { /* Aarch64 Aarch32 */ __DECL_REG(x0, r0_usr); __DECL_REG(x1, r1_usr); __DECL_REG(x2, r2_usr); __DECL_REG(x3, r3_usr); __DECL_REG(x4, r4_usr); __DECL_REG(x5, r5_usr); __DECL_REG(x6, r6_usr); __DECL_REG(x7, r7_usr); __DECL_REG(x8, r8_usr); __DECL_REG(x9, r9_usr); __DECL_REG(x10, r10_usr); __DECL_REG(x11, r11_usr); __DECL_REG(x12, r12_usr); __DECL_REG(x13, sp_usr); __DECL_REG(x14, lr_usr); __DECL_REG(x15, __unused_sp_hyp); __DECL_REG(x16, lr_irq); __DECL_REG(x17, sp_irq); __DECL_REG(x18, lr_svc); __DECL_REG(x19, sp_svc); __DECL_REG(x20, lr_abt); __DECL_REG(x21, sp_abt); __DECL_REG(x22, lr_und); __DECL_REG(x23, sp_und); __DECL_REG(x24, r8_fiq); __DECL_REG(x25, r9_fiq); __DECL_REG(x26, r10_fiq); __DECL_REG(x27, r11_fiq); __DECL_REG(x28, r12_fiq); __DECL_REG(x29, sp_fiq); __DECL_REG(x30, lr_fiq); /* Return address and mode */ __DECL_REG(pc64, pc32); /* ELR_EL2 */ uint32_t cpsr; /* SPSR_EL2 */ union { uint32_t spsr_el1; /* AArch64 */ uint32_t spsr_svc; /* AArch32 */ }; /* AArch32 guests only */ uint32_t spsr_fiq, spsr_irq, spsr_und, spsr_abt; /* AArch64 guests only */ uint64_t sp_el0; uint64_t sp_el1, elr_el1; }; typedef struct vcpu_guest_core_regs vcpu_guest_core_regs_t; DEFINE_XEN_GUEST_HANDLE(vcpu_guest_core_regs_t); #undef __DECL_REG typedef uint64_t xen_pfn_t; #define PRI_xen_pfn PRIx64 /* Maximum number of virtual CPUs in legacy multi-processor guests. */ /* Only one. All other VCPUS must use VCPUOP_register_vcpu_info */ #define XEN_LEGACY_MAX_VCPUS 1 typedef uint64_t xen_ulong_t; #define PRI_xen_ulong PRIx64 #if defined(__XEN__) || defined(__XEN_TOOLS__) struct vcpu_guest_context { #define _VGCF_online 0 #define VGCF_online (1<<_VGCF_online) uint32_t flags; /* VGCF_* */ struct vcpu_guest_core_regs user_regs; /* Core CPU registers */ uint32_t sctlr; uint64_t ttbcr, ttbr0, ttbr1; }; typedef struct vcpu_guest_context vcpu_guest_context_t; DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t); #endif struct arch_vcpu_info { }; typedef struct arch_vcpu_info arch_vcpu_info_t; struct arch_shared_info { }; typedef struct arch_shared_info arch_shared_info_t; typedef uint64_t xen_callback_t; #endif #if defined(__XEN__) || defined(__XEN_TOOLS__) /* PSR bits (CPSR, SPSR)*/ #define PSR_THUMB (1<<5) /* Thumb Mode enable */ #define PSR_FIQ_MASK (1<<6) /* Fast Interrupt mask */ #define PSR_IRQ_MASK (1<<7) /* Interrupt mask */ #define PSR_ABT_MASK (1<<8) /* Asynchronous Abort mask */ #define PSR_BIG_ENDIAN (1<<9) /* arm32: Big Endian Mode */ #define PSR_DBG_MASK (1<<9) /* arm64: Debug Exception mask */ #define PSR_IT_MASK (0x0600fc00) /* Thumb If-Then Mask */ #define PSR_JAZELLE (1<<24) /* Jazelle Mode */ /* 32 bit modes */ #define PSR_MODE_USR 0x10 #define PSR_MODE_FIQ 0x11 #define PSR_MODE_IRQ 0x12 #define PSR_MODE_SVC 0x13 #define PSR_MODE_MON 0x16 #define PSR_MODE_ABT 0x17 #define PSR_MODE_HYP 0x1a #define PSR_MODE_UND 0x1b #define PSR_MODE_SYS 0x1f /* 64 bit modes */ #define PSR_MODE_BIT 0x10 /* Set iff AArch32 */ #define PSR_MODE_EL3h 0x0d #define PSR_MODE_EL3t 0x0c #define PSR_MODE_EL2h 0x09 #define PSR_MODE_EL2t 0x08 #define PSR_MODE_EL1h 0x05 #define PSR_MODE_EL1t 0x04 #define PSR_MODE_EL0t 0x00 #define PSR_GUEST32_INIT (PSR_ABT_MASK|PSR_FIQ_MASK|PSR_IRQ_MASK|PSR_MODE_SVC) #define PSR_GUEST64_INIT (PSR_ABT_MASK|PSR_FIQ_MASK|PSR_IRQ_MASK|PSR_MODE_EL1h) #define SCTLR_GUEST_INIT 0x00c50078 /* * Virtual machine platform (memory layout, interrupts) * * These are defined for consistency between the tools and the * hypervisor. Guests must not rely on these hardcoded values but * should instead use the FDT. */ /* Physical Address Space */ #define GUEST_GICD_BASE 0x2c001000ULL #define GUEST_GICD_SIZE 0x1000ULL #define GUEST_GICC_BASE 0x2c002000ULL #define GUEST_GICC_SIZE 0x100ULL #define GUEST_RAM_BASE 0x80000000ULL #define GUEST_GNTTAB_BASE 0xb0000000ULL #define GUEST_GNTTAB_SIZE 0x00020000ULL /* Interrupts */ #define GUEST_TIMER_VIRT_PPI 27 #define GUEST_TIMER_PHYS_S_PPI 29 #define GUEST_TIMER_PHYS_NS_PPI 30 #define GUEST_EVTCHN_PPI 31 /* PSCI functions */ #define PSCI_cpu_suspend 0 #define PSCI_cpu_off 1 #define PSCI_cpu_on 2 #define PSCI_migrate 3 #endif #endif /* __XEN_PUBLIC_ARCH_ARM_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/elfnote.h0000664000175000017500000002014112307313555015741 0ustar smbsmb/****************************************************************************** * elfnote.h * * Definitions used for the Xen ELF notes. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2006, Ian Campbell, XenSource Ltd. */ #ifndef __XEN_PUBLIC_ELFNOTE_H__ #define __XEN_PUBLIC_ELFNOTE_H__ /* * `incontents 200 elfnotes ELF notes * * The notes should live in a PT_NOTE segment and have "Xen" in the * name field. * * Numeric types are either 4 or 8 bytes depending on the content of * the desc field. * * LEGACY indicated the fields in the legacy __xen_guest string which * this a note type replaces. * * String values (for non-legacy) are NULL terminated ASCII, also known * as ASCIZ type. */ /* * NAME=VALUE pair (string). */ #define XEN_ELFNOTE_INFO 0 /* * The virtual address of the entry point (numeric). * * LEGACY: VIRT_ENTRY */ #define XEN_ELFNOTE_ENTRY 1 /* The virtual address of the hypercall transfer page (numeric). * * LEGACY: HYPERCALL_PAGE. (n.b. legacy value is a physical page * number not a virtual address) */ #define XEN_ELFNOTE_HYPERCALL_PAGE 2 /* The virtual address where the kernel image should be mapped (numeric). * * Defaults to 0. * * LEGACY: VIRT_BASE */ #define XEN_ELFNOTE_VIRT_BASE 3 /* * The offset of the ELF paddr field from the actual required * pseudo-physical address (numeric). * * This is used to maintain backwards compatibility with older kernels * which wrote __PAGE_OFFSET into that field. This field defaults to 0 * if not present. * * LEGACY: ELF_PADDR_OFFSET. (n.b. legacy default is VIRT_BASE) */ #define XEN_ELFNOTE_PADDR_OFFSET 4 /* * The version of Xen that we work with (string). * * LEGACY: XEN_VER */ #define XEN_ELFNOTE_XEN_VERSION 5 /* * The name of the guest operating system (string). * * LEGACY: GUEST_OS */ #define XEN_ELFNOTE_GUEST_OS 6 /* * The version of the guest operating system (string). * * LEGACY: GUEST_VER */ #define XEN_ELFNOTE_GUEST_VERSION 7 /* * The loader type (string). * * LEGACY: LOADER */ #define XEN_ELFNOTE_LOADER 8 /* * The kernel supports PAE (x86/32 only, string = "yes", "no" or * "bimodal"). * * For compatibility with Xen 3.0.3 and earlier the "bimodal" setting * may be given as "yes,bimodal" which will cause older Xen to treat * this kernel as PAE. * * LEGACY: PAE (n.b. The legacy interface included a provision to * indicate 'extended-cr3' support allowing L3 page tables to be * placed above 4G. It is assumed that any kernel new enough to use * these ELF notes will include this and therefore "yes" here is * equivalent to "yes[entended-cr3]" in the __xen_guest interface. */ #define XEN_ELFNOTE_PAE_MODE 9 /* * The features supported/required by this kernel (string). * * The string must consist of a list of feature names (as given in * features.h, without the "XENFEAT_" prefix) separated by '|' * characters. If a feature is required for the kernel to function * then the feature name must be preceded by a '!' character. * * LEGACY: FEATURES */ #define XEN_ELFNOTE_FEATURES 10 /* * The kernel requires the symbol table to be loaded (string = "yes" or "no") * LEGACY: BSD_SYMTAB (n.b. The legacy treated the presence or absence * of this string as a boolean flag rather than requiring "yes" or * "no". */ #define XEN_ELFNOTE_BSD_SYMTAB 11 /* * The lowest address the hypervisor hole can begin at (numeric). * * This must not be set higher than HYPERVISOR_VIRT_START. Its presence * also indicates to the hypervisor that the kernel can deal with the * hole starting at a higher address. */ #define XEN_ELFNOTE_HV_START_LOW 12 /* * List of maddr_t-sized mask/value pairs describing how to recognize * (non-present) L1 page table entries carrying valid MFNs (numeric). */ #define XEN_ELFNOTE_L1_MFN_VALID 13 /* * Whether or not the guest supports cooperative suspend cancellation. * This is a numeric value. * * Default is 0 */ #define XEN_ELFNOTE_SUSPEND_CANCEL 14 /* * The (non-default) location the initial phys-to-machine map should be * placed at by the hypervisor (Dom0) or the tools (DomU). * The kernel must be prepared for this mapping to be established using * large pages, despite such otherwise not being available to guests. * The kernel must also be able to handle the page table pages used for * this mapping not being accessible through the initial mapping. * (Only x86-64 supports this at present.) */ #define XEN_ELFNOTE_INIT_P2M 15 /* * Whether or not the guest can deal with being passed an initrd not * mapped through its initial page tables. */ #define XEN_ELFNOTE_MOD_START_PFN 16 /* * The features supported by this kernel (numeric). * * Other than XEN_ELFNOTE_FEATURES on pre-4.2 Xen, this note allows a * kernel to specify support for features that older hypervisors don't * know about. The set of features 4.2 and newer hypervisors will * consider supported by the kernel is the combination of the sets * specified through this and the string note. * * LEGACY: FEATURES */ #define XEN_ELFNOTE_SUPPORTED_FEATURES 17 /* * The number of the highest elfnote defined. */ #define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUPPORTED_FEATURES /* * System information exported through crash notes. * * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_INFO * note in case of a system crash. This note will contain various * information about the system, see xen/include/xen/elfcore.h. */ #define XEN_ELFNOTE_CRASH_INFO 0x1000001 /* * System registers exported through crash notes. * * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_REGS * note per cpu in case of a system crash. This note is architecture * specific and will contain registers not saved in the "CORE" note. * See xen/include/xen/elfcore.h for more information. */ #define XEN_ELFNOTE_CRASH_REGS 0x1000002 /* * xen dump-core none note. * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_NONE * in its dump file to indicate that the file is xen dump-core * file. This note doesn't have any other information. * See tools/libxc/xc_core.h for more information. */ #define XEN_ELFNOTE_DUMPCORE_NONE 0x2000000 /* * xen dump-core header note. * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_HEADER * in its dump file. * See tools/libxc/xc_core.h for more information. */ #define XEN_ELFNOTE_DUMPCORE_HEADER 0x2000001 /* * xen dump-core xen version note. * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_XEN_VERSION * in its dump file. It contains the xen version obtained via the * XENVER hypercall. * See tools/libxc/xc_core.h for more information. */ #define XEN_ELFNOTE_DUMPCORE_XEN_VERSION 0x2000002 /* * xen dump-core format version note. * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_FORMAT_VERSION * in its dump file. It contains a format version identifier. * See tools/libxc/xc_core.h for more information. */ #define XEN_ELFNOTE_DUMPCORE_FORMAT_VERSION 0x2000003 #endif /* __XEN_PUBLIC_ELFNOTE_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/dom0_ops.h0000664000175000017500000000770712307313555016042 0ustar smbsmb/****************************************************************************** * dom0_ops.h * * Process command requests from domain-0 guest OS. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2002-2003, B Dragovic * Copyright (c) 2002-2006, K Fraser */ #ifndef __XEN_PUBLIC_DOM0_OPS_H__ #define __XEN_PUBLIC_DOM0_OPS_H__ #include "xen.h" #include "platform.h" #if __XEN_INTERFACE_VERSION__ >= 0x00030204 #error "dom0_ops.h is a compatibility interface only" #endif #define DOM0_INTERFACE_VERSION XENPF_INTERFACE_VERSION #define DOM0_SETTIME XENPF_settime #define dom0_settime xenpf_settime #define dom0_settime_t xenpf_settime_t #define DOM0_ADD_MEMTYPE XENPF_add_memtype #define dom0_add_memtype xenpf_add_memtype #define dom0_add_memtype_t xenpf_add_memtype_t #define DOM0_DEL_MEMTYPE XENPF_del_memtype #define dom0_del_memtype xenpf_del_memtype #define dom0_del_memtype_t xenpf_del_memtype_t #define DOM0_READ_MEMTYPE XENPF_read_memtype #define dom0_read_memtype xenpf_read_memtype #define dom0_read_memtype_t xenpf_read_memtype_t #define DOM0_MICROCODE XENPF_microcode_update #define dom0_microcode xenpf_microcode_update #define dom0_microcode_t xenpf_microcode_update_t #define DOM0_PLATFORM_QUIRK XENPF_platform_quirk #define dom0_platform_quirk xenpf_platform_quirk #define dom0_platform_quirk_t xenpf_platform_quirk_t typedef uint64_t cpumap_t; /* Unsupported legacy operation -- defined for API compatibility. */ #define DOM0_MSR 15 struct dom0_msr { /* IN variables. */ uint32_t write; cpumap_t cpu_mask; uint32_t msr; uint32_t in1; uint32_t in2; /* OUT variables. */ uint32_t out1; uint32_t out2; }; typedef struct dom0_msr dom0_msr_t; DEFINE_XEN_GUEST_HANDLE(dom0_msr_t); /* Unsupported legacy operation -- defined for API compatibility. */ #define DOM0_PHYSICAL_MEMORY_MAP 40 struct dom0_memory_map_entry { uint64_t start, end; uint32_t flags; /* reserved */ uint8_t is_ram; }; typedef struct dom0_memory_map_entry dom0_memory_map_entry_t; DEFINE_XEN_GUEST_HANDLE(dom0_memory_map_entry_t); struct dom0_op { uint32_t cmd; uint32_t interface_version; /* DOM0_INTERFACE_VERSION */ union { struct dom0_msr msr; struct dom0_settime settime; struct dom0_add_memtype add_memtype; struct dom0_del_memtype del_memtype; struct dom0_read_memtype read_memtype; struct dom0_microcode microcode; struct dom0_platform_quirk platform_quirk; struct dom0_memory_map_entry physical_memory_map; uint8_t pad[128]; } u; }; typedef struct dom0_op dom0_op_t; DEFINE_XEN_GUEST_HANDLE(dom0_op_t); #endif /* __XEN_PUBLIC_DOM0_OPS_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/arch-x86_64.h0000664000175000017500000000354612307313555016170 0ustar smbsmb/****************************************************************************** * arch-x86_64.h * * Guest OS interface to x86 64-bit Xen. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2004-2006, K A Fraser */ #include "arch-x86/xen.h" /* * ` enum neg_errnoval * ` HYPERVISOR_set_callbacks(unsigned long event_selector, * ` unsigned long event_address, * ` unsigned long failsafe_selector, * ` unsigned long failsafe_address); * ` * Register for callbacks on events. When an event (from an event * channel) occurs, event_address is used as the value of eip. * * A similar callback occurs if the segment selectors are invalid. * failsafe_address is used as the value of eip. * * On x86_64, event_selector and failsafe_selector are ignored (???). */ xen-4.4.0/xen/include/public/xen-compat.h0000664000175000017500000000363612307313555016372 0ustar smbsmb/****************************************************************************** * xen-compat.h * * Guest OS interface to Xen. Compatibility layer. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2006, Christian Limpach */ #ifndef __XEN_PUBLIC_XEN_COMPAT_H__ #define __XEN_PUBLIC_XEN_COMPAT_H__ #define __XEN_LATEST_INTERFACE_VERSION__ 0x00040400 #if defined(__XEN__) || defined(__XEN_TOOLS__) /* Xen is built with matching headers and implements the latest interface. */ #define __XEN_INTERFACE_VERSION__ __XEN_LATEST_INTERFACE_VERSION__ #elif !defined(__XEN_INTERFACE_VERSION__) /* Guests which do not specify a version get the legacy interface. */ #define __XEN_INTERFACE_VERSION__ 0x00000000 #endif #if __XEN_INTERFACE_VERSION__ > __XEN_LATEST_INTERFACE_VERSION__ #error "These header files do not support the requested interface version." #endif #endif /* __XEN_PUBLIC_XEN_COMPAT_H__ */ xen-4.4.0/xen/include/public/sysctl.h0000664000175000017500000006141612307313555015640 0ustar smbsmb/****************************************************************************** * sysctl.h * * System management operations. For use by node control stack. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2002-2006, K Fraser */ #ifndef __XEN_PUBLIC_SYSCTL_H__ #define __XEN_PUBLIC_SYSCTL_H__ #if !defined(__XEN__) && !defined(__XEN_TOOLS__) #error "sysctl operations are intended for use by node control tools only" #endif #include "xen.h" #include "domctl.h" #define XEN_SYSCTL_INTERFACE_VERSION 0x0000000A /* * Read console content from Xen buffer ring. */ /* XEN_SYSCTL_readconsole */ struct xen_sysctl_readconsole { /* IN: Non-zero -> clear after reading. */ uint8_t clear; /* IN: Non-zero -> start index specified by @index field. */ uint8_t incremental; uint8_t pad0, pad1; /* * IN: Start index for consuming from ring buffer (if @incremental); * OUT: End index after consuming from ring buffer. */ uint32_t index; /* IN: Virtual address to write console data. */ XEN_GUEST_HANDLE_64(char) buffer; /* IN: Size of buffer; OUT: Bytes written to buffer. */ uint32_t count; }; typedef struct xen_sysctl_readconsole xen_sysctl_readconsole_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_readconsole_t); /* Get trace buffers machine base address */ /* XEN_SYSCTL_tbuf_op */ struct xen_sysctl_tbuf_op { /* IN variables */ #define XEN_SYSCTL_TBUFOP_get_info 0 #define XEN_SYSCTL_TBUFOP_set_cpu_mask 1 #define XEN_SYSCTL_TBUFOP_set_evt_mask 2 #define XEN_SYSCTL_TBUFOP_set_size 3 #define XEN_SYSCTL_TBUFOP_enable 4 #define XEN_SYSCTL_TBUFOP_disable 5 uint32_t cmd; /* IN/OUT variables */ struct xenctl_bitmap cpu_mask; uint32_t evt_mask; /* OUT variables */ uint64_aligned_t buffer_mfn; uint32_t size; /* Also an IN variable! */ }; typedef struct xen_sysctl_tbuf_op xen_sysctl_tbuf_op_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_tbuf_op_t); /* * Get physical information about the host machine */ /* XEN_SYSCTL_physinfo */ /* (x86) The platform supports HVM guests. */ #define _XEN_SYSCTL_PHYSCAP_hvm 0 #define XEN_SYSCTL_PHYSCAP_hvm (1u<<_XEN_SYSCTL_PHYSCAP_hvm) /* (x86) The platform supports HVM-guest direct access to I/O devices. */ #define _XEN_SYSCTL_PHYSCAP_hvm_directio 1 #define XEN_SYSCTL_PHYSCAP_hvm_directio (1u<<_XEN_SYSCTL_PHYSCAP_hvm_directio) struct xen_sysctl_physinfo { uint32_t threads_per_core; uint32_t cores_per_socket; uint32_t nr_cpus; /* # CPUs currently online */ uint32_t max_cpu_id; /* Largest possible CPU ID on this host */ uint32_t nr_nodes; /* # nodes currently online */ uint32_t max_node_id; /* Largest possible node ID on this host */ uint32_t cpu_khz; uint64_aligned_t total_pages; uint64_aligned_t free_pages; uint64_aligned_t scrub_pages; uint64_aligned_t outstanding_pages; uint32_t hw_cap[8]; /* XEN_SYSCTL_PHYSCAP_??? */ uint32_t capabilities; }; typedef struct xen_sysctl_physinfo xen_sysctl_physinfo_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_physinfo_t); /* * Get the ID of the current scheduler. */ /* XEN_SYSCTL_sched_id */ struct xen_sysctl_sched_id { /* OUT variable */ uint32_t sched_id; }; typedef struct xen_sysctl_sched_id xen_sysctl_sched_id_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_sched_id_t); /* Interface for controlling Xen software performance counters. */ /* XEN_SYSCTL_perfc_op */ /* Sub-operations: */ #define XEN_SYSCTL_PERFCOP_reset 1 /* Reset all counters to zero. */ #define XEN_SYSCTL_PERFCOP_query 2 /* Get perfctr information. */ struct xen_sysctl_perfc_desc { char name[80]; /* name of perf counter */ uint32_t nr_vals; /* number of values for this counter */ }; typedef struct xen_sysctl_perfc_desc xen_sysctl_perfc_desc_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_desc_t); typedef uint32_t xen_sysctl_perfc_val_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_val_t); struct xen_sysctl_perfc_op { /* IN variables. */ uint32_t cmd; /* XEN_SYSCTL_PERFCOP_??? */ /* OUT variables. */ uint32_t nr_counters; /* number of counters description */ uint32_t nr_vals; /* number of values */ /* counter information (or NULL) */ XEN_GUEST_HANDLE_64(xen_sysctl_perfc_desc_t) desc; /* counter values (or NULL) */ XEN_GUEST_HANDLE_64(xen_sysctl_perfc_val_t) val; }; typedef struct xen_sysctl_perfc_op xen_sysctl_perfc_op_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_op_t); /* XEN_SYSCTL_getdomaininfolist */ struct xen_sysctl_getdomaininfolist { /* IN variables. */ domid_t first_domain; uint32_t max_domains; XEN_GUEST_HANDLE_64(xen_domctl_getdomaininfo_t) buffer; /* OUT variables. */ uint32_t num_domains; }; typedef struct xen_sysctl_getdomaininfolist xen_sysctl_getdomaininfolist_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getdomaininfolist_t); /* Inject debug keys into Xen. */ /* XEN_SYSCTL_debug_keys */ struct xen_sysctl_debug_keys { /* IN variables. */ XEN_GUEST_HANDLE_64(char) keys; uint32_t nr_keys; }; typedef struct xen_sysctl_debug_keys xen_sysctl_debug_keys_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_debug_keys_t); /* Get physical CPU information. */ /* XEN_SYSCTL_getcpuinfo */ struct xen_sysctl_cpuinfo { uint64_aligned_t idletime; }; typedef struct xen_sysctl_cpuinfo xen_sysctl_cpuinfo_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpuinfo_t); struct xen_sysctl_getcpuinfo { /* IN variables. */ uint32_t max_cpus; XEN_GUEST_HANDLE_64(xen_sysctl_cpuinfo_t) info; /* OUT variables. */ uint32_t nr_cpus; }; typedef struct xen_sysctl_getcpuinfo xen_sysctl_getcpuinfo_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getcpuinfo_t); /* XEN_SYSCTL_availheap */ struct xen_sysctl_availheap { /* IN variables. */ uint32_t min_bitwidth; /* Smallest address width (zero if don't care). */ uint32_t max_bitwidth; /* Largest address width (zero if don't care). */ int32_t node; /* NUMA node of interest (-1 for all nodes). */ /* OUT variables. */ uint64_aligned_t avail_bytes;/* Bytes available in the specified region. */ }; typedef struct xen_sysctl_availheap xen_sysctl_availheap_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_availheap_t); /* XEN_SYSCTL_get_pmstat */ struct pm_px_val { uint64_aligned_t freq; /* Px core frequency */ uint64_aligned_t residency; /* Px residency time */ uint64_aligned_t count; /* Px transition count */ }; typedef struct pm_px_val pm_px_val_t; DEFINE_XEN_GUEST_HANDLE(pm_px_val_t); struct pm_px_stat { uint8_t total; /* total Px states */ uint8_t usable; /* usable Px states */ uint8_t last; /* last Px state */ uint8_t cur; /* current Px state */ XEN_GUEST_HANDLE_64(uint64) trans_pt; /* Px transition table */ XEN_GUEST_HANDLE_64(pm_px_val_t) pt; }; typedef struct pm_px_stat pm_px_stat_t; DEFINE_XEN_GUEST_HANDLE(pm_px_stat_t); struct pm_cx_stat { uint32_t nr; /* entry nr in triggers & residencies, including C0 */ uint32_t last; /* last Cx state */ uint64_aligned_t idle_time; /* idle time from boot */ XEN_GUEST_HANDLE_64(uint64) triggers; /* Cx trigger counts */ XEN_GUEST_HANDLE_64(uint64) residencies; /* Cx residencies */ uint64_aligned_t pc2; uint64_aligned_t pc3; uint64_aligned_t pc6; uint64_aligned_t pc7; uint64_aligned_t cc3; uint64_aligned_t cc6; uint64_aligned_t cc7; }; struct xen_sysctl_get_pmstat { #define PMSTAT_CATEGORY_MASK 0xf0 #define PMSTAT_PX 0x10 #define PMSTAT_CX 0x20 #define PMSTAT_get_max_px (PMSTAT_PX | 0x1) #define PMSTAT_get_pxstat (PMSTAT_PX | 0x2) #define PMSTAT_reset_pxstat (PMSTAT_PX | 0x3) #define PMSTAT_get_max_cx (PMSTAT_CX | 0x1) #define PMSTAT_get_cxstat (PMSTAT_CX | 0x2) #define PMSTAT_reset_cxstat (PMSTAT_CX | 0x3) uint32_t type; uint32_t cpuid; union { struct pm_px_stat getpx; struct pm_cx_stat getcx; /* other struct for tx, etc */ } u; }; typedef struct xen_sysctl_get_pmstat xen_sysctl_get_pmstat_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_get_pmstat_t); /* XEN_SYSCTL_cpu_hotplug */ struct xen_sysctl_cpu_hotplug { /* IN variables */ uint32_t cpu; /* Physical cpu. */ #define XEN_SYSCTL_CPU_HOTPLUG_ONLINE 0 #define XEN_SYSCTL_CPU_HOTPLUG_OFFLINE 1 uint32_t op; /* hotplug opcode */ }; typedef struct xen_sysctl_cpu_hotplug xen_sysctl_cpu_hotplug_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpu_hotplug_t); /* * Get/set xen power management, include * 1. cpufreq governors and related parameters */ /* XEN_SYSCTL_pm_op */ struct xen_userspace { uint32_t scaling_setspeed; }; typedef struct xen_userspace xen_userspace_t; struct xen_ondemand { uint32_t sampling_rate_max; uint32_t sampling_rate_min; uint32_t sampling_rate; uint32_t up_threshold; }; typedef struct xen_ondemand xen_ondemand_t; /* * cpufreq para name of this structure named * same as sysfs file name of native linux */ #define CPUFREQ_NAME_LEN 16 struct xen_get_cpufreq_para { /* IN/OUT variable */ uint32_t cpu_num; uint32_t freq_num; uint32_t gov_num; /* for all governors */ /* OUT variable */ XEN_GUEST_HANDLE_64(uint32) affected_cpus; XEN_GUEST_HANDLE_64(uint32) scaling_available_frequencies; XEN_GUEST_HANDLE_64(char) scaling_available_governors; char scaling_driver[CPUFREQ_NAME_LEN]; uint32_t cpuinfo_cur_freq; uint32_t cpuinfo_max_freq; uint32_t cpuinfo_min_freq; uint32_t scaling_cur_freq; char scaling_governor[CPUFREQ_NAME_LEN]; uint32_t scaling_max_freq; uint32_t scaling_min_freq; /* for specific governor */ union { struct xen_userspace userspace; struct xen_ondemand ondemand; } u; int32_t turbo_enabled; }; struct xen_set_cpufreq_gov { char scaling_governor[CPUFREQ_NAME_LEN]; }; struct xen_set_cpufreq_para { #define SCALING_MAX_FREQ 1 #define SCALING_MIN_FREQ 2 #define SCALING_SETSPEED 3 #define SAMPLING_RATE 4 #define UP_THRESHOLD 5 uint32_t ctrl_type; uint32_t ctrl_value; }; struct xen_sysctl_pm_op { #define PM_PARA_CATEGORY_MASK 0xf0 #define CPUFREQ_PARA 0x10 /* cpufreq command type */ #define GET_CPUFREQ_PARA (CPUFREQ_PARA | 0x01) #define SET_CPUFREQ_GOV (CPUFREQ_PARA | 0x02) #define SET_CPUFREQ_PARA (CPUFREQ_PARA | 0x03) #define GET_CPUFREQ_AVGFREQ (CPUFREQ_PARA | 0x04) /* set/reset scheduler power saving option */ #define XEN_SYSCTL_pm_op_set_sched_opt_smt 0x21 /* cpuidle max_cstate access command */ #define XEN_SYSCTL_pm_op_get_max_cstate 0x22 #define XEN_SYSCTL_pm_op_set_max_cstate 0x23 /* set scheduler migration cost value */ #define XEN_SYSCTL_pm_op_set_vcpu_migration_delay 0x24 #define XEN_SYSCTL_pm_op_get_vcpu_migration_delay 0x25 /* enable/disable turbo mode when in dbs governor */ #define XEN_SYSCTL_pm_op_enable_turbo 0x26 #define XEN_SYSCTL_pm_op_disable_turbo 0x27 uint32_t cmd; uint32_t cpuid; union { struct xen_get_cpufreq_para get_para; struct xen_set_cpufreq_gov set_gov; struct xen_set_cpufreq_para set_para; uint64_aligned_t get_avgfreq; uint32_t set_sched_opt_smt; uint32_t get_max_cstate; uint32_t set_max_cstate; uint32_t get_vcpu_migration_delay; uint32_t set_vcpu_migration_delay; } u; }; /* XEN_SYSCTL_page_offline_op */ struct xen_sysctl_page_offline_op { /* IN: range of page to be offlined */ #define sysctl_page_offline 1 #define sysctl_page_online 2 #define sysctl_query_page_offline 3 uint32_t cmd; uint32_t start; uint32_t end; /* OUT: result of page offline request */ /* * bit 0~15: result flags * bit 16~31: owner */ XEN_GUEST_HANDLE(uint32) status; }; #define PG_OFFLINE_STATUS_MASK (0xFFUL) /* The result is invalid, i.e. HV does not handle it */ #define PG_OFFLINE_INVALID (0x1UL << 0) #define PG_OFFLINE_OFFLINED (0x1UL << 1) #define PG_OFFLINE_PENDING (0x1UL << 2) #define PG_OFFLINE_FAILED (0x1UL << 3) #define PG_OFFLINE_AGAIN (0x1UL << 4) #define PG_ONLINE_FAILED PG_OFFLINE_FAILED #define PG_ONLINE_ONLINED PG_OFFLINE_OFFLINED #define PG_OFFLINE_STATUS_OFFLINED (0x1UL << 1) #define PG_OFFLINE_STATUS_ONLINE (0x1UL << 2) #define PG_OFFLINE_STATUS_OFFLINE_PENDING (0x1UL << 3) #define PG_OFFLINE_STATUS_BROKEN (0x1UL << 4) #define PG_OFFLINE_MISC_MASK (0xFFUL << 4) /* valid when PG_OFFLINE_FAILED or PG_OFFLINE_PENDING */ #define PG_OFFLINE_XENPAGE (0x1UL << 8) #define PG_OFFLINE_DOM0PAGE (0x1UL << 9) #define PG_OFFLINE_ANONYMOUS (0x1UL << 10) #define PG_OFFLINE_NOT_CONV_RAM (0x1UL << 11) #define PG_OFFLINE_OWNED (0x1UL << 12) #define PG_OFFLINE_BROKEN (0x1UL << 13) #define PG_ONLINE_BROKEN PG_OFFLINE_BROKEN #define PG_OFFLINE_OWNER_SHIFT 16 /* XEN_SYSCTL_lockprof_op */ /* Sub-operations: */ #define XEN_SYSCTL_LOCKPROF_reset 1 /* Reset all profile data to zero. */ #define XEN_SYSCTL_LOCKPROF_query 2 /* Get lock profile information. */ /* Record-type: */ #define LOCKPROF_TYPE_GLOBAL 0 /* global lock, idx meaningless */ #define LOCKPROF_TYPE_PERDOM 1 /* per-domain lock, idx is domid */ #define LOCKPROF_TYPE_N 2 /* number of types */ struct xen_sysctl_lockprof_data { char name[40]; /* lock name (may include up to 2 %d specifiers) */ int32_t type; /* LOCKPROF_TYPE_??? */ int32_t idx; /* index (e.g. domain id) */ uint64_aligned_t lock_cnt; /* # of locking succeeded */ uint64_aligned_t block_cnt; /* # of wait for lock */ uint64_aligned_t lock_time; /* nsecs lock held */ uint64_aligned_t block_time; /* nsecs waited for lock */ }; typedef struct xen_sysctl_lockprof_data xen_sysctl_lockprof_data_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_lockprof_data_t); struct xen_sysctl_lockprof_op { /* IN variables. */ uint32_t cmd; /* XEN_SYSCTL_LOCKPROF_??? */ uint32_t max_elem; /* size of output buffer */ /* OUT variables (query only). */ uint32_t nr_elem; /* number of elements available */ uint64_aligned_t time; /* nsecs of profile measurement */ /* profile information (or NULL) */ XEN_GUEST_HANDLE_64(xen_sysctl_lockprof_data_t) data; }; typedef struct xen_sysctl_lockprof_op xen_sysctl_lockprof_op_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_lockprof_op_t); /* XEN_SYSCTL_topologyinfo */ #define INVALID_TOPOLOGY_ID (~0U) struct xen_sysctl_topologyinfo { /* * IN: maximum addressable entry in the caller-provided arrays. * OUT: largest cpu identifier in the system. * If OUT is greater than IN then the arrays are truncated! * If OUT is leass than IN then the array tails are not written by sysctl. */ uint32_t max_cpu_index; /* * If not NULL, these arrays are filled with core/socket/node identifier * for each cpu. * If a cpu has no core/socket/node information (e.g., cpu not present) * then the sentinel value ~0u is written to each array. * The number of array elements written by the sysctl is: * min(@max_cpu_index_IN,@max_cpu_index_OUT)+1 */ XEN_GUEST_HANDLE_64(uint32) cpu_to_core; XEN_GUEST_HANDLE_64(uint32) cpu_to_socket; XEN_GUEST_HANDLE_64(uint32) cpu_to_node; }; typedef struct xen_sysctl_topologyinfo xen_sysctl_topologyinfo_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_topologyinfo_t); /* XEN_SYSCTL_numainfo */ #define INVALID_NUMAINFO_ID (~0U) struct xen_sysctl_numainfo { /* * IN: maximum addressable entry in the caller-provided arrays. * OUT: largest node identifier in the system. * If OUT is greater than IN then the arrays are truncated! */ uint32_t max_node_index; /* NB. Entries are 0 if node is not present. */ XEN_GUEST_HANDLE_64(uint64) node_to_memsize; XEN_GUEST_HANDLE_64(uint64) node_to_memfree; /* * Array, of size (max_node_index+1)^2, listing memory access distances * between nodes. If an entry has no node distance information (e.g., node * not present) then the value ~0u is written. * * Note that the array rows must be indexed by multiplying by the minimum * of the caller-provided max_node_index and the returned value of * max_node_index. That is, if the largest node index in the system is * smaller than the caller can handle, a smaller 2-d array is constructed * within the space provided by the caller. When this occurs, trailing * space provided by the caller is not modified. If the largest node index * in the system is larger than the caller can handle, then a 2-d array of * the maximum size handleable by the caller is constructed. */ XEN_GUEST_HANDLE_64(uint32) node_to_node_distance; }; typedef struct xen_sysctl_numainfo xen_sysctl_numainfo_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_numainfo_t); /* XEN_SYSCTL_cpupool_op */ #define XEN_SYSCTL_CPUPOOL_OP_CREATE 1 /* C */ #define XEN_SYSCTL_CPUPOOL_OP_DESTROY 2 /* D */ #define XEN_SYSCTL_CPUPOOL_OP_INFO 3 /* I */ #define XEN_SYSCTL_CPUPOOL_OP_ADDCPU 4 /* A */ #define XEN_SYSCTL_CPUPOOL_OP_RMCPU 5 /* R */ #define XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN 6 /* M */ #define XEN_SYSCTL_CPUPOOL_OP_FREEINFO 7 /* F */ #define XEN_SYSCTL_CPUPOOL_PAR_ANY 0xFFFFFFFF struct xen_sysctl_cpupool_op { uint32_t op; /* IN */ uint32_t cpupool_id; /* IN: CDIARM OUT: CI */ uint32_t sched_id; /* IN: C OUT: I */ uint32_t domid; /* IN: M */ uint32_t cpu; /* IN: AR */ uint32_t n_dom; /* OUT: I */ struct xenctl_bitmap cpumap; /* OUT: IF */ }; typedef struct xen_sysctl_cpupool_op xen_sysctl_cpupool_op_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpupool_op_t); #define ARINC653_MAX_DOMAINS_PER_SCHEDULE 64 /* * This structure is used to pass a new ARINC653 schedule from a * privileged domain (ie dom0) to Xen. */ struct xen_sysctl_arinc653_schedule { /* major_frame holds the time for the new schedule's major frame * in nanoseconds. */ uint64_aligned_t major_frame; /* num_sched_entries holds how many of the entries in the * sched_entries[] array are valid. */ uint8_t num_sched_entries; /* The sched_entries array holds the actual schedule entries. */ struct { /* dom_handle must match a domain's UUID */ xen_domain_handle_t dom_handle; /* If a domain has multiple VCPUs, vcpu_id specifies which one * this schedule entry applies to. It should be set to 0 if * there is only one VCPU for the domain. */ unsigned int vcpu_id; /* runtime specifies the amount of time that should be allocated * to this VCPU per major frame. It is specified in nanoseconds */ uint64_aligned_t runtime; } sched_entries[ARINC653_MAX_DOMAINS_PER_SCHEDULE]; }; typedef struct xen_sysctl_arinc653_schedule xen_sysctl_arinc653_schedule_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_arinc653_schedule_t); struct xen_sysctl_credit_schedule { /* Length of timeslice in milliseconds */ #define XEN_SYSCTL_CSCHED_TSLICE_MAX 1000 #define XEN_SYSCTL_CSCHED_TSLICE_MIN 1 unsigned tslice_ms; /* Rate limit (minimum timeslice) in microseconds */ #define XEN_SYSCTL_SCHED_RATELIMIT_MAX 500000 #define XEN_SYSCTL_SCHED_RATELIMIT_MIN 100 unsigned ratelimit_us; }; typedef struct xen_sysctl_credit_schedule xen_sysctl_credit_schedule_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_credit_schedule_t); /* XEN_SYSCTL_scheduler_op */ /* Set or get info? */ #define XEN_SYSCTL_SCHEDOP_putinfo 0 #define XEN_SYSCTL_SCHEDOP_getinfo 1 struct xen_sysctl_scheduler_op { uint32_t cpupool_id; /* Cpupool whose scheduler is to be targetted. */ uint32_t sched_id; /* XEN_SCHEDULER_* (domctl.h) */ uint32_t cmd; /* XEN_SYSCTL_SCHEDOP_* */ union { struct xen_sysctl_sched_arinc653 { XEN_GUEST_HANDLE_64(xen_sysctl_arinc653_schedule_t) schedule; } sched_arinc653; struct xen_sysctl_credit_schedule sched_credit; } u; }; typedef struct xen_sysctl_scheduler_op xen_sysctl_scheduler_op_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_scheduler_op_t); /* XEN_SYSCTL_coverage_op */ /* * Get total size of information, to help allocate * the buffer. The pointer points to a 32 bit value. */ #define XEN_SYSCTL_COVERAGE_get_total_size 0 /* * Read coverage information in a single run * You must use a tool to split them. */ #define XEN_SYSCTL_COVERAGE_read 1 /* * Reset all the coverage counters to 0 * No parameters. */ #define XEN_SYSCTL_COVERAGE_reset 2 /* * Like XEN_SYSCTL_COVERAGE_read but reset also * counters to 0 in a single call. */ #define XEN_SYSCTL_COVERAGE_read_and_reset 3 struct xen_sysctl_coverage_op { uint32_t cmd; /* XEN_SYSCTL_COVERAGE_* */ union { uint32_t total_size; /* OUT */ XEN_GUEST_HANDLE_64(uint8) raw_info; /* OUT */ } u; }; typedef struct xen_sysctl_coverage_op xen_sysctl_coverage_op_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_coverage_op_t); struct xen_sysctl { uint32_t cmd; #define XEN_SYSCTL_readconsole 1 #define XEN_SYSCTL_tbuf_op 2 #define XEN_SYSCTL_physinfo 3 #define XEN_SYSCTL_sched_id 4 #define XEN_SYSCTL_perfc_op 5 #define XEN_SYSCTL_getdomaininfolist 6 #define XEN_SYSCTL_debug_keys 7 #define XEN_SYSCTL_getcpuinfo 8 #define XEN_SYSCTL_availheap 9 #define XEN_SYSCTL_get_pmstat 10 #define XEN_SYSCTL_cpu_hotplug 11 #define XEN_SYSCTL_pm_op 12 #define XEN_SYSCTL_page_offline_op 14 #define XEN_SYSCTL_lockprof_op 15 #define XEN_SYSCTL_topologyinfo 16 #define XEN_SYSCTL_numainfo 17 #define XEN_SYSCTL_cpupool_op 18 #define XEN_SYSCTL_scheduler_op 19 #define XEN_SYSCTL_coverage_op 20 uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */ union { struct xen_sysctl_readconsole readconsole; struct xen_sysctl_tbuf_op tbuf_op; struct xen_sysctl_physinfo physinfo; struct xen_sysctl_topologyinfo topologyinfo; struct xen_sysctl_numainfo numainfo; struct xen_sysctl_sched_id sched_id; struct xen_sysctl_perfc_op perfc_op; struct xen_sysctl_getdomaininfolist getdomaininfolist; struct xen_sysctl_debug_keys debug_keys; struct xen_sysctl_getcpuinfo getcpuinfo; struct xen_sysctl_availheap availheap; struct xen_sysctl_get_pmstat get_pmstat; struct xen_sysctl_cpu_hotplug cpu_hotplug; struct xen_sysctl_pm_op pm_op; struct xen_sysctl_page_offline_op page_offline; struct xen_sysctl_lockprof_op lockprof_op; struct xen_sysctl_cpupool_op cpupool_op; struct xen_sysctl_scheduler_op scheduler_op; struct xen_sysctl_coverage_op coverage_op; uint8_t pad[128]; } u; }; typedef struct xen_sysctl xen_sysctl_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_t); #endif /* __XEN_PUBLIC_SYSCTL_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/version.h0000664000175000017500000000610012307313555015771 0ustar smbsmb/****************************************************************************** * version.h * * Xen version, type, and compile information. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2005, Nguyen Anh Quynh * Copyright (c) 2005, Keir Fraser */ #ifndef __XEN_PUBLIC_VERSION_H__ #define __XEN_PUBLIC_VERSION_H__ #include "xen.h" /* NB. All ops return zero on success, except XENVER_{version,pagesize} */ /* arg == NULL; returns major:minor (16:16). */ #define XENVER_version 0 /* arg == xen_extraversion_t. */ #define XENVER_extraversion 1 typedef char xen_extraversion_t[16]; #define XEN_EXTRAVERSION_LEN (sizeof(xen_extraversion_t)) /* arg == xen_compile_info_t. */ #define XENVER_compile_info 2 struct xen_compile_info { char compiler[64]; char compile_by[16]; char compile_domain[32]; char compile_date[32]; }; typedef struct xen_compile_info xen_compile_info_t; #define XENVER_capabilities 3 typedef char xen_capabilities_info_t[1024]; #define XEN_CAPABILITIES_INFO_LEN (sizeof(xen_capabilities_info_t)) #define XENVER_changeset 4 typedef char xen_changeset_info_t[64]; #define XEN_CHANGESET_INFO_LEN (sizeof(xen_changeset_info_t)) #define XENVER_platform_parameters 5 struct xen_platform_parameters { xen_ulong_t virt_start; }; typedef struct xen_platform_parameters xen_platform_parameters_t; #define XENVER_get_features 6 struct xen_feature_info { unsigned int submap_idx; /* IN: which 32-bit submap to return */ uint32_t submap; /* OUT: 32-bit submap */ }; typedef struct xen_feature_info xen_feature_info_t; /* Declares the features reported by XENVER_get_features. */ #include "features.h" /* arg == NULL; returns host memory page size. */ #define XENVER_pagesize 7 /* arg == xen_domain_handle_t. */ #define XENVER_guest_handle 8 #define XENVER_commandline 9 typedef char xen_commandline_t[1024]; #endif /* __XEN_PUBLIC_VERSION_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/grant_table.h0000664000175000017500000005775412307313555016613 0ustar smbsmb/****************************************************************************** * grant_table.h * * Interface for granting foreign access to page frames, and receiving * page-ownership transfers. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2004, K A Fraser */ #ifndef __XEN_PUBLIC_GRANT_TABLE_H__ #define __XEN_PUBLIC_GRANT_TABLE_H__ #include "xen.h" /* * `incontents 150 gnttab Grant Tables * * Xen's grant tables provide a generic mechanism to memory sharing * between domains. This shared memory interface underpins the split * device drivers for block and network IO. * * Each domain has its own grant table. This is a data structure that * is shared with Xen; it allows the domain to tell Xen what kind of * permissions other domains have on its pages. Entries in the grant * table are identified by grant references. A grant reference is an * integer, which indexes into the grant table. It acts as a * capability which the grantee can use to perform operations on the * granter’s memory. * * This capability-based system allows shared-memory communications * between unprivileged domains. A grant reference also encapsulates * the details of a shared page, removing the need for a domain to * know the real machine address of a page it is sharing. This makes * it possible to share memory correctly with domains running in * fully virtualised memory. */ /*********************************** * GRANT TABLE REPRESENTATION */ /* Some rough guidelines on accessing and updating grant-table entries * in a concurrency-safe manner. For more information, Linux contains a * reference implementation for guest OSes (drivers/xen/grant_table.c, see * http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=blob;f=drivers/xen/grant-table.c;hb=HEAD * * NB. WMB is a no-op on current-generation x86 processors. However, a * compiler barrier will still be required. * * Introducing a valid entry into the grant table: * 1. Write ent->domid. * 2. Write ent->frame: * GTF_permit_access: Frame to which access is permitted. * GTF_accept_transfer: Pseudo-phys frame slot being filled by new * frame, or zero if none. * 3. Write memory barrier (WMB). * 4. Write ent->flags, inc. valid type. * * Invalidating an unused GTF_permit_access entry: * 1. flags = ent->flags. * 2. Observe that !(flags & (GTF_reading|GTF_writing)). * 3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0). * NB. No need for WMB as reuse of entry is control-dependent on success of * step 3, and all architectures guarantee ordering of ctrl-dep writes. * * Invalidating an in-use GTF_permit_access entry: * This cannot be done directly. Request assistance from the domain controller * which can set a timeout on the use of a grant entry and take necessary * action. (NB. This is not yet implemented!). * * Invalidating an unused GTF_accept_transfer entry: * 1. flags = ent->flags. * 2. Observe that !(flags & GTF_transfer_committed). [*] * 3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0). * NB. No need for WMB as reuse of entry is control-dependent on success of * step 3, and all architectures guarantee ordering of ctrl-dep writes. * [*] If GTF_transfer_committed is set then the grant entry is 'committed'. * The guest must /not/ modify the grant entry until the address of the * transferred frame is written. It is safe for the guest to spin waiting * for this to occur (detect by observing GTF_transfer_completed in * ent->flags). * * Invalidating a committed GTF_accept_transfer entry: * 1. Wait for (ent->flags & GTF_transfer_completed). * * Changing a GTF_permit_access from writable to read-only: * Use SMP-safe CMPXCHG to set GTF_readonly, while checking !GTF_writing. * * Changing a GTF_permit_access from read-only to writable: * Use SMP-safe bit-setting instruction. */ /* * Reference to a grant entry in a specified domain's grant table. */ typedef uint32_t grant_ref_t; /* * A grant table comprises a packed array of grant entries in one or more * page frames shared between Xen and a guest. * [XEN]: This field is written by Xen and read by the sharing guest. * [GST]: This field is written by the guest and read by Xen. */ /* * Version 1 of the grant table entry structure is maintained purely * for backwards compatibility. New guests should use version 2. */ #if __XEN_INTERFACE_VERSION__ < 0x0003020a #define grant_entry_v1 grant_entry #define grant_entry_v1_t grant_entry_t #endif struct grant_entry_v1 { /* GTF_xxx: various type and flag information. [XEN,GST] */ uint16_t flags; /* The domain being granted foreign privileges. [GST] */ domid_t domid; /* * GTF_permit_access: Frame that @domid is allowed to map and access. [GST] * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN] */ uint32_t frame; }; typedef struct grant_entry_v1 grant_entry_v1_t; /* The first few grant table entries will be preserved across grant table * version changes and may be pre-populated at domain creation by tools. */ #define GNTTAB_NR_RESERVED_ENTRIES 8 #define GNTTAB_RESERVED_CONSOLE 0 #define GNTTAB_RESERVED_XENSTORE 1 /* * Type of grant entry. * GTF_invalid: This grant entry grants no privileges. * GTF_permit_access: Allow @domid to map/access @frame. * GTF_accept_transfer: Allow @domid to transfer ownership of one page frame * to this guest. Xen writes the page number to @frame. * GTF_transitive: Allow @domid to transitively access a subrange of * @trans_grant in @trans_domid. No mappings are allowed. */ #define GTF_invalid (0U<<0) #define GTF_permit_access (1U<<0) #define GTF_accept_transfer (2U<<0) #define GTF_transitive (3U<<0) #define GTF_type_mask (3U<<0) /* * Subflags for GTF_permit_access. * GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST] * GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN] * GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN] * GTF_PAT, GTF_PWT, GTF_PCD: (x86) cache attribute flags for the grant [GST] * GTF_sub_page: Grant access to only a subrange of the page. @domid * will only be allowed to copy from the grant, and not * map it. [GST] */ #define _GTF_readonly (2) #define GTF_readonly (1U<<_GTF_readonly) #define _GTF_reading (3) #define GTF_reading (1U<<_GTF_reading) #define _GTF_writing (4) #define GTF_writing (1U<<_GTF_writing) #define _GTF_PWT (5) #define GTF_PWT (1U<<_GTF_PWT) #define _GTF_PCD (6) #define GTF_PCD (1U<<_GTF_PCD) #define _GTF_PAT (7) #define GTF_PAT (1U<<_GTF_PAT) #define _GTF_sub_page (8) #define GTF_sub_page (1U<<_GTF_sub_page) /* * Subflags for GTF_accept_transfer: * GTF_transfer_committed: Xen sets this flag to indicate that it is committed * to transferring ownership of a page frame. When a guest sees this flag * it must /not/ modify the grant entry until GTF_transfer_completed is * set by Xen. * GTF_transfer_completed: It is safe for the guest to spin-wait on this flag * after reading GTF_transfer_committed. Xen will always write the frame * address, followed by ORing this flag, in a timely manner. */ #define _GTF_transfer_committed (2) #define GTF_transfer_committed (1U<<_GTF_transfer_committed) #define _GTF_transfer_completed (3) #define GTF_transfer_completed (1U<<_GTF_transfer_completed) /* * Version 2 grant table entries. These fulfil the same role as * version 1 entries, but can represent more complicated operations. * Any given domain will have either a version 1 or a version 2 table, * and every entry in the table will be the same version. * * The interface by which domains use grant references does not depend * on the grant table version in use by the other domain. */ #if __XEN_INTERFACE_VERSION__ >= 0x0003020a /* * Version 1 and version 2 grant entries share a common prefix. The * fields of the prefix are documented as part of struct * grant_entry_v1. */ struct grant_entry_header { uint16_t flags; domid_t domid; }; typedef struct grant_entry_header grant_entry_header_t; /* * Version 2 of the grant entry structure. */ union grant_entry_v2 { grant_entry_header_t hdr; /* * This member is used for V1-style full page grants, where either: * * -- hdr.type is GTF_accept_transfer, or * -- hdr.type is GTF_permit_access and GTF_sub_page is not set. * * In that case, the frame field has the same semantics as the * field of the same name in the V1 entry structure. */ struct { grant_entry_header_t hdr; uint32_t pad0; uint64_t frame; } full_page; /* * If the grant type is GTF_grant_access and GTF_sub_page is set, * @domid is allowed to access bytes [@page_off,@page_off+@length) * in frame @frame. */ struct { grant_entry_header_t hdr; uint16_t page_off; uint16_t length; uint64_t frame; } sub_page; /* * If the grant is GTF_transitive, @domid is allowed to use the * grant @gref in domain @trans_domid, as if it was the local * domain. Obviously, the transitive access must be compatible * with the original grant. * * The current version of Xen does not allow transitive grants * to be mapped. */ struct { grant_entry_header_t hdr; domid_t trans_domid; uint16_t pad0; grant_ref_t gref; } transitive; uint32_t __spacer[4]; /* Pad to a power of two */ }; typedef union grant_entry_v2 grant_entry_v2_t; typedef uint16_t grant_status_t; #endif /* __XEN_INTERFACE_VERSION__ */ /*********************************** * GRANT TABLE QUERIES AND USES */ /* ` enum neg_errnoval * ` HYPERVISOR_grant_table_op(enum grant_table_op cmd, * ` void *args, * ` unsigned int count) * ` * * @args points to an array of a per-command data structure. The array * has @count members */ /* ` enum grant_table_op { // GNTTABOP_* => struct gnttab_* */ #define GNTTABOP_map_grant_ref 0 #define GNTTABOP_unmap_grant_ref 1 #define GNTTABOP_setup_table 2 #define GNTTABOP_dump_table 3 #define GNTTABOP_transfer 4 #define GNTTABOP_copy 5 #define GNTTABOP_query_size 6 #define GNTTABOP_unmap_and_replace 7 #if __XEN_INTERFACE_VERSION__ >= 0x0003020a #define GNTTABOP_set_version 8 #define GNTTABOP_get_status_frames 9 #define GNTTABOP_get_version 10 #define GNTTABOP_swap_grant_ref 11 #endif /* __XEN_INTERFACE_VERSION__ */ /* ` } */ /* * Handle to track a mapping created via a grant reference. */ typedef uint32_t grant_handle_t; /* * GNTTABOP_map_grant_ref: Map the grant entry (,) for access * by devices and/or host CPUs. If successful, is a tracking number * that must be presented later to destroy the mapping(s). On error, * is a negative status code. * NOTES: * 1. If GNTMAP_device_map is specified then is the address * via which I/O devices may access the granted frame. * 2. If GNTMAP_host_map is specified then a mapping will be added at * either a host virtual address in the current address space, or at * a PTE at the specified machine address. The type of mapping to * perform is selected through the GNTMAP_contains_pte flag, and the * address is specified in . * 3. Mappings should only be destroyed via GNTTABOP_unmap_grant_ref. If a * host mapping is destroyed by other means then it is *NOT* guaranteed * to be accounted to the correct grant reference! */ struct gnttab_map_grant_ref { /* IN parameters. */ uint64_t host_addr; uint32_t flags; /* GNTMAP_* */ grant_ref_t ref; domid_t dom; /* OUT parameters. */ int16_t status; /* => enum grant_status */ grant_handle_t handle; uint64_t dev_bus_addr; }; typedef struct gnttab_map_grant_ref gnttab_map_grant_ref_t; DEFINE_XEN_GUEST_HANDLE(gnttab_map_grant_ref_t); /* * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings * tracked by . If or is zero, that * field is ignored. If non-zero, they must refer to a device/host mapping * that is tracked by * NOTES: * 1. The call may fail in an undefined manner if either mapping is not * tracked by . * 3. After executing a batch of unmaps, it is guaranteed that no stale * mappings will remain in the device or host TLBs. */ struct gnttab_unmap_grant_ref { /* IN parameters. */ uint64_t host_addr; uint64_t dev_bus_addr; grant_handle_t handle; /* OUT parameters. */ int16_t status; /* => enum grant_status */ }; typedef struct gnttab_unmap_grant_ref gnttab_unmap_grant_ref_t; DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_grant_ref_t); /* * GNTTABOP_setup_table: Set up a grant table for comprising at least * pages. The frame addresses are written to the . * Only addresses are written, even if the table is larger. * NOTES: * 1. may be specified as DOMID_SELF. * 2. Only a sufficiently-privileged domain may specify != DOMID_SELF. * 3. Xen may not support more than a single grant-table page per domain. */ struct gnttab_setup_table { /* IN parameters. */ domid_t dom; uint32_t nr_frames; /* OUT parameters. */ int16_t status; /* => enum grant_status */ #if __XEN_INTERFACE_VERSION__ < 0x00040300 XEN_GUEST_HANDLE(ulong) frame_list; #else XEN_GUEST_HANDLE(xen_pfn_t) frame_list; #endif }; typedef struct gnttab_setup_table gnttab_setup_table_t; DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t); /* * GNTTABOP_dump_table: Dump the contents of the grant table to the * xen console. Debugging use only. */ struct gnttab_dump_table { /* IN parameters. */ domid_t dom; /* OUT parameters. */ int16_t status; /* => enum grant_status */ }; typedef struct gnttab_dump_table gnttab_dump_table_t; DEFINE_XEN_GUEST_HANDLE(gnttab_dump_table_t); /* * GNTTABOP_transfer_grant_ref: Transfer to a foreign domain. The * foreign domain has previously registered its interest in the transfer via * . * * Note that, even if the transfer fails, the specified page no longer belongs * to the calling domain *unless* the error is GNTST_bad_page. */ struct gnttab_transfer { /* IN parameters. */ xen_pfn_t mfn; domid_t domid; grant_ref_t ref; /* OUT parameters. */ int16_t status; }; typedef struct gnttab_transfer gnttab_transfer_t; DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_t); /* * GNTTABOP_copy: Hypervisor based copy * source and destinations can be eithers MFNs or, for foreign domains, * grant references. the foreign domain has to grant read/write access * in its grant table. * * The flags specify what type source and destinations are (either MFN * or grant reference). * * Note that this can also be used to copy data between two domains * via a third party if the source and destination domains had previously * grant appropriate access to their pages to the third party. * * source_offset specifies an offset in the source frame, dest_offset * the offset in the target frame and len specifies the number of * bytes to be copied. */ #define _GNTCOPY_source_gref (0) #define GNTCOPY_source_gref (1<<_GNTCOPY_source_gref) #define _GNTCOPY_dest_gref (1) #define GNTCOPY_dest_gref (1<<_GNTCOPY_dest_gref) struct gnttab_copy { /* IN parameters. */ struct { union { grant_ref_t ref; xen_pfn_t gmfn; } u; domid_t domid; uint16_t offset; } source, dest; uint16_t len; uint16_t flags; /* GNTCOPY_* */ /* OUT parameters. */ int16_t status; }; typedef struct gnttab_copy gnttab_copy_t; DEFINE_XEN_GUEST_HANDLE(gnttab_copy_t); /* * GNTTABOP_query_size: Query the current and maximum sizes of the shared * grant table. * NOTES: * 1. may be specified as DOMID_SELF. * 2. Only a sufficiently-privileged domain may specify != DOMID_SELF. */ struct gnttab_query_size { /* IN parameters. */ domid_t dom; /* OUT parameters. */ uint32_t nr_frames; uint32_t max_nr_frames; int16_t status; /* => enum grant_status */ }; typedef struct gnttab_query_size gnttab_query_size_t; DEFINE_XEN_GUEST_HANDLE(gnttab_query_size_t); /* * GNTTABOP_unmap_and_replace: Destroy one or more grant-reference mappings * tracked by but atomically replace the page table entry with one * pointing to the machine address under . will be * redirected to the null entry. * NOTES: * 1. The call may fail in an undefined manner if either mapping is not * tracked by . * 2. After executing a batch of unmaps, it is guaranteed that no stale * mappings will remain in the device or host TLBs. */ struct gnttab_unmap_and_replace { /* IN parameters. */ uint64_t host_addr; uint64_t new_addr; grant_handle_t handle; /* OUT parameters. */ int16_t status; /* => enum grant_status */ }; typedef struct gnttab_unmap_and_replace gnttab_unmap_and_replace_t; DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_and_replace_t); #if __XEN_INTERFACE_VERSION__ >= 0x0003020a /* * GNTTABOP_set_version: Request a particular version of the grant * table shared table structure. This operation can only be performed * once in any given domain. It must be performed before any grants * are activated; otherwise, the domain will be stuck with version 1. * The only defined versions are 1 and 2. */ struct gnttab_set_version { /* IN/OUT parameters */ uint32_t version; }; typedef struct gnttab_set_version gnttab_set_version_t; DEFINE_XEN_GUEST_HANDLE(gnttab_set_version_t); /* * GNTTABOP_get_status_frames: Get the list of frames used to store grant * status for . In grant format version 2, the status is separated * from the other shared grant fields to allow more efficient synchronization * using barriers instead of atomic cmpexch operations. * specify the size of vector . * The frame addresses are returned in the . * Only addresses are returned, even if the table is larger. * NOTES: * 1. may be specified as DOMID_SELF. * 2. Only a sufficiently-privileged domain may specify != DOMID_SELF. */ struct gnttab_get_status_frames { /* IN parameters. */ uint32_t nr_frames; domid_t dom; /* OUT parameters. */ int16_t status; /* => enum grant_status */ XEN_GUEST_HANDLE(uint64_t) frame_list; }; typedef struct gnttab_get_status_frames gnttab_get_status_frames_t; DEFINE_XEN_GUEST_HANDLE(gnttab_get_status_frames_t); /* * GNTTABOP_get_version: Get the grant table version which is in * effect for domain . */ struct gnttab_get_version { /* IN parameters */ domid_t dom; uint16_t pad; /* OUT parameters */ uint32_t version; }; typedef struct gnttab_get_version gnttab_get_version_t; DEFINE_XEN_GUEST_HANDLE(gnttab_get_version_t); /* * GNTTABOP_swap_grant_ref: Swap the contents of two grant entries. */ struct gnttab_swap_grant_ref { /* IN parameters */ grant_ref_t ref_a; grant_ref_t ref_b; /* OUT parameters */ int16_t status; /* => enum grant_status */ }; typedef struct gnttab_swap_grant_ref gnttab_swap_grant_ref_t; DEFINE_XEN_GUEST_HANDLE(gnttab_swap_grant_ref_t); #endif /* __XEN_INTERFACE_VERSION__ */ /* * Bitfield values for gnttab_map_grant_ref.flags. */ /* Map the grant entry for access by I/O devices. */ #define _GNTMAP_device_map (0) #define GNTMAP_device_map (1<<_GNTMAP_device_map) /* Map the grant entry for access by host CPUs. */ #define _GNTMAP_host_map (1) #define GNTMAP_host_map (1<<_GNTMAP_host_map) /* Accesses to the granted frame will be restricted to read-only access. */ #define _GNTMAP_readonly (2) #define GNTMAP_readonly (1<<_GNTMAP_readonly) /* * GNTMAP_host_map subflag: * 0 => The host mapping is usable only by the guest OS. * 1 => The host mapping is usable by guest OS + current application. */ #define _GNTMAP_application_map (3) #define GNTMAP_application_map (1<<_GNTMAP_application_map) /* * GNTMAP_contains_pte subflag: * 0 => This map request contains a host virtual address. * 1 => This map request contains the machine addess of the PTE to update. */ #define _GNTMAP_contains_pte (4) #define GNTMAP_contains_pte (1<<_GNTMAP_contains_pte) #define _GNTMAP_can_fail (5) #define GNTMAP_can_fail (1<<_GNTMAP_can_fail) /* * Bits to be placed in guest kernel available PTE bits (architecture * dependent; only supported when XENFEAT_gnttab_map_avail_bits is set). */ #define _GNTMAP_guest_avail0 (16) #define GNTMAP_guest_avail_mask ((uint32_t)~0 << _GNTMAP_guest_avail0) /* * Values for error status returns. All errors are -ve. */ /* ` enum grant_status { */ #define GNTST_okay (0) /* Normal return. */ #define GNTST_general_error (-1) /* General undefined error. */ #define GNTST_bad_domain (-2) /* Unrecognsed domain id. */ #define GNTST_bad_gntref (-3) /* Unrecognised or inappropriate gntref. */ #define GNTST_bad_handle (-4) /* Unrecognised or inappropriate handle. */ #define GNTST_bad_virt_addr (-5) /* Inappropriate virtual address to map. */ #define GNTST_bad_dev_addr (-6) /* Inappropriate device address to unmap.*/ #define GNTST_no_device_space (-7) /* Out of space in I/O MMU. */ #define GNTST_permission_denied (-8) /* Not enough privilege for operation. */ #define GNTST_bad_page (-9) /* Specified page was invalid for op. */ #define GNTST_bad_copy_arg (-10) /* copy arguments cross page boundary. */ #define GNTST_address_too_big (-11) /* transfer page address too large. */ #define GNTST_eagain (-12) /* Operation not done; try again. */ /* ` } */ #define GNTTABOP_error_msgs { \ "okay", \ "undefined error", \ "unrecognised domain id", \ "invalid grant reference", \ "invalid mapping handle", \ "invalid virtual address", \ "invalid device address", \ "no spare translation slot in the I/O MMU", \ "permission denied", \ "bad page", \ "copy arguments cross page boundary", \ "page address size too large", \ "operation not done; try again" \ } #endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/arch-arm/0000775000175000017500000000000012307313555015630 5ustar smbsmbxen-4.4.0/xen/include/public/arch-arm/hvm/0000775000175000017500000000000012307313555016422 5ustar smbsmbxen-4.4.0/xen/include/public/arch-arm/hvm/save.h0000664000175000017500000000270712307313555017537 0ustar smbsmb/* * Structure definitions for HVM state that is held by Xen and must * be saved along with the domain's memory and device-model state. * * Copyright (c) 2012 Citrix Systems Ltd. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef __XEN_PUBLIC_HVM_SAVE_ARM_H__ #define __XEN_PUBLIC_HVM_SAVE_ARM_H__ #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/vcpu.h0000664000175000017500000002234712307313555015274 0ustar smbsmb/****************************************************************************** * vcpu.h * * VCPU initialisation, query, and hotplug. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2005, Keir Fraser */ #ifndef __XEN_PUBLIC_VCPU_H__ #define __XEN_PUBLIC_VCPU_H__ #include "xen.h" /* * Prototype for this hypercall is: * int vcpu_op(int cmd, int vcpuid, void *extra_args) * @cmd == VCPUOP_??? (VCPU operation). * @vcpuid == VCPU to operate on. * @extra_args == Operation-specific extra arguments (NULL if none). */ /* * Initialise a VCPU. Each VCPU can be initialised only once. A * newly-initialised VCPU will not run until it is brought up by VCPUOP_up. * * @extra_arg == pointer to vcpu_guest_context structure containing initial * state for the VCPU. */ #define VCPUOP_initialise 0 /* * Bring up a VCPU. This makes the VCPU runnable. This operation will fail * if the VCPU has not been initialised (VCPUOP_initialise). */ #define VCPUOP_up 1 /* * Bring down a VCPU (i.e., make it non-runnable). * There are a few caveats that callers should observe: * 1. This operation may return, and VCPU_is_up may return false, before the * VCPU stops running (i.e., the command is asynchronous). It is a good * idea to ensure that the VCPU has entered a non-critical loop before * bringing it down. Alternatively, this operation is guaranteed * synchronous if invoked by the VCPU itself. * 2. After a VCPU is initialised, there is currently no way to drop all its * references to domain memory. Even a VCPU that is down still holds * memory references via its pagetable base pointer and GDT. It is good * practise to move a VCPU onto an 'idle' or default page table, LDT and * GDT before bringing it down. */ #define VCPUOP_down 2 /* Returns 1 if the given VCPU is up. */ #define VCPUOP_is_up 3 /* * Return information about the state and running time of a VCPU. * @extra_arg == pointer to vcpu_runstate_info structure. */ #define VCPUOP_get_runstate_info 4 struct vcpu_runstate_info { /* VCPU's current state (RUNSTATE_*). */ int state; /* When was current state entered (system time, ns)? */ uint64_t state_entry_time; /* * Time spent in each RUNSTATE_* (ns). The sum of these times is * guaranteed not to drift from system time. */ uint64_t time[4]; }; typedef struct vcpu_runstate_info vcpu_runstate_info_t; DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_t); /* VCPU is currently running on a physical CPU. */ #define RUNSTATE_running 0 /* VCPU is runnable, but not currently scheduled on any physical CPU. */ #define RUNSTATE_runnable 1 /* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */ #define RUNSTATE_blocked 2 /* * VCPU is not runnable, but it is not blocked. * This is a 'catch all' state for things like hotplug and pauses by the * system administrator (or for critical sections in the hypervisor). * RUNSTATE_blocked dominates this state (it is the preferred state). */ #define RUNSTATE_offline 3 /* * Register a shared memory area from which the guest may obtain its own * runstate information without needing to execute a hypercall. * Notes: * 1. The registered address may be virtual or physical or guest handle, * depending on the platform. Virtual address or guest handle should be * registered on x86 systems. * 2. Only one shared area may be registered per VCPU. The shared area is * updated by the hypervisor each time the VCPU is scheduled. Thus * runstate.state will always be RUNSTATE_running and * runstate.state_entry_time will indicate the system time at which the * VCPU was last scheduled to run. * @extra_arg == pointer to vcpu_register_runstate_memory_area structure. */ #define VCPUOP_register_runstate_memory_area 5 struct vcpu_register_runstate_memory_area { union { XEN_GUEST_HANDLE(vcpu_runstate_info_t) h; struct vcpu_runstate_info *v; uint64_t p; } addr; }; typedef struct vcpu_register_runstate_memory_area vcpu_register_runstate_memory_area_t; DEFINE_XEN_GUEST_HANDLE(vcpu_register_runstate_memory_area_t); /* * Set or stop a VCPU's periodic timer. Every VCPU has one periodic timer * which can be set via these commands. Periods smaller than one millisecond * may not be supported. */ #define VCPUOP_set_periodic_timer 6 /* arg == vcpu_set_periodic_timer_t */ #define VCPUOP_stop_periodic_timer 7 /* arg == NULL */ struct vcpu_set_periodic_timer { uint64_t period_ns; }; typedef struct vcpu_set_periodic_timer vcpu_set_periodic_timer_t; DEFINE_XEN_GUEST_HANDLE(vcpu_set_periodic_timer_t); /* * Set or stop a VCPU's single-shot timer. Every VCPU has one single-shot * timer which can be set via these commands. */ #define VCPUOP_set_singleshot_timer 8 /* arg == vcpu_set_singleshot_timer_t */ #define VCPUOP_stop_singleshot_timer 9 /* arg == NULL */ struct vcpu_set_singleshot_timer { uint64_t timeout_abs_ns; /* Absolute system time value in nanoseconds. */ uint32_t flags; /* VCPU_SSHOTTMR_??? */ }; typedef struct vcpu_set_singleshot_timer vcpu_set_singleshot_timer_t; DEFINE_XEN_GUEST_HANDLE(vcpu_set_singleshot_timer_t); /* Flags to VCPUOP_set_singleshot_timer. */ /* Require the timeout to be in the future (return -ETIME if it's passed). */ #define _VCPU_SSHOTTMR_future (0) #define VCPU_SSHOTTMR_future (1U << _VCPU_SSHOTTMR_future) /* * Register a memory location in the guest address space for the * vcpu_info structure. This allows the guest to place the vcpu_info * structure in a convenient place, such as in a per-cpu data area. * The pointer need not be page aligned, but the structure must not * cross a page boundary. * * This may be called only once per vcpu. */ #define VCPUOP_register_vcpu_info 10 /* arg == vcpu_register_vcpu_info_t */ struct vcpu_register_vcpu_info { uint64_t mfn; /* mfn of page to place vcpu_info */ uint32_t offset; /* offset within page */ uint32_t rsvd; /* unused */ }; typedef struct vcpu_register_vcpu_info vcpu_register_vcpu_info_t; DEFINE_XEN_GUEST_HANDLE(vcpu_register_vcpu_info_t); /* Send an NMI to the specified VCPU. @extra_arg == NULL. */ #define VCPUOP_send_nmi 11 /* * Get the physical ID information for a pinned vcpu's underlying physical * processor. The physical ID informmation is architecture-specific. * On x86: id[31:0]=apic_id, id[63:32]=acpi_id. * This command returns -EINVAL if it is not a valid operation for this VCPU. */ #define VCPUOP_get_physid 12 /* arg == vcpu_get_physid_t */ struct vcpu_get_physid { uint64_t phys_id; }; typedef struct vcpu_get_physid vcpu_get_physid_t; DEFINE_XEN_GUEST_HANDLE(vcpu_get_physid_t); #define xen_vcpu_physid_to_x86_apicid(physid) ((uint32_t)(physid)) #define xen_vcpu_physid_to_x86_acpiid(physid) ((uint32_t)((physid) >> 32)) /* * Register a memory location to get a secondary copy of the vcpu time * parameters. The master copy still exists as part of the vcpu shared * memory area, and this secondary copy is updated whenever the master copy * is updated (and using the same versioning scheme for synchronisation). * * The intent is that this copy may be mapped (RO) into userspace so * that usermode can compute system time using the time info and the * tsc. Usermode will see an array of vcpu_time_info structures, one * for each vcpu, and choose the right one by an existing mechanism * which allows it to get the current vcpu number (such as via a * segment limit). It can then apply the normal algorithm to compute * system time from the tsc. * * @extra_arg == pointer to vcpu_register_time_info_memory_area structure. */ #define VCPUOP_register_vcpu_time_memory_area 13 DEFINE_XEN_GUEST_HANDLE(vcpu_time_info_t); struct vcpu_register_time_memory_area { union { XEN_GUEST_HANDLE(vcpu_time_info_t) h; struct vcpu_time_info *v; uint64_t p; } addr; }; typedef struct vcpu_register_time_memory_area vcpu_register_time_memory_area_t; DEFINE_XEN_GUEST_HANDLE(vcpu_register_time_memory_area_t); #endif /* __XEN_PUBLIC_VCPU_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/xsm/0000775000175000017500000000000012307313555014745 5ustar smbsmbxen-4.4.0/xen/include/public/xsm/flask_op.h0000664000175000017500000001327212307313555016721 0ustar smbsmb/* * This file contains the flask_op hypercall commands and definitions. * * Author: George Coker, * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef __FLASK_OP_H__ #define __FLASK_OP_H__ #define XEN_FLASK_INTERFACE_VERSION 1 struct xen_flask_load { XEN_GUEST_HANDLE(char) buffer; uint32_t size; }; struct xen_flask_setenforce { uint32_t enforcing; }; struct xen_flask_sid_context { /* IN/OUT: sid to convert to/from string */ uint32_t sid; /* IN: size of the context buffer * OUT: actual size of the output context string */ uint32_t size; XEN_GUEST_HANDLE(char) context; }; struct xen_flask_access { /* IN: access request */ uint32_t ssid; uint32_t tsid; uint32_t tclass; uint32_t req; /* OUT: AVC data */ uint32_t allowed; uint32_t audit_allow; uint32_t audit_deny; uint32_t seqno; }; struct xen_flask_transition { /* IN: transition SIDs and class */ uint32_t ssid; uint32_t tsid; uint32_t tclass; /* OUT: new SID */ uint32_t newsid; }; struct xen_flask_userlist { /* IN: starting SID for list */ uint32_t start_sid; /* IN: size of user string and output buffer * OUT: number of SIDs returned */ uint32_t size; union { /* IN: user to enumerate SIDs */ XEN_GUEST_HANDLE(char) user; /* OUT: SID list */ XEN_GUEST_HANDLE(uint32) sids; } u; }; struct xen_flask_boolean { /* IN/OUT: numeric identifier for boolean [GET/SET] * If -1, name will be used and bool_id will be filled in. */ uint32_t bool_id; /* OUT: current enforcing value of boolean [GET/SET] */ uint8_t enforcing; /* OUT: pending value of boolean [GET/SET] */ uint8_t pending; /* IN: new value of boolean [SET] */ uint8_t new_value; /* IN: commit new value instead of only setting pending [SET] */ uint8_t commit; /* IN: size of boolean name buffer [GET/SET] * OUT: actual size of name [GET only] */ uint32_t size; /* IN: if bool_id is -1, used to find boolean [GET/SET] * OUT: textual name of boolean [GET only] */ XEN_GUEST_HANDLE(char) name; }; struct xen_flask_setavc_threshold { /* IN */ uint32_t threshold; }; struct xen_flask_hash_stats { /* OUT */ uint32_t entries; uint32_t buckets_used; uint32_t buckets_total; uint32_t max_chain_len; }; struct xen_flask_cache_stats { /* IN */ uint32_t cpu; /* OUT */ uint32_t lookups; uint32_t hits; uint32_t misses; uint32_t allocations; uint32_t reclaims; uint32_t frees; }; struct xen_flask_ocontext { /* IN */ uint32_t ocon; uint32_t sid; uint64_t low, high; }; struct xen_flask_peersid { /* IN */ evtchn_port_t evtchn; /* OUT */ uint32_t sid; }; struct xen_flask_relabel { /* IN */ uint32_t domid; uint32_t sid; }; struct xen_flask_op { uint32_t cmd; #define FLASK_LOAD 1 #define FLASK_GETENFORCE 2 #define FLASK_SETENFORCE 3 #define FLASK_CONTEXT_TO_SID 4 #define FLASK_SID_TO_CONTEXT 5 #define FLASK_ACCESS 6 #define FLASK_CREATE 7 #define FLASK_RELABEL 8 #define FLASK_USER 9 #define FLASK_POLICYVERS 10 #define FLASK_GETBOOL 11 #define FLASK_SETBOOL 12 #define FLASK_COMMITBOOLS 13 #define FLASK_MLS 14 #define FLASK_DISABLE 15 #define FLASK_GETAVC_THRESHOLD 16 #define FLASK_SETAVC_THRESHOLD 17 #define FLASK_AVC_HASHSTATS 18 #define FLASK_AVC_CACHESTATS 19 #define FLASK_MEMBER 20 #define FLASK_ADD_OCONTEXT 21 #define FLASK_DEL_OCONTEXT 22 #define FLASK_GET_PEER_SID 23 #define FLASK_RELABEL_DOMAIN 24 uint32_t interface_version; /* XEN_FLASK_INTERFACE_VERSION */ union { struct xen_flask_load load; struct xen_flask_setenforce enforce; /* FLASK_CONTEXT_TO_SID and FLASK_SID_TO_CONTEXT */ struct xen_flask_sid_context sid_context; struct xen_flask_access access; /* FLASK_CREATE, FLASK_RELABEL, FLASK_MEMBER */ struct xen_flask_transition transition; struct xen_flask_userlist userlist; /* FLASK_GETBOOL, FLASK_SETBOOL */ struct xen_flask_boolean boolean; struct xen_flask_setavc_threshold setavc_threshold; struct xen_flask_hash_stats hash_stats; struct xen_flask_cache_stats cache_stats; /* FLASK_ADD_OCONTEXT, FLASK_DEL_OCONTEXT */ struct xen_flask_ocontext ocontext; struct xen_flask_peersid peersid; struct xen_flask_relabel relabel; } u; }; typedef struct xen_flask_op xen_flask_op_t; DEFINE_XEN_GUEST_HANDLE(xen_flask_op_t); #endif xen-4.4.0/xen/include/public/event_channel.h0000664000175000017500000003033712307313555017126 0ustar smbsmb/****************************************************************************** * event_channel.h * * Event channels between domains. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2003-2004, K A Fraser. */ #ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__ #define __XEN_PUBLIC_EVENT_CHANNEL_H__ #include "xen.h" /* * `incontents 150 evtchn Event Channels * * Event channels are the basic primitive provided by Xen for event * notifications. An event is the Xen equivalent of a hardware * interrupt. They essentially store one bit of information, the event * of interest is signalled by transitioning this bit from 0 to 1. * * Notifications are received by a guest via an upcall from Xen, * indicating when an event arrives (setting the bit). Further * notifications are masked until the bit is cleared again (therefore, * guests must check the value of the bit after re-enabling event * delivery to ensure no missed notifications). * * Event notifications can be masked by setting a flag; this is * equivalent to disabling interrupts and can be used to ensure * atomicity of certain operations in the guest kernel. * * Event channels are represented by the evtchn_* fields in * struct shared_info and struct vcpu_info. */ /* * ` enum neg_errnoval * ` HYPERVISOR_event_channel_op(enum event_channel_op cmd, void *args) * ` * @cmd == EVTCHNOP_* (event-channel operation). * @args == struct evtchn_* Operation-specific extra arguments (NULL if none). */ /* ` enum event_channel_op { // EVTCHNOP_* => struct evtchn_* */ #define EVTCHNOP_bind_interdomain 0 #define EVTCHNOP_bind_virq 1 #define EVTCHNOP_bind_pirq 2 #define EVTCHNOP_close 3 #define EVTCHNOP_send 4 #define EVTCHNOP_status 5 #define EVTCHNOP_alloc_unbound 6 #define EVTCHNOP_bind_ipi 7 #define EVTCHNOP_bind_vcpu 8 #define EVTCHNOP_unmask 9 #define EVTCHNOP_reset 10 #define EVTCHNOP_init_control 11 #define EVTCHNOP_expand_array 12 #define EVTCHNOP_set_priority 13 /* ` } */ typedef uint32_t evtchn_port_t; DEFINE_XEN_GUEST_HANDLE(evtchn_port_t); /* * EVTCHNOP_alloc_unbound: Allocate a port in domain and mark as * accepting interdomain bindings from domain . A fresh port * is allocated in and returned as . * NOTES: * 1. If the caller is unprivileged then must be DOMID_SELF. * 2. may be DOMID_SELF, allowing loopback connections. */ struct evtchn_alloc_unbound { /* IN parameters */ domid_t dom, remote_dom; /* OUT parameters */ evtchn_port_t port; }; typedef struct evtchn_alloc_unbound evtchn_alloc_unbound_t; /* * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between * the calling domain and . must identify * a port that is unbound and marked as accepting bindings from the calling * domain. A fresh port is allocated in the calling domain and returned as * . * * In case the peer domain has already tried to set our event channel * pending, before it was bound, EVTCHNOP_bind_interdomain always sets * the local event channel pending. * * The usual pattern of use, in the guest's upcall (or subsequent * handler) is as follows: (Re-enable the event channel for subsequent * signalling and then) check for the existence of whatever condition * is being waited for by other means, and take whatever action is * needed (if any). * * NOTES: * 1. may be DOMID_SELF, allowing loopback connections. */ struct evtchn_bind_interdomain { /* IN parameters. */ domid_t remote_dom; evtchn_port_t remote_port; /* OUT parameters. */ evtchn_port_t local_port; }; typedef struct evtchn_bind_interdomain evtchn_bind_interdomain_t; /* * EVTCHNOP_bind_virq: Bind a local event channel to VIRQ on specified * vcpu. * NOTES: * 1. Virtual IRQs are classified as per-vcpu or global. See the VIRQ list * in xen.h for the classification of each VIRQ. * 2. Global VIRQs must be allocated on VCPU0 but can subsequently be * re-bound via EVTCHNOP_bind_vcpu. * 3. Per-vcpu VIRQs may be bound to at most one event channel per vcpu. * The allocated event channel is bound to the specified vcpu and the * binding cannot be changed. */ struct evtchn_bind_virq { /* IN parameters. */ uint32_t virq; /* enum virq */ uint32_t vcpu; /* OUT parameters. */ evtchn_port_t port; }; typedef struct evtchn_bind_virq evtchn_bind_virq_t; /* * EVTCHNOP_bind_pirq: Bind a local event channel to a real IRQ (PIRQ ). * NOTES: * 1. A physical IRQ may be bound to at most one event channel per domain. * 2. Only a sufficiently-privileged domain may bind to a physical IRQ. */ struct evtchn_bind_pirq { /* IN parameters. */ uint32_t pirq; #define BIND_PIRQ__WILL_SHARE 1 uint32_t flags; /* BIND_PIRQ__* */ /* OUT parameters. */ evtchn_port_t port; }; typedef struct evtchn_bind_pirq evtchn_bind_pirq_t; /* * EVTCHNOP_bind_ipi: Bind a local event channel to receive events. * NOTES: * 1. The allocated event channel is bound to the specified vcpu. The binding * may not be changed. */ struct evtchn_bind_ipi { uint32_t vcpu; /* OUT parameters. */ evtchn_port_t port; }; typedef struct evtchn_bind_ipi evtchn_bind_ipi_t; /* * EVTCHNOP_close: Close a local event channel . If the channel is * interdomain then the remote end is placed in the unbound state * (EVTCHNSTAT_unbound), awaiting a new connection. */ struct evtchn_close { /* IN parameters. */ evtchn_port_t port; }; typedef struct evtchn_close evtchn_close_t; /* * EVTCHNOP_send: Send an event to the remote end of the channel whose local * endpoint is . */ struct evtchn_send { /* IN parameters. */ evtchn_port_t port; }; typedef struct evtchn_send evtchn_send_t; /* * EVTCHNOP_status: Get the current status of the communication channel which * has an endpoint at . * NOTES: * 1. may be specified as DOMID_SELF. * 2. Only a sufficiently-privileged domain may obtain the status of an event * channel for which is not DOMID_SELF. */ struct evtchn_status { /* IN parameters */ domid_t dom; evtchn_port_t port; /* OUT parameters */ #define EVTCHNSTAT_closed 0 /* Channel is not in use. */ #define EVTCHNSTAT_unbound 1 /* Channel is waiting interdom connection.*/ #define EVTCHNSTAT_interdomain 2 /* Channel is connected to remote domain. */ #define EVTCHNSTAT_pirq 3 /* Channel is bound to a phys IRQ line. */ #define EVTCHNSTAT_virq 4 /* Channel is bound to a virtual IRQ line */ #define EVTCHNSTAT_ipi 5 /* Channel is bound to a virtual IPI line */ uint32_t status; uint32_t vcpu; /* VCPU to which this channel is bound. */ union { struct { domid_t dom; } unbound; /* EVTCHNSTAT_unbound */ struct { domid_t dom; evtchn_port_t port; } interdomain; /* EVTCHNSTAT_interdomain */ uint32_t pirq; /* EVTCHNSTAT_pirq */ uint32_t virq; /* EVTCHNSTAT_virq */ } u; }; typedef struct evtchn_status evtchn_status_t; /* * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an * event is pending. * NOTES: * 1. IPI-bound channels always notify the vcpu specified at bind time. * This binding cannot be changed. * 2. Per-VCPU VIRQ channels always notify the vcpu specified at bind time. * This binding cannot be changed. * 3. All other channels notify vcpu0 by default. This default is set when * the channel is allocated (a port that is freed and subsequently reused * has its binding reset to vcpu0). */ struct evtchn_bind_vcpu { /* IN parameters. */ evtchn_port_t port; uint32_t vcpu; }; typedef struct evtchn_bind_vcpu evtchn_bind_vcpu_t; /* * EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver * a notification to the appropriate VCPU if an event is pending. */ struct evtchn_unmask { /* IN parameters. */ evtchn_port_t port; }; typedef struct evtchn_unmask evtchn_unmask_t; /* * EVTCHNOP_reset: Close all event channels associated with specified domain. * NOTES: * 1. may be specified as DOMID_SELF. * 2. Only a sufficiently-privileged domain may specify other than DOMID_SELF. */ struct evtchn_reset { /* IN parameters. */ domid_t dom; }; typedef struct evtchn_reset evtchn_reset_t; /* * EVTCHNOP_init_control: initialize the control block for the FIFO ABI. * * Note: any events that are currently pending will not be resent and * will be lost. Guests should call this before binding any event to * avoid losing any events. */ struct evtchn_init_control { /* IN parameters. */ uint64_t control_gfn; uint32_t offset; uint32_t vcpu; /* OUT parameters. */ uint8_t link_bits; uint8_t _pad[7]; }; typedef struct evtchn_init_control evtchn_init_control_t; /* * EVTCHNOP_expand_array: add an additional page to the event array. */ struct evtchn_expand_array { /* IN parameters. */ uint64_t array_gfn; }; typedef struct evtchn_expand_array evtchn_expand_array_t; /* * EVTCHNOP_set_priority: set the priority for an event channel. */ struct evtchn_set_priority { /* IN parameters. */ uint32_t port; uint32_t priority; }; typedef struct evtchn_set_priority evtchn_set_priority_t; /* * ` enum neg_errnoval * ` HYPERVISOR_event_channel_op_compat(struct evtchn_op *op) * ` * Superceded by new event_channel_op() hypercall since 0x00030202. */ struct evtchn_op { uint32_t cmd; /* enum event_channel_op */ union { struct evtchn_alloc_unbound alloc_unbound; struct evtchn_bind_interdomain bind_interdomain; struct evtchn_bind_virq bind_virq; struct evtchn_bind_pirq bind_pirq; struct evtchn_bind_ipi bind_ipi; struct evtchn_close close; struct evtchn_send send; struct evtchn_status status; struct evtchn_bind_vcpu bind_vcpu; struct evtchn_unmask unmask; } u; }; typedef struct evtchn_op evtchn_op_t; DEFINE_XEN_GUEST_HANDLE(evtchn_op_t); /* * 2-level ABI */ #define EVTCHN_2L_NR_CHANNELS (sizeof(xen_ulong_t) * sizeof(xen_ulong_t) * 64) /* * FIFO ABI */ /* Events may have priorities from 0 (highest) to 15 (lowest). */ #define EVTCHN_FIFO_PRIORITY_MAX 0 #define EVTCHN_FIFO_PRIORITY_DEFAULT 7 #define EVTCHN_FIFO_PRIORITY_MIN 15 #define EVTCHN_FIFO_MAX_QUEUES (EVTCHN_FIFO_PRIORITY_MIN + 1) typedef uint32_t event_word_t; #define EVTCHN_FIFO_PENDING 31 #define EVTCHN_FIFO_MASKED 30 #define EVTCHN_FIFO_LINKED 29 #define EVTCHN_FIFO_BUSY 28 #define EVTCHN_FIFO_LINK_BITS 17 #define EVTCHN_FIFO_LINK_MASK ((1 << EVTCHN_FIFO_LINK_BITS) - 1) #define EVTCHN_FIFO_NR_CHANNELS (1 << EVTCHN_FIFO_LINK_BITS) struct evtchn_fifo_control_block { uint32_t ready; uint32_t _rsvd; uint32_t head[EVTCHN_FIFO_MAX_QUEUES]; }; typedef struct evtchn_fifo_control_block evtchn_fifo_control_block_t; #endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/COPYING0000664000175000017500000000326412307313555015176 0ustar smbsmbXEN NOTICE ========== This copyright applies to all files within this subdirectory and its subdirectories: include/public/*.h include/public/hvm/*.h include/public/io/*.h The intention is that these files can be freely copied into the source tree of an operating system when porting that OS to run on Xen. Doing so does *not* cause the OS to become subject to the terms of the GPL. All other files in the Xen source distribution are covered by version 2 of the GNU General Public License except where explicitly stated otherwise within individual source files. -- Keir Fraser (on behalf of the Xen team) ===================================================================== Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. xen-4.4.0/xen/include/public/xencomm.h0000664000175000017500000000321312307313555015754 0ustar smbsmb/* * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (C) IBM Corp. 2006 */ #ifndef _XEN_XENCOMM_H_ #define _XEN_XENCOMM_H_ /* A xencomm descriptor is a scatter/gather list containing physical * addresses corresponding to a virtually contiguous memory area. The * hypervisor translates these physical addresses to machine addresses to copy * to and from the virtually contiguous area. */ #define XENCOMM_MAGIC 0x58434F4D /* 'XCOM' */ #define XENCOMM_INVALID (~0UL) struct xencomm_desc { uint32_t magic; uint32_t nr_addrs; /* the number of entries in address[] */ uint64_t address[0]; }; #endif /* _XEN_XENCOMM_H_ */ xen-4.4.0/xen/include/public/mem_event.h0000664000175000017500000000564412307313555016277 0ustar smbsmb/****************************************************************************** * mem_event.h * * Memory event common structures. * * Copyright (c) 2009 by Citrix Systems, Inc. (Patrick Colp) * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #ifndef _XEN_PUBLIC_MEM_EVENT_H #define _XEN_PUBLIC_MEM_EVENT_H #include "xen.h" #include "io/ring.h" /* Memory event flags */ #define MEM_EVENT_FLAG_VCPU_PAUSED (1 << 0) #define MEM_EVENT_FLAG_DROP_PAGE (1 << 1) #define MEM_EVENT_FLAG_EVICT_FAIL (1 << 2) #define MEM_EVENT_FLAG_FOREIGN (1 << 3) #define MEM_EVENT_FLAG_DUMMY (1 << 4) /* Reasons for the memory event request */ #define MEM_EVENT_REASON_UNKNOWN 0 /* typical reason */ #define MEM_EVENT_REASON_VIOLATION 1 /* access violation, GFN is address */ #define MEM_EVENT_REASON_CR0 2 /* CR0 was hit: gfn is CR0 value */ #define MEM_EVENT_REASON_CR3 3 /* CR3 was hit: gfn is CR3 value */ #define MEM_EVENT_REASON_CR4 4 /* CR4 was hit: gfn is CR4 value */ #define MEM_EVENT_REASON_INT3 5 /* int3 was hit: gla/gfn are RIP */ #define MEM_EVENT_REASON_SINGLESTEP 6 /* single step was invoked: gla/gfn are RIP */ #define MEM_EVENT_REASON_MSR 7 /* MSR was hit: gfn is MSR value, gla is MSR address; does NOT honour HVMPME_onchangeonly */ typedef struct mem_event_st { uint32_t flags; uint32_t vcpu_id; uint64_t gfn; uint64_t offset; uint64_t gla; /* if gla_valid */ uint32_t p2mt; uint16_t access_r:1; uint16_t access_w:1; uint16_t access_x:1; uint16_t gla_valid:1; uint16_t available:12; uint16_t reason; } mem_event_request_t, mem_event_response_t; DEFINE_RING_TYPES(mem_event, mem_event_request_t, mem_event_response_t); #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/public/tmem.h0000664000175000017500000001140612307313555015253 0ustar smbsmb/****************************************************************************** * tmem.h * * Guest OS interface to Xen Transcendent Memory. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2004, K A Fraser */ #ifndef __XEN_PUBLIC_TMEM_H__ #define __XEN_PUBLIC_TMEM_H__ #include "xen.h" /* version of ABI */ #define TMEM_SPEC_VERSION 1 /* Commands to HYPERVISOR_tmem_op() */ #define TMEM_CONTROL 0 #define TMEM_NEW_POOL 1 #define TMEM_DESTROY_POOL 2 #define TMEM_PUT_PAGE 4 #define TMEM_GET_PAGE 5 #define TMEM_FLUSH_PAGE 6 #define TMEM_FLUSH_OBJECT 7 #if __XEN_INTERFACE_VERSION__ < 0x00040400 #define TMEM_NEW_PAGE 3 #define TMEM_READ 8 #define TMEM_WRITE 9 #define TMEM_XCHG 10 #endif /* Privileged commands to HYPERVISOR_tmem_op() */ #define TMEM_AUTH 101 #define TMEM_RESTORE_NEW 102 /* Subops for HYPERVISOR_tmem_op(TMEM_CONTROL) */ #define TMEMC_THAW 0 #define TMEMC_FREEZE 1 #define TMEMC_FLUSH 2 #define TMEMC_DESTROY 3 #define TMEMC_LIST 4 #define TMEMC_SET_WEIGHT 5 #define TMEMC_SET_CAP 6 #define TMEMC_SET_COMPRESS 7 #define TMEMC_QUERY_FREEABLE_MB 8 #define TMEMC_SAVE_BEGIN 10 #define TMEMC_SAVE_GET_VERSION 11 #define TMEMC_SAVE_GET_MAXPOOLS 12 #define TMEMC_SAVE_GET_CLIENT_WEIGHT 13 #define TMEMC_SAVE_GET_CLIENT_CAP 14 #define TMEMC_SAVE_GET_CLIENT_FLAGS 15 #define TMEMC_SAVE_GET_POOL_FLAGS 16 #define TMEMC_SAVE_GET_POOL_NPAGES 17 #define TMEMC_SAVE_GET_POOL_UUID 18 #define TMEMC_SAVE_GET_NEXT_PAGE 19 #define TMEMC_SAVE_GET_NEXT_INV 20 #define TMEMC_SAVE_END 21 #define TMEMC_RESTORE_BEGIN 30 #define TMEMC_RESTORE_PUT_PAGE 32 #define TMEMC_RESTORE_FLUSH_PAGE 33 /* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */ #define TMEM_POOL_PERSIST 1 #define TMEM_POOL_SHARED 2 #define TMEM_POOL_PRECOMPRESSED 4 #define TMEM_POOL_PAGESIZE_SHIFT 4 #define TMEM_POOL_PAGESIZE_MASK 0xf #define TMEM_POOL_VERSION_SHIFT 24 #define TMEM_POOL_VERSION_MASK 0xff #define TMEM_POOL_RESERVED_BITS 0x00ffff00 /* Bits for client flags (save/restore) */ #define TMEM_CLIENT_COMPRESS 1 #define TMEM_CLIENT_FROZEN 2 /* Special errno values */ #define EFROZEN 1000 #define EEMPTY 1001 #ifndef __ASSEMBLY__ #if __XEN_INTERFACE_VERSION__ < 0x00040400 typedef xen_pfn_t tmem_cli_mfn_t; #endif typedef XEN_GUEST_HANDLE(char) tmem_cli_va_t; struct tmem_op { uint32_t cmd; int32_t pool_id; union { struct { uint64_t uuid[2]; uint32_t flags; uint32_t arg1; } creat; /* for cmd == TMEM_NEW_POOL, TMEM_AUTH, TMEM_RESTORE_NEW */ struct { uint32_t subop; uint32_t cli_id; uint32_t arg1; uint32_t arg2; uint64_t oid[3]; tmem_cli_va_t buf; } ctrl; /* for cmd == TMEM_CONTROL */ struct { uint64_t oid[3]; uint32_t index; uint32_t tmem_offset; uint32_t pfn_offset; uint32_t len; xen_pfn_t cmfn; /* client machine page frame */ } gen; /* for all other cmd ("generic") */ } u; }; typedef struct tmem_op tmem_op_t; DEFINE_XEN_GUEST_HANDLE(tmem_op_t); struct tmem_handle { uint32_t pool_id; uint32_t index; uint64_t oid[3]; }; #endif #endif /* __XEN_PUBLIC_TMEM_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/acpi/0000775000175000017500000000000012307313555013574 5ustar smbsmbxen-4.4.0/xen/include/acpi/actbl1.h0000664000175000017500000006253512307313555015126 0ustar smbsmb/****************************************************************************** * * Name: actbl1.h - Additional ACPI table definitions * *****************************************************************************/ /* * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACTBL1_H__ #define __ACTBL1_H__ /******************************************************************************* * * Additional ACPI Tables (1) * * These tables are not consumed directly by the ACPICA subsystem, but are * included here to support device drivers and the AML disassembler. * * The tables in this file are fully defined within the ACPI specification. * ******************************************************************************/ /* * Values for description table header signatures for tables defined in this * file. Useful because they make it more difficult to inadvertently type in * the wrong signature. */ #define ACPI_SIG_BERT "BERT" /* Boot Error Record Table */ #define ACPI_SIG_CPEP "CPEP" /* Corrected Platform Error Polling table */ #define ACPI_SIG_ECDT "ECDT" /* Embedded Controller Boot Resources Table */ #define ACPI_SIG_EINJ "EINJ" /* Error Injection table */ #define ACPI_SIG_ERST "ERST" /* Error Record Serialization Table */ #define ACPI_SIG_HEST "HEST" /* Hardware Error Source Table */ #define ACPI_SIG_MADT "APIC" /* Multiple APIC Description Table */ #define ACPI_SIG_MSCT "MSCT" /* Maximum System Characteristics Table */ #define ACPI_SIG_SBST "SBST" /* Smart Battery Specification Table */ #define ACPI_SIG_SLIT "SLIT" /* System Locality Distance Information Table */ #define ACPI_SIG_SRAT "SRAT" /* System Resource Affinity Table */ /* * All tables must be byte-packed to match the ACPI specification, since * the tables are provided by the system BIOS. */ #pragma pack(1) /* * Note about bitfields: The u8 type is used for bitfields in ACPI tables. * This is the only type that is even remotely portable. Anything else is not * portable, so do not use any other bitfield types. */ /******************************************************************************* * * Common subtable headers * ******************************************************************************/ /* Generic subtable header (used in MADT, SRAT, etc.) */ struct acpi_subtable_header { u8 type; u8 length; }; /* Subtable header for WHEA tables (EINJ, ERST, WDAT) */ struct acpi_whea_header { u8 action; u8 instruction; u8 flags; u8 reserved; struct acpi_generic_address register_region; u64 value; /* Value used with Read/Write register */ u64 mask; /* Bitmask required for this register instruction */ }; /******************************************************************************* * * BERT - Boot Error Record Table (ACPI 4.0) * Version 1 * ******************************************************************************/ struct acpi_table_bert { struct acpi_table_header header; /* Common ACPI table header */ u32 region_length; /* Length of the boot error region */ u64 address; /* Physical address of the error region */ }; /* Boot Error Region (not a subtable, pointed to by Address field above) */ struct acpi_bert_region { u32 block_status; /* Type of error information */ u32 raw_data_offset; /* Offset to raw error data */ u32 raw_data_length; /* Length of raw error data */ u32 data_length; /* Length of generic error data */ u32 error_severity; /* Severity code */ }; /* Values for block_status flags above */ #define ACPI_BERT_UNCORRECTABLE (1) #define ACPI_BERT_CORRECTABLE (1<<1) #define ACPI_BERT_MULTIPLE_UNCORRECTABLE (1<<2) #define ACPI_BERT_MULTIPLE_CORRECTABLE (1<<3) #define ACPI_BERT_ERROR_ENTRY_COUNT (0xFF<<4) /* 8 bits, error count */ /* Values for error_severity above */ enum acpi_bert_error_severity { ACPI_BERT_ERROR_CORRECTABLE = 0, ACPI_BERT_ERROR_FATAL = 1, ACPI_BERT_ERROR_CORRECTED = 2, ACPI_BERT_ERROR_NONE = 3, ACPI_BERT_ERROR_RESERVED = 4 /* 4 and greater are reserved */ }; /* * Note: The generic error data that follows the error_severity field above * uses the struct acpi_hest_generic_data defined under the HEST table below */ /******************************************************************************* * * CPEP - Corrected Platform Error Polling table (ACPI 4.0) * Version 1 * ******************************************************************************/ struct acpi_table_cpep { struct acpi_table_header header; /* Common ACPI table header */ u64 reserved; }; /* Subtable */ struct acpi_cpep_polling { struct acpi_subtable_header header; u8 id; /* Processor ID */ u8 eid; /* Processor EID */ u32 interval; /* Polling interval (msec) */ }; /******************************************************************************* * * ECDT - Embedded Controller Boot Resources Table * Version 1 * ******************************************************************************/ struct acpi_table_ecdt { struct acpi_table_header header; /* Common ACPI table header */ struct acpi_generic_address control; /* Address of EC command/status register */ struct acpi_generic_address data; /* Address of EC data register */ u32 uid; /* Unique ID - must be same as the EC _UID method */ u8 gpe; /* The GPE for the EC */ u8 id[1]; /* Full namepath of the EC in the ACPI namespace */ }; /******************************************************************************* * * EINJ - Error Injection Table (ACPI 4.0) * Version 1 * ******************************************************************************/ struct acpi_table_einj { struct acpi_table_header header; /* Common ACPI table header */ u32 header_length; u8 flags; u8 reserved[3]; u32 entries; }; /* EINJ Injection Instruction Entries (actions) */ struct acpi_einj_entry { struct acpi_whea_header whea_header; /* Common header for WHEA tables */ }; /* Masks for Flags field above */ #define ACPI_EINJ_PRESERVE (1) /* Values for Action field above */ enum acpi_einj_actions { ACPI_EINJ_BEGIN_OPERATION = 0, ACPI_EINJ_GET_TRIGGER_TABLE = 1, ACPI_EINJ_SET_ERROR_TYPE = 2, ACPI_EINJ_GET_ERROR_TYPE = 3, ACPI_EINJ_END_OPERATION = 4, ACPI_EINJ_EXECUTE_OPERATION = 5, ACPI_EINJ_CHECK_BUSY_STATUS = 6, ACPI_EINJ_GET_COMMAND_STATUS = 7, ACPI_EINJ_ACTION_RESERVED = 8, /* 8 and greater are reserved */ ACPI_EINJ_TRIGGER_ERROR = 0xFF /* Except for this value */ }; /* Values for Instruction field above */ enum acpi_einj_instructions { ACPI_EINJ_READ_REGISTER = 0, ACPI_EINJ_READ_REGISTER_VALUE = 1, ACPI_EINJ_WRITE_REGISTER = 2, ACPI_EINJ_WRITE_REGISTER_VALUE = 3, ACPI_EINJ_NOOP = 4, ACPI_EINJ_INSTRUCTION_RESERVED = 5 /* 5 and greater are reserved */ }; /* EINJ Trigger Error Action Table */ struct acpi_einj_trigger { u32 header_size; u32 revision; u32 table_size; u32 entry_count; }; /* Command status return values */ enum acpi_einj_command_status { ACPI_EINJ_SUCCESS = 0, ACPI_EINJ_FAILURE = 1, ACPI_EINJ_INVALID_ACCESS = 2, ACPI_EINJ_STATUS_RESERVED = 3 /* 3 and greater are reserved */ }; /* Error types returned from ACPI_EINJ_GET_ERROR_TYPE (bitfield) */ #define ACPI_EINJ_PROCESSOR_CORRECTABLE (1) #define ACPI_EINJ_PROCESSOR_UNCORRECTABLE (1<<1) #define ACPI_EINJ_PROCESSOR_FATAL (1<<2) #define ACPI_EINJ_MEMORY_CORRECTABLE (1<<3) #define ACPI_EINJ_MEMORY_UNCORRECTABLE (1<<4) #define ACPI_EINJ_MEMORY_FATAL (1<<5) #define ACPI_EINJ_PCIX_CORRECTABLE (1<<6) #define ACPI_EINJ_PCIX_UNCORRECTABLE (1<<7) #define ACPI_EINJ_PCIX_FATAL (1<<8) #define ACPI_EINJ_PLATFORM_CORRECTABLE (1<<9) #define ACPI_EINJ_PLATFORM_UNCORRECTABLE (1<<10) #define ACPI_EINJ_PLATFORM_FATAL (1<<11) /******************************************************************************* * * ERST - Error Record Serialization Table (ACPI 4.0) * Version 1 * ******************************************************************************/ struct acpi_table_erst { struct acpi_table_header header; /* Common ACPI table header */ u32 header_length; u32 reserved; u32 entries; }; /* ERST Serialization Entries (actions) */ struct acpi_erst_entry { struct acpi_whea_header whea_header; /* Common header for WHEA tables */ }; /* Masks for Flags field above */ #define ACPI_ERST_PRESERVE (1) /* Values for Action field above */ enum acpi_erst_actions { ACPI_ERST_BEGIN_WRITE = 0, ACPI_ERST_BEGIN_READ = 1, ACPI_ERST_BEGIN_CLEAR = 2, ACPI_ERST_END = 3, ACPI_ERST_SET_RECORD_OFFSET = 4, ACPI_ERST_EXECUTE_OPERATION = 5, ACPI_ERST_CHECK_BUSY_STATUS = 6, ACPI_ERST_GET_COMMAND_STATUS = 7, ACPI_ERST_GET_RECORD_ID = 8, ACPI_ERST_SET_RECORD_ID = 9, ACPI_ERST_GET_RECORD_COUNT = 10, ACPI_ERST_BEGIN_DUMMY_WRIITE = 11, ACPI_ERST_NOT_USED = 12, ACPI_ERST_GET_ERROR_RANGE = 13, ACPI_ERST_GET_ERROR_LENGTH = 14, ACPI_ERST_GET_ERROR_ATTRIBUTES = 15, ACPI_ERST_ACTION_RESERVED = 16 /* 16 and greater are reserved */ }; /* Values for Instruction field above */ enum acpi_erst_instructions { ACPI_ERST_READ_REGISTER = 0, ACPI_ERST_READ_REGISTER_VALUE = 1, ACPI_ERST_WRITE_REGISTER = 2, ACPI_ERST_WRITE_REGISTER_VALUE = 3, ACPI_ERST_NOOP = 4, ACPI_ERST_LOAD_VAR1 = 5, ACPI_ERST_LOAD_VAR2 = 6, ACPI_ERST_STORE_VAR1 = 7, ACPI_ERST_ADD = 8, ACPI_ERST_SUBTRACT = 9, ACPI_ERST_ADD_VALUE = 10, ACPI_ERST_SUBTRACT_VALUE = 11, ACPI_ERST_STALL = 12, ACPI_ERST_STALL_WHILE_TRUE = 13, ACPI_ERST_SKIP_NEXT_IF_TRUE = 14, ACPI_ERST_GOTO = 15, ACPI_ERST_SET_SRC_ADDRESS_BASE = 16, ACPI_ERST_SET_DST_ADDRESS_BASE = 17, ACPI_ERST_MOVE_DATA = 18, ACPI_ERST_INSTRUCTION_RESERVED = 19 /* 19 and greater are reserved */ }; /* Command status return values */ enum acpi_erst_command_status { ACPI_ERST_SUCESS = 0, ACPI_ERST_NO_SPACE = 1, ACPI_ERST_NOT_AVAILABLE = 2, ACPI_ERST_FAILURE = 3, ACPI_ERST_RECORD_EMPTY = 4, ACPI_ERST_NOT_FOUND = 5, ACPI_ERST_STATUS_RESERVED = 6 /* 6 and greater are reserved */ }; /* Error Record Serialization Information */ struct acpi_erst_info { u16 signature; /* Should be "ER" */ u8 data[48]; }; /******************************************************************************* * * HEST - Hardware Error Source Table (ACPI 4.0) * Version 1 * ******************************************************************************/ struct acpi_table_hest { struct acpi_table_header header; /* Common ACPI table header */ u32 error_source_count; }; /* HEST subtable header */ struct acpi_hest_header { u16 type; u16 source_id; }; /* Values for Type field above for subtables */ enum acpi_hest_types { ACPI_HEST_TYPE_IA32_CHECK = 0, ACPI_HEST_TYPE_IA32_CORRECTED_CHECK = 1, ACPI_HEST_TYPE_IA32_NMI = 2, ACPI_HEST_TYPE_NOT_USED3 = 3, ACPI_HEST_TYPE_NOT_USED4 = 4, ACPI_HEST_TYPE_NOT_USED5 = 5, ACPI_HEST_TYPE_AER_ROOT_PORT = 6, ACPI_HEST_TYPE_AER_ENDPOINT = 7, ACPI_HEST_TYPE_AER_BRIDGE = 8, ACPI_HEST_TYPE_GENERIC_ERROR = 9, ACPI_HEST_TYPE_RESERVED = 10 /* 10 and greater are reserved */ }; /* * HEST substructures contained in subtables */ /* * IA32 Error Bank(s) - Follows the struct acpi_hest_ia_machine_check and * struct acpi_hest_ia_corrected structures. */ struct acpi_hest_ia_error_bank { u8 bank_number; u8 clear_status_on_init; u8 status_format; u8 reserved; u32 control_register; u64 control_data; u32 status_register; u32 address_register; u32 misc_register; }; /* Common HEST sub-structure for PCI/AER structures below (6,7,8) */ struct acpi_hest_aer_common { u16 reserved1; u8 flags; u8 enabled; u32 records_to_preallocate; u32 max_sections_per_record; u32 bus; u16 device; u16 function; u16 device_control; u16 reserved2; u32 uncorrectable_mask; u32 uncorrectable_severity; u32 correctable_mask; u32 advanced_capabilities; }; /* Masks for HEST Flags fields */ #define ACPI_HEST_FIRMWARE_FIRST (1) #define ACPI_HEST_GLOBAL (1<<1) /* Hardware Error Notification */ struct acpi_hest_notify { u8 type; u8 length; u16 config_write_enable; u32 poll_interval; u32 vector; u32 polling_threshold_value; u32 polling_threshold_window; u32 error_threshold_value; u32 error_threshold_window; }; /* Values for Notify Type field above */ enum acpi_hest_notify_types { ACPI_HEST_NOTIFY_POLLED = 0, ACPI_HEST_NOTIFY_EXTERNAL = 1, ACPI_HEST_NOTIFY_LOCAL = 2, ACPI_HEST_NOTIFY_SCI = 3, ACPI_HEST_NOTIFY_NMI = 4, ACPI_HEST_NOTIFY_RESERVED = 5 /* 5 and greater are reserved */ }; /* Values for config_write_enable bitfield above */ #define ACPI_HEST_TYPE (1) #define ACPI_HEST_POLL_INTERVAL (1<<1) #define ACPI_HEST_POLL_THRESHOLD_VALUE (1<<2) #define ACPI_HEST_POLL_THRESHOLD_WINDOW (1<<3) #define ACPI_HEST_ERR_THRESHOLD_VALUE (1<<4) #define ACPI_HEST_ERR_THRESHOLD_WINDOW (1<<5) /* * HEST subtables */ /* 0: IA32 Machine Check Exception */ struct acpi_hest_ia_machine_check { struct acpi_hest_header header; u16 reserved1; u8 flags; u8 enabled; u32 records_to_preallocate; u32 max_sections_per_record; u64 global_capability_data; u64 global_control_data; u8 num_hardware_banks; u8 reserved3[7]; }; /* 1: IA32 Corrected Machine Check */ struct acpi_hest_ia_corrected { struct acpi_hest_header header; u16 reserved1; u8 flags; u8 enabled; u32 records_to_preallocate; u32 max_sections_per_record; struct acpi_hest_notify notify; u8 num_hardware_banks; u8 reserved2[3]; }; /* 2: IA32 Non-Maskable Interrupt */ struct acpi_hest_ia_nmi { struct acpi_hest_header header; u32 reserved; u32 records_to_preallocate; u32 max_sections_per_record; u32 max_raw_data_length; }; /* 3,4,5: Not used */ /* 6: PCI Express Root Port AER */ struct acpi_hest_aer_root { struct acpi_hest_header header; struct acpi_hest_aer_common aer; u32 root_error_command; }; /* 7: PCI Express AER (AER Endpoint) */ struct acpi_hest_aer { struct acpi_hest_header header; struct acpi_hest_aer_common aer; }; /* 8: PCI Express/PCI-X Bridge AER */ struct acpi_hest_aer_bridge { struct acpi_hest_header header; struct acpi_hest_aer_common aer; u32 uncorrectable_mask2; u32 uncorrectable_severity2; u32 advanced_capabilities2; }; /* 9: Generic Hardware Error Source */ struct acpi_hest_generic { struct acpi_hest_header header; u16 related_source_id; u8 reserved; u8 enabled; u32 records_to_preallocate; u32 max_sections_per_record; u32 max_raw_data_length; struct acpi_generic_address error_status_address; struct acpi_hest_notify notify; u32 error_block_length; }; /* Generic Error Status block */ struct acpi_hest_generic_status { u32 block_status; u32 raw_data_offset; u32 raw_data_length; u32 data_length; u32 error_severity; }; /* Values for block_status flags above */ #define ACPI_HEST_UNCORRECTABLE (1) #define ACPI_HEST_CORRECTABLE (1<<1) #define ACPI_HEST_MULTIPLE_UNCORRECTABLE (1<<2) #define ACPI_HEST_MULTIPLE_CORRECTABLE (1<<3) #define ACPI_HEST_ERROR_ENTRY_COUNT (0xFF<<4) /* 8 bits, error count */ /* Generic Error Data entry */ struct acpi_hest_generic_data { u8 section_type[16]; u32 error_severity; u16 revision; u8 validation_bits; u8 flags; u32 error_data_length; u8 fru_id[16]; u8 fru_text[20]; }; /******************************************************************************* * * MADT - Multiple APIC Description Table * Version 3 * ******************************************************************************/ struct acpi_table_madt { struct acpi_table_header header; /* Common ACPI table header */ u32 address; /* Physical address of local APIC */ u32 flags; }; /* Masks for Flags field above */ #define ACPI_MADT_PCAT_COMPAT (1) /* 00: System also has dual 8259s */ /* Values for PCATCompat flag */ #define ACPI_MADT_DUAL_PIC 0 #define ACPI_MADT_MULTIPLE_APIC 1 /* Values for MADT subtable type in struct acpi_subtable_header */ enum acpi_madt_type { ACPI_MADT_TYPE_LOCAL_APIC = 0, ACPI_MADT_TYPE_IO_APIC = 1, ACPI_MADT_TYPE_INTERRUPT_OVERRIDE = 2, ACPI_MADT_TYPE_NMI_SOURCE = 3, ACPI_MADT_TYPE_LOCAL_APIC_NMI = 4, ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE = 5, ACPI_MADT_TYPE_IO_SAPIC = 6, ACPI_MADT_TYPE_LOCAL_SAPIC = 7, ACPI_MADT_TYPE_INTERRUPT_SOURCE = 8, ACPI_MADT_TYPE_LOCAL_X2APIC = 9, ACPI_MADT_TYPE_LOCAL_X2APIC_NMI = 10, ACPI_MADT_TYPE_RESERVED = 11 /* 11 and greater are reserved */ }; /* * MADT Sub-tables, correspond to Type in struct acpi_subtable_header */ /* 0: Processor Local APIC */ struct acpi_madt_local_apic { struct acpi_subtable_header header; u8 processor_id; /* ACPI processor id */ u8 id; /* Processor's local APIC id */ u32 lapic_flags; }; /* 1: IO APIC */ struct acpi_madt_io_apic { struct acpi_subtable_header header; u8 id; /* I/O APIC ID */ u8 reserved; /* Reserved - must be zero */ u32 address; /* APIC physical address */ u32 global_irq_base; /* Global system interrupt where INTI lines start */ }; /* 2: Interrupt Override */ struct acpi_madt_interrupt_override { struct acpi_subtable_header header; u8 bus; /* 0 - ISA */ u8 source_irq; /* Interrupt source (IRQ) */ u32 global_irq; /* Global system interrupt */ u16 inti_flags; }; /* 3: NMI Source */ struct acpi_madt_nmi_source { struct acpi_subtable_header header; u16 inti_flags; u32 global_irq; /* Global system interrupt */ }; /* 4: Local APIC NMI */ struct acpi_madt_local_apic_nmi { struct acpi_subtable_header header; u8 processor_id; /* ACPI processor id */ u16 inti_flags; u8 lint; /* LINTn to which NMI is connected */ }; /* 5: Address Override */ struct acpi_madt_local_apic_override { struct acpi_subtable_header header; u16 reserved; /* Reserved, must be zero */ u64 address; /* APIC physical address */ }; /* 6: I/O Sapic */ struct acpi_madt_io_sapic { struct acpi_subtable_header header; u8 id; /* I/O SAPIC ID */ u8 reserved; /* Reserved, must be zero */ u32 global_irq_base; /* Global interrupt for SAPIC start */ u64 address; /* SAPIC physical address */ }; /* 7: Local Sapic */ struct acpi_madt_local_sapic { struct acpi_subtable_header header; u8 processor_id; /* ACPI processor id */ u8 id; /* SAPIC ID */ u8 eid; /* SAPIC EID */ u8 reserved[3]; /* Reserved, must be zero */ u32 lapic_flags; u32 uid; /* Numeric UID - ACPI 3.0 */ char uid_string[1]; /* String UID - ACPI 3.0 */ }; /* 8: Platform Interrupt Source */ struct acpi_madt_interrupt_source { struct acpi_subtable_header header; u16 inti_flags; u8 type; /* 1=PMI, 2=INIT, 3=corrected */ u8 id; /* Processor ID */ u8 eid; /* Processor EID */ u8 io_sapic_vector; /* Vector value for PMI interrupts */ u32 global_irq; /* Global system interrupt */ u32 flags; /* Interrupt Source Flags */ }; /* Masks for Flags field above */ #define ACPI_MADT_CPEI_OVERRIDE (1) /* 9: Processor Local X2APIC (ACPI 4.0) */ struct acpi_madt_local_x2apic { struct acpi_subtable_header header; u16 reserved; /* Reserved - must be zero */ u32 local_apic_id; /* Processor x2APIC ID */ u32 lapic_flags; u32 uid; /* ACPI processor UID */ }; /* 10: Local X2APIC NMI (ACPI 4.0) */ struct acpi_madt_local_x2apic_nmi { struct acpi_subtable_header header; u16 inti_flags; u32 uid; /* ACPI processor UID */ u8 lint; /* LINTn to which NMI is connected */ u8 reserved[3]; }; /* * Common flags fields for MADT subtables */ /* MADT Local APIC flags (lapic_flags) */ #define ACPI_MADT_ENABLED (1) /* 00: Processor is usable if set */ /* MADT MPS INTI flags (inti_flags) */ #define ACPI_MADT_POLARITY_MASK (3) /* 00-01: Polarity of APIC I/O input signals */ #define ACPI_MADT_TRIGGER_MASK (3<<2) /* 02-03: Trigger mode of APIC input signals */ /* Values for MPS INTI flags */ #define ACPI_MADT_POLARITY_CONFORMS 0 #define ACPI_MADT_POLARITY_ACTIVE_HIGH 1 #define ACPI_MADT_POLARITY_RESERVED 2 #define ACPI_MADT_POLARITY_ACTIVE_LOW 3 #define ACPI_MADT_TRIGGER_CONFORMS (0) #define ACPI_MADT_TRIGGER_EDGE (1<<2) #define ACPI_MADT_TRIGGER_RESERVED (2<<2) #define ACPI_MADT_TRIGGER_LEVEL (3<<2) /******************************************************************************* * * MSCT - Maximum System Characteristics Table (ACPI 4.0) * Version 1 * ******************************************************************************/ struct acpi_table_msct { struct acpi_table_header header; /* Common ACPI table header */ u32 proximity_offset; /* Location of proximity info struct(s) */ u32 max_proximity_domains; /* Max number of proximity domains */ u32 max_clock_domains; /* Max number of clock domains */ u64 max_address; /* Max physical address in system */ }; /* Subtable - Maximum Proximity Domain Information. Version 1 */ struct acpi_msct_proximity { u8 revision; u8 length; u32 range_start; /* Start of domain range */ u32 range_end; /* End of domain range */ u32 processor_capacity; u64 memory_capacity; /* In bytes */ }; /******************************************************************************* * * SBST - Smart Battery Specification Table * Version 1 * ******************************************************************************/ struct acpi_table_sbst { struct acpi_table_header header; /* Common ACPI table header */ u32 warning_level; u32 low_level; u32 critical_level; }; /******************************************************************************* * * SLIT - System Locality Distance Information Table * Version 1 * ******************************************************************************/ struct acpi_table_slit { struct acpi_table_header header; /* Common ACPI table header */ u64 locality_count; u8 entry[1]; /* Real size = localities^2 */ }; /******************************************************************************* * * SRAT - System Resource Affinity Table * Version 3 * ******************************************************************************/ struct acpi_table_srat { struct acpi_table_header header; /* Common ACPI table header */ u32 table_revision; /* Must be value '1' */ u64 reserved; /* Reserved, must be zero */ }; /* Values for subtable type in struct acpi_subtable_header */ enum acpi_srat_type { ACPI_SRAT_TYPE_CPU_AFFINITY = 0, ACPI_SRAT_TYPE_MEMORY_AFFINITY = 1, ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY = 2, ACPI_SRAT_TYPE_RESERVED = 3 /* 3 and greater are reserved */ }; /* * SRAT Sub-tables, correspond to Type in struct acpi_subtable_header */ /* 0: Processor Local APIC/SAPIC Affinity */ struct acpi_srat_cpu_affinity { struct acpi_subtable_header header; u8 proximity_domain_lo; u8 apic_id; u32 flags; u8 local_sapic_eid; u8 proximity_domain_hi[3]; u32 reserved; /* Reserved, must be zero */ }; /* Flags */ #define ACPI_SRAT_CPU_USE_AFFINITY (1) /* 00: Use affinity structure */ /* 1: Memory Affinity */ struct acpi_srat_mem_affinity { struct acpi_subtable_header header; u32 proximity_domain; u16 reserved; /* Reserved, must be zero */ u64 base_address; u64 length; u32 reserved1; u32 flags; u64 reserved2; /* Reserved, must be zero */ }; /* Flags */ #define ACPI_SRAT_MEM_ENABLED (1) /* 00: Use affinity structure */ #define ACPI_SRAT_MEM_HOT_PLUGGABLE (1<<1) /* 01: Memory region is hot pluggable */ #define ACPI_SRAT_MEM_NON_VOLATILE (1<<2) /* 02: Memory region is non-volatile */ /* 2: Processor Local X2_APIC Affinity (ACPI 4.0) */ struct acpi_srat_x2apic_cpu_affinity { struct acpi_subtable_header header; u16 reserved; /* Reserved, must be zero */ u32 proximity_domain; u32 apic_id; u32 flags; u32 clock_domain; u32 reserved2; }; /* Flags for struct acpi_srat_cpu_affinity and struct acpi_srat_x2apic_cpu_affinity */ #define ACPI_SRAT_CPU_ENABLED (1) /* 00: Use affinity structure */ /* Reset to default packing */ #pragma pack() #endif /* __ACTBL1_H__ */ xen-4.4.0/xen/include/acpi/actbl.h0000664000175000017500000004210612307313555015035 0ustar smbsmb/****************************************************************************** * * Name: actbl.h - Basic ACPI Table Definitions * *****************************************************************************/ /* * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACTBL_H__ #define __ACTBL_H__ /******************************************************************************* * * Fundamental ACPI tables * * This file contains definitions for the ACPI tables that are directly consumed * by ACPICA. All other tables are consumed by the OS-dependent ACPI-related * device drivers and other OS support code. * * The RSDP and FACS do not use the common ACPI table header. All other ACPI * tables use the header. * ******************************************************************************/ /* * Values for description table header signatures for tables defined in this * file. Useful because they make it more difficult to inadvertently type in * the wrong signature. */ #define ACPI_SIG_DSDT "DSDT" /* Differentiated System Description Table */ #define ACPI_SIG_FADT "FACP" /* Fixed ACPI Description Table */ #define ACPI_SIG_FACS "FACS" /* Firmware ACPI Control Structure */ #define ACPI_SIG_PSDT "PSDT" /* Persistent System Description Table */ #define ACPI_SIG_RSDP "RSD PTR " /* Root System Description Pointer */ #define ACPI_SIG_RSDT "RSDT" /* Root System Description Table */ #define ACPI_SIG_XSDT "XSDT" /* Extended System Description Table */ #define ACPI_SIG_SSDT "SSDT" /* Secondary System Description Table */ #define ACPI_RSDP_NAME "RSDP" /* Short name for RSDP, not signature */ /* * All tables and structures must be byte-packed to match the ACPI * specification, since the tables are provided by the system BIOS */ #pragma pack(1) /* * Note about bitfields: The u8 type is used for bitfields in ACPI tables. * This is the only type that is even remotely portable. Anything else is not * portable, so do not use any other bitfield types. */ /******************************************************************************* * * Master ACPI Table Header. This common header is used by all ACPI tables * except the RSDP and FACS. * ******************************************************************************/ struct acpi_table_header { char signature[ACPI_NAME_SIZE]; /* ASCII table signature */ u32 length; /* Length of table in bytes, including this header */ u8 revision; /* ACPI Specification minor version # */ u8 checksum; /* To make sum of entire table == 0 */ char oem_id[ACPI_OEM_ID_SIZE]; /* ASCII OEM identification */ char oem_table_id[ACPI_OEM_TABLE_ID_SIZE]; /* ASCII OEM table identification */ u32 oem_revision; /* OEM revision number */ char asl_compiler_id[ACPI_NAME_SIZE]; /* ASCII ASL compiler vendor ID */ u32 asl_compiler_revision; /* ASL compiler version */ }; /******************************************************************************* * * GAS - Generic Address Structure (ACPI 2.0+) * * Note: Since this structure is used in the ACPI tables, it is byte aligned. * If misaliged access is not supported by the hardware, accesses to the * 64-bit Address field must be performed with care. * ******************************************************************************/ struct acpi_generic_address { u8 space_id; /* Address space where struct or register exists */ u8 bit_width; /* Size in bits of given register */ u8 bit_offset; /* Bit offset within the register */ u8 access_width; /* Minimum Access size (ACPI 3.0) */ u64 address; /* 64-bit address of struct or register */ }; /******************************************************************************* * * RSDP - Root System Description Pointer (Signature is "RSD PTR ") * Version 2 * ******************************************************************************/ struct acpi_table_rsdp { char signature[8]; /* ACPI signature, contains "RSD PTR " */ u8 checksum; /* ACPI 1.0 checksum */ char oem_id[ACPI_OEM_ID_SIZE]; /* OEM identification */ u8 revision; /* Must be (0) for ACPI 1.0 or (2) for ACPI 2.0+ */ u32 rsdt_physical_address; /* 32-bit physical address of the RSDT */ u32 length; /* Table length in bytes, including header (ACPI 2.0+) */ u64 xsdt_physical_address; /* 64-bit physical address of the XSDT (ACPI 2.0+) */ u8 extended_checksum; /* Checksum of entire table (ACPI 2.0+) */ u8 reserved[3]; /* Reserved, must be zero */ }; #define ACPI_RSDP_REV0_SIZE 20 /* Size of original ACPI 1.0 RSDP */ /******************************************************************************* * * RSDT/XSDT - Root System Description Tables * Version 1 (both) * ******************************************************************************/ struct acpi_table_rsdt { struct acpi_table_header header; /* Common ACPI table header */ u32 table_offset_entry[1]; /* Array of pointers to ACPI tables */ }; struct acpi_table_xsdt { struct acpi_table_header header; /* Common ACPI table header */ u64 table_offset_entry[1]; /* Array of pointers to ACPI tables */ }; /******************************************************************************* * * FACS - Firmware ACPI Control Structure (FACS) * ******************************************************************************/ struct acpi_table_facs { char signature[4]; /* ASCII table signature */ u32 length; /* Length of structure, in bytes */ u32 hardware_signature; /* Hardware configuration signature */ u32 firmware_waking_vector; /* 32-bit physical address of the Firmware Waking Vector */ u32 global_lock; /* Global Lock for shared hardware resources */ u32 flags; u64 xfirmware_waking_vector; /* 64-bit version of the Firmware Waking Vector (ACPI 2.0+) */ u8 version; /* Version of this table (ACPI 2.0+) */ u8 reserved[3]; /* Reserved, must be zero */ u32 ospm_flags; /* Flags to be set by OSPM (ACPI 4.0) */ u8 reserved1[24]; /* Reserved, must be zero */ }; /* Masks for global_lock flag field above */ #define ACPI_GLOCK_PENDING (1) /* 00: Pending global lock ownership */ #define ACPI_GLOCK_OWNED (1<<1) /* 01: Global lock is owned */ /* Masks for Flags field above */ #define ACPI_FACS_S4_BIOS_PRESENT (1) /* 00: S4BIOS support is present */ #define ACPI_FACS_64BIT_WAKE (1<<1) /* 01: 64-bit wake vector supported (ACPI 4.0) */ /* Masks for ospm_flags field above */ #define ACPI_FACS_64BIT_ENVIRONMENT (1) /* 00: 64-bit wake environment is required (ACPI 4.0) */ /******************************************************************************* * * FADT - Fixed ACPI Description Table (Signature "FACP") * Version 4 * ******************************************************************************/ /* Fields common to all versions of the FADT */ struct acpi_table_fadt { struct acpi_table_header header; /* Common ACPI table header */ u32 facs; /* 32-bit physical address of FACS */ u32 dsdt; /* 32-bit physical address of DSDT */ u8 model; /* System Interrupt Model (ACPI 1.0) - not used in ACPI 2.0+ */ u8 preferred_profile; /* Conveys preferred power management profile to OSPM. */ u16 sci_interrupt; /* System vector of SCI interrupt */ u32 smi_command; /* 32-bit Port address of SMI command port */ u8 acpi_enable; /* Value to write to smi_cmd to enable ACPI */ u8 acpi_disable; /* Value to write to smi_cmd to disable ACPI */ u8 S4bios_request; /* Value to write to SMI CMD to enter S4BIOS state */ u8 pstate_control; /* Processor performance state control */ u32 pm1a_event_block; /* 32-bit Port address of Power Mgt 1a Event Reg Blk */ u32 pm1b_event_block; /* 32-bit Port address of Power Mgt 1b Event Reg Blk */ u32 pm1a_control_block; /* 32-bit Port address of Power Mgt 1a Control Reg Blk */ u32 pm1b_control_block; /* 32-bit Port address of Power Mgt 1b Control Reg Blk */ u32 pm2_control_block; /* 32-bit Port address of Power Mgt 2 Control Reg Blk */ u32 pm_timer_block; /* 32-bit Port address of Power Mgt Timer Ctrl Reg Blk */ u32 gpe0_block; /* 32-bit Port address of General Purpose Event 0 Reg Blk */ u32 gpe1_block; /* 32-bit Port address of General Purpose Event 1 Reg Blk */ u8 pm1_event_length; /* Byte Length of ports at pm1x_event_block */ u8 pm1_control_length; /* Byte Length of ports at pm1x_control_block */ u8 pm2_control_length; /* Byte Length of ports at pm2_control_block */ u8 pm_timer_length; /* Byte Length of ports at pm_timer_block */ u8 gpe0_block_length; /* Byte Length of ports at gpe0_block */ u8 gpe1_block_length; /* Byte Length of ports at gpe1_block */ u8 gpe1_base; /* Offset in GPE number space where GPE1 events start */ u8 cst_control; /* Support for the _CST object and C States change notification */ u16 C2latency; /* Worst case HW latency to enter/exit C2 state */ u16 C3latency; /* Worst case HW latency to enter/exit C3 state */ u16 flush_size; /* Processor's memory cache line width, in bytes */ u16 flush_stride; /* Number of flush strides that need to be read */ u8 duty_offset; /* Processor duty cycle index in processor's P_CNT reg */ u8 duty_width; /* Processor duty cycle value bit width in P_CNT register */ u8 day_alarm; /* Index to day-of-month alarm in RTC CMOS RAM */ u8 month_alarm; /* Index to month-of-year alarm in RTC CMOS RAM */ u8 century; /* Index to century in RTC CMOS RAM */ u16 boot_flags; /* IA-PC Boot Architecture Flags (see below for individual flags) */ u8 reserved; /* Reserved, must be zero */ u32 flags; /* Miscellaneous flag bits (see below for individual flags) */ struct acpi_generic_address reset_register; /* 64-bit address of the Reset register */ u8 reset_value; /* Value to write to the reset_register port to reset the system */ u8 reserved4[3]; /* Reserved, must be zero */ u64 Xfacs; /* 64-bit physical address of FACS */ u64 Xdsdt; /* 64-bit physical address of DSDT */ struct acpi_generic_address xpm1a_event_block; /* 64-bit Extended Power Mgt 1a Event Reg Blk address */ struct acpi_generic_address xpm1b_event_block; /* 64-bit Extended Power Mgt 1b Event Reg Blk address */ struct acpi_generic_address xpm1a_control_block; /* 64-bit Extended Power Mgt 1a Control Reg Blk address */ struct acpi_generic_address xpm1b_control_block; /* 64-bit Extended Power Mgt 1b Control Reg Blk address */ struct acpi_generic_address xpm2_control_block; /* 64-bit Extended Power Mgt 2 Control Reg Blk address */ struct acpi_generic_address xpm_timer_block; /* 64-bit Extended Power Mgt Timer Ctrl Reg Blk address */ struct acpi_generic_address xgpe0_block; /* 64-bit Extended General Purpose Event 0 Reg Blk address */ struct acpi_generic_address xgpe1_block; /* 64-bit Extended General Purpose Event 1 Reg Blk address */ struct acpi_generic_address sleep_control; /* 64-bit Sleep Control register */ struct acpi_generic_address sleep_status; /* 64-bit Sleep Status register */ }; /* Masks for FADT Boot Architecture Flags (boot_flags) */ #define ACPI_FADT_LEGACY_DEVICES (1) /* 00: [V2] System has LPC or ISA bus devices */ #define ACPI_FADT_8042 (1<<1) /* 01: [V3] System has an 8042 controller on port 60/64 */ #define ACPI_FADT_NO_VGA (1<<2) /* 02: [V4] It is not safe to probe for VGA hardware */ #define ACPI_FADT_NO_MSI (1<<3) /* 03: [V4] Message Signaled Interrupts (MSI) must not be enabled */ #define ACPI_FADT_NO_ASPM (1<<4) /* 04: [V4] PCIe ASPM control must not be enabled */ #define ACPI_FADT_NO_CMOS_RTC (1<<5) /* 05: [V5] No CMOS real-time clock present */ #define FADT2_REVISION_ID 3 /* Masks for FADT flags */ #define ACPI_FADT_WBINVD (1) /* 00: [V1] The wbinvd instruction works properly */ #define ACPI_FADT_WBINVD_FLUSH (1<<1) /* 01: [V1] wbinvd flushes but does not invalidate caches */ #define ACPI_FADT_C1_SUPPORTED (1<<2) /* 02: [V1] All processors support C1 state */ #define ACPI_FADT_C2_MP_SUPPORTED (1<<3) /* 03: [V1] C2 state works on MP system */ #define ACPI_FADT_POWER_BUTTON (1<<4) /* 04: [V1] Power button is handled as a control method device */ #define ACPI_FADT_SLEEP_BUTTON (1<<5) /* 05: [V1] Sleep button is handled as a control method device */ #define ACPI_FADT_FIXED_RTC (1<<6) /* 06: [V1] RTC wakeup status not in fixed register space */ #define ACPI_FADT_S4_RTC_WAKE (1<<7) /* 07: [V1] RTC alarm can wake system from S4 */ #define ACPI_FADT_32BIT_TIMER (1<<8) /* 08: [V1] ACPI timer width is 32-bit (0=24-bit) */ #define ACPI_FADT_DOCKING_SUPPORTED (1<<9) /* 09: [V1] Docking supported */ #define ACPI_FADT_RESET_REGISTER (1<<10) /* 10: [V2] System reset via the FADT RESET_REG supported */ #define ACPI_FADT_SEALED_CASE (1<<11) /* 11: [V3] No internal expansion capabilities and case is sealed */ #define ACPI_FADT_HEADLESS (1<<12) /* 12: [V3] No local video capabilities or local input devices */ #define ACPI_FADT_SLEEP_TYPE (1<<13) /* 13: [V3] Must execute native instruction after writing SLP_TYPx register */ #define ACPI_FADT_PCI_EXPRESS_WAKE (1<<14) /* 14: [V4] System supports PCIEXP_WAKE (STS/EN) bits (ACPI 3.0) */ #define ACPI_FADT_PLATFORM_CLOCK (1<<15) /* 15: [V4] OSPM should use platform-provided timer (ACPI 3.0) */ #define ACPI_FADT_S4_RTC_VALID (1<<16) /* 16: [V4] Contents of RTC_STS valid after S4 wake (ACPI 3.0) */ #define ACPI_FADT_REMOTE_POWER_ON (1<<17) /* 17: [V4] System is compatible with remote power on (ACPI 3.0) */ #define ACPI_FADT_APIC_CLUSTER (1<<18) /* 18: [V4] All local APICs must use cluster model (ACPI 3.0) */ #define ACPI_FADT_APIC_PHYSICAL (1<<19) /* 19: [V4] All local x_aPICs must use physical dest mode (ACPI 3.0) */ #define ACPI_FADT_HW_REDUCED (1<<20) /* 20: [V5] ACPI hardware is not implemented (ACPI 5.0) */ #define ACPI_FADT_LOW_POWER_S0 (1<<21) /* 21: [V5] S0 power savings are equal or better than S3 (ACPI 5.0) */ /* Values for preferred_profile (Preferred Power Management Profiles) */ enum acpi_prefered_pm_profiles { PM_UNSPECIFIED = 0, PM_DESKTOP = 1, PM_MOBILE = 2, PM_WORKSTATION = 3, PM_ENTERPRISE_SERVER = 4, PM_SOHO_SERVER = 5, PM_APPLIANCE_PC = 6, PM_PERFORMANCE_SERVER = 7, PM_TABLET = 8 }; /* Values for sleep_status and sleep_control registers (V5 FADT) */ #define ACPI_X_WAKE_STATUS 0x80 #define ACPI_X_SLEEP_TYPE_MASK 0x1C #define ACPI_X_SLEEP_TYPE_POSITION 0x02 #define ACPI_X_SLEEP_ENABLE 0x20 /* Reset to default packing */ #pragma pack() #define ACPI_FADT_OFFSET(f) (u16) ACPI_OFFSET (struct acpi_table_fadt, f) /* * Get the remaining ACPI tables */ #include #include #include /* * Sizes of the various flavors of FADT. We need to look closely * at the FADT length because the version number essentially tells * us nothing because of many BIOS bugs where the version does not * match the expected length. In other words, the length of the * FADT is the bottom line as to what the version really is. * * For reference, the values below are as follows: * FADT V1 size: 0x074 * FADT V2 size: 0x084 * FADT V3 size: 0x0F4 * FADT V4 size: 0x0F4 * FADT V5 size: 0x10C */ #define ACPI_FADT_V1_SIZE (u32) (ACPI_FADT_OFFSET (flags) + 4) #define ACPI_FADT_V2_SIZE (u32) (ACPI_FADT_OFFSET (reserved4[0]) + 3) #define ACPI_FADT_V3_SIZE (u32) (ACPI_FADT_OFFSET (sleep_control)) #define ACPI_FADT_V5_SIZE (u32) (sizeof (struct acpi_table_fadt)) #endif /* __ACTBL_H__ */ xen-4.4.0/xen/include/acpi/actbl2.h0000664000175000017500000006771112307313555015130 0ustar smbsmb/****************************************************************************** * * Name: actbl2.h - ACPI Table Definitions (tables not in ACPI spec) * *****************************************************************************/ /* * Copyright (C) 2000 - 2011, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACTBL2_H__ #define __ACTBL2_H__ /******************************************************************************* * * Additional ACPI Tables (2) * * These tables are not consumed directly by the ACPICA subsystem, but are * included here to support device drivers and the AML disassembler. * * The tables in this file are defined by third-party specifications, and are * not defined directly by the ACPI specification itself. * ******************************************************************************/ /* * Values for description table header signatures for tables defined in this * file. Useful because they make it more difficult to inadvertently type in * the wrong signature. */ #define ACPI_SIG_ASF "ASF!" /* Alert Standard Format table */ #define ACPI_SIG_BOOT "BOOT" /* Simple Boot Flag Table */ #define ACPI_SIG_DBGP "DBGP" /* Debug Port table */ #define ACPI_SIG_DMAR "DMAR" /* DMA Remapping table */ #define ACPI_SIG_HPET "HPET" /* High Precision Event Timer table */ #define ACPI_SIG_IBFT "IBFT" /* i_sCSI Boot Firmware Table */ #define ACPI_SIG_IVRS "IVRS" /* I/O Virtualization Reporting Structure */ #define ACPI_SIG_MCFG "MCFG" /* PCI Memory Mapped Configuration table */ #define ACPI_SIG_MCHI "MCHI" /* Management Controller Host Interface table */ #define ACPI_SIG_SLIC "SLIC" /* Software Licensing Description Table */ #define ACPI_SIG_SPCR "SPCR" /* Serial Port Console Redirection table */ #define ACPI_SIG_SPMI "SPMI" /* Server Platform Management Interface table */ #define ACPI_SIG_TCPA "TCPA" /* Trusted Computing Platform Alliance table */ #define ACPI_SIG_UEFI "UEFI" /* Uefi Boot Optimization Table */ #define ACPI_SIG_WAET "WAET" /* Windows ACPI Emulated devices Table */ #define ACPI_SIG_WDAT "WDAT" /* Watchdog Action Table */ #define ACPI_SIG_WDDT "WDDT" /* Watchdog Timer Description Table */ #define ACPI_SIG_WDRT "WDRT" /* Watchdog Resource Table */ #ifdef ACPI_UNDEFINED_TABLES /* * These tables have been seen in the field, but no definition has been found */ #define ACPI_SIG_ATKG "ATKG" #define ACPI_SIG_GSCI "GSCI" /* GMCH SCI table */ #define ACPI_SIG_IEIT "IEIT" #endif /* * All tables must be byte-packed to match the ACPI specification, since * the tables are provided by the system BIOS. */ #pragma pack(1) /* * Note about bitfields: The u8 type is used for bitfields in ACPI tables. * This is the only type that is even remotely portable. Anything else is not * portable, so do not use any other bitfield types. */ /******************************************************************************* * * ASF - Alert Standard Format table (Signature "ASF!") * Revision 0x10 * * Conforms to the Alert Standard Format Specification V2.0, 23 April 2003 * ******************************************************************************/ struct acpi_table_asf { struct acpi_table_header header; /* Common ACPI table header */ }; /* ASF subtable header */ struct acpi_asf_header { u8 type; u8 reserved; u16 length; }; /* Values for Type field above */ enum acpi_asf_type { ACPI_ASF_TYPE_INFO = 0, ACPI_ASF_TYPE_ALERT = 1, ACPI_ASF_TYPE_CONTROL = 2, ACPI_ASF_TYPE_BOOT = 3, ACPI_ASF_TYPE_ADDRESS = 4, ACPI_ASF_TYPE_RESERVED = 5 }; /* * ASF subtables */ /* 0: ASF Information */ struct acpi_asf_info { struct acpi_asf_header header; u8 min_reset_value; u8 min_poll_interval; u16 system_id; u32 mfg_id; u8 flags; u8 reserved2[3]; }; /* Masks for Flags field above */ #define ACPI_ASF_SMBUS_PROTOCOLS (1) /* 1: ASF Alerts */ struct acpi_asf_alert { struct acpi_asf_header header; u8 assert_mask; u8 deassert_mask; u8 alerts; u8 data_length; }; struct acpi_asf_alert_data { u8 address; u8 command; u8 mask; u8 value; u8 sensor_type; u8 type; u8 offset; u8 source_type; u8 severity; u8 sensor_number; u8 entity; u8 instance; }; /* 2: ASF Remote Control */ struct acpi_asf_remote { struct acpi_asf_header header; u8 controls; u8 data_length; u16 reserved2; }; struct acpi_asf_control_data { u8 function; u8 address; u8 command; u8 value; }; /* 3: ASF RMCP Boot Options */ struct acpi_asf_rmcp { struct acpi_asf_header header; u8 capabilities[7]; u8 completion_code; u32 enterprise_id; u8 command; u16 parameter; u16 boot_options; u16 oem_parameters; }; /* 4: ASF Address */ struct acpi_asf_address { struct acpi_asf_header header; u8 eprom_address; u8 devices; }; /******************************************************************************* * * BOOT - Simple Boot Flag Table * Version 1 * * Conforms to the "Simple Boot Flag Specification", Version 2.1 * ******************************************************************************/ struct acpi_table_boot { struct acpi_table_header header; /* Common ACPI table header */ u8 cmos_index; /* Index in CMOS RAM for the boot register */ u8 reserved[3]; }; /******************************************************************************* * * DBGP - Debug Port table * Version 1 * * Conforms to the "Debug Port Specification", Version 1.00, 2/9/2000 * ******************************************************************************/ struct acpi_table_dbgp { struct acpi_table_header header; /* Common ACPI table header */ u8 type; /* 0=full 16550, 1=subset of 16550 */ u8 reserved[3]; struct acpi_generic_address debug_port; }; /******************************************************************************* * * DMAR - DMA Remapping table * Version 1 * * Conforms to "Intel Virtualization Technology for Directed I/O", * Version 1.2, Sept. 2008 * ******************************************************************************/ struct acpi_table_dmar { struct acpi_table_header header; /* Common ACPI table header */ u8 width; /* Host Address Width */ u8 flags; u8 reserved[10]; }; /* Masks for Flags field above */ #define ACPI_DMAR_INTR_REMAP (1) #define ACPI_DMAR_X2APIC_OPT_OUT (1<<1) /* DMAR subtable header */ struct acpi_dmar_header { u16 type; u16 length; }; /* Values for subtable type in struct acpi_dmar_header */ enum acpi_dmar_type { ACPI_DMAR_TYPE_HARDWARE_UNIT = 0, ACPI_DMAR_TYPE_RESERVED_MEMORY = 1, ACPI_DMAR_TYPE_ATSR = 2, ACPI_DMAR_HARDWARE_AFFINITY = 3, ACPI_DMAR_TYPE_RESERVED = 4 /* 4 and greater are reserved */ }; /* DMAR Device Scope structure */ struct acpi_dmar_device_scope { u8 entry_type; u8 length; u16 reserved; u8 enumeration_id; u8 bus; }; /* Values for entry_type in struct acpi_dmar_device_scope */ enum acpi_dmar_scope_type { ACPI_DMAR_SCOPE_TYPE_NOT_USED = 0, ACPI_DMAR_SCOPE_TYPE_ENDPOINT = 1, ACPI_DMAR_SCOPE_TYPE_BRIDGE = 2, ACPI_DMAR_SCOPE_TYPE_IOAPIC = 3, ACPI_DMAR_SCOPE_TYPE_HPET = 4, }; struct acpi_dmar_pci_path { u8 dev; u8 fn; }; /* * DMAR Sub-tables, correspond to Type in struct acpi_dmar_header */ /* 0: Hardware Unit Definition */ struct acpi_dmar_hardware_unit { struct acpi_dmar_header header; u8 flags; u8 reserved; u16 segment; u64 address; /* Register Base Address */ }; /* Masks for Flags field above */ #define ACPI_DMAR_INCLUDE_ALL (1) /* 1: Reserved Memory Defininition */ struct acpi_dmar_reserved_memory { struct acpi_dmar_header header; u16 reserved; u16 segment; u64 base_address; /* 4_k aligned base address */ u64 end_address; /* 4_k aligned limit address */ }; /* Masks for Flags field above */ #define ACPI_DMAR_ALLOW_ALL (1) /* 2: Root Port ATS Capability Reporting Structure */ struct acpi_dmar_atsr { struct acpi_dmar_header header; u8 flags; u8 reserved; u16 segment; }; /* Masks for Flags field above */ #define ACPI_DMAR_ALL_PORTS (1) /* 3: Remapping Hardware Static Affinity Structure */ struct acpi_dmar_rhsa { struct acpi_dmar_header header; u32 reserved; u64 base_address; u32 proximity_domain; }; /******************************************************************************* * * HPET - High Precision Event Timer table * Version 1 * * Conforms to "IA-PC HPET (High Precision Event Timers) Specification", * Version 1.0a, October 2004 * ******************************************************************************/ struct acpi_table_hpet { struct acpi_table_header header; /* Common ACPI table header */ u32 id; /* Hardware ID of event timer block */ struct acpi_generic_address address; /* Address of event timer block */ u8 sequence; /* HPET sequence number */ u16 minimum_tick; /* Main counter min tick, periodic mode */ u8 flags; }; /* Masks for Flags field above */ #define ACPI_HPET_PAGE_PROTECT_MASK (3) /* Values for Page Protect flags */ enum acpi_hpet_page_protect { ACPI_HPET_NO_PAGE_PROTECT = 0, ACPI_HPET_PAGE_PROTECT4 = 1, ACPI_HPET_PAGE_PROTECT64 = 2 }; /******************************************************************************* * * IBFT - Boot Firmware Table * Version 1 * * Conforms to "iSCSI Boot Firmware Table (iBFT) as Defined in ACPI 3.0b * Specification", Version 1.01, March 1, 2007 * * Note: It appears that this table is not intended to appear in the RSDT/XSDT. * Therefore, it is not currently supported by the disassembler. * ******************************************************************************/ struct acpi_table_ibft { struct acpi_table_header header; /* Common ACPI table header */ u8 reserved[12]; }; /* IBFT common subtable header */ struct acpi_ibft_header { u8 type; u8 version; u16 length; u8 index; u8 flags; }; /* Values for Type field above */ enum acpi_ibft_type { ACPI_IBFT_TYPE_NOT_USED = 0, ACPI_IBFT_TYPE_CONTROL = 1, ACPI_IBFT_TYPE_INITIATOR = 2, ACPI_IBFT_TYPE_NIC = 3, ACPI_IBFT_TYPE_TARGET = 4, ACPI_IBFT_TYPE_EXTENSIONS = 5, ACPI_IBFT_TYPE_RESERVED = 6 /* 6 and greater are reserved */ }; /* IBFT subtables */ struct acpi_ibft_control { struct acpi_ibft_header header; u16 extensions; u16 initiator_offset; u16 nic0_offset; u16 target0_offset; u16 nic1_offset; u16 target1_offset; }; struct acpi_ibft_initiator { struct acpi_ibft_header header; u8 sns_server[16]; u8 slp_server[16]; u8 primary_server[16]; u8 secondary_server[16]; u16 name_length; u16 name_offset; }; struct acpi_ibft_nic { struct acpi_ibft_header header; u8 ip_address[16]; u8 subnet_mask_prefix; u8 origin; u8 gateway[16]; u8 primary_dns[16]; u8 secondary_dns[16]; u8 dhcp[16]; u16 vlan; u8 mac_address[6]; u16 pci_address; u16 name_length; u16 name_offset; }; struct acpi_ibft_target { struct acpi_ibft_header header; u8 target_ip_address[16]; u16 target_ip_socket; u8 target_boot_lun[8]; u8 chap_type; u8 nic_association; u16 target_name_length; u16 target_name_offset; u16 chap_name_length; u16 chap_name_offset; u16 chap_secret_length; u16 chap_secret_offset; u16 reverse_chap_name_length; u16 reverse_chap_name_offset; u16 reverse_chap_secret_length; u16 reverse_chap_secret_offset; }; /******************************************************************************* * * IVRS - I/O Virtualization Reporting Structure * Version 1 * * Conforms to "AMD I/O Virtualization Technology (IOMMU) Specification", * Revision 1.26, February 2009. * ******************************************************************************/ struct acpi_table_ivrs { struct acpi_table_header header; /* Common ACPI table header */ u32 info; /* Common virtualization info */ u64 reserved; }; /* Values for Info field above */ #define ACPI_IVRS_PHYSICAL_SIZE 0x00007F00 /* 7 bits, physical address size */ #define ACPI_IVRS_VIRTUAL_SIZE 0x003F8000 /* 7 bits, virtual address size */ #define ACPI_IVRS_ATS_RESERVED 0x00400000 /* ATS address translation range reserved */ /* IVRS subtable header */ struct acpi_ivrs_header { u8 type; /* Subtable type */ u8 flags; u16 length; /* Subtable length */ u16 device_id; /* ID of IOMMU */ }; /* Values for subtable Type above */ enum acpi_ivrs_type { ACPI_IVRS_TYPE_HARDWARE = 0x10, ACPI_IVRS_TYPE_MEMORY_ALL /* _MEMORY1 */ = 0x20, ACPI_IVRS_TYPE_MEMORY_ONE /* _MEMORY2 */ = 0x21, ACPI_IVRS_TYPE_MEMORY_RANGE /* _MEMORY3 */ = 0x22, ACPI_IVRS_TYPE_MEMORY_IOMMU = 0x23 }; /* Masks for Flags field above for IVHD subtable */ #define ACPI_IVHD_TT_ENABLE (1) #define ACPI_IVHD_PASS_PW (1<<1) #define ACPI_IVHD_RES_PASS_PW (1<<2) #define ACPI_IVHD_ISOC (1<<3) #define ACPI_IVHD_IOTLB (1<<4) /* Masks for Flags field above for IVMD subtable */ #define ACPI_IVMD_UNITY (1) #define ACPI_IVMD_READ (1<<1) #define ACPI_IVMD_WRITE (1<<2) #define ACPI_IVMD_EXCLUSION_RANGE (1<<3) /* * IVRS subtables, correspond to Type in struct acpi_ivrs_header */ /* 0x10: I/O Virtualization Hardware Definition Block (IVHD) */ struct acpi_ivrs_hardware { struct acpi_ivrs_header header; u16 capability_offset; /* Offset for IOMMU control fields */ u64 base_address; /* IOMMU control registers */ u16 pci_segment_group; u16 info; /* MSI number and unit ID */ u32 reserved; }; /* Masks for Info field above */ #define ACPI_IVHD_MSI_NUMBER_MASK 0x001F /* 5 bits, MSI message number */ #define ACPI_IVHD_UNIT_ID_MASK 0x1F00 /* 5 bits, unit_iD */ /* * Device Entries for IVHD subtable, appear after struct acpi_ivrs_hardware structure. * Upper two bits of the Type field are the (encoded) length of the structure. * Currently, only 4 and 8 byte entries are defined. 16 and 32 byte entries * are reserved for future use but not defined. */ struct acpi_ivrs_de_header { u8 type; u16 id; u8 data_setting; }; /* Length of device entry is in the top two bits of Type field above */ #define ACPI_IVHD_ENTRY_LENGTH 0xC0 /* Values for device entry Type field above */ enum acpi_ivrs_device_entry_type { /* 4-byte device entries, all use struct acpi_ivrs_device4 */ ACPI_IVRS_TYPE_PAD4 = 0, ACPI_IVRS_TYPE_ALL = 1, ACPI_IVRS_TYPE_SELECT = 2, ACPI_IVRS_TYPE_START = 3, ACPI_IVRS_TYPE_END = 4, /* 8-byte device entries */ ACPI_IVRS_TYPE_PAD8 = 64, ACPI_IVRS_TYPE_NOT_USED = 65, ACPI_IVRS_TYPE_ALIAS_SELECT = 66, /* Uses struct acpi_ivrs_device8a */ ACPI_IVRS_TYPE_ALIAS_START = 67, /* Uses struct acpi_ivrs_device8a */ ACPI_IVRS_TYPE_EXT_SELECT = 70, /* Uses struct acpi_ivrs_device8b */ ACPI_IVRS_TYPE_EXT_START = 71, /* Uses struct acpi_ivrs_device8b */ ACPI_IVRS_TYPE_SPECIAL = 72 /* Uses struct acpi_ivrs_device8c */ }; /* Values for Data field above */ #define ACPI_IVHD_INIT_PASS (1) #define ACPI_IVHD_EINT_PASS (1<<1) #define ACPI_IVHD_NMI_PASS (1<<2) #define ACPI_IVHD_SYSTEM_MGMT (3<<4) #define ACPI_IVHD_LINT0_PASS (1<<6) #define ACPI_IVHD_LINT1_PASS (1<<7) /* Types 0-4: 4-byte device entry */ struct acpi_ivrs_device4 { struct acpi_ivrs_de_header header; }; /* Types 66-67: 8-byte device entry */ struct acpi_ivrs_device8a { struct acpi_ivrs_de_header header; u8 reserved1; u16 used_id; u8 reserved2; }; /* Types 70-71: 8-byte device entry */ struct acpi_ivrs_device8b { struct acpi_ivrs_de_header header; u32 extended_data; }; /* Values for extended_data above */ #define ACPI_IVHD_ATS_DISABLED (1<<31) /* Type 72: 8-byte device entry */ struct acpi_ivrs_device8c { struct acpi_ivrs_de_header header; u8 handle; u16 used_id; u8 variety; }; /* Values for Variety field above */ #define ACPI_IVHD_IOAPIC 1 #define ACPI_IVHD_HPET 2 /* 0x20, 0x21, 0x22: I/O Virtualization Memory Definition Block (IVMD) */ struct acpi_ivrs_memory { struct acpi_ivrs_header header; u16 aux_data; u64 reserved; u64 start_address; u64 memory_length; }; /******************************************************************************* * * MCFG - PCI Memory Mapped Configuration table and sub-table * Version 1 * * Conforms to "PCI Firmware Specification", Revision 3.0, June 20, 2005 * ******************************************************************************/ struct acpi_table_mcfg { struct acpi_table_header header; /* Common ACPI table header */ u8 reserved[8]; }; /* Subtable */ struct acpi_mcfg_allocation { u64 address; /* Base address, processor-relative */ u16 pci_segment; /* PCI segment group number */ u8 start_bus_number; /* Starting PCI Bus number */ u8 end_bus_number; /* Final PCI Bus number */ u32 reserved; }; /******************************************************************************* * * MCHI - Management Controller Host Interface Table * Version 1 * * Conforms to "Management Component Transport Protocol (MCTP) Host * Interface Specification", Revision 1.0.0a, October 13, 2009 * ******************************************************************************/ struct acpi_table_mchi { struct acpi_table_header header; /* Common ACPI table header */ u8 interface_type; u8 protocol; u64 protocol_data; u8 interrupt_type; u8 gpe; u8 pci_device_flag; u32 global_interrupt; struct acpi_generic_address control_register; u8 pci_segment; u8 pci_bus; u8 pci_device; u8 pci_function; }; /******************************************************************************* * * SLIC - Software Licensing Description Table * Version 1 * * Conforms to "OEM Activation 2.0 for Windows Vista Operating Systems", * Copyright 2006 * ******************************************************************************/ /* Basic SLIC table is only the common ACPI header */ struct acpi_table_slic { struct acpi_table_header header; /* Common ACPI table header */ }; /* Common SLIC subtable header */ struct acpi_slic_header { u32 type; u32 length; }; /* Values for Type field above */ enum acpi_slic_type { ACPI_SLIC_TYPE_PUBLIC_KEY = 0, ACPI_SLIC_TYPE_WINDOWS_MARKER = 1, ACPI_SLIC_TYPE_RESERVED = 2 /* 2 and greater are reserved */ }; /* * SLIC Sub-tables, correspond to Type in struct acpi_slic_header */ /* 0: Public Key Structure */ struct acpi_slic_key { struct acpi_slic_header header; u8 key_type; u8 version; u16 reserved; u32 algorithm; char magic[4]; u32 bit_length; u32 exponent; u8 modulus[128]; }; /* 1: Windows Marker Structure */ struct acpi_slic_marker { struct acpi_slic_header header; u32 version; char oem_id[ACPI_OEM_ID_SIZE]; /* ASCII OEM identification */ char oem_table_id[ACPI_OEM_TABLE_ID_SIZE]; /* ASCII OEM table identification */ char windows_flag[8]; u32 slic_version; u8 reserved[16]; u8 signature[128]; }; /******************************************************************************* * * SPCR - Serial Port Console Redirection table * Version 1 * * Conforms to "Serial Port Console Redirection Table", * Version 1.00, January 11, 2002 * ******************************************************************************/ struct acpi_table_spcr { struct acpi_table_header header; /* Common ACPI table header */ u8 interface_type; /* 0=full 16550, 1=subset of 16550 */ u8 reserved[3]; struct acpi_generic_address serial_port; u8 interrupt_type; u8 pc_interrupt; u32 interrupt; u8 baud_rate; u8 parity; u8 stop_bits; u8 flow_control; u8 terminal_type; u8 reserved1; u16 pci_device_id; u16 pci_vendor_id; u8 pci_bus; u8 pci_device; u8 pci_function; u32 pci_flags; u8 pci_segment; u32 reserved2; }; /* Masks for pci_flags field above */ #define ACPI_SPCR_DO_NOT_DISABLE (1) /******************************************************************************* * * SPMI - Server Platform Management Interface table * Version 5 * * Conforms to "Intelligent Platform Management Interface Specification * Second Generation v2.0", Document Revision 1.0, February 12, 2004 with * June 12, 2009 markup. * ******************************************************************************/ struct acpi_table_spmi { struct acpi_table_header header; /* Common ACPI table header */ u8 interface_type; u8 reserved; /* Must be 1 */ u16 spec_revision; /* Version of IPMI */ u8 interrupt_type; u8 gpe_number; /* GPE assigned */ u8 reserved1; u8 pci_device_flag; u32 interrupt; struct acpi_generic_address ipmi_register; u8 pci_segment; u8 pci_bus; u8 pci_device; u8 pci_function; u8 reserved2; }; /* Values for interface_type above */ enum acpi_spmi_interface_types { ACPI_SPMI_NOT_USED = 0, ACPI_SPMI_KEYBOARD = 1, ACPI_SPMI_SMI = 2, ACPI_SPMI_BLOCK_TRANSFER = 3, ACPI_SPMI_SMBUS = 4, ACPI_SPMI_RESERVED = 5 /* 5 and above are reserved */ }; /******************************************************************************* * * TCPA - Trusted Computing Platform Alliance table * Version 1 * * Conforms to "TCG PC Specific Implementation Specification", * Version 1.1, August 18, 2003 * ******************************************************************************/ struct acpi_table_tcpa { struct acpi_table_header header; /* Common ACPI table header */ u16 reserved; u32 max_log_length; /* Maximum length for the event log area */ u64 log_address; /* Address of the event log area */ }; /******************************************************************************* * * UEFI - UEFI Boot optimization Table * Version 1 * * Conforms to "Unified Extensible Firmware Interface Specification", * Version 2.3, May 8, 2009 * ******************************************************************************/ struct acpi_table_uefi { struct acpi_table_header header; /* Common ACPI table header */ u8 identifier[16]; /* UUID identifier */ u16 data_offset; /* Offset of remaining data in table */ }; /******************************************************************************* * * WAET - Windows ACPI Emulated devices Table * Version 1 * * Conforms to "Windows ACPI Emulated Devices Table", version 1.0, April 6, 2009 * ******************************************************************************/ struct acpi_table_waet { struct acpi_table_header header; /* Common ACPI table header */ u32 flags; }; /* Masks for Flags field above */ #define ACPI_WAET_RTC_NO_ACK (1) /* RTC requires no int acknowledge */ #define ACPI_WAET_TIMER_ONE_READ (1<<1) /* PM timer requires only one read */ /******************************************************************************* * * WDAT - Watchdog Action Table * Version 1 * * Conforms to "Hardware Watchdog Timers Design Specification", * Copyright 2006 Microsoft Corporation. * ******************************************************************************/ struct acpi_table_wdat { struct acpi_table_header header; /* Common ACPI table header */ u32 header_length; /* Watchdog Header Length */ u16 pci_segment; /* PCI Segment number */ u8 pci_bus; /* PCI Bus number */ u8 pci_device; /* PCI Device number */ u8 pci_function; /* PCI Function number */ u8 reserved[3]; u32 timer_period; /* Period of one timer count (msec) */ u32 max_count; /* Maximum counter value supported */ u32 min_count; /* Minimum counter value */ u8 flags; u8 reserved2[3]; u32 entries; /* Number of watchdog entries that follow */ }; /* Masks for Flags field above */ #define ACPI_WDAT_ENABLED (1) #define ACPI_WDAT_STOPPED 0x80 /* WDAT Instruction Entries (actions) */ struct acpi_wdat_entry { u8 action; u8 instruction; u16 reserved; struct acpi_generic_address register_region; u32 value; /* Value used with Read/Write register */ u32 mask; /* Bitmask required for this register instruction */ }; /* Values for Action field above */ enum acpi_wdat_actions { ACPI_WDAT_RESET = 1, ACPI_WDAT_GET_CURRENT_COUNTDOWN = 4, ACPI_WDAT_GET_COUNTDOWN = 5, ACPI_WDAT_SET_COUNTDOWN = 6, ACPI_WDAT_GET_RUNNING_STATE = 8, ACPI_WDAT_SET_RUNNING_STATE = 9, ACPI_WDAT_GET_STOPPED_STATE = 10, ACPI_WDAT_SET_STOPPED_STATE = 11, ACPI_WDAT_GET_REBOOT = 16, ACPI_WDAT_SET_REBOOT = 17, ACPI_WDAT_GET_SHUTDOWN = 18, ACPI_WDAT_SET_SHUTDOWN = 19, ACPI_WDAT_GET_STATUS = 32, ACPI_WDAT_SET_STATUS = 33, ACPI_WDAT_ACTION_RESERVED = 34 /* 34 and greater are reserved */ }; /* Values for Instruction field above */ enum acpi_wdat_instructions { ACPI_WDAT_READ_VALUE = 0, ACPI_WDAT_READ_COUNTDOWN = 1, ACPI_WDAT_WRITE_VALUE = 2, ACPI_WDAT_WRITE_COUNTDOWN = 3, ACPI_WDAT_INSTRUCTION_RESERVED = 4, /* 4 and greater are reserved */ ACPI_WDAT_PRESERVE_REGISTER = 0x80 /* Except for this value */ }; /******************************************************************************* * * WDDT - Watchdog Descriptor Table * Version 1 * * Conforms to "Using the Intel ICH Family Watchdog Timer (WDT)", * Version 001, September 2002 * ******************************************************************************/ struct acpi_table_wddt { struct acpi_table_header header; /* Common ACPI table header */ u16 spec_version; u16 table_version; u16 pci_vendor_id; struct acpi_generic_address address; u16 max_count; /* Maximum counter value supported */ u16 min_count; /* Minimum counter value supported */ u16 period; u16 status; u16 capability; }; /* Flags for Status field above */ #define ACPI_WDDT_AVAILABLE (1) #define ACPI_WDDT_ACTIVE (1<<1) #define ACPI_WDDT_TCO_OS_OWNED (1<<2) #define ACPI_WDDT_USER_RESET (1<<11) #define ACPI_WDDT_WDT_RESET (1<<12) #define ACPI_WDDT_POWER_FAIL (1<<13) #define ACPI_WDDT_UNKNOWN_RESET (1<<14) /* Flags for Capability field above */ #define ACPI_WDDT_AUTO_RESET (1) #define ACPI_WDDT_ALERT_SUPPORT (1<<1) /******************************************************************************* * * WDRT - Watchdog Resource Table * Version 1 * * Conforms to "Watchdog Timer Hardware Requirements for Windows Server 2003", * Version 1.01, August 28, 2006 * ******************************************************************************/ struct acpi_table_wdrt { struct acpi_table_header header; /* Common ACPI table header */ struct acpi_generic_address control_register; struct acpi_generic_address count_register; u16 pci_device_id; u16 pci_vendor_id; u8 pci_bus; /* PCI Bus number */ u8 pci_device; /* PCI Device number */ u8 pci_function; /* PCI Function number */ u8 pci_segment; /* PCI Segment number */ u16 max_count; /* Maximum counter value supported */ u8 units; }; /* Reset to default packing */ #pragma pack() #endif /* __ACTBL2_H__ */ xen-4.4.0/xen/include/acpi/acpiosxf.h0000664000175000017500000000635512307313555015572 0ustar smbsmb /****************************************************************************** * * Name: acpiosxf.h - All interfaces to the OS Services Layer (OSL). These * interfaces must be implemented by OSL to interface the * ACPI components to the host operating system. * *****************************************************************************/ /* * Copyright (C) 2000 - 2007, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACPIOSXF_H__ #define __ACPIOSXF_H__ #include "platform/acenv.h" #include "actypes.h" /* * ACPI Table interfaces */ acpi_physical_address acpi_os_get_root_pointer(void); /* * Memory mapping */ void __iomem *acpi_os_map_memory(acpi_physical_address where, acpi_native_uint length); void acpi_os_unmap_memory(void __iomem * logical_address, acpi_size size); /* * Platform and hardware-independent I/O interfaces */ acpi_status acpi_os_read_port(acpi_io_address address, u32 * value, u32 width); acpi_status acpi_os_write_port(acpi_io_address address, u32 value, u32 width); /* * Platform and hardware-independent physical memory interfaces */ acpi_status acpi_os_read_memory(acpi_physical_address address, u32 * value, u32 width); acpi_status acpi_os_write_memory(acpi_physical_address address, u32 value, u32 width); /* * Debug print routines */ void ACPI_INTERNAL_VAR_XFACE acpi_os_printf(const char *format, ...); void acpi_os_vprintf(const char *format, va_list args); #endif /* __ACPIOSXF_H__ */ xen-4.4.0/xen/include/acpi/acutils.h0000664000175000017500000001536112307313555015417 0ustar smbsmb/****************************************************************************** * * Name: acutils.h -- prototypes for the common (subsystem-wide) procedures * *****************************************************************************/ /* * Copyright (C) 2000 - 2007, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef _ACUTILS_H #define _ACUTILS_H /* Types for Resource descriptor entries */ #define ACPI_INVALID_RESOURCE 0 #define ACPI_FIXED_LENGTH 1 #define ACPI_VARIABLE_LENGTH 2 #define ACPI_SMALL_VARIABLE_LENGTH 3 /* * utglobal - Global data structures and procedures */ const char *acpi_ut_get_region_name(u8 space_id); /* * utclib - Local implementations of C library functions */ #ifndef ACPI_USE_SYSTEM_CLIBRARY acpi_size acpi_ut_strlen(const char *string); char *acpi_ut_strcpy(char *dst_string, const char *src_string); char *acpi_ut_strncpy(char *dst_string, const char *src_string, acpi_size count); int acpi_ut_memcmp(const char *buffer1, const char *buffer2, acpi_size count); int acpi_ut_strncmp(const char *string1, const char *string2, acpi_size count); int acpi_ut_strcmp(const char *string1, const char *string2); char *acpi_ut_strcat(char *dst_string, const char *src_string); char *acpi_ut_strncat(char *dst_string, const char *src_string, acpi_size count); u32 acpi_ut_strtoul(const char *string, char **terminator, u32 base); char *acpi_ut_strstr(char *string1, char *string2); void *acpi_ut_memcpy(void *dest, const void *src, acpi_size count); void *acpi_ut_memset(void *dest, acpi_native_uint value, acpi_size count); int acpi_ut_to_upper(int c); int acpi_ut_to_lower(int c); extern const u8 _acpi_ctype[]; #define _ACPI_XA 0x00 /* extra alphabetic - not supported */ #define _ACPI_XS 0x40 /* extra space */ #define _ACPI_BB 0x00 /* BEL, BS, etc. - not supported */ #define _ACPI_CN 0x20 /* CR, FF, HT, NL, VT */ #define _ACPI_DI 0x04 /* '0'-'9' */ #define _ACPI_LO 0x02 /* 'a'-'z' */ #define _ACPI_PU 0x10 /* punctuation */ #define _ACPI_SP 0x08 /* space */ #define _ACPI_UP 0x01 /* 'A'-'Z' */ #define _ACPI_XD 0x80 /* '0'-'9', 'A'-'F', 'a'-'f' */ #define ACPI_IS_DIGIT(c) (_acpi_ctype[(unsigned char)(c)] & (_ACPI_DI)) #define ACPI_IS_SPACE(c) (_acpi_ctype[(unsigned char)(c)] & (_ACPI_SP)) #define ACPI_IS_XDIGIT(c) (_acpi_ctype[(unsigned char)(c)] & (_ACPI_XD)) #define ACPI_IS_UPPER(c) (_acpi_ctype[(unsigned char)(c)] & (_ACPI_UP)) #define ACPI_IS_LOWER(c) (_acpi_ctype[(unsigned char)(c)] & (_ACPI_LO)) #define ACPI_IS_PRINT(c) (_acpi_ctype[(unsigned char)(c)] & (_ACPI_LO | _ACPI_UP | _ACPI_DI | _ACPI_SP | _ACPI_PU)) #define ACPI_IS_ALPHA(c) (_acpi_ctype[(unsigned char)(c)] & (_ACPI_LO | _ACPI_UP)) #endif /* ACPI_USE_SYSTEM_CLIBRARY */ /* * utdebug - Debug interfaces */ void acpi_ut_track_stack_ptr(void); void acpi_ut_trace(u32 line_number, const char *function_name, const char *module_name, u32 component_id); void acpi_ut_trace_ptr(u32 line_number, const char *function_name, const char *module_name, u32 component_id, void *pointer); void acpi_ut_trace_u32(u32 line_number, const char *function_name, const char *module_name, u32 component_id, u32 integer); void acpi_ut_trace_str(u32 line_number, const char *function_name, const char *module_name, u32 component_id, char *string); void acpi_ut_exit(u32 line_number, const char *function_name, const char *module_name, u32 component_id); void acpi_ut_status_exit(u32 line_number, const char *function_name, const char *module_name, u32 component_id, acpi_status status); void acpi_ut_value_exit(u32 line_number, const char *function_name, const char *module_name, u32 component_id, acpi_integer value); void acpi_ut_ptr_exit(u32 line_number, const char *function_name, const char *module_name, u32 component_id, u8 * ptr); /* Error and message reporting interfaces */ void ACPI_INTERNAL_VAR_XFACE acpi_ut_debug_print(u32 requested_debug_level, u32 line_number, const char *function_name, const char *module_name, u32 component_id, char *format, ...) ACPI_PRINTF_LIKE(6); void ACPI_INTERNAL_VAR_XFACE acpi_ut_debug_print_raw(u32 requested_debug_level, u32 line_number, const char *function_name, const char *module_name, u32 component_id, char *format, ...) ACPI_PRINTF_LIKE(6); void ACPI_INTERNAL_VAR_XFACE acpi_ut_error(const char *module_name, u32 line_number, char *format, ...) ACPI_PRINTF_LIKE(3); void ACPI_INTERNAL_VAR_XFACE acpi_ut_exception(const char *module_name, u32 line_number, acpi_status status, char *format, ...) ACPI_PRINTF_LIKE(4); void ACPI_INTERNAL_VAR_XFACE acpi_ut_warning(const char *module_name, u32 line_number, char *format, ...) ACPI_PRINTF_LIKE(3); void ACPI_INTERNAL_VAR_XFACE acpi_ut_info(const char *module_name, u32 line_number, char *format, ...) ACPI_PRINTF_LIKE(3); /* * utmisc */ const char *acpi_ut_validate_exception(acpi_status status); #endif /* _ACUTILS_H */ xen-4.4.0/xen/include/acpi/acnames.h0000664000175000017500000000663512307313555015366 0ustar smbsmb/****************************************************************************** * * Name: acnames.h - Global names and strings * *****************************************************************************/ /* * Copyright (C) 2000 - 2007, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACNAMES_H__ #define __ACNAMES_H__ /* Method names - these methods can appear anywhere in the namespace */ #define METHOD_NAME__HID "_HID" #define METHOD_NAME__CID "_CID" #define METHOD_NAME__UID "_UID" #define METHOD_NAME__ADR "_ADR" #define METHOD_NAME__INI "_INI" #define METHOD_NAME__STA "_STA" #define METHOD_NAME__REG "_REG" #define METHOD_NAME__SEG "_SEG" #define METHOD_NAME__BBN "_BBN" #define METHOD_NAME__PRT "_PRT" #define METHOD_NAME__CRS "_CRS" #define METHOD_NAME__PRS "_PRS" #define METHOD_NAME__PRW "_PRW" #define METHOD_NAME__SRS "_SRS" /* Method names - these methods must appear at the namespace root */ #define METHOD_NAME__BFS "\\_BFS" #define METHOD_NAME__GTS "\\_GTS" #define METHOD_NAME__PTS "\\_PTS" #define METHOD_NAME__SST "\\_SI._SST" #define METHOD_NAME__WAK "\\_WAK" /* Definitions of the predefined namespace names */ #define ACPI_UNKNOWN_NAME (u32) 0x3F3F3F3F /* Unknown name is "????" */ #define ACPI_ROOT_NAME (u32) 0x5F5F5F5C /* Root name is "\___" */ #define ACPI_PREFIX_MIXED (u32) 0x69706341 /* "Acpi" */ #define ACPI_PREFIX_LOWER (u32) 0x69706361 /* "acpi" */ #define ACPI_NS_ROOT_PATH "\\" #define ACPI_NS_SYSTEM_BUS "_SB_" #endif /* __ACNAMES_H__ */ xen-4.4.0/xen/include/acpi/acoutput.h0000664000175000017500000002002012307313555015603 0ustar smbsmb/****************************************************************************** * * Name: acoutput.h -- debug output * *****************************************************************************/ /* * Copyright (C) 2000 - 2007, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACOUTPUT_H__ #define __ACOUTPUT_H__ /* * Debug levels and component IDs. These are used to control the * granularity of the output of the DEBUG_PRINT macro -- on a per- * component basis and a per-exception-type basis. */ /* Component IDs are used in the global "DebugLayer" */ #define ACPI_UTILITIES 0x00000001 #define ACPI_HARDWARE 0x00000002 #define ACPI_EVENTS 0x00000004 #define ACPI_TABLES 0x00000008 #define ACPI_NAMESPACE 0x00000010 #define ACPI_PARSER 0x00000020 #define ACPI_DISPATCHER 0x00000040 #define ACPI_EXECUTER 0x00000080 #define ACPI_RESOURCES 0x00000100 #define ACPI_CA_DEBUGGER 0x00000200 #define ACPI_OS_SERVICES 0x00000400 #define ACPI_CA_DISASSEMBLER 0x00000800 /* Component IDs for ACPI tools and utilities */ #define ACPI_COMPILER 0x00001000 #define ACPI_TOOLS 0x00002000 #define ACPI_ALL_COMPONENTS 0x00003FFF #define ACPI_COMPONENT_DEFAULT (ACPI_ALL_COMPONENTS) /* Component IDs reserved for ACPI drivers */ #define ACPI_ALL_DRIVERS 0xFFFF0000 /* * Raw debug output levels, do not use these in the DEBUG_PRINT macros */ #define ACPI_LV_ERROR 0x00000001 #define ACPI_LV_WARN 0x00000002 #define ACPI_LV_INIT 0x00000004 #define ACPI_LV_DEBUG_OBJECT 0x00000008 #define ACPI_LV_INFO 0x00000010 #define ACPI_LV_ALL_EXCEPTIONS 0x0000001F /* Trace verbosity level 1 [Standard Trace Level] */ #define ACPI_LV_INIT_NAMES 0x00000020 #define ACPI_LV_PARSE 0x00000040 #define ACPI_LV_LOAD 0x00000080 #define ACPI_LV_DISPATCH 0x00000100 #define ACPI_LV_EXEC 0x00000200 #define ACPI_LV_NAMES 0x00000400 #define ACPI_LV_OPREGION 0x00000800 #define ACPI_LV_BFIELD 0x00001000 #define ACPI_LV_TABLES 0x00002000 #define ACPI_LV_VALUES 0x00004000 #define ACPI_LV_OBJECTS 0x00008000 #define ACPI_LV_RESOURCES 0x00010000 #define ACPI_LV_USER_REQUESTS 0x00020000 #define ACPI_LV_PACKAGE 0x00040000 #define ACPI_LV_VERBOSITY1 0x0007FF40 | ACPI_LV_ALL_EXCEPTIONS /* Trace verbosity level 2 [Function tracing and memory allocation] */ #define ACPI_LV_ALLOCATIONS 0x00100000 #define ACPI_LV_FUNCTIONS 0x00200000 #define ACPI_LV_OPTIMIZATIONS 0x00400000 #define ACPI_LV_VERBOSITY2 0x00700000 | ACPI_LV_VERBOSITY1 #define ACPI_LV_ALL ACPI_LV_VERBOSITY2 /* Trace verbosity level 3 [Threading, I/O, and Interrupts] */ #define ACPI_LV_MUTEX 0x01000000 #define ACPI_LV_THREADS 0x02000000 #define ACPI_LV_IO 0x04000000 #define ACPI_LV_INTERRUPTS 0x08000000 #define ACPI_LV_VERBOSITY3 0x0F000000 | ACPI_LV_VERBOSITY2 /* Exceptionally verbose output -- also used in the global "DebugLevel" */ #define ACPI_LV_AML_DISASSEMBLE 0x10000000 #define ACPI_LV_VERBOSE_INFO 0x20000000 #define ACPI_LV_FULL_TABLES 0x40000000 #define ACPI_LV_EVENTS 0x80000000 #define ACPI_LV_VERBOSE 0xF0000000 /* * Debug level macros that are used in the DEBUG_PRINT macros */ #define ACPI_DEBUG_LEVEL(dl) (u32) dl,ACPI_DEBUG_PARAMETERS /* Exception level -- used in the global "DebugLevel" */ #define ACPI_DB_INIT ACPI_DEBUG_LEVEL (ACPI_LV_INIT) #define ACPI_DB_DEBUG_OBJECT ACPI_DEBUG_LEVEL (ACPI_LV_DEBUG_OBJECT) #define ACPI_DB_INFO ACPI_DEBUG_LEVEL (ACPI_LV_INFO) #define ACPI_DB_ALL_EXCEPTIONS ACPI_DEBUG_LEVEL (ACPI_LV_ALL_EXCEPTIONS) /* * These two levels are essentially obsolete, all instances in the * ACPICA core code have been replaced by ACPI_ERROR and ACPI_WARNING * (Kept here because some drivers may still use them) */ #define ACPI_DB_ERROR ACPI_DEBUG_LEVEL (ACPI_LV_ERROR) #define ACPI_DB_WARN ACPI_DEBUG_LEVEL (ACPI_LV_WARN) /* Trace level -- also used in the global "DebugLevel" */ #define ACPI_DB_INIT_NAMES ACPI_DEBUG_LEVEL (ACPI_LV_INIT_NAMES) #define ACPI_DB_THREADS ACPI_DEBUG_LEVEL (ACPI_LV_THREADS) #define ACPI_DB_PARSE ACPI_DEBUG_LEVEL (ACPI_LV_PARSE) #define ACPI_DB_DISPATCH ACPI_DEBUG_LEVEL (ACPI_LV_DISPATCH) #define ACPI_DB_LOAD ACPI_DEBUG_LEVEL (ACPI_LV_LOAD) #define ACPI_DB_EXEC ACPI_DEBUG_LEVEL (ACPI_LV_EXEC) #define ACPI_DB_NAMES ACPI_DEBUG_LEVEL (ACPI_LV_NAMES) #define ACPI_DB_OPREGION ACPI_DEBUG_LEVEL (ACPI_LV_OPREGION) #define ACPI_DB_BFIELD ACPI_DEBUG_LEVEL (ACPI_LV_BFIELD) #define ACPI_DB_TABLES ACPI_DEBUG_LEVEL (ACPI_LV_TABLES) #define ACPI_DB_FUNCTIONS ACPI_DEBUG_LEVEL (ACPI_LV_FUNCTIONS) #define ACPI_DB_OPTIMIZATIONS ACPI_DEBUG_LEVEL (ACPI_LV_OPTIMIZATIONS) #define ACPI_DB_VALUES ACPI_DEBUG_LEVEL (ACPI_LV_VALUES) #define ACPI_DB_OBJECTS ACPI_DEBUG_LEVEL (ACPI_LV_OBJECTS) #define ACPI_DB_ALLOCATIONS ACPI_DEBUG_LEVEL (ACPI_LV_ALLOCATIONS) #define ACPI_DB_RESOURCES ACPI_DEBUG_LEVEL (ACPI_LV_RESOURCES) #define ACPI_DB_IO ACPI_DEBUG_LEVEL (ACPI_LV_IO) #define ACPI_DB_INTERRUPTS ACPI_DEBUG_LEVEL (ACPI_LV_INTERRUPTS) #define ACPI_DB_USER_REQUESTS ACPI_DEBUG_LEVEL (ACPI_LV_USER_REQUESTS) #define ACPI_DB_PACKAGE ACPI_DEBUG_LEVEL (ACPI_LV_PACKAGE) #define ACPI_DB_MUTEX ACPI_DEBUG_LEVEL (ACPI_LV_MUTEX) #define ACPI_DB_ALL ACPI_DEBUG_LEVEL (ACPI_LV_ALL) /* Defaults for debug_level, debug and normal */ #define ACPI_DEBUG_DEFAULT (ACPI_LV_INIT | ACPI_LV_WARN | ACPI_LV_ERROR) #define ACPI_NORMAL_DEFAULT (ACPI_LV_INIT | ACPI_LV_WARN | ACPI_LV_ERROR) #define ACPI_DEBUG_ALL (ACPI_LV_AML_DISASSEMBLE | ACPI_LV_ALL_EXCEPTIONS | ACPI_LV_ALL) #endif /* __ACOUTPUT_H__ */ xen-4.4.0/xen/include/acpi/aclocal.h0000664000175000017500000001673412307313555015356 0ustar smbsmb/****************************************************************************** * * Name: aclocal.h - Internal data types used across the ACPI subsystem * *****************************************************************************/ /* * Copyright (C) 2000 - 2007, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACLOCAL_H__ #define __ACLOCAL_H__ /* acpisrc:struct_defs -- for acpisrc conversion */ /***************************************************************************** * * Namespace typedefs and structs * ****************************************************************************/ union acpi_name_union { u32 integer; char ascii[4]; }; /* * ACPI Table Descriptor. One per ACPI table */ struct acpi_table_desc { acpi_physical_address address; struct acpi_table_header *pointer; u32 length; /* Length fixed at 32 bits */ union acpi_name_union signature; u8 flags; }; /* Flags for above */ #define ACPI_TABLE_ORIGIN_UNKNOWN (0) #define ACPI_TABLE_ORIGIN_MAPPED (1) #define ACPI_TABLE_ORIGIN_ALLOCATED (2) #define ACPI_TABLE_ORIGIN_MASK (3) #define ACPI_TABLE_IS_LOADED (4) /* One internal RSDT for table management */ struct acpi_internal_rsdt { struct acpi_table_desc *tables; u32 count; u32 size; u8 flags; }; /* Flags for above */ #define ACPI_ROOT_ORIGIN_UNKNOWN (0) /* ~ORIGIN_ALLOCATED */ #define ACPI_ROOT_ORIGIN_ALLOCATED (1) #define ACPI_ROOT_ALLOW_RESIZE (2) /* Predefined (fixed) table indexes */ #define ACPI_TABLE_INDEX_DSDT (0) #define ACPI_TABLE_INDEX_FACS (1) /***************************************************************************** * * Hardware (ACPI registers) and PNP * ****************************************************************************/ struct acpi_bit_register_info { u8 parent_register; u8 bit_position; u16 access_bit_mask; }; /* * Some ACPI registers have bits that must be ignored -- meaning that they * must be preserved. */ #define ACPI_PM1_STATUS_PRESERVED_BITS 0x0800 /* Bit 11 */ #define ACPI_PM1_CONTROL_PRESERVED_BITS 0x0200 /* Bit 9 (whatever) */ /* * Register IDs * These are the full ACPI registers */ #define ACPI_REGISTER_PM1_STATUS 0x01 #define ACPI_REGISTER_PM1_ENABLE 0x02 #define ACPI_REGISTER_PM1_CONTROL 0x03 #define ACPI_REGISTER_PM1A_CONTROL 0x04 #define ACPI_REGISTER_PM1B_CONTROL 0x05 #define ACPI_REGISTER_PM2_CONTROL 0x06 #define ACPI_REGISTER_PM_TIMER 0x07 #define ACPI_REGISTER_PROCESSOR_BLOCK 0x08 #define ACPI_REGISTER_SMI_COMMAND_BLOCK 0x09 #define ACPI_REGISTER_SLEEP_CONTROL 0x0a #define ACPI_REGISTER_SLEEP_STATUS 0x0b /* Masks used to access the bit_registers */ #define ACPI_BITMASK_TIMER_STATUS 0x0001 #define ACPI_BITMASK_BUS_MASTER_STATUS 0x0010 #define ACPI_BITMASK_GLOBAL_LOCK_STATUS 0x0020 #define ACPI_BITMASK_POWER_BUTTON_STATUS 0x0100 #define ACPI_BITMASK_SLEEP_BUTTON_STATUS 0x0200 #define ACPI_BITMASK_RT_CLOCK_STATUS 0x0400 #define ACPI_BITMASK_PCIEXP_WAKE_STATUS 0x4000 /* ACPI 3.0 */ #define ACPI_BITMASK_WAKE_STATUS 0x8000 #define ACPI_BITMASK_ALL_FIXED_STATUS (\ ACPI_BITMASK_TIMER_STATUS | \ ACPI_BITMASK_BUS_MASTER_STATUS | \ ACPI_BITMASK_GLOBAL_LOCK_STATUS | \ ACPI_BITMASK_POWER_BUTTON_STATUS | \ ACPI_BITMASK_SLEEP_BUTTON_STATUS | \ ACPI_BITMASK_RT_CLOCK_STATUS | \ ACPI_BITMASK_WAKE_STATUS) #define ACPI_BITMASK_TIMER_ENABLE 0x0001 #define ACPI_BITMASK_GLOBAL_LOCK_ENABLE 0x0020 #define ACPI_BITMASK_POWER_BUTTON_ENABLE 0x0100 #define ACPI_BITMASK_SLEEP_BUTTON_ENABLE 0x0200 #define ACPI_BITMASK_RT_CLOCK_ENABLE 0x0400 #define ACPI_BITMASK_PCIEXP_WAKE_DISABLE 0x4000 /* ACPI 3.0 */ #define ACPI_BITMASK_SCI_ENABLE 0x0001 #define ACPI_BITMASK_BUS_MASTER_RLD 0x0002 #define ACPI_BITMASK_GLOBAL_LOCK_RELEASE 0x0004 #define ACPI_BITMASK_SLEEP_TYPE_X 0x1C00 #define ACPI_BITMASK_SLEEP_ENABLE 0x2000 #define ACPI_BITMASK_ARB_DISABLE 0x0001 /* Raw bit position of each bit_register */ #define ACPI_BITPOSITION_TIMER_STATUS 0x00 #define ACPI_BITPOSITION_BUS_MASTER_STATUS 0x04 #define ACPI_BITPOSITION_GLOBAL_LOCK_STATUS 0x05 #define ACPI_BITPOSITION_POWER_BUTTON_STATUS 0x08 #define ACPI_BITPOSITION_SLEEP_BUTTON_STATUS 0x09 #define ACPI_BITPOSITION_RT_CLOCK_STATUS 0x0A #define ACPI_BITPOSITION_PCIEXP_WAKE_STATUS 0x0E /* ACPI 3.0 */ #define ACPI_BITPOSITION_WAKE_STATUS 0x0F #define ACPI_BITPOSITION_TIMER_ENABLE 0x00 #define ACPI_BITPOSITION_GLOBAL_LOCK_ENABLE 0x05 #define ACPI_BITPOSITION_POWER_BUTTON_ENABLE 0x08 #define ACPI_BITPOSITION_SLEEP_BUTTON_ENABLE 0x09 #define ACPI_BITPOSITION_RT_CLOCK_ENABLE 0x0A #define ACPI_BITPOSITION_PCIEXP_WAKE_DISABLE 0x0E /* ACPI 3.0 */ #define ACPI_BITPOSITION_SCI_ENABLE 0x00 #define ACPI_BITPOSITION_BUS_MASTER_RLD 0x01 #define ACPI_BITPOSITION_GLOBAL_LOCK_RELEASE 0x02 #define ACPI_BITPOSITION_SLEEP_TYPE_X 0x0A #define ACPI_BITPOSITION_SLEEP_ENABLE 0x0D #define ACPI_BITPOSITION_ARB_DISABLE 0x00 /***************************************************************************** * * Resource descriptors * ****************************************************************************/ /* resource_type values */ #define ACPI_ADDRESS_TYPE_MEMORY_RANGE 0 #define ACPI_ADDRESS_TYPE_IO_RANGE 1 #define ACPI_ADDRESS_TYPE_BUS_NUMBER_RANGE 2 #endif /* __ACLOCAL_H__ */ xen-4.4.0/xen/include/acpi/pdc_intel.h0000664000175000017500000000256412307313555015715 0ustar smbsmb /* _PDC bit definition for Intel processors */ #ifndef __PDC_INTEL_H__ #define __PDC_INTEL_H__ #define ACPI_PDC_REVISION_ID 1 #define ACPI_PDC_P_FFH (0x0001) #define ACPI_PDC_C_C1_HALT (0x0002) #define ACPI_PDC_T_FFH (0x0004) #define ACPI_PDC_SMP_C1PT (0x0008) #define ACPI_PDC_SMP_C2C3 (0x0010) #define ACPI_PDC_SMP_P_SWCOORD (0x0020) #define ACPI_PDC_SMP_C_SWCOORD (0x0040) #define ACPI_PDC_SMP_T_SWCOORD (0x0080) #define ACPI_PDC_C_C1_FFH (0x0100) #define ACPI_PDC_C_C2C3_FFH (0x0200) #define ACPI_PDC_SMP_P_HWCOORD (0x0800) #define ACPI_PDC_EST_CAPABILITY_SMP (ACPI_PDC_SMP_C1PT | \ ACPI_PDC_C_C1_HALT | \ ACPI_PDC_P_FFH) #define ACPI_PDC_EST_CAPABILITY_SWSMP (ACPI_PDC_SMP_C1PT | \ ACPI_PDC_C_C1_HALT | \ ACPI_PDC_SMP_P_SWCOORD | \ ACPI_PDC_SMP_P_HWCOORD | \ ACPI_PDC_P_FFH) #define ACPI_PDC_C_CAPABILITY_SMP (ACPI_PDC_SMP_C2C3 | \ ACPI_PDC_SMP_C1PT | \ ACPI_PDC_C_C1_HALT | \ ACPI_PDC_C_C1_FFH | \ ACPI_PDC_C_C2C3_FFH) #define ACPI_PDC_C_MASK (ACPI_PDC_C_C1_HALT | \ ACPI_PDC_C_C1_FFH | \ ACPI_PDC_SMP_C2C3 | \ ACPI_PDC_SMP_C_SWCOORD | \ ACPI_PDC_C_C2C3_FFH) #define ACPI_PDC_P_MASK (ACPI_PDC_P_FFH | \ ACPI_PDC_SMP_P_SWCOORD | \ ACPI_PDC_SMP_P_HWCOORD) #define ACPI_PDC_T_MASK (ACPI_PDC_T_FFH | \ ACPI_PDC_SMP_T_SWCOORD) #endif /* __PDC_INTEL_H__ */ xen-4.4.0/xen/include/acpi/acexcep.h0000664000175000017500000003206312307313555015361 0ustar smbsmb/****************************************************************************** * * Name: acexcep.h - Exception codes returned by the ACPI subsystem * *****************************************************************************/ /* * Copyright (C) 2000 - 2007, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACEXCEP_H__ #define __ACEXCEP_H__ /* * Exceptions returned by external ACPI interfaces */ #define AE_CODE_ENVIRONMENTAL 0x0000 #define AE_CODE_PROGRAMMER 0x1000 #define AE_CODE_ACPI_TABLES 0x2000 #define AE_CODE_AML 0x3000 #define AE_CODE_CONTROL 0x4000 #define AE_CODE_MASK 0xF000 #define ACPI_SUCCESS(a) (!(a)) #define ACPI_FAILURE(a) (a) #define AE_OK (acpi_status) 0x0000 /* * Environmental exceptions */ #define AE_ERROR (acpi_status) (0x0001 | AE_CODE_ENVIRONMENTAL) #define AE_NO_ACPI_TABLES (acpi_status) (0x0002 | AE_CODE_ENVIRONMENTAL) #define AE_NO_NAMESPACE (acpi_status) (0x0003 | AE_CODE_ENVIRONMENTAL) #define AE_NO_MEMORY (acpi_status) (0x0004 | AE_CODE_ENVIRONMENTAL) #define AE_NOT_FOUND (acpi_status) (0x0005 | AE_CODE_ENVIRONMENTAL) #define AE_NOT_EXIST (acpi_status) (0x0006 | AE_CODE_ENVIRONMENTAL) #define AE_ALREADY_EXISTS (acpi_status) (0x0007 | AE_CODE_ENVIRONMENTAL) #define AE_TYPE (acpi_status) (0x0008 | AE_CODE_ENVIRONMENTAL) #define AE_NULL_OBJECT (acpi_status) (0x0009 | AE_CODE_ENVIRONMENTAL) #define AE_NULL_ENTRY (acpi_status) (0x000A | AE_CODE_ENVIRONMENTAL) #define AE_BUFFER_OVERFLOW (acpi_status) (0x000B | AE_CODE_ENVIRONMENTAL) #define AE_STACK_OVERFLOW (acpi_status) (0x000C | AE_CODE_ENVIRONMENTAL) #define AE_STACK_UNDERFLOW (acpi_status) (0x000D | AE_CODE_ENVIRONMENTAL) #define AE_NOT_IMPLEMENTED (acpi_status) (0x000E | AE_CODE_ENVIRONMENTAL) #define AE_VERSION_MISMATCH (acpi_status) (0x000F | AE_CODE_ENVIRONMENTAL) #define AE_SUPPORT (acpi_status) (0x0010 | AE_CODE_ENVIRONMENTAL) #define AE_SHARE (acpi_status) (0x0011 | AE_CODE_ENVIRONMENTAL) #define AE_LIMIT (acpi_status) (0x0012 | AE_CODE_ENVIRONMENTAL) #define AE_TIME (acpi_status) (0x0013 | AE_CODE_ENVIRONMENTAL) #define AE_UNKNOWN_STATUS (acpi_status) (0x0014 | AE_CODE_ENVIRONMENTAL) #define AE_ACQUIRE_DEADLOCK (acpi_status) (0x0015 | AE_CODE_ENVIRONMENTAL) #define AE_RELEASE_DEADLOCK (acpi_status) (0x0016 | AE_CODE_ENVIRONMENTAL) #define AE_NOT_ACQUIRED (acpi_status) (0x0017 | AE_CODE_ENVIRONMENTAL) #define AE_ALREADY_ACQUIRED (acpi_status) (0x0018 | AE_CODE_ENVIRONMENTAL) #define AE_NO_HARDWARE_RESPONSE (acpi_status) (0x0019 | AE_CODE_ENVIRONMENTAL) #define AE_NO_GLOBAL_LOCK (acpi_status) (0x001A | AE_CODE_ENVIRONMENTAL) #define AE_LOGICAL_ADDRESS (acpi_status) (0x001B | AE_CODE_ENVIRONMENTAL) #define AE_ABORT_METHOD (acpi_status) (0x001C | AE_CODE_ENVIRONMENTAL) #define AE_SAME_HANDLER (acpi_status) (0x001D | AE_CODE_ENVIRONMENTAL) #define AE_WAKE_ONLY_GPE (acpi_status) (0x001E | AE_CODE_ENVIRONMENTAL) #define AE_OWNER_ID_LIMIT (acpi_status) (0x001F | AE_CODE_ENVIRONMENTAL) #define AE_CODE_ENV_MAX 0x001F /* * Programmer exceptions */ #define AE_BAD_PARAMETER (acpi_status) (0x0001 | AE_CODE_PROGRAMMER) #define AE_BAD_CHARACTER (acpi_status) (0x0002 | AE_CODE_PROGRAMMER) #define AE_BAD_PATHNAME (acpi_status) (0x0003 | AE_CODE_PROGRAMMER) #define AE_BAD_DATA (acpi_status) (0x0004 | AE_CODE_PROGRAMMER) #define AE_BAD_ADDRESS (acpi_status) (0x0005 | AE_CODE_PROGRAMMER) #define AE_ALIGNMENT (acpi_status) (0x0006 | AE_CODE_PROGRAMMER) #define AE_BAD_HEX_CONSTANT (acpi_status) (0x0007 | AE_CODE_PROGRAMMER) #define AE_BAD_OCTAL_CONSTANT (acpi_status) (0x0008 | AE_CODE_PROGRAMMER) #define AE_BAD_DECIMAL_CONSTANT (acpi_status) (0x0009 | AE_CODE_PROGRAMMER) #define AE_CODE_PGM_MAX 0x0009 /* * Acpi table exceptions */ #define AE_BAD_SIGNATURE (acpi_status) (0x0001 | AE_CODE_ACPI_TABLES) #define AE_BAD_HEADER (acpi_status) (0x0002 | AE_CODE_ACPI_TABLES) #define AE_BAD_CHECKSUM (acpi_status) (0x0003 | AE_CODE_ACPI_TABLES) #define AE_BAD_VALUE (acpi_status) (0x0004 | AE_CODE_ACPI_TABLES) #define AE_TABLE_NOT_SUPPORTED (acpi_status) (0x0005 | AE_CODE_ACPI_TABLES) #define AE_INVALID_TABLE_LENGTH (acpi_status) (0x0006 | AE_CODE_ACPI_TABLES) #define AE_CODE_TBL_MAX 0x0006 /* * AML exceptions. These are caused by problems with * the actual AML byte stream */ #define AE_AML_ERROR (acpi_status) (0x0001 | AE_CODE_AML) #define AE_AML_PARSE (acpi_status) (0x0002 | AE_CODE_AML) #define AE_AML_BAD_OPCODE (acpi_status) (0x0003 | AE_CODE_AML) #define AE_AML_NO_OPERAND (acpi_status) (0x0004 | AE_CODE_AML) #define AE_AML_OPERAND_TYPE (acpi_status) (0x0005 | AE_CODE_AML) #define AE_AML_OPERAND_VALUE (acpi_status) (0x0006 | AE_CODE_AML) #define AE_AML_UNINITIALIZED_LOCAL (acpi_status) (0x0007 | AE_CODE_AML) #define AE_AML_UNINITIALIZED_ARG (acpi_status) (0x0008 | AE_CODE_AML) #define AE_AML_UNINITIALIZED_ELEMENT (acpi_status) (0x0009 | AE_CODE_AML) #define AE_AML_NUMERIC_OVERFLOW (acpi_status) (0x000A | AE_CODE_AML) #define AE_AML_REGION_LIMIT (acpi_status) (0x000B | AE_CODE_AML) #define AE_AML_BUFFER_LIMIT (acpi_status) (0x000C | AE_CODE_AML) #define AE_AML_PACKAGE_LIMIT (acpi_status) (0x000D | AE_CODE_AML) #define AE_AML_DIVIDE_BY_ZERO (acpi_status) (0x000E | AE_CODE_AML) #define AE_AML_BAD_NAME (acpi_status) (0x000F | AE_CODE_AML) #define AE_AML_NAME_NOT_FOUND (acpi_status) (0x0010 | AE_CODE_AML) #define AE_AML_INTERNAL (acpi_status) (0x0011 | AE_CODE_AML) #define AE_AML_INVALID_SPACE_ID (acpi_status) (0x0012 | AE_CODE_AML) #define AE_AML_STRING_LIMIT (acpi_status) (0x0013 | AE_CODE_AML) #define AE_AML_NO_RETURN_VALUE (acpi_status) (0x0014 | AE_CODE_AML) #define AE_AML_METHOD_LIMIT (acpi_status) (0x0015 | AE_CODE_AML) #define AE_AML_NOT_OWNER (acpi_status) (0x0016 | AE_CODE_AML) #define AE_AML_MUTEX_ORDER (acpi_status) (0x0017 | AE_CODE_AML) #define AE_AML_MUTEX_NOT_ACQUIRED (acpi_status) (0x0018 | AE_CODE_AML) #define AE_AML_INVALID_RESOURCE_TYPE (acpi_status) (0x0019 | AE_CODE_AML) #define AE_AML_INVALID_INDEX (acpi_status) (0x001A | AE_CODE_AML) #define AE_AML_REGISTER_LIMIT (acpi_status) (0x001B | AE_CODE_AML) #define AE_AML_NO_WHILE (acpi_status) (0x001C | AE_CODE_AML) #define AE_AML_ALIGNMENT (acpi_status) (0x001D | AE_CODE_AML) #define AE_AML_NO_RESOURCE_END_TAG (acpi_status) (0x001E | AE_CODE_AML) #define AE_AML_BAD_RESOURCE_VALUE (acpi_status) (0x001F | AE_CODE_AML) #define AE_AML_CIRCULAR_REFERENCE (acpi_status) (0x0020 | AE_CODE_AML) #define AE_AML_BAD_RESOURCE_LENGTH (acpi_status) (0x0021 | AE_CODE_AML) #define AE_AML_ILLEGAL_ADDRESS (acpi_status) (0x0022 | AE_CODE_AML) #define AE_CODE_AML_MAX 0x0022 /* * Internal exceptions used for control */ #define AE_CTRL_RETURN_VALUE (acpi_status) (0x0001 | AE_CODE_CONTROL) #define AE_CTRL_PENDING (acpi_status) (0x0002 | AE_CODE_CONTROL) #define AE_CTRL_TERMINATE (acpi_status) (0x0003 | AE_CODE_CONTROL) #define AE_CTRL_TRUE (acpi_status) (0x0004 | AE_CODE_CONTROL) #define AE_CTRL_FALSE (acpi_status) (0x0005 | AE_CODE_CONTROL) #define AE_CTRL_DEPTH (acpi_status) (0x0006 | AE_CODE_CONTROL) #define AE_CTRL_END (acpi_status) (0x0007 | AE_CODE_CONTROL) #define AE_CTRL_TRANSFER (acpi_status) (0x0008 | AE_CODE_CONTROL) #define AE_CTRL_BREAK (acpi_status) (0x0009 | AE_CODE_CONTROL) #define AE_CTRL_CONTINUE (acpi_status) (0x000A | AE_CODE_CONTROL) #define AE_CTRL_SKIP (acpi_status) (0x000B | AE_CODE_CONTROL) #define AE_CTRL_PARSE_CONTINUE (acpi_status) (0x000C | AE_CODE_CONTROL) #define AE_CTRL_PARSE_PENDING (acpi_status) (0x000D | AE_CODE_CONTROL) #define AE_CODE_CTRL_MAX 0x000D #ifdef DEFINE_ACPI_GLOBALS /* * String versions of the exception codes above * These strings must match the corresponding defines exactly */ char const *__initdata acpi_gbl_exception_names_env[] = { "AE_OK", "AE_ERROR", "AE_NO_ACPI_TABLES", "AE_NO_NAMESPACE", "AE_NO_MEMORY", "AE_NOT_FOUND", "AE_NOT_EXIST", "AE_ALREADY_EXISTS", "AE_TYPE", "AE_NULL_OBJECT", "AE_NULL_ENTRY", "AE_BUFFER_OVERFLOW", "AE_STACK_OVERFLOW", "AE_STACK_UNDERFLOW", "AE_NOT_IMPLEMENTED", "AE_VERSION_MISMATCH", "AE_SUPPORT", "AE_SHARE", "AE_LIMIT", "AE_TIME", "AE_UNKNOWN_STATUS", "AE_ACQUIRE_DEADLOCK", "AE_RELEASE_DEADLOCK", "AE_NOT_ACQUIRED", "AE_ALREADY_ACQUIRED", "AE_NO_HARDWARE_RESPONSE", "AE_NO_GLOBAL_LOCK", "AE_LOGICAL_ADDRESS", "AE_ABORT_METHOD", "AE_SAME_HANDLER", "AE_WAKE_ONLY_GPE", "AE_OWNER_ID_LIMIT" }; char const *__initdata acpi_gbl_exception_names_pgm[] = { "AE_BAD_PARAMETER", "AE_BAD_CHARACTER", "AE_BAD_PATHNAME", "AE_BAD_DATA", "AE_BAD_ADDRESS", "AE_ALIGNMENT", "AE_BAD_HEX_CONSTANT", "AE_BAD_OCTAL_CONSTANT", "AE_BAD_DECIMAL_CONSTANT" }; char const *__initdata acpi_gbl_exception_names_tbl[] = { "AE_BAD_SIGNATURE", "AE_BAD_HEADER", "AE_BAD_CHECKSUM", "AE_BAD_VALUE", "AE_TABLE_NOT_SUPPORTED", "AE_INVALID_TABLE_LENGTH" }; char const *__initdata acpi_gbl_exception_names_aml[] = { "AE_AML_ERROR", "AE_AML_PARSE", "AE_AML_BAD_OPCODE", "AE_AML_NO_OPERAND", "AE_AML_OPERAND_TYPE", "AE_AML_OPERAND_VALUE", "AE_AML_UNINITIALIZED_LOCAL", "AE_AML_UNINITIALIZED_ARG", "AE_AML_UNINITIALIZED_ELEMENT", "AE_AML_NUMERIC_OVERFLOW", "AE_AML_REGION_LIMIT", "AE_AML_BUFFER_LIMIT", "AE_AML_PACKAGE_LIMIT", "AE_AML_DIVIDE_BY_ZERO", "AE_AML_BAD_NAME", "AE_AML_NAME_NOT_FOUND", "AE_AML_INTERNAL", "AE_AML_INVALID_SPACE_ID", "AE_AML_STRING_LIMIT", "AE_AML_NO_RETURN_VALUE", "AE_AML_METHOD_LIMIT", "AE_AML_NOT_OWNER", "AE_AML_MUTEX_ORDER", "AE_AML_MUTEX_NOT_ACQUIRED", "AE_AML_INVALID_RESOURCE_TYPE", "AE_AML_INVALID_INDEX", "AE_AML_REGISTER_LIMIT", "AE_AML_NO_WHILE", "AE_AML_ALIGNMENT", "AE_AML_NO_RESOURCE_END_TAG", "AE_AML_BAD_RESOURCE_VALUE", "AE_AML_CIRCULAR_REFERENCE", "AE_AML_BAD_RESOURCE_LENGTH", "AE_AML_ILLEGAL_ADDRESS" }; char const *__initdata acpi_gbl_exception_names_ctrl[] = { "AE_CTRL_RETURN_VALUE", "AE_CTRL_PENDING", "AE_CTRL_TERMINATE", "AE_CTRL_TRUE", "AE_CTRL_FALSE", "AE_CTRL_DEPTH", "AE_CTRL_END", "AE_CTRL_TRANSFER", "AE_CTRL_BREAK", "AE_CTRL_CONTINUE", "AE_CTRL_SKIP", "AE_CTRL_PARSE_CONTINUE", "AE_CTRL_PARSE_PENDING" }; #endif /* ACPI GLOBALS */ #endif /* __ACEXCEP_H__ */ xen-4.4.0/xen/include/acpi/achware.h0000664000175000017500000000476512307313555015373 0ustar smbsmb/****************************************************************************** * * Name: achware.h -- hardware specific interfaces * *****************************************************************************/ /* * Copyright (C) 2000 - 2007, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACHWARE_H__ #define __ACHWARE_H__ /* * hwregs - ACPI Register I/O */ acpi_status acpi_hw_register_read(u32 register_id, u32 * return_value); acpi_status acpi_hw_register_write(u32 register_id, u32 value); acpi_status acpi_hw_low_level_read(u32 width, u32 * value, struct acpi_generic_address *reg); acpi_status acpi_hw_low_level_write(u32 width, u32 value, struct acpi_generic_address *reg); #endif /* __ACHWARE_H__ */ xen-4.4.0/xen/include/acpi/acpixf.h0000664000175000017500000001040312307313555015215 0ustar smbsmb /****************************************************************************** * * Name: acpixf.h - External interfaces to the ACPI subsystem * *****************************************************************************/ /* * Copyright (C) 2000 - 2007, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACXFACE_H__ #define __ACXFACE_H__ #include "actypes.h" #include "actbl.h" /* * Global interfaces */ acpi_status acpi_initialize_tables(struct acpi_table_desc *initial_storage, u32 initial_table_count, u8 allow_resize); const char *acpi_format_exception(acpi_status exception); /* * ACPI table manipulation interfaces */ acpi_status acpi_reallocate_root_table(void); acpi_status acpi_find_root_pointer(acpi_native_uint * rsdp_address); acpi_status acpi_load_tables(void); acpi_status acpi_load_table(struct acpi_table_header *table_ptr); acpi_status acpi_get_table_header(acpi_string signature, acpi_native_uint instance, struct acpi_table_header *out_table_header); acpi_status acpi_get_table(acpi_string signature, acpi_native_uint instance, struct acpi_table_header **out_table); acpi_status acpi_get_table_phys(acpi_string signature, acpi_native_uint instance, acpi_physical_address *addr, acpi_native_uint *len); /* * Namespace and name interfaces */ acpi_status acpi_get_handle(acpi_handle parent, acpi_string pathname, acpi_handle * ret_handle); acpi_status acpi_debug_trace(char *name, u32 debug_level, u32 debug_layer, u32 flags); acpi_status acpi_get_object_info(acpi_handle handle, struct acpi_buffer *return_buffer); acpi_status acpi_get_type(acpi_handle object, acpi_object_type * out_type); acpi_status acpi_get_parent(acpi_handle object, acpi_handle * out_handle); /* * Hardware (ACPI device) interfaces */ acpi_status acpi_get_register(u32 register_id, u32 * return_value); acpi_status acpi_set_register(u32 register_id, u32 value); acpi_status acpi_set_firmware_waking_vector(acpi_physical_address physical_address); #ifdef ACPI_FUTURE_USAGE acpi_status acpi_get_firmware_waking_vector(acpi_physical_address * physical_address); #endif acpi_status acpi_get_sleep_type_data(u8 sleep_state, u8 * slp_typ_a, u8 * slp_typ_b); acpi_status acpi_enter_sleep_state_prep(u8 sleep_state); acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state); acpi_status asmlinkage acpi_enter_sleep_state_s4bios(void); acpi_status acpi_leave_sleep_state_prep(u8 sleep_state); acpi_status acpi_leave_sleep_state(u8 sleep_state); #endif /* __ACXFACE_H__ */ xen-4.4.0/xen/include/acpi/acpi.h0000664000175000017500000000610512307313555014663 0ustar smbsmb/****************************************************************************** * * Name: acpi.h - Master include file, Publics and external data. * *****************************************************************************/ /* * Copyright (C) 2000 - 2007, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACPI_H__ #define __ACPI_H__ #define PREFIX "ACPI: " /* * Common includes for all ACPI driver files * We put them here because we don't want to duplicate them * in the rest of the source code again and again. */ #include "acnames.h" /* Global ACPI names and strings */ #include "acconfig.h" /* Configuration constants */ #include "platform/acenv.h" /* Target environment specific items */ #include "actypes.h" /* Fundamental common data types */ #include "acexcep.h" /* ACPI exception codes */ #include "acmacros.h" /* C macros */ #include "actbl.h" /* ACPI table definitions */ #include "aclocal.h" /* Internal data types */ #include "acoutput.h" /* Error output and Debug macros */ #include "acpiosxf.h" /* Interfaces to the ACPI-to-OS layer */ #include "acpixf.h" /* ACPI core subsystem external interfaces */ #include "acglobal.h" /* All global variables */ #include "achware.h" /* Hardware defines and interfaces */ #include "acutils.h" /* Utility interfaces */ #endif /* __ACPI_H__ */ xen-4.4.0/xen/include/acpi/actypes.h0000664000175000017500000005623612307313555015431 0ustar smbsmb/****************************************************************************** * * Name: actypes.h - Common data types for the entire ACPI subsystem * *****************************************************************************/ /* * Copyright (C) 2000 - 2007, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACTYPES_H__ #define __ACTYPES_H__ /* acpisrc:struct_defs -- for acpisrc conversion */ /* * ACPI_MACHINE_WIDTH must be specified in an OS- or compiler-dependent header * and must be either 32 or 64. 16-bit ACPICA is no longer supported, as of * 12/2006. */ #ifndef ACPI_MACHINE_WIDTH #error ACPI_MACHINE_WIDTH not defined #endif /*! [Begin] no source code translation */ /* * Data type ranges * Note: These macros are designed to be compiler independent as well as * working around problems that some 32-bit compilers have with 64-bit * constants. */ #define ACPI_UINT8_MAX (UINT8) (~((UINT8) 0)) /* 0xFF */ #define ACPI_UINT16_MAX (UINT16)(~((UINT16) 0)) /* 0xFFFF */ #define ACPI_UINT32_MAX (UINT32)(~((UINT32) 0)) /* 0xFFFFFFFF */ #define ACPI_UINT64_MAX (UINT64)(~((UINT64) 0)) /* 0xFFFFFFFFFFFFFFFF */ #define ACPI_ASCII_MAX 0x7F /* * Architecture-specific ACPICA Subsystem Data Types * * The goal of these types is to provide source code portability across * 16-bit, 32-bit, and 64-bit targets. * * 1) The following types are of fixed size for all targets (16/32/64): * * BOOLEAN Logical boolean * * UINT8 8-bit (1 byte) unsigned value * UINT16 16-bit (2 byte) unsigned value * UINT32 32-bit (4 byte) unsigned value * UINT64 64-bit (8 byte) unsigned value * * INT16 16-bit (2 byte) signed value * INT32 32-bit (4 byte) signed value * INT64 64-bit (8 byte) signed value * * COMPILER_DEPENDENT_UINT64/INT64 - These types are defined in the * compiler-dependent header(s) and were introduced because there is no common * 64-bit integer type across the various compilation models, as shown in * the table below. * * Datatype LP64 ILP64 LLP64 ILP32 LP32 16bit * char 8 8 8 8 8 8 * short 16 16 16 16 16 16 * _int32 32 * int 32 64 32 32 16 16 * long 64 64 32 32 32 32 * long long 64 64 * pointer 64 64 64 32 32 32 * * Note: ILP64 and LP32 are currently not supported. * * * 2) These types represent the native word size of the target mode of the * processor, and may be 16-bit, 32-bit, or 64-bit as required. They are * usually used for memory allocation, efficient loop counters, and array * indexes. The types are similar to the size_t type in the C library and are * required because there is no C type that consistently represents the native * data width. * * ACPI_SIZE 16/32/64-bit unsigned value * ACPI_NATIVE_UINT 16/32/64-bit unsigned value * ACPI_NATIVE_INT 16/32/64-bit signed value * */ /******************************************************************************* * * Common types for all compilers, all targets * ******************************************************************************/ typedef unsigned char BOOLEAN; typedef unsigned char UINT8; typedef unsigned short UINT16; typedef COMPILER_DEPENDENT_UINT64 UINT64; typedef COMPILER_DEPENDENT_INT64 INT64; /*! [End] no source code translation !*/ /******************************************************************************* * * Types specific to 64-bit targets * ******************************************************************************/ #if ACPI_MACHINE_WIDTH == 64 /*! [Begin] no source code translation (keep the typedefs as-is) */ typedef unsigned int UINT32; typedef int INT32; /*! [End] no source code translation !*/ typedef u64 acpi_native_uint; typedef s64 acpi_native_int; typedef u64 acpi_io_address; typedef u64 acpi_physical_address; #define ACPI_MAX_PTR ACPI_UINT64_MAX #define ACPI_SIZE_MAX ACPI_UINT64_MAX #define ACPI_USE_NATIVE_DIVIDE /* Has native 64-bit integer support */ /* * In the case of the Itanium Processor Family (IPF), the hardware does not * support misaligned memory transfers. Set the MISALIGNMENT_NOT_SUPPORTED flag * to indicate that special precautions must be taken to avoid alignment faults. * (IA64 or ia64 is currently used by existing compilers to indicate IPF.) * * Note: Em64_t and other X86-64 processors support misaligned transfers, * so there is no need to define this flag. */ #if defined (__IA64__) || defined (__ia64__) #define ACPI_MISALIGNMENT_NOT_SUPPORTED #endif /******************************************************************************* * * Types specific to 32-bit targets * ******************************************************************************/ #elif ACPI_MACHINE_WIDTH == 32 /*! [Begin] no source code translation (keep the typedefs as-is) */ typedef unsigned int UINT32; typedef int INT32; /*! [End] no source code translation !*/ typedef u32 acpi_native_uint; typedef s32 acpi_native_int; typedef u32 acpi_io_address; typedef u32 acpi_physical_address; #define ACPI_MAX_PTR ACPI_UINT32_MAX #define ACPI_SIZE_MAX ACPI_UINT32_MAX #else /* ACPI_MACHINE_WIDTH must be either 64 or 32 */ #error unknown ACPI_MACHINE_WIDTH #endif /* Variable-width type, used instead of clib size_t */ typedef acpi_native_uint acpi_size; /******************************************************************************* * * OS-dependent and compiler-dependent types * * If the defaults below are not appropriate for the host system, they can * be defined in the compiler-specific or OS-specific header, and this will * take precedence. * ******************************************************************************/ /* Value returned by acpi_os_get_thread_id */ #ifndef acpi_thread_id #define acpi_thread_id acpi_native_uint #endif /* Use C99 uintptr_t for pointer casting if available, "void *" otherwise */ #ifndef acpi_uintptr_t #define acpi_uintptr_t void * #endif /* * ACPI_PRINTF_LIKE is used to tag functions as "printf-like" because * some compilers can catch printf format string problems */ #ifndef ACPI_PRINTF_LIKE #define ACPI_PRINTF_LIKE(c) #endif /* * Some compilers complain about unused variables. Sometimes we don't want to * use all the variables (for example, _acpi_module_name). This allows us * to to tell the compiler in a per-variable manner that a variable * is unused */ #ifndef ACPI_UNUSED_VAR #define ACPI_UNUSED_VAR #endif /* * All ACPICA functions that are available to the rest of the kernel are * tagged with this macro which can be defined as appropriate for the host. */ #ifndef ACPI_EXPORT_SYMBOL #define ACPI_EXPORT_SYMBOL(symbol) #endif /******************************************************************************* * * Independent types * ******************************************************************************/ /* Logical defines and NULL */ #ifdef FALSE #undef FALSE #endif #define FALSE (1 == 0) #ifdef TRUE #undef TRUE #endif #define TRUE (1 == 1) #ifndef NULL #define NULL (void *) 0 #endif /* * Mescellaneous types */ typedef u32 acpi_status; /* All ACPI Exceptions */ typedef u32 acpi_name; /* 4-byte ACPI name */ typedef char *acpi_string; /* Null terminated ASCII string */ typedef void *acpi_handle; /* Actually a ptr to a NS Node */ struct uint64_struct { u32 lo; u32 hi; }; union uint64_overlay { u64 full; struct uint64_struct part; }; struct uint32_struct { u32 lo; u32 hi; }; /* Synchronization objects */ #define acpi_mutex void * #define acpi_semaphore void * /* * Acpi integer width. In ACPI version 1, integers are * 32 bits. In ACPI version 2, integers are 64 bits. * Note that this pertains to the ACPI integer type only, not * other integers used in the implementation of the ACPI CA * subsystem. */ #ifdef ACPI_NO_INTEGER64_SUPPORT /* 32-bit integers only, no 64-bit support */ typedef u32 acpi_integer; #define ACPI_INTEGER_MAX ACPI_UINT32_MAX #define ACPI_INTEGER_BIT_SIZE 32 #define ACPI_MAX_DECIMAL_DIGITS 10 /* 2^32 = 4,294,967,296 */ #define ACPI_USE_NATIVE_DIVIDE /* Use compiler native 32-bit divide */ #else /* 64-bit integers */ typedef unsigned long long acpi_integer; #define ACPI_INTEGER_MAX ACPI_UINT64_MAX #define ACPI_INTEGER_BIT_SIZE 64 #define ACPI_MAX_DECIMAL_DIGITS 20 /* 2^64 = 18,446,744,073,709,551,616 */ #if ACPI_MACHINE_WIDTH == 64 #define ACPI_USE_NATIVE_DIVIDE /* Use compiler native 64-bit divide */ #endif #endif #define ACPI_MAX64_DECIMAL_DIGITS 20 #define ACPI_MAX32_DECIMAL_DIGITS 10 #define ACPI_MAX16_DECIMAL_DIGITS 5 #define ACPI_MAX8_DECIMAL_DIGITS 3 /* * Constants with special meanings */ #define ACPI_ROOT_OBJECT ACPI_ADD_PTR (acpi_handle, NULL, ACPI_MAX_PTR) /* * Initialization sequence */ #define ACPI_FULL_INITIALIZATION 0x00 #define ACPI_NO_ADDRESS_SPACE_INIT 0x01 #define ACPI_NO_HARDWARE_INIT 0x02 #define ACPI_NO_EVENT_INIT 0x04 #define ACPI_NO_HANDLER_INIT 0x08 #define ACPI_NO_ACPI_ENABLE 0x10 #define ACPI_NO_DEVICE_INIT 0x20 #define ACPI_NO_OBJECT_INIT 0x40 /* * Initialization state */ #define ACPI_SUBSYSTEM_INITIALIZE 0x01 #define ACPI_INITIALIZED_OK 0x02 /* * Power state values */ #define ACPI_STATE_UNKNOWN (u8) 0xFF #define ACPI_STATE_S0 (u8) 0 #define ACPI_STATE_S1 (u8) 1 #define ACPI_STATE_S2 (u8) 2 #define ACPI_STATE_S3 (u8) 3 #define ACPI_STATE_S4 (u8) 4 #define ACPI_STATE_S5 (u8) 5 #define ACPI_S_STATES_MAX ACPI_STATE_S5 #define ACPI_S_STATE_COUNT 6 #define ACPI_STATE_D0 (u8) 0 #define ACPI_STATE_D1 (u8) 1 #define ACPI_STATE_D2 (u8) 2 #define ACPI_STATE_D3 (u8) 3 #define ACPI_D_STATES_MAX ACPI_STATE_D3 #define ACPI_D_STATE_COUNT 4 #define ACPI_STATE_C0 (u8) 0 #define ACPI_STATE_C1 (u8) 1 #define ACPI_STATE_C2 (u8) 2 #define ACPI_STATE_C3 (u8) 3 #define ACPI_C_STATES_MAX ACPI_STATE_C3 #define ACPI_C_STATE_COUNT 4 /* * Sleep type invalid value */ #define ACPI_SLEEP_TYPE_MAX 0x7 #define ACPI_SLEEP_TYPE_INVALID 0xFF /* * Standard notify values */ #define ACPI_NOTIFY_BUS_CHECK (u8) 0 #define ACPI_NOTIFY_DEVICE_CHECK (u8) 1 #define ACPI_NOTIFY_DEVICE_WAKE (u8) 2 #define ACPI_NOTIFY_EJECT_REQUEST (u8) 3 #define ACPI_NOTIFY_DEVICE_CHECK_LIGHT (u8) 4 #define ACPI_NOTIFY_FREQUENCY_MISMATCH (u8) 5 #define ACPI_NOTIFY_BUS_MODE_MISMATCH (u8) 6 #define ACPI_NOTIFY_POWER_FAULT (u8) 7 /* * Types associated with ACPI names and objects. The first group of * values (up to ACPI_TYPE_EXTERNAL_MAX) correspond to the definition * of the ACPI object_type() operator (See the ACPI Spec). Therefore, * only add to the first group if the spec changes. * * NOTE: Types must be kept in sync with the global acpi_ns_properties * and acpi_ns_type_names arrays. */ typedef u32 acpi_object_type; #define ACPI_TYPE_ANY 0x00 #define ACPI_TYPE_INTEGER 0x01 /* Byte/Word/Dword/Zero/One/Ones */ #define ACPI_TYPE_STRING 0x02 #define ACPI_TYPE_BUFFER 0x03 #define ACPI_TYPE_PACKAGE 0x04 /* byte_const, multiple data_term/Constant/super_name */ #define ACPI_TYPE_FIELD_UNIT 0x05 #define ACPI_TYPE_DEVICE 0x06 /* Name, multiple Node */ #define ACPI_TYPE_EVENT 0x07 #define ACPI_TYPE_METHOD 0x08 /* Name, byte_const, multiple Code */ #define ACPI_TYPE_MUTEX 0x09 #define ACPI_TYPE_REGION 0x0A #define ACPI_TYPE_POWER 0x0B /* Name,byte_const,word_const,multi Node */ #define ACPI_TYPE_PROCESSOR 0x0C /* Name,byte_const,Dword_const,byte_const,multi nm_o */ #define ACPI_TYPE_THERMAL 0x0D /* Name, multiple Node */ #define ACPI_TYPE_BUFFER_FIELD 0x0E #define ACPI_TYPE_DDB_HANDLE 0x0F #define ACPI_TYPE_DEBUG_OBJECT 0x10 #define ACPI_TYPE_EXTERNAL_MAX 0x10 /* * These are object types that do not map directly to the ACPI * object_type() operator. They are used for various internal purposes only. * If new predefined ACPI_TYPEs are added (via the ACPI specification), these * internal types must move upwards. (There is code that depends on these * values being contiguous with the external types above.) */ #define ACPI_TYPE_LOCAL_REGION_FIELD 0x11 #define ACPI_TYPE_LOCAL_BANK_FIELD 0x12 #define ACPI_TYPE_LOCAL_INDEX_FIELD 0x13 #define ACPI_TYPE_LOCAL_REFERENCE 0x14 /* Arg#, Local#, Name, Debug, ref_of, Index */ #define ACPI_TYPE_LOCAL_ALIAS 0x15 #define ACPI_TYPE_LOCAL_METHOD_ALIAS 0x16 #define ACPI_TYPE_LOCAL_NOTIFY 0x17 #define ACPI_TYPE_LOCAL_ADDRESS_HANDLER 0x18 #define ACPI_TYPE_LOCAL_RESOURCE 0x19 #define ACPI_TYPE_LOCAL_RESOURCE_FIELD 0x1A #define ACPI_TYPE_LOCAL_SCOPE 0x1B /* 1 Name, multiple object_list Nodes */ #define ACPI_TYPE_NS_NODE_MAX 0x1B /* Last typecode used within a NS Node */ /* * These are special object types that never appear in * a Namespace node, only in an union acpi_operand_object */ #define ACPI_TYPE_LOCAL_EXTRA 0x1C #define ACPI_TYPE_LOCAL_DATA 0x1D #define ACPI_TYPE_LOCAL_MAX 0x1D /* All types above here are invalid */ #define ACPI_TYPE_INVALID 0x1E #define ACPI_TYPE_NOT_FOUND 0xFF /* * All I/O */ #define ACPI_READ 0 #define ACPI_WRITE 1 #define ACPI_IO_MASK 1 /* * Event Types: Fixed & General Purpose */ typedef u32 acpi_event_type; /* * Fixed events */ #define ACPI_EVENT_PMTIMER 0 #define ACPI_EVENT_GLOBAL 1 #define ACPI_EVENT_POWER_BUTTON 2 #define ACPI_EVENT_SLEEP_BUTTON 3 #define ACPI_EVENT_RTC 4 #define ACPI_EVENT_MAX 4 #define ACPI_NUM_FIXED_EVENTS ACPI_EVENT_MAX + 1 /* * Event Status - Per event * ------------- * The encoding of acpi_event_status is illustrated below. * Note that a set bit (1) indicates the property is TRUE * (e.g. if bit 0 is set then the event is enabled). * +-------------+-+-+-+ * | Bits 31:3 |2|1|0| * +-------------+-+-+-+ * | | | | * | | | +- Enabled? * | | +--- Enabled for wake? * | +----- Set? * +----------- */ typedef u32 acpi_event_status; #define ACPI_EVENT_FLAG_DISABLED (acpi_event_status) 0x00 #define ACPI_EVENT_FLAG_ENABLED (acpi_event_status) 0x01 #define ACPI_EVENT_FLAG_WAKE_ENABLED (acpi_event_status) 0x02 #define ACPI_EVENT_FLAG_SET (acpi_event_status) 0x04 /* Notify types */ #define ACPI_SYSTEM_NOTIFY 0x1 #define ACPI_DEVICE_NOTIFY 0x2 #define ACPI_ALL_NOTIFY 0x3 #define ACPI_MAX_NOTIFY_HANDLER_TYPE 0x3 #define ACPI_MAX_SYS_NOTIFY 0x7f /* Address Space (Operation Region) Types */ typedef u8 acpi_adr_space_type; #define ACPI_ADR_SPACE_SYSTEM_MEMORY (acpi_adr_space_type) 0 #define ACPI_ADR_SPACE_SYSTEM_IO (acpi_adr_space_type) 1 #define ACPI_ADR_SPACE_PCI_CONFIG (acpi_adr_space_type) 2 #define ACPI_ADR_SPACE_EC (acpi_adr_space_type) 3 #define ACPI_ADR_SPACE_SMBUS (acpi_adr_space_type) 4 #define ACPI_ADR_SPACE_CMOS (acpi_adr_space_type) 5 #define ACPI_ADR_SPACE_PCI_BAR_TARGET (acpi_adr_space_type) 6 #define ACPI_ADR_SPACE_DATA_TABLE (acpi_adr_space_type) 7 #define ACPI_ADR_SPACE_FIXED_HARDWARE (acpi_adr_space_type) 127 /* * bit_register IDs * These are bitfields defined within the full ACPI registers */ #define ACPI_BITREG_TIMER_STATUS 0x00 #define ACPI_BITREG_BUS_MASTER_STATUS 0x01 #define ACPI_BITREG_GLOBAL_LOCK_STATUS 0x02 #define ACPI_BITREG_POWER_BUTTON_STATUS 0x03 #define ACPI_BITREG_SLEEP_BUTTON_STATUS 0x04 #define ACPI_BITREG_RT_CLOCK_STATUS 0x05 #define ACPI_BITREG_WAKE_STATUS 0x06 #define ACPI_BITREG_PCIEXP_WAKE_STATUS 0x07 #define ACPI_BITREG_TIMER_ENABLE 0x08 #define ACPI_BITREG_GLOBAL_LOCK_ENABLE 0x09 #define ACPI_BITREG_POWER_BUTTON_ENABLE 0x0A #define ACPI_BITREG_SLEEP_BUTTON_ENABLE 0x0B #define ACPI_BITREG_RT_CLOCK_ENABLE 0x0C #define ACPI_BITREG_WAKE_ENABLE 0x0D #define ACPI_BITREG_PCIEXP_WAKE_DISABLE 0x0E #define ACPI_BITREG_SCI_ENABLE 0x0F #define ACPI_BITREG_BUS_MASTER_RLD 0x10 #define ACPI_BITREG_GLOBAL_LOCK_RELEASE 0x11 #define ACPI_BITREG_SLEEP_TYPE_A 0x12 #define ACPI_BITREG_SLEEP_TYPE_B 0x13 #define ACPI_BITREG_SLEEP_ENABLE 0x14 #define ACPI_BITREG_ARB_DISABLE 0x15 #define ACPI_BITREG_MAX 0x15 #define ACPI_NUM_BITREG ACPI_BITREG_MAX + 1 /* * External ACPI object definition */ union acpi_object { acpi_object_type type; /* See definition of acpi_ns_type for values */ struct { acpi_object_type type; acpi_integer value; /* The actual number */ } integer; struct { acpi_object_type type; u32 length; /* # of bytes in string, excluding trailing null */ char *pointer; /* points to the string value */ } string; struct { acpi_object_type type; u32 length; /* # of bytes in buffer */ u8 *pointer; /* points to the buffer */ } buffer; struct { acpi_object_type type; u32 fill1; acpi_handle handle; /* object reference */ } reference; struct { acpi_object_type type; u32 count; /* # of elements in package */ union acpi_object *elements; /* Pointer to an array of ACPI_OBJECTs */ } package; struct { acpi_object_type type; u32 proc_id; acpi_io_address pblk_address; u32 pblk_length; } processor; struct { acpi_object_type type; u32 system_level; u32 resource_order; } power_resource; }; /* * List of objects, used as a parameter list for control method evaluation */ struct acpi_object_list { u32 count; union acpi_object *pointer; }; /* * Miscellaneous common Data Structures used by the interfaces */ #define ACPI_NO_BUFFER 0 #define ACPI_ALLOCATE_BUFFER (acpi_size) (-1) #define ACPI_ALLOCATE_LOCAL_BUFFER (acpi_size) (-2) struct acpi_buffer { acpi_size length; /* Length in bytes of the buffer */ void *pointer; /* pointer to buffer */ }; /* * Memory Attributes */ #define ACPI_READ_ONLY_MEMORY (u8) 0x00 #define ACPI_READ_WRITE_MEMORY (u8) 0x01 #define ACPI_NON_CACHEABLE_MEMORY (u8) 0x00 #define ACPI_CACHABLE_MEMORY (u8) 0x01 #define ACPI_WRITE_COMBINING_MEMORY (u8) 0x02 #define ACPI_PREFETCHABLE_MEMORY (u8) 0x03 /* * IO Attributes * The ISA IO ranges are: n000-n0_fFh, n400-n4_fFh, n800-n8_fFh, n_c00-n_cFFh. * The non-ISA IO ranges are: n100-n3_fFh, n500-n7_fFh, n900-n_bFFh, n_cd0-n_fFFh. */ #define ACPI_NON_ISA_ONLY_RANGES (u8) 0x01 #define ACPI_ISA_ONLY_RANGES (u8) 0x02 #define ACPI_ENTIRE_RANGE (ACPI_NON_ISA_ONLY_RANGES | ACPI_ISA_ONLY_RANGES) /* Type of translation - 1=Sparse, 0=Dense */ #define ACPI_SPARSE_TRANSLATION (u8) 0x01 /* * IO Port Descriptor Decode */ #define ACPI_DECODE_10 (u8) 0x00 /* 10-bit IO address decode */ #define ACPI_DECODE_16 (u8) 0x01 /* 16-bit IO address decode */ /* * IRQ Attributes */ #define ACPI_LEVEL_SENSITIVE (u8) 0x00 #define ACPI_EDGE_SENSITIVE (u8) 0x01 #define ACPI_ACTIVE_HIGH (u8) 0x00 #define ACPI_ACTIVE_LOW (u8) 0x01 #define ACPI_EXCLUSIVE (u8) 0x00 #define ACPI_SHARED (u8) 0x01 /* * DMA Attributes */ #define ACPI_COMPATIBILITY (u8) 0x00 #define ACPI_TYPE_A (u8) 0x01 #define ACPI_TYPE_B (u8) 0x02 #define ACPI_TYPE_F (u8) 0x03 #define ACPI_NOT_BUS_MASTER (u8) 0x00 #define ACPI_BUS_MASTER (u8) 0x01 #define ACPI_TRANSFER_8 (u8) 0x00 #define ACPI_TRANSFER_8_16 (u8) 0x01 #define ACPI_TRANSFER_16 (u8) 0x02 /* * Start Dependent Functions Priority definitions */ #define ACPI_GOOD_CONFIGURATION (u8) 0x00 #define ACPI_ACCEPTABLE_CONFIGURATION (u8) 0x01 #define ACPI_SUB_OPTIMAL_CONFIGURATION (u8) 0x02 /* * 16, 32 and 64-bit Address Descriptor resource types */ #define ACPI_MEMORY_RANGE (u8) 0x00 #define ACPI_IO_RANGE (u8) 0x01 #define ACPI_BUS_NUMBER_RANGE (u8) 0x02 #define ACPI_ADDRESS_NOT_FIXED (u8) 0x00 #define ACPI_ADDRESS_FIXED (u8) 0x01 #define ACPI_POS_DECODE (u8) 0x00 #define ACPI_SUB_DECODE (u8) 0x01 #define ACPI_PRODUCER (u8) 0x00 #define ACPI_CONSUMER (u8) 0x01 #endif /* __ACTYPES_H__ */ xen-4.4.0/xen/include/acpi/acmacros.h0000664000175000017500000006232412307313555015544 0ustar smbsmb/****************************************************************************** * * Name: acmacros.h - C macros for the entire subsystem. * *****************************************************************************/ /* * Copyright (C) 2000 - 2007, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACMACROS_H__ #define __ACMACROS_H__ /* * Data manipulation macros */ #define ACPI_LOWORD(l) ((u16)(u32)(l)) #define ACPI_HIWORD(l) ((u16)((((u32)(l)) >> 16) & 0xFFFF)) #define ACPI_LOBYTE(l) ((u8)(u16)(l)) #define ACPI_HIBYTE(l) ((u8)((((u16)(l)) >> 8) & 0xFF)) #define ACPI_SET_BIT(target,bit) ((target) |= (bit)) #define ACPI_CLEAR_BIT(target,bit) ((target) &= ~(bit)) #define ACPI_MIN(a,b) (((a)<(b))?(a):(b)) #define ACPI_MAX(a,b) (((a)>(b))?(a):(b)) /* Size calculation */ #define ACPI_ARRAY_LENGTH(x) (sizeof(x) / sizeof((x)[0])) #ifdef ACPI_NO_INTEGER64_SUPPORT /* * acpi_integer is 32-bits, no 64-bit support on this platform */ #define ACPI_LODWORD(l) ((u32)(l)) #define ACPI_HIDWORD(l) ((u32)(0)) #else /* * Full 64-bit address/integer on both 32-bit and 64-bit platforms */ #define ACPI_LODWORD(l) ((u32)(u64)(l)) #define ACPI_HIDWORD(l) ((u32)(((*(struct uint64_struct *)(void *)(&l))).hi)) #endif /* * printf() format helpers */ /* Split 64-bit integer into two 32-bit values. Use with %8.8_x%8.8_x */ #define ACPI_FORMAT_UINT64(i) ACPI_HIDWORD(i),ACPI_LODWORD(i) /* * Extract data using a pointer. Any more than a byte and we * get into potential aligment issues -- see the STORE macros below. * Use with care. */ #define ACPI_GET8(ptr) *ACPI_CAST_PTR (u8, ptr) #define ACPI_GET16(ptr) *ACPI_CAST_PTR (u16, ptr) #define ACPI_GET32(ptr) *ACPI_CAST_PTR (u32, ptr) #define ACPI_GET64(ptr) *ACPI_CAST_PTR (u64, ptr) #define ACPI_SET8(ptr) *ACPI_CAST_PTR (u8, ptr) #define ACPI_SET16(ptr) *ACPI_CAST_PTR (u16, ptr) #define ACPI_SET32(ptr) *ACPI_CAST_PTR (u32, ptr) #define ACPI_SET64(ptr) *ACPI_CAST_PTR (u64, ptr) /* * Pointer manipulation */ #define ACPI_CAST_PTR(t, p) ((t *) (acpi_uintptr_t) (p)) #define ACPI_CAST_INDIRECT_PTR(t, p) ((t **) (acpi_uintptr_t) (p)) #define ACPI_ADD_PTR(t,a,b) ACPI_CAST_PTR (t, (ACPI_CAST_PTR (u8,(a)) + (acpi_native_uint)(b))) #define ACPI_PTR_DIFF(a,b) (acpi_native_uint) (ACPI_CAST_PTR (u8,(a)) - ACPI_CAST_PTR (u8,(b))) /* Pointer/Integer type conversions */ #define ACPI_TO_POINTER(i) ACPI_ADD_PTR (void,(void *) NULL,(acpi_native_uint) i) #define ACPI_TO_INTEGER(p) ACPI_PTR_DIFF (p,(void *) NULL) #define ACPI_OFFSET(d,f) (acpi_size) ACPI_PTR_DIFF (&(((d *)0)->f),(void *) NULL) #define ACPI_PHYSADDR_TO_PTR(i) ACPI_TO_POINTER(i) #define ACPI_PTR_TO_PHYSADDR(i) ACPI_TO_INTEGER(i) #ifndef ACPI_MISALIGNMENT_NOT_SUPPORTED #define ACPI_COMPARE_NAME(a,b) (*ACPI_CAST_PTR (u32,(a)) == *ACPI_CAST_PTR (u32,(b))) #else #define ACPI_COMPARE_NAME(a,b) (!ACPI_STRNCMP (ACPI_CAST_PTR (char,(a)), ACPI_CAST_PTR (char,(b)), ACPI_NAME_SIZE)) #endif /* * Macros for moving data around to/from buffers that are possibly unaligned. * If the hardware supports the transfer of unaligned data, just do the store. * Otherwise, we have to move one byte at a time. */ #ifdef ACPI_BIG_ENDIAN /* * Macros for big-endian machines */ /* This macro sets a buffer index, starting from the end of the buffer */ #define ACPI_BUFFER_INDEX(buf_len,buf_offset,byte_gran) ((buf_len) - (((buf_offset)+1) * (byte_gran))) /* These macros reverse the bytes during the move, converting little-endian to big endian */ /* Big Endian <== Little Endian */ /* Hi...Lo Lo...Hi */ /* 16-bit source, 16/32/64 destination */ #define ACPI_MOVE_16_TO_16(d,s) {(( u8 *)(void *)(d))[0] = ((u8 *)(void *)(s))[1];\ (( u8 *)(void *)(d))[1] = ((u8 *)(void *)(s))[0];} #define ACPI_MOVE_16_TO_32(d,s) {(*(u32 *)(void *)(d))=0;\ ((u8 *)(void *)(d))[2] = ((u8 *)(void *)(s))[1];\ ((u8 *)(void *)(d))[3] = ((u8 *)(void *)(s))[0];} #define ACPI_MOVE_16_TO_64(d,s) {(*(u64 *)(void *)(d))=0;\ ((u8 *)(void *)(d))[6] = ((u8 *)(void *)(s))[1];\ ((u8 *)(void *)(d))[7] = ((u8 *)(void *)(s))[0];} /* 32-bit source, 16/32/64 destination */ #define ACPI_MOVE_32_TO_16(d,s) ACPI_MOVE_16_TO_16(d,s) /* Truncate to 16 */ #define ACPI_MOVE_32_TO_32(d,s) {(( u8 *)(void *)(d))[0] = ((u8 *)(void *)(s))[3];\ (( u8 *)(void *)(d))[1] = ((u8 *)(void *)(s))[2];\ (( u8 *)(void *)(d))[2] = ((u8 *)(void *)(s))[1];\ (( u8 *)(void *)(d))[3] = ((u8 *)(void *)(s))[0];} #define ACPI_MOVE_32_TO_64(d,s) {(*(u64 *)(void *)(d))=0;\ ((u8 *)(void *)(d))[4] = ((u8 *)(void *)(s))[3];\ ((u8 *)(void *)(d))[5] = ((u8 *)(void *)(s))[2];\ ((u8 *)(void *)(d))[6] = ((u8 *)(void *)(s))[1];\ ((u8 *)(void *)(d))[7] = ((u8 *)(void *)(s))[0];} /* 64-bit source, 16/32/64 destination */ #define ACPI_MOVE_64_TO_16(d,s) ACPI_MOVE_16_TO_16(d,s) /* Truncate to 16 */ #define ACPI_MOVE_64_TO_32(d,s) ACPI_MOVE_32_TO_32(d,s) /* Truncate to 32 */ #define ACPI_MOVE_64_TO_64(d,s) {(( u8 *)(void *)(d))[0] = ((u8 *)(void *)(s))[7];\ (( u8 *)(void *)(d))[1] = ((u8 *)(void *)(s))[6];\ (( u8 *)(void *)(d))[2] = ((u8 *)(void *)(s))[5];\ (( u8 *)(void *)(d))[3] = ((u8 *)(void *)(s))[4];\ (( u8 *)(void *)(d))[4] = ((u8 *)(void *)(s))[3];\ (( u8 *)(void *)(d))[5] = ((u8 *)(void *)(s))[2];\ (( u8 *)(void *)(d))[6] = ((u8 *)(void *)(s))[1];\ (( u8 *)(void *)(d))[7] = ((u8 *)(void *)(s))[0];} #else /* * Macros for little-endian machines */ /* This macro sets a buffer index, starting from the beginning of the buffer */ #define ACPI_BUFFER_INDEX(buf_len,buf_offset,byte_gran) (buf_offset) #ifndef ACPI_MISALIGNMENT_NOT_SUPPORTED /* The hardware supports unaligned transfers, just do the little-endian move */ /* 16-bit source, 16/32/64 destination */ #define ACPI_MOVE_16_TO_16(d,s) *(u16 *)(void *)(d) = *(u16 *)(void *)(s) #define ACPI_MOVE_16_TO_32(d,s) *(u32 *)(void *)(d) = *(u16 *)(void *)(s) #define ACPI_MOVE_16_TO_64(d,s) *(u64 *)(void *)(d) = *(u16 *)(void *)(s) /* 32-bit source, 16/32/64 destination */ #define ACPI_MOVE_32_TO_16(d,s) ACPI_MOVE_16_TO_16(d,s) /* Truncate to 16 */ #define ACPI_MOVE_32_TO_32(d,s) *(u32 *)(void *)(d) = *(u32 *)(void *)(s) #define ACPI_MOVE_32_TO_64(d,s) *(u64 *)(void *)(d) = *(u32 *)(void *)(s) /* 64-bit source, 16/32/64 destination */ #define ACPI_MOVE_64_TO_16(d,s) ACPI_MOVE_16_TO_16(d,s) /* Truncate to 16 */ #define ACPI_MOVE_64_TO_32(d,s) ACPI_MOVE_32_TO_32(d,s) /* Truncate to 32 */ #define ACPI_MOVE_64_TO_64(d,s) *(u64 *)(void *)(d) = *(u64 *)(void *)(s) #else /* * The hardware does not support unaligned transfers. We must move the * data one byte at a time. These macros work whether the source or * the destination (or both) is/are unaligned. (Little-endian move) */ /* 16-bit source, 16/32/64 destination */ #define ACPI_MOVE_16_TO_16(d,s) {(( u8 *)(void *)(d))[0] = ((u8 *)(void *)(s))[0];\ (( u8 *)(void *)(d))[1] = ((u8 *)(void *)(s))[1];} #define ACPI_MOVE_16_TO_32(d,s) {(*(u32 *)(void *)(d)) = 0; ACPI_MOVE_16_TO_16(d,s);} #define ACPI_MOVE_16_TO_64(d,s) {(*(u64 *)(void *)(d)) = 0; ACPI_MOVE_16_TO_16(d,s);} /* 32-bit source, 16/32/64 destination */ #define ACPI_MOVE_32_TO_16(d,s) ACPI_MOVE_16_TO_16(d,s) /* Truncate to 16 */ #define ACPI_MOVE_32_TO_32(d,s) {(( u8 *)(void *)(d))[0] = ((u8 *)(void *)(s))[0];\ (( u8 *)(void *)(d))[1] = ((u8 *)(void *)(s))[1];\ (( u8 *)(void *)(d))[2] = ((u8 *)(void *)(s))[2];\ (( u8 *)(void *)(d))[3] = ((u8 *)(void *)(s))[3];} #define ACPI_MOVE_32_TO_64(d,s) {(*(u64 *)(void *)(d)) = 0; ACPI_MOVE_32_TO_32(d,s);} /* 64-bit source, 16/32/64 destination */ #define ACPI_MOVE_64_TO_16(d,s) ACPI_MOVE_16_TO_16(d,s) /* Truncate to 16 */ #define ACPI_MOVE_64_TO_32(d,s) ACPI_MOVE_32_TO_32(d,s) /* Truncate to 32 */ #define ACPI_MOVE_64_TO_64(d,s) {(( u8 *)(void *)(d))[0] = ((u8 *)(void *)(s))[0];\ (( u8 *)(void *)(d))[1] = ((u8 *)(void *)(s))[1];\ (( u8 *)(void *)(d))[2] = ((u8 *)(void *)(s))[2];\ (( u8 *)(void *)(d))[3] = ((u8 *)(void *)(s))[3];\ (( u8 *)(void *)(d))[4] = ((u8 *)(void *)(s))[4];\ (( u8 *)(void *)(d))[5] = ((u8 *)(void *)(s))[5];\ (( u8 *)(void *)(d))[6] = ((u8 *)(void *)(s))[6];\ (( u8 *)(void *)(d))[7] = ((u8 *)(void *)(s))[7];} #endif #endif /* Macros based on machine integer width */ #if ACPI_MACHINE_WIDTH == 32 #define ACPI_MOVE_SIZE_TO_16(d,s) ACPI_MOVE_32_TO_16(d,s) #elif ACPI_MACHINE_WIDTH == 64 #define ACPI_MOVE_SIZE_TO_16(d,s) ACPI_MOVE_64_TO_16(d,s) #else #error unknown ACPI_MACHINE_WIDTH #endif /* * Fast power-of-two math macros for non-optimized compilers */ #define _ACPI_DIV(value,power_of2) ((u32) ((value) >> (power_of2))) #define _ACPI_MUL(value,power_of2) ((u32) ((value) << (power_of2))) #define _ACPI_MOD(value,divisor) ((u32) ((value) & ((divisor) -1))) #define ACPI_DIV_2(a) _ACPI_DIV(a,1) #define ACPI_MUL_2(a) _ACPI_MUL(a,1) #define ACPI_MOD_2(a) _ACPI_MOD(a,2) #define ACPI_DIV_4(a) _ACPI_DIV(a,2) #define ACPI_MUL_4(a) _ACPI_MUL(a,2) #define ACPI_MOD_4(a) _ACPI_MOD(a,4) #define ACPI_DIV_8(a) _ACPI_DIV(a,3) #define ACPI_MUL_8(a) _ACPI_MUL(a,3) #define ACPI_MOD_8(a) _ACPI_MOD(a,8) #define ACPI_DIV_16(a) _ACPI_DIV(a,4) #define ACPI_MUL_16(a) _ACPI_MUL(a,4) #define ACPI_MOD_16(a) _ACPI_MOD(a,16) #define ACPI_DIV_32(a) _ACPI_DIV(a,5) #define ACPI_MUL_32(a) _ACPI_MUL(a,5) #define ACPI_MOD_32(a) _ACPI_MOD(a,32) /* * Rounding macros (Power of two boundaries only) */ #define ACPI_ROUND_DOWN(value,boundary) (((acpi_native_uint)(value)) & \ (~(((acpi_native_uint) boundary)-1))) #define ACPI_ROUND_UP(value,boundary) ((((acpi_native_uint)(value)) + \ (((acpi_native_uint) boundary)-1)) & \ (~(((acpi_native_uint) boundary)-1))) /* Note: sizeof(acpi_native_uint) evaluates to either 2, 4, or 8 */ #define ACPI_ROUND_DOWN_TO_32BIT(a) ACPI_ROUND_DOWN(a,4) #define ACPI_ROUND_DOWN_TO_64BIT(a) ACPI_ROUND_DOWN(a,8) #define ACPI_ROUND_DOWN_TO_NATIVE_WORD(a) ACPI_ROUND_DOWN(a,sizeof(acpi_native_uint)) #define ACPI_ROUND_UP_TO_32BIT(a) ACPI_ROUND_UP(a,4) #define ACPI_ROUND_UP_TO_64BIT(a) ACPI_ROUND_UP(a,8) #define ACPI_ROUND_UP_TO_NATIVE_WORD(a) ACPI_ROUND_UP(a,sizeof(acpi_native_uint)) #define ACPI_ROUND_BITS_UP_TO_BYTES(a) ACPI_DIV_8((a) + 7) #define ACPI_ROUND_BITS_DOWN_TO_BYTES(a) ACPI_DIV_8((a)) #define ACPI_ROUND_UP_TO_1K(a) (((a) + 1023) >> 10) /* Generic (non-power-of-two) rounding */ #define ACPI_ROUND_UP_TO(value,boundary) (((value) + ((boundary)-1)) / (boundary)) #define ACPI_IS_MISALIGNED(value) (((acpi_native_uint)value) & (sizeof(acpi_native_uint)-1)) /* * Bitmask creation * Bit positions start at zero. * MASK_BITS_ABOVE creates a mask starting AT the position and above * MASK_BITS_BELOW creates a mask starting one bit BELOW the position */ #define ACPI_MASK_BITS_ABOVE(position) (~((ACPI_INTEGER_MAX) << ((u32) (position)))) #define ACPI_MASK_BITS_BELOW(position) ((ACPI_INTEGER_MAX) << ((u32) (position))) #define ACPI_IS_OCTAL_DIGIT(d) (((char)(d) >= '0') && ((char)(d) <= '7')) /* Bitfields within ACPI registers */ #define ACPI_REGISTER_PREPARE_BITS(val, pos, mask) ((val << pos) & mask) #define ACPI_REGISTER_INSERT_VALUE(reg, pos, mask, val) reg = (reg & (~(mask))) | ACPI_REGISTER_PREPARE_BITS(val, pos, mask) #define ACPI_INSERT_BITS(target, mask, source) target = ((target & (~(mask))) | (source & mask)) /* Generate a UUID */ #define ACPI_INIT_UUID(a,b,c,d0,d1,d2,d3,d4,d5,d6,d7) \ (a) & 0xFF, ((a) >> 8) & 0xFF, ((a) >> 16) & 0xFF, ((a) >> 24) & 0xFF, \ (b) & 0xFF, ((b) >> 8) & 0xFF, \ (c) & 0xFF, ((c) >> 8) & 0xFF, \ (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) /* * An struct acpi_namespace_node * can appear in some contexts, * where a pointer to an union acpi_operand_object can also * appear. This macro is used to distinguish them. * * The "Descriptor" field is the first field in both structures. */ #define ACPI_GET_DESCRIPTOR_TYPE(d) (((union acpi_descriptor *)(void *)(d))->common.descriptor_type) #define ACPI_SET_DESCRIPTOR_TYPE(d,t) (((union acpi_descriptor *)(void *)(d))->common.descriptor_type = t) /* Macro to test the object type */ #define ACPI_GET_OBJECT_TYPE(d) (((union acpi_operand_object *)(void *)(d))->common.type) /* Macro to check the table flags for SINGLE or MULTIPLE tables are allowed */ #define ACPI_IS_SINGLE_TABLE(x) (((x) & 0x01) == ACPI_TABLE_SINGLE ? 1 : 0) /* * Macros for the master AML opcode table */ #if defined(ACPI_DISASSEMBLER) || defined (ACPI_DEBUG_OUTPUT) #define ACPI_OP(name,Pargs,Iargs,obj_type,class,type,flags) {name,(u32)(Pargs),(u32)(Iargs),(u32)(flags),obj_type,class,type} #else #define ACPI_OP(name,Pargs,Iargs,obj_type,class,type,flags) {(u32)(Pargs),(u32)(Iargs),(u32)(flags),obj_type,class,type} #endif #ifdef ACPI_DISASSEMBLER #define ACPI_DISASM_ONLY_MEMBERS(a) a; #else #define ACPI_DISASM_ONLY_MEMBERS(a) #endif #define ARG_TYPE_WIDTH 5 #define ARG_1(x) ((u32)(x)) #define ARG_2(x) ((u32)(x) << (1 * ARG_TYPE_WIDTH)) #define ARG_3(x) ((u32)(x) << (2 * ARG_TYPE_WIDTH)) #define ARG_4(x) ((u32)(x) << (3 * ARG_TYPE_WIDTH)) #define ARG_5(x) ((u32)(x) << (4 * ARG_TYPE_WIDTH)) #define ARG_6(x) ((u32)(x) << (5 * ARG_TYPE_WIDTH)) #define ARGI_LIST1(a) (ARG_1(a)) #define ARGI_LIST2(a,b) (ARG_1(b)|ARG_2(a)) #define ARGI_LIST3(a,b,c) (ARG_1(c)|ARG_2(b)|ARG_3(a)) #define ARGI_LIST4(a,b,c,d) (ARG_1(d)|ARG_2(c)|ARG_3(b)|ARG_4(a)) #define ARGI_LIST5(a,b,c,d,e) (ARG_1(e)|ARG_2(d)|ARG_3(c)|ARG_4(b)|ARG_5(a)) #define ARGI_LIST6(a,b,c,d,e,f) (ARG_1(f)|ARG_2(e)|ARG_3(d)|ARG_4(c)|ARG_5(b)|ARG_6(a)) #define ARGP_LIST1(a) (ARG_1(a)) #define ARGP_LIST2(a,b) (ARG_1(a)|ARG_2(b)) #define ARGP_LIST3(a,b,c) (ARG_1(a)|ARG_2(b)|ARG_3(c)) #define ARGP_LIST4(a,b,c,d) (ARG_1(a)|ARG_2(b)|ARG_3(c)|ARG_4(d)) #define ARGP_LIST5(a,b,c,d,e) (ARG_1(a)|ARG_2(b)|ARG_3(c)|ARG_4(d)|ARG_5(e)) #define ARGP_LIST6(a,b,c,d,e,f) (ARG_1(a)|ARG_2(b)|ARG_3(c)|ARG_4(d)|ARG_5(e)|ARG_6(f)) #define GET_CURRENT_ARG_TYPE(list) (list & ((u32) 0x1F)) #define INCREMENT_ARG_LIST(list) (list >>= ((u32) ARG_TYPE_WIDTH)) #if defined (ACPI_DEBUG_OUTPUT) || !defined (ACPI_NO_ERROR_MESSAGES) /* * Module name is include in both debug and non-debug versions primarily for * error messages. The __FILE__ macro is not very useful for this, because it * often includes the entire pathname to the module */ #define ACPI_MODULE_NAME(name) static const char ACPI_UNUSED_VAR _acpi_module_name[] = name; #else #define ACPI_MODULE_NAME(name) #endif /* * Ascii error messages can be configured out */ #ifndef ACPI_NO_ERROR_MESSAGES #define AE_INFO _acpi_module_name, __LINE__ /* * Error reporting. Callers module and line number are inserted by AE_INFO, * the plist contains a set of parens to allow variable-length lists. * These macros are used for both the debug and non-debug versions of the code. */ #define ACPI_INFO(plist) acpi_ut_info plist #define ACPI_WARNING(plist) acpi_ut_warning plist #define ACPI_ERROR(plist) acpi_ut_error plist #define ACPI_ERROR_NAMESPACE(s,e) acpi_ns_report_error (AE_INFO, s, e); #define ACPI_ERROR_METHOD(s,n,p,e) acpi_ns_report_method_error (AE_INFO, s, n, p, e); #else /* No error messages */ #define ACPI_INFO(plist) #define ACPI_WARNING(plist) #define ACPI_ERROR(plist) #define ACPI_ERROR_NAMESPACE(s,e) #define ACPI_ERROR_METHOD(s,n,p,e) #endif /* * Debug macros that are conditionally compiled */ #ifdef ACPI_DEBUG_OUTPUT /* * Common parameters used for debug output functions: * line number, function name, module(file) name, component ID */ #define ACPI_DEBUG_PARAMETERS __LINE__, ACPI_GET_FUNCTION_NAME, _acpi_module_name, _COMPONENT /* * Function entry tracing */ /* * If ACPI_GET_FUNCTION_NAME was not defined in the compiler-dependent header, * define it now. This is the case where there the compiler does not support * a __FUNCTION__ macro or equivalent. We save the function name on the * local stack. */ #ifndef ACPI_GET_FUNCTION_NAME #define ACPI_GET_FUNCTION_NAME _acpi_function_name /* * The Name parameter should be the procedure name as a quoted string. * This is declared as a local string ("MyFunctionName") so that it can * be also used by the function exit macros below. * Note: (const char) is used to be compatible with the debug interfaces * and macros such as __FUNCTION__. */ #define ACPI_FUNCTION_NAME(name) const char *_acpi_function_name = #name; #else /* Compiler supports __FUNCTION__ (or equivalent) -- Ignore this macro */ #define ACPI_FUNCTION_NAME(name) #endif #ifdef CONFIG_ACPI_DEBUG_FUNC_TRACE #define ACPI_FUNCTION_TRACE(a) ACPI_FUNCTION_NAME(a) \ acpi_ut_trace(ACPI_DEBUG_PARAMETERS) #define ACPI_FUNCTION_TRACE_PTR(a,b) ACPI_FUNCTION_NAME(a) \ acpi_ut_trace_ptr(ACPI_DEBUG_PARAMETERS,(void *)b) #define ACPI_FUNCTION_TRACE_U32(a,b) ACPI_FUNCTION_NAME(a) \ acpi_ut_trace_u32(ACPI_DEBUG_PARAMETERS,(u32)b) #define ACPI_FUNCTION_TRACE_STR(a,b) ACPI_FUNCTION_NAME(a) \ acpi_ut_trace_str(ACPI_DEBUG_PARAMETERS,(char *)b) #define ACPI_FUNCTION_ENTRY() acpi_ut_track_stack_ptr() /* * Function exit tracing. * WARNING: These macros include a return statement. This is usually considered * bad form, but having a separate exit macro is very ugly and difficult to maintain. * One of the FUNCTION_TRACE macros above must be used in conjunction with these macros * so that "_AcpiFunctionName" is defined. * * Note: the DO_WHILE0 macro is used to prevent some compilers from complaining * about these constructs. */ #ifdef ACPI_USE_DO_WHILE_0 #define ACPI_DO_WHILE0(a) do a while(0) #else #define ACPI_DO_WHILE0(a) a #endif #define return_VOID ACPI_DO_WHILE0 ({ \ acpi_ut_exit (ACPI_DEBUG_PARAMETERS); \ return;}) /* * There are two versions of most of the return macros. The default version is * safer, since it avoids side-effects by guaranteeing that the argument will * not be evaluated twice. * * A less-safe version of the macros is provided for optional use if the * compiler uses excessive CPU stack (for example, this may happen in the * debug case if code optimzation is disabled.) */ #ifndef ACPI_SIMPLE_RETURN_MACROS #define return_ACPI_STATUS(s) ACPI_DO_WHILE0 ({ \ register acpi_status _s = (s); \ acpi_ut_status_exit (ACPI_DEBUG_PARAMETERS, _s); \ return (_s); }) #define return_PTR(s) ACPI_DO_WHILE0 ({ \ register void *_s = (void *) (s); \ acpi_ut_ptr_exit (ACPI_DEBUG_PARAMETERS, (u8 *) _s); \ return (_s); }) #define return_VALUE(s) ACPI_DO_WHILE0 ({ \ register acpi_integer _s = (s); \ acpi_ut_value_exit (ACPI_DEBUG_PARAMETERS, _s); \ return (_s); }) #define return_UINT8(s) ACPI_DO_WHILE0 ({ \ register u8 _s = (u8) (s); \ acpi_ut_value_exit (ACPI_DEBUG_PARAMETERS, (acpi_integer) _s); \ return (_s); }) #define return_UINT32(s) ACPI_DO_WHILE0 ({ \ register u32 _s = (u32) (s); \ acpi_ut_value_exit (ACPI_DEBUG_PARAMETERS, (acpi_integer) _s); \ return (_s); }) #else /* Use original less-safe macros */ #define return_ACPI_STATUS(s) ACPI_DO_WHILE0 ({ \ acpi_ut_status_exit (ACPI_DEBUG_PARAMETERS, (s)); \ return((s)); }) #define return_PTR(s) ACPI_DO_WHILE0 ({ \ acpi_ut_ptr_exit (ACPI_DEBUG_PARAMETERS, (u8 *) (s)); \ return((s)); }) #define return_VALUE(s) ACPI_DO_WHILE0 ({ \ acpi_ut_value_exit (ACPI_DEBUG_PARAMETERS, (acpi_integer) (s)); \ return((s)); }) #define return_UINT8(s) return_VALUE(s) #define return_UINT32(s) return_VALUE(s) #endif /* ACPI_SIMPLE_RETURN_MACROS */ #else /* !CONFIG_ACPI_DEBUG_FUNC_TRACE */ #define ACPI_FUNCTION_TRACE(a) #define ACPI_FUNCTION_TRACE_PTR(a,b) #define ACPI_FUNCTION_TRACE_U32(a,b) #define ACPI_FUNCTION_TRACE_STR(a,b) #define ACPI_FUNCTION_EXIT #define ACPI_FUNCTION_STATUS_EXIT(s) #define ACPI_FUNCTION_VALUE_EXIT(s) #define ACPI_FUNCTION_TRACE(a) #define ACPI_FUNCTION_ENTRY() #define return_VOID return #define return_ACPI_STATUS(s) return(s) #define return_VALUE(s) return(s) #define return_UINT8(s) return(s) #define return_UINT32(s) return(s) #define return_PTR(s) return(s) #endif /* CONFIG_ACPI_DEBUG_FUNC_TRACE */ /* Conditional execution */ #define ACPI_DEBUG_EXEC(a) a #define ACPI_NORMAL_EXEC(a) #define ACPI_DEBUG_DEFINE(a) a; #define ACPI_DEBUG_ONLY_MEMBERS(a) a; #define _VERBOSE_STRUCTURES /* * Master debug print macros * Print iff: * 1) Debug print for the current component is enabled * 2) Debug error level or trace level for the print statement is enabled */ #define ACPI_DEBUG_PRINT(plist) acpi_ut_debug_print plist #define ACPI_DEBUG_PRINT_RAW(plist) acpi_ut_debug_print_raw plist #else /* * This is the non-debug case -- make everything go away, * leaving no executable debug code! */ #define ACPI_DEBUG_EXEC(a) #define ACPI_NORMAL_EXEC(a) a; #define ACPI_DEBUG_DEFINE(a) do { } while(0) #define ACPI_DEBUG_ONLY_MEMBERS(a) do { } while(0) #define ACPI_FUNCTION_NAME(a) do { } while(0) #define ACPI_FUNCTION_TRACE(a) do { } while(0) #define ACPI_FUNCTION_TRACE_PTR(a,b) do { } while(0) #define ACPI_FUNCTION_TRACE_U32(a,b) do { } while(0) #define ACPI_FUNCTION_TRACE_STR(a,b) do { } while(0) #define ACPI_FUNCTION_EXIT do { } while(0) #define ACPI_FUNCTION_STATUS_EXIT(s) do { } while(0) #define ACPI_FUNCTION_VALUE_EXIT(s) do { } while(0) #define ACPI_FUNCTION_ENTRY() do { } while(0) #define ACPI_DEBUG_PRINT(pl) do { } while(0) #define ACPI_DEBUG_PRINT_RAW(pl) do { } while(0) #define return_VOID return #define return_ACPI_STATUS(s) return(s) #define return_VALUE(s) return(s) #define return_UINT8(s) return(s) #define return_UINT32(s) return(s) #define return_PTR(s) return(s) #endif #endif /* ACMACROS_H */ xen-4.4.0/xen/include/acpi/acglobal.h0000664000175000017500000001022112307313555015505 0ustar smbsmb/****************************************************************************** * * Name: acglobal.h - Declarations for global variables * *****************************************************************************/ /* * Copyright (C) 2000 - 2007, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACGLOBAL_H__ #define __ACGLOBAL_H__ /* * Ensure that the globals are actually defined and initialized only once. * * The use of these macros allows a single list of globals (here) in order * to simplify maintenance of the code. */ #ifdef DEFINE_ACPI_GLOBALS #define ACPI_EXTERN #define ACPI_INIT_GLOBAL(a,b) a=b #else #define ACPI_EXTERN extern #define ACPI_INIT_GLOBAL(a,b) a #endif /***************************************************************************** * * ACPI Table globals * ****************************************************************************/ /* * acpi_gbl_root_table_list is the master list of ACPI tables found in the * RSDT/XSDT. * * acpi_gbl_FADT is a local copy of the FADT, converted to a common format. */ ACPI_EXTERN struct acpi_internal_rsdt acpi_gbl_root_table_list; ACPI_EXTERN struct acpi_table_fadt acpi_gbl_FADT; /* These addresses are calculated from FADT address values */ ACPI_EXTERN struct acpi_generic_address acpi_gbl_xpm1a_enable; ACPI_EXTERN struct acpi_generic_address acpi_gbl_xpm1b_enable; /* * ACPI 5.0 introduces the concept of a "reduced hardware platform", meaning * that the ACPI hardware is no longer required. A flag in the FADT indicates * a reduced HW machine, and that flag is duplicated here for convenience. */ ACPI_EXTERN u8 acpi_gbl_reduced_hardware; /***************************************************************************** * * Miscellaneous globals * ****************************************************************************/ #ifndef DEFINE_ACPI_GLOBALS extern char const *acpi_gbl_exception_names_env[]; extern char const *acpi_gbl_exception_names_pgm[]; extern char const *acpi_gbl_exception_names_tbl[]; extern char const *acpi_gbl_exception_names_aml[]; extern char const *acpi_gbl_exception_names_ctrl[]; #endif /***************************************************************************** * * Hardware globals * ****************************************************************************/ extern struct acpi_bit_register_info acpi_gbl_bit_register_info[ACPI_NUM_BITREG]; #endif /* __ACGLOBAL_H__ */ xen-4.4.0/xen/include/acpi/actbl3.h0000664000175000017500000003373712307313555015132 0ustar smbsmb/****************************************************************************** * * Name: actbl3.h - ACPI Table Definitions * *****************************************************************************/ /* * Copyright (C) 2000 - 2012, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACTBL3_H__ #define __ACTBL3_H__ /******************************************************************************* * * Additional ACPI Tables (3) * * These tables are not consumed directly by the ACPICA subsystem, but are * included here to support device drivers and the AML disassembler. * * The tables in this file are fully defined within the ACPI specification. * ******************************************************************************/ /* * Values for description table header signatures for tables defined in this * file. Useful because they make it more difficult to inadvertently type in * the wrong signature. */ #define ACPI_SIG_BGRT "BGRT" /* Boot Graphics Resource Table */ #define ACPI_SIG_DRTM "DRTM" /* Dynamic Root of Trust for Measurement table */ #define ACPI_SIG_FPDT "FPDT" /* Firmware Performance Data Table */ #define ACPI_SIG_GTDT "GTDT" /* Generic Timer Description Table */ #define ACPI_SIG_MPST "MPST" /* Memory Power State Table */ #define ACPI_SIG_PCCT "PCCT" /* Platform Communications Channel Table */ #define ACPI_SIG_PMTT "PMTT" /* Platform Memory Topology Table */ #define ACPI_SIG_RASF "RASF" /* RAS Feature table */ #define ACPI_SIG_S3PT "S3PT" /* S3 Performance (sub)Table */ #define ACPI_SIG_PCCS "PCC" /* PCC Shared Memory Region */ /* Reserved table signatures */ #define ACPI_SIG_CSRT "CSRT" /* Core System Resources Table */ #define ACPI_SIG_MATR "MATR" /* Memory Address Translation Table */ #define ACPI_SIG_MSDM "MSDM" /* Microsoft Data Management Table */ #define ACPI_SIG_WPBT "WPBT" /* Windows Platform Binary Table */ /* * All tables must be byte-packed to match the ACPI specification, since * the tables are provided by the system BIOS. */ #pragma pack(1) /* * Note: C bitfields are not used for this reason: * * "Bitfields are great and easy to read, but unfortunately the C language * does not specify the layout of bitfields in memory, which means they are * essentially useless for dealing with packed data in on-disk formats or * binary wire protocols." (Or ACPI tables and buffers.) "If you ask me, * this decision was a design error in C. Ritchie could have picked an order * and stuck with it." Norman Ramsey. * See http://stackoverflow.com/a/1053662/41661 */ /******************************************************************************* * * BGRT - Boot Graphics Resource Table (ACPI 5.0) * Version 1 * ******************************************************************************/ struct acpi_table_bgrt { struct acpi_table_header header; /* Common ACPI table header */ u16 version; u8 status; u8 image_type; u64 image_address; u32 image_offset_x; u32 image_offset_y; }; /******************************************************************************* * * DRTM - Dynamic Root of Trust for Measurement table * ******************************************************************************/ struct acpi_table_drtm { struct acpi_table_header header; /* Common ACPI table header */ u64 entry_base_address; u64 entry_length; u32 entry_address32; u64 entry_address64; u64 exit_address; u64 log_area_address; u32 log_area_length; u64 arch_dependent_address; u32 flags; }; /* 1) Validated Tables List */ struct acpi_drtm_vtl_list { u32 validated_table_list_count; }; /* 2) Resources List */ struct acpi_drtm_resource_list { u32 resource_list_count; }; /* 3) Platform-specific Identifiers List */ struct acpi_drtm_id_list { u32 id_list_count; }; /******************************************************************************* * * FPDT - Firmware Performance Data Table (ACPI 5.0) * Version 1 * ******************************************************************************/ struct acpi_table_fpdt { struct acpi_table_header header; /* Common ACPI table header */ }; /* FPDT subtable header */ struct acpi_fpdt_header { u16 type; u8 length; u8 revision; }; /* Values for Type field above */ enum acpi_fpdt_type { ACPI_FPDT_TYPE_BOOT = 0, ACPI_FPDT_TYPE_S3PERF = 1, }; /* * FPDT subtables */ /* 0: Firmware Basic Boot Performance Record */ struct acpi_fpdt_boot { struct acpi_fpdt_header header; u8 reserved[4]; u64 reset_end; u64 load_start; u64 startup_start; u64 exit_services_entry; u64 exit_services_exit; }; /* 1: S3 Performance Table Pointer Record */ struct acpi_fpdt_s3pt_ptr { struct acpi_fpdt_header header; u8 reserved[4]; u64 address; }; /* * S3PT - S3 Performance Table. This table is pointed to by the * FPDT S3 Pointer Record above. */ struct acpi_table_s3pt { u8 signature[4]; /* "S3PT" */ u32 length; }; /* * S3PT Subtables */ struct acpi_s3pt_header { u16 type; u8 length; u8 revision; }; /* Values for Type field above */ enum acpi_s3pt_type { ACPI_S3PT_TYPE_RESUME = 0, ACPI_S3PT_TYPE_SUSPEND = 1, }; struct acpi_s3pt_resume { struct acpi_s3pt_header header; u32 resume_count; u64 full_resume; u64 average_resume; }; struct acpi_s3pt_suspend { struct acpi_s3pt_header header; u64 suspend_start; u64 suspend_end; }; /******************************************************************************* * * GTDT - Generic Timer Description Table (ACPI 5.0) * Version 1 * ******************************************************************************/ struct acpi_table_gtdt { struct acpi_table_header header; /* Common ACPI table header */ u64 address; u32 flags; u32 secure_pl1_interrupt; u32 secure_pl1_flags; u32 non_secure_pl1_interrupt; u32 non_secure_pl1_flags; u32 virtual_timer_interrupt; u32 virtual_timer_flags; u32 non_secure_pl2_interrupt; u32 non_secure_pl2_flags; }; /* Values for Flags field above */ #define ACPI_GTDT_MAPPED_BLOCK_PRESENT 1 /* Values for all "TimerFlags" fields above */ #define ACPI_GTDT_INTERRUPT_MODE 1 #define ACPI_GTDT_INTERRUPT_POLARITY 2 /******************************************************************************* * * MPST - Memory Power State Table (ACPI 5.0) * Version 1 * ******************************************************************************/ #define ACPI_MPST_CHANNEL_INFO \ u16 reserved1; \ u8 channel_id; \ u8 reserved2; \ u16 power_node_count; /* Main table */ struct acpi_table_mpst { struct acpi_table_header header; /* Common ACPI table header */ ACPI_MPST_CHANNEL_INFO /* Platform Communication Channel */ }; /* Memory Platform Communication Channel Info */ struct acpi_mpst_channel { ACPI_MPST_CHANNEL_INFO /* Platform Communication Channel */ }; /* Memory Power Node Structure */ struct acpi_mpst_power_node { u8 flags; u8 reserved1; u16 node_id; u32 length; u64 range_address; u64 range_length; u8 num_power_states; u8 num_physical_components; u16 reserved2; }; /* Values for Flags field above */ #define ACPI_MPST_ENABLED 1 #define ACPI_MPST_POWER_MANAGED 2 #define ACPI_MPST_HOT_PLUG_CAPABLE 4 /* Memory Power State Structure (follows POWER_NODE above) */ struct acpi_mpst_power_state { u8 power_state; u8 info_index; }; /* Physical Component ID Structure (follows POWER_STATE above) */ struct acpi_mpst_component { u16 component_id; }; /* Memory Power State Characteristics Structure (follows all POWER_NODEs) */ struct acpi_mpst_data_hdr { u16 characteristics_count; }; struct acpi_mpst_power_data { u8 revision; u8 flags; u16 reserved1; u32 average_power; u32 power_saving; u64 exit_latency; u64 reserved2; }; /* Values for Flags field above */ #define ACPI_MPST_PRESERVE 1 #define ACPI_MPST_AUTOENTRY 2 #define ACPI_MPST_AUTOEXIT 4 /* Shared Memory Region (not part of an ACPI table) */ struct acpi_mpst_shared { u32 signature; u16 pcc_command; u16 pcc_status; u16 command_register; u16 status_register; u16 power_state_id; u16 power_node_id; u64 energy_consumed; u64 average_power; }; /******************************************************************************* * * PCCT - Platform Communications Channel Table (ACPI 5.0) * Version 1 * ******************************************************************************/ struct acpi_table_pcct { struct acpi_table_header header; /* Common ACPI table header */ u32 flags; u32 latency; u32 reserved; }; /* Values for Flags field above */ #define ACPI_PCCT_DOORBELL 1 /* * PCCT subtables */ /* 0: Generic Communications Subspace */ struct acpi_pcct_subspace { struct acpi_subtable_header header; u8 reserved[6]; u64 base_address; u64 length; struct acpi_generic_address doorbell_register; u64 preserve_mask; u64 write_mask; }; /* * PCC memory structures (not part of the ACPI table) */ /* Shared Memory Region */ struct acpi_pcct_shared_memory { u32 signature; u16 command; u16 status; }; /******************************************************************************* * * PMTT - Platform Memory Topology Table (ACPI 5.0) * Version 1 * ******************************************************************************/ struct acpi_table_pmtt { struct acpi_table_header header; /* Common ACPI table header */ u32 reserved; }; /* Common header for PMTT subtables that follow main table */ struct acpi_pmtt_header { u8 type; u8 reserved1; u16 length; u16 flags; u16 reserved2; }; /* Values for Type field above */ #define ACPI_PMTT_TYPE_SOCKET 0 #define ACPI_PMTT_TYPE_CONTROLLER 1 #define ACPI_PMTT_TYPE_DIMM 2 #define ACPI_PMTT_TYPE_RESERVED 3 /* 0x03-0xFF are reserved */ /* Values for Flags field above */ #define ACPI_PMTT_TOP_LEVEL 0x0001 #define ACPI_PMTT_PHYSICAL 0x0002 #define ACPI_PMTT_MEMORY_TYPE 0x000C /* * PMTT subtables, correspond to Type in struct acpi_pmtt_header */ /* 0: Socket Structure */ struct acpi_pmtt_socket { struct acpi_pmtt_header header; u16 socket_id; u16 reserved; }; /* 1: Memory Controller subtable */ struct acpi_pmtt_controller { struct acpi_pmtt_header header; u32 read_latency; u32 write_latency; u32 read_bandwidth; u32 write_bandwidth; u16 access_width; u16 alignment; u16 reserved; u16 domain_count; }; /* 1a: Proximity Domain substructure */ struct acpi_pmtt_domain { u32 proximity_domain; }; /* 2: Physical Component Identifier (DIMM) */ struct acpi_pmtt_physical_component { struct acpi_pmtt_header header; u16 component_id; u16 reserved; u32 memory_size; u32 bios_handle; }; /******************************************************************************* * * RASF - RAS Feature Table (ACPI 5.0) * Version 1 * ******************************************************************************/ struct acpi_table_rasf { struct acpi_table_header header; /* Common ACPI table header */ u8 channel_id[12]; }; /* RASF Platform Communication Channel Shared Memory Region */ struct acpi_rasf_shared_memory { u32 signature; u16 command; u16 status; u64 requested_address; u64 requested_length; u64 actual_address; u64 actual_length; u16 flags; u8 speed; }; /* Masks for Flags and Speed fields above */ #define ACPI_RASF_SCRUBBER_RUNNING 1 #define ACPI_RASF_SPEED (7<<1) /* Channel Commands */ enum acpi_rasf_commands { ACPI_RASF_GET_RAS_CAPABILITIES = 1, ACPI_RASF_GET_PATROL_PARAMETERS = 2, ACPI_RASF_START_PATROL_SCRUBBER = 3, ACPI_RASF_STOP_PATROL_SCRUBBER = 4 }; /* Channel Command flags */ #define ACPI_RASF_GENERATE_SCI (1<<15) /* Status values */ enum acpi_rasf_status { ACPI_RASF_SUCCESS = 0, ACPI_RASF_NOT_VALID = 1, ACPI_RASF_NOT_SUPPORTED = 2, ACPI_RASF_BUSY = 3, ACPI_RASF_FAILED = 4, ACPI_RASF_ABORTED = 5, ACPI_RASF_INVALID_DATA = 6 }; /* Status flags */ #define ACPI_RASF_COMMAND_COMPLETE (1) #define ACPI_RASF_SCI_DOORBELL (1<<1) #define ACPI_RASF_ERROR (1<<2) #define ACPI_RASF_STATUS (0x1F<<3) /* Reset to default packing */ #pragma pack() #endif /* __ACTBL3_H__ */ xen-4.4.0/xen/include/acpi/acconfig.h0000664000175000017500000001626712307313555015532 0ustar smbsmb/****************************************************************************** * * Name: acconfig.h - Global configuration constants * *****************************************************************************/ /* * Copyright (C) 2000 - 2007, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef _ACCONFIG_H #define _ACCONFIG_H /****************************************************************************** * * Configuration options * *****************************************************************************/ /* * ACPI_DEBUG_OUTPUT - This switch enables all the debug facilities of the * ACPI subsystem. This includes the DEBUG_PRINT output * statements. When disabled, all DEBUG_PRINT * statements are compiled out. * * ACPI_APPLICATION - Use this switch if the subsystem is going to be run * at the application level. * */ /* Current ACPICA subsystem version in YYYYMMDD format */ #define ACPI_CA_VERSION 0x20070126 /* * OS name, used for the _OS object. The _OS object is essentially obsolete, * but there is a large base of ASL/AML code in existing machines that check * for the string below. The use of this string usually guarantees that * the ASL will execute down the most tested code path. Also, there is some * code that will not execute the _OSI method unless _OS matches the string * below. Therefore, change this string at your own risk. */ #define ACPI_OS_NAME "Microsoft Windows NT" /* Maximum objects in the various object caches */ #define ACPI_MAX_STATE_CACHE_DEPTH 96 /* State objects */ #define ACPI_MAX_PARSE_CACHE_DEPTH 96 /* Parse tree objects */ #define ACPI_MAX_EXTPARSE_CACHE_DEPTH 96 /* Parse tree objects */ #define ACPI_MAX_OBJECT_CACHE_DEPTH 96 /* Interpreter operand objects */ #define ACPI_MAX_NAMESPACE_CACHE_DEPTH 96 /* Namespace objects */ /* * Should the subsystem abort the loading of an ACPI table if the * table checksum is incorrect? */ #define ACPI_CHECKSUM_ABORT FALSE /****************************************************************************** * * Subsystem Constants * *****************************************************************************/ /* Version of ACPI supported */ #define ACPI_CA_SUPPORT_LEVEL 3 /* Maximum count for a semaphore object */ #define ACPI_MAX_SEMAPHORE_COUNT 256 /* Maximum object reference count (detects object deletion issues) */ #define ACPI_MAX_REFERENCE_COUNT 0x1000 /* Size of cached memory mapping for system memory operation region */ #define ACPI_SYSMEM_REGION_WINDOW_SIZE 4096 /* owner_id tracking. 8 entries allows for 255 owner_ids */ #define ACPI_NUM_OWNERID_MASKS 8 /* Size of the root table array is increased by this increment */ #define ACPI_ROOT_TABLE_SIZE_INCREMENT 4 /****************************************************************************** * * ACPI Specification constants (Do not change unless the specification changes) * *****************************************************************************/ /* Number of distinct GPE register blocks and register width */ #define ACPI_MAX_GPE_BLOCKS 2 #define ACPI_GPE_REGISTER_WIDTH 8 /* Method info (in WALK_STATE), containing local variables and argumetns */ #define ACPI_METHOD_NUM_LOCALS 8 #define ACPI_METHOD_MAX_LOCAL 7 #define ACPI_METHOD_NUM_ARGS 7 #define ACPI_METHOD_MAX_ARG 6 /* Length of _HID, _UID, _CID, and UUID values */ #define ACPI_DEVICE_ID_LENGTH 0x09 #define ACPI_MAX_CID_LENGTH 48 #define ACPI_UUID_LENGTH 16 /* * Operand Stack (in WALK_STATE), Must be large enough to contain METHOD_MAX_ARG */ #define ACPI_OBJ_NUM_OPERANDS 8 #define ACPI_OBJ_MAX_OPERAND 7 /* Names within the namespace are 4 bytes long */ #define ACPI_NAME_SIZE 4 #define ACPI_PATH_SEGMENT_LENGTH 5 /* 4 chars for name + 1 char for separator */ #define ACPI_PATH_SEPARATOR '.' /* Sizes for ACPI table headers */ #define ACPI_OEM_ID_SIZE 6 #define ACPI_OEM_TABLE_ID_SIZE 8 /* Constants used in searching for the RSDP in low memory */ #define ACPI_EBDA_PTR_LOCATION 0x0000040E /* Physical Address */ #define ACPI_EBDA_PTR_LENGTH 2 #define ACPI_EBDA_WINDOW_SIZE 1024 #define ACPI_HI_RSDP_WINDOW_BASE 0x000E0000 /* Physical Address */ #define ACPI_HI_RSDP_WINDOW_SIZE 0x00020000 #define ACPI_RSDP_SCAN_STEP 16 /* Operation regions */ #define ACPI_NUM_PREDEFINED_REGIONS 8 #define ACPI_USER_REGION_BEGIN 0x80 /* Maximum space_ids for Operation Regions */ #define ACPI_MAX_ADDRESS_SPACE 255 /* Array sizes. Used for range checking also */ #define ACPI_MAX_MATCH_OPCODE 5 /* RSDP checksums */ #define ACPI_RSDP_CHECKSUM_LENGTH 20 #define ACPI_RSDP_XCHECKSUM_LENGTH 36 /* SMBus bidirectional buffer size */ #define ACPI_SMBUS_BUFFER_SIZE 34 /****************************************************************************** * * ACPI AML Debugger * *****************************************************************************/ #define ACPI_DEBUGGER_MAX_ARGS 8 /* Must be max method args + 1 */ #define ACPI_DEBUGGER_COMMAND_PROMPT '-' #define ACPI_DEBUGGER_EXECUTE_PROMPT '%' #endif /* _ACCONFIG_H */ xen-4.4.0/xen/include/acpi/cpufreq/0000775000175000017500000000000012307313555015241 5ustar smbsmbxen-4.4.0/xen/include/acpi/cpufreq/processor_perf.h0000664000175000017500000000324512307313555020451 0ustar smbsmb#ifndef __XEN_PROCESSOR_PM_H__ #define __XEN_PROCESSOR_PM_H__ #include #include #include #define XEN_PX_INIT 0x80000000 int powernow_cpufreq_init(void); unsigned int powernow_register_driver(void); unsigned int get_measured_perf(unsigned int cpu, unsigned int flag); void cpufreq_residency_update(unsigned int, uint8_t); void cpufreq_statistic_update(unsigned int, uint8_t, uint8_t); int cpufreq_statistic_init(unsigned int); void cpufreq_statistic_exit(unsigned int); void cpufreq_statistic_reset(unsigned int); int cpufreq_limit_change(unsigned int); int cpufreq_add_cpu(unsigned int); int cpufreq_del_cpu(unsigned int); struct processor_performance { uint32_t state; uint32_t platform_limit; struct xen_pct_register control_register; struct xen_pct_register status_register; uint32_t state_count; struct xen_processor_px *states; struct xen_psd_package domain_info; uint32_t shared_type; uint32_t init; }; struct processor_pminfo { uint32_t acpi_id; uint32_t id; struct processor_performance perf; }; extern struct processor_pminfo *processor_pminfo[NR_CPUS]; struct px_stat { uint8_t total; /* total Px states */ uint8_t usable; /* usable Px states */ uint8_t last; /* last Px state */ uint8_t cur; /* current Px state */ uint64_t *trans_pt; /* Px transition table */ pm_px_val_t *pt; }; struct pm_px { struct px_stat u; uint64_t prev_state_wall; uint64_t prev_idle_wall; }; DECLARE_PER_CPU(struct pm_px *, cpufreq_statistic_data); int cpufreq_cpu_init(unsigned int cpuid); #endif /* __XEN_PROCESSOR_PM_H__ */ xen-4.4.0/xen/include/acpi/cpufreq/cpufreq.h0000664000175000017500000002106212307313555017060 0ustar smbsmb/* * xen/include/acpi/cpufreq/cpufreq.h * * Copyright (C) 2001 Russell King * (C) 2002 - 2003 Dominik Brodowski * * $Id: cpufreq.h,v 1.36 2003/01/20 17:31:48 db Exp $ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #ifndef __XEN_CPUFREQ_PM_H__ #define __XEN_CPUFREQ_PM_H__ #include #include #include #include "processor_perf.h" DECLARE_PER_CPU(spinlock_t, cpufreq_statistic_lock); extern bool_t cpufreq_verbose; struct cpufreq_governor; struct acpi_cpufreq_data { struct processor_performance *acpi_data; struct cpufreq_frequency_table *freq_table; unsigned int arch_cpu_flags; }; extern struct acpi_cpufreq_data *cpufreq_drv_data[NR_CPUS]; struct cpufreq_cpuinfo { unsigned int max_freq; unsigned int second_max_freq; /* P1 if Turbo Mode is on */ unsigned int min_freq; unsigned int transition_latency; /* in 10^(-9) s = nanoseconds */ }; struct cpufreq_policy { cpumask_var_t cpus; /* affected CPUs */ unsigned int shared_type; /* ANY or ALL affected CPUs should set cpufreq */ unsigned int cpu; /* cpu nr of registered CPU */ struct cpufreq_cpuinfo cpuinfo; unsigned int min; /* in kHz */ unsigned int max; /* in kHz */ unsigned int cur; /* in kHz, only needed if cpufreq * governors are used */ struct cpufreq_governor *governor; bool_t resume; /* flag for cpufreq 1st run * S3 wakeup, hotplug cpu, etc */ s8 turbo; /* tristate flag: 0 for unsupported * -1 for disable, 1 for enabled * See CPUFREQ_TURBO_* below for defines */ bool_t aperf_mperf; /* CPU has APERF/MPERF MSRs */ }; DECLARE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_policy); extern int __cpufreq_set_policy(struct cpufreq_policy *data, struct cpufreq_policy *policy); void cpufreq_cmdline_parse(char *); #define CPUFREQ_SHARED_TYPE_NONE (0) /* None */ #define CPUFREQ_SHARED_TYPE_HW (1) /* HW does needed coordination */ #define CPUFREQ_SHARED_TYPE_ALL (2) /* All dependent CPUs should set freq */ #define CPUFREQ_SHARED_TYPE_ANY (3) /* Freq can be set from any dependent CPU*/ /******************** cpufreq transition notifiers *******************/ struct cpufreq_freqs { unsigned int cpu; /* cpu nr */ unsigned int old; unsigned int new; u8 flags; /* flags of cpufreq_driver, see below. */ }; /********************************************************************* * CPUFREQ GOVERNORS * *********************************************************************/ #define CPUFREQ_GOV_START 1 #define CPUFREQ_GOV_STOP 2 #define CPUFREQ_GOV_LIMITS 3 struct cpufreq_governor { char name[CPUFREQ_NAME_LEN]; int (*governor)(struct cpufreq_policy *policy, unsigned int event); bool_t (*handle_option)(const char *name, const char *value); struct list_head governor_list; }; extern struct cpufreq_governor *cpufreq_opt_governor; extern struct cpufreq_governor cpufreq_gov_dbs; extern struct cpufreq_governor cpufreq_gov_userspace; extern struct cpufreq_governor cpufreq_gov_performance; extern struct cpufreq_governor cpufreq_gov_powersave; extern struct list_head cpufreq_governor_list; extern int cpufreq_register_governor(struct cpufreq_governor *governor); extern struct cpufreq_governor *__find_governor(const char *governor); #define CPUFREQ_DEFAULT_GOVERNOR &cpufreq_gov_dbs /* pass a target to the cpufreq driver */ extern int __cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation); #define GOV_GETAVG 1 #define USR_GETAVG 2 extern int cpufreq_driver_getavg(unsigned int cpu, unsigned int flag); #define CPUFREQ_TURBO_DISABLED -1 #define CPUFREQ_TURBO_UNSUPPORTED 0 #define CPUFREQ_TURBO_ENABLED 1 extern int cpufreq_update_turbo(int cpuid, int new_state); extern int cpufreq_get_turbo_status(int cpuid); static __inline__ int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event) { return policy->governor->governor(policy, event); } /********************************************************************* * CPUFREQ DRIVER INTERFACE * *********************************************************************/ #define CPUFREQ_RELATION_L 0 /* lowest frequency at or above target */ #define CPUFREQ_RELATION_H 1 /* highest frequency below or at target */ struct cpufreq_driver { char name[CPUFREQ_NAME_LEN]; int (*init)(struct cpufreq_policy *policy); int (*verify)(struct cpufreq_policy *policy); int (*update)(int cpuid, struct cpufreq_policy *policy); int (*target)(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation); unsigned int (*get)(unsigned int cpu); unsigned int (*getavg)(unsigned int cpu, unsigned int flag); int (*exit)(struct cpufreq_policy *policy); }; extern struct cpufreq_driver *cpufreq_driver; static __inline__ int cpufreq_register_driver(struct cpufreq_driver *driver_data) { if (!driver_data || !driver_data->init || !driver_data->exit || !driver_data->verify || !driver_data->target) return -EINVAL; if (cpufreq_driver) return -EBUSY; cpufreq_driver = driver_data; return 0; } static __inline__ int cpufreq_unregister_driver(struct cpufreq_driver *driver) { if (!cpufreq_driver || (driver != cpufreq_driver)) return -EINVAL; cpufreq_driver = NULL; return 0; } static __inline__ void cpufreq_verify_within_limits(struct cpufreq_policy *policy, unsigned int min, unsigned int max) { if (policy->min < min) policy->min = min; if (policy->max < min) policy->max = min; if (policy->min > max) policy->min = max; if (policy->max > max) policy->max = max; if (policy->min > policy->max) policy->min = policy->max; return; } /********************************************************************* * FREQUENCY TABLE HELPERS * *********************************************************************/ #define CPUFREQ_ENTRY_INVALID ~0 #define CPUFREQ_TABLE_END ~1 struct cpufreq_frequency_table { unsigned int index; /* any */ unsigned int frequency; /* kHz - doesn't need to be in ascending * order */ }; int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table); int cpufreq_frequency_table_verify(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table); int cpufreq_frequency_table_target(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int target_freq, unsigned int relation, unsigned int *index); /********************************************************************* * UNIFIED DEBUG HELPERS * *********************************************************************/ struct cpu_dbs_info_s { uint64_t prev_cpu_idle; uint64_t prev_cpu_wall; struct cpufreq_policy *cur_policy; struct cpufreq_frequency_table *freq_table; int cpu; unsigned int enable:1; unsigned int stoppable:1; unsigned int turbo_enabled:1; }; int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event); int get_cpufreq_ondemand_para(uint32_t *sampling_rate_max, uint32_t *sampling_rate_min, uint32_t *sampling_rate, uint32_t *up_threshold); int write_ondemand_sampling_rate(unsigned int sampling_rate); int write_ondemand_up_threshold(unsigned int up_threshold); int write_userspace_scaling_setspeed(unsigned int cpu, unsigned int freq); void cpufreq_dbs_timer_suspend(void); void cpufreq_dbs_timer_resume(void); #endif /* __XEN_CPUFREQ_PM_H__ */ xen-4.4.0/xen/include/acpi/apei.h0000664000175000017500000000153612307313555014670 0ustar smbsmb/* * apei.h - ACPI Platform Error Interface */ #ifndef ACPI_APEI_H #define ACPI_APEI_H #include #include #define APEI_ERST_INVALID_RECORD_ID 0xffffffffffffffffULL #define FIX_APEI_RANGE_MAX 64 int erst_write(const struct cper_record_header *record); size_t erst_get_record_count(void); int erst_get_next_record_id(u64 *record_id); size_t erst_read(u64 record_id, struct cper_record_header *record, size_t buflen); size_t erst_read_next(struct cper_record_header *record, size_t buflen); int erst_clear(u64 record_id); void __iomem *apei_pre_map(paddr_t paddr, unsigned long size); int apei_pre_map_gar(struct acpi_generic_address *reg); int apei_post_unmap_gar(struct acpi_generic_address *reg); int apei_read(u64 *val, struct acpi_generic_address *reg); int apei_write(u64 val, struct acpi_generic_address *reg); #endif xen-4.4.0/xen/include/acpi/platform/0000775000175000017500000000000012307313555015420 5ustar smbsmbxen-4.4.0/xen/include/acpi/platform/acenv.h0000664000175000017500000002630712307313555016675 0ustar smbsmb/****************************************************************************** * * Name: acenv.h - Generation environment specific items * *****************************************************************************/ /* * Copyright (C) 2000 - 2007, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACENV_H__ #define __ACENV_H__ /* * Configuration for ACPI tools and utilities */ #ifdef ACPI_LIBRARY /* * Note: The non-debug version of the acpi_library does not contain any * debug support, for minimimal size. The debug version uses ACPI_FULL_DEBUG */ #define ACPI_USE_LOCAL_CACHE #endif #ifdef ACPI_ASL_COMPILER #define ACPI_DEBUG_OUTPUT #define ACPI_APPLICATION #define ACPI_DISASSEMBLER #define ACPI_CONSTANT_EVAL_ONLY #define ACPI_LARGE_NAMESPACE_NODE #define ACPI_DATA_TABLE_DISASSEMBLY #endif #ifdef ACPI_EXEC_APP #undef DEBUGGER_THREADING #define DEBUGGER_THREADING DEBUGGER_SINGLE_THREADED #define ACPI_FULL_DEBUG #define ACPI_APPLICATION #define ACPI_DEBUGGER #define ACPI_MUTEX_DEBUG #define ACPI_DBG_TRACK_ALLOCATIONS #endif #ifdef ACPI_DASM_APP #ifndef MSDOS #define ACPI_DEBUG_OUTPUT #endif #define ACPI_APPLICATION #define ACPI_DISASSEMBLER #define ACPI_NO_METHOD_EXECUTION #define ACPI_LARGE_NAMESPACE_NODE #define ACPI_DATA_TABLE_DISASSEMBLY #endif #ifdef ACPI_APPLICATION #define ACPI_USE_SYSTEM_CLIBRARY #define ACPI_USE_LOCAL_CACHE #endif #ifdef ACPI_FULL_DEBUG #define ACPI_DEBUGGER #define ACPI_DEBUG_OUTPUT #define ACPI_DISASSEMBLER #endif /* * Environment configuration. The purpose of this file is to interface to the * local generation environment. * * 1) ACPI_USE_SYSTEM_CLIBRARY - Define this if linking to an actual C library. * Otherwise, local versions of string/memory functions will be used. * 2) ACPI_USE_STANDARD_HEADERS - Define this if linking to a C library and * the standard header files may be used. * * The ACPI subsystem only uses low level C library functions that do not call * operating system services and may therefore be inlined in the code. * * It may be necessary to tailor these include files to the target * generation environment. * * * Functions and constants used from each header: * * string.h: memcpy * memset * strcat * strcmp * strcpy * strlen * strncmp * strncat * strncpy * * stdlib.h: strtoul * * stdarg.h: va_list * va_arg * va_start * va_end * */ /*! [Begin] no source code translation */ #if defined(__XEN_TOOLS__) && defined(__ia64__) #include "ac_ia64_tools.h" #elif 1 /*defined(_LINUX) || defined(__linux__)*/ #include "aclinux.h" #elif defined(_AED_EFI) #include "acefi.h" #elif defined(WIN32) #include "acwin.h" #elif defined(WIN64) #include "acwin64.h" #elif defined(MSDOS) /* Must appear after WIN32 and WIN64 check */ #include "acdos16.h" #elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) #include "acfreebsd.h" #elif defined(__NetBSD__) #include "acnetbsd.h" #elif defined(MODESTO) #include "acmodesto.h" #elif defined(NETWARE) #include "acnetware.h" #elif defined(__sun) #include "acsolaris.h" #else /* All other environments */ #define ACPI_USE_STANDARD_HEADERS #define COMPILER_DEPENDENT_INT64 long long #define COMPILER_DEPENDENT_UINT64 unsigned long long #endif /*! [End] no source code translation !*/ /* * Debugger threading model * Use single threaded if the entire subsystem is contained in an application * Use multiple threaded when the subsystem is running in the kernel. * * By default the model is single threaded if ACPI_APPLICATION is set, * multi-threaded if ACPI_APPLICATION is not set. */ #define DEBUGGER_SINGLE_THREADED 0 #define DEBUGGER_MULTI_THREADED 1 #ifndef DEBUGGER_THREADING #ifdef ACPI_APPLICATION #define DEBUGGER_THREADING DEBUGGER_SINGLE_THREADED #else #define DEBUGGER_THREADING DEBUGGER_MULTI_THREADED #endif #endif /* !DEBUGGER_THREADING */ /****************************************************************************** * * C library configuration * *****************************************************************************/ #define ACPI_IS_ASCII(c) ((c) < 0x80) #ifdef ACPI_USE_SYSTEM_CLIBRARY /* * Use the standard C library headers. * We want to keep these to a minimum. */ #ifdef ACPI_USE_STANDARD_HEADERS /* * Use the standard headers from the standard locations */ #include #include #include #include #endif /* ACPI_USE_STANDARD_HEADERS */ /* * We will be linking to the standard Clib functions */ #define ACPI_STRSTR(s1,s2) strstr((s1), (s2)) #define ACPI_STRCHR(s1,c) strchr((s1), (c)) #define ACPI_STRLEN(s) (acpi_size) strlen((s)) #define ACPI_STRCPY(d,s) (void) strcpy((d), (s)) #define ACPI_STRNCPY(d,s,n) (void) strncpy((d), (s), (acpi_size)(n)) #define ACPI_STRNCMP(d,s,n) strncmp((d), (s), (acpi_size)(n)) #define ACPI_STRCMP(d,s) strcmp((d), (s)) #define ACPI_STRCAT(d,s) (void) strcat((d), (s)) #define ACPI_STRNCAT(d,s,n) strncat((d), (s), (acpi_size)(n)) #define ACPI_STRTOUL(d,s,n) strtoul((d), (s), (acpi_size)(n)) #define ACPI_MEMCMP(s1,s2,n) memcmp((const char *)(s1), (const char *)(s2), (acpi_size)(n)) #define ACPI_MEMCPY(d,s,n) (void) memcpy((d), (s), (acpi_size)(n)) #define ACPI_MEMSET(d,s,n) (void) memset((d), (s), (acpi_size)(n)) #define ACPI_TOUPPER(i) toupper((int) (i)) #define ACPI_TOLOWER(i) tolower((int) (i)) #define ACPI_IS_XDIGIT(i) isxdigit((int) (i)) #define ACPI_IS_DIGIT(i) isdigit((int) (i)) #define ACPI_IS_SPACE(i) isspace((int) (i)) #define ACPI_IS_UPPER(i) isupper((int) (i)) #define ACPI_IS_PRINT(i) isprint((int) (i)) #define ACPI_IS_ALPHA(i) isalpha((int) (i)) #else /****************************************************************************** * * Not using native C library, use local implementations * *****************************************************************************/ /* * Use local definitions of C library macros and functions * NOTE: The function implementations may not be as efficient * as an inline or assembly code implementation provided by a * native C library. */ #ifndef va_arg #ifndef _VALIST #define _VALIST typedef char *va_list; #endif /* _VALIST */ /* * Storage alignment properties */ #define _AUPBND (sizeof (acpi_native_int) - 1) #define _ADNBND (sizeof (acpi_native_int) - 1) /* * Variable argument list macro definitions */ #define _bnd(X, bnd) (((sizeof (X)) + (bnd)) & (~(bnd))) #define va_arg(ap, T) (*(T *)(((ap) += (_bnd (T, _AUPBND))) - (_bnd (T,_ADNBND)))) #define va_end(ap) (void) 0 #define va_start(ap, A) (void) ((ap) = (((char *) &(A)) + (_bnd (A,_AUPBND)))) #endif /* va_arg */ #define ACPI_STRSTR(s1,s2) acpi_ut_strstr ((s1), (s2)) #define ACPI_STRCHR(s1,c) acpi_ut_strchr ((s1), (c)) #define ACPI_STRLEN(s) (acpi_size) acpi_ut_strlen ((s)) #define ACPI_STRCPY(d,s) (void) acpi_ut_strcpy ((d), (s)) #define ACPI_STRNCPY(d,s,n) (void) acpi_ut_strncpy ((d), (s), (acpi_size)(n)) #define ACPI_STRNCMP(d,s,n) acpi_ut_strncmp ((d), (s), (acpi_size)(n)) #define ACPI_STRCMP(d,s) acpi_ut_strcmp ((d), (s)) #define ACPI_STRCAT(d,s) (void) acpi_ut_strcat ((d), (s)) #define ACPI_STRNCAT(d,s,n) acpi_ut_strncat ((d), (s), (acpi_size)(n)) #define ACPI_STRTOUL(d,s,n) acpi_ut_strtoul ((d), (s), (acpi_size)(n)) #define ACPI_MEMCMP(s1,s2,n) acpi_ut_memcmp((const char *)(s1), (const char *)(s2), (acpi_size)(n)) #define ACPI_MEMCPY(d,s,n) (void) acpi_ut_memcpy ((d), (s), (acpi_size)(n)) #define ACPI_MEMSET(d,v,n) (void) acpi_ut_memset ((d), (v), (acpi_size)(n)) #define ACPI_TOUPPER acpi_ut_to_upper #define ACPI_TOLOWER acpi_ut_to_lower #endif /* ACPI_USE_SYSTEM_CLIBRARY */ /****************************************************************************** * * Assembly code macros * *****************************************************************************/ /* * Handle platform- and compiler-specific assembly language differences. * These should already have been defined by the platform includes above. * * Notes: * 1) Interrupt 3 is used to break into a debugger * 2) Interrupts are turned off during ACPI register setup */ /* Unrecognized compiler, use defaults */ #ifndef ACPI_ASM_MACROS /* * Calling conventions: * * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads) * ACPI_EXTERNAL_XFACE - External ACPI interfaces * ACPI_INTERNAL_XFACE - Internal ACPI interfaces * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces */ #define ACPI_SYSTEM_XFACE #define ACPI_EXTERNAL_XFACE #define ACPI_INTERNAL_XFACE #define ACPI_INTERNAL_VAR_XFACE #define ACPI_ASM_MACROS #define BREAKPOINT3 #define ACPI_DISABLE_IRQS() #define ACPI_ENABLE_IRQS() #define ACPI_ACQUIRE_GLOBAL_LOCK(Glptr, acq) #define ACPI_RELEASE_GLOBAL_LOCK(Glptr, acq) #endif /* ACPI_ASM_MACROS */ #ifdef ACPI_APPLICATION /* Don't want software interrupts within a ring3 application */ #undef BREAKPOINT3 #define BREAKPOINT3 #endif /****************************************************************************** * * Compiler-specific information is contained in the compiler-specific * headers. * *****************************************************************************/ #endif /* __ACENV_H__ */ xen-4.4.0/xen/include/acpi/platform/acgcc.h0000664000175000017500000000537612307313555016644 0ustar smbsmb/****************************************************************************** * * Name: acgcc.h - GCC specific defines, etc. * *****************************************************************************/ /* * Copyright (C) 2000 - 2007, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACGCC_H__ #define __ACGCC_H__ /* Function name is used for debug output. Non-ANSI, compiler-dependent */ #define ACPI_GET_FUNCTION_NAME __FUNCTION__ /* * This macro is used to tag functions as "printf-like" because * some compilers (like GCC) can catch printf format string problems. */ #define ACPI_PRINTF_LIKE(c) __attribute__ ((__format__ (__printf__, c, c+1))) /* * Some compilers complain about unused variables. Sometimes we don't want to * use all the variables (for example, _acpi_module_name). This allows us * to to tell the compiler warning in a per-variable manner that a variable * is unused. */ #define ACPI_UNUSED_VAR __attribute__ ((unused)) #endif /* __ACGCC_H__ */ xen-4.4.0/xen/include/acpi/platform/aclinux.h0000664000175000017500000000631512307313555017241 0ustar smbsmb/****************************************************************************** * * Name: aclinux.h - OS specific defines, etc. * *****************************************************************************/ /* * Copyright (C) 2000 - 2007, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACLINUX_H__ #define __ACLINUX_H__ #define ACPI_USE_SYSTEM_CLIBRARY #define ACPI_USE_DO_WHILE_0 #include #include #include #include #include #include #include #include #include #include #include /* Host-dependent types and defines */ #define ACPI_MACHINE_WIDTH BITS_PER_LONG #define acpi_cache_t void /*struct kmem_cache*/ #define acpi_spinlock spinlock_t * #define ACPI_EXPORT_SYMBOL(symbol) EXPORT_SYMBOL(symbol); #define strtoul simple_strtoul /* Full namespace pathname length limit - arbitrary */ #define ACPI_PATHNAME_MAX 256 #include "acgcc.h" #define acpi_cpu_flags unsigned long #define acpi_thread_id struct vcpu * void *acpi_os_alloc_memory(size_t); void *acpi_os_zalloc_memory(size_t); void acpi_os_free_memory(void *); #define ACPI_ALLOCATE(a) acpi_os_alloc_memory(a) #define ACPI_ALLOCATE_ZEROED(a) acpi_os_zalloc_memory(a) #define ACPI_FREE(a) acpi_os_free_memory(a) #endif /* __ACLINUX_H__ */ xen-4.4.0/xen/include/acpi/actables.h0000664000175000017500000000726012307313555015530 0ustar smbsmb/****************************************************************************** * * Name: actables.h - ACPI table management * *****************************************************************************/ /* * Copyright (C) 2000 - 2008, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #ifndef __ACTABLES_H__ #define __ACTABLES_H__ acpi_status acpi_allocate_root_table(u32 initial_table_count); /* * tbfadt - FADT parse/convert/validate */ void acpi_tb_parse_fadt(acpi_native_uint table_index, u8 flags); void acpi_tb_create_local_fadt(struct acpi_table_header *table, u32 length); /* * tbfind - find ACPI table */ acpi_status acpi_tb_find_table(char *signature, char *oem_id, char *oem_table_id, acpi_native_uint * table_index); /* * tbinstal - Table removal and deletion */ acpi_status acpi_tb_resize_root_table_list(void); acpi_status acpi_tb_verify_table(struct acpi_table_desc *table_desc); acpi_status acpi_tb_add_table(struct acpi_table_desc *table_desc, acpi_native_uint * table_index); acpi_status acpi_tb_store_table(acpi_physical_address address, struct acpi_table_header *table, u32 length, u8 flags, acpi_native_uint * table_index); void acpi_tb_delete_table(struct acpi_table_desc *table_desc); void acpi_tb_terminate(void); u8 acpi_tb_is_table_loaded(acpi_native_uint table_index); void acpi_tb_set_table_loaded_flag(acpi_native_uint table_index, u8 is_loaded); /* * tbutils - table manager utilities */ void acpi_tb_print_table_header(acpi_physical_address address, struct acpi_table_header *header); u8 acpi_tb_checksum(u8 * buffer, acpi_native_uint length); acpi_status acpi_tb_verify_checksum(struct acpi_table_header *table, u32 length); void acpi_tb_install_table(acpi_physical_address address, u8 flags, char *signature, acpi_native_uint table_index); acpi_status acpi_tb_parse_root_table(acpi_physical_address rsdp_address, u8 flags); #endif /* __ACTABLES_H__ */ xen-4.4.0/xen/include/asm-arm/0000775000175000017500000000000012307313555014215 5ustar smbsmbxen-4.4.0/xen/include/asm-arm/procinfo.h0000664000175000017500000000157212307313555016212 0ustar smbsmb/* * include/asm-arm/procinfo.h * * Bamvor Jian Zhang * Copyright (c) 2013 SUSE * * base on linux/arch/arm/include/asm/procinfo.h * Copyright (C) 1996-1999 Russell King * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #ifndef __ASM_ARM_PROCINFO_H #define __ASM_ARM_PROCINFO_H struct proc_info_list { unsigned int cpu_val; unsigned int cpu_mask; void (*cpu_init)(void); }; #endif xen-4.4.0/xen/include/asm-arm/softirq.h0000664000175000017500000000035112307313555016054 0ustar smbsmb#ifndef __ASM_SOFTIRQ_H__ #define __ASM_SOFTIRQ_H__ #define NR_ARCH_SOFTIRQS 0 #endif /* __ASM_SOFTIRQ_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/hypercall.h0000664000175000017500000000104512307313555016351 0ustar smbsmb#ifndef __ASM_ARM_HYPERCALL_H__ #define __ASM_ARM_HYPERCALL_H__ #include /* for arch_do_domctl */ int do_physdev_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg); long do_arm_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg); long subarch_do_domctl(struct xen_domctl *domctl, struct domain *d, XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl); #endif /* __ASM_ARM_HYPERCALL_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/xenoprof.h0000664000175000017500000000031212307313555016222 0ustar smbsmb#ifndef __ASM_XENOPROF_H__ #define __ASM_XENOPROF_H__ #endif /* __ASM_XENOPROF_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/mm.h0000664000175000017500000002774512307313555015016 0ustar smbsmb#ifndef __ARCH_ARM_MM__ #define __ARCH_ARM_MM__ #include #include #include #include /* Align Xen to a 2 MiB boundary. */ #define XEN_PADDR_ALIGN (1 << 21) /* * Per-page-frame information. * * Every architecture must ensure the following: * 1. 'struct page_info' contains a 'struct page_list_entry list'. * 2. Provide a PFN_ORDER() macro for accessing the order of a free page. */ #define PFN_ORDER(_pfn) ((_pfn)->v.free.order) struct page_info { /* Each frame can be threaded onto a doubly-linked list. */ struct page_list_entry list; /* Reference count and various PGC_xxx flags and fields. */ unsigned long count_info; /* Context-dependent fields follow... */ union { /* Page is in use: ((count_info & PGC_count_mask) != 0). */ struct { /* Type reference count and various PGT_xxx flags and fields. */ unsigned long type_info; } inuse; /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */ struct { /* Do TLBs need flushing for safety before next page use? */ bool_t need_tlbflush; } free; } u; union { /* Page is in use, but not as a shadow. */ struct { /* Owner of this page (zero if page is anonymous). */ struct domain *domain; } inuse; /* Page is on a free list. */ struct { /* Order-size of the free chunk this page is the head of. */ unsigned int order; } free; } v; union { /* * Timestamp from 'TLB clock', used to avoid extra safety flushes. * Only valid for: a) free pages, and b) pages with zero type count */ u32 tlbflush_timestamp; }; u64 pad; }; #define PG_shift(idx) (BITS_PER_LONG - (idx)) #define PG_mask(x, idx) (x ## UL << PG_shift(idx)) #define PGT_none PG_mask(0, 4) /* no special uses of this page */ #define PGT_writable_page PG_mask(7, 4) /* has writable mappings? */ #define PGT_type_mask PG_mask(15, 4) /* Bits 28-31 or 60-63. */ /* Owning guest has pinned this page to its current type? */ #define _PGT_pinned PG_shift(5) #define PGT_pinned PG_mask(1, 5) /* Has this page been validated for use as its current type? */ #define _PGT_validated PG_shift(6) #define PGT_validated PG_mask(1, 6) /* Count of uses of this frame as its current type. */ #define PGT_count_width PG_shift(9) #define PGT_count_mask ((1UL<count_info&PGC_state) == PGC_state_##st) /* Count of references to this frame. */ #define PGC_count_width PG_shift(9) #define PGC_count_mask ((1UL<= xenheap_mfn_start && _mfn < xenheap_mfn_end); \ }) #else #define is_xen_heap_page(page) ((page)->count_info & PGC_xen_heap) #define is_xen_heap_mfn(mfn) \ (mfn_valid(mfn) && is_xen_heap_page(__mfn_to_page(mfn))) #endif #define is_xen_fixed_mfn(mfn) \ ((pfn_to_paddr(mfn) >= virt_to_maddr(&_start)) && \ (pfn_to_paddr(mfn) <= virt_to_maddr(&_end))) #define page_get_owner(_p) (_p)->v.inuse.domain #define page_set_owner(_p,_d) ((_p)->v.inuse.domain = (_d)) #define maddr_get_owner(ma) (page_get_owner(maddr_to_page((ma)))) #define XENSHARE_writable 0 #define XENSHARE_readonly 1 extern void share_xen_page_with_guest( struct page_info *page, struct domain *d, int readonly); extern void share_xen_page_with_privileged_guests( struct page_info *page, int readonly); #define frame_table ((struct page_info *)FRAMETABLE_VIRT_START) /* MFN of the first page in the frame table. */ extern unsigned long frametable_base_mfn; extern unsigned long max_page; extern unsigned long total_pages; /* Boot-time pagetable setup */ extern void setup_pagetables(unsigned long boot_phys_offset, paddr_t xen_paddr); /* Remove early mappings */ extern void remove_early_mappings(void); /* Allocate and initialise pagetables for a secondary CPU. Sets init_ttbr to the * new page table */ extern int __cpuinit init_secondary_pagetables(int cpu); /* Switch secondary CPUS to its own pagetables and finalise MMU setup */ extern void __cpuinit mmu_init_secondary_cpu(void); /* Second stage paging setup, to be called on all CPUs */ extern void __cpuinit setup_virt_paging(void); /* Set up the xenheap: up to 1GB of contiguous, always-mapped memory. * Base must be 32MB aligned and size a multiple of 32MB. */ extern void setup_xenheap_mappings(unsigned long base_mfn, unsigned long nr_mfns); /* Map a frame table to cover physical addresses ps through pe */ extern void setup_frametable_mappings(paddr_t ps, paddr_t pe); /* Map a 4k page in a fixmap entry */ extern void set_fixmap(unsigned map, unsigned long mfn, unsigned attributes); /* Remove a mapping from a fixmap entry */ extern void clear_fixmap(unsigned map); /* map a physical range in virtual memory */ void __iomem *ioremap_attr(paddr_t start, size_t len, unsigned attributes); static inline void __iomem *ioremap_nocache(paddr_t start, size_t len) { return ioremap_attr(start, len, PAGE_HYPERVISOR_NOCACHE); } static inline void __iomem *ioremap_cache(paddr_t start, size_t len) { return ioremap_attr(start, len, PAGE_HYPERVISOR); } static inline void __iomem *ioremap_wc(paddr_t start, size_t len) { return ioremap_attr(start, len, PAGE_HYPERVISOR_WC); } #define mfn_valid(mfn) ({ \ unsigned long __m_f_n = (mfn); \ likely(__m_f_n >= frametable_base_mfn && __m_f_n < max_page); \ }) #define max_pdx max_page #define pfn_to_pdx(pfn) (pfn) #define pdx_to_pfn(pdx) (pdx) #define virt_to_pdx(va) virt_to_mfn(va) #define pdx_to_virt(pdx) mfn_to_virt(pdx) /* Convert between machine frame numbers and page-info structures. */ #define mfn_to_page(mfn) (frame_table + (pfn_to_pdx(mfn) - frametable_base_mfn)) #define page_to_mfn(pg) pdx_to_pfn((unsigned long)((pg) - frame_table) + frametable_base_mfn) #define __page_to_mfn(pg) page_to_mfn(pg) #define __mfn_to_page(mfn) mfn_to_page(mfn) /* Convert between machine addresses and page-info structures. */ #define maddr_to_page(ma) __mfn_to_page((ma) >> PAGE_SHIFT) #define page_to_maddr(pg) ((paddr_t)__page_to_mfn(pg) << PAGE_SHIFT) /* Convert between frame number and address formats. */ #define pfn_to_paddr(pfn) ((paddr_t)(pfn) << PAGE_SHIFT) #define paddr_to_pfn(pa) ((unsigned long)((pa) >> PAGE_SHIFT)) #define paddr_to_pdx(pa) pfn_to_pdx(paddr_to_pfn(pa)) static inline paddr_t __virt_to_maddr(vaddr_t va) { uint64_t par = va_to_par(va); return (par & PADDR_MASK & PAGE_MASK) | (va & ~PAGE_MASK); } #define virt_to_maddr(va) __virt_to_maddr((vaddr_t)(va)) #ifdef CONFIG_ARM_32 static inline void *maddr_to_virt(paddr_t ma) { ASSERT(is_xen_heap_mfn(ma >> PAGE_SHIFT)); ma -= pfn_to_paddr(xenheap_mfn_start); return (void *)(unsigned long) ma + XENHEAP_VIRT_START; } #else static inline void *maddr_to_virt(paddr_t ma) { ASSERT((ma >> PAGE_SHIFT) < (DIRECTMAP_SIZE >> PAGE_SHIFT)); ma -= pfn_to_paddr(xenheap_mfn_start); return (void *)(unsigned long) ma + DIRECTMAP_VIRT_START; } #endif static inline int gvirt_to_maddr(vaddr_t va, paddr_t *pa) { uint64_t par = gva_to_ma_par(va); if ( par & PAR_F ) return -EFAULT; *pa = (par & PADDR_MASK & PAGE_MASK) | ((unsigned long) va & ~PAGE_MASK); return 0; } /* Convert between Xen-heap virtual addresses and machine addresses. */ #define __pa(x) (virt_to_maddr(x)) #define __va(x) (maddr_to_virt(x)) /* Convert between Xen-heap virtual addresses and machine frame numbers. */ #define virt_to_mfn(va) (virt_to_maddr(va) >> PAGE_SHIFT) #define mfn_to_virt(mfn) (maddr_to_virt((paddr_t)(mfn) << PAGE_SHIFT)) /* Convert between Xen-heap virtual addresses and page-info structures. */ static inline struct page_info *virt_to_page(const void *v) { unsigned long va = (unsigned long)v; ASSERT(va >= XENHEAP_VIRT_START); ASSERT(va < xenheap_virt_end); return frame_table + ((va - XENHEAP_VIRT_START) >> PAGE_SHIFT) + xenheap_mfn_start - frametable_base_mfn; } static inline void *page_to_virt(const struct page_info *pg) { return mfn_to_virt(page_to_mfn(pg)); } struct domain *page_get_owner_and_reference(struct page_info *page); void put_page(struct page_info *page); int get_page(struct page_info *page, struct domain *domain); /* * The MPT (machine->physical mapping table) is an array of word-sized * values, indexed on machine frame number. It is expected that guest OSes * will use it to store a "physical" frame number to give the appearance of * contiguous (or near contiguous) physical memory. */ #undef machine_to_phys_mapping #define machine_to_phys_mapping ((unsigned long *)RDWR_MPT_VIRT_START) #define INVALID_M2P_ENTRY (~0UL) #define VALID_M2P(_e) (!((_e) & (1UL<<(BITS_PER_LONG-1)))) #define SHARED_M2P_ENTRY (~0UL - 1UL) #define SHARED_M2P(_e) ((_e) == SHARED_M2P_ENTRY) #define _set_gpfn_from_mfn(mfn, pfn) ({ \ struct domain *d = page_get_owner(__mfn_to_page(mfn)); \ if(d && (d == dom_cow)) \ machine_to_phys_mapping[(mfn)] = SHARED_M2P_ENTRY; \ else \ machine_to_phys_mapping[(mfn)] = (pfn); \ }) static inline void put_gfn(struct domain *d, unsigned long gfn) {} static inline void mem_event_cleanup(struct domain *d) {} static inline int relinquish_shared_pages(struct domain *d) { return 0; } #define INVALID_MFN (~0UL) /* Xen always owns P2M on ARM */ #define set_gpfn_from_mfn(mfn, pfn) do { (void) (mfn), (void)(pfn); } while (0) #define mfn_to_gmfn(_d, mfn) (mfn) /* Arch-specific portion of memory_op hypercall. */ long arch_memory_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg); int steal_page( struct domain *d, struct page_info *page, unsigned int memflags); int donate_page( struct domain *d, struct page_info *page, unsigned int memflags); #define domain_set_alloc_bitsize(d) ((void)0) #define domain_clamp_alloc_bitsize(d, b) (b) unsigned long domain_get_maximum_gpfn(struct domain *d); extern struct domain *dom_xen, *dom_io, *dom_cow; #define memguard_init(_s) (_s) #define memguard_guard_stack(_p) ((void)0) #define memguard_guard_range(_p,_l) ((void)0) #define memguard_unguard_range(_p,_l) ((void)0) /* Release all __init and __initdata ranges to be reused */ void free_init_memory(void); int guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn, unsigned int order); extern void put_page_type(struct page_info *page); static inline void put_page_and_type(struct page_info *page) { put_page_type(page); put_page(page); } #endif /* __ARCH_ARM_MM__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/p2m.h0000664000175000017500000001344612307313555015074 0ustar smbsmb#ifndef _XEN_P2M_H #define _XEN_P2M_H #include struct domain; /* Per-p2m-table state */ struct p2m_domain { /* Lock that protects updates to the p2m */ spinlock_t lock; /* Pages used to construct the p2m */ struct page_list_head pages; /* Root of p2m page tables, 2 contiguous pages */ struct page_info *first_level; /* Current VMID in use */ uint8_t vmid; /* Highest guest frame that's ever been mapped in the p2m * Only takes into account ram and foreign mapping */ unsigned long max_mapped_gfn; /* Lowest mapped gfn in the p2m. When releasing mapped gfn's in a * preemptible manner this is update to track recall where to * resume the search. Apart from during teardown this can only * decrease. */ unsigned long lowest_mapped_gfn; }; /* List of possible type for each page in the p2m entry. * The number of available bit per page in the pte for this purpose is 4 bits. * So it's possible to only have 16 fields. If we run out of value in the * future, it's possible to use higher value for pseudo-type and don't store * them in the p2m entry. */ typedef enum { p2m_invalid = 0, /* Nothing mapped here */ p2m_ram_rw, /* Normal read/write guest RAM */ p2m_ram_ro, /* Read-only; writes are silently dropped */ p2m_mmio_direct, /* Read/write mapping of genuine MMIO area */ p2m_map_foreign, /* Ram pages from foreign domain */ p2m_grant_map_rw, /* Read/write grant mapping */ p2m_grant_map_ro, /* Read-only grant mapping */ p2m_max_real_type, /* Types after this won't be store in the p2m */ } p2m_type_t; #define p2m_is_foreign(_t) ((_t) == p2m_map_foreign) #define p2m_is_ram(_t) ((_t) == p2m_ram_rw || (_t) == p2m_ram_ro) /* Initialise vmid allocator */ void p2m_vmid_allocator_init(void); /* Init the datastructures for later use by the p2m code */ int p2m_init(struct domain *d); /* Return all the p2m resources to Xen. */ void p2m_teardown(struct domain *d); /* Remove mapping refcount on each mapping page in the p2m * * TODO: For the moment only foreign mappings are handled */ int relinquish_p2m_mapping(struct domain *d); /* Allocate a new p2m table for a domain. * * Returns 0 for success or -errno. */ int p2m_alloc_table(struct domain *d); /* */ void p2m_load_VTTBR(struct domain *d); /* Look up the MFN corresponding to a domain's PFN. */ paddr_t p2m_lookup(struct domain *d, paddr_t gpfn, p2m_type_t *t); /* Clean & invalidate caches corresponding to a region of guest address space */ int p2m_cache_flush(struct domain *d, xen_pfn_t start_mfn, xen_pfn_t end_mfn); /* Setup p2m RAM mapping for domain d from start-end. */ int p2m_populate_ram(struct domain *d, paddr_t start, paddr_t end); /* Map MMIO regions in the p2m: start_gaddr and end_gaddr is the range * in the guest physical address space to map, starting from the machine * address maddr. */ int map_mmio_regions(struct domain *d, paddr_t start_gaddr, paddr_t end_gaddr, paddr_t maddr); int guest_physmap_add_entry(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned long page_order, p2m_type_t t); /* Untyped version for RAM only, for compatibility */ static inline int guest_physmap_add_page(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int page_order) { return guest_physmap_add_entry(d, gfn, mfn, page_order, p2m_ram_rw); } void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, unsigned long mfn, unsigned int page_order); unsigned long gmfn_to_mfn(struct domain *d, unsigned long gpfn); /* * Populate-on-demand */ /* Call when decreasing memory reservation to handle PoD entries properly. * Will return '1' if all entries were handled and nothing more need be done.*/ int p2m_pod_decrease_reservation(struct domain *d, xen_pfn_t gpfn, unsigned int order); /* Look up a GFN and take a reference count on the backing page. */ typedef unsigned int p2m_query_t; #define P2M_ALLOC (1u<<0) /* Populate PoD and paged-out entries */ #define P2M_UNSHARE (1u<<1) /* Break CoW sharing */ static inline struct page_info *get_page_from_gfn( struct domain *d, unsigned long gfn, p2m_type_t *t, p2m_query_t q) { struct page_info *page; p2m_type_t p2mt; paddr_t maddr = p2m_lookup(d, pfn_to_paddr(gfn), &p2mt); unsigned long mfn = maddr >> PAGE_SHIFT; if (t) *t = p2mt; if ( p2mt == p2m_invalid || p2mt == p2m_mmio_direct ) return NULL; if ( !mfn_valid(mfn) ) return NULL; page = mfn_to_page(mfn); /* get_page won't work on foreign mapping because the page doesn't * belong to the current domain. */ if ( p2mt == p2m_map_foreign ) { struct domain *fdom = page_get_owner_and_reference(page); ASSERT(fdom != NULL); ASSERT(fdom != d); return page; } if ( !get_page(page, d) ) return NULL; return page; } int get_page_type(struct page_info *page, unsigned long type); int is_iomem_page(unsigned long mfn); static inline int get_page_and_type(struct page_info *page, struct domain *domain, unsigned long type) { int rc = get_page(page, domain); if ( likely(rc) && unlikely(!get_page_type(page, type)) ) { put_page(page); rc = 0; } return rc; } #endif /* _XEN_P2M_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/page.h0000664000175000017500000003034712307313555015311 0ustar smbsmb#ifndef __ARM_PAGE_H__ #define __ARM_PAGE_H__ #include #include #include #include #define PADDR_BITS 40 #define PADDR_MASK ((1ULL << PADDR_BITS)-1) #define VADDR_BITS 32 #define VADDR_MASK (~0UL) /* Shareability values for the LPAE entries */ #define LPAE_SH_NON_SHAREABLE 0x0 #define LPAE_SH_UNPREDICTALE 0x1 #define LPAE_SH_OUTER 0x2 #define LPAE_SH_INNER 0x3 /* LPAE Memory region attributes, to match Linux's (non-LPAE) choices. * Indexed by the AttrIndex bits of a LPAE entry; * the 8-bit fields are packed little-endian into MAIR0 and MAIR1 * * ai encoding * UNCACHED 000 0000 0000 -- Strongly Ordered * BUFFERABLE 001 0100 0100 -- Non-Cacheable * WRITETHROUGH 010 1010 1010 -- Write-through * WRITEBACK 011 1110 1110 -- Write-back * DEV_SHARED 100 0000 0100 -- Device * ?? 101 * reserved 110 * WRITEALLOC 111 1111 1111 -- Write-back write-allocate * * DEV_NONSHARED 100 (== DEV_SHARED) * DEV_WC 001 (== BUFFERABLE) * DEV_CACHED 011 (== WRITEBACK) */ #define MAIR0VAL 0xeeaa4400 #define MAIR1VAL 0xff000004 #define MAIRVAL (MAIR0VAL|MAIR1VAL<<32) /* * Attribute Indexes. * * These are valid in the AttrIndx[2:0] field of an LPAE stage 1 page * table entry. They are indexes into the bytes of the MAIR* * registers, as defined above. * */ #define UNCACHED 0x0 #define BUFFERABLE 0x1 #define WRITETHROUGH 0x2 #define WRITEBACK 0x3 #define DEV_SHARED 0x4 #define WRITEALLOC 0x7 #define DEV_NONSHARED DEV_SHARED #define DEV_WC BUFFERABLE #define DEV_CACHED WRITEBACK #define PAGE_HYPERVISOR (WRITEALLOC) #define PAGE_HYPERVISOR_NOCACHE (DEV_SHARED) #define PAGE_HYPERVISOR_WC (DEV_WC) #define MAP_SMALL_PAGES PAGE_HYPERVISOR /* * Stage 2 Memory Type. * * These are valid in the MemAttr[3:0] field of an LPAE stage 2 page * table entry. * */ #define MATTR_DEV 0x1 #define MATTR_MEM 0xf #ifndef __ASSEMBLY__ #include #include /* WARNING! Unlike the Intel pagetable code, where l1 is the lowest * level and l4 is the root of the trie, the ARM pagetables follow ARM's * documentation: the levels are called first, second &c in the order * that the MMU walks them (i.e. "first" is the root of the trie). */ /****************************************************************************** * ARMv7-A LPAE pagetables: 3-level trie, mapping 40-bit input to * 40-bit output addresses. Tables at all levels have 512 64-bit entries * (i.e. are 4Kb long). * * The bit-shuffling that has the permission bits in branch nodes in a * different place from those in leaf nodes seems to be to allow linear * pagetable tricks. If we're not doing that then the set of permission * bits that's not in use in a given node type can be used as * extra software-defined bits. */ typedef struct { /* These are used in all kinds of entry. */ unsigned long valid:1; /* Valid mapping */ unsigned long table:1; /* == 1 in 4k map entries too */ /* These ten bits are only used in Block entries and are ignored * in Table entries. */ unsigned long ai:3; /* Attribute Index */ unsigned long ns:1; /* Not-Secure */ unsigned long user:1; /* User-visible */ unsigned long ro:1; /* Read-Only */ unsigned long sh:2; /* Shareability */ unsigned long af:1; /* Access Flag */ unsigned long ng:1; /* Not-Global */ /* The base address must be appropriately aligned for Block entries */ unsigned long base:28; /* Base address of block or next table */ unsigned long sbz:12; /* Must be zero */ /* These seven bits are only used in Block entries and are ignored * in Table entries. */ unsigned long contig:1; /* In a block of 16 contiguous entries */ unsigned long pxn:1; /* Privileged-XN */ unsigned long xn:1; /* eXecute-Never */ unsigned long avail:4; /* Ignored by hardware */ /* These 5 bits are only used in Table entries and are ignored in * Block entries */ unsigned long pxnt:1; /* Privileged-XN */ unsigned long xnt:1; /* eXecute-Never */ unsigned long apt:2; /* Access Permissions */ unsigned long nst:1; /* Not-Secure */ } __attribute__((__packed__)) lpae_pt_t; /* The p2m tables have almost the same layout, but some of the permission * and cache-control bits are laid out differently (or missing) */ typedef struct { /* These are used in all kinds of entry. */ unsigned long valid:1; /* Valid mapping */ unsigned long table:1; /* == 1 in 4k map entries too */ /* These ten bits are only used in Block entries and are ignored * in Table entries. */ unsigned long mattr:4; /* Memory Attributes */ unsigned long read:1; /* Read access */ unsigned long write:1; /* Write access */ unsigned long sh:2; /* Shareability */ unsigned long af:1; /* Access Flag */ unsigned long sbz4:1; /* The base address must be appropriately aligned for Block entries */ unsigned long base:28; /* Base address of block or next table */ unsigned long sbz3:12; /* These seven bits are only used in Block entries and are ignored * in Table entries. */ unsigned long contig:1; /* In a block of 16 contiguous entries */ unsigned long sbz2:1; unsigned long xn:1; /* eXecute-Never */ unsigned long type:4; /* Ignore by hardware. Used to store p2m types */ unsigned long sbz1:5; } __attribute__((__packed__)) lpae_p2m_t; /* * Walk is the common bits of p2m and pt entries which are needed to * simply walk the table (e.g. for debug). */ typedef struct { /* These are used in all kinds of entry. */ unsigned long valid:1; /* Valid mapping */ unsigned long table:1; /* == 1 in 4k map entries too */ unsigned long pad2:10; /* The base address must be appropriately aligned for Block entries */ unsigned long base:28; /* Base address of block or next table */ unsigned long pad1:24; } __attribute__((__packed__)) lpae_walk_t; typedef union { uint64_t bits; lpae_pt_t pt; lpae_p2m_t p2m; lpae_walk_t walk; } lpae_t; /* Standard entry type that we'll use to build Xen's own pagetables. * We put the same permissions at every level, because they're ignored * by the walker in non-leaf entries. */ static inline lpae_t mfn_to_xen_entry(unsigned long mfn) { paddr_t pa = ((paddr_t) mfn) << PAGE_SHIFT; lpae_t e = (lpae_t) { .pt = { .xn = 1, /* No need to execute outside .text */ .ng = 1, /* Makes TLB flushes easier */ .af = 1, /* No need for access tracking */ .sh = LPAE_SH_OUTER, /* Xen mappings are globally coherent */ .ns = 1, /* Hyp mode is in the non-secure world */ .user = 1, /* See below */ .ai = WRITEALLOC, .table = 0, /* Set to 1 for links and 4k maps */ .valid = 1, /* Mappings are present */ }};; /* Setting the User bit is strange, but the ATS1H[RW] instructions * don't seem to work otherwise, and since we never run on Xen * pagetables un User mode it's OK. If this changes, remember * to update the hard-coded values in head.S too */ ASSERT(!(pa & ~PAGE_MASK)); ASSERT(!(pa & ~PADDR_MASK)); // XXX shifts e.bits |= pa; return e; } #if defined(CONFIG_ARM_32) # include #elif defined(CONFIG_ARM_64) # include #else # error "unknown ARM variant" #endif /* Architectural minimum cacheline size is 4 32-bit words. */ #define MIN_CACHELINE_BYTES 16 /* Actual cacheline size on the boot CPU. */ extern size_t cacheline_bytes; /* Function for flushing medium-sized areas. * if 'range' is large enough we might want to use model-specific * full-cache flushes. */ static inline void clean_xen_dcache_va_range(void *p, unsigned long size) { void *end; dsb(); /* So the CPU issues all writes to the range */ for ( end = p + size; p < end; p += cacheline_bytes ) asm volatile (__clean_xen_dcache_one(0) : : "r" (p)); dsb(); /* So we know the flushes happen before continuing */ } /* Macro for flushing a single small item. The predicate is always * compile-time constant so this will compile down to 3 instructions in * the common case. */ #define clean_xen_dcache(x) do { \ typeof(x) *_p = &(x); \ if ( sizeof(x) > MIN_CACHELINE_BYTES || sizeof(x) > alignof(x) ) \ clean_xen_dcache_va_range(_p, sizeof(x)); \ else \ asm volatile ( \ "dsb sy;" /* Finish all earlier writes */ \ __clean_xen_dcache_one(0) \ "dsb sy;" /* Finish flush before continuing */ \ : : "r" (_p), "m" (*_p)); \ } while (0) /* Flush the dcache for an entire page. */ void flush_page_to_ram(unsigned long mfn); /* Print a walk of an arbitrary page table */ void dump_pt_walk(lpae_t *table, paddr_t addr); /* Print a walk of the hypervisor's page tables for a virtual addr. */ extern void dump_hyp_walk(vaddr_t addr); /* Print a walk of the p2m for a domain for a physical address. */ extern void dump_p2m_lookup(struct domain *d, paddr_t addr); static inline uint64_t va_to_par(vaddr_t va) { uint64_t par = __va_to_par(va); /* It is not OK to call this with an invalid VA */ if ( par & PAR_F ) { dump_hyp_walk(va); panic_PAR(par); } return par; } static inline int gva_to_ipa(vaddr_t va, paddr_t *paddr) { uint64_t par = gva_to_ipa_par(va); if ( par & PAR_F ) return -EFAULT; *paddr = (par & PADDR_MASK & PAGE_MASK) | ((unsigned long) va & ~PAGE_MASK); return 0; } /* Bits in the PAR returned by va_to_par */ #define PAR_FAULT 0x1 #endif /* __ASSEMBLY__ */ /* * These numbers add up to a 48-bit input address space. * * On 32-bit the zeroeth level does not exist, therefore the total is * 39-bits. The ARMv7-A architecture actually specifies a 40-bit input * address space for the p2m, with an 8K (1024-entry) top-level table. * However Xen only supports 16GB of RAM on 32-bit ARM systems and * therefore 39-bits are sufficient. */ #define LPAE_SHIFT 9 #define LPAE_ENTRIES (1u << LPAE_SHIFT) #define LPAE_ENTRY_MASK (LPAE_ENTRIES - 1) #define THIRD_SHIFT (PAGE_SHIFT) #define THIRD_SIZE ((paddr_t)1 << THIRD_SHIFT) #define THIRD_MASK (~(THIRD_SIZE - 1)) #define SECOND_SHIFT (THIRD_SHIFT + LPAE_SHIFT) #define SECOND_SIZE ((paddr_t)1 << SECOND_SHIFT) #define SECOND_MASK (~(SECOND_SIZE - 1)) #define FIRST_SHIFT (SECOND_SHIFT + LPAE_SHIFT) #define FIRST_SIZE ((paddr_t)1 << FIRST_SHIFT) #define FIRST_MASK (~(FIRST_SIZE - 1)) #define ZEROETH_SHIFT (FIRST_SHIFT + LPAE_SHIFT) #define ZEROETH_SIZE ((paddr_t)1 << ZEROETH_SHIFT) #define ZEROETH_MASK (~(ZEROETH_SIZE - 1)) /* Calculate the offsets into the pagetables for a given VA */ #define zeroeth_linear_offset(va) ((va) >> ZEROETH_SHIFT) #define first_linear_offset(va) ((va) >> FIRST_SHIFT) #define second_linear_offset(va) ((va) >> SECOND_SHIFT) #define third_linear_offset(va) ((va) >> THIRD_SHIFT) #define TABLE_OFFSET(offs) ((unsigned int)(offs) & LPAE_ENTRY_MASK) #define first_table_offset(va) TABLE_OFFSET(first_linear_offset(va)) #define second_table_offset(va) TABLE_OFFSET(second_linear_offset(va)) #define third_table_offset(va) TABLE_OFFSET(third_linear_offset(va)) #define zeroeth_table_offset(va) TABLE_OFFSET(zeroeth_linear_offset(va)) #define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) #define PAGE_ALIGN(x) (((x) + PAGE_SIZE - 1) & PAGE_MASK) #endif /* __ARM_PAGE_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/init.h0000664000175000017500000000057612307313555015341 0ustar smbsmb#ifndef _XEN_ASM_INIT_H #define _XEN_ASM_INIT_H struct init_info { /* Pointer to the stack, used by head.S when entering in C */ unsigned char *stack; /* Logical CPU ID, used by start_secondary */ unsigned int cpuid; }; #endif /* _XEN_ASM_INIT_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/asm_defns.h0000664000175000017500000000053212307313555016325 0ustar smbsmb#ifndef __ARM_ASM_DEFNS_H__ #define __ARM_ASM_DEFNS_H__ #ifndef COMPILE_OFFSETS /* NB. Auto-generated from arch/.../asm-offsets.c */ #include #endif #include #endif /* __ARM_ASM_DEFNS_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/platforms/0000775000175000017500000000000012307313555016224 5ustar smbsmbxen-4.4.0/xen/include/asm-arm/platforms/vexpress.h0000664000175000017500000000207012307313555020253 0ustar smbsmb#ifndef __ASM_ARM_PLATFORMS_VEXPRESS_H #define __ASM_ARM_PLATFORMS_VEXPRESS_H /* V2M */ #define V2M_SYS_MMIO_BASE (0x1c010000) #define V2M_SYS_FLAGSSET (0x30) #define V2M_SYS_FLAGSCLR (0x34) #define V2M_SYS_CFGDATA (0x00A0) #define V2M_SYS_CFGCTRL (0x00A4) #define V2M_SYS_CFGSTAT (0x00A8) #define V2M_SYS_CFG_START (1<<31) #define V2M_SYS_CFG_WRITE (1<<30) #define V2M_SYS_CFG_ERROR (1<<1) #define V2M_SYS_CFG_COMPLETE (1<<0) #define V2M_SYS_CFG_OSC_FUNC 1 #define V2M_SYS_CFG_OSC0 0 #define V2M_SYS_CFG_OSC1 1 #define V2M_SYS_CFG_OSC2 2 #define V2M_SYS_CFG_OSC3 3 #define V2M_SYS_CFG_OSC4 4 #define V2M_SYS_CFG_OSC5 5 /* Board-specific: base address of system controller */ #define SP810_ADDRESS 0x1C020000 #ifndef __ASSEMBLY__ #include int vexpress_syscfg(int write, int function, int device, uint32_t *data); #endif #endif /* __ASM_ARM_PLATFORMS_VEXPRESS_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/platforms/exynos5.h0000664000175000017500000000126712307313555020015 0ustar smbsmb#ifndef __ASM_ARM_PLATFORMS_EXYNOS5_H #define __ASM_ASM_PLATFORMS_EXYSNO5_H #define EXYNOS5_MCT_BASE 0x101c0000 #define EXYNOS5_MCT_G_TCON 0x240 /* Relative to MCT_BASE */ #define EXYNOS5_MCT_G_TCON_START (1 << 8) #define EXYNOS5_PA_CHIPID 0x10000000 #define EXYNOS5_PA_TIMER 0x12dd0000 /* Base address of system controller */ #define EXYNOS5_PA_PMU 0x10040000 #define EXYNOS5_SWRESET 0x0400 /* Relative to PA_PMU */ #define S5P_PA_SYSRAM 0x02020000 #endif /* __ASM_ARM_PLATFORMS_EXYNOS5_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/platforms/omap5.h0000664000175000017500000000213712307313555017421 0ustar smbsmb#ifndef __ASM_ARM_PLATFORMS_OMAP5_H #define __ASM_ASM_PLATFORMS_OMAP5_H #define REALTIME_COUNTER_BASE 0x48243200 #define INCREMENTER_NUMERATOR_OFFSET 0x10 #define INCREMENTER_DENUMERATOR_RELOAD_OFFSET 0x14 #define NUMERATOR_DENUMERATOR_MASK 0xfffff000 #define PRM_FRAC_INCREMENTER_DENUMERATOR_RELOAD 0x00010000 #define OMAP5_L4_WKUP 0x4AE00000 #define OMAP5_PRM_BASE (OMAP5_L4_WKUP + 0x6000) #define OMAP5_CKGEN_PRM_BASE (OMAP5_PRM_BASE + 0x100) #define OMAP5_CM_CLKSEL_SYS 0x10 #define SYS_CLKSEL_MASK 0xfffffff8 #define OMAP5_PRCM_MPU_BASE 0x48243000 #define OMAP5_WKUPGEN_BASE 0x48281000 #define OMAP5_SRAM_PA 0x40300000 #define OMAP_AUX_CORE_BOOT_0_OFFSET 0x800 #define OMAP_AUX_CORE_BOOT_1_OFFSET 0x804 #endif /* __ASM_ARM_PLATFORMS_OMAP5_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/platforms/midway.h0000664000175000017500000000103012307313555017661 0ustar smbsmb#ifndef __ASM_ARM_PLATFORMS_MIDWAY_H #define __ASM_ASM_PLATFORMS_MIDWAY_H /* addresses of SREG registers for resetting the SoC */ #define MW_SREG_PWR_REQ 0xfff3cf00 #define MW_SREG_A15_PWR_CTRL 0xfff3c200 #define MW_PWR_SUSPEND 0 #define MW_PWR_SOFT_RESET 1 #define MW_PWR_HARD_RESET 2 #define MW_PWR_SHUTDOWN 3 #endif /* __ASM_ARM_PLATFORMS_MIDWAY_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/early_printk.h0000664000175000017500000000214012307313555017066 0ustar smbsmb/* * printk() for use before the final page tables are setup. * * Copyright (C) 2012 Citrix Systems, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #ifndef __ARM_EARLY_PRINTK_H__ #define __ARM_EARLY_PRINTK_H__ #include #ifdef EARLY_PRINTK /* need to add the uart address offset in page to the fixmap address */ #define EARLY_UART_VIRTUAL_ADDRESS \ (FIXMAP_ADDR(FIXMAP_CONSOLE) +(EARLY_UART_BASE_ADDRESS & ~PAGE_MASK)) #endif #ifndef __ASSEMBLY__ #ifdef EARLY_PRINTK void early_printk(const char *fmt, ...) __attribute__((format (printf, 1, 2))); void early_panic(const char *fmt, ...) __attribute__((noreturn)) __attribute__((format (printf, 1, 2))); #else static inline __attribute__((format (printf, 1, 2))) void early_printk(const char *fmt, ...) {} static inline void __attribute__((noreturn)) __attribute__((format (printf, 1, 2))) early_panic(const char *fmt, ...) {while(1);} #endif #endif /* __ASSEMBLY__ */ #endif xen-4.4.0/xen/include/asm-arm/arm64/0000775000175000017500000000000012307313555015146 5ustar smbsmbxen-4.4.0/xen/include/asm-arm/arm64/page.h0000664000175000017500000000641312307313555016237 0ustar smbsmb#ifndef __ARM_ARM64_PAGE_H__ #define __ARM_ARM64_PAGE_H__ #ifndef __ASSEMBLY__ /* Write a pagetable entry */ static inline void write_pte(lpae_t *p, lpae_t pte) { asm volatile ( /* Ensure any writes have completed with the old mappings. */ "dsb sy;" "str %0, [%1];" /* Write the entry */ "dsb sy;" : : "r" (pte.bits), "r" (p) : "memory"); } /* Inline ASM to flush dcache on register R (may be an inline asm operand) */ #define __clean_xen_dcache_one(R) "dc cvac, %" #R ";" /* Inline ASM to clean and invalidate dcache on register R (may be an * inline asm operand) */ #define __clean_and_invalidate_xen_dcache_one(R) "dc civac, %" #R ";" /* * Flush all hypervisor mappings from the TLB * This is needed after changing Xen code mappings. * * The caller needs to issue the necessary DSB and D-cache flushes * before calling flush_xen_text_tlb. */ static inline void flush_xen_text_tlb(void) { asm volatile ( "isb;" /* Ensure synchronization with previous changes to text */ "tlbi alle2;" /* Flush hypervisor TLB */ "ic iallu;" /* Flush I-cache */ "dsb sy;" /* Ensure completion of TLB flush */ "isb;" : : : "memory"); } /* * Flush all hypervisor mappings from the data TLB. This is not * sufficient when changing code mappings or for self modifying code. */ static inline void flush_xen_data_tlb(void) { asm volatile ( "dsb sy;" /* Ensure visibility of PTE writes */ "tlbi alle2;" /* Flush hypervisor TLB */ "dsb sy;" /* Ensure completion of TLB flush */ "isb;" : : : "memory"); } /* * Flush a range of VA's hypervisor mappings from the data TLB. This is not * sufficient when changing code mappings or for self modifying code. */ static inline void flush_xen_data_tlb_range_va(unsigned long va, unsigned long size) { unsigned long end = va + size; dsb(); /* Ensure preceding are visible */ while ( va < end ) { asm volatile("tlbi vae2, %0;" : : "r" (va>>PAGE_SHIFT) : "memory"); va += PAGE_SIZE; } dsb(); /* Ensure completion of the TLB flush */ isb(); } /* Ask the MMU to translate a VA for us */ static inline uint64_t __va_to_par(vaddr_t va) { uint64_t par, tmp = READ_SYSREG64(PAR_EL1); asm volatile ("at s1e2r, %0;" : : "r" (va)); isb(); par = READ_SYSREG64(PAR_EL1); WRITE_SYSREG64(tmp, PAR_EL1); return par; } /* Ask the MMU to translate a Guest VA for us */ static inline uint64_t gva_to_ma_par(vaddr_t va) { uint64_t par, tmp = READ_SYSREG64(PAR_EL1); asm volatile ("at s12e1r, %0;" : : "r" (va)); isb(); par = READ_SYSREG64(PAR_EL1); WRITE_SYSREG64(tmp, PAR_EL1); return par; } static inline uint64_t gva_to_ipa_par(vaddr_t va) { uint64_t par, tmp = READ_SYSREG64(PAR_EL1); asm volatile ("at s1e1r, %0;" : : "r" (va)); isb(); par = READ_SYSREG64(PAR_EL1); WRITE_SYSREG64(tmp, PAR_EL1); return par; } #endif /* __ASSEMBLY__ */ #endif /* __ARM_ARM64_PAGE_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/arm64/flushtlb.h0000664000175000017500000000211012307313555017134 0ustar smbsmb#ifndef __ASM_ARM_ARM64_FLUSHTLB_H__ #define __ASM_ARM_ARM64_FLUSHTLB_H__ /* Flush local TLBs, current VMID only */ static inline void flush_tlb_local(void) { asm volatile( "dsb sy;" "tlbi vmalle1;" "dsb sy;" "isb;" : : : "memory"); } /* Flush innershareable TLBs, current VMID only */ static inline void flush_tlb(void) { asm volatile( "dsb sy;" "tlbi vmalle1is;" "dsb sy;" "isb;" : : : "memory"); } /* Flush local TLBs, all VMIDs, non-hypervisor mode */ static inline void flush_tlb_all_local(void) { asm volatile( "dsb sy;" "tlbi alle1;" "dsb sy;" "isb;" : : : "memory"); } /* Flush innershareable TLBs, all VMIDs, non-hypervisor mode */ static inline void flush_tlb_all(void) { asm volatile( "dsb sy;" "tlbi alle1is;" "dsb sy;" "isb;" : : : "memory"); } #endif /* __ASM_ARM_ARM64_FLUSHTLB_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/arm64/processor.h0000664000175000017500000000706512307313555017346 0ustar smbsmb#ifndef __ASM_ARM_ARM64_PROCESSOR_H #define __ASM_ARM_ARM64_PROCESSOR_H #ifndef __ASSEMBLY__ /* Anonymous union includes both 32- and 64-bit names (e.g., r0/x0). */ #define __DECL_REG(n64, n32) union { \ uint64_t n64; \ uint32_t n32; \ } /* On stack VCPU state */ struct cpu_user_regs { /* Aarch64 Aarch32 */ __DECL_REG(x0, r0/*_usr*/); __DECL_REG(x1, r1/*_usr*/); __DECL_REG(x2, r2/*_usr*/); __DECL_REG(x3, r3/*_usr*/); __DECL_REG(x4, r4/*_usr*/); __DECL_REG(x5, r5/*_usr*/); __DECL_REG(x6, r6/*_usr*/); __DECL_REG(x7, r7/*_usr*/); __DECL_REG(x8, r8/*_usr*/); __DECL_REG(x9, r9/*_usr*/); __DECL_REG(x10, r10/*_usr*/); __DECL_REG(x11 , r11/*_usr*/); __DECL_REG(x12, r12/*_usr*/); __DECL_REG(x13, /* r13_usr */ sp_usr); __DECL_REG(x14, /* r14_usr */ lr_usr); __DECL_REG(x15, /* r13_hyp */ __unused_sp_hyp); __DECL_REG(x16, /* r14_irq */ lr_irq); __DECL_REG(x17, /* r13_irq */ sp_irq); __DECL_REG(x18, /* r14_svc */ lr_svc); __DECL_REG(x19, /* r13_svc */ sp_svc); __DECL_REG(x20, /* r14_abt */ lr_abt); __DECL_REG(x21, /* r13_abt */ sp_abt); __DECL_REG(x22, /* r14_und */ lr_und); __DECL_REG(x23, /* r13_und */ sp_und); __DECL_REG(x24, r8_fiq); __DECL_REG(x25, r9_fiq); __DECL_REG(x26, r10_fiq); __DECL_REG(x27, r11_fiq); __DECL_REG(x28, r12_fiq); __DECL_REG(/* x29 */ fp, /* r13_fiq */ sp_fiq); __DECL_REG(/* x30 */ lr, /* r14_fiq */ lr_fiq); register_t sp; /* Valid for hypervisor frames */ /* Return address and mode */ __DECL_REG(pc, pc32); /* ELR_EL2 */ uint32_t cpsr; /* SPSR_EL2 */ uint32_t pad0; /* Align end of kernel frame. */ /* Outer guest frame only from here on... */ union { uint32_t spsr_el1; /* AArch64 */ uint32_t spsr_svc; /* AArch32 */ }; uint32_t pad1; /* Doubleword-align the user half of the frame */ /* AArch32 guests only */ uint32_t spsr_fiq, spsr_irq, spsr_und, spsr_abt; /* AArch64 guests only */ uint64_t sp_el0; uint64_t sp_el1, elr_el1; }; #undef __DECL_REG /* Access to system registers */ #define READ_SYSREG32(name) ({ \ uint32_t _r; \ asm volatile("mrs %0, "#name : "=r" (_r)); \ _r; }) #define WRITE_SYSREG32(v, name) do { \ uint32_t _r = v; \ asm volatile("msr "#name", %0" : : "r" (_r)); \ } while (0) #define WRITE_SYSREG64(v, name) do { \ uint64_t _r = v; \ asm volatile("msr "#name", %0" : : "r" (_r)); \ } while (0) #define READ_SYSREG64(name) ({ \ uint64_t _r; \ asm volatile("mrs %0, "#name : "=r" (_r)); \ _r; }) #define READ_SYSREG(name) READ_SYSREG64(name) #define WRITE_SYSREG(v, name) WRITE_SYSREG64(v, name) #define cpu_has_erratum_766422() 0 #endif /* __ASSEMBLY__ */ #endif /* __ASM_ARM_ARM64_PROCESSOR_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/arm64/io.h0000664000175000017500000000763012307313555015734 0ustar smbsmb/* * Based on linux arch/arm64/include/asm/io.h which is in turn * Based on arch/arm/include/asm/io.h * * Copyright (C) 1996-2000 Russell King * Copyright (C) 2012 ARM Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef _ARM_ARM64_IO_H #define _ARM_ARM64_IO_H #include /* * Generic IO read/write. These perform native-endian accesses. */ static inline void __raw_writeb(u8 val, volatile void __iomem *addr) { asm volatile("strb %w0, [%1]" : : "r" (val), "r" (addr)); } static inline void __raw_writew(u16 val, volatile void __iomem *addr) { asm volatile("strh %w0, [%1]" : : "r" (val), "r" (addr)); } static inline void __raw_writel(u32 val, volatile void __iomem *addr) { asm volatile("str %w0, [%1]" : : "r" (val), "r" (addr)); } static inline void __raw_writeq(u64 val, volatile void __iomem *addr) { asm volatile("str %0, [%1]" : : "r" (val), "r" (addr)); } static inline u8 __raw_readb(const volatile void __iomem *addr) { u8 val; asm volatile("ldrb %w0, [%1]" : "=r" (val) : "r" (addr)); return val; } static inline u16 __raw_readw(const volatile void __iomem *addr) { u16 val; asm volatile("ldrh %w0, [%1]" : "=r" (val) : "r" (addr)); return val; } static inline u32 __raw_readl(const volatile void __iomem *addr) { u32 val; asm volatile("ldr %w0, [%1]" : "=r" (val) : "r" (addr)); return val; } static inline u64 __raw_readq(const volatile void __iomem *addr) { u64 val; asm volatile("ldr %0, [%1]" : "=r" (val) : "r" (addr)); return val; } /* IO barriers */ #define __iormb() rmb() #define __iowmb() wmb() #define mmiowb() do { } while (0) /* * Relaxed I/O memory access primitives. These follow the Device memory * ordering rules but do not guarantee any ordering relative to Normal memory * accesses. */ #define readb_relaxed(c) ({ u8 __v = __raw_readb(c); __v; }) #define readw_relaxed(c) ({ u16 __v = le16_to_cpu((__force __le16)__raw_readw(c)); __v; }) #define readl_relaxed(c) ({ u32 __v = le32_to_cpu((__force __le32)__raw_readl(c)); __v; }) #define readq_relaxed(c) ({ u64 __v = le64_to_cpu((__force __le64)__raw_readq(c)); __v; }) #define writeb_relaxed(v,c) ((void)__raw_writeb((v),(c))) #define writew_relaxed(v,c) ((void)__raw_writew((__force u16)cpu_to_le16(v),(c))) #define writel_relaxed(v,c) ((void)__raw_writel((__force u32)cpu_to_le32(v),(c))) #define writeq_relaxed(v,c) ((void)__raw_writeq((__force u64)cpu_to_le64(v),(c))) /* * I/O memory access primitives. Reads are ordered relative to any * following Normal memory access. Writes are ordered relative to any prior * Normal memory access. */ #define readb(c) ({ u8 __v = readb_relaxed(c); __iormb(); __v; }) #define readw(c) ({ u16 __v = readw_relaxed(c); __iormb(); __v; }) #define readl(c) ({ u32 __v = readl_relaxed(c); __iormb(); __v; }) #define readq(c) ({ u64 __v = readq_relaxed(c); __iormb(); __v; }) #define writeb(v,c) ({ __iowmb(); writeb_relaxed((v),(c)); }) #define writew(v,c) ({ __iowmb(); writew_relaxed((v),(c)); }) #define writel(v,c) ({ __iowmb(); writel_relaxed((v),(c)); }) #define writeq(v,c) ({ __iowmb(); writeq_relaxed((v),(c)); }) #endif /* _ARM_ARM64_IO_H */ xen-4.4.0/xen/include/asm-arm/arm64/bitops.h0000664000175000017500000000525512307313555016626 0ustar smbsmb#ifndef _ARM_ARM64_BITOPS_H #define _ARM_ARM64_BITOPS_H /* * Little endian assembly atomic bitops. */ extern void set_bit(int nr, volatile void *p); extern void clear_bit(int nr, volatile void *p); extern void change_bit(int nr, volatile void *p); extern int test_and_set_bit(int nr, volatile void *p); extern int test_and_clear_bit(int nr, volatile void *p); extern int test_and_change_bit(int nr, volatile void *p); /* Based on linux/include/asm-generic/bitops/builtin-__ffs.h */ /** * __ffs - find first bit in word. * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ static /*__*/always_inline unsigned long __ffs(unsigned long word) { return __builtin_ctzl(word); } /* Based on linux/include/asm-generic/bitops/ffz.h */ /* * ffz - find first zero in word. * @word: The word to search * * Undefined if no zero exists, so code should check against ~0UL first. */ #define ffz(x) __ffs(~(x)) /* Based on linux/include/asm-generic/bitops/find.h */ #ifndef find_next_bit /** * find_next_bit - find the next set bit in a memory region * @addr: The address to base the search on * @offset: The bitnumber to start searching at * @size: The bitmap size in bits */ extern unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset); #endif #ifndef find_next_zero_bit /** * find_next_zero_bit - find the next cleared bit in a memory region * @addr: The address to base the search on * @offset: The bitnumber to start searching at * @size: The bitmap size in bits */ extern unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset); #endif #ifdef CONFIG_GENERIC_FIND_FIRST_BIT /** * find_first_bit - find the first set bit in a memory region * @addr: The address to start the search at * @size: The maximum size to search * * Returns the bit number of the first set bit. */ extern unsigned long find_first_bit(const unsigned long *addr, unsigned long size); /** * find_first_zero_bit - find the first cleared bit in a memory region * @addr: The address to start the search at * @size: The maximum size to search * * Returns the bit number of the first cleared bit. */ extern unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size); #else /* CONFIG_GENERIC_FIND_FIRST_BIT */ #define find_first_bit(addr, size) find_next_bit((addr), (size), 0) #define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0) #endif /* CONFIG_GENERIC_FIND_FIRST_BIT */ #endif /* _ARM_ARM64_BITOPS_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/arm64/atomic.h0000664000175000017500000000745612307313555016607 0ustar smbsmb/* * Based on arch/arm64/include/asm/atomic.h * which in turn is * Based on arch/arm/include/asm/atomic.h * * Copyright (C) 1996 Russell King. * Copyright (C) 2002 Deep Blue Solutions Ltd. * Copyright (C) 2012 ARM Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __ARCH_ARM_ARM64_ATOMIC #define __ARCH_ARM_ARM64_ATOMIC /* * AArch64 UP and SMP safe atomic ops. We use load exclusive and * store exclusive to ensure that these are atomic. We may loop * to ensure that the update happens. */ static inline void atomic_add(int i, atomic_t *v) { unsigned long tmp; int result; asm volatile("// atomic_add\n" "1: ldxr %w0, %2\n" " add %w0, %w0, %w3\n" " stxr %w1, %w0, %2\n" " cbnz %w1, 1b" : "=&r" (result), "=&r" (tmp), "+Q" (v->counter) : "Ir" (i) : "cc"); } static inline int atomic_add_return(int i, atomic_t *v) { unsigned long tmp; int result; asm volatile("// atomic_add_return\n" "1: ldaxr %w0, %2\n" " add %w0, %w0, %w3\n" " stlxr %w1, %w0, %2\n" " cbnz %w1, 1b" : "=&r" (result), "=&r" (tmp), "+Q" (v->counter) : "Ir" (i) : "cc", "memory"); return result; } static inline void atomic_sub(int i, atomic_t *v) { unsigned long tmp; int result; asm volatile("// atomic_sub\n" "1: ldxr %w0, %2\n" " sub %w0, %w0, %w3\n" " stxr %w1, %w0, %2\n" " cbnz %w1, 1b" : "=&r" (result), "=&r" (tmp), "+Q" (v->counter) : "Ir" (i) : "cc"); } static inline int atomic_sub_return(int i, atomic_t *v) { unsigned long tmp; int result; asm volatile("// atomic_sub_return\n" "1: ldaxr %w0, %2\n" " sub %w0, %w0, %w3\n" " stlxr %w1, %w0, %2\n" " cbnz %w1, 1b" : "=&r" (result), "=&r" (tmp), "+Q" (v->counter) : "Ir" (i) : "cc", "memory"); return result; } static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) { unsigned long tmp; int oldval; asm volatile("// atomic_cmpxchg\n" "1: ldaxr %w1, %2\n" " cmp %w1, %w3\n" " b.ne 2f\n" " stlxr %w0, %w4, %2\n" " cbnz %w0, 1b\n" "2:" : "=&r" (tmp), "=&r" (oldval), "+Q" (ptr->counter) : "Ir" (old), "r" (new) : "cc", "memory"); return oldval; } static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr) { unsigned long tmp, tmp2; asm volatile("// atomic_clear_mask\n" "1: ldxr %0, %2\n" " bic %0, %0, %3\n" " stxr %w1, %0, %2\n" " cbnz %w1, 1b" : "=&r" (tmp), "=&r" (tmp2), "+Q" (*addr) : "Ir" (mask) : "cc"); } #define atomic_xchg(v, new) (xchg(&((v)->counter), new)) static inline int __atomic_add_unless(atomic_t *v, int a, int u) { int c, old; c = atomic_read(v); while (c != u && (old = atomic_cmpxchg((v), c, c + a)) != c) c = old; return c; } #define atomic_inc(v) atomic_add(1, v) #define atomic_dec(v) atomic_sub(1, v) #define atomic_inc_and_test(v) (atomic_add_return(1, v) == 0) #define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) #define atomic_inc_return(v) (atomic_add_return(1, v)) #define atomic_dec_return(v) (atomic_sub_return(1, v)) #define atomic_sub_and_test(i, v) (atomic_sub_return(i, v) == 0) #define atomic_add_negative(i,v) (atomic_add_return(i, v) < 0) #define smp_mb__before_atomic_dec() smp_mb() #define smp_mb__after_atomic_dec() smp_mb() #define smp_mb__before_atomic_inc() smp_mb() #define smp_mb__after_atomic_inc() smp_mb() #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/arm64/system.h0000664000175000017500000002111512307313555016643 0ustar smbsmb/* Portions taken from Linux arch arm64 */ #ifndef __ASM_ARM64_SYSTEM_H #define __ASM_ARM64_SYSTEM_H #define sev() asm volatile("sev" : : : "memory") #define wfe() asm volatile("wfe" : : : "memory") #define wfi() asm volatile("wfi" : : : "memory") #define isb() asm volatile("isb" : : : "memory") #define dsb() asm volatile("dsb sy" : : : "memory") #define dmb() asm volatile("dmb sy" : : : "memory") #define mb() dsb() #define rmb() dsb() #define wmb() mb() #define smp_mb() mb() #define smp_rmb() rmb() #define smp_wmb() wmb() extern void __bad_xchg(volatile void *, int); static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size) { unsigned long ret, tmp; switch (size) { case 1: asm volatile("// __xchg1\n" "1: ldaxrb %w0, %2\n" " stlxrb %w1, %w3, %2\n" " cbnz %w1, 1b\n" : "=&r" (ret), "=&r" (tmp), "+Q" (*(u8 *)ptr) : "r" (x) : "cc", "memory"); break; case 2: asm volatile("// __xchg2\n" "1: ldaxrh %w0, %2\n" " stlxrh %w1, %w3, %2\n" " cbnz %w1, 1b\n" : "=&r" (ret), "=&r" (tmp), "+Q" (*(u16 *)ptr) : "r" (x) : "cc", "memory"); break; case 4: asm volatile("// __xchg4\n" "1: ldaxr %w0, %2\n" " stlxr %w1, %w3, %2\n" " cbnz %w1, 1b\n" : "=&r" (ret), "=&r" (tmp), "+Q" (*(u32 *)ptr) : "r" (x) : "cc", "memory"); break; case 8: asm volatile("// __xchg8\n" "1: ldaxr %0, %2\n" " stlxr %w1, %3, %2\n" " cbnz %w1, 1b\n" : "=&r" (ret), "=&r" (tmp), "+Q" (*(u64 *)ptr) : "r" (x) : "cc", "memory"); break; default: __bad_xchg(ptr, size), ret = 0; break; } return ret; } #define xchg(ptr,x) \ ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr)))) extern void __bad_cmpxchg(volatile void *ptr, int size); static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size) { unsigned long oldval = 0, res; switch (size) { case 1: do { asm volatile("// __cmpxchg1\n" " ldxrb %w1, %2\n" " mov %w0, #0\n" " cmp %w1, %w3\n" " b.ne 1f\n" " stxrb %w0, %w4, %2\n" "1:\n" : "=&r" (res), "=&r" (oldval), "+Q" (*(u8 *)ptr) : "Ir" (old), "r" (new) : "cc"); } while (res); break; case 2: do { asm volatile("// __cmpxchg2\n" " ldxrh %w1, %2\n" " mov %w0, #0\n" " cmp %w1, %w3\n" " b.ne 1f\n" " stxrh %w0, %w4, %2\n" "1:\n" : "=&r" (res), "=&r" (oldval), "+Q" (*(u16 *)ptr) : "Ir" (old), "r" (new) : "cc"); } while (res); break; case 4: do { asm volatile("// __cmpxchg4\n" " ldxr %w1, %2\n" " mov %w0, #0\n" " cmp %w1, %w3\n" " b.ne 1f\n" " stxr %w0, %w4, %2\n" "1:\n" : "=&r" (res), "=&r" (oldval), "+Q" (*(u32 *)ptr) : "Ir" (old), "r" (new) : "cc"); } while (res); break; case 8: do { asm volatile("// __cmpxchg8\n" " ldxr %1, %2\n" " mov %w0, #0\n" " cmp %1, %3\n" " b.ne 1f\n" " stxr %w0, %4, %2\n" "1:\n" : "=&r" (res), "=&r" (oldval), "+Q" (*(u64 *)ptr) : "Ir" (old), "r" (new) : "cc"); } while (res); break; default: __bad_cmpxchg(ptr, size); oldval = 0; } return oldval; } static inline unsigned long __cmpxchg_mb(volatile void *ptr, unsigned long old, unsigned long new, int size) { unsigned long ret; smp_mb(); ret = __cmpxchg(ptr, old, new, size); smp_mb(); return ret; } #define cmpxchg(ptr,o,n) \ ((__typeof__(*(ptr)))__cmpxchg_mb((ptr), \ (unsigned long)(o), \ (unsigned long)(n), \ sizeof(*(ptr)))) #define cmpxchg_local(ptr,o,n) \ ((__typeof__(*(ptr)))__cmpxchg((ptr), \ (unsigned long)(o), \ (unsigned long)(n), \ sizeof(*(ptr)))) /* Uses uimm4 as a bitmask to select the clearing of one or more of * the DAIF exception mask bits: * bit 3 selects the D mask, * bit 2 the A mask, * bit 1 the I mask and * bit 0 the F mask. */ #define local_fiq_disable() asm volatile ( "msr daifset, #1\n" ::: "memory" ) #define local_fiq_enable() asm volatile ( "msr daifclr, #1\n" ::: "memory" ) #define local_irq_disable() asm volatile ( "msr daifset, #2\n" ::: "memory" ) #define local_irq_enable() asm volatile ( "msr daifclr, #2\n" ::: "memory" ) #define local_abort_disable() asm volatile ( "msr daifset, #4\n" ::: "memory" ) #define local_abort_enable() asm volatile ( "msr daifclr, #4\n" ::: "memory" ) #define local_save_flags(x) \ ({ \ BUILD_BUG_ON(sizeof(x) != sizeof(long)); \ asm volatile( \ "mrs %0, daif // local_save_flags\n" \ : "=r" (x) \ : \ : "memory"); \ }) #define local_irq_save(x) \ ({ \ local_save_flags(x); \ local_irq_disable(); \ }) #define local_irq_restore(x) \ ({ \ BUILD_BUG_ON(sizeof(x) != sizeof(long)); \ asm volatile ( \ "msr daif, %0 // local_irq_restore" \ : \ : "r" (flags) \ : "memory"); \ }) static inline int local_irq_is_enabled(void) { unsigned long flags; local_save_flags(flags); return !(flags & PSR_IRQ_MASK); } static inline int local_fiq_is_enabled(void) { unsigned long flags; local_save_flags(flags); return !(flags & PSR_FIQ_MASK); } #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/arm64/spinlock.h0000664000175000017500000000602512307313555017144 0ustar smbsmb/* * Derived from Linux arch64 spinlock.h which is: * Copyright (C) 2012 ARM Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __ASM_ARM64_SPINLOCK_H #define __ASM_ARM64_SPINLOCK_H typedef struct { volatile unsigned int lock; } raw_spinlock_t; #define _RAW_SPIN_LOCK_UNLOCKED { 0 } #define _raw_spin_is_locked(x) ((x)->lock != 0) static always_inline void _raw_spin_unlock(raw_spinlock_t *lock) { ASSERT(_raw_spin_is_locked(lock)); asm volatile( " stlr %w1, %0\n" : "=Q" (lock->lock) : "r" (0) : "memory"); } static always_inline int _raw_spin_trylock(raw_spinlock_t *lock) { unsigned int tmp; asm volatile( "2: ldaxr %w0, %1\n" " cbnz %w0, 1f\n" " stxr %w0, %w2, %1\n" " cbnz %w0, 2b\n" "1:\n" : "=&r" (tmp), "+Q" (lock->lock) : "r" (1) : "cc", "memory"); return !tmp; } typedef struct { volatile unsigned int lock; } raw_rwlock_t; #define _RAW_RW_LOCK_UNLOCKED { 0 } static always_inline int _raw_read_trylock(raw_rwlock_t *rw) { unsigned int tmp, tmp2 = 1; asm volatile( " ldaxr %w0, %2\n" " add %w0, %w0, #1\n" " tbnz %w0, #31, 1f\n" " stxr %w1, %w0, %2\n" "1:\n" : "=&r" (tmp), "+r" (tmp2), "+Q" (rw->lock) : : "cc", "memory"); return !tmp2; } static always_inline int _raw_write_trylock(raw_rwlock_t *rw) { unsigned int tmp; asm volatile( " ldaxr %w0, %1\n" " cbnz %w0, 1f\n" " stxr %w0, %w2, %1\n" "1:\n" : "=&r" (tmp), "+Q" (rw->lock) : "r" (0x80000000) : "cc", "memory"); return !tmp; } static inline void _raw_read_unlock(raw_rwlock_t *rw) { unsigned int tmp, tmp2; asm volatile( " 1: ldxr %w0, %2\n" " sub %w0, %w0, #1\n" " stlxr %w1, %w0, %2\n" " cbnz %w1, 1b\n" : "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock) : : "cc", "memory"); } static inline void _raw_write_unlock(raw_rwlock_t *rw) { asm volatile( " stlr %w1, %0\n" : "=Q" (rw->lock) : "r" (0) : "memory"); } #define _raw_rw_is_locked(x) ((x)->lock != 0) #define _raw_rw_is_write_locked(x) ((x)->lock == 0x80000000) #endif /* __ASM_SPINLOCK_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/arm64/vfp.h0000664000175000017500000000046412307313555016116 0ustar smbsmb#ifndef _ARM_ARM64_VFP_H #define _ARM_ARM64_VFP_H struct vfp_state { uint64_t fpregs[64]; uint32_t fpcr; uint32_t fpexc32_el2; uint32_t fpsr; }; #endif /* _ARM_ARM64_VFP_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/time.h0000664000175000017500000000174212307313555015330 0ustar smbsmb#ifndef __ARM_TIME_H__ #define __ARM_TIME_H__ #define DT_MATCH_TIMER \ DT_MATCH_COMPATIBLE("arm,armv7-timer"), \ DT_MATCH_COMPATIBLE("arm,armv8-timer") typedef unsigned long cycles_t; static inline cycles_t get_cycles (void) { return 0; } struct tm; struct tm wallclock_time(void); /* List of timer's IRQ */ enum timer_ppi { TIMER_PHYS_SECURE_PPI = 0, TIMER_PHYS_NONSECURE_PPI = 1, TIMER_VIRT_PPI = 2, TIMER_HYP_PPI = 3, MAX_TIMER_PPI = 4, }; /* Get one of the timer IRQ description */ const struct dt_irq* timer_dt_irq(enum timer_ppi ppi); /* Route timer's IRQ on this CPU */ extern void __cpuinit route_timer_interrupt(void); /* Set up the timer interrupt on this CPU */ extern void __cpuinit init_timer_interrupt(void); /* Counter value at boot time */ extern uint64_t boot_count; #endif /* __ARM_TIME_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/cache.h0000664000175000017500000000057412307313555015437 0ustar smbsmb#ifndef __ARCH_ARM_CACHE_H #define __ARCH_ARM_CACHE_H #include /* L1 cache line size */ #define L1_CACHE_SHIFT (CONFIG_ARM_L1_CACHE_SHIFT) #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) #define __read_mostly __section(".data.read_mostly") #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/flushtlb.h0000664000175000017500000000155512307313555016217 0ustar smbsmb#ifndef __ASM_ARM_FLUSHTLB_H__ #define __ASM_ARM_FLUSHTLB_H__ #include /* * Filter the given set of CPUs, removing those that definitely flushed their * TLB since @page_timestamp. */ /* XXX lazy implementation just doesn't clear anything.... */ #define tlbflush_filter(mask, page_timestamp) \ do { \ } while ( 0 ) #define tlbflush_current_time() (0) #if defined(CONFIG_ARM_32) # include #elif defined(CONFIG_ARM_64) # include #else # error "unknown ARM variant" #endif /* Flush specified CPUs' TLBs */ void flush_tlb_mask(const cpumask_t *mask); #endif /* __ASM_ARM_FLUSHTLB_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/event.h0000664000175000017500000000327012307313555015511 0ustar smbsmb#ifndef __ASM_EVENT_H__ #define __ASM_EVENT_H__ #include #include void vcpu_kick(struct vcpu *v); void vcpu_mark_events_pending(struct vcpu *v); static inline int vcpu_event_delivery_is_enabled(struct vcpu *v) { struct cpu_user_regs *regs = &v->arch.cpu_info->guest_cpu_user_regs; return !(regs->cpsr & PSR_IRQ_MASK); } static inline int local_events_need_delivery_nomask(void) { struct pending_irq *p = irq_to_pending(current, current->domain->arch.evtchn_irq); /* XXX: if the first interrupt has already been delivered, we should * check whether any other interrupts with priority higher than the * one in GICV_IAR are in the lr_pending queue or in the LR * registers and return 1 only in that case. * In practice the guest interrupt handler should run with * interrupts disabled so this shouldn't be a problem in the general * case. */ if ( gic_events_need_delivery() ) return 1; if ( vcpu_info(current, evtchn_upcall_pending) && list_empty(&p->inflight) ) return 1; return 0; } static inline int local_events_need_delivery(void) { if ( !vcpu_event_delivery_is_enabled(current) ) return 0; return local_events_need_delivery_nomask(); } static inline void local_event_delivery_enable(void) { struct cpu_user_regs *regs = guest_cpu_user_regs(); regs->cpsr &= ~PSR_IRQ_MASK; } /* No arch specific virq definition now. Default to global. */ static inline int arch_virq_is_global(int virq) { return 1; } #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/domain.h0000664000175000017500000002021612307313555015636 0ustar smbsmb#ifndef __ASM_DOMAIN_H__ #define __ASM_DOMAIN_H__ #include #include #include #include #include #include #include #include /* Represents state corresponding to a block of 32 interrupts */ struct vgic_irq_rank { spinlock_t lock; /* Covers access to all other members of this struct */ uint32_t ienable, iactive, ipend, pendsgi; uint32_t icfg[2]; uint32_t ipriority[8]; uint32_t itargets[8]; }; struct pending_irq { int irq; /* * The following two states track the lifecycle of the guest irq. * However because we are not sure and we don't want to track * whether an irq added to an LR register is PENDING or ACTIVE, the * following states are just an approximation. * * GIC_IRQ_GUEST_PENDING: the irq is asserted * * GIC_IRQ_GUEST_VISIBLE: the irq has been added to an LR register, * therefore the guest is aware of it. From the guest point of view * the irq can be pending (if the guest has not acked the irq yet) * or active (after acking the irq). * * In order for the state machine to be fully accurate, for level * interrupts, we should keep the GIC_IRQ_GUEST_PENDING state until * the guest deactivates the irq. However because we are not sure * when that happens, we simply remove the GIC_IRQ_GUEST_PENDING * state when we add the irq to an LR register. We add it back when * we receive another interrupt notification. * Therefore it is possible to set GIC_IRQ_GUEST_PENDING while the * irq is GIC_IRQ_GUEST_VISIBLE. We could also change the state of * the guest irq in the LR register from active to active and * pending, but for simplicity we simply inject a second irq after * the guest EOIs the first one. * * * An additional state is used to keep track of whether the guest * irq is enabled at the vgicd level: * * GIC_IRQ_GUEST_ENABLED: the guest IRQ is enabled at the VGICD * level (GICD_ICENABLER/GICD_ISENABLER). * */ #define GIC_IRQ_GUEST_PENDING 0 #define GIC_IRQ_GUEST_VISIBLE 1 #define GIC_IRQ_GUEST_ENABLED 2 unsigned long status; struct irq_desc *desc; /* only set it the irq corresponds to a physical irq */ uint8_t priority; /* inflight is used to append instances of pending_irq to * vgic.inflight_irqs */ struct list_head inflight; /* lr_queue is used to append instances of pending_irq to * gic.lr_pending */ struct list_head lr_queue; }; struct hvm_domain { uint64_t params[HVM_NR_PARAMS]; } __cacheline_aligned; #ifdef CONFIG_ARM_64 enum domain_type { DOMAIN_PV32, DOMAIN_PV64, }; #define is_pv32_domain(d) ((d)->arch.type == DOMAIN_PV32) #define is_pv64_domain(d) ((d)->arch.type == DOMAIN_PV64) #else #define is_pv32_domain(d) (1) #define is_pv64_domain(d) (0) #endif extern int dom0_11_mapping; #define is_domain_direct_mapped(d) ((d) == dom0 && dom0_11_mapping) struct vtimer { struct vcpu *v; int irq; struct timer timer; uint32_t ctl; uint64_t cval; }; struct arch_domain { #ifdef CONFIG_ARM_64 enum domain_type type; #endif /* Virtual MMU */ struct p2m_domain p2m; uint64_t vttbr; struct hvm_domain hvm_domain; xen_pfn_t *grant_table_gpfn; /* Continuable domain_relinquish_resources(). */ enum { RELMEM_not_started, RELMEM_xen, RELMEM_page, RELMEM_mapping, RELMEM_done, } relmem; /* Virtual CPUID */ uint32_t vpidr; struct { uint64_t offset; } phys_timer_base; struct { uint64_t offset; } virt_timer_base; struct { /* * Covers access to other members of this struct _except_ for * shared_irqs where each member contains its own locking. * * If both class of lock is required then this lock must be * taken first. If multiple rank locks are required (including * the per-vcpu private_irqs rank) then they must be taken in * rank order. */ spinlock_t lock; int ctlr; int nr_lines; /* Number of SPIs */ struct vgic_irq_rank *shared_irqs; /* * SPIs are domain global, SGIs and PPIs are per-VCPU and stored in * struct arch_vcpu. */ struct pending_irq *pending_irqs; /* Base address for guest GIC */ paddr_t dbase; /* Distributor base address */ paddr_t cbase; /* CPU base address */ } vgic; struct vuart { #define VUART_BUF_SIZE 128 char *buf; int idx; const struct vuart_info *info; spinlock_t lock; } vuart; unsigned int evtchn_irq; } __cacheline_aligned; struct arch_vcpu { struct { #ifdef CONFIG_ARM_32 register_t r4; register_t r5; register_t r6; register_t r7; register_t r8; register_t r9; register_t sl; #else register_t x19; register_t x20; register_t x21; register_t x22; register_t x23; register_t x24; register_t x25; register_t x26; register_t x27; register_t x28; #endif register_t fp; register_t sp; register_t pc; } saved_context; void *stack; /* * Points into ->stack, more convenient than doing pointer arith * all the time. */ struct cpu_info *cpu_info; /* Fault Status */ #ifdef CONFIG_ARM_32 uint32_t dfsr; uint32_t dfar, ifar; #else uint64_t far; uint32_t esr; #endif uint32_t ifsr; /* 32-bit guests only */ uint32_t afsr0, afsr1; /* MMU */ register_t vbar; register_t ttbcr; uint64_t ttbr0, ttbr1; uint32_t dacr; /* 32-bit guests only */ uint64_t par; #ifdef CONFIG_ARM_32 uint32_t mair0, mair1; uint32_t amair0, amair1; #else uint64_t mair; uint64_t amair; #endif /* Control Registers */ uint32_t actlr, sctlr; uint32_t cpacr; uint32_t contextidr; register_t tpidr_el0; register_t tpidr_el1; register_t tpidrro_el0; uint32_t teecr, teehbr; /* ThumbEE, 32-bit guests only */ #ifdef CONFIG_ARM_32 /* * ARMv8 only supports a trivial implementation on Jazelle when in AArch32 * mode and therefore has no extended control registers. */ uint32_t joscr, jmcr; #endif /* Float-pointer */ struct vfp_state vfp; /* CP 15 */ uint32_t csselr; register_t vmpidr; uint32_t gic_hcr, gic_vmcr, gic_apr; uint32_t gic_lr[64]; uint64_t event_mask; uint64_t lr_mask; struct { /* * SGIs and PPIs are per-VCPU, SPIs are domain global and in * struct arch_domain. */ struct pending_irq pending_irqs[32]; struct vgic_irq_rank private_irqs; /* This list is ordered by IRQ priority and it is used to keep * track of the IRQs that the VGIC injected into the guest. * Depending on the availability of LR registers, the IRQs might * actually be in an LR, and therefore injected into the guest, * or queued in gic.lr_pending. * As soon as an IRQ is EOI'd by the guest and removed from the * corresponding LR it is also removed from this list. */ struct list_head inflight_irqs; /* lr_pending is used to queue IRQs (struct pending_irq) that the * vgic tried to inject in the guest (calling gic_set_guest_irq) but * no LRs were available at the time. * As soon as an LR is freed we remove the first IRQ from this * list and write it to the LR register. * lr_pending is a subset of vgic.inflight_irqs. */ struct list_head lr_pending; spinlock_t lock; } vgic; struct vtimer phys_timer; struct vtimer virt_timer; } __cacheline_aligned; void vcpu_show_execution_state(struct vcpu *); void vcpu_show_registers(const struct vcpu *); #endif /* __ASM_DOMAIN_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/iocap.h0000664000175000017500000000066012307313555015463 0ustar smbsmb#ifndef __X86_IOCAP_H__ #define __X86_IOCAP_H__ #define cache_flush_permitted(d) \ (!rangeset_is_empty((d)->iomem_caps)) #define multipage_allocation_permitted(d, order) \ (((order) <= 9) || /* allow 2MB superpages */ \ !rangeset_is_empty((d)->iomem_caps)) #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/platform.h0000664000175000017500000000466712307313555016227 0ustar smbsmb#ifndef __ASM_ARM_PLATFORM_H #define __ASM_ARM_PLATFORM_H #include #include #include #include /* Describe specific operation for a board */ struct platform_desc { /* Platform name */ const char *name; /* Array of device tree 'compatible' strings */ const char *const *compatible; /* Platform initialization */ int (*init)(void); int (*init_time)(void); #ifdef CONFIG_ARM_32 /* SMP */ int (*smp_init)(void); int (*cpu_up)(int cpu); #endif /* Specific mapping for dom0 */ int (*specific_mapping)(struct domain *d); /* Platform reset */ void (*reset)(void); /* Platform power-off */ void (*poweroff)(void); /* * Platform quirks * Defined has a function because a platform can support multiple * board with different quirk on each */ uint32_t (*quirks)(void); /* * Platform blacklist devices * List of devices which must not pass-through to a guest */ const struct dt_device_match *blacklist_dev; /* * The IRQ (PPI) to use to inject event channels to dom0. */ unsigned int dom0_evtchn_ppi; /* * The location of a region of physical address space which dom0 * can use for grant table mappings. If size is zero defaults to * 0xb0000000-0xb0020000. */ paddr_t dom0_gnttab_start, dom0_gnttab_size; }; /* * Quirk for platforms where the 4K GIC register ranges are placed at * 64K stride. */ #define PLATFORM_QUIRK_GIC_64K_STRIDE (1 << 0) void __init platform_init(void); int __init platform_init_time(void); int __init platform_specific_mapping(struct domain *d); #ifdef CONFIG_ARM_32 int platform_smp_init(void); int platform_cpu_up(int cpu); #endif void platform_reset(void); void platform_poweroff(void); bool_t platform_has_quirk(uint32_t quirk); bool_t platform_device_is_blacklisted(const struct dt_device_node *node); unsigned int platform_dom0_evtchn_ppi(void); void platform_dom0_gnttab(paddr_t *start, paddr_t *size); #define PLATFORM_START(_name, _namestr) \ static const struct platform_desc __plat_desc_##_name __used \ __attribute__((__section__(".arch.info"))) = { \ .name = _namestr, #define PLATFORM_END \ }; #endif /* __ASM_ARM_PLATFORM_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/irq.h0000664000175000017500000000214512307313555015163 0ustar smbsmb#ifndef _ASM_HW_IRQ_H #define _ASM_HW_IRQ_H #include #include #define NR_VECTORS 256 /* XXX */ typedef struct { DECLARE_BITMAP(_bits,NR_VECTORS); } vmask_t; struct arch_pirq { }; struct irq_cfg { #define arch_irq_desc irq_cfg int eoi_cpu; }; #define NR_LOCAL_IRQS 32 #define NR_IRQS 1024 #define nr_irqs NR_IRQS #define nr_irqs NR_IRQS #define nr_static_irqs NR_IRQS struct irq_desc; struct irqaction; struct irq_desc *__irq_to_desc(int irq); #define irq_to_desc(irq) __irq_to_desc(irq) void do_IRQ(struct cpu_user_regs *regs, unsigned int irq, int is_fiq); #define domain_pirq_to_irq(d, pirq) (pirq) void init_IRQ(void); void init_secondary_IRQ(void); int __init request_dt_irq(const struct dt_irq *irq, void (*handler)(int, void *, struct cpu_user_regs *), const char *devname, void *dev_id); int __init setup_dt_irq(const struct dt_irq *irq, struct irqaction *new); #endif /* _ASM_HW_IRQ_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/trace.h0000664000175000017500000000030112307313555015456 0ustar smbsmb#ifndef __ASM_TRACE_H__ #define __ASM_TRACE_H__ #endif /* __ASM_TRACE_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/hardirq.h0000664000175000017500000000123512307313555016021 0ustar smbsmb#ifndef __ASM_HARDIRQ_H #define __ASM_HARDIRQ_H #include #include #include typedef struct { unsigned long __softirq_pending; unsigned int __local_irq_count; } __cacheline_aligned irq_cpustat_t; #include /* Standard mappings for irq_cpustat_t above */ #define in_irq() (local_irq_count(smp_processor_id()) != 0) #define irq_enter() (local_irq_count(smp_processor_id())++) #define irq_exit() (local_irq_count(smp_processor_id())--) #endif /* __ASM_HARDIRQ_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/processor.h0000664000175000017500000003706612307313555016421 0ustar smbsmb#ifndef __ASM_ARM_PROCESSOR_H #define __ASM_ARM_PROCESSOR_H #include #include /* MIDR Main ID Register */ #define MIDR_MASK 0xff0ffff0 /* MPIDR Multiprocessor Affinity Register */ #define _MPIDR_UP (30) #define MPIDR_UP (_AC(1,U) << _MPIDR_UP) #define _MPIDR_SMP (31) #define MPIDR_SMP (_AC(1,U) << _MPIDR_SMP) #define MPIDR_AFF0_SHIFT (0) #define MPIDR_AFF0_MASK (_AC(0xff,U) << MPIDR_AFF0_SHIFT) #define MPIDR_HWID_MASK _AC(0xffffff,U) #define MPIDR_INVALID (~MPIDR_HWID_MASK) /* TTBCR Translation Table Base Control Register */ #define TTBCR_EAE _AC(0x80000000,U) #define TTBCR_N_MASK _AC(0x07,U) #define TTBCR_N_16KB _AC(0x00,U) #define TTBCR_N_8KB _AC(0x01,U) #define TTBCR_N_4KB _AC(0x02,U) #define TTBCR_N_2KB _AC(0x03,U) #define TTBCR_N_1KB _AC(0x04,U) /* SCTLR System Control Register. */ /* HSCTLR is a subset of this. */ #define SCTLR_TE (_AC(1,U)<<30) #define SCTLR_AFE (_AC(1,U)<<29) #define SCTLR_TRE (_AC(1,U)<<28) #define SCTLR_NMFI (_AC(1,U)<<27) #define SCTLR_EE (_AC(1,U)<<25) #define SCTLR_VE (_AC(1,U)<<24) #define SCTLR_U (_AC(1,U)<<22) #define SCTLR_FI (_AC(1,U)<<21) #define SCTLR_WXN (_AC(1,U)<<19) #define SCTLR_HA (_AC(1,U)<<17) #define SCTLR_RR (_AC(1,U)<<14) #define SCTLR_V (_AC(1,U)<<13) #define SCTLR_I (_AC(1,U)<<12) #define SCTLR_Z (_AC(1,U)<<11) #define SCTLR_SW (_AC(1,U)<<10) #define SCTLR_B (_AC(1,U)<<7) #define SCTLR_C (_AC(1,U)<<2) #define SCTLR_A (_AC(1,U)<<1) #define SCTLR_M (_AC(1,U)<<0) #define HSCTLR_BASE _AC(0x30c51878,U) /* HCR Hyp Configuration Register */ #define HCR_RW (_AC(1,UL)<<31) /* Register Width, ARM64 only */ #define HCR_TGE (_AC(1,UL)<<27) /* Trap General Exceptions */ #define HCR_TVM (_AC(1,UL)<<26) /* Trap Virtual Memory Controls */ #define HCR_TTLB (_AC(1,UL)<<25) /* Trap TLB Maintenance Operations */ #define HCR_TPU (_AC(1,UL)<<24) /* Trap Cache Maintenance Operations to PoU */ #define HCR_TPC (_AC(1,UL)<<23) /* Trap Cache Maintenance Operations to PoC */ #define HCR_TSW (_AC(1,UL)<<22) /* Trap Set/Way Cache Maintenance Operations */ #define HCR_TAC (_AC(1,UL)<<21) /* Trap ACTLR Accesses */ #define HCR_TIDCP (_AC(1,UL)<<20) /* Trap lockdown */ #define HCR_TSC (_AC(1,UL)<<19) /* Trap SMC instruction */ #define HCR_TID3 (_AC(1,UL)<<18) /* Trap ID Register Group 3 */ #define HCR_TID2 (_AC(1,UL)<<17) /* Trap ID Register Group 2 */ #define HCR_TID1 (_AC(1,UL)<<16) /* Trap ID Register Group 1 */ #define HCR_TID0 (_AC(1,UL)<<15) /* Trap ID Register Group 0 */ #define HCR_TWE (_AC(1,UL)<<14) /* Trap WFE instruction */ #define HCR_TWI (_AC(1,UL)<<13) /* Trap WFI instruction */ #define HCR_DC (_AC(1,UL)<<12) /* Default cacheable */ #define HCR_BSU_MASK (_AC(3,UL)<<10) /* Barrier Shareability Upgrade */ #define HCR_BSU_NONE (_AC(0,UL)<<10) #define HCR_BSU_INNER (_AC(1,UL)<<10) #define HCR_BSU_OUTER (_AC(2,UL)<<10) #define HCR_BSU_FULL (_AC(3,UL)<<10) #define HCR_FB (_AC(1,UL)<<9) /* Force Broadcast of Cache/BP/TLB operations */ #define HCR_VA (_AC(1,UL)<<8) /* Virtual Asynchronous Abort */ #define HCR_VI (_AC(1,UL)<<7) /* Virtual IRQ */ #define HCR_VF (_AC(1,UL)<<6) /* Virtual FIQ */ #define HCR_AMO (_AC(1,UL)<<5) /* Override CPSR.A */ #define HCR_IMO (_AC(1,UL)<<4) /* Override CPSR.I */ #define HCR_FMO (_AC(1,UL)<<3) /* Override CPSR.F */ #define HCR_PTW (_AC(1,UL)<<2) /* Protected Walk */ #define HCR_SWIO (_AC(1,UL)<<1) /* Set/Way Invalidation Override */ #define HCR_VM (_AC(1,UL)<<0) /* Virtual MMU Enable */ #define HSR_EC_UNKNOWN 0x00 #define HSR_EC_WFI_WFE 0x01 #define HSR_EC_CP15_32 0x03 #define HSR_EC_CP15_64 0x04 #define HSR_EC_CP14_32 0x05 #define HSR_EC_CP14_DBG 0x06 #define HSR_EC_CP 0x07 #define HSR_EC_CP10 0x08 #define HSR_EC_JAZELLE 0x09 #define HSR_EC_BXJ 0x0a #define HSR_EC_CP14_64 0x0c #define HSR_EC_SVC32 0x11 #define HSR_EC_HVC32 0x12 #define HSR_EC_SMC32 0x13 #ifdef CONFIG_ARM_64 #define HSR_EC_HVC64 0x16 #define HSR_EC_SMC64 0x17 #define HSR_EC_SYSREG 0x18 #endif #define HSR_EC_INSTR_ABORT_LOWER_EL 0x20 #define HSR_EC_INSTR_ABORT_CURR_EL 0x21 #define HSR_EC_DATA_ABORT_LOWER_EL 0x24 #define HSR_EC_DATA_ABORT_CURR_EL 0x25 /* FSR format, common */ #define FSR_LPAE (_AC(1,UL)<<9) /* FSR short format */ #define FSRS_FS_DEBUG (_AC(0,UL)<<10|_AC(0x2,UL)<<0) /* FSR long format */ #define FSRL_STATUS_DEBUG (_AC(0x22,UL)<<0) #ifndef __ASSEMBLY__ #include struct cpuinfo_arm { union { uint32_t bits; struct { unsigned long revision:4; unsigned long part_number:12; unsigned long architecture:4; unsigned long variant:4; unsigned long implementer:8; }; } midr; union { register_t bits; struct { unsigned long aff0:8; unsigned long aff1:8; unsigned long aff2:8; unsigned long mt:1; /* Multi-thread, iff MP == 1 */ unsigned long __res0:5; unsigned long up:1; /* UP system, iff MP == 1 */ unsigned long mp:1; /* MP extensions */ #ifdef CONFIG_ARM_64 unsigned long aff3:8; unsigned long __res1:24; #endif }; } mpidr; #ifdef CONFIG_ARM_64 /* 64-bit CPUID registers. */ union { uint64_t bits[2]; struct { unsigned long el0:4; unsigned long el1:4; unsigned long el2:4; unsigned long el3:4; unsigned long fp:4; /* Floating Point */ unsigned long simd:4; /* Advanced SIMD */ unsigned long __res0:8; unsigned long __res1; }; } pfr64; struct { uint64_t bits[2]; } dbg64; struct { uint64_t bits[2]; } aux64; struct { uint64_t bits[2]; } mm64; struct { uint64_t bits[2]; } isa64; #endif /* * 32-bit CPUID registers. On ARMv8 these describe the properties * when running in 32-bit mode. */ union { uint32_t bits[2]; struct { unsigned long arm:4; unsigned long thumb:4; unsigned long jazelle:4; unsigned long thumbee:4; unsigned long __res0:16; unsigned long progmodel:4; unsigned long security:4; unsigned long mprofile:4; unsigned long virt:4; unsigned long gentimer:4; unsigned long __res1:12; }; } pfr32; struct { uint32_t bits[1]; } dbg32; struct { uint32_t bits[1]; } aux32; struct { uint32_t bits[4]; } mm32; struct { uint32_t bits[6]; } isa32; }; /* * capabilities of CPUs */ extern struct cpuinfo_arm boot_cpu_data; extern void identify_cpu(struct cpuinfo_arm *); extern struct cpuinfo_arm cpu_data[]; #define current_cpu_data cpu_data[smp_processor_id()] extern u32 __cpu_logical_map[]; #define cpu_logical_map(cpu) __cpu_logical_map[cpu] union hsr { uint32_t bits; struct { unsigned long iss:25; /* Instruction Specific Syndrome */ unsigned long len:1; /* Instruction length */ unsigned long ec:6; /* Exception Class */ }; /* Common to all conditional exception classes (0x0N, except 0x00). */ struct hsr_cond { unsigned long iss:20; /* Instruction Specific Syndrome */ unsigned long cc:4; /* Condition Code */ unsigned long ccvalid:1;/* CC Valid */ unsigned long len:1; /* Instruction length */ unsigned long ec:6; /* Exception Class */ } cond; /* reg, reg0, reg1 are 4 bits on AArch32, the fifth bit is sbzp. */ struct hsr_cp32 { unsigned long read:1; /* Direction */ unsigned long crm:4; /* CRm */ unsigned long reg:5; /* Rt */ unsigned long crn:4; /* CRn */ unsigned long op1:3; /* Op1 */ unsigned long op2:3; /* Op2 */ unsigned long cc:4; /* Condition Code */ unsigned long ccvalid:1;/* CC Valid */ unsigned long len:1; /* Instruction length */ unsigned long ec:6; /* Exception Class */ } cp32; /* HSR_EC_CP15_32, CP14_32, CP10 */ struct hsr_cp64 { unsigned long read:1; /* Direction */ unsigned long crm:4; /* CRm */ unsigned long reg1:5; /* Rt1 */ unsigned long reg2:5; /* Rt2 */ unsigned long sbzp2:1; unsigned long op1:4; /* Op1 */ unsigned long cc:4; /* Condition Code */ unsigned long ccvalid:1;/* CC Valid */ unsigned long len:1; /* Instruction length */ unsigned long ec:6; /* Exception Class */ } cp64; /* HSR_EC_CP15_64, HSR_EC_CP14_64 */ #ifdef CONFIG_ARM_64 struct hsr_sysreg { unsigned long read:1; /* Direction */ unsigned long crm:4; /* CRm */ unsigned long reg:5; /* Rt */ unsigned long crn:4; /* CRn */ unsigned long op1:3; /* Op1 */ unsigned long op2:3; /* Op2 */ unsigned long op0:2; /* Op0 */ unsigned long res0:3; unsigned long len:1; /* Instruction length */ unsigned long ec:6; } sysreg; /* HSR_EC_SYSREG */ #endif struct hsr_dabt { unsigned long dfsc:6; /* Data Fault Status Code */ unsigned long write:1; /* Write / not Read */ unsigned long s1ptw:1; /* */ unsigned long cache:1; /* Cache Maintenance */ unsigned long eat:1; /* External Abort Type */ #ifdef CONFIG_ARM_32 unsigned long sbzp0:6; #else unsigned long sbzp0:4; unsigned long ar:1; /* Acquire Release */ unsigned long sf:1; /* Sixty Four bit register */ #endif unsigned long reg:5; /* Register */ unsigned long sign:1; /* Sign extend */ unsigned long size:2; /* Access Size */ unsigned long valid:1; /* Syndrome Valid */ unsigned long len:1; /* Instruction length */ unsigned long ec:6; /* Exception Class */ } dabt; /* HSR_EC_DATA_ABORT_* */ }; #endif /* HSR.EC == HSR_CP{15,14,10}_32 */ #define HSR_CP32_OP2_MASK (0x000e0000) #define HSR_CP32_OP2_SHIFT (17) #define HSR_CP32_OP1_MASK (0x0001c000) #define HSR_CP32_OP1_SHIFT (14) #define HSR_CP32_CRN_MASK (0x00003c00) #define HSR_CP32_CRN_SHIFT (10) #define HSR_CP32_CRM_MASK (0x0000001e) #define HSR_CP32_CRM_SHIFT (1) #define HSR_CP32_REGS_MASK (HSR_CP32_OP1_MASK|HSR_CP32_OP2_MASK|\ HSR_CP32_CRN_MASK|HSR_CP32_CRM_MASK) /* HSR.EC == HSR_CP{15,14}_64 */ #define HSR_CP64_OP1_MASK (0x000f0000) #define HSR_CP64_OP1_SHIFT (16) #define HSR_CP64_CRM_MASK (0x0000001e) #define HSR_CP64_CRM_SHIFT (1) #define HSR_CP64_REGS_MASK (HSR_CP64_OP1_MASK|HSR_CP64_CRM_MASK) /* HSR.EC == HSR_SYSREG */ #define HSR_SYSREG_OP0_MASK (0x00300000) #define HSR_SYSREG_OP0_SHIFT (20) #define HSR_SYSREG_OP1_MASK (0x0001c000) #define HSR_SYSREG_OP1_SHIFT (14) #define HSR_SYSREG_CRN_MASK (0x00003c00) #define HSR_SYSREG_CRN_SHIFT (10) #define HSR_SYSREG_CRM_MASK (0x0000001e) #define HSR_SYSREG_CRM_SHIFT (1) #define HSR_SYSREG_OP2_MASK (0x000e0000) #define HSR_SYSREG_OP2_SHIFT (17) #define HSR_SYSREG_REGS_MASK (HSR_SYSREG_OP0_MASK|HSR_SYSREG_OP1_MASK|\ HSR_SYSREG_CRN_MASK|HSR_SYSREG_CRM_MASK|\ HSR_SYSREG_OP2_MASK) /* Physical Address Register */ #define PAR_F (_AC(1,U)<<0) /* .... If F == 1 */ #define PAR_FSC_SHIFT (1) #define PAR_FSC_MASK (_AC(0x3f,U)< #elif defined(CONFIG_ARM_64) # include #else # error "unknown ARM variant" #endif #ifndef __ASSEMBLY__ extern uint32_t hyp_traps_vector[]; void init_traps(void); void panic_PAR(uint64_t par); void show_execution_state(struct cpu_user_regs *regs); void show_registers(struct cpu_user_regs *regs); //#define dump_execution_state() run_in_exception_handler(show_execution_state) #define dump_execution_state() asm volatile (".word 0xe7f000f0\n"); /* XXX */ #define cpu_relax() barrier() /* Could yield? */ /* All a bit UP for the moment */ #define cpu_to_core(_cpu) (0) #define cpu_to_socket(_cpu) (0) void do_unexpected_trap(const char *msg, struct cpu_user_regs *regs); void vcpu_regs_hyp_to_user(const struct vcpu *vcpu, struct vcpu_guest_core_regs *regs); void vcpu_regs_user_to_hyp(struct vcpu *vcpu, const struct vcpu_guest_core_regs *regs); struct cpuinfo_x86 { uint32_t pfr32[2]; }; #endif /* __ASSEMBLY__ */ #endif /* __ASM_ARM_PROCESSOR_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/config.h0000664000175000017500000001347712307313555015647 0ustar smbsmb/****************************************************************************** * config.h * * A Linux-style configuration list. */ #ifndef __ARM_CONFIG_H__ #define __ARM_CONFIG_H__ #if defined(__aarch64__) # define CONFIG_ARM_64 1 #elif defined(__arm__) # define CONFIG_ARM_32 1 #endif #if defined(CONFIG_ARM_64) # define LONG_BYTEORDER 3 #else # define LONG_BYTEORDER 2 #endif #define BYTES_PER_LONG (1 << LONG_BYTEORDER) #define BITS_PER_LONG (BYTES_PER_LONG << 3) /* xen_ulong_t is always 64 bits */ #define BITS_PER_XEN_ULONG 64 #define CONFIG_PAGING_ASSISTANCE 1 #define CONFIG_PAGING_LEVELS 3 #define CONFIG_ARM 1 #define CONFIG_ARM_L1_CACHE_SHIFT 7 /* XXX */ #define CONFIG_SMP 1 #define CONFIG_VIDEO 1 #define OPT_CONSOLE_STR "dtuart" #ifdef MAX_PHYS_CPUS #define NR_CPUS MAX_PHYS_CPUS #else #define NR_CPUS 128 #endif #define MAX_VIRT_CPUS 8 #define MAX_HVM_VCPUS MAX_VIRT_CPUS #define asmlinkage /* Nothing needed */ #define __LINUX_ARM_ARCH__ 7 #define CONFIG_AEABI /* Linkage for ARM */ #define __ALIGN .align 2 #define __ALIGN_STR ".align 2" #ifdef __ASSEMBLY__ #define ALIGN __ALIGN #define ALIGN_STR __ALIGN_STR #define ENTRY(name) \ .globl name; \ ALIGN; \ name: #define GLOBAL(name) \ .globl name; \ name: #define END(name) \ .size name, .-name #define ENDPROC(name) \ .type name, %function; \ END(name) #endif #include /* * Common ARM32 and ARM64 layout: * 0 - 2M Unmapped * 2M - 4M Xen text, data, bss * 4M - 6M Fixmap: special-purpose 4K mapping slots * 6M - 8M Early boot mapping of FDT * 8M - 10M Early relocation address (used when relocating Xen) * * ARM32 layout: * 0 - 8M * * 32M - 128M Frametable: 24 bytes per page for 16GB of RAM * 256M - 1G VMAP: ioremap and early_ioremap use this virtual address * space * * 1G - 2G Xenheap: always-mapped memory * 2G - 4G Domheap: on-demand-mapped * * ARM64 layout: * 0x0000000000000000 - 0x0000007fffffffff (512GB, L0 slot [0]) * 0 - 8M * * 1G - 2G VMAP: ioremap and early_ioremap * * 32G - 64G Frametable: 24 bytes per page for 5.3TB of RAM * * 0x0000008000000000 - 0x00007fffffffffff (127.5TB, L0 slots [1..255]) * Unused * * 0x0000800000000000 - 0x000084ffffffffff (5TB, L0 slots [256..265]) * 1:1 mapping of RAM * * 0x0000850000000000 - 0x0000ffffffffffff (123TB, L0 slots [266..511]) * Unused */ #define XEN_VIRT_START _AT(vaddr_t,0x00200000) #define FIXMAP_ADDR(n) (_AT(vaddr_t,0x00400000) + (n) * PAGE_SIZE) #define BOOT_FDT_VIRT_START _AT(vaddr_t,0x00600000) #define BOOT_RELOC_VIRT_START _AT(vaddr_t,0x00800000) #define HYPERVISOR_VIRT_START XEN_VIRT_START #ifdef CONFIG_ARM_32 #define CONFIG_DOMAIN_PAGE 1 #define CONFIG_SEPARATE_XENHEAP 1 #define FRAMETABLE_VIRT_START _AT(vaddr_t,0x02000000) #define VMAP_VIRT_START _AT(vaddr_t,0x10000000) #define XENHEAP_VIRT_START _AT(vaddr_t,0x40000000) #define XENHEAP_VIRT_END _AT(vaddr_t,0x7fffffff) #define DOMHEAP_VIRT_START _AT(vaddr_t,0x80000000) #define DOMHEAP_VIRT_END _AT(vaddr_t,0xffffffff) #define VMAP_VIRT_END XENHEAP_VIRT_START #define DOMHEAP_ENTRIES 1024 /* 1024 2MB mapping slots */ /* Number of domheap pagetable pages required at the second level (2MB mappings) */ #define DOMHEAP_SECOND_PAGES ((DOMHEAP_VIRT_END - DOMHEAP_VIRT_START + 1) >> FIRST_SHIFT) #else /* ARM_64 */ #define SLOT0_ENTRY_BITS 39 #define SLOT0(slot) (_AT(vaddr_t,slot) << SLOT0_ENTRY_BITS) #define SLOT0_ENTRY_SIZE SLOT0(1) #define VMAP_VIRT_START GB(1) #define VMAP_VIRT_END (VMAP_VIRT_START + GB(1) - 1) #define FRAMETABLE_VIRT_START GB(32) #define FRAMETABLE_VIRT_END (FRAMETABLE_VIRT_START + GB(32) - 1) #define DIRECTMAP_VIRT_START SLOT0(256) #define DIRECTMAP_SIZE (SLOT0_ENTRY_SIZE * (265-256)) #define DIRECTMAP_VIRT_END (DIRECTMAP_VIRT_START + DIRECTMAP_SIZE - 1) #define XENHEAP_VIRT_START DIRECTMAP_VIRT_START #define HYPERVISOR_VIRT_END DIRECTMAP_VIRT_END #endif /* Fixmap slots */ #define FIXMAP_CONSOLE 0 /* The primary UART */ #define FIXMAP_PT 1 /* Temporary mappings of pagetable pages */ #define FIXMAP_MISC 2 /* Ephemeral mappings of hardware */ #define FIXMAP_GICD 3 /* Interrupt controller: distributor registers */ #define FIXMAP_GICC1 4 /* Interrupt controller: CPU registers (first page) */ #define FIXMAP_GICC2 5 /* Interrupt controller: CPU registers (second page) */ #define FIXMAP_GICH 6 /* Interrupt controller: virtual interface control registers */ #define PAGE_SHIFT 12 #ifndef __ASSEMBLY__ #define PAGE_SIZE (1L << PAGE_SHIFT) #else #define PAGE_SIZE (1 << PAGE_SHIFT) #endif #define PAGE_MASK (~(PAGE_SIZE-1)) #define PAGE_FLAG_MASK (~0) #define STACK_ORDER 3 #define STACK_SIZE (PAGE_SIZE << STACK_ORDER) #ifndef __ASSEMBLY__ extern unsigned long xen_phys_start; extern unsigned long xenheap_phys_end; extern unsigned long frametable_virt_end; #endif #define supervisor_mode_kernel (0) #define watchdog_disable() ((void)0) #define watchdog_enable() ((void)0) #ifdef __ASSEMBLY__ /* Board-specific: regs base address for the GIC * Theses constants are only intend to be used in assembly file * because the DT is not yet parsed. */ #define GIC_DR_OFFSET 0x1000 #define GIC_CR_OFFSET 0x2000 #define GIC_HR_OFFSET 0x4000 /* Guess work http://lists.infradead.org/pipermail/linux-arm-kernel/2011-September/064219.html */ #define GIC_VR_OFFSET 0x6000 /* Virtual Machine CPU interface) */ #endif /* __ASSEMBLY__ */ #endif /* __ARM_CONFIG_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/setup.h0000664000175000017500000000067212307313555015533 0ustar smbsmb#ifndef __ARM_SETUP_H_ #define __ARM_SETUP_H_ #include void arch_init_memory(void); void copy_from_paddr(void *dst, paddr_t paddr, unsigned long len, int attrindx); void arch_get_xen_caps(xen_capabilities_info_t *info); int construct_dom0(struct domain *d); void discard_initial_modules(void); #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/div64.h0000664000175000017500000002016512307313555015326 0ustar smbsmb/* Taken from Linux arch/arm */ #ifndef __ASM_ARM_DIV64 #define __ASM_ARM_DIV64 #include #include /* * The semantics of do_div() are: * * uint32_t do_div(uint64_t *n, uint32_t base) * { * uint32_t remainder = *n % base; * *n = *n / base; * return remainder; * } * * In other words, a 64-bit dividend with a 32-bit divisor producing * a 64-bit result and a 32-bit remainder. To accomplish this optimally * we call a special __do_div64 helper with completely non standard * calling convention for arguments and results (beware). */ #if BITS_PER_LONG == 64 # define do_div(n,base) ({ \ uint32_t __base = (base); \ uint32_t __rem; \ __rem = ((uint64_t)(n)) % __base; \ (n) = ((uint64_t)(n)) / __base; \ __rem; \ }) #elif BITS_PER_LONG == 32 #ifdef __ARMEB__ #define __xh "r0" #define __xl "r1" #else #define __xl "r0" #define __xh "r1" #endif #define __do_div_asm(n, base) \ ({ \ register unsigned int __base asm("r4") = base; \ register unsigned long long __n asm("r0") = n; \ register unsigned long long __res asm("r2"); \ register unsigned int __rem asm(__xh); \ asm( __asmeq("%0", __xh) \ __asmeq("%1", "r2") \ __asmeq("%2", "r0") \ __asmeq("%3", "r4") \ "bl __do_div64" \ : "=r" (__rem), "=r" (__res) \ : "r" (__n), "r" (__base) \ : "ip", "lr", "cc"); \ n = __res; \ __rem; \ }) #if __GNUC__ < 4 /* * gcc versions earlier than 4.0 are simply too problematic for the * optimized implementation below. First there is gcc PR 15089 that * tend to trig on more complex constructs, spurious .global __udivsi3 * are inserted even if none of those symbols are referenced in the * generated code, and those gcc versions are not able to do constant * propagation on long long values anyway. */ #define do_div(n, base) __do_div_asm(n, base) #elif __GNUC__ >= 4 #include /* * If the divisor happens to be constant, we determine the appropriate * inverse at compile time to turn the division into a few inline * multiplications instead which is much faster. And yet only if compiling * for ARMv4 or higher (we need umull/umlal) and if the gcc version is * sufficiently recent to perform proper long long constant propagation. * (It is unfortunate that gcc doesn't perform all this internally.) */ #define do_div(n, base) \ ({ \ unsigned int __r, __b = (base); \ if (!__builtin_constant_p(__b) || __b == 0) { \ /* non-constant divisor (or zero): slow path */ \ __r = __do_div_asm(n, __b); \ } else if ((__b & (__b - 1)) == 0) { \ /* Trivial: __b is constant and a power of 2 */ \ /* gcc does the right thing with this code. */ \ __r = n; \ __r &= (__b - 1); \ n /= __b; \ } else { \ /* Multiply by inverse of __b: n/b = n*(p/b)/p */ \ /* We rely on the fact that most of this code gets */ \ /* optimized away at compile time due to constant */ \ /* propagation and only a couple inline assembly */ \ /* instructions should remain. Better avoid any */ \ /* code construct that might prevent that. */ \ unsigned long long __res, __x, __t, __m, __n = n; \ unsigned int __c, __p, __z = 0; \ /* preserve low part of n for reminder computation */ \ __r = __n; \ /* determine number of bits to represent __b */ \ __p = 1 << __div64_fls(__b); \ /* compute __m = ((__p << 64) + __b - 1) / __b */ \ __m = (~0ULL / __b) * __p; \ __m += (((~0ULL % __b + 1) * __p) + __b - 1) / __b; \ /* compute __res = __m*(~0ULL/__b*__b-1)/(__p << 64) */ \ __x = ~0ULL / __b * __b - 1; \ __res = (__m & 0xffffffff) * (__x & 0xffffffff); \ __res >>= 32; \ __res += (__m & 0xffffffff) * (__x >> 32); \ __t = __res; \ __res += (__x & 0xffffffff) * (__m >> 32); \ __t = (__res < __t) ? (1ULL << 32) : 0; \ __res = (__res >> 32) + __t; \ __res += (__m >> 32) * (__x >> 32); \ __res /= __p; \ /* Now sanitize and optimize what we've got. */ \ if (~0ULL % (__b / (__b & -__b)) == 0) { \ /* those cases can be simplified with: */ \ __n /= (__b & -__b); \ __m = ~0ULL / (__b / (__b & -__b)); \ __p = 1; \ __c = 1; \ } else if (__res != __x / __b) { \ /* We can't get away without a correction */ \ /* to compensate for bit truncation errors. */ \ /* To avoid it we'd need an additional bit */ \ /* to represent __m which would overflow it. */ \ /* Instead we do m=p/b and n/b=(n*m+m)/p. */ \ __c = 1; \ /* Compute __m = (__p << 64) / __b */ \ __m = (~0ULL / __b) * __p; \ __m += ((~0ULL % __b + 1) * __p) / __b; \ } else { \ /* Reduce __m/__p, and try to clear bit 31 */ \ /* of __m when possible otherwise that'll */ \ /* need extra overflow handling later. */ \ unsigned int __bits = -(__m & -__m); \ __bits |= __m >> 32; \ __bits = (~__bits) << 1; \ /* If __bits == 0 then setting bit 31 is */ \ /* unavoidable. Simply apply the maximum */ \ /* possible reduction in that case. */ \ /* Otherwise the MSB of __bits indicates the */ \ /* best reduction we should apply. */ \ if (!__bits) { \ __p /= (__m & -__m); \ __m /= (__m & -__m); \ } else { \ __p >>= __div64_fls(__bits); \ __m >>= __div64_fls(__bits); \ } \ /* No correction needed. */ \ __c = 0; \ } \ /* Now we have a combination of 2 conditions: */ \ /* 1) whether or not we need a correction (__c), and */ \ /* 2) whether or not there might be an overflow in */ \ /* the cross product (__m & ((1<<63) | (1<<31))) */ \ /* Select the best insn combination to perform the */ \ /* actual __m * __n / (__p << 64) operation. */ \ if (!__c) { \ asm ( "umull %Q0, %R0, %1, %Q2\n\t" \ "mov %Q0, #0" \ : "=&r" (__res) \ : "r" (__m), "r" (__n) \ : "cc" ); \ } else if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \ __res = __m; \ asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t" \ "mov %Q0, #0" \ : "+&r" (__res) \ : "r" (__m), "r" (__n) \ : "cc" ); \ } else { \ asm ( "umull %Q0, %R0, %Q1, %Q2\n\t" \ "cmn %Q0, %Q1\n\t" \ "adcs %R0, %R0, %R1\n\t" \ "adc %Q0, %3, #0" \ : "=&r" (__res) \ : "r" (__m), "r" (__n), "r" (__z) \ : "cc" ); \ } \ if (!(__m & ((1ULL << 63) | (1ULL << 31)))) { \ asm ( "umlal %R0, %Q0, %R1, %Q2\n\t" \ "umlal %R0, %Q0, %Q1, %R2\n\t" \ "mov %R0, #0\n\t" \ "umlal %Q0, %R0, %R1, %R2" \ : "+&r" (__res) \ : "r" (__m), "r" (__n) \ : "cc" ); \ } else { \ asm ( "umlal %R0, %Q0, %R2, %Q3\n\t" \ "umlal %R0, %1, %Q2, %R3\n\t" \ "mov %R0, #0\n\t" \ "adds %Q0, %1, %Q0\n\t" \ "adc %R0, %R0, #0\n\t" \ "umlal %Q0, %R0, %R2, %R3" \ : "+&r" (__res), "+&r" (__z) \ : "r" (__m), "r" (__n) \ : "cc" ); \ } \ __res /= __p; \ /* The reminder can be computed with 32-bit regs */ \ /* only, and gcc is good at that. */ \ { \ unsigned int __res0 = __res; \ unsigned int __b0 = __b; \ __r -= __res0 * __b0; \ } \ /* BUG_ON(__r >= __b || __res * __b + __r != n); */ \ n = __res; \ } \ __r; \ }) /* our own fls implementation to make sure constant propagation is fine */ #define __div64_fls(bits) \ ({ \ unsigned int __left = (bits), __nr = 0; \ if (__left & 0xffff0000) __nr += 16, __left >>= 16; \ if (__left & 0x0000ff00) __nr += 8, __left >>= 8; \ if (__left & 0x000000f0) __nr += 4, __left >>= 4; \ if (__left & 0x0000000c) __nr += 2, __left >>= 2; \ if (__left & 0x00000002) __nr += 1; \ __nr; \ }) #endif /* GCC version */ #endif /* BITS_PER_LONG */ #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/paging.h0000664000175000017500000000044012307313555015631 0ustar smbsmb#ifndef _XEN_PAGING_H #define _XEN_PAGING_H #define paging_mode_translate(d) (1) #define paging_mode_external(d) (1) #endif /* XEN_PAGING_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/elf.h0000664000175000017500000000112012307313555015126 0ustar smbsmb#ifndef __ARM_ELF_H__ #define __ARM_ELF_H__ typedef struct { unsigned long r0; unsigned long r1; unsigned long r2; unsigned long r3; unsigned long r4; unsigned long r5; unsigned long r6; unsigned long r7; unsigned long r8; unsigned long r9; unsigned long r10; unsigned long r11; unsigned long r12; unsigned long sp; unsigned long lr; unsigned long pc; } ELF_Gregset; #endif /* __ARM_ELF_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/delay.h0000664000175000017500000000035312307313555015465 0ustar smbsmb#ifndef _ARM_DELAY_H #define _ARM_DELAY_H extern void udelay(unsigned long usecs); #endif /* defined(_ARM_DELAY_H) */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/io.h0000664000175000017500000000047112307313555014777 0ustar smbsmb#ifndef _ASM_IO_H #define _ASM_IO_H #if defined(CONFIG_ARM_32) # include #elif defined(CONFIG_ARM_64) # include #else # error "unknown ARM variant" #endif #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/bug.h0000664000175000017500000000041612307313555015144 0ustar smbsmb#ifndef __ARM_BUG_H__ #define __ARM_BUG_H__ #define BUG() __bug(__FILE__, __LINE__) #define WARN() __warn(__FILE__, __LINE__) #endif /* __X86_BUG_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/nmi.h0000664000175000017500000000042412307313555015151 0ustar smbsmb#ifndef ASM_NMI_H #define ASM_NMI_H #define register_guest_nmi_callback(a) (-ENOSYS) #define unregister_guest_nmi_callback() (-ENOSYS) #endif /* ASM_NMI_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/bitops.h0000664000175000017500000001117212307313555015670 0ustar smbsmb/* * Copyright 1995, Russell King. * Various bits and pieces copyrights include: * Linus Torvalds (test_bit). * Big endian support: Copyright 2001, Nicolas Pitre * reworked by rmk. */ #ifndef _ARM_BITOPS_H #define _ARM_BITOPS_H /* * Non-atomic bit manipulation. * * Implemented using atomics to be interrupt safe. Could alternatively * implement with local interrupt masking. */ #define __set_bit(n,p) set_bit(n,p) #define __clear_bit(n,p) clear_bit(n,p) #define BIT(nr) (1UL << (nr)) #define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) #define BIT_WORD(nr) ((nr) / BITS_PER_LONG) #define BITS_PER_BYTE 8 #define ADDR (*(volatile long *) addr) #define CONST_ADDR (*(const volatile long *) addr) #if defined(CONFIG_ARM_32) # include #elif defined(CONFIG_ARM_64) # include #else # error "unknown ARM variant" #endif /** * __test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * * This operation is non-atomic and can be reordered. * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ static inline int __test_and_set_bit(int nr, volatile void *addr) { unsigned long mask = BIT_MASK(nr); volatile unsigned long *p = ((volatile unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; *p = old | mask; return (old & mask) != 0; } /** * __test_and_clear_bit - Clear a bit and return its old value * @nr: Bit to clear * @addr: Address to count from * * This operation is non-atomic and can be reordered. * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ static inline int __test_and_clear_bit(int nr, volatile void *addr) { unsigned long mask = BIT_MASK(nr); volatile unsigned long *p = ((volatile unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; *p = old & ~mask; return (old & mask) != 0; } /* WARNING: non atomic and it can be reordered! */ static inline int __test_and_change_bit(int nr, volatile void *addr) { unsigned long mask = BIT_MASK(nr); volatile unsigned long *p = ((volatile unsigned long *)addr) + BIT_WORD(nr); unsigned long old = *p; *p = old ^ mask; return (old & mask) != 0; } /** * test_bit - Determine whether a bit is set * @nr: bit number to test * @addr: Address to start counting from */ static inline int test_bit(int nr, const volatile void *addr) { const volatile unsigned long *p = (const volatile unsigned long *)addr; return 1UL & (p[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); } static inline int constant_fls(int x) { int r = 32; if (!x) return 0; if (!(x & 0xffff0000u)) { x <<= 16; r -= 16; } if (!(x & 0xff000000u)) { x <<= 8; r -= 8; } if (!(x & 0xf0000000u)) { x <<= 4; r -= 4; } if (!(x & 0xc0000000u)) { x <<= 2; r -= 2; } if (!(x & 0x80000000u)) { x <<= 1; r -= 1; } return r; } /* * On ARMv5 and above those functions can be implemented around * the clz instruction for much better code efficiency. */ static inline int fls(int x) { int ret; if (__builtin_constant_p(x)) return constant_fls(x); asm("clz\t%0, %1" : "=r" (ret) : "r" (x)); ret = BITS_PER_LONG - ret; return ret; } #define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); }) /** * find_first_set_bit - find the first set bit in @word * @word: the word to search * * Returns the bit-number of the first set bit (first bit being 0). * The input must *not* be zero. */ static inline unsigned int find_first_set_bit(unsigned long word) { return ffs(word) - 1; } /** * hweightN - returns the hamming weight of a N-bit word * @x: the word to weigh * * The Hamming Weight of a number is the total number of bits set in it. */ #define hweight64(x) generic_hweight64(x) #define hweight32(x) generic_hweight32(x) #define hweight16(x) generic_hweight16(x) #define hweight8(x) generic_hweight8(x) #endif /* _ARM_BITOPS_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/smp.h0000664000175000017500000000154212307313555015167 0ustar smbsmb#ifndef __ASM_SMP_H #define __ASM_SMP_H #ifndef __ASSEMBLY__ #include #include #include #include #endif DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask); DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask); #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu)) #define raw_smp_processor_id() (get_processor_id()) extern void stop_cpu(void); extern int arch_smp_init(void); extern int arch_cpu_init(int cpu, struct dt_device_node *dn); extern int arch_cpu_up(int cpu); int cpu_up_send_sgi(int cpu); /* Secondary CPU entry point */ extern void init_secondary(void); extern void smp_init_cpus(void); extern void smp_clear_cpu_maps (void); extern int smp_get_max_cpus (void); #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/string.h0000664000175000017500000000256712307313555015706 0ustar smbsmb#ifndef __ARM_STRING_H__ #define __ARM_STRING_H__ #include #if defined(CONFIG_ARM_32) #define __HAVE_ARCH_MEMCPY extern void * memcpy(void *, const void *, __kernel_size_t); /* Some versions of gcc don't have this builtin. It's non-critical anyway. */ #define __HAVE_ARCH_MEMMOVE extern void *memmove(void *dest, const void *src, size_t n); #define __HAVE_ARCH_MEMSET extern void * memset(void *, int, __kernel_size_t); extern void __memzero(void *ptr, __kernel_size_t n); #define memset(p,v,n) \ ({ \ void *__p = (p); size_t __n = n; \ if ((__n) != 0) { \ if (__builtin_constant_p((v)) && (v) == 0) \ __memzero((__p),(__n)); \ else \ memset((__p),(v),(__n)); \ } \ (__p); \ }) #endif #endif /* __ARM_STRING_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/psci.h0000664000175000017500000000127712307313555015333 0ustar smbsmb#ifndef __ASM_PSCI_H__ #define __ASM_PSCI_H__ #define PSCI_SUCCESS 0 #define PSCI_ENOSYS -1 #define PSCI_EINVAL -2 #define PSCI_DENIED -3 /* availability of PSCI on the host for SMP bringup */ extern bool_t psci_available; int psci_init(void); int call_psci_cpu_on(int cpu); /* functions to handle guest PSCI requests */ int do_psci_cpu_on(uint32_t vcpuid, register_t entry_point); int do_psci_cpu_off(uint32_t power_state); int do_psci_cpu_suspend(uint32_t power_state, register_t entry_point); int do_psci_migrate(uint32_t vcpuid); #endif /* __ASM_PSCI_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/atomic.h0000664000175000017500000001021012307313555015634 0ustar smbsmb#ifndef __ARCH_ARM_ATOMIC__ #define __ARCH_ARM_ATOMIC__ #include #include #define build_atomic_read(name, size, width, type, reg)\ static inline type name(const volatile type *addr) \ { \ type ret; \ asm volatile("ldr" size " %" width "0,%1" \ : reg (ret) \ : "m" (*(volatile type *)addr)); \ return ret; \ } #define build_atomic_write(name, size, width, type, reg) \ static inline void name(volatile type *addr, type val) \ { \ asm volatile("str" size " %"width"1,%0" \ : "=m" (*(volatile type *)addr) \ : reg (val)); \ } #if defined (CONFIG_ARM_32) #define BYTE "" #define WORD "" #elif defined (CONFIG_ARM_64) #define BYTE "w" #define WORD "w" #endif build_atomic_read(read_u8_atomic, "b", BYTE, uint8_t, "=r") build_atomic_read(read_u16_atomic, "h", WORD, uint16_t, "=r") build_atomic_read(read_u32_atomic, "", WORD, uint32_t, "=r") build_atomic_read(read_int_atomic, "", WORD, int, "=r") build_atomic_write(write_u8_atomic, "b", BYTE, uint8_t, "r") build_atomic_write(write_u16_atomic, "h", WORD, uint16_t, "r") build_atomic_write(write_u32_atomic, "", WORD, uint32_t, "r") build_atomic_write(write_int_atomic, "", WORD, int, "r") #if 0 /* defined (CONFIG_ARM_64) */ build_atomic_read(read_u64_atomic, "x", uint64_t, "=r") build_atomic_write(write_u64_atomic, "x", uint64_t, "r") #endif void __bad_atomic_size(void); #define read_atomic(p) ({ \ typeof(*p) __x; \ switch ( sizeof(*p) ) { \ case 1: __x = (typeof(*p))read_u8_atomic((uint8_t *)p); break; \ case 2: __x = (typeof(*p))read_u16_atomic((uint16_t *)p); break; \ case 4: __x = (typeof(*p))read_u32_atomic((uint32_t *)p); break; \ default: __x = 0; __bad_atomic_size(); break; \ } \ __x; \ }) #define write_atomic(p, x) ({ \ typeof(*p) __x = (x); \ switch ( sizeof(*p) ) { \ case 1: write_u8_atomic((uint8_t *)p, (uint8_t)__x); break; \ case 2: write_u16_atomic((uint16_t *)p, (uint16_t)__x); break; \ case 4: write_u32_atomic((uint32_t *)p, (uint32_t)__x); break; \ default: __bad_atomic_size(); break; \ } \ __x; \ }) /* * NB. I've pushed the volatile qualifier into the operations. This allows * fast accessors such as _atomic_read() and _atomic_set() which don't give * the compiler a fit. */ typedef struct { int counter; } atomic_t; #define ATOMIC_INIT(i) { (i) } /* * On ARM, ordinary assignment (str instruction) doesn't clear the local * strex/ldrex monitor on some implementations. The reason we can use it for * atomic_set() is the clrex or dummy strex done on every exception return. */ #define _atomic_read(v) ((v).counter) #define atomic_read(v) (*(volatile int *)&(v)->counter) #define _atomic_set(v,i) (((v).counter) = (i)) #define atomic_set(v,i) (((v)->counter) = (i)) #if defined(CONFIG_ARM_32) # include #elif defined(CONFIG_ARM_64) # include #else # error "unknown ARM variant" #endif static inline atomic_t atomic_compareandswap( atomic_t old, atomic_t new, atomic_t *v) { atomic_t rc; rc.counter = __cmpxchg(&v->counter, old.counter, new.counter, sizeof(int)); return rc; } #endif /* __ARCH_ARM_ATOMIC__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/sysregs.h0000664000175000017500000000413612307313555016071 0ustar smbsmb#ifndef __ASM_ARM_SYSREGS_H #define __ASM_ARM_SYSREGS_H #ifdef CONFIG_ARM_64 #include /* AArch 64 System Register Encodings */ #define __HSR_SYSREG_c0 0 #define __HSR_SYSREG_c1 1 #define __HSR_SYSREG_c2 2 #define __HSR_SYSREG_c3 3 #define __HSR_SYSREG_c4 4 #define __HSR_SYSREG_c5 5 #define __HSR_SYSREG_c6 6 #define __HSR_SYSREG_c7 7 #define __HSR_SYSREG_c8 8 #define __HSR_SYSREG_c9 9 #define __HSR_SYSREG_c10 10 #define __HSR_SYSREG_c11 11 #define __HSR_SYSREG_c12 12 #define __HSR_SYSREG_c13 13 #define __HSR_SYSREG_c14 14 #define __HSR_SYSREG_c15 15 #define __HSR_SYSREG_0 0 #define __HSR_SYSREG_1 1 #define __HSR_SYSREG_2 2 #define __HSR_SYSREG_3 3 #define __HSR_SYSREG_4 4 #define __HSR_SYSREG_5 5 #define __HSR_SYSREG_6 6 #define __HSR_SYSREG_7 7 /* These are used to decode traps with HSR.EC==HSR_EC_SYSREG */ #define HSR_SYSREG(op0,op1,crn,crm,op2) \ ((__HSR_SYSREG_##op0) << HSR_SYSREG_OP0_SHIFT) | \ ((__HSR_SYSREG_##op1) << HSR_SYSREG_OP1_SHIFT) | \ ((__HSR_SYSREG_##crn) << HSR_SYSREG_CRN_SHIFT) | \ ((__HSR_SYSREG_##crm) << HSR_SYSREG_CRM_SHIFT) | \ ((__HSR_SYSREG_##op2) << HSR_SYSREG_OP2_SHIFT) #define HSR_SYSREG_SCTLR_EL1 HSR_SYSREG(3,0,c1, c0,0) #define HSR_SYSREG_TTBR0_EL1 HSR_SYSREG(3,0,c2, c0,0) #define HSR_SYSREG_TTBR1_EL1 HSR_SYSREG(3,0,c2, c0,1) #define HSR_SYSREG_TCR_EL1 HSR_SYSREG(3,0,c2, c0,2) #define HSR_SYSREG_AFSR0_EL1 HSR_SYSREG(3,0,c5, c1,0) #define HSR_SYSREG_AFSR1_EL1 HSR_SYSREG(3,0,c5, c1,1) #define HSR_SYSREG_ESR_EL1 HSR_SYSREG(3,0,c5, c2,0) #define HSR_SYSREG_FAR_EL1 HSR_SYSREG(3,0,c6, c0,0) #define HSR_SYSREG_MAIR_EL1 HSR_SYSREG(3,0,c10,c2,0) #define HSR_SYSREG_AMAIR_EL1 HSR_SYSREG(3,0,c10,c3,0) #define HSR_SYSREG_CONTEXTIDR_EL1 HSR_SYSREG(3,0,c13,c0,1) #define HSR_SYSREG_CNTPCT_EL0 HSR_SYSREG(3,3,c14,c0,0) #define HSR_SYSREG_CNTP_CTL_EL0 HSR_SYSREG(3,3,c14,c2,1) #define HSR_SYSREG_CNTP_TVAL_EL0 HSR_SYSREG(3,3,c14,c2,0) #endif #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/current.h0000664000175000017500000000230712307313555016052 0ustar smbsmb#ifndef __ARM_CURRENT_H__ #define __ARM_CURRENT_H__ #include #include #include #include #include #ifndef __ASSEMBLY__ struct vcpu; /* Which VCPU is "current" on this PCPU. */ DECLARE_PER_CPU(struct vcpu *, curr_vcpu); #define current (this_cpu(curr_vcpu)) #define set_current(vcpu) do { current = (vcpu); } while (0) /* Per-VCPU state that lives at the top of the stack */ struct cpu_info { struct cpu_user_regs guest_cpu_user_regs; unsigned long elr; unsigned int pad; }; static inline struct cpu_info *get_cpu_info(void) { register unsigned long sp asm ("sp"); return (struct cpu_info *)((sp & ~(STACK_SIZE - 1)) + STACK_SIZE - sizeof(struct cpu_info)); } #define guest_cpu_user_regs() (&get_cpu_info()->guest_cpu_user_regs) #define switch_stack_and_jump(stack, fn) \ asm volatile ("mov sp,%0; b " STR(fn) : : "r" (stack) : "memory" ) #define reset_stack_and_jump(fn) switch_stack_and_jump(get_cpu_info(), fn) #endif #endif /* __ARM_CURRENT_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/types.h0000664000175000017500000000275712307313555015545 0ustar smbsmb#ifndef __ARM_TYPES_H__ #define __ARM_TYPES_H__ #ifndef __ASSEMBLY__ #include typedef __signed__ char __s8; typedef unsigned char __u8; typedef __signed__ short __s16; typedef unsigned short __u16; typedef __signed__ int __s32; typedef unsigned int __u32; #if defined(__GNUC__) && !defined(__STRICT_ANSI__) #if defined(CONFIG_ARM_32) typedef __signed__ long long __s64; typedef unsigned long long __u64; #elif defined (CONFIG_ARM_64) typedef __signed__ long __s64; typedef unsigned long __u64; #endif #endif typedef signed char s8; typedef unsigned char u8; typedef signed short s16; typedef unsigned short u16; typedef signed int s32; typedef unsigned int u32; #if defined(CONFIG_ARM_32) typedef signed long long s64; typedef unsigned long long u64; typedef u32 vaddr_t; #define PRIvaddr PRIx32 typedef u64 paddr_t; #define INVALID_PADDR (~0ULL) #define PRIpaddr "016llx" typedef u32 register_t; #define PRIregister "x" #elif defined (CONFIG_ARM_64) typedef signed long s64; typedef unsigned long u64; typedef u64 vaddr_t; #define PRIvaddr PRIx64 typedef u64 paddr_t; #define INVALID_PADDR (~0UL) #define PRIpaddr "016lx" typedef u64 register_t; #define PRIregister "lx" #endif typedef unsigned long size_t; typedef char bool_t; #define test_and_set_bool(b) xchg(&(b), 1) #define test_and_clear_bool(b) xchg(&(b), 0) #endif /* __ASSEMBLY__ */ #endif /* __ARM_TYPES_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/pl011-uart.h0000664000175000017500000000531212307313555016175 0ustar smbsmb/* * xen/include/asm-arm/pl011-uart.h * * Common constant definition between early printk and the UART driver * for the pl011 UART * * Tim Deegan * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #ifndef __ASM_ARM_PL011_H #define __ASM_ARM_PL011_H /* PL011 register addresses */ #define DR (0x00) #define RSR (0x04) #define FR (0x18) #define ILPR (0x20) #define IBRD (0x24) #define FBRD (0x28) #define LCR_H (0x2c) #define CR (0x30) #define IFLS (0x34) #define IMSC (0x38) #define RIS (0x3c) #define MIS (0x40) #define ICR (0x44) #define DMACR (0x48) /* CR bits */ #define CTSEN (1<<15) /* automatic CTS hardware flow control */ #define RTSEN (1<<14) /* automatic RTS hardware flow control */ #define RTS (1<<11) /* RTS signal */ #define DTR (1<<10) /* DTR signal */ #define RXE (1<<9) /* Receive enable */ #define TXE (1<<8) /* Transmit enable */ #define UARTEN (1<<0) /* UART enable */ /* FR bits */ #define TXFE (1<<7) /* TX FIFO empty */ #define RXFE (1<<4) /* RX FIFO empty */ #define BUSY (1<<3) /* Transmit is not complete */ /* LCR_H bits */ #define SPS (1<<7) /* Stick parity select */ #define FEN (1<<4) /* FIFO enable */ #define STP2 (1<<3) /* Two stop bits select */ #define EPS (1<<2) /* Even parity select */ #define PEN (1<<1) /* Parity enable */ #define BRK (1<<0) /* Send break */ /* Interrupt bits (IMSC, MIS, ICR) */ #define OEI (1<<10) /* Overrun Error interrupt mask */ #define BEI (1<<9) /* Break Error interrupt mask */ #define PEI (1<<8) /* Parity Error interrupt mask */ #define FEI (1<<7) /* Framing Error interrupt mask */ #define RTI (1<<6) /* Receive Timeout interrupt mask */ #define TXI (1<<5) /* Transmit interrupt mask */ #define RXI (1<<4) /* Receive interrupt mask */ #define DSRMI (1<<3) /* nUARTDSR Modem interrupt mask */ #define DCDMI (1<<2) /* nUARTDCD Modem interrupt mask */ #define CTSMI (1<<1) /* nUARTCTS Modem interrupt mask */ #define RIMI (1<<0) /* nUARTRI Modem interrupt mask */ #define ALLI OEI|BEI|PEI|FEI|RTI|TXI|RXI|DSRMI|DCDMI|CTSMI|RIMI #endif /* __ASM_ARM_PL011_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/arm32/0000775000175000017500000000000012307313555015141 5ustar smbsmbxen-4.4.0/xen/include/asm-arm/arm32/page.h0000664000175000017500000000713512307313555016234 0ustar smbsmb#ifndef __ARM_ARM32_PAGE_H__ #define __ARM_ARM32_PAGE_H__ #ifndef __ASSEMBLY__ /* Write a pagetable entry. * * If the table entry is changing a text mapping, it is responsibility * of the caller to issue an ISB after write_pte. */ static inline void write_pte(lpae_t *p, lpae_t pte) { asm volatile ( /* Ensure any writes have completed with the old mappings. */ "dsb;" /* Safely write the entry (STRD is atomic on CPUs that support LPAE) */ "strd %0, %H0, [%1];" "dsb;" : : "r" (pte.bits), "r" (p) : "memory"); } /* Inline ASM to flush dcache on register R (may be an inline asm operand) */ #define __clean_xen_dcache_one(R) STORE_CP32(R, DCCMVAC) /* Inline ASM to clean and invalidate dcache on register R (may be an * inline asm operand) */ #define __clean_and_invalidate_xen_dcache_one(R) STORE_CP32(R, DCCIMVAC) /* * Flush all hypervisor mappings from the TLB and branch predictor. * This is needed after changing Xen code mappings. * * The caller needs to issue the necessary DSB and D-cache flushes * before calling flush_xen_text_tlb. */ static inline void flush_xen_text_tlb(void) { register unsigned long r0 asm ("r0"); asm volatile ( "isb;" /* Ensure synchronization with previous changes to text */ STORE_CP32(0, TLBIALLH) /* Flush hypervisor TLB */ STORE_CP32(0, ICIALLU) /* Flush I-cache */ STORE_CP32(0, BPIALL) /* Flush branch predictor */ "dsb;" /* Ensure completion of TLB+BP flush */ "isb;" : : "r" (r0) /*dummy*/ : "memory"); } /* * Flush all hypervisor mappings from the data TLB. This is not * sufficient when changing code mappings or for self modifying code. */ static inline void flush_xen_data_tlb(void) { register unsigned long r0 asm ("r0"); asm volatile("dsb;" /* Ensure preceding are visible */ STORE_CP32(0, TLBIALLH) "dsb;" /* Ensure completion of the TLB flush */ "isb;" : : "r" (r0) /* dummy */: "memory"); } /* * Flush a range of VA's hypervisor mappings from the data TLB. This is not * sufficient when changing code mappings or for self modifying code. */ static inline void flush_xen_data_tlb_range_va(unsigned long va, unsigned long size) { unsigned long end = va + size; dsb(); /* Ensure preceding are visible */ while ( va < end ) { asm volatile(STORE_CP32(0, TLBIMVAH) : : "r" (va) : "memory"); va += PAGE_SIZE; } dsb(); /* Ensure completion of the TLB flush */ isb(); } /* Ask the MMU to translate a VA for us */ static inline uint64_t __va_to_par(vaddr_t va) { uint64_t par, tmp; tmp = READ_CP64(PAR); WRITE_CP32(va, ATS1HR); isb(); /* Ensure result is available. */ par = READ_CP64(PAR); WRITE_CP64(tmp, PAR); return par; } /* Ask the MMU to translate a Guest VA for us */ static inline uint64_t gva_to_ma_par(vaddr_t va) { uint64_t par, tmp; tmp = READ_CP64(PAR); WRITE_CP32(va, ATS12NSOPR); isb(); /* Ensure result is available. */ par = READ_CP64(PAR); WRITE_CP64(tmp, PAR); return par; } static inline uint64_t gva_to_ipa_par(vaddr_t va) { uint64_t par, tmp; tmp = READ_CP64(PAR); WRITE_CP32(va, ATS1CPR); isb(); /* Ensure result is available. */ par = READ_CP64(PAR); WRITE_CP64(tmp, PAR); return par; } #endif /* __ASSEMBLY__ */ #endif /* __ARM_ARM32_PAGE_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/arm32/flushtlb.h0000664000175000017500000000164512307313555017143 0ustar smbsmb#ifndef __ASM_ARM_ARM32_FLUSHTLB_H__ #define __ASM_ARM_ARM32_FLUSHTLB_H__ /* Flush local TLBs, current VMID only */ static inline void flush_tlb_local(void) { dsb(); WRITE_CP32((uint32_t) 0, TLBIALL); dsb(); isb(); } /* Flush inner shareable TLBs, current VMID only */ static inline void flush_tlb(void) { dsb(); WRITE_CP32((uint32_t) 0, TLBIALLIS); dsb(); isb(); } /* Flush local TLBs, all VMIDs, non-hypervisor mode */ static inline void flush_tlb_all_local(void) { dsb(); WRITE_CP32((uint32_t) 0, TLBIALLNSNH); dsb(); isb(); } /* Flush innershareable TLBs, all VMIDs, non-hypervisor mode */ static inline void flush_tlb_all(void) { dsb(); WRITE_CP32((uint32_t) 0, TLBIALLNSNHIS); dsb(); isb(); } #endif /* __ASM_ARM_ARM32_FLUSHTLB_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/arm32/processor.h0000664000175000017500000000755312307313555017343 0ustar smbsmb#ifndef __ASM_ARM_ARM32_PROCESSOR_H #define __ASM_ARM_ARM32_PROCESSOR_H #define ACTLR_V7_SMP (1<<6) #ifndef __ASSEMBLY__ /* On stack VCPU state */ struct cpu_user_regs { uint32_t r0; uint32_t r1; uint32_t r2; uint32_t r3; uint32_t r4; uint32_t r5; uint32_t r6; uint32_t r7; uint32_t r8; uint32_t r9; uint32_t r10; union { uint32_t r11; uint32_t fp; }; uint32_t r12; uint32_t sp; /* r13 - SP: Valid for Hyp. frames only, o/w banked (see below) */ /* r14 - LR: is the same physical register as LR_usr */ union { uint32_t lr; /* r14 - LR: Valid for Hyp. Same physical register as lr_usr. */ uint32_t lr_usr; }; union { /* Return IP, pc32 is used to allow code to be common with 64-bit */ uint32_t pc, pc32; }; uint32_t cpsr; /* Return mode */ uint32_t pad0; /* Doubleword-align the kernel half of the frame */ /* Outer guest frame only from here on... */ uint32_t sp_usr; /* LR_usr is the same register as LR, see above */ uint32_t sp_irq, lr_irq; uint32_t sp_svc, lr_svc; uint32_t sp_abt, lr_abt; uint32_t sp_und, lr_und; uint32_t r8_fiq, r9_fiq, r10_fiq, r11_fiq, r12_fiq; uint32_t sp_fiq, lr_fiq; uint32_t spsr_svc, spsr_abt, spsr_und, spsr_irq, spsr_fiq; uint32_t pad1; /* Doubleword-align the user half of the frame */ }; #endif /* Layout as used in assembly, with src/dest registers mixed in */ #define __CP32(r, coproc, opc1, crn, crm, opc2) coproc, opc1, r, crn, crm, opc2 #define __CP64(r1, r2, coproc, opc, crm) coproc, opc, r1, r2, crm #define CP32(r, name...) __CP32(r, name) #define CP64(r, name...) __CP64(r, name) /* Stringified for inline assembly */ #define LOAD_CP32(r, name...) "mrc " __stringify(CP32(%r, name)) ";" #define STORE_CP32(r, name...) "mcr " __stringify(CP32(%r, name)) ";" #define LOAD_CP64(r, name...) "mrrc " __stringify(CP64(%r, %H##r, name)) ";" #define STORE_CP64(r, name...) "mcrr " __stringify(CP64(%r, %H##r, name)) ";" #ifndef __ASSEMBLY__ /* C wrappers */ #define READ_CP32(name...) ({ \ register uint32_t _r; \ asm volatile(LOAD_CP32(0, name) : "=r" (_r)); \ _r; }) #define WRITE_CP32(v, name...) do { \ register uint32_t _r = (v); \ asm volatile(STORE_CP32(0, name) : : "r" (_r)); \ } while (0) #define READ_CP64(name...) ({ \ register uint64_t _r; \ asm volatile(LOAD_CP64(0, name) : "=r" (_r)); \ _r; }) #define WRITE_CP64(v, name...) do { \ register uint64_t _r = (v); \ asm volatile(STORE_CP64(0, name) : : "r" (_r)); \ } while (0) /* * C wrappers for accessing system registers. * * Registers come in 3 types: * - those which are always 32-bit regardless of AArch32 vs AArch64 * (use {READ,WRITE}_SYSREG32). * - those which are always 64-bit regardless of AArch32 vs AArch64 * (use {READ,WRITE}_SYSREG64). * - those which vary between AArch32 and AArch64 (use {READ,WRITE}_SYSREG). */ #define READ_SYSREG32(R...) READ_CP32(R) #define WRITE_SYSREG32(V, R...) WRITE_CP32(V, R) #define READ_SYSREG64(R...) READ_CP64(R) #define WRITE_SYSREG64(V, R...) WRITE_CP64(V, R) #define READ_SYSREG(R...) READ_SYSREG32(R) #define WRITE_SYSREG(V, R...) WRITE_SYSREG32(V, R) /* Erratum 766422: only Cortex A15 r0p4 is affected */ #define cpu_has_erratum_766422() \ (unlikely(current_cpu_data.midr.bits == 0x410fc0f4)) #endif /* __ASSEMBLY__ */ #endif /* __ASM_ARM_ARM32_PROCESSOR_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/arm32/io.h0000664000175000017500000000654412307313555015732 0ustar smbsmb/* * Based on linux arch/arm/include/asm/io.h * * Copyright (C) 1996-2000 Russell King * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * Modifications: * 16-Sep-1996 RMK Inlined the inx/outx functions & optimised for both * constant addresses and variable addresses. * 04-Dec-1997 RMK Moved a lot of this stuff to the new architecture * specific IO header files. * 27-Mar-1999 PJB Second parameter of memcpy_toio is const.. * 04-Apr-1999 PJB Added check_signature. * 12-Dec-1999 RMK More cleanups * 18-Jun-2000 RMK Removed virt_to_* and friends definitions * 05-Oct-2004 BJD Moved memory string functions to use void __iomem */ #ifndef _ARM_ARM32_IO_H #define _ARM_ARM32_IO_H #include #include static inline void __raw_writeb(u8 val, volatile void __iomem *addr) { asm volatile("strb %1, %0" : "+Qo" (*(volatile u8 __force *)addr) : "r" (val)); } static inline void __raw_writew(u16 val, volatile void __iomem *addr) { asm volatile("strh %1, %0" : "+Q" (*(volatile u16 __force *)addr) : "r" (val)); } static inline void __raw_writel(u32 val, volatile void __iomem *addr) { asm volatile("str %1, %0" : "+Qo" (*(volatile u32 __force *)addr) : "r" (val)); } static inline u8 __raw_readb(const volatile void __iomem *addr) { u8 val; asm volatile("ldrb %1, %0" : "+Qo" (*(volatile u8 __force *)addr), "=r" (val)); return val; } static inline u16 __raw_readw(const volatile void __iomem *addr) { u16 val; asm volatile("ldrh %1, %0" : "+Q" (*(volatile u16 __force *)addr), "=r" (val)); return val; } static inline u32 __raw_readl(const volatile void __iomem *addr) { u32 val; asm volatile("ldr %1, %0" : "+Qo" (*(volatile u32 __force *)addr), "=r" (val)); return val; } #define __iormb() rmb() #define __iowmb() wmb() #define readb_relaxed(c) ({ u8 __r = __raw_readb(c); __r; }) #define readw_relaxed(c) ({ u16 __r = le16_to_cpu((__force __le16) \ __raw_readw(c)); __r; }) #define readl_relaxed(c) ({ u32 __r = le32_to_cpu((__force __le32) \ __raw_readl(c)); __r; }) #define writeb_relaxed(v,c) __raw_writeb(v,c) #define writew_relaxed(v,c) __raw_writew((__force u16) cpu_to_le16(v),c) #define writel_relaxed(v,c) __raw_writel((__force u32) cpu_to_le32(v),c) #define readb(c) ({ u8 __v = readb_relaxed(c); __iormb(); __v; }) #define readw(c) ({ u16 __v = readw_relaxed(c); __iormb(); __v; }) #define readl(c) ({ u32 __v = readl_relaxed(c); __iormb(); __v; }) #define writeb(v,c) ({ __iowmb(); writeb_relaxed(v,c); }) #define writew(v,c) ({ __iowmb(); writew_relaxed(v,c); }) #define writel(v,c) ({ __iowmb(); writel_relaxed(v,c); }) #endif /* _ARM_ARM32_IO_H */ xen-4.4.0/xen/include/asm-arm/arm32/bitops.h0000664000175000017500000000406112307313555016613 0ustar smbsmb#ifndef _ARM_ARM32_BITOPS_H #define _ARM_ARM32_BITOPS_H extern void _set_bit(int nr, volatile void * p); extern void _clear_bit(int nr, volatile void * p); extern void _change_bit(int nr, volatile void * p); extern int _test_and_set_bit(int nr, volatile void * p); extern int _test_and_clear_bit(int nr, volatile void * p); extern int _test_and_change_bit(int nr, volatile void * p); #define set_bit(n,p) _set_bit(n,p) #define clear_bit(n,p) _clear_bit(n,p) #define change_bit(n,p) _change_bit(n,p) #define test_and_set_bit(n,p) _test_and_set_bit(n,p) #define test_and_clear_bit(n,p) _test_and_clear_bit(n,p) #define test_and_change_bit(n,p) _test_and_change_bit(n,p) /* * Little endian assembly bitops. nr = 0 -> byte 0 bit 0. */ extern int _find_first_zero_bit_le(const void * p, unsigned size); extern int _find_next_zero_bit_le(const void * p, int size, int offset); extern int _find_first_bit_le(const unsigned long *p, unsigned size); extern int _find_next_bit_le(const unsigned long *p, int size, int offset); /* * Big endian assembly bitops. nr = 0 -> byte 3 bit 0. */ extern int _find_first_zero_bit_be(const void * p, unsigned size); extern int _find_next_zero_bit_be(const void * p, int size, int offset); extern int _find_first_bit_be(const unsigned long *p, unsigned size); extern int _find_next_bit_be(const unsigned long *p, int size, int offset); #ifndef __ARMEB__ /* * These are the little endian, atomic definitions. */ #define find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz) #define find_next_zero_bit(p,sz,off) _find_next_zero_bit_le(p,sz,off) #define find_first_bit(p,sz) _find_first_bit_le(p,sz) #define find_next_bit(p,sz,off) _find_next_bit_le(p,sz,off) #else /* * These are the big endian, atomic definitions. */ #define find_first_zero_bit(p,sz) _find_first_zero_bit_be(p,sz) #define find_next_zero_bit(p,sz,off) _find_next_zero_bit_be(p,sz,off) #define find_first_bit(p,sz) _find_first_bit_be(p,sz) #define find_next_bit(p,sz,off) _find_next_bit_be(p,sz,off) #endif #endif /* _ARM_ARM32_BITOPS_H */ xen-4.4.0/xen/include/asm-arm/arm32/atomic.h0000664000175000017500000000750712307313555016577 0ustar smbsmb/* * arch/arm/include/asm/atomic.h * * Copyright (C) 1996 Russell King. * Copyright (C) 2002 Deep Blue Solutions Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #ifndef __ARCH_ARM_ARM32_ATOMIC__ #define __ARCH_ARM_ARM32_ATOMIC__ /* * ARMv6 UP and SMP safe atomic ops. We use load exclusive and * store exclusive to ensure that these are atomic. We may loop * to ensure that the update happens. */ static inline void atomic_add(int i, atomic_t *v) { unsigned long tmp; int result; __asm__ __volatile__("@ atomic_add\n" "1: ldrex %0, [%3]\n" " add %0, %0, %4\n" " strex %1, %0, [%3]\n" " teq %1, #0\n" " bne 1b" : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter) : "r" (&v->counter), "Ir" (i) : "cc"); } static inline int atomic_add_return(int i, atomic_t *v) { unsigned long tmp; int result; smp_mb(); __asm__ __volatile__("@ atomic_add_return\n" "1: ldrex %0, [%3]\n" " add %0, %0, %4\n" " strex %1, %0, [%3]\n" " teq %1, #0\n" " bne 1b" : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter) : "r" (&v->counter), "Ir" (i) : "cc"); smp_mb(); return result; } static inline void atomic_sub(int i, atomic_t *v) { unsigned long tmp; int result; __asm__ __volatile__("@ atomic_sub\n" "1: ldrex %0, [%3]\n" " sub %0, %0, %4\n" " strex %1, %0, [%3]\n" " teq %1, #0\n" " bne 1b" : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter) : "r" (&v->counter), "Ir" (i) : "cc"); } static inline int atomic_sub_return(int i, atomic_t *v) { unsigned long tmp; int result; smp_mb(); __asm__ __volatile__("@ atomic_sub_return\n" "1: ldrex %0, [%3]\n" " sub %0, %0, %4\n" " strex %1, %0, [%3]\n" " teq %1, #0\n" " bne 1b" : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter) : "r" (&v->counter), "Ir" (i) : "cc"); smp_mb(); return result; } static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) { unsigned long oldval, res; smp_mb(); do { __asm__ __volatile__("@ atomic_cmpxchg\n" "ldrex %1, [%3]\n" "mov %0, #0\n" "teq %1, %4\n" "strexeq %0, %5, [%3]\n" : "=&r" (res), "=&r" (oldval), "+Qo" (ptr->counter) : "r" (&ptr->counter), "Ir" (old), "r" (new) : "cc"); } while (res); smp_mb(); return oldval; } static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr) { unsigned long tmp, tmp2; __asm__ __volatile__("@ atomic_clear_mask\n" "1: ldrex %0, [%3]\n" " bic %0, %0, %4\n" " strex %1, %0, [%3]\n" " teq %1, #0\n" " bne 1b" : "=&r" (tmp), "=&r" (tmp2), "+Qo" (*addr) : "r" (addr), "Ir" (mask) : "cc"); } #define atomic_inc(v) atomic_add(1, v) #define atomic_dec(v) atomic_sub(1, v) #define atomic_inc_and_test(v) (atomic_add_return(1, v) == 0) #define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) #define atomic_inc_return(v) (atomic_add_return(1, v)) #define atomic_dec_return(v) (atomic_sub_return(1, v)) #define atomic_sub_and_test(i, v) (atomic_sub_return(i, v) == 0) #define atomic_add_negative(i,v) (atomic_add_return(i, v) < 0) #endif /* __ARCH_ARM_ARM32_ATOMIC__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/arm32/system.h0000664000175000017500000001453612307313555016647 0ustar smbsmb/* Portions taken from Linux arch arm */ #ifndef __ASM_ARM32_SYSTEM_H #define __ASM_ARM32_SYSTEM_H #define sev() __asm__ __volatile__ ("sev" : : : "memory") #define wfe() __asm__ __volatile__ ("wfe" : : : "memory") #define wfi() __asm__ __volatile__ ("wfi" : : : "memory") #define isb() __asm__ __volatile__ ("isb" : : : "memory") #define dsb() __asm__ __volatile__ ("dsb" : : : "memory") #define dmb() __asm__ __volatile__ ("dmb" : : : "memory") #define mb() dsb() #define rmb() dsb() #define wmb() mb() #define smp_mb() mb() #define smp_rmb() rmb() #define smp_wmb() wmb() extern void __bad_xchg(volatile void *, int); static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size) { unsigned long ret; unsigned int tmp; smp_mb(); switch (size) { case 1: asm volatile("@ __xchg1\n" "1: ldrexb %0, [%3]\n" " strexb %1, %2, [%3]\n" " teq %1, #0\n" " bne 1b" : "=&r" (ret), "=&r" (tmp) : "r" (x), "r" (ptr) : "memory", "cc"); break; case 4: asm volatile("@ __xchg4\n" "1: ldrex %0, [%3]\n" " strex %1, %2, [%3]\n" " teq %1, #0\n" " bne 1b" : "=&r" (ret), "=&r" (tmp) : "r" (x), "r" (ptr) : "memory", "cc"); break; default: __bad_xchg(ptr, size), ret = 0; break; } smp_mb(); return ret; } /* * Atomic compare and exchange. Compare OLD with MEM, if identical, * store NEW in MEM. Return the initial value in MEM. Success is * indicated by comparing RETURN with OLD. */ extern void __bad_cmpxchg(volatile void *ptr, int size); static always_inline unsigned long __cmpxchg( volatile void *ptr, unsigned long old, unsigned long new, int size) { unsigned long /*long*/ oldval, res; switch (size) { case 1: do { asm volatile("@ __cmpxchg1\n" " ldrexb %1, [%2]\n" " mov %0, #0\n" " teq %1, %3\n" " strexbeq %0, %4, [%2]\n" : "=&r" (res), "=&r" (oldval) : "r" (ptr), "Ir" (old), "r" (new) : "memory", "cc"); } while (res); break; case 2: do { asm volatile("@ __cmpxchg2\n" " ldrexh %1, [%2]\n" " mov %0, #0\n" " teq %1, %3\n" " strexheq %0, %4, [%2]\n" : "=&r" (res), "=&r" (oldval) : "r" (ptr), "Ir" (old), "r" (new) : "memory", "cc"); } while (res); break; case 4: do { asm volatile("@ __cmpxchg4\n" " ldrex %1, [%2]\n" " mov %0, #0\n" " teq %1, %3\n" " strexeq %0, %4, [%2]\n" : "=&r" (res), "=&r" (oldval) : "r" (ptr), "Ir" (old), "r" (new) : "memory", "cc"); } while (res); break; #if 0 case 8: do { asm volatile("@ __cmpxchg8\n" " ldrexd %1, [%2]\n" " mov %0, #0\n" " teq %1, %3\n" " strexdeq %0, %4, [%2]\n" : "=&r" (res), "=&r" (oldval) : "r" (ptr), "Ir" (old), "r" (new) : "memory", "cc"); } while (res); break; #endif default: __bad_cmpxchg(ptr, size); oldval = 0; } return oldval; } #define cmpxchg(ptr,o,n) \ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o), \ (unsigned long)(n),sizeof(*(ptr)))) #define local_irq_disable() asm volatile ( "cpsid i @ local_irq_disable\n" : : : "cc" ) #define local_irq_enable() asm volatile ( "cpsie i @ local_irq_enable\n" : : : "cc" ) #define local_save_flags(x) \ ({ \ BUILD_BUG_ON(sizeof(x) != sizeof(long)); \ asm volatile ( "mrs %0, cpsr @ local_save_flags\n" \ : "=r" (x) :: "memory", "cc" ); \ }) #define local_irq_save(x) \ ({ \ local_save_flags(x); \ local_irq_disable(); \ }) #define local_irq_restore(x) \ ({ \ BUILD_BUG_ON(sizeof(x) != sizeof(long)); \ asm volatile ( \ "msr cpsr_c, %0 @ local_irq_restore\n" \ : \ : "r" (flags) \ : "memory", "cc"); \ }) static inline int local_irq_is_enabled(void) { unsigned long flags; local_save_flags(flags); return !(flags & PSR_IRQ_MASK); } #define local_fiq_enable() __asm__("cpsie f @ __stf\n" : : : "memory", "cc") #define local_fiq_disable() __asm__("cpsid f @ __clf\n" : : : "memory", "cc") #define local_abort_enable() __asm__("cpsie a @ __sta\n" : : : "memory", "cc") #define local_abort_disable() __asm__("cpsid a @ __sta\n" : : : "memory", "cc") static inline int local_fiq_is_enabled(void) { unsigned long flags; local_save_flags(flags); return !(flags & PSR_FIQ_MASK); } #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/arm32/spinlock.h0000664000175000017500000000505312307313555017137 0ustar smbsmb#ifndef __ASM_ARM32_SPINLOCK_H #define __ASM_ARM32_SPINLOCK_H static inline void dsb_sev(void) { __asm__ __volatile__ ( "dsb\n" "sev\n" ); } typedef struct { volatile unsigned int lock; } raw_spinlock_t; #define _RAW_SPIN_LOCK_UNLOCKED { 0 } #define _raw_spin_is_locked(x) ((x)->lock != 0) static always_inline void _raw_spin_unlock(raw_spinlock_t *lock) { ASSERT(_raw_spin_is_locked(lock)); smp_mb(); __asm__ __volatile__( " str %1, [%0]\n" : : "r" (&lock->lock), "r" (0) : "cc"); dsb_sev(); } static always_inline int _raw_spin_trylock(raw_spinlock_t *lock) { unsigned long contended, res; do { __asm__ __volatile__( " ldrex %0, [%2]\n" " teq %0, #0\n" " strexeq %1, %3, [%2]\n" " movne %1, #0\n" : "=&r" (contended), "=r" (res) : "r" (&lock->lock), "r" (1) : "cc"); } while (res); if (!contended) { smp_mb(); return 1; } else { return 0; } } typedef struct { volatile unsigned int lock; } raw_rwlock_t; #define _RAW_RW_LOCK_UNLOCKED { 0 } static always_inline int _raw_read_trylock(raw_rwlock_t *rw) { unsigned long tmp, tmp2 = 1; __asm__ __volatile__( "1: ldrex %0, [%2]\n" " adds %0, %0, #1\n" " strexpl %1, %0, [%2]\n" : "=&r" (tmp), "+r" (tmp2) : "r" (&rw->lock) : "cc"); smp_mb(); return tmp2 == 0; } static always_inline int _raw_write_trylock(raw_rwlock_t *rw) { unsigned long tmp; __asm__ __volatile__( "1: ldrex %0, [%1]\n" " teq %0, #0\n" " strexeq %0, %2, [%1]" : "=&r" (tmp) : "r" (&rw->lock), "r" (0x80000000) : "cc"); if (tmp == 0) { smp_mb(); return 1; } else { return 0; } } static inline void _raw_read_unlock(raw_rwlock_t *rw) { unsigned long tmp, tmp2; smp_mb(); __asm__ __volatile__( "1: ldrex %0, [%2]\n" " sub %0, %0, #1\n" " strex %1, %0, [%2]\n" " teq %1, #0\n" " bne 1b" : "=&r" (tmp), "=&r" (tmp2) : "r" (&rw->lock) : "cc"); if (tmp == 0) dsb_sev(); } static inline void _raw_write_unlock(raw_rwlock_t *rw) { smp_mb(); __asm__ __volatile__( "str %1, [%0]\n" : : "r" (&rw->lock), "r" (0) : "cc"); dsb_sev(); } #define _raw_rw_is_locked(x) ((x)->lock != 0) #define _raw_rw_is_write_locked(x) ((x)->lock == 0x80000000) #endif /* __ASM_SPINLOCK_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/arm32/vfp.h0000664000175000017500000000206612307313555016111 0ustar smbsmb#ifndef _ARM_ARM32_VFP_H #define _ARM_ARM32_VFP_H #define FPEXC_EX (1u << 31) #define FPEXC_EN (1u << 30) #define FPEXC_FP2V (1u << 28) #define MVFR0_A_SIMD_MASK (0xf << 0) #define FPSID_IMPLEMENTER_BIT (24) #define FPSID_IMPLEMENTER_MASK (0xff << FPSID_IMPLEMENTER_BIT) #define FPSID_ARCH_BIT (16) #define FPSID_ARCH_MASK (0xf << FPSID_ARCH_BIT) #define FPSID_PART_BIT (8) #define FPSID_PART_MASK (0xff << FPSID_PART_BIT) #define FPSID_VARIANT_BIT (4) #define FPSID_VARIANT_MASK (0xf << FPSID_VARIANT_BIT) #define FPSID_REV_BIT (0) #define FPSID_REV_MASK (0xf << FPSID_REV_BIT) struct vfp_state { uint64_t fpregs1[16]; /* {d0-d15} */ uint64_t fpregs2[16]; /* {d16-d31} */ uint32_t fpexc; uint32_t fpscr; /* VFP implementation specific state */ uint32_t fpinst; uint32_t fpinst2; }; #endif /* _ARM_ARM32_VFP_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/system.h0000664000175000017500000000211512307313555015711 0ustar smbsmb/* Portions taken from Linux arch arm */ #ifndef __ASM_SYSTEM_H #define __ASM_SYSTEM_H #include #include #define nop() \ asm volatile ( "nop" ) #define xchg(ptr,x) \ ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr)))) /* * This is used to ensure the compiler did actually allocate the register we * asked it for some inline assembly sequences. Apparently we can't trust * the compiler from one version to another so a bit of paranoia won't hurt. * This string is meant to be concatenated with the inline asm string and * will cause compilation to stop on mismatch. * (for details, see gcc PR 15089) */ #define __asmeq(x, y) ".ifnc " x "," y " ; .err ; .endif\n\t" #if defined(CONFIG_ARM_32) # include #elif defined(CONFIG_ARM_64) # include #else # error "unknown ARM variant" #endif extern struct vcpu *__context_switch(struct vcpu *prev, struct vcpu *next); #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/processor-ca7.h0000664000175000017500000000104512307313555017055 0ustar smbsmb#ifndef __ASM_ARM_PROCESSOR_CA7_H #define __ASM_ARM_PROCESSOR_CA7_H /* ACTLR Auxiliary Control Register, Cortex A7 */ #define ACTLR_CA7_DDI (1<<28) #define ACTLR_CA7_DDVM (1<<15) #define ACTLR_CA7_L1RADIS (1<<12) #define ACTLR_CA7_L2RADIS (1<<11) #define ACTLR_CA7_DODMBS (1<<10) #define ACTLR_CA7_SMP (1<<6) #endif /* __ASM_ARM_PROCESSOR_CA7_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/processor-ca15.h0000664000175000017500000000302412307313555017133 0ustar smbsmb#ifndef __ASM_ARM_PROCESSOR_CA15_H #define __ASM_ARM_PROCESSOR_CA15_H /* ACTLR Auxiliary Control Register, Cortex A15 */ #define ACTLR_CA15_SNOOP_DELAYED (1<<31) #define ACTLR_CA15_MAIN_CLOCK (1<<30) #define ACTLR_CA15_NEON_CLOCK (1<<29) #define ACTLR_CA15_NONCACHE (1<<24) #define ACTLR_CA15_INORDER_REQ (1<<23) #define ACTLR_CA15_INORDER_LOAD (1<<22) #define ACTLR_CA15_L2_TLB_PREFETCH (1<<21) #define ACTLR_CA15_L2_IPA_PA_CACHE (1<<20) #define ACTLR_CA15_L2_CACHE (1<<19) #define ACTLR_CA15_L2_PA_CACHE (1<<18) #define ACTLR_CA15_TLB (1<<17) #define ACTLR_CA15_STRONGY_ORDERED (1<<16) #define ACTLR_CA15_INORDER (1<<15) #define ACTLR_CA15_FORCE_LIM (1<<14) #define ACTLR_CA15_CP_FLUSH (1<<13) #define ACTLR_CA15_CP_PUSH (1<<12) #define ACTLR_CA15_LIM (1<<11) #define ACTLR_CA15_SER (1<<10) #define ACTLR_CA15_OPT (1<<9) #define ACTLR_CA15_WFI (1<<8) #define ACTLR_CA15_WFE (1<<7) #define ACTLR_CA15_SMP (1<<6) #define ACTLR_CA15_PLD (1<<5) #define ACTLR_CA15_IP (1<<4) #define ACTLR_CA15_MICRO_BTB (1<<3) #define ACTLR_CA15_LOOP_ONE (1<<2) #define ACTLR_CA15_LOOP_DISABLE (1<<1) #define ACTLR_CA15_BTB (1<<0) #endif /* __ASM_ARM_PROCESSOR_CA15_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/cpufeature.h0000664000175000017500000000265612307313555016542 0ustar smbsmb#ifndef __ASM_ARM_CPUFEATURE_H #define __ASM_ARM_CPUFEATURE_H #ifdef CONFIG_ARM_64 #define cpu_feature64(c, feat) ((c)->pfr64.feat) #define boot_cpu_feature64(feat) (boot_cpu_data.pfr64.feat) #define cpu_has_el0_32 (boot_cpu_feature64(el0) == 2) #define cpu_has_el0_64 (boot_cpu_feature64(el0) >= 1) #define cpu_has_el1_32 (boot_cpu_feature64(el1) == 2) #define cpu_has_el1_64 (boot_cpu_feature64(el1) >= 1) #define cpu_has_el2_32 (boot_cpu_feature64(el2) == 2) #define cpu_has_el2_64 (boot_cpu_feature64(el2) >= 1) #define cpu_has_el3_32 (boot_cpu_feature64(el3) == 2) #define cpu_has_el3_64 (boot_cpu_feature64(el3) >= 1) #define cpu_has_fp (boot_cpu_feature64(fp) == 0) #define cpu_has_simd (boot_cpu_feature64(simd) == 0) #endif #define cpu_feature32(c, feat) ((c)->pfr32.feat) #define boot_cpu_feature32(feat) (boot_cpu_data.pfr32.feat) #define cpu_has_aarch32 (boot_cpu_feature32(arm) == 1) #define cpu_has_thumb (boot_cpu_feature32(thumb) >= 1) #define cpu_has_thumb2 (boot_cpu_feature32(thumb) >= 3) #define cpu_has_jazelle (boot_cpu_feature32(jazelle) >= 0) #define cpu_has_thumbee (boot_cpu_feature32(thumbee) == 1) #define cpu_has_gentimer (boot_cpu_feature32(gentimer) == 1) #define cpu_has_security (boot_cpu_feature32(security) > 0) #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/spinlock.h0000664000175000017500000000063012307313555016207 0ustar smbsmb#ifndef __ASM_SPINLOCK_H #define __ASM_SPINLOCK_H #include #include #if defined(CONFIG_ARM_32) # include #elif defined(CONFIG_ARM_64) # include #else # error "unknown ARM variant" #endif #endif /* __ASM_SPINLOCK_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/debugger.h0000664000175000017500000000044512307313555016155 0ustar smbsmb#ifndef __ARM_DEBUGGER_H__ #define __ARM_DEBUGGER_H__ #define debugger_trap_fatal(v, r) ((void) 0) #define debugger_trap_immediate() ((void) 0) #endif /* __ARM_DEBUGGER_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/byteorder.h0000664000175000017500000000044212307313555016365 0ustar smbsmb#ifndef __ASM_ARM_BYTEORDER_H__ #define __ASM_ARM_BYTEORDER_H__ #define __BYTEORDER_HAS_U64__ #include #endif /* __ASM_ARM_BYTEORDER_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/device.h0000664000175000017500000000250312307313555015625 0ustar smbsmb#ifndef __ASM_ARM_DEVICE_H #define __ASM_ARM_DEVICE_H #include #include enum device_type { DEVICE_SERIAL }; struct device_desc { /* Device name */ const char *name; /* Device type */ enum device_type type; /* Array of device tree 'compatible' strings */ const char *const *compatible; /* Device initialization */ int (*init)(struct dt_device_node *dev, const void *data); }; /** * device_init - Initialize a device * @dev: device to initialize * @type: type of the device (serial, network...) * @data: specific data for initializing the device * * Return 0 on success. */ int __init device_init(struct dt_device_node *dev, enum device_type type, const void *data); #define DT_DEVICE_START(_name, _namestr, _type) \ static const struct device_desc __dev_desc_##_name __used \ __attribute__((__section__(".dev.info"))) = { \ .name = _namestr, \ .type = _type, \ #define DT_DEVICE_END \ }; #endif /* __ASM_ARM_DEVICE_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/regs.h0000664000175000017500000000405212307313555015327 0ustar smbsmb#ifndef __ARM_REGS_H__ #define __ARM_REGS_H__ #define PSR_MODE_MASK 0x1f #ifndef __ASSEMBLY__ #include #include #include #define psr_mode(psr,m) (((psr) & PSR_MODE_MASK) == m) #define psr_mode_is_32bit(psr) !!((psr) & PSR_MODE_BIT) #define usr_mode(r) psr_mode((r)->cpsr,PSR_MODE_USR) #define fiq_mode(r) psr_mode((r)->cpsr,PSR_MODE_FIQ) #define irq_mode(r) psr_mode((r)->cpsr,PSR_MODE_IRQ) #define svc_mode(r) psr_mode((r)->cpsr,PSR_MODE_SVC) #define mon_mode(r) psr_mode((r)->cpsr,PSR_MODE_MON) #define abt_mode(r) psr_mode((r)->cpsr,PSR_MODE_ABT) #define und_mode(r) psr_mode((r)->cpsr,PSR_MODE_UND) #define sys_mode(r) psr_mode((r)->cpsr,PSR_MODE_SYS) #ifdef CONFIG_ARM_32 #define hyp_mode(r) psr_mode((r)->cpsr,PSR_MODE_HYP) #else #define hyp_mode(r) (psr_mode((r)->cpsr,PSR_MODE_EL2h) || \ psr_mode((r)->cpsr,PSR_MODE_EL2t)) #endif #define guest_mode(r) \ ({ \ unsigned long diff = (char *)guest_cpu_user_regs() - (char *)(r); \ /* Frame pointer must point into current CPU stack. */ \ ASSERT(diff < STACK_SIZE); \ /* If not a guest frame, it must be a hypervisor frame. */ \ ASSERT((diff == 0) || hyp_mode(r)); \ /* Return TRUE if it's a guest frame. */ \ (diff == 0); \ }) #define return_reg(v) ((v)->arch.cpu_info->guest_cpu_user_regs.r0) /* * Returns a pointer to the given register value in regs, taking the * processor mode (CPSR) into account. */ extern register_t *select_user_reg(struct cpu_user_regs *regs, int reg); #endif #endif /* __ARM_REGS_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/pci.h0000664000175000017500000000014212307313555015136 0ustar smbsmb#ifndef __X86_PCI_H__ #define __X86_PCI_H__ struct arch_pci_dev { }; #endif /* __X86_PCI_H__ */ xen-4.4.0/xen/include/asm-arm/multicall.h0000664000175000017500000000042712307313555016357 0ustar smbsmb#ifndef __ASM_ARM_MULTICALL_H__ #define __ASM_ARM_MULTICALL_H__ extern void do_multicall_call(struct multicall_entry *call); #endif /* __ASM_ARM_MULTICALL_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/grant_table.h0000664000175000017500000000303512307313555016651 0ustar smbsmb#ifndef __ASM_GRANT_TABLE_H__ #define __ASM_GRANT_TABLE_H__ #include #define INVALID_GFN (-1UL) #define INITIAL_NR_GRANT_FRAMES 1 void gnttab_clear_flag(unsigned long nr, uint16_t *addr); int create_grant_host_mapping(unsigned long gpaddr, unsigned long mfn, unsigned int flags, unsigned int cache_flags); #define gnttab_host_mapping_get_page_type(op, d, rd) (0) int replace_grant_host_mapping(unsigned long gpaddr, unsigned long mfn, unsigned long new_gpaddr, unsigned int flags); void gnttab_mark_dirty(struct domain *d, unsigned long l); #define gnttab_create_status_page(d, t, i) do {} while (0) #define gnttab_status_gmfn(d, t, i) (0) #define gnttab_release_host_mappings(domain) 1 static inline int replace_grant_supported(void) { return 1; } #define gnttab_create_shared_page(d, t, i) \ do { \ share_xen_page_with_guest( \ virt_to_page((char *)(t)->shared_raw[i]), \ (d), XENSHARE_writable); \ } while ( 0 ) #define gnttab_shared_gmfn(d, t, i) \ ( ((i >= nr_grant_frames(d->grant_table)) && \ (i < max_nr_grant_frames)) ? 0 : (d->arch.grant_table_gpfn[i])) #endif /* __ASM_GRANT_TABLE_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/exynos4210-uart.h0000664000175000017500000000564312307313555017203 0ustar smbsmb/* * xen/include/asm-arm/exynos4210-uart.h * * Common constant definition between early printk and the UART driver * for the exynos 4210 UART * * Julien Grall * Copyright (c) 2013 Linaro Limited. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #ifndef __ASM_ARM_EXYNOS4210_H #define __ASM_ARM_EXYNOS4210_H /* * this value is only valid for UART 2 and UART 3 * XXX: define per UART */ #define FIFO_MAX_SIZE 16 /* register addresses */ #define ULCON (0x00) #define UCON (0x04) #define UFCON (0x08) #define UMCON (0x0c) #define UTRSTAT (0x10) #define UERSTAT (0x14) #define UFSTAT (0x18) #define UMSTAT (0x1c) #define UTXH (0x20) #define URXH (0x24) #define UBRDIV (0x28) #define UFRACVAL (0x2c) #define UINTP (0x30) #define UINTS (0x34) #define UINTM (0x38) /* UCON */ #define UCON_RX_IRQ (1 << 0) #define UCON_TX_IRQ (1 << 2) #define UCON_RX_TIMEOUT (1 << 7) /* * FIXME: IRQ_LEVEL should be 1 << n but with this value, the IRQ * handler will never end... */ #define UCON_RX_IRQ_LEVEL (0 << 8) #define UCON_TX_IRQ_LEVEL (0 << 9) /* ULCON */ #define ULCON_STOPB_SHIFT 2 #define ULCON_PARITY_SHIFT 3 /* UFCON */ #define UFCON_FIFO_TX_RESET (1 << 2) #define UFCON_FIFO_RX_RESET (1 << 1) #define UFCON_FIFO_RESET (UFCON_FIFO_TX_RESET | UFCON_FIFO_RX_RESET) #define UFCON_FIFO_EN (1 << 0) #define UFCON_FIFO_TX_TRIGGER (0x6 << 8) /* UMCON */ #define UMCON_INT_EN (1 << 3) /* UERSTAT */ #define UERSTAT_OVERRUN (1 << 0) #define UERSTAT_PARITY (1 << 1) #define UERSTAT_FRAME (1 << 2) #define UERSTAT_BREAK (1 << 3) /* UFSTAT */ #define UFSTAT_TX_FULL (1 << 24) #define UFSTAT_TX_COUNT_SHIFT (16) #define UFSTAT_TX_COUNT_MASK (0xff << UFSTAT_TX_COUNT_SHIFT) #define UFSTAT_RX_FULL (1 << 8) #define UFSTAT_RX_COUNT_SHIFT (0) #define UFSTAT_RX_COUNT_MASK (0xff << UFSTAT_RX_COUNT_SHIFT) /* UTRSTAT */ #define UTRSTAT_TXFE (1 << 1) #define UTRSTAT_TXE (1 << 2) /* URHX */ #define URXH_DATA_MASK (0xff) /* Interrupt bits (UINTP, UINTS, UINTM) */ #define UINTM_MODEM (1 << 3) #define UINTM_TXD (1 << 2) #define UINTM_ERROR (1 << 1) #define UINTM_RXD (1 << 0) #define UINTM_ALLI (UINTM_MODEM | UINTM_TXD | UINTM_ERROR | UINTM_RXD) #endif /* __ASM_ARM_EXYNOS4210_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/random.h0000664000175000017500000000022512307313555015645 0ustar smbsmb#ifndef __ASM_RANDOM_H__ #define __ASM_RANDOM_H__ static inline unsigned int arch_get_random(void) { return 0; } #endif /* __ASM_RANDOM_H__ */ xen-4.4.0/xen/include/asm-arm/cpregs.h0000664000175000017500000003656512307313555015670 0ustar smbsmb#ifndef __ASM_ARM_CPREGS_H #define __ASM_ARM_CPREGS_H #include /* * AArch32 Co-processor registers. * * Note that AArch64 requires many of these definitions in order to * support 32-bit guests. */ #define __HSR_CPREG_c0 0 #define __HSR_CPREG_c1 1 #define __HSR_CPREG_c2 2 #define __HSR_CPREG_c3 3 #define __HSR_CPREG_c4 4 #define __HSR_CPREG_c5 5 #define __HSR_CPREG_c6 6 #define __HSR_CPREG_c7 7 #define __HSR_CPREG_c8 8 #define __HSR_CPREG_c9 9 #define __HSR_CPREG_c10 10 #define __HSR_CPREG_c11 11 #define __HSR_CPREG_c12 12 #define __HSR_CPREG_c13 13 #define __HSR_CPREG_c14 14 #define __HSR_CPREG_c15 15 #define __HSR_CPREG_0 0 #define __HSR_CPREG_1 1 #define __HSR_CPREG_2 2 #define __HSR_CPREG_3 3 #define __HSR_CPREG_4 4 #define __HSR_CPREG_5 5 #define __HSR_CPREG_6 6 #define __HSR_CPREG_7 7 #define _HSR_CPREG32(cp,op1,crn,crm,op2) \ ((__HSR_CPREG_##crn) << HSR_CP32_CRN_SHIFT) | \ ((__HSR_CPREG_##crm) << HSR_CP32_CRM_SHIFT) | \ ((__HSR_CPREG_##op1) << HSR_CP32_OP1_SHIFT) | \ ((__HSR_CPREG_##op2) << HSR_CP32_OP2_SHIFT) #define _HSR_CPREG64(cp,op1,crm) \ ((__HSR_CPREG_##crm) << HSR_CP64_CRM_SHIFT) | \ ((__HSR_CPREG_##op1) << HSR_CP64_OP1_SHIFT) /* Encode a register as per HSR ISS pattern */ #define HSR_CPREG32(X) _HSR_CPREG32(X) #define HSR_CPREG64(X) _HSR_CPREG64(X) /* * Order registers by Coprocessor-> CRn-> Opcode 1-> CRm-> Opcode 2 * * This matches the ordering used in the ARM as well as the groupings * which the CP registers are allocated in. * * This is slightly different to the form of the instruction * arguments, which are cp,opc1,crn,crm,opc2. */ /* Coprocessor 10 */ #define FPSID p10,7,c0,c0,0 /* Floating-Point System ID Register */ #define FPSCR p10,7,c1,c0,0 /* Floating-Point Status and Control Register */ #define MVFR0 p10,7,c7,c0,0 /* Media and VFP Feature Register 0 */ #define FPEXC p10,7,c8,c0,0 /* Floating-Point Exception Control Register */ #define FPINST p10,7,c9,c0,0 /* Floating-Point Instruction Register */ #define FPINST2 p10,7,c10,c0,0 /* Floating-point Instruction Register 2 */ /* Coprocessor 14 */ /* CP14 CR0: */ #define TEECR p14,6,c0,c0,0 /* ThumbEE Configuration Register */ /* CP14 CR1: */ #define TEEHBR p14,6,c1,c0,0 /* ThumbEE Handler Base Register */ #define JOSCR p14,7,c1,c0,0 /* Jazelle OS Control Register */ /* CP14 CR2: */ #define JMCR p14,7,c2,c0,0 /* Jazelle Main Configuration Register */ /* Coprocessor 15 */ /* CP15 CR0: CPUID and Cache Type Registers */ #define MIDR p15,0,c0,c0,0 /* Main ID Register */ #define MPIDR p15,0,c0,c0,5 /* Multiprocessor Affinity Register */ #define ID_PFR0 p15,0,c0,c1,0 /* Processor Feature Register 0 */ #define ID_PFR1 p15,0,c0,c1,1 /* Processor Feature Register 1 */ #define ID_DFR0 p15,0,c0,c1,2 /* Debug Feature Register 0 */ #define ID_AFR0 p15,0,c0,c1,3 /* Auxiliary Feature Register 0 */ #define ID_MMFR0 p15,0,c0,c1,4 /* Memory Model Feature Register 0 */ #define ID_MMFR1 p15,0,c0,c1,5 /* Memory Model Feature Register 1 */ #define ID_MMFR2 p15,0,c0,c1,6 /* Memory Model Feature Register 2 */ #define ID_MMFR3 p15,0,c0,c1,7 /* Memory Model Feature Register 3 */ #define ID_ISAR0 p15,0,c0,c2,0 /* ISA Feature Register 0 */ #define ID_ISAR1 p15,0,c0,c2,1 /* ISA Feature Register 1 */ #define ID_ISAR2 p15,0,c0,c2,2 /* ISA Feature Register 2 */ #define ID_ISAR3 p15,0,c0,c2,3 /* ISA Feature Register 3 */ #define ID_ISAR4 p15,0,c0,c2,4 /* ISA Feature Register 4 */ #define ID_ISAR5 p15,0,c0,c2,5 /* ISA Feature Register 5 */ #define CCSIDR p15,1,c0,c0,0 /* Cache Size ID Registers */ #define CLIDR p15,1,c0,c0,1 /* Cache Level ID Register */ #define CSSELR p15,2,c0,c0,0 /* Cache Size Selection Register */ #define VPIDR p15,4,c0,c0,0 /* Virtualization Processor ID Register */ #define VMPIDR p15,4,c0,c0,5 /* Virtualization Multiprocessor ID Register */ /* CP15 CR1: System Control Registers */ #define SCTLR p15,0,c1,c0,0 /* System Control Register */ #define ACTLR p15,0,c1,c0,1 /* Auxiliary Control Register */ #define CPACR p15,0,c1,c0,2 /* Coprocessor Access Control Register */ #define SCR p15,0,c1,c1,0 /* Secure Configuration Register */ #define NSACR p15,0,c1,c1,2 /* Non-Secure Access Control Register */ #define HSCTLR p15,4,c1,c0,0 /* Hyp. System Control Register */ #define HCR p15,4,c1,c1,0 /* Hyp. Configuration Register */ /* CP15 CR2: Translation Table Base and Control Registers */ #define TTBCR p15,0,c2,c0,2 /* Translatation Table Base Control Register */ #define TTBR0 p15,0,c2 /* Translation Table Base Reg. 0 */ #define TTBR1 p15,1,c2 /* Translation Table Base Reg. 1 */ #define HTTBR p15,4,c2 /* Hyp. Translation Table Base Register */ #define TTBR0_32 p15,0,c2,c0,0 /* 32-bit access to TTBR0 */ #define TTBR1_32 p15,0,c2,c0,1 /* 32-bit access to TTBR1 */ #define HTCR p15,4,c2,c0,2 /* Hyp. Translation Control Register */ #define VTCR p15,4,c2,c1,2 /* Virtualization Translation Control Register */ #define VTTBR p15,6,c2 /* Virtualization Translation Table Base Register */ /* CP15 CR3: Domain Access Control Register */ #define DACR p15,0,c3,c0,0 /* Domain Access Control Register */ /* CP15 CR4: */ /* CP15 CR5: Fault Status Registers */ #define DFSR p15,0,c5,c0,0 /* Data Fault Status Register */ #define IFSR p15,0,c5,c0,1 /* Instruction Fault Status Register */ #define ADFSR p15,0,c5,c1,0 /* Auxiliary Data Fault Status Register */ #define AIFSR p15,0,c5,c1,1 /* Auxiliary Instruction Fault Status Register */ #define HSR p15,4,c5,c2,0 /* Hyp. Syndrome Register */ /* CP15 CR6: Fault Address Registers */ #define DFAR p15,0,c6,c0,0 /* Data Fault Address Register */ #define IFAR p15,0,c6,c0,2 /* Instruction Fault Address Register */ #define HDFAR p15,4,c6,c0,0 /* Hyp. Data Fault Address Register */ #define HIFAR p15,4,c6,c0,2 /* Hyp. Instruction Fault Address Register */ #define HPFAR p15,4,c6,c0,4 /* Hyp. IPA Fault Address Register */ /* CP15 CR7: Cache and address translation operations */ #define PAR p15,0,c7 /* Physical Address Register */ #define ICIALLUIS p15,0,c7,c1,0 /* Invalidate all instruction caches to PoU inner shareable */ #define BPIALLIS p15,0,c7,c1,6 /* Invalidate entire branch predictor array inner shareable */ #define ICIALLU p15,0,c7,c5,0 /* Invalidate all instruction caches to PoU */ #define ICIMVAU p15,0,c7,c5,1 /* Invalidate instruction caches by MVA to PoU */ #define BPIALL p15,0,c7,c5,6 /* Invalidate entire branch predictor array */ #define BPIMVA p15,0,c7,c5,7 /* Invalidate MVA from branch predictor array */ #define DCIMVAC p15,0,c7,c6,1 /* Invalidate data cache line by MVA to PoC */ #define DCISW p15,0,c7,c6,2 /* Invalidate data cache line by set/way */ #define ATS1CPR p15,0,c7,c8,0 /* Address Translation Stage 1. Non-Secure Kernel Read */ #define ATS1CPW p15,0,c7,c8,1 /* Address Translation Stage 1. Non-Secure Kernel Write */ #define ATS1CUR p15,0,c7,c8,2 /* Address Translation Stage 1. Non-Secure User Read */ #define ATS1CUW p15,0,c7,c8,3 /* Address Translation Stage 1. Non-Secure User Write */ #define ATS12NSOPR p15,0,c7,c8,4 /* Address Translation Stage 1+2 Non-Secure Kernel Read */ #define ATS12NSOPW p15,0,c7,c8,5 /* Address Translation Stage 1+2 Non-Secure Kernel Write */ #define ATS12NSOUR p15,0,c7,c8,6 /* Address Translation Stage 1+2 Non-Secure User Read */ #define ATS12NSOUW p15,0,c7,c8,7 /* Address Translation Stage 1+2 Non-Secure User Write */ #define DCCMVAC p15,0,c7,c10,1 /* Clean data or unified cache line by MVA to PoC */ #define DCCSW p15,0,c7,c10,2 /* Clean data cache line by set/way */ #define DCCMVAU p15,0,c7,c11,1 /* Clean data cache line by MVA to PoU */ #define DCCIMVAC p15,0,c7,c14,1 /* Data cache clean and invalidate by MVA */ #define DCCISW p15,0,c7,c14,2 /* Clean and invalidate data cache line by set/way */ #define ATS1HR p15,4,c7,c8,0 /* Address Translation Stage 1 Hyp. Read */ #define ATS1HW p15,4,c7,c8,1 /* Address Translation Stage 1 Hyp. Write */ /* CP15 CR8: TLB maintenance operations */ #define TLBIALLIS p15,0,c8,c3,0 /* Invalidate entire TLB innrer shareable */ #define TLBIMVAIS p15,0,c8,c3,1 /* Invalidate unified TLB entry by MVA inner shareable */ #define TLBIASIDIS p15,0,c8,c3,2 /* Invalidate unified TLB by ASID match inner shareable */ #define TLBIMVAAIS p15,0,c8,c3,3 /* Invalidate unified TLB entry by MVA all ASID inner shareable */ #define ITLBIALL p15,0,c8,c5,0 /* Invalidate instruction TLB */ #define ITLBIMVA p15,0,c8,c5,1 /* Invalidate instruction TLB entry by MVA */ #define ITLBIASID p15,0,c8,c5,2 /* Invalidate instruction TLB by ASID match */ #define DTLBIALL p15,0,c8,c6,0 /* Invalidate data TLB */ #define DTLBIMVA p15,0,c8,c6,1 /* Invalidate data TLB entry by MVA */ #define DTLBIASID p15,0,c8,c6,2 /* Invalidate data TLB by ASID match */ #define TLBIALL p15,0,c8,c7,0 /* invalidate unified TLB */ #define TLBIMVA p15,0,c8,c7,1 /* invalidate unified TLB entry by MVA */ #define TLBIASID p15,0,c8,c7,2 /* invalid unified TLB by ASID match */ #define TLBIMVAA p15,0,c8,c7,3 /* invalidate unified TLB entries by MVA all ASID */ #define TLBIALLHIS p15,4,c8,c3,0 /* Invalidate Entire Hyp. Unified TLB inner shareable */ #define TLBIMVAHIS p15,4,c8,c3,1 /* Invalidate Unified Hyp. TLB by MVA inner shareable */ #define TLBIALLNSNHIS p15,4,c8,c3,4 /* Invalidate Entire Non-Secure Non-Hyp. Unified TLB inner shareable */ #define TLBIALLH p15,4,c8,c7,0 /* Invalidate Entire Hyp. Unified TLB */ #define TLBIMVAH p15,4,c8,c7,1 /* Invalidate Unified Hyp. TLB by MVA */ #define TLBIALLNSNH p15,4,c8,c7,4 /* Invalidate Entire Non-Secure Non-Hyp. Unified TLB */ /* CP15 CR9: */ /* CP15 CR10: */ #define MAIR0 p15,0,c10,c2,0 /* Memory Attribute Indirection Register 0 AKA PRRR */ #define MAIR1 p15,0,c10,c2,1 /* Memory Attribute Indirection Register 1 AKA NMRR */ #define HMAIR0 p15,4,c10,c2,0 /* Hyp. Memory Attribute Indirection Register 0 */ #define HMAIR1 p15,4,c10,c2,1 /* Hyp. Memory Attribute Indirection Register 1 */ #define AMAIR0 p15,0,c10,c3,0 /* Aux. Memory Attribute Indirection Register 0 */ #define AMAIR1 p15,0,c10,c3,1 /* Aux. Memory Attribute Indirection Register 1 */ /* CP15 CR11: DMA Operations for TCM Access */ /* CP15 CR12: */ #define VBAR p15,0,c12,c0,0 /* Vector Base Address Register */ #define HVBAR p15,4,c12,c0,0 /* Hyp. Vector Base Address Register */ /* CP15 CR13: */ #define FCSEIDR p15,0,c13,c0,0 /* FCSE Process ID Register */ #define CONTEXTIDR p15,0,c13,c0,1 /* Context ID Register */ #define TPIDRURW p15,0,c13,c0,2 /* Software Thread ID, User, R/W */ #define TPIDRURO p15,0,c13,c0,3 /* Software Thread ID, User, R/O */ #define TPIDRPRW p15,0,c13,c0,4 /* Software Thread ID, Priveleged */ #define HTPIDR p15,4,c13,c0,2 /* HYp Software Thread Id Register */ /* CP15 CR14: */ #define CNTPCT p15,0,c14 /* Time counter value */ #define CNTFRQ p15,0,c14,c0,0 /* Time counter frequency */ #define CNTKCTL p15,0,c14,c1,0 /* Time counter kernel control */ #define CNTP_TVAL p15,0,c14,c2,0 /* Physical Timer value */ #define CNTP_CTL p15,0,c14,c2,1 /* Physical Timer control register */ #define CNTVCT p15,1,c14 /* Time counter value + offset */ #define CNTP_CVAL p15,2,c14 /* Physical Timer comparator */ #define CNTV_CVAL p15,3,c14 /* Virt. Timer comparator */ #define CNTVOFF p15,4,c14 /* Time counter offset */ #define CNTHCTL p15,4,c14,c1,0 /* Time counter hyp. control */ #define CNTHP_TVAL p15,4,c14,c2,0 /* Hyp. Timer value */ #define CNTHP_CTL p15,4,c14,c2,1 /* Hyp. Timer control register */ #define CNTV_TVAL p15,0,c14,c3,0 /* Virt. Timer value */ #define CNTV_CTL p15,0,c14,c3,1 /* Virt. TImer control register */ #define CNTHP_CVAL p15,6,c14 /* Hyp. Timer comparator */ /* CP15 CR15: Implementation Defined Registers */ /* Aliases of AArch64 names for use in common code when building for AArch32 */ #ifdef CONFIG_ARM_32 /* Alphabetically... */ #define ACTLR_EL1 ACTLR #define AFSR0_EL1 ADFSR #define AFSR1_EL1 AIFSR #define CCSIDR_EL1 CCSIDR #define CLIDR_EL1 CLIDR #define CNTFRQ_EL0 CNTFRQ #define CNTHCTL_EL2 CNTHCTL #define CNTHP_CTL_EL2 CNTHP_CTL #define CNTHP_CVAL_EL2 CNTHP_CVAL #define CNTKCTL_EL1 CNTKCTL #define CNTPCT_EL0 CNTPCT #define CNTP_CTL_EL0 CNTP_CTL #define CNTP_CVAL_EL0 CNTP_CVAL #define CNTVCT_EL0 CNTVCT #define CNTVOFF_EL2 CNTVOFF #define CNTV_CTL_EL0 CNTV_CTL #define CNTV_CVAL_EL0 CNTV_CVAL #define CONTEXTIDR_EL1 CONTEXTIDR #define CPACR_EL1 CPACR #define CSSELR_EL1 CSSELR #define DACR32_EL2 DACR #define ESR_EL1 DFSR #define ESR_EL2 HSR #define FAR_EL1 HIFAR #define FAR_EL2 HIFAR #define HCR_EL2 HCR #define HPFAR_EL2 HPFAR #define ID_AFR0_EL1 ID_AFR0 #define ID_DFR0_EL1 ID_DFR0 #define ID_ISAR0_EL1 ID_ISAR0 #define ID_ISAR1_EL1 ID_ISAR1 #define ID_ISAR2_EL1 ID_ISAR2 #define ID_ISAR3_EL1 ID_ISAR3 #define ID_ISAR4_EL1 ID_ISAR4 #define ID_ISAR5_EL1 ID_ISAR5 #define ID_MMFR0_EL1 ID_MMFR0 #define ID_MMFR1_EL1 ID_MMFR1 #define ID_MMFR2_EL1 ID_MMFR2 #define ID_MMFR3_EL1 ID_MMFR3 #define ID_PFR0_EL1 ID_PFR0 #define ID_PFR1_EL1 ID_PFR1 #define IFSR32_EL2 IFSR #define MIDR_EL1 MIDR #define MPIDR_EL1 MPIDR #define PAR_EL1 PAR #define SCTLR_EL1 SCTLR #define SCTLR_EL2 HSCTLR #define TCR_EL1 TTBCR #define TEECR32_EL1 TEECR #define TEEHBR32_EL1 TEEHBR #define TPIDRRO_EL0 TPIDRURO #define TPIDR_EL0 TPIDRURW #define TPIDR_EL1 TPIDRPRW #define TPIDR_EL2 HTPIDR #define TTBR0_EL1 TTBR0 #define TTBR0_EL2 HTTBR #define TTBR1_EL1 TTBR1 #define VBAR_EL1 VBAR #define VBAR_EL2 HVBAR #define VMPIDR_EL2 VMPIDR #define VPIDR_EL2 VPIDR #define VTCR_EL2 VTCR #define VTTBR_EL2 VTTBR #endif #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/numa.h0000664000175000017500000000102412307313555015323 0ustar smbsmb#ifndef __ARCH_ARM_NUMA_H #define __ARCH_ARM_NUMA_H /* Fake one node for now. See also node_online_map. */ #define cpu_to_node(cpu) 0 #define node_to_cpumask(node) (cpu_online_map) static inline __attribute__((pure)) int phys_to_nid(paddr_t addr) { return 0; } /* XXX: implement NUMA support */ #define node_spanned_pages(nid) (total_pages) #define __node_distance(a, b) (20) #endif /* __ARCH_ARM_NUMA_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/desc.h0000664000175000017500000000027312307313555015306 0ustar smbsmb#ifndef __ARCH_DESC_H #define __ARCH_DESC_H #endif /* __ARCH_DESC_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/vfp.h0000664000175000017500000000066412307313555015167 0ustar smbsmb#ifndef _ASM_VFP_H #define _ASM_VFP_H #include #if defined(CONFIG_ARM_32) # include #elif defined(CONFIG_ARM_64) # include #else # error "Unknown ARM variant" #endif void vfp_save_state(struct vcpu *v); void vfp_restore_state(struct vcpu *v); #endif /* _ASM_VFP_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/guest_access.h0000664000175000017500000001364312307313555017045 0ustar smbsmb#ifndef __ASM_ARM_GUEST_ACCESS_H__ #define __ASM_ARM_GUEST_ACCESS_H__ #include #include /* Guests have their own comlete address space */ #define access_ok(addr,size) (1) #define array_access_ok(addr,count,size) \ (likely(count < (~0UL/size)) && access_ok(addr,count*size)) unsigned long raw_copy_to_guest(void *to, const void *from, unsigned len); unsigned long raw_copy_to_guest_flush_dcache(void *to, const void *from, unsigned len); unsigned long raw_copy_from_guest(void *to, const void *from, unsigned len); unsigned long raw_clear_guest(void *to, unsigned len); #define __raw_copy_to_guest raw_copy_to_guest #define __raw_copy_from_guest raw_copy_from_guest #define __raw_clear_guest raw_clear_guest /* Remainder copied from x86 -- could be common? */ /* Is the guest handle a NULL reference? */ #define guest_handle_is_null(hnd) ((hnd).p == NULL) /* Offset the given guest handle into the array it refers to. */ #define guest_handle_add_offset(hnd, nr) ((hnd).p += (nr)) #define guest_handle_subtract_offset(hnd, nr) ((hnd).p -= (nr)) /* Cast a guest handle (either XEN_GUEST_HANDLE or XEN_GUEST_HANDLE_PARAM) * to the specified type of XEN_GUEST_HANDLE_PARAM. */ #define guest_handle_cast(hnd, type) ({ \ type *_x = (hnd).p; \ (XEN_GUEST_HANDLE_PARAM(type)) { _x }; \ }) /* Cast a XEN_GUEST_HANDLE to XEN_GUEST_HANDLE_PARAM */ #define guest_handle_to_param(hnd, type) ({ \ typeof((hnd).p) _x = (hnd).p; \ XEN_GUEST_HANDLE_PARAM(type) _y = { _x }; \ /* type checking: make sure that the pointers inside \ * XEN_GUEST_HANDLE and XEN_GUEST_HANDLE_PARAM are of \ * the same type, then return hnd */ \ (void)(&_x == &_y.p); \ _y; \ }) /* Cast a XEN_GUEST_HANDLE_PARAM to XEN_GUEST_HANDLE */ #define guest_handle_from_param(hnd, type) ({ \ typeof((hnd).p) _x = (hnd).p; \ XEN_GUEST_HANDLE(type) _y = { _x }; \ /* type checking: make sure that the pointers inside \ * XEN_GUEST_HANDLE and XEN_GUEST_HANDLE_PARAM are of \ * the same type, then return hnd */ \ (void)(&_x == &_y.p); \ _y; \ }) #define guest_handle_from_ptr(ptr, type) \ ((XEN_GUEST_HANDLE_PARAM(type)) { (type *)ptr }) #define const_guest_handle_from_ptr(ptr, type) \ ((XEN_GUEST_HANDLE_PARAM(const_##type)) { (const type *)ptr }) /* * Copy an array of objects to guest context via a guest handle, * specifying an offset into the guest array. */ #define copy_to_guest_offset(hnd, off, ptr, nr) ({ \ const typeof(*(ptr)) *_s = (ptr); \ char (*_d)[sizeof(*_s)] = (void *)(hnd).p; \ ((void)((hnd).p == (ptr))); \ raw_copy_to_guest(_d+(off), _s, sizeof(*_s)*(nr)); \ }) /* * Clear an array of objects in guest context via a guest handle, * specifying an offset into the guest array. */ #define clear_guest_offset(hnd, off, nr) ({ \ void *_d = (hnd).p; \ raw_clear_guest(_d+(off), nr); \ }) /* * Copy an array of objects from guest context via a guest handle, * specifying an offset into the guest array. */ #define copy_from_guest_offset(ptr, hnd, off, nr) ({ \ const typeof(*(ptr)) *_s = (hnd).p; \ typeof(*(ptr)) *_d = (ptr); \ raw_copy_from_guest(_d, _s+(off), sizeof(*_d)*(nr));\ }) /* Copy sub-field of a structure to guest context via a guest handle. */ #define copy_field_to_guest(hnd, ptr, field) ({ \ const typeof(&(ptr)->field) _s = &(ptr)->field; \ void *_d = &(hnd).p->field; \ ((void)(&(hnd).p->field == &(ptr)->field)); \ raw_copy_to_guest(_d, _s, sizeof(*_s)); \ }) /* Copy sub-field of a structure from guest context via a guest handle. */ #define copy_field_from_guest(ptr, hnd, field) ({ \ const typeof(&(ptr)->field) _s = &(hnd).p->field; \ typeof(&(ptr)->field) _d = &(ptr)->field; \ raw_copy_from_guest(_d, _s, sizeof(*_d)); \ }) /* * Pre-validate a guest handle. * Allows use of faster __copy_* functions. */ /* All ARM guests are paging mode external and hence safe */ #define guest_handle_okay(hnd, nr) (1) #define guest_handle_subrange_okay(hnd, first, last) (1) #define __copy_to_guest_offset(hnd, off, ptr, nr) ({ \ const typeof(*(ptr)) *_s = (ptr); \ char (*_d)[sizeof(*_s)] = (void *)(hnd).p; \ ((void)((hnd).p == (ptr))); \ __raw_copy_to_guest(_d+(off), _s, sizeof(*_s)*(nr));\ }) #define __clear_guest_offset(hnd, off, ptr, nr) ({ \ __raw_clear_guest(_d+(off), nr); \ }) #define __copy_from_guest_offset(ptr, hnd, off, nr) ({ \ const typeof(*(ptr)) *_s = (hnd).p; \ typeof(*(ptr)) *_d = (ptr); \ __raw_copy_from_guest(_d, _s+(off), sizeof(*_d)*(nr));\ }) #define __copy_field_to_guest(hnd, ptr, field) ({ \ const typeof(&(ptr)->field) _s = &(ptr)->field; \ void *_d = &(hnd).p->field; \ ((void)(&(hnd).p->field == &(ptr)->field)); \ __raw_copy_to_guest(_d, _s, sizeof(*_s)); \ }) #define __copy_field_from_guest(ptr, hnd, field) ({ \ const typeof(&(ptr)->field) _s = &(hnd).p->field; \ typeof(&(ptr)->field) _d = &(ptr)->field; \ __raw_copy_from_guest(_d, _s, sizeof(*_d)); \ }) #endif /* __ASM_ARM_GUEST_ACCESS_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/percpu.h0000664000175000017500000000245112307313555015666 0ustar smbsmb#ifndef __ARM_PERCPU_H__ #define __ARM_PERCPU_H__ #ifndef __ASSEMBLY__ #include #include #if defined(CONFIG_ARM_32) # include #elif defined(CONFIG_ARM_64) # include #else # error "unknown ARM variant" #endif extern char __per_cpu_start[], __per_cpu_data_end[]; extern unsigned long __per_cpu_offset[NR_CPUS]; void percpu_init_areas(void); /* Separate out the type, so (int[3], foo) works. */ #define __DEFINE_PER_CPU(type, name, suffix) \ __section(".bss.percpu" #suffix) \ __typeof__(type) per_cpu_##name #define per_cpu(var, cpu) \ (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu])) #define __get_cpu_var(var) \ (*RELOC_HIDE(&per_cpu__##var, READ_SYSREG(TPIDR_EL2))) #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name DECLARE_PER_CPU(unsigned int, cpu_id); #define get_processor_id() (this_cpu(cpu_id)) #define set_processor_id(id) do { \ WRITE_SYSREG(__per_cpu_offset[id], TPIDR_EL2); \ this_cpu(cpu_id) = (id); \ } while(0) #endif #endif /* __ARM_PERCPU_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-arm/gic.h0000664000175000017500000001576612307313555015147 0ustar smbsmb/* * ARM Generic Interrupt Controller support * * Tim Deegan * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #ifndef __ASM_ARM_GIC_H__ #define __ASM_ARM_GIC_H__ #define GICD_CTLR (0x000/4) #define GICD_TYPER (0x004/4) #define GICD_IIDR (0x008/4) #define GICD_IGROUPR (0x080/4) #define GICD_IGROUPRN (0x0FC/4) #define GICD_ISENABLER (0x100/4) #define GICD_ISENABLERN (0x17C/4) #define GICD_ICENABLER (0x180/4) #define GICD_ICENABLERN (0x1fC/4) #define GICD_ISPENDR (0x200/4) #define GICD_ISPENDRN (0x27C/4) #define GICD_ICPENDR (0x280/4) #define GICD_ICPENDRN (0x2FC/4) #define GICD_ISACTIVER (0x300/4) #define GICD_ISACTIVERN (0x37C/4) #define GICD_ICACTIVER (0x380/4) #define GICD_ICACTIVERN (0x3FC/4) #define GICD_IPRIORITYR (0x400/4) #define GICD_IPRIORITYRN (0x7F8/4) #define GICD_ITARGETSR (0x800/4) #define GICD_ITARGETSRN (0xBF8/4) #define GICD_ICFGR (0xC00/4) #define GICD_ICFGRN (0xCFC/4) #define GICD_NSACR (0xE00/4) #define GICD_NSACRN (0xEFC/4) #define GICD_SGIR (0xF00/4) #define GICD_CPENDSGIR (0xF10/4) #define GICD_CPENDSGIRN (0xF1C/4) #define GICD_SPENDSGIR (0xF20/4) #define GICD_SPENDSGIRN (0xF2C/4) #define GICD_ICPIDR2 (0xFE8/4) #define GICD_SGI_TARGET_LIST_SHIFT (24) #define GICD_SGI_TARGET_LIST_MASK (0x3UL << GICD_SGI_TARGET_LIST_SHIFT) #define GICD_SGI_TARGET_LIST (0UL< #define DT_MATCH_GIC DT_MATCH_COMPATIBLE("arm,cortex-a15-gic"), \ DT_MATCH_COMPATIBLE("arm,cortex-a7-gic") extern int domain_vgic_init(struct domain *d); extern void domain_vgic_free(struct domain *d); extern int vcpu_vgic_init(struct vcpu *v); extern void vgic_vcpu_inject_irq(struct vcpu *v, unsigned int irq,int virtual); extern void vgic_clear_pending_irqs(struct vcpu *v); extern struct pending_irq *irq_to_pending(struct vcpu *v, unsigned int irq); /* Program the GIC to route an interrupt with a dt_irq */ extern void gic_route_dt_irq(const struct dt_irq *irq, const cpumask_t *cpu_mask, unsigned int priority); extern void gic_route_ppis(void); extern void gic_route_spis(void); extern void gic_inject(void); extern void gic_clear_pending_irqs(struct vcpu *v); extern int gic_events_need_delivery(void); extern void __cpuinit init_maintenance_interrupt(void); extern void gic_set_guest_irq(struct vcpu *v, unsigned int irq, unsigned int state, unsigned int priority); extern void gic_remove_from_queues(struct vcpu *v, unsigned int virtual_irq); extern int gic_route_irq_to_guest(struct domain *d, const struct dt_irq *irq, const char * devname); /* Accept an interrupt from the GIC and dispatch its handler */ extern void gic_interrupt(struct cpu_user_regs *regs, int is_fiq); /* Bring up the interrupt controller, and report # cpus attached */ extern void gic_init(void); /* Bring up a secondary CPU's per-CPU GIC interface */ extern void gic_init_secondary_cpu(void); /* Take down a CPU's per-CPU GIC interface */ extern void gic_disable_cpu(void); /* setup the gic virtual interface for a guest */ extern int gicv_setup(struct domain *d); /* Context switch */ extern void gic_save_state(struct vcpu *v); extern void gic_restore_state(struct vcpu *v); /* SGI (AKA IPIs) */ enum gic_sgi { GIC_SGI_EVENT_CHECK = 0, GIC_SGI_DUMP_STATE = 1, GIC_SGI_CALL_FUNCTION = 2, }; extern void send_SGI_mask(const cpumask_t *cpumask, enum gic_sgi sgi); extern void send_SGI_one(unsigned int cpu, enum gic_sgi sgi); extern void send_SGI_self(enum gic_sgi sgi); extern void send_SGI_allbutself(enum gic_sgi sgi); /* print useful debug info */ extern void gic_dump_info(struct vcpu *v); /* Number of interrupt lines */ extern unsigned int gic_number_lines(void); /* IRQ translation function for the device tree */ int gic_irq_xlate(const u32 *intspec, unsigned int intsize, unsigned int *out_hwirq, unsigned int *out_type); #endif /* __ASSEMBLY__ */ #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/0000775000175000017500000000000012307313555014063 5ustar smbsmbxen-4.4.0/xen/include/asm-x86/softirq.h0000664000175000017500000000061712307313555015727 0ustar smbsmb#ifndef __ASM_SOFTIRQ_H__ #define __ASM_SOFTIRQ_H__ #define NMI_MCE_SOFTIRQ (NR_COMMON_SOFTIRQS + 0) #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1) #define VCPU_KICK_SOFTIRQ (NR_COMMON_SOFTIRQS + 2) #define MACHINE_CHECK_SOFTIRQ (NR_COMMON_SOFTIRQS + 3) #define PCI_SERR_SOFTIRQ (NR_COMMON_SOFTIRQS + 4) #define NR_ARCH_SOFTIRQS 5 #endif /* __ASM_SOFTIRQ_H__ */ xen-4.4.0/xen/include/asm-x86/hypercall.h0000664000175000017500000000436412307313555016226 0ustar smbsmb/****************************************************************************** * asm-x86/hypercall.h */ #ifndef __ASM_X86_HYPERCALL_H__ #define __ASM_X86_HYPERCALL_H__ #include #include /* for do_mca */ #include /* * Both do_mmuext_op() and do_mmu_update(): * We steal the m.s.b. of the @count parameter to indicate whether this * invocation of do_mmu_update() is resuming a previously preempted call. */ #define MMU_UPDATE_PREEMPTED (~(~0U>>1)) extern long do_event_channel_op_compat( XEN_GUEST_HANDLE_PARAM(evtchn_op_t) uop); extern long do_set_trap_table( XEN_GUEST_HANDLE_PARAM(const_trap_info_t) traps); extern long do_mmu_update( XEN_GUEST_HANDLE_PARAM(mmu_update_t) ureqs, unsigned int count, XEN_GUEST_HANDLE_PARAM(uint) pdone, unsigned int foreigndom); extern long do_set_gdt( XEN_GUEST_HANDLE_PARAM(xen_ulong_t) frame_list, unsigned int entries); extern long do_stack_switch( unsigned long ss, unsigned long esp); extern long do_fpu_taskswitch( int set); extern long do_set_debugreg( int reg, unsigned long value); extern unsigned long do_get_debugreg( int reg); extern long do_update_descriptor( u64 pa, u64 desc); extern long do_mca(XEN_GUEST_HANDLE_PARAM(xen_mc_t) u_xen_mc); extern long do_update_va_mapping( unsigned long va, u64 val64, unsigned long flags); extern long do_physdev_op( int cmd, XEN_GUEST_HANDLE_PARAM(void) arg); extern long do_update_va_mapping_otherdomain( unsigned long va, u64 val64, unsigned long flags, domid_t domid); extern long do_mmuext_op( XEN_GUEST_HANDLE_PARAM(mmuext_op_t) uops, unsigned int count, XEN_GUEST_HANDLE_PARAM(uint) pdone, unsigned int foreigndom); extern unsigned long do_iret( void); extern long do_set_callbacks( unsigned long event_address, unsigned long failsafe_address, unsigned long syscall_address); extern long do_set_segment_base( unsigned int which, unsigned long base); extern int compat_physdev_op( int cmd, XEN_GUEST_HANDLE_PARAM(void) arg); extern int arch_compat_vcpu_op( int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg); #endif /* __ASM_X86_HYPERCALL_H__ */ xen-4.4.0/xen/include/asm-x86/xenoprof.h0000664000175000017500000000604512307313555016101 0ustar smbsmb/****************************************************************************** * asm-x86/xenoprof.h * xenoprof x86 arch specific header file * * Copyright (c) 2006 Isaku Yamahata * VA Linux Systems Japan K.K. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef __ASM_X86_XENOPROF_H__ #define __ASM_X86_XENOPROF_H__ int nmi_reserve_counters(void); int nmi_setup_events(void); int nmi_enable_virq(void); int nmi_start(void); void nmi_stop(void); void nmi_disable_virq(void); void nmi_release_counters(void); int xenoprof_arch_init(int *num_events, char *cpu_type); #define xenoprof_arch_reserve_counters() nmi_reserve_counters() #define xenoprof_arch_setup_events() nmi_setup_events() #define xenoprof_arch_enable_virq() nmi_enable_virq() #define xenoprof_arch_start() nmi_start() #define xenoprof_arch_stop() nmi_stop() #define xenoprof_arch_disable_virq() nmi_disable_virq() #define xenoprof_arch_release_counters() nmi_release_counters() int xenoprof_arch_counter(XEN_GUEST_HANDLE_PARAM(void) arg); int compat_oprof_arch_counter(XEN_GUEST_HANDLE_PARAM(void) arg); int xenoprof_arch_ibs_counter(XEN_GUEST_HANDLE_PARAM(void) arg); struct vcpu; struct cpu_user_regs; /* AMD IBS support */ void ibs_init(void); extern u32 ibs_caps; int xenoprofile_get_mode(struct vcpu *, const struct cpu_user_regs *); static inline int xenoprof_backtrace_supported(void) { return 1; } void xenoprof_backtrace(struct vcpu *, const struct cpu_user_regs *, unsigned long depth, int mode); #define xenoprof_shared_gmfn(d, gmaddr, maddr) \ do { \ (void)(maddr); \ gdprintk(XENLOG_WARNING, \ "xenoprof/x86 with autotranslated mode enabled" \ "isn't supported yet\n"); \ } while (0) int passive_domain_do_rdmsr(unsigned int msr, uint64_t *msr_content); int passive_domain_do_wrmsr(unsigned int msr, uint64_t msr_content); void passive_domain_destroy(struct vcpu *v); #endif /* __ASM_X86_XENOPROF_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/mm.h0000664000175000017500000005653012307313555014656 0ustar smbsmb #ifndef __ASM_X86_MM_H__ #define __ASM_X86_MM_H__ #include #include #include #include #include /* * Per-page-frame information. * * Every architecture must ensure the following: * 1. 'struct page_info' contains a 'struct page_list_entry list'. * 2. Provide a PFN_ORDER() macro for accessing the order of a free page. */ #define PFN_ORDER(_pfn) ((_pfn)->v.free.order) /* * This definition is solely for the use in struct page_info (and * struct page_list_head), intended to allow easy adjustment once x86-64 * wants to support more than 16TB. * 'unsigned long' should be used for MFNs everywhere else. */ #define __pdx_t unsigned int #undef page_list_entry struct page_list_entry { __pdx_t next, prev; }; struct page_sharing_info; struct page_info { union { /* Each frame can be threaded onto a doubly-linked list. * * For unused shadow pages, a list of free shadow pages; * for multi-page shadows, links to the other pages in this shadow; * for pinnable shadows, if pinned, a list of all pinned shadows * (see sh_type_is_pinnable() for the definition of "pinnable" * shadow types). N.B. a shadow may be both pinnable and multi-page. * In that case the pages are inserted in order in the list of * pinned shadows and walkers of that list must be prepared * to keep them all together during updates. */ struct page_list_entry list; /* For non-pinnable single-page shadows, a higher entry that points * at us. */ paddr_t up; /* For shared/sharable pages, we use a doubly-linked list * of all the {pfn,domain} pairs that map this page. We also include * an opaque handle, which is effectively a version, so that clients * of sharing share the version they expect to. * This list is allocated and freed when a page is shared/unshared. */ struct page_sharing_info *sharing; }; /* Reference count and various PGC_xxx flags and fields. */ unsigned long count_info; /* Context-dependent fields follow... */ union { /* Page is in use: ((count_info & PGC_count_mask) != 0). */ struct { /* Type reference count and various PGT_xxx flags and fields. */ unsigned long type_info; } inuse; /* Page is in use as a shadow: count_info == 0. */ struct { unsigned long type:5; /* What kind of shadow is this? */ unsigned long pinned:1; /* Is the shadow pinned? */ unsigned long head:1; /* Is this the first page of the shadow? */ unsigned long count:25; /* Reference count */ } sh; /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */ struct { /* Do TLBs need flushing for safety before next page use? */ bool_t need_tlbflush; } free; } u; union { /* Page is in use, but not as a shadow. */ struct { /* Owner of this page (zero if page is anonymous). */ __pdx_t _domain; } inuse; /* Page is in use as a shadow. */ struct { /* GMFN of guest page we're a shadow of. */ __pdx_t back; } sh; /* Page is on a free list. */ struct { /* Order-size of the free chunk this page is the head of. */ unsigned int order; } free; } v; union { /* * Timestamp from 'TLB clock', used to avoid extra safety flushes. * Only valid for: a) free pages, and b) pages with zero type count * (except page table pages when the guest is in shadow mode). */ u32 tlbflush_timestamp; /* * When PGT_partial is true then this field is valid and indicates * that PTEs in the range [0, @nr_validated_ptes) have been validated. * An extra page reference must be acquired (or not dropped) whenever * PGT_partial gets set, and it must be dropped when the flag gets * cleared. This is so that a get() leaving a page in partially * validated state (where the caller would drop the reference acquired * due to the getting of the type [apparently] failing [-EAGAIN]) * would not accidentally result in a page left with zero general * reference count, but non-zero type reference count (possible when * the partial get() is followed immediately by domain destruction). * Likewise, the ownership of the single type reference for partially * (in-)validated pages is tied to this flag, i.e. the instance * setting the flag must not drop that reference, whereas the instance * clearing it will have to. * * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has * been partially validated. This implies that the general reference * to the page (acquired from get_page_from_lNe()) would be dropped * (again due to the apparent failure) and hence must be re-acquired * when resuming the validation, but must not be dropped when picking * up the page for invalidation. * * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has * been partially invalidated. This is basically the opposite case of * above, i.e. the general reference to the page was not dropped in * put_page_from_lNe() (due to the apparent failure), and hence it * must be dropped when the put operation is resumed (and completes), * but it must not be acquired if picking up the page for validation. */ struct { u16 nr_validated_ptes; s8 partial_pte; }; /* * Guest pages with a shadow. This does not conflict with * tlbflush_timestamp since page table pages are explicitly not * tracked for TLB-flush avoidance when a guest runs in shadow mode. */ u32 shadow_flags; /* When in use as a shadow, next shadow in this hash chain. */ __pdx_t next_shadow; }; }; #undef __pdx_t #define PG_shift(idx) (BITS_PER_LONG - (idx)) #define PG_mask(x, idx) (x ## UL << PG_shift(idx)) /* The following page types are MUTUALLY EXCLUSIVE. */ #define PGT_none PG_mask(0, 4) /* no special uses of this page */ #define PGT_l1_page_table PG_mask(1, 4) /* using as an L1 page table? */ #define PGT_l2_page_table PG_mask(2, 4) /* using as an L2 page table? */ #define PGT_l3_page_table PG_mask(3, 4) /* using as an L3 page table? */ #define PGT_l4_page_table PG_mask(4, 4) /* using as an L4 page table? */ #define PGT_seg_desc_page PG_mask(5, 4) /* using this page in a GDT/LDT? */ #define PGT_writable_page PG_mask(7, 4) /* has writable mappings? */ #define PGT_shared_page PG_mask(8, 4) /* CoW sharable page */ #define PGT_type_mask PG_mask(15, 4) /* Bits 28-31 or 60-63. */ /* Owning guest has pinned this page to its current type? */ #define _PGT_pinned PG_shift(5) #define PGT_pinned PG_mask(1, 5) /* Has this page been validated for use as its current type? */ #define _PGT_validated PG_shift(6) #define PGT_validated PG_mask(1, 6) /* PAE only: is this an L2 page directory containing Xen-private mappings? */ #define _PGT_pae_xen_l2 PG_shift(7) #define PGT_pae_xen_l2 PG_mask(1, 7) /* Has this page been *partially* validated for use as its current type? */ #define _PGT_partial PG_shift(8) #define PGT_partial PG_mask(1, 8) /* Page is locked? */ #define _PGT_locked PG_shift(9) #define PGT_locked PG_mask(1, 9) /* Count of uses of this frame as its current type. */ #define PGT_count_width PG_shift(9) #define PGT_count_mask ((1UL<count_info&PGC_state) == PGC_state_##st) /* Count of references to this frame. */ #define PGC_count_width PG_shift(9) #define PGC_count_mask ((1UL<count_info & PGC_xen_heap) #define is_xen_heap_mfn(mfn) \ (__mfn_valid(mfn) && is_xen_heap_page(__mfn_to_page(mfn))) #define is_xen_fixed_mfn(mfn) \ ((((mfn) << PAGE_SHIFT) >= __pa(&_start)) && \ (((mfn) << PAGE_SHIFT) <= __pa(&_end))) #define PRtype_info "016lx"/* should only be used for printk's */ /* The number of out-of-sync shadows we allow per vcpu (prime, please) */ #define SHADOW_OOS_PAGES 3 /* OOS fixup entries */ #define SHADOW_OOS_FIXUPS 2 #define page_get_owner(_p) \ ((struct domain *)((_p)->v.inuse._domain ? \ pdx_to_virt((_p)->v.inuse._domain) : NULL)) #define page_set_owner(_p,_d) \ ((_p)->v.inuse._domain = (_d) ? virt_to_pdx(_d) : 0) #define maddr_get_owner(ma) (page_get_owner(maddr_to_page((ma)))) #define XENSHARE_writable 0 #define XENSHARE_readonly 1 extern void share_xen_page_with_guest( struct page_info *page, struct domain *d, int readonly); extern void share_xen_page_with_privileged_guests( struct page_info *page, int readonly); #define frame_table ((struct page_info *)FRAMETABLE_VIRT_START) #define spage_table ((struct spage_info *)SPAGETABLE_VIRT_START) int get_superpage(unsigned long mfn, struct domain *d); extern unsigned long max_page; extern unsigned long total_pages; void init_frametable(void); #define PDX_GROUP_COUNT ((1 << L2_PAGETABLE_SHIFT) / \ (sizeof(*frame_table) & -sizeof(*frame_table))) extern unsigned long pdx_group_valid[]; /* Convert between Xen-heap virtual addresses and page-info structures. */ static inline struct page_info *__virt_to_page(const void *v) { unsigned long va = (unsigned long)v; ASSERT(va >= XEN_VIRT_START); ASSERT(va < DIRECTMAP_VIRT_END); if ( va < XEN_VIRT_END ) va += DIRECTMAP_VIRT_START - XEN_VIRT_START + xen_phys_start; else ASSERT(va >= DIRECTMAP_VIRT_START); return frame_table + ((va - DIRECTMAP_VIRT_START) >> PAGE_SHIFT); } static inline void *__page_to_virt(const struct page_info *pg) { ASSERT((unsigned long)pg - FRAMETABLE_VIRT_START < FRAMETABLE_SIZE); /* * (sizeof(*pg) & -sizeof(*pg)) selects the LS bit of sizeof(*pg). The * division and re-multiplication avoids one shift when sizeof(*pg) is a * power of two (otherwise there would be a right shift followed by a * left shift, which the compiler can't know it can fold into one). */ return (void *)(DIRECTMAP_VIRT_START + ((unsigned long)pg - FRAMETABLE_VIRT_START) / (sizeof(*pg) / (sizeof(*pg) & -sizeof(*pg))) * (PAGE_SIZE / (sizeof(*pg) & -sizeof(*pg)))); } int free_page_type(struct page_info *page, unsigned long type, int preemptible); void init_guest_l4_table(l4_pgentry_t[], const struct domain *); int is_iomem_page(unsigned long mfn); void clear_superpage_mark(struct page_info *page); const unsigned long *get_platform_badpages(unsigned int *array_size); /* Per page locks: * page_lock() is used for two purposes: pte serialization, and memory sharing. * * All users of page lock for pte serialization live in mm.c, use it * to lock a page table page during pte updates, do not take other locks within * the critical section delimited by page_lock/unlock, and perform no * nesting. * * All users of page lock for memory sharing live in mm/mem_sharing.c. Page_lock * is used in memory sharing to protect addition (share) and removal (unshare) * of (gfn,domain) tupples to a list of gfn's that the shared page is currently * backing. Nesting may happen when sharing (and locking) two pages -- deadlock * is avoided by locking pages in increasing order. * All memory sharing code paths take the p2m lock of the affected gfn before * taking the lock for the underlying page. We enforce ordering between page_lock * and p2m_lock using an mm-locks.h construct. * * These two users (pte serialization and memory sharing) do not collide, since * sharing is only supported for hvm guests, which do not perform pv pte updates. * */ int page_lock(struct page_info *page); void page_unlock(struct page_info *page); struct domain *page_get_owner_and_reference(struct page_info *page); void put_page(struct page_info *page); int get_page(struct page_info *page, struct domain *domain); void put_page_type(struct page_info *page); int get_page_type(struct page_info *page, unsigned long type); int put_page_type_preemptible(struct page_info *page); int get_page_type_preemptible(struct page_info *page, unsigned long type); int put_old_guest_table(struct vcpu *); int get_page_from_l1e( l1_pgentry_t l1e, struct domain *l1e_owner, struct domain *pg_owner); void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner); static inline void put_page_and_type(struct page_info *page) { put_page_type(page); put_page(page); } static inline int put_page_and_type_preemptible(struct page_info *page) { int rc = put_page_type_preemptible(page); if ( likely(rc == 0) ) put_page(page); return rc; } static inline int get_page_and_type(struct page_info *page, struct domain *domain, unsigned long type) { int rc = get_page(page, domain); if ( likely(rc) && unlikely(!get_page_type(page, type)) ) { put_page(page); rc = 0; } return rc; } #define ASSERT_PAGE_IS_TYPE(_p, _t) \ ASSERT(((_p)->u.inuse.type_info & PGT_type_mask) == (_t)); \ ASSERT(((_p)->u.inuse.type_info & PGT_count_mask) != 0) #define ASSERT_PAGE_IS_DOMAIN(_p, _d) \ ASSERT(((_p)->count_info & PGC_count_mask) != 0); \ ASSERT(page_get_owner(_p) == (_d)) int check_descriptor(const struct domain *, struct desc_struct *d); extern bool_t opt_allow_superpage; extern paddr_t mem_hotplug; /****************************************************************************** * With shadow pagetables, the different kinds of address start * to get get confusing. * * Virtual addresses are what they usually are: the addresses that are used * to accessing memory while the guest is running. The MMU translates from * virtual addresses to machine addresses. * * (Pseudo-)physical addresses are the abstraction of physical memory the * guest uses for allocation and so forth. For the purposes of this code, * we can largely ignore them. * * Guest frame numbers (gfns) are the entries that the guest puts in its * pagetables. For normal paravirtual guests, they are actual frame numbers, * with the translation done by the guest. * * Machine frame numbers (mfns) are the entries that the hypervisor puts * in the shadow page tables. * * Elsewhere in the xen code base, the name "gmfn" is generally used to refer * to a "machine frame number, from the guest's perspective", or in other * words, pseudo-physical frame numbers. However, in the shadow code, the * term "gmfn" means "the mfn of a guest page"; this combines naturally with * other terms such as "smfn" (the mfn of a shadow page), gl2mfn (the mfn of a * guest L2 page), etc... */ /* With this defined, we do some ugly things to force the compiler to * give us type safety between mfns and gfns and other integers. * TYPE_SAFE(int foo) defines a foo_t, and _foo() and foo_x() functions * that translate beween int and foo_t. * * It does have some performance cost because the types now have * a different storage attribute, so may not want it on all the time. */ #ifndef NDEBUG #define TYPE_SAFETY 1 #endif #ifdef TYPE_SAFETY #define TYPE_SAFE(_type,_name) \ typedef struct { _type _name; } _name##_t; \ static inline _name##_t _##_name(_type n) { return (_name##_t) { n }; } \ static inline _type _name##_x(_name##_t n) { return n._name; } #else #define TYPE_SAFE(_type,_name) \ typedef _type _name##_t; \ static inline _name##_t _##_name(_type n) { return n; } \ static inline _type _name##_x(_name##_t n) { return n; } #endif TYPE_SAFE(unsigned long,mfn); #ifndef mfn_t #define mfn_t /* Grep fodder: mfn_t, _mfn() and mfn_x() are defined above */ #undef mfn_t #endif /* Macro for printk formats: use as printk("%"PRI_mfn"\n", mfn_x(foo)); */ #define PRI_mfn "05lx" /* * The MPT (machine->physical mapping table) is an array of word-sized * values, indexed on machine frame number. It is expected that guest OSes * will use it to store a "physical" frame number to give the appearance of * contiguous (or near contiguous) physical memory. */ #undef machine_to_phys_mapping #define machine_to_phys_mapping ((unsigned long *)RDWR_MPT_VIRT_START) #define INVALID_M2P_ENTRY (~0UL) #define VALID_M2P(_e) (!((_e) & (1UL<<(BITS_PER_LONG-1)))) #define SHARED_M2P_ENTRY (~0UL - 1UL) #define SHARED_M2P(_e) ((_e) == SHARED_M2P_ENTRY) #define compat_machine_to_phys_mapping ((unsigned int *)RDWR_COMPAT_MPT_VIRT_START) #define _set_gpfn_from_mfn(mfn, pfn) ({ \ struct domain *d = page_get_owner(__mfn_to_page(mfn)); \ unsigned long entry = (d && (d == dom_cow)) ? \ SHARED_M2P_ENTRY : (pfn); \ ((void)((mfn) >= (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) / 4 || \ (compat_machine_to_phys_mapping[(mfn)] = (unsigned int)(entry))), \ machine_to_phys_mapping[(mfn)] = (entry)); \ }) /* * Disable some users of set_gpfn_from_mfn() (e.g., free_heap_pages()) until * the machine_to_phys_mapping is actually set up. */ extern bool_t machine_to_phys_mapping_valid; #define set_gpfn_from_mfn(mfn, pfn) do { \ if ( machine_to_phys_mapping_valid ) \ _set_gpfn_from_mfn(mfn, pfn); \ } while (0) extern struct rangeset *mmio_ro_ranges; #define get_gpfn_from_mfn(mfn) (machine_to_phys_mapping[(mfn)]) #define mfn_to_gmfn(_d, mfn) \ ( (paging_mode_translate(_d)) \ ? get_gpfn_from_mfn(mfn) \ : (mfn) ) #define INVALID_MFN (~0UL) #define compat_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) #define compat_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) #ifdef MEMORY_GUARD void memguard_init(void); void memguard_guard_range(void *p, unsigned long l); void memguard_unguard_range(void *p, unsigned long l); #else #define memguard_init() ((void)0) #define memguard_guard_range(_p,_l) ((void)0) #define memguard_unguard_range(_p,_l) ((void)0) #endif void memguard_guard_stack(void *p); void memguard_unguard_stack(void *p); int ptwr_do_page_fault(struct vcpu *, unsigned long, struct cpu_user_regs *); int mmio_ro_do_page_fault(struct vcpu *, unsigned long, struct cpu_user_regs *); int audit_adjust_pgtables(struct domain *d, int dir, int noisy); extern int pagefault_by_memadd(unsigned long addr, struct cpu_user_regs *regs); extern int handle_memadd_fault(unsigned long addr, struct cpu_user_regs *regs); #ifndef NDEBUG #define AUDIT_SHADOW_ALREADY_LOCKED ( 1u << 0 ) #define AUDIT_ERRORS_OK ( 1u << 1 ) #define AUDIT_QUIET ( 1u << 2 ) void _audit_domain(struct domain *d, int flags); #define audit_domain(_d) _audit_domain((_d), AUDIT_ERRORS_OK) void audit_domains(void); #else #define _audit_domain(_d, _f) ((void)0) #define audit_domain(_d) ((void)0) #define audit_domains() ((void)0) #endif int new_guest_cr3(unsigned long pfn); void make_cr3(struct vcpu *v, unsigned long mfn); void update_cr3(struct vcpu *v); int vcpu_destroy_pagetables(struct vcpu *); struct trap_bounce *propagate_page_fault(unsigned long addr, u16 error_code); void *do_page_walk(struct vcpu *v, unsigned long addr); int __sync_local_execstate(void); /* Arch-specific portion of memory_op hypercall. */ long arch_memory_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg); long subarch_memory_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg); int compat_arch_memory_op(int op, XEN_GUEST_HANDLE_PARAM(void)); int compat_subarch_memory_op(int op, XEN_GUEST_HANDLE_PARAM(void)); int steal_page( struct domain *d, struct page_info *page, unsigned int memflags); int donate_page( struct domain *d, struct page_info *page, unsigned int memflags); int map_ldt_shadow_page(unsigned int); #define NIL(type) ((type *)-sizeof(type)) #define IS_NIL(ptr) (!((uintptr_t)(ptr) + sizeof(*(ptr)))) int create_perdomain_mapping(struct domain *, unsigned long va, unsigned int nr, l1_pgentry_t **, struct page_info **); void destroy_perdomain_mapping(struct domain *, unsigned long va, unsigned int nr); void free_perdomain_mappings(struct domain *); extern int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm); void domain_set_alloc_bitsize(struct domain *d); unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits); unsigned long domain_get_maximum_gpfn(struct domain *d); void mem_event_cleanup(struct domain *d); extern struct domain *dom_xen, *dom_io, *dom_cow; /* for vmcoreinfo */ /* Definition of an mm lock: spinlock with extra fields for debugging */ typedef struct mm_lock { spinlock_t lock; int unlock_level; int locker; /* processor which holds the lock */ const char *locker_function; /* func that took it */ } mm_lock_t; typedef struct mm_rwlock { rwlock_t lock; int unlock_level; int recurse_count; int locker; /* CPU that holds the write lock */ const char *locker_function; /* func that took it */ } mm_rwlock_t; #endif /* __ASM_X86_MM_H__ */ xen-4.4.0/xen/include/asm-x86/p2m.h0000664000175000017500000006604412307313555014744 0ustar smbsmb/****************************************************************************** * include/asm-x86/paging.h * * physical-to-machine mappings for automatically-translated domains. * * Copyright (c) 2011 GridCentric Inc. (Andres Lagar-Cavilla) * Copyright (c) 2007 Advanced Micro Devices (Wei Huang) * Parts of this code are Copyright (c) 2006-2007 by XenSource Inc. * Parts of this code are Copyright (c) 2006 by Michael A Fetterman * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _XEN_P2M_H #define _XEN_P2M_H #include #include #include #include /* for pagetable_t */ extern bool_t opt_hap_1gb, opt_hap_2mb; /* * The upper levels of the p2m pagetable always contain full rights; all * variation in the access control bits is made in the level-1 PTEs. * * In addition to the phys-to-machine translation, each p2m PTE contains * *type* information about the gfn it translates, helping Xen to decide * on the correct course of action when handling a page-fault to that * guest frame. We store the type in the "available" bits of the PTEs * in the table, which gives us 8 possible types on 32-bit systems. * Further expansions of the type system will only be supported on * 64-bit Xen. */ /* * AMD IOMMU: When we share p2m table with iommu, bit 52 -bit 58 in pte * cannot be non-zero, otherwise, hardware generates io page faults when * device access those pages. Therefore, p2m_ram_rw has to be defined as 0. */ typedef enum { p2m_ram_rw = 0, /* Normal read/write guest RAM */ p2m_invalid = 1, /* Nothing mapped here */ p2m_ram_logdirty = 2, /* Temporarily read-only for log-dirty */ p2m_ram_ro = 3, /* Read-only; writes are silently dropped */ p2m_mmio_dm = 4, /* Reads and write go to the device model */ p2m_mmio_direct = 5, /* Read/write mapping of genuine MMIO area */ p2m_populate_on_demand = 6, /* Place-holder for empty memory */ /* Although these are defined in all builds, they can only * be used in 64-bit builds */ p2m_grant_map_rw = 7, /* Read/write grant mapping */ p2m_grant_map_ro = 8, /* Read-only grant mapping */ p2m_ram_paging_out = 9, /* Memory that is being paged out */ p2m_ram_paged = 10, /* Memory that has been paged out */ p2m_ram_paging_in = 11, /* Memory that is being paged in */ p2m_ram_shared = 12, /* Shared or sharable memory */ p2m_ram_broken = 13, /* Broken page, access cause domain crash */ } p2m_type_t; /* * Additional access types, which are used to further restrict * the permissions given my the p2m_type_t memory type. Violations * caused by p2m_access_t restrictions are sent to the mem_event * interface. * * The access permissions are soft state: when any ambigious change of page * type or use occurs, or when pages are flushed, swapped, or at any other * convenient type, the access permissions can get reset to the p2m_domain * default. */ typedef enum { p2m_access_n = 0, /* No access permissions allowed */ p2m_access_r = 1, p2m_access_w = 2, p2m_access_rw = 3, p2m_access_x = 4, p2m_access_rx = 5, p2m_access_wx = 6, p2m_access_rwx = 7, p2m_access_rx2rw = 8, /* Special: page goes from RX to RW on write */ p2m_access_n2rwx = 9, /* Special: page goes from N to RWX on access, * * generates an event but does not pause the * vcpu */ /* NOTE: Assumed to be only 4 bits right now */ } p2m_access_t; /* Modifiers to the query */ typedef unsigned int p2m_query_t; #define P2M_ALLOC (1u<<0) /* Populate PoD and paged-out entries */ #define P2M_UNSHARE (1u<<1) /* Break CoW sharing */ /* We use bitmaps and maks to handle groups of types */ #define p2m_to_mask(_t) (1UL << (_t)) /* RAM types, which map to real machine frames */ #define P2M_RAM_TYPES (p2m_to_mask(p2m_ram_rw) \ | p2m_to_mask(p2m_ram_logdirty) \ | p2m_to_mask(p2m_ram_ro) \ | p2m_to_mask(p2m_ram_paging_out) \ | p2m_to_mask(p2m_ram_paged) \ | p2m_to_mask(p2m_ram_paging_in) \ | p2m_to_mask(p2m_ram_shared)) /* Types that represent a physmap hole that is ok to replace with a shared * entry */ #define P2M_HOLE_TYPES (p2m_to_mask(p2m_mmio_dm) \ | p2m_to_mask(p2m_invalid) \ | p2m_to_mask(p2m_ram_paging_in) \ | p2m_to_mask(p2m_ram_paged)) /* Grant mapping types, which map to a real machine frame in another * VM */ #define P2M_GRANT_TYPES (p2m_to_mask(p2m_grant_map_rw) \ | p2m_to_mask(p2m_grant_map_ro) ) /* MMIO types, which don't have to map to anything in the frametable */ #define P2M_MMIO_TYPES (p2m_to_mask(p2m_mmio_dm) \ | p2m_to_mask(p2m_mmio_direct)) /* Read-only types, which must have the _PAGE_RW bit clear in their PTEs */ #define P2M_RO_TYPES (p2m_to_mask(p2m_ram_logdirty) \ | p2m_to_mask(p2m_ram_ro) \ | p2m_to_mask(p2m_grant_map_ro) \ | p2m_to_mask(p2m_ram_shared) ) #define P2M_POD_TYPES (p2m_to_mask(p2m_populate_on_demand)) /* Pageable types */ #define P2M_PAGEABLE_TYPES (p2m_to_mask(p2m_ram_rw) \ | p2m_to_mask(p2m_ram_logdirty) ) #define P2M_PAGING_TYPES (p2m_to_mask(p2m_ram_paging_out) \ | p2m_to_mask(p2m_ram_paged) \ | p2m_to_mask(p2m_ram_paging_in)) #define P2M_PAGED_TYPES (p2m_to_mask(p2m_ram_paged)) /* Shared types */ /* XXX: Sharable types could include p2m_ram_ro too, but we would need to * reinit the type correctly after fault */ #define P2M_SHARABLE_TYPES (p2m_to_mask(p2m_ram_rw) \ | p2m_to_mask(p2m_ram_logdirty) ) #define P2M_SHARED_TYPES (p2m_to_mask(p2m_ram_shared)) /* Broken type: the frame backing this pfn has failed in hardware * and must not be touched. */ #define P2M_BROKEN_TYPES (p2m_to_mask(p2m_ram_broken)) /* Useful predicates */ #define p2m_is_ram(_t) (p2m_to_mask(_t) & P2M_RAM_TYPES) #define p2m_is_hole(_t) (p2m_to_mask(_t) & P2M_HOLE_TYPES) #define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES) #define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES) #define p2m_is_pod(_t) (p2m_to_mask(_t) & P2M_POD_TYPES) #define p2m_is_grant(_t) (p2m_to_mask(_t) & P2M_GRANT_TYPES) /* Grant types are *not* considered valid, because they can be unmapped at any time and, unless you happen to be the shadow or p2m implementations, there's no way of synchronising against that. */ #define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES)) #define p2m_has_emt(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | p2m_to_mask(p2m_mmio_direct))) #define p2m_is_pageable(_t) (p2m_to_mask(_t) & P2M_PAGEABLE_TYPES) #define p2m_is_paging(_t) (p2m_to_mask(_t) & P2M_PAGING_TYPES) #define p2m_is_paged(_t) (p2m_to_mask(_t) & P2M_PAGED_TYPES) #define p2m_is_sharable(_t) (p2m_to_mask(_t) & P2M_SHARABLE_TYPES) #define p2m_is_shared(_t) (p2m_to_mask(_t) & P2M_SHARED_TYPES) #define p2m_is_broken(_t) (p2m_to_mask(_t) & P2M_BROKEN_TYPES) /* Per-p2m-table state */ struct p2m_domain { /* Lock that protects updates to the p2m */ mm_rwlock_t lock; /* Shadow translated domain: p2m mapping */ pagetable_t phys_table; /* Same as domain_dirty_cpumask but limited to * this p2m and those physical cpus whose vcpu's are in * guestmode. */ cpumask_var_t dirty_cpumask; struct domain *domain; /* back pointer to domain */ /* Nested p2ms only: nested p2m base value that this p2m shadows. * This can be cleared to P2M_BASE_EADDR under the per-p2m lock but * needs both the per-p2m lock and the per-domain nestedp2m lock * to set it to any other value. */ #define P2M_BASE_EADDR (~0ULL) uint64_t np2m_base; /* Nested p2ms: linked list of n2pms allocated to this domain. * The host p2m hasolds the head of the list and the np2ms are * threaded on in LRU order. */ struct list_head np2m_list; /* Host p2m: when this flag is set, don't flush all the nested-p2m * tables on every host-p2m change. The setter of this flag * is responsible for performing the full flush before releasing the * host p2m's lock. */ int defer_nested_flush; /* Pages used to construct the p2m */ struct page_list_head pages; int (*set_entry )(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma); mfn_t (*get_entry )(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *p2mt, p2m_access_t *p2ma, p2m_query_t q, unsigned int *page_order); void (*change_entry_type_global)(struct p2m_domain *p2m, p2m_type_t ot, p2m_type_t nt); void (*write_p2m_entry)(struct p2m_domain *p2m, unsigned long gfn, l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level); long (*audit_p2m)(struct p2m_domain *p2m); /* Default P2M access type for each page in the the domain: new pages, * swapped in pages, cleared pages, and pages that are ambiquously * retyped get this access type. See definition of p2m_access_t. */ p2m_access_t default_access; /* If true, and an access fault comes in and there is no mem_event listener, * pause domain. Otherwise, remove access restrictions. */ bool_t access_required; /* Highest guest frame that's ever been mapped in the p2m */ unsigned long max_mapped_pfn; /* When releasing shared gfn's in a preemptible manner, recall where * to resume the search */ unsigned long next_shared_gfn_to_relinquish; /* Populate-on-demand variables * All variables are protected with the pod lock. We cannot rely on * the p2m lock if it's turned into a fine-grained lock. * We only use the domain page_alloc lock for additions and * deletions to the domain's page list. Because we use it nested * within the PoD lock, we enforce it's ordering (by remembering * the unlock level in the arch_domain sub struct). */ struct { struct page_list_head super, /* List of superpages */ single; /* Non-super lists */ long count, /* # of pages in cache lists */ entry_count; /* # of pages in p2m marked pod */ unsigned long reclaim_single; /* Last gpfn of a scan */ unsigned long max_guest; /* gpfn of max guest demand-populate */ #define POD_HISTORY_MAX 128 /* gpfn of last guest superpage demand-populated */ unsigned long last_populated[POD_HISTORY_MAX]; unsigned int last_populated_index; mm_lock_t lock; /* Locking of private pod structs, * * not relying on the p2m lock. */ } pod; union { struct ept_data ept; /* NPT-equivalent structure could be added here. */ }; }; /* get host p2m table */ #define p2m_get_hostp2m(d) ((d)->arch.p2m) /* Get p2m table (re)usable for specified np2m base. * Automatically destroys and re-initializes a p2m if none found. * If np2m_base == 0 then v->arch.hvm_vcpu.guest_cr[3] is used. */ struct p2m_domain *p2m_get_nestedp2m(struct vcpu *v, uint64_t np2m_base); /* If vcpu is in host mode then behaviour matches p2m_get_hostp2m(). * If vcpu is in guest mode then behaviour matches p2m_get_nestedp2m(). */ struct p2m_domain *p2m_get_p2m(struct vcpu *v); #define p2m_is_nestedp2m(p2m) ((p2m) != p2m_get_hostp2m((p2m->domain))) #define p2m_get_pagetable(p2m) ((p2m)->phys_table) /**** p2m query accessors. They lock p2m_lock, and thus serialize * lookups wrt modifications. They _do not_ release the lock on exit. * After calling any of the variants below, caller needs to use * put_gfn. ****/ mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, p2m_query_t q, unsigned int *page_order, bool_t locked); /* Read a particular P2M table, mapping pages as we go. Most callers * should _not_ call this directly; use the other get_gfn* functions * below unless you know you want to walk a p2m that isn't a domain's * main one. * If the lookup succeeds, the return value is != INVALID_MFN and * *page_order is filled in with the order of the superpage (if any) that * the entry was found in. */ #define get_gfn_type_access(p, g, t, a, q, o) \ __get_gfn_type_access((p), (g), (t), (a), (q), (o), 1) /* General conversion function from gfn to mfn */ static inline mfn_t get_gfn_type(struct domain *d, unsigned long gfn, p2m_type_t *t, p2m_query_t q) { p2m_access_t a; return get_gfn_type_access(p2m_get_hostp2m(d), gfn, t, &a, q, NULL); } /* Syntactic sugar: most callers will use one of these. * N.B. get_gfn_query() is the _only_ one guaranteed not to take the * p2m lock; none of the others can be called with the p2m or paging * lock held. */ #define get_gfn(d, g, t) get_gfn_type((d), (g), (t), P2M_ALLOC) #define get_gfn_query(d, g, t) get_gfn_type((d), (g), (t), 0) #define get_gfn_unshare(d, g, t) get_gfn_type((d), (g), (t), \ P2M_ALLOC | P2M_UNSHARE) /* Will release the p2m_lock for this gfn entry. */ void __put_gfn(struct p2m_domain *p2m, unsigned long gfn); #define put_gfn(d, gfn) __put_gfn(p2m_get_hostp2m((d)), (gfn)) /* The intent of the "unlocked" accessor is to have the caller not worry about * put_gfn. They apply to very specific situations: debug printk's, dumps * during a domain crash, or to peek at a p2m entry/type. Caller is not * holding the p2m entry exclusively during or after calling this. * * This is also used in the shadow code whenever the paging lock is * held -- in those cases, the caller is protected against concurrent * p2m updates by the fact that shadow_write_p2m_entry() also takes * the paging lock. * * Note that an unlocked accessor only makes sense for a "query" lookup. * Any other type of query can cause a change in the p2m and may need to * perform locking. */ static inline mfn_t get_gfn_query_unlocked(struct domain *d, unsigned long gfn, p2m_type_t *t) { p2m_access_t a; return __get_gfn_type_access(p2m_get_hostp2m(d), gfn, t, &a, 0, NULL, 0); } /* Atomically look up a GFN and take a reference count on the backing page. * This makes sure the page doesn't get freed (or shared) underfoot, * and should be used by any path that intends to write to the backing page. * Returns NULL if the page is not backed by RAM. * The caller is responsible for calling put_page() afterwards. */ struct page_info *get_page_from_gfn_p2m(struct domain *d, struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, p2m_query_t q); static inline struct page_info *get_page_from_gfn( struct domain *d, unsigned long gfn, p2m_type_t *t, p2m_query_t q) { struct page_info *page; if ( paging_mode_translate(d) ) return get_page_from_gfn_p2m(d, p2m_get_hostp2m(d), gfn, t, NULL, q); /* Non-translated guests see 1-1 RAM mappings everywhere */ if (t) *t = p2m_ram_rw; page = __mfn_to_page(gfn); return mfn_valid(gfn) && get_page(page, d) ? page : NULL; } /* General conversion function from mfn to gfn */ static inline unsigned long mfn_to_gfn(struct domain *d, mfn_t mfn) { if ( paging_mode_translate(d) ) return get_gpfn_from_mfn(mfn_x(mfn)); else return mfn_x(mfn); } /* Deadlock-avoidance scheme when calling get_gfn on different gfn's */ struct two_gfns { struct domain *first_domain; unsigned long first_gfn; struct domain *second_domain; unsigned long second_gfn; }; /* Returns mfn, type and access for potential caller consumption, but any * of those can be NULL */ static inline void get_two_gfns(struct domain *rd, unsigned long rgfn, p2m_type_t *rt, p2m_access_t *ra, mfn_t *rmfn, struct domain *ld, unsigned long lgfn, p2m_type_t *lt, p2m_access_t *la, mfn_t *lmfn, p2m_query_t q, struct two_gfns *rval) { mfn_t *first_mfn, *second_mfn, scratch_mfn; p2m_access_t *first_a, *second_a, scratch_a; p2m_type_t *first_t, *second_t, scratch_t; /* Sort by domain, if same domain by gfn */ #define assign_pointers(dest, source) \ do { \ rval-> dest ## _domain = source ## d; \ rval-> dest ## _gfn = source ## gfn; \ dest ## _mfn = (source ## mfn) ?: &scratch_mfn; \ dest ## _a = (source ## a) ?: &scratch_a; \ dest ## _t = (source ## t) ?: &scratch_t; \ } while (0) if ( (rd->domain_id <= ld->domain_id) || ((rd == ld) && (rgfn <= lgfn)) ) { assign_pointers(first, r); assign_pointers(second, l); } else { assign_pointers(first, l); assign_pointers(second, r); } #undef assign_pointers /* Now do the gets */ *first_mfn = get_gfn_type_access(p2m_get_hostp2m(rval->first_domain), rval->first_gfn, first_t, first_a, q, NULL); *second_mfn = get_gfn_type_access(p2m_get_hostp2m(rval->second_domain), rval->second_gfn, second_t, second_a, q, NULL); } static inline void put_two_gfns(struct two_gfns *arg) { if ( !arg ) return; put_gfn(arg->second_domain, arg->second_gfn); put_gfn(arg->first_domain, arg->first_gfn); } /* Init the datastructures for later use by the p2m code */ int p2m_init(struct domain *d); /* Allocate a new p2m table for a domain. * * Returns 0 for success or -errno. */ int p2m_alloc_table(struct p2m_domain *p2m); /* Return all the p2m resources to Xen. */ void p2m_teardown(struct p2m_domain *p2m); void p2m_final_teardown(struct domain *d); /* Add a page to a domain's p2m table */ int guest_physmap_add_entry(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int page_order, p2m_type_t t); /* Untyped version for RAM only, for compatibility */ static inline int guest_physmap_add_page(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int page_order) { return guest_physmap_add_entry(d, gfn, mfn, page_order, p2m_ram_rw); } /* Remove a page from a domain's p2m table */ void guest_physmap_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int page_order); /* Set a p2m range as populate-on-demand */ int guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn, unsigned int order); /* Change types across all p2m entries in a domain */ void p2m_change_entry_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt); /* Change types across a range of p2m entries (start ... end-1) */ void p2m_change_type_range(struct domain *d, unsigned long start, unsigned long end, p2m_type_t ot, p2m_type_t nt); /* Compare-exchange the type of a single p2m entry */ p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn, p2m_type_t ot, p2m_type_t nt); /* Set mmio addresses in the p2m table (for pass-through) */ int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn); int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn); /* * Populate-on-demand */ /* Dump PoD information about the domain */ void p2m_pod_dump_data(struct domain *d); /* Move all pages from the populate-on-demand cache to the domain page_list * (usually in preparation for domain destruction) */ void p2m_pod_empty_cache(struct domain *d); /* Set populate-on-demand cache size so that the total memory allocated to a * domain matches target */ int p2m_pod_set_mem_target(struct domain *d, unsigned long target); /* Call when decreasing memory reservation to handle PoD entries properly. * Will return '1' if all entries were handled and nothing more need be done.*/ int p2m_pod_decrease_reservation(struct domain *d, xen_pfn_t gpfn, unsigned int order); /* Scan pod cache when offline/broken page triggered */ int p2m_pod_offline_or_broken_hit(struct page_info *p); /* Replace pod cache when offline/broken page triggered */ void p2m_pod_offline_or_broken_replace(struct page_info *p); /* * Paging to disk and page-sharing */ /* Modify p2m table for shared gfn */ int set_shared_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn); /* Check if a nominated gfn is valid to be paged out */ int p2m_mem_paging_nominate(struct domain *d, unsigned long gfn); /* Evict a frame */ int p2m_mem_paging_evict(struct domain *d, unsigned long gfn); /* Tell xenpaging to drop a paged out frame */ void p2m_mem_paging_drop_page(struct domain *d, unsigned long gfn, p2m_type_t p2mt); /* Start populating a paged out frame */ void p2m_mem_paging_populate(struct domain *d, unsigned long gfn); /* Prepare the p2m for paging a frame in */ int p2m_mem_paging_prep(struct domain *d, unsigned long gfn, uint64_t buffer); /* Resume normal operation (in case a domain was paused) */ void p2m_mem_paging_resume(struct domain *d); /* Send mem event based on the access (gla is -1ull if not available). Handles * the rw2rx conversion. Boolean return value indicates if access rights have * been promoted with no underlying vcpu pause. If the req_ptr has been populated, * then the caller must put the event in the ring (once having released get_gfn* * locks -- caller must also xfree the request. */ bool_t p2m_mem_access_check(paddr_t gpa, bool_t gla_valid, unsigned long gla, bool_t access_r, bool_t access_w, bool_t access_x, mem_event_request_t **req_ptr); /* Resumes the running of the VCPU, restarting the last instruction */ void p2m_mem_access_resume(struct domain *d); /* Set access type for a region of pfns. * If start_pfn == -1ul, sets the default access type */ int p2m_set_mem_access(struct domain *d, unsigned long start_pfn, uint32_t nr, hvmmem_access_t access); /* Get access type for a pfn * If pfn == -1ul, gets the default access type */ int p2m_get_mem_access(struct domain *d, unsigned long pfn, hvmmem_access_t *access); /* * Internal functions, only called by other p2m code */ struct page_info *p2m_alloc_ptp(struct p2m_domain *p2m, unsigned long type); void p2m_free_ptp(struct p2m_domain *p2m, struct page_info *pg); /* Directly set a p2m entry: only for use by p2m code. Does not need * a call to put_gfn afterwards/ */ int set_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma); /* Set up function pointers for PT implementation: only for use by p2m code */ extern void p2m_pt_init(struct p2m_domain *p2m); /* Debugging and auditing of the P2M code? */ #ifndef NDEBUG #define P2M_AUDIT 1 #else #define P2M_AUDIT 0 #endif #define P2M_DEBUGGING 0 #if P2M_AUDIT extern void audit_p2m(struct domain *d, uint64_t *orphans, uint64_t *m2p_bad, uint64_t *p2m_bad); #endif /* P2M_AUDIT */ /* Printouts */ #define P2M_PRINTK(f, a...) \ debugtrace_printk("p2m: %s(): " f, __func__, ##a) #define P2M_ERROR(f, a...) \ printk(XENLOG_G_ERR "pg error: %s(): " f, __func__, ##a) #if P2M_DEBUGGING #define P2M_DEBUG(f, a...) \ debugtrace_printk("p2mdebug: %s(): " f, __func__, ##a) #else #define P2M_DEBUG(f, a...) do { (void)(f); } while(0) #endif /* Called by p2m code when demand-populating a PoD page */ int p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn, unsigned int order, p2m_query_t q); /* * Functions specific to the p2m-pt implementation */ /* Extract the type from the PTE flags that store it */ static inline p2m_type_t p2m_flags_to_type(unsigned long flags) { /* For AMD IOMMUs we need to use type 0 for plain RAM, but we need * to make sure that an entirely empty PTE doesn't have RAM type */ if ( flags == 0 ) return p2m_invalid; /* AMD IOMMUs use bits 9-11 to encode next io page level and bits * 59-62 for iommu flags so we can't use them to store p2m type info. */ return (flags >> 12) & 0x7f; } /* * Nested p2m: shadow p2m tables used for nested HVM virtualization */ /* Flushes specified p2m table */ void p2m_flush(struct vcpu *v, struct p2m_domain *p2m); /* Flushes all nested p2m tables */ void p2m_flush_nestedp2m(struct domain *d); void nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level); #endif /* _XEN_P2M_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/amd-iommu.h0000664000175000017500000001126312307313555016124 0ustar smbsmb/* * Copyright (C) 2007 Advanced Micro Devices, Inc. * Author: Leo Duran * Author: Wei Wang - adapted to xen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _ASM_X86_64_AMD_IOMMU_H #define _ASM_X86_64_AMD_IOMMU_H #include #include #include #include #include #include #include #define iommu_found() (!list_empty(&amd_iommu_head)) extern struct list_head amd_iommu_head; #pragma pack(1) typedef struct event_entry { uint32_t data[4]; } event_entry_t; typedef struct ppr_entry { uint32_t data[4]; } ppr_entry_t; typedef struct cmd_entry { uint32_t data[4]; } cmd_entry_t; typedef struct dev_entry { uint32_t data[8]; } dev_entry_t; #pragma pack() struct table_struct { void *buffer; unsigned long entries; unsigned long alloc_size; }; struct ring_buffer { void *buffer; unsigned long entries; unsigned long alloc_size; uint32_t tail; uint32_t head; spinlock_t lock; /* protect buffer pointers */ }; typedef struct iommu_cap { uint32_t header; /* offset 00h */ uint32_t base_low; /* offset 04h */ uint32_t base_hi; /* offset 08h */ uint32_t range; /* offset 0Ch */ uint32_t misc; /* offset 10h */ } iommu_cap_t; struct amd_iommu { struct list_head list; spinlock_t lock; /* protect iommu */ u16 seg; u16 bdf; struct msi_desc msi; u16 cap_offset; iommu_cap_t cap; u8 ht_flags; u64 features; void *mmio_base; unsigned long mmio_base_phys; struct table_struct dev_table; struct ring_buffer cmd_buffer; struct ring_buffer event_log; struct ring_buffer ppr_log; int exclusion_enable; int exclusion_allow_all; uint64_t exclusion_base; uint64_t exclusion_limit; int enabled; }; struct ivrs_mappings { u16 dte_requestor_id; u8 dte_allow_exclusion; u8 unity_map_enable; u8 write_permission; u8 read_permission; unsigned long addr_range_start; unsigned long addr_range_length; struct amd_iommu *iommu; /* per device interrupt remapping table */ void *intremap_table; unsigned long *intremap_inuse; spinlock_t intremap_lock; /* ivhd device data settings */ u8 device_flags; }; extern unsigned int ivrs_bdf_entries; struct ivrs_mappings *get_ivrs_mappings(u16 seg); int iterate_ivrs_mappings(int (*)(u16 seg, struct ivrs_mappings *)); int iterate_ivrs_entries(int (*)(u16 seg, struct ivrs_mappings *)); /* iommu tables in guest space */ struct mmio_reg { uint32_t lo; uint32_t hi; }; struct guest_dev_table { struct mmio_reg reg_base; uint32_t size; }; struct guest_buffer { struct mmio_reg reg_base; struct mmio_reg reg_tail; struct mmio_reg reg_head; uint32_t entries; }; struct guest_iommu_msi { uint8_t vector; uint8_t dest; uint8_t dest_mode; uint8_t delivery_mode; uint8_t trig_mode; }; /* virtual IOMMU structure */ struct guest_iommu { struct domain *domain; spinlock_t lock; bool_t enabled; struct guest_dev_table dev_table; struct guest_buffer cmd_buffer; struct guest_buffer event_log; struct guest_buffer ppr_log; struct tasklet cmd_buffer_tasklet; uint64_t mmio_base; /* MMIO base address */ /* MMIO regs */ struct mmio_reg reg_ctrl; /* MMIO offset 0018h */ struct mmio_reg reg_status; /* MMIO offset 2020h */ struct mmio_reg reg_ext_feature; /* MMIO offset 0030h */ /* guest interrupt settings */ struct guest_iommu_msi msi; }; extern bool_t iommuv2_enabled; #endif /* _ASM_X86_64_AMD_IOMMU_H */ xen-4.4.0/xen/include/asm-x86/page.h0000664000175000017500000003676212307313555015166 0ustar smbsmb#ifndef __X86_PAGE_H__ #define __X86_PAGE_H__ #include /* * It is important that the masks are signed quantities. This ensures that * the compiler sign-extends a 32-bit mask to 64 bits if that is required. */ #define PAGE_SIZE (_AC(1,L) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) #define PAGE_FLAG_MASK (~0) #define PAGE_ORDER_4K 0 #define PAGE_ORDER_2M 9 #define PAGE_ORDER_1G 18 #ifndef __ASSEMBLY__ # include # include #endif #include /* Read a pte atomically from memory. */ #define l1e_read_atomic(l1ep) \ l1e_from_intpte(pte_read_atomic(&l1e_get_intpte(*(l1ep)))) #define l2e_read_atomic(l2ep) \ l2e_from_intpte(pte_read_atomic(&l2e_get_intpte(*(l2ep)))) #define l3e_read_atomic(l3ep) \ l3e_from_intpte(pte_read_atomic(&l3e_get_intpte(*(l3ep)))) #define l4e_read_atomic(l4ep) \ l4e_from_intpte(pte_read_atomic(&l4e_get_intpte(*(l4ep)))) /* Write a pte atomically to memory. */ #define l1e_write_atomic(l1ep, l1e) \ pte_write_atomic(&l1e_get_intpte(*(l1ep)), l1e_get_intpte(l1e)) #define l2e_write_atomic(l2ep, l2e) \ pte_write_atomic(&l2e_get_intpte(*(l2ep)), l2e_get_intpte(l2e)) #define l3e_write_atomic(l3ep, l3e) \ pte_write_atomic(&l3e_get_intpte(*(l3ep)), l3e_get_intpte(l3e)) #define l4e_write_atomic(l4ep, l4e) \ pte_write_atomic(&l4e_get_intpte(*(l4ep)), l4e_get_intpte(l4e)) /* * Write a pte safely but non-atomically to memory. * The PTE may become temporarily not-present during the update. */ #define l1e_write(l1ep, l1e) \ pte_write(&l1e_get_intpte(*(l1ep)), l1e_get_intpte(l1e)) #define l2e_write(l2ep, l2e) \ pte_write(&l2e_get_intpte(*(l2ep)), l2e_get_intpte(l2e)) #define l3e_write(l3ep, l3e) \ pte_write(&l3e_get_intpte(*(l3ep)), l3e_get_intpte(l3e)) #define l4e_write(l4ep, l4e) \ pte_write(&l4e_get_intpte(*(l4ep)), l4e_get_intpte(l4e)) /* Get direct integer representation of a pte's contents (intpte_t). */ #define l1e_get_intpte(x) ((x).l1) #define l2e_get_intpte(x) ((x).l2) #define l3e_get_intpte(x) ((x).l3) #define l4e_get_intpte(x) ((x).l4) /* Get pfn mapped by pte (unsigned long). */ #define l1e_get_pfn(x) \ ((unsigned long)(((x).l1 & (PADDR_MASK&PAGE_MASK)) >> PAGE_SHIFT)) #define l2e_get_pfn(x) \ ((unsigned long)(((x).l2 & (PADDR_MASK&PAGE_MASK)) >> PAGE_SHIFT)) #define l3e_get_pfn(x) \ ((unsigned long)(((x).l3 & (PADDR_MASK&PAGE_MASK)) >> PAGE_SHIFT)) #define l4e_get_pfn(x) \ ((unsigned long)(((x).l4 & (PADDR_MASK&PAGE_MASK)) >> PAGE_SHIFT)) /* Get physical address of page mapped by pte (paddr_t). */ #define l1e_get_paddr(x) \ ((paddr_t)(((x).l1 & (PADDR_MASK&PAGE_MASK)))) #define l2e_get_paddr(x) \ ((paddr_t)(((x).l2 & (PADDR_MASK&PAGE_MASK)))) #define l3e_get_paddr(x) \ ((paddr_t)(((x).l3 & (PADDR_MASK&PAGE_MASK)))) #define l4e_get_paddr(x) \ ((paddr_t)(((x).l4 & (PADDR_MASK&PAGE_MASK)))) /* Get pointer to info structure of page mapped by pte (struct page_info *). */ #define l1e_get_page(x) (mfn_to_page(l1e_get_pfn(x))) #define l2e_get_page(x) (mfn_to_page(l2e_get_pfn(x))) #define l3e_get_page(x) (mfn_to_page(l3e_get_pfn(x))) #define l4e_get_page(x) (mfn_to_page(l4e_get_pfn(x))) /* Get pte access flags (unsigned int). */ #define l1e_get_flags(x) (get_pte_flags((x).l1)) #define l2e_get_flags(x) (get_pte_flags((x).l2)) #define l3e_get_flags(x) (get_pte_flags((x).l3)) #define l4e_get_flags(x) (get_pte_flags((x).l4)) /* Construct an empty pte. */ #define l1e_empty() ((l1_pgentry_t) { 0 }) #define l2e_empty() ((l2_pgentry_t) { 0 }) #define l3e_empty() ((l3_pgentry_t) { 0 }) #define l4e_empty() ((l4_pgentry_t) { 0 }) /* Construct a pte from a pfn and access flags. */ #define l1e_from_pfn(pfn, flags) \ ((l1_pgentry_t) { ((intpte_t)(pfn) << PAGE_SHIFT) | put_pte_flags(flags) }) #define l2e_from_pfn(pfn, flags) \ ((l2_pgentry_t) { ((intpte_t)(pfn) << PAGE_SHIFT) | put_pte_flags(flags) }) #define l3e_from_pfn(pfn, flags) \ ((l3_pgentry_t) { ((intpte_t)(pfn) << PAGE_SHIFT) | put_pte_flags(flags) }) #define l4e_from_pfn(pfn, flags) \ ((l4_pgentry_t) { ((intpte_t)(pfn) << PAGE_SHIFT) | put_pte_flags(flags) }) /* Construct a pte from a physical address and access flags. */ #ifndef __ASSEMBLY__ static inline l1_pgentry_t l1e_from_paddr(paddr_t pa, unsigned int flags) { ASSERT((pa & ~(PADDR_MASK & PAGE_MASK)) == 0); return (l1_pgentry_t) { pa | put_pte_flags(flags) }; } static inline l2_pgentry_t l2e_from_paddr(paddr_t pa, unsigned int flags) { ASSERT((pa & ~(PADDR_MASK & PAGE_MASK)) == 0); return (l2_pgentry_t) { pa | put_pte_flags(flags) }; } static inline l3_pgentry_t l3e_from_paddr(paddr_t pa, unsigned int flags) { ASSERT((pa & ~(PADDR_MASK & PAGE_MASK)) == 0); return (l3_pgentry_t) { pa | put_pte_flags(flags) }; } static inline l4_pgentry_t l4e_from_paddr(paddr_t pa, unsigned int flags) { ASSERT((pa & ~(PADDR_MASK & PAGE_MASK)) == 0); return (l4_pgentry_t) { pa | put_pte_flags(flags) }; } #endif /* !__ASSEMBLY__ */ /* Construct a pte from its direct integer representation. */ #define l1e_from_intpte(intpte) ((l1_pgentry_t) { (intpte_t)(intpte) }) #define l2e_from_intpte(intpte) ((l2_pgentry_t) { (intpte_t)(intpte) }) #define l3e_from_intpte(intpte) ((l3_pgentry_t) { (intpte_t)(intpte) }) #define l4e_from_intpte(intpte) ((l4_pgentry_t) { (intpte_t)(intpte) }) /* Construct a pte from a page pointer and access flags. */ #define l1e_from_page(page, flags) (l1e_from_pfn(page_to_mfn(page),(flags))) #define l2e_from_page(page, flags) (l2e_from_pfn(page_to_mfn(page),(flags))) #define l3e_from_page(page, flags) (l3e_from_pfn(page_to_mfn(page),(flags))) #define l4e_from_page(page, flags) (l4e_from_pfn(page_to_mfn(page),(flags))) /* Add extra flags to an existing pte. */ #define l1e_add_flags(x, flags) ((x).l1 |= put_pte_flags(flags)) #define l2e_add_flags(x, flags) ((x).l2 |= put_pte_flags(flags)) #define l3e_add_flags(x, flags) ((x).l3 |= put_pte_flags(flags)) #define l4e_add_flags(x, flags) ((x).l4 |= put_pte_flags(flags)) /* Remove flags from an existing pte. */ #define l1e_remove_flags(x, flags) ((x).l1 &= ~put_pte_flags(flags)) #define l2e_remove_flags(x, flags) ((x).l2 &= ~put_pte_flags(flags)) #define l3e_remove_flags(x, flags) ((x).l3 &= ~put_pte_flags(flags)) #define l4e_remove_flags(x, flags) ((x).l4 &= ~put_pte_flags(flags)) /* Check if a pte's page mapping or significant access flags have changed. */ #define l1e_has_changed(x,y,flags) \ ( !!(((x).l1 ^ (y).l1) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags(flags))) ) #define l2e_has_changed(x,y,flags) \ ( !!(((x).l2 ^ (y).l2) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags(flags))) ) #define l3e_has_changed(x,y,flags) \ ( !!(((x).l3 ^ (y).l3) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags(flags))) ) #define l4e_has_changed(x,y,flags) \ ( !!(((x).l4 ^ (y).l4) & ((PADDR_MASK&PAGE_MASK)|put_pte_flags(flags))) ) /* Pagetable walking. */ #define l2e_to_l1e(x) ((l1_pgentry_t *)__va(l2e_get_paddr(x))) #define l3e_to_l2e(x) ((l2_pgentry_t *)__va(l3e_get_paddr(x))) #define l4e_to_l3e(x) ((l3_pgentry_t *)__va(l4e_get_paddr(x))) #define map_l1t_from_l2e(x) ((l1_pgentry_t *)map_domain_page(l2e_get_pfn(x))) #define map_l2t_from_l3e(x) ((l2_pgentry_t *)map_domain_page(l3e_get_pfn(x))) #define map_l3t_from_l4e(x) ((l3_pgentry_t *)map_domain_page(l4e_get_pfn(x))) /* Given a virtual address, get an entry offset into a page table. */ #define l1_table_offset(a) \ (((a) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1)) #define l2_table_offset(a) \ (((a) >> L2_PAGETABLE_SHIFT) & (L2_PAGETABLE_ENTRIES - 1)) #define l3_table_offset(a) \ (((a) >> L3_PAGETABLE_SHIFT) & (L3_PAGETABLE_ENTRIES - 1)) #define l4_table_offset(a) \ (((a) >> L4_PAGETABLE_SHIFT) & (L4_PAGETABLE_ENTRIES - 1)) /* Convert a pointer to a page-table entry into pagetable slot index. */ #define pgentry_ptr_to_slot(_p) \ (((unsigned long)(_p) & ~PAGE_MASK) / sizeof(*(_p))) #ifndef __ASSEMBLY__ /* Page-table type. */ typedef struct { u64 pfn; } pagetable_t; #define pagetable_get_paddr(x) ((paddr_t)(x).pfn << PAGE_SHIFT) #define pagetable_get_page(x) mfn_to_page((x).pfn) #define pagetable_get_pfn(x) ((x).pfn) #define pagetable_get_mfn(x) _mfn(((x).pfn)) #define pagetable_is_null(x) ((x).pfn == 0) #define pagetable_from_pfn(pfn) ((pagetable_t) { (pfn) }) #define pagetable_from_mfn(mfn) ((pagetable_t) { mfn_x(mfn) }) #define pagetable_from_page(pg) pagetable_from_pfn(page_to_mfn(pg)) #define pagetable_from_paddr(p) pagetable_from_pfn((p)>>PAGE_SHIFT) #define pagetable_null() pagetable_from_pfn(0) void clear_page_sse2(void *); #define clear_page(_p) (cpu_has_xmm2 ? \ clear_page_sse2((void *)(_p)) : \ (void)memset((void *)(_p), 0, PAGE_SIZE)) void copy_page_sse2(void *, const void *); #define copy_page(_t,_f) (cpu_has_xmm2 ? \ copy_page_sse2(_t, _f) : \ (void)memcpy(_t, _f, PAGE_SIZE)) /* Convert between Xen-heap virtual addresses and machine addresses. */ #define __pa(x) (virt_to_maddr(x)) #define __va(x) (maddr_to_virt(x)) /* Convert between Xen-heap virtual addresses and machine frame numbers. */ #define __virt_to_mfn(va) (virt_to_maddr(va) >> PAGE_SHIFT) #define __mfn_to_virt(mfn) (maddr_to_virt((paddr_t)(mfn) << PAGE_SHIFT)) /* Convert between machine frame numbers and page-info structures. */ #define __mfn_to_page(mfn) (frame_table + pfn_to_pdx(mfn)) #define __page_to_mfn(pg) pdx_to_pfn((unsigned long)((pg) - frame_table)) /* Convert between machine addresses and page-info structures. */ #define __maddr_to_page(ma) __mfn_to_page((ma) >> PAGE_SHIFT) #define __page_to_maddr(pg) ((paddr_t)__page_to_mfn(pg) << PAGE_SHIFT) /* Convert between frame number and address formats. */ #define __pfn_to_paddr(pfn) ((paddr_t)(pfn) << PAGE_SHIFT) #define __paddr_to_pfn(pa) ((unsigned long)((pa) >> PAGE_SHIFT)) /* Convert between machine frame numbers and spage-info structures. */ #define __mfn_to_spage(mfn) (spage_table + pfn_to_sdx(mfn)) #define __spage_to_mfn(pg) sdx_to_pfn((unsigned long)((pg) - spage_table)) /* Convert between page-info structures and spage-info structures. */ #define page_to_spage(page) (spage_table+(((page)-frame_table)>>(SUPERPAGE_SHIFT-PAGE_SHIFT))) #define spage_to_page(spage) (frame_table+(((spage)-spage_table)<<(SUPERPAGE_SHIFT-PAGE_SHIFT))) /* * We define non-underscored wrappers for above conversion functions. These are * overridden in various source files while underscored versions remain intact. */ #define mfn_valid(mfn) __mfn_valid(mfn) #define virt_to_mfn(va) __virt_to_mfn(va) #define mfn_to_virt(mfn) __mfn_to_virt(mfn) #define virt_to_maddr(va) __virt_to_maddr((unsigned long)(va)) #define maddr_to_virt(ma) __maddr_to_virt((unsigned long)(ma)) #define mfn_to_page(mfn) __mfn_to_page(mfn) #define page_to_mfn(pg) __page_to_mfn(pg) #define mfn_to_spage(mfn) __mfn_to_spage(mfn) #define spage_to_mfn(pg) __spage_to_mfn(pg) #define maddr_to_page(ma) __maddr_to_page(ma) #define page_to_maddr(pg) __page_to_maddr(pg) #define virt_to_page(va) __virt_to_page(va) #define page_to_virt(pg) __page_to_virt(pg) #define pfn_to_paddr(pfn) __pfn_to_paddr(pfn) #define paddr_to_pfn(pa) __paddr_to_pfn(pa) #define paddr_to_pdx(pa) pfn_to_pdx(paddr_to_pfn(pa)) #endif /* !defined(__ASSEMBLY__) */ /* Where to find each level of the linear mapping */ #define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) #define __linear_l2_table \ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START))) #define __linear_l3_table \ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START))) #define __linear_l4_table \ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START))) #ifndef __ASSEMBLY__ extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES]; extern l2_pgentry_t *compat_idle_pg_table_l2; extern unsigned int m2p_compat_vstart; extern l2_pgentry_t l2_xenmap[L2_PAGETABLE_ENTRIES], l2_bootmap[L2_PAGETABLE_ENTRIES]; extern l3_pgentry_t l3_bootmap[L3_PAGETABLE_ENTRIES]; extern l2_pgentry_t l2_identmap[4*L2_PAGETABLE_ENTRIES]; extern l1_pgentry_t l1_identmap[L1_PAGETABLE_ENTRIES], l1_fixmap[L1_PAGETABLE_ENTRIES]; void paging_init(void); #endif /* !defined(__ASSEMBLY__) */ #define _PAGE_NONE _AC(0x000,U) #define _PAGE_PRESENT _AC(0x001,U) #define _PAGE_RW _AC(0x002,U) #define _PAGE_USER _AC(0x004,U) #define _PAGE_PWT _AC(0x008,U) #define _PAGE_PCD _AC(0x010,U) #define _PAGE_ACCESSED _AC(0x020,U) #define _PAGE_DIRTY _AC(0x040,U) #define _PAGE_PAT _AC(0x080,U) #define _PAGE_PSE _AC(0x080,U) #define _PAGE_GLOBAL _AC(0x100,U) #define _PAGE_AVAIL0 _AC(0x200,U) #define _PAGE_AVAIL1 _AC(0x400,U) #define _PAGE_AVAIL2 _AC(0x800,U) #define _PAGE_AVAIL _AC(0xE00,U) #define _PAGE_PSE_PAT _AC(0x1000,U) /* non-architectural flags */ #define _PAGE_PAGED 0x2000U #define _PAGE_SHARED 0x4000U /* * Debug option: Ensure that granted mappings are not implicitly unmapped. * WARNING: This will need to be disabled to run OSes that use the spare PTE * bits themselves (e.g., *BSD). */ #ifdef NDEBUG #undef _PAGE_GNTTAB #endif #ifndef _PAGE_GNTTAB #define _PAGE_GNTTAB 0 #endif #define __PAGE_HYPERVISOR \ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) #define __PAGE_HYPERVISOR_NOCACHE \ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED) #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */ #ifndef __ASSEMBLY__ /* Allocator functions for Xen pagetables. */ void *alloc_xen_pagetable(void); void free_xen_pagetable(void *v); l1_pgentry_t *virt_to_xen_l1e(unsigned long v); extern void set_pdx_range(unsigned long smfn, unsigned long emfn); /* Convert between PAT/PCD/PWT embedded in PTE flags and 3-bit cacheattr. */ static inline uint32_t pte_flags_to_cacheattr(uint32_t flags) { return ((flags >> 5) & 4) | ((flags >> 3) & 3); } static inline uint32_t cacheattr_to_pte_flags(uint32_t cacheattr) { return ((cacheattr & 4) << 5) | ((cacheattr & 3) << 3); } /* No cache maintenance required on x86 architecture. */ static inline void flush_page_to_ram(unsigned long mfn) {} /* return true if permission increased */ static inline bool_t perms_strictly_increased(uint32_t old_flags, uint32_t new_flags) /* Given the flags of two entries, are the new flags a strict * increase in rights over the old ones? */ { uint32_t of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT); uint32_t nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT); /* Flip the NX bit, since it's the only one that decreases rights; * we calculate as if it were an "X" bit. */ of ^= _PAGE_NX_BIT; nf ^= _PAGE_NX_BIT; /* If the changed bits are all set in the new flags, then rights strictly * increased between old and new. */ return ((of | (of ^ nf)) == nf); } #endif /* !__ASSEMBLY__ */ #define PAGE_ALIGN(x) (((x) + PAGE_SIZE - 1) & PAGE_MASK) #endif /* __X86_PAGE_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/init.h0000664000175000017500000000011612307313555015175 0ustar smbsmb#ifndef _XEN_ASM_INIT_H #define _XEN_ASM_INIT_H #endif /* _XEN_ASM_INIT_H */ xen-4.4.0/xen/include/asm-x86/mpspec.h0000664000175000017500000000433412307313555015527 0ustar smbsmb#ifndef __ASM_MPSPEC_H #define __ASM_MPSPEC_H #include #include #include extern unsigned char mp_bus_id_to_type[MAX_MP_BUSSES]; extern bool_t def_to_bigsmp; extern unsigned int boot_cpu_physical_apicid; extern bool_t smp_found_config; extern void find_smp_config (void); extern void get_smp_config (void); extern unsigned char apic_version [MAX_APICS]; extern int mp_irq_entries; extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES]; extern int mpc_default_type; extern unsigned long mp_lapic_addr; extern bool_t pic_mode; #ifdef CONFIG_ACPI extern int mp_register_lapic(u32 id, bool_t enabled, bool_t hotplug); extern void mp_unregister_lapic(uint32_t apic_id, uint32_t cpu); extern void mp_register_lapic_address (u64 address); extern void mp_register_ioapic (u8 id, u32 address, u32 gsi_base); extern void mp_override_legacy_irq (u8 bus_irq, u8 polarity, u8 trigger, u32 gsi); extern void mp_config_acpi_legacy_irqs (void); extern int mp_register_gsi (u32 gsi, int edge_level, int active_high_low); #endif /* CONFIG_ACPI */ #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) struct physid_mask { unsigned long mask[PHYSID_ARRAY_SIZE]; }; typedef struct physid_mask physid_mask_t; #define physid_set(physid, map) set_bit(physid, (map).mask) #define physid_clear(physid, map) clear_bit(physid, (map).mask) #define physid_isset(physid, map) test_bit(physid, (map).mask) #define physid_test_and_set(physid, map) test_and_set_bit(physid, (map).mask) #define physids_and(dst, src1, src2) bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS) #define physids_or(dst, src1, src2) bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS) #define physids_clear(map) bitmap_zero((map).mask, MAX_APICS) #define physids_complement(dst, src) bitmap_complement((dst).mask,(src).mask, MAX_APICS) #define physids_empty(map) bitmap_empty((map).mask, MAX_APICS) #define physids_equal(map1, map2) bitmap_equal((map1).mask, (map2).mask, MAX_APICS) #define physids_weight(map) bitmap_weight((map).mask, MAX_APICS) #define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} } #define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} } extern physid_mask_t phys_cpu_present_map; #endif xen-4.4.0/xen/include/asm-x86/asm_defns.h0000664000175000017500000000557012307313555016202 0ustar smbsmb #ifndef __X86_ASM_DEFNS_H__ #define __X86_ASM_DEFNS_H__ #ifndef COMPILE_OFFSETS /* NB. Auto-generated from arch/.../asm-offsets.c */ #include #endif #include #ifndef __ASSEMBLY__ void ret_from_intr(void); #endif #include /* Exception table entry */ #ifdef __ASSEMBLY__ # define _ASM__EXTABLE(sfx, from, to) \ .section .ex_table##sfx, "a" ; \ .balign 4 ; \ .long _ASM_EX(from), _ASM_EX(to) ; \ .previous #else # define _ASM__EXTABLE(sfx, from, to) \ " .section .ex_table" #sfx ",\"a\"\n" \ " .balign 4\n" \ " .long " _ASM_EX(from) ", " _ASM_EX(to) "\n" \ " .previous\n" #endif #define _ASM_EXTABLE(from, to) _ASM__EXTABLE(, from, to) #define _ASM_PRE_EXTABLE(from, to) _ASM__EXTABLE(.pre, from, to) #ifdef __ASSEMBLY__ #define UNLIKELY_START(cond, tag) \ .Ldispatch.tag: \ j##cond .Lunlikely.tag; \ .subsection 1; \ .Lunlikely.tag: #define UNLIKELY_DISPATCH_LABEL(tag) \ .Ldispatch.tag #define UNLIKELY_DONE(cond, tag) \ j##cond .Llikely.tag #define __UNLIKELY_END(tag) \ .subsection 0; \ .Llikely.tag: #define UNLIKELY_END(tag) \ UNLIKELY_DONE(mp, tag); \ __UNLIKELY_END(tag) #define STACK_CPUINFO_FIELD(field) (STACK_SIZE-CPUINFO_sizeof+CPUINFO_##field) #define GET_STACK_BASE(reg) \ movq $~(STACK_SIZE-1),reg; \ andq %rsp,reg #define GET_CPUINFO_FIELD(field, reg) \ GET_STACK_BASE(reg); \ addq $STACK_CPUINFO_FIELD(field),reg #define __GET_CURRENT(reg) \ movq STACK_CPUINFO_FIELD(current_vcpu)(reg),reg #define GET_CURRENT(reg) \ GET_STACK_BASE(reg); \ __GET_CURRENT(reg) #ifndef NDEBUG #define ASSERT_NOT_IN_ATOMIC \ sti; /* sometimes called with interrupts disabled: safe to enable */ \ call ASSERT_NOT_IN_ATOMIC #else #define ASSERT_NOT_IN_ATOMIC #endif #else #ifdef __clang__ /* clang's builtin assember can't do .subsection */ #define UNLIKELY_START_SECTION ".pushsection .fixup,\"ax\"" #define UNLIKELY_END_SECTION ".popsection" #else #define UNLIKELY_START_SECTION ".subsection 1" #define UNLIKELY_END_SECTION ".subsection 0" #endif #define UNLIKELY_START(cond, tag) \ "j" #cond " .Lunlikely%=.tag;\n\t" \ UNLIKELY_START_SECTION "\n" \ ".Lunlikely%=.tag:" #define UNLIKELY_END(tag) \ "jmp .Llikely%=.tag;\n\t" \ UNLIKELY_END_SECTION "\n" \ ".Llikely%=.tag:" #endif #endif /* __X86_ASM_DEFNS_H__ */ xen-4.4.0/xen/include/asm-x86/mce.h0000664000175000017500000000211612307313555015000 0ustar smbsmb#include #include #ifndef _XEN_X86_MCE_H #define _XEN_X86_MCE_H /* * Emulate 2 banks for guest * Bank0: reserved for 'bank0 quirk' occur at some very old processors: * 1). Intel cpu whose family-model value < 06-1A; * 2). AMD K7 * Bank1: used to transfer error info to guest */ #define GUEST_MC_BANK_NUM 2 /* Filter MSCOD model specific error code to guest */ #define MCi_STATUS_MSCOD_MASK (~(0xffffULL << 16)) /* No mci_ctl since it stick all 1's */ struct vmce_bank { uint64_t mci_status; uint64_t mci_addr; uint64_t mci_misc; uint64_t mci_ctl2; }; /* No mcg_ctl since it not expose to guest */ struct vmce { uint64_t mcg_cap; uint64_t mcg_status; spinlock_t lock; struct vmce_bank bank[GUEST_MC_BANK_NUM]; }; /* Guest vMCE MSRs virtualization */ extern void vmce_init_vcpu(struct vcpu *); extern int vmce_restore_vcpu(struct vcpu *, const struct hvm_vmce_vcpu *); extern int vmce_wrmsr(uint32_t msr, uint64_t val); extern int vmce_rdmsr(uint32_t msr, uint64_t *val); extern unsigned int nr_mce_banks; #endif xen-4.4.0/xen/include/asm-x86/edd.h0000664000175000017500000001243212307313555014772 0ustar smbsmb/****************************************************************************** * edd.h * * Copyright (C) 2002, 2003, 2004 Dell Inc. * by Matt Domsch * * structures and definitions for the int 13h, ax={41,48}h * BIOS Enhanced Disk Drive Services * This is based on the T13 group document D1572 Revision 0 (August 14 2002) * available at http://www.t13.org/docs2002/d1572r0.pdf. It is * very similar to D1484 Revision 3 http://www.t13.org/docs2002/d1484r3.pdf * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License v2.0 as published by * the Free Software Foundation * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #ifndef __XEN_EDD_H__ #define __XEN_EDD_H__ #ifndef __ASSEMBLY__ struct edd_info { /* Int13, Fn48: Check Extensions Present. */ u8 device; /* %dl: device */ u8 version; /* %ah: major version */ u16 interface_support; /* %cx: interface support bitmap */ /* Int13, Fn08: Legacy Get Device Parameters. */ u16 legacy_max_cylinder; /* %cl[7:6]:%ch: maximum cylinder number */ u8 legacy_max_head; /* %dh: maximum head number */ u8 legacy_sectors_per_track; /* %cl[5:0]: maximum sector number */ /* Int13, Fn41: Get Device Parameters (as filled into %ds:%esi). */ struct edd_device_params { u16 length; u16 info_flags; u32 num_default_cylinders; u32 num_default_heads; u32 sectors_per_track; u64 number_of_sectors; u16 bytes_per_sector; u32 dpte_ptr; /* 0xFFFFFFFF for our purposes */ u16 key; /* = 0xBEDD */ u8 device_path_info_length; u8 reserved2; u16 reserved3; u8 host_bus_type[4]; u8 interface_type[8]; union { struct { u16 base_address; u16 reserved1; u32 reserved2; } __attribute__ ((packed)) isa; struct { u8 bus; u8 slot; u8 function; u8 channel; u32 reserved; } __attribute__ ((packed)) pci; /* pcix is same as pci */ struct { u64 reserved; } __attribute__ ((packed)) ibnd; struct { u64 reserved; } __attribute__ ((packed)) xprs; struct { u64 reserved; } __attribute__ ((packed)) htpt; struct { u64 reserved; } __attribute__ ((packed)) unknown; } interface_path; union { struct { u8 device; u8 reserved1; u16 reserved2; u32 reserved3; u64 reserved4; } __attribute__ ((packed)) ata; struct { u8 device; u8 lun; u8 reserved1; u8 reserved2; u32 reserved3; u64 reserved4; } __attribute__ ((packed)) atapi; struct { u16 id; u64 lun; u16 reserved1; u32 reserved2; } __attribute__ ((packed)) scsi; struct { u64 serial_number; u64 reserved; } __attribute__ ((packed)) usb; struct { u64 eui; u64 reserved; } __attribute__ ((packed)) i1394; struct { u64 wwid; u64 lun; } __attribute__ ((packed)) fibre; struct { u64 identity_tag; u64 reserved; } __attribute__ ((packed)) i2o; struct { u32 array_number; u32 reserved1; u64 reserved2; } __attribute__ ((packed)) raid; struct { u8 device; u8 reserved1; u16 reserved2; u32 reserved3; u64 reserved4; } __attribute__ ((packed)) sata; struct { u64 reserved1; u64 reserved2; } __attribute__ ((packed)) unknown; } device_path; u8 reserved4; u8 checksum; } __attribute__ ((packed)) edd_device_params; } __attribute__ ((packed)); struct mbr_signature { u8 device; u8 pad[3]; u32 signature; } __attribute__ ((packed)); /* These all reside in the boot trampoline. Access via bootsym(). */ extern struct mbr_signature boot_mbr_signature[]; extern u8 boot_mbr_signature_nr; extern struct edd_info boot_edd_info[]; extern u8 boot_edd_info_nr; #endif /* __ASSEMBLY__ */ /* Maximum number of EDD information structures at boot_edd_info. */ #define EDD_INFO_MAX 6 /* Maximum number of MBR signatures at boot_mbr_signature. */ #define EDD_MBR_SIG_MAX 16 /* Size of components of EDD information structure. */ #define EDDEXTSIZE 8 #define EDDPARMSIZE 74 #endif /* __XEN_EDD_H__ */ xen-4.4.0/xen/include/asm-x86/genapic.h0000664000175000017500000000564412307313555015653 0ustar smbsmb#ifndef _ASM_GENAPIC_H #define _ASM_GENAPIC_H 1 /* * Generic APIC driver interface. * * An straight forward mapping of the APIC related parts of the * x86 subarchitecture interface to a dynamic object. * * This is used by the "generic" x86 subarchitecture. * * Copyright 2003 Andi Kleen, SuSE Labs. */ struct mpc_config_translation; struct mpc_config_bus; struct mp_config_table; struct mpc_config_processor; struct genapic { const char *name; int (*probe)(void); /* When one of the next two hooks returns 1 the genapic is switched to this. Essentially they are additional probe functions. */ int (*mps_oem_check)(struct mp_config_table *mpc, char *oem, char *productid); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); /* Interrupt delivery parameters ('physical' vs. 'logical flat'). */ int int_delivery_mode; int int_dest_mode; void (*init_apic_ldr)(void); void (*clustered_apic_check)(void); const cpumask_t *(*target_cpus)(void); const cpumask_t *(*vector_allocation_cpumask)(int cpu); unsigned int (*cpu_mask_to_apicid)(const cpumask_t *cpumask); void (*send_IPI_mask)(const cpumask_t *mask, int vector); void (*send_IPI_self)(int vector); }; #define APICFUNC(x) .x = x #define APIC_INIT(aname, aprobe) \ .name = aname, \ .probe = aprobe, \ APICFUNC(mps_oem_check), \ APICFUNC(acpi_madt_oem_check) extern const struct genapic *genapic; extern const struct genapic apic_default; const cpumask_t *target_cpus_all(void); void init_apic_ldr_flat(void); void clustered_apic_check_flat(void); unsigned int cpu_mask_to_apicid_flat(const cpumask_t *cpumask); void send_IPI_mask_flat(const cpumask_t *mask, int vector); void send_IPI_self_flat(int vector); const cpumask_t *vector_allocation_cpumask_flat(int cpu); #define GENAPIC_FLAT \ .int_delivery_mode = dest_LowestPrio, \ .int_dest_mode = 1 /* logical delivery */, \ .init_apic_ldr = init_apic_ldr_flat, \ .clustered_apic_check = clustered_apic_check_flat, \ .target_cpus = target_cpus_all, \ .vector_allocation_cpumask = vector_allocation_cpumask_flat, \ .cpu_mask_to_apicid = cpu_mask_to_apicid_flat, \ .send_IPI_mask = send_IPI_mask_flat, \ .send_IPI_self = send_IPI_self_flat void init_apic_ldr_phys(void); void clustered_apic_check_phys(void); unsigned int cpu_mask_to_apicid_phys(const cpumask_t *cpumask); void send_IPI_mask_phys(const cpumask_t *mask, int vector); void send_IPI_self_phys(int vector); const cpumask_t *vector_allocation_cpumask_phys(int cpu); #define GENAPIC_PHYS \ .int_delivery_mode = dest_Fixed, \ .int_dest_mode = 0 /* physical delivery */, \ .init_apic_ldr = init_apic_ldr_phys, \ .clustered_apic_check = clustered_apic_check_phys, \ .target_cpus = target_cpus_all, \ .vector_allocation_cpumask = vector_allocation_cpumask_phys, \ .cpu_mask_to_apicid = cpu_mask_to_apicid_phys, \ .send_IPI_mask = send_IPI_mask_phys, \ .send_IPI_self = send_IPI_self_phys void send_IPI_self_x2apic(int vector); #endif xen-4.4.0/xen/include/asm-x86/amd.h0000664000175000017500000001562712307313555015010 0ustar smbsmb/* * amd.h - AMD processor specific definitions */ #ifndef __AMD_H__ #define __AMD_H__ #include /* CPUID masked for use by AMD-V Extended Migration */ #define X86_FEATURE_BITPOS(_feature_) ((_feature_) % 32) #define __bit(_x_) (1U << X86_FEATURE_BITPOS(_x_)) /* Family 0Fh, Revision C */ #define AMD_FEATURES_K8_REV_C_ECX 0 #define AMD_FEATURES_K8_REV_C_EDX ( \ __bit(X86_FEATURE_FPU) | __bit(X86_FEATURE_VME) | \ __bit(X86_FEATURE_DE) | __bit(X86_FEATURE_PSE) | \ __bit(X86_FEATURE_TSC) | __bit(X86_FEATURE_MSR) | \ __bit(X86_FEATURE_PAE) | __bit(X86_FEATURE_MCE) | \ __bit(X86_FEATURE_CX8) | __bit(X86_FEATURE_APIC) | \ __bit(X86_FEATURE_SEP) | __bit(X86_FEATURE_MTRR) | \ __bit(X86_FEATURE_PGE) | __bit(X86_FEATURE_MCA) | \ __bit(X86_FEATURE_CMOV) | __bit(X86_FEATURE_PAT) | \ __bit(X86_FEATURE_PSE36) | __bit(X86_FEATURE_CLFLSH)| \ __bit(X86_FEATURE_MMX) | __bit(X86_FEATURE_FXSR) | \ __bit(X86_FEATURE_XMM) | __bit(X86_FEATURE_XMM2)) #define AMD_EXTFEATURES_K8_REV_C_ECX 0 #define AMD_EXTFEATURES_K8_REV_C_EDX ( \ __bit(X86_FEATURE_FPU) | __bit(X86_FEATURE_VME) | \ __bit(X86_FEATURE_DE) | __bit(X86_FEATURE_PSE) | \ __bit(X86_FEATURE_TSC) | __bit(X86_FEATURE_MSR) | \ __bit(X86_FEATURE_PAE) | __bit(X86_FEATURE_MCE) | \ __bit(X86_FEATURE_CX8) | __bit(X86_FEATURE_APIC) | \ __bit(X86_FEATURE_SYSCALL) | __bit(X86_FEATURE_MTRR) | \ __bit(X86_FEATURE_PGE) | __bit(X86_FEATURE_MCA) | \ __bit(X86_FEATURE_CMOV) | __bit(X86_FEATURE_PAT) | \ __bit(X86_FEATURE_PSE36) | __bit(X86_FEATURE_NX) | \ __bit(X86_FEATURE_MMXEXT) | __bit(X86_FEATURE_MMX) | \ __bit(X86_FEATURE_FXSR) | __bit(X86_FEATURE_LM) | \ __bit(X86_FEATURE_3DNOWEXT) | __bit(X86_FEATURE_3DNOW)) /* Family 0Fh, Revision D */ #define AMD_FEATURES_K8_REV_D_ECX AMD_FEATURES_K8_REV_C_ECX #define AMD_FEATURES_K8_REV_D_EDX AMD_FEATURES_K8_REV_C_EDX #define AMD_EXTFEATURES_K8_REV_D_ECX (AMD_EXTFEATURES_K8_REV_C_ECX |\ __bit(X86_FEATURE_LAHF_LM)) #define AMD_EXTFEATURES_K8_REV_D_EDX (AMD_EXTFEATURES_K8_REV_C_EDX |\ __bit(X86_FEATURE_FFXSR)) /* Family 0Fh, Revision E */ #define AMD_FEATURES_K8_REV_E_ECX (AMD_FEATURES_K8_REV_D_ECX | \ __bit(X86_FEATURE_XMM3)) #define AMD_FEATURES_K8_REV_E_EDX (AMD_FEATURES_K8_REV_D_EDX | \ __bit(X86_FEATURE_HT)) #define AMD_EXTFEATURES_K8_REV_E_ECX (AMD_EXTFEATURES_K8_REV_D_ECX |\ __bit(X86_FEATURE_CMP_LEGACY)) #define AMD_EXTFEATURES_K8_REV_E_EDX AMD_EXTFEATURES_K8_REV_D_EDX /* Family 0Fh, Revision F */ #define AMD_FEATURES_K8_REV_F_ECX (AMD_FEATURES_K8_REV_E_ECX | \ __bit(X86_FEATURE_CX16)) #define AMD_FEATURES_K8_REV_F_EDX AMD_FEATURES_K8_REV_E_EDX #define AMD_EXTFEATURES_K8_REV_F_ECX (AMD_EXTFEATURES_K8_REV_E_ECX |\ __bit(X86_FEATURE_SVM) | __bit(X86_FEATURE_EXTAPIC) | \ __bit(X86_FEATURE_CR8_LEGACY)) #define AMD_EXTFEATURES_K8_REV_F_EDX (AMD_EXTFEATURES_K8_REV_E_EDX |\ __bit(X86_FEATURE_RDTSCP)) /* Family 0Fh, Revision G */ #define AMD_FEATURES_K8_REV_G_ECX AMD_FEATURES_K8_REV_F_ECX #define AMD_FEATURES_K8_REV_G_EDX AMD_FEATURES_K8_REV_F_EDX #define AMD_EXTFEATURES_K8_REV_G_ECX (AMD_EXTFEATURES_K8_REV_F_ECX |\ __bit(X86_FEATURE_3DNOWPREFETCH)) #define AMD_EXTFEATURES_K8_REV_G_EDX AMD_EXTFEATURES_K8_REV_F_EDX /* Family 10h, Revision B */ #define AMD_FEATURES_FAM10h_REV_B_ECX (AMD_FEATURES_K8_REV_F_ECX | \ __bit(X86_FEATURE_POPCNT) | __bit(X86_FEATURE_MWAIT)) #define AMD_FEATURES_FAM10h_REV_B_EDX AMD_FEATURES_K8_REV_F_EDX #define AMD_EXTFEATURES_FAM10h_REV_B_ECX (AMD_EXTFEATURES_K8_REV_F_ECX |\ __bit(X86_FEATURE_ABM) | __bit(X86_FEATURE_SSE4A) | \ __bit(X86_FEATURE_MISALIGNSSE) | __bit(X86_FEATURE_OSVW) | \ __bit(X86_FEATURE_IBS)) #define AMD_EXTFEATURES_FAM10h_REV_B_EDX (AMD_EXTFEATURES_K8_REV_F_EDX |\ __bit(X86_FEATURE_PAGE1GB)) /* Family 10h, Revision C */ #define AMD_FEATURES_FAM10h_REV_C_ECX AMD_FEATURES_FAM10h_REV_B_ECX #define AMD_FEATURES_FAM10h_REV_C_EDX AMD_FEATURES_FAM10h_REV_B_EDX #define AMD_EXTFEATURES_FAM10h_REV_C_ECX (AMD_EXTFEATURES_FAM10h_REV_B_ECX |\ __bit(X86_FEATURE_SKINIT) | __bit(X86_FEATURE_WDT)) #define AMD_EXTFEATURES_FAM10h_REV_C_EDX AMD_EXTFEATURES_FAM10h_REV_B_EDX /* Family 11h, Revision B */ #define AMD_FEATURES_FAM11h_REV_B_ECX AMD_FEATURES_K8_REV_G_ECX #define AMD_FEATURES_FAM11h_REV_B_EDX AMD_FEATURES_K8_REV_G_EDX #define AMD_EXTFEATURES_FAM11h_REV_B_ECX (AMD_EXTFEATURES_K8_REV_G_ECX |\ __bit(X86_FEATURE_SKINIT)) #define AMD_EXTFEATURES_FAM11h_REV_B_EDX AMD_EXTFEATURES_K8_REV_G_EDX /* AMD errata checking * * Errata are defined using the AMD_LEGACY_ERRATUM() or AMD_OSVW_ERRATUM() * macros. The latter is intended for newer errata that have an OSVW id * assigned, which it takes as first argument. Both take a variable number * of family-specific model-stepping ranges created by AMD_MODEL_RANGE(). * * Example 1: * #define AMD_ERRATUM_319 \ * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), \ * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), \ * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)) * Example 2: * #define AMD_ERRATUM_400 \ * AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), \ * AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)) * */ #define AMD_LEGACY_ERRATUM(...) -1 /* legacy */, __VA_ARGS__, 0 #define AMD_OSVW_ERRATUM(osvw_id, ...) osvw_id, __VA_ARGS__, 0 #define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) #define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) #define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) #define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) #define AMD_ERRATUM_121 \ AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x0f, 0x0, 0x0, 0x3f, 0xf)) #define AMD_ERRATUM_170 \ AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x0f, 0x0, 0x0, 0x67, 0xf)) #define AMD_ERRATUM_383 \ AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf), \ AMD_MODEL_RANGE(0x12, 0x0, 0x0, 0x1, 0x0)) #define AMD_ERRATUM_573 \ AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x0f, 0x0, 0x0, 0xff, 0xf), \ AMD_MODEL_RANGE(0x10, 0x0, 0x0, 0xff, 0xf), \ AMD_MODEL_RANGE(0x11, 0x0, 0x0, 0xff, 0xf), \ AMD_MODEL_RANGE(0x12, 0x0, 0x0, 0xff, 0xf)) struct cpuinfo_x86; int cpu_has_amd_erratum(const struct cpuinfo_x86 *, int, ...); extern s8 opt_allow_unsafe; void fam10h_check_enable_mmcfg(void); void check_enable_amd_mmconf_dmi(void); #endif /* __AMD_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/0000775000175000017500000000000012307313555014655 5ustar smbsmbxen-4.4.0/xen/include/asm-x86/hvm/vpmu.h0000664000175000017500000000744612307313555016030 0ustar smbsmb/* * vpmu.h: PMU virtualization for HVM domain. * * Copyright (c) 2007, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Author: Haitao Shan */ #ifndef __ASM_X86_HVM_VPMU_H_ #define __ASM_X86_HVM_VPMU_H_ /* * Flag bits given as a string on the hypervisor boot parameter 'vpmu'. * See arch/x86/hvm/vpmu.c. */ #define VPMU_BOOT_ENABLED 0x1 /* vpmu generally enabled. */ #define VPMU_BOOT_BTS 0x2 /* Intel BTS feature wanted. */ #define msraddr_to_bitpos(x) (((x)&0xffff) + ((x)>>31)*0x2000) #define vcpu_vpmu(vcpu) (&((vcpu)->arch.hvm_vcpu.vpmu)) #define vpmu_vcpu(vpmu) (container_of((vpmu), struct vcpu, \ arch.hvm_vcpu.vpmu)) #define vpmu_domain(vpmu) (vpmu_vcpu(vpmu)->domain) #define MSR_TYPE_COUNTER 0 #define MSR_TYPE_CTRL 1 #define MSR_TYPE_GLOBAL 2 #define MSR_TYPE_ARCH_COUNTER 3 #define MSR_TYPE_ARCH_CTRL 4 /* Arch specific operations shared by all vpmus */ struct arch_vpmu_ops { int (*do_wrmsr)(unsigned int msr, uint64_t msr_content); int (*do_rdmsr)(unsigned int msr, uint64_t *msr_content); int (*do_interrupt)(struct cpu_user_regs *regs); void (*do_cpuid)(unsigned int input, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); void (*arch_vpmu_destroy)(struct vcpu *v); int (*arch_vpmu_save)(struct vcpu *v); void (*arch_vpmu_load)(struct vcpu *v); void (*arch_vpmu_dump)(const struct vcpu *); }; int vmx_vpmu_initialise(struct vcpu *, unsigned int flags); int svm_vpmu_initialise(struct vcpu *, unsigned int flags); struct vpmu_struct { u32 flags; u32 last_pcpu; u32 hw_lapic_lvtpc; void *context; struct arch_vpmu_ops *arch_vpmu_ops; }; /* VPMU states */ #define VPMU_CONTEXT_ALLOCATED 0x1 #define VPMU_CONTEXT_LOADED 0x2 #define VPMU_RUNNING 0x4 #define VPMU_CONTEXT_SAVE 0x8 /* Force context save */ #define VPMU_FROZEN 0x10 /* Stop counters while VCPU is not running */ #define VPMU_PASSIVE_DOMAIN_ALLOCATED 0x20 /* VPMU features */ #define VPMU_CPU_HAS_DS 0x100 /* Has Debug Store */ #define VPMU_CPU_HAS_BTS 0x200 /* Has Branch Trace Store */ #define vpmu_set(_vpmu, _x) ((_vpmu)->flags |= (_x)) #define vpmu_reset(_vpmu, _x) ((_vpmu)->flags &= ~(_x)) #define vpmu_is_set(_vpmu, _x) ((_vpmu)->flags & (_x)) #define vpmu_clear(_vpmu) ((_vpmu)->flags = 0) int vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content); int vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content); int vpmu_do_interrupt(struct cpu_user_regs *regs); void vpmu_do_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); void vpmu_initialise(struct vcpu *v); void vpmu_destroy(struct vcpu *v); void vpmu_save(struct vcpu *v); void vpmu_load(struct vcpu *v); void vpmu_dump(struct vcpu *v); extern int acquire_pmu_ownership(int pmu_ownership); extern void release_pmu_ownership(int pmu_ownership); #endif /* __ASM_X86_HVM_VPMU_H_*/ xen-4.4.0/xen/include/asm-x86/hvm/vpt.h0000664000175000017500000001521212307313555015640 0ustar smbsmb/* * vpt.h: Virtual Platform Timer definitions * * Copyright (c) 2004, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #ifndef __ASM_X86_HVM_VPT_H__ #define __ASM_X86_HVM_VPT_H__ #include #include #include #include #include #include #include #include #include #include #include /* * Abstract layer of periodic time, one short time. */ typedef void time_cb(struct vcpu *v, void *opaque); struct periodic_time { struct list_head list; bool_t on_list; bool_t one_shot; bool_t do_not_freeze; bool_t irq_issued; bool_t warned_timeout_too_short; #define PTSRC_isa 1 /* ISA time source */ #define PTSRC_lapic 2 /* LAPIC time source */ u8 source; /* PTSRC_ */ u8 irq; struct vcpu *vcpu; /* vcpu timer interrupt delivers to */ u32 pending_intr_nr; /* pending timer interrupts */ u64 period; /* frequency in ns */ s_time_t scheduled; /* scheduled timer interrupt */ u64 last_plt_gtime; /* platform time when last IRQ is injected */ struct timer timer; /* ac_timer */ time_cb *cb; void *priv; /* point back to platform time source */ }; #define PIT_FREQ 1193182 #define PIT_BASE 0x40 typedef struct PITState { /* Hardware state */ struct hvm_hw_pit hw; /* Last time the counters read zero, for calcuating counter reads */ int64_t count_load_time[3]; /* Channel 0 IRQ handling. */ struct periodic_time pt0; spinlock_t lock; } PITState; struct hpet_registers { /* Memory-mapped, software visible registers */ uint64_t capability; /* capabilities */ uint64_t config; /* configuration */ uint64_t isr; /* interrupt status reg */ uint64_t mc64; /* main counter */ struct { /* timers */ uint64_t config; /* configuration/cap */ uint64_t cmp; /* comparator */ uint64_t fsb; /* FSB route, not supported now */ } timers[HPET_TIMER_NUM]; /* Hidden register state */ uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */ uint64_t comparator64[HPET_TIMER_NUM]; /* 64 bit running comparator */ }; typedef struct HPETState { struct hpet_registers hpet; uint64_t stime_freq; uint64_t hpet_to_ns_scale; /* hpet ticks to ns (multiplied by 2^10) */ uint64_t hpet_to_ns_limit; /* max hpet ticks convertable to ns */ uint64_t mc_offset; struct periodic_time pt[HPET_TIMER_NUM]; spinlock_t lock; } HPETState; typedef struct RTCState { /* Hardware state */ struct hvm_hw_rtc hw; /* RTC's idea of the current time */ struct tm current_tm; /* update-ended timer */ struct timer update_timer; struct timer update_timer2; uint64_t next_update_time; /* alarm timer */ struct timer alarm_timer; /* periodic timer */ struct periodic_time pt; s_time_t start_time; int pt_code; uint8_t pt_dead_ticks; uint32_t use_timer; spinlock_t lock; } RTCState; #define FREQUENCE_PMTIMER 3579545 /* Timer should run at 3.579545 MHz */ typedef struct PMTState { struct hvm_hw_pmtimer pm; /* 32bit timer value */ struct vcpu *vcpu; /* Keeps sync with this vcpu's guest-time */ uint64_t last_gtime; /* Last (guest) time we updated the timer */ uint32_t not_accounted; /* time not accounted at last update */ uint64_t scale; /* Multiplier to get from tsc to timer ticks */ struct timer timer; /* To make sure we send SCIs */ spinlock_t lock; } PMTState; struct pl_time { /* platform time */ struct RTCState vrtc; struct HPETState vhpet; struct PMTState vpmt; /* guest_time = Xen sys time + stime_offset */ int64_t stime_offset; /* Ensures monotonicity in appropriate timer modes. */ uint64_t last_guest_time; spinlock_t pl_time_lock; }; void pt_save_timer(struct vcpu *v); void pt_restore_timer(struct vcpu *v); int pt_update_irq(struct vcpu *v); void pt_intr_post(struct vcpu *v, struct hvm_intack intack); void pt_migrate(struct vcpu *v); void pt_adjust_global_vcpu_target(struct vcpu *v); #define pt_global_vcpu_target(d) \ (is_hvm_domain(d) && (d)->arch.hvm_domain.i8259_target ? \ (d)->arch.hvm_domain.i8259_target : \ (d)->vcpu ? (d)->vcpu[0] : NULL) void pt_may_unmask_irq(struct domain *d, struct periodic_time *vlapic_pt); /* Is given periodic timer active? */ #define pt_active(pt) ((pt)->on_list || (pt)->pending_intr_nr) /* * Create/destroy a periodic (or one-shot!) timer. * The given periodic timer structure must be initialised with zero bytes, * except for the 'source' field which must be initialised with the * correct PTSRC_ value. The initialised timer structure can then be passed * to {create,destroy}_periodic_time() any number of times and in any order. * Note that, for a given periodic timer, invocations of these functions MUST * be serialised. */ void create_periodic_time( struct vcpu *v, struct periodic_time *pt, uint64_t delta, uint64_t period, uint8_t irq, time_cb *cb, void *data); void destroy_periodic_time(struct periodic_time *pt); int pv_pit_handler(int port, int data, int write); void pit_reset(struct domain *d); void pit_init(struct vcpu *v, unsigned long cpu_khz); void pit_stop_channel0_irq(PITState * pit); void pit_deinit(struct domain *d); void rtc_init(struct domain *d); void rtc_migrate_timers(struct vcpu *v); void rtc_deinit(struct domain *d); void rtc_reset(struct domain *d); void rtc_update_clock(struct domain *d); bool_t rtc_periodic_interrupt(void *); void pmtimer_init(struct vcpu *v); void pmtimer_deinit(struct domain *d); void pmtimer_reset(struct domain *d); int pmtimer_change_ioport(struct domain *d, unsigned int version); void hpet_init(struct vcpu *v); void hpet_deinit(struct domain *d); void hpet_reset(struct domain *d); #endif /* __ASM_X86_HVM_VPT_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/vmx/0000775000175000017500000000000012307313555015467 5ustar smbsmbxen-4.4.0/xen/include/asm-x86/hvm/vmx/vvmx.h0000664000175000017500000001700012307313555016636 0ustar smbsmb /* * vvmx.h: Support virtual VMX for nested virtualization. * * Copyright (c) 2010, Intel Corporation. * Author: Qing He * Eddie Dong * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #ifndef __ASM_X86_HVM_VVMX_H__ #define __ASM_X86_HVM_VVMX_H__ struct vvmcs_list { unsigned long vvmcs_mfn; struct list_head node; }; struct nestedvmx { paddr_t vmxon_region_pa; void *iobitmap[2]; /* map (va) of L1 guest I/O bitmap */ void *msrbitmap; /* map (va) of L1 guest MSR bitmap */ /* deferred nested interrupt */ struct { unsigned long intr_info; u32 error_code; u8 source; } intr; struct { bool_t enabled; uint32_t exit_reason; uint32_t exit_qual; } ept; uint32_t guest_vpid; struct list_head launched_list; }; #define vcpu_2_nvmx(v) (vcpu_nestedhvm(v).u.nvmx) /* bit 1, 2, 4 must be 1 */ #define VMX_PINBASED_CTLS_DEFAULT1 0x16 /* bit 1, 4-6,8,13-16,26 must be 1 */ #define VMX_PROCBASED_CTLS_DEFAULT1 0x401e172 /* bit 0-8, 10,11,13,14,16,17 must be 1 */ #define VMX_EXIT_CTLS_DEFAULT1 0x36dff /* bit 0-8, and 12 must be 1 */ #define VMX_ENTRY_CTLS_DEFAULT1 0x11ff /* * Encode of VMX instructions base on Table 24-11 & 24-12 of SDM 3B */ enum vmx_regs_enc { VMX_REG_RAX, VMX_REG_RCX, VMX_REG_RDX, VMX_REG_RBX, VMX_REG_RSP, VMX_REG_RBP, VMX_REG_RSI, VMX_REG_RDI, VMX_REG_R8, VMX_REG_R9, VMX_REG_R10, VMX_REG_R11, VMX_REG_R12, VMX_REG_R13, VMX_REG_R14, VMX_REG_R15, }; enum vmx_sregs_enc { VMX_SREG_ES, VMX_SREG_CS, VMX_SREG_SS, VMX_SREG_DS, VMX_SREG_FS, VMX_SREG_GS, }; union vmx_inst_info { struct { unsigned int scaling :2; /* bit 0-1 */ unsigned int __rsvd0 :1; /* bit 2 */ unsigned int reg1 :4; /* bit 3-6 */ unsigned int addr_size :3; /* bit 7-9 */ unsigned int memreg :1; /* bit 10 */ unsigned int __rsvd1 :4; /* bit 11-14 */ unsigned int segment :3; /* bit 15-17 */ unsigned int index_reg :4; /* bit 18-21 */ unsigned int index_reg_invalid :1; /* bit 22 */ unsigned int base_reg :4; /* bit 23-26 */ unsigned int base_reg_invalid :1; /* bit 27 */ unsigned int reg2 :4; /* bit 28-31 */ } fields; u32 word; }; int nvmx_vcpu_initialise(struct vcpu *v); void nvmx_vcpu_destroy(struct vcpu *v); int nvmx_vcpu_reset(struct vcpu *v); uint64_t nvmx_vcpu_guestcr3(struct vcpu *v); uint64_t nvmx_vcpu_eptp_base(struct vcpu *v); uint32_t nvmx_vcpu_asid(struct vcpu *v); enum hvm_intblk nvmx_intr_blocked(struct vcpu *v); int nvmx_intercepts_exception(struct vcpu *v, unsigned int trap, int error_code); void nvmx_domain_relinquish_resources(struct domain *d); bool_t nvmx_ept_enabled(struct vcpu *v); int nvmx_handle_vmxon(struct cpu_user_regs *regs); int nvmx_handle_vmxoff(struct cpu_user_regs *regs); #define EPT_TRANSLATE_SUCCEED 0 #define EPT_TRANSLATE_VIOLATION 1 #define EPT_TRANSLATE_MISCONFIG 2 #define EPT_TRANSLATE_RETRY 3 int nvmx_hap_walk_L1_p2m(struct vcpu *v, paddr_t L2_gpa, paddr_t *L1_gpa, unsigned int *page_order, uint8_t *p2m_acc, bool_t access_r, bool_t access_w, bool_t access_x); /* * Virtual VMCS layout * * Since physical VMCS layout is unknown, a custom layout is used * for virtual VMCS seen by guest. It occupies a 4k page, and the * field is offset by an 9-bit offset into u64[], The offset is as * follow, which means every pair has a max of 32 * fields available. * * 9 7 5 0 * -------------------------------- * offset: | width | type | index | * -------------------------------- * * Also, since the lower range has only one * field: VPID, it is moved to a higher offset (63), and leaves the * lower range to non-indexed field like VMCS revision. * */ struct vvmcs_header { u32 revision; u32 abort; }; union vmcs_encoding { struct { u32 access_type : 1; u32 index : 9; u32 type : 2; u32 rsv1 : 1; u32 width : 2; u32 rsv2 : 17; }; u32 word; }; enum vvmcs_encoding_width { VVMCS_WIDTH_16 = 0, VVMCS_WIDTH_64, VVMCS_WIDTH_32, VVMCS_WIDTH_NATURAL, }; enum vvmcs_encoding_type { VVMCS_TYPE_CONTROL = 0, VVMCS_TYPE_RO, VVMCS_TYPE_GSTATE, VVMCS_TYPE_HSTATE, }; u64 __get_vvmcs_virtual(void *vvmcs, u32 vmcs_encoding); u64 __get_vvmcs_real(void *vvmcs, u32 vmcs_encoding); void __set_vvmcs_virtual(void *vvmcs, u32 vmcs_encoding, u64 val); void __set_vvmcs_real(void *vvmcs, u32 vmcs_encoding, u64 val); #define __get_vvmcs(_vvmcs, _vmcs_encoding) \ (cpu_has_vmx_vmcs_shadowing ? __get_vvmcs_real(_vvmcs, _vmcs_encoding) \ : __get_vvmcs_virtual(_vvmcs, _vmcs_encoding)) #define __set_vvmcs(_vvmcs, _vmcs_encoding, _val) \ (cpu_has_vmx_vmcs_shadowing ? __set_vvmcs_real(_vvmcs, _vmcs_encoding, _val) \ : __set_vvmcs_virtual(_vvmcs, _vmcs_encoding, _val)) uint64_t get_shadow_eptp(struct vcpu *v); void nvmx_destroy_vmcs(struct vcpu *v); int nvmx_handle_vmptrld(struct cpu_user_regs *regs); int nvmx_handle_vmptrst(struct cpu_user_regs *regs); int nvmx_handle_vmclear(struct cpu_user_regs *regs); int nvmx_handle_vmread(struct cpu_user_regs *regs); int nvmx_handle_vmwrite(struct cpu_user_regs *regs); int nvmx_handle_vmresume(struct cpu_user_regs *regs); int nvmx_handle_vmlaunch(struct cpu_user_regs *regs); int nvmx_handle_invept(struct cpu_user_regs *regs); int nvmx_handle_invvpid(struct cpu_user_regs *regs); int nvmx_msr_read_intercept(unsigned int msr, u64 *msr_content); int nvmx_msr_write_intercept(unsigned int msr, u64 msr_content); void nvmx_update_exec_control(struct vcpu *v, u32 value); void nvmx_update_secondary_exec_control(struct vcpu *v, unsigned long value); void nvmx_update_exception_bitmap(struct vcpu *v, unsigned long value); void nvmx_switch_guest(void); void nvmx_idtv_handling(void); u64 nvmx_get_tsc_offset(struct vcpu *v); int nvmx_n2_vmexit_handler(struct cpu_user_regs *regs, unsigned int exit_reason); void nvmx_set_cr_read_shadow(struct vcpu *v, unsigned int cr); uint64_t nept_get_ept_vpid_cap(void); int nept_translate_l2ga(struct vcpu *v, paddr_t l2ga, unsigned int *page_order, uint32_t rwx_acc, unsigned long *l1gfn, uint8_t *p2m_acc, uint64_t *exit_qual, uint32_t *exit_reason); int nvmx_cpu_up_prepare(unsigned int cpu); void nvmx_cpu_dead(unsigned int cpu); #endif /* __ASM_X86_HVM_VVMX_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/vmx/vpmu_core2.h0000664000175000017500000000301212307313555017715 0ustar smbsmb /* * vpmu_core2.h: CORE 2 specific PMU virtualization for HVM domain. * * Copyright (c) 2007, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Author: Haitao Shan */ #ifndef __ASM_X86_HVM_VPMU_CORE_H_ #define __ASM_X86_HVM_VPMU_CORE_H_ /* Currently only 3 fixed counters are supported. */ #define VPMU_CORE2_NUM_FIXED 3 /* Currently only 3 Non-architectual Performance Control MSRs */ #define VPMU_CORE2_NUM_CTRLS 3 struct arch_msr_pair { u64 counter; u64 control; }; struct core2_pmu_enable { char ds_area_enable; char fixed_ctr_enable[VPMU_CORE2_NUM_FIXED]; char arch_pmc_enable[1]; }; struct core2_vpmu_context { struct core2_pmu_enable *pmu_enable; u64 fix_counters[VPMU_CORE2_NUM_FIXED]; u64 ctrls[VPMU_CORE2_NUM_CTRLS]; u64 global_ovf_status; struct arch_msr_pair arch_msr_pair[1]; }; #endif /* __ASM_X86_HVM_VPMU_CORE_H_ */ xen-4.4.0/xen/include/asm-x86/hvm/vmx/vmx.h0000664000175000017500000004402712307313555016461 0ustar smbsmb/* * vmx.h: VMX Architecture related definitions * Copyright (c) 2004, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #ifndef __ASM_X86_HVM_VMX_VMX_H__ #define __ASM_X86_HVM_VMX_VMX_H__ #include #include #include #include #include #include #include #include #include typedef union { struct { u64 r : 1, /* bit 0 - Read permission */ w : 1, /* bit 1 - Write permission */ x : 1, /* bit 2 - Execute permission */ emt : 3, /* bits 5:3 - EPT Memory type */ ipat : 1, /* bit 6 - Ignore PAT memory type */ sp : 1, /* bit 7 - Is this a superpage? */ rsvd1 : 2, /* bits 9:8 - Reserved for future use */ avail1 : 1, /* bit 10 - Software available 1 */ rsvd2_snp : 1, /* bit 11 - Used for VT-d snoop control in shared EPT/VT-d usage */ mfn : 40, /* bits 51:12 - Machine physical frame number */ sa_p2mt : 6, /* bits 57:52 - Software available 2 */ access : 4, /* bits 61:58 - p2m_access_t */ rsvd3_tm : 1, /* bit 62 - Used for VT-d transient-mapping hint in shared EPT/VT-d usage */ avail3 : 1; /* bit 63 - Software available 3 */ }; u64 epte; } ept_entry_t; typedef struct { /*use lxe[0] to save result */ ept_entry_t lxe[5]; } ept_walk_t; typedef enum { ept_access_n = 0, /* No access permissions allowed */ ept_access_r = 1, /* Read only */ ept_access_w = 2, /* Write only */ ept_access_rw = 3, /* Read & Write */ ept_access_x = 4, /* Exec Only */ ept_access_rx = 5, /* Read & Exec */ ept_access_wx = 6, /* Write & Exec*/ ept_access_all = 7, /* Full permissions */ } ept_access_t; #define EPT_TABLE_ORDER 9 #define EPTE_SUPER_PAGE_MASK 0x80 #define EPTE_MFN_MASK 0xffffffffff000ULL #define EPTE_AVAIL1_MASK 0xF00 #define EPTE_EMT_MASK 0x38 #define EPTE_IGMT_MASK 0x40 #define EPTE_AVAIL1_SHIFT 8 #define EPTE_EMT_SHIFT 3 #define EPTE_IGMT_SHIFT 6 #define EPTE_RWX_MASK 0x7 #define EPTE_FLAG_MASK 0x7f #define EPT_EMT_UC 0 #define EPT_EMT_WC 1 #define EPT_EMT_RSV0 2 #define EPT_EMT_RSV1 3 #define EPT_EMT_WT 4 #define EPT_EMT_WP 5 #define EPT_EMT_WB 6 #define EPT_EMT_RSV2 7 void vmx_asm_vmexit_handler(struct cpu_user_regs); void vmx_asm_do_vmentry(void); void vmx_intr_assist(void); void vmx_do_resume(struct vcpu *); void vmx_vlapic_msr_changed(struct vcpu *v); void vmx_realmode(struct cpu_user_regs *regs); void vmx_update_debug_state(struct vcpu *v); void vmx_update_exception_bitmap(struct vcpu *v); void vmx_update_cpu_exec_control(struct vcpu *v); void vmx_update_secondary_exec_control(struct vcpu *v); #define POSTED_INTR_ON 0 static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) { return test_and_set_bit(vector, pi_desc->pir); } static inline int pi_test_and_set_on(struct pi_desc *pi_desc) { return test_and_set_bit(POSTED_INTR_ON, &pi_desc->control); } static inline void pi_set_on(struct pi_desc *pi_desc) { set_bit(POSTED_INTR_ON, &pi_desc->control); } static inline int pi_test_and_clear_on(struct pi_desc *pi_desc) { return test_and_clear_bit(POSTED_INTR_ON, &pi_desc->control); } static inline unsigned long pi_get_pir(struct pi_desc *pi_desc, int group) { return xchg(&pi_desc->pir[group], 0); } /* * Exit Reasons */ #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 #define EXIT_REASON_EXCEPTION_NMI 0 #define EXIT_REASON_EXTERNAL_INTERRUPT 1 #define EXIT_REASON_TRIPLE_FAULT 2 #define EXIT_REASON_INIT 3 #define EXIT_REASON_SIPI 4 #define EXIT_REASON_IO_SMI 5 #define EXIT_REASON_OTHER_SMI 6 #define EXIT_REASON_PENDING_VIRT_INTR 7 #define EXIT_REASON_PENDING_VIRT_NMI 8 #define EXIT_REASON_TASK_SWITCH 9 #define EXIT_REASON_CPUID 10 #define EXIT_REASON_GETSEC 11 #define EXIT_REASON_HLT 12 #define EXIT_REASON_INVD 13 #define EXIT_REASON_INVLPG 14 #define EXIT_REASON_RDPMC 15 #define EXIT_REASON_RDTSC 16 #define EXIT_REASON_RSM 17 #define EXIT_REASON_VMCALL 18 #define EXIT_REASON_VMCLEAR 19 #define EXIT_REASON_VMLAUNCH 20 #define EXIT_REASON_VMPTRLD 21 #define EXIT_REASON_VMPTRST 22 #define EXIT_REASON_VMREAD 23 #define EXIT_REASON_VMRESUME 24 #define EXIT_REASON_VMWRITE 25 #define EXIT_REASON_VMXOFF 26 #define EXIT_REASON_VMXON 27 #define EXIT_REASON_CR_ACCESS 28 #define EXIT_REASON_DR_ACCESS 29 #define EXIT_REASON_IO_INSTRUCTION 30 #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 #define EXIT_REASON_INVALID_GUEST_STATE 33 #define EXIT_REASON_MSR_LOADING 34 #define EXIT_REASON_MWAIT_INSTRUCTION 36 #define EXIT_REASON_MONITOR_TRAP_FLAG 37 #define EXIT_REASON_MONITOR_INSTRUCTION 39 #define EXIT_REASON_PAUSE_INSTRUCTION 40 #define EXIT_REASON_MCE_DURING_VMENTRY 41 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 #define EXIT_REASON_EOI_INDUCED 45 #define EXIT_REASON_ACCESS_GDTR_OR_IDTR 46 #define EXIT_REASON_ACCESS_LDTR_OR_TR 47 #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_INVEPT 50 #define EXIT_REASON_RDTSCP 51 #define EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED 52 #define EXIT_REASON_INVVPID 53 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 #define EXIT_REASON_INVPCID 58 /* * Interruption-information format */ #define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ #define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ #define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */ #define INTR_INFO_NMI_UNBLOCKED_BY_IRET 0x1000 /* 12 */ #define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ #define INTR_INFO_RESVD_BITS_MASK 0x7ffff000 /* * Exit Qualifications for MOV for Control Register Access */ /* 3:0 - control register number (CRn) */ #define VMX_CONTROL_REG_ACCESS_NUM(eq) ((eq) & 0xf) /* 5:4 - access type (CR write, CR read, CLTS, LMSW) */ #define VMX_CONTROL_REG_ACCESS_TYPE(eq) (((eq) >> 4) & 0x3) # define VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR 0 # define VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR 1 # define VMX_CONTROL_REG_ACCESS_TYPE_CLTS 2 # define VMX_CONTROL_REG_ACCESS_TYPE_LMSW 3 /* 10:8 - general purpose register operand */ #define VMX_CONTROL_REG_ACCESS_GPR(eq) (((eq) >> 8) & 0xf) /* * Access Rights */ #define X86_SEG_AR_SEG_TYPE 0xf /* 3:0, segment type */ #define X86_SEG_AR_DESC_TYPE (1u << 4) /* 4, descriptor type */ #define X86_SEG_AR_DPL 0x60 /* 6:5, descriptor privilege level */ #define X86_SEG_AR_SEG_PRESENT (1u << 7) /* 7, segment present */ #define X86_SEG_AR_AVL (1u << 12) /* 12, available for system software */ #define X86_SEG_AR_CS_LM_ACTIVE (1u << 13) /* 13, long mode active (CS only) */ #define X86_SEG_AR_DEF_OP_SIZE (1u << 14) /* 14, default operation size */ #define X86_SEG_AR_GRANULARITY (1u << 15) /* 15, granularity */ #define X86_SEG_AR_SEG_UNUSABLE (1u << 16) /* 16, segment unusable */ #define VMCALL_OPCODE ".byte 0x0f,0x01,0xc1\n" #define VMCLEAR_OPCODE ".byte 0x66,0x0f,0xc7\n" /* reg/opcode: /6 */ #define VMLAUNCH_OPCODE ".byte 0x0f,0x01,0xc2\n" #define VMPTRLD_OPCODE ".byte 0x0f,0xc7\n" /* reg/opcode: /6 */ #define VMPTRST_OPCODE ".byte 0x0f,0xc7\n" /* reg/opcode: /7 */ #define VMREAD_OPCODE ".byte 0x0f,0x78\n" #define VMRESUME_OPCODE ".byte 0x0f,0x01,0xc3\n" #define VMWRITE_OPCODE ".byte 0x0f,0x79\n" #define INVEPT_OPCODE ".byte 0x66,0x0f,0x38,0x80\n" /* m128,r64/32 */ #define INVVPID_OPCODE ".byte 0x66,0x0f,0x38,0x81\n" /* m128,r64/32 */ #define VMXOFF_OPCODE ".byte 0x0f,0x01,0xc4\n" #define VMXON_OPCODE ".byte 0xf3,0x0f,0xc7\n" #define MODRM_EAX_08 ".byte 0x08\n" /* ECX, [EAX] */ #define MODRM_EAX_06 ".byte 0x30\n" /* [EAX], with reg/opcode: /6 */ #define MODRM_EAX_07 ".byte 0x38\n" /* [EAX], with reg/opcode: /7 */ #define MODRM_EAX_ECX ".byte 0xc1\n" /* EAX, ECX */ extern u64 vmx_ept_vpid_cap; extern uint8_t posted_intr_vector; #define cpu_has_vmx_ept_exec_only_supported \ (vmx_ept_vpid_cap & VMX_EPT_EXEC_ONLY_SUPPORTED) #define cpu_has_vmx_ept_wl4_supported \ (vmx_ept_vpid_cap & VMX_EPT_WALK_LENGTH_4_SUPPORTED) #define cpu_has_vmx_ept_mt_uc \ (vmx_ept_vpid_cap & VMX_EPT_MEMORY_TYPE_UC) #define cpu_has_vmx_ept_mt_wb \ (vmx_ept_vpid_cap & VMX_EPT_MEMORY_TYPE_WB) #define cpu_has_vmx_ept_1gb \ (vmx_ept_vpid_cap & VMX_EPT_SUPERPAGE_1GB) #define cpu_has_vmx_ept_2mb \ (vmx_ept_vpid_cap & VMX_EPT_SUPERPAGE_2MB) #define cpu_has_vmx_ept_invept_single_context \ (vmx_ept_vpid_cap & VMX_EPT_INVEPT_SINGLE_CONTEXT) #define EPT_2MB_SHIFT 16 #define EPT_1GB_SHIFT 17 #define ept_has_2mb(c) ((c >> EPT_2MB_SHIFT) & 1) #define ept_has_1gb(c) ((c >> EPT_1GB_SHIFT) & 1) #define INVEPT_SINGLE_CONTEXT 1 #define INVEPT_ALL_CONTEXT 2 #define cpu_has_vmx_vpid_invvpid_individual_addr \ (vmx_ept_vpid_cap & VMX_VPID_INVVPID_INDIVIDUAL_ADDR) #define cpu_has_vmx_vpid_invvpid_single_context \ (vmx_ept_vpid_cap & VMX_VPID_INVVPID_SINGLE_CONTEXT) #define cpu_has_vmx_vpid_invvpid_single_context_retaining_global \ (vmx_ept_vpid_cap & VMX_VPID_INVVPID_SINGLE_CONTEXT_RETAINING_GLOBAL) #define INVVPID_INDIVIDUAL_ADDR 0 #define INVVPID_SINGLE_CONTEXT 1 #define INVVPID_ALL_CONTEXT 2 #define INVVPID_SINGLE_CONTEXT_RETAINING_GLOBAL 3 static inline void __vmptrld(u64 addr) { asm volatile ( #ifdef HAVE_GAS_VMX "vmptrld %0\n" #else VMPTRLD_OPCODE MODRM_EAX_06 #endif /* CF==1 or ZF==1 --> crash (ud2) */ UNLIKELY_START(be, vmptrld) "\tud2\n" UNLIKELY_END_SECTION : #ifdef HAVE_GAS_VMX : "m" (addr) #else : "a" (&addr) #endif : "memory"); } static inline void __vmpclear(u64 addr) { asm volatile ( #ifdef HAVE_GAS_VMX "vmclear %0\n" #else VMCLEAR_OPCODE MODRM_EAX_06 #endif /* CF==1 or ZF==1 --> crash (ud2) */ UNLIKELY_START(be, vmclear) "\tud2\n" UNLIKELY_END_SECTION : #ifdef HAVE_GAS_VMX : "m" (addr) #else : "a" (&addr) #endif : "memory"); } static inline void __vmread(unsigned long field, unsigned long *value) { asm volatile ( #ifdef HAVE_GAS_VMX "vmread %1, %0\n\t" #else VMREAD_OPCODE MODRM_EAX_ECX #endif /* CF==1 or ZF==1 --> crash (ud2) */ UNLIKELY_START(be, vmread) "\tud2\n" UNLIKELY_END_SECTION #ifdef HAVE_GAS_VMX : "=rm" (*value) : "r" (field)); #else : "=c" (*value) : "a" (field)); #endif } static inline void __vmwrite(unsigned long field, unsigned long value) { asm volatile ( #ifdef HAVE_GAS_VMX "vmwrite %1, %0\n" #else VMWRITE_OPCODE MODRM_EAX_ECX #endif /* CF==1 or ZF==1 --> crash (ud2) */ UNLIKELY_START(be, vmwrite) "\tud2\n" UNLIKELY_END_SECTION : #ifdef HAVE_GAS_VMX : "r" (field) , "rm" (value)); #else : "a" (field) , "c" (value)); #endif } static inline bool_t __vmread_safe(unsigned long field, unsigned long *value) { bool_t okay; asm volatile ( #ifdef HAVE_GAS_VMX "vmread %2, %1\n\t" #else VMREAD_OPCODE MODRM_EAX_ECX #endif /* CF==1 or ZF==1 --> rc = 0 */ "setnbe %0" #ifdef HAVE_GAS_VMX : "=qm" (okay), "=rm" (*value) : "r" (field)); #else : "=qm" (okay), "=c" (*value) : "a" (field)); #endif return okay; } static inline void __invept(unsigned long type, u64 eptp, u64 gpa) { struct { u64 eptp, gpa; } operand = {eptp, gpa}; /* * If single context invalidation is not supported, we escalate to * use all context invalidation. */ if ( (type == INVEPT_SINGLE_CONTEXT) && !cpu_has_vmx_ept_invept_single_context ) type = INVEPT_ALL_CONTEXT; asm volatile ( #ifdef HAVE_GAS_EPT "invept %0, %1\n" #else INVEPT_OPCODE MODRM_EAX_08 #endif /* CF==1 or ZF==1 --> crash (ud2) */ UNLIKELY_START(be, invept) "\tud2\n" UNLIKELY_END_SECTION : #ifdef HAVE_GAS_EPT : "m" (operand), "r" (type) #else : "a" (&operand), "c" (type) #endif : "memory" ); } static inline void __invvpid(unsigned long type, u16 vpid, u64 gva) { struct { u64 vpid:16; u64 rsvd:48; u64 gva; } __attribute__ ((packed)) operand = {vpid, 0, gva}; /* Fix up #UD exceptions which occur when TLBs are flushed before VMXON. */ asm volatile ( "1: " #ifdef HAVE_GAS_EPT "invvpid %0, %1\n" #else INVVPID_OPCODE MODRM_EAX_08 #endif /* CF==1 or ZF==1 --> crash (ud2) */ UNLIKELY_START(be, invvpid) "\tud2\n" UNLIKELY_END_SECTION "\n" "2:" _ASM_EXTABLE(1b, 2b) : #ifdef HAVE_GAS_EPT : "m" (operand), "r" (type) #else : "a" (&operand), "c" (type) #endif : "memory" ); } static inline void ept_sync_all(void) { __invept(INVEPT_ALL_CONTEXT, 0, 0); } void ept_sync_domain(struct p2m_domain *p2m); static inline void vpid_sync_vcpu_gva(struct vcpu *v, unsigned long gva) { int type = INVVPID_INDIVIDUAL_ADDR; /* * If individual address invalidation is not supported, we escalate to * use single context invalidation. */ if ( likely(cpu_has_vmx_vpid_invvpid_individual_addr) ) goto execute_invvpid; type = INVVPID_SINGLE_CONTEXT; /* * If single context invalidation is not supported, we escalate to * use all context invalidation. */ if ( !cpu_has_vmx_vpid_invvpid_single_context ) type = INVVPID_ALL_CONTEXT; execute_invvpid: __invvpid(type, v->arch.hvm_vcpu.n1asid.asid, (u64)gva); } static inline void vpid_sync_all(void) { __invvpid(INVVPID_ALL_CONTEXT, 0, 0); } static inline void __vmxoff(void) { asm volatile ( VMXOFF_OPCODE : : : "memory" ); } static inline int __vmxon(u64 addr) { int rc; asm volatile ( "1: " VMXON_OPCODE MODRM_EAX_06 "\n" " setna %b0 ; neg %0\n" /* CF==1 or ZF==1 --> rc = -1 */ "2:\n" ".section .fixup,\"ax\"\n" "3: sub $2,%0 ; jmp 2b\n" /* #UD or #GP --> rc = -2 */ ".previous\n" _ASM_EXTABLE(1b, 3b) : "=q" (rc) : "0" (0), "a" (&addr) : "memory"); return rc; } void vmx_get_segment_register(struct vcpu *, enum x86_segment, struct segment_register *); void vmx_inject_extint(int trap, uint8_t source); void vmx_inject_nmi(void); int ept_p2m_init(struct p2m_domain *p2m); void ept_p2m_uninit(struct p2m_domain *p2m); void ept_walk_table(struct domain *d, unsigned long gfn); void setup_ept_dump(void); void update_guest_eip(void); int alloc_p2m_hap_data(struct p2m_domain *p2m); void free_p2m_hap_data(struct p2m_domain *p2m); void p2m_init_hap_data(struct p2m_domain *p2m); /* EPT violation qualifications definitions */ #define _EPT_READ_VIOLATION 0 #define EPT_READ_VIOLATION (1UL<<_EPT_READ_VIOLATION) #define _EPT_WRITE_VIOLATION 1 #define EPT_WRITE_VIOLATION (1UL<<_EPT_WRITE_VIOLATION) #define _EPT_EXEC_VIOLATION 2 #define EPT_EXEC_VIOLATION (1UL<<_EPT_EXEC_VIOLATION) #define _EPT_EFFECTIVE_READ 3 #define EPT_EFFECTIVE_READ (1UL<<_EPT_EFFECTIVE_READ) #define _EPT_EFFECTIVE_WRITE 4 #define EPT_EFFECTIVE_WRITE (1UL<<_EPT_EFFECTIVE_WRITE) #define _EPT_EFFECTIVE_EXEC 5 #define EPT_EFFECTIVE_EXEC (1UL<<_EPT_EFFECTIVE_EXEC) #define _EPT_GLA_VALID 7 #define EPT_GLA_VALID (1UL<<_EPT_GLA_VALID) #define _EPT_GLA_FAULT 8 #define EPT_GLA_FAULT (1UL<<_EPT_GLA_FAULT) #define EPT_L4_PAGETABLE_SHIFT 39 #define EPT_PAGETABLE_ENTRIES 512 #endif /* __ASM_X86_HVM_VMX_VMX_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/vmx/vmcs.h0000664000175000017500000005024212307313555016613 0ustar smbsmb/* * vmcs.h: VMCS related definitions * Copyright (c) 2004, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #ifndef __ASM_X86_HVM_VMX_VMCS_H__ #define __ASM_X86_HVM_VMX_VMCS_H__ #include #include #include extern void vmcs_dump_vcpu(struct vcpu *v); extern void setup_vmcs_dump(void); extern int vmx_cpu_up_prepare(unsigned int cpu); extern void vmx_cpu_dead(unsigned int cpu); extern int vmx_cpu_up(void); extern void vmx_cpu_down(void); extern void vmx_save_host_msrs(void); struct vmcs_struct { u32 vmcs_revision_id; unsigned char data [0]; /* vmcs size is read from MSR */ }; struct vmx_msr_entry { u32 index; u32 mbz; u64 data; }; enum { VMX_INDEX_MSR_LSTAR = 0, VMX_INDEX_MSR_STAR, VMX_INDEX_MSR_SYSCALL_MASK, VMX_MSR_COUNT }; struct vmx_msr_state { unsigned long flags; unsigned long msrs[VMX_MSR_COUNT]; }; #define EPT_DEFAULT_MT MTRR_TYPE_WRBACK struct ept_data { union { struct { u64 ept_mt :3, ept_wl :3, rsvd :6, asr :52; }; u64 eptp; }; cpumask_var_t synced_mask; }; struct vmx_domain { unsigned long apic_access_mfn; }; struct pi_desc { DECLARE_BITMAP(pir, NR_VECTORS); u32 control; u32 rsvd[7]; } __attribute__ ((aligned (64))); #define ept_get_wl(ept) ((ept)->ept_wl) #define ept_get_asr(ept) ((ept)->asr) #define ept_get_eptp(ept) ((ept)->eptp) #define ept_get_synced_mask(ept) ((ept)->synced_mask) struct arch_vmx_struct { /* Virtual address of VMCS. */ struct vmcs_struct *vmcs; /* VMCS shadow machine address. */ paddr_t vmcs_shadow_maddr; /* Protects remote usage of VMCS (VMPTRLD/VMCLEAR). */ spinlock_t vmcs_lock; /* * Activation and launch status of this VMCS. * - Activated on a CPU by VMPTRLD. Deactivated by VMCLEAR. * - Launched on active CPU by VMLAUNCH when current VMCS. */ struct list_head active_list; int active_cpu; int launched; /* Cache of cpu execution control. */ u32 exec_control; u32 secondary_exec_control; u32 exception_bitmap; struct vmx_msr_state msr_state; unsigned long shadow_gs; unsigned long cstar; unsigned long *msr_bitmap; unsigned int msr_count; struct vmx_msr_entry *msr_area; unsigned int host_msr_count; struct vmx_msr_entry *host_msr_area; unsigned long eoi_exitmap_changed; DECLARE_BITMAP(eoi_exit_bitmap, NR_VECTORS); struct pi_desc pi_desc; unsigned long host_cr0; /* Is the guest in real mode? */ uint8_t vmx_realmode; /* Are we emulating rather than VMENTERing? */ uint8_t vmx_emulate; /* Bitmask of segments that we can't safely use in virtual 8086 mode */ uint16_t vm86_segment_mask; /* Shadow CS, SS, DS, ES, FS, GS, TR while in virtual 8086 mode */ struct segment_register vm86_saved_seg[x86_seg_tr + 1]; /* Remember EFLAGS while in virtual 8086 mode */ uint32_t vm86_saved_eflags; int hostenv_migrated; /* Bitmap to control vmexit policy for Non-root VMREAD/VMWRITE */ struct page_info *vmread_bitmap; struct page_info *vmwrite_bitmap; }; int vmx_create_vmcs(struct vcpu *v); void vmx_destroy_vmcs(struct vcpu *v); void vmx_vmcs_enter(struct vcpu *v); bool_t __must_check vmx_vmcs_try_enter(struct vcpu *v); void vmx_vmcs_exit(struct vcpu *v); #define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004 #define CPU_BASED_USE_TSC_OFFSETING 0x00000008 #define CPU_BASED_HLT_EXITING 0x00000080 #define CPU_BASED_INVLPG_EXITING 0x00000200 #define CPU_BASED_MWAIT_EXITING 0x00000400 #define CPU_BASED_RDPMC_EXITING 0x00000800 #define CPU_BASED_RDTSC_EXITING 0x00001000 #define CPU_BASED_CR3_LOAD_EXITING 0x00008000 #define CPU_BASED_CR3_STORE_EXITING 0x00010000 #define CPU_BASED_CR8_LOAD_EXITING 0x00080000 #define CPU_BASED_CR8_STORE_EXITING 0x00100000 #define CPU_BASED_TPR_SHADOW 0x00200000 #define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000 #define CPU_BASED_MOV_DR_EXITING 0x00800000 #define CPU_BASED_UNCOND_IO_EXITING 0x01000000 #define CPU_BASED_ACTIVATE_IO_BITMAP 0x02000000 #define CPU_BASED_MONITOR_TRAP_FLAG 0x08000000 #define CPU_BASED_ACTIVATE_MSR_BITMAP 0x10000000 #define CPU_BASED_MONITOR_EXITING 0x20000000 #define CPU_BASED_PAUSE_EXITING 0x40000000 #define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 extern u32 vmx_cpu_based_exec_control; #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 #define PIN_BASED_VIRTUAL_NMIS 0x00000020 #define PIN_BASED_PREEMPT_TIMER 0x00000040 #define PIN_BASED_POSTED_INTERRUPT 0x00000080 extern u32 vmx_pin_based_exec_control; #define VM_EXIT_SAVE_DEBUG_CNTRLS 0x00000004 #define VM_EXIT_IA32E_MODE 0x00000200 #define VM_EXIT_LOAD_PERF_GLOBAL_CTRL 0x00001000 #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 #define VM_EXIT_SAVE_GUEST_PAT 0x00040000 #define VM_EXIT_LOAD_HOST_PAT 0x00080000 #define VM_EXIT_SAVE_GUEST_EFER 0x00100000 #define VM_EXIT_LOAD_HOST_EFER 0x00200000 #define VM_EXIT_SAVE_PREEMPT_TIMER 0x00400000 extern u32 vmx_vmexit_control; #define VM_ENTRY_IA32E_MODE 0x00000200 #define VM_ENTRY_SMM 0x00000400 #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 #define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL 0x00002000 #define VM_ENTRY_LOAD_GUEST_PAT 0x00004000 #define VM_ENTRY_LOAD_GUEST_EFER 0x00008000 extern u32 vmx_vmentry_control; #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 #define SECONDARY_EXEC_DESCRIPTOR_TABLE_EXITING 0x00000004 #define SECONDARY_EXEC_ENABLE_RDTSCP 0x00000008 #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 #define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100 #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_ENABLE_VMCS_SHADOWING 0x00004000 extern u32 vmx_secondary_exec_control; #define VMX_EPT_EXEC_ONLY_SUPPORTED 0x00000001 #define VMX_EPT_WALK_LENGTH_4_SUPPORTED 0x00000040 #define VMX_EPT_MEMORY_TYPE_UC 0x00000100 #define VMX_EPT_MEMORY_TYPE_WB 0x00004000 #define VMX_EPT_SUPERPAGE_2MB 0x00010000 #define VMX_EPT_SUPERPAGE_1GB 0x00020000 #define VMX_EPT_INVEPT_INSTRUCTION 0x00100000 #define VMX_EPT_INVEPT_SINGLE_CONTEXT 0x02000000 #define VMX_EPT_INVEPT_ALL_CONTEXT 0x04000000 #define VMX_MISC_VMWRITE_ALL 0x20000000 #define VMX_VPID_INVVPID_INSTRUCTION 0x100000000ULL #define VMX_VPID_INVVPID_INDIVIDUAL_ADDR 0x10000000000ULL #define VMX_VPID_INVVPID_SINGLE_CONTEXT 0x20000000000ULL #define VMX_VPID_INVVPID_ALL_CONTEXT 0x40000000000ULL #define VMX_VPID_INVVPID_SINGLE_CONTEXT_RETAINING_GLOBAL 0x80000000000ULL #define VMX_MISC_CR3_TARGET 0x1ff0000 #define cpu_has_wbinvd_exiting \ (vmx_secondary_exec_control & SECONDARY_EXEC_WBINVD_EXITING) #define cpu_has_vmx_virtualize_apic_accesses \ (vmx_secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) #define cpu_has_vmx_tpr_shadow \ (vmx_cpu_based_exec_control & CPU_BASED_TPR_SHADOW) #define cpu_has_vmx_vnmi \ (vmx_pin_based_exec_control & PIN_BASED_VIRTUAL_NMIS) #define cpu_has_vmx_msr_bitmap \ (vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_MSR_BITMAP) #define cpu_has_vmx_secondary_exec_control \ (vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) #define cpu_has_vmx_ept \ (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) #define cpu_has_vmx_vpid \ (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) #define cpu_has_monitor_trap_flag \ (vmx_cpu_based_exec_control & CPU_BASED_MONITOR_TRAP_FLAG) #define cpu_has_vmx_pat \ (vmx_vmentry_control & VM_ENTRY_LOAD_GUEST_PAT) #define cpu_has_vmx_unrestricted_guest \ (vmx_secondary_exec_control & SECONDARY_EXEC_UNRESTRICTED_GUEST) #define vmx_unrestricted_guest(v) \ ((v)->arch.hvm_vmx.secondary_exec_control & \ SECONDARY_EXEC_UNRESTRICTED_GUEST) #define cpu_has_vmx_ple \ (vmx_secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) #define cpu_has_vmx_apic_reg_virt \ (vmx_secondary_exec_control & SECONDARY_EXEC_APIC_REGISTER_VIRT) #define cpu_has_vmx_virtual_intr_delivery \ (vmx_secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) #define cpu_has_vmx_virtualize_x2apic_mode \ (vmx_secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE) #define cpu_has_vmx_posted_intr_processing \ (vmx_pin_based_exec_control & PIN_BASED_POSTED_INTERRUPT) #define cpu_has_vmx_vmcs_shadowing \ (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VMCS_SHADOWING) #define VMCS_RID_TYPE_MASK 0x80000000 /* GUEST_INTERRUPTIBILITY_INFO flags. */ #define VMX_INTR_SHADOW_STI 0x00000001 #define VMX_INTR_SHADOW_MOV_SS 0x00000002 #define VMX_INTR_SHADOW_SMI 0x00000004 #define VMX_INTR_SHADOW_NMI 0x00000008 #define VMX_BASIC_REVISION_MASK 0x7fffffff #define VMX_BASIC_VMCS_SIZE_MASK (0x1fffULL << 32) #define VMX_BASIC_32BIT_ADDRESSES (1ULL << 48) #define VMX_BASIC_DUAL_MONITOR (1ULL << 49) #define VMX_BASIC_MEMORY_TYPE_MASK (0xfULL << 50) #define VMX_BASIC_INS_OUT_INFO (1ULL << 54) /* * bit 55 of IA32_VMX_BASIC MSR, indicating whether any VMX controls that * default to 1 may be cleared to 0. */ #define VMX_BASIC_DEFAULT1_ZERO (1ULL << 55) extern u64 vmx_basic_msr; #define cpu_has_vmx_ins_outs_instr_info \ (!!(vmx_basic_msr & VMX_BASIC_INS_OUT_INFO)) /* Guest interrupt status */ #define VMX_GUEST_INTR_STATUS_SUBFIELD_BITMASK 0x0FF #define VMX_GUEST_INTR_STATUS_SVI_OFFSET 8 /* VMCS field encodings. */ enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, POSTED_INTR_NOTIFICATION_VECTOR = 0x00000002, GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, GUEST_DS_SELECTOR = 0x00000806, GUEST_FS_SELECTOR = 0x00000808, GUEST_GS_SELECTOR = 0x0000080a, GUEST_LDTR_SELECTOR = 0x0000080c, GUEST_TR_SELECTOR = 0x0000080e, GUEST_INTR_STATUS = 0x00000810, HOST_ES_SELECTOR = 0x00000c00, HOST_CS_SELECTOR = 0x00000c02, HOST_SS_SELECTOR = 0x00000c04, HOST_DS_SELECTOR = 0x00000c06, HOST_FS_SELECTOR = 0x00000c08, HOST_GS_SELECTOR = 0x00000c0a, HOST_TR_SELECTOR = 0x00000c0c, IO_BITMAP_A = 0x00002000, IO_BITMAP_A_HIGH = 0x00002001, IO_BITMAP_B = 0x00002002, IO_BITMAP_B_HIGH = 0x00002003, MSR_BITMAP = 0x00002004, MSR_BITMAP_HIGH = 0x00002005, VM_EXIT_MSR_STORE_ADDR = 0x00002006, VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007, VM_EXIT_MSR_LOAD_ADDR = 0x00002008, VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, TSC_OFFSET = 0x00002010, TSC_OFFSET_HIGH = 0x00002011, VIRTUAL_APIC_PAGE_ADDR = 0x00002012, VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, APIC_ACCESS_ADDR = 0x00002014, APIC_ACCESS_ADDR_HIGH = 0x00002015, PI_DESC_ADDR = 0x00002016, PI_DESC_ADDR_HIGH = 0x00002017, EPT_POINTER = 0x0000201a, EPT_POINTER_HIGH = 0x0000201b, EOI_EXIT_BITMAP0 = 0x0000201c, #define EOI_EXIT_BITMAP(n) (EOI_EXIT_BITMAP0 + (n) * 2) /* n = 0...3 */ VMREAD_BITMAP = 0x00002026, VMREAD_BITMAP_HIGH = 0x00002027, VMWRITE_BITMAP = 0x00002028, VMWRITE_BITMAP_HIGH = 0x00002029, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, VMCS_LINK_POINTER_HIGH = 0x00002801, GUEST_IA32_DEBUGCTL = 0x00002802, GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, GUEST_PAT = 0x00002804, GUEST_PAT_HIGH = 0x00002805, GUEST_EFER = 0x00002806, GUEST_EFER_HIGH = 0x00002807, GUEST_PERF_GLOBAL_CTRL = 0x00002808, GUEST_PERF_GLOBAL_CTRL_HIGH = 0x00002809, GUEST_PDPTR0 = 0x0000280a, GUEST_PDPTR0_HIGH = 0x0000280b, GUEST_PDPTR1 = 0x0000280c, GUEST_PDPTR1_HIGH = 0x0000280d, GUEST_PDPTR2 = 0x0000280e, GUEST_PDPTR2_HIGH = 0x0000280f, GUEST_PDPTR3 = 0x00002810, GUEST_PDPTR3_HIGH = 0x00002811, HOST_PAT = 0x00002c00, HOST_PAT_HIGH = 0x00002c01, HOST_EFER = 0x00002c02, HOST_EFER_HIGH = 0x00002c03, HOST_PERF_GLOBAL_CTRL = 0x00002c04, HOST_PERF_GLOBAL_CTRL_HIGH = 0x00002c05, PIN_BASED_VM_EXEC_CONTROL = 0x00004000, CPU_BASED_VM_EXEC_CONTROL = 0x00004002, EXCEPTION_BITMAP = 0x00004004, PAGE_FAULT_ERROR_CODE_MASK = 0x00004006, PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008, CR3_TARGET_COUNT = 0x0000400a, VM_EXIT_CONTROLS = 0x0000400c, VM_EXIT_MSR_STORE_COUNT = 0x0000400e, VM_EXIT_MSR_LOAD_COUNT = 0x00004010, VM_ENTRY_CONTROLS = 0x00004012, VM_ENTRY_MSR_LOAD_COUNT = 0x00004014, VM_ENTRY_INTR_INFO = 0x00004016, VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018, VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, TPR_THRESHOLD = 0x0000401c, SECONDARY_VM_EXEC_CONTROL = 0x0000401e, PLE_GAP = 0x00004020, PLE_WINDOW = 0x00004022, VM_INSTRUCTION_ERROR = 0x00004400, VM_EXIT_REASON = 0x00004402, VM_EXIT_INTR_INFO = 0x00004404, VM_EXIT_INTR_ERROR_CODE = 0x00004406, IDT_VECTORING_INFO = 0x00004408, IDT_VECTORING_ERROR_CODE = 0x0000440a, VM_EXIT_INSTRUCTION_LEN = 0x0000440c, VMX_INSTRUCTION_INFO = 0x0000440e, GUEST_ES_LIMIT = 0x00004800, GUEST_CS_LIMIT = 0x00004802, GUEST_SS_LIMIT = 0x00004804, GUEST_DS_LIMIT = 0x00004806, GUEST_FS_LIMIT = 0x00004808, GUEST_GS_LIMIT = 0x0000480a, GUEST_LDTR_LIMIT = 0x0000480c, GUEST_TR_LIMIT = 0x0000480e, GUEST_GDTR_LIMIT = 0x00004810, GUEST_IDTR_LIMIT = 0x00004812, GUEST_ES_AR_BYTES = 0x00004814, GUEST_CS_AR_BYTES = 0x00004816, GUEST_SS_AR_BYTES = 0x00004818, GUEST_DS_AR_BYTES = 0x0000481a, GUEST_FS_AR_BYTES = 0x0000481c, GUEST_GS_AR_BYTES = 0x0000481e, GUEST_LDTR_AR_BYTES = 0x00004820, GUEST_TR_AR_BYTES = 0x00004822, GUEST_INTERRUPTIBILITY_INFO = 0x00004824, GUEST_ACTIVITY_STATE = 0x00004826, GUEST_SYSENTER_CS = 0x0000482A, GUEST_PREEMPTION_TIMER = 0x0000482e, HOST_SYSENTER_CS = 0x00004c00, CR0_GUEST_HOST_MASK = 0x00006000, CR4_GUEST_HOST_MASK = 0x00006002, CR0_READ_SHADOW = 0x00006004, CR4_READ_SHADOW = 0x00006006, CR3_TARGET_VALUE0 = 0x00006008, CR3_TARGET_VALUE1 = 0x0000600a, CR3_TARGET_VALUE2 = 0x0000600c, CR3_TARGET_VALUE3 = 0x0000600e, EXIT_QUALIFICATION = 0x00006400, GUEST_LINEAR_ADDRESS = 0x0000640a, GUEST_CR0 = 0x00006800, GUEST_CR3 = 0x00006802, GUEST_CR4 = 0x00006804, GUEST_ES_BASE = 0x00006806, GUEST_CS_BASE = 0x00006808, GUEST_SS_BASE = 0x0000680a, GUEST_DS_BASE = 0x0000680c, GUEST_FS_BASE = 0x0000680e, GUEST_GS_BASE = 0x00006810, GUEST_LDTR_BASE = 0x00006812, GUEST_TR_BASE = 0x00006814, GUEST_GDTR_BASE = 0x00006816, GUEST_IDTR_BASE = 0x00006818, GUEST_DR7 = 0x0000681a, GUEST_RSP = 0x0000681c, GUEST_RIP = 0x0000681e, GUEST_RFLAGS = 0x00006820, GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, GUEST_SYSENTER_ESP = 0x00006824, GUEST_SYSENTER_EIP = 0x00006826, HOST_CR0 = 0x00006c00, HOST_CR3 = 0x00006c02, HOST_CR4 = 0x00006c04, HOST_FS_BASE = 0x00006c06, HOST_GS_BASE = 0x00006c08, HOST_TR_BASE = 0x00006c0a, HOST_GDTR_BASE = 0x00006c0c, HOST_IDTR_BASE = 0x00006c0e, HOST_SYSENTER_ESP = 0x00006c10, HOST_SYSENTER_EIP = 0x00006c12, HOST_RSP = 0x00006c14, HOST_RIP = 0x00006c16, }; #define VMCS_VPID_WIDTH 16 #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr, int type); void vmx_enable_intercept_for_msr(struct vcpu *v, u32 msr, int type); int vmx_read_guest_msr(u32 msr, u64 *val); int vmx_write_guest_msr(u32 msr, u64 val); int vmx_add_guest_msr(u32 msr); int vmx_add_host_load_msr(u32 msr); void vmx_vmcs_switch(struct vmcs_struct *from, struct vmcs_struct *to); void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector); void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 vector); int vmx_check_msr_bitmap(unsigned long *msr_bitmap, u32 msr, int access_type); void virtual_vmcs_enter(void *vvmcs); void virtual_vmcs_exit(void *vvmcs); u64 virtual_vmcs_vmread(void *vvmcs, u32 vmcs_encoding); void virtual_vmcs_vmwrite(void *vvmcs, u32 vmcs_encoding, u64 val); #endif /* ASM_X86_HVM_VMX_VMCS_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/hvm/domain.h0000664000175000017500000000563512307313555016306 0ustar smbsmb/* * domain.h: HVM per domain definitions * * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2005, International Business Machines Corporation * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #ifndef __ASM_X86_HVM_DOMAIN_H__ #define __ASM_X86_HVM_DOMAIN_H__ #include #include #include #include #include #include #include #include #include #include #include #include #include struct hvm_ioreq_page { spinlock_t lock; struct page_info *page; void *va; }; struct hvm_domain { struct hvm_ioreq_page ioreq; struct hvm_ioreq_page buf_ioreq; struct pl_time pl_time; struct hvm_io_handler *io_handler; /* Lock protects access to irq, vpic and vioapic. */ spinlock_t irq_lock; struct hvm_irq irq; struct hvm_hw_vpic vpic[2]; /* 0=master; 1=slave */ struct hvm_vioapic *vioapic; struct hvm_hw_stdvga stdvga; /* VCPU which is current target for 8259 interrupts. */ struct vcpu *i8259_target; /* emulated irq to pirq */ struct radix_tree_root emuirq_pirq; uint64_t *params; /* Memory ranges with pinned cache attributes. */ struct list_head pinned_cacheattr_ranges; /* VRAM dirty support. */ struct sh_dirty_vram *dirty_vram; /* If one of vcpus of this domain is in no_fill_mode or * mtrr/pat between vcpus is not the same, set is_in_uc_mode */ spinlock_t uc_lock; bool_t is_in_uc_mode; /* Pass-through */ struct hvm_iommu hvm_iommu; /* hypervisor intercepted msix table */ struct list_head msixtbl_list; spinlock_t msixtbl_list_lock; struct viridian_domain viridian; bool_t hap_enabled; bool_t mem_sharing_enabled; bool_t qemu_mapcache_invalidate; bool_t is_s3_suspended; union { struct vmx_domain vmx; struct svm_domain svm; }; }; #define hap_enabled(d) ((d)->arch.hvm_domain.hap_enabled) #endif /* __ASM_X86_HVM_DOMAIN_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/support.h0000664000175000017500000001304312307313555016543 0ustar smbsmb/* * support.h: HVM support routines used by VT-x and SVM. * * Leendert van Doorn, leendert@watson.ibm.com * Copyright (c) 2005, International Business Machines Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #ifndef __ASM_X86_HVM_SUPPORT_H__ #define __ASM_X86_HVM_SUPPORT_H__ #include #include #include #include #include static inline ioreq_t *get_ioreq(struct vcpu *v) { struct domain *d = v->domain; shared_iopage_t *p = d->arch.hvm_domain.ioreq.va; ASSERT((v == current) || spin_is_locked(&d->arch.hvm_domain.ioreq.lock)); return p ? &p->vcpu_ioreq[v->vcpu_id] : NULL; } #define HVM_DELIVER_NO_ERROR_CODE -1 #ifndef NDEBUG #define DBG_LEVEL_0 (1 << 0) #define DBG_LEVEL_1 (1 << 1) #define DBG_LEVEL_2 (1 << 2) #define DBG_LEVEL_3 (1 << 3) #define DBG_LEVEL_IO (1 << 4) #define DBG_LEVEL_VMMU (1 << 5) #define DBG_LEVEL_VLAPIC (1 << 6) #define DBG_LEVEL_VLAPIC_TIMER (1 << 7) #define DBG_LEVEL_VLAPIC_INTERRUPT (1 << 8) #define DBG_LEVEL_IOAPIC (1 << 9) #define DBG_LEVEL_HCALL (1 << 10) #define DBG_LEVEL_MSR (1 << 11) extern unsigned int opt_hvm_debug_level; #define HVM_DBG_LOG(level, _f, _a...) \ do { \ if ( unlikely((level) & opt_hvm_debug_level) ) \ printk("[HVM:%d.%d] <%s> " _f "\n", \ current->domain->domain_id, current->vcpu_id, __func__, \ ## _a); \ } while (0) #else #define HVM_DBG_LOG(level, _f, _a...) do {} while (0) #endif extern unsigned long hvm_io_bitmap[]; enum hvm_copy_result { HVMCOPY_okay = 0, HVMCOPY_bad_gva_to_gfn, HVMCOPY_bad_gfn_to_mfn, HVMCOPY_unhandleable, HVMCOPY_gfn_paged_out, HVMCOPY_gfn_shared, }; /* * Copy to/from a guest physical address. * Returns HVMCOPY_okay, else HVMCOPY_bad_gfn_to_mfn if the given physical * address range does not map entirely onto ordinary machine memory. */ enum hvm_copy_result hvm_copy_to_guest_phys( paddr_t paddr, void *buf, int size); enum hvm_copy_result hvm_copy_from_guest_phys( void *buf, paddr_t paddr, int size); /* * Copy to/from a guest virtual address. @pfec should include PFEC_user_mode * if emulating a user-mode access (CPL=3). All other flags in @pfec are * managed by the called function: it is therefore optional for the caller * to set them. * * Returns: * HVMCOPY_okay: Copy was entirely successful. * HVMCOPY_bad_gfn_to_mfn: Some guest physical address did not map to * ordinary machine memory. * HVMCOPY_bad_gva_to_gfn: Some guest virtual address did not have a valid * mapping to a guest physical address. In this case * a page fault exception is automatically queued * for injection into the current HVM VCPU. */ enum hvm_copy_result hvm_copy_to_guest_virt( unsigned long vaddr, void *buf, int size, uint32_t pfec); enum hvm_copy_result hvm_copy_from_guest_virt( void *buf, unsigned long vaddr, int size, uint32_t pfec); enum hvm_copy_result hvm_fetch_from_guest_virt( void *buf, unsigned long vaddr, int size, uint32_t pfec); /* * As above (copy to/from a guest virtual address), but no fault is generated * when HVMCOPY_bad_gva_to_gfn is returned. */ enum hvm_copy_result hvm_copy_to_guest_virt_nofault( unsigned long vaddr, void *buf, int size, uint32_t pfec); enum hvm_copy_result hvm_copy_from_guest_virt_nofault( void *buf, unsigned long vaddr, int size, uint32_t pfec); enum hvm_copy_result hvm_fetch_from_guest_virt_nofault( void *buf, unsigned long vaddr, int size, uint32_t pfec); #define HVM_HCALL_completed 0 /* hypercall completed - no further action */ #define HVM_HCALL_preempted 1 /* hypercall preempted - re-execute VMCALL */ #define HVM_HCALL_invalidate 2 /* invalidate ioemu-dm memory cache */ int hvm_do_hypercall(struct cpu_user_regs *pregs); void hvm_hlt(unsigned long rflags); void hvm_triple_fault(void); void hvm_rdtsc_intercept(struct cpu_user_regs *regs); int __must_check hvm_handle_xsetbv(u32 index, u64 new_bv); void hvm_shadow_handle_cd(struct vcpu *v, unsigned long value); /* These functions all return X86EMUL return codes. */ int hvm_set_efer(uint64_t value); int hvm_set_cr0(unsigned long value); int hvm_set_cr3(unsigned long value); int hvm_set_cr4(unsigned long value); int hvm_msr_read_intercept(unsigned int msr, uint64_t *msr_content); int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content); int hvm_mov_to_cr(unsigned int cr, unsigned int gpr); int hvm_mov_from_cr(unsigned int cr, unsigned int gpr); #endif /* __ASM_X86_HVM_SUPPORT_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/irq.h0000664000175000017500000000657512307313555015636 0ustar smbsmb/****************************************************************************** * irq.h * * Interrupt distribution and delivery logic. * * Copyright (c) 2006, K A Fraser, XenSource Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #ifndef __ASM_X86_HVM_IRQ_H__ #define __ASM_X86_HVM_IRQ_H__ #include #include #include #include struct hvm_irq { /* * Virtual interrupt wires for a single PCI bus. * Indexed by: device*4 + INTx#. */ struct hvm_hw_pci_irqs pci_intx; /* * Virtual interrupt wires for ISA devices. * Indexed by ISA IRQ (assumes no ISA-device IRQ sharing). */ struct hvm_hw_isa_irqs isa_irq; /* * PCI-ISA interrupt router. * Each PCI is 'wire-ORed' into one of four links using * the traditional 'barber's pole' mapping ((device + INTx#) & 3). * The router provides a programmable mapping from each link to a GSI. */ struct hvm_hw_pci_link pci_link; /* Virtual interrupt and via-link for paravirtual platform driver. */ uint32_t callback_via_asserted; union { enum { HVMIRQ_callback_none, HVMIRQ_callback_gsi, HVMIRQ_callback_pci_intx, HVMIRQ_callback_vector } callback_via_type; }; union { uint32_t gsi; struct { uint8_t dev, intx; } pci; uint32_t vector; } callback_via; /* Number of INTx wires asserting each PCI-ISA link. */ u8 pci_link_assert_count[4]; /* * Number of wires asserting each GSI. * * GSIs 0-15 are the ISA IRQs. ISA devices map directly into this space * except ISA IRQ 0, which is connected to GSI 2. * PCI links map into this space via the PCI-ISA bridge. * * GSIs 16+ are used only be PCI devices. The mapping from PCI device to * GSI is as follows: ((device*4 + device/8 + INTx#) & 31) + 16 */ u8 gsi_assert_count[VIOAPIC_NUM_PINS]; /* * GSIs map onto PIC/IO-APIC in the usual way: * 0-7: Master 8259 PIC, IO-APIC pins 0-7 * 8-15: Slave 8259 PIC, IO-APIC pins 8-15 * 16+ : IO-APIC pins 16+ */ /* Last VCPU that was delivered a LowestPrio interrupt. */ u8 round_robin_prev_vcpu; struct hvm_irq_dpci *dpci; }; #define hvm_pci_intx_gsi(dev, intx) \ (((((dev)<<2) + ((dev)>>3) + (intx)) & 31) + 16) #define hvm_pci_intx_link(dev, intx) \ (((dev) + (intx)) & 3) #define hvm_isa_irq_to_gsi(isa_irq) ((isa_irq) ? : 2) /* Check/Acknowledge next pending interrupt. */ struct hvm_intack hvm_vcpu_has_pending_irq(struct vcpu *v); struct hvm_intack hvm_vcpu_ack_pending_irq(struct vcpu *v, struct hvm_intack intack); #endif /* __ASM_X86_HVM_IRQ_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/trace.h0000664000175000017500000001210212307313555016120 0ustar smbsmb#ifndef __ASM_X86_HVM_TRACE_H__ #define __ASM_X86_HVM_TRACE_H__ #include #define DEFAULT_HVM_TRACE_ON 1 #define DEFAULT_HVM_TRACE_OFF 0 #define DEFAULT_HVM_VMSWITCH DEFAULT_HVM_TRACE_ON #define DEFAULT_HVM_PF DEFAULT_HVM_TRACE_ON #define DEFAULT_HVM_INJECT DEFAULT_HVM_TRACE_ON #define DEFAULT_HVM_IO DEFAULT_HVM_TRACE_ON #define DEFAULT_HVM_REGACCESS DEFAULT_HVM_TRACE_ON #define DEFAULT_HVM_MISC DEFAULT_HVM_TRACE_ON #define DEFAULT_HVM_INTR DEFAULT_HVM_TRACE_ON #define DO_TRC_HVM_VMENTRY DEFAULT_HVM_VMSWITCH #define DO_TRC_HVM_VMEXIT DEFAULT_HVM_VMSWITCH #define DO_TRC_HVM_VMEXIT64 DEFAULT_HVM_VMSWITCH #define DO_TRC_HVM_PF_XEN DEFAULT_HVM_PF #define DO_TRC_HVM_PF_XEN64 DEFAULT_HVM_PF #define DO_TRC_HVM_PF_INJECT DEFAULT_HVM_PF #define DO_TRC_HVM_PF_INJECT64 DEFAULT_HVM_PF #define DO_TRC_HVM_INJ_EXC DEFAULT_HVM_INJECT #define DO_TRC_HVM_INJ_VIRQ DEFAULT_HVM_INJECT #define DO_TRC_HVM_REINJ_VIRQ DEFAULT_HVM_INJECT #define DO_TRC_HVM_INTR_WINDOW DEFAULT_HVM_INJECT #define DO_TRC_HVM_IO_READ DEFAULT_HVM_IO #define DO_TRC_HVM_IO_WRITE DEFAULT_HVM_IO #define DO_TRC_HVM_CR_READ DEFAULT_HVM_REGACCESS #define DO_TRC_HVM_CR_READ64 DEFAULT_HVM_REGACCESS #define DO_TRC_HVM_CR_WRITE DEFAULT_HVM_REGACCESS #define DO_TRC_HVM_CR_WRITE64 DEFAULT_HVM_REGACCESS #define DO_TRC_HVM_DR_READ DEFAULT_HVM_REGACCESS #define DO_TRC_HVM_DR_WRITE DEFAULT_HVM_REGACCESS #define DO_TRC_HVM_MSR_READ DEFAULT_HVM_REGACCESS #define DO_TRC_HVM_MSR_WRITE DEFAULT_HVM_REGACCESS #define DO_TRC_HVM_RDTSC DEFAULT_HVM_REGACCESS #define DO_TRC_HVM_CPUID DEFAULT_HVM_MISC #define DO_TRC_HVM_INTR DEFAULT_HVM_INTR #define DO_TRC_HVM_NMI DEFAULT_HVM_INTR #define DO_TRC_HVM_MCE DEFAULT_HVM_INTR #define DO_TRC_HVM_SMI DEFAULT_HVM_INTR #define DO_TRC_HVM_VMMCALL DEFAULT_HVM_MISC #define DO_TRC_HVM_HLT DEFAULT_HVM_MISC #define DO_TRC_HVM_INVLPG DEFAULT_HVM_MISC #define DO_TRC_HVM_INVLPG64 DEFAULT_HVM_MISC #define DO_TRC_HVM_IO_ASSIST DEFAULT_HVM_MISC #define DO_TRC_HVM_MMIO_ASSIST DEFAULT_HVM_MISC #define DO_TRC_HVM_CLTS DEFAULT_HVM_MISC #define DO_TRC_HVM_LMSW DEFAULT_HVM_MISC #define DO_TRC_HVM_LMSW64 DEFAULT_HVM_MISC #define DO_TRC_HVM_REALMODE_EMULATE DEFAULT_HVM_MISC #define DO_TRC_HVM_TRAP DEFAULT_HVM_MISC #define DO_TRC_HVM_TRAP_DEBUG DEFAULT_HVM_MISC #define DO_TRC_HVM_VLAPIC DEFAULT_HVM_MISC #define TRC_PAR_LONG(par) ((par)&0xFFFFFFFF),((par)>>32) #define HVMTRACE_ND(evt, modifier, cycles, count, d1, d2, d3, d4, d5, d6) \ do { \ if ( unlikely(tb_init_done) && DO_TRC_HVM_ ## evt ) \ { \ struct { \ u32 d[6]; \ } _d; \ _d.d[0]=(d1); \ _d.d[1]=(d2); \ _d.d[2]=(d3); \ _d.d[3]=(d4); \ _d.d[4]=(d5); \ _d.d[5]=(d6); \ __trace_var(TRC_HVM_ ## evt | (modifier), cycles, \ sizeof(*_d.d) * count, &_d); \ } \ } while(0) #define HVMTRACE_6D(evt, d1, d2, d3, d4, d5, d6) \ HVMTRACE_ND(evt, 0, 0, 6, d1, d2, d3, d4, d5, d6) #define HVMTRACE_5D(evt, d1, d2, d3, d4, d5) \ HVMTRACE_ND(evt, 0, 0, 5, d1, d2, d3, d4, d5, 0) #define HVMTRACE_4D(evt, d1, d2, d3, d4) \ HVMTRACE_ND(evt, 0, 0, 4, d1, d2, d3, d4, 0, 0) #define HVMTRACE_3D(evt, d1, d2, d3) \ HVMTRACE_ND(evt, 0, 0, 3, d1, d2, d3, 0, 0, 0) #define HVMTRACE_2D(evt, d1, d2) \ HVMTRACE_ND(evt, 0, 0, 2, d1, d2, 0, 0, 0, 0) #define HVMTRACE_1D(evt, d1) \ HVMTRACE_ND(evt, 0, 0, 1, d1, 0, 0, 0, 0, 0) #define HVMTRACE_0D(evt) \ HVMTRACE_ND(evt, 0, 0, 0, 0, 0, 0, 0, 0, 0) #define HVMTRACE_LONG_1D(evt, d1) \ HVMTRACE_2D(evt ## 64, (d1) & 0xFFFFFFFF, (d1) >> 32) #define HVMTRACE_LONG_2D(evt, d1, d2, ...) \ HVMTRACE_3D(evt ## 64, d1, d2) #define HVMTRACE_LONG_3D(evt, d1, d2, d3, ...) \ HVMTRACE_4D(evt ## 64, d1, d2, d3) #define HVMTRACE_LONG_4D(evt, d1, d2, d3, d4, ...) \ HVMTRACE_5D(evt ## 64, d1, d2, d3, d4) #endif /* __ASM_X86_HVM_TRACE_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/hvm/asid.h0000664000175000017500000000325512307313555015753 0ustar smbsmb/* * asid.h: ASID management * Copyright (c) 2007, Advanced Micro Devices, Inc. * Copyright (c) 2009, Citrix Systems, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #ifndef __ASM_X86_HVM_ASID_H__ #define __ASM_X86_HVM_ASID_H__ #include struct vcpu; struct hvm_vcpu_asid; /* Initialise ASID management for the current physical CPU. */ void hvm_asid_init(int nasids); /* Invalidate a particular ASID allocation: forces re-allocation. */ void hvm_asid_flush_vcpu_asid(struct hvm_vcpu_asid *asid); /* Invalidate all ASID allocations for specified VCPU: forces re-allocation. */ void hvm_asid_flush_vcpu(struct vcpu *v); /* Flush all ASIDs on this processor core. */ void hvm_asid_flush_core(void); /* Called before entry to guest context. Checks ASID allocation, returns a * boolean indicating whether all ASIDs must be flushed. */ bool_t hvm_asid_handle_vmenter(struct hvm_vcpu_asid *asid); #endif /* __ASM_X86_HVM_ASID_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/hvm/viridian.h0000664000175000017500000000254512307313555016641 0ustar smbsmb/***************************************************************************** * * include/xen/viridian.h * * Copyright (c) 2008 Citrix Corp. * */ #ifndef __ASM_X86_HVM_VIRIDIAN_H__ #define __ASM_X86_HVM_VIRIDIAN_H__ union viridian_apic_assist { uint64_t raw; struct { uint64_t enabled:1; uint64_t reserved_preserved:11; uint64_t pfn:48; } fields; }; struct viridian_vcpu { union viridian_apic_assist apic_assist; }; union viridian_guest_os_id { uint64_t raw; struct { uint64_t build_number:16; uint64_t service_pack:8; uint64_t minor:8; uint64_t major:8; uint64_t os:8; uint64_t vendor:16; } fields; }; union viridian_hypercall_gpa { uint64_t raw; struct { uint64_t enabled:1; uint64_t reserved_preserved:11; uint64_t pfn:48; } fields; }; struct viridian_domain { union viridian_guest_os_id guest_os_id; union viridian_hypercall_gpa hypercall_gpa; }; int cpuid_viridian_leaves( unsigned int leaf, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); int wrmsr_viridian_regs( uint32_t idx, uint64_t val); int rdmsr_viridian_regs( uint32_t idx, uint64_t *val); int viridian_hypercall(struct cpu_user_regs *regs); #endif /* __ASM_X86_HVM_VIRIDIAN_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/vpic.h0000664000175000017500000000317512307313555015775 0ustar smbsmb/* * i8259 interrupt controller emulation * * Copyright (c) 2003 Fabrice Bellard * Copyright (c) 2005 Intel Corp * Copyright (c) 2006 Keir Fraser, XenSource Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #ifndef __ASM_X86_HVM_VPIC_H__ #define __ASM_X86_HVM_VPIC_H__ #include void vpic_irq_positive_edge(struct domain *d, int irq); void vpic_irq_negative_edge(struct domain *d, int irq); void vpic_init(struct domain *d); void vpic_reset(struct domain *d); int vpic_ack_pending_irq(struct vcpu *v); int is_periodic_irq(struct vcpu *v, int irq, int type); #endif /* __ASM_X86_HVM_VPIC_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/iommu.h0000664000175000017500000000152612307313555016160 0ustar smbsmb#ifndef __ASM_X86_HVM_IOMMU_H__ #define __ASM_X86_HVM_IOMMU_H__ #include struct iommu_ops; extern const struct iommu_ops intel_iommu_ops; extern const struct iommu_ops amd_iommu_ops; extern int intel_vtd_setup(void); extern int amd_iov_detect(void); static inline const struct iommu_ops *iommu_get_ops(void) { switch ( boot_cpu_data.x86_vendor ) { case X86_VENDOR_INTEL: return &intel_iommu_ops; case X86_VENDOR_AMD: return &amd_iommu_ops; default: BUG(); } return NULL; } static inline int iommu_hardware_setup(void) { switch ( boot_cpu_data.x86_vendor ) { case X86_VENDOR_INTEL: return intel_vtd_setup(); case X86_VENDOR_AMD: return amd_iov_detect(); default: return -ENODEV; } return 0; } #endif /* __ASM_X86_HVM_IOMMU_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/io.h0000664000175000017500000001067412307313555015445 0ustar smbsmb/* * io.h: HVM IO support * * Copyright (c) 2004, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #ifndef __ASM_X86_HVM_IO_H__ #define __ASM_X86_HVM_IO_H__ #include #include #include #include #define MAX_IO_HANDLER 16 #define HVM_PORTIO 0 #define HVM_BUFFERED_IO 2 typedef int (*hvm_mmio_read_t)(struct vcpu *v, unsigned long addr, unsigned long length, unsigned long *val); typedef int (*hvm_mmio_write_t)(struct vcpu *v, unsigned long addr, unsigned long length, unsigned long val); typedef int (*hvm_mmio_check_t)(struct vcpu *v, unsigned long addr); typedef int (*portio_action_t)( int dir, uint32_t port, uint32_t bytes, uint32_t *val); typedef int (*mmio_action_t)(ioreq_t *); struct io_handler { int type; unsigned long addr; unsigned long size; union { portio_action_t portio; mmio_action_t mmio; void *ptr; } action; }; struct hvm_io_handler { int num_slot; struct io_handler hdl_list[MAX_IO_HANDLER]; }; struct hvm_mmio_handler { hvm_mmio_check_t check_handler; hvm_mmio_read_t read_handler; hvm_mmio_write_t write_handler; }; extern const struct hvm_mmio_handler hpet_mmio_handler; extern const struct hvm_mmio_handler vlapic_mmio_handler; extern const struct hvm_mmio_handler vioapic_mmio_handler; extern const struct hvm_mmio_handler msixtbl_mmio_handler; extern const struct hvm_mmio_handler iommu_mmio_handler; #define HVM_MMIO_HANDLER_NR 5 int hvm_io_intercept(ioreq_t *p, int type); void register_io_handler( struct domain *d, unsigned long addr, unsigned long size, void *action, int type); void relocate_io_handler( struct domain *d, unsigned long old_addr, unsigned long new_addr, unsigned long size, int type); static inline int hvm_portio_intercept(ioreq_t *p) { return hvm_io_intercept(p, HVM_PORTIO); } static inline int hvm_buffered_io_intercept(ioreq_t *p) { return hvm_io_intercept(p, HVM_BUFFERED_IO); } int hvm_mmio_intercept(ioreq_t *p); int hvm_buffered_io_send(ioreq_t *p); static inline void register_portio_handler( struct domain *d, unsigned long addr, unsigned long size, portio_action_t action) { register_io_handler(d, addr, size, action, HVM_PORTIO); } static inline void relocate_portio_handler( struct domain *d, unsigned long old_addr, unsigned long new_addr, unsigned long size) { relocate_io_handler(d, old_addr, new_addr, size, HVM_PORTIO); } static inline void register_buffered_io_handler( struct domain *d, unsigned long addr, unsigned long size, mmio_action_t action) { register_io_handler(d, addr, size, action, HVM_BUFFERED_IO); } void send_timeoffset_req(unsigned long timeoff); void send_invalidate_req(void); int handle_mmio(void); int handle_mmio_with_translation(unsigned long gva, unsigned long gpfn); int handle_pio(uint16_t port, unsigned int size, int dir); void hvm_interrupt_post(struct vcpu *v, int vector, int type); void hvm_io_assist(ioreq_t *p); void hvm_dpci_eoi(struct domain *d, unsigned int guest_irq, union vioapic_redir_entry *ent); void msix_write_completion(struct vcpu *); struct hvm_hw_stdvga { uint8_t sr_index; uint8_t sr[8]; uint8_t gr_index; uint8_t gr[9]; bool_t stdvga; bool_t cache; uint32_t latch; struct page_info *vram_page[64]; /* shadow of 0xa0000-0xaffff */ spinlock_t lock; }; void stdvga_init(struct domain *d); void stdvga_deinit(struct domain *d); extern void hvm_dpci_msi_eoi(struct domain *d, int vector); #endif /* __ASM_X86_HVM_IO_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/vlapic.h0000664000175000017500000001134212307313555016305 0ustar smbsmb/* * hvm_vlapic.h: virtualize LAPIC definitions. * * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2006 Keir Fraser, XenSource Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #ifndef __ASM_X86_HVM_VLAPIC_H__ #define __ASM_X86_HVM_VLAPIC_H__ #include #include #include #include #define vcpu_vlapic(x) (&(x)->arch.hvm_vcpu.vlapic) #define vlapic_vcpu(x) (container_of((x), struct vcpu, arch.hvm_vcpu.vlapic)) #define vlapic_domain(x) (vlapic_vcpu(x)->domain) #define VLAPIC_ID(vlapic) \ (GET_xAPIC_ID(vlapic_get_reg((vlapic), APIC_ID))) /* * APIC can be disabled in two ways: * 1. 'Hardware disable': via IA32_APIC_BASE_MSR[11] * CPU should behave as if it does not have an APIC. * 2. 'Software disable': via APIC_SPIV[8]. * APIC is visible but does not respond to interrupt messages. */ #define VLAPIC_HW_DISABLED 0x1 #define VLAPIC_SW_DISABLED 0x2 #define vlapic_sw_disabled(vlapic) ((vlapic)->hw.disabled & VLAPIC_SW_DISABLED) #define vlapic_hw_disabled(vlapic) ((vlapic)->hw.disabled & VLAPIC_HW_DISABLED) #define vlapic_disabled(vlapic) ((vlapic)->hw.disabled) #define vlapic_enabled(vlapic) (!vlapic_disabled(vlapic)) #define vlapic_base_address(vlapic) \ ((vlapic)->hw.apic_base_msr & MSR_IA32_APICBASE_BASE) #define vlapic_x2apic_mode(vlapic) \ ((vlapic)->hw.apic_base_msr & MSR_IA32_APICBASE_EXTD) /* * Generic APIC bitmap vector update & search routines. */ #define VEC_POS(v) ((v) % 32) #define REG_POS(v) (((v) / 32) * 0x10) #define vlapic_test_and_set_vector(vec, bitmap) \ test_and_set_bit(VEC_POS(vec), (uint32_t *)((bitmap) + REG_POS(vec))) #define vlapic_test_and_clear_vector(vec, bitmap) \ test_and_clear_bit(VEC_POS(vec), (uint32_t *)((bitmap) + REG_POS(vec))) #define vlapic_set_vector(vec, bitmap) \ set_bit(VEC_POS(vec), (uint32_t *)((bitmap) + REG_POS(vec))) #define vlapic_clear_vector(vec, bitmap) \ clear_bit(VEC_POS(vec), (uint32_t *)((bitmap) + REG_POS(vec))) struct vlapic { struct hvm_hw_lapic hw; struct hvm_hw_lapic_regs *regs; struct periodic_time pt; s_time_t timer_last_update; struct page_info *regs_page; /* INIT-SIPI-SIPI work gets deferred to a tasklet. */ struct { uint32_t icr, dest; struct tasklet tasklet; } init_sipi; }; /* vlapic's frequence is 100 MHz */ #define APIC_BUS_CYCLE_NS 10 static inline uint32_t vlapic_get_reg(struct vlapic *vlapic, uint32_t reg) { return *((uint32_t *)(&vlapic->regs->data[reg])); } static inline void vlapic_set_reg( struct vlapic *vlapic, uint32_t reg, uint32_t val) { *((uint32_t *)(&vlapic->regs->data[reg])) = val; } bool_t is_vlapic_lvtpc_enabled(struct vlapic *vlapic); void vlapic_set_irq(struct vlapic *vlapic, uint8_t vec, uint8_t trig); int vlapic_has_pending_irq(struct vcpu *v); int vlapic_ack_pending_irq(struct vcpu *v, int vector, bool_t force_ack); int vlapic_init(struct vcpu *v); void vlapic_destroy(struct vcpu *v); void vlapic_reset(struct vlapic *vlapic); void vlapic_msr_set(struct vlapic *vlapic, uint64_t value); void vlapic_tdt_msr_set(struct vlapic *vlapic, uint64_t value); uint64_t vlapic_tdt_msr_get(struct vlapic *vlapic); int vlapic_accept_pic_intr(struct vcpu *v); uint32_t vlapic_set_ppr(struct vlapic *vlapic); void vlapic_adjust_i8259_target(struct domain *d); void vlapic_EOI_set(struct vlapic *vlapic); void vlapic_handle_EOI_induced_exit(struct vlapic *vlapic, int vector); void vlapic_ipi(struct vlapic *vlapic, uint32_t icr_low, uint32_t icr_high); int vlapic_apicv_write(struct vcpu *v, unsigned int offset); struct vlapic *vlapic_lowest_prio( struct domain *d, struct vlapic *source, int short_hand, uint8_t dest, uint8_t dest_mode); bool_t vlapic_match_dest( struct vlapic *target, struct vlapic *source, int short_hand, uint8_t dest, uint8_t dest_mode); #endif /* __ASM_X86_HVM_VLAPIC_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/cacheattr.h0000664000175000017500000000137012307313555016765 0ustar smbsmb#ifndef __HVM_CACHEATTR_H__ #define __HVM_CACHEATTR_H__ struct hvm_mem_pinned_cacheattr_range { struct list_head list; uint64_t start, end; uint32_t type; }; void hvm_init_cacheattr_region_list( struct domain *d); void hvm_destroy_cacheattr_region_list( struct domain *d); /* * To see guest_fn is in the pinned range or not, * if yes, return 1, and set type to value in this range * if no, return 0, and set type to 0 */ int32_t hvm_get_mem_pinned_cacheattr( struct domain *d, uint64_t guest_fn, uint32_t *type); /* Set pinned caching type for a domain. */ int32_t hvm_set_mem_pinned_cacheattr( struct domain *d, uint64_t gfn_start, uint64_t gfn_end, uint32_t type); #endif /* __HVM_CACHEATTR_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/nestedhvm.h0000664000175000017500000000563312307313555017032 0ustar smbsmb/* * Nested HVM * Copyright (c) 2011, Advanced Micro Devices, Inc. * Author: Christoph Egger * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #ifndef _HVM_NESTEDHVM_H #define _HVM_NESTEDHVM_H #include /* for uintNN_t */ #include /* for struct vcpu, struct domain */ #include /* for vcpu_nestedhvm */ enum nestedhvm_vmexits { NESTEDHVM_VMEXIT_ERROR = 0, /* inject VMEXIT w/ invalid VMCB */ NESTEDHVM_VMEXIT_FATALERROR = 1, /* crash first level guest */ NESTEDHVM_VMEXIT_HOST = 2, /* exit handled on host level */ NESTEDHVM_VMEXIT_CONTINUE = 3, /* further handling */ NESTEDHVM_VMEXIT_INJECT = 4, /* inject VMEXIT */ NESTEDHVM_VMEXIT_DONE = 5, /* VMEXIT handled */ }; /* Nested HVM on/off per domain */ bool_t nestedhvm_enabled(struct domain *d); /* Nested VCPU */ int nestedhvm_vcpu_initialise(struct vcpu *v); void nestedhvm_vcpu_destroy(struct vcpu *v); void nestedhvm_vcpu_reset(struct vcpu *v); bool_t nestedhvm_vcpu_in_guestmode(struct vcpu *v); #define nestedhvm_vcpu_enter_guestmode(v) \ vcpu_nestedhvm(v).nv_guestmode = 1 #define nestedhvm_vcpu_exit_guestmode(v) \ vcpu_nestedhvm(v).nv_guestmode = 0 /* Nested paging */ #define NESTEDHVM_PAGEFAULT_DONE 0 #define NESTEDHVM_PAGEFAULT_INJECT 1 #define NESTEDHVM_PAGEFAULT_L1_ERROR 2 #define NESTEDHVM_PAGEFAULT_L0_ERROR 3 #define NESTEDHVM_PAGEFAULT_MMIO 4 #define NESTEDHVM_PAGEFAULT_RETRY 5 #define NESTEDHVM_PAGEFAULT_DIRECT_MMIO 6 int nestedhvm_hap_nested_page_fault(struct vcpu *v, paddr_t *L2_gpa, bool_t access_r, bool_t access_w, bool_t access_x); /* IO permission map */ unsigned long *nestedhvm_vcpu_iomap_get(bool_t ioport_80, bool_t ioport_ed); /* Misc */ #define nestedhvm_paging_mode_hap(v) (!!nhvm_vmcx_hap_enabled(v)) #define nestedhvm_vmswitch_in_progress(v) \ (!!vcpu_nestedhvm((v)).nv_vmswitch_in_progress) void nestedhvm_vmcx_flushtlb(struct p2m_domain *p2m); bool_t nestedhvm_is_n2(struct vcpu *v); static inline void nestedhvm_set_cr(struct vcpu *v, unsigned int cr, unsigned long value) { if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) ) v->arch.hvm_vcpu.nvcpu.guest_cr[cr] = value; } #endif /* _HVM_NESTEDHVM_H */ xen-4.4.0/xen/include/asm-x86/hvm/hvm.h0000664000175000017500000004434512307313555015632 0ustar smbsmb/* * hvm.h: Hardware virtual machine assist interface definitions. * * Leendert van Doorn, leendert@watson.ibm.com * Copyright (c) 2005, International Business Machines Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #ifndef __ASM_X86_HVM_HVM_H__ #define __ASM_X86_HVM_HVM_H__ #include #include #include #include #include #include /* Interrupt acknowledgement sources. */ enum hvm_intsrc { hvm_intsrc_none, hvm_intsrc_pic, hvm_intsrc_lapic, hvm_intsrc_nmi, hvm_intsrc_mce, hvm_intsrc_vector }; struct hvm_intack { uint8_t source; /* enum hvm_intsrc */ uint8_t vector; }; #define hvm_intack(src, vec) ((struct hvm_intack) { hvm_intsrc_##src, vec }) #define hvm_intack_none hvm_intack(none, 0) #define hvm_intack_pic(vec) hvm_intack(pic, vec) #define hvm_intack_lapic(vec) hvm_intack(lapic, vec) #define hvm_intack_nmi hvm_intack(nmi, 2) #define hvm_intack_mce hvm_intack(mce, 18) #define hvm_intack_vector(vec) hvm_intack(vector, vec) enum hvm_intblk { hvm_intblk_none, /* not blocked (deliverable) */ hvm_intblk_shadow, /* MOV-SS or STI shadow */ hvm_intblk_rflags_ie, /* RFLAGS.IE == 0 */ hvm_intblk_tpr, /* LAPIC TPR too high */ hvm_intblk_nmi_iret, /* NMI blocked until IRET */ hvm_intblk_arch, /* SVM/VMX specific reason */ }; /* These happen to be the same as the VMX interrupt shadow definitions. */ #define HVM_INTR_SHADOW_STI 0x00000001 #define HVM_INTR_SHADOW_MOV_SS 0x00000002 #define HVM_INTR_SHADOW_SMI 0x00000004 #define HVM_INTR_SHADOW_NMI 0x00000008 /* * HAP super page capabilities: * bit0: if 2MB super page is allowed? * bit1: if 1GB super page is allowed? */ #define HVM_HAP_SUPERPAGE_2MB 0x00000001 #define HVM_HAP_SUPERPAGE_1GB 0x00000002 struct hvm_trap { int vector; unsigned int type; /* X86_EVENTTYPE_* */ int error_code; /* HVM_DELIVER_NO_ERROR_CODE if n/a */ int insn_len; /* Instruction length */ unsigned long cr2; /* Only for TRAP_page_fault h/w exception */ }; /* * The hardware virtual machine (HVM) interface abstracts away from the * x86/x86_64 CPU virtualization assist specifics. Currently this interface * supports Intel's VT-x and AMD's SVM extensions. */ struct hvm_function_table { char *name; /* Support Hardware-Assisted Paging? */ int hap_supported; /* Necessary hardware support for PVH mode? */ int pvh_supported; /* Indicate HAP capabilities. */ int hap_capabilities; /* * Initialise/destroy HVM domain/vcpu resources */ int (*domain_initialise)(struct domain *d); void (*domain_destroy)(struct domain *d); int (*vcpu_initialise)(struct vcpu *v); void (*vcpu_destroy)(struct vcpu *v); /* save and load hvm guest cpu context for save/restore */ void (*save_cpu_ctxt)(struct vcpu *v, struct hvm_hw_cpu *ctxt); int (*load_cpu_ctxt)(struct vcpu *v, struct hvm_hw_cpu *ctxt); /* Examine specifics of the guest state. */ unsigned int (*get_interrupt_shadow)(struct vcpu *v); void (*set_interrupt_shadow)(struct vcpu *v, unsigned int intr_shadow); int (*guest_x86_mode)(struct vcpu *v); void (*get_segment_register)(struct vcpu *v, enum x86_segment seg, struct segment_register *reg); void (*set_segment_register)(struct vcpu *v, enum x86_segment seg, struct segment_register *reg); unsigned long (*get_shadow_gs_base)(struct vcpu *v); /* * Re-set the value of CR3 that Xen runs on when handling VM exits. */ void (*update_host_cr3)(struct vcpu *v); /* * Called to inform HVM layer that a guest CRn or EFER has changed. */ void (*update_guest_cr)(struct vcpu *v, unsigned int cr); void (*update_guest_efer)(struct vcpu *v); int (*get_guest_pat)(struct vcpu *v, u64 *); int (*set_guest_pat)(struct vcpu *v, u64); void (*set_tsc_offset)(struct vcpu *v, u64 offset); void (*inject_trap)(struct hvm_trap *trap); void (*init_hypercall_page)(struct domain *d, void *hypercall_page); int (*event_pending)(struct vcpu *v); int (*cpu_up_prepare)(unsigned int cpu); void (*cpu_dead)(unsigned int cpu); int (*cpu_up)(void); void (*cpu_down)(void); /* Copy up to 15 bytes from cached instruction bytes at current rIP. */ unsigned int (*get_insn_bytes)(struct vcpu *v, uint8_t *buf); /* Instruction intercepts: non-void return values are X86EMUL codes. */ void (*cpuid_intercept)( unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); void (*wbinvd_intercept)(void); void (*fpu_dirty_intercept)(void); int (*msr_read_intercept)(unsigned int msr, uint64_t *msr_content); int (*msr_write_intercept)(unsigned int msr, uint64_t msr_content); void (*invlpg_intercept)(unsigned long vaddr); void (*handle_cd)(struct vcpu *v, unsigned long value); void (*set_info_guest)(struct vcpu *v); void (*set_rdtsc_exiting)(struct vcpu *v, bool_t); /* Nested HVM */ int (*nhvm_vcpu_initialise)(struct vcpu *v); void (*nhvm_vcpu_destroy)(struct vcpu *v); int (*nhvm_vcpu_reset)(struct vcpu *v); int (*nhvm_vcpu_hostrestore)(struct vcpu *v, struct cpu_user_regs *regs); int (*nhvm_vcpu_vmexit)(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode); int (*nhvm_vcpu_vmexit_trap)(struct vcpu *v, struct hvm_trap *trap); uint64_t (*nhvm_vcpu_guestcr3)(struct vcpu *v); uint64_t (*nhvm_vcpu_p2m_base)(struct vcpu *v); uint32_t (*nhvm_vcpu_asid)(struct vcpu *v); int (*nhvm_vmcx_guest_intercepts_trap)(struct vcpu *v, unsigned int trapnr, int errcode); bool_t (*nhvm_vmcx_hap_enabled)(struct vcpu *v); enum hvm_intblk (*nhvm_intr_blocked)(struct vcpu *v); void (*nhvm_domain_relinquish_resources)(struct domain *d); /* Virtual interrupt delivery */ void (*update_eoi_exit_bitmap)(struct vcpu *v, u8 vector, u8 trig); int (*virtual_intr_delivery_enabled)(void); void (*process_isr)(int isr, struct vcpu *v); void (*deliver_posted_intr)(struct vcpu *v, u8 vector); void (*sync_pir_to_irr)(struct vcpu *v); void (*handle_eoi)(u8 vector); /*Walk nested p2m */ int (*nhvm_hap_walk_L1_p2m)(struct vcpu *v, paddr_t L2_gpa, paddr_t *L1_gpa, unsigned int *page_order, uint8_t *p2m_acc, bool_t access_r, bool_t access_w, bool_t access_x); }; extern struct hvm_function_table hvm_funcs; extern bool_t hvm_enabled; extern bool_t cpu_has_lmsl; extern s8 hvm_port80_allowed; extern const struct hvm_function_table *start_svm(void); extern const struct hvm_function_table *start_vmx(void); int hvm_domain_initialise(struct domain *d); void hvm_domain_relinquish_resources(struct domain *d); void hvm_domain_destroy(struct domain *d); int hvm_vcpu_initialise(struct vcpu *v); void hvm_vcpu_destroy(struct vcpu *v); void hvm_vcpu_down(struct vcpu *v); int hvm_vcpu_cacheattr_init(struct vcpu *v); void hvm_vcpu_cacheattr_destroy(struct vcpu *v); void hvm_vcpu_reset_state(struct vcpu *v, uint16_t cs, uint16_t ip); /* Prepare/destroy a ring for a dom0 helper. Helper with talk * with Xen on behalf of this hvm domain. */ int prepare_ring_for_helper(struct domain *d, unsigned long gmfn, struct page_info **_page, void **_va); void destroy_ring_for_helper(void **_va, struct page_info *page); bool_t hvm_send_assist_req(struct vcpu *v); void hvm_get_guest_pat(struct vcpu *v, u64 *guest_pat); int hvm_set_guest_pat(struct vcpu *v, u64 guest_pat); void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc); u64 hvm_get_guest_tsc(struct vcpu *v); void hvm_init_guest_time(struct domain *d); void hvm_set_guest_time(struct vcpu *v, u64 guest_time); u64 hvm_get_guest_time(struct vcpu *v); int vmsi_deliver( struct domain *d, int vector, uint8_t dest, uint8_t dest_mode, uint8_t delivery_mode, uint8_t trig_mode); struct hvm_pirq_dpci; void vmsi_deliver_pirq(struct domain *d, const struct hvm_pirq_dpci *); int hvm_girq_dest_2_vcpu_id(struct domain *d, uint8_t dest, uint8_t dest_mode); #define hvm_paging_enabled(v) \ (!!((v)->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG)) #define hvm_wp_enabled(v) \ (!!((v)->arch.hvm_vcpu.guest_cr[0] & X86_CR0_WP)) #define hvm_pcid_enabled(v) \ (!!((v)->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PCIDE)) #define hvm_pae_enabled(v) \ (hvm_paging_enabled(v) && ((v)->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE)) #define hvm_smep_enabled(v) \ (hvm_paging_enabled(v) && ((v)->arch.hvm_vcpu.guest_cr[4] & X86_CR4_SMEP)) #define hvm_nx_enabled(v) \ (!!((v)->arch.hvm_vcpu.guest_efer & EFER_NX)) /* Can we use superpages in the HAP p2m table? */ #define hvm_hap_has_1gb(d) \ (hvm_funcs.hap_capabilities & HVM_HAP_SUPERPAGE_1GB) #define hvm_hap_has_2mb(d) \ (hvm_funcs.hap_capabilities & HVM_HAP_SUPERPAGE_2MB) /* Can the guest use 1GB superpages in its own pagetables? */ #define hvm_pse1gb_supported(d) \ (cpu_has_page1gb && paging_mode_hap(d)) #define hvm_long_mode_enabled(v) \ ((v)->arch.hvm_vcpu.guest_efer & EFER_LMA) enum hvm_intblk hvm_interrupt_blocked(struct vcpu *v, struct hvm_intack intack); static inline int hvm_guest_x86_mode(struct vcpu *v) { ASSERT(v == current); return hvm_funcs.guest_x86_mode(v); } static inline void hvm_update_host_cr3(struct vcpu *v) { if ( hvm_funcs.update_host_cr3 ) hvm_funcs.update_host_cr3(v); } static inline void hvm_update_guest_cr(struct vcpu *v, unsigned int cr) { hvm_funcs.update_guest_cr(v, cr); } static inline void hvm_update_guest_efer(struct vcpu *v) { hvm_funcs.update_guest_efer(v); } /* * Called to ensure than all guest-specific mappings in a tagged TLB are * flushed; does *not* flush Xen's TLB entries, and on processors without a * tagged TLB it will be a noop. */ static inline void hvm_flush_guest_tlbs(void) { if ( hvm_enabled ) hvm_asid_flush_core(); } void hvm_hypercall_page_initialise(struct domain *d, void *hypercall_page); static inline void hvm_get_segment_register(struct vcpu *v, enum x86_segment seg, struct segment_register *reg) { hvm_funcs.get_segment_register(v, seg, reg); } static inline void hvm_set_segment_register(struct vcpu *v, enum x86_segment seg, struct segment_register *reg) { hvm_funcs.set_segment_register(v, seg, reg); } static inline unsigned long hvm_get_shadow_gs_base(struct vcpu *v) { return hvm_funcs.get_shadow_gs_base(v); } #define is_viridian_domain(_d) \ (is_hvm_domain(_d) && ((_d)->arch.hvm_domain.params[HVM_PARAM_VIRIDIAN])) void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); void hvm_migrate_timers(struct vcpu *v); void hvm_do_resume(struct vcpu *v); void hvm_migrate_pirqs(struct vcpu *v); void hvm_inject_trap(struct hvm_trap *trap); void hvm_inject_hw_exception(unsigned int trapnr, int errcode); void hvm_inject_page_fault(int errcode, unsigned long cr2); static inline int hvm_event_pending(struct vcpu *v) { return hvm_funcs.event_pending(v); } /* These reserved bits in lower 32 remain 0 after any load of CR0 */ #define HVM_CR0_GUEST_RESERVED_BITS \ (~((unsigned long) \ (X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | \ X86_CR0_TS | X86_CR0_ET | X86_CR0_NE | \ X86_CR0_WP | X86_CR0_AM | X86_CR0_NW | \ X86_CR0_CD | X86_CR0_PG))) /* These bits in CR4 are owned by the host. */ #define HVM_CR4_HOST_MASK (mmu_cr4_features & \ (X86_CR4_VMXE | X86_CR4_PAE | X86_CR4_MCE)) /* These bits in CR4 cannot be set by the guest. */ #define HVM_CR4_GUEST_RESERVED_BITS(_v) \ (~((unsigned long) \ (X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | \ X86_CR4_DE | X86_CR4_PSE | X86_CR4_PAE | \ X86_CR4_MCE | X86_CR4_PGE | X86_CR4_PCE | \ X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT | \ (cpu_has_smep ? X86_CR4_SMEP : 0) | \ (cpu_has_fsgsbase ? X86_CR4_FSGSBASE : 0) | \ ((nestedhvm_enabled((_v)->domain) && cpu_has_vmx)\ ? X86_CR4_VMXE : 0) | \ (cpu_has_pcid ? X86_CR4_PCIDE : 0) | \ (cpu_has_xsave ? X86_CR4_OSXSAVE : 0)))) /* These exceptions must always be intercepted. */ #define HVM_TRAP_MASK ((1U << TRAP_machine_check) | (1U << TRAP_invalid_op)) /* * x86 event types. This enumeration is valid for: * Intel VMX: {VM_ENTRY,VM_EXIT,IDT_VECTORING}_INTR_INFO[10:8] * AMD SVM: eventinj[10:8] and exitintinfo[10:8] (types 0-4 only) */ #define X86_EVENTTYPE_EXT_INTR 0 /* external interrupt */ #define X86_EVENTTYPE_NMI 2 /* NMI */ #define X86_EVENTTYPE_HW_EXCEPTION 3 /* hardware exception */ #define X86_EVENTTYPE_SW_INTERRUPT 4 /* software interrupt (CD nn) */ #define X86_EVENTTYPE_PRI_SW_EXCEPTION 5 /* ICEBP (F1) */ #define X86_EVENTTYPE_SW_EXCEPTION 6 /* INT3 (CC), INTO (CE) */ int hvm_event_needs_reinjection(uint8_t type, uint8_t vector); uint8_t hvm_combine_hw_exceptions(uint8_t vec1, uint8_t vec2); void hvm_set_rdtsc_exiting(struct domain *d, bool_t enable); static inline int hvm_cpu_up(void) { return (hvm_funcs.cpu_up ? hvm_funcs.cpu_up() : 0); } static inline void hvm_cpu_down(void) { if ( hvm_funcs.cpu_down ) hvm_funcs.cpu_down(); } static inline unsigned int hvm_get_insn_bytes(struct vcpu *v, uint8_t *buf) { return (hvm_funcs.get_insn_bytes ? hvm_funcs.get_insn_bytes(v, buf) : 0); } enum hvm_task_switch_reason { TSW_jmp, TSW_iret, TSW_call_or_int }; void hvm_task_switch( uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason, int32_t errcode); enum hvm_access_type { hvm_access_insn_fetch, hvm_access_none, hvm_access_read, hvm_access_write }; int hvm_virtual_to_linear_addr( enum x86_segment seg, struct segment_register *reg, unsigned long offset, unsigned int bytes, enum hvm_access_type access_type, unsigned int addr_size, unsigned long *linear_addr); void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent); void *hvm_map_guest_frame_ro(unsigned long gfn, bool_t permanent); void hvm_unmap_guest_frame(void *p, bool_t permanent); static inline void hvm_set_info_guest(struct vcpu *v) { if ( hvm_funcs.set_info_guest ) return hvm_funcs.set_info_guest(v); } int hvm_debug_op(struct vcpu *v, int32_t op); static inline void hvm_invalidate_regs_fields(struct cpu_user_regs *regs) { #ifndef NDEBUG regs->error_code = 0xbeef; regs->entry_vector = 0xbeef; regs->saved_upcall_mask = 0xbf; regs->cs = 0xbeef; regs->ss = 0xbeef; regs->ds = 0xbeef; regs->es = 0xbeef; regs->fs = 0xbeef; regs->gs = 0xbeef; #endif } int hvm_hap_nested_page_fault(paddr_t gpa, bool_t gla_valid, unsigned long gla, bool_t access_r, bool_t access_w, bool_t access_x); #define hvm_msr_tsc_aux(v) ({ \ struct domain *__d = (v)->domain; \ (__d->arch.tsc_mode == TSC_MODE_PVRDTSCP) \ ? (u32)__d->arch.incarnation : (u32)(v)->arch.hvm_vcpu.msr_tsc_aux; \ }) int hvm_x2apic_msr_read(struct vcpu *v, unsigned int msr, uint64_t *msr_content); int hvm_x2apic_msr_write(struct vcpu *v, unsigned int msr, uint64_t msr_content); /* Called for current VCPU on crX changes by guest */ void hvm_memory_event_cr0(unsigned long value, unsigned long old); void hvm_memory_event_cr3(unsigned long value, unsigned long old); void hvm_memory_event_cr4(unsigned long value, unsigned long old); void hvm_memory_event_msr(unsigned long msr, unsigned long value); /* Called for current VCPU on int3: returns -1 if no listener */ int hvm_memory_event_int3(unsigned long gla); /* Called for current VCPU on single step: returns -1 if no listener */ int hvm_memory_event_single_step(unsigned long gla); /* * Nested HVM */ /* Restores l1 guest state */ int nhvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs); /* Fill l1 guest's VMCB/VMCS with data provided by generic exit codes * (do conversion as needed), other misc SVM/VMX specific tweaks to make * it work */ int nhvm_vcpu_vmexit(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode); /* inject vmexit into l1 guest. l1 guest will see a VMEXIT due to * 'trapnr' exception. */ int nhvm_vcpu_vmexit_trap(struct vcpu *v, struct hvm_trap *trap); /* returns l2 guest cr3 in l2 guest physical address space. */ uint64_t nhvm_vcpu_guestcr3(struct vcpu *v); /* returns l1 guest's cr3 that points to the page table used to * translate l2 guest physical address to l1 guest physical address. */ uint64_t nhvm_vcpu_p2m_base(struct vcpu *v); /* returns the asid number l1 guest wants to use to run the l2 guest */ uint32_t nhvm_vcpu_asid(struct vcpu *v); /* returns true, when l1 guest intercepts the specified trap */ int nhvm_vmcx_guest_intercepts_trap(struct vcpu *v, unsigned int trapnr, int errcode); /* returns true when l1 guest wants to use hap to run l2 guest */ bool_t nhvm_vmcx_hap_enabled(struct vcpu *v); /* interrupt */ enum hvm_intblk nhvm_interrupt_blocked(struct vcpu *v); #endif /* __ASM_X86_HVM_HVM_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/vioapic.h0000664000175000017500000000422012307313555016456 0ustar smbsmb/* * Copyright (C) 2001 MandrakeSoft S.A. * * MandrakeSoft S.A. * 43, rue d'Aboukir * 75002 Paris - France * http://www.linux-mandrake.com/ * http://www.mandrakesoft.com/ * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef __ASM_X86_HVM_VIOAPIC_H__ #define __ASM_X86_HVM_VIOAPIC_H__ #include #include #include #include #define VIOAPIC_VERSION_ID 0x11 /* IOAPIC version */ #define VIOAPIC_EDGE_TRIG 0 #define VIOAPIC_LEVEL_TRIG 1 #define VIOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 #define VIOAPIC_MEM_LENGTH 0x100 /* Direct registers. */ #define VIOAPIC_REG_SELECT 0x00 #define VIOAPIC_REG_WINDOW 0x10 #define VIOAPIC_REG_EOI 0x40 /* Indirect registers. */ #define VIOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */ #define VIOAPIC_REG_VERSION 0x01 #define VIOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */ struct hvm_vioapic { struct hvm_hw_vioapic hvm_hw_vioapic; struct domain *domain; }; #define domain_vioapic(d) (&(d)->arch.hvm_domain.vioapic->hvm_hw_vioapic) #define vioapic_domain(v) (container_of((v), struct hvm_vioapic, \ hvm_hw_vioapic)->domain) int vioapic_init(struct domain *d); void vioapic_deinit(struct domain *d); void vioapic_reset(struct domain *d); void vioapic_irq_positive_edge(struct domain *d, unsigned int irq); void vioapic_update_EOI(struct domain *d, int vector); #endif /* __ASM_X86_HVM_VIOAPIC_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/vcpu.h0000664000175000017500000001316512307313555016011 0ustar smbsmb/* * vcpu.h: HVM per vcpu definitions * * Copyright (c) 2005, International Business Machines Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #ifndef __ASM_X86_HVM_VCPU_H__ #define __ASM_X86_HVM_VCPU_H__ #include #include #include #include #include #include #include #include #include enum hvm_io_state { HVMIO_none = 0, HVMIO_dispatched, HVMIO_awaiting_completion, HVMIO_handle_mmio_awaiting_completion, HVMIO_handle_pio_awaiting_completion, HVMIO_completed }; struct hvm_vcpu_asid { uint64_t generation; uint32_t asid; }; struct hvm_vcpu_io { /* I/O request in flight to device model. */ enum hvm_io_state io_state; unsigned long io_data; int io_size; /* * HVM emulation: * Virtual address @mmio_gva maps to MMIO physical frame @mmio_gpfn. * The latter is known to be an MMIO frame (not RAM). * This translation is only valid if @mmio_gva is non-zero. */ unsigned long mmio_gva; unsigned long mmio_gpfn; /* We may read up to m256 as a number of device-model transactions. */ paddr_t mmio_large_read_pa; uint8_t mmio_large_read[32]; unsigned int mmio_large_read_bytes; /* We may write up to m256 as a number of device-model transactions. */ unsigned int mmio_large_write_bytes; paddr_t mmio_large_write_pa; /* For retries we shouldn't re-fetch the instruction. */ unsigned int mmio_insn_bytes; unsigned char mmio_insn[16]; /* * For string instruction emulation we need to be able to signal a * necessary retry through other than function return codes. */ bool_t mmio_retry, mmio_retrying; unsigned long msix_unmask_address; }; #define VMCX_EADDR (~0ULL) struct nestedvcpu { bool_t nv_guestmode; /* vcpu in guestmode? */ void *nv_vvmcx; /* l1 guest virtual VMCB/VMCS */ void *nv_n1vmcx; /* VMCB/VMCS used to run l1 guest */ void *nv_n2vmcx; /* shadow VMCB/VMCS used to run l2 guest */ uint64_t nv_vvmcxaddr; /* l1 guest physical address of nv_vvmcx */ uint64_t nv_n1vmcx_pa; /* host physical address of nv_n1vmcx */ uint64_t nv_n2vmcx_pa; /* host physical address of nv_n2vmcx */ /* SVM/VMX arch specific */ union { struct nestedsvm nsvm; struct nestedvmx nvmx; } u; bool_t nv_flushp2m; /* True, when p2m table must be flushed */ struct p2m_domain *nv_p2m; /* used p2m table for this vcpu */ struct hvm_vcpu_asid nv_n2asid; bool_t nv_vmentry_pending; bool_t nv_vmexit_pending; bool_t nv_vmswitch_in_progress; /* true during vmentry/vmexit emulation */ /* Does l1 guest intercept io ports 0x80 and/or 0xED ? * Useful to optimize io permission handling. */ bool_t nv_ioport80; bool_t nv_ioportED; /* L2's control-resgister, just as the L2 sees them. */ unsigned long guest_cr[5]; }; #define vcpu_nestedhvm(v) ((v)->arch.hvm_vcpu.nvcpu) struct hvm_vcpu { /* Guest control-register and EFER values, just as the guest sees them. */ unsigned long guest_cr[5]; unsigned long guest_efer; /* * Processor-visible control-register values, while guest executes. * CR0, CR4: Used as a cache of VMCS contents by VMX only. * CR1, CR2: Never used (guest_cr[2] is always processor-visible CR2). * CR3: Always used and kept up to date by paging subsystem. */ unsigned long hw_cr[5]; struct vlapic vlapic; s64 cache_tsc_offset; u64 guest_time; /* Lock and list for virtual platform timers. */ spinlock_t tm_lock; struct list_head tm_list; int xen_port; bool_t flag_dr_dirty; bool_t debug_state_latch; bool_t single_step; bool_t hcall_preempted; bool_t hcall_64bit; struct hvm_vcpu_asid n1asid; u32 msr_tsc_aux; u64 msr_tsc_adjust; /* VPMU */ struct vpmu_struct vpmu; union { struct arch_vmx_struct vmx; struct arch_svm_struct svm; } u; struct tasklet assert_evtchn_irq_tasklet; struct nestedvcpu nvcpu; struct mtrr_state mtrr; u64 pat_cr; /* In mode delay_for_missed_ticks, VCPUs have differing guest times. */ int64_t stime_offset; /* Which cache mode is this VCPU in (CR0:CD/NW)? */ u8 cache_mode; struct hvm_vcpu_io hvm_io; /* Callback into x86_emulate when emulating FPU/MMX/XMM instructions. */ void (*fpu_exception_callback)(void *, struct cpu_user_regs *); void *fpu_exception_callback_arg; /* Pending hw/sw interrupt (.vector = -1 means nothing pending). */ struct hvm_trap inject_trap; struct viridian_vcpu viridian; }; #endif /* __ASM_X86_HVM_VCPU_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/emulate.h0000664000175000017500000000251112307313555016461 0ustar smbsmb/****************************************************************************** * hvm/emulate.h * * HVM instruction emulation. Used for MMIO and VMX real mode. * * Copyright (c) 2008 Citrix Systems, Inc. * * Authors: * Keir Fraser */ #ifndef __ASM_X86_HVM_EMULATE_H__ #define __ASM_X86_HVM_EMULATE_H__ #include #include struct hvm_emulate_ctxt { struct x86_emulate_ctxt ctxt; /* Cache of 16 bytes of instruction. */ uint8_t insn_buf[16]; unsigned long insn_buf_eip; unsigned int insn_buf_bytes; struct segment_register seg_reg[10]; unsigned long seg_reg_accessed; unsigned long seg_reg_dirty; bool_t exn_pending; uint8_t exn_vector; uint8_t exn_insn_len; int32_t exn_error_code; uint32_t intr_shadow; }; int hvm_emulate_one( struct hvm_emulate_ctxt *hvmemul_ctxt); void hvm_emulate_prepare( struct hvm_emulate_ctxt *hvmemul_ctxt, struct cpu_user_regs *regs); void hvm_emulate_writeback( struct hvm_emulate_ctxt *hvmemul_ctxt); struct segment_register *hvmemul_get_seg_reg( enum x86_segment seg, struct hvm_emulate_ctxt *hvmemul_ctxt); int hvmemul_do_pio( unsigned long port, unsigned long *reps, int size, paddr_t ram_gpa, int dir, int df, void *p_data); #endif /* __ASM_X86_HVM_EMULATE_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/svm/0000775000175000017500000000000012307313555015462 5ustar smbsmbxen-4.4.0/xen/include/asm-x86/hvm/svm/asid.h0000664000175000017500000000256212307313555016560 0ustar smbsmb/* * asid.h: handling ASIDs in SVM. * Copyright (c) 2007, Advanced Micro Devices, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #ifndef __ASM_X86_HVM_SVM_ASID_H__ #define __ASM_X86_HVM_SVM_ASID_H__ #include #include #include #include void svm_asid_init(struct cpuinfo_x86 *c); static inline void svm_asid_g_invlpg(struct vcpu *v, unsigned long g_vaddr) { #if 0 /* Optimization? */ svm_invlpga(g_vaddr, v->arch.hvm_svm.vmcb->guest_asid); #endif /* Safe fallback. Take a new ASID. */ hvm_asid_flush_vcpu(v); } #endif /* __ASM_X86_HVM_SVM_ASID_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/hvm/svm/vmcb.h0000664000175000017500000005243012307313555016566 0ustar smbsmb/* * vmcb.h: VMCB related definitions * Copyright (c) 2005-2007, Advanced Micro Devices, Inc * Copyright (c) 2004, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #ifndef __ASM_X86_HVM_SVM_VMCB_H__ #define __ASM_X86_HVM_SVM_VMCB_H__ #include #include #include /* general 1 intercepts */ enum GenericIntercept1bits { GENERAL1_INTERCEPT_INTR = 1 << 0, GENERAL1_INTERCEPT_NMI = 1 << 1, GENERAL1_INTERCEPT_SMI = 1 << 2, GENERAL1_INTERCEPT_INIT = 1 << 3, GENERAL1_INTERCEPT_VINTR = 1 << 4, GENERAL1_INTERCEPT_CR0_SEL_WRITE = 1 << 5, GENERAL1_INTERCEPT_IDTR_READ = 1 << 6, GENERAL1_INTERCEPT_GDTR_READ = 1 << 7, GENERAL1_INTERCEPT_LDTR_READ = 1 << 8, GENERAL1_INTERCEPT_TR_READ = 1 << 9, GENERAL1_INTERCEPT_IDTR_WRITE = 1 << 10, GENERAL1_INTERCEPT_GDTR_WRITE = 1 << 11, GENERAL1_INTERCEPT_LDTR_WRITE = 1 << 12, GENERAL1_INTERCEPT_TR_WRITE = 1 << 13, GENERAL1_INTERCEPT_RDTSC = 1 << 14, GENERAL1_INTERCEPT_RDPMC = 1 << 15, GENERAL1_INTERCEPT_PUSHF = 1 << 16, GENERAL1_INTERCEPT_POPF = 1 << 17, GENERAL1_INTERCEPT_CPUID = 1 << 18, GENERAL1_INTERCEPT_RSM = 1 << 19, GENERAL1_INTERCEPT_IRET = 1 << 20, GENERAL1_INTERCEPT_SWINT = 1 << 21, GENERAL1_INTERCEPT_INVD = 1 << 22, GENERAL1_INTERCEPT_PAUSE = 1 << 23, GENERAL1_INTERCEPT_HLT = 1 << 24, GENERAL1_INTERCEPT_INVLPG = 1 << 25, GENERAL1_INTERCEPT_INVLPGA = 1 << 26, GENERAL1_INTERCEPT_IOIO_PROT = 1 << 27, GENERAL1_INTERCEPT_MSR_PROT = 1 << 28, GENERAL1_INTERCEPT_TASK_SWITCH = 1 << 29, GENERAL1_INTERCEPT_FERR_FREEZE = 1 << 30, GENERAL1_INTERCEPT_SHUTDOWN_EVT = 1 << 31 }; /* general 2 intercepts */ enum GenericIntercept2bits { GENERAL2_INTERCEPT_VMRUN = 1 << 0, GENERAL2_INTERCEPT_VMMCALL = 1 << 1, GENERAL2_INTERCEPT_VMLOAD = 1 << 2, GENERAL2_INTERCEPT_VMSAVE = 1 << 3, GENERAL2_INTERCEPT_STGI = 1 << 4, GENERAL2_INTERCEPT_CLGI = 1 << 5, GENERAL2_INTERCEPT_SKINIT = 1 << 6, GENERAL2_INTERCEPT_RDTSCP = 1 << 7, GENERAL2_INTERCEPT_ICEBP = 1 << 8, GENERAL2_INTERCEPT_WBINVD = 1 << 9, GENERAL2_INTERCEPT_MONITOR = 1 << 10, GENERAL2_INTERCEPT_MWAIT = 1 << 11, GENERAL2_INTERCEPT_MWAIT_CONDITIONAL = 1 << 12, GENERAL2_INTERCEPT_XSETBV = 1 << 13 }; /* control register intercepts */ enum CRInterceptBits { CR_INTERCEPT_CR0_READ = 1 << 0, CR_INTERCEPT_CR1_READ = 1 << 1, CR_INTERCEPT_CR2_READ = 1 << 2, CR_INTERCEPT_CR3_READ = 1 << 3, CR_INTERCEPT_CR4_READ = 1 << 4, CR_INTERCEPT_CR5_READ = 1 << 5, CR_INTERCEPT_CR6_READ = 1 << 6, CR_INTERCEPT_CR7_READ = 1 << 7, CR_INTERCEPT_CR8_READ = 1 << 8, CR_INTERCEPT_CR9_READ = 1 << 9, CR_INTERCEPT_CR10_READ = 1 << 10, CR_INTERCEPT_CR11_READ = 1 << 11, CR_INTERCEPT_CR12_READ = 1 << 12, CR_INTERCEPT_CR13_READ = 1 << 13, CR_INTERCEPT_CR14_READ = 1 << 14, CR_INTERCEPT_CR15_READ = 1 << 15, CR_INTERCEPT_CR0_WRITE = 1 << 16, CR_INTERCEPT_CR1_WRITE = 1 << 17, CR_INTERCEPT_CR2_WRITE = 1 << 18, CR_INTERCEPT_CR3_WRITE = 1 << 19, CR_INTERCEPT_CR4_WRITE = 1 << 20, CR_INTERCEPT_CR5_WRITE = 1 << 21, CR_INTERCEPT_CR6_WRITE = 1 << 22, CR_INTERCEPT_CR7_WRITE = 1 << 23, CR_INTERCEPT_CR8_WRITE = 1 << 24, CR_INTERCEPT_CR9_WRITE = 1 << 25, CR_INTERCEPT_CR10_WRITE = 1 << 26, CR_INTERCEPT_CR11_WRITE = 1 << 27, CR_INTERCEPT_CR12_WRITE = 1 << 28, CR_INTERCEPT_CR13_WRITE = 1 << 29, CR_INTERCEPT_CR14_WRITE = 1 << 30, CR_INTERCEPT_CR15_WRITE = 1 << 31, }; /* debug register intercepts */ enum DRInterceptBits { DR_INTERCEPT_DR0_READ = 1 << 0, DR_INTERCEPT_DR1_READ = 1 << 1, DR_INTERCEPT_DR2_READ = 1 << 2, DR_INTERCEPT_DR3_READ = 1 << 3, DR_INTERCEPT_DR4_READ = 1 << 4, DR_INTERCEPT_DR5_READ = 1 << 5, DR_INTERCEPT_DR6_READ = 1 << 6, DR_INTERCEPT_DR7_READ = 1 << 7, DR_INTERCEPT_DR8_READ = 1 << 8, DR_INTERCEPT_DR9_READ = 1 << 9, DR_INTERCEPT_DR10_READ = 1 << 10, DR_INTERCEPT_DR11_READ = 1 << 11, DR_INTERCEPT_DR12_READ = 1 << 12, DR_INTERCEPT_DR13_READ = 1 << 13, DR_INTERCEPT_DR14_READ = 1 << 14, DR_INTERCEPT_DR15_READ = 1 << 15, DR_INTERCEPT_DR0_WRITE = 1 << 16, DR_INTERCEPT_DR1_WRITE = 1 << 17, DR_INTERCEPT_DR2_WRITE = 1 << 18, DR_INTERCEPT_DR3_WRITE = 1 << 19, DR_INTERCEPT_DR4_WRITE = 1 << 20, DR_INTERCEPT_DR5_WRITE = 1 << 21, DR_INTERCEPT_DR6_WRITE = 1 << 22, DR_INTERCEPT_DR7_WRITE = 1 << 23, DR_INTERCEPT_DR8_WRITE = 1 << 24, DR_INTERCEPT_DR9_WRITE = 1 << 25, DR_INTERCEPT_DR10_WRITE = 1 << 26, DR_INTERCEPT_DR11_WRITE = 1 << 27, DR_INTERCEPT_DR12_WRITE = 1 << 28, DR_INTERCEPT_DR13_WRITE = 1 << 29, DR_INTERCEPT_DR14_WRITE = 1 << 30, DR_INTERCEPT_DR15_WRITE = 1 << 31, }; enum VMEXIT_EXITCODE { /* control register read exitcodes */ VMEXIT_CR0_READ = 0, /* 0x0 */ VMEXIT_CR1_READ = 1, /* 0x1 */ VMEXIT_CR2_READ = 2, /* 0x2 */ VMEXIT_CR3_READ = 3, /* 0x3 */ VMEXIT_CR4_READ = 4, /* 0x4 */ VMEXIT_CR5_READ = 5, /* 0x5 */ VMEXIT_CR6_READ = 6, /* 0x6 */ VMEXIT_CR7_READ = 7, /* 0x7 */ VMEXIT_CR8_READ = 8, /* 0x8 */ VMEXIT_CR9_READ = 9, /* 0x9 */ VMEXIT_CR10_READ = 10, /* 0xa */ VMEXIT_CR11_READ = 11, /* 0xb */ VMEXIT_CR12_READ = 12, /* 0xc */ VMEXIT_CR13_READ = 13, /* 0xd */ VMEXIT_CR14_READ = 14, /* 0xe */ VMEXIT_CR15_READ = 15, /* 0xf */ /* control register write exitcodes */ VMEXIT_CR0_WRITE = 16, /* 0x10 */ VMEXIT_CR1_WRITE = 17, /* 0x11 */ VMEXIT_CR2_WRITE = 18, /* 0x12 */ VMEXIT_CR3_WRITE = 19, /* 0x13 */ VMEXIT_CR4_WRITE = 20, /* 0x14 */ VMEXIT_CR5_WRITE = 21, /* 0x15 */ VMEXIT_CR6_WRITE = 22, /* 0x16 */ VMEXIT_CR7_WRITE = 23, /* 0x17 */ VMEXIT_CR8_WRITE = 24, /* 0x18 */ VMEXIT_CR9_WRITE = 25, /* 0x19 */ VMEXIT_CR10_WRITE = 26, /* 0x1a */ VMEXIT_CR11_WRITE = 27, /* 0x1b */ VMEXIT_CR12_WRITE = 28, /* 0x1c */ VMEXIT_CR13_WRITE = 29, /* 0x1d */ VMEXIT_CR14_WRITE = 30, /* 0x1e */ VMEXIT_CR15_WRITE = 31, /* 0x1f */ /* debug register read exitcodes */ VMEXIT_DR0_READ = 32, /* 0x20 */ VMEXIT_DR1_READ = 33, /* 0x21 */ VMEXIT_DR2_READ = 34, /* 0x22 */ VMEXIT_DR3_READ = 35, /* 0x23 */ VMEXIT_DR4_READ = 36, /* 0x24 */ VMEXIT_DR5_READ = 37, /* 0x25 */ VMEXIT_DR6_READ = 38, /* 0x26 */ VMEXIT_DR7_READ = 39, /* 0x27 */ VMEXIT_DR8_READ = 40, /* 0x28 */ VMEXIT_DR9_READ = 41, /* 0x29 */ VMEXIT_DR10_READ = 42, /* 0x2a */ VMEXIT_DR11_READ = 43, /* 0x2b */ VMEXIT_DR12_READ = 44, /* 0x2c */ VMEXIT_DR13_READ = 45, /* 0x2d */ VMEXIT_DR14_READ = 46, /* 0x2e */ VMEXIT_DR15_READ = 47, /* 0x2f */ /* debug register write exitcodes */ VMEXIT_DR0_WRITE = 48, /* 0x30 */ VMEXIT_DR1_WRITE = 49, /* 0x31 */ VMEXIT_DR2_WRITE = 50, /* 0x32 */ VMEXIT_DR3_WRITE = 51, /* 0x33 */ VMEXIT_DR4_WRITE = 52, /* 0x34 */ VMEXIT_DR5_WRITE = 53, /* 0x35 */ VMEXIT_DR6_WRITE = 54, /* 0x36 */ VMEXIT_DR7_WRITE = 55, /* 0x37 */ VMEXIT_DR8_WRITE = 56, /* 0x38 */ VMEXIT_DR9_WRITE = 57, /* 0x39 */ VMEXIT_DR10_WRITE = 58, /* 0x3a */ VMEXIT_DR11_WRITE = 59, /* 0x3b */ VMEXIT_DR12_WRITE = 60, /* 0x3c */ VMEXIT_DR13_WRITE = 61, /* 0x3d */ VMEXIT_DR14_WRITE = 62, /* 0x3e */ VMEXIT_DR15_WRITE = 63, /* 0x3f */ /* processor exception exitcodes (VMEXIT_EXCP[0-31]) */ VMEXIT_EXCEPTION_DE = 64, /* 0x40, divide-by-zero-error */ VMEXIT_EXCEPTION_DB = 65, /* 0x41, debug */ VMEXIT_EXCEPTION_NMI = 66, /* 0x42, non-maskable-interrupt */ VMEXIT_EXCEPTION_BP = 67, /* 0x43, breakpoint */ VMEXIT_EXCEPTION_OF = 68, /* 0x44, overflow */ VMEXIT_EXCEPTION_BR = 69, /* 0x45, bound-range */ VMEXIT_EXCEPTION_UD = 70, /* 0x46, invalid-opcode*/ VMEXIT_EXCEPTION_NM = 71, /* 0x47, device-not-available */ VMEXIT_EXCEPTION_DF = 72, /* 0x48, double-fault */ VMEXIT_EXCEPTION_09 = 73, /* 0x49, unsupported (reserved) */ VMEXIT_EXCEPTION_TS = 74, /* 0x4a, invalid-tss */ VMEXIT_EXCEPTION_NP = 75, /* 0x4b, segment-not-present */ VMEXIT_EXCEPTION_SS = 76, /* 0x4c, stack */ VMEXIT_EXCEPTION_GP = 77, /* 0x4d, general-protection */ VMEXIT_EXCEPTION_PF = 78, /* 0x4f, page-fault */ VMEXIT_EXCEPTION_15 = 79, /* 0x50, reserved */ VMEXIT_EXCEPTION_MF = 80, /* 0x51, x87 floating-point exception-pending */ VMEXIT_EXCEPTION_AC = 81, /* 0x52, alignment-check */ VMEXIT_EXCEPTION_MC = 82, /* 0x53, machine-check */ VMEXIT_EXCEPTION_XF = 83, /* 0x54, simd floating-point */ /* exceptions 20-31 (exitcodes 84-95) are reserved */ /* ...and the rest of the #VMEXITs */ VMEXIT_INTR = 96, /* 0x60 */ VMEXIT_NMI = 97, /* 0x61 */ VMEXIT_SMI = 98, /* 0x62 */ VMEXIT_INIT = 99, /* 0x63 */ VMEXIT_VINTR = 100, /* 0x64 */ VMEXIT_CR0_SEL_WRITE = 101, /* 0x65 */ VMEXIT_IDTR_READ = 102, /* 0x66 */ VMEXIT_GDTR_READ = 103, /* 0x67 */ VMEXIT_LDTR_READ = 104, /* 0x68 */ VMEXIT_TR_READ = 105, /* 0x69 */ VMEXIT_IDTR_WRITE = 106, /* 0x6a */ VMEXIT_GDTR_WRITE = 107, /* 0x6b */ VMEXIT_LDTR_WRITE = 108, /* 0x6c */ VMEXIT_TR_WRITE = 109, /* 0x6d */ VMEXIT_RDTSC = 110, /* 0x6e */ VMEXIT_RDPMC = 111, /* 0x6f */ VMEXIT_PUSHF = 112, /* 0x70 */ VMEXIT_POPF = 113, /* 0x71 */ VMEXIT_CPUID = 114, /* 0x72 */ VMEXIT_RSM = 115, /* 0x73 */ VMEXIT_IRET = 116, /* 0x74 */ VMEXIT_SWINT = 117, /* 0x75 */ VMEXIT_INVD = 118, /* 0x76 */ VMEXIT_PAUSE = 119, /* 0x77 */ VMEXIT_HLT = 120, /* 0x78 */ VMEXIT_INVLPG = 121, /* 0x79 */ VMEXIT_INVLPGA = 122, /* 0x7a */ VMEXIT_IOIO = 123, /* 0x7b */ VMEXIT_MSR = 124, /* 0x7c */ VMEXIT_TASK_SWITCH = 125, /* 0x7d */ VMEXIT_FERR_FREEZE = 126, /* 0x7e */ VMEXIT_SHUTDOWN = 127, /* 0x7f */ VMEXIT_VMRUN = 128, /* 0x80 */ VMEXIT_VMMCALL = 129, /* 0x81 */ VMEXIT_VMLOAD = 130, /* 0x82 */ VMEXIT_VMSAVE = 131, /* 0x83 */ VMEXIT_STGI = 132, /* 0x84 */ VMEXIT_CLGI = 133, /* 0x85 */ VMEXIT_SKINIT = 134, /* 0x86 */ VMEXIT_RDTSCP = 135, /* 0x87 */ VMEXIT_ICEBP = 136, /* 0x88 */ VMEXIT_WBINVD = 137, /* 0x89 */ VMEXIT_MONITOR = 138, /* 0x8a */ VMEXIT_MWAIT = 139, /* 0x8b */ VMEXIT_MWAIT_CONDITIONAL= 140, /* 0x8c */ VMEXIT_XSETBV = 141, /* 0x8d */ VMEXIT_NPF = 1024, /* 0x400, nested paging fault */ VMEXIT_INVALID = -1 }; /* Definition of segment state is borrowed by the generic HVM code. */ typedef struct segment_register svm_segment_register_t; typedef union { u64 bytes; struct { u64 vector: 8; u64 type: 3; u64 ev: 1; u64 resvd1: 19; u64 v: 1; u64 errorcode:32; } fields; } __attribute__ ((packed)) eventinj_t; typedef union { u64 bytes; struct { u64 tpr: 8; u64 irq: 1; u64 rsvd0: 7; u64 prio: 4; u64 ign_tpr: 1; u64 rsvd1: 3; u64 intr_masking: 1; u64 rsvd2: 7; u64 vector: 8; u64 rsvd3: 24; } fields; } __attribute__ ((packed)) vintr_t; typedef union { u64 bytes; struct { u64 type: 1; u64 rsv0: 1; u64 str: 1; u64 rep: 1; u64 sz8: 1; u64 sz16: 1; u64 sz32: 1; u64 rsv1: 9; u64 port: 16; } fields; } __attribute__ ((packed)) ioio_info_t; typedef union { u64 bytes; struct { u64 enable:1; } fields; } __attribute__ ((packed)) lbrctrl_t; typedef union { uint32_t bytes; struct { /* cr_intercepts, dr_intercepts, exception_intercepts, * general{1,2}_intercepts, pause_filter_count, tsc_offset */ uint32_t intercepts: 1; /* iopm_base_pa, msrpm_base_pa */ uint32_t iopm: 1; /* guest_asid */ uint32_t asid: 1; /* vintr */ uint32_t tpr: 1; /* np_enable, h_cr3, g_pat */ uint32_t np: 1; /* cr0, cr3, cr4, efer */ uint32_t cr: 1; /* dr6, dr7 */ uint32_t dr: 1; /* gdtr, idtr */ uint32_t dt: 1; /* cs, ds, es, ss, cpl */ uint32_t seg: 1; /* cr2 */ uint32_t cr2: 1; /* debugctlmsr, last{branch,int}{to,from}ip */ uint32_t lbr: 1; uint32_t resv: 21; } fields; } __attribute__ ((packed)) vmcbcleanbits_t; #define IOPM_SIZE (12 * 1024) #define MSRPM_SIZE (8 * 1024) struct vmcb_struct { u32 _cr_intercepts; /* offset 0x00 - cleanbit 0 */ u32 _dr_intercepts; /* offset 0x04 - cleanbit 0 */ u32 _exception_intercepts; /* offset 0x08 - cleanbit 0 */ u32 _general1_intercepts; /* offset 0x0C - cleanbit 0 */ u32 _general2_intercepts; /* offset 0x10 - cleanbit 0 */ u32 res01; /* offset 0x14 */ u64 res02; /* offset 0x18 */ u64 res03; /* offset 0x20 */ u64 res04; /* offset 0x28 */ u64 res05; /* offset 0x30 */ u32 res06; /* offset 0x38 */ u16 res06a; /* offset 0x3C */ u16 _pause_filter_count; /* offset 0x3E - cleanbit 0 */ u64 _iopm_base_pa; /* offset 0x40 - cleanbit 1 */ u64 _msrpm_base_pa; /* offset 0x48 - cleanbit 1 */ u64 _tsc_offset; /* offset 0x50 - cleanbit 0 */ u32 _guest_asid; /* offset 0x58 - cleanbit 2 */ u8 tlb_control; /* offset 0x5C */ u8 res07[3]; vintr_t _vintr; /* offset 0x60 - cleanbit 3 */ u64 interrupt_shadow; /* offset 0x68 */ u64 exitcode; /* offset 0x70 */ u64 exitinfo1; /* offset 0x78 */ u64 exitinfo2; /* offset 0x80 */ eventinj_t exitintinfo; /* offset 0x88 */ u64 _np_enable; /* offset 0x90 - cleanbit 4 */ u64 res08[2]; eventinj_t eventinj; /* offset 0xA8 */ u64 _h_cr3; /* offset 0xB0 - cleanbit 4 */ lbrctrl_t lbr_control; /* offset 0xB8 */ vmcbcleanbits_t cleanbits; /* offset 0xC0 */ u32 res09; /* offset 0xC4 */ u64 nextrip; /* offset 0xC8 */ u8 guest_ins_len; /* offset 0xD0 */ u8 guest_ins[15]; /* offset 0xD1 */ u64 res10a[100]; /* offset 0xE0 pad to save area */ svm_segment_register_t es; /* offset 1024 - cleanbit 8 */ svm_segment_register_t cs; /* cleanbit 8 */ svm_segment_register_t ss; /* cleanbit 8 */ svm_segment_register_t ds; /* cleanbit 8 */ svm_segment_register_t fs; svm_segment_register_t gs; svm_segment_register_t gdtr; /* cleanbit 7 */ svm_segment_register_t ldtr; svm_segment_register_t idtr; /* cleanbit 7 */ svm_segment_register_t tr; u64 res10[5]; u8 res11[3]; u8 _cpl; /* cleanbit 8 */ u32 res12; u64 _efer; /* offset 1024 + 0xD0 - cleanbit 5 */ u64 res13[14]; u64 _cr4; /* offset 1024 + 0x148 - cleanbit 5 */ u64 _cr3; /* cleanbit 5 */ u64 _cr0; /* cleanbit 5 */ u64 _dr7; /* cleanbit 6 */ u64 _dr6; /* cleanbit 6 */ u64 rflags; u64 rip; u64 res14[11]; u64 rsp; u64 res15[3]; u64 rax; u64 star; u64 lstar; u64 cstar; u64 sfmask; u64 kerngsbase; u64 sysenter_cs; u64 sysenter_esp; u64 sysenter_eip; u64 _cr2; /* cleanbit 9 */ u64 pdpe0; u64 pdpe1; u64 pdpe2; u64 pdpe3; u64 _g_pat; /* cleanbit 4 */ u64 _debugctlmsr; /* cleanbit 10 */ u64 _lastbranchfromip; /* cleanbit 10 */ u64 _lastbranchtoip; /* cleanbit 10 */ u64 _lastintfromip; /* cleanbit 10 */ u64 _lastinttoip; /* cleanbit 10 */ u64 res16[301]; } __attribute__ ((packed)); struct svm_domain { }; struct arch_svm_struct { struct vmcb_struct *vmcb; u64 vmcb_pa; unsigned long *msrpm; int launch_core; bool_t vmcb_in_sync; /* VMCB sync'ed with VMSAVE? */ /* VMCB has a cached instruction from #PF/#NPF Decode Assist? */ uint8_t cached_insn_len; /* Zero if no cached instruction. */ /* Upper four bytes are undefined in the VMCB, therefore we can't * use the fields in the VMCB. Write a 64bit value and then read a 64bit * value is fine unless there's a VMRUN/VMEXIT in between which clears * the upper four bytes. */ uint64_t guest_sysenter_cs; uint64_t guest_sysenter_esp; uint64_t guest_sysenter_eip; /* AMD lightweight profiling MSR */ uint64_t guest_lwp_cfg; /* guest version */ uint64_t cpu_lwp_cfg; /* CPU version */ /* OSVW MSRs */ struct { u64 length; u64 status; } osvw; }; struct vmcb_struct *alloc_vmcb(void); struct host_save_area *alloc_host_save_area(void); void free_vmcb(struct vmcb_struct *vmcb); int svm_create_vmcb(struct vcpu *v); void svm_destroy_vmcb(struct vcpu *v); void setup_vmcb_dump(void); #define MSR_INTERCEPT_NONE 0 #define MSR_INTERCEPT_READ 1 #define MSR_INTERCEPT_WRITE 2 #define MSR_INTERCEPT_RW (MSR_INTERCEPT_WRITE | MSR_INTERCEPT_READ) void svm_intercept_msr(struct vcpu *v, uint32_t msr, int enable); #define svm_disable_intercept_for_msr(v, msr) svm_intercept_msr((v), (msr), MSR_INTERCEPT_NONE) #define svm_enable_intercept_for_msr(v, msr) svm_intercept_msr((v), (msr), MSR_INTERCEPT_RW) /* * VMCB accessor functions. */ #define VMCB_ACCESSORS(_type, _name, _cleanbit) \ static inline void vmcb_set_##_name(struct vmcb_struct *vmcb, _type value) \ { \ vmcb->_##_name = value; \ vmcb->cleanbits.fields._cleanbit = 0; \ } \ static inline _type vmcb_get_##_name(struct vmcb_struct *vmcb) \ { \ return vmcb->_##_name; \ } VMCB_ACCESSORS(u32, cr_intercepts, intercepts) VMCB_ACCESSORS(u32, dr_intercepts, intercepts) VMCB_ACCESSORS(u32, exception_intercepts, intercepts) VMCB_ACCESSORS(u32, general1_intercepts, intercepts) VMCB_ACCESSORS(u32, general2_intercepts, intercepts) VMCB_ACCESSORS(u16, pause_filter_count, intercepts) VMCB_ACCESSORS(u64, tsc_offset, intercepts) VMCB_ACCESSORS(u64, iopm_base_pa, iopm) VMCB_ACCESSORS(u64, msrpm_base_pa, iopm) VMCB_ACCESSORS(u32, guest_asid, asid) VMCB_ACCESSORS(vintr_t, vintr, tpr) VMCB_ACCESSORS(u64, np_enable, np) VMCB_ACCESSORS(u64, h_cr3, np) VMCB_ACCESSORS(u64, g_pat, np) VMCB_ACCESSORS(u64, cr0, cr) VMCB_ACCESSORS(u64, cr3, cr) VMCB_ACCESSORS(u64, cr4, cr) VMCB_ACCESSORS(u64, efer, cr) VMCB_ACCESSORS(u64, dr6, dr) VMCB_ACCESSORS(u64, dr7, dr) /* Updates are all via hvm_set_segment_register(). */ /* VMCB_ACCESSORS(svm_segment_register_t, gdtr, dt) */ /* VMCB_ACCESSORS(svm_segment_register_t, idtr, dt) */ /* VMCB_ACCESSORS(svm_segment_register_t, cs, seg) */ /* VMCB_ACCESSORS(svm_segment_register_t, ds, seg) */ /* VMCB_ACCESSORS(svm_segment_register_t, es, seg) */ /* VMCB_ACCESSORS(svm_segment_register_t, ss, seg) */ VMCB_ACCESSORS(u8, cpl, seg) VMCB_ACCESSORS(u64, cr2, cr2) VMCB_ACCESSORS(u64, debugctlmsr, lbr) VMCB_ACCESSORS(u64, lastbranchfromip, lbr) VMCB_ACCESSORS(u64, lastbranchtoip, lbr) VMCB_ACCESSORS(u64, lastintfromip, lbr) VMCB_ACCESSORS(u64, lastinttoip, lbr) #undef VMCB_ACCESSORS #endif /* ASM_X86_HVM_SVM_VMCS_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h0000664000175000017500000002201612307313555020662 0ustar smbsmb/* * Copyright (C) 2007 Advanced Micro Devices, Inc. * Author: Leo Duran * Author: Wei Wang - adapted to xen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _ASM_X86_64_AMD_IOMMU_PROTO_H #define _ASM_X86_64_AMD_IOMMU_PROTO_H #include #include #include #include struct acpi_ivrs_hardware; #define for_each_amd_iommu(amd_iommu) \ list_for_each_entry(amd_iommu, \ &amd_iommu_head, list) #define DMA_32BIT_MASK 0x00000000ffffffffULL #define AMD_IOMMU_DEBUG(fmt, args...) \ do \ { \ if ( iommu_debug ) \ printk(XENLOG_INFO "AMD-Vi: " fmt, ## args); \ } while(0) /* amd-iommu-detect functions */ int amd_iommu_get_ivrs_dev_entries(void); int amd_iommu_detect_one_acpi(const struct acpi_ivrs_hardware *); int amd_iommu_detect_acpi(void); void get_iommu_features(struct amd_iommu *iommu); /* amd-iommu-init functions */ int amd_iommu_init(void); int amd_iommu_update_ivrs_mapping_acpi(void); /* mapping functions */ int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int flags); int amd_iommu_unmap_page(struct domain *d, unsigned long gfn); u64 amd_iommu_get_next_table_from_pte(u32 *entry); int amd_iommu_reserve_domain_unity_map(struct domain *domain, u64 phys_addr, unsigned long size, int iw, int ir); /* Share p2m table with iommu */ void amd_iommu_share_p2m(struct domain *d); /* device table functions */ int get_dma_requestor_id(u16 seg, u16 bdf); void amd_iommu_set_intremap_table( u32 *dte, u64 intremap_ptr, u8 int_valid); void amd_iommu_set_root_page_table( u32 *dte, u64 root_ptr, u16 domain_id, u8 paging_mode, u8 valid); void iommu_dte_set_iotlb(u32 *dte, u8 i); void iommu_dte_add_device_entry(u32 *dte, struct ivrs_mappings *ivrs_dev); void iommu_dte_set_guest_cr3(u32 *dte, u16 dom_id, u64 gcr3, int gv, unsigned int glx); /* send cmd to iommu */ void amd_iommu_flush_all_pages(struct domain *d); void amd_iommu_flush_pages(struct domain *d, unsigned long gfn, unsigned int order); void amd_iommu_flush_iotlb(u8 devfn, const struct pci_dev *pdev, uint64_t gaddr, unsigned int order); void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf); void amd_iommu_flush_intremap(struct amd_iommu *iommu, uint16_t bdf); void amd_iommu_flush_all_caches(struct amd_iommu *iommu); /* find iommu for bdf */ struct amd_iommu *find_iommu_for_device(int seg, int bdf); /* interrupt remapping */ int amd_iommu_setup_ioapic_remapping(void); void *amd_iommu_alloc_intremap_table(unsigned long **); int amd_iommu_free_intremap_table(u16 seg, struct ivrs_mappings *); void amd_iommu_ioapic_update_ire( unsigned int apic, unsigned int reg, unsigned int value); unsigned int amd_iommu_read_ioapic_from_ire( unsigned int apic, unsigned int reg); int amd_iommu_msi_msg_update_ire( struct msi_desc *msi_desc, struct msi_msg *msg); void amd_iommu_read_msi_from_ire( struct msi_desc *msi_desc, struct msi_msg *msg); int amd_setup_hpet_msi(struct msi_desc *msi_desc); extern struct ioapic_sbdf { u16 bdf, seg; u16 *pin_2_idx; } ioapic_sbdf[MAX_IO_APICS]; extern struct hpet_sbdf { u16 bdf, seg, id; enum { HPET_NONE, HPET_CMDL, HPET_IVHD, } init; } hpet_sbdf; extern void *shared_intremap_table; extern unsigned long *shared_intremap_inuse; /* power management support */ void amd_iommu_resume(void); void amd_iommu_suspend(void); void amd_iommu_crash_shutdown(void); /* guest iommu support */ void amd_iommu_send_guest_cmd(struct amd_iommu *iommu, u32 cmd[]); void guest_iommu_add_ppr_log(struct domain *d, u32 entry[]); void guest_iommu_add_event_log(struct domain *d, u32 entry[]); int guest_iommu_init(struct domain* d); void guest_iommu_destroy(struct domain *d); int guest_iommu_set_base(struct domain *d, uint64_t base); static inline u32 get_field_from_reg_u32(u32 reg_value, u32 mask, u32 shift) { u32 field; field = (reg_value & mask) >> shift; return field; } static inline u32 set_field_in_reg_u32(u32 field, u32 reg_value, u32 mask, u32 shift, u32 *reg) { reg_value &= ~mask; reg_value |= (field << shift) & mask; if (reg) *reg = reg_value; return reg_value; } static inline u8 get_field_from_byte(u8 value, u8 mask) { return (value & mask) / (mask & -mask); } static inline unsigned long region_to_pages(unsigned long addr, unsigned long size) { return (PAGE_ALIGN(addr + size) - (addr & PAGE_MASK)) >> PAGE_SHIFT; } static inline struct page_info* alloc_amd_iommu_pgtable(void) { struct page_info *pg; void *vaddr; pg = alloc_domheap_page(NULL, 0); if ( pg == NULL ) return 0; vaddr = __map_domain_page(pg); memset(vaddr, 0, PAGE_SIZE); unmap_domain_page(vaddr); return pg; } static inline void free_amd_iommu_pgtable(struct page_info *pg) { if ( pg != 0 ) free_domheap_page(pg); } static inline void* __alloc_amd_iommu_tables(int order) { void *buf; buf = alloc_xenheap_pages(order, 0); return buf; } static inline void __free_amd_iommu_tables(void *table, int order) { free_xenheap_pages(table, order); } static inline void iommu_set_bit(uint32_t *reg, uint32_t bit) { set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, *reg, 1U << bit, bit, reg); } static inline void iommu_clear_bit(uint32_t *reg, uint32_t bit) { set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, *reg, 1U << bit, bit, reg); } static inline uint32_t iommu_get_bit(uint32_t reg, uint32_t bit) { return get_field_from_reg_u32(reg, 1U << bit, bit); } static inline int iommu_has_cap(struct amd_iommu *iommu, uint32_t bit) { return !!(iommu->cap.header & (1u << bit)); } static inline int iommu_has_feature(struct amd_iommu *iommu, uint32_t bit) { if ( !iommu_has_cap(iommu, PCI_CAP_EFRSUP_SHIFT) ) return 0; return !!(iommu->features & (1U << bit)); } /* access tail or head pointer of ring buffer */ static inline uint32_t iommu_get_rb_pointer(uint32_t reg) { return get_field_from_reg_u32(reg, IOMMU_RING_BUFFER_PTR_MASK, IOMMU_RING_BUFFER_PTR_SHIFT); } static inline void iommu_set_rb_pointer(uint32_t *reg, uint32_t val) { set_field_in_reg_u32(val, *reg, IOMMU_RING_BUFFER_PTR_MASK, IOMMU_RING_BUFFER_PTR_SHIFT, reg); } /* access device id field from iommu cmd */ static inline uint16_t iommu_get_devid_from_cmd(uint32_t cmd) { return get_field_from_reg_u32(cmd, IOMMU_CMD_DEVICE_ID_MASK, IOMMU_CMD_DEVICE_ID_SHIFT); } static inline void iommu_set_devid_to_cmd(uint32_t *cmd, uint16_t id) { set_field_in_reg_u32(id, *cmd, IOMMU_CMD_DEVICE_ID_MASK, IOMMU_CMD_DEVICE_ID_SHIFT, cmd); } /* access address field from iommu cmd */ static inline uint32_t iommu_get_addr_lo_from_cmd(uint32_t cmd) { return get_field_from_reg_u32(cmd, IOMMU_CMD_ADDR_LOW_MASK, IOMMU_CMD_ADDR_LOW_SHIFT); } static inline uint32_t iommu_get_addr_hi_from_cmd(uint32_t cmd) { return get_field_from_reg_u32(cmd, IOMMU_CMD_ADDR_LOW_MASK, IOMMU_CMD_ADDR_HIGH_SHIFT); } /* access address field from event log entry */ #define iommu_get_devid_from_event iommu_get_devid_from_cmd /* access iommu base addresses field from mmio regs */ static inline void iommu_set_addr_lo_to_reg(uint32_t *reg, uint32_t addr) { set_field_in_reg_u32(addr, *reg, IOMMU_REG_BASE_ADDR_LOW_MASK, IOMMU_REG_BASE_ADDR_LOW_SHIFT, reg); } static inline void iommu_set_addr_hi_to_reg(uint32_t *reg, uint32_t addr) { set_field_in_reg_u32(addr, *reg, IOMMU_REG_BASE_ADDR_HIGH_MASK, IOMMU_REG_BASE_ADDR_HIGH_SHIFT, reg); } static inline int iommu_is_pte_present(const u32 *entry) { return get_field_from_reg_u32(entry[0], IOMMU_PDE_PRESENT_MASK, IOMMU_PDE_PRESENT_SHIFT); } static inline unsigned int iommu_next_level(const u32 *entry) { return get_field_from_reg_u32(entry[0], IOMMU_PDE_NEXT_LEVEL_MASK, IOMMU_PDE_NEXT_LEVEL_SHIFT); } #endif /* _ASM_X86_64_AMD_IOMMU_PROTO_H */ xen-4.4.0/xen/include/asm-x86/hvm/svm/nestedsvm.h0000664000175000017500000001274212307313555017651 0ustar smbsmb/* * nestedsvm.h: Nested Virtualization * Copyright (c) 2011, Advanced Micro Devices, Inc * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #ifndef __ASM_X86_HVM_SVM_NESTEDSVM_H__ #define __ASM_X86_HVM_SVM_NESTEDSVM_H__ #include #include /* SVM specific intblk types, cannot be an enum because gcc 4.5 complains */ /* GIF cleared */ #define hvm_intblk_svm_gif hvm_intblk_arch struct nestedsvm { bool_t ns_gif; uint64_t ns_msr_hsavepa; /* MSR HSAVE_PA value */ /* l1 guest physical address of virtual vmcb used by prior VMRUN. * Needed for VMCB Cleanbit emulation. */ uint64_t ns_ovvmcb_pa; /* virtual tscratio holding the value l1 guest writes to the * MSR_AMD64_TSC_RATIO MSR. */ uint64_t ns_tscratio; /* Cached real intercepts of the l2 guest */ uint32_t ns_cr_intercepts; uint32_t ns_dr_intercepts; uint32_t ns_exception_intercepts; uint32_t ns_general1_intercepts; uint32_t ns_general2_intercepts; /* Cached real lbr of the l2 guest */ lbrctrl_t ns_lbr_control; /* Cached real MSR permission bitmaps of the l2 guest */ unsigned long *ns_cached_msrpm; /* Merged MSR permission bitmap */ unsigned long *ns_merged_msrpm; /* guest physical address of virtual io permission map */ paddr_t ns_iomap_pa, ns_oiomap_pa; /* Shadow io permission map */ unsigned long *ns_iomap; uint64_t ns_cr0; /* Cached guest_cr[0] of l1 guest while l2 guest runs. * Needed to handle FPU context switching */ /* Cache guest cr3/host cr3 the guest sets up for the l2 guest. * Used by Shadow-on-Shadow and Nested-on-Nested. * ns_vmcb_guestcr3: in l2 guest physical address space and points to * the l2 guest page table * ns_vmcb_hostcr3: in l1 guest physical address space and points to * the l1 guest nested page table */ uint64_t ns_vmcb_guestcr3, ns_vmcb_hostcr3; uint32_t ns_guest_asid; bool_t ns_hap_enabled; /* Only meaningful when vmexit_pending flag is set */ struct { uint64_t exitcode; /* native exitcode to inject into l1 guest */ uint64_t exitinfo1; /* additional information to the exitcode */ uint64_t exitinfo2; /* additional information to the exitcode */ } ns_vmexit; union { uint32_t bytes; struct { uint32_t rflagsif: 1; uint32_t vintrmask: 1; uint32_t reserved: 30; } fields; } ns_hostflags; }; #define vcpu_nestedsvm(v) (vcpu_nestedhvm(v).u.nsvm) /* True when l1 guest enabled SVM in EFER */ #define nsvm_efer_svm_enabled(v) \ (!!((v)->arch.hvm_vcpu.guest_efer & EFER_SVME)) int nestedsvm_vmcb_map(struct vcpu *v, uint64_t vmcbaddr); void nestedsvm_vmexit_defer(struct vcpu *v, uint64_t exitcode, uint64_t exitinfo1, uint64_t exitinfo2); enum nestedhvm_vmexits nestedsvm_vmexit_n2n1(struct vcpu *v, struct cpu_user_regs *regs); enum nestedhvm_vmexits nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode); /* Interface methods */ void nsvm_vcpu_destroy(struct vcpu *v); int nsvm_vcpu_initialise(struct vcpu *v); int nsvm_vcpu_reset(struct vcpu *v); int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs); int nsvm_vcpu_vmrun(struct vcpu *v, struct cpu_user_regs *regs); int nsvm_vcpu_vmexit_inject(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode); int nsvm_vcpu_vmexit_trap(struct vcpu *v, struct hvm_trap *trap); uint64_t nsvm_vcpu_guestcr3(struct vcpu *v); uint64_t nsvm_vcpu_hostcr3(struct vcpu *v); uint32_t nsvm_vcpu_asid(struct vcpu *v); int nsvm_vmcb_guest_intercepts_exitcode(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode); int nsvm_vmcb_guest_intercepts_trap(struct vcpu *v, unsigned int trapnr, int errcode); bool_t nsvm_vmcb_hap_enabled(struct vcpu *v); enum hvm_intblk nsvm_intr_blocked(struct vcpu *v); /* MSRs */ int nsvm_rdmsr(struct vcpu *v, unsigned int msr, uint64_t *msr_content); int nsvm_wrmsr(struct vcpu *v, unsigned int msr, uint64_t msr_content); /* Interrupts, vGIF */ void svm_vmexit_do_clgi(struct cpu_user_regs *regs, struct vcpu *v); void svm_vmexit_do_stgi(struct cpu_user_regs *regs, struct vcpu *v); bool_t nestedsvm_gif_isset(struct vcpu *v); int nsvm_hap_walk_L1_p2m(struct vcpu *v, paddr_t L2_gpa, paddr_t *L1_gpa, unsigned int *page_order, uint8_t *p2m_acc, bool_t access_r, bool_t access_w, bool_t access_x); #define NSVM_INTR_NOTHANDLED 3 #define NSVM_INTR_NOTINTERCEPTED 2 #define NSVM_INTR_FORCEVMEXIT 1 #define NSVM_INTR_MASKED 0 int nestedsvm_vcpu_interrupt(struct vcpu *v, const struct hvm_intack intack); #endif /* ASM_X86_HVM_SVM_NESTEDSVM_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/hvm/svm/amd-iommu-defs.h0000664000175000017500000005222212307313555020442 0ustar smbsmb/* * Copyright (C) 2007 Advanced Micro Devices, Inc. * Author: Leo Duran * Author: Wei Wang - adapted to xen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _ASM_X86_64_AMD_IOMMU_DEFS_H #define _ASM_X86_64_AMD_IOMMU_DEFS_H /* IOMMU Command Buffer entries: in power of 2 increments, minimum of 256 */ #define IOMMU_CMD_BUFFER_DEFAULT_ENTRIES 512 /* IOMMU Event Log entries: in power of 2 increments, minimum of 256 */ #define IOMMU_EVENT_LOG_DEFAULT_ENTRIES 512 /* IOMMU PPR Log entries: in power of 2 increments, minimum of 256 */ #define IOMMU_PPR_LOG_DEFAULT_ENTRIES 512 #define PTE_PER_TABLE_SHIFT 9 #define PTE_PER_TABLE_SIZE (1 << PTE_PER_TABLE_SHIFT) #define PTE_PER_TABLE_MASK (~(PTE_PER_TABLE_SIZE - 1)) #define PTE_PER_TABLE_ALIGN(entries) \ (((entries) + PTE_PER_TABLE_SIZE - 1) & PTE_PER_TABLE_MASK) #define PTE_PER_TABLE_ALLOC(entries) \ PAGE_SIZE * (PTE_PER_TABLE_ALIGN(entries) >> PTE_PER_TABLE_SHIFT) #define amd_offset_level_address(offset, level) \ ((u64)(offset) << (12 + (PTE_PER_TABLE_SHIFT * \ (level - IOMMU_PAGING_MODE_LEVEL_1)))) #define PCI_MIN_CAP_OFFSET 0x40 #define PCI_MAX_CAP_BLOCKS 48 #define PCI_CAP_PTR_MASK 0xFC /* IOMMU Capability */ #define PCI_CAP_ID_MASK 0x000000FF #define PCI_CAP_ID_SHIFT 0 #define PCI_CAP_NEXT_PTR_MASK 0x0000FF00 #define PCI_CAP_NEXT_PTR_SHIFT 8 #define PCI_CAP_TYPE_MASK 0x00070000 #define PCI_CAP_TYPE_SHIFT 16 #define PCI_CAP_REV_MASK 0x00F80000 #define PCI_CAP_REV_SHIFT 19 #define PCI_CAP_IOTLB_MASK 0x01000000 #define PCI_CAP_IOTLB_SHIFT 24 #define PCI_CAP_HT_TUNNEL_MASK 0x02000000 #define PCI_CAP_HT_TUNNEL_SHIFT 25 #define PCI_CAP_NP_CACHE_MASK 0x04000000 #define PCI_CAP_NP_CACHE_SHIFT 26 #define PCI_CAP_EFRSUP_SHIFT 27 #define PCI_CAP_RESET_MASK 0x80000000 #define PCI_CAP_RESET_SHIFT 31 #define PCI_CAP_TYPE_IOMMU 0x3 #define PCI_CAP_MMIO_BAR_LOW_OFFSET 0x04 #define PCI_CAP_MMIO_BAR_HIGH_OFFSET 0x08 #define PCI_CAP_MMIO_BAR_LOW_MASK 0xFFFFC000 #define IOMMU_MMIO_REGION_LENGTH 0x4000 #define PCI_CAP_RANGE_OFFSET 0x0C #define PCI_CAP_BUS_NUMBER_MASK 0x0000FF00 #define PCI_CAP_BUS_NUMBER_SHIFT 8 #define PCI_CAP_FIRST_DEVICE_MASK 0x00FF0000 #define PCI_CAP_FIRST_DEVICE_SHIFT 16 #define PCI_CAP_LAST_DEVICE_MASK 0xFF000000 #define PCI_CAP_LAST_DEVICE_SHIFT 24 #define PCI_CAP_UNIT_ID_MASK 0x0000001F #define PCI_CAP_UNIT_ID_SHIFT 0 #define PCI_CAP_MISC_INFO_OFFSET 0x10 #define PCI_CAP_MSI_NUMBER_MASK 0x0000001F #define PCI_CAP_MSI_NUMBER_SHIFT 0 /* Device Table */ #define IOMMU_DEV_TABLE_BASE_LOW_OFFSET 0x00 #define IOMMU_DEV_TABLE_BASE_HIGH_OFFSET 0x04 #define IOMMU_DEV_TABLE_SIZE_MASK 0x000001FF #define IOMMU_DEV_TABLE_SIZE_SHIFT 0 #define IOMMU_DEV_TABLE_ENTRIES_PER_BUS 256 #define IOMMU_DEV_TABLE_ENTRY_SIZE 32 #define IOMMU_DEV_TABLE_U32_PER_ENTRY (IOMMU_DEV_TABLE_ENTRY_SIZE / 4) #define IOMMU_DEV_TABLE_SYS_MGT_DMA_ABORTED 0x0 #define IOMMU_DEV_TABLE_SYS_MGT_MSG_FORWARDED 0x1 #define IOMMU_DEV_TABLE_SYS_MGT_INT_FORWARDED 0x2 #define IOMMU_DEV_TABLE_SYS_MGT_DMA_FORWARDED 0x3 #define IOMMU_DEV_TABLE_IO_CONTROL_ABORTED 0x0 #define IOMMU_DEV_TABLE_IO_CONTROL_FORWARDED 0x1 #define IOMMU_DEV_TABLE_IO_CONTROL_TRANSLATED 0x2 #define IOMMU_DEV_TABLE_INT_CONTROL_ABORTED 0x0 #define IOMMU_DEV_TABLE_INT_CONTROL_FORWARDED 0x1 #define IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED 0x2 /* DeviceTable Entry[31:0] */ #define IOMMU_DEV_TABLE_VALID_MASK 0x00000001 #define IOMMU_DEV_TABLE_VALID_SHIFT 0 #define IOMMU_DEV_TABLE_TRANSLATION_VALID_MASK 0x00000002 #define IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT 1 #define IOMMU_DEV_TABLE_PAGING_MODE_MASK 0x00000E00 #define IOMMU_DEV_TABLE_PAGING_MODE_SHIFT 9 #define IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK 0xFFFFF000 #define IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT 12 /* DeviceTable Entry[63:32] */ #define IOMMU_DEV_TABLE_GV_SHIFT 23 #define IOMMU_DEV_TABLE_GV_MASK 0x800000 #define IOMMU_DEV_TABLE_GLX_SHIFT 24 #define IOMMU_DEV_TABLE_GLX_MASK 0x3000000 #define IOMMU_DEV_TABLE_GCR3_1_SHIFT 26 #define IOMMU_DEV_TABLE_GCR3_1_MASK 0x1c000000 #define IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_MASK 0x000FFFFF #define IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT 0 #define IOMMU_DEV_TABLE_IO_READ_PERMISSION_MASK 0x20000000 #define IOMMU_DEV_TABLE_IO_READ_PERMISSION_SHIFT 29 #define IOMMU_DEV_TABLE_IO_WRITE_PERMISSION_MASK 0x40000000 #define IOMMU_DEV_TABLE_IO_WRITE_PERMISSION_SHIFT 30 /* DeviceTable Entry[95:64] */ #define IOMMU_DEV_TABLE_DOMAIN_ID_MASK 0x0000FFFF #define IOMMU_DEV_TABLE_DOMAIN_ID_SHIFT 0 #define IOMMU_DEV_TABLE_GCR3_2_SHIFT 16 #define IOMMU_DEV_TABLE_GCR3_2_MASK 0xFFFF0000 /* DeviceTable Entry[127:96] */ #define IOMMU_DEV_TABLE_IOTLB_SUPPORT_MASK 0x00000001 #define IOMMU_DEV_TABLE_IOTLB_SUPPORT_SHIFT 0 #define IOMMU_DEV_TABLE_SUPRESS_LOGGED_PAGES_MASK 0x00000002 #define IOMMU_DEV_TABLE_SUPRESS_LOGGED_PAGES_SHIFT 1 #define IOMMU_DEV_TABLE_SUPRESS_ALL_PAGES_MASK 0x00000004 #define IOMMU_DEV_TABLE_SUPRESS_ALL_PAGES_SHIFT 2 #define IOMMU_DEV_TABLE_IO_CONTROL_MASK 0x00000018 #define IOMMU_DEV_TABLE_IO_CONTROL_SHIFT 3 #define IOMMU_DEV_TABLE_IOTLB_CACHE_HINT_MASK 0x00000020 #define IOMMU_DEV_TABLE_IOTLB_CACHE_HINT_SHIFT 5 #define IOMMU_DEV_TABLE_SNOOP_DISABLE_MASK 0x00000040 #define IOMMU_DEV_TABLE_SNOOP_DISABLE_SHIFT 6 #define IOMMU_DEV_TABLE_ALLOW_EXCLUSION_MASK 0x00000080 #define IOMMU_DEV_TABLE_ALLOW_EXCLUSION_SHIFT 7 #define IOMMU_DEV_TABLE_SYS_MGT_MSG_ENABLE_MASK 0x00000300 #define IOMMU_DEV_TABLE_SYS_MGT_MSG_ENABLE_SHIFT 8 /* DeviceTable Entry[159:128] */ #define IOMMU_DEV_TABLE_INT_VALID_MASK 0x00000001 #define IOMMU_DEV_TABLE_INT_VALID_SHIFT 0 #define IOMMU_DEV_TABLE_INT_TABLE_LENGTH_MASK 0x0000001E #define IOMMU_DEV_TABLE_INT_TABLE_LENGTH_SHIFT 1 #define IOMMU_DEV_TABLE_INT_TABLE_IGN_UNMAPPED_MASK 0x0000000020 #define IOMMU_DEV_TABLE_INT_TABLE_IGN_UNMAPPED_SHIFT 5 #define IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_MASK 0xFFFFFFC0 #define IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_SHIFT 6 #define IOMMU_DEV_TABLE_GCR3_3_SHIFT 11 #define IOMMU_DEV_TABLE_GCR3_3_MASK 0xfffff800 /* DeviceTable Entry[191:160] */ #define IOMMU_DEV_TABLE_INT_TABLE_PTR_HIGH_MASK 0x000FFFFF #define IOMMU_DEV_TABLE_INT_TABLE_PTR_HIGH_SHIFT 0 #define IOMMU_DEV_TABLE_IVHD_FLAGS_SHIFT 24 #define IOMMU_DEV_TABLE_IVHD_FLAGS_MASK 0xC7000000 #define IOMMU_DEV_TABLE_INT_CONTROL_MASK 0x30000000 #define IOMMU_DEV_TABLE_INT_CONTROL_SHIFT 28 /* Command Buffer */ #define IOMMU_CMD_BUFFER_BASE_LOW_OFFSET 0x08 #define IOMMU_CMD_BUFFER_BASE_HIGH_OFFSET 0x0C #define IOMMU_CMD_BUFFER_HEAD_OFFSET 0x2000 #define IOMMU_CMD_BUFFER_TAIL_OFFSET 0x2008 #define IOMMU_CMD_BUFFER_LENGTH_MASK 0x0F000000 #define IOMMU_CMD_BUFFER_LENGTH_SHIFT 24 #define IOMMU_CMD_BUFFER_ENTRY_SIZE 16 #define IOMMU_CMD_BUFFER_POWER_OF2_ENTRIES_PER_PAGE 8 #define IOMMU_CMD_BUFFER_U32_PER_ENTRY (IOMMU_CMD_BUFFER_ENTRY_SIZE / 4) #define IOMMU_CMD_OPCODE_MASK 0xF0000000 #define IOMMU_CMD_OPCODE_SHIFT 28 #define IOMMU_CMD_COMPLETION_WAIT 0x1 #define IOMMU_CMD_INVALIDATE_DEVTAB_ENTRY 0x2 #define IOMMU_CMD_INVALIDATE_IOMMU_PAGES 0x3 #define IOMMU_CMD_INVALIDATE_IOTLB_PAGES 0x4 #define IOMMU_CMD_INVALIDATE_INT_TABLE 0x5 #define IOMMU_CMD_COMPLETE_PPR_REQUEST 0x7 #define IOMMU_CMD_INVALIDATE_IOMMU_ALL 0x8 /* COMPLETION_WAIT command */ #define IOMMU_COMP_WAIT_DATA_BUFFER_SIZE 8 #define IOMMU_COMP_WAIT_DATA_BUFFER_ALIGNMENT 8 #define IOMMU_COMP_WAIT_S_FLAG_MASK 0x00000001 #define IOMMU_COMP_WAIT_S_FLAG_SHIFT 0 #define IOMMU_COMP_WAIT_I_FLAG_MASK 0x00000002 #define IOMMU_COMP_WAIT_I_FLAG_SHIFT 1 #define IOMMU_COMP_WAIT_F_FLAG_MASK 0x00000004 #define IOMMU_COMP_WAIT_F_FLAG_SHIFT 2 #define IOMMU_COMP_WAIT_ADDR_LOW_MASK 0xFFFFFFF8 #define IOMMU_COMP_WAIT_ADDR_LOW_SHIFT 3 #define IOMMU_COMP_WAIT_ADDR_HIGH_MASK 0x000FFFFF #define IOMMU_COMP_WAIT_ADDR_HIGH_SHIFT 0 /* INVALIDATE_IOMMU_PAGES command */ #define IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_MASK 0x0000FFFF #define IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_SHIFT 0 #define IOMMU_INV_IOMMU_PAGES_S_FLAG_MASK 0x00000001 #define IOMMU_INV_IOMMU_PAGES_S_FLAG_SHIFT 0 #define IOMMU_INV_IOMMU_PAGES_PDE_FLAG_MASK 0x00000002 #define IOMMU_INV_IOMMU_PAGES_PDE_FLAG_SHIFT 1 #define IOMMU_INV_IOMMU_PAGES_ADDR_LOW_MASK 0xFFFFF000 #define IOMMU_INV_IOMMU_PAGES_ADDR_LOW_SHIFT 12 #define IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_MASK 0xFFFFFFFF #define IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_SHIFT 0 /* INVALIDATE_DEVTAB_ENTRY command */ #define IOMMU_INV_DEVTAB_ENTRY_DEVICE_ID_MASK 0x0000FFFF #define IOMMU_INV_DEVTAB_ENTRY_DEVICE_ID_SHIFT 0 /* INVALIDATE_INTERRUPT_TABLE command */ #define IOMMU_INV_INT_TABLE_DEVICE_ID_MASK 0x0000FFFF #define IOMMU_INV_INT_TABLE_DEVICE_ID_SHIFT 0 /* INVALIDATE_IOTLB_PAGES command */ #define IOMMU_INV_IOTLB_PAGES_MAXPEND_MASK 0xff000000 #define IOMMU_INV_IOTLB_PAGES_MAXPEND_SHIFT 24 #define IOMMU_INV_IOTLB_PAGES_PASID1_MASK 0x00ff0000 #define IOMMU_INV_IOTLB_PAGES_PASID1_SHIFT 16 #define IOMMU_INV_IOTLB_PAGES_PASID2_MASK 0x0fff0000 #define IOMMU_INV_IOTLB_PAGES_PASID2_SHIFT 16 #define IOMMU_INV_IOTLB_PAGES_QUEUEID_MASK 0x0000ffff #define IOMMU_INV_IOTLB_PAGES_QUEUEID_SHIFT 0 #define IOMMU_INV_IOTLB_PAGES_DEVICE_ID_MASK 0x0000FFFF #define IOMMU_INV_IOTLB_PAGES_DEVICE_ID_SHIFT 0 #define IOMMU_INV_IOTLB_PAGES_ADDR_LOW_MASK 0xFFFFF000 #define IOMMU_INV_IOTLB_PAGES_ADDR_LOW_SHIFT 12 #define IOMMU_INV_IOTLB_PAGES_ADDR_HIGH_MASK 0xFFFFFFFF #define IOMMU_INV_IOTLB_PAGES_ADDR_HIGH_SHIFT 0 #define IOMMU_INV_IOTLB_PAGES_S_FLAG_MASK 0x00000001 #define IOMMU_INV_IOTLB_PAGES_S_FLAG_SHIFT 0 /* Event Log */ #define IOMMU_EVENT_LOG_BASE_LOW_OFFSET 0x10 #define IOMMU_EVENT_LOG_BASE_HIGH_OFFSET 0x14 #define IOMMU_EVENT_LOG_HEAD_OFFSET 0x2010 #define IOMMU_EVENT_LOG_TAIL_OFFSET 0x2018 #define IOMMU_EVENT_LOG_LENGTH_MASK 0x0F000000 #define IOMMU_EVENT_LOG_LENGTH_SHIFT 24 #define IOMMU_EVENT_LOG_HEAD_MASK 0x0007FFF0 #define IOMMU_EVENT_LOG_HEAD_SHIFT 4 #define IOMMU_EVENT_LOG_TAIL_MASK 0x0007FFF0 #define IOMMU_EVENT_LOG_TAIL_SHIFT 4 #define IOMMU_EVENT_LOG_ENTRY_SIZE 16 #define IOMMU_EVENT_LOG_POWER_OF2_ENTRIES_PER_PAGE 8 #define IOMMU_EVENT_LOG_U32_PER_ENTRY (IOMMU_EVENT_LOG_ENTRY_SIZE / 4) #define IOMMU_EVENT_CODE_MASK 0xF0000000 #define IOMMU_EVENT_CODE_SHIFT 28 #define IOMMU_EVENT_ILLEGAL_DEV_TABLE_ENTRY 0x1 #define IOMMU_EVENT_IO_PAGE_FAULT 0x2 #define IOMMU_EVENT_DEV_TABLE_HW_ERROR 0x3 #define IOMMU_EVENT_PAGE_TABLE_HW_ERROR 0x4 #define IOMMU_EVENT_ILLEGAL_COMMAND_ERROR 0x5 #define IOMMU_EVENT_COMMAND_HW_ERROR 0x6 #define IOMMU_EVENT_IOTLB_INV_TIMEOUT 0x7 #define IOMMU_EVENT_INVALID_DEV_REQUEST 0x8 #define IOMMU_EVENT_DOMAIN_ID_MASK 0x0000FFFF #define IOMMU_EVENT_DOMAIN_ID_SHIFT 0 #define IOMMU_EVENT_DEVICE_ID_MASK 0x0000FFFF #define IOMMU_EVENT_DEVICE_ID_SHIFT 0 #define IOMMU_EVENT_FLAGS_SHIFT 16 #define IOMMU_EVENT_FLAGS_MASK 0x0FFF0000 /* PPR Log */ #define IOMMU_PPR_LOG_ENTRY_SIZE 16 #define IOMMU_PPR_LOG_POWER_OF2_ENTRIES_PER_PAGE 8 #define IOMMU_PPR_LOG_U32_PER_ENTRY (IOMMU_PPR_LOG_ENTRY_SIZE / 4) #define IOMMU_PPR_LOG_BASE_LOW_OFFSET 0x0038 #define IOMMU_PPR_LOG_BASE_HIGH_OFFSET 0x003C #define IOMMU_PPR_LOG_BASE_LOW_MASK 0xFFFFF000 #define IOMMU_PPR_LOG_BASE_LOW_SHIFT 12 #define IOMMU_PPR_LOG_BASE_HIGH_MASK 0x000FFFFF #define IOMMU_PPR_LOG_BASE_HIGH_SHIFT 0 #define IOMMU_PPR_LOG_LENGTH_MASK 0x0F000000 #define IOMMU_PPR_LOG_LENGTH_SHIFT 24 #define IOMMU_PPR_LOG_HEAD_MASK 0x0007FFF0 #define IOMMU_PPR_LOG_HEAD_SHIFT 4 #define IOMMU_PPR_LOG_TAIL_MASK 0x0007FFF0 #define IOMMU_PPR_LOG_TAIL_SHIFT 4 #define IOMMU_PPR_LOG_HEAD_OFFSET 0x2030 #define IOMMU_PPR_LOG_TAIL_OFFSET 0x2038 #define IOMMU_PPR_LOG_DEVICE_ID_MASK 0x0000FFFF #define IOMMU_PPR_LOG_DEVICE_ID_SHIFT 0 #define IOMMU_PPR_LOG_CODE_MASK 0xF0000000 #define IOMMU_PPR_LOG_CODE_SHIFT 28 #define IOMMU_LOG_ENTRY_TIMEOUT 1000 /* Control Register */ #define IOMMU_CONTROL_MMIO_OFFSET 0x18 #define IOMMU_CONTROL_TRANSLATION_ENABLE_MASK 0x00000001 #define IOMMU_CONTROL_TRANSLATION_ENABLE_SHIFT 0 #define IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_MASK 0x00000002 #define IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_SHIFT 1 #define IOMMU_CONTROL_EVENT_LOG_ENABLE_MASK 0x00000004 #define IOMMU_CONTROL_EVENT_LOG_ENABLE_SHIFT 2 #define IOMMU_CONTROL_EVENT_LOG_INT_MASK 0x00000008 #define IOMMU_CONTROL_EVENT_LOG_INT_SHIFT 3 #define IOMMU_CONTROL_COMP_WAIT_INT_MASK 0x00000010 #define IOMMU_CONTROL_COMP_WAIT_INT_SHIFT 4 #define IOMMU_CONTROL_INVALIDATION_TIMEOUT_MASK 0x000000E0 #define IOMMU_CONTROL_INVALIDATION_TIMEOUT_SHIFT 5 #define IOMMU_CONTROL_PASS_POSTED_WRITE_MASK 0x00000100 #define IOMMU_CONTROL_PASS_POSTED_WRITE_SHIFT 8 #define IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_MASK 0x00000200 #define IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_SHIFT 9 #define IOMMU_CONTROL_COHERENT_MASK 0x00000400 #define IOMMU_CONTROL_COHERENT_SHIFT 10 #define IOMMU_CONTROL_ISOCHRONOUS_MASK 0x00000800 #define IOMMU_CONTROL_ISOCHRONOUS_SHIFT 11 #define IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_MASK 0x00001000 #define IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT 12 #define IOMMU_CONTROL_PPR_LOG_ENABLE_MASK 0x00002000 #define IOMMU_CONTROL_PPR_LOG_ENABLE_SHIFT 13 #define IOMMU_CONTROL_PPR_LOG_INT_MASK 0x00004000 #define IOMMU_CONTROL_PPR_LOG_INT_SHIFT 14 #define IOMMU_CONTROL_PPR_ENABLE_MASK 0x00008000 #define IOMMU_CONTROL_PPR_ENABLE_SHIFT 15 #define IOMMU_CONTROL_GT_ENABLE_MASK 0x00010000 #define IOMMU_CONTROL_GT_ENABLE_SHIFT 16 #define IOMMU_CONTROL_RESTART_MASK 0x80000000 #define IOMMU_CONTROL_RESTART_SHIFT 31 /* Exclusion Register */ #define IOMMU_EXCLUSION_BASE_LOW_OFFSET 0x20 #define IOMMU_EXCLUSION_BASE_HIGH_OFFSET 0x24 #define IOMMU_EXCLUSION_LIMIT_LOW_OFFSET 0x28 #define IOMMU_EXCLUSION_LIMIT_HIGH_OFFSET 0x2C #define IOMMU_EXCLUSION_BASE_LOW_MASK 0xFFFFF000 #define IOMMU_EXCLUSION_BASE_LOW_SHIFT 12 #define IOMMU_EXCLUSION_BASE_HIGH_MASK 0xFFFFFFFF #define IOMMU_EXCLUSION_BASE_HIGH_SHIFT 0 #define IOMMU_EXCLUSION_RANGE_ENABLE_MASK 0x00000001 #define IOMMU_EXCLUSION_RANGE_ENABLE_SHIFT 0 #define IOMMU_EXCLUSION_ALLOW_ALL_MASK 0x00000002 #define IOMMU_EXCLUSION_ALLOW_ALL_SHIFT 1 #define IOMMU_EXCLUSION_LIMIT_LOW_MASK 0xFFFFF000 #define IOMMU_EXCLUSION_LIMIT_LOW_SHIFT 12 #define IOMMU_EXCLUSION_LIMIT_HIGH_MASK 0xFFFFFFFF #define IOMMU_EXCLUSION_LIMIT_HIGH_SHIFT 0 /* Extended Feature Register*/ #define IOMMU_EXT_FEATURE_MMIO_OFFSET 0x30 #define IOMMU_EXT_FEATURE_PREFSUP_SHIFT 0x0 #define IOMMU_EXT_FEATURE_PPRSUP_SHIFT 0x1 #define IOMMU_EXT_FEATURE_XTSUP_SHIFT 0x2 #define IOMMU_EXT_FEATURE_NXSUP_SHIFT 0x3 #define IOMMU_EXT_FEATURE_GTSUP_SHIFT 0x4 #define IOMMU_EXT_FEATURE_IASUP_SHIFT 0x6 #define IOMMU_EXT_FEATURE_GASUP_SHIFT 0x7 #define IOMMU_EXT_FEATURE_HESUP_SHIFT 0x8 #define IOMMU_EXT_FEATURE_PCSUP_SHIFT 0x9 #define IOMMU_EXT_FEATURE_HATS_SHIFT 0x10 #define IOMMU_EXT_FEATURE_HATS_MASK 0x00000C00 #define IOMMU_EXT_FEATURE_GATS_SHIFT 0x12 #define IOMMU_EXT_FEATURE_GATS_MASK 0x00003000 #define IOMMU_EXT_FEATURE_GLXSUP_SHIFT 0x14 #define IOMMU_EXT_FEATURE_GLXSUP_MASK 0x0000C000 #define IOMMU_EXT_FEATURE_PASMAX_SHIFT 0x0 #define IOMMU_EXT_FEATURE_PASMAX_MASK 0x0000001F /* Status Register*/ #define IOMMU_STATUS_MMIO_OFFSET 0x2020 #define IOMMU_STATUS_EVENT_OVERFLOW_MASK 0x00000001 #define IOMMU_STATUS_EVENT_OVERFLOW_SHIFT 0 #define IOMMU_STATUS_EVENT_LOG_INT_MASK 0x00000002 #define IOMMU_STATUS_EVENT_LOG_INT_SHIFT 1 #define IOMMU_STATUS_COMP_WAIT_INT_MASK 0x00000004 #define IOMMU_STATUS_COMP_WAIT_INT_SHIFT 2 #define IOMMU_STATUS_EVENT_LOG_RUN_MASK 0x00000008 #define IOMMU_STATUS_EVENT_LOG_RUN_SHIFT 3 #define IOMMU_STATUS_CMD_BUFFER_RUN_MASK 0x00000010 #define IOMMU_STATUS_CMD_BUFFER_RUN_SHIFT 4 #define IOMMU_STATUS_PPR_LOG_OVERFLOW_MASK 0x00000020 #define IOMMU_STATUS_PPR_LOG_OVERFLOW_SHIFT 5 #define IOMMU_STATUS_PPR_LOG_INT_MASK 0x00000040 #define IOMMU_STATUS_PPR_LOG_INT_SHIFT 6 #define IOMMU_STATUS_PPR_LOG_RUN_MASK 0x00000080 #define IOMMU_STATUS_PPR_LOG_RUN_SHIFT 7 #define IOMMU_STATUS_GAPIC_LOG_OVERFLOW_MASK 0x00000100 #define IOMMU_STATUS_GAPIC_LOG_OVERFLOW_SHIFT 8 #define IOMMU_STATUS_GAPIC_LOG_INT_MASK 0x00000200 #define IOMMU_STATUS_GAPIC_LOG_INT_SHIFT 9 #define IOMMU_STATUS_GAPIC_LOG_RUN_MASK 0x00000400 #define IOMMU_STATUS_GAPIC_LOG_RUN_SHIFT 10 /* I/O Page Table */ #define IOMMU_PAGE_TABLE_ENTRY_SIZE 8 #define IOMMU_PAGE_TABLE_U32_PER_ENTRY (IOMMU_PAGE_TABLE_ENTRY_SIZE / 4) #define IOMMU_PAGE_TABLE_ALIGNMENT 4096 #define IOMMU_PTE_PRESENT_MASK 0x00000001 #define IOMMU_PTE_PRESENT_SHIFT 0 #define IOMMU_PTE_NEXT_LEVEL_MASK 0x00000E00 #define IOMMU_PTE_NEXT_LEVEL_SHIFT 9 #define IOMMU_PTE_ADDR_LOW_MASK 0xFFFFF000 #define IOMMU_PTE_ADDR_LOW_SHIFT 12 #define IOMMU_PTE_ADDR_HIGH_MASK 0x000FFFFF #define IOMMU_PTE_ADDR_HIGH_SHIFT 0 #define IOMMU_PTE_U_MASK 0x08000000 #define IOMMU_PTE_U_SHIFT 7 #define IOMMU_PTE_FC_MASK 0x10000000 #define IOMMU_PTE_FC_SHIFT 28 #define IOMMU_PTE_IO_READ_PERMISSION_MASK 0x20000000 #define IOMMU_PTE_IO_READ_PERMISSION_SHIFT 29 #define IOMMU_PTE_IO_WRITE_PERMISSION_MASK 0x40000000 #define IOMMU_PTE_IO_WRITE_PERMISSION_SHIFT 30 /* I/O Page Directory */ #define IOMMU_PAGE_DIRECTORY_ENTRY_SIZE 8 #define IOMMU_PAGE_DIRECTORY_ALIGNMENT 4096 #define IOMMU_PDE_PRESENT_MASK 0x00000001 #define IOMMU_PDE_PRESENT_SHIFT 0 #define IOMMU_PDE_NEXT_LEVEL_MASK 0x00000E00 #define IOMMU_PDE_NEXT_LEVEL_SHIFT 9 #define IOMMU_PDE_ADDR_LOW_MASK 0xFFFFF000 #define IOMMU_PDE_ADDR_LOW_SHIFT 12 #define IOMMU_PDE_ADDR_HIGH_MASK 0x000FFFFF #define IOMMU_PDE_ADDR_HIGH_SHIFT 0 #define IOMMU_PDE_IO_READ_PERMISSION_MASK 0x20000000 #define IOMMU_PDE_IO_READ_PERMISSION_SHIFT 29 #define IOMMU_PDE_IO_WRITE_PERMISSION_MASK 0x40000000 #define IOMMU_PDE_IO_WRITE_PERMISSION_SHIFT 30 /* Paging modes */ #define IOMMU_PAGING_MODE_DISABLED 0x0 #define IOMMU_PAGING_MODE_LEVEL_0 0x0 #define IOMMU_PAGING_MODE_LEVEL_1 0x1 #define IOMMU_PAGING_MODE_LEVEL_2 0x2 #define IOMMU_PAGING_MODE_LEVEL_3 0x3 #define IOMMU_PAGING_MODE_LEVEL_4 0x4 #define IOMMU_PAGING_MODE_LEVEL_5 0x5 #define IOMMU_PAGING_MODE_LEVEL_6 0x6 #define IOMMU_PAGING_MODE_LEVEL_7 0x7 /* Flags */ #define IOMMU_CONTROL_DISABLED 0 #define IOMMU_CONTROL_ENABLED 1 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 /* interrupt remapping table */ #define INT_REMAP_ENTRY_REMAPEN_MASK 0x00000001 #define INT_REMAP_ENTRY_REMAPEN_SHIFT 0 #define INT_REMAP_ENTRY_SUPIOPF_MASK 0x00000002 #define INT_REMAP_ENTRY_SUPIOPF_SHIFT 1 #define INT_REMAP_ENTRY_INTTYPE_MASK 0x0000001C #define INT_REMAP_ENTRY_INTTYPE_SHIFT 2 #define INT_REMAP_ENTRY_REQEOI_MASK 0x00000020 #define INT_REMAP_ENTRY_REQEOI_SHIFT 5 #define INT_REMAP_ENTRY_DM_MASK 0x00000040 #define INT_REMAP_ENTRY_DM_SHIFT 6 #define INT_REMAP_ENTRY_DEST_MAST 0x0000FF00 #define INT_REMAP_ENTRY_DEST_SHIFT 8 #define INT_REMAP_ENTRY_VECTOR_MASK 0x00FF0000 #define INT_REMAP_ENTRY_VECTOR_SHIFT 16 #define INV_IOMMU_ALL_PAGES_ADDRESS ((1ULL << 63) - 1) #define IOMMU_RING_BUFFER_PTR_MASK 0x0007FFF0 #define IOMMU_RING_BUFFER_PTR_SHIFT 4 #define IOMMU_CMD_DEVICE_ID_MASK 0x0000FFFF #define IOMMU_CMD_DEVICE_ID_SHIFT 0 #define IOMMU_CMD_ADDR_LOW_MASK 0xFFFFF000 #define IOMMU_CMD_ADDR_LOW_SHIFT 12 #define IOMMU_CMD_ADDR_HIGH_MASK 0xFFFFFFFF #define IOMMU_CMD_ADDR_HIGH_SHIFT 0 #define IOMMU_REG_BASE_ADDR_LOW_MASK 0xFFFFF000 #define IOMMU_REG_BASE_ADDR_LOW_SHIFT 12 #define IOMMU_REG_BASE_ADDR_HIGH_MASK 0x000FFFFF #define IOMMU_REG_BASE_ADDR_HIGH_SHIFT 0 #endif /* _ASM_X86_64_AMD_IOMMU_DEFS_H */ xen-4.4.0/xen/include/asm-x86/hvm/svm/emulate.h0000664000175000017500000000340312307313555017267 0ustar smbsmb/* * emulate.h: SVM instruction emulation bits. * Copyright (c) 2005, AMD Corporation. * Copyright (c) 2004, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #ifndef __ASM_X86_HVM_SVM_EMULATE_H__ #define __ASM_X86_HVM_SVM_EMULATE_H__ /* Enumerate some standard instructions that we support */ enum instruction_index { INSTR_INVD, INSTR_WBINVD, INSTR_CPUID, INSTR_RDMSR, INSTR_WRMSR, INSTR_VMCALL, INSTR_HLT, INSTR_INT3, INSTR_RDTSC, INSTR_PAUSE, INSTR_XSETBV, INSTR_VMRUN, INSTR_VMLOAD, INSTR_VMSAVE, INSTR_STGI, INSTR_CLGI, INSTR_INVLPGA, INSTR_MAX_COUNT /* Must be last - Number of instructions supported */ }; struct vcpu; int __get_instruction_length_from_list( struct vcpu *, const enum instruction_index *, unsigned int list_count); static inline int __get_instruction_length( struct vcpu *v, enum instruction_index instr) { return __get_instruction_length_from_list(v, &instr, 1); } #endif /* __ASM_X86_HVM_SVM_EMULATE_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/hvm/svm/intr.h0000664000175000017500000000166012307313555016612 0ustar smbsmb/* * intr.h: SVM Architecture related definitions * Copyright (c) 2005, AMD Corporation. * Copyright (c) 2004, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #ifndef __ASM_X86_HVM_SVM_INTR_H__ #define __ASM_X86_HVM_SVM_INTR_H__ void svm_intr_assist(void); #endif /* __ASM_X86_HVM_SVM_INTR_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/svm/svm.h0000664000175000017500000000746112307313555016450 0ustar smbsmb/* * svm.h: SVM Architecture related definitions * Copyright (c) 2005, AMD Corporation. * Copyright (c) 2004, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #ifndef __ASM_X86_HVM_SVM_H__ #define __ASM_X86_HVM_SVM_H__ #include #include #define SVM_REG_EAX (0) #define SVM_REG_ECX (1) #define SVM_REG_EDX (2) #define SVM_REG_EBX (3) #define SVM_REG_ESP (4) #define SVM_REG_EBP (5) #define SVM_REG_ESI (6) #define SVM_REG_EDI (7) #define SVM_REG_R8 (8) #define SVM_REG_R9 (9) #define SVM_REG_R10 (10) #define SVM_REG_R11 (11) #define SVM_REG_R12 (12) #define SVM_REG_R13 (13) #define SVM_REG_R14 (14) #define SVM_REG_R15 (15) #define svm_vmload(x) svm_vmload_pa(__pa(x)) #define svm_vmsave(x) svm_vmsave_pa(__pa(x)) static inline void svm_vmload_pa(paddr_t vmcb) { asm volatile ( ".byte 0x0f,0x01,0xda" /* vmload */ : : "a" (vmcb) : "memory" ); } static inline void svm_vmsave_pa(paddr_t vmcb) { asm volatile ( ".byte 0x0f,0x01,0xdb" /* vmsave */ : : "a" (vmcb) : "memory" ); } static inline void svm_invlpga(unsigned long vaddr, uint32_t asid) { asm volatile ( ".byte 0x0f,0x01,0xdf" : /* output */ : /* input */ "a" (vaddr), "c" (asid)); } unsigned long *svm_msrbit(unsigned long *msr_bitmap, uint32_t msr); void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len); void svm_update_guest_cr(struct vcpu *, unsigned int cr); extern u32 svm_feature_flags; #define SVM_FEATURE_NPT 0 /* Nested page table support */ #define SVM_FEATURE_LBRV 1 /* LBR virtualization support */ #define SVM_FEATURE_SVML 2 /* SVM locking MSR support */ #define SVM_FEATURE_NRIPS 3 /* Next RIP save on VMEXIT support */ #define SVM_FEATURE_TSCRATEMSR 4 /* TSC ratio MSR support */ #define SVM_FEATURE_VMCBCLEAN 5 /* VMCB clean bits support */ #define SVM_FEATURE_FLUSHBYASID 6 /* TLB flush by ASID support */ #define SVM_FEATURE_DECODEASSISTS 7 /* Decode assists support */ #define SVM_FEATURE_PAUSEFILTER 10 /* Pause intercept filter support */ #define cpu_has_svm_feature(f) test_bit(f, &svm_feature_flags) #define cpu_has_svm_npt cpu_has_svm_feature(SVM_FEATURE_NPT) #define cpu_has_svm_lbrv cpu_has_svm_feature(SVM_FEATURE_LBRV) #define cpu_has_svm_svml cpu_has_svm_feature(SVM_FEATURE_SVML) #define cpu_has_svm_nrips cpu_has_svm_feature(SVM_FEATURE_NRIPS) #define cpu_has_svm_cleanbits cpu_has_svm_feature(SVM_FEATURE_VMCBCLEAN) #define cpu_has_svm_decode cpu_has_svm_feature(SVM_FEATURE_DECODEASSISTS) #define cpu_has_pause_filter cpu_has_svm_feature(SVM_FEATURE_PAUSEFILTER) #define cpu_has_tsc_ratio cpu_has_svm_feature(SVM_FEATURE_TSCRATEMSR) #define SVM_PAUSEFILTER_INIT 3000 /* TSC rate */ #define DEFAULT_TSC_RATIO 0x0000000100000000ULL #define TSC_RATIO_RSVD_BITS 0xffffff0000000000ULL #define TSC_RATIO(g_khz, h_khz) ( (((u64)(g_khz)<<32)/(u64)(h_khz)) & \ ~TSC_RATIO_RSVD_BITS ) #define vcpu_tsc_ratio(v) TSC_RATIO((v)->domain->arch.tsc_khz, cpu_khz) extern void svm_host_osvw_reset(void); extern void svm_host_osvw_init(void); #endif /* __ASM_X86_HVM_SVM_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/svm/svmdebug.h0000664000175000017500000000212512307313555017447 0ustar smbsmb/* * svmdebug.h: SVM related debug defintions * Copyright (c) 2011, AMD Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #ifndef __ASM_X86_HVM_SVM_SVMDEBUG_H__ #define __ASM_X86_HVM_SVM_SVMDEBUG_H__ #include #include void svm_vmcb_dump(const char *from, struct vmcb_struct *vmcb); bool_t svm_vmcb_isvalid(const char *from, struct vmcb_struct *vmcb, bool_t verbose); #endif /* __ASM_X86_HVM_SVM_SVMDEBUG_H__ */ xen-4.4.0/xen/include/asm-x86/hvm/guest_access.h0000664000175000017500000000051412307313555017476 0ustar smbsmb#ifndef __ASM_X86_HVM_GUEST_ACCESS_H__ #define __ASM_X86_HVM_GUEST_ACCESS_H__ unsigned long copy_to_user_hvm(void *to, const void *from, unsigned len); unsigned long clear_user_hvm(void *to, unsigned int len); unsigned long copy_from_user_hvm(void *to, const void *from, unsigned len); #endif /* __ASM_X86_HVM_GUEST_ACCESS_H__ */ xen-4.4.0/xen/include/asm-x86/time.h0000664000175000017500000000453612307313555015202 0ustar smbsmb #ifndef __X86_TIME_H__ #define __X86_TIME_H__ #include /* * PV TSC emulation modes: * 0 = guest rdtsc/p executed natively when monotonicity can be guaranteed * and emulated otherwise (with frequency scaled if necessary) * 1 = guest rdtsc/p always emulated at 1GHz (kernel and user) * 2 = guest rdtsc always executed natively (no monotonicity/frequency * guarantees); guest rdtscp emulated at native frequency if * unsupported by h/w, else executed natively * 3 = same as 2, except xen manages TSC_AUX register so guest can * determine when a restore/migration has occurred and assumes * guest obtains/uses pvclock-like mechanism to adjust for * monotonicity and frequency changes */ #define TSC_MODE_DEFAULT 0 #define TSC_MODE_ALWAYS_EMULATE 1 #define TSC_MODE_NEVER_EMULATE 2 #define TSC_MODE_PVRDTSCP 3 typedef u64 cycles_t; extern bool_t disable_tsc_sync; static inline cycles_t get_cycles(void) { cycles_t c; rdtscll(c); return c; } unsigned long mktime (unsigned int year, unsigned int mon, unsigned int day, unsigned int hour, unsigned int min, unsigned int sec); int time_suspend(void); int time_resume(void); void init_percpu_time(void); struct ioreq; int dom0_pit_access(struct ioreq *ioreq); int cpu_frequency_change(u64 freq); struct tm; struct tm wallclock_time(void); void pit_broadcast_enter(void); void pit_broadcast_exit(void); int pit_broadcast_is_available(void); uint64_t acpi_pm_tick_to_ns(uint64_t ticks); uint64_t ns_to_acpi_pm_tick(uint64_t ns); uint64_t tsc_ticks2ns(uint64_t ticks); void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs, int rdtscp); u64 gtime_to_gtsc(struct domain *d, u64 time); u64 gtsc_to_gtime(struct domain *d, u64 tsc); void tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec, uint32_t gtsc_khz, uint32_t incarnation); void tsc_get_info(struct domain *d, uint32_t *tsc_mode, uint64_t *elapsed_nsec, uint32_t *gtsc_khz, uint32_t *incarnation); void force_update_vcpu_system_time(struct vcpu *v); int host_tsc_is_safe(void); void cpuid_time_leaf(uint32_t sub_idx, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); u64 stime2tsc(s_time_t stime); #endif /* __X86_TIME_H__ */ xen-4.4.0/xen/include/asm-x86/cache.h0000664000175000017500000000045112307313555015277 0ustar smbsmb/* * include/asm-x86/cache.h */ #ifndef __ARCH_X86_CACHE_H #define __ARCH_X86_CACHE_H #include /* L1 cache line size */ #define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT) #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) #define __read_mostly __section(".data.read_mostly") #endif xen-4.4.0/xen/include/asm-x86/mach-generic/0000775000175000017500000000000012307313555016405 5ustar smbsmbxen-4.4.0/xen/include/asm-x86/mach-generic/mach_apic.h0000664000175000017500000000426412307313555020470 0ustar smbsmb#ifndef __ASM_MACH_APIC_H #define __ASM_MACH_APIC_H #include #include #include #include /* ESR was originally disabled in Linux for NUMA-Q. Do we really need to? */ #define esr_disable (0) /* The following are dependent on APIC delivery mode (logical vs. physical). */ #define INT_DELIVERY_MODE (genapic->int_delivery_mode) #define INT_DEST_MODE (genapic->int_dest_mode) #define TARGET_CPUS (genapic->target_cpus()) #define init_apic_ldr (genapic->init_apic_ldr) #define clustered_apic_check (genapic->clustered_apic_check) #define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) #define vector_allocation_cpumask(cpu) (genapic->vector_allocation_cpumask(cpu)) static inline void enable_apic_mode(void) { /* Not needed for modern ES7000 which boot in Virtual Wire mode. */ /*es7000_sw_apic();*/ } #define apicid_to_node(apicid) ((int)apicid_to_node[(u32)apicid]) extern u32 bios_cpu_apicid[]; static inline int mpc_apic_id(struct mpc_config_processor *m, u32 apicid) { printk("Processor #%d %d:%d APIC version %d\n", apicid, (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, m->mpc_apicver); return apicid; } static inline int multi_timer_check(int apic, int irq) { return 0; } extern void generic_apic_probe(void); extern void generic_bigsmp_probe(void); /* * The following functions based around phys_cpu_present_map are disabled in * some i386 Linux subarchitectures, and in x86_64 'cluster' genapic mode. I'm * really not sure why, since all local APICs should have distinct physical * IDs, and we need to know what they are. */ static inline int apic_id_registered(void) { return physid_isset(get_apic_id(), phys_cpu_present_map); } static inline void ioapic_phys_id_map(physid_mask_t *map) { *map = phys_cpu_present_map; } static inline int check_apicid_used(const physid_mask_t *map, int apicid) { return physid_isset(apicid, *map); } static inline int check_apicid_present(int apicid) { return physid_isset(apicid, phys_cpu_present_map); } static inline void set_apicid(int phys_apicid, physid_mask_t *map) { physid_set(phys_apicid, *map); } #endif /* __ASM_MACH_APIC_H */ xen-4.4.0/xen/include/asm-x86/mach-generic/mach_mpparse.h0000664000175000017500000000030212307313555021210 0ustar smbsmb#ifndef _MACH_MPPARSE_H #define _MACH_MPPARSE_H 1 int mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid); int acpi_madt_oem_check(char *oem_id, char *oem_table_id); #endif xen-4.4.0/xen/include/asm-x86/flushtlb.h0000664000175000017500000001005712307313555016062 0ustar smbsmb/****************************************************************************** * flushtlb.h * * TLB flushes are timestamped using a global virtual 'clock' which ticks * on any TLB flush on any processor. * * Copyright (c) 2003-2004, K A Fraser */ #ifndef __FLUSHTLB_H__ #define __FLUSHTLB_H__ #include #include #include #include /* The current time as shown by the virtual TLB clock. */ extern u32 tlbflush_clock; /* Time at which each CPU's TLB was last flushed. */ DECLARE_PER_CPU(u32, tlbflush_time); #define tlbflush_current_time() tlbflush_clock /* * @cpu_stamp is the timestamp at last TLB flush for the CPU we are testing. * @lastuse_stamp is a timestamp taken when the PFN we are testing was last * used for a purpose that may have caused the CPU's TLB to become tainted. */ static inline int NEED_FLUSH(u32 cpu_stamp, u32 lastuse_stamp) { u32 curr_time = tlbflush_current_time(); /* * Two cases: * 1. During a wrap, the clock ticks over to 0 while CPUs catch up. For * safety during this period, we force a flush if @curr_time == 0. * 2. Otherwise, we look to see if @cpu_stamp <= @lastuse_stamp. * To detect false positives because @cpu_stamp has wrapped, we * also check @curr_time. If less than @lastuse_stamp we definitely * wrapped, so there's no need for a flush (one is forced every wrap). */ return ((curr_time == 0) || ((cpu_stamp <= lastuse_stamp) && (lastuse_stamp <= curr_time))); } /* * Filter the given set of CPUs, removing those that definitely flushed their * TLB since @page_timestamp. */ #define tlbflush_filter(mask, page_timestamp) \ do { \ unsigned int cpu; \ for_each_cpu ( cpu, &(mask) ) \ if ( !NEED_FLUSH(per_cpu(tlbflush_time, cpu), page_timestamp) ) \ cpumask_clear_cpu(cpu, &(mask)); \ } while ( 0 ) void new_tlbflush_clock_period(void); /* Read pagetable base. */ static inline unsigned long read_cr3(void) { unsigned long cr3; __asm__ __volatile__ ( "mov %%cr3, %0" : "=r" (cr3) : ); return cr3; } /* Write pagetable base and implicitly tick the tlbflush clock. */ void write_cr3(unsigned long cr3); /* flush_* flag fields: */ /* * Area to flush: 2^flush_order pages. Default is flush entire address space. * NB. Multi-page areas do not need to have been mapped with a superpage. */ #define FLUSH_ORDER_MASK 0xff #define FLUSH_ORDER(x) ((x)+1) /* Flush TLBs (or parts thereof) */ #define FLUSH_TLB 0x100 /* Flush TLBs (or parts thereof) including global mappings */ #define FLUSH_TLB_GLOBAL 0x200 /* Flush data caches */ #define FLUSH_CACHE 0x400 /* Flush local TLBs/caches. */ void flush_area_local(const void *va, unsigned int flags); #define flush_local(flags) flush_area_local(NULL, flags) /* Flush specified CPUs' TLBs/caches */ void flush_area_mask(const cpumask_t *, const void *va, unsigned int flags); #define flush_mask(mask, flags) flush_area_mask(mask, NULL, flags) /* Flush all CPUs' TLBs/caches */ #define flush_area_all(va, flags) flush_area_mask(&cpu_online_map, va, flags) #define flush_all(flags) flush_mask(&cpu_online_map, flags) /* Flush local TLBs */ #define flush_tlb_local() \ flush_local(FLUSH_TLB) #define flush_tlb_one_local(v) \ flush_area_local((const void *)(v), FLUSH_TLB|FLUSH_ORDER(0)) /* Flush specified CPUs' TLBs */ #define flush_tlb_mask(mask) \ flush_mask(mask, FLUSH_TLB) #define flush_tlb_one_mask(mask,v) \ flush_area_mask(mask, (const void *)(v), FLUSH_TLB|FLUSH_ORDER(0)) /* Flush all CPUs' TLBs */ #define flush_tlb_all() \ flush_tlb_mask(&cpu_online_map) #define flush_tlb_one_all(v) \ flush_tlb_one_mask(&cpu_online_map, v) #endif /* __FLUSHTLB_H__ */ xen-4.4.0/xen/include/asm-x86/event.h0000664000175000017500000000216712307313555015363 0ustar smbsmb/****************************************************************************** * event.h * * A nice interface for passing asynchronous events to guest OSes. * (architecture-dependent part) * */ #ifndef __ASM_EVENT_H__ #define __ASM_EVENT_H__ #include void vcpu_kick(struct vcpu *v); void vcpu_mark_events_pending(struct vcpu *v); static inline int vcpu_event_delivery_is_enabled(struct vcpu *v) { return !vcpu_info(v, evtchn_upcall_mask); } int hvm_local_events_need_delivery(struct vcpu *v); static inline int local_events_need_delivery(void) { struct vcpu *v = current; return (has_hvm_container_vcpu(v) ? hvm_local_events_need_delivery(v) : (vcpu_info(v, evtchn_upcall_pending) && !vcpu_info(v, evtchn_upcall_mask))); } static inline void local_event_delivery_disable(void) { vcpu_info(current, evtchn_upcall_mask) = 1; } static inline void local_event_delivery_enable(void) { vcpu_info(current, evtchn_upcall_mask) = 0; } /* No arch specific virq definition now. Default to global. */ static inline int arch_virq_is_global(uint32_t virq) { return 1; } #endif xen-4.4.0/xen/include/asm-x86/domain.h0000664000175000017500000003743112307313555015513 0ustar smbsmb#ifndef __ASM_DOMAIN_H__ #define __ASM_DOMAIN_H__ #include #include #include #include #include #include #include #include #define has_32bit_shinfo(d) ((d)->arch.has_32bit_shinfo) #define is_pv_32bit_domain(d) ((d)->arch.is_32bit_pv) #define is_pv_32bit_vcpu(v) (is_pv_32bit_domain((v)->domain)) #define is_pv_32on64_domain(d) (is_pv_32bit_domain(d)) #define is_pv_32on64_vcpu(v) (is_pv_32on64_domain((v)->domain)) #define is_hvm_pv_evtchn_domain(d) (has_hvm_container_domain(d) && \ d->arch.hvm_domain.irq.callback_via_type == HVMIRQ_callback_vector) #define is_hvm_pv_evtchn_vcpu(v) (is_hvm_pv_evtchn_domain(v->domain)) #define VCPU_TRAP_NMI 1 #define VCPU_TRAP_MCE 2 #define VCPU_TRAP_LAST VCPU_TRAP_MCE #define nmi_state async_exception_state(VCPU_TRAP_NMI) #define mce_state async_exception_state(VCPU_TRAP_MCE) #define nmi_pending nmi_state.pending #define mce_pending mce_state.pending struct trap_bounce { uint32_t error_code; uint8_t flags; /* TBF_ */ uint16_t cs; unsigned long eip; }; #define MAPHASH_ENTRIES 8 #define MAPHASH_HASHFN(pfn) ((pfn) & (MAPHASH_ENTRIES-1)) #define MAPHASHENT_NOTINUSE ((u32)~0U) struct mapcache_vcpu { /* Shadow of mapcache_domain.epoch. */ unsigned int shadow_epoch; /* Lock-free per-VCPU hash of recently-used mappings. */ struct vcpu_maphash_entry { unsigned long mfn; uint32_t idx; uint32_t refcnt; } hash[MAPHASH_ENTRIES]; }; struct mapcache_domain { /* The number of array entries, and a cursor into the array. */ unsigned int entries; unsigned int cursor; /* Protects map_domain_page(). */ spinlock_t lock; /* Garbage mappings are flushed from TLBs in batches called 'epochs'. */ unsigned int epoch; u32 tlbflush_timestamp; /* Which mappings are in use, and which are garbage to reap next epoch? */ unsigned long *inuse; unsigned long *garbage; }; int mapcache_domain_init(struct domain *); int mapcache_vcpu_init(struct vcpu *); void mapcache_override_current(struct vcpu *); /* x86/64: toggle guest between kernel and user modes. */ void toggle_guest_mode(struct vcpu *); /* * Initialise a hypercall-transfer page. The given pointer must be mapped * in Xen virtual address space (accesses are not validated or checked). */ void hypercall_page_initialise(struct domain *d, void *); /************************************************/ /* shadow paging extension */ /************************************************/ struct shadow_domain { unsigned int opt_flags; /* runtime tunable optimizations on/off */ struct page_list_head pinned_shadows; /* Memory allocation */ struct page_list_head freelist; unsigned int total_pages; /* number of pages allocated */ unsigned int free_pages; /* number of pages on freelists */ unsigned int p2m_pages; /* number of pages allocates to p2m */ /* 1-to-1 map for use when HVM vcpus have paging disabled */ pagetable_t unpaged_pagetable; /* reflect guest table dirty status, incremented by write * emulation and remove write permission */ atomic_t gtable_dirty_version; /* Shadow hashtable */ struct page_info **hash_table; bool_t hash_walking; /* Some function is walking the hash table */ /* Fast MMIO path heuristic */ bool_t has_fast_mmio_entries; /* OOS */ bool_t oos_active; bool_t oos_off; /* Has this domain ever used HVMOP_pagetable_dying? */ bool_t pagetable_dying_op; }; struct shadow_vcpu { /* PAE guests: per-vcpu shadow top-level table */ l3_pgentry_t l3table[4] __attribute__((__aligned__(32))); /* PAE guests: per-vcpu cache of the top-level *guest* entries */ l3_pgentry_t gl3e[4] __attribute__((__aligned__(32))); /* Non-PAE guests: pointer to guest top-level pagetable */ void *guest_vtable; /* Last MFN that we emulated a write to as unshadow heuristics. */ unsigned long last_emulated_mfn_for_unshadow; /* MFN of the last shadow that we shot a writeable mapping in */ unsigned long last_writeable_pte_smfn; /* Last frame number that we emulated a write to. */ unsigned long last_emulated_frame; /* Last MFN that we emulated a write successfully */ unsigned long last_emulated_mfn; /* Shadow out-of-sync: pages that this vcpu has let go out of sync */ mfn_t oos[SHADOW_OOS_PAGES]; mfn_t oos_snapshot[SHADOW_OOS_PAGES]; struct oos_fixup { int next; mfn_t smfn[SHADOW_OOS_FIXUPS]; unsigned long off[SHADOW_OOS_FIXUPS]; } oos_fixup[SHADOW_OOS_PAGES]; bool_t pagetable_dying; }; /************************************************/ /* hardware assisted paging */ /************************************************/ struct hap_domain { struct page_list_head freelist; unsigned int total_pages; /* number of pages allocated */ unsigned int free_pages; /* number of pages on freelists */ unsigned int p2m_pages; /* number of pages allocates to p2m */ }; /************************************************/ /* common paging data structure */ /************************************************/ struct log_dirty_domain { /* log-dirty radix tree to record dirty pages */ mfn_t top; unsigned int allocs; unsigned int failed_allocs; /* log-dirty mode stats */ unsigned int fault_count; unsigned int dirty_count; /* functions which are paging mode specific */ int (*enable_log_dirty )(struct domain *d, bool_t log_global); int (*disable_log_dirty )(struct domain *d); void (*clean_dirty_bitmap )(struct domain *d); }; struct paging_domain { /* paging lock */ mm_lock_t lock; /* flags to control paging operation */ u32 mode; /* extension for shadow paging support */ struct shadow_domain shadow; /* extension for hardware-assited paging */ struct hap_domain hap; /* log dirty support */ struct log_dirty_domain log_dirty; /* alloc/free pages from the pool for paging-assistance structures * (used by p2m and log-dirty code for their tries) */ struct page_info * (*alloc_page)(struct domain *d); void (*free_page)(struct domain *d, struct page_info *pg); /* Has that pool ever run out of memory? */ bool_t p2m_alloc_failed; }; struct paging_vcpu { /* Pointers to mode-specific entry points. */ const struct paging_mode *mode; /* Nested Virtualization: paging mode of nested guest */ const struct paging_mode *nestedmode; /* HVM guest: last emulate was to a pagetable */ unsigned int last_write_was_pt:1; /* HVM guest: last write emulation succeeds */ unsigned int last_write_emul_ok:1; /* Translated guest: virtual TLB */ struct shadow_vtlb *vtlb; spinlock_t vtlb_lock; /* paging support extension */ struct shadow_vcpu shadow; }; #define MAX_CPUID_INPUT 40 typedef xen_domctl_cpuid_t cpuid_input_t; #define MAX_NESTEDP2M 10 struct p2m_domain; struct time_scale { int shift; u32 mul_frac; }; struct pv_domain { l1_pgentry_t **gdt_ldt_l1tab; /* Shared page for notifying that explicit PIRQ EOI is required. */ unsigned long *pirq_eoi_map; unsigned long pirq_eoi_map_mfn; /* set auto_unmask to 1 if you want PHYSDEVOP_eoi to automatically * unmask the event channel */ bool_t auto_unmask; /* map_domain_page() mapping cache. */ struct mapcache_domain mapcache; }; struct arch_domain { struct page_info *perdomain_l3_pg; unsigned int hv_compat_vstart; bool_t s3_integrity; /* I/O-port admin-specified access capabilities. */ struct rangeset *ioport_caps; uint32_t pci_cf8; uint8_t cmos_idx; struct list_head pdev_list; union { struct pv_domain pv_domain; struct hvm_domain hvm_domain; }; struct paging_domain paging; struct p2m_domain *p2m; /* To enforce lock ordering in the pod code wrt the * page_alloc lock */ int page_alloc_unlock_level; /* nestedhvm: translate l2 guest physical to host physical */ struct p2m_domain *nested_p2m[MAX_NESTEDP2M]; mm_lock_t nested_p2m_lock; /* NB. protected by d->event_lock and by irq_desc[irq].lock */ struct radix_tree_root irq_pirq; /* Maximum physical-address bitwidth supported by this guest. */ unsigned int physaddr_bitsize; /* Is a 32-bit PV (non-HVM) guest? */ bool_t is_32bit_pv; /* Is shared-info page in 32-bit format? */ bool_t has_32bit_shinfo; /* Domain cannot handle spurious page faults? */ bool_t suppress_spurious_page_faults; /* Continuable domain_relinquish_resources(). */ enum { RELMEM_not_started, RELMEM_shared, RELMEM_xen, RELMEM_l4, RELMEM_l3, RELMEM_l2, RELMEM_done, } relmem; struct page_list_head relmem_list; cpuid_input_t *cpuids; struct PITState vpit; /* TSC management (emulation, pv, scaling, stats) */ int tsc_mode; /* see include/asm-x86/time.h */ bool_t vtsc; /* tsc is emulated (may change after migrate) */ s_time_t vtsc_last; /* previous TSC value (guarantee monotonicity) */ spinlock_t vtsc_lock; uint64_t vtsc_offset; /* adjustment for save/restore/migrate */ uint32_t tsc_khz; /* cached khz for certain emulated cases */ struct time_scale vtsc_to_ns; /* scaling for certain emulated cases */ struct time_scale ns_to_vtsc; /* scaling for certain emulated cases */ uint32_t incarnation; /* incremented every restore or live migrate (possibly other cases in the future */ uint64_t vtsc_kerncount; /* for hvm, counts all vtsc */ uint64_t vtsc_usercount; /* not used for hvm */ /* Pseudophysical e820 map (XENMEM_memory_map). */ spinlock_t e820_lock; struct e820entry *e820; unsigned int nr_e820; } __cacheline_aligned; #define has_arch_pdevs(d) (!list_empty(&(d)->arch.pdev_list)) #define gdt_ldt_pt_idx(v) \ ((v)->vcpu_id >> (PAGETABLE_ORDER - GDT_LDT_VCPU_SHIFT)) #define gdt_ldt_ptes(d, v) \ ((d)->arch.pv_domain.gdt_ldt_l1tab[gdt_ldt_pt_idx(v)] + \ (((v)->vcpu_id << GDT_LDT_VCPU_SHIFT) & (L1_PAGETABLE_ENTRIES - 1))) struct pv_vcpu { /* map_domain_page() mapping cache. */ struct mapcache_vcpu mapcache; struct trap_info *trap_ctxt; unsigned long gdt_frames[FIRST_RESERVED_GDT_PAGE]; unsigned long ldt_base; unsigned int gdt_ents, ldt_ents; unsigned long kernel_ss, kernel_sp; unsigned long ctrlreg[8]; unsigned long event_callback_eip; unsigned long failsafe_callback_eip; union { unsigned long syscall_callback_eip; struct { unsigned int event_callback_cs; unsigned int failsafe_callback_cs; }; }; unsigned long vm_assist; unsigned long syscall32_callback_eip; unsigned long sysenter_callback_eip; unsigned short syscall32_callback_cs; unsigned short sysenter_callback_cs; bool_t syscall32_disables_events; bool_t sysenter_disables_events; /* Segment base addresses. */ unsigned long fs_base; unsigned long gs_base_kernel; unsigned long gs_base_user; /* Bounce information for propagating an exception to guest OS. */ struct trap_bounce trap_bounce; struct trap_bounce int80_bounce; /* I/O-port access bitmap. */ XEN_GUEST_HANDLE(uint8) iobmp; /* Guest kernel vaddr of the bitmap. */ unsigned int iobmp_limit; /* Number of ports represented in the bitmap. */ unsigned int iopl; /* Current IOPL for this VCPU. */ /* Current LDT details. */ unsigned long shadow_ldt_mapcnt; spinlock_t shadow_ldt_lock; /* Deferred VA-based update state. */ bool_t need_update_runstate_area; struct vcpu_time_info pending_system_time; }; struct arch_vcpu { /* * guest context (mirroring struct vcpu_guest_context) common * between pv and hvm guests */ void *fpu_ctxt; unsigned long vgc_flags; struct cpu_user_regs user_regs; unsigned long debugreg[8]; /* other state */ unsigned long flags; /* TF_ */ void (*schedule_tail) (struct vcpu *); void (*ctxt_switch_from) (struct vcpu *); void (*ctxt_switch_to) (struct vcpu *); /* Virtual Machine Extensions */ union { struct pv_vcpu pv_vcpu; struct hvm_vcpu hvm_vcpu; }; pagetable_t guest_table_user; /* (MFN) x86/64 user-space pagetable */ pagetable_t guest_table; /* (MFN) guest notion of cr3 */ struct page_info *old_guest_table; /* partially destructed pagetable */ /* guest_table holds a ref to the page, and also a type-count unless * shadow refcounts are in use */ pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */ pagetable_t monitor_table; /* (MFN) hypervisor PT (for HVM) */ unsigned long cr3; /* (MA) value to install in HW CR3 */ /* * The save area for Processor Extended States and the bitmask of the * XSAVE/XRSTOR features. They are used by: 1) when a vcpu (which has * dirtied FPU/SSE) is scheduled out we XSAVE the states here; 2) in * #NM handler, we XRSTOR the states we XSAVE-ed; */ struct xsave_struct *xsave_area; uint64_t xcr0; /* Accumulated eXtended features mask for using XSAVE/XRESTORE by Xen * itself, as we can never know whether guest OS depends on content * preservation whenever guest OS clears one feature flag (for example, * temporarily). * However, processor should not be able to touch eXtended states before * it explicitly enables it via xcr0. */ uint64_t xcr0_accum; /* This variable determines whether nonlazy extended state has been used, * and thus should be saved/restored. */ bool_t nonlazy_xstate_used; struct vmce vmce; struct paging_vcpu paging; uint32_t gdbsx_vcpu_event; /* A secondary copy of the vcpu time info. */ XEN_GUEST_HANDLE(vcpu_time_info_t) time_info_guest; } __cacheline_aligned; /* Shorthands to improve code legibility. */ #define hvm_vmx hvm_vcpu.u.vmx #define hvm_svm hvm_vcpu.u.svm bool_t update_runstate_area(const struct vcpu *); bool_t update_secondary_system_time(const struct vcpu *, struct vcpu_time_info *); void vcpu_show_execution_state(struct vcpu *); void vcpu_show_registers(const struct vcpu *); /* Clean up CR4 bits that are not under guest control. */ unsigned long pv_guest_cr4_fixup(const struct vcpu *, unsigned long guest_cr4); /* Convert between guest-visible and real CR4 values. */ #define pv_guest_cr4_to_real_cr4(v) \ (((v)->arch.pv_vcpu.ctrlreg[4] \ | (mmu_cr4_features \ & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP | \ X86_CR4_OSXSAVE | X86_CR4_FSGSBASE)) \ | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)) \ & ~X86_CR4_DE) #define real_cr4_to_pv_guest_cr4(c) \ ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD | \ X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE)) void domain_cpuid(struct domain *d, unsigned int input, unsigned int sub_input, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); #endif /* __ASM_DOMAIN_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/mc146818rtc.h0000664000175000017500000000751012307313555016043 0ustar smbsmb/* * Machine dependent access functions for RTC registers. */ #ifndef _ASM_MC146818RTC_H #define _ASM_MC146818RTC_H #include #include extern spinlock_t rtc_lock; /* serialize CMOS RAM access */ /********************************************************************** * register summary **********************************************************************/ #define RTC_SECONDS 0 #define RTC_SECONDS_ALARM 1 #define RTC_MINUTES 2 #define RTC_MINUTES_ALARM 3 #define RTC_HOURS 4 #define RTC_HOURS_ALARM 5 /* RTC_*_alarm is always true if 2 MSBs are set */ # define RTC_ALARM_DONT_CARE 0xC0 #define RTC_DAY_OF_WEEK 6 #define RTC_DAY_OF_MONTH 7 #define RTC_MONTH 8 #define RTC_YEAR 9 /* control registers - Moto names */ #define RTC_REG_A 10 #define RTC_REG_B 11 #define RTC_REG_C 12 #define RTC_REG_D 13 /********************************************************************** * register details **********************************************************************/ #define RTC_FREQ_SELECT RTC_REG_A /* update-in-progress - set to "1" 244 microsecs before RTC goes off the bus, * reset after update (may take 1.984ms @ 32768Hz RefClock) is complete, * totalling to a max high interval of 2.228 ms. */ # define RTC_UIP 0x80 # define RTC_DIV_CTL 0x70 /* divider control: refclock values 4.194 / 1.049 MHz / 32.768 kHz */ # define RTC_REF_CLCK_4MHZ 0x00 # define RTC_REF_CLCK_1MHZ 0x10 # define RTC_REF_CLCK_32KHZ 0x20 /* 2 values for divider stage reset, others for "testing purposes only" */ # define RTC_DIV_RESET1 0x60 # define RTC_DIV_RESET2 0x70 /* Periodic intr. / Square wave rate select. 0=none, 1=32.8kHz,... 15=2Hz */ # define RTC_RATE_SELECT 0x0F /**********************************************************************/ #define RTC_CONTROL RTC_REG_B # define RTC_SET 0x80 /* disable updates for clock setting */ # define RTC_PIE 0x40 /* periodic interrupt enable */ # define RTC_AIE 0x20 /* alarm interrupt enable */ # define RTC_UIE 0x10 /* update-finished interrupt enable */ # define RTC_SQWE 0x08 /* enable square-wave output */ # define RTC_DM_BINARY 0x04 /* all time/date values are BCD if clear */ # define RTC_24H 0x02 /* 24 hour mode - else hours bit 7 means pm */ # define RTC_DST_EN 0x01 /* auto switch DST - works f. USA only */ /**********************************************************************/ #define RTC_INTR_FLAGS RTC_REG_C /* caution - cleared by read */ # define RTC_IRQF 0x80 /* any of the following 3 is active */ # define RTC_PF 0x40 # define RTC_AF 0x20 # define RTC_UF 0x10 /**********************************************************************/ #define RTC_VALID RTC_REG_D # define RTC_VRT 0x80 /* valid RAM and time */ /**********************************************************************/ /* example: !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) * determines if the following two #defines are needed */ #ifndef BCD_TO_BIN #define BCD_TO_BIN(val) ((val)=((val)&15) + ((val)>>4)*10) #endif #ifndef BIN_TO_BCD #define BIN_TO_BCD(val) ((val)=(((val)/10)<<4) + (val)%10) #endif #ifndef RTC_PORT #define RTC_PORT(x) (0x70 + (x)) #define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */ #endif /* * The yet supported machines all access the RTC index register via * an ISA port access but the way to access the date register differs ... */ #define CMOS_READ(addr) ({ \ outb_p((addr),RTC_PORT(0)); \ inb_p(RTC_PORT(1)); \ }) #define CMOS_WRITE(val, addr) ({ \ outb_p((addr),RTC_PORT(0)); \ outb_p((val),RTC_PORT(1)); \ }) #define RTC_IRQ 8 #endif /* _ASM_MC146818RTC_H */ xen-4.4.0/xen/include/asm-x86/xstate.h0000664000175000017500000000517212307313555015551 0ustar smbsmb/* * include/asm-i386/xstate.h * * x86 extended state (xsave/xrstor) related definitions * */ #ifndef __ASM_XSTATE_H #define __ASM_XSTATE_H #include #define FCW_DEFAULT 0x037f #define FCW_RESET 0x0040 #define MXCSR_DEFAULT 0x1f80 #define XSTATE_CPUID 0x0000000d #define XSTATE_FEATURE_XSAVEOPT (1 << 0) /* sub-leaf 1, eax[bit 0] */ #define XCR_XFEATURE_ENABLED_MASK 0x00000000 /* index of XCR0 */ #define XSTATE_YMM_SIZE 256 #define XSTATE_YMM_OFFSET XSAVE_AREA_MIN_SIZE #define XSTATE_AREA_MIN_SIZE (512 + 64) /* FP/SSE + XSAVE.HEADER */ #define XSTATE_FP (1ULL << 0) #define XSTATE_SSE (1ULL << 1) #define XSTATE_YMM (1ULL << 2) #define XSTATE_LWP (1ULL << 62) /* AMD lightweight profiling */ #define XSTATE_FP_SSE (XSTATE_FP | XSTATE_SSE) #define XCNTXT_MASK (XSTATE_FP | XSTATE_SSE | XSTATE_YMM | XSTATE_LWP) #define XSTATE_ALL (~0) #define XSTATE_NONLAZY (XSTATE_LWP) #define XSTATE_LAZY (XSTATE_ALL & ~XSTATE_NONLAZY) extern u64 xfeature_mask; /* extended state save area */ struct xsave_struct { union { /* FPU/MMX, SSE */ char x[512]; struct { uint16_t fcw; uint16_t fsw; uint8_t ftw; uint8_t rsvd1; uint16_t fop; union { uint64_t addr; struct { uint32_t offs; uint16_t sel; uint16_t rsvd; }; } fip, fdp; uint32_t mxcsr; uint32_t mxcsr_mask; /* data registers follow here */ }; } fpu_sse; struct { u64 xstate_bv; u64 reserved[7]; } xsave_hdr; /* The 64-byte header */ struct { char x[XSTATE_YMM_SIZE]; } ymm; /* YMM */ char data[]; /* Future new states */ } __attribute__ ((packed, aligned (64))); /* extended state operations */ bool_t __must_check set_xcr0(u64 xfeatures); uint64_t get_xcr0(void); void xsave(struct vcpu *v, uint64_t mask); void xrstor(struct vcpu *v, uint64_t mask); bool_t xsave_enabled(const struct vcpu *v); int __must_check validate_xstate(u64 xcr0, u64 xcr0_accum, u64 xstate_bv, u64 xfeat_mask); int __must_check handle_xsetbv(u32 index, u64 new_bv); /* extended state init and cleanup functions */ void xstate_free_save_area(struct vcpu *v); int xstate_alloc_save_area(struct vcpu *v); void xstate_init(bool_t bsp); unsigned int xstate_ctxt_size(u64 xcr0); #endif /* __ASM_XSTATE_H */ xen-4.4.0/xen/include/asm-x86/iocap.h0000664000175000017500000000167712307313555015342 0ustar smbsmb/****************************************************************************** * iocap.h * * Architecture-specific per-domain I/O capabilities. */ #ifndef __X86_IOCAP_H__ #define __X86_IOCAP_H__ #define ioports_permit_access(d, s, e) \ rangeset_add_range((d)->arch.ioport_caps, s, e) #define ioports_deny_access(d, s, e) \ rangeset_remove_range((d)->arch.ioport_caps, s, e) #define ioports_access_permitted(d, s, e) \ rangeset_contains_range((d)->arch.ioport_caps, s, e) #define cache_flush_permitted(d) \ (!rangeset_is_empty((d)->iomem_caps) || \ !rangeset_is_empty((d)->arch.ioport_caps)) #define multipage_allocation_permitted(d, order) \ (((order) <= 9) || /* allow 2MB superpages */ \ !rangeset_is_empty((d)->iomem_caps) || \ !rangeset_is_empty((d)->arch.ioport_caps)) #endif /* __X86_IOCAP_H__ */ xen-4.4.0/xen/include/asm-x86/cpuidle.h0000664000175000017500000000173312307313555015665 0ustar smbsmb#ifndef __ASM_X86_CPUIDLE_H__ #define __ASM_X86_CPUIDLE_H__ #include #include #include #include extern struct acpi_processor_power *processor_powers[]; extern void (*pm_idle_save)(void); bool_t lapic_timer_init(void); extern void (*lapic_timer_off)(void); extern void (*lapic_timer_on)(void); extern uint64_t (*cpuidle_get_tick)(void); int mwait_idle_init(struct notifier_block *); int cpuidle_init_cpu(unsigned int cpu); void default_dead_idle(void); void acpi_dead_idle(void); void trace_exit_reason(u32 *irq_traced); void update_idle_stats(struct acpi_processor_power *, struct acpi_processor_cx *, uint64_t, uint64_t); /* * vcpu is urgent if vcpu is polling event channel * * if urgent vcpu exists, CPU should not enter deep C state */ static inline int sched_has_urgent_vcpu(void) { return atomic_read(&this_cpu(schedule_data).urgent_count); } #endif /* __X86_ASM_CPUIDLE_H__ */ xen-4.4.0/xen/include/asm-x86/guest_pt.h0000664000175000017500000003046712307313555016100 0ustar smbsmb/****************************************************************************** * xen/asm-x86/guest_pt.h * * Types and accessors for guest pagetable entries, as distinct from * Xen's pagetable types. * * Users must #define GUEST_PAGING_LEVELS to 2, 3 or 4 before including * this file. * * Parts of this code are Copyright (c) 2006 by XenSource Inc. * Parts of this code are Copyright (c) 2006 by Michael A Fetterman * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _XEN_ASM_GUEST_PT_H #define _XEN_ASM_GUEST_PT_H #if !defined(GUEST_PAGING_LEVELS) #error GUEST_PAGING_LEVELS not defined #endif /* Type of the guest's frame numbers */ TYPE_SAFE(unsigned long,gfn) #define PRI_gfn "05lx" #ifndef gfn_t #define gfn_t /* Grep fodder: gfn_t, _gfn() and gfn_x() are defined above */ #undef gfn_t #endif #define VALID_GFN(m) (m != INVALID_GFN) static inline int valid_gfn(gfn_t m) { return VALID_GFN(gfn_x(m)); } static inline paddr_t gfn_to_paddr(gfn_t gfn) { return ((paddr_t)gfn_x(gfn)) << PAGE_SHIFT; } /* Override get_gfn to work with gfn_t */ #undef get_gfn #define get_gfn(d, g, t) get_gfn_type((d), gfn_x(g), (t), P2M_ALLOC) /* Types of the guest's page tables and access functions for them */ #if GUEST_PAGING_LEVELS == 2 #define GUEST_L1_PAGETABLE_ENTRIES 1024 #define GUEST_L2_PAGETABLE_ENTRIES 1024 #define GUEST_L1_PAGETABLE_SHIFT 12 #define GUEST_L2_PAGETABLE_SHIFT 22 typedef uint32_t guest_intpte_t; typedef struct { guest_intpte_t l1; } guest_l1e_t; typedef struct { guest_intpte_t l2; } guest_l2e_t; #define PRI_gpte "08x" static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e) { return ((paddr_t) gl1e.l1) & (PADDR_MASK & PAGE_MASK); } static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e) { return ((paddr_t) gl2e.l2) & (PADDR_MASK & PAGE_MASK); } static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e) { return _gfn(guest_l1e_get_paddr(gl1e) >> PAGE_SHIFT); } static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e) { return _gfn(guest_l2e_get_paddr(gl2e) >> PAGE_SHIFT); } static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e) { return gl1e.l1 & 0xfff; } static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e) { return gl2e.l2 & 0xfff; } static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags) { return (guest_l1e_t) { (gfn_x(gfn) << PAGE_SHIFT) | flags }; } static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags) { return (guest_l2e_t) { (gfn_x(gfn) << PAGE_SHIFT) | flags }; } #define guest_l1_table_offset(_va) \ (((_va) >> GUEST_L1_PAGETABLE_SHIFT) & (GUEST_L1_PAGETABLE_ENTRIES - 1)) #define guest_l2_table_offset(_va) \ (((_va) >> GUEST_L2_PAGETABLE_SHIFT) & (GUEST_L2_PAGETABLE_ENTRIES - 1)) #else /* GUEST_PAGING_LEVELS != 2 */ #if GUEST_PAGING_LEVELS == 3 #define GUEST_L1_PAGETABLE_ENTRIES 512 #define GUEST_L2_PAGETABLE_ENTRIES 512 #define GUEST_L3_PAGETABLE_ENTRIES 4 #define GUEST_L1_PAGETABLE_SHIFT 12 #define GUEST_L2_PAGETABLE_SHIFT 21 #define GUEST_L3_PAGETABLE_SHIFT 30 #else /* GUEST_PAGING_LEVELS == 4 */ #define GUEST_L1_PAGETABLE_ENTRIES 512 #define GUEST_L2_PAGETABLE_ENTRIES 512 #define GUEST_L3_PAGETABLE_ENTRIES 512 #define GUEST_L4_PAGETABLE_ENTRIES 512 #define GUEST_L1_PAGETABLE_SHIFT 12 #define GUEST_L2_PAGETABLE_SHIFT 21 #define GUEST_L3_PAGETABLE_SHIFT 30 #define GUEST_L4_PAGETABLE_SHIFT 39 #endif typedef l1_pgentry_t guest_l1e_t; typedef l2_pgentry_t guest_l2e_t; typedef l3_pgentry_t guest_l3e_t; #if GUEST_PAGING_LEVELS >= 4 typedef l4_pgentry_t guest_l4e_t; #endif typedef intpte_t guest_intpte_t; #define PRI_gpte "016"PRIx64 static inline paddr_t guest_l1e_get_paddr(guest_l1e_t gl1e) { return l1e_get_paddr(gl1e); } static inline paddr_t guest_l2e_get_paddr(guest_l2e_t gl2e) { return l2e_get_paddr(gl2e); } static inline paddr_t guest_l3e_get_paddr(guest_l3e_t gl3e) { return l3e_get_paddr(gl3e); } #if GUEST_PAGING_LEVELS >= 4 static inline paddr_t guest_l4e_get_paddr(guest_l4e_t gl4e) { return l4e_get_paddr(gl4e); } #endif static inline gfn_t guest_l1e_get_gfn(guest_l1e_t gl1e) { return _gfn(l1e_get_paddr(gl1e) >> PAGE_SHIFT); } static inline gfn_t guest_l2e_get_gfn(guest_l2e_t gl2e) { return _gfn(l2e_get_paddr(gl2e) >> PAGE_SHIFT); } static inline gfn_t guest_l3e_get_gfn(guest_l3e_t gl3e) { return _gfn(l3e_get_paddr(gl3e) >> PAGE_SHIFT); } #if GUEST_PAGING_LEVELS >= 4 static inline gfn_t guest_l4e_get_gfn(guest_l4e_t gl4e) { return _gfn(l4e_get_paddr(gl4e) >> PAGE_SHIFT); } #endif static inline u32 guest_l1e_get_flags(guest_l1e_t gl1e) { return l1e_get_flags(gl1e); } static inline u32 guest_l2e_get_flags(guest_l2e_t gl2e) { return l2e_get_flags(gl2e); } static inline u32 guest_l3e_get_flags(guest_l3e_t gl3e) { return l3e_get_flags(gl3e); } #if GUEST_PAGING_LEVELS >= 4 static inline u32 guest_l4e_get_flags(guest_l4e_t gl4e) { return l4e_get_flags(gl4e); } #endif static inline guest_l1e_t guest_l1e_from_gfn(gfn_t gfn, u32 flags) { return l1e_from_pfn(gfn_x(gfn), flags); } static inline guest_l2e_t guest_l2e_from_gfn(gfn_t gfn, u32 flags) { return l2e_from_pfn(gfn_x(gfn), flags); } static inline guest_l3e_t guest_l3e_from_gfn(gfn_t gfn, u32 flags) { return l3e_from_pfn(gfn_x(gfn), flags); } #if GUEST_PAGING_LEVELS >= 4 static inline guest_l4e_t guest_l4e_from_gfn(gfn_t gfn, u32 flags) { return l4e_from_pfn(gfn_x(gfn), flags); } #endif #define guest_l1_table_offset(a) l1_table_offset(a) #define guest_l2_table_offset(a) l2_table_offset(a) #define guest_l3_table_offset(a) l3_table_offset(a) #define guest_l4_table_offset(a) l4_table_offset(a) #endif /* GUEST_PAGING_LEVELS != 2 */ /* Mask of the GFNs covered by an L2 or L3 superpage */ #define GUEST_L2_GFN_MASK (GUEST_L1_PAGETABLE_ENTRIES - 1) #define GUEST_L3_GFN_MASK \ ((GUEST_L2_PAGETABLE_ENTRIES * GUEST_L1_PAGETABLE_ENTRIES) - 1) /* Which pagetable features are supported on this vcpu? */ static inline int guest_supports_superpages(struct vcpu *v) { /* The _PAGE_PSE bit must be honoured in HVM guests, whenever * CR4.PSE is set or the guest is in PAE or long mode. * It's also used in the dummy PT for vcpus with CR4.PG cleared. */ return (is_pv_vcpu(v) ? opt_allow_superpage : (GUEST_PAGING_LEVELS != 2 || !hvm_paging_enabled(v) || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE))); } static inline int guest_supports_1G_superpages(struct vcpu *v) { return (GUEST_PAGING_LEVELS >= 4 && hvm_pse1gb_supported(v->domain)); } static inline int guest_supports_nx(struct vcpu *v) { if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx ) return 0; if ( is_pv_vcpu(v) ) return cpu_has_nx; return hvm_nx_enabled(v); } /* Some bits are invalid in any pagetable entry. */ #if GUEST_PAGING_LEVELS == 2 #define _PAGE_INVALID_BITS (0) #elif GUEST_PAGING_LEVELS == 3 #define _PAGE_INVALID_BITS \ get_pte_flags(((1ull<<63) - 1) & ~((1ull<= 3 #if GUEST_PAGING_LEVELS >= 4 guest_l4e_t l4e; /* Guest's level 4 entry */ #endif guest_l3e_t l3e; /* Guest's level 3 entry */ #endif guest_l2e_t l2e; /* Guest's level 2 entry */ guest_l1e_t l1e; /* Guest's level 1 entry (or fabrication) */ #if GUEST_PAGING_LEVELS >= 4 mfn_t l4mfn; /* MFN that the level 4 entry was in */ mfn_t l3mfn; /* MFN that the level 3 entry was in */ #endif mfn_t l2mfn; /* MFN that the level 2 entry was in */ mfn_t l1mfn; /* MFN that the level 1 entry was in */ }; /* Given a walk_t, translate the gw->va into the guest's notion of the * corresponding frame number. */ static inline gfn_t guest_walk_to_gfn(walk_t *gw) { if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) ) return _gfn(INVALID_GFN); return guest_l1e_get_gfn(gw->l1e); } /* Given a walk_t, translate the gw->va into the guest's notion of the * corresponding physical address. */ static inline paddr_t guest_walk_to_gpa(walk_t *gw) { if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) ) return 0; return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK); } /* Given a walk_t from a successful walk, return the page-order of the * page or superpage that the virtual address is in. */ static inline unsigned int guest_walk_to_page_order(walk_t *gw) { /* This is only valid for successful walks - otherwise the * PSE bits might be invalid. */ ASSERT(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT); #if GUEST_PAGING_LEVELS >= 3 if ( guest_l3e_get_flags(gw->l3e) & _PAGE_PSE ) return GUEST_L3_PAGETABLE_SHIFT - PAGE_SHIFT; #endif if ( guest_l2e_get_flags(gw->l2e) & _PAGE_PSE ) return GUEST_L2_PAGETABLE_SHIFT - PAGE_SHIFT; return GUEST_L1_PAGETABLE_SHIFT - PAGE_SHIFT; } /* Walk the guest pagetables, after the manner of a hardware walker. * * Inputs: a vcpu, a virtual address, a walk_t to fill, a * pointer to a pagefault code, the MFN of the guest's * top-level pagetable, and a mapping of the * guest's top-level pagetable. * * We walk the vcpu's guest pagetables, filling the walk_t with what we * see and adding any Accessed and Dirty bits that are needed in the * guest entries. Using the pagefault code, we check the permissions as * we go. For the purposes of reading pagetables we treat all non-RAM * memory as contining zeroes. * * Returns 0 for success, or the set of permission bits that we failed on * if the walk did not complete. */ /* Macro-fu so you can call guest_walk_tables() and get the right one. */ #define GPT_RENAME2(_n, _l) _n ## _ ## _l ## _levels #define GPT_RENAME(_n, _l) GPT_RENAME2(_n, _l) #define guest_walk_tables GPT_RENAME(guest_walk_tables, GUEST_PAGING_LEVELS) #define map_domain_gfn GPT_RENAME(map_domain_gfn, GUEST_PAGING_LEVELS) void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, mfn_t *mfn, p2m_type_t *p2mt, p2m_query_t q, uint32_t *rc); extern uint32_t guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, unsigned long va, walk_t *gw, uint32_t pfec, mfn_t top_mfn, void *top_map); /* Pretty-print the contents of a guest-walk */ static inline void print_gw(walk_t *gw) { gdprintk(XENLOG_INFO, "GUEST WALK TO %#lx:\n", gw->va); #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ gdprintk(XENLOG_INFO, " l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn)); gdprintk(XENLOG_INFO, " l4e=%" PRI_gpte "\n", gw->l4e.l4); gdprintk(XENLOG_INFO, " l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn)); #endif /* PAE or 64... */ gdprintk(XENLOG_INFO, " l3e=%" PRI_gpte "\n", gw->l3e.l3); #endif /* All levels... */ gdprintk(XENLOG_INFO, " l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn)); gdprintk(XENLOG_INFO, " l2e=%" PRI_gpte "\n", gw->l2e.l2); gdprintk(XENLOG_INFO, " l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn)); gdprintk(XENLOG_INFO, " l1e=%" PRI_gpte "\n", gw->l1e.l1); } #endif /* _XEN_ASM_GUEST_PT_H */ xen-4.4.0/xen/include/asm-x86/bzimage.h0000664000175000017500000000047112307313555015654 0ustar smbsmb#ifndef __X86_BZIMAGE_H__ #define __X86_BZIMAGE_H__ #include #include unsigned long bzimage_headroom(char *image_start, unsigned long image_length); int bzimage_parse(char *image_base, char **image_start, unsigned long *image_len); #endif /* __X86_BZIMAGE_H__ */ xen-4.4.0/xen/include/asm-x86/msr.h0000664000175000017500000001033712307313555015041 0ustar smbsmb#ifndef __ASM_MSR_H #define __ASM_MSR_H #include "msr-index.h" #ifndef __ASSEMBLY__ #include #include #include #include #include #define rdmsr(msr,val1,val2) \ __asm__ __volatile__("rdmsr" \ : "=a" (val1), "=d" (val2) \ : "c" (msr)) #define rdmsrl(msr,val) do { unsigned long a__,b__; \ __asm__ __volatile__("rdmsr" \ : "=a" (a__), "=d" (b__) \ : "c" (msr)); \ val = a__ | ((u64)b__<<32); \ } while(0) #define wrmsr(msr,val1,val2) \ __asm__ __volatile__("wrmsr" \ : /* no outputs */ \ : "c" (msr), "a" (val1), "d" (val2)) static inline void wrmsrl(unsigned int msr, __u64 val) { __u32 lo, hi; lo = (__u32)val; hi = (__u32)(val >> 32); wrmsr(msr, lo, hi); } /* rdmsr with exception handling */ #define rdmsr_safe(msr,val) ({\ int _rc; \ uint32_t lo, hi; \ __asm__ __volatile__( \ "1: rdmsr\n2:\n" \ ".section .fixup,\"ax\"\n" \ "3: xorl %0,%0\n; xorl %1,%1\n" \ " movl %5,%2\n; jmp 2b\n" \ ".previous\n" \ _ASM_EXTABLE(1b, 3b) \ : "=a" (lo), "=d" (hi), "=&r" (_rc) \ : "c" (msr), "2" (0), "i" (-EFAULT)); \ val = lo | ((uint64_t)hi << 32); \ _rc; }) /* wrmsr with exception handling */ static inline int wrmsr_safe(unsigned int msr, uint64_t val) { int _rc; uint32_t lo, hi; lo = (uint32_t)val; hi = (uint32_t)(val >> 32); __asm__ __volatile__( "1: wrmsr\n2:\n" ".section .fixup,\"ax\"\n" "3: movl %5,%0\n; jmp 2b\n" ".previous\n" _ASM_EXTABLE(1b, 3b) : "=&r" (_rc) : "c" (msr), "a" (lo), "d" (hi), "0" (0), "i" (-EFAULT)); return _rc; } #define rdtsc(low,high) \ __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high)) #define rdtscl(low) \ __asm__ __volatile__("rdtsc" : "=a" (low) : : "edx") #define rdtscll(val) do { \ unsigned int a,d; \ asm volatile("rdtsc" : "=a" (a), "=d" (d)); \ (val) = ((unsigned long)a) | (((unsigned long)d)<<32); \ } while(0) #define __write_tsc(val) wrmsrl(MSR_IA32_TSC, val) #define write_tsc(val) ({ \ /* Reliable TSCs are in lockstep across all CPUs. We should \ * never write to them. */ \ ASSERT(!boot_cpu_has(X86_FEATURE_TSC_RELIABLE)); \ __write_tsc(val); \ }) #define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0) #define rdpmc(counter,low,high) \ __asm__ __volatile__("rdpmc" \ : "=a" (low), "=d" (high) \ : "c" (counter)) static inline unsigned long __rdfsbase(void) { unsigned long base; #ifdef HAVE_GAS_FSGSBASE asm volatile ( "rdfsbase %0" : "=r" (base) ); #else asm volatile ( ".byte 0xf3, 0x48, 0x0f, 0xae, 0xc0" : "=a" (base) ); #endif return base; } static inline unsigned long __rdgsbase(void) { unsigned long base; #ifdef HAVE_GAS_FSGSBASE asm volatile ( "rdgsbase %0" : "=r" (base) ); #else asm volatile ( ".byte 0xf3, 0x48, 0x0f, 0xae, 0xc8" : "=a" (base) ); #endif return base; } static inline unsigned long rdfsbase(void) { unsigned long base; if ( cpu_has_fsgsbase ) return __rdfsbase(); rdmsrl(MSR_FS_BASE, base); return base; } static inline unsigned long rdgsbase(void) { unsigned long base; if ( cpu_has_fsgsbase ) return __rdgsbase(); rdmsrl(MSR_GS_BASE, base); return base; } static inline void wrfsbase(unsigned long base) { if ( cpu_has_fsgsbase ) #ifdef HAVE_GAS_FSGSBASE asm volatile ( "wrfsbase %0" :: "r" (base) ); #else asm volatile ( ".byte 0xf3, 0x48, 0x0f, 0xae, 0xd0" :: "a" (base) ); #endif else wrmsrl(MSR_FS_BASE, base); } static inline void wrgsbase(unsigned long base) { if ( cpu_has_fsgsbase ) #ifdef HAVE_GAS_FSGSBASE asm volatile ( "wrgsbase %0" :: "r" (base) ); #else asm volatile ( ".byte 0xf3, 0x48, 0x0f, 0xae, 0xd8" :: "a" (base) ); #endif else wrmsrl(MSR_GS_BASE, base); } DECLARE_PER_CPU(u64, efer); u64 read_efer(void); void write_efer(u64 val); DECLARE_PER_CPU(u32, ler_msr); #endif /* !__ASSEMBLY__ */ #endif /* __ASM_MSR_H */ xen-4.4.0/xen/include/asm-x86/irq.h0000664000175000017500000001353512307313555015036 0ustar smbsmb#ifndef _ASM_HW_IRQ_H #define _ASM_HW_IRQ_H /* (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar */ #include #include #include #include #include #include #include extern unsigned int nr_irqs_gsi; extern unsigned int nr_irqs; #define nr_static_irqs nr_irqs_gsi #define IO_APIC_IRQ(irq) (platform_legacy_irq(irq) ? \ (1 << (irq)) & io_apic_irqs : \ (irq) < nr_irqs_gsi) #define MSI_IRQ(irq) ((irq) >= nr_irqs_gsi && (irq) < nr_irqs) #define LEGACY_VECTOR(irq) ((irq) + FIRST_LEGACY_VECTOR) typedef struct { DECLARE_BITMAP(_bits,NR_VECTORS); } vmask_t; struct irq_desc; struct arch_irq_desc { s16 vector; /* vector itself is only 8 bits, */ s16 old_vector; /* but we use -1 for unassigned */ cpumask_var_t cpu_mask; cpumask_var_t old_cpu_mask; cpumask_var_t pending_mask; unsigned move_cleanup_count; vmask_t *used_vectors; u8 move_in_progress : 1; s8 used; }; /* For use with irq_desc.arch.used */ #define IRQ_UNUSED (0) #define IRQ_USED (1) #define IRQ_RESERVED (-1) #define IRQ_VECTOR_UNASSIGNED (-1) typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); extern bool_t opt_noirqbalance; #define OPT_IRQ_VECTOR_MAP_DEFAULT 0 /* Do the default thing */ #define OPT_IRQ_VECTOR_MAP_NONE 1 /* None */ #define OPT_IRQ_VECTOR_MAP_GLOBAL 2 /* One global vector map (no vector sharing) */ #define OPT_IRQ_VECTOR_MAP_PERDEV 3 /* Per-device vetor map (no vector sharing w/in a device) */ extern int opt_irq_vector_map; /* * Per-cpu current frame pointer - the location of the last exception frame on * the stack */ DECLARE_PER_CPU(struct cpu_user_regs *, __irq_regs); static inline struct cpu_user_regs *get_irq_regs(void) { return __get_cpu_var(__irq_regs); } static inline struct cpu_user_regs *set_irq_regs(struct cpu_user_regs *new_regs) { struct cpu_user_regs *old_regs, **pp_regs = &__get_cpu_var(__irq_regs); old_regs = *pp_regs; *pp_regs = new_regs; return old_regs; } #define platform_legacy_irq(irq) ((irq) < 16) void event_check_interrupt(struct cpu_user_regs *regs); void invalidate_interrupt(struct cpu_user_regs *regs); void call_function_interrupt(struct cpu_user_regs *regs); void apic_timer_interrupt(struct cpu_user_regs *regs); void error_interrupt(struct cpu_user_regs *regs); void pmu_apic_interrupt(struct cpu_user_regs *regs); void spurious_interrupt(struct cpu_user_regs *regs); void irq_move_cleanup_interrupt(struct cpu_user_regs *regs); uint8_t alloc_hipriority_vector(void); void set_direct_apic_vector( uint8_t vector, void (*handler)(struct cpu_user_regs *)); void alloc_direct_apic_vector( uint8_t *vector, void (*handler)(struct cpu_user_regs *)); void do_IRQ(struct cpu_user_regs *regs); void disable_8259A_irq(struct irq_desc *); void enable_8259A_irq(struct irq_desc *); int i8259A_irq_pending(unsigned int irq); void mask_8259A(void); void unmask_8259A(void); void init_8259A(int aeoi); void make_8259A_irq(unsigned int irq); bool_t bogus_8259A_irq(unsigned int irq); int i8259A_suspend(void); int i8259A_resume(void); void setup_IO_APIC(void); void disable_IO_APIC(void); void setup_ioapic_dest(void); vmask_t *io_apic_get_used_vector_map(unsigned int irq); extern unsigned int io_apic_irqs; DECLARE_PER_CPU(unsigned int, irq_count); struct pirq; struct arch_pirq { int irq; union { struct hvm_pirq { int emuirq; struct hvm_pirq_dpci dpci; } hvm; }; }; #define pirq_dpci(pirq) ((pirq) ? &(pirq)->arch.hvm.dpci : NULL) #define dpci_pirq(pd) container_of(pd, struct pirq, arch.hvm.dpci) int pirq_shared(struct domain *d , int irq); int map_domain_pirq(struct domain *d, int pirq, int irq, int type, void *data); int unmap_domain_pirq(struct domain *d, int pirq); int get_free_pirq(struct domain *d, int type); int get_free_pirqs(struct domain *, unsigned int nr); void free_domain_pirqs(struct domain *d); int map_domain_emuirq_pirq(struct domain *d, int pirq, int irq); int unmap_domain_pirq_emuirq(struct domain *d, int pirq); bool_t hvm_domain_use_pirq(const struct domain *, const struct pirq *); /* A cpu has been removed from cpu_online_mask. Re-set irq affinities. */ void fixup_irqs(void); int init_irq_data(void); void clear_irq_vector(int irq); int irq_to_vector(int irq); int create_irq(int node); void destroy_irq(unsigned int irq); int assign_irq_vector(int irq, const cpumask_t *); extern void irq_complete_move(struct irq_desc *); extern struct irq_desc *irq_desc; void lock_vector_lock(void); void unlock_vector_lock(void); void __setup_vector_irq(int cpu); void move_native_irq(struct irq_desc *); void move_masked_irq(struct irq_desc *); int bind_irq_vector(int irq, int vector, const cpumask_t *); void irq_set_affinity(struct irq_desc *, const cpumask_t *mask); int init_domain_irq_mapping(struct domain *); void cleanup_domain_irq_mapping(struct domain *); #define domain_pirq_to_irq(d, pirq) pirq_field(d, pirq, arch.irq, 0) #define domain_irq_to_pirq(d, irq) ({ \ void *__ret = radix_tree_lookup(&(d)->arch.irq_pirq, irq); \ __ret ? radix_tree_ptr_to_int(__ret) : 0; \ }) #define PIRQ_ALLOCATED -1 #define domain_pirq_to_emuirq(d, pirq) pirq_field(d, pirq, \ arch.hvm.emuirq, IRQ_UNBOUND) #define domain_emuirq_to_pirq(d, emuirq) ({ \ void *__ret = radix_tree_lookup(&(d)->arch.hvm_domain.emuirq_pirq, \ emuirq); \ __ret ? radix_tree_ptr_to_int(__ret) : IRQ_UNBOUND; \ }) #define IRQ_UNBOUND -1 #define IRQ_PT -2 #define IRQ_MSI_EMU -3 bool_t cpu_has_pending_apic_eoi(void); #endif /* _ASM_HW_IRQ_H */ xen-4.4.0/xen/include/asm-x86/mtrr.h0000664000175000017500000000564112307313555015226 0ustar smbsmb#ifndef __ASM_X86_MTRR_H__ #define __ASM_X86_MTRR_H__ #include #include /* These are the region types. They match the architectural specification. */ #define MTRR_TYPE_UNCACHABLE 0 #define MTRR_TYPE_WRCOMB 1 #define MTRR_TYPE_WRTHROUGH 4 #define MTRR_TYPE_WRPROT 5 #define MTRR_TYPE_WRBACK 6 #define MTRR_NUM_TYPES 7 #define MEMORY_NUM_TYPES MTRR_NUM_TYPES #define NO_HARDCODE_MEM_TYPE MTRR_NUM_TYPES #define NORMAL_CACHE_MODE 0 #define NO_FILL_CACHE_MODE 2 enum { PAT_TYPE_UNCACHABLE=0, PAT_TYPE_WRCOMB=1, PAT_TYPE_RESERVED=2, PAT_TYPE_WRTHROUGH=4, PAT_TYPE_WRPROT=5, PAT_TYPE_WRBACK=6, PAT_TYPE_UC_MINUS=7, PAT_TYPE_NUMS }; #define INVALID_MEM_TYPE PAT_TYPE_NUMS /* In the Intel processor's MTRR interface, the MTRR type is always held in an 8 bit field: */ typedef u8 mtrr_type; struct mtrr_var_range { uint64_t base; uint64_t mask; }; #define NUM_FIXED_RANGES 88 #define NUM_FIXED_MSR 11 struct mtrr_state { struct mtrr_var_range *var_ranges; mtrr_type fixed_ranges[NUM_FIXED_RANGES]; unsigned char enabled; unsigned char have_fixed; mtrr_type def_type; u64 mtrr_cap; /* ranges in var MSRs are overlapped or not:0(no overlapped) */ bool_t overlapped; }; extern struct mtrr_state mtrr_state; extern void mtrr_save_fixed_ranges(void *); extern void mtrr_save_state(void); extern int mtrr_add(unsigned long base, unsigned long size, unsigned int type, char increment); extern int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, char increment); extern int mtrr_del(int reg, unsigned long base, unsigned long size); extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); extern u32 get_pat_flags(struct vcpu *v, u32 gl1e_flags, paddr_t gpaddr, paddr_t spaddr, uint8_t gmtrr_mtype); extern uint8_t epte_get_entry_emt(struct domain *d, unsigned long gfn, mfn_t mfn, uint8_t *ipat, bool_t direct_mmio); extern void ept_change_entry_emt_with_range( struct domain *d, unsigned long start_gfn, unsigned long end_gfn); extern unsigned char pat_type_2_pte_flags(unsigned char pat_type); extern int hold_mtrr_updates_on_aps; extern void mtrr_aps_sync_begin(void); extern void mtrr_aps_sync_end(void); extern void mtrr_bp_restore(void); extern bool_t mtrr_var_range_msr_set( struct domain *d, struct mtrr_state *m, uint32_t msr, uint64_t msr_content); extern bool_t mtrr_fix_range_msr_set(struct mtrr_state *v, uint32_t row, uint64_t msr_content); extern bool_t mtrr_def_type_msr_set(struct mtrr_state *v, uint64_t msr_content); extern bool_t pat_msr_set(uint64_t *pat, uint64_t msr); bool_t is_var_mtrr_overlapped(struct mtrr_state *m); bool_t mtrr_pat_not_equal(struct vcpu *vd, struct vcpu *vs); #endif /* __ASM_X86_MTRR_H__ */ xen-4.4.0/xen/include/asm-x86/trace.h0000664000175000017500000000274512307313555015342 0ustar smbsmb#ifndef __ASM_TRACE_H__ #define __ASM_TRACE_H__ #include void __trace_pv_trap(int trapnr, unsigned long eip, int use_error_code, unsigned error_code); static inline void trace_pv_trap(int trapnr, unsigned long eip, int use_error_code, unsigned error_code) { if ( unlikely(tb_init_done) ) __trace_pv_trap(trapnr, eip, use_error_code, error_code); } void __trace_pv_page_fault(unsigned long addr, unsigned error_code); static inline void trace_pv_page_fault(unsigned long addr, unsigned error_code) { if ( unlikely(tb_init_done) ) __trace_pv_page_fault(addr, error_code); } void __trace_trap_one_addr(unsigned event, unsigned long va); static inline void trace_trap_one_addr(unsigned event, unsigned long va) { if ( unlikely(tb_init_done) ) __trace_trap_one_addr(event, va); } void __trace_trap_two_addr(unsigned event, unsigned long va1, unsigned long va2); static inline void trace_trap_two_addr(unsigned event, unsigned long va1, unsigned long va2) { if ( unlikely(tb_init_done) ) __trace_trap_two_addr(event, va1, va2); } void __trace_ptwr_emulation(unsigned long addr, l1_pgentry_t npte); static inline void trace_ptwr_emulation(unsigned long addr, l1_pgentry_t npte) { if ( unlikely(tb_init_done) ) __trace_ptwr_emulation(addr, npte); } #endif /* __ASM_TRACE_H__ */ xen-4.4.0/xen/include/asm-x86/hardirq.h0000664000175000017500000000125512307313555015671 0ustar smbsmb#ifndef __ASM_HARDIRQ_H #define __ASM_HARDIRQ_H #include #include #include typedef struct { unsigned int __softirq_pending; unsigned int __local_irq_count; unsigned int __nmi_count; bool_t __mwait_wakeup; } __cacheline_aligned irq_cpustat_t; #include /* Standard mappings for irq_cpustat_t above */ #define in_irq() (local_irq_count(smp_processor_id()) != 0) #define irq_enter() (local_irq_count(smp_processor_id())++) #define irq_exit() (local_irq_count(smp_processor_id())--) void ack_bad_irq(unsigned int irq); extern void apic_intr_init(void); extern void smp_intr_init(void); #endif /* __ASM_HARDIRQ_H */ xen-4.4.0/xen/include/asm-x86/processor.h0000664000175000017500000004116312307313555016260 0ustar smbsmb /* Portions are: Copyright (c) 1994 Linus Torvalds */ #ifndef __ASM_X86_PROCESSOR_H #define __ASM_X86_PROCESSOR_H #ifndef __ASSEMBLY__ #include #include #include #include #include #include #include #include #include #endif /* * CPU vendor IDs */ #define X86_VENDOR_INTEL 0 #define X86_VENDOR_CYRIX 1 #define X86_VENDOR_AMD 2 #define X86_VENDOR_UMC 3 #define X86_VENDOR_NEXGEN 4 #define X86_VENDOR_CENTAUR 5 #define X86_VENDOR_RISE 6 #define X86_VENDOR_TRANSMETA 7 #define X86_VENDOR_NSC 8 #define X86_VENDOR_NUM 9 #define X86_VENDOR_UNKNOWN 0xff /* * EFLAGS bits */ #define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ #define X86_EFLAGS_MBS 0x00000002 /* Resvd bit */ #define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ #define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ #define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ #define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ #define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ #define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ #define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ #define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ #define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ #define X86_EFLAGS_NT 0x00004000 /* Nested Task */ #define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ #define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ #define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ #define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ /* * Intel CPU flags in CR0 */ #define X86_CR0_PE 0x00000001 /* Enable Protected Mode (RW) */ #define X86_CR0_MP 0x00000002 /* Monitor Coprocessor (RW) */ #define X86_CR0_EM 0x00000004 /* Require FPU Emulation (RO) */ #define X86_CR0_TS 0x00000008 /* Task Switched (RW) */ #define X86_CR0_ET 0x00000010 /* Extension type (RO) */ #define X86_CR0_NE 0x00000020 /* Numeric Error Reporting (RW) */ #define X86_CR0_WP 0x00010000 /* Supervisor Write Protect (RW) */ #define X86_CR0_AM 0x00040000 /* Alignment Checking (RW) */ #define X86_CR0_NW 0x20000000 /* Not Write-Through (RW) */ #define X86_CR0_CD 0x40000000 /* Cache Disable (RW) */ #define X86_CR0_PG 0x80000000 /* Paging (RW) */ /* * Intel CPU features in CR4 */ #define X86_CR4_VME 0x0001 /* enable vm86 extensions */ #define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ #define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ #define X86_CR4_DE 0x0008 /* enable debugging extensions */ #define X86_CR4_PSE 0x0010 /* enable page size extensions */ #define X86_CR4_PAE 0x0020 /* enable physical address extensions */ #define X86_CR4_MCE 0x0040 /* Machine check enable */ #define X86_CR4_PGE 0x0080 /* enable global pages */ #define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ #define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */ #define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */ #define X86_CR4_VMXE 0x2000 /* enable VMX */ #define X86_CR4_SMXE 0x4000 /* enable SMX */ #define X86_CR4_FSGSBASE 0x10000 /* enable {rd,wr}{fs,gs}base */ #define X86_CR4_PCIDE 0x20000 /* enable PCID */ #define X86_CR4_OSXSAVE 0x40000 /* enable XSAVE/XRSTOR */ #define X86_CR4_SMEP 0x100000/* enable SMEP */ #define X86_CR4_SMAP 0x200000/* enable SMAP */ /* * Trap/fault mnemonics. */ #define TRAP_divide_error 0 #define TRAP_debug 1 #define TRAP_nmi 2 #define TRAP_int3 3 #define TRAP_overflow 4 #define TRAP_bounds 5 #define TRAP_invalid_op 6 #define TRAP_no_device 7 #define TRAP_double_fault 8 #define TRAP_copro_seg 9 #define TRAP_invalid_tss 10 #define TRAP_no_segment 11 #define TRAP_stack_error 12 #define TRAP_gp_fault 13 #define TRAP_page_fault 14 #define TRAP_spurious_int 15 #define TRAP_copro_error 16 #define TRAP_alignment_check 17 #define TRAP_machine_check 18 #define TRAP_simd_error 19 #define TRAP_last_reserved 31 /* Set for entry via SYSCALL. Informs return code to use SYSRETQ not IRETQ. */ /* NB. Same as VGCF_in_syscall. No bits in common with any other TRAP_ defn. */ #define TRAP_syscall 256 /* Boolean return code: the reason for a fault has been fixed. */ #define EXCRET_fault_fixed 1 /* 'trap_bounce' flags values */ #define TBF_EXCEPTION 1 #define TBF_EXCEPTION_ERRCODE 2 #define TBF_INTERRUPT 8 #define TBF_FAILSAFE 16 /* 'arch_vcpu' flags values */ #define _TF_kernel_mode 0 #define TF_kernel_mode (1<<_TF_kernel_mode) /* #PF error code values. */ #define PFEC_page_present (1U<<0) #define PFEC_write_access (1U<<1) #define PFEC_user_mode (1U<<2) #define PFEC_reserved_bit (1U<<3) #define PFEC_insn_fetch (1U<<4) #define PFEC_page_paged (1U<<5) #define PFEC_page_shared (1U<<6) #ifndef __ASSEMBLY__ struct domain; struct vcpu; /* * Default implementation of macro that returns current * instruction pointer ("program counter"). */ #define current_text_addr() ({ \ void *pc; \ asm ( "leaq 1f(%%rip),%0\n1:" : "=r" (pc) ); \ pc; \ }) struct cpuinfo_x86 { __u8 x86; /* CPU family */ __u8 x86_vendor; /* CPU vendor */ __u8 x86_model; __u8 x86_mask; int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ unsigned int x86_capability[NCAPINTS]; char x86_vendor_id[16]; char x86_model_id[64]; int x86_cache_size; /* in KB - valid for CPUS which support this call */ int x86_cache_alignment; /* In bytes */ int x86_power; __u32 x86_max_cores; /* cpuid returned max cores value */ __u32 booted_cores; /* number of cores as seen by OS */ __u32 x86_num_siblings; /* cpuid logical cpus per chip value */ __u32 apicid; int phys_proc_id; /* package ID of each logical CPU */ int cpu_core_id; /* core ID of each logical CPU*/ int compute_unit_id; /* AMD compute unit ID of each logical CPU */ unsigned short x86_clflush_size; } __cacheline_aligned; /* * capabilities of CPUs */ extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 cpu_data[]; #define current_cpu_data cpu_data[smp_processor_id()] extern void set_cpuid_faulting(bool_t enable); extern u64 host_pat; extern bool_t opt_cpu_info; /* Maximum width of physical addresses supported by the hardware */ extern unsigned int paddr_bits; extern void identify_cpu(struct cpuinfo_x86 *); extern void setup_clear_cpu_cap(unsigned int); extern void print_cpu_info(unsigned int cpu); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void dodgy_tsc(void); extern void detect_extended_topology(struct cpuinfo_x86 *c); #ifdef CONFIG_X86_HT extern void detect_ht(struct cpuinfo_x86 *c); #else static always_inline void detect_ht(struct cpuinfo_x86 *c) {} #endif #define cpu_to_core(_cpu) (cpu_data[_cpu].cpu_core_id) #define cpu_to_socket(_cpu) (cpu_data[_cpu].phys_proc_id) /* * Generic CPUID function * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx * resulting in stale register contents being returned. */ #define cpuid(_op,_eax,_ebx,_ecx,_edx) \ asm volatile ( "cpuid" \ : "=a" (*(int *)(_eax)), \ "=b" (*(int *)(_ebx)), \ "=c" (*(int *)(_ecx)), \ "=d" (*(int *)(_edx)) \ : "0" (_op), "2" (0) ) /* Some CPUID calls want 'count' to be placed in ecx */ static inline void cpuid_count( int op, int count, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { asm volatile ( "cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (op), "c" (count) ); } /* * CPUID functions returning a single datum */ static always_inline unsigned int cpuid_eax(unsigned int op) { unsigned int eax; asm volatile ( "cpuid" : "=a" (eax) : "0" (op) : "bx", "cx", "dx" ); return eax; } static always_inline unsigned int cpuid_ebx(unsigned int op) { unsigned int eax, ebx; asm volatile ( "cpuid" : "=a" (eax), "=b" (ebx) : "0" (op) : "cx", "dx" ); return ebx; } static always_inline unsigned int cpuid_ecx(unsigned int op) { unsigned int eax, ecx; asm volatile ( "cpuid" : "=a" (eax), "=c" (ecx) : "0" (op) : "bx", "dx" ); return ecx; } static always_inline unsigned int cpuid_edx(unsigned int op) { unsigned int eax, edx; asm volatile ( "cpuid" : "=a" (eax), "=d" (edx) : "0" (op) : "bx", "cx" ); return edx; } static inline unsigned long read_cr0(void) { unsigned long cr0; asm volatile ( "mov %%cr0,%0\n\t" : "=r" (cr0) ); return cr0; } static inline void write_cr0(unsigned long val) { asm volatile ( "mov %0,%%cr0" : : "r" ((unsigned long)val) ); } static inline unsigned long read_cr2(void) { unsigned long cr2; asm volatile ( "mov %%cr2,%0\n\t" : "=r" (cr2) ); return cr2; } DECLARE_PER_CPU(unsigned long, cr4); static inline unsigned long read_cr4(void) { return this_cpu(cr4); } static inline void write_cr4(unsigned long val) { this_cpu(cr4) = val; asm volatile ( "mov %0,%%cr4" : : "r" (val) ); } /* Clear and set 'TS' bit respectively */ static inline void clts(void) { asm volatile ( "clts" ); } static inline void stts(void) { write_cr0(X86_CR0_TS|read_cr0()); } /* * Save the cr4 feature set we're using (ie * Pentium 4MB enable and PPro Global page * enable), so that any CPU's that boot up * after us can get the correct flags. */ extern unsigned long mmu_cr4_features; static always_inline void set_in_cr4 (unsigned long mask) { mmu_cr4_features |= mask; write_cr4(read_cr4() | mask); } static always_inline void clear_in_cr4 (unsigned long mask) { mmu_cr4_features &= ~mask; write_cr4(read_cr4() & ~mask); } /* * NSC/Cyrix CPU configuration register indexes */ #define CX86_PCR0 0x20 #define CX86_GCR 0xb8 #define CX86_CCR0 0xc0 #define CX86_CCR1 0xc1 #define CX86_CCR2 0xc2 #define CX86_CCR3 0xc3 #define CX86_CCR4 0xe8 #define CX86_CCR5 0xe9 #define CX86_CCR6 0xea #define CX86_CCR7 0xeb #define CX86_PCR1 0xf0 #define CX86_DIR0 0xfe #define CX86_DIR1 0xff #define CX86_ARR_BASE 0xc4 #define CX86_RCR_BASE 0xdc /* * NSC/Cyrix CPU indexed register access macros */ #define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); }) #define setCx86(reg, data) do { \ outb((reg), 0x22); \ outb((data), 0x23); \ } while (0) /* Stop speculative execution */ static inline void sync_core(void) { int tmp; asm volatile ( "cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory" ); } static always_inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) { /* "monitor %eax,%ecx,%edx;" */ asm volatile ( ".byte 0x0f,0x01,0xc8;" : : "a" (eax), "c" (ecx), "d"(edx) ); } static always_inline void __mwait(unsigned long eax, unsigned long ecx) { /* "mwait %eax,%ecx;" */ asm volatile ( ".byte 0x0f,0x01,0xc9;" : : "a" (eax), "c" (ecx) ); } #define IOBMP_BYTES 8192 #define IOBMP_INVALID_OFFSET 0x8000 struct tss_struct { unsigned short back_link,__blh; union { u64 rsp0, esp0; }; union { u64 rsp1, esp1; }; union { u64 rsp2, esp2; }; u64 reserved1; u64 ist[7]; /* Interrupt Stack Table is 1-based so tss->ist[0] * corresponds to an IST value of 1 in an Interrupt * Descriptor */ u64 reserved2; u16 reserved3; u16 bitmap; /* Pads the TSS to be cacheline-aligned (total size is 0x80). */ u8 __cacheline_filler[24]; } __cacheline_aligned __attribute__((packed)); #define IST_NONE 0UL #define IST_DF 1UL #define IST_NMI 2UL #define IST_MCE 3UL #define IST_MAX 3UL /* Set the interrupt stack table used by a particular interrupt * descriptor table entry. */ static always_inline void set_ist(idt_entry_t *idt, unsigned long ist) { /* IST is a 3 bit field, 32 bits into the IDT entry. */ ASSERT(ist <= IST_MAX); idt->a = (idt->a & ~(7UL << 32)) | (ist << 32); } #define IDT_ENTRIES 256 extern idt_entry_t idt_table[]; extern idt_entry_t *idt_tables[]; DECLARE_PER_CPU(struct tss_struct, init_tss); extern void init_int80_direct_trap(struct vcpu *v); #define set_int80_direct_trap(_ed) ((void)0) extern int gpf_emulate_4gb(struct cpu_user_regs *regs); extern void write_ptbase(struct vcpu *v); void destroy_gdt(struct vcpu *d); long set_gdt(struct vcpu *d, unsigned long *frames, unsigned int entries); #define write_debugreg(reg, val) do { \ unsigned long __val = val; \ asm volatile ( "mov %0,%%db" #reg : : "r" (__val) ); \ } while (0) #define read_debugreg(reg) ({ \ unsigned long __val; \ asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) ); \ __val; \ }) long set_debugreg(struct vcpu *p, int reg, unsigned long value); /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ static always_inline void rep_nop(void) { asm volatile ( "rep;nop" : : : "memory" ); } #define cpu_relax() rep_nop() /* Prefetch instructions for Pentium III and AMD Athlon */ #ifdef CONFIG_MPENTIUMIII #define ARCH_HAS_PREFETCH extern always_inline void prefetch(const void *x) { asm volatile ( "prefetchnta (%0)" : : "r"(x) ); } #elif CONFIG_X86_USE_3DNOW #define ARCH_HAS_PREFETCH #define ARCH_HAS_PREFETCHW #define ARCH_HAS_SPINLOCK_PREFETCH extern always_inline void prefetch(const void *x) { asm volatile ( "prefetch (%0)" : : "r"(x) ); } extern always_inline void prefetchw(const void *x) { asm volatile ( "prefetchw (%0)" : : "r"(x) ); } #define spin_lock_prefetch(x) prefetchw(x) #endif void show_stack(struct cpu_user_regs *regs); void show_stack_overflow(unsigned int cpu, const struct cpu_user_regs *regs); void show_registers(struct cpu_user_regs *regs); void show_execution_state(struct cpu_user_regs *regs); #define dump_execution_state() run_in_exception_handler(show_execution_state) void show_page_walk(unsigned long addr); void fatal_trap(int trapnr, struct cpu_user_regs *regs); void compat_show_guest_stack(struct vcpu *, struct cpu_user_regs *, int lines); extern void mtrr_ap_init(void); extern void mtrr_bp_init(void); void mcheck_init(struct cpuinfo_x86 *c, bool_t bsp); #define DECLARE_TRAP_HANDLER(_name) \ void _name(void); \ void do_ ## _name(struct cpu_user_regs *regs) DECLARE_TRAP_HANDLER(divide_error); DECLARE_TRAP_HANDLER(debug); DECLARE_TRAP_HANDLER(nmi); DECLARE_TRAP_HANDLER(nmi_crash); DECLARE_TRAP_HANDLER(int3); DECLARE_TRAP_HANDLER(overflow); DECLARE_TRAP_HANDLER(bounds); DECLARE_TRAP_HANDLER(invalid_op); DECLARE_TRAP_HANDLER(device_not_available); DECLARE_TRAP_HANDLER(coprocessor_segment_overrun); DECLARE_TRAP_HANDLER(invalid_TSS); DECLARE_TRAP_HANDLER(segment_not_present); DECLARE_TRAP_HANDLER(stack_segment); DECLARE_TRAP_HANDLER(general_protection); DECLARE_TRAP_HANDLER(page_fault); DECLARE_TRAP_HANDLER(coprocessor_error); DECLARE_TRAP_HANDLER(simd_coprocessor_error); DECLARE_TRAP_HANDLER(machine_check); DECLARE_TRAP_HANDLER(alignment_check); DECLARE_TRAP_HANDLER(spurious_interrupt_bug); #undef DECLARE_TRAP_HANDLER void trap_nop(void); void enable_nmis(void); void syscall_enter(void); void sysenter_entry(void); void sysenter_eflags_saved(void); void compat_hypercall(void); void int80_direct_trap(void); extern int hypercall(void); int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx); int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val); int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val); void microcode_set_module(unsigned int); int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void), unsigned long len); int microcode_resume_cpu(int cpu); void pv_cpuid(struct cpu_user_regs *regs); #endif /* !__ASSEMBLY__ */ #endif /* __ASM_X86_PROCESSOR_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/config.h0000664000175000017500000003144612307313555015511 0ustar smbsmb/****************************************************************************** * config.h * * A Linux-style configuration list. */ #ifndef __X86_CONFIG_H__ #define __X86_CONFIG_H__ #define LONG_BYTEORDER 3 #define CONFIG_PAGING_LEVELS 4 #define BYTES_PER_LONG (1 << LONG_BYTEORDER) #define BITS_PER_LONG (BYTES_PER_LONG << 3) #define BITS_PER_BYTE 8 #define BITS_PER_XEN_ULONG BITS_PER_LONG #define CONFIG_X86 1 #define CONFIG_X86_HT 1 #define CONFIG_PAGING_ASSISTANCE 1 #define CONFIG_X86_LOCAL_APIC 1 #define CONFIG_X86_GOOD_APIC 1 #define CONFIG_X86_IO_APIC 1 #define CONFIG_X86_PM_TIMER 1 #define CONFIG_HPET_TIMER 1 #define CONFIG_X86_MCE_THERMAL 1 #define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 1 #define CONFIG_NUMA 1 #define CONFIG_DISCONTIGMEM 1 #define CONFIG_NUMA_EMU 1 #define CONFIG_PAGEALLOC_MAX_ORDER (2 * PAGETABLE_ORDER) #define CONFIG_DOMAIN_PAGE 1 /* Intel P4 currently has largest cache line (L2 line size is 128 bytes). */ #define CONFIG_X86_L1_CACHE_SHIFT 7 #define CONFIG_ACPI 1 #define CONFIG_ACPI_BOOT 1 #define CONFIG_ACPI_SLEEP 1 #define CONFIG_ACPI_NUMA 1 #define CONFIG_ACPI_SRAT 1 #define CONFIG_ACPI_CSTATE 1 #define CONFIG_VGA 1 #define CONFIG_VIDEO 1 #define CONFIG_HOTPLUG 1 #define CONFIG_HOTPLUG_CPU 1 #define CONFIG_XENOPROF 1 #define CONFIG_KEXEC 1 #define CONFIG_WATCHDOG 1 #define HZ 100 #define OPT_CONSOLE_STR "vga" #ifdef MAX_PHYS_CPUS #define NR_CPUS MAX_PHYS_CPUS #else #define NR_CPUS 256 #endif /* Maximum we can support with current vLAPIC ID mapping. */ #define MAX_HVM_VCPUS 128 #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL # define supervisor_mode_kernel (1) #else # define supervisor_mode_kernel (0) #endif /* Linkage for x86 */ #define __ALIGN .align 16,0x90 #define __ALIGN_STR ".align 16,0x90" #ifdef __ASSEMBLY__ #define ALIGN __ALIGN #define ALIGN_STR __ALIGN_STR #define ENTRY(name) \ .globl name; \ ALIGN; \ name: #define GLOBAL(name) \ .globl name; \ name: #endif #define NR_hypercalls 64 #ifndef NDEBUG #define MEMORY_GUARD #endif #define STACK_ORDER 3 #define STACK_SIZE (PAGE_SIZE << STACK_ORDER) /* Primary stack is restricted to 8kB by guard pages. */ #define PRIMARY_STACK_SIZE 8192 /* Return value for zero-size _xmalloc(), distinguished from NULL. */ #define ZERO_BLOCK_PTR ((void *)0xBAD0BAD0BAD0BAD0UL) #ifndef __ASSEMBLY__ extern unsigned long trampoline_phys; #define bootsym_phys(sym) \ (((unsigned long)&(sym)-(unsigned long)&trampoline_start)+trampoline_phys) #define bootsym(sym) \ (*RELOC_HIDE((typeof(&(sym)))__va(__pa(&(sym))), \ trampoline_phys-__pa(trampoline_start))) extern char trampoline_start[], trampoline_end[]; extern char trampoline_realmode_entry[]; extern unsigned int trampoline_xen_phys_start; extern unsigned char trampoline_cpu_started; extern char wakeup_start[]; extern unsigned int video_mode, video_flags; extern unsigned short boot_edid_caps; extern unsigned char boot_edid_info[128]; #endif #define asmlinkage #define CONFIG_COMPAT 1 #include #define PML4_ENTRY_BITS 39 #define PML4_ENTRY_BYTES (_AC(1,UL) << PML4_ENTRY_BITS) #define PML4_ADDR(_slot) \ (((_AC(_slot, UL) >> 8) * _AC(0xffff000000000000,UL)) | \ (_AC(_slot, UL) << PML4_ENTRY_BITS)) /* * Memory layout: * 0x0000000000000000 - 0x00007fffffffffff [128TB, 2^47 bytes, PML4:0-255] * Guest-defined use (see below for compatibility mode guests). * 0x0000800000000000 - 0xffff7fffffffffff [16EB] * Inaccessible: current arch only supports 48-bit sign-extended VAs. * 0xffff800000000000 - 0xffff803fffffffff [256GB, 2^38 bytes, PML4:256] * Read-only machine-to-phys translation table (GUEST ACCESSIBLE). * 0xffff804000000000 - 0xffff807fffffffff [256GB, 2^38 bytes, PML4:256] * Reserved for future shared info with the guest OS (GUEST ACCESSIBLE). * 0xffff808000000000 - 0xffff80ffffffffff [512GB, 2^39 bytes, PML4:257] * ioremap for PCI mmconfig space * 0xffff810000000000 - 0xffff817fffffffff [512GB, 2^39 bytes, PML4:258] * Guest linear page table. * 0xffff818000000000 - 0xffff81ffffffffff [512GB, 2^39 bytes, PML4:259] * Shadow linear page table. * 0xffff820000000000 - 0xffff827fffffffff [512GB, 2^39 bytes, PML4:260] * Per-domain mappings (e.g., GDT, LDT). * 0xffff828000000000 - 0xffff82bfffffffff [256GB, 2^38 bytes, PML4:261] * Machine-to-phys translation table. * 0xffff82c000000000 - 0xffff82cfffffffff [64GB, 2^36 bytes, PML4:261] * vmap()/ioremap()/fixmap area. * 0xffff82d000000000 - 0xffff82d03fffffff [1GB, 2^30 bytes, PML4:261] * Compatibility machine-to-phys translation table. * 0xffff82d040000000 - 0xffff82d07fffffff [1GB, 2^30 bytes, PML4:261] * High read-only compatibility machine-to-phys translation table. * 0xffff82d080000000 - 0xffff82d0bfffffff [1GB, 2^30 bytes, PML4:261] * Xen text, static data, bss. * 0xffff82d0c0000000 - 0xffff82dffbffffff [61GB - 64MB, PML4:261] * Reserved for future use. * 0xffff82dffc000000 - 0xffff82dfffffffff [64MB, 2^26 bytes, PML4:261] * Super-page information array. * 0xffff82e000000000 - 0xffff82ffffffffff [128GB, 2^37 bytes, PML4:261] * Page-frame information array. * 0xffff830000000000 - 0xffff87ffffffffff [5TB, 5*2^40 bytes, PML4:262-271] * 1:1 direct mapping of all physical memory. * 0xffff880000000000 - 0xffffffffffffffff [120TB, PML4:272-511] * PV: Guest-defined use. * 0xffff880000000000 - 0xffffff7fffffffff [119.5TB, PML4:272-510] * HVM/idle: continuation of 1:1 mapping * 0xffffff8000000000 - 0xffffffffffffffff [512GB, 2^39 bytes PML4:511] * HVM/idle: unused * * Compatibility guest area layout: * 0x0000000000000000 - 0x00000000f57fffff [3928MB, PML4:0] * Guest-defined use. * 0x00000000f5800000 - 0x00000000ffffffff [168MB, PML4:0] * Read-only machine-to-phys translation table (GUEST ACCESSIBLE). * 0x0000000100000000 - 0x0000007fffffffff [508GB, PML4:0] * Unused. * 0x0000008000000000 - 0x000000ffffffffff [512GB, 2^39 bytes, PML4:1] * Hypercall argument translation area. * 0x0000010000000000 - 0x00007fffffffffff [127TB, 2^46 bytes, PML4:2-255] * Reserved for future use. */ #define ROOT_PAGETABLE_FIRST_XEN_SLOT 256 #define ROOT_PAGETABLE_LAST_XEN_SLOT 271 #define ROOT_PAGETABLE_XEN_SLOTS \ (L4_PAGETABLE_ENTRIES - ROOT_PAGETABLE_FIRST_XEN_SLOT - 1) #define ROOT_PAGETABLE_PV_XEN_SLOTS \ (ROOT_PAGETABLE_LAST_XEN_SLOT - ROOT_PAGETABLE_FIRST_XEN_SLOT + 1) /* Hypervisor reserves PML4 slots 256 to 271 inclusive. */ #define HYPERVISOR_VIRT_START (PML4_ADDR(256)) #define HYPERVISOR_VIRT_END (HYPERVISOR_VIRT_START + PML4_ENTRY_BYTES*16) /* Slot 256: read-only guest-accessible machine-to-phys translation table. */ #define RO_MPT_VIRT_START (PML4_ADDR(256)) #define MPT_VIRT_SIZE (PML4_ENTRY_BYTES / 2) #define RO_MPT_VIRT_END (RO_MPT_VIRT_START + MPT_VIRT_SIZE) /* Slot 257: ioremap for PCI mmconfig space for 2048 segments (512GB) * - full 16-bit segment support needs 44 bits * - since PML4 slot has 39 bits, we limit segments to 2048 (11-bits) */ #define PCI_MCFG_VIRT_START (PML4_ADDR(257)) #define PCI_MCFG_VIRT_END (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES) /* Slot 258: linear page table (guest table). */ #define LINEAR_PT_VIRT_START (PML4_ADDR(258)) #define LINEAR_PT_VIRT_END (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES) /* Slot 259: linear page table (shadow table). */ #define SH_LINEAR_PT_VIRT_START (PML4_ADDR(259)) #define SH_LINEAR_PT_VIRT_END (SH_LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES) /* Slot 260: per-domain mappings (including map cache). */ #define PERDOMAIN_VIRT_START (PML4_ADDR(260)) #define PERDOMAIN_SLOT_MBYTES (PML4_ENTRY_BYTES >> (20 + PAGETABLE_ORDER)) #define PERDOMAIN_SLOTS 3 #define PERDOMAIN_VIRT_SLOT(s) (PERDOMAIN_VIRT_START + (s) * \ (PERDOMAIN_SLOT_MBYTES << 20)) /* Slot 261: machine-to-phys conversion table (256GB). */ #define RDWR_MPT_VIRT_START (PML4_ADDR(261)) #define RDWR_MPT_VIRT_END (RDWR_MPT_VIRT_START + MPT_VIRT_SIZE) /* Slot 261: vmap()/ioremap()/fixmap area (64GB). */ #define VMAP_VIRT_START RDWR_MPT_VIRT_END #define VMAP_VIRT_END (VMAP_VIRT_START + GB(64)) /* Slot 261: compatibility machine-to-phys conversion table (1GB). */ #define RDWR_COMPAT_MPT_VIRT_START VMAP_VIRT_END #define RDWR_COMPAT_MPT_VIRT_END (RDWR_COMPAT_MPT_VIRT_START + GB(1)) /* Slot 261: high read-only compat machine-to-phys conversion table (1GB). */ #define HIRO_COMPAT_MPT_VIRT_START RDWR_COMPAT_MPT_VIRT_END #define HIRO_COMPAT_MPT_VIRT_END (HIRO_COMPAT_MPT_VIRT_START + GB(1)) /* Slot 261: xen text, static data and bss (1GB). */ #define XEN_VIRT_START (HIRO_COMPAT_MPT_VIRT_END) #define XEN_VIRT_END (XEN_VIRT_START + GB(1)) /* Slot 261: superpage information array (64MB). */ #define SPAGETABLE_VIRT_END FRAMETABLE_VIRT_START #define SPAGETABLE_NR (((FRAMETABLE_NR - 1) >> (SUPERPAGE_SHIFT - \ PAGE_SHIFT)) + 1) #define SPAGETABLE_SIZE (SPAGETABLE_NR * sizeof(struct spage_info)) #define SPAGETABLE_VIRT_START ((SPAGETABLE_VIRT_END - SPAGETABLE_SIZE) & \ (_AC(-1,UL) << SUPERPAGE_SHIFT)) /* Slot 261: page-frame information array (128GB). */ #define FRAMETABLE_VIRT_END DIRECTMAP_VIRT_START #define FRAMETABLE_SIZE GB(128) #define FRAMETABLE_NR (FRAMETABLE_SIZE / sizeof(*frame_table)) #define FRAMETABLE_VIRT_START (FRAMETABLE_VIRT_END - FRAMETABLE_SIZE) /* Slot 262-271/510: A direct 1:1 mapping of all of physical memory. */ #define DIRECTMAP_VIRT_START (PML4_ADDR(262)) #define DIRECTMAP_SIZE (PML4_ENTRY_BYTES * (511 - 262)) #define DIRECTMAP_VIRT_END (DIRECTMAP_VIRT_START + DIRECTMAP_SIZE) #ifndef __ASSEMBLY__ /* This is not a fixed value, just a lower limit. */ #define __HYPERVISOR_COMPAT_VIRT_START 0xF5800000 #define HYPERVISOR_COMPAT_VIRT_START(d) ((d)->arch.hv_compat_vstart) #define MACH2PHYS_COMPAT_VIRT_START HYPERVISOR_COMPAT_VIRT_START #define MACH2PHYS_COMPAT_VIRT_END 0xFFE00000 #define MACH2PHYS_COMPAT_NR_ENTRIES(d) \ ((MACH2PHYS_COMPAT_VIRT_END-MACH2PHYS_COMPAT_VIRT_START(d))>>2) #define COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d) \ l2_table_offset(HYPERVISOR_COMPAT_VIRT_START(d)) #define COMPAT_L2_PAGETABLE_LAST_XEN_SLOT l2_table_offset(~0U) #define COMPAT_L2_PAGETABLE_XEN_SLOTS(d) \ (COMPAT_L2_PAGETABLE_LAST_XEN_SLOT - COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d) + 1) #define COMPAT_LEGACY_MAX_VCPUS XEN_LEGACY_MAX_VCPUS #define COMPAT_HAVE_PV_GUEST_ENTRY XEN_HAVE_PV_GUEST_ENTRY #define COMPAT_HAVE_PV_UPCALL_MASK XEN_HAVE_PV_UPCALL_MASK #endif #define __HYPERVISOR_CS64 0xe008 #define __HYPERVISOR_CS32 0xe038 #define __HYPERVISOR_CS __HYPERVISOR_CS64 #define __HYPERVISOR_DS64 0x0000 #define __HYPERVISOR_DS32 0xe010 #define __HYPERVISOR_DS __HYPERVISOR_DS64 #define SYMBOLS_ORIGIN XEN_VIRT_START /* For generic assembly code: use macros to define operation/operand sizes. */ #define __OS "q" /* Operation Suffix */ #define __OP "r" /* Operand Prefix */ #ifndef __ASSEMBLY__ extern unsigned long xen_phys_start; #endif /* GDT/LDT shadow mapping area. The first per-domain-mapping sub-area. */ #define GDT_LDT_VCPU_SHIFT 5 #define GDT_LDT_VCPU_VA_SHIFT (GDT_LDT_VCPU_SHIFT + PAGE_SHIFT) #define GDT_LDT_MBYTES PERDOMAIN_SLOT_MBYTES #define MAX_VIRT_CPUS (GDT_LDT_MBYTES << (20-GDT_LDT_VCPU_VA_SHIFT)) #define GDT_LDT_VIRT_START PERDOMAIN_VIRT_SLOT(0) #define GDT_LDT_VIRT_END (GDT_LDT_VIRT_START + (GDT_LDT_MBYTES << 20)) /* The address of a particular VCPU's GDT or LDT. */ #define GDT_VIRT_START(v) \ (PERDOMAIN_VIRT_START + ((v)->vcpu_id << GDT_LDT_VCPU_VA_SHIFT)) #define LDT_VIRT_START(v) \ (GDT_VIRT_START(v) + (64*1024)) /* map_domain_page() map cache. The second per-domain-mapping sub-area. */ #define MAPCACHE_VCPU_ENTRIES (CONFIG_PAGING_LEVELS * CONFIG_PAGING_LEVELS) #define MAPCACHE_ENTRIES (MAX_VIRT_CPUS * MAPCACHE_VCPU_ENTRIES) #define MAPCACHE_VIRT_START PERDOMAIN_VIRT_SLOT(1) #define MAPCACHE_VIRT_END (MAPCACHE_VIRT_START + \ MAPCACHE_ENTRIES * PAGE_SIZE) /* Argument translation area. The third per-domain-mapping sub-area. */ #define ARG_XLAT_VIRT_START PERDOMAIN_VIRT_SLOT(2) /* Allow for at least one guard page (COMPAT_ARG_XLAT_SIZE being 2 pages): */ #define ARG_XLAT_VA_SHIFT (2 + PAGE_SHIFT) #define ARG_XLAT_START(v) \ (ARG_XLAT_VIRT_START + ((v)->vcpu_id << ARG_XLAT_VA_SHIFT)) #define ELFSIZE 64 #define ARCH_CRASH_SAVE_VMCOREINFO #endif /* __X86_CONFIG_H__ */ xen-4.4.0/xen/include/asm-x86/setup.h0000664000175000017500000000221212307313555015371 0ustar smbsmb#ifndef __X86_SETUP_H_ #define __X86_SETUP_H_ #include extern unsigned long xenheap_initial_phys_start; void early_cpu_init(void); void early_time_init(void); void early_page_fault(void); int intel_cpu_init(void); int amd_init_cpu(void); int cyrix_init_cpu(void); int nsc_init_cpu(void); int centaur_init_cpu(void); int transmeta_init_cpu(void); void set_nr_cpu_ids(unsigned int max_cpus); void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn); void arch_init_memory(void); void subarch_init_memory(void); void init_IRQ(void); void vesa_init(void); void vesa_mtrr_init(void); int construct_dom0( struct domain *d, const module_t *kernel, unsigned long kernel_headroom, module_t *initrd, void *(*bootstrap_map)(const module_t *), char *cmdline); unsigned long initial_images_nrpages(void); void discard_initial_images(void); int xen_in_range(unsigned long mfn); void microcode_grab_module( unsigned long *, const multiboot_info_t *, void *(*)(const module_t *)); extern uint8_t kbd_shift_flags; #ifdef NDEBUG # define highmem_start 0 #else extern unsigned long highmem_start; #endif #endif xen-4.4.0/xen/include/asm-x86/div64.h0000664000175000017500000000057012307313555015172 0ustar smbsmb#ifndef __X86_DIV64 #define __X86_DIV64 #include #define do_div(n,base) ({ \ uint32_t __base = (base); \ uint32_t __rem; \ __rem = ((uint64_t)(n)) % __base; \ (n) = ((uint64_t)(n)) / __base; \ __rem; \ }) #endif xen-4.4.0/xen/include/asm-x86/paging.h0000664000175000017500000004257712307313555015520 0ustar smbsmb/****************************************************************************** * include/asm-x86/paging.h * * Common interface for paging support * Copyright (c) 2007 Advanced Micro Devices (Wei Huang) * Parts of this code are Copyright (c) 2006 by XenSource Inc. * Parts of this code are Copyright (c) 2006 by Michael A Fetterman * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _XEN_PAGING_H #define _XEN_PAGING_H #include #include #include #include #include #include #include /***************************************************************************** * Macros to tell which paging mode a domain is in */ #define PG_SH_shift 20 #define PG_HAP_shift 21 /* We're in one of the shadow modes */ #define PG_SH_enable (1U << PG_SH_shift) #define PG_HAP_enable (1U << PG_HAP_shift) /* common paging mode bits */ #define PG_mode_shift 10 /* Refcounts based on shadow tables instead of guest tables */ #define PG_refcounts (XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT << PG_mode_shift) /* Enable log dirty mode */ #define PG_log_dirty (XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY << PG_mode_shift) /* Xen does p2m translation, not guest */ #define PG_translate (XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE << PG_mode_shift) /* Xen does not steal address space from the domain for its own booking; * requires VT or similar mechanisms */ #define PG_external (XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL << PG_mode_shift) #define paging_mode_enabled(_d) ((_d)->arch.paging.mode) #define paging_mode_shadow(_d) ((_d)->arch.paging.mode & PG_SH_enable) #define paging_mode_hap(_d) ((_d)->arch.paging.mode & PG_HAP_enable) #define paging_mode_refcounts(_d) ((_d)->arch.paging.mode & PG_refcounts) #define paging_mode_log_dirty(_d) ((_d)->arch.paging.mode & PG_log_dirty) #define paging_mode_translate(_d) ((_d)->arch.paging.mode & PG_translate) #define paging_mode_external(_d) ((_d)->arch.paging.mode & PG_external) /* flags used for paging debug */ #define PAGING_DEBUG_LOGDIRTY 0 /***************************************************************************** * Mode-specific entry points into the shadow code. * * These shouldn't be used directly by callers; rather use the functions * below which will indirect through this table as appropriate. */ struct sh_emulate_ctxt; struct shadow_paging_mode { void (*detach_old_tables )(struct vcpu *v); int (*x86_emulate_write )(struct vcpu *v, unsigned long va, void *src, u32 bytes, struct sh_emulate_ctxt *sh_ctxt); int (*x86_emulate_cmpxchg )(struct vcpu *v, unsigned long va, unsigned long old, unsigned long new, unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt); mfn_t (*make_monitor_table )(struct vcpu *v); void (*destroy_monitor_table )(struct vcpu *v, mfn_t mmfn); int (*guess_wrmap )(struct vcpu *v, unsigned long vaddr, mfn_t gmfn); void (*pagetable_dying )(struct vcpu *v, paddr_t gpa); /* For outsiders to tell what mode we're in */ unsigned int shadow_levels; }; /************************************************/ /* common paging interface */ /************************************************/ struct paging_mode { int (*page_fault )(struct vcpu *v, unsigned long va, struct cpu_user_regs *regs); int (*invlpg )(struct vcpu *v, unsigned long va); unsigned long (*gva_to_gfn )(struct vcpu *v, struct p2m_domain *p2m, unsigned long va, uint32_t *pfec); unsigned long (*p2m_ga_to_gfn )(struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3, paddr_t ga, uint32_t *pfec, unsigned int *page_order); void (*update_cr3 )(struct vcpu *v, int do_locking); void (*update_paging_modes )(struct vcpu *v); void (*write_p2m_entry )(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level); int (*write_guest_entry )(struct vcpu *v, intpte_t *p, intpte_t new, mfn_t gmfn); int (*cmpxchg_guest_entry )(struct vcpu *v, intpte_t *p, intpte_t *old, intpte_t new, mfn_t gmfn); void * (*guest_map_l1e )(struct vcpu *v, unsigned long va, unsigned long *gl1mfn); void (*guest_get_eff_l1e )(struct vcpu *v, unsigned long va, void *eff_l1e); unsigned int guest_levels; /* paging support extension */ struct shadow_paging_mode shadow; }; /***************************************************************************** * Log dirty code */ /* free log dirty bitmap resource */ void paging_free_log_dirty_bitmap(struct domain *d); /* get the dirty bitmap for a specific range of pfns */ void paging_log_dirty_range(struct domain *d, unsigned long begin_pfn, unsigned long nr, uint8_t *dirty_bitmap); /* enable log dirty */ int paging_log_dirty_enable(struct domain *d, bool_t log_global); /* disable log dirty */ int paging_log_dirty_disable(struct domain *d); /* log dirty initialization */ void paging_log_dirty_init(struct domain *d, int (*enable_log_dirty)(struct domain *d, bool_t log_global), int (*disable_log_dirty)(struct domain *d), void (*clean_dirty_bitmap)(struct domain *d)); /* mark a page as dirty */ void paging_mark_dirty(struct domain *d, unsigned long guest_mfn); /* is this guest page dirty? * This is called from inside paging code, with the paging lock held. */ int paging_mfn_is_dirty(struct domain *d, mfn_t gmfn); /* * Log-dirty radix tree indexing: * All tree nodes are PAGE_SIZE bytes, mapped on-demand. * Leaf nodes are simple bitmaps; 1 bit per guest pfn. * Interior nodes are arrays of LOGDIRTY_NODE_ENTRIES mfns. * TODO: Dynamic radix tree height. Most guests will only need 2 levels. * The fourth level is basically unusable on 32-bit Xen. * TODO2: Abstract out the radix-tree mechanics? */ #define LOGDIRTY_NODE_ENTRIES (1 << PAGETABLE_ORDER) #define L1_LOGDIRTY_IDX(pfn) ((pfn) & ((1 << (PAGE_SHIFT+3)) - 1)) #define L2_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3)) & \ (LOGDIRTY_NODE_ENTRIES-1)) #define L3_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3+PAGETABLE_ORDER)) & \ (LOGDIRTY_NODE_ENTRIES-1)) #define L4_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3+PAGETABLE_ORDER*2)) & \ (LOGDIRTY_NODE_ENTRIES-1)) /* VRAM dirty tracking support */ struct sh_dirty_vram { unsigned long begin_pfn; unsigned long end_pfn; paddr_t *sl1ma; uint8_t *dirty_bitmap; s_time_t last_dirty; }; /***************************************************************************** * Entry points into the paging-assistance code */ /* Initialize the paging resource for vcpu struct. It is called by * vcpu_initialise() in domain.c */ void paging_vcpu_init(struct vcpu *v); /* Set up the paging-assistance-specific parts of a domain struct at * start of day. Called for every domain from arch_domain_create() */ int paging_domain_init(struct domain *d, unsigned int domcr_flags); /* Handler for paging-control ops: operations from user-space to enable * and disable ephemeral shadow modes (test mode and log-dirty mode) and * manipulate the log-dirty bitmap. */ int paging_domctl(struct domain *d, xen_domctl_shadow_op_t *sc, XEN_GUEST_HANDLE_PARAM(void) u_domctl); /* Call when destroying a domain */ void paging_teardown(struct domain *d); /* Call once all of the references to the domain have gone away */ void paging_final_teardown(struct domain *d); /* Enable an arbitrary paging-assistance mode. Call once at domain * creation. */ int paging_enable(struct domain *d, u32 mode); #define paging_get_hostmode(v) ((v)->arch.paging.mode) #define paging_get_nestedmode(v) ((v)->arch.paging.nestedmode) const struct paging_mode *paging_get_mode(struct vcpu *v); void paging_update_nestedmode(struct vcpu *v); /* Page fault handler * Called from pagefault handler in Xen, and from the HVM trap handlers * for pagefaults. Returns 1 if this fault was an artefact of the * paging code (and the guest should retry) or 0 if it is not (and the * fault should be handled elsewhere or passed to the guest). * * Note: under shadow paging, this function handles all page faults; * however, for hardware-assisted paging, this function handles only * host page faults (i.e. nested page faults). */ static inline int paging_fault(unsigned long va, struct cpu_user_regs *regs) { struct vcpu *v = current; return paging_get_hostmode(v)->page_fault(v, va, regs); } /* Handle invlpg requests on vcpus. * Returns 1 if the invlpg instruction should be issued on the hardware, * or 0 if it's safe not to do so. */ static inline int paging_invlpg(struct vcpu *v, unsigned long va) { return paging_get_hostmode(v)->invlpg(v, va); } /* Translate a guest virtual address to the frame number that the * *guest* pagetables would map it to. Returns INVALID_GFN if the guest * tables don't map this address for this kind of access. * pfec[0] is used to determine which kind of access this is when * walking the tables. The caller should set the PFEC_page_present bit * in pfec[0]; in the failure case, that bit will be cleared if appropriate. */ #define INVALID_GFN (-1UL) unsigned long paging_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec); /* Translate a guest address using a particular CR3 value. This is used * to by nested HAP code, to walk the guest-supplied NPT tables as if * they were pagetables. * Use 'paddr_t' for the guest address so it won't overflow when * l1 or l2 guest is in 32bit PAE mode. * If the GFN returned is not INVALID_GFN, *page_order gives * the size of the superpage (if any) it was found in. */ static inline unsigned long paging_ga_to_gfn_cr3(struct vcpu *v, unsigned long cr3, paddr_t ga, uint32_t *pfec, unsigned int *page_order) { struct p2m_domain *p2m = v->domain->arch.p2m; return paging_get_hostmode(v)->p2m_ga_to_gfn(v, p2m, cr3, ga, pfec, page_order); } /* Update all the things that are derived from the guest's CR3. * Called when the guest changes CR3; the caller can then use v->arch.cr3 * as the value to load into the host CR3 to schedule this vcpu */ static inline void paging_update_cr3(struct vcpu *v) { paging_get_hostmode(v)->update_cr3(v, 1); } /* Update all the things that are derived from the guest's CR0/CR3/CR4. * Called to initialize paging structures if the paging mode * has changed, and when bringing up a VCPU for the first time. */ static inline void paging_update_paging_modes(struct vcpu *v) { paging_get_hostmode(v)->update_paging_modes(v); } /* Write a new value into the guest pagetable, and update the * paging-assistance state appropriately. Returns 0 if we page-faulted, * 1 for success. */ static inline int paging_write_guest_entry(struct vcpu *v, intpte_t *p, intpte_t new, mfn_t gmfn) { if ( unlikely(paging_mode_enabled(v->domain) && v->arch.paging.mode != NULL) ) return paging_get_hostmode(v)->write_guest_entry(v, p, new, gmfn); else return (!__copy_to_user(p, &new, sizeof(new))); } /* Cmpxchg a new value into the guest pagetable, and update the * paging-assistance state appropriately. Returns 0 if we page-faulted, * 1 if not. N.B. caller should check the value of "old" to see if the * cmpxchg itself was successful. */ static inline int paging_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p, intpte_t *old, intpte_t new, mfn_t gmfn) { if ( unlikely(paging_mode_enabled(v->domain) && v->arch.paging.mode != NULL) ) return paging_get_hostmode(v)->cmpxchg_guest_entry(v, p, old, new, gmfn); else return (!cmpxchg_user(p, *old, new)); } /* Helper function that writes a pte in such a way that a concurrent read * never sees a half-written entry that has _PAGE_PRESENT set */ static inline void safe_write_pte(l1_pgentry_t *p, l1_pgentry_t new) { *p = new; } /* Atomically write a P2M entry and update the paging-assistance state * appropriately. * Arguments: the domain in question, the GFN whose mapping is being updated, * a pointer to the entry to be written, the MFN in which the entry resides, * the new contents of the entry, and the level in the p2m tree at which * we are writing. */ struct p2m_domain; void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level); /* Called from the guest to indicate that the a process is being * torn down and its pagetables will soon be discarded */ void pagetable_dying(struct domain *d, paddr_t gpa); /* Print paging-assistance info to the console */ void paging_dump_domain_info(struct domain *d); void paging_dump_vcpu_info(struct vcpu *v); /***************************************************************************** * Access to the guest pagetables */ /* Get a mapping of a PV guest's l1e for this virtual address. */ static inline l1_pgentry_t * guest_map_l1e(struct vcpu *v, unsigned long addr, unsigned long *gl1mfn) { l2_pgentry_t l2e; if ( unlikely(!__addr_ok(addr)) ) return NULL; if ( unlikely(paging_mode_translate(v->domain)) ) return paging_get_hostmode(v)->guest_map_l1e(v, addr, gl1mfn); /* Find this l1e and its enclosing l1mfn in the linear map */ if ( __copy_from_user(&l2e, &__linear_l2_table[l2_linear_offset(addr)], sizeof(l2_pgentry_t)) != 0 ) return NULL; /* Check flags that it will be safe to read the l1e */ if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT ) return NULL; *gl1mfn = l2e_get_pfn(l2e); return (l1_pgentry_t *)map_domain_page(*gl1mfn) + l1_table_offset(addr); } /* Pull down the mapping we got from guest_map_l1e() */ static inline void guest_unmap_l1e(struct vcpu *v, void *p) { unmap_domain_page(p); } /* Read the guest's l1e that maps this address. */ static inline void guest_get_eff_l1e(struct vcpu *v, unsigned long addr, l1_pgentry_t *eff_l1e) { if ( unlikely(!__addr_ok(addr)) ) { *eff_l1e = l1e_empty(); return; } if ( likely(!paging_mode_translate(v->domain)) ) { ASSERT(!paging_mode_external(v->domain)); if ( __copy_from_user(eff_l1e, &__linear_l1_table[l1_linear_offset(addr)], sizeof(l1_pgentry_t)) != 0 ) *eff_l1e = l1e_empty(); return; } paging_get_hostmode(v)->guest_get_eff_l1e(v, addr, eff_l1e); } /* Read the guest's l1e that maps this address, from the kernel-mode * pagetables. */ static inline void guest_get_eff_kern_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e) { int user_mode = !(v->arch.flags & TF_kernel_mode); #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v) TOGGLE_MODE(); guest_get_eff_l1e(v, addr, eff_l1e); TOGGLE_MODE(); } #endif /* XEN_PAGING_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/mach-default/0000775000175000017500000000000012307313555016415 5ustar smbsmbxen-4.4.0/xen/include/asm-x86/mach-default/smpboot_hooks.h0000664000175000017500000000217112307313555021455 0ustar smbsmb/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws * which needs to alter them. */ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; spin_lock_irqsave(&rtc_lock, flags); CMOS_WRITE(0xa, 0xf); spin_unlock_irqrestore(&rtc_lock, flags); flush_tlb_local(); Dprintk("1.\n"); *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4; Dprintk("2.\n"); *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf; Dprintk("3.\n"); } static inline void smpboot_restore_warm_reset_vector(void) { unsigned long flags; /* * Install writable page 0 entry to set BIOS data area. */ flush_tlb_local(); /* * Paranoid: Set warm reset code and vector here back * to default values. */ spin_lock_irqsave(&rtc_lock, flags); CMOS_WRITE(0, 0xf); spin_unlock_irqrestore(&rtc_lock, flags); *((volatile int *) maddr_to_virt(0x467)) = 0; } static inline void smpboot_setup_io_apic(void) { /* * Here we can be sure that there is an IO-APIC in the system. Let's * go and set it up: */ if (!skip_ioapic_setup && nr_ioapics) setup_IO_APIC(); } xen-4.4.0/xen/include/asm-x86/mach-default/bios_ebda.h0000664000175000017500000000053012307313555020473 0ustar smbsmb#ifndef _MACH_BIOS_EBDA_H #define _MACH_BIOS_EBDA_H /* * there is a real-mode segmented pointer pointing to the * 4K EBDA area at 0x40E. */ static inline unsigned int get_bios_ebda(void) { unsigned int address = *(unsigned short *)maddr_to_virt(0x40E); address <<= 4; return address; /* 0 means none */ } #endif /* _MACH_BIOS_EBDA_H */ xen-4.4.0/xen/include/asm-x86/mach-default/mach_wakecpu.h0000664000175000017500000000044412307313555021217 0ustar smbsmb#ifndef __ASM_MACH_WAKECPU_H #define __ASM_MACH_WAKECPU_H /* * This file copes with machines that wakeup secondary CPUs by the * INIT, INIT, STARTUP sequence. */ #define TRAMPOLINE_LOW maddr_to_virt(0x467) #define TRAMPOLINE_HIGH maddr_to_virt(0x469) #endif /* __ASM_MACH_WAKECPU_H */ xen-4.4.0/xen/include/asm-x86/mach-default/io_ports.h0000664000175000017500000000135212307313555020425 0ustar smbsmb/* * arch/i386/mach-generic/io_ports.h * * Machine specific IO port address definition for generic. * Written by Osamu Tomita */ #ifndef _MACH_IO_PORTS_H #define _MACH_IO_PORTS_H /* i8253A PIT registers */ #define PIT_MODE 0x43 #define PIT_CH0 0x40 #define PIT_CH2 0x42 /* i8259A PIC registers */ #define PIC_MASTER_CMD 0x20 #define PIC_MASTER_IMR 0x21 #define PIC_MASTER_ISR PIC_MASTER_CMD #define PIC_MASTER_POLL PIC_MASTER_ISR #define PIC_MASTER_OCW3 PIC_MASTER_ISR #define PIC_SLAVE_CMD 0xa0 #define PIC_SLAVE_IMR 0xa1 /* i8259A PIC related value */ #define PIC_CASCADE_IR 2 #define MASTER_ICW4_DEFAULT 0x01 #define SLAVE_ICW4_DEFAULT 0x01 #define PIC_ICW4_AEOI 2 #endif /* !_MACH_IO_PORTS_H */ xen-4.4.0/xen/include/asm-x86/mach-default/irq_vectors.h0000664000175000017500000000220212307313555021122 0ustar smbsmb#ifndef _ASM_IRQ_VECTORS_H #define _ASM_IRQ_VECTORS_H /* Processor-initiated interrupts are all high priority. */ #define SPURIOUS_APIC_VECTOR 0xff #define ERROR_APIC_VECTOR 0xfe #define INVALIDATE_TLB_VECTOR 0xfd #define EVENT_CHECK_VECTOR 0xfc #define CALL_FUNCTION_VECTOR 0xfb #define LOCAL_TIMER_VECTOR 0xfa #define PMU_APIC_VECTOR 0xf9 /* * High-priority dynamically-allocated vectors. For interrupts that * must be higher priority than any guest-bound interrupt. */ #define FIRST_HIPRIORITY_VECTOR 0xf1 #define LAST_HIPRIORITY_VECTOR 0xf8 /* IRQ0 (timer) is statically allocated but must be high priority. */ #define IRQ0_VECTOR 0xf0 /* Legacy PIC uses vectors 0xe0-0xef. */ #define FIRST_LEGACY_VECTOR 0xe0 #define LAST_LEGACY_VECTOR 0xef #define HYPERCALL_VECTOR 0x82 #define LEGACY_SYSCALL_VECTOR 0x80 /* Dynamically-allocated vectors available to any driver. */ #define FIRST_DYNAMIC_VECTOR 0x20 #define LAST_DYNAMIC_VECTOR 0xdf #define NR_DYNAMIC_VECTORS (LAST_DYNAMIC_VECTOR - FIRST_DYNAMIC_VECTOR + 1) #define IRQ_MOVE_CLEANUP_VECTOR FIRST_DYNAMIC_VECTOR #define NR_VECTORS 256 #endif /* _ASM_IRQ_VECTORS_H */ xen-4.4.0/xen/include/asm-x86/mach-default/mach_mpspec.h0000664000175000017500000000042712307313555021050 0ustar smbsmb#ifndef __ASM_MACH_MPSPEC_H #define __ASM_MACH_MPSPEC_H #define MAX_IRQ_SOURCES 256 /* Generic (i.e. installer) kernels need lots of bus entries. */ /* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */ #define MAX_MP_BUSSES 260 #endif /* __ASM_MACH_MPSPEC_H */ xen-4.4.0/xen/include/asm-x86/mach-default/mach_mpparse.h0000664000175000017500000000053612307313555021231 0ustar smbsmb#ifndef __ASM_MACH_MPPARSE_H #define __ASM_MACH_MPPARSE_H static inline int __init mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid) { return 0; } /* Hook from generic ACPI tables.c */ static inline int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; } #endif /* __ASM_MACH_MPPARSE_H */ xen-4.4.0/xen/include/asm-x86/perfc.h0000664000175000017500000000022712307313555015334 0ustar smbsmb#ifndef __ASM_PERFC_H__ #define __ASM_PERFC_H__ static inline void arch_perfc_reset(void) { } static inline void arch_perfc_gather(void) { } #endif xen-4.4.0/xen/include/asm-x86/shadow.h0000664000175000017500000001166612307313555015533 0ustar smbsmb/****************************************************************************** * include/asm-x86/shadow.h * * Parts of this code are Copyright (c) 2006 by XenSource Inc. * Parts of this code are Copyright (c) 2006 by Michael A Fetterman * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _XEN_SHADOW_H #define _XEN_SHADOW_H #include #include #include #include #include #include #include /***************************************************************************** * Macros to tell which shadow paging mode a domain is in*/ #define shadow_mode_enabled(_d) paging_mode_shadow(_d) #define shadow_mode_refcounts(_d) (paging_mode_shadow(_d) && \ paging_mode_refcounts(_d)) #define shadow_mode_log_dirty(_d) (paging_mode_shadow(_d) && \ paging_mode_log_dirty(_d)) #define shadow_mode_translate(_d) (paging_mode_shadow(_d) && \ paging_mode_translate(_d)) #define shadow_mode_external(_d) (paging_mode_shadow(_d) && \ paging_mode_external(_d)) /* Xen traps & emulates all reads of all page table pages: * not yet supported */ #define shadow_mode_trap_reads(_d) ({ (void)(_d); 0; }) /***************************************************************************** * Entry points into the shadow code */ /* Set up the shadow-specific parts of a domain struct at start of day. * Called from paging_domain_init(). */ void shadow_domain_init(struct domain *d, unsigned int domcr_flags); /* Setup the shadow-specific parts of a vcpu struct. It is called by * paging_vcpu_init() in paging.c */ void shadow_vcpu_init(struct vcpu *v); /* Enable an arbitrary shadow mode. Call once at domain creation. */ int shadow_enable(struct domain *d, u32 mode); /* Enable VRAM dirty bit tracking. */ int shadow_track_dirty_vram(struct domain *d, unsigned long first_pfn, unsigned long nr, XEN_GUEST_HANDLE_64(uint8) dirty_bitmap); /* Handler for shadow control ops: operations from user-space to enable * and disable ephemeral shadow modes (test mode and log-dirty mode) and * manipulate the log-dirty bitmap. */ int shadow_domctl(struct domain *d, xen_domctl_shadow_op_t *sc, XEN_GUEST_HANDLE_PARAM(void) u_domctl); /* Call when destroying a domain */ void shadow_teardown(struct domain *d); /* Call once all of the references to the domain have gone away */ void shadow_final_teardown(struct domain *d); /* shadow code to call when log dirty is enabled */ int shadow_enable_log_dirty(struct domain *d, bool_t log_global); /* shadow code to call when log dirty is disabled */ int shadow_disable_log_dirty(struct domain *d); /* shadow code to call when bitmap is being cleaned */ void shadow_clean_dirty_bitmap(struct domain *d); /* Update all the things that are derived from the guest's CR0/CR3/CR4. * Called to initialize paging structures if the paging mode * has changed, and when bringing up a VCPU for the first time. */ void shadow_update_paging_modes(struct vcpu *v); /* Remove all mappings of the guest page from the shadows. * This is called from common code. It does not flush TLBs. */ int sh_remove_all_mappings(struct vcpu *v, mfn_t target_mfn); static inline void shadow_drop_references(struct domain *d, struct page_info *p) { if ( unlikely(shadow_mode_enabled(d)) ) /* See the comment about locking in sh_remove_all_mappings */ sh_remove_all_mappings(d->vcpu[0], _mfn(page_to_mfn(p))); } /* Remove all shadows of the guest mfn. */ void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int fast, int all); static inline void shadow_remove_all_shadows(struct vcpu *v, mfn_t gmfn) { /* See the comment about locking in sh_remove_shadows */ sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */); } /* Discard _all_ mappings from the domain's shadows. */ void shadow_blow_tables_per_domain(struct domain *d); #endif /* _XEN_SHADOW_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/x86_64/0000775000175000017500000000000012307313555015021 5ustar smbsmbxen-4.4.0/xen/include/asm-x86/x86_64/page.h0000664000175000017500000001445412307313555016116 0ustar smbsmb #ifndef __X86_64_PAGE_H__ #define __X86_64_PAGE_H__ #define L1_PAGETABLE_SHIFT 12 #define L2_PAGETABLE_SHIFT 21 #define L3_PAGETABLE_SHIFT 30 #define L4_PAGETABLE_SHIFT 39 #define PAGE_SHIFT L1_PAGETABLE_SHIFT #define SUPERPAGE_SHIFT L2_PAGETABLE_SHIFT #define ROOT_PAGETABLE_SHIFT L4_PAGETABLE_SHIFT #define PAGETABLE_ORDER 9 #define L1_PAGETABLE_ENTRIES (1<> 47) == ((long)(x) >> 63)) #ifndef __ASSEMBLY__ #include #include extern unsigned long xen_virt_end; extern unsigned long max_pdx; extern unsigned long pfn_pdx_bottom_mask, ma_va_bottom_mask; extern unsigned int pfn_pdx_hole_shift; extern unsigned long pfn_hole_mask; extern unsigned long pfn_top_mask, ma_top_mask; extern void pfn_pdx_hole_setup(unsigned long); #define page_to_pdx(pg) ((pg) - frame_table) #define pdx_to_page(pdx) (frame_table + (pdx)) #define spage_to_pdx(spg) (((spg) - spage_table)<<(SUPERPAGE_SHIFT-PAGE_SHIFT)) #define pdx_to_spage(pdx) (spage_table + ((pdx)>>(SUPERPAGE_SHIFT-PAGE_SHIFT))) /* * Note: These are solely for the use by page_{get,set}_owner(), and * therefore don't need to handle the XEN_VIRT_{START,END} range. */ #define virt_to_pdx(va) (((unsigned long)(va) - DIRECTMAP_VIRT_START) >> \ PAGE_SHIFT) #define pdx_to_virt(pdx) ((void *)(DIRECTMAP_VIRT_START + \ ((unsigned long)(pdx) << PAGE_SHIFT))) extern int __mfn_valid(unsigned long mfn); static inline unsigned long pfn_to_pdx(unsigned long pfn) { return (pfn & pfn_pdx_bottom_mask) | ((pfn & pfn_top_mask) >> pfn_pdx_hole_shift); } static inline unsigned long pdx_to_pfn(unsigned long pdx) { return (pdx & pfn_pdx_bottom_mask) | ((pdx << pfn_pdx_hole_shift) & pfn_top_mask); } static inline unsigned long pfn_to_sdx(unsigned long pfn) { return pfn_to_pdx(pfn) >> (SUPERPAGE_SHIFT-PAGE_SHIFT); } static inline unsigned long sdx_to_pfn(unsigned long sdx) { return pdx_to_pfn(sdx << (SUPERPAGE_SHIFT-PAGE_SHIFT)); } static inline unsigned long __virt_to_maddr(unsigned long va) { ASSERT(va >= XEN_VIRT_START); ASSERT(va < DIRECTMAP_VIRT_END); if ( va >= DIRECTMAP_VIRT_START ) va -= DIRECTMAP_VIRT_START; else { ASSERT(va < XEN_VIRT_END); va += xen_phys_start - XEN_VIRT_START; } return (va & ma_va_bottom_mask) | ((va << pfn_pdx_hole_shift) & ma_top_mask); } static inline void *__maddr_to_virt(unsigned long ma) { ASSERT(pfn_to_pdx(ma >> PAGE_SHIFT) < (DIRECTMAP_SIZE >> PAGE_SHIFT)); return (void *)(DIRECTMAP_VIRT_START + ((ma & ma_va_bottom_mask) | ((ma & ma_top_mask) >> pfn_pdx_hole_shift))); } /* read access (should only be used for debug printk's) */ typedef u64 intpte_t; #define PRIpte "016lx" typedef struct { intpte_t l1; } l1_pgentry_t; typedef struct { intpte_t l2; } l2_pgentry_t; typedef struct { intpte_t l3; } l3_pgentry_t; typedef struct { intpte_t l4; } l4_pgentry_t; typedef l4_pgentry_t root_pgentry_t; #endif /* !__ASSEMBLY__ */ #define pte_read_atomic(ptep) read_atomic(ptep) #define pte_write_atomic(ptep, pte) write_atomic(ptep, pte) #define pte_write(ptep, pte) write_atomic(ptep, pte) /* Given a virtual address, get an entry offset into a linear page table. */ #define l1_linear_offset(_a) (((_a) & VADDR_MASK) >> L1_PAGETABLE_SHIFT) #define l2_linear_offset(_a) (((_a) & VADDR_MASK) >> L2_PAGETABLE_SHIFT) #define l3_linear_offset(_a) (((_a) & VADDR_MASK) >> L3_PAGETABLE_SHIFT) #define l4_linear_offset(_a) (((_a) & VADDR_MASK) >> L4_PAGETABLE_SHIFT) #define is_guest_l1_slot(_s) (1) #define is_guest_l2_slot(_d, _t, _s) \ ( !is_pv_32bit_domain(_d) || \ !((_t) & PGT_pae_xen_l2) || \ ((_s) < COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(_d)) ) #define is_guest_l3_slot(_s) (1) #define is_guest_l4_slot(_d, _s) \ ( is_pv_32bit_domain(_d) \ ? ((_s) == 0) \ : (((_s) < ROOT_PAGETABLE_FIRST_XEN_SLOT) || \ ((_s) > ROOT_PAGETABLE_LAST_XEN_SLOT))) #define root_get_pfn l4e_get_pfn #define root_get_flags l4e_get_flags #define root_get_intpte l4e_get_intpte #define root_empty l4e_empty #define root_from_paddr l4e_from_paddr #define PGT_root_page_table PGT_l4_page_table /* * PTE pfn and flags: * 40-bit pfn = (pte[51:12]) * 24-bit flags = (pte[63:52],pte[11:0]) */ /* Extract flags into 24-bit integer, or turn 24-bit flags into a pte mask. */ #define get_pte_flags(x) (((int)((x) >> 40) & ~0xFFF) | ((int)(x) & 0xFFF)) #define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 40) | ((x) & 0xFFF)) /* Bit 23 of a 24-bit flag mask. This corresponds to bit 63 of a pte.*/ #define _PAGE_NX_BIT (1U<<23) /* Bit 22 of a 24-bit flag mask. This corresponds to bit 62 of a pte.*/ #define _PAGE_GNTTAB (1U<<22) #define PAGE_HYPERVISOR (__PAGE_HYPERVISOR | _PAGE_GLOBAL) #define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL) #define USER_MAPPINGS_ARE_GLOBAL #ifdef USER_MAPPINGS_ARE_GLOBAL /* * Bit 12 of a 24-bit flag mask. This corresponds to bit 52 of a pte. * This is needed to distinguish between user and kernel PTEs since _PAGE_USER * is asserted for both. */ #define _PAGE_GUEST_KERNEL (1U<<12) #else #define _PAGE_GUEST_KERNEL 0 #endif #endif /* __X86_64_PAGE_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/x86_64/asm_defns.h0000664000175000017500000001562612307313555017143 0ustar smbsmb#ifndef __X86_64_ASM_DEFNS_H__ #define __X86_64_ASM_DEFNS_H__ #include #ifdef CONFIG_FRAME_POINTER /* Indicate special exception stack frame by inverting the frame pointer. */ #define SETUP_EXCEPTION_FRAME_POINTER(offs) \ leaq offs(%rsp),%rbp; \ notq %rbp #else #define SETUP_EXCEPTION_FRAME_POINTER(offs) #endif #ifndef NDEBUG #define ASSERT_INTERRUPT_STATUS(x) \ pushf; \ testb $X86_EFLAGS_IF>>8,1(%rsp); \ j##x 1f; \ ud2a; \ 1: addq $8,%rsp; #else #define ASSERT_INTERRUPT_STATUS(x) #endif #define ASSERT_INTERRUPTS_ENABLED ASSERT_INTERRUPT_STATUS(nz) #define ASSERT_INTERRUPTS_DISABLED ASSERT_INTERRUPT_STATUS(z) /* * This flag is set in an exception frame when registers R12-R15 did not get * saved. */ #define _TRAP_regs_partial 16 #define TRAP_regs_partial (1 << _TRAP_regs_partial) /* * This flag gets set in an exception frame when registers R12-R15 possibly * get modified from their originally saved values and hence need to be * restored even if the normal call flow would restore register values. * * The flag being set implies _TRAP_regs_partial to be unset. Restoring * R12-R15 thus is * - required when this flag is set, * - safe when _TRAP_regs_partial is unset. */ #define _TRAP_regs_dirty 17 #define TRAP_regs_dirty (1 << _TRAP_regs_dirty) #define mark_regs_dirty(r) ({ \ struct cpu_user_regs *r__ = (r); \ ASSERT(!((r__)->entry_vector & TRAP_regs_partial)); \ r__->entry_vector |= TRAP_regs_dirty; \ }) #define SAVE_ALL \ addq $-(UREGS_error_code-UREGS_r15), %rsp; \ cld; \ movq %rdi,UREGS_rdi(%rsp); \ movq %rsi,UREGS_rsi(%rsp); \ movq %rdx,UREGS_rdx(%rsp); \ movq %rcx,UREGS_rcx(%rsp); \ movq %rax,UREGS_rax(%rsp); \ movq %r8,UREGS_r8(%rsp); \ movq %r9,UREGS_r9(%rsp); \ movq %r10,UREGS_r10(%rsp); \ movq %r11,UREGS_r11(%rsp); \ movq %rbx,UREGS_rbx(%rsp); \ movq %rbp,UREGS_rbp(%rsp); \ SETUP_EXCEPTION_FRAME_POINTER(UREGS_rbp); \ movq %r12,UREGS_r12(%rsp); \ movq %r13,UREGS_r13(%rsp); \ movq %r14,UREGS_r14(%rsp); \ movq %r15,UREGS_r15(%rsp); \ #ifdef __ASSEMBLY__ /* * Save all registers not preserved by C code or used in entry/exit code. Mark * the frame as partial. * * @type: exception type * @compat: R8-R15 don't need saving, and the frame nevertheless is complete */ .macro SAVE_VOLATILE type compat=0 .if \compat movl $\type,UREGS_entry_vector-UREGS_error_code(%rsp) .else movl $\type|TRAP_regs_partial,\ UREGS_entry_vector-UREGS_error_code(%rsp) .endif addq $-(UREGS_error_code-UREGS_r15),%rsp cld movq %rdi,UREGS_rdi(%rsp) movq %rsi,UREGS_rsi(%rsp) movq %rdx,UREGS_rdx(%rsp) movq %rcx,UREGS_rcx(%rsp) movq %rax,UREGS_rax(%rsp) .if !\compat movq %r8,UREGS_r8(%rsp) movq %r9,UREGS_r9(%rsp) movq %r10,UREGS_r10(%rsp) movq %r11,UREGS_r11(%rsp) .endif movq %rbx,UREGS_rbx(%rsp) movq %rbp,UREGS_rbp(%rsp) SETUP_EXCEPTION_FRAME_POINTER(UREGS_rbp) .endm /* * Complete a frame potentially only partially saved. */ .macro SAVE_PRESERVED btrl $_TRAP_regs_partial,UREGS_entry_vector(%rsp) jnc 987f movq %r12,UREGS_r12(%rsp) movq %r13,UREGS_r13(%rsp) movq %r14,UREGS_r14(%rsp) movq %r15,UREGS_r15(%rsp) 987: .endm /* * Reload registers not preserved by C code from frame. * * @compat: R8-R11 don't need reloading * * For the way it is used in RESTORE_ALL, this macro must preserve EFLAGS.ZF. */ .macro LOAD_C_CLOBBERED compat=0 .if !\compat movq UREGS_r11(%rsp),%r11 movq UREGS_r10(%rsp),%r10 movq UREGS_r9(%rsp),%r9 movq UREGS_r8(%rsp),%r8 .endif movq UREGS_rax(%rsp),%rax movq UREGS_rcx(%rsp),%rcx movq UREGS_rdx(%rsp),%rdx movq UREGS_rsi(%rsp),%rsi movq UREGS_rdi(%rsp),%rdi .endm /* * Restore all previously saved registers. * * @adj: extra stack pointer adjustment to be folded into the adjustment done * anyway at the end of the macro * @compat: R8-R15 don't need reloading */ .macro RESTORE_ALL adj=0 compat=0 .if !\compat testl $TRAP_regs_dirty,UREGS_entry_vector(%rsp) .endif LOAD_C_CLOBBERED \compat .if !\compat jz 987f movq UREGS_r15(%rsp),%r15 movq UREGS_r14(%rsp),%r14 movq UREGS_r13(%rsp),%r13 movq UREGS_r12(%rsp),%r12 #ifndef NDEBUG .subsection 1 987: testl $TRAP_regs_partial,UREGS_entry_vector(%rsp) jnz 987f cmpq UREGS_r15(%rsp),%r15 jne 789f cmpq UREGS_r14(%rsp),%r14 jne 789f cmpq UREGS_r13(%rsp),%r13 jne 789f cmpq UREGS_r12(%rsp),%r12 je 987f 789: ud2 .subsection 0 #endif .endif 987: movq UREGS_rbp(%rsp),%rbp movq UREGS_rbx(%rsp),%rbx subq $-(UREGS_error_code-UREGS_r15+\adj), %rsp .endm #endif #ifdef PERF_COUNTERS #define PERFC_INCR(_name,_idx,_cur) \ pushq _cur; \ movslq VCPU_processor(_cur),_cur; \ pushq %rdx; \ leaq __per_cpu_offset(%rip),%rdx; \ movq (%rdx,_cur,8),_cur; \ leaq per_cpu__perfcounters(%rip),%rdx; \ addq %rdx,_cur; \ popq %rdx; \ incl ASM_PERFC_##_name*4(_cur,_idx,4); \ popq _cur #else #define PERFC_INCR(_name,_idx,_cur) #endif /* Work around AMD erratum #88 */ #define safe_swapgs \ "mfence; swapgs;" #ifdef __sun__ #define REX64_PREFIX "rex64\\" #elif defined(__clang__) #define REX64_PREFIX ".byte 0x48; " #else #define REX64_PREFIX "rex64/" #endif #define BUILD_COMMON_IRQ() \ __asm__( \ "\n" __ALIGN_STR"\n" \ "common_interrupt:\n\t" \ STR(SAVE_ALL) "\n\t" \ "movq %rsp,%rdi\n\t" \ "callq " STR(do_IRQ) "\n\t" \ "jmp ret_from_intr\n"); #define BUILD_IRQ(nr) \ "pushq $0\n\t" \ "movl $"#nr",4(%rsp)\n\t" \ "jmp common_interrupt" #ifdef __ASSEMBLY__ # define _ASM_EX(p) p-. #else # define _ASM_EX(p) #p "-." #endif #endif /* __X86_64_ASM_DEFNS_H__ */ xen-4.4.0/xen/include/asm-x86/x86_64/elf.h0000664000175000017500000000552112307313555015743 0ustar smbsmb#ifndef __X86_64_ELF_H__ #define __X86_64_ELF_H__ typedef struct { unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long rbp; unsigned long rbx; unsigned long r11; unsigned long r10; unsigned long r9; unsigned long r8; unsigned long rax; unsigned long rcx; unsigned long rdx; unsigned long rsi; unsigned long rdi; unsigned long orig_rax; unsigned long rip; unsigned long cs; unsigned long rflags; unsigned long rsp; unsigned long ss; unsigned long thread_fs; unsigned long thread_gs; unsigned long ds; unsigned long es; unsigned long fs; unsigned long gs; } ELF_Gregset; static inline void elf_core_save_regs(ELF_Gregset *core_regs, crash_xen_core_t *xen_core_regs) { unsigned long tmp; asm volatile("movq %%r15,%0" : "=m"(core_regs->r15)); asm volatile("movq %%r14,%0" : "=m"(core_regs->r14)); asm volatile("movq %%r13,%0" : "=m"(core_regs->r13)); asm volatile("movq %%r12,%0" : "=m"(core_regs->r12)); asm volatile("movq %%rbp,%0" : "=m"(core_regs->rbp)); asm volatile("movq %%rbx,%0" : "=m"(core_regs->rbx)); asm volatile("movq %%r11,%0" : "=m"(core_regs->r11)); asm volatile("movq %%r10,%0" : "=m"(core_regs->r10)); asm volatile("movq %%r9,%0" : "=m"(core_regs->r9)); asm volatile("movq %%r8,%0" : "=m"(core_regs->r8)); asm volatile("movq %%rax,%0" : "=m"(core_regs->rax)); asm volatile("movq %%rcx,%0" : "=m"(core_regs->rcx)); asm volatile("movq %%rdx,%0" : "=m"(core_regs->rdx)); asm volatile("movq %%rsi,%0" : "=m"(core_regs->rsi)); asm volatile("movq %%rdi,%0" : "=m"(core_regs->rdi)); /* orig_rax not filled in for now */ core_regs->rip = (unsigned long)elf_core_save_regs; asm volatile("movl %%cs, %%eax;" :"=a"(core_regs->cs)); asm volatile("pushfq; popq %0" :"=m"(core_regs->rflags)); asm volatile("movq %%rsp,%0" : "=m"(core_regs->rsp)); asm volatile("movl %%ss, %%eax;" :"=a"(core_regs->ss)); /* thread_fs not filled in for now */ /* thread_gs not filled in for now */ asm volatile("movl %%ds, %%eax;" :"=a"(core_regs->ds)); asm volatile("movl %%es, %%eax;" :"=a"(core_regs->es)); asm volatile("movl %%fs, %%eax;" :"=a"(core_regs->fs)); asm volatile("movl %%gs, %%eax;" :"=a"(core_regs->gs)); asm volatile("mov %%cr0, %0" : "=r" (tmp) : ); xen_core_regs->cr0 = tmp; asm volatile("mov %%cr2, %0" : "=r" (tmp) : ); xen_core_regs->cr2 = tmp; asm volatile("mov %%cr3, %0" : "=r" (tmp) : ); xen_core_regs->cr3 = tmp; asm volatile("mov %%cr4, %0" : "=r" (tmp) : ); xen_core_regs->cr4 = tmp; } #endif /* __X86_64_ELF_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/x86_64/efibind.h0000664000175000017500000002125112307313555016573 0ustar smbsmb/*++ Copyright (c) 1998 Intel Corporation Module Name: efefind.h Abstract: EFI to compile bindings Revision History --*/ #ifndef __GNUC__ #pragma pack() #endif // // Basic int types of various widths // #if !defined(__STDC_VERSION__) || (__STDC_VERSION__ < 199901L ) // No ANSI C 1999/2000 stdint.h integer width declarations #if defined(_MSC_EXTENSIONS) // Use Microsoft C compiler integer width declarations typedef unsigned __int64 uint64_t; typedef __int64 int64_t; typedef unsigned __int32 uint32_t; typedef __int32 int32_t; typedef unsigned short uint16_t; typedef short int16_t; typedef unsigned char uint8_t; typedef char int8_t; #elif defined(__GNUC__) typedef unsigned long long uint64_t __attribute__((aligned (8))); typedef long long int64_t __attribute__((aligned (8))); typedef unsigned int uint32_t; typedef int int32_t; typedef unsigned short uint16_t; typedef short int16_t; typedef unsigned char uint8_t; typedef char int8_t; #elif defined(UNIX_LP64) /* Use LP64 programming model from C_FLAGS for integer width declarations */ typedef unsigned long uint64_t; typedef long int64_t; typedef unsigned int uint32_t; typedef int int32_t; typedef unsigned short uint16_t; typedef short int16_t; typedef unsigned char uint8_t; typedef char int8_t; #else /* Assume P64 programming model from C_FLAGS for integer width declarations */ typedef unsigned long long uint64_t __attribute__((aligned (8))); typedef long long int64_t __attribute__((aligned (8))); typedef unsigned int uint32_t; typedef int int32_t; typedef unsigned short uint16_t; typedef short int16_t; typedef unsigned char uint8_t; typedef char int8_t; #endif #endif // // Basic EFI types of various widths // #ifndef __WCHAR_TYPE__ # define __WCHAR_TYPE__ short #endif typedef uint64_t UINT64; typedef int64_t INT64; #ifndef _BASETSD_H_ typedef uint32_t UINT32; typedef int32_t INT32; #endif typedef uint16_t UINT16; typedef int16_t INT16; typedef uint8_t UINT8; typedef int8_t INT8; typedef __WCHAR_TYPE__ WCHAR; #undef VOID #define VOID void typedef int64_t INTN; typedef uint64_t UINTN; #ifdef EFI_NT_EMULATOR #define POST_CODE(_Data) #else #ifdef EFI_DEBUG #define POST_CODE(_Data) __asm mov eax,(_Data) __asm out 0x80,al #else #define POST_CODE(_Data) #endif #endif #define EFIERR(a) (0x8000000000000000 | a) #define EFI_ERROR_MASK 0x8000000000000000 #define EFIERR_OEM(a) (0xc000000000000000 | a) #define BAD_POINTER 0xFBFBFBFBFBFBFBFB #define MAX_ADDRESS 0xFFFFFFFFFFFFFFFF #ifdef EFI_NT_EMULATOR #define BREAKPOINT() __asm { int 3 } #else #define BREAKPOINT() while (TRUE); // Make it hang on Bios[Dbg]32 #endif // // Pointers must be aligned to these address to function // #define MIN_ALIGNMENT_SIZE 4 #define ALIGN_VARIABLE(Value ,Adjustment) \ (UINTN)Adjustment = 0; \ if((UINTN)Value % MIN_ALIGNMENT_SIZE) \ (UINTN)Adjustment = MIN_ALIGNMENT_SIZE - ((UINTN)Value % MIN_ALIGNMENT_SIZE); \ Value = (UINTN)Value + (UINTN)Adjustment // // Define macros to build data structure signatures from characters. // #define EFI_SIGNATURE_16(A,B) ((A) | (B<<8)) #define EFI_SIGNATURE_32(A,B,C,D) (EFI_SIGNATURE_16(A,B) | (EFI_SIGNATURE_16(C,D) << 16)) #define EFI_SIGNATURE_64(A,B,C,D,E,F,G,H) (EFI_SIGNATURE_32(A,B,C,D) | ((UINT64)(EFI_SIGNATURE_32(E,F,G,H)) << 32)) // // To export & import functions in the EFI emulator environment // #ifdef EFI_NT_EMULATOR #define EXPORTAPI __declspec( dllexport ) #else #define EXPORTAPI #endif // // EFIAPI - prototype calling convention for EFI function pointers // BOOTSERVICE - prototype for implementation of a boot service interface // RUNTIMESERVICE - prototype for implementation of a runtime service interface // RUNTIMEFUNCTION - prototype for implementation of a runtime function that is not a service // RUNTIME_CODE - pragma macro for declaring runtime code // #ifndef EFIAPI // Forces EFI calling conventions reguardless of compiler options #ifdef _MSC_EXTENSIONS #define EFIAPI __cdecl // Force C calling convention for Microsoft C compiler #elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4) #define EFIAPI __attribute__((__ms_abi__)) // Force Microsoft ABI #else #define EFIAPI // Substitute expresion to force C calling convention #endif #endif #define BOOTSERVICE //#define RUNTIMESERVICE(proto,a) alloc_text("rtcode",a); proto a //#define RUNTIMEFUNCTION(proto,a) alloc_text("rtcode",a); proto a #define RUNTIMESERVICE #define RUNTIMEFUNCTION #define RUNTIME_CODE(a) alloc_text("rtcode", a) #define BEGIN_RUNTIME_DATA() data_seg("rtdata") #define END_RUNTIME_DATA() data_seg("") #define VOLATILE volatile #define MEMORY_FENCE() #ifdef EFI_NT_EMULATOR // // To help ensure proper coding of integrated drivers, they are // compiled as DLLs. In NT they require a dll init entry pointer. // The macro puts a stub entry point into the DLL so it will load. // #define EFI_DRIVER_ENTRY_POINT(InitFunction) \ UINTN \ __stdcall \ _DllMainCRTStartup ( \ UINTN Inst, \ UINTN reason_for_call, \ VOID *rserved \ ) \ { \ return 1; \ } \ \ int \ EXPORTAPI \ __cdecl \ InitializeDriver ( \ void *ImageHandle, \ void *SystemTable \ ) \ { \ return InitFunction(ImageHandle, SystemTable); \ } #define LOAD_INTERNAL_DRIVER(_if, type, name, entry) \ (_if)->LoadInternal(type, name, NULL) #else // EFI_NT_EMULATOR // // When build similiar to FW, then link everything together as // one big module. // #define EFI_DRIVER_ENTRY_POINT(InitFunction) \ UINTN \ InitializeDriver ( \ VOID *ImageHandle, \ VOID *SystemTable \ ) \ { \ return InitFunction(ImageHandle, \ SystemTable); \ } \ \ EFI_STATUS efi_main( \ EFI_HANDLE image, \ EFI_SYSTEM_TABLE *systab \ ) __attribute__((weak, \ alias ("InitializeDriver"))); #define LOAD_INTERNAL_DRIVER(_if, type, name, entry) \ (_if)->LoadInternal(type, name, entry) #endif // EFI_FW_NT // // Some compilers don't support the forward reference construct: // typedef struct XXXXX // // The following macro provide a workaround for such cases. // #ifdef NO_INTERFACE_DECL #define INTERFACE_DECL(x) #else #ifdef __GNUC__ #define INTERFACE_DECL(x) struct x #else #define INTERFACE_DECL(x) typedef struct x #endif #endif #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4) #define uefi_call_wrapper(func, va_num, ...) func(__VA_ARGS__) #else /* for x86_64, EFI_FUNCTION_WRAPPER must be defined */ #ifdef EFI_FUNCTION_WRAPPER UINTN uefi_call_wrapper(void *func, unsigned long va_num, ...); #else #error "EFI_FUNCTION_WRAPPER must be defined for x86_64 architecture" #endif #endif #ifdef _MSC_EXTENSIONS #pragma warning ( disable : 4731 ) // Suppress warnings about modification of EBP #endif xen-4.4.0/xen/include/asm-x86/x86_64/uaccess.h0000664000175000017500000000610112307313555016616 0ustar smbsmb#ifndef __X86_64_UACCESS_H #define __X86_64_UACCESS_H #define COMPAT_ARG_XLAT_VIRT_BASE ((void *)ARG_XLAT_START(current)) #define COMPAT_ARG_XLAT_SIZE (2*PAGE_SIZE) struct vcpu; int setup_compat_arg_xlat(struct vcpu *v); void free_compat_arg_xlat(struct vcpu *v); #define is_compat_arg_xlat_range(addr, size) ({ \ unsigned long __off; \ __off = (unsigned long)(addr) - (unsigned long)COMPAT_ARG_XLAT_VIRT_BASE; \ (__off < COMPAT_ARG_XLAT_SIZE) && \ ((__off + (unsigned long)(size)) <= COMPAT_ARG_XLAT_SIZE); \ }) #define xlat_page_start ((unsigned long)COMPAT_ARG_XLAT_VIRT_BASE) #define xlat_page_size COMPAT_ARG_XLAT_SIZE #define xlat_page_left_size(xlat_page_current) \ (xlat_page_start + xlat_page_size - xlat_page_current) #define xlat_malloc_init(xlat_page_current) do { \ xlat_page_current = xlat_page_start; \ } while (0) extern void *xlat_malloc(unsigned long *xlat_page_current, size_t size); #define xlat_malloc_array(_p, _t, _c) ((_t *) xlat_malloc(&_p, sizeof(_t) * _c)) /* * Valid if in +ve half of 48-bit address space, or above Xen-reserved area. * This is also valid for range checks (addr, addr+size). As long as the * start address is outside the Xen-reserved area then we will access a * non-canonical address (and thus fault) before ever reaching VIRT_START. */ #define __addr_ok(addr) \ (((unsigned long)(addr) < (1UL<<47)) || \ ((unsigned long)(addr) >= HYPERVISOR_VIRT_END)) #define access_ok(addr, size) \ (__addr_ok(addr) || is_compat_arg_xlat_range(addr, size)) #define array_access_ok(addr, count, size) \ (access_ok(addr, (count)*(size))) #define __compat_addr_ok(d, addr) \ ((unsigned long)(addr) < HYPERVISOR_COMPAT_VIRT_START(d)) #define __compat_access_ok(d, addr, size) \ __compat_addr_ok(d, (unsigned long)(addr) + ((size) ? (size) - 1 : 0)) #define compat_access_ok(addr, size) \ __compat_access_ok(current->domain, addr, size) #define compat_array_access_ok(addr,count,size) \ (likely((count) < (~0U / (size))) && \ compat_access_ok(addr, (count) * (size))) #define __put_user_size(x,ptr,size,retval,errret) \ do { \ retval = 0; \ switch (size) { \ case 1: __put_user_asm(x,ptr,retval,"b","b","iq",errret);break; \ case 2: __put_user_asm(x,ptr,retval,"w","w","ir",errret);break; \ case 4: __put_user_asm(x,ptr,retval,"l","k","ir",errret);break; \ case 8: __put_user_asm(x,ptr,retval,"q","","ir",errret);break; \ default: __put_user_bad(); \ } \ } while (0) #define __get_user_size(x,ptr,size,retval,errret) \ do { \ retval = 0; \ switch (size) { \ case 1: __get_user_asm(x,ptr,retval,"b","b","=q",errret);break; \ case 2: __get_user_asm(x,ptr,retval,"w","w","=r",errret);break; \ case 4: __get_user_asm(x,ptr,retval,"l","k","=r",errret);break; \ case 8: __get_user_asm(x,ptr,retval,"q","","=r",errret); break; \ default: __get_user_bad(); \ } \ } while (0) #endif /* __X86_64_UACCESS_H */ xen-4.4.0/xen/include/asm-x86/x86_64/system.h0000664000175000017500000000546612307313555016531 0ustar smbsmb#ifndef __X86_64_SYSTEM_H__ #define __X86_64_SYSTEM_H__ #define cmpxchg(ptr,o,n) \ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o), \ (unsigned long)(n),sizeof(*(ptr)))) /* * This function causes value _o to be changed to _n at location _p. * If this access causes a fault then we return 1, otherwise we return 0. * If no fault occurs then _o is updated to the value we saw at _p. If this * is the same as the initial value of _o then _n is written to location _p. */ #define __cmpxchg_user(_p,_o,_n,_isuff,_oppre,_regtype) \ asm volatile ( \ "1: lock; cmpxchg"_isuff" %"_oppre"2,%3\n" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: movl $1,%1\n" \ " jmp 2b\n" \ ".previous\n" \ _ASM_EXTABLE(1b, 3b) \ : "=a" (_o), "=r" (_rc) \ : _regtype (_n), "m" (*__xg((volatile void *)_p)), "0" (_o), "1" (0) \ : "memory"); #define cmpxchg_user(_p,_o,_n) \ ({ \ int _rc; \ switch ( sizeof(*(_p)) ) { \ case 1: \ __cmpxchg_user(_p,_o,_n,"b","b","q"); \ break; \ case 2: \ __cmpxchg_user(_p,_o,_n,"w","w","r"); \ break; \ case 4: \ __cmpxchg_user(_p,_o,_n,"l","k","r"); \ break; \ case 8: \ __cmpxchg_user(_p,_o,_n,"q","","r"); \ break; \ } \ _rc; \ }) #define mb() \ asm volatile ( "mfence" : : : "memory" ) #endif /* __X86_64_SYSTEM_H__ */ xen-4.4.0/xen/include/asm-x86/x86_64/regs.h0000664000175000017500000000200012307313555016122 0ustar smbsmb#ifndef _X86_64_REGS_H #define _X86_64_REGS_H #include #include #define vm86_mode(r) (0) /* No VM86 support in long mode. */ #define ring_0(r) (((r)->cs & 3) == 0) #define ring_1(r) (((r)->cs & 3) == 1) #define ring_2(r) (((r)->cs & 3) == 2) #define ring_3(r) (((r)->cs & 3) == 3) #define guest_kernel_mode(v, r) \ (!is_pv_32bit_vcpu(v) ? \ (ring_3(r) && ((v)->arch.flags & TF_kernel_mode)) : \ (ring_1(r))) #define permit_softint(dpl, v, r) \ ((dpl) >= (guest_kernel_mode(v, r) ? 1 : 3)) /* Check for null trap callback handler: Is the EIP null? */ #define null_trap_bounce(v, tb) \ (!is_pv_32bit_vcpu(v) ? ((tb)->eip == 0) : (((tb)->cs & ~3) == 0)) /* Number of bytes of on-stack execution state to be context-switched. */ /* NB. Segment registers and bases are not saved/restored on x86/64 stack. */ #define CTXT_SWITCH_STACK_BYTES (offsetof(struct cpu_user_regs, es)) #endif xen-4.4.0/xen/include/asm-x86/elf.h0000664000175000017500000000046512307313555015007 0ustar smbsmb#ifndef __X86_ELF_H__ #define __X86_ELF_H__ typedef struct { unsigned long cr0, cr2, cr3, cr4; } crash_xen_core_t; #include #endif /* __X86_ELF_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/apicdef.h0000664000175000017500000002355712307313555015643 0ustar smbsmb#ifndef __ASM_APICDEF_H #define __ASM_APICDEF_H /* * Constants for various Intel APICs. (local APIC, IOAPIC, etc.) * * Alan Cox , 1995. * Ingo Molnar , 1999, 2000 */ #define APIC_DEFAULT_PHYS_BASE 0xfee00000 #define APIC_ID 0x20 #define APIC_ID_MASK (0xFFu<<24) #define GET_xAPIC_ID(x) (((x)>>24)&0xFFu) #define SET_xAPIC_ID(x) (((x)<<24)) #define APIC_LVR 0x30 #define APIC_LVR_MASK 0xFF00FF #define APIC_LVR_DIRECTED_EOI (1 << 24) #define GET_APIC_VERSION(x) ((x)&0xFF) #define GET_APIC_MAXLVT(x) (((x)>>16)&0xFF) #define APIC_INTEGRATED(x) ((x)&0xF0) #define APIC_XAPIC(x) ((x) >= 0x14) #define APIC_TASKPRI 0x80 #define APIC_TPRI_MASK 0xFF #define APIC_ARBPRI 0x90 #define APIC_ARBPRI_MASK 0xFF #define APIC_PROCPRI 0xA0 #define APIC_EOI 0xB0 #define APIC_EIO_ACK 0x0 /* Write this to the EOI register */ #define APIC_RRR 0xC0 #define APIC_LDR 0xD0 #define APIC_LDR_MASK (0xFF<<24) #define GET_xAPIC_LOGICAL_ID(x) (((x)>>24)&0xFF) #define SET_xAPIC_LOGICAL_ID(x) (((x)<<24)) #define APIC_ALL_CPUS 0xFF #define APIC_DFR 0xE0 #define APIC_DFR_CLUSTER 0x0FFFFFFFul #define APIC_DFR_FLAT 0xFFFFFFFFul #define APIC_SPIV 0xF0 #define APIC_SPIV_FOCUS_DISABLED (1<<9) #define APIC_SPIV_APIC_ENABLED (1<<8) #define APIC_SPIV_DIRECTED_EOI (1<<12) #define APIC_ISR 0x100 #define APIC_ISR_NR 0x8 /* Number of 32 bit ISR registers. */ #define APIC_TMR 0x180 #define APIC_IRR 0x200 #define APIC_ESR 0x280 #define APIC_ESR_SEND_CS 0x00001 #define APIC_ESR_RECV_CS 0x00002 #define APIC_ESR_SEND_ACC 0x00004 #define APIC_ESR_RECV_ACC 0x00008 #define APIC_ESR_SENDILL 0x00020 #define APIC_ESR_RECVILL 0x00040 #define APIC_ESR_ILLREGA 0x00080 #define APIC_ICR 0x300 #define APIC_DEST_SELF 0x40000 #define APIC_DEST_ALLINC 0x80000 #define APIC_DEST_ALLBUT 0xC0000 #define APIC_ICR_RR_MASK 0x30000 #define APIC_ICR_RR_INVALID 0x00000 #define APIC_ICR_RR_INPROG 0x10000 #define APIC_ICR_RR_VALID 0x20000 #define APIC_INT_LEVELTRIG 0x08000 #define APIC_INT_ASSERT 0x04000 #define APIC_ICR_BUSY 0x01000 #define APIC_DEST_LOGICAL 0x00800 #define APIC_DEST_PHYSICAL 0x00000 #define APIC_DM_FIXED 0x00000 #define APIC_DM_LOWEST 0x00100 #define APIC_DM_SMI 0x00200 #define APIC_DM_REMRD 0x00300 #define APIC_DM_NMI 0x00400 #define APIC_DM_INIT 0x00500 #define APIC_DM_STARTUP 0x00600 #define APIC_DM_EXTINT 0x00700 #define APIC_VECTOR_MASK 0x000FF #define APIC_ICR2 0x310 #define GET_xAPIC_DEST_FIELD(x) (((x)>>24)&0xFF) #define SET_xAPIC_DEST_FIELD(x) ((x)<<24) #define APIC_LVTT 0x320 #define APIC_LVTTHMR 0x330 #define APIC_LVTPC 0x340 #define APIC_LVT0 0x350 #define APIC_CMCI 0x2F0 #define APIC_LVT_TIMER_BASE_MASK (0x3<<18) #define GET_APIC_TIMER_BASE(x) (((x)>>18)&0x3) #define SET_APIC_TIMER_BASE(x) (((x)<<18)) #define APIC_TIMER_BASE_CLKIN 0x0 #define APIC_TIMER_BASE_TMBASE 0x1 #define APIC_TIMER_BASE_DIV 0x2 #define APIC_TIMER_MODE_MASK (0x3<<17) #define APIC_TIMER_MODE_ONESHOT (0x0<<17) #define APIC_TIMER_MODE_PERIODIC (0x1<<17) #define APIC_TIMER_MODE_TSC_DEADLINE (0x2<<17) #define APIC_LVT_MASKED (1<<16) #define APIC_LVT_LEVEL_TRIGGER (1<<15) #define APIC_LVT_REMOTE_IRR (1<<14) #define APIC_INPUT_POLARITY (1<<13) #define APIC_SEND_PENDING (1<<12) #define APIC_MODE_MASK 0x700 #define GET_APIC_DELIVERY_MODE(x) (((x)>>8)&0x7) #define SET_APIC_DELIVERY_MODE(x,y) (((x)&~0x700)|((y)<<8)) #define APIC_MODE_FIXED 0x0 #define APIC_MODE_NMI 0x4 #define APIC_MODE_EXTINT 0x7 #define APIC_LVT1 0x360 #define APIC_LVTERR 0x370 #define APIC_TMICT 0x380 #define APIC_TMCCT 0x390 #define APIC_TDCR 0x3E0 /* Only available in x2APIC mode */ #define APIC_SELF_IPI 0x3F0 #define APIC_TDR_DIV_TMBASE (1<<2) #define APIC_TDR_DIV_1 0xB #define APIC_TDR_DIV_2 0x0 #define APIC_TDR_DIV_4 0x1 #define APIC_TDR_DIV_8 0x2 #define APIC_TDR_DIV_16 0x3 #define APIC_TDR_DIV_32 0x8 #define APIC_TDR_DIV_64 0x9 #define APIC_TDR_DIV_128 0xA #define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) /* It's only used in x2APIC mode of an x2APIC unit. */ #define APIC_MSR_BASE 0x800 #define MAX_IO_APICS 128 /* * the local APIC register structure, memory mapped. Not terribly well * tested, but we might eventually use this one in the future - the * problem why we cannot use it right now is the P5 APIC, it has an * errata which cannot take 8-bit reads and writes, only 32-bit ones ... */ #define u32 unsigned int #define lapic ((volatile struct local_apic *)APIC_BASE) #ifndef __ASSEMBLY__ struct local_apic { /*000*/ struct { u32 __reserved[4]; } __reserved_01; /*010*/ struct { u32 __reserved[4]; } __reserved_02; /*020*/ struct { /* APIC ID Register */ u32 __reserved_1 : 24, phys_apic_id : 4, __reserved_2 : 4; u32 __reserved[3]; } id; /*030*/ const struct { /* APIC Version Register */ u32 version : 8, __reserved_1 : 8, max_lvt : 8, __reserved_2 : 8; u32 __reserved[3]; } version; /*040*/ struct { u32 __reserved[4]; } __reserved_03; /*050*/ struct { u32 __reserved[4]; } __reserved_04; /*060*/ struct { u32 __reserved[4]; } __reserved_05; /*070*/ struct { u32 __reserved[4]; } __reserved_06; /*080*/ struct { /* Task Priority Register */ u32 priority : 8, __reserved_1 : 24; u32 __reserved_2[3]; } tpr; /*090*/ const struct { /* Arbitration Priority Register */ u32 priority : 8, __reserved_1 : 24; u32 __reserved_2[3]; } apr; /*0A0*/ const struct { /* Processor Priority Register */ u32 priority : 8, __reserved_1 : 24; u32 __reserved_2[3]; } ppr; /*0B0*/ struct { /* End Of Interrupt Register */ u32 eoi; u32 __reserved[3]; } eoi; /*0C0*/ struct { u32 __reserved[4]; } __reserved_07; /*0D0*/ struct { /* Logical Destination Register */ u32 __reserved_1 : 24, logical_dest : 8; u32 __reserved_2[3]; } ldr; /*0E0*/ struct { /* Destination Format Register */ u32 __reserved_1 : 28, model : 4; u32 __reserved_2[3]; } dfr; /*0F0*/ struct { /* Spurious Interrupt Vector Register */ u32 spurious_vector : 8, apic_enabled : 1, focus_cpu : 1, __reserved_2 : 22; u32 __reserved_3[3]; } svr; /*100*/ struct { /* In Service Register */ /*170*/ u32 bitfield; u32 __reserved[3]; } isr [8]; /*180*/ struct { /* Trigger Mode Register */ /*1F0*/ u32 bitfield; u32 __reserved[3]; } tmr [8]; /*200*/ struct { /* Interrupt Request Register */ /*270*/ u32 bitfield; u32 __reserved[3]; } irr [8]; /*280*/ union { /* Error Status Register */ struct { u32 send_cs_error : 1, receive_cs_error : 1, send_accept_error : 1, receive_accept_error : 1, __reserved_1 : 1, send_illegal_vector : 1, receive_illegal_vector : 1, illegal_register_address : 1, __reserved_2 : 24; u32 __reserved_3[3]; } error_bits; struct { u32 errors; u32 __reserved_3[3]; } all_errors; } esr; /*290*/ struct { u32 __reserved[4]; } __reserved_08; /*2A0*/ struct { u32 __reserved[4]; } __reserved_09; /*2B0*/ struct { u32 __reserved[4]; } __reserved_10; /*2C0*/ struct { u32 __reserved[4]; } __reserved_11; /*2D0*/ struct { u32 __reserved[4]; } __reserved_12; /*2E0*/ struct { u32 __reserved[4]; } __reserved_13; /*2F0*/ struct { u32 __reserved[4]; } __reserved_14; /*300*/ struct { /* Interrupt Command Register 1 */ u32 vector : 8, delivery_mode : 3, destination_mode : 1, delivery_status : 1, __reserved_1 : 1, level : 1, trigger : 1, __reserved_2 : 2, shorthand : 2, __reserved_3 : 12; u32 __reserved_4[3]; } icr1; /*310*/ struct { /* Interrupt Command Register 2 */ union { u32 __reserved_1 : 24, phys_dest : 4, __reserved_2 : 4; u32 __reserved_3 : 24, logical_dest : 8; } dest; u32 __reserved_4[3]; } icr2; /*320*/ struct { /* LVT - Timer */ u32 vector : 8, __reserved_1 : 4, delivery_status : 1, __reserved_2 : 3, mask : 1, timer_mode : 1, __reserved_3 : 14; u32 __reserved_4[3]; } lvt_timer; /*330*/ struct { /* LVT - Thermal Sensor */ u32 vector : 8, delivery_mode : 3, __reserved_1 : 1, delivery_status : 1, __reserved_2 : 3, mask : 1, __reserved_3 : 15; u32 __reserved_4[3]; } lvt_thermal; /*340*/ struct { /* LVT - Performance Counter */ u32 vector : 8, delivery_mode : 3, __reserved_1 : 1, delivery_status : 1, __reserved_2 : 3, mask : 1, __reserved_3 : 15; u32 __reserved_4[3]; } lvt_pc; /*350*/ struct { /* LVT - LINT0 */ u32 vector : 8, delivery_mode : 3, __reserved_1 : 1, delivery_status : 1, polarity : 1, remote_irr : 1, trigger : 1, mask : 1, __reserved_2 : 15; u32 __reserved_3[3]; } lvt_lint0; /*360*/ struct { /* LVT - LINT1 */ u32 vector : 8, delivery_mode : 3, __reserved_1 : 1, delivery_status : 1, polarity : 1, remote_irr : 1, trigger : 1, mask : 1, __reserved_2 : 15; u32 __reserved_3[3]; } lvt_lint1; /*370*/ struct { /* LVT - Error */ u32 vector : 8, __reserved_1 : 4, delivery_status : 1, __reserved_2 : 3, mask : 1, __reserved_3 : 15; u32 __reserved_4[3]; } lvt_error; /*380*/ struct { /* Timer Initial Count Register */ u32 initial_count; u32 __reserved_2[3]; } timer_icr; /*390*/ const struct { /* Timer Current Count Register */ u32 curr_count; u32 __reserved_2[3]; } timer_ccr; /*3A0*/ struct { u32 __reserved[4]; } __reserved_16; /*3B0*/ struct { u32 __reserved[4]; } __reserved_17; /*3C0*/ struct { u32 __reserved[4]; } __reserved_18; /*3D0*/ struct { u32 __reserved[4]; } __reserved_19; /*3E0*/ struct { /* Timer Divide Configuration Register */ u32 divisor : 4, __reserved_1 : 28; u32 __reserved_2[3]; } timer_dcr; /*3F0*/ struct { u32 __reserved[4]; } __reserved_20; } __attribute__ ((packed)); #endif /* !__ASSEMBLY__ */ #undef u32 #endif xen-4.4.0/xen/include/asm-x86/hap.h0000664000175000017500000000517512307313555015014 0ustar smbsmb/****************************************************************************** * include/asm-x86/hap.h * * hardware-assisted paging * Copyright (c) 2007 Advanced Micro Devices (Wei Huang) * * Parts of this code are Copyright (c) 2006 by XenSource Inc. * Parts of this code are Copyright (c) 2006 by Michael A Fetterman * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _XEN_HAP_H #define _XEN_HAP_H #define HAP_PRINTK(_f, _a...) \ debugtrace_printk("hap: %s(): " _f, __func__, ##_a) #define HAP_ERROR(_f, _a...) \ printk("hap error: %s(): " _f, __func__, ##_a) /************************************************/ /* hap domain page mapping */ /************************************************/ static inline void * hap_map_domain_page(mfn_t mfn) { return map_domain_page(mfn_x(mfn)); } static inline void hap_unmap_domain_page(void *p) { unmap_domain_page(p); } /************************************************/ /* hap domain level functions */ /************************************************/ void hap_domain_init(struct domain *d); int hap_domctl(struct domain *d, xen_domctl_shadow_op_t *sc, XEN_GUEST_HANDLE_PARAM(void) u_domctl); int hap_enable(struct domain *d, u32 mode); void hap_final_teardown(struct domain *d); void hap_teardown(struct domain *d); void hap_vcpu_init(struct vcpu *v); void hap_logdirty_init(struct domain *d); int hap_track_dirty_vram(struct domain *d, unsigned long begin_pfn, unsigned long nr, XEN_GUEST_HANDLE_64(uint8) dirty_bitmap); extern const struct paging_mode *hap_paging_get_mode(struct vcpu *); #endif /* XEN_HAP_H */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/delay.h0000664000175000017500000000040512307313555015331 0ustar smbsmb#ifndef _X86_DELAY_H #define _X86_DELAY_H /* * Copyright (C) 1993 Linus Torvalds * * Delay routines calling functions in arch/i386/lib/delay.c */ extern void __udelay(unsigned long usecs); #define udelay(n) __udelay(n) #endif /* defined(_X86_DELAY_H) */ xen-4.4.0/xen/include/asm-x86/e820.h0000664000175000017500000000216112307313555014712 0ustar smbsmb#ifndef __E820_HEADER #define __E820_HEADER /* * PC BIOS standard E820 types and structure. */ #define E820_RAM 1 #define E820_RESERVED 2 #define E820_ACPI 3 #define E820_NVS 4 #define E820_UNUSABLE 5 struct e820entry { uint64_t addr; uint64_t size; uint32_t type; } __attribute__((packed)); #define E820MAX 128 struct e820map { int nr_map; struct e820entry map[E820MAX]; }; extern int e820_all_mapped(u64 start, u64 end, unsigned type); extern int reserve_e820_ram(struct e820map *e820, uint64_t s, uint64_t e); extern int e820_change_range_type( struct e820map *e820, uint64_t s, uint64_t e, uint32_t orig_type, uint32_t new_type); extern int e820_add_range( struct e820map *, uint64_t s, uint64_t e, uint32_t type); extern unsigned long init_e820(const char *, struct e820entry *, int *); extern struct e820map e820; /* These symbols live in the boot trampoline. */ extern struct e820entry e820map[]; extern int e820nr; extern unsigned int lowmem_kb, highmem_kb; #define e820_raw bootsym(e820map) #define e820_raw_nr bootsym(e820nr) #endif /*__E820_HEADER*/ xen-4.4.0/xen/include/asm-x86/io.h0000664000175000017500000000320112307313555014637 0ustar smbsmb#ifndef _ASM_IO_H #define _ASM_IO_H #include #include #include #define readb(x) (*(volatile uint8_t *)(x)) #define readw(x) (*(volatile uint16_t *)(x)) #define readl(x) (*(volatile uint32_t *)(x)) #define readq(x) (*(volatile uint64_t *)(x)) #define writeb(d,x) (*(volatile uint8_t *)(x) = (d)) #define writew(d,x) (*(volatile uint16_t *)(x) = (d)) #define writel(d,x) (*(volatile uint32_t *)(x) = (d)) #define writeq(d,x) (*(volatile uint64_t *)(x) = (d)) #define __OUT1(s,x) \ static inline void out##s(unsigned x value, unsigned short port) { #define __OUT2(s,s1,s2) \ __asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1" #define __OUT(s,s1,x) \ __OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \ __OUT1(s##_p,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port));} #define __IN1(s) \ static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v; #define __IN2(s,s1,s2) \ __asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0" #define __IN(s,s1,i...) \ __IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \ __IN1(s##_p) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } #define RETURN_TYPE unsigned char __IN(b,"") #undef RETURN_TYPE #define RETURN_TYPE unsigned short __IN(w,"") #undef RETURN_TYPE #define RETURN_TYPE unsigned int __IN(l,"") #undef RETURN_TYPE __OUT(b,"b",char) __OUT(w,"w",short) __OUT(l,,int) extern void (*pv_post_outb_hook)(unsigned int port, u8 value); /* Function pointer used to handle platform specific I/O port emulation. */ extern void (*ioemul_handle_quirk)( u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs); #endif xen-4.4.0/xen/include/asm-x86/msr-index.h0000664000175000017500000004324212307313555016147 0ustar smbsmb#ifndef __ASM_MSR_INDEX_H #define __ASM_MSR_INDEX_H /* CPU model specific register (MSR) numbers */ /* x86-64 specific MSRs */ #define MSR_EFER 0xc0000080 /* extended feature register */ #define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */ #define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */ #define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target */ #define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */ #define MSR_FS_BASE 0xc0000100 /* 64bit FS base */ #define MSR_GS_BASE 0xc0000101 /* 64bit GS base */ #define MSR_SHADOW_GS_BASE 0xc0000102 /* SwapGS GS shadow */ #define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */ /* EFER bits: */ #define _EFER_SCE 0 /* SYSCALL/SYSRET */ #define _EFER_LME 8 /* Long mode enable */ #define _EFER_LMA 10 /* Long mode active (read-only) */ #define _EFER_NX 11 /* No execute enable */ #define _EFER_SVME 12 /* AMD: SVM enable */ #define _EFER_LMSLE 13 /* AMD: Long-mode segment limit enable */ #define _EFER_FFXSE 14 /* AMD: Fast FXSAVE/FXRSTOR enable */ #define EFER_SCE (1<<_EFER_SCE) #define EFER_LME (1<<_EFER_LME) #define EFER_LMA (1<<_EFER_LMA) #define EFER_NX (1<<_EFER_NX) #define EFER_SVME (1<<_EFER_SVME) #define EFER_LMSLE (1<<_EFER_LMSLE) #define EFER_FFXSE (1<<_EFER_FFXSE) /* Intel MSRs. Some also available on other CPUs */ #define MSR_IA32_PERFCTR0 0x000000c1 #define MSR_IA32_A_PERFCTR0 0x000004c1 #define MSR_FSB_FREQ 0x000000cd #define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2 #define NHM_C3_AUTO_DEMOTE (1UL << 25) #define NHM_C1_AUTO_DEMOTE (1UL << 26) #define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25) #define MSR_MTRRcap 0x000000fe #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 #define MSR_IA32_SYSENTER_EIP 0x00000176 #define MSR_IA32_MCG_CAP 0x00000179 #define MSR_IA32_MCG_STATUS 0x0000017a #define MSR_IA32_MCG_CTL 0x0000017b #define MSR_IA32_PEBS_ENABLE 0x000003f1 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 #define MSR_MTRRfix64K_00000 0x00000250 #define MSR_MTRRfix16K_80000 0x00000258 #define MSR_MTRRfix16K_A0000 0x00000259 #define MSR_MTRRfix4K_C0000 0x00000268 #define MSR_MTRRfix4K_C8000 0x00000269 #define MSR_MTRRfix4K_D0000 0x0000026a #define MSR_MTRRfix4K_D8000 0x0000026b #define MSR_MTRRfix4K_E0000 0x0000026c #define MSR_MTRRfix4K_E8000 0x0000026d #define MSR_MTRRfix4K_F0000 0x0000026e #define MSR_MTRRfix4K_F8000 0x0000026f #define MSR_MTRRdefType 0x000002ff #define MSR_IA32_DEBUGCTLMSR 0x000001d9 #define IA32_DEBUGCTLMSR_LBR (1<<0) /* Last Branch Record */ #define IA32_DEBUGCTLMSR_BTF (1<<1) /* Single Step on Branches */ #define IA32_DEBUGCTLMSR_TR (1<<6) /* Trace Message Enable */ #define IA32_DEBUGCTLMSR_BTS (1<<7) /* Branch Trace Store */ #define IA32_DEBUGCTLMSR_BTINT (1<<8) /* Branch Trace Interrupt */ #define IA32_DEBUGCTLMSR_BTS_OFF_OS (1<<9) /* BTS off if CPL 0 */ #define IA32_DEBUGCTLMSR_BTS_OFF_USR (1<<10) /* BTS off if CPL > 0 */ #define MSR_IA32_LASTBRANCHFROMIP 0x000001db #define MSR_IA32_LASTBRANCHTOIP 0x000001dc #define MSR_IA32_LASTINTFROMIP 0x000001dd #define MSR_IA32_LASTINTTOIP 0x000001de #define MSR_IA32_POWER_CTL 0x000001fc #define MSR_IA32_MTRR_PHYSBASE0 0x00000200 #define MSR_IA32_MTRR_PHYSMASK0 0x00000201 #define MSR_IA32_MTRR_PHYSBASE1 0x00000202 #define MSR_IA32_MTRR_PHYSMASK1 0x00000203 #define MSR_IA32_MTRR_PHYSBASE2 0x00000204 #define MSR_IA32_MTRR_PHYSMASK2 0x00000205 #define MSR_IA32_MTRR_PHYSBASE3 0x00000206 #define MSR_IA32_MTRR_PHYSMASK3 0x00000207 #define MSR_IA32_MTRR_PHYSBASE4 0x00000208 #define MSR_IA32_MTRR_PHYSMASK4 0x00000209 #define MSR_IA32_MTRR_PHYSBASE5 0x0000020a #define MSR_IA32_MTRR_PHYSMASK5 0x0000020b #define MSR_IA32_MTRR_PHYSBASE6 0x0000020c #define MSR_IA32_MTRR_PHYSMASK6 0x0000020d #define MSR_IA32_MTRR_PHYSBASE7 0x0000020e #define MSR_IA32_MTRR_PHYSMASK7 0x0000020f #define MSR_IA32_CR_PAT 0x00000277 #define MSR_IA32_CR_PAT_RESET 0x0007040600070406ULL #define MSR_IA32_MC0_CTL 0x00000400 #define MSR_IA32_MC0_STATUS 0x00000401 #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 #define MSR_IA32_MC0_CTL2 0x00000280 #define CMCI_EN (1UL<<30) #define CMCI_THRESHOLD_MASK 0x7FFF #define MSR_AMD64_MC0_MASK 0xc0010044 #define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) #define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) #define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) #define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) #define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) #define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x)) #define MSR_P6_PERFCTR0 0x000000c1 #define MSR_P6_PERFCTR1 0x000000c2 #define MSR_P6_EVNTSEL0 0x00000186 #define MSR_P6_EVNTSEL1 0x00000187 /* MSRs & bits used for VMX enabling */ #define MSR_IA32_VMX_BASIC 0x480 #define MSR_IA32_VMX_PINBASED_CTLS 0x481 #define MSR_IA32_VMX_PROCBASED_CTLS 0x482 #define MSR_IA32_VMX_EXIT_CTLS 0x483 #define MSR_IA32_VMX_ENTRY_CTLS 0x484 #define MSR_IA32_VMX_MISC 0x485 #define MSR_IA32_VMX_CR0_FIXED0 0x486 #define MSR_IA32_VMX_CR0_FIXED1 0x487 #define MSR_IA32_VMX_CR4_FIXED0 0x488 #define MSR_IA32_VMX_CR4_FIXED1 0x489 #define MSR_IA32_VMX_VMCS_ENUM 0x48a #define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b #define MSR_IA32_VMX_EPT_VPID_CAP 0x48c #define MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x48d #define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x48e #define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x48f #define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x490 #define IA32_FEATURE_CONTROL_MSR 0x3a #define IA32_FEATURE_CONTROL_MSR_LOCK 0x0001 #define IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_INSIDE_SMX 0x0002 #define IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON_OUTSIDE_SMX 0x0004 #define IA32_FEATURE_CONTROL_MSR_SENTER_PARAM_CTL 0x7f00 #define IA32_FEATURE_CONTROL_MSR_ENABLE_SENTER 0x8000 /* K7/K8 MSRs. Not complete. See the architecture manual for a more complete list. */ #define MSR_K7_EVNTSEL0 0xc0010000 #define MSR_K7_PERFCTR0 0xc0010004 #define MSR_K7_EVNTSEL1 0xc0010001 #define MSR_K7_PERFCTR1 0xc0010005 #define MSR_K7_EVNTSEL2 0xc0010002 #define MSR_K7_PERFCTR2 0xc0010006 #define MSR_K7_EVNTSEL3 0xc0010003 #define MSR_K7_PERFCTR3 0xc0010007 #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K7_CLK_CTL 0xc001001b #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_K8_SYSCFG 0xc0010010 #define K8_MTRRFIXRANGE_DRAM_ENABLE 0x00040000 /* MtrrFixDramEn bit */ #define K8_MTRRFIXRANGE_DRAM_MODIFY 0x00080000 /* MtrrFixDramModEn bit */ #define K8_MTRR_RDMEM_WRMEM_MASK 0x18181818 /* Mask: RdMem|WrMem */ #define MSR_K7_HWCR 0xc0010015 #define MSR_K8_HWCR 0xc0010015 #define MSR_K7_FID_VID_CTL 0xc0010041 #define MSR_K7_FID_VID_STATUS 0xc0010042 #define MSR_K8_PSTATE_LIMIT 0xc0010061 #define MSR_K8_PSTATE_CTRL 0xc0010062 #define MSR_K8_PSTATE_STATUS 0xc0010063 #define MSR_K8_PSTATE0 0xc0010064 #define MSR_K8_PSTATE1 0xc0010065 #define MSR_K8_PSTATE2 0xc0010066 #define MSR_K8_PSTATE3 0xc0010067 #define MSR_K8_PSTATE4 0xc0010068 #define MSR_K8_PSTATE5 0xc0010069 #define MSR_K8_PSTATE6 0xc001006A #define MSR_K8_PSTATE7 0xc001006B #define MSR_K8_ENABLE_C1E 0xc0010055 #define MSR_K8_VM_CR 0xc0010114 #define MSR_K8_VM_HSAVE_PA 0xc0010117 #define MSR_AMD_FAM15H_EVNTSEL0 0xc0010200 #define MSR_AMD_FAM15H_PERFCTR0 0xc0010201 #define MSR_AMD_FAM15H_EVNTSEL1 0xc0010202 #define MSR_AMD_FAM15H_PERFCTR1 0xc0010203 #define MSR_AMD_FAM15H_EVNTSEL2 0xc0010204 #define MSR_AMD_FAM15H_PERFCTR2 0xc0010205 #define MSR_AMD_FAM15H_EVNTSEL3 0xc0010206 #define MSR_AMD_FAM15H_PERFCTR3 0xc0010207 #define MSR_AMD_FAM15H_EVNTSEL4 0xc0010208 #define MSR_AMD_FAM15H_PERFCTR4 0xc0010209 #define MSR_AMD_FAM15H_EVNTSEL5 0xc001020a #define MSR_AMD_FAM15H_PERFCTR5 0xc001020b #define MSR_K8_FEATURE_MASK 0xc0011004 #define MSR_K8_EXT_FEATURE_MASK 0xc0011005 /* MSR_K8_VM_CR bits: */ #define _K8_VMCR_SVME_DISABLE 4 #define K8_VMCR_SVME_DISABLE (1 << _K8_VMCR_SVME_DISABLE) /* AMD64 MSRs */ #define MSR_AMD64_NB_CFG 0xc001001f #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_IC_CFG 0xc0011021 #define MSR_AMD64_DC_CFG 0xc0011022 #define AMD64_NB_CFG_CF8_EXT_ENABLE_BIT 46 /* AMD Family10h machine check MSRs */ #define MSR_F10_MC4_MISC1 0xc0000408 #define MSR_F10_MC4_MISC2 0xc0000409 #define MSR_F10_MC4_MISC3 0xc000040A /* AMD Family10h Bus Unit MSRs */ #define MSR_F10_BU_CFG 0xc0011023 #define MSR_F10_BU_CFG2 0xc001102a /* Other AMD Fam10h MSRs */ #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 #define FAM10H_MMIO_CONF_ENABLE (1<<0) #define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf #define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL #define FAM10H_MMIO_CONF_BASE_SHIFT 20 /* AMD Microcode MSRs */ #define MSR_AMD_PATCHLEVEL 0x0000008b #define MSR_AMD_PATCHLOADER 0xc0010020 /* AMD TSC RATE MSR */ #define MSR_AMD64_TSC_RATIO 0xc0000104 /* AMD Lightweight Profiling MSRs */ #define MSR_AMD64_LWP_CFG 0xc0000105 #define MSR_AMD64_LWP_CBADDR 0xc0000106 /* AMD OS Visible Workaround MSRs */ #define MSR_AMD_OSVW_ID_LENGTH 0xc0010140 #define MSR_AMD_OSVW_STATUS 0xc0010141 /* K6 MSRs */ #define MSR_K6_EFER 0xc0000080 #define MSR_K6_STAR 0xc0000081 #define MSR_K6_WHCR 0xc0000082 #define MSR_K6_UWCCR 0xc0000085 #define MSR_K6_EPMR 0xc0000086 #define MSR_K6_PSOR 0xc0000087 #define MSR_K6_PFIR 0xc0000088 /* Centaur-Hauls/IDT defined MSRs. */ #define MSR_IDT_FCR1 0x00000107 #define MSR_IDT_FCR2 0x00000108 #define MSR_IDT_FCR3 0x00000109 #define MSR_IDT_FCR4 0x0000010a #define MSR_IDT_MCR0 0x00000110 #define MSR_IDT_MCR1 0x00000111 #define MSR_IDT_MCR2 0x00000112 #define MSR_IDT_MCR3 0x00000113 #define MSR_IDT_MCR4 0x00000114 #define MSR_IDT_MCR5 0x00000115 #define MSR_IDT_MCR6 0x00000116 #define MSR_IDT_MCR7 0x00000117 #define MSR_IDT_MCR_CTRL 0x00000120 /* VIA Cyrix defined MSRs*/ #define MSR_VIA_FCR 0x00001107 #define MSR_VIA_LONGHAUL 0x0000110a #define MSR_VIA_RNG 0x0000110b #define MSR_VIA_BCR2 0x00001147 /* Transmeta defined MSRs */ #define MSR_TMTA_LONGRUN_CTRL 0x80868010 #define MSR_TMTA_LONGRUN_FLAGS 0x80868011 #define MSR_TMTA_LRTI_READOUT 0x80868018 #define MSR_TMTA_LRTI_VOLT_MHZ 0x8086801a /* Intel defined MSRs. */ #define MSR_IA32_P5_MC_ADDR 0x00000000 #define MSR_IA32_P5_MC_TYPE 0x00000001 #define MSR_IA32_TSC 0x00000010 #define MSR_IA32_PLATFORM_ID 0x00000017 #define MSR_IA32_EBL_CR_POWERON 0x0000002a #define MSR_IA32_EBC_FREQUENCY_ID 0x0000002c #define MSR_IA32_TSC_ADJUST 0x0000003b #define MSR_IA32_APICBASE 0x0000001b #define MSR_IA32_APICBASE_BSP (1<<8) #define MSR_IA32_APICBASE_EXTD (1<<10) #define MSR_IA32_APICBASE_ENABLE (1<<11) #define MSR_IA32_APICBASE_BASE (0xfffff<<12) #define MSR_IA32_APICBASE_MSR 0x800 #define MSR_IA32_APICTPR_MSR 0x808 #define MSR_IA32_APICPPR_MSR 0x80a #define MSR_IA32_APICEOI_MSR 0x80b #define MSR_IA32_APICTMICT_MSR 0x838 #define MSR_IA32_APICTMCCT_MSR 0x839 #define MSR_IA32_APICSELF_MSR 0x83f #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b #define MSR_IA32_PERF_STATUS 0x00000198 #define MSR_IA32_PERF_CTL 0x00000199 #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 #define MSR_IA32_THERM_CONTROL 0x0000019a #define MSR_IA32_THERM_INTERRUPT 0x0000019b #define MSR_IA32_THERM_STATUS 0x0000019c #define MSR_IA32_MISC_ENABLE 0x000001a0 #define MSR_IA32_MISC_ENABLE_PERF_AVAIL (1<<7) #define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL (1<<11) #define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1<<12) #define MSR_IA32_MISC_ENABLE_MONITOR_ENABLE (1<<18) #define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1<<22) #define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1<<23) #define MSR_IA32_TSC_DEADLINE 0x000006E0 #define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 /* Intel Model 6 */ #define MSR_P6_EVNTSEL0 0x00000186 #define MSR_P6_EVNTSEL1 0x00000187 /* P4/Xeon+ specific */ #define MSR_IA32_MCG_EAX 0x00000180 #define MSR_IA32_MCG_EBX 0x00000181 #define MSR_IA32_MCG_ECX 0x00000182 #define MSR_IA32_MCG_EDX 0x00000183 #define MSR_IA32_MCG_ESI 0x00000184 #define MSR_IA32_MCG_EDI 0x00000185 #define MSR_IA32_MCG_EBP 0x00000186 #define MSR_IA32_MCG_ESP 0x00000187 #define MSR_IA32_MCG_EFLAGS 0x00000188 #define MSR_IA32_MCG_EIP 0x00000189 #define MSR_IA32_MCG_MISC 0x0000018a #define MSR_IA32_MCG_R8 0x00000190 #define MSR_IA32_MCG_R9 0x00000191 #define MSR_IA32_MCG_R10 0x00000192 #define MSR_IA32_MCG_R11 0x00000193 #define MSR_IA32_MCG_R12 0x00000194 #define MSR_IA32_MCG_R13 0x00000195 #define MSR_IA32_MCG_R14 0x00000196 #define MSR_IA32_MCG_R15 0x00000197 /* Pentium IV performance counter MSRs */ #define MSR_P4_BPU_PERFCTR0 0x00000300 #define MSR_P4_BPU_PERFCTR1 0x00000301 #define MSR_P4_BPU_PERFCTR2 0x00000302 #define MSR_P4_BPU_PERFCTR3 0x00000303 #define MSR_P4_MS_PERFCTR0 0x00000304 #define MSR_P4_MS_PERFCTR1 0x00000305 #define MSR_P4_MS_PERFCTR2 0x00000306 #define MSR_P4_MS_PERFCTR3 0x00000307 #define MSR_P4_FLAME_PERFCTR0 0x00000308 #define MSR_P4_FLAME_PERFCTR1 0x00000309 #define MSR_P4_FLAME_PERFCTR2 0x0000030a #define MSR_P4_FLAME_PERFCTR3 0x0000030b #define MSR_P4_IQ_PERFCTR0 0x0000030c #define MSR_P4_IQ_PERFCTR1 0x0000030d #define MSR_P4_IQ_PERFCTR2 0x0000030e #define MSR_P4_IQ_PERFCTR3 0x0000030f #define MSR_P4_IQ_PERFCTR4 0x00000310 #define MSR_P4_IQ_PERFCTR5 0x00000311 #define MSR_P4_BPU_CCCR0 0x00000360 #define MSR_P4_BPU_CCCR1 0x00000361 #define MSR_P4_BPU_CCCR2 0x00000362 #define MSR_P4_BPU_CCCR3 0x00000363 #define MSR_P4_MS_CCCR0 0x00000364 #define MSR_P4_MS_CCCR1 0x00000365 #define MSR_P4_MS_CCCR2 0x00000366 #define MSR_P4_MS_CCCR3 0x00000367 #define MSR_P4_FLAME_CCCR0 0x00000368 #define MSR_P4_FLAME_CCCR1 0x00000369 #define MSR_P4_FLAME_CCCR2 0x0000036a #define MSR_P4_FLAME_CCCR3 0x0000036b #define MSR_P4_IQ_CCCR0 0x0000036c #define MSR_P4_IQ_CCCR1 0x0000036d #define MSR_P4_IQ_CCCR2 0x0000036e #define MSR_P4_IQ_CCCR3 0x0000036f #define MSR_P4_IQ_CCCR4 0x00000370 #define MSR_P4_IQ_CCCR5 0x00000371 #define MSR_P4_ALF_ESCR0 0x000003ca #define MSR_P4_ALF_ESCR1 0x000003cb #define MSR_P4_BPU_ESCR0 0x000003b2 #define MSR_P4_BPU_ESCR1 0x000003b3 #define MSR_P4_BSU_ESCR0 0x000003a0 #define MSR_P4_BSU_ESCR1 0x000003a1 #define MSR_P4_CRU_ESCR0 0x000003b8 #define MSR_P4_CRU_ESCR1 0x000003b9 #define MSR_P4_CRU_ESCR2 0x000003cc #define MSR_P4_CRU_ESCR3 0x000003cd #define MSR_P4_CRU_ESCR4 0x000003e0 #define MSR_P4_CRU_ESCR5 0x000003e1 #define MSR_P4_DAC_ESCR0 0x000003a8 #define MSR_P4_DAC_ESCR1 0x000003a9 #define MSR_P4_FIRM_ESCR0 0x000003a4 #define MSR_P4_FIRM_ESCR1 0x000003a5 #define MSR_P4_FLAME_ESCR0 0x000003a6 #define MSR_P4_FLAME_ESCR1 0x000003a7 #define MSR_P4_FSB_ESCR0 0x000003a2 #define MSR_P4_FSB_ESCR1 0x000003a3 #define MSR_P4_IQ_ESCR0 0x000003ba #define MSR_P4_IQ_ESCR1 0x000003bb #define MSR_P4_IS_ESCR0 0x000003b4 #define MSR_P4_IS_ESCR1 0x000003b5 #define MSR_P4_ITLB_ESCR0 0x000003b6 #define MSR_P4_ITLB_ESCR1 0x000003b7 #define MSR_P4_IX_ESCR0 0x000003c8 #define MSR_P4_IX_ESCR1 0x000003c9 #define MSR_P4_MOB_ESCR0 0x000003aa #define MSR_P4_MOB_ESCR1 0x000003ab #define MSR_P4_MS_ESCR0 0x000003c0 #define MSR_P4_MS_ESCR1 0x000003c1 #define MSR_P4_PMH_ESCR0 0x000003ac #define MSR_P4_PMH_ESCR1 0x000003ad #define MSR_P4_RAT_ESCR0 0x000003bc #define MSR_P4_RAT_ESCR1 0x000003bd #define MSR_P4_SAAT_ESCR0 0x000003ae #define MSR_P4_SAAT_ESCR1 0x000003af #define MSR_P4_SSU_ESCR0 0x000003be #define MSR_P4_SSU_ESCR1 0x000003bf /* guess: not in manual */ #define MSR_P4_TBPU_ESCR0 0x000003c2 #define MSR_P4_TBPU_ESCR1 0x000003c3 #define MSR_P4_TC_ESCR0 0x000003c4 #define MSR_P4_TC_ESCR1 0x000003c5 #define MSR_P4_U2L_ESCR0 0x000003b0 #define MSR_P4_U2L_ESCR1 0x000003b1 /* Netburst (P4) last-branch recording */ #define MSR_P4_LER_FROM_LIP 0x000001d7 #define MSR_P4_LER_TO_LIP 0x000001d8 #define MSR_P4_LASTBRANCH_TOS 0x000001da #define MSR_P4_LASTBRANCH_0 0x000001db #define NUM_MSR_P4_LASTBRANCH 4 #define MSR_P4_LASTBRANCH_0_FROM_LIP 0x00000680 #define MSR_P4_LASTBRANCH_0_TO_LIP 0x000006c0 #define NUM_MSR_P4_LASTBRANCH_FROM_TO 16 /* Pentium M (and Core) last-branch recording */ #define MSR_PM_LASTBRANCH_TOS 0x000001c9 #define MSR_PM_LASTBRANCH_0 0x00000040 #define NUM_MSR_PM_LASTBRANCH 8 /* Core 2 and Atom last-branch recording */ #define MSR_C2_LASTBRANCH_TOS 0x000001c9 #define MSR_C2_LASTBRANCH_0_FROM_IP 0x00000040 #define MSR_C2_LASTBRANCH_0_TO_IP 0x00000060 #define NUM_MSR_C2_LASTBRANCH_FROM_TO 4 #define NUM_MSR_ATOM_LASTBRANCH_FROM_TO 8 /* Intel Core-based CPU performance counters */ #define MSR_CORE_PERF_FIXED_CTR0 0x00000309 #define MSR_CORE_PERF_FIXED_CTR1 0x0000030a #define MSR_CORE_PERF_FIXED_CTR2 0x0000030b #define MSR_CORE_PERF_FIXED_CTR_CTRL 0x0000038d #define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e #define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f #define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390 /* Intel cpuid spoofing MSRs */ #define MSR_INTEL_CPUID_FEATURE_MASK 0x00000478 #define MSR_INTEL_CPUID1_FEATURE_MASK 0x00000130 #define MSR_INTEL_CPUID80000001_FEATURE_MASK 0x00000131 #define MSR_INTEL_CPUID1_FEATURE_MASK_V2 0x00000132 #define MSR_INTEL_CPUID80000001_FEATURE_MASK_V2 0x00000133 #define MSR_INTEL_CPUIDD_01_FEATURE_MASK 0x00000134 /* Intel cpuid faulting MSRs */ #define MSR_INTEL_PLATFORM_INFO 0x000000ce #define MSR_INTEL_MISC_FEATURES_ENABLES 0x00000140 /* Geode defined MSRs */ #define MSR_GEODE_BUSCONT_CONF0 0x00001900 #endif /* __ASM_MSR_INDEX_H */ xen-4.4.0/xen/include/asm-x86/bug.h0000664000175000017500000000626712307313555015024 0ustar smbsmb#ifndef __X86_BUG_H__ #define __X86_BUG_H__ #define BUG_DISP_WIDTH 24 #define BUG_LINE_LO_WIDTH (31 - BUG_DISP_WIDTH) #define BUG_LINE_HI_WIDTH (31 - BUG_DISP_WIDTH) struct bug_frame { signed int loc_disp:BUG_DISP_WIDTH; unsigned int line_hi:BUG_LINE_HI_WIDTH; signed int ptr_disp:BUG_DISP_WIDTH; unsigned int line_lo:BUG_LINE_LO_WIDTH; signed int msg_disp[]; }; #define bug_loc(b) ((const void *)(b) + (b)->loc_disp) #define bug_ptr(b) ((const void *)(b) + (b)->ptr_disp) #define bug_line(b) (((((b)->line_hi + ((b)->loc_disp < 0)) & \ ((1 << BUG_LINE_HI_WIDTH) - 1)) << \ BUG_LINE_LO_WIDTH) + \ (((b)->line_lo + ((b)->ptr_disp < 0)) & \ ((1 << BUG_LINE_LO_WIDTH) - 1))) #define bug_msg(b) ((const char *)(b) + (b)->msg_disp[1]) #define BUGFRAME_run_fn 0 #define BUGFRAME_warn 1 #define BUGFRAME_bug 2 #define BUGFRAME_assert 3 #define BUG_FRAME(type, line, ptr, second_frame, msg) do { \ BUILD_BUG_ON((line) >> (BUG_LINE_LO_WIDTH + BUG_LINE_HI_WIDTH)); \ asm volatile ( ".Lbug%=: ud2\n" \ ".pushsection .bug_frames.%c0, \"a\", @progbits\n" \ ".p2align 2\n" \ ".Lfrm%=:\n" \ ".long (.Lbug%= - .Lfrm%=) + %c4\n" \ ".long (%c1 - .Lfrm%=) + %c3\n" \ ".if " #second_frame "\n" \ ".long 0, %c2 - .Lfrm%=\n" \ ".endif\n" \ ".popsection" \ : \ : "i" (type), "i" (ptr), "i" (msg), \ "i" ((line & ((1 << BUG_LINE_LO_WIDTH) - 1)) \ << BUG_DISP_WIDTH), \ "i" (((line) >> BUG_LINE_LO_WIDTH) << BUG_DISP_WIDTH)); \ } while (0) #define WARN() BUG_FRAME(BUGFRAME_warn, __LINE__, __FILE__, 0, NULL) #define BUG() do { \ BUG_FRAME(BUGFRAME_bug, __LINE__, __FILE__, 0, NULL); \ unreachable(); \ } while (0) #define run_in_exception_handler(fn) BUG_FRAME(BUGFRAME_run_fn, 0, fn, 0, NULL) #define assert_failed(msg) do { \ BUG_FRAME(BUGFRAME_assert, __LINE__, __FILE__, 1, msg); \ unreachable(); \ } while (0) extern const struct bug_frame __start_bug_frames[], __stop_bug_frames_0[], __stop_bug_frames_1[], __stop_bug_frames_2[], __stop_bug_frames_3[]; #endif /* __X86_BUG_H__ */ xen-4.4.0/xen/include/asm-x86/tboot.h0000664000175000017500000001235512307313555015371 0ustar smbsmb/* * tboot.h: shared data structure with MLE and kernel and functions * used by kernel for runtime support * * Copyright (c) 2006-2007, Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * Neither the name of the Intel Corporation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED * OF THE POSSIBILITY OF SUCH DAMAGE. * */ #ifndef __TBOOT_H__ #define __TBOOT_H__ #include #ifndef __packed #define __packed __attribute__ ((packed)) #endif typedef struct __packed { uint32_t data1; uint16_t data2; uint16_t data3; uint16_t data4; uint8_t data5[6]; } uuid_t; /* used to communicate between tboot and the launched kernel (i.e. Xen) */ #define TB_KEY_SIZE 64 /* 512 bits */ #define MAX_TB_MAC_REGIONS 32 typedef struct __packed { uint64_t start; /* must be 64 byte -aligned */ uint32_t size; /* must be 64 byte -granular */ } tboot_mac_region_t; /* GAS - Generic Address Structure (ACPI 2.0+) */ typedef struct __packed { uint8_t space_id; uint8_t bit_width; uint8_t bit_offset; uint8_t access_width; uint64_t address; } tboot_acpi_generic_address_t; typedef struct __packed { tboot_acpi_generic_address_t pm1a_cnt_blk; tboot_acpi_generic_address_t pm1b_cnt_blk; tboot_acpi_generic_address_t pm1a_evt_blk; tboot_acpi_generic_address_t pm1b_evt_blk; uint16_t pm1a_cnt_val; uint16_t pm1b_cnt_val; uint64_t wakeup_vector; uint32_t vector_width; uint64_t kernel_s3_resume_vector; } tboot_acpi_sleep_info_t; typedef struct __packed { /* version 3+ fields: */ uuid_t uuid; /* {663C8DFF-E8B3-4b82-AABF-19EA4D057A08} */ uint32_t version; /* Version number; currently supports 0.6 */ uint32_t log_addr; /* physical addr of tb_log_t log */ uint32_t shutdown_entry; /* entry point for tboot shutdown */ uint32_t shutdown_type; /* type of shutdown (TB_SHUTDOWN_*) */ tboot_acpi_sleep_info_t acpi_sinfo; /* where kernel put acpi sleep info in Sx */ uint32_t tboot_base; /* starting addr for tboot */ uint32_t tboot_size; /* size of tboot */ uint8_t num_mac_regions; /* number mem regions to MAC on S3 */ /* contig regions memory to MAC on S3 */ tboot_mac_region_t mac_regions[MAX_TB_MAC_REGIONS]; /* version 4+ fields: */ /* populated by tboot; will be encrypted */ uint8_t s3_key[TB_KEY_SIZE]; /* version 5+ fields: */ uint8_t reserved_align[3]; /* used to 4byte-align num_in_wfs */ uint32_t num_in_wfs; /* number of processors in wait-for-SIPI */ /* version 6+ fields: */ uint32_t flags; uint64_t ap_wake_addr; /* phys addr of kernel/VMM SIPI vector */ uint32_t ap_wake_trigger; /* kernel/VMM writes APIC ID to wake AP */ } tboot_shared_t; #define TB_SHUTDOWN_REBOOT 0 #define TB_SHUTDOWN_S5 1 #define TB_SHUTDOWN_S4 2 #define TB_SHUTDOWN_S3 3 #define TB_SHUTDOWN_HALT 4 #define TB_FLAG_AP_WAKE_SUPPORT 0x00000001 /* kernel/VMM use INIT-SIPI-SIPI if clear, ap_wake_* if set */ /* {663C8DFF-E8B3-4b82-AABF-19EA4D057A08} */ #define TBOOT_SHARED_UUID { 0x663c8dff, 0xe8b3, 0x4b82, 0xaabf, \ { 0x19, 0xea, 0x4d, 0x5, 0x7a, 0x8 } }; extern tboot_shared_t *g_tboot_shared; void tboot_probe(void); void tboot_shutdown(uint32_t shutdown_type); int tboot_in_measured_env(void); int tboot_protect_mem_regions(void); int tboot_parse_dmar_table(acpi_table_handler dmar_handler); int tboot_s3_resume(void); void tboot_s3_error(int error); int tboot_wake_ap(int apicid, unsigned long sipi_vec); #endif /* __TBOOT_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/io_apic.h0000664000175000017500000001241312307313555015640 0ustar smbsmb#ifndef __ASM_IO_APIC_H #define __ASM_IO_APIC_H #include #include #include #include #include #include /* * Intel IO-APIC support for SMP and UP systems. * * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar */ #ifdef CONFIG_X86_IO_APIC #define IO_APIC_BASE(idx) \ ((volatile int *)(__fix_to_virt(FIX_IO_APIC_BASE_0 + idx) \ + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK))) #define IO_APIC_ID(idx) (mp_ioapics[idx].mpc_apicid) /* I/O Unit Redirection Table */ #define IO_APIC_REDIR_VECTOR_MASK 0x000FF #define IO_APIC_REDIR_DEST_LOGICAL 0x00800 #define IO_APIC_REDIR_DEST_PHYSICAL 0x00000 #define IO_APIC_REDIR_SEND_PENDING (1 << 12) #define IO_APIC_REDIR_REMOTE_IRR (1 << 14) #define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15) #define IO_APIC_REDIR_MASKED (1 << 16) /* * The structure of the IO-APIC: */ union IO_APIC_reg_00 { u32 raw; struct { u32 __reserved_2 : 14, LTS : 1, delivery_type : 1, __reserved_1 : 8, ID : 8; } __attribute__ ((packed)) bits; }; union IO_APIC_reg_01 { u32 raw; struct { u32 version : 8, __reserved_2 : 7, PRQ : 1, entries : 8, __reserved_1 : 8; } __attribute__ ((packed)) bits; }; union IO_APIC_reg_02 { u32 raw; struct { u32 __reserved_2 : 24, arbitration : 4, __reserved_1 : 4; } __attribute__ ((packed)) bits; }; union IO_APIC_reg_03 { u32 raw; struct { u32 boot_DT : 1, __reserved_1 : 31; } __attribute__ ((packed)) bits; }; /* * # of IO-APICs and # of IRQ routing registers */ extern int nr_ioapics; extern int nr_ioapic_entries[MAX_IO_APICS]; enum ioapic_irq_destination_types { dest_Fixed = 0, dest_LowestPrio = 1, dest_SMI = 2, dest__reserved_1 = 3, dest_NMI = 4, dest_INIT = 5, dest__reserved_2 = 6, dest_ExtINT = 7 }; struct IO_APIC_route_entry { __u32 vector : 8, delivery_mode : 3, /* 000: FIXED * 001: lowest prio * 111: ExtINT */ dest_mode : 1, /* 0: physical, 1: logical */ delivery_status : 1, polarity : 1, irr : 1, trigger : 1, /* 0: edge, 1: level */ mask : 1, /* 0: enabled, 1: disabled */ __reserved_2 : 15; union { struct { __u32 __reserved_1 : 24, physical_dest : 4, __reserved_2 : 4; } physical; struct { __u32 __reserved_1 : 24, logical_dest : 8; } logical; /* used when Interrupt Remapping with EIM is enabled */ __u32 dest32; } dest; } __attribute__ ((packed)); /* * MP-BIOS irq configuration table structures: */ /* I/O APIC entries */ extern struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; /* Only need to remap ioapic RTE (reg: 10~3Fh) */ #define ioapic_reg_remapped(reg) (iommu_intremap && ((reg) >= 0x10)) static inline unsigned int __io_apic_read(unsigned int apic, unsigned int reg) { *IO_APIC_BASE(apic) = reg; return *(IO_APIC_BASE(apic)+4); } static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) { if (ioapic_reg_remapped(reg)) return iommu_read_apic_from_ire(apic, reg); return __io_apic_read(apic, reg); } static inline void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) { *IO_APIC_BASE(apic) = reg; *(IO_APIC_BASE(apic)+4) = value; } static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) { if (ioapic_reg_remapped(reg)) return iommu_update_ire_from_apic(apic, reg, value); __io_apic_write(apic, reg, value); } /* * Re-write a value: to be used for read-modify-write * cycles where the read already set up the index register. */ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { if (ioapic_reg_remapped(reg)) return iommu_update_ire_from_apic(apic, reg, value); *(IO_APIC_BASE(apic)+4) = value; } /* 1 if "noapic" boot option passed */ extern bool_t skip_ioapic_setup; extern bool_t ioapic_ack_new; extern bool_t ioapic_ack_forced; #ifdef CONFIG_ACPI_BOOT extern int io_apic_get_unique_id (int ioapic, int apic_id); extern int io_apic_get_version (int ioapic); extern int io_apic_get_redir_entries (int ioapic); extern int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low); #endif /*CONFIG_ACPI_BOOT*/ extern void init_ioapic_mappings(void); extern void ioapic_suspend(void); extern void ioapic_resume(void); extern void dump_ioapic_irq_info(void); extern struct IO_APIC_route_entry __ioapic_read_entry( unsigned int apic, unsigned int pin, bool_t raw); void __ioapic_write_entry( unsigned int apic, unsigned int pin, bool_t raw, struct IO_APIC_route_entry); extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); #else /* !CONFIG_X86_IO_APIC */ static inline void init_ioapic_mappings(void) {} static inline void ioapic_suspend(void) {} static inline void ioapic_resume(void) {} #endif unsigned highest_gsi(void); int ioapic_guest_read( unsigned long physbase, unsigned int reg, u32 *pval); int ioapic_guest_write(unsigned long physbase, unsigned int reg, u32 pval); #endif xen-4.4.0/xen/include/asm-x86/fixmap.h0000664000175000017500000000542712307313555015530 0ustar smbsmb/* * fixmap.h: compile-time virtual memory allocation * * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive * for more details. * * Copyright (C) 1998 Ingo Molnar * Modifications for Xen are copyright (c) 2002-2004, K A Fraser */ #ifndef _ASM_FIXMAP_H #define _ASM_FIXMAP_H #include #include #define FIXADDR_TOP (VMAP_VIRT_END - PAGE_SIZE) #ifndef __ASSEMBLY__ #include #include #include #include #include #include #include #include /* * Here we define all the compile-time 'special' virtual * addresses. The point is to have a constant address at * compile time, but to set the physical address only * in the boot process. We allocate these special addresses * from the end of virtual memory backwards. */ enum fixed_addresses { /* Index 0 is reserved since fix_to_virt(0) == FIXADDR_TOP. */ FIX_RESERVED, /* * Indexes using the page tables set up before entering __start_xen() * must be among the first (L1_PAGETABLE_ENTRIES - 1) entries. * These are generally those needed by the various console drivers. */ FIX_COM_BEGIN, FIX_COM_END, FIX_EHCI_DBGP, /* Everything else should go further down. */ FIX_VGC_END, FIX_VGC_BEGIN = FIX_VGC_END + PFN_UP(sizeof(struct vcpu_guest_context)) * NR_CPUS - 1, FIX_APIC_BASE, FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, FIX_ACPI_BEGIN, FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, FIX_HPET_BASE, FIX_TBOOT_SHARED_BASE, FIX_MSIX_IO_RESERV_BASE, FIX_MSIX_IO_RESERV_END = FIX_MSIX_IO_RESERV_BASE + FIX_MSIX_MAX_PAGES -1, FIX_TBOOT_MAP_ADDRESS, FIX_APEI_RANGE_BASE, FIX_APEI_RANGE_END = FIX_APEI_RANGE_BASE + FIX_APEI_RANGE_MAX -1, FIX_IGD_MMIO, FIX_EFI_MPF, __end_of_fixed_addresses }; #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) extern void __set_fixmap( enum fixed_addresses idx, unsigned long mfn, unsigned long flags); #define set_fixmap(idx, phys) \ __set_fixmap(idx, (phys)>>PAGE_SHIFT, PAGE_HYPERVISOR) #define set_fixmap_nocache(idx, phys) \ __set_fixmap(idx, (phys)>>PAGE_SHIFT, PAGE_HYPERVISOR_NOCACHE) #define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) #define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) #define fix_to_virt(x) (__fix_to_virt(x)) static inline unsigned long virt_to_fix(const unsigned long vaddr) { BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); return __virt_to_fix(vaddr); } #endif /* __ASSEMBLY__ */ #endif xen-4.4.0/xen/include/asm-x86/mpspec_def.h0000664000175000017500000001062512307313555016345 0ustar smbsmb#ifndef __ASM_MPSPEC_DEF_H #define __ASM_MPSPEC_DEF_H /* * Structure definitions for SMP machines following the * Intel Multiprocessing Specification 1.1 and 1.4. */ /* * This tag identifies where the SMP configuration * information is. */ #define SMP_MAGIC_IDENT (('_'<<24)|('P'<<16)|('M'<<8)|'_') #define MAX_MPC_ENTRY 1024 #define MAX_APICS MAX(256, 4 * NR_CPUS) struct intel_mp_floating { char mpf_signature[4]; /* "_MP_" */ unsigned int mpf_physptr; /* Configuration table address */ unsigned char mpf_length; /* Our length (paragraphs) */ unsigned char mpf_specification;/* Specification version */ unsigned char mpf_checksum; /* Checksum (makes sum 0) */ unsigned char mpf_feature1; /* Standard or configuration ? */ unsigned char mpf_feature2; /* Bit7 set for IMCR|PIC */ unsigned char mpf_feature3; /* Unused (0) */ unsigned char mpf_feature4; /* Unused (0) */ unsigned char mpf_feature5; /* Unused (0) */ }; struct mp_config_table { char mpc_signature[4]; #define MPC_SIGNATURE "PCMP" unsigned short mpc_length; /* Size of table */ char mpc_spec; /* 0x01 */ char mpc_checksum; char mpc_oem[8]; char mpc_productid[12]; unsigned int mpc_oemptr; /* 0 if not present */ unsigned short mpc_oemsize; /* 0 if not present */ unsigned short mpc_oemcount; unsigned int mpc_lapic; /* APIC address */ unsigned int reserved; }; /* Followed by entries */ #define MP_PROCESSOR 0 #define MP_BUS 1 #define MP_IOAPIC 2 #define MP_INTSRC 3 #define MP_LINTSRC 4 #define MP_TRANSLATION 192 /* Used by IBM NUMA-Q to describe node locality */ struct mpc_config_processor { unsigned char mpc_type; unsigned char mpc_apicid; /* Local APIC number */ unsigned char mpc_apicver; /* Its versions */ unsigned char mpc_cpuflag; #define CPU_ENABLED 1 /* Processor is available */ #define CPU_BOOTPROCESSOR 2 /* Processor is the BP */ unsigned int mpc_cpufeature; #define CPU_STEPPING_MASK 0x0F #define CPU_MODEL_MASK 0xF0 #define CPU_FAMILY_MASK 0xF00 unsigned int mpc_featureflag; /* CPUID feature value */ unsigned int mpc_reserved[2]; }; struct mpc_config_bus { unsigned char mpc_type; unsigned char mpc_busid; unsigned char mpc_bustype[6]; }; /* List of Bus Type string values, Intel MP Spec. */ #define BUSTYPE_EISA "EISA" #define BUSTYPE_ISA "ISA" #define BUSTYPE_INTERN "INTERN" /* Internal BUS */ #define BUSTYPE_MCA "MCA" #define BUSTYPE_VL "VL" /* Local bus */ #define BUSTYPE_PCI "PCI" #define BUSTYPE_PCMCIA "PCMCIA" #define BUSTYPE_CBUS "CBUS" #define BUSTYPE_CBUSII "CBUSII" #define BUSTYPE_FUTURE "FUTURE" #define BUSTYPE_MBI "MBI" #define BUSTYPE_MBII "MBII" #define BUSTYPE_MPI "MPI" #define BUSTYPE_MPSA "MPSA" #define BUSTYPE_NUBUS "NUBUS" #define BUSTYPE_TC "TC" #define BUSTYPE_VME "VME" #define BUSTYPE_XPRESS "XPRESS" #define BUSTYPE_NEC98 "NEC98" struct mpc_config_ioapic { unsigned char mpc_type; unsigned char mpc_apicid; unsigned char mpc_apicver; unsigned char mpc_flags; #define MPC_APIC_USABLE 0x01 unsigned int mpc_apicaddr; }; struct mpc_config_intsrc { unsigned char mpc_type; unsigned char mpc_irqtype; unsigned short mpc_irqflag; unsigned char mpc_srcbus; unsigned char mpc_srcbusirq; unsigned char mpc_dstapic; unsigned char mpc_dstirq; }; enum mp_irq_source_types { mp_INT = 0, mp_NMI = 1, mp_SMI = 2, mp_ExtINT = 3 }; #define MP_IRQDIR_DEFAULT 0 #define MP_IRQDIR_HIGH 1 #define MP_IRQDIR_LOW 3 struct mpc_config_lintsrc { unsigned char mpc_type; unsigned char mpc_irqtype; unsigned short mpc_irqflag; unsigned char mpc_srcbusid; unsigned char mpc_srcbusirq; unsigned char mpc_destapic; #define MP_APIC_ALL 0xFF unsigned char mpc_destapiclint; }; struct mp_config_oemtable { char oem_signature[4]; #define MPC_OEM_SIGNATURE "_OEM" unsigned short oem_length; /* Size of table */ char oem_rev; /* 0x01 */ char oem_checksum; char mpc_oem[8]; }; struct mpc_config_translation { unsigned char mpc_type; unsigned char trans_len; unsigned char trans_type; unsigned char trans_quad; unsigned char trans_global; unsigned char trans_local; unsigned short trans_reserved; }; /* * Default configurations * * 1 2 CPU ISA 82489DX * 2 2 CPU EISA 82489DX neither IRQ 0 timer nor IRQ 13 DMA chaining * 3 2 CPU EISA 82489DX * 4 2 CPU MCA 82489DX * 5 2 CPU ISA+PCI * 6 2 CPU EISA+PCI * 7 2 CPU MCA+PCI */ enum mp_bustype { MP_BUS_ISA = 1, MP_BUS_EISA, MP_BUS_PCI, MP_BUS_MCA, MP_BUS_NEC98 }; #endif xen-4.4.0/xen/include/asm-x86/nmi.h0000664000175000017500000000155512307313555015025 0ustar smbsmb #ifndef ASM_NMI_H #define ASM_NMI_H #include struct cpu_user_regs; /* Watchdog boolean from the command line */ extern bool_t opt_watchdog; typedef int (*nmi_callback_t)(struct cpu_user_regs *regs, int cpu); /** * set_nmi_callback * * Set a handler for an NMI. Only one handler may be * set. Return 1 if the NMI was handled. */ void set_nmi_callback(nmi_callback_t callback); /** * unset_nmi_callback * * Remove the handler previously set. */ void unset_nmi_callback(void); /** * register_guest_nmi_callback * * The default NMI handler passes the NMI to a guest callback. This * function registers the address of that callback. */ long register_guest_nmi_callback(unsigned long address); /** * unregister_guest_nmi_callback * * Unregister a guest NMI handler. */ long unregister_guest_nmi_callback(void); #endif /* ASM_NMI_H */ xen-4.4.0/xen/include/asm-x86/bitops.h0000664000175000017500000003172112307313555015540 0ustar smbsmb#ifndef _X86_BITOPS_H #define _X86_BITOPS_H /* * Copyright 1992, Linus Torvalds. */ #include /* * We specify the memory operand as both input and output because the memory * operand is both read from and written to. Since the operand is in fact a * word array, we also specify "memory" in the clobbers list to indicate that * words other than the one directly addressed by the memory operand may be * modified. We don't use "+m" because the gcc manual says that it should be * used only when the constraint allows the operand to reside in a register. */ #define ADDR (*(volatile long *) addr) #define CONST_ADDR (*(const volatile long *) addr) extern void __bitop_bad_size(void); #define bitop_bad_size(addr) (sizeof(*(addr)) < 4) /** * set_bit - Atomically set a bit in memory * @nr: the bit to set * @addr: the address to start counting from * * This function is atomic and may not be reordered. See __set_bit() * if you do not require the atomic guarantees. * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ static inline void set_bit(int nr, volatile void *addr) { asm volatile ( "lock; btsl %1,%0" : "=m" (ADDR) : "Ir" (nr), "m" (ADDR) : "memory"); } #define set_bit(nr, addr) ({ \ if ( bitop_bad_size(addr) ) __bitop_bad_size(); \ set_bit(nr, addr); \ }) /** * __set_bit - Set a bit in memory * @nr: the bit to set * @addr: the address to start counting from * * Unlike set_bit(), this function is non-atomic and may be reordered. * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ static inline void __set_bit(int nr, volatile void *addr) { asm volatile ( "btsl %1,%0" : "=m" (ADDR) : "Ir" (nr), "m" (ADDR) : "memory"); } #define __set_bit(nr, addr) ({ \ if ( bitop_bad_size(addr) ) __bitop_bad_size(); \ __set_bit(nr, addr); \ }) /** * clear_bit - Clears a bit in memory * @nr: Bit to clear * @addr: Address to start counting from * * clear_bit() is atomic and may not be reordered. */ static inline void clear_bit(int nr, volatile void *addr) { asm volatile ( "lock; btrl %1,%0" : "=m" (ADDR) : "Ir" (nr), "m" (ADDR) : "memory"); } #define clear_bit(nr, addr) ({ \ if ( bitop_bad_size(addr) ) __bitop_bad_size(); \ clear_bit(nr, addr); \ }) /** * __clear_bit - Clears a bit in memory * @nr: Bit to clear * @addr: Address to start counting from * * Unlike clear_bit(), this function is non-atomic and may be reordered. * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ static inline void __clear_bit(int nr, volatile void *addr) { asm volatile ( "btrl %1,%0" : "=m" (ADDR) : "Ir" (nr), "m" (ADDR) : "memory"); } #define __clear_bit(nr, addr) ({ \ if ( bitop_bad_size(addr) ) __bitop_bad_size(); \ __clear_bit(nr, addr); \ }) /** * __change_bit - Toggle a bit in memory * @nr: the bit to set * @addr: the address to start counting from * * Unlike change_bit(), this function is non-atomic and may be reordered. * If it's called on the same region of memory simultaneously, the effect * may be that only one operation succeeds. */ static inline void __change_bit(int nr, volatile void *addr) { asm volatile ( "btcl %1,%0" : "=m" (ADDR) : "Ir" (nr), "m" (ADDR) : "memory"); } #define __change_bit(nr, addr) ({ \ if ( bitop_bad_size(addr) ) __bitop_bad_size(); \ __change_bit(nr, addr); \ }) /** * change_bit - Toggle a bit in memory * @nr: Bit to clear * @addr: Address to start counting from * * change_bit() is atomic and may not be reordered. * Note that @nr may be almost arbitrarily large; this function is not * restricted to acting on a single-word quantity. */ static inline void change_bit(int nr, volatile void *addr) { asm volatile ( "lock; btcl %1,%0" : "=m" (ADDR) : "Ir" (nr), "m" (ADDR) : "memory"); } #define change_bit(nr, addr) ({ \ if ( bitop_bad_size(addr) ) __bitop_bad_size(); \ change_bit(nr, addr); \ }) /** * test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ static inline int test_and_set_bit(int nr, volatile void *addr) { int oldbit; asm volatile ( "lock; btsl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr), "m" (ADDR) : "memory"); return oldbit; } #define test_and_set_bit(nr, addr) ({ \ if ( bitop_bad_size(addr) ) __bitop_bad_size(); \ test_and_set_bit(nr, addr); \ }) /** * __test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * * This operation is non-atomic and can be reordered. * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ static inline int __test_and_set_bit(int nr, volatile void *addr) { int oldbit; asm volatile ( "btsl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr), "m" (ADDR) : "memory"); return oldbit; } #define __test_and_set_bit(nr, addr) ({ \ if ( bitop_bad_size(addr) ) __bitop_bad_size(); \ __test_and_set_bit(nr, addr); \ }) /** * test_and_clear_bit - Clear a bit and return its old value * @nr: Bit to set * @addr: Address to count from * * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ static inline int test_and_clear_bit(int nr, volatile void *addr) { int oldbit; asm volatile ( "lock; btrl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr), "m" (ADDR) : "memory"); return oldbit; } #define test_and_clear_bit(nr, addr) ({ \ if ( bitop_bad_size(addr) ) __bitop_bad_size(); \ test_and_clear_bit(nr, addr); \ }) /** * __test_and_clear_bit - Clear a bit and return its old value * @nr: Bit to set * @addr: Address to count from * * This operation is non-atomic and can be reordered. * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ static inline int __test_and_clear_bit(int nr, volatile void *addr) { int oldbit; asm volatile ( "btrl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr), "m" (ADDR) : "memory"); return oldbit; } #define __test_and_clear_bit(nr, addr) ({ \ if ( bitop_bad_size(addr) ) __bitop_bad_size(); \ __test_and_clear_bit(nr, addr); \ }) /* WARNING: non atomic and it can be reordered! */ static inline int __test_and_change_bit(int nr, volatile void *addr) { int oldbit; asm volatile ( "btcl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr), "m" (ADDR) : "memory"); return oldbit; } #define __test_and_change_bit(nr, addr) ({ \ if ( bitop_bad_size(addr) ) __bitop_bad_size(); \ __test_and_change_bit(nr, addr); \ }) /** * test_and_change_bit - Change a bit and return its new value * @nr: Bit to set * @addr: Address to count from * * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ static inline int test_and_change_bit(int nr, volatile void *addr) { int oldbit; asm volatile ( "lock; btcl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr), "m" (ADDR) : "memory"); return oldbit; } #define test_and_change_bit(nr, addr) ({ \ if ( bitop_bad_size(addr) ) __bitop_bad_size(); \ test_and_change_bit(nr, addr); \ }) static inline int constant_test_bit(int nr, const volatile void *addr) { return ((1U << (nr & 31)) & (((const volatile unsigned int *)addr)[nr >> 5])) != 0; } static inline int variable_test_bit(int nr, const volatile void *addr) { int oldbit; asm volatile ( "btl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit) : "m" (CONST_ADDR), "Ir" (nr) : "memory" ); return oldbit; } #define test_bit(nr, addr) ({ \ if ( bitop_bad_size(addr) ) __bitop_bad_size(); \ (__builtin_constant_p(nr) ? \ constant_test_bit((nr),(addr)) : \ variable_test_bit((nr),(addr))); \ }) extern unsigned int __find_first_bit( const unsigned long *addr, unsigned int size); extern unsigned int __find_next_bit( const unsigned long *addr, unsigned int size, unsigned int offset); extern unsigned int __find_first_zero_bit( const unsigned long *addr, unsigned int size); extern unsigned int __find_next_zero_bit( const unsigned long *addr, unsigned int size, unsigned int offset); static inline unsigned int __scanbit(unsigned long val, unsigned long max) { asm ( "bsf %1,%0 ; cmovz %2,%0" : "=&r" (val) : "r" (val), "r" (max) ); return (unsigned int)val; } /** * find_first_bit - find the first set bit in a memory region * @addr: The address to start the search at * @size: The maximum size to search * * Returns the bit-number of the first set bit, not the number of the byte * containing a bit. */ #define find_first_bit(addr, size) find_next_bit(addr, size, 0) /** * find_next_bit - find the first set bit in a memory region * @addr: The address to base the search on * @offset: The bitnumber to start searching at * @size: The maximum size to search */ #define find_next_bit(addr, size, off) ({ \ unsigned int r__ = (size); \ unsigned int o__ = (off); \ switch ( -!__builtin_constant_p(size) | r__ ) \ { \ case 0: (void)(addr); break; \ case 1 ... BITS_PER_LONG: \ r__ = o__ + __scanbit(*(const unsigned long *)(addr) >> o__, r__); \ break; \ default: \ if ( __builtin_constant_p(off) && !o__ ) \ r__ = __find_first_bit(addr, r__); \ else \ r__ = __find_next_bit(addr, r__, o__); \ break; \ } \ r__; \ }) /** * find_first_zero_bit - find the first zero bit in a memory region * @addr: The address to start the search at * @size: The maximum size to search * * Returns the bit-number of the first zero bit, not the number of the byte * containing a bit. */ #define find_first_zero_bit(addr, size) find_next_zero_bit(addr, size, 0) /** * find_next_zero_bit - find the first zero bit in a memory region * @addr: The address to base the search on * @offset: The bitnumber to start searching at * @size: The maximum size to search */ #define find_next_zero_bit(addr, size, off) ({ \ unsigned int r__ = (size); \ unsigned int o__ = (off); \ switch ( -!__builtin_constant_p(size) | r__ ) \ { \ case 0: (void)(addr); break; \ case 1 ... BITS_PER_LONG: \ r__ = o__ + __scanbit(~*(const unsigned long *)(addr) >> o__, r__); \ break; \ default: \ if ( __builtin_constant_p(off) && !o__ ) \ r__ = __find_first_zero_bit(addr, r__); \ else \ r__ = __find_next_zero_bit(addr, r__, o__); \ break; \ } \ r__; \ }) /** * find_first_set_bit - find the first set bit in @word * @word: the word to search * * Returns the bit-number of the first set bit. The input must *not* be zero. */ static inline unsigned int find_first_set_bit(unsigned long word) { asm ( "bsf %1,%0" : "=r" (word) : "r" (word) ); return (unsigned int)word; } /** * ffs - find first bit set * @x: the word to search * * This is defined the same way as the libc and compiler builtin ffs routines. */ static inline int ffs(unsigned long x) { long r; asm ( "bsf %1,%0\n\t" "jnz 1f\n\t" "mov $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); return (int)r+1; } /** * fls - find last bit set * @x: the word to search * * This is defined the same way as ffs. */ static inline int fls(unsigned long x) { long r; asm ( "bsr %1,%0\n\t" "jnz 1f\n\t" "mov $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); return (int)r+1; } /** * hweightN - returns the hamming weight of a N-bit word * @x: the word to weigh * * The Hamming Weight of a number is the total number of bits set in it. */ #define hweight64(x) generic_hweight64(x) #define hweight32(x) generic_hweight32(x) #define hweight16(x) generic_hweight16(x) #define hweight8(x) generic_hweight8(x) #endif /* _X86_BITOPS_H */ xen-4.4.0/xen/include/asm-x86/smp.h0000664000175000017500000000254612307313555015042 0ustar smbsmb#ifndef __ASM_SMP_H #define __ASM_SMP_H /* * We need the APIC definitions automatically as part of 'smp.h' */ #ifndef __ASSEMBLY__ #include #include #include #include #endif #ifndef __ASSEMBLY__ #include #include #endif #define BAD_APICID -1U #ifndef __ASSEMBLY__ /* * Private routines/data */ extern void smp_alloc_memory(void); DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask); DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask); void smp_send_nmi_allbutself(void); void send_IPI_mask(const cpumask_t *, int vector); void send_IPI_self(int vector); extern void (*mtrr_hook) (void); extern void zap_low_mappings(void); #define MAX_APICID 256 extern u32 x86_cpu_to_apicid[]; #define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu)) extern void cpu_exit_clear(unsigned int cpu); extern void cpu_uninit(unsigned int cpu); int cpu_add(uint32_t apic_id, uint32_t acpi_id, uint32_t pxm); /* * This function is needed by all SMP systems. It must _always_ be valid * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ #define raw_smp_processor_id() (get_processor_id()) int hard_smp_processor_id(void); void __stop_this_cpu(void); #endif /* !__ASSEMBLY__ */ #endif xen-4.4.0/xen/include/asm-x86/string.h0000664000175000017500000000066612307313555015552 0ustar smbsmb#ifndef __X86_STRING_H__ #define __X86_STRING_H__ #include #define __HAVE_ARCH_MEMCPY #define memcpy(t,f,n) (__builtin_memcpy((t),(f),(n))) /* Some versions of gcc don't have this builtin. It's non-critical anyway. */ #define __HAVE_ARCH_MEMMOVE extern void *memmove(void *dest, const void *src, size_t n); #define __HAVE_ARCH_MEMSET #define memset(s,c,n) (__builtin_memset((s),(c),(n))) #endif /* __X86_STRING_H__ */ xen-4.4.0/xen/include/asm-x86/mem_paging.h0000664000175000017500000000212212307313555016334 0ustar smbsmb/****************************************************************************** * include/asm-x86/mem_paging.h * * Memory paging support. * * Copyright (c) 2009 Citrix Systems, Inc. (Patrick Colp) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ int mem_paging_memop(struct domain *d, xen_mem_event_op_t *meo); /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/traps.h0000664000175000017500000000355212307313555015372 0ustar smbsmb/* * Copyright (c) 2007, 2008 Advanced Micro Devices, Inc. * Author: Christoph Egger * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef ASM_TRAP_H #define ASM_TRAP_H struct softirq_trap { struct domain *domain; /* domain to inject trap */ struct vcpu *vcpu; /* vcpu to inject trap */ int processor; /* physical cpu to inject trap */ }; struct cpu_user_regs; extern void machine_check_vector(struct cpu_user_regs *regs, long error_code); void async_exception_cleanup(struct vcpu *); /** * guest_has_trap_callback * * returns true (non-zero) if guest registered a trap handler */ extern int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr); /** * send_guest_trap * * delivers trap to guest analogous to send_guest_global_virq * return 0 on successful delivery */ extern int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr); uint32_t guest_io_read(unsigned int port, unsigned int bytes, struct vcpu *, struct cpu_user_regs *); void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data, struct vcpu *, struct cpu_user_regs *); #endif /* ASM_TRAP_H */ xen-4.4.0/xen/include/asm-x86/atomic.h0000664000175000017500000001532412307313555015515 0ustar smbsmb#ifndef __ARCH_X86_ATOMIC__ #define __ARCH_X86_ATOMIC__ #include #include #define build_read_atomic(name, size, type, reg, barrier) \ static inline type name(const volatile type *addr) \ { type ret; asm volatile("mov" size " %1,%0":reg (ret) \ :"m" (*(volatile type *)addr) barrier); return ret; } #define build_write_atomic(name, size, type, reg, barrier) \ static inline void name(volatile type *addr, type val) \ { asm volatile("mov" size " %1,%0": "=m" (*(volatile type *)addr) \ :reg (val) barrier); } build_read_atomic(read_u8_atomic, "b", uint8_t, "=q", ) build_read_atomic(read_u16_atomic, "w", uint16_t, "=r", ) build_read_atomic(read_u32_atomic, "l", uint32_t, "=r", ) build_write_atomic(write_u8_atomic, "b", uint8_t, "q", ) build_write_atomic(write_u16_atomic, "w", uint16_t, "r", ) build_write_atomic(write_u32_atomic, "l", uint32_t, "r", ) build_read_atomic(read_u64_atomic, "q", uint64_t, "=r", ) build_write_atomic(write_u64_atomic, "q", uint64_t, "r", ) #undef build_read_atomic #undef build_write_atomic void __bad_atomic_size(void); #define read_atomic(p) ({ \ typeof(*p) __x; \ switch ( sizeof(*p) ) { \ case 1: __x = (typeof(*p))read_u8_atomic((uint8_t *)p); break; \ case 2: __x = (typeof(*p))read_u16_atomic((uint16_t *)p); break; \ case 4: __x = (typeof(*p))read_u32_atomic((uint32_t *)p); break; \ case 8: __x = (typeof(*p))read_u64_atomic((uint64_t *)p); break; \ default: __x = 0; __bad_atomic_size(); break; \ } \ __x; \ }) #define write_atomic(p, x) ({ \ typeof(*p) __x = (x); \ switch ( sizeof(*p) ) { \ case 1: write_u8_atomic((uint8_t *)p, (uint8_t)__x); break; \ case 2: write_u16_atomic((uint16_t *)p, (uint16_t)__x); break; \ case 4: write_u32_atomic((uint32_t *)p, (uint32_t)__x); break; \ case 8: write_u64_atomic((uint64_t *)p, (uint64_t)__x); break; \ default: __bad_atomic_size(); break; \ } \ __x; \ }) /* * NB. I've pushed the volatile qualifier into the operations. This allows * fast accessors such as _atomic_read() and _atomic_set() which don't give * the compiler a fit. */ typedef struct { int counter; } atomic_t; #define ATOMIC_INIT(i) { (i) } /** * atomic_read - read atomic variable * @v: pointer of type atomic_t * * Atomically reads the value of @v. */ #define _atomic_read(v) ((v).counter) #define atomic_read(v) read_atomic(&((v)->counter)) /** * atomic_set - set atomic variable * @v: pointer of type atomic_t * @i: required value * * Atomically sets the value of @v to @i. */ #define _atomic_set(v,i) (((v).counter) = (i)) #define atomic_set(v,i) write_atomic(&((v)->counter), (i)) /** * atomic_add - add integer to atomic variable * @i: integer value to add * @v: pointer of type atomic_t * * Atomically adds @i to @v. */ static inline void atomic_add(int i, atomic_t *v) { asm volatile ( "lock; addl %1,%0" : "=m" (*(volatile int *)&v->counter) : "ir" (i), "m" (*(volatile int *)&v->counter) ); } /** * atomic_sub - subtract the atomic variable * @i: integer value to subtract * @v: pointer of type atomic_t * * Atomically subtracts @i from @v. */ static inline void atomic_sub(int i, atomic_t *v) { asm volatile ( "lock; subl %1,%0" : "=m" (*(volatile int *)&v->counter) : "ir" (i), "m" (*(volatile int *)&v->counter) ); } /** * atomic_sub_and_test - subtract value from variable and test result * @i: integer value to subtract * @v: pointer of type atomic_t * * Atomically subtracts @i from @v and returns * true if the result is zero, or false for all * other cases. */ static inline int atomic_sub_and_test(int i, atomic_t *v) { unsigned char c; asm volatile ( "lock; subl %2,%0; sete %1" : "=m" (*(volatile int *)&v->counter), "=qm" (c) : "ir" (i), "m" (*(volatile int *)&v->counter) : "memory" ); return c; } /** * atomic_inc - increment atomic variable * @v: pointer of type atomic_t * * Atomically increments @v by 1. */ static inline void atomic_inc(atomic_t *v) { asm volatile ( "lock; incl %0" : "=m" (*(volatile int *)&v->counter) : "m" (*(volatile int *)&v->counter) ); } /** * atomic_dec - decrement atomic variable * @v: pointer of type atomic_t * * Atomically decrements @v by 1. */ static inline void atomic_dec(atomic_t *v) { asm volatile ( "lock; decl %0" : "=m" (*(volatile int *)&v->counter) : "m" (*(volatile int *)&v->counter) ); } /** * atomic_dec_and_test - decrement and test * @v: pointer of type atomic_t * * Atomically decrements @v by 1 and * returns true if the result is 0, or false for all other * cases. */ static inline int atomic_dec_and_test(atomic_t *v) { unsigned char c; asm volatile ( "lock; decl %0; sete %1" : "=m" (*(volatile int *)&v->counter), "=qm" (c) : "m" (*(volatile int *)&v->counter) : "memory" ); return c != 0; } /** * atomic_inc_and_test - increment and test * @v: pointer of type atomic_t * * Atomically increments @v by 1 * and returns true if the result is zero, or false for all * other cases. */ static inline int atomic_inc_and_test(atomic_t *v) { unsigned char c; asm volatile ( "lock; incl %0; sete %1" : "=m" (*(volatile int *)&v->counter), "=qm" (c) : "m" (*(volatile int *)&v->counter) : "memory" ); return c != 0; } /** * atomic_add_negative - add and test if negative * @v: pointer of type atomic_t * @i: integer value to add * * Atomically adds @i to @v and returns true * if the result is negative, or false when * result is greater than or equal to zero. */ static inline int atomic_add_negative(int i, atomic_t *v) { unsigned char c; asm volatile ( "lock; addl %2,%0; sets %1" : "=m" (*(volatile int *)&v->counter), "=qm" (c) : "ir" (i), "m" (*(volatile int *)&v->counter) : "memory" ); return c; } static inline atomic_t atomic_compareandswap( atomic_t old, atomic_t new, atomic_t *v) { atomic_t rc; rc.counter = __cmpxchg(&v->counter, old.counter, new.counter, sizeof(int)); return rc; } #endif /* __ARCH_X86_ATOMIC__ */ xen-4.4.0/xen/include/asm-x86/efibind.h0000664000175000017500000000006712307313555015637 0ustar smbsmb#include #include xen-4.4.0/xen/include/asm-x86/uaccess.h0000664000175000017500000002211012307313555015656 0ustar smbsmb #ifndef __X86_UACCESS_H__ #define __X86_UACCESS_H__ #include #include #include #include #include #include #include unsigned long copy_to_user(void *to, const void *from, unsigned len); unsigned long clear_user(void *to, unsigned len); unsigned long copy_from_user(void *to, const void *from, unsigned len); /* Handles exceptions in both to and from, but doesn't do access_ok */ unsigned long __copy_to_user_ll(void *to, const void *from, unsigned n); unsigned long __copy_from_user_ll(void *to, const void *from, unsigned n); extern long __get_user_bad(void); extern void __put_user_bad(void); /** * get_user: - Get a simple variable from user space. * @x: Variable to store result. * @ptr: Source address, in user space. * * Context: User context only. This function may sleep. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and the result of * dereferencing @ptr must be assignable to @x without a cast. * * Returns zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ #define get_user(x,ptr) \ __get_user_check((x),(ptr),sizeof(*(ptr))) /** * put_user: - Write a simple value into user space. * @x: Value to copy to user space. * @ptr: Destination address, in user space. * * Context: User context only. This function may sleep. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and @x must be assignable * to the result of dereferencing @ptr. * * Returns zero on success, or -EFAULT on error. */ #define put_user(x,ptr) \ __put_user_check((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) /** * __get_user: - Get a simple variable from user space, with less checking. * @x: Variable to store result. * @ptr: Source address, in user space. * * Context: User context only. This function may sleep. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and the result of * dereferencing @ptr must be assignable to @x without a cast. * * Caller must check the pointer with access_ok() before calling this * function. * * Returns zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ #define __get_user(x,ptr) \ __get_user_nocheck((x),(ptr),sizeof(*(ptr))) /** * __put_user: - Write a simple value into user space, with less checking. * @x: Value to copy to user space. * @ptr: Destination address, in user space. * * Context: User context only. This function may sleep. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and @x must be assignable * to the result of dereferencing @ptr. * * Caller must check the pointer with access_ok() before calling this * function. * * Returns zero on success, or -EFAULT on error. */ #define __put_user(x,ptr) \ __put_user_nocheck((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) #define __put_user_nocheck(x,ptr,size) \ ({ \ long __pu_err; \ __put_user_size((x),(ptr),(size),__pu_err,-EFAULT); \ __pu_err; \ }) #define __put_user_check(x,ptr,size) \ ({ \ long __pu_err = -EFAULT; \ __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ if (access_ok(__pu_addr,size)) \ __put_user_size((x),__pu_addr,(size),__pu_err,-EFAULT); \ __pu_err; \ }) #define __get_user_nocheck(x,ptr,size) \ ({ \ long __gu_err; \ __get_user_size((x),(ptr),(size),__gu_err,-EFAULT); \ __gu_err; \ }) #define __get_user_check(x,ptr,size) \ ({ \ long __gu_err; \ __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ __get_user_size((x),__gu_addr,(size),__gu_err,-EFAULT); \ if (!access_ok(__gu_addr,size)) __gu_err = -EFAULT; \ __gu_err; \ }) struct __large_struct { unsigned long buf[100]; }; #define __m(x) (*(const struct __large_struct *)(x)) /* * Tell gcc we read from memory instead of writing: this is because * we do not write to any memory gcc knows about, so there are no * aliasing issues. */ #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \ __asm__ __volatile__( \ "1: mov"itype" %"rtype"1,%2\n" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " jmp 2b\n" \ ".previous\n" \ _ASM_EXTABLE(1b, 3b) \ : "=r"(err) \ : ltype (x), "m"(__m(addr)), "i"(errret), "0"(err)) #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \ __asm__ __volatile__( \ "1: mov"itype" %2,%"rtype"1\n" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %3,%0\n" \ " xor"itype" %"rtype"1,%"rtype"1\n" \ " jmp 2b\n" \ ".previous\n" \ _ASM_EXTABLE(1b, 3b) \ : "=r"(err), ltype (x) \ : "m"(__m(addr)), "i"(errret), "0"(err)) /** * __copy_to_user: - Copy a block of data into user space, with less checking * @to: Destination address, in user space. * @from: Source address, in kernel space. * @n: Number of bytes to copy. * * Context: User context only. This function may sleep. * * Copy data from kernel space to user space. Caller must check * the specified block with access_ok() before calling this function. * * Returns number of bytes that could not be copied. * On success, this will be zero. */ static always_inline unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { if (__builtin_constant_p(n)) { unsigned long ret; switch (n) { case 1: __put_user_size(*(const u8 *)from, (u8 __user *)to, 1, ret, 1); return ret; case 2: __put_user_size(*(const u16 *)from, (u16 __user *)to, 2, ret, 2); return ret; case 4: __put_user_size(*(const u32 *)from, (u32 __user *)to, 4, ret, 4); return ret; case 8: __put_user_size(*(const u64 *)from, (u64 __user *)to, 8, ret, 8); return ret; } } return __copy_to_user_ll(to, from, n); } /** * __copy_from_user: - Copy a block of data from user space, with less checking * @to: Destination address, in kernel space. * @from: Source address, in user space. * @n: Number of bytes to copy. * * Context: User context only. This function may sleep. * * Copy data from user space to kernel space. Caller must check * the specified block with access_ok() before calling this function. * * Returns number of bytes that could not be copied. * On success, this will be zero. * * If some data could not be copied, this function will pad the copied * data to the requested size using zero bytes. */ static always_inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { if (__builtin_constant_p(n)) { unsigned long ret; switch (n) { case 1: __get_user_size(*(u8 *)to, from, 1, ret, 1); return ret; case 2: __get_user_size(*(u16 *)to, from, 2, ret, 2); return ret; case 4: __get_user_size(*(u32 *)to, from, 4, ret, 4); return ret; case 8: __get_user_size(*(u64*)to, from, 8, ret, 8); return ret; } } return __copy_from_user_ll(to, from, n); } /* * The exception table consists of pairs of addresses: the first is the * address of an instruction that is allowed to fault, and the second is * the address at which the program should continue. No registers are * modified, so it is entirely up to the continuation code to figure out * what to do. * * All the routines below use bits of fixup code that are out of line * with the main instruction path. This means when everything is well, * we don't even have to jump over them. Further, they do not intrude * on our cache or tlb entries. */ struct exception_table_entry { s32 addr, cont; }; extern struct exception_table_entry __start___ex_table[]; extern struct exception_table_entry __stop___ex_table[]; extern struct exception_table_entry __start___pre_ex_table[]; extern struct exception_table_entry __stop___pre_ex_table[]; extern unsigned long search_exception_table(unsigned long); extern void sort_exception_tables(void); #endif /* __X86_UACCESS_H__ */ xen-4.4.0/xen/include/asm-x86/current.h0000664000175000017500000000500412307313555015715 0ustar smbsmb/****************************************************************************** * current.h * * Information structure that lives at the bottom of the per-cpu Xen stack. */ #ifndef __X86_CURRENT_H__ #define __X86_CURRENT_H__ #include #include #include #include struct vcpu; struct cpu_info { struct cpu_user_regs guest_cpu_user_regs; unsigned int processor_id; struct vcpu *current_vcpu; unsigned long per_cpu_offset; /* get_stack_bottom() must be 16-byte aligned */ unsigned long __pad_for_stack_bottom; }; static inline struct cpu_info *get_cpu_info(void) { unsigned long tos; __asm__ ( "and %%rsp,%0" : "=r" (tos) : "0" (~(STACK_SIZE-1)) ); return (struct cpu_info *)(tos + STACK_SIZE) - 1; } #define get_current() (get_cpu_info()->current_vcpu) #define set_current(vcpu) (get_cpu_info()->current_vcpu = (vcpu)) #define current (get_current()) #define get_processor_id() (get_cpu_info()->processor_id) #define set_processor_id(id) do { \ struct cpu_info *ci__ = get_cpu_info(); \ ci__->per_cpu_offset = __per_cpu_offset[ci__->processor_id = (id)]; \ } while (0) #define guest_cpu_user_regs() (&get_cpu_info()->guest_cpu_user_regs) /* * Get the bottom-of-stack, as stored in the per-CPU TSS. This actually points * into the middle of cpu_info.guest_cpu_user_regs, at the section that * precisely corresponds to a CPU trap frame. */ #define get_stack_bottom() \ ((unsigned long)&get_cpu_info()->guest_cpu_user_regs.es) /* * Get the bottom-of-stack, as useful for printing stack traces. This is the * highest word on the stack which might be part of a stack trace, and is the * adjacent word to a struct cpu_info on the stack. */ #define get_printable_stack_bottom(sp) \ ((sp & (~(STACK_SIZE-1))) + \ (STACK_SIZE - sizeof(struct cpu_info) - sizeof(unsigned long))) #define reset_stack_and_jump(__fn) \ __asm__ __volatile__ ( \ "mov %0,%%"__OP"sp; jmp %c1" \ : : "r" (guest_cpu_user_regs()), "i" (__fn) : "memory" ) #define schedule_tail(vcpu) (((vcpu)->arch.schedule_tail)(vcpu)) /* * Which VCPU's state is currently running on each CPU? * This is not necesasrily the same as 'current' as a CPU may be * executing a lazy state switch. */ DECLARE_PER_CPU(struct vcpu *, curr_vcpu); #endif /* __X86_CURRENT_H__ */ xen-4.4.0/xen/include/asm-x86/types.h0000664000175000017500000000170012307313555015376 0ustar smbsmb#ifndef __X86_TYPES_H__ #define __X86_TYPES_H__ #ifndef __ASSEMBLY__ #include typedef __signed__ char __s8; typedef unsigned char __u8; typedef __signed__ short __s16; typedef unsigned short __u16; typedef __signed__ int __s32; typedef unsigned int __u32; #if defined(__GNUC__) && !defined(__STRICT_ANSI__) typedef __signed__ long __s64; typedef unsigned long __u64; #endif typedef signed char s8; typedef unsigned char u8; typedef signed short s16; typedef unsigned short u16; typedef signed int s32; typedef unsigned int u32; typedef signed long s64; typedef unsigned long u64; typedef unsigned long paddr_t; #define INVALID_PADDR (~0UL) #define PRIpaddr "016lx" #if defined(__SIZE_TYPE__) typedef __SIZE_TYPE__ size_t; #else typedef unsigned long size_t; #endif typedef char bool_t; #define test_and_set_bool(b) xchg(&(b), 1) #define test_and_clear_bool(b) xchg(&(b), 0) #endif /* __ASSEMBLY__ */ #endif /* __X86_TYPES_H__ */ xen-4.4.0/xen/include/asm-x86/acpi.h0000664000175000017500000001151112307313555015147 0ustar smbsmb#ifndef _ASM_X86_ACPI_H #define _ASM_X86_ACPI_H /* * Copyright (C) 2001 Paul Diefenbaugh * Copyright (C) 2001 Patrick Mochel * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ #include #include #include #include #define COMPILER_DEPENDENT_INT64 long long #define COMPILER_DEPENDENT_UINT64 unsigned long long /* * Calling conventions: * * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads) * ACPI_EXTERNAL_XFACE - External ACPI interfaces * ACPI_INTERNAL_XFACE - Internal ACPI interfaces * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces */ #define ACPI_SYSTEM_XFACE #define ACPI_EXTERNAL_XFACE #define ACPI_INTERNAL_XFACE #define ACPI_INTERNAL_VAR_XFACE /* Asm macros */ #define ACPI_ASM_MACROS #define BREAKPOINT3 #define ACPI_DISABLE_IRQS() local_irq_disable() #define ACPI_ENABLE_IRQS() local_irq_enable() #define ACPI_FLUSH_CPU_CACHE() wbinvd() int __acpi_acquire_global_lock(unsigned int *lock); int __acpi_release_global_lock(unsigned int *lock); #define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \ ((Acq) = __acpi_acquire_global_lock(&facs->global_lock)) #define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \ ((Acq) = __acpi_release_global_lock(&facs->global_lock)) /* * Math helper asm macros */ #define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \ asm("divl %2;" \ :"=a"(q32), "=d"(r32) \ :"r"(d32), \ "0"(n_lo), "1"(n_hi)) #define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \ asm("shrl $1,%2 ;" \ "rcrl $1,%3;" \ :"=r"(n_hi), "=r"(n_lo) \ :"0"(n_hi), "1"(n_lo)) extern bool_t acpi_lapic, acpi_ioapic, acpi_noirq; extern bool_t acpi_force, acpi_ht, acpi_disabled; extern bool_t acpi_skip_timer_override; extern u32 acpi_smi_cmd; extern u8 acpi_enable_value, acpi_disable_value; void acpi_pic_sci_set_trigger(unsigned int, u16); static inline void disable_acpi(void) { acpi_disabled = 1; acpi_ht = 0; acpi_noirq = 1; } /* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */ #define FIX_ACPI_PAGES 4 static inline void acpi_noirq_set(void) { acpi_noirq = 1; } /* routines for saving/restoring kernel state */ extern int acpi_save_state_mem(void); extern int acpi_save_state_disk(void); extern void acpi_restore_state_mem(void); extern unsigned long acpi_wakeup_address; /* early initialization routine */ extern void acpi_reserve_bootmem(void); #define ARCH_HAS_POWER_INIT 1 extern int acpi_numa; extern int acpi_scan_nodes(u64 start, u64 end); #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) #ifdef CONFIG_ACPI_SLEEP extern struct acpi_sleep_info acpi_sinfo; #define acpi_video_flags bootsym(video_flags) struct xenpf_enter_acpi_sleep; extern int acpi_enter_sleep(struct xenpf_enter_acpi_sleep *sleep); extern int acpi_enter_state(u32 state); struct acpi_sleep_info { struct acpi_generic_address pm1a_cnt_blk; struct acpi_generic_address pm1b_cnt_blk; struct acpi_generic_address pm1a_evt_blk; struct acpi_generic_address pm1b_evt_blk; struct acpi_generic_address sleep_control; struct acpi_generic_address sleep_status; union { uint16_t pm1a_cnt_val; uint8_t sleep_type_a; }; union { uint16_t pm1b_cnt_val; uint8_t sleep_type_b; }; uint32_t sleep_state; uint64_t wakeup_vector; uint32_t vector_width; bool_t sleep_extended; }; #endif /* CONFIG_ACPI_SLEEP */ #define MAX_MADT_ENTRIES MAX(256, 2 * NR_CPUS) extern u32 x86_acpiid_to_apicid[]; #define MAX_LOCAL_APIC MAX(256, 4 * NR_CPUS) #define INVALID_ACPIID (-1U) extern u32 pmtmr_ioport; int acpi_dmar_init(void); void acpi_mmcfg_init(void); /* Incremented whenever we transition through S3. Value is 1 during boot. */ extern uint32_t system_reset_counter; void hvm_acpi_power_button(struct domain *d); void hvm_acpi_sleep_button(struct domain *d); /* suspend/resume */ void save_rest_processor_state(void); void restore_rest_processor_state(void); #endif /*__X86_ASM_ACPI_H*/ xen-4.4.0/xen/include/asm-x86/system.h0000664000175000017500000001443312307313555015565 0ustar smbsmb#ifndef __ASM_SYSTEM_H #define __ASM_SYSTEM_H #include #include #include #define read_segment_register(name) \ ({ u16 __sel; \ asm volatile ( "movw %%" STR(name) ",%0" : "=r" (__sel) ); \ __sel; \ }) #define wbinvd() \ asm volatile ( "wbinvd" : : : "memory" ) #define clflush(a) \ asm volatile ( "clflush (%0)" : : "r"(a) ) #define nop() \ asm volatile ( "nop" ) #define xchg(ptr,v) \ ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) struct __xchg_dummy { unsigned long a[100]; }; #define __xg(x) ((volatile struct __xchg_dummy *)(x)) #include /* * Note: no "lock" prefix even on SMP: xchg always implies lock anyway * Note 2: xchg has side effect, so that attribute volatile is necessary, * but generally the primitive is invalid, *ptr is output argument. --ANK */ static always_inline unsigned long __xchg( unsigned long x, volatile void *ptr, int size) { switch ( size ) { case 1: asm volatile ( "xchgb %b0,%1" : "=q" (x) : "m" (*__xg((volatile void *)ptr)), "0" (x) : "memory" ); break; case 2: asm volatile ( "xchgw %w0,%1" : "=r" (x) : "m" (*__xg((volatile void *)ptr)), "0" (x) : "memory" ); break; case 4: asm volatile ( "xchgl %k0,%1" : "=r" (x) : "m" (*__xg((volatile void *)ptr)), "0" (x) : "memory" ); break; case 8: asm volatile ( "xchgq %0,%1" : "=r" (x) : "m" (*__xg((volatile void *)ptr)), "0" (x) : "memory" ); break; } return x; } /* * Atomic compare and exchange. Compare OLD with MEM, if identical, * store NEW in MEM. Return the initial value in MEM. Success is * indicated by comparing RETURN with OLD. */ static always_inline unsigned long __cmpxchg( volatile void *ptr, unsigned long old, unsigned long new, int size) { unsigned long prev; switch ( size ) { case 1: asm volatile ( "lock; cmpxchgb %b1,%2" : "=a" (prev) : "q" (new), "m" (*__xg((volatile void *)ptr)), "0" (old) : "memory" ); return prev; case 2: asm volatile ( "lock; cmpxchgw %w1,%2" : "=a" (prev) : "r" (new), "m" (*__xg((volatile void *)ptr)), "0" (old) : "memory" ); return prev; case 4: asm volatile ( "lock; cmpxchgl %k1,%2" : "=a" (prev) : "r" (new), "m" (*__xg((volatile void *)ptr)), "0" (old) : "memory" ); return prev; case 8: asm volatile ( "lock; cmpxchgq %1,%2" : "=a" (prev) : "r" (new), "m" (*__xg((volatile void *)ptr)), "0" (old) : "memory" ); return prev; } return old; } #define cmpxchgptr(ptr,o,n) ({ \ const __typeof__(**(ptr)) *__o = (o); \ __typeof__(**(ptr)) *__n = (n); \ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)__o, \ (unsigned long)__n,sizeof(*(ptr)))); \ }) /* * Both Intel and AMD agree that, from a programmer's viewpoint: * Loads cannot be reordered relative to other loads. * Stores cannot be reordered relative to other stores. * * Intel64 Architecture Memory Ordering White Paper * * * AMD64 Architecture Programmer's Manual, Volume 2: System Programming * */ #define rmb() barrier() #define wmb() barrier() #define smp_mb() mb() #define smp_rmb() rmb() #define smp_wmb() wmb() #define set_mb(var, value) do { xchg(&var, value); } while (0) #define set_wmb(var, value) do { var = value; wmb(); } while (0) #define local_irq_disable() asm volatile ( "cli" : : : "memory" ) #define local_irq_enable() asm volatile ( "sti" : : : "memory" ) /* used in the idle loop; sti takes one instruction cycle to complete */ #define safe_halt() asm volatile ( "sti; hlt" : : : "memory" ) /* used when interrupts are already enabled or to shutdown the processor */ #define halt() asm volatile ( "hlt" : : : "memory" ) #define local_save_flags(x) \ ({ \ BUILD_BUG_ON(sizeof(x) != sizeof(long)); \ asm volatile ( "pushf" __OS " ; pop" __OS " %0" : "=g" (x)); \ }) #define local_irq_save(x) \ ({ \ local_save_flags(x); \ local_irq_disable(); \ }) #define local_irq_restore(x) \ ({ \ BUILD_BUG_ON(sizeof(x) != sizeof(long)); \ asm volatile ( "pushfq\n\t" \ "andq %0, (%%rsp)\n\t" \ "orq %1, (%%rsp)\n\t" \ "popfq" \ : : "i?r" ( ~X86_EFLAGS_IF ), \ "ri" ( (x) & X86_EFLAGS_IF ) ); \ }) static inline int local_irq_is_enabled(void) { unsigned long flags; local_save_flags(flags); return !!(flags & X86_EFLAGS_IF); } #define BROKEN_ACPI_Sx 0x0001 #define BROKEN_INIT_AFTER_S1 0x0002 void trap_init(void); void percpu_traps_init(void); void subarch_percpu_traps_init(void); #endif xen-4.4.0/xen/include/asm-x86/debugreg.h0000664000175000017500000000556312307313555016031 0ustar smbsmb#ifndef _X86_DEBUGREG_H #define _X86_DEBUGREG_H /* Indicate the register numbers for a number of the specific debug registers. Registers 0-3 contain the addresses we wish to trap on */ #define DR_FIRSTADDR 0 #define DR_LASTADDR 3 #define DR_STATUS 6 #define DR_CONTROL 7 /* Define a few things for the status register. We can use this to determine which debugging register was responsible for the trap. The other bits are either reserved or not of interest to us. */ #define DR_TRAP0 (0x1) /* db0 */ #define DR_TRAP1 (0x2) /* db1 */ #define DR_TRAP2 (0x4) /* db2 */ #define DR_TRAP3 (0x8) /* db3 */ #define DR_STEP (0x4000) /* single-step */ #define DR_SWITCH (0x8000) /* task switch */ /* Now define a bunch of things for manipulating the control register. The top two bytes of the control register consist of 4 fields of 4 bits - each field corresponds to one of the four debug registers, and indicates what types of access we trap on, and how large the data field is that we are looking at */ #define DR_CONTROL_SHIFT 16 /* Skip this many bits in ctl register */ #define DR_CONTROL_SIZE 4 /* 4 control bits per register */ #define DR_RW_EXECUTE (0x0) /* Settings for the access types to trap on */ #define DR_RW_WRITE (0x1) #define DR_IO (0x2) #define DR_RW_READ (0x3) #define DR_LEN_1 (0x0) /* Settings for data length to trap on */ #define DR_LEN_2 (0x4) #define DR_LEN_4 (0xC) #define DR_LEN_8 (0x8) /* The low byte to the control register determine which registers are enabled. There are 4 fields of two bits. One bit is "local", meaning that the processor will reset the bit after a task switch and the other is global meaning that we have to explicitly reset the bit. */ #define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */ #define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */ #define DR_ENABLE_SIZE 2 /* 2 enable bits per register */ #define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */ #define DR_GLOBAL_ENABLE_MASK (0xAA) /* Set global bits for all 4 regs */ #define DR7_ACTIVE_MASK (DR_LOCAL_ENABLE_MASK|DR_GLOBAL_ENABLE_MASK) /* The second byte to the control register has a few special things. We can slow the instruction pipeline for instructions coming via the gdt or the ldt if we want to. I am not sure why this is an advantage */ #define DR_CONTROL_RESERVED_ZERO (~0xffff27fful) /* Reserved, read as zero */ #define DR_CONTROL_RESERVED_ONE (0x00000400ul) /* Reserved, read as one */ #define DR_LOCAL_EXACT_ENABLE (0x00000100ul) /* Local exact enable */ #define DR_GLOBAL_EXACT_ENABLE (0x00000200ul) /* Global exact enable */ #define DR_GENERAL_DETECT (0x00002000ul) /* General detect enable */ #endif /* _X86_DEBUGREG_H */ xen-4.4.0/xen/include/asm-x86/mwait.h0000664000175000017500000000062212307313555015355 0ustar smbsmb#ifndef __ASM_X86_MWAIT_H__ #define __ASM_X86_MWAIT_H__ #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf #define MWAIT_SUBSTATE_SIZE 4 #define CPUID_MWAIT_LEAF 5 #define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 #define CPUID5_ECX_INTERRUPT_BREAK 0x2 #define MWAIT_ECX_INTERRUPT_BREAK 0x1 void mwait_idle_with_hints(unsigned int eax, unsigned int ecx); #endif /* __ASM_X86_MWAIT_H__ */ xen-4.4.0/xen/include/asm-x86/cpufeature.h0000664000175000017500000002564712307313555016415 0ustar smbsmb/* * cpufeature.h * * Defines x86 CPU feature bits */ #ifndef __ASM_I386_CPUFEATURE_H #define __ASM_I386_CPUFEATURE_H #include #define NCAPINTS 8 /* N 32-bit words worth of info */ /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ #define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */ #define X86_FEATURE_VME (0*32+ 1) /* Virtual Mode Extensions */ #define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */ #define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extensions */ #define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */ #define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers, RDMSR, WRMSR */ #define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */ #define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Architecture */ #define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */ #define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */ #define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */ #define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */ #define X86_FEATURE_PGE (0*32+13) /* Page Global Enable */ #define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */ #define X86_FEATURE_CMOV (0*32+15) /* CMOV instruction (FCMOVCC and FCOMI too if FPU present) */ #define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */ #define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */ #define X86_FEATURE_PN (0*32+18) /* Processor serial number */ #define X86_FEATURE_CLFLSH (0*32+19) /* Supports the CLFLUSH instruction */ #define X86_FEATURE_DS (0*32+21) /* Debug Store */ #define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */ #define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ #define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions (fast save and restore */ /* of FPU context), and CR4.OSFXSR available */ #define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */ #define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */ #define X86_FEATURE_SELFSNOOP (0*32+27) /* CPU self snoop */ #define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */ #define X86_FEATURE_ACC (0*32+29) /* Automatic clock control */ #define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */ #define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ /* Don't duplicate feature flags which are redundant with Intel! */ #define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */ #define X86_FEATURE_MP (1*32+19) /* MP Capable. */ #define X86_FEATURE_NX (1*32+20) /* Execute Disable */ #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ #define X86_FEATURE_FFXSR (1*32+25) /* FFXSR instruction optimizations */ #define X86_FEATURE_PAGE1GB (1*32+26) /* 1Gb large page support */ #define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */ #define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */ #define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */ #define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */ /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ #define X86_FEATURE_RECOVERY (2*32+ 0) /* CPU in recovery mode */ #define X86_FEATURE_LONGRUN (2*32+ 1) /* Longrun power control */ #define X86_FEATURE_LRTI (2*32+ 3) /* LongRun table interface */ /* Other features, Linux-defined mapping, word 3 */ /* This range is used for feature bits which conflict or are synthesized */ #define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */ #define X86_FEATURE_NONSTOP_TSC (3*32+ 9) /* TSC does not stop in C states */ #define X86_FEATURE_ARAT (3*32+ 10) /* Always running APIC timer */ #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ #define X86_FEATURE_TSC_RELIABLE (3*32+12) /* TSC is known to be reliable */ #define X86_FEATURE_XTOPOLOGY (3*32+13) /* cpu topology enum extensions */ #define X86_FEATURE_CPUID_FAULTING (3*32+14) /* cpuid faulting */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */ #define X86_FEATURE_PCLMULQDQ (4*32+ 1) /* Carry-less mulitplication */ #define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit Debug Store */ #define X86_FEATURE_MWAIT (4*32+ 3) /* Monitor/Mwait support */ #define X86_FEATURE_DSCPL (4*32+ 4) /* CPL Qualified Debug Store */ #define X86_FEATURE_VMXE (4*32+ 5) /* Virtual Machine Extensions */ #define X86_FEATURE_SMXE (4*32+ 6) /* Safer Mode Extensions */ #define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */ #define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */ #define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental Streaming SIMD Extensions-3 */ #define X86_FEATURE_CID (4*32+10) /* Context ID */ #define X86_FEATURE_FMA (4*32+12) /* Fused Multiply Add */ #define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */ #define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */ #define X86_FEATURE_PDCM (4*32+15) /* Perf/Debug Capability MSR */ #define X86_FEATURE_PCID (4*32+17) /* Process Context ID */ #define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */ #define X86_FEATURE_SSE4_1 (4*32+19) /* Streaming SIMD Extensions 4.1 */ #define X86_FEATURE_SSE4_2 (4*32+20) /* Streaming SIMD Extensions 4.2 */ #define X86_FEATURE_X2APIC (4*32+21) /* Extended xAPIC */ #define X86_FEATURE_MOVBE (4*32+22) /* movbe instruction */ #define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ #define X86_FEATURE_TSC_DEADLINE (4*32+24) /* "tdt" TSC Deadline Timer */ #define X86_FEATURE_AES (4*32+25) /* AES instructions */ #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ #define X86_FEATURE_OSXSAVE (4*32+27) /* OSXSAVE */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ #define X86_FEATURE_F16C (4*32+29) /* Half-precision convert instruction */ #define X86_FEATURE_RDRAND (4*32+30) /* Digital Random Number Generator */ #define X86_FEATURE_HYPERVISOR (4*32+31) /* Running under some hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ #define X86_FEATURE_XSTORE (5*32+ 2) /* on-CPU RNG present (xstore insn) */ #define X86_FEATURE_XSTORE_EN (5*32+ 3) /* on-CPU RNG enabled */ #define X86_FEATURE_XCRYPT (5*32+ 6) /* on-CPU crypto (xcrypt insn) */ #define X86_FEATURE_XCRYPT_EN (5*32+ 7) /* on-CPU crypto enabled */ #define X86_FEATURE_ACE2 (5*32+ 8) /* Advanced Cryptography Engine v2 */ #define X86_FEATURE_ACE2_EN (5*32+ 9) /* ACE v2 enabled */ #define X86_FEATURE_PHE (5*32+ 10) /* PadLock Hash Engine */ #define X86_FEATURE_PHE_EN (5*32+ 11) /* PHE enabled */ #define X86_FEATURE_PMM (5*32+ 12) /* PadLock Montgomery Multiplier */ #define X86_FEATURE_PMM_EN (5*32+ 13) /* PMM enabled */ /* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ #define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */ #define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */ #define X86_FEATURE_SVM (6*32+ 2) /* Secure virtual machine */ #define X86_FEATURE_EXTAPIC (6*32+ 3) /* Extended APIC space */ #define X86_FEATURE_CR8_LEGACY (6*32+ 4) /* CR8 in 32-bit mode */ #define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */ #define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */ #define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */ #define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ #define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ #define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ #define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */ #define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ #define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ #define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */ #define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */ #define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ #define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ #define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 7 */ #define X86_FEATURE_FSGSBASE (7*32+ 0) /* {RD,WR}{FS,GS}BASE instructions */ #define X86_FEATURE_BMI1 (7*32+ 3) /* 1st bit manipulation extensions */ #define X86_FEATURE_HLE (7*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_AVX2 (7*32+ 5) /* AVX2 instructions */ #define X86_FEATURE_SMEP (7*32+ 7) /* Supervisor Mode Execution Protection */ #define X86_FEATURE_BMI2 (7*32+ 8) /* 2nd bit manipulation extensions */ #define X86_FEATURE_ERMS (7*32+ 9) /* Enhanced REP MOVSB/STOSB */ #define X86_FEATURE_INVPCID (7*32+10) /* Invalidate Process Context ID */ #define X86_FEATURE_RTM (7*32+11) /* Restricted Transactional Memory */ #define X86_FEATURE_NO_FPU_SEL (7*32+13) /* FPU CS/DS stored as zero */ #define X86_FEATURE_SMAP (7*32+20) /* Supervisor Mode Access Prevention */ #define cpu_has(c, bit) test_bit(bit, (c)->x86_capability) #define boot_cpu_has(bit) test_bit(bit, boot_cpu_data.x86_capability) #define cpufeat_mask(idx) (1u << ((idx) & 31)) #define CPUID_MWAIT_LEAF 5 #define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 #define CPUID5_ECX_INTERRUPT_BREAK 0x2 #define cpu_has_vme 0 #define cpu_has_de 1 #define cpu_has_pse 1 #define cpu_has_tsc 1 #define cpu_has_pge 1 #define cpu_has_pat 1 #define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) #define cpu_has_sep boot_cpu_has(X86_FEATURE_SEP) #define cpu_has_mtrr 1 #define cpu_has_mmx 1 #define cpu_has_fxsr 1 #define cpu_has_xmm 1 #define cpu_has_xmm2 1 #define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3) #define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) #define cpu_has_syscall 1 #define cpu_has_mp 1 #define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) #define cpu_has_k6_mtrr 0 #define cpu_has_cyrix_arr 0 #define cpu_has_centaur_mcr 0 #define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH) #define cpu_has_page1gb boot_cpu_has(X86_FEATURE_PAGE1GB) #define cpu_has_efer 1 #define cpu_has_fsgsbase boot_cpu_has(X86_FEATURE_FSGSBASE) #define cpu_has_smep boot_cpu_has(X86_FEATURE_SMEP) #define cpu_has_fpu_sel (!boot_cpu_has(X86_FEATURE_NO_FPU_SEL)) #define cpu_has_ffxsr ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) \ && boot_cpu_has(X86_FEATURE_FFXSR)) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_pcid boot_cpu_has(X86_FEATURE_PCID) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) #define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) #define cpu_has_lwp boot_cpu_has(X86_FEATURE_LWP) #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) #define cpu_has_rdtscp boot_cpu_has(X86_FEATURE_RDTSCP) #define cpu_has_svm boot_cpu_has(X86_FEATURE_SVM) #define cpu_has_vmx boot_cpu_has(X86_FEATURE_VMXE) #define cpu_has_cpuid_faulting boot_cpu_has(X86_FEATURE_CPUID_FAULTING) #endif /* __ASM_I386_CPUFEATURE_H */ /* * Local Variables: * mode:c * comment-column:42 * End: */ xen-4.4.0/xen/include/asm-x86/spinlock.h0000664000175000017500000000406712307313555016065 0ustar smbsmb#ifndef __ASM_SPINLOCK_H #define __ASM_SPINLOCK_H #include #include #include typedef struct { volatile s16 lock; } raw_spinlock_t; #define _RAW_SPIN_LOCK_UNLOCKED /*(raw_spinlock_t)*/ { 1 } #define _raw_spin_is_locked(x) ((x)->lock <= 0) static always_inline void _raw_spin_unlock(raw_spinlock_t *lock) { ASSERT(_raw_spin_is_locked(lock)); asm volatile ( "movw $1,%0" : "=m" (lock->lock) : : "memory" ); } static always_inline int _raw_spin_trylock(raw_spinlock_t *lock) { s16 oldval; asm volatile ( "xchgw %w0,%1" :"=r" (oldval), "=m" (lock->lock) :"0" ((s16)0) : "memory" ); return (oldval > 0); } typedef struct { volatile int lock; } raw_rwlock_t; #define RW_WRITE_BIAS 0x7fffffff #define _RAW_RW_LOCK_UNLOCKED /*(raw_rwlock_t)*/ { 0 } static always_inline int _raw_read_trylock(raw_rwlock_t *rw) { int acquired; asm volatile ( " lock; decl %0 \n" " jns 2f \n" #ifdef __clang__ /* clang's builtin assember can't do .subsection */ "1: .pushsection .fixup,\"ax\"\n" #else "1: .subsection 1 \n" #endif "2: lock; incl %0 \n" " decl %1 \n" " jmp 1b \n" #ifdef __clang__ " .popsection \n" #else " .subsection 0 \n" #endif : "=m" (rw->lock), "=r" (acquired) : "1" (1) : "memory" ); return acquired; } static always_inline int _raw_write_trylock(raw_rwlock_t *rw) { return (cmpxchg(&rw->lock, 0, RW_WRITE_BIAS) == 0); } static always_inline void _raw_read_unlock(raw_rwlock_t *rw) { asm volatile ( "lock ; incl %0" : "=m" ((rw)->lock) : : "memory" ); } static always_inline void _raw_write_unlock(raw_rwlock_t *rw) { asm volatile ( "lock ; subl %1,%0" : "=m" ((rw)->lock) : "i" (RW_WRITE_BIAS) : "memory" ); } #define _raw_rw_is_locked(x) ((x)->lock != 0) #define _raw_rw_is_write_locked(x) ((x)->lock > 0) #endif /* __ASM_SPINLOCK_H */ xen-4.4.0/xen/include/asm-x86/debugger.h0000664000175000017500000000545612307313555016032 0ustar smbsmb/****************************************************************************** * asm/debugger.h * * Generic hooks into arch-dependent Xen. * * Each debugger should define two functions here: * * 1. debugger_trap_entry(): * Called at start of any synchronous fault or trap, before any other work * is done. The idea is that if your debugger deliberately caused the trap * (e.g. to implement breakpoints or data watchpoints) then you can take * appropriate action and return a non-zero value to cause early exit from * the trap function. * * 2. debugger_trap_fatal(): * Called when Xen is about to give up and crash. Typically you will use this * hook to drop into a debug session. It can also be used to hook off * deliberately caused traps (which you then handle and return non-zero). * * 3. debugger_trap_immediate(): * Called if we want to drop into a debugger now. This is essentially the * same as debugger_trap_fatal, except that we use the current register state * rather than the state which was in effect when we took the trap. * For example: if we're dying because of an unhandled exception, we call * debugger_trap_fatal; if we're dying because of a panic() we call * debugger_trap_immediate(). */ #ifndef __X86_DEBUGGER_H__ #define __X86_DEBUGGER_H__ #include #include #include /* The main trap handlers use these helper macros which include early bail. */ #define DEBUGGER_trap_entry(_v, _r) \ if ( debugger_trap_entry(_v, _r) ) return; #define DEBUGGER_trap_fatal(_v, _r) \ if ( debugger_trap_fatal(_v, _r) ) return; #if defined(CRASH_DEBUG) #include static inline int debugger_trap_fatal( unsigned int vector, struct cpu_user_regs *regs) { int rc = __trap_to_gdb(regs, vector); return ((rc == 0) || (vector == TRAP_int3)); } /* Int3 is a trivial way to gather cpu_user_regs context. */ #define debugger_trap_immediate() __asm__ __volatile__ ( "int3" ); #else static inline int debugger_trap_fatal( unsigned int vector, struct cpu_user_regs *regs) { return 0; } #define debugger_trap_immediate() ((void)0) #endif static inline int debugger_trap_entry( unsigned int vector, struct cpu_user_regs *regs) { struct vcpu *v = current; if ( guest_kernel_mode(v, regs) && v->domain->debugger_attached && ((vector == TRAP_int3) || (vector == TRAP_debug)) ) { if ( vector != TRAP_debug ) /* domain pause is good enough */ current->arch.gdbsx_vcpu_event = vector; domain_pause_for_debugger(); return 1; } return 0; } typedef unsigned long dbgva_t; typedef unsigned char dbgbyte_t; extern int dbg_rw_mem(dbgva_t addr, dbgbyte_t *buf, int len, domid_t domid, int toaddr, uint64_t pgd3); #endif /* __X86_DEBUGGER_H__ */ xen-4.4.0/xen/include/asm-x86/byteorder.h0000664000175000017500000000153612307313555016240 0ustar smbsmb#ifndef __ASM_X86_BYTEORDER_H__ #define __ASM_X86_BYTEORDER_H__ #include #include static inline __attribute_const__ __u32 ___arch__swab32(__u32 x) { asm("bswap %0" : "=r" (x) : "0" (x)); return x; } static inline __attribute_const__ __u64 ___arch__swab64(__u64 val) { union { struct { __u32 a,b; } s; __u64 u; } v; v.u = val; asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1" : "=r" (v.s.a), "=r" (v.s.b) : "0" (v.s.a), "1" (v.s.b)); return v.u; } /* Do not define swab16. Gcc is smart enough to recognize "C" version and convert it into rotation or exhange. */ #define __arch__swab64(x) ___arch__swab64(x) #define __arch__swab32(x) ___arch__swab32(x) #define __BYTEORDER_HAS_U64__ #include #endif /* __ASM_X86_BYTEORDER_H__ */ xen-4.4.0/xen/include/asm-x86/i387.h0000664000175000017500000000202312307313555014723 0ustar smbsmb/* * include/asm-i386/i387.h * * Copyright (C) 1994 Linus Torvalds * * Pentium III FXSR, SSE support * General FPU state handling cleanups * Gareth Hughes , May 2000 */ #ifndef __ASM_I386_I387_H #define __ASM_I386_I387_H #include #include /* Byte offset of the stored word size within the FXSAVE area/portion. */ #define FPU_WORD_SIZE_OFFSET 511 struct ix87_state { struct ix87_env { uint16_t fcw, _res0; uint16_t fsw, _res1; uint16_t ftw, _res2; uint32_t fip; uint16_t fcs; uint16_t fop; uint32_t fdp; uint16_t fds, _res6; } env; struct ix87_reg { uint64_t mantissa; uint16_t exponent:15; uint16_t sign:1; } __attribute__((__packed__)) r[8]; }; void vcpu_restore_fpu_eager(struct vcpu *v); void vcpu_restore_fpu_lazy(struct vcpu *v); void vcpu_save_fpu(struct vcpu *v); int vcpu_init_fpu(struct vcpu *v); void vcpu_destroy_fpu(struct vcpu *v); #endif /* __ASM_I386_I387_H */ xen-4.4.0/xen/include/asm-x86/x86_emulate.h0000664000175000017500000000100412307313555016370 0ustar smbsmb/****************************************************************************** * x86_emulate.h * * Wrapper for generic x86 instruction decoder and emulator. * * Copyright (c) 2008, Citrix Systems, Inc. * * Authors: * Keir Fraser */ #ifndef __ASM_X86_X86_EMULATE_H__ #define __ASM_X86_X86_EMULATE_H__ #include #include #include #include #include "../../arch/x86/x86_emulate/x86_emulate.h" #endif /* __ASM_X86_X86_EMULATE_H__ */ xen-4.4.0/xen/include/asm-x86/regs.h0000664000175000017500000000155712307313555015204 0ustar smbsmb #ifndef __X86_REGS_H__ #define __X86_REGS_H__ #include #define guest_mode(r) \ ({ \ unsigned long diff = (char *)guest_cpu_user_regs() - (char *)(r); \ /* Frame pointer must point into current CPU stack. */ \ ASSERT(diff < STACK_SIZE); \ /* If not a guest frame, it must be a hypervisor frame. */ \ ASSERT((diff == 0) || (!vm86_mode(r) && (r->cs == __HYPERVISOR_CS))); \ /* Return TRUE if it's a guest frame. */ \ (diff == 0); \ }) #define return_reg(v) ((v)->arch.user_regs.eax) #endif /* __X86_REGS_H__ */ xen-4.4.0/xen/include/asm-x86/mem_sharing.h0000664000175000017500000000746112307313555016535 0ustar smbsmb/****************************************************************************** * include/asm-x86/mem_sharing.h * * Memory sharing support. * * Copyright (c) 2009 Citrix Systems, Inc. (Grzegorz Milos) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef __MEM_SHARING_H__ #define __MEM_SHARING_H__ #include #include /* Auditing of memory sharing code? */ #define MEM_SHARING_AUDIT 1 typedef uint64_t shr_handle_t; typedef struct rmap_hashtab { struct list_head *bucket; /* Overlaps with prev pointer of list_head in union below. * Unlike the prev pointer, this can be NULL. */ void *flag; } rmap_hashtab_t; struct page_sharing_info { struct page_info *pg; /* Back pointer to the page. */ shr_handle_t handle; /* Globally unique version / handle. */ #if MEM_SHARING_AUDIT struct list_head entry; /* List of all shared pages (entry). */ struct rcu_head rcu_head; /* List of all shared pages (entry). */ #endif /* Reverse map of tuples for this shared frame. */ union { struct list_head gfns; rmap_hashtab_t hash_table; }; }; #define sharing_supported(_d) \ (is_hvm_domain(_d) && paging_mode_hap(_d)) unsigned int mem_sharing_get_nr_saved_mfns(void); unsigned int mem_sharing_get_nr_shared_mfns(void); int mem_sharing_nominate_page(struct domain *d, unsigned long gfn, int expected_refcnt, shr_handle_t *phandle); #define MEM_SHARING_DESTROY_GFN (1<<1) /* Only fails with -ENOMEM. Enforce it with a BUG_ON wrapper. */ int __mem_sharing_unshare_page(struct domain *d, unsigned long gfn, uint16_t flags); static inline int mem_sharing_unshare_page(struct domain *d, unsigned long gfn, uint16_t flags) { int rc = __mem_sharing_unshare_page(d, gfn, flags); BUG_ON( rc && (rc != -ENOMEM) ); return rc; } /* If called by a foreign domain, possible errors are * -EBUSY -> ring full * -ENOSYS -> no ring to begin with * and the foreign mapper is responsible for retrying. * * If called by the guest vcpu itself and allow_sleep is set, may * sleep on a wait queue, so the caller is responsible for not * holding locks on entry. It may only fail with ENOSYS * * If called by the guest vcpu itself and allow_sleep is not set, * then it's the same as a foreign domain. */ int mem_sharing_notify_enomem(struct domain *d, unsigned long gfn, bool_t allow_sleep); int mem_sharing_sharing_resume(struct domain *d); int mem_sharing_memop(struct domain *d, xen_mem_sharing_op_t *mec); int mem_sharing_domctl(struct domain *d, xen_domctl_mem_sharing_op_t *mec); int mem_sharing_audit(void); void mem_sharing_init(void); /* Scans the p2m and relinquishes any shared pages, destroying * those for which this domain holds the final reference. * Preemptible. */ int relinquish_shared_pages(struct domain *d); #endif /* __MEM_SHARING_H__ */ xen-4.4.0/xen/include/asm-x86/pci.h0000664000175000017500000000055612307313555015015 0ustar smbsmb#ifndef __X86_PCI_H__ #define __X86_PCI_H__ #define IS_SNB_GFX(id) (id == 0x01068086 || id == 0x01168086 \ || id == 0x01268086 || id == 0x01028086 \ || id == 0x01128086 || id == 0x01228086 \ || id == 0x010A8086 ) struct arch_pci_dev { vmask_t used_vectors; }; #endif /* __X86_PCI_H__ */ xen-4.4.0/xen/include/asm-x86/perfc_defn.h0000664000175000017500000001676612307313555016347 0ustar smbsmb/* This file is legitimately included multiple times. */ /*#ifndef __XEN_PERFC_DEFN_H__*/ /*#define __XEN_PERFC_DEFN_H__*/ PERFCOUNTER_ARRAY(exceptions, "exceptions", 32) #define VMX_PERF_EXIT_REASON_SIZE 56 #define VMX_PERF_VECTOR_SIZE 0x20 PERFCOUNTER_ARRAY(vmexits, "vmexits", VMX_PERF_EXIT_REASON_SIZE) PERFCOUNTER_ARRAY(cause_vector, "cause vector", VMX_PERF_VECTOR_SIZE) #define VMEXIT_NPF_PERFC 141 #define SVM_PERF_EXIT_REASON_SIZE (1+141) PERFCOUNTER_ARRAY(svmexits, "SVMexits", SVM_PERF_EXIT_REASON_SIZE) PERFCOUNTER(seg_fixups, "segmentation fixups") PERFCOUNTER(apic_timer, "apic timer interrupts") PERFCOUNTER(domain_page_tlb_flush, "domain page tlb flushes") PERFCOUNTER(calls_to_mmuext_op, "calls to mmuext_op") PERFCOUNTER(num_mmuext_ops, "mmuext ops") PERFCOUNTER(calls_to_mmu_update, "calls to mmu_update") PERFCOUNTER(num_page_updates, "page updates") PERFCOUNTER(writable_mmu_updates, "mmu_updates of writable pages") PERFCOUNTER(calls_to_update_va, "calls to update_va_map") PERFCOUNTER(page_faults, "page faults") PERFCOUNTER(copy_user_faults, "copy_user faults") PERFCOUNTER(map_domain_page_count, "map_domain_page count") PERFCOUNTER(ptwr_emulations, "writable pt emulations") PERFCOUNTER(exception_fixed, "pre-exception fixed") PERFCOUNTER(guest_walk, "guest pagetable walks") /* Shadow counters */ PERFCOUNTER(shadow_alloc, "calls to shadow_alloc") PERFCOUNTER(shadow_alloc_tlbflush, "shadow_alloc flushed TLBs") /* STATUS counters do not reset when 'P' is hit */ PERFSTATUS(shadow_alloc_count, "number of shadow pages in use") PERFCOUNTER(shadow_free, "calls to shadow_free") PERFCOUNTER(shadow_prealloc_1, "shadow recycles old shadows") PERFCOUNTER(shadow_prealloc_2, "shadow recycles in-use shadows") PERFCOUNTER(shadow_linear_map_failed, "shadow hit read-only linear map") PERFCOUNTER(shadow_a_update, "shadow A bit update") PERFCOUNTER(shadow_ad_update, "shadow A&D bit update") PERFCOUNTER(shadow_fault, "calls to shadow_fault") PERFCOUNTER(shadow_fault_fast_gnp, "shadow_fault fast path n/p") PERFCOUNTER(shadow_fault_fast_mmio, "shadow_fault fast path mmio") PERFCOUNTER(shadow_fault_fast_fail, "shadow_fault fast path error") PERFCOUNTER(shadow_fault_bail_bad_gfn, "shadow_fault guest bad gfn") PERFCOUNTER(shadow_fault_bail_real_fault, "shadow_fault really guest fault") PERFCOUNTER(shadow_fault_emulate_read, "shadow_fault emulates a read") PERFCOUNTER(shadow_fault_emulate_write, "shadow_fault emulates a write") PERFCOUNTER(shadow_fault_emulate_failed, "shadow_fault emulator fails") PERFCOUNTER(shadow_fault_emulate_stack, "shadow_fault emulate stack write") PERFCOUNTER(shadow_fault_emulate_wp, "shadow_fault emulate for CR0.WP=0") PERFCOUNTER(shadow_fault_fast_emulate, "shadow_fault fast emulate") PERFCOUNTER(shadow_fault_fast_emulate_fail, "shadow_fault fast emulate failed") PERFCOUNTER(shadow_fault_mmio, "shadow_fault handled as mmio") PERFCOUNTER(shadow_fault_fixed, "shadow_fault fixed fault") PERFCOUNTER(shadow_ptwr_emulate, "shadow causes ptwr to emulate") PERFCOUNTER(shadow_validate_gl1e_calls, "calls to shadow_validate_gl1e") PERFCOUNTER(shadow_validate_gl2e_calls, "calls to shadow_validate_gl2e") PERFCOUNTER(shadow_validate_gl3e_calls, "calls to shadow_validate_gl3e") PERFCOUNTER(shadow_validate_gl4e_calls, "calls to shadow_validate_gl4e") PERFCOUNTER(shadow_hash_lookups, "calls to shadow_hash_lookup") PERFCOUNTER(shadow_hash_lookup_head, "shadow hash hit in bucket head") PERFCOUNTER(shadow_hash_lookup_miss, "shadow hash misses") PERFCOUNTER(shadow_get_shadow_status, "calls to get_shadow_status") PERFCOUNTER(shadow_hash_inserts, "calls to shadow_hash_insert") PERFCOUNTER(shadow_hash_deletes, "calls to shadow_hash_delete") PERFCOUNTER(shadow_writeable, "shadow removes write access") PERFCOUNTER(shadow_writeable_h_1, "shadow writeable: 32b w2k3") PERFCOUNTER(shadow_writeable_h_2, "shadow writeable: 32pae w2k3") PERFCOUNTER(shadow_writeable_h_3, "shadow writeable: 64b w2k3") PERFCOUNTER(shadow_writeable_h_4, "shadow writeable: linux low/solaris") PERFCOUNTER(shadow_writeable_h_5, "shadow writeable: linux high") PERFCOUNTER(shadow_writeable_h_6, "shadow writeable: FreeBSD") PERFCOUNTER(shadow_writeable_h_7, "shadow writeable: sl1p") PERFCOUNTER(shadow_writeable_h_8, "shadow writeable: sl1p failed") PERFCOUNTER(shadow_writeable_bf, "shadow writeable brute-force") PERFCOUNTER(shadow_writeable_bf_1, "shadow writeable resync bf") PERFCOUNTER(shadow_mappings, "shadow removes all mappings") PERFCOUNTER(shadow_mappings_bf, "shadow rm-mappings brute-force") PERFCOUNTER(shadow_early_unshadow, "shadow unshadows for fork/exit") PERFCOUNTER(shadow_unshadow, "shadow unshadows a page") PERFCOUNTER(shadow_up_pointer, "shadow unshadow by up-pointer") PERFCOUNTER(shadow_unshadow_bf, "shadow unshadow brute-force") PERFCOUNTER(shadow_get_page_fail, "shadow_get_page_from_l1e failed") PERFCOUNTER(shadow_check_gwalk, "shadow checks gwalk") PERFCOUNTER(shadow_inconsistent_gwalk, "shadow check inconsistent gwalk") PERFCOUNTER(shadow_rm_write_flush_tlb, "shadow flush tlb by removing write perm") PERFCOUNTER(shadow_invlpg, "shadow emulates invlpg") PERFCOUNTER(shadow_invlpg_fault, "shadow invlpg faults") PERFCOUNTER(shadow_em_ex_pt, "shadow extra pt write") PERFCOUNTER(shadow_em_ex_non_pt, "shadow extra non-pt-write op") PERFCOUNTER(shadow_em_ex_fail, "shadow extra emulation failed") PERFCOUNTER(shadow_oos_fixup_add, "shadow OOS fixup adds") PERFCOUNTER(shadow_oos_fixup_evict,"shadow OOS fixup evictions") PERFCOUNTER(shadow_unsync, "shadow OOS unsyncs") PERFCOUNTER(shadow_unsync_evict, "shadow OOS evictions") PERFCOUNTER(shadow_resync, "shadow OOS resyncs") PERFCOUNTER(mshv_call_sw_addr_space, "MS Hv Switch Address Space") PERFCOUNTER(mshv_call_flush_tlb_list, "MS Hv Flush TLB list") PERFCOUNTER(mshv_call_flush_tlb_all, "MS Hv Flush TLB all") PERFCOUNTER(mshv_call_long_wait, "MS Hv Notify long wait") PERFCOUNTER(mshv_rdmsr_osid, "MS Hv rdmsr Guest OS ID") PERFCOUNTER(mshv_rdmsr_hc_page, "MS Hv rdmsr hypercall page") PERFCOUNTER(mshv_rdmsr_vp_index, "MS Hv rdmsr vp index") PERFCOUNTER(mshv_rdmsr_tsc_frequency, "MS Hv rdmsr TSC frequency") PERFCOUNTER(mshv_rdmsr_apic_frequency, "MS Hv rdmsr APIC frequency") PERFCOUNTER(mshv_rdmsr_icr, "MS Hv rdmsr icr") PERFCOUNTER(mshv_rdmsr_tpr, "MS Hv rdmsr tpr") PERFCOUNTER(mshv_rdmsr_apic_assist, "MS Hv rdmsr APIC assist") PERFCOUNTER(mshv_rdmsr_apic_msr, "MS Hv rdmsr APIC msr") PERFCOUNTER(mshv_wrmsr_osid, "MS Hv wrmsr Guest OS ID") PERFCOUNTER(mshv_wrmsr_hc_page, "MS Hv wrmsr hypercall page") PERFCOUNTER(mshv_wrmsr_vp_index, "MS Hv wrmsr vp index") PERFCOUNTER(mshv_wrmsr_icr, "MS Hv wrmsr icr") PERFCOUNTER(mshv_wrmsr_tpr, "MS Hv wrmsr tpr") PERFCOUNTER(mshv_wrmsr_eoi, "MS Hv wrmsr eoi") PERFCOUNTER(mshv_wrmsr_apic_assist, "MS Hv wrmsr APIC assist") PERFCOUNTER(mshv_wrmsr_apic_msr, "MS Hv wrmsr APIC msr") PERFCOUNTER(realmode_emulations, "realmode instructions emulated") PERFCOUNTER(realmode_exits, "vmexits from realmode") PERFCOUNTER(pauseloop_exits, "vmexits from Pause-Loop Detection") /*#endif*/ /* __XEN_PERFC_DEFN_H__ */ xen-4.4.0/xen/include/asm-x86/multicall.h0000664000175000017500000000740512307313555016230 0ustar smbsmb/****************************************************************************** * asm-x86/multicall.h */ #ifndef __ASM_X86_MULTICALL_H__ #define __ASM_X86_MULTICALL_H__ #include #define do_multicall_call(_call) \ do { \ __asm__ __volatile__ ( \ " movq %c1(%0),%%rax; " \ " leaq hypercall_table(%%rip),%%rdi; " \ " cmpq $("STR(NR_hypercalls)"),%%rax; " \ " jae 2f; " \ " movq (%%rdi,%%rax,8),%%rax; " \ " movq %c2+0*%c3(%0),%%rdi; " \ " movq %c2+1*%c3(%0),%%rsi; " \ " movq %c2+2*%c3(%0),%%rdx; " \ " movq %c2+3*%c3(%0),%%rcx; " \ " movq %c2+4*%c3(%0),%%r8; " \ " movq %c2+5*%c3(%0),%%r9; " \ " callq *%%rax; " \ "1: movq %%rax,%c4(%0)\n" \ ".section .fixup,\"ax\"\n" \ "2: movq $-"STR(ENOSYS)",%%rax\n" \ " jmp 1b\n" \ ".previous\n" \ : \ : "b" (_call), \ "i" (offsetof(__typeof__(*_call), op)), \ "i" (offsetof(__typeof__(*_call), args)), \ "i" (sizeof(*(_call)->args)), \ "i" (offsetof(__typeof__(*_call), result)) \ /* all the caller-saves registers */ \ : "rax", "rcx", "rdx", "rsi", "rdi", \ "r8", "r9", "r10", "r11" ); \ } while ( 0 ) #define compat_multicall_call(_call) \ __asm__ __volatile__ ( \ " movl %c1(%0),%%eax; " \ " leaq compat_hypercall_table(%%rip),%%rdi; "\ " cmpl $("STR(NR_hypercalls)"),%%eax; " \ " jae 2f; " \ " movq (%%rdi,%%rax,8),%%rax; " \ " movl %c2+0*%c3(%0),%%edi; " \ " movl %c2+1*%c3(%0),%%esi; " \ " movl %c2+2*%c3(%0),%%edx; " \ " movl %c2+3*%c3(%0),%%ecx; " \ " movl %c2+4*%c3(%0),%%r8d; " \ " movl %c2+5*%c3(%0),%%r9d; " \ " callq *%%rax; " \ "1: movl %%eax,%c4(%0)\n" \ ".section .fixup,\"ax\"\n" \ "2: movl $-"STR(ENOSYS)",%%eax\n" \ " jmp 1b\n" \ ".previous\n" \ : \ : "b" (_call), \ "i" (offsetof(__typeof__(*_call), op)), \ "i" (offsetof(__typeof__(*_call), args)), \ "i" (sizeof(*(_call)->args)), \ "i" (offsetof(__typeof__(*_call), result)) \ /* all the caller-saves registers */ \ : "rax", "rcx", "rdx", "rsi", "rdi", \ "r8", "r9", "r10", "r11" ) \ #endif /* __ASM_X86_MULTICALL_H__ */ xen-4.4.0/xen/include/asm-x86/grant_table.h0000664000175000017500000000513512307313555016522 0ustar smbsmb/****************************************************************************** * include/asm-x86/grant_table.h * * Copyright (c) 2004-2005 K A Fraser */ #ifndef __ASM_GRANT_TABLE_H__ #define __ASM_GRANT_TABLE_H__ #define INITIAL_NR_GRANT_FRAMES 4 /* * Caller must own caller's BIGLOCK, is responsible for flushing the TLB, and * must hold a reference to the page. */ int create_grant_host_mapping(uint64_t addr, unsigned long frame, unsigned int flags, unsigned int cache_flags); int replace_grant_host_mapping( uint64_t addr, unsigned long frame, uint64_t new_addr, unsigned int flags); #define gnttab_create_shared_page(d, t, i) \ do { \ share_xen_page_with_guest( \ virt_to_page((char *)(t)->shared_raw[i]), \ (d), XENSHARE_writable); \ } while ( 0 ) #define gnttab_create_status_page(d, t, i) \ do { \ share_xen_page_with_guest( \ virt_to_page((char *)(t)->status[i]), \ (d), XENSHARE_writable); \ } while ( 0 ) #define gnttab_shared_mfn(d, t, i) \ ((virt_to_maddr((t)->shared_raw[i]) >> PAGE_SHIFT)) #define gnttab_shared_gmfn(d, t, i) \ (mfn_to_gmfn(d, gnttab_shared_mfn(d, t, i))) #define gnttab_status_mfn(t, i) \ ((virt_to_maddr((t)->status[i]) >> PAGE_SHIFT)) #define gnttab_status_gmfn(d, t, i) \ (mfn_to_gmfn(d, gnttab_status_mfn(t, i))) #define gnttab_mark_dirty(d, f) paging_mark_dirty((d), (f)) static inline void gnttab_clear_flag(unsigned int nr, uint16_t *st) { /* * Note that this cannot be clear_bit(), as the access must be * confined to the specified 2 bytes. */ asm volatile ("lock btrw %w1,%0" : "=m" (*st) : "Ir" (nr), "m" (*st)); } /* Foreign mappings of HHVM-guest pages do not modify the type count. */ #define gnttab_host_mapping_get_page_type(op, ld, rd) \ (!((op)->flags & GNTMAP_readonly) && \ (((ld) == (rd)) || !paging_mode_external(rd))) /* Done implicitly when page tables are destroyed. */ #define gnttab_release_host_mappings(domain) ( paging_mode_external(domain) ) static inline int replace_grant_supported(void) { return 1; } #endif /* __ASM_GRANT_TABLE_H__ */ xen-4.4.0/xen/include/asm-x86/compat.h0000664000175000017500000000030312307313555015513 0ustar smbsmb/****************************************************************************** * compat.h */ #define COMPAT_BITS_PER_LONG 32 typedef uint32_t compat_ptr_t; typedef unsigned long full_ptr_t; xen-4.4.0/xen/include/asm-x86/shared.h0000664000175000017500000000452212307313555015505 0ustar smbsmb#ifndef __XEN_X86_SHARED_H__ #define __XEN_X86_SHARED_H__ #define nmi_reason(d) (!has_32bit_shinfo(d) ? \ (u32 *)&(d)->shared_info->native.arch.nmi_reason : \ (u32 *)&(d)->shared_info->compat.arch.nmi_reason) #define GET_SET_SHARED(type, field) \ static inline type arch_get_##field(const struct domain *d) \ { \ return !has_32bit_shinfo(d) ? \ d->shared_info->native.arch.field : \ d->shared_info->compat.arch.field; \ } \ static inline void arch_set_##field(struct domain *d, \ type val) \ { \ if ( !has_32bit_shinfo(d) ) \ d->shared_info->native.arch.field = val; \ else \ d->shared_info->compat.arch.field = val; \ } #define GET_SET_VCPU(type, field) \ static inline type arch_get_##field(const struct vcpu *v) \ { \ return !has_32bit_shinfo(v->domain) ? \ v->vcpu_info->native.arch.field : \ v->vcpu_info->compat.arch.field; \ } \ static inline void arch_set_##field(struct vcpu *v, \ type val) \ { \ if ( !has_32bit_shinfo(v->domain) ) \ v->vcpu_info->native.arch.field = val; \ else \ v->vcpu_info->compat.arch.field = val; \ } GET_SET_SHARED(unsigned long, max_pfn) GET_SET_SHARED(xen_pfn_t, pfn_to_mfn_frame_list_list) GET_SET_SHARED(unsigned long, nmi_reason) GET_SET_VCPU(unsigned long, cr2) #undef GET_SET_VCPU #undef GET_SET_SHARED #endif /* __XEN_X86_SHARED_H__ */ xen-4.4.0/xen/include/asm-x86/machine_kexec.h0000664000175000017500000000065012307313555017020 0ustar smbsmb#ifndef __X86_MACHINE_KEXEC_H__ #define __X86_MACHINE_KEXEC_H__ #define KEXEC_RELOC_FLAG_COMPAT 0x1 /* 32-bit image */ #ifndef __ASSEMBLY__ extern void kexec_reloc(unsigned long reloc_code, unsigned long reloc_pt, unsigned long ind_maddr, unsigned long entry_maddr, unsigned long flags); extern unsigned int kexec_reloc_size; #endif #endif /* __X86_MACHINE_KEXEC_H__ */ xen-4.4.0/xen/include/asm-x86/ldt.h0000664000175000017500000000140612307313555015020 0ustar smbsmb #ifndef __ARCH_LDT_H #define __ARCH_LDT_H #ifndef __ASSEMBLY__ static inline void load_LDT(struct vcpu *v) { struct desc_struct *desc; unsigned long ents; if ( (ents = v->arch.pv_vcpu.ldt_ents) == 0 ) { __asm__ __volatile__ ( "lldt %%ax" : : "a" (0) ); } else { desc = (!is_pv_32on64_vcpu(v) ? this_cpu(gdt_table) : this_cpu(compat_gdt_table)) + LDT_ENTRY - FIRST_RESERVED_GDT_ENTRY; _set_tssldt_desc(desc, LDT_VIRT_START(v), ents*8-1, 2); __asm__ __volatile__ ( "lldt %%ax" : : "a" (LDT_ENTRY << 3) ); } } #endif /* !__ASSEMBLY__ */ #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/random.h0000664000175000017500000000050712307313555015516 0ustar smbsmb#ifndef __ASM_RANDOM_H__ #define __ASM_RANDOM_H__ #include static inline unsigned int arch_get_random(void) { unsigned int val = 0; if ( cpu_has(¤t_cpu_data, X86_FEATURE_RDRAND) ) asm volatile ( ".byte 0x0f,0xc7,0xf0" : "+a" (val) ); return val; } #endif /* __ASM_RANDOM_H__ */ xen-4.4.0/xen/include/asm-x86/msi.h0000664000175000017500000002023512307313555015026 0ustar smbsmb#ifndef __ASM_MSI_H #define __ASM_MSI_H #include #include #include /* * Constants for Intel APIC based MSI messages. */ /* * Shifts for MSI data */ #define MSI_DATA_VECTOR_SHIFT 0 #define MSI_DATA_VECTOR_MASK 0x000000ff #define MSI_DATA_VECTOR(v) (((v) << MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK) #define MSI_DATA_DELIVERY_MODE_SHIFT 8 #define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT) #define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT) #define MSI_DATA_DELIVERY_MODE_MASK 0x00000700 #define MSI_DATA_LEVEL_SHIFT 14 #define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT) #define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT) #define MSI_DATA_TRIGGER_SHIFT 15 #define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT) #define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT) #define MSI_DATA_TRIGGER_MASK 0x00008000 /* * Shift/mask fields for msi address */ #define MSI_ADDR_BASE_HI 0 #define MSI_ADDR_BASE_LO 0xfee00000 #define MSI_ADDR_HEADER MSI_ADDR_BASE_LO #define MSI_ADDR_DESTMODE_SHIFT 2 #define MSI_ADDR_DESTMODE_PHYS (0 << MSI_ADDR_DESTMODE_SHIFT) #define MSI_ADDR_DESTMODE_LOGIC (1 << MSI_ADDR_DESTMODE_SHIFT) #define MSI_ADDR_DESTMODE_MASK 0x4 #define MSI_ADDR_REDIRECTION_SHIFT 3 #define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT) #define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT) #define MSI_ADDR_DEST_ID_SHIFT 12 #define MSI_ADDR_DEST_ID_MASK 0x00ffff0 #define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & MSI_ADDR_DEST_ID_MASK) /* MAX fixed pages reserved for mapping MSIX tables. */ #define FIX_MSIX_MAX_PAGES 512 struct msi_info { u16 seg; u8 bus; u8 devfn; int irq; int entry_nr; uint64_t table_base; }; struct msi_msg { u32 address_lo; /* low 32 bits of msi message address */ u32 address_hi; /* high 32 bits of msi message address */ u32 data; /* 16 bits of msi message data */ u32 dest32; /* used when Interrupt Remapping with EIM is enabled */ }; struct irq_desc; struct hw_interrupt_type; struct msi_desc; /* Helper functions */ extern int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc); extern void pci_disable_msi(struct msi_desc *desc); extern int pci_prepare_msix(u16 seg, u8 bus, u8 devfn, bool_t off); extern void pci_cleanup_msi(struct pci_dev *pdev); extern int setup_msi_irq(struct irq_desc *, struct msi_desc *); extern int __setup_msi_irq(struct irq_desc *, struct msi_desc *, const struct hw_interrupt_type *); extern void teardown_msi_irq(int irq); extern int msi_free_vector(struct msi_desc *entry); extern int pci_restore_msi_state(struct pci_dev *pdev); extern unsigned int pci_msix_get_table_len(struct pci_dev *pdev); struct msi_desc { struct msi_attrib { __u8 type : 5; /* {0: unused, 5h:MSI, 11h:MSI-X} */ __u8 maskbit : 1; /* mask-pending bit supported ? */ __u8 masked : 1; __u8 is_64 : 1; /* Address size: 0=32bit 1=64bit */ __u8 pos; /* Location of the msi capability */ __u16 entry_nr; /* specific enabled entry */ } msi_attrib; struct list_head list; union { void __iomem *mask_base;/* va for the entry in mask table */ struct { unsigned int nvec;/* number of vectors */ unsigned int mpos;/* location of mask register */ } msi; unsigned int hpet_id; /* HPET (dev is NULL) */ }; struct pci_dev *dev; int irq; struct msi_msg msg; /* Last set MSI message */ int remap_index; /* index in interrupt remapping table */ }; /* * Values stored into msi_desc.msi_attrib.pos for non-PCI devices * (msi_desc.msi_attrib.type is zero): */ #define MSI_TYPE_UNKNOWN 0 #define MSI_TYPE_HPET 1 #define MSI_TYPE_IOMMU 2 int msi_maskable_irq(const struct msi_desc *); int msi_free_irq(struct msi_desc *entry); /* * Assume the maximum number of hot plug slots supported by the system is about * ten. The worstcase is that each of these slots is hot-added with a device, * which has two MSI/MSI-X capable functions. To avoid any MSI-X driver, which * attempts to request all available vectors, NR_HP_RESERVED_VECTORS is defined * as below to ensure at least one message is assigned to each detected MSI/ * MSI-X device function. */ #define NR_HP_RESERVED_VECTORS 20 #define msi_control_reg(base) (base + PCI_MSI_FLAGS) #define msi_lower_address_reg(base) (base + PCI_MSI_ADDRESS_LO) #define msi_upper_address_reg(base) (base + PCI_MSI_ADDRESS_HI) #define msi_data_reg(base, is64bit) \ ( (is64bit == 1) ? base+PCI_MSI_DATA_64 : base+PCI_MSI_DATA_32 ) #define msi_mask_bits_reg(base, is64bit) \ ( (is64bit == 1) ? base+PCI_MSI_MASK_BIT : base+PCI_MSI_MASK_BIT-4) #define msi_disable(control) control &= ~PCI_MSI_FLAGS_ENABLE #define multi_msi_capable(control) \ (1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1)) #define multi_msi_enable(control, num) \ control |= (((fls(num) - 1) << 4) & PCI_MSI_FLAGS_QSIZE); #define is_64bit_address(control) (!!(control & PCI_MSI_FLAGS_64BIT)) #define is_mask_bit_support(control) (!!(control & PCI_MSI_FLAGS_MASKBIT)) #define msi_enable(control, num) multi_msi_enable(control, num); \ control |= PCI_MSI_FLAGS_ENABLE #define msix_control_reg(base) (base + PCI_MSIX_FLAGS) #define msix_table_offset_reg(base) (base + PCI_MSIX_TABLE) #define msix_pba_offset_reg(base) (base + PCI_MSIX_PBA) #define msix_enable(control) control |= PCI_MSIX_FLAGS_ENABLE #define msix_disable(control) control &= ~PCI_MSIX_FLAGS_ENABLE #define msix_table_size(control) ((control & PCI_MSIX_FLAGS_QSIZE)+1) #define multi_msix_capable msix_table_size #define msix_unmask(address) (address & ~PCI_MSIX_VECTOR_BITMASK) #define msix_mask(address) (address | PCI_MSIX_VECTOR_BITMASK) /* * MSI Defined Data Structures */ struct msg_data { #if defined(__LITTLE_ENDIAN_BITFIELD) __u32 vector : 8; __u32 delivery_mode : 3; /* 000b: FIXED | 001b: lowest prior */ __u32 reserved_1 : 3; __u32 level : 1; /* 0: deassert | 1: assert */ __u32 trigger : 1; /* 0: edge | 1: level */ __u32 reserved_2 : 16; #elif defined(__BIG_ENDIAN_BITFIELD) __u32 reserved_2 : 16; __u32 trigger : 1; /* 0: edge | 1: level */ __u32 level : 1; /* 0: deassert | 1: assert */ __u32 reserved_1 : 3; __u32 delivery_mode : 3; /* 000b: FIXED | 001b: lowest prior */ __u32 vector : 8; #else #error "Bitfield endianness not defined! Check your byteorder.h" #endif } __attribute__ ((packed)); struct msg_address { union { struct { #if defined(__LITTLE_ENDIAN_BITFIELD) __u32 reserved_1 : 2; __u32 dest_mode : 1; /*0:physic | 1:logic */ __u32 redirection_hint: 1; /*0: dedicated CPU 1: lowest priority */ __u32 reserved_2 : 4; __u32 dest_id : 24; /* Destination ID */ #elif defined(__BIG_ENDIAN_BITFIELD) __u32 dest_id : 24; /* Destination ID */ __u32 reserved_2 : 4; __u32 redirection_hint: 1; /*0: dedicated CPU 1: lowest priority */ __u32 dest_mode : 1; /*0:physic | 1:logic */ __u32 reserved_1 : 2; #else #error "Bitfield endianness not defined! Check your byteorder.h" #endif }u; __u32 value; }lo_address; __u32 hi_address; } __attribute__ ((packed)); #define MAX_MSIX_TABLE_ENTRIES (PCI_MSIX_FLAGS_QSIZE + 1) #define MAX_MSIX_TABLE_PAGES PFN_UP(MAX_MSIX_TABLE_ENTRIES * \ PCI_MSIX_ENTRY_SIZE + \ (~PCI_MSIX_BIRMASK & (PAGE_SIZE - 1))) struct arch_msix { unsigned int nr_entries, used_entries; struct { unsigned long first, last; } table, pba; int table_refcnt[MAX_MSIX_TABLE_PAGES]; int table_idx[MAX_MSIX_TABLE_PAGES]; spinlock_t table_lock; domid_t warned; }; void early_msi_init(void); void msi_compose_msg(unsigned vector, const cpumask_t *mask, struct msi_msg *msg); void __msi_set_enable(u16 seg, u8 bus, u8 slot, u8 func, int pos, int enable); void mask_msi_irq(struct irq_desc *); void unmask_msi_irq(struct irq_desc *); void ack_nonmaskable_msi_irq(struct irq_desc *); void end_nonmaskable_msi_irq(struct irq_desc *, u8 vector); void set_msi_affinity(struct irq_desc *, const cpumask_t *); #endif /* __ASM_MSI_H */ xen-4.4.0/xen/include/asm-x86/numa.h0000664000175000017500000000443112307313555015176 0ustar smbsmb#ifndef _ASM_X8664_NUMA_H #define _ASM_X8664_NUMA_H 1 #include #define NODES_SHIFT 6 extern int srat_rev; extern unsigned char cpu_to_node[]; extern cpumask_t node_to_cpumask[]; #define cpu_to_node(cpu) (cpu_to_node[cpu]) #define parent_node(node) (node) #define node_to_first_cpu(node) (__ffs(node_to_cpumask[node])) #define node_to_cpumask(node) (node_to_cpumask[node]) struct node { u64 start,end; }; extern int compute_hash_shift(struct node *nodes, int numnodes, int *nodeids); extern int pxm_to_node(int nid); #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) #define VIRTUAL_BUG_ON(x) extern void numa_add_cpu(int cpu); extern void numa_init_array(void); extern int numa_off; extern int srat_disabled(void); extern void numa_set_node(int cpu, int node); extern int setup_node(int pxm); extern void srat_detect_node(int cpu); extern void setup_node_bootmem(int nodeid, u64 start, u64 end); extern unsigned char apicid_to_node[]; #ifdef CONFIG_NUMA extern void init_cpu_to_node(void); static inline void clear_node_cpumask(int cpu) { cpumask_clear_cpu(cpu, &node_to_cpumask[cpu_to_node(cpu)]); } /* Simple perfect hash to map pdx to node numbers */ extern int memnode_shift; extern unsigned long memnodemapsize; extern u8 *memnodemap; struct node_data { unsigned long node_start_pfn; unsigned long node_spanned_pages; unsigned int node_id; }; extern struct node_data node_data[]; static inline __attribute__((pure)) int phys_to_nid(paddr_t addr) { unsigned nid; VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= memnodemapsize); nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift]; VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); return nid; } #define NODE_DATA(nid) (&(node_data[nid])) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ NODE_DATA(nid)->node_spanned_pages) extern int valid_numa_range(u64 start, u64 end, int node); #else #define init_cpu_to_node() do {} while (0) #define clear_node_cpumask(cpu) do {} while (0) #define valid_numa_range(start, end, node) 1 #endif void srat_parse_regions(u64 addr); extern int __node_distance(int a, int b); #endif xen-4.4.0/xen/include/asm-x86/desc.h0000664000175000017500000001716412307313555015163 0ustar smbsmb#ifndef __ARCH_DESC_H #define __ARCH_DESC_H /* * Xen reserves a memory page of GDT entries. * No guest GDT entries exist beyond the Xen reserved area. */ #define NR_RESERVED_GDT_PAGES 1 #define NR_RESERVED_GDT_BYTES (NR_RESERVED_GDT_PAGES * PAGE_SIZE) #define NR_RESERVED_GDT_ENTRIES (NR_RESERVED_GDT_BYTES / 8) #define LAST_RESERVED_GDT_PAGE \ (FIRST_RESERVED_GDT_PAGE + NR_RESERVED_GDT_PAGES - 1) #define LAST_RESERVED_GDT_BYTE \ (FIRST_RESERVED_GDT_BYTE + NR_RESERVED_GDT_BYTES - 1) #define LAST_RESERVED_GDT_ENTRY \ (FIRST_RESERVED_GDT_ENTRY + NR_RESERVED_GDT_ENTRIES - 1) #define LDT_ENTRY_SIZE 8 #define FLAT_COMPAT_RING1_CS 0xe019 /* GDT index 259 */ #define FLAT_COMPAT_RING1_DS 0xe021 /* GDT index 260 */ #define FLAT_COMPAT_RING1_SS 0xe021 /* GDT index 260 */ #define FLAT_COMPAT_RING3_CS 0xe02b /* GDT index 261 */ #define FLAT_COMPAT_RING3_DS 0xe033 /* GDT index 262 */ #define FLAT_COMPAT_RING3_SS 0xe033 /* GDT index 262 */ #define FLAT_COMPAT_KERNEL_DS FLAT_COMPAT_RING1_DS #define FLAT_COMPAT_KERNEL_CS FLAT_COMPAT_RING1_CS #define FLAT_COMPAT_KERNEL_SS FLAT_COMPAT_RING1_SS #define FLAT_COMPAT_USER_DS FLAT_COMPAT_RING3_DS #define FLAT_COMPAT_USER_CS FLAT_COMPAT_RING3_CS #define FLAT_COMPAT_USER_SS FLAT_COMPAT_RING3_SS #define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) #define LDT_ENTRY (TSS_ENTRY + 2) #define PER_CPU_GDT_ENTRY (LDT_ENTRY + 2) #ifndef __ASSEMBLY__ #define GUEST_KERNEL_RPL(d) (is_pv_32bit_domain(d) ? 1 : 3) /* Fix up the RPL of a guest segment selector. */ #define __fixup_guest_selector(d, sel) \ ({ \ uint16_t _rpl = GUEST_KERNEL_RPL(d); \ (sel) = (((sel) & 3) >= _rpl) ? (sel) : (((sel) & ~3) | _rpl); \ }) /* Stack selectors don't need fixing up if the kernel runs in ring 0. */ #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL #define fixup_guest_stack_selector(d, ss) ((void)0) #else #define fixup_guest_stack_selector(d, ss) __fixup_guest_selector(d, ss) #endif /* * Code selectors are always fixed up. It allows the Xen exit stub to detect * return to guest context, even when the guest kernel runs in ring 0. */ #define fixup_guest_code_selector(d, cs) __fixup_guest_selector(d, cs) /* * We need this function because enforcing the correct guest kernel RPL is * unsufficient if the selector is poked into an interrupt, trap or call gate. * The selector RPL is ignored when a gate is accessed. We must therefore make * sure that the selector does not reference a Xen-private segment. * * Note that selectors used only by IRET do not need to be checked. If the * descriptor DPL fiffers from CS RPL then we'll #GP. * * Stack and data selectors do not need to be checked. If DS, ES, FS, GS are * DPL < CPL then they'll be cleared automatically. If SS RPL or DPL differs * from CS RPL then we'll #GP. */ #define guest_gate_selector_okay(d, sel) \ ((((sel)>>3) < FIRST_RESERVED_GDT_ENTRY) || /* Guest seg? */ \ ((sel) == (!is_pv_32on64_domain(d) ? \ FLAT_KERNEL_CS : /* Xen default seg? */ \ FLAT_COMPAT_KERNEL_CS)) || \ ((sel) & 4)) /* LDT seg? */ #endif /* __ASSEMBLY__ */ /* These are bitmasks for the high 32 bits of a descriptor table entry. */ #define _SEGMENT_TYPE (15<< 8) #define _SEGMENT_WR ( 1<< 9) /* Writeable (data) or Readable (code) segment */ #define _SEGMENT_EC ( 1<<10) /* Expand-down or Conforming segment */ #define _SEGMENT_CODE ( 1<<11) /* Code (vs data) segment for non-system segments */ #define _SEGMENT_S ( 1<<12) /* System descriptor (yes iff S==0) */ #define _SEGMENT_DPL ( 3<<13) /* Descriptor Privilege Level */ #define _SEGMENT_P ( 1<<15) /* Segment Present */ #define _SEGMENT_L ( 1<<21) /* 64-bit segment */ #define _SEGMENT_DB ( 1<<22) /* 16- or 32-bit segment */ #define _SEGMENT_G ( 1<<23) /* Granularity */ #ifndef __ASSEMBLY__ struct desc_struct { u32 a, b; }; typedef struct { u64 a, b; } idt_entry_t; /* Write the lower 64 bits of an IDT Entry. This relies on the upper 32 * bits of the address not changing, which is a safe assumption as all * functions we are likely to load will live inside the 1GB * code/data/bss address range. * * Ideally, we would use cmpxchg16b, but this is not supported on some * old AMD 64bit capable processors, and has no safe equivalent. */ static inline void _write_gate_lower(volatile idt_entry_t *gate, const idt_entry_t *new) { ASSERT(gate->b == new->b); gate->a = new->a; } #define _set_gate(gate_addr,type,dpl,addr) \ do { \ (gate_addr)->a = 0; \ wmb(); /* disable gate /then/ rewrite */ \ (gate_addr)->b = \ ((unsigned long)(addr) >> 32); \ wmb(); /* rewrite /then/ enable gate */ \ (gate_addr)->a = \ (((unsigned long)(addr) & 0xFFFF0000UL) << 32) | \ ((unsigned long)(dpl) << 45) | \ ((unsigned long)(type) << 40) | \ ((unsigned long)(addr) & 0xFFFFUL) | \ ((unsigned long)__HYPERVISOR_CS64 << 16) | \ (1UL << 47); \ } while (0) static inline void _set_gate_lower(idt_entry_t *gate, unsigned long type, unsigned long dpl, void *addr) { idt_entry_t idte; idte.b = gate->b; idte.a = (((unsigned long)(addr) & 0xFFFF0000UL) << 32) | ((unsigned long)(dpl) << 45) | ((unsigned long)(type) << 40) | ((unsigned long)(addr) & 0xFFFFUL) | ((unsigned long)__HYPERVISOR_CS64 << 16) | (1UL << 47); _write_gate_lower(gate, &idte); } /* Update the lower half handler of an IDT Entry, without changing any * other configuration. */ static inline void _update_gate_addr_lower(idt_entry_t *gate, void *addr) { idt_entry_t idte; idte.a = gate->a; idte.b = ((unsigned long)(addr) >> 32); idte.a &= 0x0000FFFFFFFF0000ULL; idte.a |= (((unsigned long)(addr) & 0xFFFF0000UL) << 32) | ((unsigned long)(addr) & 0xFFFFUL); _write_gate_lower(gate, &idte); } #define _set_tssldt_desc(desc,addr,limit,type) \ do { \ (desc)[0].b = (desc)[1].b = 0; \ wmb(); /* disable entry /then/ rewrite */ \ (desc)[0].a = \ ((u32)(addr) << 16) | ((u32)(limit) & 0xFFFF); \ (desc)[1].a = (u32)(((unsigned long)(addr)) >> 32); \ wmb(); /* rewrite /then/ enable entry */ \ (desc)[0].b = \ ((u32)(addr) & 0xFF000000U) | \ ((u32)(type) << 8) | 0x8000U | \ (((u32)(addr) & 0x00FF0000U) >> 16); \ } while (0) struct desc_ptr { unsigned short limit; unsigned long base; } __attribute__((__packed__)) ; extern struct desc_struct boot_cpu_gdt_table[]; DECLARE_PER_CPU(struct desc_struct *, gdt_table); extern struct desc_struct boot_cpu_compat_gdt_table[]; DECLARE_PER_CPU(struct desc_struct *, compat_gdt_table); extern void set_intr_gate(unsigned int irq, void * addr); extern void load_TR(void); #endif /* !__ASSEMBLY__ */ #endif /* __ARCH_DESC_H */ xen-4.4.0/xen/include/asm-x86/apic.h0000664000175000017500000001330312307313555015150 0ustar smbsmb#ifndef __ASM_APIC_H #define __ASM_APIC_H #include #include #include #include #define Dprintk(x...) do {} while (0) /* * Debugging macros */ #define APIC_QUIET 0 #define APIC_VERBOSE 1 #define APIC_DEBUG 2 #define SET_APIC_LOGICAL_ID(x) (((x)<<24)) #define IO_APIC_REDIR_VECTOR_MASK 0x000FF #define IO_APIC_REDIR_DEST_LOGICAL 0x00800 #define IO_APIC_REDIR_DEST_PHYSICAL 0x00000 /* Possible APIC states */ enum apic_mode { APIC_MODE_INVALID, /* Not set yet */ APIC_MODE_DISABLED, /* If uniprocessor, or MP in uniprocessor mode */ APIC_MODE_XAPIC, /* xAPIC mode - default upon chipset reset */ APIC_MODE_X2APIC /* x2APIC mode - common for large MP machines */ }; extern u8 apic_verbosity; extern bool_t x2apic_enabled; extern bool_t directed_eoi_enabled; void check_x2apic_preenabled(void); void x2apic_bsp_setup(void); void x2apic_ap_setup(void); const struct genapic *apic_x2apic_probe(void); /* * Define the default level of output to be very little * This can be turned up by using apic=verbose for more * information and apic=debug for _lots_ of information. * apic_verbosity is defined in apic.c */ #define apic_printk(v, s, a...) do { \ if ((v) <= apic_verbosity) \ printk(s, ##a); \ } while (0) #ifdef CONFIG_X86_LOCAL_APIC /* * Basic functions accessing APICs. */ static __inline void apic_mem_write(unsigned long reg, u32 v) { *((volatile u32 *)(APIC_BASE+reg)) = v; } static __inline void apic_mem_write_atomic(unsigned long reg, u32 v) { (void)xchg((volatile u32 *)(APIC_BASE+reg), v); } static __inline u32 apic_mem_read(unsigned long reg) { return *((volatile u32 *)(APIC_BASE+reg)); } /* NOTE: in x2APIC mode, we should use apic_icr_write()/apic_icr_read() to * access the 64-bit ICR register. */ static __inline void apic_wrmsr(unsigned long reg, uint64_t msr_content) { if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR || reg == APIC_LVR) return; wrmsrl(APIC_MSR_BASE + (reg >> 4), msr_content); } static __inline uint64_t apic_rdmsr(unsigned long reg) { uint64_t msr_content; if (reg == APIC_DFR) return -1u; rdmsrl(APIC_MSR_BASE + (reg >> 4), msr_content); return msr_content; } static __inline void apic_write(unsigned long reg, u32 v) { if ( x2apic_enabled ) apic_wrmsr(reg, v); else apic_mem_write(reg, v); } static __inline void apic_write_atomic(unsigned long reg, u32 v) { if ( x2apic_enabled ) apic_wrmsr(reg, v); else apic_mem_write_atomic(reg, v); } static __inline u32 apic_read(unsigned long reg) { if ( x2apic_enabled ) return apic_rdmsr(reg); else return apic_mem_read(reg); } static __inline u64 apic_icr_read(void) { u32 lo, hi; if ( x2apic_enabled ) return apic_rdmsr(APIC_ICR); else { lo = apic_mem_read(APIC_ICR); hi = apic_mem_read(APIC_ICR2); } return ((u64)lo) | (((u64)hi) << 32); } static __inline void apic_icr_write(u32 low, u32 dest) { if ( x2apic_enabled ) apic_wrmsr(APIC_ICR, low | ((uint64_t)dest << 32)); else { apic_mem_write(APIC_ICR2, dest << 24); apic_mem_write(APIC_ICR, low); } } static __inline bool_t apic_isr_read(u8 vector) { return (apic_read(APIC_ISR + ((vector & ~0x1f) >> 1)) >> (vector & 0x1f)) & 1; } static __inline u32 get_apic_id(void) /* Get the physical APIC id */ { u32 id = apic_read(APIC_ID); return x2apic_enabled ? id : GET_xAPIC_ID(id); } void apic_wait_icr_idle(void); int get_physical_broadcast(void); #ifdef CONFIG_X86_GOOD_APIC # define FORCE_READ_AROUND_WRITE 0 # define apic_read_around(x) # define apic_write_around(x,y) apic_write((x),(y)) #else # define FORCE_READ_AROUND_WRITE 1 # define apic_read_around(x) apic_read(x) # define apic_write_around(x,y) apic_write_atomic((x),(y)) #endif static inline void ack_APIC_irq(void) { /* * ack_APIC_irq() actually gets compiled as a single instruction: * - a single rmw on Pentium/82489DX * - a single write on P6+ cores (CONFIG_X86_GOOD_APIC) * ... yummie. */ /* Docs say use 0 for future compatibility */ apic_write_around(APIC_EOI, 0); } extern int get_maxlvt(void); extern void clear_local_APIC(void); extern void connect_bsp_APIC (void); extern void disconnect_bsp_APIC (int virt_wire_setup); extern void disable_local_APIC (void); extern int verify_local_APIC (void); extern void cache_APIC_registers (void); extern void sync_Arb_IDs (void); extern void init_bsp_APIC (void); extern void setup_local_APIC (void); extern void init_apic_mappings (void); extern void smp_local_timer_interrupt (struct cpu_user_regs *regs); extern void setup_boot_APIC_clock (void); extern void setup_secondary_APIC_clock (void); extern void setup_apic_nmi_watchdog (void); extern void disable_lapic_nmi_watchdog(void); extern int reserve_lapic_nmi(void); extern void release_lapic_nmi(void); extern void self_nmi(void); extern void disable_timer_nmi_watchdog(void); extern void enable_timer_nmi_watchdog(void); extern void nmi_watchdog_tick (struct cpu_user_regs *regs); extern int APIC_init_uniprocessor (void); extern void disable_APIC_timer(void); extern void enable_APIC_timer(void); extern int lapic_suspend(void); extern int lapic_resume(void); extern void record_boot_APIC_mode(void); extern enum apic_mode current_local_apic_mode(void); extern int check_nmi_watchdog (void); extern unsigned int nmi_watchdog; #define NMI_NONE 0 #define NMI_IO_APIC 1 #define NMI_LOCAL_APIC 2 #define NMI_INVALID 3 #else /* !CONFIG_X86_LOCAL_APIC */ static inline int lapic_suspend(void) {return 0;} static inline int lapic_resume(void) {return 0;} #endif /* !CONFIG_X86_LOCAL_APIC */ #endif /* __ASM_APIC_H */ xen-4.4.0/xen/include/asm-x86/microcode.h0000664000175000017500000000151512307313555016202 0ustar smbsmb#ifndef ASM_X86__MICROCODE_H #define ASM_X86__MICROCODE_H #include struct cpu_signature; struct ucode_cpu_info; struct microcode_ops { int (*microcode_resume_match)(int cpu, const void *mc); int (*cpu_request_microcode)(int cpu, const void *buf, size_t size); int (*collect_cpu_info)(int cpu, struct cpu_signature *csig); int (*apply_microcode)(int cpu); int (*start_update)(void); }; struct cpu_signature { unsigned int sig; unsigned int pf; unsigned int rev; }; struct ucode_cpu_info { struct cpu_signature cpu_sig; union { struct microcode_intel *mc_intel; struct microcode_amd *mc_amd; void *mc_valid; } mc; }; DECLARE_PER_CPU(struct ucode_cpu_info, ucode_cpu_info); extern const struct microcode_ops *microcode_ops; #endif /* ASM_X86__MICROCODE_H */ xen-4.4.0/xen/include/asm-x86/guest_access.h0000664000175000017500000001532112307313555016706 0ustar smbsmb/****************************************************************************** * guest_access.h * * Copyright (c) 2006, K A Fraser */ #ifndef __ASM_X86_GUEST_ACCESS_H__ #define __ASM_X86_GUEST_ACCESS_H__ #include #include #include #include /* Raw access functions: no type checking. */ #define raw_copy_to_guest(dst, src, len) \ (has_hvm_container_vcpu(current) ? \ copy_to_user_hvm((dst), (src), (len)) : \ copy_to_user((dst), (src), (len))) #define raw_copy_from_guest(dst, src, len) \ (has_hvm_container_vcpu(current) ? \ copy_from_user_hvm((dst), (src), (len)) : \ copy_from_user((dst), (src), (len))) #define raw_clear_guest(dst, len) \ (has_hvm_container_vcpu(current) ? \ clear_user_hvm((dst), (len)) : \ clear_user((dst), (len))) #define __raw_copy_to_guest(dst, src, len) \ (has_hvm_container_vcpu(current) ? \ copy_to_user_hvm((dst), (src), (len)) : \ __copy_to_user((dst), (src), (len))) #define __raw_copy_from_guest(dst, src, len) \ (has_hvm_container_vcpu(current) ? \ copy_from_user_hvm((dst), (src), (len)) : \ __copy_from_user((dst), (src), (len))) #define __raw_clear_guest(dst, len) \ (has_hvm_container_vcpu(current) ? \ clear_user_hvm((dst), (len)) : \ clear_user((dst), (len))) /* Is the guest handle a NULL reference? */ #define guest_handle_is_null(hnd) ((hnd).p == NULL) /* Offset the given guest handle into the array it refers to. */ #define guest_handle_add_offset(hnd, nr) ((hnd).p += (nr)) #define guest_handle_subtract_offset(hnd, nr) ((hnd).p -= (nr)) /* Cast a guest handle (either XEN_GUEST_HANDLE or XEN_GUEST_HANDLE_PARAM) * to the specified type of XEN_GUEST_HANDLE_PARAM. */ #define guest_handle_cast(hnd, type) ({ \ type *_x = (hnd).p; \ (XEN_GUEST_HANDLE_PARAM(type)) { _x }; \ }) /* Cast a XEN_GUEST_HANDLE to XEN_GUEST_HANDLE_PARAM */ #define guest_handle_to_param(hnd, type) ({ \ /* type checking: make sure that the pointers inside \ * XEN_GUEST_HANDLE and XEN_GUEST_HANDLE_PARAM are of \ * the same type, then return hnd */ \ (void)((typeof(&(hnd).p)) 0 == \ (typeof(&((XEN_GUEST_HANDLE_PARAM(type)) {}).p)) 0); \ (hnd); \ }) /* Cast a XEN_GUEST_HANDLE_PARAM to XEN_GUEST_HANDLE */ #define guest_handle_from_param(hnd, type) ({ \ /* type checking: make sure that the pointers inside \ * XEN_GUEST_HANDLE and XEN_GUEST_HANDLE_PARAM are of \ * the same type, then return hnd */ \ (void)((typeof(&(hnd).p)) 0 == \ (typeof(&((XEN_GUEST_HANDLE_PARAM(type)) {}).p)) 0); \ (hnd); \ }) #define guest_handle_for_field(hnd, type, fld) \ ((XEN_GUEST_HANDLE(type)) { &(hnd).p->fld }) #define guest_handle_from_ptr(ptr, type) \ ((XEN_GUEST_HANDLE_PARAM(type)) { (type *)ptr }) #define const_guest_handle_from_ptr(ptr, type) \ ((XEN_GUEST_HANDLE_PARAM(const_##type)) { (const type *)ptr }) /* * Copy an array of objects to guest context via a guest handle, * specifying an offset into the guest array. */ #define copy_to_guest_offset(hnd, off, ptr, nr) ({ \ const typeof(*(ptr)) *_s = (ptr); \ char (*_d)[sizeof(*_s)] = (void *)(hnd).p; \ ((void)((hnd).p == (ptr))); \ raw_copy_to_guest(_d+(off), _s, sizeof(*_s)*(nr)); \ }) /* * Copy an array of objects from guest context via a guest handle, * specifying an offset into the guest array. */ #define copy_from_guest_offset(ptr, hnd, off, nr) ({ \ const typeof(*(ptr)) *_s = (hnd).p; \ typeof(*(ptr)) *_d = (ptr); \ raw_copy_from_guest(_d, _s+(off), sizeof(*_d)*(nr));\ }) #define clear_guest_offset(hnd, off, nr) ({ \ void *_d = (hnd).p; \ raw_clear_guest(_d+(off), nr); \ }) /* Copy sub-field of a structure to guest context via a guest handle. */ #define copy_field_to_guest(hnd, ptr, field) ({ \ const typeof(&(ptr)->field) _s = &(ptr)->field; \ void *_d = &(hnd).p->field; \ ((void)(&(hnd).p->field == &(ptr)->field)); \ raw_copy_to_guest(_d, _s, sizeof(*_s)); \ }) /* Copy sub-field of a structure from guest context via a guest handle. */ #define copy_field_from_guest(ptr, hnd, field) ({ \ const typeof(&(ptr)->field) _s = &(hnd).p->field; \ typeof(&(ptr)->field) _d = &(ptr)->field; \ raw_copy_from_guest(_d, _s, sizeof(*_d)); \ }) /* * Pre-validate a guest handle. * Allows use of faster __copy_* functions. */ #define guest_handle_okay(hnd, nr) \ (paging_mode_external(current->domain) || \ array_access_ok((hnd).p, (nr), sizeof(*(hnd).p))) #define guest_handle_subrange_okay(hnd, first, last) \ (paging_mode_external(current->domain) || \ array_access_ok((hnd).p + (first), \ (last)-(first)+1, \ sizeof(*(hnd).p))) #define __copy_to_guest_offset(hnd, off, ptr, nr) ({ \ const typeof(*(ptr)) *_s = (ptr); \ char (*_d)[sizeof(*_s)] = (void *)(hnd).p; \ ((void)((hnd).p == (ptr))); \ __raw_copy_to_guest(_d+(off), _s, sizeof(*_s)*(nr));\ }) #define __copy_from_guest_offset(ptr, hnd, off, nr) ({ \ const typeof(*(ptr)) *_s = (hnd).p; \ typeof(*(ptr)) *_d = (ptr); \ __raw_copy_from_guest(_d, _s+(off), sizeof(*_d)*(nr));\ }) #define __clear_guest_offset(hnd, off, nr) ({ \ void *_d = (hnd).p; \ __raw_clear_guest(_d+(off), nr); \ }) #define __copy_field_to_guest(hnd, ptr, field) ({ \ const typeof(&(ptr)->field) _s = &(ptr)->field; \ void *_d = &(hnd).p->field; \ ((void)(&(hnd).p->field == &(ptr)->field)); \ __raw_copy_to_guest(_d, _s, sizeof(*_s)); \ }) #define __copy_field_from_guest(ptr, hnd, field) ({ \ const typeof(&(ptr)->field) _s = &(hnd).p->field; \ typeof(&(ptr)->field) _d = &(ptr)->field; \ __raw_copy_from_guest(_d, _s, sizeof(*_d)); \ }) #endif /* __ASM_X86_GUEST_ACCESS_H__ */ xen-4.4.0/xen/include/asm-x86/percpu.h0000664000175000017500000000143012307313555015530 0ustar smbsmb#ifndef __X86_PERCPU_H__ #define __X86_PERCPU_H__ #ifndef __ASSEMBLY__ extern char __per_cpu_start[], __per_cpu_data_end[]; extern unsigned long __per_cpu_offset[NR_CPUS]; void percpu_init_areas(void); #endif /* Separate out the type, so (int[3], foo) works. */ #define __DEFINE_PER_CPU(type, name, suffix) \ __section(".bss.percpu" #suffix) \ __typeof__(type) per_cpu_##name /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) \ (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu])) #define __get_cpu_var(var) \ (*RELOC_HIDE(&per_cpu__##var, get_cpu_info()->per_cpu_offset)) #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name #endif /* __X86_PERCPU_H__ */ xen-4.4.0/xen/include/asm-x86/mem_event.h0000664000175000017500000000546012307313555016220 0ustar smbsmb/****************************************************************************** * include/asm-x86/mem_event.h * * Common interface for memory event support. * * Copyright (c) 2009 Citrix Systems, Inc. (Patrick Colp) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef __MEM_EVENT_H__ #define __MEM_EVENT_H__ /* Returns whether a ring has been set up */ bool_t mem_event_check_ring(struct mem_event_domain *med); /* Returns 0 on success, -ENOSYS if there is no ring, -EBUSY if there is no * available space and the caller is a foreign domain. If the guest itself * is the caller, -EBUSY is avoided by sleeping on a wait queue to ensure * that the ring does not lose future events. * * However, the allow_sleep flag can be set to false in cases in which it is ok * to lose future events, and thus -EBUSY can be returned to guest vcpus * (handle with care!). * * In general, you must follow a claim_slot() call with either put_request() or * cancel_slot(), both of which are guaranteed to * succeed. */ int __mem_event_claim_slot(struct domain *d, struct mem_event_domain *med, bool_t allow_sleep); static inline int mem_event_claim_slot(struct domain *d, struct mem_event_domain *med) { return __mem_event_claim_slot(d, med, 1); } static inline int mem_event_claim_slot_nosleep(struct domain *d, struct mem_event_domain *med) { return __mem_event_claim_slot(d, med, 0); } void mem_event_cancel_slot(struct domain *d, struct mem_event_domain *med); void mem_event_put_request(struct domain *d, struct mem_event_domain *med, mem_event_request_t *req); int mem_event_get_response(struct domain *d, struct mem_event_domain *med, mem_event_response_t *rsp); int do_mem_event_op(int op, uint32_t domain, void *arg); int mem_event_domctl(struct domain *d, xen_domctl_mem_event_op_t *mec, XEN_GUEST_HANDLE_PARAM(void) u_domctl); #endif /* __MEM_EVENT_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/mem_access.h0000664000175000017500000000220212307313555016327 0ustar smbsmb/****************************************************************************** * include/asm-x86/mem_paging.h * * Memory access support. * * Copyright (c) 2011 Virtuata, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ int mem_access_memop(struct domain *d, xen_mem_event_op_t *meo); int mem_access_send_req(struct domain *d, mem_event_request_t *req); /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/include/asm-x86/hpet.h0000664000175000017500000000450012307313555015173 0ustar smbsmb#ifndef __X86_HPET_H__ #define __X86_HPET_H__ /* * Documentation on HPET can be found at: * http://www.intel.com/content/dam/www/public/us/en/documents/ * technical-specifications/software-developers-hpet-spec-1-0a.pdf */ #define HPET_MMAP_SIZE 1024 #define HPET_ID 0x000 #define HPET_PERIOD 0x004 #define HPET_CFG 0x010 #define HPET_STATUS 0x020 #define HPET_COUNTER 0x0f0 #define HPET_Tn_CFG(n) (0x100 + (n) * 0x20) #define HPET_Tn_CMP(n) (0x108 + (n) * 0x20) #define HPET_Tn_ROUTE(n) (0x110 + (n) * 0x20) #define HPET_ID_VENDOR 0xffff0000 #define HPET_ID_LEGSUP 0x00008000 #define HPET_ID_64BIT 0x00002000 #define HPET_ID_NUMBER 0x00001f00 #define HPET_ID_REV 0x000000ff #define HPET_ID_NUMBER_SHIFT 8 #define HPET_ID_VENDOR_SHIFT 16 #define HPET_CFG_ENABLE 0x001 #define HPET_CFG_LEGACY 0x002 #define HPET_LEGACY_8254 2 #define HPET_LEGACY_RTC 8 #define HPET_TN_LEVEL 0x002 #define HPET_TN_ENABLE 0x004 #define HPET_TN_PERIODIC 0x008 #define HPET_TN_PERIODIC_CAP 0x010 #define HPET_TN_64BIT_CAP 0x020 #define HPET_TN_SETVAL 0x040 #define HPET_TN_32BIT 0x100 #define HPET_TN_ROUTE 0x3e00 #define HPET_TN_FSB 0x4000 #define HPET_TN_FSB_CAP 0x8000 #define HPET_TN_RESERVED 0xffff0081 #define HPET_TN_ROUTE_SHIFT 9 #define hpet_read32(x) \ (*(volatile u32 *)(fix_to_virt(FIX_HPET_BASE) + (x))) #define hpet_write32(y,x) \ (*(volatile u32 *)(fix_to_virt(FIX_HPET_BASE) + (x)) = (y)) extern unsigned long hpet_address; extern u8 hpet_blockid; /* * Detect and initialise HPET hardware: return counter update frequency. * Return value is zero if HPET is unavailable. */ u64 hpet_setup(void); void hpet_resume(u32 *); /* * Disable HPET hardware: restore it to boot time state. */ void hpet_disable(void); /* * Callback from legacy timer (PIT channel 0) IRQ handler. * Returns 1 if tick originated from HPET; else 0. */ int hpet_legacy_irq_tick(void); /* * Temporarily use an HPET event counter for timer interrupt handling, * rather than using the LAPIC timer. Used for Cx state entry. */ void hpet_broadcast_init(void); void hpet_broadcast_resume(void); void hpet_broadcast_enter(void); void hpet_broadcast_exit(void); int hpet_broadcast_is_available(void); void hpet_disable_legacy_broadcast(void); extern void (*pv_rtc_handler)(uint8_t reg, uint8_t value); #endif /* __X86_HPET_H__ */ xen-4.4.0/xen/include/xsm/0000775000175000017500000000000012307313555013467 5ustar smbsmbxen-4.4.0/xen/include/xsm/dummy.h0000664000175000017500000004512012307313555014775 0ustar smbsmb/* * Default XSM hooks - IS_PRIV and IS_PRIV_FOR checks * * Author: Daniel De Graaf * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, * as published by the Free Software Foundation. * * * Each XSM hook implementing an access check should have its first parameter * preceded by XSM_DEFAULT_ARG (or use XSM_DEFAULT_VOID if it has no * arguments). The first non-declaration statement shold be XSM_ASSERT_ACTION * with the expected type of the hook, which will either define or check the * value of action. */ #include #include /* Cannot use BUILD_BUG_ON here because the expressions we check are not * considered constant at compile time. Instead, rely on constant propagation to * inline out the calls to this invalid function, which will cause linker errors * if references remain at link time. */ #define LINKER_BUG_ON(x) do { if (x) __xsm_action_mismatch_detected(); } while (0) /* DO NOT implement this function; it is supposed to trigger link errors */ void __xsm_action_mismatch_detected(void); #ifdef XSM_ENABLE /* In XSM_ENABLE builds, this header file is included from xsm/dummy.c, and * contains static (not inline) functions compiled to the dummy XSM module. * There is no xsm_default_t argument available, so the value from the assertion * is used to initialize the variable. */ #define XSM_INLINE /* */ #define XSM_DEFAULT_ARG /* */ #define XSM_DEFAULT_VOID void #define XSM_ASSERT_ACTION(def) xsm_default_t action = def; (void)action #else /* XSM_ENABLE */ /* In !XSM_ENABLE builds, this header file is included from xsm/xsm.h, and * contains inline functions for each XSM hook. These functions also perform * compile-time checks on the xsm_default_t argument to ensure that the behavior * of the dummy XSM module is the same as the behavior with XSM disabled. */ #define XSM_INLINE always_inline #define XSM_DEFAULT_ARG xsm_default_t action, #define XSM_DEFAULT_VOID xsm_default_t action #define XSM_ASSERT_ACTION(def) LINKER_BUG_ON(def != action) #endif /* XSM_ENABLE */ static always_inline int xsm_default_action( xsm_default_t action, struct domain *src, struct domain *target) { switch ( action ) { case XSM_HOOK: return 0; case XSM_DM_PRIV: if ( src->is_privileged ) return 0; if ( target && src->target == target ) return 0; return -EPERM; case XSM_TARGET: if ( src == target ) return 0; if ( src->is_privileged ) return 0; if ( target && src->target == target ) return 0; return -EPERM; case XSM_PRIV: if ( src->is_privileged ) return 0; return -EPERM; default: LINKER_BUG_ON(1); return -EPERM; } } static XSM_INLINE void xsm_security_domaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info) { return; } static XSM_INLINE int xsm_domain_create(XSM_DEFAULT_ARG struct domain *d, u32 ssidref) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_getdomaininfo(XSM_DEFAULT_ARG struct domain *d) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_domctl_scheduler_op(XSM_DEFAULT_ARG struct domain *d, int cmd) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_sysctl_scheduler_op(XSM_DEFAULT_ARG int cmd) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_set_target(XSM_DEFAULT_ARG struct domain *d, struct domain *e) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_domctl(XSM_DEFAULT_ARG struct domain *d, int cmd) { XSM_ASSERT_ACTION(XSM_OTHER); switch ( cmd ) { case XEN_DOMCTL_ioport_mapping: case XEN_DOMCTL_memory_mapping: case XEN_DOMCTL_bind_pt_irq: case XEN_DOMCTL_unbind_pt_irq: return xsm_default_action(XSM_DM_PRIV, current->domain, d); default: return xsm_default_action(XSM_PRIV, current->domain, d); } } static XSM_INLINE int xsm_sysctl(XSM_DEFAULT_ARG int cmd) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_readconsole(XSM_DEFAULT_ARG uint32_t clear) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_do_mca(XSM_DEFAULT_VOID) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_alloc_security_domain(struct domain *d) { return 0; } static XSM_INLINE void xsm_free_security_domain(struct domain *d) { return; } static XSM_INLINE int xsm_grant_mapref(XSM_DEFAULT_ARG struct domain *d1, struct domain *d2, uint32_t flags) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, d1, d2); } static XSM_INLINE int xsm_grant_unmapref(XSM_DEFAULT_ARG struct domain *d1, struct domain *d2) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, d1, d2); } static XSM_INLINE int xsm_grant_setup(XSM_DEFAULT_ARG struct domain *d1, struct domain *d2) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, d1, d2); } static XSM_INLINE int xsm_grant_transfer(XSM_DEFAULT_ARG struct domain *d1, struct domain *d2) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, d1, d2); } static XSM_INLINE int xsm_grant_copy(XSM_DEFAULT_ARG struct domain *d1, struct domain *d2) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, d1, d2); } static XSM_INLINE int xsm_grant_query_size(XSM_DEFAULT_ARG struct domain *d1, struct domain *d2) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, d1, d2); } static XSM_INLINE int xsm_memory_exchange(XSM_DEFAULT_ARG struct domain *d) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_memory_adjust_reservation(XSM_DEFAULT_ARG struct domain *d1, struct domain *d2) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, d1, d2); } static XSM_INLINE int xsm_memory_stat_reservation(XSM_DEFAULT_ARG struct domain *d1, struct domain *d2) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, d1, d2); } static XSM_INLINE int xsm_console_io(XSM_DEFAULT_ARG struct domain *d, int cmd) { XSM_ASSERT_ACTION(XSM_OTHER); #ifdef VERBOSE if ( cmd == CONSOLEIO_write ) return xsm_default_action(XSM_HOOK, d, NULL); #endif return xsm_default_action(XSM_PRIV, d, NULL); } static XSM_INLINE int xsm_profile(XSM_DEFAULT_ARG struct domain *d, int op) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, d, NULL); } static XSM_INLINE int xsm_kexec(XSM_DEFAULT_VOID) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_schedop_shutdown(XSM_DEFAULT_ARG struct domain *d1, struct domain *d2) { XSM_ASSERT_ACTION(XSM_DM_PRIV); return xsm_default_action(action, d1, d2); } static XSM_INLINE int xsm_memory_pin_page(XSM_DEFAULT_ARG struct domain *d1, struct domain *d2, struct page_info *page) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, d1, d2); } static XSM_INLINE int xsm_claim_pages(XSM_DEFAULT_ARG struct domain *d) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_evtchn_unbound(XSM_DEFAULT_ARG struct domain *d, struct evtchn *chn, domid_t id2) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_evtchn_interdomain(XSM_DEFAULT_ARG struct domain *d1, struct evtchn *chan1, struct domain *d2, struct evtchn *chan2) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, d1, d2); } static XSM_INLINE void xsm_evtchn_close_post(struct evtchn *chn) { return; } static XSM_INLINE int xsm_evtchn_send(XSM_DEFAULT_ARG struct domain *d, struct evtchn *chn) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, d, NULL); } static XSM_INLINE int xsm_evtchn_status(XSM_DEFAULT_ARG struct domain *d, struct evtchn *chn) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_evtchn_reset(XSM_DEFAULT_ARG struct domain *d1, struct domain *d2) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, d1, d2); } static XSM_INLINE int xsm_alloc_security_evtchn(struct evtchn *chn) { return 0; } static XSM_INLINE void xsm_free_security_evtchn(struct evtchn *chn) { return; } static XSM_INLINE char *xsm_show_security_evtchn(struct domain *d, const struct evtchn *chn) { return NULL; } static XSM_INLINE int xsm_get_pod_target(XSM_DEFAULT_ARG struct domain *d) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_set_pod_target(XSM_DEFAULT_ARG struct domain *d) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_get_device_group(XSM_DEFAULT_ARG uint32_t machine_bdf) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_test_assign_device(XSM_DEFAULT_ARG uint32_t machine_bdf) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_assign_device(XSM_DEFAULT_ARG struct domain *d, uint32_t machine_bdf) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_deassign_device(XSM_DEFAULT_ARG struct domain *d, uint32_t machine_bdf) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_resource_plug_core(XSM_DEFAULT_VOID) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_resource_unplug_core(XSM_DEFAULT_VOID) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_resource_plug_pci(XSM_DEFAULT_ARG uint32_t machine_bdf) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_resource_unplug_pci(XSM_DEFAULT_ARG uint32_t machine_bdf) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_resource_setup_pci(XSM_DEFAULT_ARG uint32_t machine_bdf) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_resource_setup_gsi(XSM_DEFAULT_ARG int gsi) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_resource_setup_misc(XSM_DEFAULT_VOID) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_page_offline(XSM_DEFAULT_ARG uint32_t cmd) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_tmem_op(XSM_DEFAULT_VOID) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_tmem_control(XSM_DEFAULT_VOID) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE long xsm_do_xsm_op(XEN_GUEST_HANDLE_PARAM(xsm_op_t) op) { return -ENOSYS; } static XSM_INLINE char *xsm_show_irq_sid(int irq) { return NULL; } static XSM_INLINE int xsm_map_domain_pirq(XSM_DEFAULT_ARG struct domain *d) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_map_domain_irq(XSM_DEFAULT_ARG struct domain *d, int irq, void *data) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_unmap_domain_pirq(XSM_DEFAULT_ARG struct domain *d) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_unmap_domain_irq(XSM_DEFAULT_ARG struct domain *d, int irq, void *data) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_irq_permission(XSM_DEFAULT_ARG struct domain *d, int pirq, uint8_t allow) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_iomem_permission(XSM_DEFAULT_ARG struct domain *d, uint64_t s, uint64_t e, uint8_t allow) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_iomem_mapping(XSM_DEFAULT_ARG struct domain *d, uint64_t s, uint64_t e, uint8_t allow) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_pci_config_permission(XSM_DEFAULT_ARG struct domain *d, uint32_t machine_bdf, uint16_t start, uint16_t end, uint8_t access) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_add_to_physmap(XSM_DEFAULT_ARG struct domain *d1, struct domain *d2) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, d1, d2); } static XSM_INLINE int xsm_remove_from_physmap(XSM_DEFAULT_ARG struct domain *d1, struct domain *d2) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, d1, d2); } static XSM_INLINE int xsm_hvm_param(XSM_DEFAULT_ARG struct domain *d, unsigned long op) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_hvm_param_nested(XSM_DEFAULT_ARG struct domain *d) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, current->domain, d); } #ifdef CONFIG_X86 static XSM_INLINE int xsm_shadow_control(XSM_DEFAULT_ARG struct domain *d, uint32_t op) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_hvm_set_pci_intx_level(XSM_DEFAULT_ARG struct domain *d) { XSM_ASSERT_ACTION(XSM_DM_PRIV); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_hvm_set_isa_irq_level(XSM_DEFAULT_ARG struct domain *d) { XSM_ASSERT_ACTION(XSM_DM_PRIV); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_hvm_set_pci_link_route(XSM_DEFAULT_ARG struct domain *d) { XSM_ASSERT_ACTION(XSM_DM_PRIV); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_hvm_inject_msi(XSM_DEFAULT_ARG struct domain *d) { XSM_ASSERT_ACTION(XSM_DM_PRIV); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_mem_event_control(XSM_DEFAULT_ARG struct domain *d, int mode, int op) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_mem_event_op(XSM_DEFAULT_ARG struct domain *d, int op) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_mem_sharing_op(XSM_DEFAULT_ARG struct domain *d, struct domain *cd, int op) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, current->domain, cd); } static XSM_INLINE int xsm_apic(XSM_DEFAULT_ARG struct domain *d, int cmd) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, d, NULL); } static XSM_INLINE int xsm_platform_op(XSM_DEFAULT_ARG uint32_t op) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_machine_memory_map(XSM_DEFAULT_VOID) { XSM_ASSERT_ACTION(XSM_PRIV); return xsm_default_action(action, current->domain, NULL); } static XSM_INLINE int xsm_domain_memory_map(XSM_DEFAULT_ARG struct domain *d) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_mmu_update(XSM_DEFAULT_ARG struct domain *d, struct domain *t, struct domain *f, uint32_t flags) { int rc = 0; XSM_ASSERT_ACTION(XSM_TARGET); if ( f != dom_io ) rc = xsm_default_action(action, d, f); if ( t && !rc ) rc = xsm_default_action(action, d, t); return rc; } static XSM_INLINE int xsm_mmuext_op(XSM_DEFAULT_ARG struct domain *d, struct domain *f) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, d, f); } static XSM_INLINE int xsm_update_va_mapping(XSM_DEFAULT_ARG struct domain *d, struct domain *f, l1_pgentry_t pte) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, d, f); } static XSM_INLINE int xsm_priv_mapping(XSM_DEFAULT_ARG struct domain *d, struct domain *t) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, d, t); } static XSM_INLINE int xsm_bind_pt_irq(XSM_DEFAULT_ARG struct domain *d, struct xen_domctl_bind_pt_irq *bind) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_unbind_pt_irq(XSM_DEFAULT_ARG struct domain *d, struct xen_domctl_bind_pt_irq *bind) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_ioport_permission(XSM_DEFAULT_ARG struct domain *d, uint32_t s, uint32_t e, uint8_t allow) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, d); } static XSM_INLINE int xsm_ioport_mapping(XSM_DEFAULT_ARG struct domain *d, uint32_t s, uint32_t e, uint8_t allow) { XSM_ASSERT_ACTION(XSM_HOOK); return xsm_default_action(action, current->domain, d); } #endif /* CONFIG_X86 */ #ifdef CONFIG_ARM static XSM_INLINE int xsm_map_gmfn_foreign(XSM_DEFAULT_ARG struct domain *d, struct domain *t) { XSM_ASSERT_ACTION(XSM_TARGET); return xsm_default_action(action, d, t); } #endif xen-4.4.0/xen/include/xsm/xsm.h0000664000175000017500000005002412307313555014450 0ustar smbsmb/* * This file contains the XSM hook definitions for Xen. * * This work is based on the LSM implementation in Linux 2.6.13.4. * * Author: George Coker, * * Contributors: Michael LeMay, * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, * as published by the Free Software Foundation. */ #ifndef __XSM_H__ #define __XSM_H__ #include #include typedef void xsm_op_t; DEFINE_XEN_GUEST_HANDLE(xsm_op_t); /* policy magic number (defined by XSM_MAGIC) */ typedef u32 xsm_magic_t; #ifndef XSM_MAGIC #define XSM_MAGIC 0x00000000 #endif /* These annotations are used by callers and in dummy.h to document the * default actions of XSM hooks. They should be compiled out otherwise. */ enum xsm_default { XSM_HOOK, /* Guests can normally access the hypercall */ XSM_DM_PRIV, /* Device model can perform on its target domain */ XSM_TARGET, /* Can perform on self or your target domain */ XSM_PRIV, /* Privileged - normally restricted to dom0 */ XSM_OTHER /* Something more complex */ }; typedef enum xsm_default xsm_default_t; extern char *policy_buffer; extern u32 policy_size; typedef int (*xsm_initcall_t)(void); extern xsm_initcall_t __xsm_initcall_start[], __xsm_initcall_end[]; #define xsm_initcall(fn) \ static xsm_initcall_t __initcall_##fn \ __used_section(".xsm_initcall.init") = fn struct xsm_operations { void (*security_domaininfo) (struct domain *d, struct xen_domctl_getdomaininfo *info); int (*domain_create) (struct domain *d, u32 ssidref); int (*getdomaininfo) (struct domain *d); int (*domctl_scheduler_op) (struct domain *d, int op); int (*sysctl_scheduler_op) (int op); int (*set_target) (struct domain *d, struct domain *e); int (*domctl) (struct domain *d, int cmd); int (*sysctl) (int cmd); int (*readconsole) (uint32_t clear); int (*do_mca) (void); int (*evtchn_unbound) (struct domain *d, struct evtchn *chn, domid_t id2); int (*evtchn_interdomain) (struct domain *d1, struct evtchn *chn1, struct domain *d2, struct evtchn *chn2); void (*evtchn_close_post) (struct evtchn *chn); int (*evtchn_send) (struct domain *d, struct evtchn *chn); int (*evtchn_status) (struct domain *d, struct evtchn *chn); int (*evtchn_reset) (struct domain *d1, struct domain *d2); int (*grant_mapref) (struct domain *d1, struct domain *d2, uint32_t flags); int (*grant_unmapref) (struct domain *d1, struct domain *d2); int (*grant_setup) (struct domain *d1, struct domain *d2); int (*grant_transfer) (struct domain *d1, struct domain *d2); int (*grant_copy) (struct domain *d1, struct domain *d2); int (*grant_query_size) (struct domain *d1, struct domain *d2); int (*alloc_security_domain) (struct domain *d); void (*free_security_domain) (struct domain *d); int (*alloc_security_evtchn) (struct evtchn *chn); void (*free_security_evtchn) (struct evtchn *chn); char *(*show_security_evtchn) (struct domain *d, const struct evtchn *chn); int (*get_pod_target) (struct domain *d); int (*set_pod_target) (struct domain *d); int (*memory_exchange) (struct domain *d); int (*memory_adjust_reservation) (struct domain *d1, struct domain *d2); int (*memory_stat_reservation) (struct domain *d1, struct domain *d2); int (*memory_pin_page) (struct domain *d1, struct domain *d2, struct page_info *page); int (*add_to_physmap) (struct domain *d1, struct domain *d2); int (*remove_from_physmap) (struct domain *d1, struct domain *d2); int (*claim_pages) (struct domain *d); int (*console_io) (struct domain *d, int cmd); int (*profile) (struct domain *d, int op); int (*kexec) (void); int (*schedop_shutdown) (struct domain *d1, struct domain *d2); char *(*show_irq_sid) (int irq); int (*map_domain_pirq) (struct domain *d); int (*map_domain_irq) (struct domain *d, int irq, void *data); int (*unmap_domain_pirq) (struct domain *d); int (*unmap_domain_irq) (struct domain *d, int irq, void *data); int (*irq_permission) (struct domain *d, int pirq, uint8_t allow); int (*iomem_permission) (struct domain *d, uint64_t s, uint64_t e, uint8_t allow); int (*iomem_mapping) (struct domain *d, uint64_t s, uint64_t e, uint8_t allow); int (*pci_config_permission) (struct domain *d, uint32_t machine_bdf, uint16_t start, uint16_t end, uint8_t access); int (*get_device_group) (uint32_t machine_bdf); int (*test_assign_device) (uint32_t machine_bdf); int (*assign_device) (struct domain *d, uint32_t machine_bdf); int (*deassign_device) (struct domain *d, uint32_t machine_bdf); int (*resource_plug_core) (void); int (*resource_unplug_core) (void); int (*resource_plug_pci) (uint32_t machine_bdf); int (*resource_unplug_pci) (uint32_t machine_bdf); int (*resource_setup_pci) (uint32_t machine_bdf); int (*resource_setup_gsi) (int gsi); int (*resource_setup_misc) (void); int (*page_offline)(uint32_t cmd); int (*tmem_op)(void); int (*tmem_control)(void); long (*do_xsm_op) (XEN_GUEST_HANDLE_PARAM(xsm_op_t) op); int (*hvm_param) (struct domain *d, unsigned long op); int (*hvm_param_nested) (struct domain *d); #ifdef CONFIG_X86 int (*shadow_control) (struct domain *d, uint32_t op); int (*hvm_set_pci_intx_level) (struct domain *d); int (*hvm_set_isa_irq_level) (struct domain *d); int (*hvm_set_pci_link_route) (struct domain *d); int (*hvm_inject_msi) (struct domain *d); int (*mem_event_control) (struct domain *d, int mode, int op); int (*mem_event_op) (struct domain *d, int op); int (*mem_sharing_op) (struct domain *d, struct domain *cd, int op); int (*apic) (struct domain *d, int cmd); int (*memtype) (uint32_t access); int (*platform_op) (uint32_t cmd); int (*machine_memory_map) (void); int (*domain_memory_map) (struct domain *d); #define XSM_MMU_UPDATE_READ 1 #define XSM_MMU_UPDATE_WRITE 2 #define XSM_MMU_NORMAL_UPDATE 4 #define XSM_MMU_MACHPHYS_UPDATE 8 int (*mmu_update) (struct domain *d, struct domain *t, struct domain *f, uint32_t flags); int (*mmuext_op) (struct domain *d, struct domain *f); int (*update_va_mapping) (struct domain *d, struct domain *f, l1_pgentry_t pte); int (*priv_mapping) (struct domain *d, struct domain *t); int (*bind_pt_irq) (struct domain *d, struct xen_domctl_bind_pt_irq *bind); int (*unbind_pt_irq) (struct domain *d, struct xen_domctl_bind_pt_irq *bind); int (*ioport_permission) (struct domain *d, uint32_t s, uint32_t e, uint8_t allow); int (*ioport_mapping) (struct domain *d, uint32_t s, uint32_t e, uint8_t allow); #endif #ifdef CONFIG_ARM int (*map_gmfn_foreign) (struct domain *d, struct domain *t); #endif }; #ifdef XSM_ENABLE extern struct xsm_operations *xsm_ops; #ifndef XSM_NO_WRAPPERS static inline void xsm_security_domaininfo (struct domain *d, struct xen_domctl_getdomaininfo *info) { xsm_ops->security_domaininfo(d, info); } static inline int xsm_domain_create (xsm_default_t def, struct domain *d, u32 ssidref) { return xsm_ops->domain_create(d, ssidref); } static inline int xsm_getdomaininfo (xsm_default_t def, struct domain *d) { return xsm_ops->getdomaininfo(d); } static inline int xsm_domctl_scheduler_op (xsm_default_t def, struct domain *d, int cmd) { return xsm_ops->domctl_scheduler_op(d, cmd); } static inline int xsm_sysctl_scheduler_op (xsm_default_t def, int cmd) { return xsm_ops->sysctl_scheduler_op(cmd); } static inline int xsm_set_target (xsm_default_t def, struct domain *d, struct domain *e) { return xsm_ops->set_target(d, e); } static inline int xsm_domctl (xsm_default_t def, struct domain *d, int cmd) { return xsm_ops->domctl(d, cmd); } static inline int xsm_sysctl (xsm_default_t def, int cmd) { return xsm_ops->sysctl(cmd); } static inline int xsm_readconsole (xsm_default_t def, uint32_t clear) { return xsm_ops->readconsole(clear); } static inline int xsm_do_mca(xsm_default_t def) { return xsm_ops->do_mca(); } static inline int xsm_evtchn_unbound (xsm_default_t def, struct domain *d1, struct evtchn *chn, domid_t id2) { return xsm_ops->evtchn_unbound(d1, chn, id2); } static inline int xsm_evtchn_interdomain (xsm_default_t def, struct domain *d1, struct evtchn *chan1, struct domain *d2, struct evtchn *chan2) { return xsm_ops->evtchn_interdomain(d1, chan1, d2, chan2); } static inline void xsm_evtchn_close_post (struct evtchn *chn) { xsm_ops->evtchn_close_post(chn); } static inline int xsm_evtchn_send (xsm_default_t def, struct domain *d, struct evtchn *chn) { return xsm_ops->evtchn_send(d, chn); } static inline int xsm_evtchn_status (xsm_default_t def, struct domain *d, struct evtchn *chn) { return xsm_ops->evtchn_status(d, chn); } static inline int xsm_evtchn_reset (xsm_default_t def, struct domain *d1, struct domain *d2) { return xsm_ops->evtchn_reset(d1, d2); } static inline int xsm_grant_mapref (xsm_default_t def, struct domain *d1, struct domain *d2, uint32_t flags) { return xsm_ops->grant_mapref(d1, d2, flags); } static inline int xsm_grant_unmapref (xsm_default_t def, struct domain *d1, struct domain *d2) { return xsm_ops->grant_unmapref(d1, d2); } static inline int xsm_grant_setup (xsm_default_t def, struct domain *d1, struct domain *d2) { return xsm_ops->grant_setup(d1, d2); } static inline int xsm_grant_transfer (xsm_default_t def, struct domain *d1, struct domain *d2) { return xsm_ops->grant_transfer(d1, d2); } static inline int xsm_grant_copy (xsm_default_t def, struct domain *d1, struct domain *d2) { return xsm_ops->grant_copy(d1, d2); } static inline int xsm_grant_query_size (xsm_default_t def, struct domain *d1, struct domain *d2) { return xsm_ops->grant_query_size(d1, d2); } static inline int xsm_alloc_security_domain (struct domain *d) { return xsm_ops->alloc_security_domain(d); } static inline void xsm_free_security_domain (struct domain *d) { xsm_ops->free_security_domain(d); } static inline int xsm_alloc_security_evtchn (struct evtchn *chn) { return xsm_ops->alloc_security_evtchn(chn); } static inline void xsm_free_security_evtchn (struct evtchn *chn) { (void)xsm_ops->free_security_evtchn(chn); } static inline char *xsm_show_security_evtchn (struct domain *d, const struct evtchn *chn) { return xsm_ops->show_security_evtchn(d, chn); } static inline int xsm_get_pod_target (xsm_default_t def, struct domain *d) { return xsm_ops->get_pod_target(d); } static inline int xsm_set_pod_target (xsm_default_t def, struct domain *d) { return xsm_ops->set_pod_target(d); } static inline int xsm_memory_exchange (xsm_default_t def, struct domain *d) { return xsm_ops->memory_exchange(d); } static inline int xsm_memory_adjust_reservation (xsm_default_t def, struct domain *d1, struct domain *d2) { return xsm_ops->memory_adjust_reservation(d1, d2); } static inline int xsm_memory_stat_reservation (xsm_default_t def, struct domain *d1, struct domain *d2) { return xsm_ops->memory_stat_reservation(d1, d2); } static inline int xsm_memory_pin_page(xsm_default_t def, struct domain *d1, struct domain *d2, struct page_info *page) { return xsm_ops->memory_pin_page(d1, d2, page); } static inline int xsm_add_to_physmap(xsm_default_t def, struct domain *d1, struct domain *d2) { return xsm_ops->add_to_physmap(d1, d2); } static inline int xsm_remove_from_physmap(xsm_default_t def, struct domain *d1, struct domain *d2) { return xsm_ops->remove_from_physmap(d1, d2); } static inline int xsm_claim_pages(xsm_default_t def, struct domain *d) { return xsm_ops->claim_pages(d); } static inline int xsm_console_io (xsm_default_t def, struct domain *d, int cmd) { return xsm_ops->console_io(d, cmd); } static inline int xsm_profile (xsm_default_t def, struct domain *d, int op) { return xsm_ops->profile(d, op); } static inline int xsm_kexec (xsm_default_t def) { return xsm_ops->kexec(); } static inline int xsm_schedop_shutdown (xsm_default_t def, struct domain *d1, struct domain *d2) { return xsm_ops->schedop_shutdown(d1, d2); } static inline char *xsm_show_irq_sid (int irq) { return xsm_ops->show_irq_sid(irq); } static inline int xsm_map_domain_pirq (xsm_default_t def, struct domain *d) { return xsm_ops->map_domain_pirq(d); } static inline int xsm_map_domain_irq (xsm_default_t def, struct domain *d, int irq, void *data) { return xsm_ops->map_domain_irq(d, irq, data); } static inline int xsm_unmap_domain_pirq (xsm_default_t def, struct domain *d) { return xsm_ops->unmap_domain_pirq(d); } static inline int xsm_unmap_domain_irq (xsm_default_t def, struct domain *d, int irq, void *data) { return xsm_ops->unmap_domain_irq(d, irq, data); } static inline int xsm_irq_permission (xsm_default_t def, struct domain *d, int pirq, uint8_t allow) { return xsm_ops->irq_permission(d, pirq, allow); } static inline int xsm_iomem_permission (xsm_default_t def, struct domain *d, uint64_t s, uint64_t e, uint8_t allow) { return xsm_ops->iomem_permission(d, s, e, allow); } static inline int xsm_iomem_mapping (xsm_default_t def, struct domain *d, uint64_t s, uint64_t e, uint8_t allow) { return xsm_ops->iomem_mapping(d, s, e, allow); } static inline int xsm_pci_config_permission (xsm_default_t def, struct domain *d, uint32_t machine_bdf, uint16_t start, uint16_t end, uint8_t access) { return xsm_ops->pci_config_permission(d, machine_bdf, start, end, access); } static inline int xsm_get_device_group(xsm_default_t def, uint32_t machine_bdf) { return xsm_ops->get_device_group(machine_bdf); } static inline int xsm_test_assign_device(xsm_default_t def, uint32_t machine_bdf) { return xsm_ops->test_assign_device(machine_bdf); } static inline int xsm_assign_device(xsm_default_t def, struct domain *d, uint32_t machine_bdf) { return xsm_ops->assign_device(d, machine_bdf); } static inline int xsm_deassign_device(xsm_default_t def, struct domain *d, uint32_t machine_bdf) { return xsm_ops->deassign_device(d, machine_bdf); } static inline int xsm_resource_plug_pci (xsm_default_t def, uint32_t machine_bdf) { return xsm_ops->resource_plug_pci(machine_bdf); } static inline int xsm_resource_unplug_pci (xsm_default_t def, uint32_t machine_bdf) { return xsm_ops->resource_unplug_pci(machine_bdf); } static inline int xsm_resource_plug_core (xsm_default_t def) { return xsm_ops->resource_plug_core(); } static inline int xsm_resource_unplug_core (xsm_default_t def) { return xsm_ops->resource_unplug_core(); } static inline int xsm_resource_setup_pci (xsm_default_t def, uint32_t machine_bdf) { return xsm_ops->resource_setup_pci(machine_bdf); } static inline int xsm_resource_setup_gsi (xsm_default_t def, int gsi) { return xsm_ops->resource_setup_gsi(gsi); } static inline int xsm_resource_setup_misc (xsm_default_t def) { return xsm_ops->resource_setup_misc(); } static inline int xsm_page_offline(xsm_default_t def, uint32_t cmd) { return xsm_ops->page_offline(cmd); } static inline int xsm_tmem_op(xsm_default_t def) { return xsm_ops->tmem_op(); } static inline int xsm_tmem_control(xsm_default_t def) { return xsm_ops->tmem_control(); } static inline long xsm_do_xsm_op (XEN_GUEST_HANDLE_PARAM(xsm_op_t) op) { return xsm_ops->do_xsm_op(op); } static inline int xsm_hvm_param (xsm_default_t def, struct domain *d, unsigned long op) { return xsm_ops->hvm_param(d, op); } static inline int xsm_hvm_param_nested (xsm_default_t def, struct domain *d) { return xsm_ops->hvm_param_nested(d); } #ifdef CONFIG_X86 static inline int xsm_shadow_control (xsm_default_t def, struct domain *d, uint32_t op) { return xsm_ops->shadow_control(d, op); } static inline int xsm_hvm_set_pci_intx_level (xsm_default_t def, struct domain *d) { return xsm_ops->hvm_set_pci_intx_level(d); } static inline int xsm_hvm_set_isa_irq_level (xsm_default_t def, struct domain *d) { return xsm_ops->hvm_set_isa_irq_level(d); } static inline int xsm_hvm_set_pci_link_route (xsm_default_t def, struct domain *d) { return xsm_ops->hvm_set_pci_link_route(d); } static inline int xsm_hvm_inject_msi (xsm_default_t def, struct domain *d) { return xsm_ops->hvm_inject_msi(d); } static inline int xsm_mem_event_control (xsm_default_t def, struct domain *d, int mode, int op) { return xsm_ops->mem_event_control(d, mode, op); } static inline int xsm_mem_event_op (xsm_default_t def, struct domain *d, int op) { return xsm_ops->mem_event_op(d, op); } static inline int xsm_mem_sharing_op (xsm_default_t def, struct domain *d, struct domain *cd, int op) { return xsm_ops->mem_sharing_op(d, cd, op); } static inline int xsm_apic (xsm_default_t def, struct domain *d, int cmd) { return xsm_ops->apic(d, cmd); } static inline int xsm_memtype (xsm_default_t def, uint32_t access) { return xsm_ops->memtype(access); } static inline int xsm_platform_op (xsm_default_t def, uint32_t op) { return xsm_ops->platform_op(op); } static inline int xsm_machine_memory_map(xsm_default_t def) { return xsm_ops->machine_memory_map(); } static inline int xsm_domain_memory_map(xsm_default_t def, struct domain *d) { return xsm_ops->domain_memory_map(d); } static inline int xsm_mmu_update (xsm_default_t def, struct domain *d, struct domain *t, struct domain *f, uint32_t flags) { return xsm_ops->mmu_update(d, t, f, flags); } static inline int xsm_mmuext_op (xsm_default_t def, struct domain *d, struct domain *f) { return xsm_ops->mmuext_op(d, f); } static inline int xsm_update_va_mapping(xsm_default_t def, struct domain *d, struct domain *f, l1_pgentry_t pte) { return xsm_ops->update_va_mapping(d, f, pte); } static inline int xsm_priv_mapping(xsm_default_t def, struct domain *d, struct domain *t) { return xsm_ops->priv_mapping(d, t); } static inline int xsm_bind_pt_irq(xsm_default_t def, struct domain *d, struct xen_domctl_bind_pt_irq *bind) { return xsm_ops->bind_pt_irq(d, bind); } static inline int xsm_unbind_pt_irq(xsm_default_t def, struct domain *d, struct xen_domctl_bind_pt_irq *bind) { return xsm_ops->unbind_pt_irq(d, bind); } static inline int xsm_ioport_permission (xsm_default_t def, struct domain *d, uint32_t s, uint32_t e, uint8_t allow) { return xsm_ops->ioport_permission(d, s, e, allow); } static inline int xsm_ioport_mapping (xsm_default_t def, struct domain *d, uint32_t s, uint32_t e, uint8_t allow) { return xsm_ops->ioport_mapping(d, s, e, allow); } #endif /* CONFIG_X86 */ #ifdef CONFIG_ARM static inline int xsm_map_gmfn_foreign (struct domain *d, struct domain *t) { return xsm_ops->map_gmfn_foreign(d, t); } #endif /* CONFIG_ARM */ #endif /* XSM_NO_WRAPPERS */ extern int xsm_init(unsigned long *module_map, const multiboot_info_t *mbi, void *(*bootstrap_map)(const module_t *)); extern int xsm_policy_init(unsigned long *module_map, const multiboot_info_t *mbi, void *(*bootstrap_map)(const module_t *)); extern int register_xsm(struct xsm_operations *ops); extern int unregister_xsm(struct xsm_operations *ops); extern struct xsm_operations dummy_xsm_ops; extern void xsm_fixup_ops(struct xsm_operations *ops); #else /* XSM_ENABLE */ #include static inline int xsm_init (unsigned long *module_map, const multiboot_info_t *mbi, void *(*bootstrap_map)(const module_t *)) { return 0; } #endif /* XSM_ENABLE */ #endif /* __XSM_H */ xen-4.4.0/xen/include/efi/0000775000175000017500000000000012307313555013423 5ustar smbsmbxen-4.4.0/xen/include/efi/eficon.h0000664000175000017500000002155412307313555015046 0ustar smbsmb#ifndef _EFI_CON_H #define _EFI_CON_H /*++ Copyright (c) 1998 Intel Corporation Module Name: eficon.h Abstract: EFI console protocols Revision History --*/ // // Text output protocol // #define SIMPLE_TEXT_OUTPUT_PROTOCOL \ { 0x387477c2, 0x69c7, 0x11d2, {0x8e, 0x39, 0x0, 0xa0, 0xc9, 0x69, 0x72, 0x3b} } INTERFACE_DECL(_SIMPLE_TEXT_OUTPUT_INTERFACE); typedef EFI_STATUS (EFIAPI *EFI_TEXT_RESET) ( IN struct _SIMPLE_TEXT_OUTPUT_INTERFACE *This, IN BOOLEAN ExtendedVerification ); typedef EFI_STATUS (EFIAPI *EFI_TEXT_OUTPUT_STRING) ( IN struct _SIMPLE_TEXT_OUTPUT_INTERFACE *This, IN CHAR16 *String ); typedef EFI_STATUS (EFIAPI *EFI_TEXT_TEST_STRING) ( IN struct _SIMPLE_TEXT_OUTPUT_INTERFACE *This, IN CHAR16 *String ); typedef EFI_STATUS (EFIAPI *EFI_TEXT_QUERY_MODE) ( IN struct _SIMPLE_TEXT_OUTPUT_INTERFACE *This, IN UINTN ModeNumber, OUT UINTN *Columns, OUT UINTN *Rows ); typedef EFI_STATUS (EFIAPI *EFI_TEXT_SET_MODE) ( IN struct _SIMPLE_TEXT_OUTPUT_INTERFACE *This, IN UINTN ModeNumber ); typedef EFI_STATUS (EFIAPI *EFI_TEXT_SET_ATTRIBUTE) ( IN struct _SIMPLE_TEXT_OUTPUT_INTERFACE *This, IN UINTN Attribute ); #define EFI_BLACK 0x00 #define EFI_BLUE 0x01 #define EFI_GREEN 0x02 #define EFI_CYAN (EFI_BLUE | EFI_GREEN) #define EFI_RED 0x04 #define EFI_MAGENTA (EFI_BLUE | EFI_RED) #define EFI_BROWN (EFI_GREEN | EFI_RED) #define EFI_LIGHTGRAY (EFI_BLUE | EFI_GREEN | EFI_RED) #define EFI_BRIGHT 0x08 #define EFI_DARKGRAY (EFI_BRIGHT) #define EFI_LIGHTBLUE (EFI_BLUE | EFI_BRIGHT) #define EFI_LIGHTGREEN (EFI_GREEN | EFI_BRIGHT) #define EFI_LIGHTCYAN (EFI_CYAN | EFI_BRIGHT) #define EFI_LIGHTRED (EFI_RED | EFI_BRIGHT) #define EFI_LIGHTMAGENTA (EFI_MAGENTA | EFI_BRIGHT) #define EFI_YELLOW (EFI_BROWN | EFI_BRIGHT) #define EFI_WHITE (EFI_BLUE | EFI_GREEN | EFI_RED | EFI_BRIGHT) #define EFI_TEXT_ATTR(f,b) ((f) | ((b) << 4)) #define EFI_BACKGROUND_BLACK 0x00 #define EFI_BACKGROUND_BLUE 0x10 #define EFI_BACKGROUND_GREEN 0x20 #define EFI_BACKGROUND_CYAN (EFI_BACKGROUND_BLUE | EFI_BACKGROUND_GREEN) #define EFI_BACKGROUND_RED 0x40 #define EFI_BACKGROUND_MAGENTA (EFI_BACKGROUND_BLUE | EFI_BACKGROUND_RED) #define EFI_BACKGROUND_BROWN (EFI_BACKGROUND_GREEN | EFI_BACKGROUND_RED) #define EFI_BACKGROUND_LIGHTGRAY (EFI_BACKGROUND_BLUE | EFI_BACKGROUND_GREEN | EFI_BACKGROUND_RED) typedef EFI_STATUS (EFIAPI *EFI_TEXT_CLEAR_SCREEN) ( IN struct _SIMPLE_TEXT_OUTPUT_INTERFACE *This ); typedef EFI_STATUS (EFIAPI *EFI_TEXT_SET_CURSOR_POSITION) ( IN struct _SIMPLE_TEXT_OUTPUT_INTERFACE *This, IN UINTN Column, IN UINTN Row ); typedef EFI_STATUS (EFIAPI *EFI_TEXT_ENABLE_CURSOR) ( IN struct _SIMPLE_TEXT_OUTPUT_INTERFACE *This, IN BOOLEAN Enable ); typedef struct { INT32 MaxMode; // current settings INT32 Mode; INT32 Attribute; INT32 CursorColumn; INT32 CursorRow; BOOLEAN CursorVisible; } SIMPLE_TEXT_OUTPUT_MODE; typedef struct _SIMPLE_TEXT_OUTPUT_INTERFACE { EFI_TEXT_RESET Reset; EFI_TEXT_OUTPUT_STRING OutputString; EFI_TEXT_TEST_STRING TestString; EFI_TEXT_QUERY_MODE QueryMode; EFI_TEXT_SET_MODE SetMode; EFI_TEXT_SET_ATTRIBUTE SetAttribute; EFI_TEXT_CLEAR_SCREEN ClearScreen; EFI_TEXT_SET_CURSOR_POSITION SetCursorPosition; EFI_TEXT_ENABLE_CURSOR EnableCursor; // Current mode SIMPLE_TEXT_OUTPUT_MODE *Mode; } SIMPLE_TEXT_OUTPUT_INTERFACE; // // Define's for required EFI Unicode Box Draw character // #define BOXDRAW_HORIZONTAL 0x2500 #define BOXDRAW_VERTICAL 0x2502 #define BOXDRAW_DOWN_RIGHT 0x250c #define BOXDRAW_DOWN_LEFT 0x2510 #define BOXDRAW_UP_RIGHT 0x2514 #define BOXDRAW_UP_LEFT 0x2518 #define BOXDRAW_VERTICAL_RIGHT 0x251c #define BOXDRAW_VERTICAL_LEFT 0x2524 #define BOXDRAW_DOWN_HORIZONTAL 0x252c #define BOXDRAW_UP_HORIZONTAL 0x2534 #define BOXDRAW_VERTICAL_HORIZONTAL 0x253c #define BOXDRAW_DOUBLE_HORIZONTAL 0x2550 #define BOXDRAW_DOUBLE_VERTICAL 0x2551 #define BOXDRAW_DOWN_RIGHT_DOUBLE 0x2552 #define BOXDRAW_DOWN_DOUBLE_RIGHT 0x2553 #define BOXDRAW_DOUBLE_DOWN_RIGHT 0x2554 #define BOXDRAW_DOWN_LEFT_DOUBLE 0x2555 #define BOXDRAW_DOWN_DOUBLE_LEFT 0x2556 #define BOXDRAW_DOUBLE_DOWN_LEFT 0x2557 #define BOXDRAW_UP_RIGHT_DOUBLE 0x2558 #define BOXDRAW_UP_DOUBLE_RIGHT 0x2559 #define BOXDRAW_DOUBLE_UP_RIGHT 0x255a #define BOXDRAW_UP_LEFT_DOUBLE 0x255b #define BOXDRAW_UP_DOUBLE_LEFT 0x255c #define BOXDRAW_DOUBLE_UP_LEFT 0x255d #define BOXDRAW_VERTICAL_RIGHT_DOUBLE 0x255e #define BOXDRAW_VERTICAL_DOUBLE_RIGHT 0x255f #define BOXDRAW_DOUBLE_VERTICAL_RIGHT 0x2560 #define BOXDRAW_VERTICAL_LEFT_DOUBLE 0x2561 #define BOXDRAW_VERTICAL_DOUBLE_LEFT 0x2562 #define BOXDRAW_DOUBLE_VERTICAL_LEFT 0x2563 #define BOXDRAW_DOWN_HORIZONTAL_DOUBLE 0x2564 #define BOXDRAW_DOWN_DOUBLE_HORIZONTAL 0x2565 #define BOXDRAW_DOUBLE_DOWN_HORIZONTAL 0x2566 #define BOXDRAW_UP_HORIZONTAL_DOUBLE 0x2567 #define BOXDRAW_UP_DOUBLE_HORIZONTAL 0x2568 #define BOXDRAW_DOUBLE_UP_HORIZONTAL 0x2569 #define BOXDRAW_VERTICAL_HORIZONTAL_DOUBLE 0x256a #define BOXDRAW_VERTICAL_DOUBLE_HORIZONTAL 0x256b #define BOXDRAW_DOUBLE_VERTICAL_HORIZONTAL 0x256c // // EFI Required Block Elements Code Chart // #define BLOCKELEMENT_FULL_BLOCK 0x2588 #define BLOCKELEMENT_LIGHT_SHADE 0x2591 // // EFI Required Geometric Shapes Code Chart // #define GEOMETRICSHAPE_UP_TRIANGLE 0x25b2 #define GEOMETRICSHAPE_RIGHT_TRIANGLE 0x25ba #define GEOMETRICSHAPE_DOWN_TRIANGLE 0x25bc #define GEOMETRICSHAPE_LEFT_TRIANGLE 0x25c4 // // EFI Required Arrow shapes // #define ARROW_UP 0x2191 #define ARROW_DOWN 0x2193 // // Text input protocol // #define SIMPLE_TEXT_INPUT_PROTOCOL \ { 0x387477c1, 0x69c7, 0x11d2, {0x8e, 0x39, 0x0, 0xa0, 0xc9, 0x69, 0x72, 0x3b} } INTERFACE_DECL(_SIMPLE_INPUT_INTERFACE); typedef struct { UINT16 ScanCode; CHAR16 UnicodeChar; } EFI_INPUT_KEY; // // Baseline unicode control chars // #define CHAR_NULL 0x0000 #define CHAR_BACKSPACE 0x0008 #define CHAR_TAB 0x0009 #define CHAR_LINEFEED 0x000A #define CHAR_CARRIAGE_RETURN 0x000D // // Scan codes for base line keys // #define SCAN_NULL 0x0000 #define SCAN_UP 0x0001 #define SCAN_DOWN 0x0002 #define SCAN_RIGHT 0x0003 #define SCAN_LEFT 0x0004 #define SCAN_HOME 0x0005 #define SCAN_END 0x0006 #define SCAN_INSERT 0x0007 #define SCAN_DELETE 0x0008 #define SCAN_PAGE_UP 0x0009 #define SCAN_PAGE_DOWN 0x000A #define SCAN_F1 0x000B #define SCAN_F2 0x000C #define SCAN_F3 0x000D #define SCAN_F4 0x000E #define SCAN_F5 0x000F #define SCAN_F6 0x0010 #define SCAN_F7 0x0011 #define SCAN_F8 0x0012 #define SCAN_F9 0x0013 #define SCAN_F10 0x0014 #define SCAN_ESC 0x0017 typedef EFI_STATUS (EFIAPI *EFI_INPUT_RESET) ( IN struct _SIMPLE_INPUT_INTERFACE *This, IN BOOLEAN ExtendedVerification ); typedef EFI_STATUS (EFIAPI *EFI_INPUT_READ_KEY) ( IN struct _SIMPLE_INPUT_INTERFACE *This, OUT EFI_INPUT_KEY *Key ); typedef struct _SIMPLE_INPUT_INTERFACE { EFI_INPUT_RESET Reset; EFI_INPUT_READ_KEY ReadKeyStroke; EFI_EVENT WaitForKey; } SIMPLE_INPUT_INTERFACE; #endif xen-4.4.0/xen/include/efi/efipciio.h0000664000175000017500000001515612307313555015373 0ustar smbsmb#ifndef _EFI_PCI_IO_H #define _EFI_PCI_IO_H #define EFI_PCI_IO_PROTOCOL \ { 0x4cf5b200, 0x68b8, 0x4ca5, {0x9e, 0xec, 0xb2, 0x3e, 0x3f, 0x50, 0x02, 0x9a} } INTERFACE_DECL(_EFI_PCI_IO); typedef enum { EfiPciIoWidthUint8, EfiPciIoWidthUint16, EfiPciIoWidthUint32, EfiPciIoWidthUint64, EfiPciIoWidthFifoUint8, EfiPciIoWidthFifoUint16, EfiPciIoWidthFifoUint32, EfiPciIoWidthFifoUint64, EfiPciIoWidthFillUint8, EfiPciIoWidthFillUint16, EfiPciIoWidthFillUint32, EfiPciIoWidthFillUint64, EfiPciIoWidthMaximum } EFI_PCI_IO_PROTOCOL_WIDTH; #define EFI_PCI_IO_PASS_THROUGH_BAR 0xff typedef EFI_STATUS (EFIAPI *EFI_PCI_IO_PROTOCOL_POLL_IO_MEM) ( IN struct _EFI_PCI_IO *This, IN EFI_PCI_IO_PROTOCOL_WIDTH Width, IN UINT8 BarIndex, IN UINT64 Offset, IN UINT64 Mask, IN UINT64 Value, IN UINT64 Delay, OUT UINT64 *Result ); typedef EFI_STATUS (EFIAPI *EFI_PCI_IO_PROTOCOL_IO_MEM) ( IN struct _EFI_PCI_IO *This, IN EFI_PCI_IO_PROTOCOL_WIDTH Width, IN UINT8 BarIndex, IN UINT64 Offset, IN UINTN Count, IN OUT VOID *Buffer ); typedef struct { EFI_PCI_IO_PROTOCOL_IO_MEM Read; EFI_PCI_IO_PROTOCOL_IO_MEM Write; } EFI_PCI_IO_PROTOCOL_ACCESS; typedef EFI_STATUS (EFIAPI *EFI_PCI_IO_PROTOCOL_CONFIG) ( IN struct _EFI_PCI_IO *This, IN EFI_PCI_IO_PROTOCOL_WIDTH Width, IN UINT32 Offset, IN UINTN Count, IN OUT VOID *Buffer ); typedef struct { EFI_PCI_IO_PROTOCOL_CONFIG Read; EFI_PCI_IO_PROTOCOL_CONFIG Write; } EFI_PCI_IO_PROTOCOL_CONFIG_ACCESS; typedef EFI_STATUS (EFIAPI *EFI_PCI_IO_PROTOCOL_COPY_MEM) ( IN struct _EFI_PCI_IO *This, IN EFI_PCI_IO_PROTOCOL_WIDTH Width, IN UINT8 DestBarIndex, IN UINT64 DestOffset, IN UINT8 SrcBarIndex, IN UINT64 SrcOffset, IN UINTN Count ); typedef enum { EfiPciIoOperationBusMasterRead, EfiPciIoOperationBusMasterWrite, EfiPciIoOperationBusMasterCommonBuffer, EfiPciIoOperationMaximum } EFI_PCI_IO_PROTOCOL_OPERATION; typedef EFI_STATUS (EFIAPI *EFI_PCI_IO_PROTOCOL_MAP) ( IN struct _EFI_PCI_IO *This, IN EFI_PCI_IO_PROTOCOL_OPERATION Operation, IN VOID *HostAddress, IN OUT UINTN *NumberOfBytes, OUT EFI_PHYSICAL_ADDRESS *DeviceAddress, OUT VOID **Mapping ); typedef EFI_STATUS (EFIAPI *EFI_PCI_IO_PROTOCOL_UNMAP) ( IN struct _EFI_PCI_IO *This, IN VOID *Mapping ); typedef EFI_STATUS (EFIAPI *EFI_PCI_IO_PROTOCOL_ALLOCATE_BUFFER) ( IN struct _EFI_PCI_IO *This, IN EFI_ALLOCATE_TYPE Type, IN EFI_MEMORY_TYPE MemoryType, IN UINTN Pages, OUT VOID **HostAddress, IN UINT64 Attributes ); typedef EFI_STATUS (EFIAPI *EFI_PCI_IO_PROTOCOL_FREE_BUFFER) ( IN struct _EFI_PCI_IO *This, IN UINTN Pages, IN VOID *HostAddress ); typedef EFI_STATUS (EFIAPI *EFI_PCI_IO_PROTOCOL_FLUSH) ( IN struct _EFI_PCI_IO *This ); typedef EFI_STATUS (EFIAPI *EFI_PCI_IO_PROTOCOL_GET_LOCATION) ( IN struct _EFI_PCI_IO *This, OUT UINTN *SegmentNumber, OUT UINTN *BusNumber, OUT UINTN *DeviceNumber, OUT UINTN *FunctionNumber ); #define EFI_PCI_IO_ATTRIBUTE_ISA_IO 0x0002 #define EFI_PCI_IO_ATTRIBUTE_VGA_PALETTE_IO 0x0004 #define EFI_PCI_IO_ATTRIBUTE_VGA_MEMORY 0x0008 #define EFI_PCI_IO_ATTRIBUTE_VGA_IO 0x0010 #define EFI_PCI_IO_ATTRIBUTE_IDE_PRIMARY_IO 0x0020 #define EFI_PCI_IO_ATTRIBUTE_IDE_SECONDARY_IO 0x0040 #define EFI_PCI_IO_ATTRIBUTE_MEMORY_WRITE_COMBINE 0x0080 #define EFI_PCI_IO_ATTRIBUTE_IO 0x0100 #define EFI_PCI_IO_ATTRIBUTE_MEMORY 0x0200 #define EFI_PCI_IO_ATTRIBUTE_BUS_MASTER 0x0400 #define EFI_PCI_IO_ATTRIBUTE_MEMORY_CACHED 0x0800 #define EFI_PCI_IO_ATTRIBUTE_MEMORY_DISABLE 0x1000 #define EFI_PCI_IO_ATTRIBUTE_EMBEDDED_DEVICE 0x2000 #define EFI_PCI_IO_ATTRIBUTE_EMBEDDED_ROM 0x4000 #define EFI_PCI_IO_ATTRIBUTE_DUAL_ADDRESS_CYCLE 0x8000 #define EFI_PCI_IO_ATTRIBUTE_ISA_IO_16 0x10000 #define EFI_PCI_IO_ATTRIBUTE_VGA_PALETTE_IO_16 0x20000 #define EFI_PCI_IO_ATTRIBUTE_VGA_IO_16 0x40000 typedef enum { EfiPciIoAttributeOperationGet, EfiPciIoAttributeOperationSet, EfiPciIoAttributeOperationEnable, EfiPciIoAttributeOperationDisable, EfiPciIoAttributeOperationSupported, EfiPciIoAttributeOperationMaximum } EFI_PCI_IO_PROTOCOL_ATTRIBUTE_OPERATION; typedef EFI_STATUS (EFIAPI *EFI_PCI_IO_PROTOCOL_ATTRIBUTES) ( IN struct _EFI_PCI_IO *This, IN EFI_PCI_IO_PROTOCOL_ATTRIBUTE_OPERATION Operation, IN UINT64 Attributes, OUT UINT64 *Result OPTIONAL ); typedef EFI_STATUS (EFIAPI *EFI_PCI_IO_PROTOCOL_GET_BAR_ATTRIBUTES) ( IN struct _EFI_PCI_IO *This, IN UINT8 BarIndex, OUT UINT64 *Supports OPTIONAL, OUT VOID **Resources OPTIONAL ); typedef EFI_STATUS (EFIAPI *EFI_PCI_IO_PROTOCOL_SET_BAR_ATTRIBUTES) ( IN struct _EFI_PCI_IO *This, IN UINT64 Attributes, IN UINT8 BarIndex, IN OUT UINT64 *Offset, IN OUT UINT64 *Length ); typedef struct _EFI_PCI_IO { EFI_PCI_IO_PROTOCOL_POLL_IO_MEM PollMem; EFI_PCI_IO_PROTOCOL_POLL_IO_MEM PollIo; EFI_PCI_IO_PROTOCOL_ACCESS Mem; EFI_PCI_IO_PROTOCOL_ACCESS Io; EFI_PCI_IO_PROTOCOL_CONFIG_ACCESS Pci; EFI_PCI_IO_PROTOCOL_COPY_MEM CopyMem; EFI_PCI_IO_PROTOCOL_MAP Map; EFI_PCI_IO_PROTOCOL_UNMAP Unmap; EFI_PCI_IO_PROTOCOL_ALLOCATE_BUFFER AllocateBuffer; EFI_PCI_IO_PROTOCOL_FREE_BUFFER FreeBuffer; EFI_PCI_IO_PROTOCOL_FLUSH Flush; EFI_PCI_IO_PROTOCOL_GET_LOCATION GetLocation; EFI_PCI_IO_PROTOCOL_ATTRIBUTES Attributes; EFI_PCI_IO_PROTOCOL_GET_BAR_ATTRIBUTES GetBarAttributes; EFI_PCI_IO_PROTOCOL_SET_BAR_ATTRIBUTES SetBarAttributes; UINT64 RomSize; VOID *RomImage; } EFI_PCI_IO; #endif /* _EFI_PCI_IO_H */ xen-4.4.0/xen/include/efi/eficapsule.h0000664000175000017500000000430312307313555015714 0ustar smbsmb/*++ Copyright (c) 2004 - 2007, Intel Corporation All rights reserved. This program and the accompanying materials are licensed and made available under the terms and conditions of the BSD License which accompanies this distribution. The full text of the license may be found at http://opensource.org/licenses/bsd-license.php THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. Module Name: EfiCapsule.h Abstract: Defines for the EFI Capsule functionality --*/ #ifndef _EFI_CAPSULE_H #define _EFI_CAPSULE_H #define CAPSULE_BLOCK_DESCRIPTOR_SIGNATURE EFI_SIGNATURE_32 ('C', 'B', 'D', 'S') typedef struct { EFI_GUID OemGuid; UINT32 HeaderSize; // // UINT8 OemHdrData[]; // } EFI_CAPSULE_OEM_HEADER; #define MAX_SUPPORT_CAPSULE_NUM 50 #define CAPSULE_FLAGS_PERSIST_ACROSS_RESET 0x00010000 #define CAPSULE_FLAGS_POPULATE_SYSTEM_TABLE 0x00020000 typedef struct { UINT64 Length; union { EFI_PHYSICAL_ADDRESS DataBlock; EFI_PHYSICAL_ADDRESS ContinuationPointer; } Union; } EFI_CAPSULE_BLOCK_DESCRIPTOR; typedef struct { EFI_GUID CapsuleGuid; UINT32 HeaderSize; UINT32 Flags; UINT32 CapsuleImageSize; } EFI_CAPSULE_HEADER; typedef struct { UINT32 CapsuleArrayNumber; VOID* CapsulePtr[1]; } EFI_CAPSULE_TABLE; // // Bits in the flags field of the capsule header // #define EFI_CAPSULE_HEADER_FLAG_SETUP 0x00000001 // supports setup changes // // This is the GUID of the capsule header of the image on disk. // #define EFI_CAPSULE_GUID \ { \ 0x3B6686BD, 0x0D76, 0x4030, 0xB7, 0x0E, 0xB5, 0x51, 0x9E, 0x2F, 0xC5, 0xA0 \ } // // This is the GUID of the file created by the capsule application that contains // the path to the device(s) to update. // #define EFI_PATH_FILE_NAME_GUID \ { \ 0x7644C181, 0xFA6E, 0x46DA, 0x80, 0xCB, 0x04, 0xB9, 0x90, 0x40, 0x62, 0xE8 \ } // // This is the GUID of the configuration results file created by the capsule // application. // #define EFI_CONFIG_FILE_NAME_GUID \ { \ 0x98B8D59B, 0xE8BA, 0x48EE, 0x98, 0xDD, 0xC2, 0x95, 0x39, 0x2F, 0x1E, 0xDB \ } #endif xen-4.4.0/xen/include/efi/efidevp.h0000664000175000017500000003332412307313555015223 0ustar smbsmb#ifndef _DEVPATH_H #define _DEVPATH_H /*++ Copyright (c) 1998 Intel Corporation Module Name: devpath.h Abstract: Defines for parsing the EFI Device Path structures Revision History --*/ // // Device Path structures - Section C // typedef struct _EFI_DEVICE_PATH { UINT8 Type; UINT8 SubType; UINT8 Length[2]; } EFI_DEVICE_PATH; #define EFI_DP_TYPE_MASK 0x7F #define EFI_DP_TYPE_UNPACKED 0x80 //#define END_DEVICE_PATH_TYPE 0xff #define END_DEVICE_PATH_TYPE 0x7f //#define END_DEVICE_PATH_TYPE_UNPACKED 0x7f #define END_ENTIRE_DEVICE_PATH_SUBTYPE 0xff #define END_INSTANCE_DEVICE_PATH_SUBTYPE 0x01 #define END_DEVICE_PATH_LENGTH (sizeof(EFI_DEVICE_PATH)) #define DP_IS_END_TYPE(a) #define DP_IS_END_SUBTYPE(a) ( ((a)->SubType == END_ENTIRE_DEVICE_PATH_SUBTYPE ) #define DevicePathType(a) ( ((a)->Type) & EFI_DP_TYPE_MASK ) #define DevicePathSubType(a) ( (a)->SubType ) #define DevicePathNodeLength(a) ( ((a)->Length[0]) | ((a)->Length[1] << 8) ) #define NextDevicePathNode(a) ( (EFI_DEVICE_PATH *) ( ((UINT8 *) (a)) + DevicePathNodeLength(a))) //#define IsDevicePathEndType(a) ( DevicePathType(a) == END_DEVICE_PATH_TYPE_UNPACKED ) #define IsDevicePathEndType(a) ( DevicePathType(a) == END_DEVICE_PATH_TYPE ) #define IsDevicePathEndSubType(a) ( (a)->SubType == END_ENTIRE_DEVICE_PATH_SUBTYPE ) #define IsDevicePathEnd(a) ( IsDevicePathEndType(a) && IsDevicePathEndSubType(a) ) #define IsDevicePathUnpacked(a) ( (a)->Type & EFI_DP_TYPE_UNPACKED ) #define SetDevicePathNodeLength(a,l) { \ (a)->Length[0] = (UINT8) (l); \ (a)->Length[1] = (UINT8) ((l) >> 8); \ } #define SetDevicePathEndNode(a) { \ (a)->Type = END_DEVICE_PATH_TYPE; \ (a)->SubType = END_ENTIRE_DEVICE_PATH_SUBTYPE; \ (a)->Length[0] = sizeof(EFI_DEVICE_PATH); \ (a)->Length[1] = 0; \ } /* * */ #define HARDWARE_DEVICE_PATH 0x01 #define HW_PCI_DP 0x01 typedef struct _PCI_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT8 Function; UINT8 Device; } PCI_DEVICE_PATH; #define HW_PCCARD_DP 0x02 typedef struct _PCCARD_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT8 SocketNumber; } PCCARD_DEVICE_PATH; #define HW_MEMMAP_DP 0x03 typedef struct _MEMMAP_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT32 MemoryType; EFI_PHYSICAL_ADDRESS StartingAddress; EFI_PHYSICAL_ADDRESS EndingAddress; } MEMMAP_DEVICE_PATH; #define HW_VENDOR_DP 0x04 typedef struct _VENDOR_DEVICE_PATH { EFI_DEVICE_PATH Header; EFI_GUID Guid; } VENDOR_DEVICE_PATH; #define UNKNOWN_DEVICE_GUID \ { 0xcf31fac5, 0xc24e, 0x11d2, {0x85, 0xf3, 0x0, 0xa0, 0xc9, 0x3e, 0xc9, 0x3b} } typedef struct _UKNOWN_DEVICE_VENDOR_DP { VENDOR_DEVICE_PATH DevicePath; UINT8 LegacyDriveLetter; } UNKNOWN_DEVICE_VENDOR_DEVICE_PATH; #define HW_CONTROLLER_DP 0x05 typedef struct _CONTROLLER_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT32 Controller; } CONTROLLER_DEVICE_PATH; /* * */ #define ACPI_DEVICE_PATH 0x02 #define ACPI_DP 0x01 typedef struct _ACPI_HID_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT32 HID; UINT32 UID; } ACPI_HID_DEVICE_PATH; #define EXPANDED_ACPI_DP 0x02 typedef struct _EXPANDED_ACPI_HID_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT32 HID; UINT32 UID; UINT32 CID; UINT8 HidStr[1]; } EXPANDED_ACPI_HID_DEVICE_PATH; // // EISA ID Macro // EISA ID Definition 32-bits // bits[15:0] - three character compressed ASCII EISA ID. // bits[31:16] - binary number // Compressed ASCII is 5 bits per character 0b00001 = 'A' 0b11010 = 'Z' // #define PNP_EISA_ID_CONST 0x41d0 #define EISA_ID(_Name, _Num) ((UINT32) ((_Name) | (_Num) << 16)) #define EISA_PNP_ID(_PNPId) (EISA_ID(PNP_EISA_ID_CONST, (_PNPId))) #define PNP_EISA_ID_MASK 0xffff #define EISA_ID_TO_NUM(_Id) ((_Id) >> 16) /* * */ #define MESSAGING_DEVICE_PATH 0x03 #define MSG_ATAPI_DP 0x01 typedef struct _ATAPI_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT8 PrimarySecondary; UINT8 SlaveMaster; UINT16 Lun; } ATAPI_DEVICE_PATH; #define MSG_SCSI_DP 0x02 typedef struct _SCSI_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT16 Pun; UINT16 Lun; } SCSI_DEVICE_PATH; #define MSG_FIBRECHANNEL_DP 0x03 typedef struct _FIBRECHANNEL_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT32 Reserved; UINT64 WWN; UINT64 Lun; } FIBRECHANNEL_DEVICE_PATH; #define MSG_1394_DP 0x04 typedef struct _F1394_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT32 Reserved; UINT64 Guid; } F1394_DEVICE_PATH; #define MSG_USB_DP 0x05 typedef struct _USB_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT8 Port; UINT8 Endpoint; } USB_DEVICE_PATH; #define MSG_USB_CLASS_DP 0x0F typedef struct _USB_CLASS_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT16 VendorId; UINT16 ProductId; UINT8 DeviceClass; UINT8 DeviceSubclass; UINT8 DeviceProtocol; } USB_CLASS_DEVICE_PATH; #define MSG_I2O_DP 0x06 typedef struct _I2O_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT32 Tid; } I2O_DEVICE_PATH; #define MSG_MAC_ADDR_DP 0x0b typedef struct _MAC_ADDR_DEVICE_PATH { EFI_DEVICE_PATH Header; EFI_MAC_ADDRESS MacAddress; UINT8 IfType; } MAC_ADDR_DEVICE_PATH; #define MSG_IPv4_DP 0x0c typedef struct _IPv4_DEVICE_PATH { EFI_DEVICE_PATH Header; EFI_IPv4_ADDRESS LocalIpAddress; EFI_IPv4_ADDRESS RemoteIpAddress; UINT16 LocalPort; UINT16 RemotePort; UINT16 Protocol; BOOLEAN StaticIpAddress; } IPv4_DEVICE_PATH; #define MSG_IPv6_DP 0x0d typedef struct _IPv6_DEVICE_PATH { EFI_DEVICE_PATH Header; EFI_IPv6_ADDRESS LocalIpAddress; EFI_IPv6_ADDRESS RemoteIpAddress; UINT16 LocalPort; UINT16 RemotePort; UINT16 Protocol; BOOLEAN StaticIpAddress; } IPv6_DEVICE_PATH; #define MSG_INFINIBAND_DP 0x09 typedef struct _INFINIBAND_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT32 Reserved; UINT64 NodeGuid; UINT64 IocGuid; UINT64 DeviceId; } INFINIBAND_DEVICE_PATH; #define MSG_UART_DP 0x0e typedef struct _UART_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT32 Reserved; UINT64 BaudRate; UINT8 DataBits; UINT8 Parity; UINT8 StopBits; } UART_DEVICE_PATH; #define MSG_VENDOR_DP 0x0A /* Use VENDOR_DEVICE_PATH struct */ #define DEVICE_PATH_MESSAGING_PC_ANSI \ { 0xe0c14753, 0xf9be, 0x11d2, {0x9a, 0x0c, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d} } #define DEVICE_PATH_MESSAGING_VT_100 \ { 0xdfa66065, 0xb419, 0x11d3, {0x9a, 0x2d, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d} } #define MEDIA_DEVICE_PATH 0x04 #define MEDIA_HARDDRIVE_DP 0x01 typedef struct _HARDDRIVE_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT32 PartitionNumber; UINT64 PartitionStart; UINT64 PartitionSize; UINT8 Signature[16]; UINT8 MBRType; UINT8 SignatureType; } HARDDRIVE_DEVICE_PATH; #define MBR_TYPE_PCAT 0x01 #define MBR_TYPE_EFI_PARTITION_TABLE_HEADER 0x02 #define SIGNATURE_TYPE_MBR 0x01 #define SIGNATURE_TYPE_GUID 0x02 #define MEDIA_CDROM_DP 0x02 typedef struct _CDROM_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT32 BootEntry; UINT64 PartitionStart; UINT64 PartitionSize; } CDROM_DEVICE_PATH; #define MEDIA_VENDOR_DP 0x03 /* Use VENDOR_DEVICE_PATH struct */ #define MEDIA_FILEPATH_DP 0x04 typedef struct _FILEPATH_DEVICE_PATH { EFI_DEVICE_PATH Header; CHAR16 PathName[1]; } FILEPATH_DEVICE_PATH; #define SIZE_OF_FILEPATH_DEVICE_PATH EFI_FIELD_OFFSET(FILEPATH_DEVICE_PATH,PathName) #define MEDIA_PROTOCOL_DP 0x05 typedef struct _MEDIA_PROTOCOL_DEVICE_PATH { EFI_DEVICE_PATH Header; EFI_GUID Protocol; } MEDIA_PROTOCOL_DEVICE_PATH; #define BBS_DEVICE_PATH 0x05 #define BBS_BBS_DP 0x01 typedef struct _BBS_BBS_DEVICE_PATH { EFI_DEVICE_PATH Header; UINT16 DeviceType; UINT16 StatusFlag; CHAR8 String[1]; } BBS_BBS_DEVICE_PATH; /* DeviceType definitions - from BBS specification */ #define BBS_TYPE_FLOPPY 0x01 #define BBS_TYPE_HARDDRIVE 0x02 #define BBS_TYPE_CDROM 0x03 #define BBS_TYPE_PCMCIA 0x04 #define BBS_TYPE_USB 0x05 #define BBS_TYPE_EMBEDDED_NETWORK 0x06 #define BBS_TYPE_DEV 0x80 #define BBS_TYPE_UNKNOWN 0xFF typedef union { EFI_DEVICE_PATH DevPath; PCI_DEVICE_PATH Pci; PCCARD_DEVICE_PATH PcCard; MEMMAP_DEVICE_PATH MemMap; VENDOR_DEVICE_PATH Vendor; UNKNOWN_DEVICE_VENDOR_DEVICE_PATH UnknownVendor; CONTROLLER_DEVICE_PATH Controller; ACPI_HID_DEVICE_PATH Acpi; ATAPI_DEVICE_PATH Atapi; SCSI_DEVICE_PATH Scsi; FIBRECHANNEL_DEVICE_PATH FibreChannel; F1394_DEVICE_PATH F1394; USB_DEVICE_PATH Usb; USB_CLASS_DEVICE_PATH UsbClass; I2O_DEVICE_PATH I2O; MAC_ADDR_DEVICE_PATH MacAddr; IPv4_DEVICE_PATH Ipv4; IPv6_DEVICE_PATH Ipv6; INFINIBAND_DEVICE_PATH InfiniBand; UART_DEVICE_PATH Uart; HARDDRIVE_DEVICE_PATH HardDrive; CDROM_DEVICE_PATH CD; FILEPATH_DEVICE_PATH FilePath; MEDIA_PROTOCOL_DEVICE_PATH MediaProtocol; BBS_BBS_DEVICE_PATH Bbs; } EFI_DEV_PATH; typedef union { EFI_DEVICE_PATH *DevPath; PCI_DEVICE_PATH *Pci; PCCARD_DEVICE_PATH *PcCard; MEMMAP_DEVICE_PATH *MemMap; VENDOR_DEVICE_PATH *Vendor; UNKNOWN_DEVICE_VENDOR_DEVICE_PATH *UnknownVendor; CONTROLLER_DEVICE_PATH *Controller; ACPI_HID_DEVICE_PATH *Acpi; ATAPI_DEVICE_PATH *Atapi; SCSI_DEVICE_PATH *Scsi; FIBRECHANNEL_DEVICE_PATH *FibreChannel; F1394_DEVICE_PATH *F1394; USB_DEVICE_PATH *Usb; USB_CLASS_DEVICE_PATH *UsbClass; I2O_DEVICE_PATH *I2O; MAC_ADDR_DEVICE_PATH *MacAddr; IPv4_DEVICE_PATH *Ipv4; IPv6_DEVICE_PATH *Ipv6; INFINIBAND_DEVICE_PATH *InfiniBand; UART_DEVICE_PATH *Uart; HARDDRIVE_DEVICE_PATH *HardDrive; FILEPATH_DEVICE_PATH *FilePath; MEDIA_PROTOCOL_DEVICE_PATH *MediaProtocol; CDROM_DEVICE_PATH *CD; BBS_BBS_DEVICE_PATH *Bbs; } EFI_DEV_PATH_PTR; #endif xen-4.4.0/xen/include/efi/efierr.h0000664000175000017500000000407712307313555015060 0ustar smbsmb#ifndef _EFI_ERR_H #define _EFI_ERR_H /*++ Copyright (c) 1998 Intel Corporation Module Name: efierr.h Abstract: EFI error codes Revision History --*/ #define EFIWARN(a) (a) #define EFI_ERROR(a) (((INTN) a) < 0) #define EFI_SUCCESS 0 #define EFI_LOAD_ERROR EFIERR(1) #define EFI_INVALID_PARAMETER EFIERR(2) #define EFI_UNSUPPORTED EFIERR(3) #define EFI_BAD_BUFFER_SIZE EFIERR(4) #define EFI_BUFFER_TOO_SMALL EFIERR(5) #define EFI_NOT_READY EFIERR(6) #define EFI_DEVICE_ERROR EFIERR(7) #define EFI_WRITE_PROTECTED EFIERR(8) #define EFI_OUT_OF_RESOURCES EFIERR(9) #define EFI_VOLUME_CORRUPTED EFIERR(10) #define EFI_VOLUME_FULL EFIERR(11) #define EFI_NO_MEDIA EFIERR(12) #define EFI_MEDIA_CHANGED EFIERR(13) #define EFI_NOT_FOUND EFIERR(14) #define EFI_ACCESS_DENIED EFIERR(15) #define EFI_NO_RESPONSE EFIERR(16) #define EFI_NO_MAPPING EFIERR(17) #define EFI_TIMEOUT EFIERR(18) #define EFI_NOT_STARTED EFIERR(19) #define EFI_ALREADY_STARTED EFIERR(20) #define EFI_ABORTED EFIERR(21) #define EFI_ICMP_ERROR EFIERR(22) #define EFI_TFTP_ERROR EFIERR(23) #define EFI_PROTOCOL_ERROR EFIERR(24) #define EFI_INCOMPATIBLE_VERSION EFIERR(25) #define EFI_SECURITY_VIOLATION EFIERR(26) #define EFI_CRC_ERROR EFIERR(27) #define EFI_END_OF_MEDIA EFIERR(28) #define EFI_END_OF_FILE EFIERR(31) #define EFI_INVALID_LANGUAGE EFIERR(32) #define EFI_COMPROMISED_DATA EFIERR(33) #define EFI_WARN_UNKOWN_GLYPH EFIWARN(1) #define EFI_WARN_DELETE_FAILURE EFIWARN(2) #define EFI_WARN_WRITE_FAILURE EFIWARN(3) #define EFI_WARN_BUFFER_TOO_SMALL EFIWARN(4) #endif xen-4.4.0/xen/include/efi/efidef.h0000664000175000017500000001025712307313555015023 0ustar smbsmb#ifndef _EFI_DEF_H #define _EFI_DEF_H /*++ Copyright (c) 1998 Intel Corporation Module Name: efidef.h Abstract: EFI definitions Revision History --*/ typedef UINT16 CHAR16; typedef UINT8 CHAR8; typedef UINT8 BOOLEAN; #ifndef TRUE #define TRUE ((BOOLEAN) 1) #define FALSE ((BOOLEAN) 0) #endif #ifndef NULL #define NULL ((VOID *) 0) #endif typedef UINTN EFI_STATUS; typedef UINT64 EFI_LBA; typedef UINTN EFI_TPL; typedef VOID *EFI_HANDLE; typedef VOID *EFI_EVENT; // // Prototype argument decoration for EFI parameters to indicate // their direction // // IN - argument is passed into the function // OUT - argument (pointer) is returned from the function // OPTIONAL - argument is optional // #ifndef IN #define IN #define OUT #define OPTIONAL #endif // // A GUID // typedef struct { UINT32 Data1; UINT16 Data2; UINT16 Data3; UINT8 Data4[8]; } EFI_GUID; // // Time // typedef struct { UINT16 Year; // 1998 - 20XX UINT8 Month; // 1 - 12 UINT8 Day; // 1 - 31 UINT8 Hour; // 0 - 23 UINT8 Minute; // 0 - 59 UINT8 Second; // 0 - 59 UINT8 Pad1; UINT32 Nanosecond; // 0 - 999,999,999 INT16 TimeZone; // -1440 to 1440 or 2047 UINT8 Daylight; UINT8 Pad2; } EFI_TIME; // Bit definitions for EFI_TIME.Daylight #define EFI_TIME_ADJUST_DAYLIGHT 0x01 #define EFI_TIME_IN_DAYLIGHT 0x02 // Value definition for EFI_TIME.TimeZone #define EFI_UNSPECIFIED_TIMEZONE 0x07FF // // Networking // typedef struct { UINT8 Addr[4]; } EFI_IPv4_ADDRESS; typedef struct { UINT8 Addr[16]; } EFI_IPv6_ADDRESS; typedef struct { UINT8 Addr[32]; } EFI_MAC_ADDRESS; // // Memory // typedef UINT64 EFI_PHYSICAL_ADDRESS; typedef UINT64 EFI_VIRTUAL_ADDRESS; typedef enum { AllocateAnyPages, AllocateMaxAddress, AllocateAddress, MaxAllocateType } EFI_ALLOCATE_TYPE; //Preseve the attr on any range supplied. //ConventialMemory must have WB,SR,SW when supplied. //When allocating from ConventialMemory always make it WB,SR,SW //When returning to ConventialMemory always make it WB,SR,SW //When getting the memory map, or on RT for runtime types typedef enum { EfiReservedMemoryType, EfiLoaderCode, EfiLoaderData, EfiBootServicesCode, EfiBootServicesData, EfiRuntimeServicesCode, EfiRuntimeServicesData, EfiConventionalMemory, EfiUnusableMemory, EfiACPIReclaimMemory, EfiACPIMemoryNVS, EfiMemoryMappedIO, EfiMemoryMappedIOPortSpace, EfiPalCode, EfiMaxMemoryType } EFI_MEMORY_TYPE; // possible caching types for the memory range #define EFI_MEMORY_UC 0x0000000000000001 #define EFI_MEMORY_WC 0x0000000000000002 #define EFI_MEMORY_WT 0x0000000000000004 #define EFI_MEMORY_WB 0x0000000000000008 #define EFI_MEMORY_UCE 0x0000000000000010 // physical memory protection on range #define EFI_MEMORY_WP 0x0000000000001000 #define EFI_MEMORY_RP 0x0000000000002000 #define EFI_MEMORY_XP 0x0000000000004000 // range requires a runtime mapping #define EFI_MEMORY_RUNTIME 0x8000000000000000 #define EFI_MEMORY_DESCRIPTOR_VERSION 1 typedef struct { UINT32 Type; // Field size is 32 bits followed by 32 bit pad UINT32 Pad; EFI_PHYSICAL_ADDRESS PhysicalStart; // Field size is 64 bits EFI_VIRTUAL_ADDRESS VirtualStart; // Field size is 64 bits UINT64 NumberOfPages; // Field size is 64 bits UINT64 Attribute; // Field size is 64 bits } EFI_MEMORY_DESCRIPTOR; // // International Language // typedef UINT8 ISO_639_2; #define ISO_639_2_ENTRY_SIZE 3 // // // #define EFI_PAGE_SIZE 4096 #define EFI_PAGE_MASK 0xFFF #define EFI_PAGE_SHIFT 12 #define EFI_SIZE_TO_PAGES(a) \ ( ((a) >> EFI_PAGE_SHIFT) + ((a) & EFI_PAGE_MASK ? 1 : 0) ) #endif xen-4.4.0/xen/include/efi/efiprot.h0000664000175000017500000005022312307313555015246 0ustar smbsmb#ifndef _EFI_PROT_H #define _EFI_PROT_H /*++ Copyright (c) 1998 Intel Corporation Module Name: efiprot.h Abstract: EFI Protocols Revision History --*/ // // FPSWA library protocol // #define FPSWA_PROTOCOL \ { 0xc41b6531, 0x97b9, 0x11d3, {0x9a, 0x29, 0x0, 0x90, 0x27, 0x3f, 0xc1, 0x4d} } // // Device Path protocol // #define DEVICE_PATH_PROTOCOL \ { 0x9576e91, 0x6d3f, 0x11d2, {0x8e, 0x39, 0x0, 0xa0, 0xc9, 0x69, 0x72, 0x3b} } // // Block IO protocol // #define BLOCK_IO_PROTOCOL \ { 0x964e5b21, 0x6459, 0x11d2, {0x8e, 0x39, 0x0, 0xa0, 0xc9, 0x69, 0x72, 0x3b} } #define EFI_BLOCK_IO_INTERFACE_REVISION 0x00010000 INTERFACE_DECL(_EFI_BLOCK_IO); typedef EFI_STATUS (EFIAPI *EFI_BLOCK_RESET) ( IN struct _EFI_BLOCK_IO *This, IN BOOLEAN ExtendedVerification ); typedef EFI_STATUS (EFIAPI *EFI_BLOCK_READ) ( IN struct _EFI_BLOCK_IO *This, IN UINT32 MediaId, IN EFI_LBA LBA, IN UINTN BufferSize, OUT VOID *Buffer ); typedef EFI_STATUS (EFIAPI *EFI_BLOCK_WRITE) ( IN struct _EFI_BLOCK_IO *This, IN UINT32 MediaId, IN EFI_LBA LBA, IN UINTN BufferSize, IN VOID *Buffer ); typedef EFI_STATUS (EFIAPI *EFI_BLOCK_FLUSH) ( IN struct _EFI_BLOCK_IO *This ); typedef struct { UINT32 MediaId; BOOLEAN RemovableMedia; BOOLEAN MediaPresent; BOOLEAN LogicalPartition; BOOLEAN ReadOnly; BOOLEAN WriteCaching; UINT32 BlockSize; UINT32 IoAlign; EFI_LBA LastBlock; } EFI_BLOCK_IO_MEDIA; typedef struct _EFI_BLOCK_IO { UINT64 Revision; EFI_BLOCK_IO_MEDIA *Media; EFI_BLOCK_RESET Reset; EFI_BLOCK_READ ReadBlocks; EFI_BLOCK_WRITE WriteBlocks; EFI_BLOCK_FLUSH FlushBlocks; } EFI_BLOCK_IO; // // Disk Block IO protocol // #define DISK_IO_PROTOCOL \ { 0xce345171, 0xba0b, 0x11d2, {0x8e, 0x4f, 0x0, 0xa0, 0xc9, 0x69, 0x72, 0x3b} } #define EFI_DISK_IO_INTERFACE_REVISION 0x00010000 INTERFACE_DECL(_EFI_DISK_IO); typedef EFI_STATUS (EFIAPI *EFI_DISK_READ) ( IN struct _EFI_DISK_IO *This, IN UINT32 MediaId, IN UINT64 Offset, IN UINTN BufferSize, OUT VOID *Buffer ); typedef EFI_STATUS (EFIAPI *EFI_DISK_WRITE) ( IN struct _EFI_DISK_IO *This, IN UINT32 MediaId, IN UINT64 Offset, IN UINTN BufferSize, IN VOID *Buffer ); typedef struct _EFI_DISK_IO { UINT64 Revision; EFI_DISK_READ ReadDisk; EFI_DISK_WRITE WriteDisk; } EFI_DISK_IO; // // Simple file system protocol // #define SIMPLE_FILE_SYSTEM_PROTOCOL \ { 0x964e5b22, 0x6459, 0x11d2, {0x8e, 0x39, 0x0, 0xa0, 0xc9, 0x69, 0x72, 0x3b} } INTERFACE_DECL(_EFI_FILE_IO_INTERFACE); INTERFACE_DECL(_EFI_FILE_HANDLE); typedef EFI_STATUS (EFIAPI *EFI_VOLUME_OPEN) ( IN struct _EFI_FILE_IO_INTERFACE *This, OUT struct _EFI_FILE_HANDLE **Root ); #define EFI_FILE_IO_INTERFACE_REVISION 0x00010000 typedef struct _EFI_FILE_IO_INTERFACE { UINT64 Revision; EFI_VOLUME_OPEN OpenVolume; } EFI_FILE_IO_INTERFACE; // // // typedef EFI_STATUS (EFIAPI *EFI_FILE_OPEN) ( IN struct _EFI_FILE_HANDLE *File, OUT struct _EFI_FILE_HANDLE **NewHandle, IN CHAR16 *FileName, IN UINT64 OpenMode, IN UINT64 Attributes ); // Open modes #define EFI_FILE_MODE_READ 0x0000000000000001 #define EFI_FILE_MODE_WRITE 0x0000000000000002 #define EFI_FILE_MODE_CREATE 0x8000000000000000 // File attributes #define EFI_FILE_READ_ONLY 0x0000000000000001 #define EFI_FILE_HIDDEN 0x0000000000000002 #define EFI_FILE_SYSTEM 0x0000000000000004 #define EFI_FILE_RESERVIED 0x0000000000000008 #define EFI_FILE_DIRECTORY 0x0000000000000010 #define EFI_FILE_ARCHIVE 0x0000000000000020 #define EFI_FILE_VALID_ATTR 0x0000000000000037 typedef EFI_STATUS (EFIAPI *EFI_FILE_CLOSE) ( IN struct _EFI_FILE_HANDLE *File ); typedef EFI_STATUS (EFIAPI *EFI_FILE_DELETE) ( IN struct _EFI_FILE_HANDLE *File ); typedef EFI_STATUS (EFIAPI *EFI_FILE_READ) ( IN struct _EFI_FILE_HANDLE *File, IN OUT UINTN *BufferSize, OUT VOID *Buffer ); typedef EFI_STATUS (EFIAPI *EFI_FILE_WRITE) ( IN struct _EFI_FILE_HANDLE *File, IN OUT UINTN *BufferSize, IN VOID *Buffer ); typedef EFI_STATUS (EFIAPI *EFI_FILE_SET_POSITION) ( IN struct _EFI_FILE_HANDLE *File, IN UINT64 Position ); typedef EFI_STATUS (EFIAPI *EFI_FILE_GET_POSITION) ( IN struct _EFI_FILE_HANDLE *File, OUT UINT64 *Position ); typedef EFI_STATUS (EFIAPI *EFI_FILE_GET_INFO) ( IN struct _EFI_FILE_HANDLE *File, IN EFI_GUID *InformationType, IN OUT UINTN *BufferSize, OUT VOID *Buffer ); typedef EFI_STATUS (EFIAPI *EFI_FILE_SET_INFO) ( IN struct _EFI_FILE_HANDLE *File, IN EFI_GUID *InformationType, IN UINTN BufferSize, IN VOID *Buffer ); typedef EFI_STATUS (EFIAPI *EFI_FILE_FLUSH) ( IN struct _EFI_FILE_HANDLE *File ); #define EFI_FILE_HANDLE_REVISION 0x00010000 typedef struct _EFI_FILE_HANDLE { UINT64 Revision; EFI_FILE_OPEN Open; EFI_FILE_CLOSE Close; EFI_FILE_DELETE Delete; EFI_FILE_READ Read; EFI_FILE_WRITE Write; EFI_FILE_GET_POSITION GetPosition; EFI_FILE_SET_POSITION SetPosition; EFI_FILE_GET_INFO GetInfo; EFI_FILE_SET_INFO SetInfo; EFI_FILE_FLUSH Flush; } EFI_FILE, *EFI_FILE_HANDLE; // // File information types // #define EFI_FILE_INFO_ID \ { 0x9576e92, 0x6d3f, 0x11d2, {0x8e, 0x39, 0x0, 0xa0, 0xc9, 0x69, 0x72, 0x3b} } typedef struct { UINT64 Size; UINT64 FileSize; UINT64 PhysicalSize; EFI_TIME CreateTime; EFI_TIME LastAccessTime; EFI_TIME ModificationTime; UINT64 Attribute; CHAR16 FileName[1]; } EFI_FILE_INFO; // // The FileName field of the EFI_FILE_INFO data structure is variable length. // Whenever code needs to know the size of the EFI_FILE_INFO data structure, it needs to // be the size of the data structure without the FileName field. The following macro // computes this size correctly no matter how big the FileName array is declared. // This is required to make the EFI_FILE_INFO data structure ANSI compilant. // #define SIZE_OF_EFI_FILE_INFO EFI_FIELD_OFFSET(EFI_FILE_INFO,FileName) #define EFI_FILE_SYSTEM_INFO_ID \ { 0x9576e93, 0x6d3f, 0x11d2, {0x8e, 0x39, 0x0, 0xa0, 0xc9, 0x69, 0x72, 0x3b} } typedef struct { UINT64 Size; BOOLEAN ReadOnly; UINT64 VolumeSize; UINT64 FreeSpace; UINT32 BlockSize; CHAR16 VolumeLabel[1]; } EFI_FILE_SYSTEM_INFO; // // The VolumeLabel field of the EFI_FILE_SYSTEM_INFO data structure is variable length. // Whenever code needs to know the size of the EFI_FILE_SYSTEM_INFO data structure, it needs // to be the size of the data structure without the VolumeLable field. The following macro // computes this size correctly no matter how big the VolumeLable array is declared. // This is required to make the EFI_FILE_SYSTEM_INFO data structure ANSI compilant. // #define SIZE_OF_EFI_FILE_SYSTEM_INFO EFI_FIELD_OFFSET(EFI_FILE_SYSTEM_INFO,VolumeLabel) #define EFI_FILE_SYSTEM_VOLUME_LABEL_INFO_ID \ { 0xDB47D7D3,0xFE81, 0x11d3, {0x9A, 0x35, 0x00, 0x90, 0x27, 0x3F, 0xC1, 0x4D} } typedef struct { CHAR16 VolumeLabel[1]; } EFI_FILE_SYSTEM_VOLUME_LABEL_INFO; #define SIZE_OF_EFI_FILE_SYSTEM_VOLUME_LABEL_INFO EFI_FIELD_OFFSET(EFI_FILE_SYSTEM_VOLUME_LABEL_INFO,VolumeLabel) // // Load file protocol // #define LOAD_FILE_PROTOCOL \ { 0x56EC3091, 0x954C, 0x11d2, {0x8E, 0x3F, 0x00, 0xA0, 0xC9, 0x69, 0x72, 0x3B} } INTERFACE_DECL(_EFI_LOAD_FILE_INTERFACE); typedef EFI_STATUS (EFIAPI *EFI_LOAD_FILE) ( IN struct _EFI_LOAD_FILE_INTERFACE *This, IN EFI_DEVICE_PATH *FilePath, IN BOOLEAN BootPolicy, IN OUT UINTN *BufferSize, IN VOID *Buffer OPTIONAL ); typedef struct _EFI_LOAD_FILE_INTERFACE { EFI_LOAD_FILE LoadFile; } EFI_LOAD_FILE_INTERFACE; // // Device IO protocol // #define DEVICE_IO_PROTOCOL \ { 0xaf6ac311, 0x84c3, 0x11d2, {0x8e, 0x3c, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b} } INTERFACE_DECL(_EFI_DEVICE_IO_INTERFACE); typedef enum { IO_UINT8, IO_UINT16, IO_UINT32, IO_UINT64, // // Specification Change: Copy from MMIO to MMIO vs. MMIO to buffer, buffer to MMIO // MMIO_COPY_UINT8, MMIO_COPY_UINT16, MMIO_COPY_UINT32, MMIO_COPY_UINT64 } EFI_IO_WIDTH; #define EFI_PCI_ADDRESS(_bus,_dev,_func) \ ( (UINT64) ( (((UINTN)_bus) << 24) + (((UINTN)_dev) << 16) + (((UINTN)_func) << 8) ) ) typedef EFI_STATUS (EFIAPI *EFI_DEVICE_IO) ( IN struct _EFI_DEVICE_IO_INTERFACE *This, IN EFI_IO_WIDTH Width, IN UINT64 Address, IN UINTN Count, IN OUT VOID *Buffer ); typedef struct { EFI_DEVICE_IO Read; EFI_DEVICE_IO Write; } EFI_IO_ACCESS; typedef EFI_STATUS (EFIAPI *EFI_PCI_DEVICE_PATH) ( IN struct _EFI_DEVICE_IO_INTERFACE *This, IN UINT64 Address, IN OUT EFI_DEVICE_PATH **PciDevicePath ); typedef enum { EfiBusMasterRead, EfiBusMasterWrite, EfiBusMasterCommonBuffer } EFI_IO_OPERATION_TYPE; typedef EFI_STATUS (EFIAPI *EFI_IO_MAP) ( IN struct _EFI_DEVICE_IO_INTERFACE *This, IN EFI_IO_OPERATION_TYPE Operation, IN EFI_PHYSICAL_ADDRESS *HostAddress, IN OUT UINTN *NumberOfBytes, OUT EFI_PHYSICAL_ADDRESS *DeviceAddress, OUT VOID **Mapping ); typedef EFI_STATUS (EFIAPI *EFI_IO_UNMAP) ( IN struct _EFI_DEVICE_IO_INTERFACE *This, IN VOID *Mapping ); typedef EFI_STATUS (EFIAPI *EFI_IO_ALLOCATE_BUFFER) ( IN struct _EFI_DEVICE_IO_INTERFACE *This, IN EFI_ALLOCATE_TYPE Type, IN EFI_MEMORY_TYPE MemoryType, IN UINTN Pages, IN OUT EFI_PHYSICAL_ADDRESS *HostAddress ); typedef EFI_STATUS (EFIAPI *EFI_IO_FLUSH) ( IN struct _EFI_DEVICE_IO_INTERFACE *This ); typedef EFI_STATUS (EFIAPI *EFI_IO_FREE_BUFFER) ( IN struct _EFI_DEVICE_IO_INTERFACE *This, IN UINTN Pages, IN EFI_PHYSICAL_ADDRESS HostAddress ); typedef struct _EFI_DEVICE_IO_INTERFACE { EFI_IO_ACCESS Mem; EFI_IO_ACCESS Io; EFI_IO_ACCESS Pci; EFI_IO_MAP Map; EFI_PCI_DEVICE_PATH PciDevicePath; EFI_IO_UNMAP Unmap; EFI_IO_ALLOCATE_BUFFER AllocateBuffer; EFI_IO_FLUSH Flush; EFI_IO_FREE_BUFFER FreeBuffer; } EFI_DEVICE_IO_INTERFACE; // // Unicode Collation protocol // #define UNICODE_COLLATION_PROTOCOL \ { 0x1d85cd7f, 0xf43d, 0x11d2, {0x9a, 0xc, 0x0, 0x90, 0x27, 0x3f, 0xc1, 0x4d} } #define UNICODE_BYTE_ORDER_MARK (CHAR16)(0xfeff) INTERFACE_DECL(_EFI_UNICODE_COLLATION_INTERFACE); typedef INTN (EFIAPI *EFI_UNICODE_STRICOLL) ( IN struct _EFI_UNICODE_COLLATION_INTERFACE *This, IN CHAR16 *s1, IN CHAR16 *s2 ); typedef BOOLEAN (EFIAPI *EFI_UNICODE_METAIMATCH) ( IN struct _EFI_UNICODE_COLLATION_INTERFACE *This, IN CHAR16 *String, IN CHAR16 *Pattern ); typedef VOID (EFIAPI *EFI_UNICODE_STRLWR) ( IN struct _EFI_UNICODE_COLLATION_INTERFACE *This, IN OUT CHAR16 *Str ); typedef VOID (EFIAPI *EFI_UNICODE_STRUPR) ( IN struct _EFI_UNICODE_COLLATION_INTERFACE *This, IN OUT CHAR16 *Str ); typedef VOID (EFIAPI *EFI_UNICODE_FATTOSTR) ( IN struct _EFI_UNICODE_COLLATION_INTERFACE *This, IN UINTN FatSize, IN CHAR8 *Fat, OUT CHAR16 *String ); typedef BOOLEAN (EFIAPI *EFI_UNICODE_STRTOFAT) ( IN struct _EFI_UNICODE_COLLATION_INTERFACE *This, IN CHAR16 *String, IN UINTN FatSize, OUT CHAR8 *Fat ); typedef struct _EFI_UNICODE_COLLATION_INTERFACE { // general EFI_UNICODE_STRICOLL StriColl; EFI_UNICODE_METAIMATCH MetaiMatch; EFI_UNICODE_STRLWR StrLwr; EFI_UNICODE_STRUPR StrUpr; // for supporting fat volumes EFI_UNICODE_FATTOSTR FatToStr; EFI_UNICODE_STRTOFAT StrToFat; CHAR8 *SupportedLanguages; } EFI_UNICODE_COLLATION_INTERFACE; /* Graphics output protocol */ #define EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID \ { \ 0x9042a9de, 0x23dc, 0x4a38, {0x96, 0xfb, 0x7a, 0xde, 0xd0, 0x80, 0x51, 0x6a } \ } typedef struct _EFI_GRAPHICS_OUTPUT_PROTOCOL EFI_GRAPHICS_OUTPUT_PROTOCOL; typedef struct { UINT32 RedMask; UINT32 GreenMask; UINT32 BlueMask; UINT32 ReservedMask; } EFI_PIXEL_BITMASK; typedef enum { PixelRedGreenBlueReserved8BitPerColor, PixelBlueGreenRedReserved8BitPerColor, PixelBitMask, PixelBltOnly, PixelFormatMax } EFI_GRAPHICS_PIXEL_FORMAT; typedef struct { UINT32 Version; UINT32 HorizontalResolution; UINT32 VerticalResolution; EFI_GRAPHICS_PIXEL_FORMAT PixelFormat; EFI_PIXEL_BITMASK PixelInformation; UINT32 PixelsPerScanLine; } EFI_GRAPHICS_OUTPUT_MODE_INFORMATION; /** Return the current video mode information. @param This Protocol instance pointer. @param ModeNumber The mode number to return information on. @param SizeOfInfo A pointer to the size, in bytes, of the Info buffer. @param Info A pointer to callee allocated buffer that returns information about ModeNumber. @retval EFI_SUCCESS Mode information returned. @retval EFI_BUFFER_TOO_SMALL The Info buffer was too small. @retval EFI_DEVICE_ERROR A hardware error occurred trying to retrieve the video mode. @retval EFI_NOT_STARTED Video display is not initialized. Call SetMode () @retval EFI_INVALID_PARAMETER One of the input args was NULL. **/ typedef EFI_STATUS (EFIAPI *EFI_GRAPHICS_OUTPUT_PROTOCOL_QUERY_MODE) ( IN EFI_GRAPHICS_OUTPUT_PROTOCOL *This, IN UINT32 ModeNumber, OUT UINTN *SizeOfInfo, OUT EFI_GRAPHICS_OUTPUT_MODE_INFORMATION **Info ) ; /** Return the current video mode information. @param This Protocol instance pointer. @param ModeNumber The mode number to be set. @retval EFI_SUCCESS Graphics mode was changed. @retval EFI_DEVICE_ERROR The device had an error and could not complete the request. @retval EFI_UNSUPPORTED ModeNumber is not supported by this device. **/ typedef EFI_STATUS (EFIAPI *EFI_GRAPHICS_OUTPUT_PROTOCOL_SET_MODE) ( IN EFI_GRAPHICS_OUTPUT_PROTOCOL *This, IN UINT32 ModeNumber ) ; typedef struct { UINT8 Blue; UINT8 Green; UINT8 Red; UINT8 Reserved; } EFI_GRAPHICS_OUTPUT_BLT_PIXEL; typedef union { EFI_GRAPHICS_OUTPUT_BLT_PIXEL Pixel; UINT32 Raw; } EFI_GRAPHICS_OUTPUT_BLT_PIXEL_UNION; typedef enum { EfiBltVideoFill, EfiBltVideoToBltBuffer, EfiBltBufferToVideo, EfiBltVideoToVideo, EfiGraphicsOutputBltOperationMax } EFI_GRAPHICS_OUTPUT_BLT_OPERATION; /** The following table defines actions for BltOperations: EfiBltVideoFill - Write data from the BltBuffer pixel (SourceX, SourceY) directly to every pixel of the video display rectangle (DestinationX, DestinationY) (DestinationX + Width, DestinationY + Height). Only one pixel will be used from the BltBuffer. Delta is NOT used. EfiBltVideoToBltBuffer - Read data from the video display rectangle (SourceX, SourceY) (SourceX + Width, SourceY + Height) and place it in the BltBuffer rectangle (DestinationX, DestinationY ) (DestinationX + Width, DestinationY + Height). If DestinationX or DestinationY is not zero then Delta must be set to the length in bytes of a row in the BltBuffer. EfiBltBufferToVideo - Write data from the BltBuffer rectangle (SourceX, SourceY) (SourceX + Width, SourceY + Height) directly to the video display rectangle (DestinationX, DestinationY) (DestinationX + Width, DestinationY + Height). If SourceX or SourceY is not zero then Delta must be set to the length in bytes of a row in the BltBuffer. EfiBltVideoToVideo - Copy from the video display rectangle (SourceX, SourceY) (SourceX + Width, SourceY + Height) .to the video display rectangle (DestinationX, DestinationY) (DestinationX + Width, DestinationY + Height). The BltBuffer and Delta are not used in this mode. @param This Protocol instance pointer. @param BltBuffer Buffer containing data to blit into video buffer. This buffer has a size of Width*Height*sizeof(EFI_GRAPHICS_OUTPUT_BLT_PIXEL) @param BltOperation Operation to perform on BlitBuffer and video memory @param SourceX X coordinate of source for the BltBuffer. @param SourceY Y coordinate of source for the BltBuffer. @param DestinationX X coordinate of destination for the BltBuffer. @param DestinationY Y coordinate of destination for the BltBuffer. @param Width Width of rectangle in BltBuffer in pixels. @param Height Hight of rectangle in BltBuffer in pixels. @param Delta OPTIONAL @retval EFI_SUCCESS The Blt operation completed. @retval EFI_INVALID_PARAMETER BltOperation is not valid. @retval EFI_DEVICE_ERROR A hardware error occured writting to the video buffer. **/ typedef EFI_STATUS (EFIAPI *EFI_GRAPHICS_OUTPUT_PROTOCOL_BLT) ( IN EFI_GRAPHICS_OUTPUT_PROTOCOL *This, IN EFI_GRAPHICS_OUTPUT_BLT_PIXEL *BltBuffer, OPTIONAL IN EFI_GRAPHICS_OUTPUT_BLT_OPERATION BltOperation, IN UINTN SourceX, IN UINTN SourceY, IN UINTN DestinationX, IN UINTN DestinationY, IN UINTN Width, IN UINTN Height, IN UINTN Delta OPTIONAL ); typedef struct { UINT32 MaxMode; UINT32 Mode; EFI_GRAPHICS_OUTPUT_MODE_INFORMATION *Info; UINTN SizeOfInfo; EFI_PHYSICAL_ADDRESS FrameBufferBase; UINTN FrameBufferSize; } EFI_GRAPHICS_OUTPUT_PROTOCOL_MODE; struct _EFI_GRAPHICS_OUTPUT_PROTOCOL { EFI_GRAPHICS_OUTPUT_PROTOCOL_QUERY_MODE QueryMode; EFI_GRAPHICS_OUTPUT_PROTOCOL_SET_MODE SetMode; EFI_GRAPHICS_OUTPUT_PROTOCOL_BLT Blt; EFI_GRAPHICS_OUTPUT_PROTOCOL_MODE *Mode; }; #endif xen-4.4.0/xen/include/efi/efiapi.h0000664000175000017500000005506412307313555015043 0ustar smbsmb#ifndef _EFI_API_H #define _EFI_API_H /*++ Copyright (c) 1998 Intel Corporation Module Name: efiapi.h Abstract: Global EFI runtime & boot service interfaces Revision History --*/ // // EFI Specification Revision // #define EFI_SPECIFICATION_MAJOR_REVISION 1 #define EFI_SPECIFICATION_MINOR_REVISION 02 // // Declare forward referenced data structures // INTERFACE_DECL(_EFI_SYSTEM_TABLE); // // EFI Memory // typedef EFI_STATUS (EFIAPI *EFI_ALLOCATE_PAGES) ( IN EFI_ALLOCATE_TYPE Type, IN EFI_MEMORY_TYPE MemoryType, IN UINTN NoPages, OUT EFI_PHYSICAL_ADDRESS *Memory ); typedef EFI_STATUS (EFIAPI *EFI_FREE_PAGES) ( IN EFI_PHYSICAL_ADDRESS Memory, IN UINTN NoPages ); typedef EFI_STATUS (EFIAPI *EFI_GET_MEMORY_MAP) ( IN OUT UINTN *MemoryMapSize, IN OUT EFI_MEMORY_DESCRIPTOR *MemoryMap, OUT UINTN *MapKey, OUT UINTN *DescriptorSize, OUT UINT32 *DescriptorVersion ); #define NextMemoryDescriptor(Ptr,Size) ((EFI_MEMORY_DESCRIPTOR *) (((UINT8 *) Ptr) + Size)) typedef EFI_STATUS (EFIAPI *EFI_ALLOCATE_POOL) ( IN EFI_MEMORY_TYPE PoolType, IN UINTN Size, OUT VOID **Buffer ); typedef EFI_STATUS (EFIAPI *EFI_FREE_POOL) ( IN VOID *Buffer ); typedef EFI_STATUS (EFIAPI *EFI_SET_VIRTUAL_ADDRESS_MAP) ( IN UINTN MemoryMapSize, IN UINTN DescriptorSize, IN UINT32 DescriptorVersion, IN EFI_MEMORY_DESCRIPTOR *VirtualMap ); #define EFI_OPTIONAL_PTR 0x00000001 #define EFI_INTERNAL_FNC 0x00000002 // Pointer to internal runtime fnc #define EFI_INTERNAL_PTR 0x00000004 // Pointer to internal runtime data typedef EFI_STATUS (EFIAPI *EFI_CONVERT_POINTER) ( IN UINTN DebugDisposition, IN OUT VOID **Address ); // // EFI Events // #define EVT_TIMER 0x80000000 #define EVT_RUNTIME 0x40000000 #define EVT_RUNTIME_CONTEXT 0x20000000 #define EVT_NOTIFY_WAIT 0x00000100 #define EVT_NOTIFY_SIGNAL 0x00000200 #define EVT_SIGNAL_EXIT_BOOT_SERVICES 0x00000201 #define EVT_SIGNAL_VIRTUAL_ADDRESS_CHANGE 0x60000202 #define EVT_EFI_SIGNAL_MASK 0x000000FF #define EVT_EFI_SIGNAL_MAX 2 typedef VOID (EFIAPI *EFI_EVENT_NOTIFY) ( IN EFI_EVENT Event, IN VOID *Context ); typedef EFI_STATUS (EFIAPI *EFI_CREATE_EVENT) ( IN UINT32 Type, IN EFI_TPL NotifyTpl, IN EFI_EVENT_NOTIFY NotifyFunction, IN VOID *NotifyContext, OUT EFI_EVENT *Event ); typedef enum { TimerCancel, TimerPeriodic, TimerRelative, TimerTypeMax } EFI_TIMER_DELAY; typedef EFI_STATUS (EFIAPI *EFI_SET_TIMER) ( IN EFI_EVENT Event, IN EFI_TIMER_DELAY Type, IN UINT64 TriggerTime ); typedef EFI_STATUS (EFIAPI *EFI_SIGNAL_EVENT) ( IN EFI_EVENT Event ); typedef EFI_STATUS (EFIAPI *EFI_WAIT_FOR_EVENT) ( IN UINTN NumberOfEvents, IN EFI_EVENT *Event, OUT UINTN *Index ); typedef EFI_STATUS (EFIAPI *EFI_CLOSE_EVENT) ( IN EFI_EVENT Event ); typedef EFI_STATUS (EFIAPI *EFI_CHECK_EVENT) ( IN EFI_EVENT Event ); // // Task priority level // #define TPL_APPLICATION 4 #define TPL_CALLBACK 8 #define TPL_NOTIFY 16 #define TPL_HIGH_LEVEL 31 typedef EFI_TPL (EFIAPI *EFI_RAISE_TPL) ( IN EFI_TPL NewTpl ); typedef VOID (EFIAPI *EFI_RESTORE_TPL) ( IN EFI_TPL OldTpl ); // // EFI platform varibles // #define EFI_GLOBAL_VARIABLE \ { 0x8BE4DF61, 0x93CA, 0x11d2, {0xAA, 0x0D, 0x00, 0xE0, 0x98, 0x03, 0x2B, 0x8C} } // Variable attributes #define EFI_VARIABLE_NON_VOLATILE 0x00000001 #define EFI_VARIABLE_BOOTSERVICE_ACCESS 0x00000002 #define EFI_VARIABLE_RUNTIME_ACCESS 0x00000004 #define EFI_VARIABLE_HARDWARE_ERROR_RECORD 0x00000008 #define EFI_VARIABLE_AUTHENTICATED_WRITE_ACCESS 0x00000010 #define EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS 0x00000020 #define EFI_VARIABLE_APPEND_WRITE 0x00000040 // Variable size limitation #define EFI_MAXIMUM_VARIABLE_SIZE 1024 typedef EFI_STATUS (EFIAPI *EFI_GET_VARIABLE) ( IN CHAR16 *VariableName, IN EFI_GUID *VendorGuid, OUT UINT32 *Attributes OPTIONAL, IN OUT UINTN *DataSize, OUT VOID *Data ); typedef EFI_STATUS (EFIAPI *EFI_GET_NEXT_VARIABLE_NAME) ( IN OUT UINTN *VariableNameSize, IN OUT CHAR16 *VariableName, IN OUT EFI_GUID *VendorGuid ); typedef EFI_STATUS (EFIAPI *EFI_SET_VARIABLE) ( IN CHAR16 *VariableName, IN EFI_GUID *VendorGuid, IN UINT32 Attributes, IN UINTN DataSize, IN VOID *Data ); typedef EFI_STATUS (EFIAPI *EFI_QUERY_VARIABLE_INFO) ( IN UINT32 Attributes, OUT UINT64 *MaximumVariableStorageSize, OUT UINT64 *RemainingVariableStorageSize, OUT UINT64 *MaximumVariableSize ); // // EFI Time // typedef struct { UINT32 Resolution; // 1e-6 parts per million UINT32 Accuracy; // hertz BOOLEAN SetsToZero; // Set clears sub-second time } EFI_TIME_CAPABILITIES; typedef EFI_STATUS (EFIAPI *EFI_GET_TIME) ( OUT EFI_TIME *Time, OUT EFI_TIME_CAPABILITIES *Capabilities OPTIONAL ); typedef EFI_STATUS (EFIAPI *EFI_SET_TIME) ( IN EFI_TIME *Time ); typedef EFI_STATUS (EFIAPI *EFI_GET_WAKEUP_TIME) ( OUT BOOLEAN *Enabled, OUT BOOLEAN *Pending, OUT EFI_TIME *Time ); typedef EFI_STATUS (EFIAPI *EFI_SET_WAKEUP_TIME) ( IN BOOLEAN Enable, IN EFI_TIME *Time OPTIONAL ); // // Image functions // // PE32+ Subsystem type for EFI images #if !defined(IMAGE_SUBSYSTEM_EFI_APPLICATION) #define IMAGE_SUBSYSTEM_EFI_APPLICATION 10 #define IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER 11 #define IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER 12 #endif // PE32+ Machine type for EFI images #if !defined(EFI_IMAGE_MACHINE_IA32) #define EFI_IMAGE_MACHINE_IA32 0x014c #endif #if !defined(EFI_IMAGE_MACHINE_IA64) #define EFI_IMAGE_MACHINE_IA64 0x0200 #endif // Image Entry prototype typedef EFI_STATUS (EFIAPI *EFI_IMAGE_ENTRY_POINT) ( IN EFI_HANDLE ImageHandle, IN struct _EFI_SYSTEM_TABLE *SystemTable ); typedef EFI_STATUS (EFIAPI *EFI_IMAGE_LOAD) ( IN BOOLEAN BootPolicy, IN EFI_HANDLE ParentImageHandle, IN EFI_DEVICE_PATH *FilePath, IN VOID *SourceBuffer OPTIONAL, IN UINTN SourceSize, OUT EFI_HANDLE *ImageHandle ); typedef EFI_STATUS (EFIAPI *EFI_IMAGE_START) ( IN EFI_HANDLE ImageHandle, OUT UINTN *ExitDataSize, OUT CHAR16 **ExitData OPTIONAL ); typedef EFI_STATUS (EFIAPI *EFI_EXIT) ( IN EFI_HANDLE ImageHandle, IN EFI_STATUS ExitStatus, IN UINTN ExitDataSize, IN CHAR16 *ExitData OPTIONAL ); typedef EFI_STATUS (EFIAPI *EFI_IMAGE_UNLOAD) ( IN EFI_HANDLE ImageHandle ); // Image handle #define LOADED_IMAGE_PROTOCOL \ { 0x5B1B31A1, 0x9562, 0x11d2, {0x8E, 0x3F, 0x00, 0xA0, 0xC9, 0x69, 0x72, 0x3B} } #define EFI_IMAGE_INFORMATION_REVISION 0x1000 typedef struct { UINT32 Revision; EFI_HANDLE ParentHandle; struct _EFI_SYSTEM_TABLE *SystemTable; // Source location of image EFI_HANDLE DeviceHandle; EFI_DEVICE_PATH *FilePath; VOID *Reserved; // Images load options UINT32 LoadOptionsSize; VOID *LoadOptions; // Location of where image was loaded VOID *ImageBase; UINT64 ImageSize; EFI_MEMORY_TYPE ImageCodeType; EFI_MEMORY_TYPE ImageDataType; // If the driver image supports a dynamic unload request EFI_IMAGE_UNLOAD Unload; } EFI_LOADED_IMAGE; typedef EFI_STATUS (EFIAPI *EFI_EXIT_BOOT_SERVICES) ( IN EFI_HANDLE ImageHandle, IN UINTN MapKey ); // // Misc // typedef EFI_STATUS (EFIAPI *EFI_STALL) ( IN UINTN Microseconds ); typedef EFI_STATUS (EFIAPI *EFI_SET_WATCHDOG_TIMER) ( IN UINTN Timeout, IN UINT64 WatchdogCode, IN UINTN DataSize, IN CHAR16 *WatchdogData OPTIONAL ); typedef EFI_STATUS (EFIAPI *EFI_CONNECT_CONTROLLER) ( IN EFI_HANDLE ControllerHandle, IN EFI_HANDLE *DriverImageHandle OPTIONAL, IN EFI_DEVICE_PATH *RemainingDevicePath OPTIONAL, IN BOOLEAN Recursive ); typedef EFI_STATUS (EFIAPI *EFI_DISCONNECT_CONTROLLER) ( IN EFI_HANDLE ControllerHandle, IN EFI_HANDLE DriverImageHandle OPTIONAL, IN EFI_HANDLE ChildHandle OPTIONAL ); #define EFI_OPEN_PROTOCOL_BY_HANDLE_PROTOCOL 0x00000001 #define EFI_OPEN_PROTOCOL_GET_PROTOCOL 0x00000002 #define EFI_OPEN_PROTOCOL_TEST_PROTOCOL 0x00000004 #define EFI_OPEN_PROTOCOL_BY_CHILD_CONTROLLER 0x00000008 #define EFI_OPEN_PROTOCOL_BY_DRIVER 0x00000010 #define EFI_OPEN_PROTOCOL_EXCLUSIVE 0x00000020 typedef EFI_STATUS (EFIAPI *EFI_OPEN_PROTOCOL) ( IN EFI_HANDLE Handle, IN EFI_GUID *Protocol, OUT VOID **Interface OPTIONAL, IN EFI_HANDLE AgentHandle, IN EFI_HANDLE ControllerHandle, IN UINT32 Attributes ); typedef EFI_STATUS (EFIAPI *EFI_CLOSE_PROTOCOL) ( IN EFI_HANDLE Handle, IN EFI_GUID *Protocol, IN EFI_HANDLE AgentHandle, IN EFI_HANDLE ControllerHandle ); typedef struct { EFI_HANDLE AgentHandle; EFI_HANDLE ControllerHandle; UINT32 Attributes; UINT32 OpenCount; } EFI_OPEN_PROTOCOL_INFORMATION_ENTRY; typedef EFI_STATUS (EFIAPI *EFI_OPEN_PROTOCOL_INFORMATION) ( IN EFI_HANDLE Handle, IN EFI_GUID *Protocol, OUT EFI_OPEN_PROTOCOL_INFORMATION_ENTRY **EntryBuffer, OUT UINTN *EntryCount ); typedef EFI_STATUS (EFIAPI *EFI_PROTOCOLS_PER_HANDLE) ( IN EFI_HANDLE Handle, OUT EFI_GUID ***ProtocolBuffer, OUT UINTN *ProtocolBufferCount ); typedef enum { AllHandles, ByRegisterNotify, ByProtocol } EFI_LOCATE_SEARCH_TYPE; typedef EFI_STATUS (EFIAPI *EFI_LOCATE_HANDLE_BUFFER) ( IN EFI_LOCATE_SEARCH_TYPE SearchType, IN EFI_GUID *Protocol OPTIONAL, IN VOID *SearchKey OPTIONAL, IN OUT UINTN *NoHandles, OUT EFI_HANDLE **Buffer ); typedef EFI_STATUS (EFIAPI *EFI_LOCATE_PROTOCOL) ( IN EFI_GUID *Protocol, IN VOID *Registration OPTIONAL, OUT VOID **Interface ); typedef EFI_STATUS (EFIAPI *EFI_INSTALL_MULTIPLE_PROTOCOL_INTERFACES) ( IN OUT EFI_HANDLE *Handle, ... ); typedef EFI_STATUS (EFIAPI *EFI_UNINSTALL_MULTIPLE_PROTOCOL_INTERFACES) ( IN OUT EFI_HANDLE Handle, ... ); typedef EFI_STATUS (EFIAPI *EFI_CALCULATE_CRC32) ( IN VOID *Data, IN UINTN DataSize, OUT UINT32 *Crc32 ); typedef VOID (EFIAPI *EFI_COPY_MEM) ( IN VOID *Destination, IN VOID *Source, IN UINTN Length ); typedef VOID (EFIAPI *EFI_SET_MEM) ( IN VOID *Buffer, IN UINTN Size, IN UINT8 Value ); typedef EFI_STATUS (EFIAPI *EFI_CREATE_EVENT_EX) ( IN UINT32 Type, IN EFI_TPL NotifyTpl, IN EFI_EVENT_NOTIFY NotifyFunction OPTIONAL, IN const VOID *NotifyContext OPTIONAL, IN const EFI_GUID EventGroup OPTIONAL, OUT EFI_EVENT *Event ); typedef enum { EfiResetCold, EfiResetWarm, EfiResetShutdown } EFI_RESET_TYPE; typedef EFI_STATUS (EFIAPI *EFI_RESET_SYSTEM) ( IN EFI_RESET_TYPE ResetType, IN EFI_STATUS ResetStatus, IN UINTN DataSize, IN CHAR16 *ResetData OPTIONAL ); typedef EFI_STATUS (EFIAPI *EFI_GET_NEXT_MONOTONIC_COUNT) ( OUT UINT64 *Count ); typedef EFI_STATUS (EFIAPI *EFI_GET_NEXT_HIGH_MONO_COUNT) ( OUT UINT32 *HighCount ); typedef EFI_STATUS (EFIAPI *EFI_UPDATE_CAPSULE) ( IN EFI_CAPSULE_HEADER **CapsuleHeaderArray, IN UINTN CapsuleCount, IN EFI_PHYSICAL_ADDRESS ScatterGatherList OPTIONAL ); typedef EFI_STATUS (EFIAPI *EFI_QUERY_CAPSULE_CAPABILITIES) ( IN EFI_CAPSULE_HEADER **CapsuleHeaderArray, IN UINTN CapsuleCount, OUT UINT64 *MaxiumCapsuleSize, OUT EFI_RESET_TYPE *ResetType ); // // Protocol handler functions // typedef enum { EFI_NATIVE_INTERFACE, EFI_PCODE_INTERFACE } EFI_INTERFACE_TYPE; typedef EFI_STATUS (EFIAPI *EFI_INSTALL_PROTOCOL_INTERFACE) ( IN OUT EFI_HANDLE *Handle, IN EFI_GUID *Protocol, IN EFI_INTERFACE_TYPE InterfaceType, IN VOID *Interface ); typedef EFI_STATUS (EFIAPI *EFI_REINSTALL_PROTOCOL_INTERFACE) ( IN EFI_HANDLE Handle, IN EFI_GUID *Protocol, IN VOID *OldInterface, IN VOID *NewInterface ); typedef EFI_STATUS (EFIAPI *EFI_UNINSTALL_PROTOCOL_INTERFACE) ( IN EFI_HANDLE Handle, IN EFI_GUID *Protocol, IN VOID *Interface ); typedef EFI_STATUS (EFIAPI *EFI_HANDLE_PROTOCOL) ( IN EFI_HANDLE Handle, IN EFI_GUID *Protocol, OUT VOID **Interface ); typedef EFI_STATUS (EFIAPI *EFI_REGISTER_PROTOCOL_NOTIFY) ( IN EFI_GUID *Protocol, IN EFI_EVENT Event, OUT VOID **Registration ); typedef EFI_STATUS (EFIAPI *EFI_LOCATE_HANDLE) ( IN EFI_LOCATE_SEARCH_TYPE SearchType, IN EFI_GUID *Protocol OPTIONAL, IN VOID *SearchKey OPTIONAL, IN OUT UINTN *BufferSize, OUT EFI_HANDLE *Buffer ); typedef EFI_STATUS (EFIAPI *EFI_LOCATE_DEVICE_PATH) ( IN EFI_GUID *Protocol, IN OUT EFI_DEVICE_PATH **DevicePath, OUT EFI_HANDLE *Device ); typedef EFI_STATUS (EFIAPI *EFI_INSTALL_CONFIGURATION_TABLE) ( IN EFI_GUID *Guid, IN VOID *Table ); typedef VOID *EFI_RESERVED_SERVICE; // // Standard EFI table header // typedef struct _EFI_TABLE_HEARDER { UINT64 Signature; UINT32 Revision; UINT32 HeaderSize; UINT32 CRC32; UINT32 Reserved; } EFI_TABLE_HEADER; // // EFI Runtime Serivces Table // #define EFI_RUNTIME_SERVICES_SIGNATURE 0x56524553544e5552 #define EFI_RUNTIME_SERVICES_REVISION (EFI_SPECIFICATION_MAJOR_REVISION<<16) | (EFI_SPECIFICATION_MINOR_REVISION) typedef struct { EFI_TABLE_HEADER Hdr; // // Time services // EFI_GET_TIME GetTime; EFI_SET_TIME SetTime; EFI_GET_WAKEUP_TIME GetWakeupTime; EFI_SET_WAKEUP_TIME SetWakeupTime; // // Virtual memory services // EFI_SET_VIRTUAL_ADDRESS_MAP SetVirtualAddressMap; EFI_CONVERT_POINTER ConvertPointer; // // Variable serviers // EFI_GET_VARIABLE GetVariable; EFI_GET_NEXT_VARIABLE_NAME GetNextVariableName; EFI_SET_VARIABLE SetVariable; // // Misc // EFI_GET_NEXT_HIGH_MONO_COUNT GetNextHighMonotonicCount; EFI_RESET_SYSTEM ResetSystem; // // New Boot Service added by UEFI 2.0 // EFI_UPDATE_CAPSULE UpdateCapsule; EFI_QUERY_CAPSULE_CAPABILITIES QueryCapsuleCapabilities; EFI_QUERY_VARIABLE_INFO QueryVariableInfo; } EFI_RUNTIME_SERVICES; // // EFI Boot Services Table // #define EFI_BOOT_SERVICES_SIGNATURE 0x56524553544f4f42 #define EFI_BOOT_SERVICES_REVISION (EFI_SPECIFICATION_MAJOR_REVISION<<16) | (EFI_SPECIFICATION_MINOR_REVISION) typedef struct _EFI_BOOT_SERVICES { EFI_TABLE_HEADER Hdr; // // Task priority functions // EFI_RAISE_TPL RaiseTPL; EFI_RESTORE_TPL RestoreTPL; // // Memory functions // EFI_ALLOCATE_PAGES AllocatePages; EFI_FREE_PAGES FreePages; EFI_GET_MEMORY_MAP GetMemoryMap; EFI_ALLOCATE_POOL AllocatePool; EFI_FREE_POOL FreePool; // // Event & timer functions // EFI_CREATE_EVENT CreateEvent; EFI_SET_TIMER SetTimer; EFI_WAIT_FOR_EVENT WaitForEvent; EFI_SIGNAL_EVENT SignalEvent; EFI_CLOSE_EVENT CloseEvent; EFI_CHECK_EVENT CheckEvent; // // Protocol handler functions // EFI_INSTALL_PROTOCOL_INTERFACE InstallProtocolInterface; EFI_REINSTALL_PROTOCOL_INTERFACE ReinstallProtocolInterface; EFI_UNINSTALL_PROTOCOL_INTERFACE UninstallProtocolInterface; EFI_HANDLE_PROTOCOL HandleProtocol; EFI_HANDLE_PROTOCOL PCHandleProtocol; EFI_REGISTER_PROTOCOL_NOTIFY RegisterProtocolNotify; EFI_LOCATE_HANDLE LocateHandle; EFI_LOCATE_DEVICE_PATH LocateDevicePath; EFI_INSTALL_CONFIGURATION_TABLE InstallConfigurationTable; // // Image functions // EFI_IMAGE_LOAD LoadImage; EFI_IMAGE_START StartImage; EFI_EXIT Exit; EFI_IMAGE_UNLOAD UnloadImage; EFI_EXIT_BOOT_SERVICES ExitBootServices; // // Misc functions // EFI_GET_NEXT_MONOTONIC_COUNT GetNextMonotonicCount; EFI_STALL Stall; EFI_SET_WATCHDOG_TIMER SetWatchdogTimer; // // DriverSupport Services // EFI_CONNECT_CONTROLLER ConnectController; EFI_DISCONNECT_CONTROLLER DisconnectController; // // Open and Close Protocol Services // EFI_OPEN_PROTOCOL OpenProtocol; EFI_CLOSE_PROTOCOL CloseProtocol; EFI_OPEN_PROTOCOL_INFORMATION OpenProtocolInformation; // // Library Services // EFI_PROTOCOLS_PER_HANDLE ProtocolsPerHandle; EFI_LOCATE_HANDLE_BUFFER LocateHandleBuffer; EFI_LOCATE_PROTOCOL LocateProtocol; EFI_INSTALL_MULTIPLE_PROTOCOL_INTERFACES InstallMultipleProtocolInterfaces; EFI_UNINSTALL_MULTIPLE_PROTOCOL_INTERFACES UninstallMultipleProtocolInterfaces; // // 32-bit CRC Services // EFI_CALCULATE_CRC32 CalculateCrc32; // // Misc Services // EFI_COPY_MEM CopyMem; EFI_SET_MEM SetMem; EFI_CREATE_EVENT_EX CreateEventEx; } EFI_BOOT_SERVICES; // // EFI Configuration Table and GUID definitions // #define MPS_TABLE_GUID \ { 0xeb9d2d2f, 0x2d88, 0x11d3, {0x9a, 0x16, 0x0, 0x90, 0x27, 0x3f, 0xc1, 0x4d} } #define ACPI_TABLE_GUID \ { 0xeb9d2d30, 0x2d88, 0x11d3, {0x9a, 0x16, 0x0, 0x90, 0x27, 0x3f, 0xc1, 0x4d} } #define ACPI_20_TABLE_GUID \ { 0x8868e871, 0xe4f1, 0x11d3, {0xbc, 0x22, 0x0, 0x80, 0xc7, 0x3c, 0x88, 0x81} } #define SMBIOS_TABLE_GUID \ { 0xeb9d2d31, 0x2d88, 0x11d3, {0x9a, 0x16, 0x0, 0x90, 0x27, 0x3f, 0xc1, 0x4d} } #define SAL_SYSTEM_TABLE_GUID \ { 0xeb9d2d32, 0x2d88, 0x11d3, {0x9a, 0x16, 0x0, 0x90, 0x27, 0x3f, 0xc1, 0x4d} } typedef struct _EFI_CONFIGURATION_TABLE { EFI_GUID VendorGuid; VOID *VendorTable; } EFI_CONFIGURATION_TABLE; // // EFI System Table // #define EFI_SYSTEM_TABLE_SIGNATURE 0x5453595320494249 #define EFI_SYSTEM_TABLE_REVISION (EFI_SPECIFICATION_MAJOR_REVISION<<16) | (EFI_SPECIFICATION_MINOR_REVISION) typedef struct _EFI_SYSTEM_TABLE { EFI_TABLE_HEADER Hdr; CHAR16 *FirmwareVendor; UINT32 FirmwareRevision; EFI_HANDLE ConsoleInHandle; SIMPLE_INPUT_INTERFACE *ConIn; EFI_HANDLE ConsoleOutHandle; SIMPLE_TEXT_OUTPUT_INTERFACE *ConOut; EFI_HANDLE StandardErrorHandle; SIMPLE_TEXT_OUTPUT_INTERFACE *StdErr; EFI_RUNTIME_SERVICES *RuntimeServices; EFI_BOOT_SERVICES *BootServices; UINTN NumberOfTableEntries; EFI_CONFIGURATION_TABLE *ConfigurationTable; } EFI_SYSTEM_TABLE; #endif xen-4.4.0/xen/xsm/0000775000175000017500000000000012307313555012044 5ustar smbsmbxen-4.4.0/xen/xsm/Makefile0000664000175000017500000000017112307313555013503 0ustar smbsmbobj-y += xsm_core.o ifeq ($(XSM_ENABLE),y) obj-y += xsm_policy.o obj-y += dummy.o endif subdir-$(FLASK_ENABLE) += flask xen-4.4.0/xen/xsm/xsm_core.c0000664000175000017500000000467712307313555014045 0ustar smbsmb/* * This work is based on the LSM implementation in Linux 2.6.13.4. * * Author: George Coker, * * Contributors: Michael LeMay, * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, * as published by the Free Software Foundation. */ #include #include #include #include #include #ifdef XSM_ENABLE #define XSM_FRAMEWORK_VERSION "1.0.0" struct xsm_operations *xsm_ops; static inline int verify(struct xsm_operations *ops) { /* verify the security_operations structure exists */ if ( !ops ) return -EINVAL; xsm_fixup_ops(ops); return 0; } static void __init do_xsm_initcalls(void) { xsm_initcall_t *call; call = __xsm_initcall_start; while ( call < __xsm_initcall_end ) { (*call) (); call++; } } int __init xsm_init(unsigned long *module_map, const multiboot_info_t *mbi, void *(*bootstrap_map)(const module_t *)) { int ret = 0; printk("XSM Framework v" XSM_FRAMEWORK_VERSION " initialized\n"); if ( XSM_MAGIC ) { ret = xsm_policy_init(module_map, mbi, bootstrap_map); if ( ret ) { bootstrap_map(NULL); printk("%s: Error initializing policy.\n", __FUNCTION__); return -EINVAL; } } if ( verify(&dummy_xsm_ops) ) { bootstrap_map(NULL); printk("%s could not verify " "dummy_xsm_ops structure.\n", __FUNCTION__); return -EIO; } xsm_ops = &dummy_xsm_ops; do_xsm_initcalls(); bootstrap_map(NULL); return 0; } int register_xsm(struct xsm_operations *ops) { if ( verify(ops) ) { printk("%s could not verify " "security_operations structure.\n", __FUNCTION__); return -EINVAL; } if ( xsm_ops != &dummy_xsm_ops ) return -EAGAIN; xsm_ops = ops; return 0; } int unregister_xsm(struct xsm_operations *ops) { if ( ops != xsm_ops ) { printk("%s: trying to unregister " "a security_opts structure that is not " "registered, failing.\n", __FUNCTION__); return -EINVAL; } xsm_ops = &dummy_xsm_ops; return 0; } #endif long do_xsm_op (XEN_GUEST_HANDLE_PARAM(xsm_op_t) op) { return xsm_do_xsm_op(op); } xen-4.4.0/xen/xsm/flask/0000775000175000017500000000000012307313555013144 5ustar smbsmbxen-4.4.0/xen/xsm/flask/Makefile0000664000175000017500000000152112307313555014603 0ustar smbsmbobj-y += avc.o obj-y += hooks.o obj-y += flask_op.o subdir-y += ss CFLAGS += -I./include AWK = awk CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ else if [ -x /bin/bash ]; then echo /bin/bash; \ else echo sh; fi ; fi) FLASK_H_DEPEND = policy/security_classes policy/initial_sids AV_H_DEPEND = policy/access_vectors FLASK_H_FILES = include/flask.h include/class_to_string.h include/initial_sid_to_string.h AV_H_FILES = include/av_perm_to_string.h include/av_permissions.h ALL_H_FILES = $(FLASK_H_FILES) $(AV_H_FILES) $(obj-y) ss/built_in.o: $(ALL_H_FILES) $(FLASK_H_FILES): $(FLASK_H_DEPEND) $(CONFIG_SHELL) policy/mkflask.sh $(AWK) $(FLASK_H_DEPEND) $(AV_H_FILES): $(AV_H_DEPEND) $(CONFIG_SHELL) policy/mkaccess_vector.sh $(AWK) $(AV_H_DEPEND) .PHONY: clean clean:: rm -f $(ALL_H_FILES) *.o $(DEPS) xen-4.4.0/xen/xsm/flask/ss/0000775000175000017500000000000012307313555013571 5ustar smbsmbxen-4.4.0/xen/xsm/flask/ss/conditional.h0000664000175000017500000000437412307313555016255 0ustar smbsmb/* Authors: Karl MacMillan * Frank Mayer * * Copyright (C) 2003 - 2004 Tresys Technology, LLC * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, version 2. */ #ifndef _CONDITIONAL_H_ #define _CONDITIONAL_H_ #include "avtab.h" #include "symtab.h" #include "policydb.h" #define COND_EXPR_MAXDEPTH 10 /* * A conditional expression is a list of operators and operands * in reverse polish notation. */ struct cond_expr { #define COND_BOOL 1 /* plain bool */ #define COND_NOT 2 /* !bool */ #define COND_OR 3 /* bool || bool */ #define COND_AND 4 /* bool && bool */ #define COND_XOR 5 /* bool ^ bool */ #define COND_EQ 6 /* bool == bool */ #define COND_NEQ 7 /* bool != bool */ #define COND_LAST COND_NEQ __u32 expr_type; __u32 bool; struct cond_expr *next; }; /* * Each cond_node contains a list of rules to be enabled/disabled * depending on the current value of the conditional expression. This * struct is for that list. */ struct cond_av_list { struct avtab_node *node; struct cond_av_list *next; }; /* * A cond node represents a conditional block in a policy. It * contains a conditional expression, the current state of the expression, * two lists of rules to enable/disable depending on the value of the * expression (the true list corresponds to if and the false list corresponds * to else).. */ struct cond_node { int cur_state; struct cond_expr *expr; struct cond_av_list *true_list; struct cond_av_list *false_list; struct cond_node *next; }; int cond_policydb_init(struct policydb* p); void cond_policydb_destroy(struct policydb* p); int cond_init_bool_indexes(struct policydb* p); int cond_destroy_bool(void *key, void *datum, void *p); int cond_index_bool(void *key, void *datum, void *datap); int cond_read_bool(struct policydb *p, struct hashtab *h, void *fp); int cond_read_list(struct policydb *p, void *fp); void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd); int evaluate_cond_node(struct policydb *p, struct cond_node *node); #endif /* _CONDITIONAL_H_ */ xen-4.4.0/xen/xsm/flask/ss/Makefile0000664000175000017500000000030112307313555015223 0ustar smbsmbobj-y += ebitmap.o obj-y += hashtab.o obj-y += symtab.o obj-y += sidtab.o obj-y += avtab.o obj-y += policydb.o obj-y += services.o obj-y += conditional.o obj-y += mls.o CFLAGS += -I../include xen-4.4.0/xen/xsm/flask/ss/ebitmap.h0000664000175000017500000001002412307313555015360 0ustar smbsmb/* * An extensible bitmap is a bitmap that supports an * arbitrary number of bits. Extensible bitmaps are * used to represent sets of values, such as types, * roles, categories, and classes. * * Each extensible bitmap is implemented as a linked * list of bitmap nodes, where each bitmap node has * an explicitly specified starting bit position within * the total bitmap. * * Author : Stephen Smalley, */ #ifndef _SS_EBITMAP_H_ #define _SS_EBITMAP_H_ #include #define EBITMAP_UNIT_NUMS ((32 - sizeof(void *) - sizeof(u32)) \ / sizeof(unsigned long)) #define EBITMAP_UNIT_SIZE BITS_PER_LONG #define EBITMAP_SIZE (EBITMAP_UNIT_NUMS * EBITMAP_UNIT_SIZE) #define EBITMAP_BIT 1ULL #define EBITMAP_SHIFT_UNIT_SIZE(x) \ (((x) >> EBITMAP_UNIT_SIZE / 2) >> EBITMAP_UNIT_SIZE / 2) struct ebitmap_node { struct ebitmap_node *next; unsigned long maps[EBITMAP_UNIT_NUMS]; u32 startbit; }; struct ebitmap { struct ebitmap_node *node; /* first node in the bitmap */ u32 highbit; /* highest position in the total bitmap */ }; #define ebitmap_length(e) ((e)->highbit) #define ebitmap_startbit(e) ((e)->node ? (e)->node->startbit : 0) static inline unsigned int ebitmap_start_positive(struct ebitmap *e, struct ebitmap_node **n) { unsigned int ofs; for ( *n = e->node; *n; *n = (*n)->next ) { ofs = find_first_bit((*n)->maps, EBITMAP_SIZE); if ( ofs < EBITMAP_SIZE ) return (*n)->startbit + ofs; } return ebitmap_length(e); } static inline void ebitmap_init(struct ebitmap *e) { memset(e, 0, sizeof(*e)); } static inline unsigned int ebitmap_next_positive(struct ebitmap *e, struct ebitmap_node **n, unsigned int bit) { unsigned int ofs; ofs = find_next_bit((*n)->maps, EBITMAP_SIZE, bit - (*n)->startbit + 1); if ( ofs < EBITMAP_SIZE ) return ofs + (*n)->startbit; for ( *n = (*n)->next; *n; *n = (*n)->next ) { ofs = find_first_bit((*n)->maps, EBITMAP_SIZE); if ( ofs < EBITMAP_SIZE ) return ofs + (*n)->startbit; } return ebitmap_length(e); } #define EBITMAP_NODE_INDEX(node, bit) \ (((bit) - (node)->startbit) / EBITMAP_UNIT_SIZE) #define EBITMAP_NODE_OFFSET(node, bit) \ (((bit) - (node)->startbit) % EBITMAP_UNIT_SIZE) static inline int ebitmap_node_get_bit(struct ebitmap_node *n, unsigned int bit) { unsigned int index = EBITMAP_NODE_INDEX(n, bit); unsigned int ofs = EBITMAP_NODE_OFFSET(n, bit); BUG_ON( index >= EBITMAP_UNIT_NUMS ); if ( (n->maps[index] & (EBITMAP_BIT << ofs)) ) return 1; return 0; } static inline void ebitmap_node_set_bit(struct ebitmap_node *n, unsigned int bit) { unsigned int index = EBITMAP_NODE_INDEX(n, bit); unsigned int ofs = EBITMAP_NODE_OFFSET(n, bit); BUG_ON(index >= EBITMAP_UNIT_NUMS); n->maps[index] |= (EBITMAP_BIT << ofs); } static inline void ebitmap_node_clr_bit(struct ebitmap_node *n, unsigned int bit) { unsigned int index = EBITMAP_NODE_INDEX(n, bit); unsigned int ofs = EBITMAP_NODE_OFFSET(n, bit); BUG_ON( index >= EBITMAP_UNIT_NUMS ); n->maps[index] &= ~(EBITMAP_BIT << ofs); } #define ebitmap_for_each_positive_bit(e, n, bit) \ for ( bit = ebitmap_start_positive(e, &n); \ bit < ebitmap_length(e); \ bit = ebitmap_next_positive(e, &n, bit) ) \ int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2); int ebitmap_cpy(struct ebitmap *dst, struct ebitmap *src); int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2); int ebitmap_get_bit(struct ebitmap *e, unsigned long bit); int ebitmap_set_bit(struct ebitmap *e, unsigned long bit, int value); void ebitmap_destroy(struct ebitmap *e); int ebitmap_read(struct ebitmap *e, void *fp); #endif /* _SS_EBITMAP_H_ */ xen-4.4.0/xen/xsm/flask/ss/sidtab.c0000664000175000017500000001540412307313555015207 0ustar smbsmb/* * Implementation of the SID table type. * * Author : Stephen Smalley, */ /* Ported to Xen 3.0, George Coker, */ #include #include #include #include #include "flask.h" #include "security.h" #include "sidtab.h" #define SIDTAB_HASH(sid) (sid & SIDTAB_HASH_MASK) #define INIT_SIDTAB_LOCK(s) spin_lock_init(&s->lock) #define SIDTAB_LOCK(s) spin_lock(&s->lock) #define SIDTAB_UNLOCK(s) spin_unlock(&s->lock) int sidtab_init(struct sidtab *s) { int i; s->htable = xmalloc_array(struct sidtab_node *, SIDTAB_SIZE); if ( !s->htable ) return -ENOMEM; for ( i = 0; i < SIDTAB_SIZE; i++ ) s->htable[i] = NULL; s->nel = 0; s->next_sid = 1; s->shutdown = 0; INIT_SIDTAB_LOCK(s); return 0; } int sidtab_insert(struct sidtab *s, u32 sid, struct context *context) { int hvalue, rc = 0; struct sidtab_node *prev, *cur, *newnode; if ( !s ) { rc = -ENOMEM; goto out; } hvalue = SIDTAB_HASH(sid); prev = NULL; cur = s->htable[hvalue]; while ( cur != NULL && sid > cur->sid ) { prev = cur; cur = cur->next; } if ( cur && sid == cur->sid ) { rc = -EEXIST; goto out; } newnode = xmalloc(struct sidtab_node); if ( newnode == NULL ) { rc = -ENOMEM; goto out; } newnode->sid = sid; if ( context_cpy(&newnode->context, context) ) { xfree(newnode); rc = -ENOMEM; goto out; } if ( prev ) { newnode->next = prev->next; smp_wmb(); prev->next = newnode; } else { newnode->next = s->htable[hvalue]; smp_wmb(); s->htable[hvalue] = newnode; } s->nel++; if ( sid >= s->next_sid ) s->next_sid = sid + 1; out: return rc; } struct context *sidtab_search(struct sidtab *s, u32 sid) { int hvalue; struct sidtab_node *cur; if ( !s ) return NULL; hvalue = SIDTAB_HASH(sid); cur = s->htable[hvalue]; while ( cur != NULL && sid > cur->sid ) cur = cur->next; if ( cur == NULL || sid != cur->sid ) { /* Remap invalid SIDs to the unlabeled SID. */ sid = SECINITSID_UNLABELED; hvalue = SIDTAB_HASH(sid); cur = s->htable[hvalue]; while ( cur != NULL && sid > cur->sid ) cur = cur->next; if ( !cur || sid != cur->sid ) return NULL; } return &cur->context; } int sidtab_map(struct sidtab *s, int (*apply) (u32 sid, struct context *context, void *args), void *args) { int i, rc = 0; struct sidtab_node *cur; if ( !s ) goto out; for ( i = 0; i < SIDTAB_SIZE; i++ ) { cur = s->htable[i]; while ( cur != NULL ) { rc = apply(cur->sid, &cur->context, args); if ( rc ) goto out; cur = cur->next; } } out: return rc; } void sidtab_map_remove_on_error(struct sidtab *s, int (*apply) (u32 sid, struct context *context, void *args), void *args) { int i, ret; struct sidtab_node *last, *cur, *temp; if ( !s ) return; for ( i = 0; i < SIDTAB_SIZE; i++ ) { last = NULL; cur = s->htable[i]; while ( cur != NULL ) { ret = apply(cur->sid, &cur->context, args); if ( ret ) { if ( last ) { last->next = cur->next; } else { s->htable[i] = cur->next; } temp = cur; cur = cur->next; context_destroy(&temp->context); xfree(temp); s->nel--; } else { last = cur; cur = cur->next; } } } return; } static inline u32 sidtab_search_context(struct sidtab *s, struct context *context) { int i; struct sidtab_node *cur; for ( i = 0; i < SIDTAB_SIZE; i++ ) { cur = s->htable[i]; while ( cur != NULL ) { if ( context_cmp(&cur->context, context) ) return cur->sid; cur = cur->next; } } return 0; } int sidtab_context_to_sid(struct sidtab *s, struct context *context, u32 *out_sid) { u32 sid; int ret = 0; *out_sid = SECSID_NULL; sid = sidtab_search_context(s, context); if ( !sid ) { SIDTAB_LOCK(s); /* Rescan now that we hold the lock. */ sid = sidtab_search_context(s, context); if ( sid ) goto unlock_out; /* No SID exists for the context. Allocate a new one. */ if ( s->next_sid == UINT_MAX || s->shutdown ) { ret = -ENOMEM; goto unlock_out; } sid = s->next_sid++; ret = sidtab_insert(s, sid, context); if ( ret ) s->next_sid--; unlock_out: SIDTAB_UNLOCK(s); } if ( ret ) return ret; *out_sid = sid; return 0; } void sidtab_hash_eval(struct sidtab *h, char *tag) { int i, chain_len, slots_used, max_chain_len; struct sidtab_node *cur; slots_used = 0; max_chain_len = 0; for ( i = 0; i < SIDTAB_SIZE; i++ ) { cur = h->htable[i]; if ( cur ) { slots_used++; chain_len = 0; while ( cur ) { chain_len++; cur = cur->next; } if ( chain_len > max_chain_len ) max_chain_len = chain_len; } } printk(KERN_INFO "%s: %d entries and %d/%d buckets used, longest " "chain length %d\n", tag, h->nel, slots_used, SIDTAB_SIZE, max_chain_len); } void sidtab_destroy(struct sidtab *s) { int i; struct sidtab_node *cur, *temp; if ( !s ) return; for ( i = 0; i < SIDTAB_SIZE; i++ ) { cur = s->htable[i]; while ( cur != NULL ) { temp = cur; cur = cur->next; context_destroy(&temp->context); xfree(temp); } s->htable[i] = NULL; } xfree(s->htable); s->htable = NULL; s->nel = 0; s->next_sid = 1; } void sidtab_set(struct sidtab *dst, struct sidtab *src) { SIDTAB_LOCK(src); dst->htable = src->htable; dst->nel = src->nel; dst->next_sid = src->next_sid; dst->shutdown = 0; SIDTAB_UNLOCK(src); } void sidtab_shutdown(struct sidtab *s) { SIDTAB_LOCK(s); s->shutdown = 1; SIDTAB_UNLOCK(s); } xen-4.4.0/xen/xsm/flask/ss/policydb.h0000664000175000017500000002045212307313555015552 0ustar smbsmb/* * A policy database (policydb) specifies the * configuration data for the security policy. * * Author : Stephen Smalley, */ /* * Updated: Trusted Computer Solutions, Inc. * * Support for enhanced MLS infrastructure. * * Updated: Frank Mayer and Karl MacMillan * * Added conditional policy language extensions * * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. * Copyright (C) 2003 - 2004 Tresys Technology, LLC * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, version 2. */ /* Ported to Xen 3.0, George Coker, */ #ifndef _SS_POLICYDB_H_ #define _SS_POLICYDB_H_ #include "symtab.h" #include "avtab.h" #include "sidtab.h" #include "context.h" #include "constraint.h" /* * A datum type is defined for each kind of symbol * in the configuration data: individual permissions, * common prefixes for access vectors, classes, * users, roles, types, sensitivities, categories, etc. */ /* Permission attributes */ struct perm_datum { u32 value; /* permission bit + 1 */ }; /* Attributes of a common prefix for access vectors */ struct common_datum { u32 value; /* internal common value */ struct symtab permissions; /* common permissions */ }; /* Class attributes */ struct class_datum { u32 value; /* class value */ char *comkey; /* common name */ struct common_datum *comdatum; /* common datum */ struct symtab permissions; /* class-specific permission symbol table */ struct constraint_node *constraints; /* constraints on class permissions */ struct constraint_node *validatetrans; /* special transition rules */ }; /* Role attributes */ struct role_datum { u32 value; /* internal role value */ u32 bounds; /* boundary of role */ struct ebitmap dominates; /* set of roles dominated by this role */ struct ebitmap types; /* set of authorized types for role */ }; struct role_trans { u32 role; /* current role */ u32 type; /* program executable type */ u32 new_role; /* new role */ struct role_trans *next; }; struct role_allow { u32 role; /* current role */ u32 new_role; /* new role */ struct role_allow *next; }; /* Type attributes */ struct type_datum { u32 value; /* internal type value */ u32 bounds; /* boundary of type */ unsigned char primary; /* primary name? */ unsigned char attribute;/* attribute ?*/ }; /* * type_datum properties * available at the kernel policy version >= POLICYDB_VERSION_BOUNDARY */ #define TYPEDATUM_PROPERTY_PRIMARY 0x0001 #define TYPEDATUM_PROPERTY_ATTRIBUTE 0x0002 /* limitation of boundary depth */ #define POLICYDB_BOUNDS_MAXDEPTH 4 /* User attributes */ struct user_datum { u32 value; /* internal user value */ u32 bounds; /* bounds of user */ struct ebitmap roles; /* set of authorized roles for user */ struct mls_range range; /* MLS range (min - max) for user */ struct mls_level dfltlevel; /* default login MLS level for user */ }; /* Sensitivity attributes */ struct level_datum { struct mls_level *level; /* sensitivity and associated categories */ unsigned char isalias; /* is this sensitivity an alias for another? */ }; /* Category attributes */ struct cat_datum { u32 value; /* internal category bit + 1 */ unsigned char isalias; /* is this category an alias for another? */ }; struct range_trans { u32 source_type; u32 target_type; u32 target_class; struct mls_range target_range; struct range_trans *next; }; /* Boolean data type */ struct cond_bool_datum { __u32 value; /* internal type value */ int state; }; struct cond_node; /* * The configuration data includes security contexts for * initial SIDs, unlabeled file systems, TCP and UDP port numbers, * network interfaces, and nodes. This structure stores the * relevant data for one such entry. Entries of the same kind * (e.g. all initial SIDs) are linked together into a list. */ struct ocontext { union { char *name; /* name of initial SID, fs, netif, fstype, path */ u16 pirq; u32 device; struct { u32 low_ioport; u32 high_ioport; } ioport; struct { u32 low_iomem; u32 high_iomem; } iomem; } u; struct context context[2]; /* security context(s) */ u32 sid[2]; /* SID(s) */ struct ocontext *next; }; /* symbol table array indices */ #define SYM_COMMONS 0 #define SYM_CLASSES 1 #define SYM_ROLES 2 #define SYM_TYPES 3 #define SYM_USERS 4 #define SYM_BOOLS 5 #define SYM_LEVELS 6 #define SYM_CATS 7 #define SYM_NUM 8 /* object context array indices */ #define OCON_ISID 0 /* initial SIDs */ #define OCON_PIRQ 1 /* physical irqs */ #define OCON_IOPORT 2 /* io ports */ #define OCON_IOMEM 3 /* io memory */ #define OCON_DEVICE 4 /* pci devices */ #define OCON_NUM 5 #define OCON_NUM_OLD 7 /* The policy database */ struct policydb { /* symbol tables */ struct symtab symtab[SYM_NUM]; #define p_commons symtab[SYM_COMMONS] #define p_classes symtab[SYM_CLASSES] #define p_roles symtab[SYM_ROLES] #define p_types symtab[SYM_TYPES] #define p_users symtab[SYM_USERS] #define p_bools symtab[SYM_BOOLS] #define p_levels symtab[SYM_LEVELS] #define p_cats symtab[SYM_CATS] /* symbol names indexed by (value - 1) */ char **sym_val_to_name[SYM_NUM]; #define p_common_val_to_name sym_val_to_name[SYM_COMMONS] #define p_class_val_to_name sym_val_to_name[SYM_CLASSES] #define p_role_val_to_name sym_val_to_name[SYM_ROLES] #define p_type_val_to_name sym_val_to_name[SYM_TYPES] #define p_user_val_to_name sym_val_to_name[SYM_USERS] #define p_bool_val_to_name sym_val_to_name[SYM_BOOLS] #define p_sens_val_to_name sym_val_to_name[SYM_LEVELS] #define p_cat_val_to_name sym_val_to_name[SYM_CATS] /* class, role, and user attributes indexed by (value - 1) */ struct class_datum **class_val_to_struct; struct role_datum **role_val_to_struct; struct user_datum **user_val_to_struct; struct type_datum **type_val_to_struct; /* type enforcement access vectors and transitions */ struct avtab te_avtab; /* role transitions */ struct role_trans *role_tr; /* bools indexed by (value - 1) */ struct cond_bool_datum **bool_val_to_struct; /* type enforcement conditional access vectors and transitions */ struct avtab te_cond_avtab; /* linked list indexing te_cond_avtab by conditional */ struct cond_node* cond_list; /* role allows */ struct role_allow *role_allow; /* security contexts of initial SIDs, unlabeled file systems, TCP or UDP port numbers, network interfaces and nodes */ struct ocontext *ocontexts[OCON_NUM]; /* range transitions */ struct range_trans *range_tr; /* type -> attribute reverse mapping */ struct ebitmap *type_attr_map; struct ebitmap policycaps; struct ebitmap permissive_map; unsigned int policyvers; u16 target_type; }; extern void policydb_destroy(struct policydb *p); extern int policydb_load_isids(struct policydb *p, struct sidtab *s); extern int policydb_context_isvalid(struct policydb *p, struct context *c); extern int policydb_class_isvalid(struct policydb *p, unsigned int class); extern int policydb_type_isvalid(struct policydb *p, unsigned int type); extern int policydb_role_isvalid(struct policydb *p, unsigned int role); extern int policydb_read(struct policydb *p, void *fp); #define PERM_SYMTAB_SIZE 32 #define POLICYDB_CONFIG_MLS 1 #define OBJECT_R "object_r" #define OBJECT_R_VAL 1 #define POLICYDB_MAGIC FLASK_MAGIC #define POLICYDB_STRING "XenFlask" #define POLICYDB_STRING_OLD "SE Linux" #define TARGET_XEN 1 #define TARGET_XEN_OLD 0 struct policy_file { char *data; size_t len; }; static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes) { if ( bytes > fp->len ) return -EINVAL; memcpy(buf, fp->data, bytes); fp->data += bytes; fp->len -= bytes; return 0; } #endif /* _SS_POLICYDB_H_ */ xen-4.4.0/xen/xsm/flask/ss/services.h0000664000175000017500000000040712307313555015566 0ustar smbsmb/* * Implementation of the security services. * * Author : Stephen Smalley, */ #ifndef _SS_SERVICES_H_ #define _SS_SERVICES_H_ #include "policydb.h" #include "sidtab.h" extern struct policydb policydb; #endif /* _SS_SERVICES_H_ */ xen-4.4.0/xen/xsm/flask/ss/mls_types.h0000664000175000017500000000265312307313555015767 0ustar smbsmb/* * Type definitions for the multi-level security (MLS) policy. * * Author : Stephen Smalley, */ /* * Updated: Trusted Computer Solutions, Inc. * * Support for enhanced MLS infrastructure. * * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. */ /* Ported to Xen 3.0, George Coker, */ #ifndef _SS_MLS_TYPES_H_ #define _SS_MLS_TYPES_H_ #include "security.h" struct mls_level { u32 sens; /* sensitivity */ struct ebitmap cat; /* category set */ }; struct mls_range { struct mls_level level[2]; /* low == level[0], high == level[1] */ }; static inline int mls_level_eq(struct mls_level *l1, struct mls_level *l2) { if ( !flask_mls_enabled ) return 1; return ((l1->sens == l2->sens) && ebitmap_cmp(&l1->cat, &l2->cat)); } static inline int mls_level_dom(struct mls_level *l1, struct mls_level *l2) { if ( !flask_mls_enabled ) return 1; return ((l1->sens >= l2->sens) && ebitmap_contains(&l1->cat, &l2->cat)); } #define mls_level_incomp(l1, l2) \ (!mls_level_dom((l1), (l2)) && !mls_level_dom((l2), (l1))) #define mls_level_between(l1, l2, l3) \ (mls_level_dom((l1), (l2)) && mls_level_dom((l3), (l1))) #define mls_range_contains(r1, r2) \ (mls_level_dom(&(r2).level[0], &(r1).level[0]) && \ mls_level_dom(&(r1).level[1], &(r2).level[1])) #endif /* _SS_MLS_TYPES_H_ */ xen-4.4.0/xen/xsm/flask/ss/mls.c0000664000175000017500000003474312307313555014543 0ustar smbsmb/* * Implementation of the multi-level security (MLS) policy. * * Author : Stephen Smalley, */ /* * Updated: Trusted Computer Solutions, Inc. * * Support for enhanced MLS infrastructure. * * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. */ /* Ported to Xen 3.0, George Coker, */ #include #include #include #include #include "sidtab.h" #include "mls.h" #include "policydb.h" #include "services.h" /* * Return the length in bytes for the MLS fields of the * security context string representation of `context'. */ int mls_compute_context_len(struct context * context) { int i, l, len, head, prev; char *nm; struct ebitmap *e; struct ebitmap_node *node; if ( !flask_mls_enabled ) return 0; len = 1; /* for the beginning ":" */ for ( l = 0; l < 2; l++ ) { int index_sens = context->range.level[l].sens; len += strlen(policydb.p_sens_val_to_name[index_sens - 1]); /* categories */ head = -2; prev = -2; e = &context->range.level[l].cat; ebitmap_for_each_positive_bit(e, node, i) { if ( i - prev > 1 ) { /* one or more negative bits are skipped */ if ( head != prev ) { nm = policydb.p_cat_val_to_name[prev]; len += strlen(nm) + 1; } nm = policydb.p_cat_val_to_name[i]; len += strlen(nm) + 1; head = i; } prev = i; } if ( prev != head ) { nm = policydb.p_cat_val_to_name[prev]; len += strlen(nm) + 1; } if ( l == 0 ) { if ( mls_level_eq(&context->range.level[0], &context->range.level[1]) ) break; else len++; } } return len; } /* * Write the security context string representation of * the MLS fields of `context' into the string `*scontext'. * Update `*scontext' to point to the end of the MLS fields. */ void mls_sid_to_context(struct context *context, char **scontext) { char *scontextp, *nm; int i, l, head, prev; struct ebitmap *e; struct ebitmap_node *node; if ( !flask_mls_enabled ) return; scontextp = *scontext; *scontextp = ':'; scontextp++; for ( l = 0; l < 2; l++ ) { memcpy(scontextp, policydb.p_sens_val_to_name[context->range.level[l].sens - 1], strlen(policydb.p_sens_val_to_name[context->range.level[l].sens - 1])+1); scontextp += strlen(scontextp); /* categories */ head = -2; prev = -2; e = &context->range.level[l].cat; ebitmap_for_each_positive_bit(e, node, i) { if ( i - prev > 1 ) { /* one or more negative bits are skipped */ if ( prev != head ) { if ( prev - head > 1 ) *scontextp++ = '.'; else *scontextp++ = ','; nm = policydb.p_cat_val_to_name[prev]; memcpy(scontextp, nm, strlen(nm)+1); scontextp += strlen(nm); } if ( prev < 0 ) *scontextp++ = ':'; else *scontextp++ = ','; nm = policydb.p_cat_val_to_name[i]; memcpy(scontextp, nm, strlen(nm)+1); scontextp += strlen(nm); head = i; } prev = i; } if ( prev != head ) { if ( prev - head > 1 ) *scontextp++ = '.'; else *scontextp++ = ','; nm = policydb.p_cat_val_to_name[prev]; memcpy(scontextp, nm, strlen(nm)+1); scontextp += strlen(nm); } if ( l == 0 ) { if ( mls_level_eq(&context->range.level[0], &context->range.level[1]) ) break; else { *scontextp = '-'; scontextp++; } } } *scontext = scontextp; return; } int mls_level_isvalid(struct policydb *p, struct mls_level *l) { struct level_datum *levdatum; struct ebitmap_node *node; int i; if ( !l->sens || l->sens > p->p_levels.nprim ) return 0; levdatum = hashtab_search(p->p_levels.table, p->p_sens_val_to_name[l->sens - 1]); if ( !levdatum ) return 0; ebitmap_for_each_positive_bit(&l->cat, node, i) { if ( i > p->p_cats.nprim ) return 0; if ( !ebitmap_get_bit(&levdatum->level->cat, i) ) { /* * Category may not be associated with * sensitivity. */ return 0; } } return 1; } int mls_range_isvalid(struct policydb *p, struct mls_range *r) { return ( mls_level_isvalid(p, &r->level[0]) && mls_level_isvalid(p, &r->level[1]) && mls_level_dom(&r->level[1], &r->level[0])); } /* * Return 1 if the MLS fields in the security context * structure `c' are valid. Return 0 otherwise. */ int mls_context_isvalid(struct policydb *p, struct context *c) { struct user_datum *usrdatum; if ( !flask_mls_enabled ) return 1; if ( !mls_range_isvalid(p, &c->range) ) return 0; if ( c->role == OBJECT_R_VAL ) return 1; /* * User must be authorized for the MLS range. */ if ( !c->user || c->user > p->p_users.nprim ) return 0; usrdatum = p->user_val_to_struct[c->user - 1]; if ( !mls_range_contains(usrdatum->range, c->range) ) return 0; /* user may not be associated with range */ return 1; } /* * Set the MLS fields in the security context structure * `context' based on the string representation in * the string `*scontext'. Update `*scontext' to * point to the end of the string representation of * the MLS fields. * * This function modifies the string in place, inserting * NULL characters to terminate the MLS fields. * * Policy read-lock must be held for sidtab lookup. * */ int mls_context_to_sid(char oldc, char **scontext, struct context *context, struct sidtab *s) { char delim; char *scontextp, *p, *rngptr; struct level_datum *levdatum; struct cat_datum *catdatum, *rngdatum; int l, rc = -EINVAL; if ( !flask_mls_enabled ) return 0; /* * No MLS component to the security context -> error. */ if ( !oldc ) goto out; /* Extract low sensitivity. */ scontextp = p = *scontext; while ( *p && *p != ':' && *p != '-' ) p++; delim = *p; if ( delim != 0 ) *p++ = 0; for ( l = 0; l < 2; l++ ) { levdatum = hashtab_search(policydb.p_levels.table, scontextp); if ( !levdatum ) { rc = -EINVAL; goto out; } context->range.level[l].sens = levdatum->level->sens; if ( delim == ':' ) { /* Extract category set. */ while ( 1 ) { scontextp = p; while ( *p && *p != ',' && *p != '-' ) p++; delim = *p; if ( delim != 0 ) *p++ = 0; /* Separate into range if exists */ if ( (rngptr = strchr(scontextp, '.')) != NULL ) { /* Remove '.' */ *rngptr++ = 0; } catdatum = hashtab_search(policydb.p_cats.table, scontextp); if ( !catdatum ) { rc = -EINVAL; goto out; } rc = ebitmap_set_bit(&context->range.level[l].cat, catdatum->value - 1, 1); if ( rc ) goto out; /* If range, set all categories in range */ if ( rngptr ) { int i; rngdatum = hashtab_search(policydb.p_cats.table, rngptr); if ( !rngdatum ) { rc = -EINVAL; goto out; } if ( catdatum->value >= rngdatum->value ) { rc = -EINVAL; goto out; } for ( i = catdatum->value; i < rngdatum->value; i++ ) { rc = ebitmap_set_bit(&context->range.level[l].cat, i, 1); if ( rc ) goto out; } } if ( delim != ',' ) break; } } if ( delim == '-' ) { /* Extract high sensitivity. */ scontextp = p; while ( *p && *p != ':' ) p++; delim = *p; if ( delim != 0 ) *p++ = 0; } else break; } if ( l == 0 ) { context->range.level[1].sens = context->range.level[0].sens; rc = ebitmap_cpy(&context->range.level[1].cat, &context->range.level[0].cat); if ( rc ) goto out; } *scontext = ++p; rc = 0; out: return rc; } /* * Copies the MLS range `range' into `context'. */ static inline int mls_range_set(struct context *context, struct mls_range *range) { int l, rc = 0; /* Copy the MLS range into the context */ for ( l = 0; l < 2; l++ ) { context->range.level[l].sens = range->level[l].sens; rc = ebitmap_cpy(&context->range.level[l].cat, &range->level[l].cat); if ( rc ) break; } return rc; } int mls_setup_user_range(struct context *fromcon, struct user_datum *user, struct context *usercon) { if ( flask_mls_enabled ) { struct mls_level *fromcon_sen = &(fromcon->range.level[0]); struct mls_level *fromcon_clr = &(fromcon->range.level[1]); struct mls_level *user_low = &(user->range.level[0]); struct mls_level *user_clr = &(user->range.level[1]); struct mls_level *user_def = &(user->dfltlevel); struct mls_level *usercon_sen = &(usercon->range.level[0]); struct mls_level *usercon_clr = &(usercon->range.level[1]); /* Honor the user's default level if we can */ if ( mls_level_between(user_def, fromcon_sen, fromcon_clr) ) { *usercon_sen = *user_def; } else if ( mls_level_between(fromcon_sen, user_def, user_clr) ) { *usercon_sen = *fromcon_sen; } else if ( mls_level_between(fromcon_clr, user_low, user_def) ) { *usercon_sen = *user_low; } else return -EINVAL; /* Lower the clearance of available contexts if the clearance of "fromcon" is lower than that of the user's default clearance (but only if the "fromcon" clearance dominates the user's computed sensitivity level) */ if ( mls_level_dom(user_clr, fromcon_clr) ) { *usercon_clr = *fromcon_clr; } else if ( mls_level_dom(fromcon_clr, user_clr) ) { *usercon_clr = *user_clr; } else return -EINVAL; } return 0; } /* * Convert the MLS fields in the security context * structure `c' from the values specified in the * policy `oldp' to the values specified in the policy `newp'. */ int mls_convert_context(struct policydb *oldp, struct policydb *newp, struct context *c) { struct level_datum *levdatum; struct cat_datum *catdatum; struct ebitmap bitmap; struct ebitmap_node *node; int l, i; if ( !flask_mls_enabled ) return 0; for ( l = 0; l < 2; l++ ) { levdatum = hashtab_search(newp->p_levels.table, oldp->p_sens_val_to_name[c->range.level[l].sens - 1]); if ( !levdatum ) return -EINVAL; c->range.level[l].sens = levdatum->level->sens; ebitmap_init(&bitmap); ebitmap_for_each_positive_bit(&c->range.level[l].cat, node, i) { int rc; catdatum = hashtab_search(newp->p_cats.table, oldp->p_cat_val_to_name[i]); if ( !catdatum ) return -EINVAL; rc = ebitmap_set_bit(&bitmap, catdatum->value - 1, 1); if ( rc ) return rc; } ebitmap_destroy(&c->range.level[l].cat); c->range.level[l].cat = bitmap; } return 0; } int mls_compute_sid(struct context *scontext, struct context *tcontext, u16 tclass, u32 specified, struct context *newcontext) { struct range_trans *rtr; if ( !flask_mls_enabled ) return 0; switch ( specified ) { case AVTAB_TRANSITION: /* Look for a range transition rule. */ for (rtr = policydb.range_tr; rtr; rtr = rtr->next) { if (rtr->source_type == scontext->type && rtr->target_type == tcontext->type && rtr->target_class == tclass) { /* Set the range from the rule */ return mls_range_set(newcontext, &rtr->target_range); } } /* Fallthrough */ case AVTAB_CHANGE: if ( tclass == SECCLASS_DOMAIN ) /* Use the process MLS attributes. */ return mls_context_cpy(newcontext, scontext); else /* Use the process effective MLS attributes. */ return mls_context_cpy_low(newcontext, scontext); case AVTAB_MEMBER: /* Use the process effective MLS attributes. */ return mls_context_cpy_low(newcontext, scontext); default: return -EINVAL; } return -EINVAL; } xen-4.4.0/xen/xsm/flask/ss/mls.h0000664000175000017500000000242412307313555014537 0ustar smbsmb/* * Multi-level security (MLS) policy operations. * * Author : Stephen Smalley, */ /* * Updated: Trusted Computer Solutions, Inc. * * Support for enhanced MLS infrastructure. * * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. */ #ifndef _SS_MLS_H_ #define _SS_MLS_H_ #include "context.h" #include "policydb.h" int mls_compute_context_len(struct context *context); void mls_sid_to_context(struct context *context, char **scontext); int mls_context_isvalid(struct policydb *p, struct context *c); int mls_range_isvalid(struct policydb *p, struct mls_range *r); int mls_level_isvalid(struct policydb *p, struct mls_level *l); int mls_context_to_sid(char oldc, char **scontext, struct context *context, struct sidtab *s); int mls_convert_context(struct policydb *oldp, struct policydb *newp, struct context *context); int mls_compute_sid(struct context *scontext, struct context *tcontext, u16 tclass, u32 specified, struct context *newcontext); int mls_setup_user_range(struct context *fromcon, struct user_datum *user, struct context *usercon); #endif /* _SS_MLS_H */ xen-4.4.0/xen/xsm/flask/ss/constraint.h0000664000175000017500000000434412307313555016133 0ustar smbsmb/* * A constraint is a condition that must be satisfied in * order for one or more permissions to be granted. * Constraints are used to impose additional restrictions * beyond the type-based rules in `te' or the role-based * transition rules in `rbac'. Constraints are typically * used to prevent a process from transitioning to a new user * identity or role unless it is in a privileged type. * Constraints are likewise typically used to prevent a * process from labeling an object with a different user * identity. * * Author : Stephen Smalley, */ #ifndef _SS_CONSTRAINT_H_ #define _SS_CONSTRAINT_H_ #include "ebitmap.h" #define CEXPR_MAXDEPTH 5 struct constraint_expr { #define CEXPR_NOT 1 /* not expr */ #define CEXPR_AND 2 /* expr and expr */ #define CEXPR_OR 3 /* expr or expr */ #define CEXPR_ATTR 4 /* attr op attr */ #define CEXPR_NAMES 5 /* attr op names */ u32 expr_type; /* expression type */ #define CEXPR_USER 1 /* user */ #define CEXPR_ROLE 2 /* role */ #define CEXPR_TYPE 4 /* type */ #define CEXPR_TARGET 8 /* target if set, source otherwise */ #define CEXPR_XTARGET 16 /* special 3rd target for validatetrans rule */ #define CEXPR_L1L2 32 /* low level 1 vs. low level 2 */ #define CEXPR_L1H2 64 /* low level 1 vs. high level 2 */ #define CEXPR_H1L2 128 /* high level 1 vs. low level 2 */ #define CEXPR_H1H2 256 /* high level 1 vs. high level 2 */ #define CEXPR_L1H1 512 /* low level 1 vs. high level 1 */ #define CEXPR_L2H2 1024 /* low level 2 vs. high level 2 */ u32 attr; /* attribute */ #define CEXPR_EQ 1 /* == or eq */ #define CEXPR_NEQ 2 /* != */ #define CEXPR_DOM 3 /* dom */ #define CEXPR_DOMBY 4 /* domby */ #define CEXPR_INCOMP 5 /* incomp */ u32 op; /* operator */ struct ebitmap names; /* names */ struct constraint_expr *next; /* next expression */ }; struct constraint_node { u32 permissions; /* constrained permissions */ struct constraint_expr *expr; /* constraint on permissions */ struct constraint_node *next; /* next constraint */ }; #endif /* _SS_CONSTRAINT_H_ */ xen-4.4.0/xen/xsm/flask/ss/context.h0000664000175000017500000000660012307313555015430 0ustar smbsmb/* * A security context is a set of security attributes * associated with each subject and object controlled * by the security policy. Security contexts are * externally represented as variable-length strings * that can be interpreted by a user or application * with an understanding of the security policy. * Internally, the security server uses a simple * structure. This structure is private to the * security server and can be changed without affecting * clients of the security server. * * Author : Stephen Smalley, */ /* Ported to Xen 3.0, George Coker, */ #ifndef _SS_CONTEXT_H_ #define _SS_CONTEXT_H_ #include "ebitmap.h" #include "mls_types.h" #include "security.h" /* * A security context consists of an authenticated user * identity, a role, a type and a MLS range. */ struct context { u32 user; u32 role; u32 type; struct mls_range range; }; static inline void mls_context_init(struct context *c) { memset(&c->range, 0, sizeof(c->range)); } static inline int mls_context_cpy(struct context *dst, struct context *src) { int rc; if ( !flask_mls_enabled ) return 0; dst->range.level[0].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat); if ( rc ) goto out; dst->range.level[1].sens = src->range.level[1].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat); if ( rc ) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } /* * Sets both levels in the MLS range of 'dst' to the low level of 'src'. */ static inline int mls_context_cpy_low(struct context *dst, struct context *src) { int rc; if ( !flask_mls_enabled ) return 0; dst->range.level[0].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat); if ( rc ) goto out; dst->range.level[1].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[0].cat); if ( rc ) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } static inline int mls_context_cmp(struct context *c1, struct context *c2) { if ( !flask_mls_enabled ) return 1; return ((c1->range.level[0].sens == c2->range.level[0].sens) && ebitmap_cmp(&c1->range.level[0].cat,&c2->range.level[0].cat) && (c1->range.level[1].sens == c2->range.level[1].sens) && ebitmap_cmp(&c1->range.level[1].cat,&c2->range.level[1].cat)); } static inline void mls_context_destroy(struct context *c) { if ( !flask_mls_enabled ) return; ebitmap_destroy(&c->range.level[0].cat); ebitmap_destroy(&c->range.level[1].cat); mls_context_init(c); } static inline void context_init(struct context *c) { memset(c, 0, sizeof(*c)); } static inline int context_cpy(struct context *dst, struct context *src) { dst->user = src->user; dst->role = src->role; dst->type = src->type; return mls_context_cpy(dst, src); } static inline void context_destroy(struct context *c) { c->user = c->role = c->type = 0; mls_context_destroy(c); } static inline int context_cmp(struct context *c1, struct context *c2) { return ((c1->user == c2->user) && (c1->role == c2->role) && (c1->type == c2->type) && mls_context_cmp(c1, c2)); } #endif /* _SS_CONTEXT_H_ */ xen-4.4.0/xen/xsm/flask/ss/avtab.c0000664000175000017500000003700112307313555015033 0ustar smbsmb/* * Implementation of the access vector table type. * * Author : Stephen Smalley, */ /* Updated: Frank Mayer and Karl MacMillan * * Added conditional policy language extensions * * Copyright (C) 2003 Tresys Technology, LLC * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, version 2. * * Updated: Yuichi Nakamura * Tuned number of hash slots for avtab to reduce memory usage */ /* Ported to Xen 3.0, George Coker, */ #include #include #include #include #include #include "avtab.h" #include "policydb.h" static inline int avtab_hash(struct avtab_key *keyp, u16 mask) { return ((keyp->target_class + (keyp->target_type << 2) + (keyp->source_type << 9)) & mask); } static struct avtab_node* avtab_insert_node(struct avtab *h, int hvalue, struct avtab_node * prev, struct avtab_node * cur, struct avtab_key *key, struct avtab_datum *datum) { struct avtab_node * newnode; newnode = xmalloc(struct avtab_node); if ( newnode == NULL ) return NULL; memset(newnode, 0, sizeof(struct avtab_node)); newnode->key = *key; newnode->datum = *datum; if ( prev ) { newnode->next = prev->next; prev->next = newnode; } else { newnode->next = h->htable[hvalue]; h->htable[hvalue] = newnode; } h->nel++; return newnode; } static int avtab_insert(struct avtab *h, struct avtab_key *key, struct avtab_datum *datum) { int hvalue; struct avtab_node *prev, *cur, *newnode; u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); if ( !h || !h->htable ) return -EINVAL; hvalue = avtab_hash(key, h->mask); for ( prev = NULL, cur = h->htable[hvalue]; cur; prev = cur, cur = cur->next) { if ( key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class == cur->key.target_class && (specified & cur->key.specified) ) return -EEXIST; if ( key->source_type < cur->key.source_type ) break; if ( key->source_type == cur->key.source_type && key->target_type < cur->key.target_type ) break; if ( key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class < cur->key.target_class ) break; } newnode = avtab_insert_node(h, hvalue, prev, cur, key, datum); if( !newnode ) return -ENOMEM; return 0; } /* Unlike avtab_insert(), this function allow multiple insertions of the same * key/specified mask into the table, as needed by the conditional avtab. * It also returns a pointer to the node inserted. */ struct avtab_node * avtab_insert_nonunique(struct avtab * h, struct avtab_key * key, struct avtab_datum * datum) { int hvalue; struct avtab_node *prev, *cur, *newnode; u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); if ( !h || !h->htable ) return NULL; hvalue = avtab_hash(key, h->mask); for ( prev = NULL, cur = h->htable[hvalue]; cur; prev = cur, cur = cur->next ) { if ( key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class == cur->key.target_class && (specified & cur->key.specified) ) break; if ( key->source_type < cur->key.source_type ) break; if ( key->source_type == cur->key.source_type && key->target_type < cur->key.target_type ) break; if ( key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class < cur->key.target_class ) break; } newnode = avtab_insert_node(h, hvalue, prev, cur, key, datum); return newnode; } struct avtab_datum *avtab_search(struct avtab *h, struct avtab_key *key) { int hvalue; struct avtab_node *cur; u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); if ( !h || !h->htable ) return NULL; hvalue = avtab_hash(key, h->mask); for ( cur = h->htable[hvalue]; cur; cur = cur->next ) { if ( key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class == cur->key.target_class && (specified & cur->key.specified) ) return &cur->datum; if ( key->source_type < cur->key.source_type ) break; if ( key->source_type == cur->key.source_type && key->target_type < cur->key.target_type ) break; if ( key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class < cur->key.target_class ) break; } return NULL; } /* This search function returns a node pointer, and can be used in * conjunction with avtab_search_next_node() */ struct avtab_node* avtab_search_node(struct avtab *h, struct avtab_key *key) { int hvalue; struct avtab_node *cur; u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); if ( !h || !h->htable ) return NULL; hvalue = avtab_hash(key, h->mask); for ( cur = h->htable[hvalue]; cur; cur = cur->next ) { if ( key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class == cur->key.target_class && (specified & cur->key.specified) ) return cur; if ( key->source_type < cur->key.source_type ) break; if ( key->source_type == cur->key.source_type && key->target_type < cur->key.target_type ) break; if ( key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class < cur->key.target_class ) break; } return NULL; } struct avtab_node* avtab_search_node_next(struct avtab_node *node, int specified) { struct avtab_node *cur; if ( !node ) return NULL; specified &= ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); for ( cur = node->next; cur; cur = cur->next ) { if ( node->key.source_type == cur->key.source_type && node->key.target_type == cur->key.target_type && node->key.target_class == cur->key.target_class && (specified & cur->key.specified) ) return cur; if ( node->key.source_type < cur->key.source_type ) break; if ( node->key.source_type == cur->key.source_type && node->key.target_type < cur->key.target_type ) break; if ( node->key.source_type == cur->key.source_type && node->key.target_type == cur->key.target_type && node->key.target_class < cur->key.target_class ) break; } return NULL; } void avtab_destroy(struct avtab *h) { int i; struct avtab_node *cur, *temp; if ( !h || !h->htable ) return; for ( i = 0; i < h->nslot; i++ ) { cur = h->htable[i]; while ( cur != NULL ) { temp = cur; cur = cur->next; xfree(temp); } h->htable[i] = NULL; } xfree(h->htable); h->htable = NULL; h->nslot = 0; h->mask = 0; } int avtab_init(struct avtab *h) { h->htable = NULL; h->nel = 0; return 0; } int avtab_alloc(struct avtab *h, u32 nrules) { u16 mask = 0; u32 shift = 0; u32 work = nrules; u32 nslot = 0; int i; if ( nrules == 0 ) goto avtab_alloc_out; while ( work ) { work = work >> 1; shift++; } if ( shift > 2 ) shift = shift - 2; nslot = 1 << shift; if ( nslot > MAX_AVTAB_SIZE ) nslot = MAX_AVTAB_SIZE; mask = nslot - 1; h->htable = xmalloc_array(struct avtab_node *, nslot); if ( !h->htable ) return -ENOMEM; for ( i = 0; i < nslot; i++ ) h->htable[i] = NULL; avtab_alloc_out: h->nel = 0; h->nslot = nslot; h->mask = mask; printk(KERN_DEBUG "Flask: %d avtab hash slots, %d rules.\n", h->nslot, nrules); return 0; } void avtab_hash_eval(struct avtab *h, char *tag) { int i, chain_len, slots_used, max_chain_len; struct avtab_node *cur; slots_used = 0; max_chain_len = 0; for ( i = 0; i < h->nslot; i++ ) { cur = h->htable[i]; if ( cur ) { slots_used++; chain_len = 0; while ( cur ) { chain_len++; cur = cur->next; } if ( chain_len > max_chain_len ) max_chain_len = chain_len; } } printk(KERN_INFO "%s: %d entries and %d/%d buckets used, longest " "chain length %d\n", tag, h->nel, slots_used, h->nslot, max_chain_len); } static uint16_t spec_order[] = { AVTAB_ALLOWED, AVTAB_AUDITDENY, AVTAB_AUDITALLOW, AVTAB_TRANSITION, AVTAB_CHANGE, AVTAB_MEMBER }; int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol, int (*insertf)(struct avtab *a, struct avtab_key *k, struct avtab_datum *d, void *p), void *p) { __le16 buf16[4]; u16 enabled; __le32 buf32[7]; u32 items, items2, val, vers = pol->policyvers; struct avtab_key key; struct avtab_datum datum; int i, rc; unsigned set; memset(&key, 0, sizeof(struct avtab_key)); memset(&datum, 0, sizeof(struct avtab_datum)); if ( vers < POLICYDB_VERSION_AVTAB ) { rc = next_entry(buf32, fp, sizeof(u32)); if ( rc < 0 ) { printk(KERN_ERR "Flask: avtab: truncated entry\n"); return -1; } items2 = le32_to_cpu(buf32[0]); if ( items2 > ARRAY_SIZE(buf32) ) { printk(KERN_ERR "Flask: avtab: entry overflow\n"); return -1; } rc = next_entry(buf32, fp, sizeof(u32)*items2); if ( rc < 0 ) { printk(KERN_ERR "Flask: avtab: truncated entry\n"); return -1; } items = 0; val = le32_to_cpu(buf32[items++]); key.source_type = (u16)val; if ( key.source_type != val ) { printk("Flask: avtab: truncated source type\n"); return -1; } val = le32_to_cpu(buf32[items++]); key.target_type = (u16)val; if ( key.target_type != val ) { printk("Flask: avtab: truncated target type\n"); return -1; } val = le32_to_cpu(buf32[items++]); key.target_class = (u16)val; if ( key.target_class != val ) { printk("Flask: avtab: truncated target class\n"); return -1; } val = le32_to_cpu(buf32[items++]); enabled = (val & AVTAB_ENABLED_OLD) ? AVTAB_ENABLED : 0; if ( !(val & (AVTAB_AV | AVTAB_TYPE)) ) { printk("Flask: avtab: null entry\n"); return -1; } if ( (val & AVTAB_AV) && (val & AVTAB_TYPE) ) { printk("Flask: avtab: entry has both access vectors and types\n"); return -1; } for ( i = 0; i < sizeof(spec_order)/sizeof(u16); i++ ) { if ( val & spec_order[i] ) { key.specified = spec_order[i] | enabled; datum.data = le32_to_cpu(buf32[items++]); rc = insertf(a, &key, &datum, p); if ( rc ) return rc; } } if ( items != items2 ) { printk("Flask: avtab: entry only had %d items, expected %d\n", items2, items); return -1; } return 0; } rc = next_entry(buf16, fp, sizeof(u16)*4); if ( rc < 0 ) { printk("Flask: avtab: truncated entry\n"); return -1; } items = 0; key.source_type = le16_to_cpu(buf16[items++]); key.target_type = le16_to_cpu(buf16[items++]); key.target_class = le16_to_cpu(buf16[items++]); key.specified = le16_to_cpu(buf16[items++]); if ( !policydb_type_isvalid(pol, key.source_type) || !policydb_type_isvalid(pol, key.target_type) || !policydb_class_isvalid(pol, key.target_class) ) { printk(KERN_ERR "Flask: avtab: invalid type or class\n"); return -1; } set = 0; for ( i = 0; i < ARRAY_SIZE(spec_order); i++ ) { if ( key.specified & spec_order[i] ) set++; } if ( !set || set > 1 ) { printk(KERN_ERR "Flask: avtab: more than one specifier\n"); return -1; } rc = next_entry(buf32, fp, sizeof(u32)); if ( rc < 0 ) { printk("Flask: avtab: truncated entry\n"); return -1; } datum.data = le32_to_cpu(*buf32); if ( (key.specified & AVTAB_TYPE) && !policydb_type_isvalid(pol, datum.data) ) { printk(KERN_ERR "Flask: avtab: invalid type\n"); return -1; } return insertf(a, &key, &datum, p); } static int avtab_insertf(struct avtab *a, struct avtab_key *k, struct avtab_datum *d, void *p) { return avtab_insert(a, k, d); } int avtab_read(struct avtab *a, void *fp, struct policydb *pol) { int rc; __le32 buf[1]; u32 nel, i; rc = next_entry(buf, fp, sizeof(u32)); if ( rc < 0 ) { printk(KERN_ERR "Flask: avtab: truncated table\n"); goto bad; } nel = le32_to_cpu(buf[0]); if ( !nel ) { printk(KERN_ERR "Flask: avtab: table is empty\n"); rc = -EINVAL; goto bad; } rc = avtab_alloc(a, nel); if ( rc ) goto bad; for ( i = 0; i < nel; i++ ) { rc = avtab_read_item(a, fp, pol, avtab_insertf, NULL); if ( rc ) { if ( rc == -ENOMEM ) printk(KERN_ERR "Flask: avtab: out of memory\n"); else if ( rc == -EEXIST ) printk(KERN_ERR "Flask: avtab: duplicate entry\n"); else rc = -EINVAL; goto bad; } } rc = 0; out: return rc; bad: avtab_destroy(a); goto out; } xen-4.4.0/xen/xsm/flask/ss/policydb.c0000664000175000017500000014450612307313555015554 0ustar smbsmb/* * Implementation of the policy database. * * Author : Stephen Smalley, */ /* * Updated: Trusted Computer Solutions, Inc. * * Support for enhanced MLS infrastructure. * * Updated: Frank Mayer and Karl MacMillan * * Added conditional policy language extensions * * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. * Copyright (C) 2003 - 2004 Tresys Technology, LLC * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, version 2. */ /* Ported to Xen 3.0, George Coker, */ #include #include #include #include #include #include #include "security.h" #include "policydb.h" #include "conditional.h" #include "mls.h" #define _DEBUG_HASHES #ifdef DEBUG_HASHES static char *symtab_name[SYM_NUM] = { "common prefixes", "classes", "roles", "types", "users", "bools", "levels", "categories", }; #endif int flask_mls_enabled = 0; static unsigned int symtab_sizes[SYM_NUM] = { 2, 32, 16, 512, 128, 16, 16, 16, }; struct policydb_compat_info { int version; int sym_num; int ocon_num; int target_type; }; /* These need to be updated if SYM_NUM or OCON_NUM changes */ static struct policydb_compat_info policydb_compat[] = { { .version = POLICYDB_VERSION_BASE, .sym_num = SYM_NUM - 3, .ocon_num = OCON_NUM - 1, .target_type = TARGET_XEN_OLD, }, { .version = POLICYDB_VERSION_BOOL, .sym_num = SYM_NUM - 2, .ocon_num = OCON_NUM - 1, .target_type = TARGET_XEN_OLD, }, { .version = POLICYDB_VERSION_IPV6, .sym_num = SYM_NUM - 2, .ocon_num = OCON_NUM, .target_type = TARGET_XEN_OLD, }, { .version = POLICYDB_VERSION_NLCLASS, .sym_num = SYM_NUM - 2, .ocon_num = OCON_NUM, .target_type = TARGET_XEN_OLD, }, { .version = POLICYDB_VERSION_MLS, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, .target_type = TARGET_XEN_OLD, }, { .version = POLICYDB_VERSION_AVTAB, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, .target_type = TARGET_XEN_OLD, }, { .version = POLICYDB_VERSION_RANGETRANS, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, .target_type = TARGET_XEN_OLD, }, { .version = POLICYDB_VERSION_POLCAP, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, .target_type = TARGET_XEN_OLD, }, { .version = POLICYDB_VERSION_PERMISSIVE, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, .target_type = TARGET_XEN_OLD, }, { .version = POLICYDB_VERSION_BOUNDARY, .sym_num = SYM_NUM, .ocon_num = OCON_NUM_OLD, .target_type = TARGET_XEN_OLD, }, { .version = POLICYDB_VERSION_BOUNDARY, .sym_num = SYM_NUM, .ocon_num = OCON_NUM, .target_type = TARGET_XEN, }, }; static struct policydb_compat_info *policydb_lookup_compat(int version, int target) { int i; struct policydb_compat_info *info = NULL; for ( i = 0; i < sizeof(policydb_compat)/sizeof(*info); i++ ) { if ( policydb_compat[i].version == version && policydb_compat[i].target_type == target ) { info = &policydb_compat[i]; break; } } return info; } /* * Initialize the role table. */ static int roles_init(struct policydb *p) { char *key = NULL; int rc; struct role_datum *role; role = xmalloc(struct role_datum); if ( !role ) { rc = -ENOMEM; goto out; } memset(role, 0, sizeof(*role)); role->value = ++p->p_roles.nprim; if ( role->value != OBJECT_R_VAL ) { rc = -EINVAL; goto out_free_role; } key = xmalloc_array(char, strlen(OBJECT_R)+1); if ( !key ) { rc = -ENOMEM; goto out_free_role; } strlcpy(key, OBJECT_R, strlen(OBJECT_R)+1); rc = hashtab_insert(p->p_roles.table, key, role); if ( rc ) goto out_free_key; out: return rc; out_free_key: xfree(key); out_free_role: xfree(role); goto out; } /* * Initialize a policy database structure. */ static int policydb_init(struct policydb *p) { int i, rc; memset(p, 0, sizeof(*p)); for ( i = 0; i < SYM_NUM; i++ ) { rc = symtab_init(&p->symtab[i], symtab_sizes[i]); if ( rc ) goto out_free_symtab; } rc = avtab_init(&p->te_avtab); if ( rc ) goto out_free_symtab; rc = roles_init(p); if ( rc ) goto out_free_avtab; rc = cond_policydb_init(p); if ( rc ) goto out_free_avtab; ebitmap_init(&p->policycaps); ebitmap_init(&p->permissive_map); out: return rc; out_free_avtab: avtab_destroy(&p->te_avtab); out_free_symtab: for ( i = 0; i < SYM_NUM; i++ ) hashtab_destroy(p->symtab[i].table); goto out; } /* * The following *_index functions are used to * define the val_to_name and val_to_struct arrays * in a policy database structure. The val_to_name * arrays are used when converting security context * structures into string representations. The * val_to_struct arrays are used when the attributes * of a class, role, or user are needed. */ static int common_index(void *key, void *datum, void *datap) { return 0; } static int class_index(void *key, void *datum, void *datap) { struct policydb *p; struct class_datum *cladatum; cladatum = datum; p = datap; if ( !cladatum->value || cladatum->value > p->p_classes.nprim ) return -EINVAL; p->p_class_val_to_name[cladatum->value - 1] = key; p->class_val_to_struct[cladatum->value - 1] = cladatum; return 0; } static int role_index(void *key, void *datum, void *datap) { struct policydb *p; struct role_datum *role; role = datum; p = datap; if ( !role->value || role->value > p->p_roles.nprim || role->bounds > p->p_roles.nprim ) return -EINVAL; p->p_role_val_to_name[role->value - 1] = key; p->role_val_to_struct[role->value - 1] = role; return 0; } static int type_index(void *key, void *datum, void *datap) { struct policydb *p; struct type_datum *typdatum; typdatum = datum; p = datap; if ( typdatum->primary ) { if ( !typdatum->value || typdatum->value > p->p_types.nprim || typdatum->bounds > p->p_types.nprim ) return -EINVAL; p->p_type_val_to_name[typdatum->value - 1] = key; p->type_val_to_struct[typdatum->value - 1] = typdatum; } return 0; } static int user_index(void *key, void *datum, void *datap) { struct policydb *p; struct user_datum *usrdatum; usrdatum = datum; p = datap; if ( !usrdatum->value || usrdatum->value > p->p_users.nprim || usrdatum->bounds > p->p_users.nprim ) return -EINVAL; p->p_user_val_to_name[usrdatum->value - 1] = key; p->user_val_to_struct[usrdatum->value - 1] = usrdatum; return 0; } static int sens_index(void *key, void *datum, void *datap) { struct policydb *p; struct level_datum *levdatum; levdatum = datum; p = datap; if ( !levdatum->isalias ) { if ( !levdatum->level->sens || levdatum->level->sens > p->p_levels.nprim ) return -EINVAL; p->p_sens_val_to_name[levdatum->level->sens - 1] = key; } return 0; } static int cat_index(void *key, void *datum, void *datap) { struct policydb *p; struct cat_datum *catdatum; catdatum = datum; p = datap; if ( !catdatum->isalias ) { if ( !catdatum->value || catdatum->value > p->p_cats.nprim ) return -EINVAL; p->p_cat_val_to_name[catdatum->value - 1] = key; } return 0; } static int (*index_f[SYM_NUM]) (void *key, void *datum, void *datap) = { common_index, class_index, role_index, type_index, user_index, cond_index_bool, sens_index, cat_index, }; /* * Define the class val_to_name and val_to_struct arrays in a policy * database structure. * * Caller must clean up upon failure. */ static int policydb_index_classes(struct policydb *p) { int rc; p->class_val_to_struct = xmalloc_array(struct class_datum *, p->p_classes.nprim); if ( !p->class_val_to_struct ) { rc = -ENOMEM; goto out; } p->p_class_val_to_name = xmalloc_array(char *, p->p_classes.nprim); if ( !p->p_class_val_to_name ) { rc = -ENOMEM; goto out; } rc = hashtab_map(p->p_classes.table, class_index, p); out: return rc; } #ifdef DEBUG_HASHES static void symtab_hash_eval(struct symtab *s) { int i; for ( i = 0; i < SYM_NUM; i++ ) { struct hashtab *h = s[i].table; struct hashtab_info info; hashtab_stat(h, &info); printk(KERN_INFO "%s: %d entries and %d/%d buckets used, " "longest chain length %d\n", symtab_name[i], h->nel, info.slots_used, h->size, info.max_chain_len); } } #endif /* * Define the other val_to_name and val_to_struct arrays * in a policy database structure. * * Caller must clean up on failure. */ static int policydb_index_others(struct policydb *p) { int i, rc = 0; printk(KERN_INFO "Flask: %d users, %d roles, %d types, %d bools", p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim, p->p_bools.nprim); if ( flask_mls_enabled ) printk(", %d sens, %d cats", p->p_levels.nprim, p->p_cats.nprim); printk("\n"); printk(KERN_INFO "Flask: %d classes, %d rules\n", p->p_classes.nprim, p->te_avtab.nel); #ifdef DEBUG_HASHES avtab_hash_eval(&p->te_avtab, "rules"); symtab_hash_eval(p->symtab); #endif p->role_val_to_struct = xmalloc_array(struct role_datum *, p->p_roles.nprim); if ( !p->role_val_to_struct ) { rc = -ENOMEM; goto out; } p->user_val_to_struct = xmalloc_array(struct user_datum *, p->p_users.nprim); if ( !p->user_val_to_struct ) { rc = -ENOMEM; goto out; } p->type_val_to_struct = xmalloc_array(struct type_datum *, p->p_types.nprim); if ( !p->type_val_to_struct ) { rc = -ENOMEM; goto out; } if ( cond_init_bool_indexes(p) ) { rc = -ENOMEM; goto out; } for ( i = SYM_ROLES; i < SYM_NUM; i++ ) { p->sym_val_to_name[i] = xmalloc_array(char *, p->symtab[i].nprim); if ( !p->sym_val_to_name[i] ) { rc = -ENOMEM; goto out; } rc = hashtab_map(p->symtab[i].table, index_f[i], p); if ( rc ) goto out; } out: return rc; } /* * The following *_destroy functions are used to * free any memory allocated for each kind of * symbol data in the policy database. */ static int perm_destroy(void *key, void *datum, void *p) { xfree(key); xfree(datum); return 0; } static int common_destroy(void *key, void *datum, void *p) { struct common_datum *comdatum; xfree(key); comdatum = datum; hashtab_map(comdatum->permissions.table, perm_destroy, NULL); hashtab_destroy(comdatum->permissions.table); xfree(datum); return 0; } static int class_destroy(void *key, void *datum, void *p) { struct class_datum *cladatum; struct constraint_node *constraint, *ctemp; struct constraint_expr *e, *etmp; xfree(key); cladatum = datum; hashtab_map(cladatum->permissions.table, perm_destroy, NULL); hashtab_destroy(cladatum->permissions.table); constraint = cladatum->constraints; while ( constraint ) { e = constraint->expr; while ( e ) { ebitmap_destroy(&e->names); etmp = e; e = e->next; xfree(etmp); } ctemp = constraint; constraint = constraint->next; xfree(ctemp); } constraint = cladatum->validatetrans; while ( constraint ) { e = constraint->expr; while ( e ) { ebitmap_destroy(&e->names); etmp = e; e = e->next; xfree(etmp); } ctemp = constraint; constraint = constraint->next; xfree(ctemp); } xfree(cladatum->comkey); xfree(datum); return 0; } static int role_destroy(void *key, void *datum, void *p) { struct role_datum *role; xfree(key); role = datum; ebitmap_destroy(&role->dominates); ebitmap_destroy(&role->types); xfree(datum); return 0; } static int type_destroy(void *key, void *datum, void *p) { xfree(key); xfree(datum); return 0; } static int user_destroy(void *key, void *datum, void *p) { struct user_datum *usrdatum; xfree(key); usrdatum = datum; ebitmap_destroy(&usrdatum->roles); ebitmap_destroy(&usrdatum->range.level[0].cat); ebitmap_destroy(&usrdatum->range.level[1].cat); ebitmap_destroy(&usrdatum->dfltlevel.cat); xfree(datum); return 0; } static int sens_destroy(void *key, void *datum, void *p) { struct level_datum *levdatum; xfree(key); levdatum = datum; ebitmap_destroy(&levdatum->level->cat); xfree(levdatum->level); xfree(datum); return 0; } static int cat_destroy(void *key, void *datum, void *p) { xfree(key); xfree(datum); return 0; } static int (*destroy_f[SYM_NUM]) (void *key, void *datum, void *datap) = { common_destroy, class_destroy, role_destroy, type_destroy, user_destroy, cond_destroy_bool, sens_destroy, cat_destroy, }; static void ocontext_destroy(struct ocontext *c, int i) { context_destroy(&c->context[0]); context_destroy(&c->context[1]); if ( i == OCON_ISID ) xfree(c->u.name); xfree(c); } /* * Free any memory allocated by a policy database structure. */ void policydb_destroy(struct policydb *p) { struct ocontext *c, *ctmp; int i; struct role_allow *ra, *lra = NULL; struct role_trans *tr, *ltr = NULL; struct range_trans *rt, *lrt = NULL; for ( i = 0; i < SYM_NUM; i++ ) { hashtab_map(p->symtab[i].table, destroy_f[i], NULL); hashtab_destroy(p->symtab[i].table); } for ( i = 0; i < SYM_NUM; i++ ) xfree(p->sym_val_to_name[i]); xfree(p->class_val_to_struct); xfree(p->role_val_to_struct); xfree(p->user_val_to_struct); xfree(p->type_val_to_struct); avtab_destroy(&p->te_avtab); for ( i = 0; i < OCON_NUM; i++ ) { c = p->ocontexts[i]; while ( c ) { ctmp = c; c = c->next; ocontext_destroy(ctmp,i); } p->ocontexts[i] = NULL; } cond_policydb_destroy(p); for ( tr = p->role_tr; tr; tr = tr->next ) { if ( ltr ) xfree(ltr); ltr = tr; } if ( ltr ) xfree(ltr); for ( ra = p->role_allow; ra; ra = ra -> next ) { if ( lra ) xfree(lra); lra = ra; } if ( lra ) xfree(lra); for ( rt = p->range_tr; rt; rt = rt -> next ) { if ( lrt ) { ebitmap_destroy(&lrt->target_range.level[0].cat); ebitmap_destroy(&lrt->target_range.level[1].cat); xfree(lrt); } lrt = rt; } if ( lrt ) { ebitmap_destroy(&lrt->target_range.level[0].cat); ebitmap_destroy(&lrt->target_range.level[1].cat); xfree(lrt); } if ( p->type_attr_map ) for ( i = 0; i < p->p_types.nprim; i++ ) ebitmap_destroy(&p->type_attr_map[i]); xfree(p->type_attr_map); ebitmap_destroy(&p->policycaps); ebitmap_destroy(&p->permissive_map); return; } /* * Load the initial SIDs specified in a policy database * structure into a SID table. */ int policydb_load_isids(struct policydb *p, struct sidtab *s) { struct ocontext *head, *c; int rc; rc = sidtab_init(s); if ( rc ) { printk(KERN_ERR "Flask: out of memory on SID table init\n"); goto out; } head = p->ocontexts[OCON_ISID]; for ( c = head; c; c = c->next ) { if ( !c->context[0].user ) { printk(KERN_ERR "Flask: SID %s was never " "defined.\n", c->u.name); rc = -EINVAL; goto out; } if ( sidtab_insert(s, c->sid[0], &c->context[0]) ) { printk(KERN_ERR "Flask: unable to load initial " "SID %s.\n", c->u.name); rc = -EINVAL; goto out; } } out: return rc; } int policydb_class_isvalid(struct policydb *p, unsigned int class) { if ( !class || class > p->p_classes.nprim ) return 0; return 1; } int policydb_role_isvalid(struct policydb *p, unsigned int role) { if ( !role || role > p->p_roles.nprim ) return 0; return 1; } int policydb_type_isvalid(struct policydb *p, unsigned int type) { if ( !type || type > p->p_types.nprim ) return 0; return 1; } /* * Return 1 if the fields in the security context * structure `c' are valid. Return 0 otherwise. */ int policydb_context_isvalid(struct policydb *p, struct context *c) { struct role_datum *role; struct user_datum *usrdatum; if ( !c->role || c->role > p->p_roles.nprim ) return 0; if ( !c->user || c->user > p->p_users.nprim ) return 0; if ( !c->type || c->type > p->p_types.nprim ) return 0; if ( c->role != OBJECT_R_VAL ) { /* * Role must be authorized for the type. */ role = p->role_val_to_struct[c->role - 1]; if ( !ebitmap_get_bit(&role->types, c->type - 1) ) /* role may not be associated with type */ return 0; /* * User must be authorized for the role. */ usrdatum = p->user_val_to_struct[c->user - 1]; if ( !usrdatum ) return 0; if ( !ebitmap_get_bit(&usrdatum->roles, c->role - 1) ) /* user may not be associated with role */ return 0; } if ( !mls_context_isvalid(p, c) ) return 0; return 1; } /* * Read a MLS range structure from a policydb binary * representation file. */ static int mls_read_range_helper(struct mls_range *r, void *fp) { __le32 buf[2]; u32 items; int rc; rc = next_entry(buf, fp, sizeof(u32)); if ( rc < 0 ) goto out; items = le32_to_cpu(buf[0]); if ( items > ARRAY_SIZE(buf) ) { printk(KERN_ERR "Flask: mls: range overflow\n"); rc = -EINVAL; goto out; } rc = next_entry(buf, fp, sizeof(u32) * items); if ( rc < 0 ) { printk(KERN_ERR "Flask: mls: truncated range\n"); goto out; } r->level[0].sens = le32_to_cpu(buf[0]); if ( items > 1 ) r->level[1].sens = le32_to_cpu(buf[1]); else r->level[1].sens = r->level[0].sens; rc = ebitmap_read(&r->level[0].cat, fp); if ( rc ) { printk(KERN_ERR "Flask: mls: error reading low " "categories\n"); goto out; } if ( items > 1 ) { rc = ebitmap_read(&r->level[1].cat, fp); if ( rc ) { printk(KERN_ERR "Flask: mls: error reading high " "categories\n"); goto bad_high; } } else { rc = ebitmap_cpy(&r->level[1].cat, &r->level[0].cat); if ( rc ) { printk(KERN_ERR "Flask: mls: out of memory\n"); goto bad_high; } } rc = 0; out: return rc; bad_high: ebitmap_destroy(&r->level[0].cat); goto out; } /* * Read and validate a security context structure * from a policydb binary representation file. */ static int context_read_and_validate(struct context *c, struct policydb *p, void *fp) { __le32 buf[3]; int rc; rc = next_entry(buf, fp, sizeof buf); if ( rc < 0 ) { printk(KERN_ERR "Flask: context truncated\n"); goto out; } c->user = le32_to_cpu(buf[0]); c->role = le32_to_cpu(buf[1]); c->type = le32_to_cpu(buf[2]); if ( p->policyvers >= POLICYDB_VERSION_MLS ) { if ( mls_read_range_helper(&c->range, fp) ) { printk(KERN_ERR "Flask: error reading MLS range of " "context\n"); rc = -EINVAL; goto out; } } if ( !policydb_context_isvalid(p, c) ) { printk(KERN_ERR "Flask: invalid security context\n"); context_destroy(c); rc = -EINVAL; } out: return rc; } /* * The following *_read functions are used to * read the symbol data from a policy database * binary representation file. */ static int perm_read(struct policydb *p, struct hashtab *h, void *fp) { char *key = NULL; struct perm_datum *perdatum; int rc; __le32 buf[2]; u32 len; perdatum = xmalloc(struct perm_datum); if ( !perdatum ) { rc = -ENOMEM; goto out; } memset(perdatum, 0, sizeof(*perdatum)); rc = next_entry(buf, fp, sizeof buf); if ( rc < 0 ) goto bad; len = le32_to_cpu(buf[0]); perdatum->value = le32_to_cpu(buf[1]); key = xmalloc_array(char, len + 1); if ( !key ) { rc = -ENOMEM; goto bad; } rc = next_entry(key, fp, len); if ( rc < 0 ) goto bad; key[len] = 0; rc = hashtab_insert(h, key, perdatum); if ( rc ) goto bad; out: return rc; bad: perm_destroy(key, perdatum, NULL); goto out; } static int common_read(struct policydb *p, struct hashtab *h, void *fp) { char *key = NULL; struct common_datum *comdatum; __le32 buf[4]; u32 len, nel; int i, rc; comdatum = xmalloc(struct common_datum); if ( !comdatum ) { rc = -ENOMEM; goto out; } memset(comdatum, 0, sizeof(*comdatum)); rc = next_entry(buf, fp, sizeof buf); if ( rc < 0 ) goto bad; len = le32_to_cpu(buf[0]); comdatum->value = le32_to_cpu(buf[1]); rc = symtab_init(&comdatum->permissions, PERM_SYMTAB_SIZE); if ( rc ) goto bad; comdatum->permissions.nprim = le32_to_cpu(buf[2]); nel = le32_to_cpu(buf[3]); key = xmalloc_array(char, len + 1); if ( !key ) { rc = -ENOMEM; goto bad; } rc = next_entry(key, fp, len); if ( rc < 0 ) goto bad; key[len] = 0; for ( i = 0; i < nel; i++ ) { rc = perm_read(p, comdatum->permissions.table, fp); if ( rc ) goto bad; } rc = hashtab_insert(h, key, comdatum); if ( rc ) goto bad; out: return rc; bad: common_destroy(key, comdatum, NULL); goto out; } static int read_cons_helper(struct constraint_node **nodep, int ncons, int allowxtarget, void *fp) { struct constraint_node *c, *lc; struct constraint_expr *e, *le; __le32 buf[3]; u32 nexpr; int rc, i, j, depth; lc = NULL; for ( i = 0; i < ncons; i++ ) { c = xmalloc(struct constraint_node); if ( !c ) return -ENOMEM; memset(c, 0, sizeof(*c)); if ( lc ) { lc->next = c; } else { *nodep = c; } rc = next_entry(buf, fp, (sizeof(u32) * 2)); if ( rc < 0 ) return rc; c->permissions = le32_to_cpu(buf[0]); nexpr = le32_to_cpu(buf[1]); le = NULL; depth = -1; for ( j = 0; j < nexpr; j++ ) { e = xmalloc(struct constraint_expr); if ( !e ) return -ENOMEM; memset(e, 0, sizeof(*e)); if ( le ) le->next = e; else c->expr = e; rc = next_entry(buf, fp, (sizeof(u32) * 3)); if ( rc < 0 ) return rc; e->expr_type = le32_to_cpu(buf[0]); e->attr = le32_to_cpu(buf[1]); e->op = le32_to_cpu(buf[2]); switch ( e->expr_type ) { case CEXPR_NOT: if ( depth < 0 ) return -EINVAL; break; case CEXPR_AND: case CEXPR_OR: if ( depth < 1 ) return -EINVAL; depth--; break; case CEXPR_ATTR: if ( depth == (CEXPR_MAXDEPTH - 1) ) return -EINVAL; depth++; break; case CEXPR_NAMES: if ( !allowxtarget && (e->attr & CEXPR_XTARGET) ) return -EINVAL; if ( depth == (CEXPR_MAXDEPTH - 1) ) return -EINVAL; depth++; if ( ebitmap_read(&e->names, fp) ) return -EINVAL; break; default: return -EINVAL; } le = e; } if ( depth != 0 ) return -EINVAL; lc = c; } return 0; } static int class_read(struct policydb *p, struct hashtab *h, void *fp) { char *key = NULL; struct class_datum *cladatum; __le32 buf[6]; u32 len, len2, ncons, nel; int i, rc; cladatum = xmalloc(struct class_datum); if ( !cladatum ) { rc = -ENOMEM; goto out; } memset(cladatum, 0, sizeof(*cladatum)); rc = next_entry(buf, fp, sizeof(u32)*6); if ( rc < 0 ) goto bad; len = le32_to_cpu(buf[0]); len2 = le32_to_cpu(buf[1]); cladatum->value = le32_to_cpu(buf[2]); rc = symtab_init(&cladatum->permissions, PERM_SYMTAB_SIZE); if ( rc ) goto bad; cladatum->permissions.nprim = le32_to_cpu(buf[3]); nel = le32_to_cpu(buf[4]); ncons = le32_to_cpu(buf[5]); key = xmalloc_array(char, len + 1); if ( !key ) { rc = -ENOMEM; goto bad; } rc = next_entry(key, fp, len); if ( rc < 0 ) goto bad; key[len] = 0; if ( len2 ) { printk(KERN_ERR "Flask: classes with common prefixes are not supported\n"); rc = -EINVAL; goto bad; } for ( i = 0; i < nel; i++ ) { rc = perm_read(p, cladatum->permissions.table, fp); if ( rc ) goto bad; } rc = read_cons_helper(&cladatum->constraints, ncons, 0, fp); if ( rc ) goto bad; if ( p->policyvers >= POLICYDB_VERSION_VALIDATETRANS ) { /* grab the validatetrans rules */ rc = next_entry(buf, fp, sizeof(u32)); if ( rc < 0 ) goto bad; ncons = le32_to_cpu(buf[0]); rc = read_cons_helper(&cladatum->validatetrans, ncons, 1, fp); if ( rc ) goto bad; } rc = hashtab_insert(h, key, cladatum); if ( rc ) goto bad; rc = 0; out: return rc; bad: class_destroy(key, cladatum, NULL); goto out; } static int role_read(struct policydb *p, struct hashtab *h, void *fp) { char *key = NULL; struct role_datum *role; int rc; __le32 buf[3]; u32 len; role = xmalloc(struct role_datum); if ( !role ) { rc = -ENOMEM; goto out; } memset(role, 0, sizeof(*role)); if ( p->policyvers >= POLICYDB_VERSION_BOUNDARY ) rc = next_entry(buf, fp, sizeof(buf[0]) * 3); else rc = next_entry(buf, fp, sizeof(buf[0]) * 2); if ( rc < 0 ) goto bad; len = le32_to_cpu(buf[0]); role->value = le32_to_cpu(buf[1]); if ( p->policyvers >= POLICYDB_VERSION_BOUNDARY ) role->bounds = le32_to_cpu(buf[2]); key = xmalloc_array(char, len + 1); if ( !key ) { rc = -ENOMEM; goto bad; } rc = next_entry(key, fp, len); if ( rc < 0 ) goto bad; key[len] = 0; rc = ebitmap_read(&role->dominates, fp); if ( rc ) goto bad; rc = ebitmap_read(&role->types, fp); if ( rc ) goto bad; if ( strcmp(key, OBJECT_R) == 0 ) { if ( role->value != OBJECT_R_VAL ) { printk(KERN_ERR "Role %s has wrong value %d\n", OBJECT_R, role->value); rc = -EINVAL; goto bad; } rc = 0; goto bad; } rc = hashtab_insert(h, key, role); if ( rc ) goto bad; out: return rc; bad: role_destroy(key, role, NULL); goto out; } static int type_read(struct policydb *p, struct hashtab *h, void *fp) { char *key = NULL; struct type_datum *typdatum; int rc; __le32 buf[4]; u32 len; typdatum = xmalloc(struct type_datum); if ( !typdatum ) { rc = -ENOMEM; return rc; } memset(typdatum, 0, sizeof(*typdatum)); if ( p->policyvers >= POLICYDB_VERSION_BOUNDARY ) rc = next_entry(buf, fp, sizeof(buf[0]) * 4); else rc = next_entry(buf, fp, sizeof(buf[0]) * 3); if ( rc < 0 ) goto bad; len = le32_to_cpu(buf[0]); typdatum->value = le32_to_cpu(buf[1]); if ( p->policyvers >= POLICYDB_VERSION_BOUNDARY ) { u32 prop = le32_to_cpu(buf[2]); if ( prop & TYPEDATUM_PROPERTY_PRIMARY ) typdatum->primary = 1; if ( prop & TYPEDATUM_PROPERTY_ATTRIBUTE ) typdatum->attribute = 1; typdatum->bounds = le32_to_cpu(buf[3]); } else { typdatum->primary = le32_to_cpu(buf[2]); } key = xmalloc_array(char, len + 1); if ( !key ) { rc = -ENOMEM; goto bad; } rc = next_entry(key, fp, len); if ( rc < 0 ) goto bad; key[len] = 0; rc = hashtab_insert(h, key, typdatum); if ( rc ) goto bad; out: return rc; bad: type_destroy(key, typdatum, NULL); goto out; } /* * Read a MLS level structure from a policydb binary * representation file. */ static int mls_read_level(struct mls_level *lp, void *fp) { __le32 buf[1]; int rc; memset(lp, 0, sizeof(*lp)); rc = next_entry(buf, fp, sizeof buf); if ( rc < 0 ) { printk(KERN_ERR "Flask: mls: truncated level\n"); goto bad; } lp->sens = le32_to_cpu(buf[0]); if ( ebitmap_read(&lp->cat, fp) ) { printk(KERN_ERR "Flask: mls: error reading level categories\n"); goto bad; } return 0; bad: return -EINVAL; } static int user_read(struct policydb *p, struct hashtab *h, void *fp) { char *key = NULL; struct user_datum *usrdatum; int rc; __le32 buf[3]; u32 len; usrdatum = xmalloc(struct user_datum); if ( !usrdatum ) { rc = -ENOMEM; goto out; } memset(usrdatum, 0, sizeof(*usrdatum)); if ( p->policyvers >= POLICYDB_VERSION_BOUNDARY ) rc = next_entry(buf, fp, sizeof(buf[0]) * 3); else rc = next_entry(buf, fp, sizeof(buf[0]) * 2); if ( rc < 0 ) goto bad; len = le32_to_cpu(buf[0]); usrdatum->value = le32_to_cpu(buf[1]); if ( p->policyvers >= POLICYDB_VERSION_BOUNDARY ) usrdatum->bounds = le32_to_cpu(buf[2]); key = xmalloc_array(char, len + 1); if ( !key ) { rc = -ENOMEM; goto bad; } rc = next_entry(key, fp, len); if ( rc < 0 ) goto bad; key[len] = 0; rc = ebitmap_read(&usrdatum->roles, fp); if ( rc ) goto bad; if ( p->policyvers >= POLICYDB_VERSION_MLS ) { rc = mls_read_range_helper(&usrdatum->range, fp); if ( rc ) goto bad; rc = mls_read_level(&usrdatum->dfltlevel, fp); if ( rc ) goto bad; } rc = hashtab_insert(h, key, usrdatum); if ( rc ) goto bad; out: return rc; bad: user_destroy(key, usrdatum, NULL); goto out; } static int sens_read(struct policydb *p, struct hashtab *h, void *fp) { char *key = NULL; struct level_datum *levdatum; int rc; __le32 buf[2]; u32 len; levdatum = xmalloc(struct level_datum); if ( !levdatum ) { rc = -ENOMEM; goto out; } memset(levdatum, 0, sizeof(*levdatum)); rc = next_entry(buf, fp, sizeof buf); if ( rc < 0 ) goto bad; len = le32_to_cpu(buf[0]); levdatum->isalias = le32_to_cpu(buf[1]); key = xmalloc_array(char, len + 1); if ( !key ) { rc = -ENOMEM; goto bad; } rc = next_entry(key, fp, len); if ( rc < 0 ) goto bad; key[len] = 0; levdatum->level = xmalloc(struct mls_level); if ( !levdatum->level ) { rc = -ENOMEM; goto bad; } if ( mls_read_level(levdatum->level, fp) ) { rc = -EINVAL; goto bad; } rc = hashtab_insert(h, key, levdatum); if ( rc ) goto bad; out: return rc; bad: sens_destroy(key, levdatum, NULL); goto out; } static int cat_read(struct policydb *p, struct hashtab *h, void *fp) { char *key = NULL; struct cat_datum *catdatum; int rc; __le32 buf[3]; u32 len; catdatum = xmalloc(struct cat_datum); if ( !catdatum ) { rc = -ENOMEM; goto out; } memset(catdatum, 0, sizeof(*catdatum)); rc = next_entry(buf, fp, sizeof buf); if ( rc < 0 ) goto bad; len = le32_to_cpu(buf[0]); catdatum->value = le32_to_cpu(buf[1]); catdatum->isalias = le32_to_cpu(buf[2]); key = xmalloc_array(char, len + 1); if ( !key ) { rc = -ENOMEM; goto bad; } rc = next_entry(key, fp, len); if ( rc < 0 ) goto bad; key[len] = 0; rc = hashtab_insert(h, key, catdatum); if ( rc ) goto bad; out: return rc; bad: cat_destroy(key, catdatum, NULL); goto out; } static int (*read_f[SYM_NUM]) (struct policydb *p, struct hashtab *h, void *fp) = { common_read, class_read, role_read, type_read, user_read, cond_read_bool, sens_read, cat_read, }; static int user_bounds_sanity_check(void *key, void *datum, void *datap) { struct user_datum *upper, *user; struct policydb *p = datap; int depth = 0; upper = user = datum; while (upper->bounds) { struct ebitmap_node *node; unsigned long bit; if ( ++depth == POLICYDB_BOUNDS_MAXDEPTH ) { printk(KERN_ERR "Flask: user %s: " "too deep or looped boundary", (char *) key); return -EINVAL; } upper = p->user_val_to_struct[upper->bounds - 1]; ebitmap_for_each_positive_bit(&user->roles, node, bit) { if ( ebitmap_get_bit(&upper->roles, bit) ) continue; printk(KERN_ERR "Flask: boundary violated policy: " "user=%s role=%s bounds=%s\n", p->p_user_val_to_name[user->value - 1], p->p_role_val_to_name[bit], p->p_user_val_to_name[upper->value - 1]); return -EINVAL; } } return 0; } static int role_bounds_sanity_check(void *key, void *datum, void *datap) { struct role_datum *upper, *role; struct policydb *p = datap; int depth = 0; upper = role = datum; while (upper->bounds) { struct ebitmap_node *node; unsigned long bit; if ( ++depth == POLICYDB_BOUNDS_MAXDEPTH ) { printk(KERN_ERR "Flask: role %s: " "too deep or looped bounds\n", (char *) key); return -EINVAL; } upper = p->role_val_to_struct[upper->bounds - 1]; ebitmap_for_each_positive_bit(&role->types, node, bit) { if ( ebitmap_get_bit(&upper->types, bit) ) continue; printk(KERN_ERR "Flask: boundary violated policy: " "role=%s type=%s bounds=%s\n", p->p_role_val_to_name[role->value - 1], p->p_type_val_to_name[bit], p->p_role_val_to_name[upper->value - 1]); return -EINVAL; } } return 0; } static int type_bounds_sanity_check(void *key, void *datum, void *datap) { struct type_datum *upper, *type; struct policydb *p = datap; int depth = 0; upper = type = datum; while (upper->bounds) { if ( ++depth == POLICYDB_BOUNDS_MAXDEPTH ) { printk(KERN_ERR "Flask: type %s: " "too deep or looped boundary\n", (char *) key); return -EINVAL; } upper = p->type_val_to_struct[upper->bounds - 1]; if ( upper->attribute ) { printk(KERN_ERR "Flask: type %s: " "bounded by attribute %s", (char *) key, p->p_type_val_to_name[upper->value - 1]); return -EINVAL; } } return 0; } static int policydb_bounds_sanity_check(struct policydb *p) { int rc; if ( p->policyvers < POLICYDB_VERSION_BOUNDARY ) return 0; rc = hashtab_map(p->p_users.table, user_bounds_sanity_check, p); if ( rc ) return rc; rc = hashtab_map(p->p_roles.table, role_bounds_sanity_check, p); if ( rc ) return rc; rc = hashtab_map(p->p_types.table, type_bounds_sanity_check, p); if ( rc ) return rc; return 0; } extern int ss_initialized; /* * Read the configuration data from a policy database binary * representation file into a policy database structure. */ int policydb_read(struct policydb *p, void *fp) { struct role_allow *ra, *lra; struct role_trans *tr, *ltr; struct ocontext *l, *c /*, *newc*/; int i, j, rc; __le32 buf[8]; u32 len, /*len2,*/ config, nprim, nel /*, nel2*/; char *policydb_str; struct policydb_compat_info *info; struct range_trans *rt, *lrt; config = 0; rc = policydb_init(p); if ( rc ) goto out; /* Read the magic number and string length. */ rc = next_entry(buf, fp, sizeof(u32)* 2); if ( rc < 0 ) goto bad; if ( le32_to_cpu(buf[0]) != POLICYDB_MAGIC ) { printk(KERN_ERR "Flask: policydb magic number 0x%x does " "not match expected magic number 0x%x\n", le32_to_cpu(buf[0]), POLICYDB_MAGIC); goto bad; } len = le32_to_cpu(buf[1]); if ( len != strlen(POLICYDB_STRING) ) { printk(KERN_ERR "Flask: policydb string length %d does not " "match expected length %zu\n", len, strlen(POLICYDB_STRING)); goto bad; } policydb_str = xmalloc_array(char, len + 1); if ( !policydb_str ) { printk(KERN_ERR "Flask: unable to allocate memory for policydb " "string of length %d\n", len); rc = -ENOMEM; goto bad; } rc = next_entry(policydb_str, fp, len); if ( rc < 0 ) { printk(KERN_ERR "Flask: truncated policydb string identifier\n"); xfree(policydb_str); goto bad; } policydb_str[len] = 0; if ( strcmp(policydb_str, POLICYDB_STRING) == 0 ) p->target_type = TARGET_XEN; else if ( strcmp(policydb_str, POLICYDB_STRING_OLD) == 0 ) p->target_type = TARGET_XEN_OLD; else { printk(KERN_ERR "Flask: %s not a valid policydb string", policydb_str); xfree(policydb_str); goto bad; } /* Done with policydb_str. */ xfree(policydb_str); policydb_str = NULL; /* Read the version, config, and table sizes. */ rc = next_entry(buf, fp, sizeof(u32)*4); if ( rc < 0 ) goto bad; p->policyvers = le32_to_cpu(buf[0]); if ( p->policyvers < POLICYDB_VERSION_MIN || p->policyvers > POLICYDB_VERSION_MAX ) { printk(KERN_ERR "Flask: policydb version %d does not match " "my version range %d-%d\n", le32_to_cpu(buf[0]), POLICYDB_VERSION_MIN, POLICYDB_VERSION_MAX); goto bad; } if ( (le32_to_cpu(buf[1]) & POLICYDB_CONFIG_MLS) ) { if ( ss_initialized && !flask_mls_enabled ) { printk(KERN_ERR "Cannot switch between non-MLS and MLS " "policies\n"); goto bad; } flask_mls_enabled = 1; config |= POLICYDB_CONFIG_MLS; if ( p->policyvers < POLICYDB_VERSION_MLS ) { printk(KERN_ERR "security policydb version %d (MLS) " "not backwards compatible\n", p->policyvers); goto bad; } } else { if ( ss_initialized && flask_mls_enabled ) { printk(KERN_ERR "Cannot switch between MLS and non-MLS " "policies\n"); goto bad; } } if ( p->policyvers >= POLICYDB_VERSION_POLCAP && ebitmap_read(&p->policycaps, fp) != 0 ) goto bad; if ( p->policyvers >= POLICYDB_VERSION_PERMISSIVE && ebitmap_read(&p->permissive_map, fp) != 0 ) goto bad; info = policydb_lookup_compat(p->policyvers, p->target_type); if ( !info ) { printk(KERN_ERR "Flask: unable to find policy compat info " "for version %d target %d\n", p->policyvers, p->target_type); goto bad; } if ( le32_to_cpu(buf[2]) != info->sym_num || le32_to_cpu(buf[3]) != info->ocon_num ) { printk(KERN_ERR "Flask: policydb table sizes (%d,%d) do " "not match mine (%d,%d)\n", le32_to_cpu(buf[2]), le32_to_cpu(buf[3]), info->sym_num, info->ocon_num); goto bad; } for ( i = 0; i < info->sym_num; i++ ) { rc = next_entry(buf, fp, sizeof(u32)*2); if ( rc < 0 ) goto bad; nprim = le32_to_cpu(buf[0]); nel = le32_to_cpu(buf[1]); for ( j = 0; j < nel; j++ ) { rc = read_f[i](p, p->symtab[i].table, fp); if ( rc ) goto bad; } p->symtab[i].nprim = nprim; } rc = avtab_read(&p->te_avtab, fp, p); if ( rc ) goto bad; if ( p->policyvers >= POLICYDB_VERSION_BOOL ) { rc = cond_read_list(p, fp); if ( rc ) goto bad; } rc = next_entry(buf, fp, sizeof(u32)); if ( rc < 0 ) goto bad; nel = le32_to_cpu(buf[0]); ltr = NULL; for ( i = 0; i < nel; i++ ) { tr = xmalloc(struct role_trans); if ( !tr ) { rc = -ENOMEM; goto bad; } memset(tr, 0, sizeof(*tr)); if ( ltr ) ltr->next = tr; else p->role_tr = tr; rc = next_entry(buf, fp, sizeof(u32)*3); if ( rc < 0 ) goto bad; tr->role = le32_to_cpu(buf[0]); tr->type = le32_to_cpu(buf[1]); tr->new_role = le32_to_cpu(buf[2]); if ( !policydb_role_isvalid(p, tr->role) || !policydb_type_isvalid(p, tr->type) || !policydb_role_isvalid(p, tr->new_role) ) { rc = -EINVAL; goto bad; } ltr = tr; } rc = next_entry(buf, fp, sizeof(u32)); if ( rc < 0 ) goto bad; nel = le32_to_cpu(buf[0]); lra = NULL; for ( i = 0; i < nel; i++ ) { ra = xmalloc(struct role_allow); if ( !ra ) { rc = -ENOMEM; goto bad; } memset(ra, 0, sizeof(*ra)); if ( lra ) lra->next = ra; else p->role_allow = ra; rc = next_entry(buf, fp, sizeof(u32)*2); if ( rc < 0 ) goto bad; ra->role = le32_to_cpu(buf[0]); ra->new_role = le32_to_cpu(buf[1]); if ( !policydb_role_isvalid(p, ra->role) || !policydb_role_isvalid(p, ra->new_role) ) { rc = -EINVAL; goto bad; } lra = ra; } rc = policydb_index_classes(p); if ( rc ) goto bad; rc = policydb_index_others(p); if ( rc ) goto bad; for ( i = 0; i < info->ocon_num; i++ ) { rc = next_entry(buf, fp, sizeof(u32)); if ( rc < 0 ) goto bad; nel = le32_to_cpu(buf[0]); l = NULL; for ( j = 0; j < nel; j++ ) { c = xmalloc(struct ocontext); if ( !c ) { rc = -ENOMEM; goto bad; } memset(c, 0, sizeof(*c)); if ( l ) l->next = c; else p->ocontexts[i] = c; l = c; rc = -EINVAL; switch ( i ) { case OCON_ISID: rc = next_entry(buf, fp, sizeof(u32)); if ( rc < 0 ) goto bad; c->sid[0] = le32_to_cpu(buf[0]); rc = context_read_and_validate(&c->context[0], p, fp); if ( rc ) goto bad; break; case OCON_PIRQ: if ( p->target_type != TARGET_XEN ) { printk(KERN_ERR "Old xen policy does not support pirqcon"); goto bad; } rc = next_entry(buf, fp, sizeof(u32)); if ( rc < 0 ) goto bad; c->u.pirq = le32_to_cpu(buf[0]); rc = context_read_and_validate(&c->context[0], p, fp); if ( rc ) goto bad; break; case OCON_IOPORT: if ( p->target_type != TARGET_XEN ) { printk(KERN_ERR "Old xen policy does not support ioportcon"); goto bad; } rc = next_entry(buf, fp, sizeof(u32) *2); if ( rc < 0 ) goto bad; c->u.ioport.low_ioport = le32_to_cpu(buf[0]); c->u.ioport.high_ioport = le32_to_cpu(buf[1]); rc = context_read_and_validate(&c->context[0], p, fp); if ( rc ) goto bad; break; case OCON_IOMEM: if ( p->target_type != TARGET_XEN ) { printk(KERN_ERR "Old xen policy does not support iomemcon"); goto bad; } rc = next_entry(buf, fp, sizeof(u32) *2); if ( rc < 0 ) goto bad; c->u.iomem.low_iomem = le32_to_cpu(buf[0]); c->u.iomem.high_iomem = le32_to_cpu(buf[1]); rc = context_read_and_validate(&c->context[0], p, fp); if ( rc ) goto bad; break; case OCON_DEVICE: if ( p->target_type != TARGET_XEN ) { printk(KERN_ERR "Old xen policy does not support pcidevicecon"); goto bad; } rc = next_entry(buf, fp, sizeof(u32)); if ( rc < 0 ) goto bad; c->u.device = le32_to_cpu(buf[0]); rc = context_read_and_validate(&c->context[0], p, fp); if ( rc ) goto bad; break; default: printk(KERN_ERR "Flask: unsupported object context config data\n"); rc = -EINVAL; goto bad; } } } rc = next_entry(buf, fp, sizeof(u32)); if ( rc < 0 ) goto bad; nel = le32_to_cpu(buf[0]); if ( nel ) { printk(KERN_ERR "Flask: unsupported genfs config data\n"); rc = -EINVAL; goto bad; } if ( p->policyvers >= POLICYDB_VERSION_MLS ) { int new_rangetr = p->policyvers >= POLICYDB_VERSION_RANGETRANS; rc = next_entry(buf, fp, sizeof(u32)); if ( rc < 0 ) goto bad; nel = le32_to_cpu(buf[0]); lrt = NULL; for ( i = 0; i < nel; i++ ) { rt = xmalloc(struct range_trans); if ( !rt ) { rc = -ENOMEM; goto bad; } memset(rt, 0, sizeof(*rt)); if ( lrt ) lrt->next = rt; else p->range_tr = rt; rc = next_entry(buf, fp, (sizeof(u32) * 2)); if ( rc < 0 ) goto bad; rt->source_type = le32_to_cpu(buf[0]); rt->target_type = le32_to_cpu(buf[1]); if ( new_rangetr ) { rc = next_entry(buf, fp, sizeof(u32)); if ( rc < 0 ) goto bad; rt->target_class = le32_to_cpu(buf[0]); } else rt->target_class = SECCLASS_DOMAIN; if ( !policydb_type_isvalid(p, rt->source_type) || !policydb_type_isvalid(p, rt->target_type) || !policydb_class_isvalid(p, rt->target_class) ) { rc = -EINVAL; goto bad; } rc = mls_read_range_helper(&rt->target_range, fp); if ( rc ) goto bad; if ( !mls_range_isvalid(p, &rt->target_range) ) { printk(KERN_WARNING "Flask: rangetrans: invalid range\n"); goto bad; } lrt = rt; } } p->type_attr_map = xmalloc_array(struct ebitmap, p->p_types.nprim); if ( !p->type_attr_map ) goto bad; for ( i = 0; i < p->p_types.nprim; i++ ) { ebitmap_init(&p->type_attr_map[i]); if ( p->policyvers >= POLICYDB_VERSION_AVTAB ) { if ( ebitmap_read(&p->type_attr_map[i], fp) ) goto bad; } /* add the type itself as the degenerate case */ if ( ebitmap_set_bit(&p->type_attr_map[i], i, 1) ) goto bad; } rc = policydb_bounds_sanity_check(p); if ( rc ) goto bad; rc = 0; out: return rc; bad: if ( !rc ) rc = -EINVAL; policydb_destroy(p); goto out; } xen-4.4.0/xen/xsm/flask/ss/avtab.h0000664000175000017500000000575612307313555015054 0ustar smbsmb/* * An access vector table (avtab) is a hash table * of access vectors and transition types indexed * by a type pair and a class. An access vector * table is used to represent the type enforcement * tables. * * Author : Stephen Smalley, */ /* Updated: Frank Mayer and Karl MacMillan * * Added conditional policy language extensions * * Copyright (C) 2003 Tresys Technology, LLC * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, version 2. * * Updated: Yuichi Nakamura * Tuned number of hash slots for avtab to reduce memory usage */ /* Ported to Xen 3.0, George Coker, */ #ifndef _SS_AVTAB_H_ #define _SS_AVTAB_H_ struct avtab_key { u16 source_type; /* source type */ u16 target_type; /* target type */ u16 target_class; /* target object class */ #define AVTAB_ALLOWED 1 #define AVTAB_AUDITALLOW 2 #define AVTAB_AUDITDENY 4 #define AVTAB_AV (AVTAB_ALLOWED | AVTAB_AUDITALLOW | AVTAB_AUDITDENY) #define AVTAB_TRANSITION 16 #define AVTAB_MEMBER 32 #define AVTAB_CHANGE 64 #define AVTAB_TYPE (AVTAB_TRANSITION | AVTAB_MEMBER | AVTAB_CHANGE) #define AVTAB_ENABLED_OLD 0x80000000 /* reserved for used in cond_avtab */ #define AVTAB_ENABLED 0x8000 /* reserved for used in cond_avtab */ u16 specified; /* what field is specified */ }; struct avtab_datum { u32 data; /* access vector or type value */ }; struct avtab_node { struct avtab_key key; struct avtab_datum datum; struct avtab_node *next; }; struct avtab { struct avtab_node **htable; u32 nel; /* number of elements */ u32 nslot; /* number of hash slots */ u16 mask; /* mask to compute hash func */ }; int avtab_init(struct avtab *); int avtab_alloc(struct avtab *, u32); struct avtab_datum *avtab_search(struct avtab *h, struct avtab_key *k); void avtab_destroy(struct avtab *h); void avtab_hash_eval(struct avtab *h, char *tag); struct policydb; int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol, int (*insert)(struct avtab *a, struct avtab_key *k, struct avtab_datum *d, void *p), void *p); int avtab_read(struct avtab *a, void *fp, struct policydb *pol); struct avtab_node *avtab_insert_nonunique(struct avtab *h, struct avtab_key *key, struct avtab_datum *datum); struct avtab_node *avtab_search_node(struct avtab *h, struct avtab_key *key); struct avtab_node *avtab_search_node_next(struct avtab_node *node, int specified); #define MAX_AVTAB_HASH_BITS 13 #define MAX_AVTAB_HASH_BUCKETS (1 << MAX_AVTAB_HASH_BITS) #define MAX_AVTAB_HASH_MASK (MAX_AVTAB_HASH_BUCKETS-1) #define MAX_AVTAB_SIZE MAX_AVTAB_HASH_BUCKETS #endif /* _SS_AVTAB_H_ */ xen-4.4.0/xen/xsm/flask/ss/hashtab.c0000664000175000017500000000725012307313555015353 0ustar smbsmb/* * Implementation of the hash table type. * * Author : Stephen Smalley, */ /* Ported to Xen 3.0, George Coker, */ #include #include #include #include "hashtab.h" struct hashtab *hashtab_create(u32 (*hash_value)(struct hashtab *h, const void *key), int (*keycmp)(struct hashtab *h, const void *key1, const void *key2), u32 size) { struct hashtab *p; u32 i; p = xmalloc(struct hashtab); if ( p == NULL ) return p; memset(p, 0, sizeof(*p)); p->size = size; p->nel = 0; p->hash_value = hash_value; p->keycmp = keycmp; p->htable = xmalloc_array(struct hashtab_node *, size); if ( p->htable == NULL ) { xfree(p); return NULL; } for ( i = 0; i < size; i++ ) p->htable[i] = NULL; return p; } int hashtab_insert(struct hashtab *h, void *key, void *datum) { u32 hvalue; struct hashtab_node *prev, *cur, *newnode; if ( !h || h->nel == HASHTAB_MAX_NODES ) return -EINVAL; hvalue = h->hash_value(h, key); prev = NULL; cur = h->htable[hvalue]; while ( cur && h->keycmp(h, key, cur->key) > 0 ) { prev = cur; cur = cur->next; } if ( cur && (h->keycmp(h, key, cur->key) == 0) ) return -EEXIST; newnode = xmalloc(struct hashtab_node); if ( newnode == NULL ) return -ENOMEM; memset(newnode, 0, sizeof(*newnode)); newnode->key = key; newnode->datum = datum; if ( prev ) { newnode->next = prev->next; prev->next = newnode; } else { newnode->next = h->htable[hvalue]; h->htable[hvalue] = newnode; } h->nel++; return 0; } void *hashtab_search(struct hashtab *h, const void *key) { u32 hvalue; struct hashtab_node *cur; if ( !h ) return NULL; hvalue = h->hash_value(h, key); cur = h->htable[hvalue]; while ( cur != NULL && h->keycmp(h, key, cur->key) > 0 ) cur = cur->next; if ( cur == NULL || (h->keycmp(h, key, cur->key) != 0) ) return NULL; return cur->datum; } void hashtab_destroy(struct hashtab *h) { u32 i; struct hashtab_node *cur, *temp; if ( !h ) return; for ( i = 0; i < h->size; i++ ) { cur = h->htable[i]; while ( cur != NULL ) { temp = cur; cur = cur->next; xfree(temp); } h->htable[i] = NULL; } xfree(h->htable); h->htable = NULL; xfree(h); } int hashtab_map(struct hashtab *h, int (*apply)(void *k, void *d, void *args), void *args) { u32 i; int ret; struct hashtab_node *cur; if ( !h ) return 0; for ( i = 0; i < h->size; i++ ) { cur = h->htable[i]; while ( cur != NULL ) { ret = apply(cur->key, cur->datum, args); if ( ret ) return ret; cur = cur->next; } } return 0; } void hashtab_stat(struct hashtab *h, struct hashtab_info *info) { u32 i, chain_len, slots_used, max_chain_len; struct hashtab_node *cur; slots_used = 0; max_chain_len = 0; for ( slots_used = max_chain_len = i = 0; i < h->size; i++ ) { cur = h->htable[i]; if ( cur ) { slots_used++; chain_len = 0; while ( cur ) { chain_len++; cur = cur->next; } if ( chain_len > max_chain_len ) max_chain_len = chain_len; } } info->slots_used = slots_used; info->max_chain_len = max_chain_len; } xen-4.4.0/xen/xsm/flask/ss/sidtab.h0000664000175000017500000000305512307313555015213 0ustar smbsmb/* * A security identifier table (sidtab) is a hash table * of security context structures indexed by SID value. * * Author : Stephen Smalley, */ /* Ported to Xen 3.0, George Coker, */ #ifndef _SS_SIDTAB_H_ #define _SS_SIDTAB_H_ #include "context.h" #include struct sidtab_node { u32 sid; /* security identifier */ struct context context; /* security context structure */ struct sidtab_node *next; }; #define SIDTAB_HASH_BITS 7 #define SIDTAB_HASH_BUCKETS (1 << SIDTAB_HASH_BITS) #define SIDTAB_HASH_MASK (SIDTAB_HASH_BUCKETS-1) #define SIDTAB_SIZE SIDTAB_HASH_BUCKETS struct sidtab { struct sidtab_node **htable; unsigned int nel; /* number of elements */ unsigned int next_sid; /* next SID to allocate */ unsigned char shutdown; spinlock_t lock; }; int sidtab_init(struct sidtab *s); int sidtab_insert(struct sidtab *s, u32 sid, struct context *context); struct context *sidtab_search(struct sidtab *s, u32 sid); int sidtab_map(struct sidtab *s, int (*apply) (u32 sid, struct context *context, void *args), void *args); void sidtab_map_remove_on_error(struct sidtab *s, int (*apply) (u32 sid, struct context *context, void *args), void *args); int sidtab_context_to_sid(struct sidtab *s, struct context *context, u32 *sid); void sidtab_hash_eval(struct sidtab *h, char *tag); void sidtab_destroy(struct sidtab *s); void sidtab_set(struct sidtab *dst, struct sidtab *src); void sidtab_shutdown(struct sidtab *s); #endif /* _SS_SIDTAB_H_ */ xen-4.4.0/xen/xsm/flask/ss/symtab.h0000664000175000017500000000110512307313555015236 0ustar smbsmb/* * A symbol table (symtab) maintains associations between symbol * strings and datum values. The type of the datum values * is arbitrary. The symbol table type is implemented * using the hash table type (hashtab). * * Author : Stephen Smalley, */ #ifndef _SS_SYMTAB_H_ #define _SS_SYMTAB_H_ #include "hashtab.h" struct symtab { struct hashtab *table; /* hash table (keyed on a string) */ u32 nprim; /* number of primary names in table */ }; int symtab_init(struct symtab *s, unsigned int size); #endif /* _SS_SYMTAB_H_ */ xen-4.4.0/xen/xsm/flask/ss/conditional.c0000664000175000017500000003350012307313555016241 0ustar smbsmb/* Authors: Karl MacMillan * Frank Mayer * * Copyright (C) 2003 - 2004 Tresys Technology, LLC * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, version 2. */ /* Ported to Xen 3.0, George Coker, */ #include #include #include #include #include #include #include #include "security.h" #include "conditional.h" /* * cond_evaluate_expr evaluates a conditional expr * in reverse polish notation. It returns true (1), false (0), * or undefined (-1). Undefined occurs when the expression * exceeds the stack depth of COND_EXPR_MAXDEPTH. */ static int cond_evaluate_expr(struct policydb *p, struct cond_expr *expr) { struct cond_expr *cur; int s[COND_EXPR_MAXDEPTH]; int sp = -1; for ( cur = expr; cur != NULL; cur = cur->next ) { switch ( cur->expr_type ) { case COND_BOOL: if ( sp == (COND_EXPR_MAXDEPTH - 1) ) return -1; sp++; s[sp] = p->bool_val_to_struct[cur->bool - 1]->state; break; case COND_NOT: if ( sp < 0 ) return -1; s[sp] = !s[sp]; break; case COND_OR: if ( sp < 1 ) return -1; sp--; s[sp] |= s[sp + 1]; break; case COND_AND: if ( sp < 1 ) return -1; sp--; s[sp] &= s[sp + 1]; break; case COND_XOR: if ( sp < 1 ) return -1; sp--; s[sp] ^= s[sp + 1]; break; case COND_EQ: if ( sp < 1 ) return -1; sp--; s[sp] = (s[sp] == s[sp + 1]); break; case COND_NEQ: if ( sp < 1 ) return -1; sp--; s[sp] = (s[sp] != s[sp + 1]); break; default: return -1; } } return s[0]; } /* * evaluate_cond_node evaluates the conditional stored in * a struct cond_node and if the result is different than the * current state of the node it sets the rules in the true/false * list appropriately. If the result of the expression is undefined * all of the rules are disabled for safety. */ int evaluate_cond_node(struct policydb *p, struct cond_node *node) { int new_state; struct cond_av_list* cur; new_state = cond_evaluate_expr(p, node->expr); if ( new_state != node->cur_state ) { node->cur_state = new_state; if ( new_state == -1 ) printk(KERN_ERR "Flask: expression result was undefined - disabling all rules.\n"); /* turn the rules on or off */ for ( cur = node->true_list; cur != NULL; cur = cur->next ) { if ( new_state <= 0 ) cur->node->key.specified &= ~AVTAB_ENABLED; else cur->node->key.specified |= AVTAB_ENABLED; } for ( cur = node->false_list; cur != NULL; cur = cur->next ) { /* -1 or 1 */ if ( new_state ) cur->node->key.specified &= ~AVTAB_ENABLED; else cur->node->key.specified |= AVTAB_ENABLED; } } return 0; } int cond_policydb_init(struct policydb *p) { p->bool_val_to_struct = NULL; p->cond_list = NULL; if ( avtab_init(&p->te_cond_avtab) ) return -1; return 0; } static void cond_av_list_destroy(struct cond_av_list *list) { struct cond_av_list *cur, *next; for ( cur = list; cur != NULL; cur = next ) { next = cur->next; /* the avtab_ptr_t node is destroy by the avtab */ xfree(cur); } } static void cond_node_destroy(struct cond_node *node) { struct cond_expr *cur_expr, *next_expr; for ( cur_expr = node->expr; cur_expr != NULL; cur_expr = next_expr ) { next_expr = cur_expr->next; xfree(cur_expr); } cond_av_list_destroy(node->true_list); cond_av_list_destroy(node->false_list); xfree(node); } static void cond_list_destroy(struct cond_node *list) { struct cond_node *next, *cur; if ( list == NULL ) return; for ( cur = list; cur != NULL; cur = next ) { next = cur->next; cond_node_destroy(cur); } } void cond_policydb_destroy(struct policydb *p) { xfree(p->bool_val_to_struct); avtab_destroy(&p->te_cond_avtab); cond_list_destroy(p->cond_list); } int cond_init_bool_indexes(struct policydb *p) { xfree(p->bool_val_to_struct); p->bool_val_to_struct = (struct cond_bool_datum**) xmalloc_array(struct cond_bool_datum*, p->p_bools.nprim); if ( !p->bool_val_to_struct ) return -1; return 0; } int cond_destroy_bool(void *key, void *datum, void *p) { xfree(key); xfree(datum); return 0; } int cond_index_bool(void *key, void *datum, void *datap) { struct policydb *p; struct cond_bool_datum *booldatum; booldatum = datum; p = datap; if ( !booldatum->value || booldatum->value > p->p_bools.nprim ) return -EINVAL; p->p_bool_val_to_name[booldatum->value - 1] = key; p->bool_val_to_struct[booldatum->value -1] = booldatum; return 0; } static int bool_isvalid(struct cond_bool_datum *b) { if ( !(b->state == 0 || b->state == 1) ) return 0; return 1; } int cond_read_bool(struct policydb *p, struct hashtab *h, void *fp) { char *key = NULL; struct cond_bool_datum *booldatum; __le32 buf[3]; u32 len; int rc; booldatum = xmalloc(struct cond_bool_datum); if ( !booldatum ) return -1; memset(booldatum, 0, sizeof(struct cond_bool_datum)); rc = next_entry(buf, fp, sizeof buf); if ( rc < 0 ) goto err; booldatum->value = le32_to_cpu(buf[0]); booldatum->state = le32_to_cpu(buf[1]); if ( !bool_isvalid(booldatum) ) goto err; len = le32_to_cpu(buf[2]); key = xmalloc_array(char, len + 1); if ( !key ) goto err; rc = next_entry(key, fp, len); if ( rc < 0 ) goto err; key[len] = 0; if ( hashtab_insert(h, key, booldatum) ) goto err; return 0; err: cond_destroy_bool(key, booldatum, NULL); return -1; } struct cond_insertf_data { struct policydb *p; struct cond_av_list *other; struct cond_av_list *head; struct cond_av_list *tail; }; static int cond_insertf(struct avtab *a, struct avtab_key *k, struct avtab_datum *d, void *ptr) { struct cond_insertf_data *data = ptr; struct policydb *p = data->p; struct cond_av_list *other = data->other, *list, *cur; struct avtab_node *node_ptr; u8 found; /* * For type rules we have to make certain there aren't any * conflicting rules by searching the te_avtab and the * cond_te_avtab. */ if ( k->specified & AVTAB_TYPE ) { if ( avtab_search(&p->te_avtab, k) ) { printk("Flask: type rule already exists outside of a " "conditional."); goto err; } /* * If we are reading the false list other will be a pointer to * the true list. We can have duplicate entries if there is only * 1 other entry and it is in our true list. * * If we are reading the true list (other == NULL) there shouldn't * be any other entries. */ if ( other ) { node_ptr = avtab_search_node(&p->te_cond_avtab, k); if ( node_ptr ) { if ( avtab_search_node_next(node_ptr, k->specified) ) { printk("Flask: too many conflicting type rules."); goto err; } found = 0; for ( cur = other; cur != NULL; cur = cur->next ) { if ( cur->node == node_ptr ) { found = 1; break; } } if ( !found ) { printk("Flask: conflicting type rules.\n"); goto err; } } } else { if ( avtab_search(&p->te_cond_avtab, k) ) { printk("Flask: conflicting type rules when adding type rule " "for true.\n"); goto err; } } } node_ptr = avtab_insert_nonunique(&p->te_cond_avtab, k, d); if ( !node_ptr ) { printk("Flask: could not insert rule."); goto err; } list = xmalloc(struct cond_av_list); if ( !list ) goto err; memset(list, 0, sizeof(*list)); list->node = node_ptr; if ( !data->head ) data->head = list; else data->tail->next = list; data->tail = list; return 0; err: cond_av_list_destroy(data->head); data->head = NULL; return -1; } static int cond_read_av_list(struct policydb *p, void *fp, struct cond_av_list **ret_list, struct cond_av_list *other) { int i, rc; __le32 buf[1]; u32 len; struct cond_insertf_data data; *ret_list = NULL; len = 0; rc = next_entry(buf, fp, sizeof(u32)); if ( rc < 0 ) return -1; len = le32_to_cpu(buf[0]); if ( len == 0 ) { return 0; } data.p = p; data.other = other; data.head = NULL; data.tail = NULL; for ( i = 0; i < len; i++ ) { rc = avtab_read_item(&p->te_cond_avtab, fp, p, cond_insertf, &data); if ( rc ) return rc; } *ret_list = data.head; return 0; } static int expr_isvalid(struct policydb *p, struct cond_expr *expr) { if ( expr->expr_type <= 0 || expr->expr_type > COND_LAST ) { printk("Flask: conditional expressions uses unknown operator.\n"); return 0; } if ( expr->bool > p->p_bools.nprim ) { printk("Flask: conditional expressions uses unknown bool.\n"); return 0; } return 1; } static int cond_read_node(struct policydb *p, struct cond_node *node, void *fp) { __le32 buf[2]; u32 len, i; int rc; struct cond_expr *expr = NULL, *last = NULL; rc = next_entry(buf, fp, sizeof(u32)); if ( rc < 0 ) return -1; node->cur_state = le32_to_cpu(buf[0]); len = 0; rc = next_entry(buf, fp, sizeof(u32)); if ( rc < 0 ) return -1; /* expr */ len = le32_to_cpu(buf[0]); for ( i = 0; i < len; i++ ) { rc = next_entry(buf, fp, sizeof(u32) * 2); if ( rc < 0 ) goto err; expr = xmalloc(struct cond_expr); if ( !expr ) { goto err; } memset(expr, 0, sizeof(struct cond_expr)); expr->expr_type = le32_to_cpu(buf[0]); expr->bool = le32_to_cpu(buf[1]); if ( !expr_isvalid(p, expr) ) { xfree(expr); goto err; } if ( i == 0 ) node->expr = expr; else last->next = expr; last = expr; } if ( cond_read_av_list(p, fp, &node->true_list, NULL) != 0 ) goto err; if ( cond_read_av_list(p, fp, &node->false_list, node->true_list) != 0 ) goto err; return 0; err: cond_node_destroy(node); return -1; } int cond_read_list(struct policydb *p, void *fp) { struct cond_node *node, *last = NULL; __le32 buf[1]; u32 i, len; int rc; rc = next_entry(buf, fp, sizeof buf); if ( rc < 0 ) return -1; len = le32_to_cpu(buf[0]); rc = avtab_alloc(&(p->te_cond_avtab), p->te_avtab.nel); if ( rc ) goto err; for ( i = 0; i < len; i++ ) { node = xmalloc(struct cond_node); if ( !node ) goto err; memset(node, 0, sizeof(struct cond_node)); if ( cond_read_node(p, node, fp) != 0 ) goto err; if ( i == 0 ) p->cond_list = node; else last->next = node; last = node; } return 0; err: cond_list_destroy(p->cond_list); p->cond_list = NULL; return -1; } /* Determine whether additional permissions are granted by the conditional * av table, and if so, add them to the result */ void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd) { struct avtab_node *node; if( !ctab || !key || !avd ) return; for( node = avtab_search_node(ctab, key); node != NULL; node = avtab_search_node_next(node, key->specified) ) { if ( (u16) (AVTAB_ALLOWED|AVTAB_ENABLED) == (node->key.specified & (AVTAB_ALLOWED|AVTAB_ENABLED)) ) avd->allowed |= node->datum.data; if ( (u16) (AVTAB_AUDITDENY|AVTAB_ENABLED) == (node->key.specified & (AVTAB_AUDITDENY|AVTAB_ENABLED)) ) /* Since a '0' in an auditdeny mask represents a * permission we do NOT want to audit (dontaudit), we use * the '&' operand to ensure that all '0's in the mask * are retained (much unlike the allow and auditallow cases). */ avd->auditdeny &= node->datum.data; if ( (u16) (AVTAB_AUDITALLOW|AVTAB_ENABLED) == (node->key.specified & (AVTAB_AUDITALLOW|AVTAB_ENABLED)) ) avd->auditallow |= node->datum.data; } return; } xen-4.4.0/xen/xsm/flask/ss/services.c0000664000175000017500000017263612307313555015577 0ustar smbsmb/* * Implementation of the security services. * * Authors : Stephen Smalley, * James Morris * * Updated: Trusted Computer Solutions, Inc. * * Support for enhanced MLS infrastructure. * * Updated: Frank Mayer and Karl MacMillan * * Added conditional policy language extensions * * Updated: Hewlett-Packard * * Added support for the policy capability bitmap * * Updated: Chad Sellers * * Added validation of kernel classes and permissions * * Updated: KaiGai Kohei * * Added support for bounds domain and audit messaged on masked permissions * * Copyright (C) 2008, 2009 NEC Corporation * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P. * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC * Copyright (C) 2003 Red Hat, Inc., James Morris * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, version 2. */ /* Ported to Xen 3.0, George Coker, */ #include #include #include #include #include #include "flask.h" #include "avc.h" #include "avc_ss.h" #include "security.h" #include "context.h" #include "policydb.h" #include "sidtab.h" #include "services.h" #include "conditional.h" #include "mls.h" unsigned int policydb_loaded_version; static DEFINE_RWLOCK(policy_rwlock); #define POLICY_RDLOCK read_lock(&policy_rwlock) #define POLICY_WRLOCK write_lock(&policy_rwlock) #define POLICY_RDUNLOCK read_unlock(&policy_rwlock) #define POLICY_WRUNLOCK write_unlock(&policy_rwlock) static DEFINE_SPINLOCK(load_sem); #define LOAD_LOCK spin_lock(&load_sem) #define LOAD_UNLOCK spin_unlock(&load_sem) static struct sidtab sidtab; struct policydb policydb; int ss_initialized = 0; /* * The largest sequence number that has been used when * providing an access decision to the access vector cache. * The sequence number only changes when a policy change * occurs. */ static u32 latest_granting = 0; /* Forward declaration. */ static int context_struct_to_string(struct context *context, char **scontext, u32 *scontext_len); static int context_struct_compute_av(struct context *scontext, struct context *tcontext, u16 tclass, u32 requested, struct av_decision *avd); /* * Return the boolean value of a constraint expression * when it is applied to the specified source and target * security contexts. * * xcontext is a special beast... It is used by the validatetrans rules * only. For these rules, scontext is the context before the transition, * tcontext is the context after the transition, and xcontext is the context * of the process performing the transition. All other callers of * constraint_expr_eval should pass in NULL for xcontext. */ static int constraint_expr_eval(struct context *scontext, struct context *tcontext, struct context *xcontext, struct constraint_expr *cexpr) { u32 val1, val2; struct context *c; struct role_datum *r1, *r2; struct mls_level *l1, *l2; struct constraint_expr *e; int s[CEXPR_MAXDEPTH]; int sp = -1; for ( e = cexpr; e; e = e->next ) { switch ( e->expr_type ) { case CEXPR_NOT: BUG_ON(sp < 0); s[sp] = !s[sp]; break; case CEXPR_AND: BUG_ON(sp < 1); sp--; s[sp] &= s[sp+1]; break; case CEXPR_OR: BUG_ON(sp < 1); sp--; s[sp] |= s[sp+1]; break; case CEXPR_ATTR: if ( sp == (CEXPR_MAXDEPTH-1) ) return 0; switch ( e->attr ) { case CEXPR_USER: val1 = scontext->user; val2 = tcontext->user; break; case CEXPR_TYPE: val1 = scontext->type; val2 = tcontext->type; break; case CEXPR_ROLE: val1 = scontext->role; val2 = tcontext->role; r1 = policydb.role_val_to_struct[val1 - 1]; r2 = policydb.role_val_to_struct[val2 - 1]; switch ( e->op ) { case CEXPR_DOM: s[++sp] = ebitmap_get_bit(&r1->dominates, val2 - 1); continue; case CEXPR_DOMBY: s[++sp] = ebitmap_get_bit(&r2->dominates, val1 - 1); continue; case CEXPR_INCOMP: s[++sp] = ( !ebitmap_get_bit(&r1->dominates, val2 - 1) && !ebitmap_get_bit(&r2->dominates, val1 - 1) ); continue; default: break; } break; case CEXPR_L1L2: l1 = &(scontext->range.level[0]); l2 = &(tcontext->range.level[0]); goto mls_ops; case CEXPR_L1H2: l1 = &(scontext->range.level[0]); l2 = &(tcontext->range.level[1]); goto mls_ops; case CEXPR_H1L2: l1 = &(scontext->range.level[1]); l2 = &(tcontext->range.level[0]); goto mls_ops; case CEXPR_H1H2: l1 = &(scontext->range.level[1]); l2 = &(tcontext->range.level[1]); goto mls_ops; case CEXPR_L1H1: l1 = &(scontext->range.level[0]); l2 = &(scontext->range.level[1]); goto mls_ops; case CEXPR_L2H2: l1 = &(tcontext->range.level[0]); l2 = &(tcontext->range.level[1]); goto mls_ops; mls_ops: switch ( e->op ) { case CEXPR_EQ: s[++sp] = mls_level_eq(l1, l2); continue; case CEXPR_NEQ: s[++sp] = !mls_level_eq(l1, l2); continue; case CEXPR_DOM: s[++sp] = mls_level_dom(l1, l2); continue; case CEXPR_DOMBY: s[++sp] = mls_level_dom(l2, l1); continue; case CEXPR_INCOMP: s[++sp] = mls_level_incomp(l2, l1); continue; default: BUG(); return 0; } break; default: BUG(); return 0; } switch ( e->op ) { case CEXPR_EQ: s[++sp] = (val1 == val2); break; case CEXPR_NEQ: s[++sp] = (val1 != val2); break; default: BUG(); return 0; } break; case CEXPR_NAMES: if ( sp == (CEXPR_MAXDEPTH-1) ) return 0; c = scontext; if ( e->attr & CEXPR_TARGET ) c = tcontext; else if ( e->attr & CEXPR_XTARGET ) { c = xcontext; if ( !c ) { BUG(); return 0; } } if ( e->attr & CEXPR_USER ) val1 = c->user; else if ( e->attr & CEXPR_ROLE ) val1 = c->role; else if ( e->attr & CEXPR_TYPE ) val1 = c->type; else { BUG(); return 0; } switch ( e->op ) { case CEXPR_EQ: s[++sp] = ebitmap_get_bit(&e->names, val1 - 1); break; case CEXPR_NEQ: s[++sp] = !ebitmap_get_bit(&e->names, val1 - 1); break; default: BUG(); return 0; } break; default: BUG(); return 0; } } BUG_ON(sp != 0); return s[0]; } /* * security_dump_masked_av - dumps masked permissions during * security_compute_av due to RBAC, MLS/Constraint and Type bounds. */ static int dump_masked_av_helper(void *k, void *d, void *args) { struct perm_datum *pdatum = d; char **permission_names = args; BUG_ON(pdatum->value < 1 || pdatum->value > 32); permission_names[pdatum->value - 1] = (char *)k; return 0; } static void security_dump_masked_av(struct context *scontext, struct context *tcontext, u16 tclass, u32 permissions, const char *reason) { struct common_datum *common_dat; struct class_datum *tclass_dat; char *tclass_name; char *scontext_name = NULL; char *tcontext_name = NULL; char *permission_names[32]; int index; u32 length; unsigned char need_comma = 0; if ( !permissions ) return; tclass_name = policydb.p_class_val_to_name[tclass - 1]; tclass_dat = policydb.class_val_to_struct[tclass - 1]; common_dat = tclass_dat->comdatum; /* init permission_names */ if ( common_dat && hashtab_map(common_dat->permissions.table, dump_masked_av_helper, permission_names) < 0 ) goto out; if ( hashtab_map(tclass_dat->permissions.table, dump_masked_av_helper, permission_names) < 0 ) goto out; /* get scontext/tcontext in text form */ if ( context_struct_to_string(scontext, &scontext_name, &length) < 0 ) goto out; if ( context_struct_to_string(tcontext, &tcontext_name, &length) < 0 ) goto out; printk("Flask: op=security_compute_av reason=%s " "scontext=%s tcontext=%s tclass=%s perms=", reason, scontext_name, tcontext_name, tclass_name); for ( index = 0; index < 32; index++ ) { u32 mask = (1 << index); if ( (mask & permissions) == 0 ) continue; printk("%s%s", need_comma ? "," : "", permission_names[index] ? permission_names[index] : "????"); need_comma = 1; } printk("\n"); out: /* release scontext/tcontext */ xfree(tcontext_name); xfree(scontext_name); return; } /* * security_boundary_permission - drops violated permissions * on boundary constraint. */ static void type_attribute_bounds_av(struct context *scontext, struct context *tcontext, u16 tclass, u32 requested, struct av_decision *avd) { struct context lo_scontext; struct context lo_tcontext; struct av_decision lo_avd; struct type_datum *source = policydb.type_val_to_struct[scontext->type - 1]; struct type_datum *target = policydb.type_val_to_struct[tcontext->type - 1]; u32 masked = 0; if ( source->bounds ) { memset(&lo_avd, 0, sizeof(lo_avd)); memcpy(&lo_scontext, scontext, sizeof(lo_scontext)); lo_scontext.type = source->bounds; context_struct_compute_av(&lo_scontext, tcontext, tclass, requested, &lo_avd); if ( (lo_avd.allowed & avd->allowed) == avd->allowed ) return; /* no masked permission */ masked = ~lo_avd.allowed & avd->allowed; } if ( target->bounds ) { memset(&lo_avd, 0, sizeof(lo_avd)); memcpy(&lo_tcontext, tcontext, sizeof(lo_tcontext)); lo_tcontext.type = target->bounds; context_struct_compute_av(scontext, &lo_tcontext, tclass, requested, &lo_avd); if ( (lo_avd.allowed & avd->allowed) == avd->allowed ) return; /* no masked permission */ masked = ~lo_avd.allowed & avd->allowed; } if ( source->bounds && target->bounds ) { memset(&lo_avd, 0, sizeof(lo_avd)); /* * lo_scontext and lo_tcontext are already * set up. */ context_struct_compute_av(&lo_scontext, &lo_tcontext, tclass, requested, &lo_avd); if ( (lo_avd.allowed & avd->allowed) == avd->allowed ) return; /* no masked permission */ masked = ~lo_avd.allowed & avd->allowed; } if ( masked ) { /* mask violated permissions */ avd->allowed &= ~masked; /* audit masked permissions */ security_dump_masked_av(scontext, tcontext, tclass, masked, "bounds"); } } /* * Compute access vectors based on a context structure pair for * the permissions in a particular class. */ static int context_struct_compute_av(struct context *scontext, struct context *tcontext, u16 tclass, u32 requested, struct av_decision *avd) { struct constraint_node *constraint; struct role_allow *ra; struct avtab_key avkey; struct avtab_node *node; struct class_datum *tclass_datum; struct ebitmap *sattr, *tattr; struct ebitmap_node *snode, *tnode; unsigned int i, j; /* * Initialize the access vectors to the default values. */ avd->allowed = 0; avd->auditallow = 0; avd->auditdeny = 0xffffffff; avd->seqno = latest_granting; avd->flags = 0; /* * We do not presently support policydb.handle_unknown == allow in Xen. */ if ( !tclass || tclass > policydb.p_classes.nprim ) return -EINVAL; tclass_datum = policydb.class_val_to_struct[tclass - 1]; /* * If a specific type enforcement rule was defined for * this permission check, then use it. */ avkey.target_class = tclass; avkey.specified = AVTAB_AV; sattr = &policydb.type_attr_map[scontext->type - 1]; tattr = &policydb.type_attr_map[tcontext->type - 1]; ebitmap_for_each_positive_bit(sattr, snode, i) { ebitmap_for_each_positive_bit(tattr, tnode, j) { avkey.source_type = i + 1; avkey.target_type = j + 1; for ( node = avtab_search_node(&policydb.te_avtab, &avkey); node != NULL; node = avtab_search_node_next(node, avkey.specified) ) { if ( node->key.specified == AVTAB_ALLOWED ) avd->allowed |= node->datum.data; else if ( node->key.specified == AVTAB_AUDITALLOW ) avd->auditallow |= node->datum.data; else if ( node->key.specified == AVTAB_AUDITDENY ) avd->auditdeny &= node->datum.data; } /* Check conditional av table for additional permissions */ cond_compute_av(&policydb.te_cond_avtab, &avkey, avd); } } /* * Remove any permissions prohibited by a constraint (this includes * the MLS policy). */ constraint = tclass_datum->constraints; while ( constraint ) { if ( (constraint->permissions & (avd->allowed) ) && !constraint_expr_eval(scontext, tcontext, NULL, constraint->expr)) { avd->allowed &= ~(constraint->permissions); } constraint = constraint->next; } /* * If checking process transition permission and the * role is changing, then check the (current_role, new_role) * pair. */ if ( tclass == SECCLASS_DOMAIN && (avd->allowed & DOMAIN__TRANSITION) && scontext->role != tcontext->role ) { for ( ra = policydb.role_allow; ra; ra = ra->next ) { if ( scontext->role == ra->role && tcontext->role == ra->new_role ) break; } if (!ra) avd->allowed &= ~DOMAIN__TRANSITION; } /* * If the given source and target types have boundary * constraint, lazy checks have to mask any violated * permission and notice it to userspace via audit. */ type_attribute_bounds_av(scontext, tcontext, tclass, requested, avd); return 0; } static int security_validtrans_handle_fail(struct context *ocontext, struct context *ncontext, struct context *tcontext, u16 tclass) { char *o = NULL, *n = NULL, *t = NULL; u32 olen, nlen, tlen; if ( context_struct_to_string(ocontext, &o, &olen) < 0 ) goto out; if ( context_struct_to_string(ncontext, &n, &nlen) < 0 ) goto out; if ( context_struct_to_string(tcontext, &t, &tlen) < 0 ) goto out; printk("security_validate_transition: denied for" " oldcontext=%s newcontext=%s taskcontext=%s tclass=%s", o, n, t, policydb.p_class_val_to_name[tclass-1]); out: xfree(o); xfree(n); xfree(t); if ( !flask_enforcing ) return 0; return -EPERM; } int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, u16 tclass) { struct context *ocontext; struct context *ncontext; struct context *tcontext; struct class_datum *tclass_datum; struct constraint_node *constraint; int rc = 0; if ( !ss_initialized ) return 0; POLICY_RDLOCK; if ( !tclass || tclass > policydb.p_classes.nprim ) { printk(KERN_ERR "security_validate_transition: " "unrecognized class %d\n", tclass); rc = -EINVAL; goto out; } tclass_datum = policydb.class_val_to_struct[tclass - 1]; ocontext = sidtab_search(&sidtab, oldsid); if ( !ocontext ) { printk(KERN_ERR "security_validate_transition: " " unrecognized SID %d\n", oldsid); rc = -EINVAL; goto out; } ncontext = sidtab_search(&sidtab, newsid); if ( !ncontext ) { printk(KERN_ERR "security_validate_transition: " " unrecognized SID %d\n", newsid); rc = -EINVAL; goto out; } tcontext = sidtab_search(&sidtab, tasksid); if ( !tcontext ) { printk(KERN_ERR "security_validate_transition: " " unrecognized SID %d\n", tasksid); rc = -EINVAL; goto out; } constraint = tclass_datum->validatetrans; while ( constraint ) { if ( !constraint_expr_eval(ocontext, ncontext, tcontext, constraint->expr) ) { rc = security_validtrans_handle_fail(ocontext, ncontext, tcontext, tclass); goto out; } constraint = constraint->next; } out: POLICY_RDUNLOCK; return rc; } /** * security_compute_av - Compute access vector decisions. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @requested: requested permissions * @avd: access vector decisions * * Compute a set of access vector decisions based on the * SID pair (@ssid, @tsid) for the permissions in @tclass. * Return -%EINVAL if any of the parameters are invalid or %0 * if the access vector decisions were computed successfully. */ int security_compute_av(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct av_decision *avd) { struct context *scontext = NULL, *tcontext = NULL; int rc = 0; if ( !ss_initialized ) { avd->allowed = 0xffffffff; avd->auditallow = 0; avd->auditdeny = 0xffffffff; avd->seqno = latest_granting; return 0; } POLICY_RDLOCK; scontext = sidtab_search(&sidtab, ssid); if ( !scontext ) { printk("security_compute_av: unrecognized SID %d\n", ssid); rc = -EINVAL; goto out; } tcontext = sidtab_search(&sidtab, tsid); if ( !tcontext ) { printk("security_compute_av: unrecognized SID %d\n", tsid); rc = -EINVAL; goto out; } rc = context_struct_compute_av(scontext, tcontext, tclass, requested, avd); /* permissive domain? */ if ( ebitmap_get_bit(&policydb.permissive_map, scontext->type) ) avd->flags |= AVD_FLAGS_PERMISSIVE; out: POLICY_RDUNLOCK; return rc; } /* * Write the security context string representation of * the context structure `context' into a dynamically * allocated string of the correct size. Set `*scontext' * to point to this string and set `*scontext_len' to * the length of the string. */ static int context_struct_to_string(struct context *context, char **scontext, u32 *scontext_len) { char *scontextp; *scontext = NULL; *scontext_len = 0; /* Compute the size of the context. */ *scontext_len += strlen(policydb.p_user_val_to_name[context->user - 1]) + 1; *scontext_len += strlen(policydb.p_role_val_to_name[context->role - 1]) + 1; *scontext_len += strlen(policydb.p_type_val_to_name[context->type - 1]) + 1; *scontext_len += mls_compute_context_len(context); /* Allocate space for the context; caller must free this space. */ scontextp = xmalloc_array(char, *scontext_len); if ( !scontextp ) return -ENOMEM; *scontext = scontextp; /* * Copy the user name, role name and type name into the context. */ snprintf(scontextp, *scontext_len, "%s:%s:%s", policydb.p_user_val_to_name[context->user - 1], policydb.p_role_val_to_name[context->role - 1], policydb.p_type_val_to_name[context->type - 1]); scontextp += strlen(policydb.p_user_val_to_name[context->user - 1]) + 1 + strlen(policydb.p_role_val_to_name[context->role - 1]) + 1 + strlen(policydb.p_type_val_to_name[context->type - 1]); mls_sid_to_context(context, &scontextp); *scontextp = 0; return 0; } #include "initial_sid_to_string.h" /** * security_sid_to_context - Obtain a context for a given SID. * @sid: security identifier, SID * @scontext: security context * @scontext_len: length in bytes * * Write the string representation of the context associated with @sid * into a dynamically allocated string of the correct size. Set @scontext * to point to this string and set @scontext_len to the length of the string. */ int security_sid_to_context(u32 sid, char **scontext, u32 *scontext_len) { struct context *context; int rc = 0; if ( !ss_initialized ) { if ( sid <= SECINITSID_NUM ) { char *scontextp; *scontext_len = strlen(initial_sid_to_string[sid]) + 1; scontextp = xmalloc_array(char, *scontext_len); strlcpy(scontextp, initial_sid_to_string[sid], *scontext_len); *scontext = scontextp; goto out; } printk(KERN_ERR "security_sid_to_context: called before initial " "load_policy on unknown SID %d\n", sid); rc = -EINVAL; goto out; } POLICY_RDLOCK; context = sidtab_search(&sidtab, sid); if ( !context ) { printk(KERN_ERR "security_sid_to_context: unrecognized SID " "%d\n", sid); rc = -EINVAL; goto out_unlock; } rc = context_struct_to_string(context, scontext, scontext_len); out_unlock: POLICY_RDUNLOCK; out: return rc; } /** * security_context_to_sid - Obtain a SID for a given security context. * @scontext: security context * @scontext_len: length in bytes * @sid: security identifier, SID * * Obtains a SID associated with the security context that * has the string representation specified by @scontext. * Returns -%EINVAL if the context is invalid, -%ENOMEM if insufficient * memory is available, or 0 on success. */ int security_context_to_sid(char *scontext, u32 scontext_len, u32 *sid) { char *scontext2; struct context context; struct role_datum *role; struct type_datum *typdatum; struct user_datum *usrdatum; char *scontextp, *p, oldc; int rc = 0; if ( !ss_initialized ) { int i; for ( i = 1; i < SECINITSID_NUM; i++ ) { if ( !strcmp(initial_sid_to_string[i], scontext) ) { *sid = i; goto out; } } *sid = SECINITSID_XEN; goto out; } *sid = SECSID_NULL; /* Copy the string so that we can modify the copy as we parse it. The string should already by null terminated, but we append a null suffix to the copy to avoid problems with the existing attr package, which doesn't view the null terminator as part of the attribute value. */ scontext2 = xmalloc_array(char, scontext_len+1); if ( !scontext2 ) { rc = -ENOMEM; goto out; } memcpy(scontext2, scontext, scontext_len); scontext2[scontext_len] = 0; context_init(&context); *sid = SECSID_NULL; POLICY_RDLOCK; /* Parse the security context. */ rc = -EINVAL; scontextp = (char *) scontext2; /* Extract the user. */ p = scontextp; while ( *p && *p != ':' ) p++; if (*p == 0) goto out_unlock; *p++ = 0; usrdatum = hashtab_search(policydb.p_users.table, scontextp); if ( !usrdatum ) goto out_unlock; context.user = usrdatum->value; /* Extract role. */ scontextp = p; while ( *p && *p != ':' ) p++; if ( *p == 0 ) goto out_unlock; *p++ = 0; role = hashtab_search(policydb.p_roles.table, scontextp); if ( !role ) goto out_unlock; context.role = role->value; /* Extract type. */ scontextp = p; while ( *p && *p != ':' ) p++; oldc = *p; *p++ = 0; typdatum = hashtab_search(policydb.p_types.table, scontextp); if ( !typdatum || typdatum->attribute ) goto out_unlock; context.type = typdatum->value; rc = mls_context_to_sid(oldc, &p, &context, &sidtab); if ( rc ) goto out_unlock; if ( (p - scontext2) < scontext_len ) { rc = -EINVAL; goto out_unlock; } /* Check the validity of the new context. */ if ( !policydb_context_isvalid(&policydb, &context) ) { rc = -EINVAL; goto out_unlock; } /* Obtain the new sid. */ rc = sidtab_context_to_sid(&sidtab, &context, sid); out_unlock: POLICY_RDUNLOCK; context_destroy(&context); xfree(scontext2); out: return rc; } static int compute_sid_handle_invalid_context( struct context *scontext, struct context *tcontext, u16 tclass, struct context *newcontext) { char *s = NULL, *t = NULL, *n = NULL; u32 slen, tlen, nlen; if ( context_struct_to_string(scontext, &s, &slen) < 0 ) goto out; if ( context_struct_to_string(tcontext, &t, &tlen) < 0 ) goto out; if ( context_struct_to_string(newcontext, &n, &nlen) < 0 ) goto out; printk("security_compute_sid: invalid context %s" " for scontext=%s" " tcontext=%s" " tclass=%s", n, s, t, policydb.p_class_val_to_name[tclass-1]); out: xfree(s); xfree(t); xfree(n); if ( !flask_enforcing ) return 0; return -EACCES; } static int security_compute_sid(u32 ssid, u32 tsid, u16 tclass, u32 specified, u32 *out_sid) { struct context *scontext = NULL, *tcontext = NULL, newcontext; struct role_trans *roletr = NULL; struct avtab_key avkey; struct avtab_datum *avdatum; struct avtab_node *node; int rc = 0; if ( !ss_initialized ) { switch ( tclass ) { case SECCLASS_DOMAIN: *out_sid = ssid; break; default: *out_sid = tsid; break; } goto out; } POLICY_RDLOCK; scontext = sidtab_search(&sidtab, ssid); if ( !scontext ) { printk(KERN_ERR "security_compute_sid: unrecognized SID %d\n", ssid); rc = -EINVAL; goto out_unlock; } tcontext = sidtab_search(&sidtab, tsid); if ( !tcontext ) { printk(KERN_ERR "security_compute_sid: unrecognized SID %d\n", tsid); rc = -EINVAL; goto out_unlock; } context_init(&newcontext); /* Set the user identity. */ switch ( specified ) { case AVTAB_TRANSITION: case AVTAB_CHANGE: /* Use the process user identity. */ newcontext.user = scontext->user; break; case AVTAB_MEMBER: /* Use the related object owner. */ newcontext.user = tcontext->user; break; } /* Set the role and type to default values. */ switch ( tclass ) { case SECCLASS_DOMAIN: /* Use the current role and type of process. */ newcontext.role = scontext->role; newcontext.type = scontext->type; break; default: /* Use the well-defined object role. */ newcontext.role = OBJECT_R_VAL; /* Use the type of the related object. */ newcontext.type = tcontext->type; } /* Look for a type transition/member/change rule. */ avkey.source_type = scontext->type; avkey.target_type = tcontext->type; avkey.target_class = tclass; avkey.specified = specified; avdatum = avtab_search(&policydb.te_avtab, &avkey); /* If no permanent rule, also check for enabled conditional rules */ if ( !avdatum ) { node = avtab_search_node(&policydb.te_cond_avtab, &avkey); for ( ; node != NULL; node = avtab_search_node_next(node, specified) ) { if ( node->key.specified & AVTAB_ENABLED ) { avdatum = &node->datum; break; } } } if ( avdatum ) { /* Use the type from the type transition/member/change rule. */ newcontext.type = avdatum->data; } /* Check for class-specific changes. */ switch ( tclass ) { case SECCLASS_DOMAIN: if ( specified & AVTAB_TRANSITION ) { /* Look for a role transition rule. */ for ( roletr = policydb.role_tr; roletr; roletr = roletr->next ) { if ( roletr->role == scontext->role && roletr->type == tcontext->type ) { /* Use the role transition rule. */ newcontext.role = roletr->new_role; break; } } } break; default: break; } /* Set the MLS attributes. This is done last because it may allocate memory. */ rc = mls_compute_sid(scontext, tcontext, tclass, specified, &newcontext); if ( rc ) goto out_unlock; /* Check the validity of the context. */ if ( !policydb_context_isvalid(&policydb, &newcontext) ) { rc = compute_sid_handle_invalid_context(scontext, tcontext, tclass, &newcontext); if ( rc ) goto out_unlock; } /* Obtain the sid for the context. */ rc = sidtab_context_to_sid(&sidtab, &newcontext, out_sid); out_unlock: POLICY_RDUNLOCK; context_destroy(&newcontext); out: return rc; } /** * security_transition_sid - Compute the SID for a new subject/object. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @out_sid: security identifier for new subject/object * * Compute a SID to use for labeling a new subject or object in the * class @tclass based on a SID pair (@ssid, @tsid). * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM * if insufficient memory is available, or %0 if the new SID was * computed successfully. */ int security_transition_sid(u32 ssid, u32 tsid, u16 tclass, u32 *out_sid) { return security_compute_sid(ssid, tsid, tclass, AVTAB_TRANSITION, out_sid); } /** * security_member_sid - Compute the SID for member selection. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @out_sid: security identifier for selected member * * Compute a SID to use when selecting a member of a polyinstantiated * object of class @tclass based on a SID pair (@ssid, @tsid). * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM * if insufficient memory is available, or %0 if the SID was * computed successfully. */ int security_member_sid(u32 ssid, u32 tsid, u16 tclass, u32 *out_sid) { return security_compute_sid(ssid, tsid, tclass, AVTAB_MEMBER, out_sid); } /** * security_change_sid - Compute the SID for object relabeling. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @out_sid: security identifier for selected member * * Compute a SID to use for relabeling an object of class @tclass * based on a SID pair (@ssid, @tsid). * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM * if insufficient memory is available, or %0 if the SID was * computed successfully. */ int security_change_sid(u32 ssid, u32 tsid, u16 tclass, u32 *out_sid) { return security_compute_sid(ssid, tsid, tclass, AVTAB_CHANGE, out_sid); } /* * Verify that each kernel class that is defined in the * policy is correct */ static int validate_classes(struct policydb *p) { int i; struct class_datum *cladatum; struct perm_datum *perdatum; u32 nprim, perm_val, pol_val; u16 class_val; const struct selinux_class_perm *kdefs = &selinux_class_perm; const char *def_class, *def_perm, *pol_class; struct symtab *perms; for ( i = 1; i < kdefs->cts_len; i++ ) { def_class = kdefs->class_to_string[i]; if ( !def_class ) continue; if ( i > p->p_classes.nprim ) { printk(KERN_INFO "Flask: class %s not defined in policy\n", def_class); return -EINVAL; } pol_class = p->p_class_val_to_name[i-1]; if ( strcmp(pol_class, def_class) ) { printk(KERN_ERR "Flask: class %d is incorrect, found %s but should be %s\n", i, pol_class, def_class); return -EINVAL; } } for ( i = 0; i < kdefs->av_pts_len; i++ ) { class_val = kdefs->av_perm_to_string[i].tclass; perm_val = kdefs->av_perm_to_string[i].value; def_perm = kdefs->av_perm_to_string[i].name; if ( class_val > p->p_classes.nprim ) continue; pol_class = p->p_class_val_to_name[class_val-1]; cladatum = hashtab_search(p->p_classes.table, pol_class); BUG_ON( !cladatum ); perms = &cladatum->permissions; nprim = 1 << (perms->nprim - 1); if ( perm_val > nprim ) { printk(KERN_INFO "Flask: permission %s in class %s not defined in policy\n", def_perm, pol_class); return -EINVAL; } perdatum = hashtab_search(perms->table, def_perm); if ( perdatum == NULL ) { printk(KERN_ERR "Flask: permission %s in class %s not found in policy\n", def_perm, pol_class); return -EINVAL; } pol_val = 1 << (perdatum->value - 1); if ( pol_val != perm_val ) { printk(KERN_ERR "Flask: permission %s in class %s has incorrect value\n", def_perm, pol_class); return -EINVAL; } } return 0; } /* Clone the SID into the new SID table. */ static int clone_sid(u32 sid, struct context *context, void *arg) { struct sidtab *s = arg; return sidtab_insert(s, sid, context); } static inline int convert_context_handle_invalid_context(struct context *context) { int rc = 0; if ( flask_enforcing ) rc = -EINVAL; else { char *s; u32 len; context_struct_to_string(context, &s, &len); printk(KERN_ERR "Flask: context %s is invalid\n", s); xfree(s); } return rc; } struct convert_context_args { struct policydb *oldp; struct policydb *newp; }; /* * Convert the values in the security context * structure `c' from the values specified * in the policy `p->oldp' to the values specified * in the policy `p->newp'. Verify that the * context is valid under the new policy. */ static int convert_context(u32 key, struct context *c, void *p) { struct convert_context_args *args; struct context oldc; struct role_datum *role; struct type_datum *typdatum; struct user_datum *usrdatum; char *s; u32 len; int rc; args = p; rc = context_cpy(&oldc, c); if ( rc ) goto out; rc = -EINVAL; /* Convert the user. */ usrdatum = hashtab_search(args->newp->p_users.table, args->oldp->p_user_val_to_name[c->user - 1]); if ( !usrdatum ) goto bad; c->user = usrdatum->value; /* Convert the role. */ role = hashtab_search(args->newp->p_roles.table, args->oldp->p_role_val_to_name[c->role - 1]); if ( !role ) goto bad; c->role = role->value; /* Convert the type. */ typdatum = hashtab_search(args->newp->p_types.table, args->oldp->p_type_val_to_name[c->type - 1]); if ( !typdatum ) goto bad; c->type = typdatum->value; rc = mls_convert_context(args->oldp, args->newp, c); if ( rc ) goto bad; /* Check the validity of the new context. */ if ( !policydb_context_isvalid(args->newp, c) ) { rc = convert_context_handle_invalid_context(&oldc); if ( rc ) goto bad; } context_destroy(&oldc); out: return rc; bad: context_struct_to_string(&oldc, &s, &len); context_destroy(&oldc); printk(KERN_ERR "Flask: invalidating context %s\n", s); xfree(s); goto out; } static int security_preserve_bools(struct policydb *p); /** * security_load_policy - Load a security policy configuration. * @data: binary policy data * @len: length of data in bytes * * Load a new set of security policy configuration data, * validate it and convert the SID table as necessary. * This function will flush the access vector cache after * loading the new policy. */ int security_load_policy(void *data, size_t len) { struct policydb oldpolicydb, newpolicydb; struct sidtab oldsidtab, newsidtab; struct convert_context_args args; u32 seqno; int rc = 0; struct policy_file file = { data, len }, *fp = &file; LOAD_LOCK; if ( !ss_initialized ) { if ( policydb_read(&policydb, fp) ) { LOAD_UNLOCK; return -EINVAL; } if ( policydb_load_isids(&policydb, &sidtab) ) { LOAD_UNLOCK; policydb_destroy(&policydb); return -EINVAL; } if ( validate_classes(&policydb) ) { LOAD_UNLOCK; printk(KERN_ERR "Flask: the definition of a class is incorrect\n"); sidtab_destroy(&sidtab); policydb_destroy(&policydb); return -EINVAL; } policydb_loaded_version = policydb.policyvers; ss_initialized = 1; seqno = ++latest_granting; LOAD_UNLOCK; avc_ss_reset(seqno); return 0; } #if 0 sidtab_hash_eval(&sidtab, "sids"); #endif if ( policydb_read(&newpolicydb, fp) ) { LOAD_UNLOCK; return -EINVAL; } sidtab_init(&newsidtab); /* Verify that the kernel defined classes are correct. */ if ( validate_classes(&newpolicydb) ) { printk(KERN_ERR "Flask: the definition of a class is incorrect\n"); rc = -EINVAL; goto err; } rc = security_preserve_bools(&newpolicydb); if ( rc ) { printk(KERN_ERR "Flask: unable to preserve booleans\n"); goto err; } /* Clone the SID table. */ sidtab_shutdown(&sidtab); if ( sidtab_map(&sidtab, clone_sid, &newsidtab) ) { rc = -ENOMEM; goto err; } /* Convert the internal representations of contexts in the new SID table and remove invalid SIDs. */ args.oldp = &policydb; args.newp = &newpolicydb; sidtab_map_remove_on_error(&newsidtab, convert_context, &args); /* Save the old policydb and SID table to free later. */ memcpy(&oldpolicydb, &policydb, sizeof policydb); sidtab_set(&oldsidtab, &sidtab); /* Install the new policydb and SID table. */ POLICY_WRLOCK; memcpy(&policydb, &newpolicydb, sizeof policydb); sidtab_set(&sidtab, &newsidtab); seqno = ++latest_granting; policydb_loaded_version = policydb.policyvers; POLICY_WRUNLOCK; LOAD_UNLOCK; /* Free the old policydb and SID table. */ policydb_destroy(&oldpolicydb); sidtab_destroy(&oldsidtab); avc_ss_reset(seqno); return 0; err: LOAD_UNLOCK; sidtab_destroy(&newsidtab); policydb_destroy(&newpolicydb); return rc; } /** * security_irq_sid - Obtain the SID for a physical irq. * @pirq: physical irq * @out_sid: security identifier */ int security_irq_sid(int pirq, u32 *out_sid) { int rc = 0; struct ocontext *c; POLICY_RDLOCK; c = policydb.ocontexts[OCON_PIRQ]; while ( c ) { if ( c->u.pirq == pirq ) break; c = c->next; } if ( c ) { if ( !c->sid[0] ) { rc = sidtab_context_to_sid(&sidtab, &c->context[0], &c->sid[0]); if ( rc ) goto out; } *out_sid = c->sid[0]; } else { *out_sid = SECINITSID_IRQ; } out: POLICY_RDUNLOCK; return rc; } /** * security_iomem_sid - Obtain the SID for a page of iomem. * @mfn: iomem mfn * @out_sid: security identifier */ int security_iomem_sid(unsigned long mfn, u32 *out_sid) { struct ocontext *c; int rc = 0; POLICY_RDLOCK; c = policydb.ocontexts[OCON_IOMEM]; while ( c ) { if ( c->u.iomem.low_iomem <= mfn && c->u.iomem.high_iomem >= mfn ) break; c = c->next; } if ( c ) { if ( !c->sid[0] ) { rc = sidtab_context_to_sid(&sidtab, &c->context[0], &c->sid[0]); if ( rc ) goto out; } *out_sid = c->sid[0]; } else { *out_sid = SECINITSID_IOMEM; } out: POLICY_RDUNLOCK; return rc; } int security_iterate_iomem_sids(unsigned long start, unsigned long end, security_iterate_fn fn, void *data) { struct ocontext *c; int rc = 0; POLICY_RDLOCK; c = policydb.ocontexts[OCON_IOMEM]; while (c && c->u.iomem.high_iomem < start) c = c->next; while (c && c->u.iomem.low_iomem <= end) { if (!c->sid[0]) { rc = sidtab_context_to_sid(&sidtab, &c->context[0], &c->sid[0]); if ( rc ) goto out; } if (start < c->u.iomem.low_iomem) { /* found a gap */ rc = fn(data, SECINITSID_IOMEM, start, c->u.iomem.low_iomem - 1); if (rc) goto out; start = c->u.iomem.low_iomem; } if (end <= c->u.iomem.high_iomem) { /* iteration ends in the middle of this range */ rc = fn(data, c->sid[0], start, end); goto out; } rc = fn(data, c->sid[0], start, c->u.iomem.high_iomem); if (rc) goto out; start = c->u.iomem.high_iomem + 1; c = c->next; } rc = fn(data, SECINITSID_IOMEM, start, end); out: POLICY_RDUNLOCK; return rc; } /** * security_ioport_sid - Obtain the SID for an ioport. * @ioport: ioport * @out_sid: security identifier */ int security_ioport_sid(u32 ioport, u32 *out_sid) { struct ocontext *c; int rc = 0; POLICY_RDLOCK; c = policydb.ocontexts[OCON_IOPORT]; while ( c ) { if ( c->u.ioport.low_ioport <= ioport && c->u.ioport.high_ioport >= ioport ) break; c = c->next; } if ( c ) { if ( !c->sid[0] ) { rc = sidtab_context_to_sid(&sidtab, &c->context[0], &c->sid[0]); if ( rc ) goto out; } *out_sid = c->sid[0]; } else { *out_sid = SECINITSID_IOPORT; } out: POLICY_RDUNLOCK; return rc; } int security_iterate_ioport_sids(u32 start, u32 end, security_iterate_fn fn, void *data) { struct ocontext *c; int rc = 0; POLICY_RDLOCK; c = policydb.ocontexts[OCON_IOPORT]; while (c && c->u.ioport.high_ioport < start) c = c->next; while (c && c->u.ioport.low_ioport <= end) { if (!c->sid[0]) { rc = sidtab_context_to_sid(&sidtab, &c->context[0], &c->sid[0]); if ( rc ) goto out; } if (start < c->u.ioport.low_ioport) { /* found a gap */ rc = fn(data, SECINITSID_IOPORT, start, c->u.ioport.low_ioport - 1); if (rc) goto out; start = c->u.ioport.low_ioport; } if (end <= c->u.ioport.high_ioport) { /* iteration ends in the middle of this range */ rc = fn(data, c->sid[0], start, end); goto out; } rc = fn(data, c->sid[0], start, c->u.ioport.high_ioport); if (rc) goto out; start = c->u.ioport.high_ioport + 1; c = c->next; } rc = fn(data, SECINITSID_IOPORT, start, end); out: POLICY_RDUNLOCK; return rc; } /** * security_device_sid - Obtain the SID for a PCI device. * @ioport: device * @out_sid: security identifier */ int security_device_sid(u32 device, u32 *out_sid) { struct ocontext *c; int rc = 0; POLICY_RDLOCK; c = policydb.ocontexts[OCON_DEVICE]; while ( c ) { if ( c->u.device == device ) break; c = c->next; } if ( c ) { if ( !c->sid[0] ) { rc = sidtab_context_to_sid(&sidtab, &c->context[0], &c->sid[0]); if ( rc ) goto out; } *out_sid = c->sid[0]; } else { *out_sid = SECINITSID_DEVICE; } out: POLICY_RDUNLOCK; return rc; } #define SIDS_NEL 25 /** * security_get_user_sids - Obtain reachable SIDs for a user. * @fromsid: starting SID * @username: username * @sids: array of reachable SIDs for user * @nel: number of elements in @sids * * Generate the set of SIDs for legal security contexts * for a given user that can be reached by @fromsid. * Set *@sids to point to a dynamically allocated * array containing the set of SIDs. Set *@nel to the * number of elements in the array. */ int security_get_user_sids(u32 fromsid, char *username, u32 **sids, u32 *nel) { struct context *fromcon, usercon; u32 *mysids, *mysids2, sid; u32 mynel = 0, maxnel = SIDS_NEL; struct user_datum *user; struct role_datum *role; struct av_decision avd; struct ebitmap_node *rnode, *tnode; int rc = 0, i, j; if ( !ss_initialized ) { *sids = NULL; *nel = 0; goto out; } POLICY_RDLOCK; fromcon = sidtab_search(&sidtab, fromsid); if ( !fromcon ) { rc = -EINVAL; goto out_unlock; } user = hashtab_search(policydb.p_users.table, username); if ( !user ) { rc = -EINVAL; goto out_unlock; } usercon.user = user->value; mysids = xmalloc_array(u32, maxnel); if ( !mysids ) { rc = -ENOMEM; goto out_unlock; } memset(mysids, 0, maxnel*sizeof(*mysids)); ebitmap_for_each_positive_bit(&user->roles, rnode, i) { role = policydb.role_val_to_struct[i]; usercon.role = i+1; ebitmap_for_each_positive_bit(&role->types, tnode, j) { usercon.type = j+1; if ( mls_setup_user_range(fromcon, user, &usercon) ) continue; rc = context_struct_compute_av(fromcon, &usercon, SECCLASS_DOMAIN, DOMAIN__TRANSITION, &avd); if ( rc || !(avd.allowed & DOMAIN__TRANSITION) ) continue; rc = sidtab_context_to_sid(&sidtab, &usercon, &sid); if ( rc ) { xfree(mysids); goto out_unlock; } if ( mynel < maxnel ) { mysids[mynel++] = sid; } else { maxnel += SIDS_NEL; mysids2 = xmalloc_array(u32, maxnel); if ( !mysids2 ) { rc = -ENOMEM; xfree(mysids); goto out_unlock; } memset(mysids2, 0, maxnel*sizeof(*mysids2)); memcpy(mysids2, mysids, mynel * sizeof(*mysids2)); xfree(mysids); mysids = mysids2; mysids[mynel++] = sid; } } } *sids = mysids; *nel = mynel; out_unlock: POLICY_RDUNLOCK; out: return rc; } int security_find_bool(const char *name) { int i, rv = -ENOENT; POLICY_RDLOCK; for ( i = 0; i < policydb.p_bools.nprim; i++ ) { if (!strcmp(name, policydb.p_bool_val_to_name[i])) { rv = i; break; } } POLICY_RDUNLOCK; return rv; } int security_get_bools(int *len, char ***names, int **values, size_t *maxstr) { int i, rc = -ENOMEM; POLICY_RDLOCK; if ( names ) *names = NULL; *values = NULL; if ( maxstr ) *maxstr = 0; *len = policydb.p_bools.nprim; if ( !*len ) { rc = 0; goto out; } if ( names ) { *names = (char**)xmalloc_array(char*, *len); if ( !*names ) goto err; memset(*names, 0, sizeof(char*) * *len); } *values = (int*)xmalloc_array(int, *len); if ( !*values ) goto err; for ( i = 0; i < *len; i++ ) { size_t name_len = strlen(policydb.p_bool_val_to_name[i]); (*values)[i] = policydb.bool_val_to_struct[i]->state; if ( names ) { (*names)[i] = xmalloc_array(char, name_len + 1); if ( !(*names)[i] ) goto err; strlcpy((*names)[i], policydb.p_bool_val_to_name[i], name_len + 1); } if ( maxstr && name_len > *maxstr ) *maxstr = name_len; } rc = 0; out: POLICY_RDUNLOCK; return rc; err: if ( names && *names ) { for ( i = 0; i < *len; i++ ) xfree((*names)[i]); xfree(*names); } xfree(*values); goto out; } int security_set_bools(int len, int *values) { int i, rc = 0; int lenp, seqno = 0; struct cond_node *cur; POLICY_WRLOCK; lenp = policydb.p_bools.nprim; if ( len != lenp ) { rc = -EFAULT; goto out; } printk(KERN_INFO "Flask: committed booleans { "); for ( i = 0; i < len; i++ ) { if ( values[i] ) { policydb.bool_val_to_struct[i]->state = 1; } else { policydb.bool_val_to_struct[i]->state = 0; } if ( i != 0 ) printk(", "); printk("%s:%d", policydb.p_bool_val_to_name[i], policydb.bool_val_to_struct[i]->state); } printk(" }\n"); for ( cur = policydb.cond_list; cur != NULL; cur = cur->next ) { rc = evaluate_cond_node(&policydb, cur); if ( rc ) goto out; } seqno = ++latest_granting; out: POLICY_WRUNLOCK; if ( !rc ) { avc_ss_reset(seqno); } return rc; } int security_get_bool_value(unsigned int bool) { int rc = 0; unsigned int len; POLICY_RDLOCK; len = policydb.p_bools.nprim; if ( bool >= len ) { rc = -ENOENT; goto out; } rc = policydb.bool_val_to_struct[bool]->state; out: POLICY_RDUNLOCK; return rc; } char *security_get_bool_name(unsigned int bool) { unsigned int len; char *rv = NULL; POLICY_RDLOCK; len = policydb.p_bools.nprim; if ( bool >= len ) { goto out; } len = strlen(policydb.p_bool_val_to_name[bool]) + 1; rv = xmalloc_array(char, len); if ( !rv ) goto out; memcpy(rv, policydb.p_bool_val_to_name[bool], len); out: POLICY_RDUNLOCK; return rv; } static int security_preserve_bools(struct policydb *p) { int rc, nbools = 0, *bvalues = NULL, i; char **bnames = NULL; struct cond_bool_datum *booldatum; struct cond_node *cur; rc = security_get_bools(&nbools, &bnames, &bvalues, NULL); if ( rc ) return rc; for ( i = 0; i < nbools; i++ ) { booldatum = hashtab_search(p->p_bools.table, bnames[i]); if ( booldatum ) booldatum->state = bvalues[i]; } for ( cur = p->cond_list; cur; cur = cur->next ) { rc = evaluate_cond_node(p, cur); if ( rc ) goto out; } out: if ( bnames ) { for ( i = 0; i < nbools; i++ ) xfree(bnames[i]); } xfree(bnames); xfree(bvalues); return rc; } int determine_ocontext( char *ocontext ) { if ( strcmp(ocontext, "pirq") == 0 ) return OCON_PIRQ; else if ( strcmp(ocontext, "ioport") == 0 ) return OCON_IOPORT; else if ( strcmp(ocontext, "iomem") == 0 ) return OCON_IOMEM; else if ( strcmp(ocontext, "pcidevice") == 0 ) return OCON_DEVICE; else return -1; } int security_ocontext_add( u32 ocon, unsigned long low, unsigned long high ,u32 sid ) { int ret = 0; struct ocontext *c; struct ocontext *prev; struct ocontext *add; if ( (add = xmalloc(struct ocontext)) == NULL ) return -ENOMEM; memset(add, 0, sizeof(struct ocontext)); add->sid[0] = sid; POLICY_WRLOCK; switch( ocon ) { case OCON_PIRQ: add->u.pirq = (u16)low; if ( high != low ) { ret = -EINVAL; break; } c = policydb.ocontexts[OCON_PIRQ]; while ( c ) { if ( c->u.pirq == add->u.pirq ) { if ( c->sid[0] == sid ) break; printk("%s: Duplicate pirq %d\n", __FUNCTION__, add->u.pirq); ret = -EEXIST; break; } c = c->next; } if ( ret == 0 ) { add->next = policydb.ocontexts[OCON_PIRQ]; policydb.ocontexts[OCON_PIRQ] = add; } break; case OCON_IOPORT: add->u.ioport.low_ioport = low; add->u.ioport.high_ioport = high; prev = NULL; c = policydb.ocontexts[OCON_IOPORT]; while ( c && c->u.ioport.high_ioport < low ) { prev = c; c = c->next; } if (c && c->u.ioport.low_ioport <= high) { if (c->u.ioport.low_ioport == low && c->u.ioport.high_ioport == high && c->sid[0] == sid) break; printk("%s: IO Port overlap with entry 0x%x - 0x%x\n", __FUNCTION__, c->u.ioport.low_ioport, c->u.ioport.high_ioport); ret = -EEXIST; break; } if (prev) { add->next = prev->next; prev->next = add; } else { add->next = policydb.ocontexts[OCON_IOPORT]; policydb.ocontexts[OCON_IOPORT] = add; } break; case OCON_IOMEM: add->u.iomem.low_iomem = low; add->u.iomem.high_iomem = high; prev = NULL; c = policydb.ocontexts[OCON_IOMEM]; while ( c && c->u.iomem.high_iomem < low ) { prev = c; c = c->next; } if (c && c->u.iomem.low_iomem <= high) { if (c->u.iomem.low_iomem == low && c->u.iomem.high_iomem == high && c->sid[0] == sid) break; printk("%s: IO Memory overlap with entry 0x%x - 0x%x\n", __FUNCTION__, c->u.iomem.low_iomem, c->u.iomem.high_iomem); ret = -EEXIST; break; } if (prev) { add->next = prev->next; prev->next = add; } else { add->next = policydb.ocontexts[OCON_IOMEM]; policydb.ocontexts[OCON_IOMEM] = add; } break; case OCON_DEVICE: add->u.device = low; if ( high != low ) { ret = -EINVAL; break; } c = policydb.ocontexts[OCON_DEVICE]; while ( c ) { if ( c->u.device == add->u.device ) { if ( c->sid[0] == sid ) break; printk("%s: Duplicate PCI Device 0x%x\n", __FUNCTION__, add->u.device); ret = -EEXIST; break; } c = c->next; } if ( ret == 0 ) { add->next = policydb.ocontexts[OCON_DEVICE]; policydb.ocontexts[OCON_DEVICE] = add; } break; default: ret = -EINVAL; } POLICY_WRUNLOCK; if ( ret != 0 ) xfree(add); return ret; } int security_ocontext_del( u32 ocon, unsigned int low, unsigned int high ) { int ret = 0; struct ocontext *c, *before_c; POLICY_WRLOCK; switch( ocon ) { case OCON_PIRQ: for ( before_c = NULL, c = policydb.ocontexts[OCON_PIRQ]; c; before_c = c, c = c->next ) { if ( c->u.pirq == low ) { if ( before_c == NULL ) { policydb.ocontexts[OCON_PIRQ] = c->next; xfree(c); goto out; } else { before_c->next = c->next; xfree(c); goto out; } } } printk("%s: ocontext not found: pirq %d\n", __FUNCTION__, low); ret = -ENOENT; break; case OCON_IOPORT: for ( before_c = NULL, c = policydb.ocontexts[OCON_IOPORT]; c; before_c = c, c = c->next ) { if ( c->u.ioport.low_ioport == low && c->u.ioport.high_ioport == high ) { if ( before_c == NULL ) { policydb.ocontexts[OCON_IOPORT] = c->next; xfree(c); goto out; } else { before_c->next = c->next; xfree(c); goto out; } } } printk("%s: ocontext not found: ioport 0x%x - 0x%x\n", __FUNCTION__, low, high); ret = -ENOENT; break; case OCON_IOMEM: for ( before_c = NULL, c = policydb.ocontexts[OCON_IOMEM]; c; before_c = c, c = c->next ) { if ( c->u.iomem.low_iomem == low && c->u.iomem.high_iomem == high ) { if ( before_c == NULL ) { policydb.ocontexts[OCON_IOMEM] = c->next; xfree(c); goto out; } else { before_c->next = c->next; xfree(c); goto out; } } } printk("%s: ocontext not found: iomem 0x%x - 0x%x\n", __FUNCTION__, low, high); ret = -ENOENT; break; case OCON_DEVICE: for ( before_c = NULL, c = policydb.ocontexts[OCON_DEVICE]; c; before_c = c, c = c->next ) { if ( c->u.device == low ) { if ( before_c == NULL ) { policydb.ocontexts[OCON_DEVICE] = c->next; xfree(c); goto out; } else { before_c->next = c->next; xfree(c); goto out; } } } printk("%s: ocontext not found: pcidevice 0x%x\n", __FUNCTION__, low); ret = -ENOENT; break; default: ret = -EINVAL; } out: POLICY_WRUNLOCK; return ret; } xen-4.4.0/xen/xsm/flask/ss/hashtab.h0000664000175000017500000000507512307313555015363 0ustar smbsmb/* * A hash table (hashtab) maintains associations between * key values and datum values. The type of the key values * and the type of the datum values is arbitrary. The * functions for hash computation and key comparison are * provided by the creator of the table. * * Author : Stephen Smalley, */ #ifndef _SS_HASHTAB_H_ #define _SS_HASHTAB_H_ #define HASHTAB_MAX_NODES 0xffffffff struct hashtab_node { void *key; void *datum; struct hashtab_node *next; }; struct hashtab { struct hashtab_node **htable; /* hash table */ u32 size; /* number of slots in hash table */ u32 nel; /* number of elements in hash table */ u32 (*hash_value)(struct hashtab *h, const void *key); /* hash function */ int (*keycmp)(struct hashtab *h, const void *key1, const void *key2); /* key comparison function */ }; struct hashtab_info { u32 slots_used; u32 max_chain_len; }; /* * Creates a new hash table with the specified characteristics. * * Returns NULL if insufficent space is available or * the new hash table otherwise. */ struct hashtab *hashtab_create(u32 (*hash_value)(struct hashtab *h, const void *key), int (*keycmp)(struct hashtab *h, const void *key1, const void *key2), u32 size); /* * Inserts the specified (key, datum) pair into the specified hash table. * * Returns -ENOMEM on memory allocation error, * -EEXIST if there is already an entry with the same key, * -EINVAL for general errors or * 0 otherwise. */ int hashtab_insert(struct hashtab *h, void *k, void *d); /* * Searches for the entry with the specified key in the hash table. * * Returns NULL if no entry has the specified key or * the datum of the entry otherwise. */ void *hashtab_search(struct hashtab *h, const void *k); /* * Destroys the specified hash table. */ void hashtab_destroy(struct hashtab *h); /* * Applies the specified apply function to (key,datum,args) * for each entry in the specified hash table. * * The order in which the function is applied to the entries * is dependent upon the internal structure of the hash table. * * If apply returns a non-zero status, then hashtab_map will cease * iterating through the hash table and will propagate the error * return to its caller. */ int hashtab_map(struct hashtab *h, int (*apply)(void *k, void *d, void *args), void *args); /* Fill info with some hash table statistics */ void hashtab_stat(struct hashtab *h, struct hashtab_info *info); #endif /* _SS_HASHTAB_H */ xen-4.4.0/xen/xsm/flask/ss/symtab.c0000664000175000017500000000173712307313555015244 0ustar smbsmb/* * Implementation of the symbol table type. * * Author : Stephen Smalley, */ /* Ported to Xen 3.0, George Coker, */ #include #include #include #include #include "symtab.h" static unsigned int symhash(struct hashtab *h, const void *key) { const char *p, *keyp; unsigned int size; unsigned int val; val = 0; keyp = key; size = strlen(keyp); for ( p = keyp; (p - keyp) < size; p++ ) val = (val << 4 | (val >> (8*sizeof(unsigned int)-4))) ^ (*p); return val & (h->size - 1); } static int symcmp(struct hashtab *h, const void *key1, const void *key2) { const char *keyp1, *keyp2; keyp1 = key1; keyp2 = key2; return strcmp(keyp1, keyp2); } int symtab_init(struct symtab *s, unsigned int size) { s->table = hashtab_create(symhash, symcmp, size); if ( !s->table ) return -1; s->nprim = 0; return 0; } xen-4.4.0/xen/xsm/flask/ss/ebitmap.c0000664000175000017500000001671512307313555015370 0ustar smbsmb/* * Implementation of the extensible bitmap type. * * Author : Stephen Smalley, */ /* * Updated: KaiGai Kohei * Applied standard bit operations to improve bitmap scanning. */ /* Ported to Xen 3.0, George Coker, */ #include #include #include #include #include #include #include "ebitmap.h" #include "policydb.h" int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2) { struct ebitmap_node *n1, *n2; if ( e1->highbit != e2->highbit ) return 0; n1 = e1->node; n2 = e2->node; while ( n1 && n2 && (n1->startbit == n2->startbit) && !memcmp(n1->maps, n2->maps, EBITMAP_SIZE / 8)) { n1 = n1->next; n2 = n2->next; } if ( n1 || n2 ) return 0; return 1; } int ebitmap_cpy(struct ebitmap *dst, struct ebitmap *src) { struct ebitmap_node *n, *new, *prev; ebitmap_init(dst); n = src->node; prev = NULL; while ( n ) { new = xmalloc(struct ebitmap_node); if ( !new ) { ebitmap_destroy(dst); return -ENOMEM; } memset(new, 0, sizeof(*new)); new->startbit = n->startbit; memcpy(new->maps, n->maps, EBITMAP_SIZE / 8); new->next = NULL; if ( prev ) prev->next = new; else dst->node = new; prev = new; n = n->next; } dst->highbit = src->highbit; return 0; } int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2) { struct ebitmap_node *n1, *n2; int i; if ( e1->highbit < e2->highbit ) return 0; n1 = e1->node; n2 = e2->node; while ( n1 && n2 && (n1->startbit <= n2->startbit) ) { if ( n1->startbit < n2->startbit ) { n1 = n1->next; continue; } for ( i = 0; i < EBITMAP_UNIT_NUMS; i++ ) { if ( (n1->maps[i] & n2->maps[i]) != n2->maps[i] ) return 0; } n1 = n1->next; n2 = n2->next; } if ( n2 ) return 0; return 1; } int ebitmap_get_bit(struct ebitmap *e, unsigned long bit) { struct ebitmap_node *n; if ( e->highbit < bit ) return 0; n = e->node; while ( n && (n->startbit <= bit) ) { if ( (n->startbit + EBITMAP_SIZE) > bit ) return ebitmap_node_get_bit(n, bit); n = n->next; } return 0; } int ebitmap_set_bit(struct ebitmap *e, unsigned long bit, int value) { struct ebitmap_node *n, *prev, *new; prev = NULL; n = e->node; while ( n && n->startbit <= bit ) { if ( (n->startbit + EBITMAP_SIZE) > bit ) { if ( value ) { ebitmap_node_set_bit(n, bit); } else { unsigned int s; ebitmap_node_clr_bit(n, bit); s = find_first_bit(n->maps, EBITMAP_SIZE); if ( s < EBITMAP_SIZE ) return 0; /* drop this node from the bitmap */ if ( !n->next ) { /* * this was the highest map * within the bitmap */ if ( prev ) e->highbit = prev->startbit + EBITMAP_SIZE; else e->highbit = 0; } if ( prev ) prev->next = n->next; else e->node = n->next; xfree(n); } return 0; } prev = n; n = n->next; } if ( !value ) return 0; new = xmalloc(struct ebitmap_node); if ( !new ) return -ENOMEM; memset(new, 0, sizeof(*new)); new->startbit = bit - (bit % EBITMAP_SIZE); ebitmap_node_set_bit(new, bit); if ( !n ) /* this node will be the highest map within the bitmap */ e->highbit = new->startbit + EBITMAP_SIZE; if ( prev ) { new->next = prev->next; prev->next = new; } else { new->next = e->node; e->node = new; } return 0; } void ebitmap_destroy(struct ebitmap *e) { struct ebitmap_node *n, *temp; if ( !e ) return; n = e->node; while ( n ) { temp = n; n = n->next; xfree(temp); } e->highbit = 0; e->node = NULL; return; } int ebitmap_read(struct ebitmap *e, void *fp) { struct ebitmap_node *n = NULL; u32 mapunit, count, startbit, index; u64 map; __le32 buf[3]; int rc, i; ebitmap_init(e); rc = next_entry(buf, fp, sizeof buf); if ( rc < 0 ) goto out; mapunit = le32_to_cpu(buf[0]); e->highbit = le32_to_cpu(buf[1]); count = le32_to_cpu(buf[2]); if ( mapunit != sizeof(u64) * 8 ) { printk(KERN_ERR "Flask: ebitmap: map size %u does not " "match my size %Zd (high bit was %d)\n", mapunit, sizeof(u64) * 8, e->highbit); goto bad; } /* round up e->highbit */ e->highbit += EBITMAP_SIZE - 1; e->highbit -= (e->highbit % EBITMAP_SIZE); if ( !e->highbit ) { e->node = NULL; goto ok; } for ( i = 0; i < count; i++ ) { rc = next_entry(&startbit, fp, sizeof(u32)); if ( rc < 0 ) { printk(KERN_ERR "Flask: ebitmap: truncated map\n"); goto bad; } startbit = le32_to_cpu(startbit); if ( startbit & (mapunit - 1) ) { printk(KERN_ERR "Flask: ebitmap start bit (%d) is " "not a multiple of the map unit size (%u)\n", startbit, mapunit); goto bad; } if ( startbit > e->highbit - mapunit ) { printk(KERN_ERR "Flask: ebitmap start bit (%d) is " "beyond the end of the bitmap (%u)\n", startbit, (e->highbit - mapunit)); goto bad; } if ( !n || startbit >= n->startbit + EBITMAP_SIZE ) { struct ebitmap_node *tmp; tmp = xmalloc(struct ebitmap_node); if ( !tmp ) { printk(KERN_ERR "Flask: ebitmap: out of memory\n"); rc = -ENOMEM; goto bad; } memset(tmp, 0, sizeof(*tmp)); /* round down */ tmp->startbit = startbit - (startbit % EBITMAP_SIZE); if ( n ) n->next = tmp; else e->node = tmp; n = tmp; } else if ( startbit <= n->startbit ) { printk(KERN_ERR "Flask: ebitmap: start bit %d" " comes after start bit %d\n", startbit, n->startbit); goto bad; } rc = next_entry(&map, fp, sizeof(u64)); if ( rc < 0 ) { printk(KERN_ERR "Flask: ebitmap: truncated map\n"); goto bad; } map = le64_to_cpu(map); index = (startbit - n->startbit) / EBITMAP_UNIT_SIZE; while ( map ) { n->maps[index++] = map & (-1UL); map = EBITMAP_SHIFT_UNIT_SIZE(map); } } ok: rc = 0; out: return rc; bad: if ( !rc ) rc = -EINVAL; ebitmap_destroy(e); goto out; } xen-4.4.0/xen/xsm/flask/hooks.c0000664000175000017500000012361012307313555014436 0ustar smbsmb/* * This file contains the Flask hook function implementations for Xen. * * Author: George Coker, * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, * as published by the Free Software Foundation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct xsm_operations *original_ops = NULL; static u32 domain_sid(struct domain *dom) { struct domain_security_struct *dsec = dom->ssid; return dsec->sid; } static u32 domain_target_sid(struct domain *src, struct domain *dst) { struct domain_security_struct *ssec = src->ssid; struct domain_security_struct *dsec = dst->ssid; if (src == dst) return ssec->self_sid; if (src->target == dst) return ssec->target_sid; return dsec->sid; } static u32 evtchn_sid(const struct evtchn *chn) { struct evtchn_security_struct *esec = chn->ssid; return esec->sid; } static int domain_has_perm(struct domain *dom1, struct domain *dom2, u16 class, u32 perms) { u32 ssid, tsid; struct avc_audit_data ad; AVC_AUDIT_DATA_INIT(&ad, NONE); ad.sdom = dom1; ad.tdom = dom2; ssid = domain_sid(dom1); tsid = domain_target_sid(dom1, dom2); return avc_has_perm(ssid, tsid, class, perms, &ad); } static int avc_current_has_perm(u32 tsid, u16 class, u32 perm, struct avc_audit_data *ad) { u32 csid = domain_sid(current->domain); return avc_has_perm(csid, tsid, class, perm, ad); } static int current_has_perm(struct domain *d, u16 class, u32 perms) { return domain_has_perm(current->domain, d, class, perms); } static int domain_has_evtchn(struct domain *d, struct evtchn *chn, u32 perms) { u32 dsid = domain_sid(d); u32 esid = evtchn_sid(chn); return avc_has_perm(dsid, esid, SECCLASS_EVENT, perms, NULL); } static int domain_has_xen(struct domain *d, u32 perms) { u32 dsid = domain_sid(d); return avc_has_perm(dsid, SECINITSID_XEN, SECCLASS_XEN, perms, NULL); } static int get_irq_sid(int irq, u32 *sid, struct avc_audit_data *ad) { struct irq_desc *desc = irq_to_desc(irq); if ( irq >= nr_irqs || irq < 0 ) return -EINVAL; if ( irq < nr_static_irqs ) { if (ad) { AVC_AUDIT_DATA_INIT(ad, IRQ); ad->irq = irq; } return security_irq_sid(irq, sid); } if ( desc->msi_desc && desc->msi_desc->dev ) { struct pci_dev *dev = desc->msi_desc->dev; u32 sbdf = (dev->seg << 16) | (dev->bus << 8) | dev->devfn; if (ad) { AVC_AUDIT_DATA_INIT(ad, DEV); ad->device = sbdf; } return security_device_sid(sbdf, sid); } if (ad) { AVC_AUDIT_DATA_INIT(ad, IRQ); ad->irq = irq; } /* HPET or IOMMU IRQ, should not be seen by domains */ *sid = SECINITSID_UNLABELED; return 0; } static int flask_domain_alloc_security(struct domain *d) { struct domain_security_struct *dsec; dsec = xmalloc(struct domain_security_struct); if ( !dsec ) return -ENOMEM; memset(dsec, 0, sizeof(struct domain_security_struct)); switch ( d->domain_id ) { case DOMID_IDLE: dsec->sid = SECINITSID_XEN; break; case DOMID_XEN: dsec->sid = SECINITSID_DOMXEN; break; case DOMID_IO: dsec->sid = SECINITSID_DOMIO; break; default: dsec->sid = SECINITSID_UNLABELED; } dsec->self_sid = dsec->sid; d->ssid = dsec; return 0; } static void flask_domain_free_security(struct domain *d) { struct domain_security_struct *dsec = d->ssid; if ( !dsec ) return; d->ssid = NULL; xfree(dsec); } static int flask_evtchn_unbound(struct domain *d1, struct evtchn *chn, domid_t id2) { u32 sid1, sid2, newsid; int rc; struct domain *d2; struct evtchn_security_struct *esec; d2 = rcu_lock_domain_by_any_id(id2); if ( d2 == NULL ) return -EPERM; sid1 = domain_sid(d1); sid2 = domain_target_sid(d1, d2); esec = chn->ssid; rc = security_transition_sid(sid1, sid2, SECCLASS_EVENT, &newsid); if ( rc ) goto out; rc = avc_current_has_perm(newsid, SECCLASS_EVENT, EVENT__CREATE, NULL); if ( rc ) goto out; rc = avc_has_perm(newsid, sid2, SECCLASS_EVENT, EVENT__BIND, NULL); if ( rc ) goto out; esec->sid = newsid; out: rcu_unlock_domain(d2); return rc; } static int flask_evtchn_interdomain(struct domain *d1, struct evtchn *chn1, struct domain *d2, struct evtchn *chn2) { u32 sid1, sid2, newsid, reverse_sid; int rc; struct evtchn_security_struct *esec1; struct avc_audit_data ad; AVC_AUDIT_DATA_INIT(&ad, NONE); ad.sdom = d1; ad.tdom = d2; sid1 = domain_sid(d1); sid2 = domain_target_sid(d1, d2); esec1 = chn1->ssid; rc = security_transition_sid(sid1, sid2, SECCLASS_EVENT, &newsid); if ( rc ) { printk("%s: security_transition_sid failed, rc=%d (domain=%d)\n", __FUNCTION__, -rc, d2->domain_id); return rc; } rc = avc_current_has_perm(newsid, SECCLASS_EVENT, EVENT__CREATE, &ad); if ( rc ) return rc; rc = avc_has_perm(newsid, sid2, SECCLASS_EVENT, EVENT__BIND, &ad); if ( rc ) return rc; /* It's possible the target domain has changed (relabel or destroy/create) * since the unbound part was created; re-validate this binding now. */ reverse_sid = evtchn_sid(chn2); sid1 = domain_target_sid(d2, d1); rc = avc_has_perm(reverse_sid, sid1, SECCLASS_EVENT, EVENT__BIND, &ad); if ( rc ) return rc; esec1->sid = newsid; return rc; } static void flask_evtchn_close_post(struct evtchn *chn) { struct evtchn_security_struct *esec; esec = chn->ssid; esec->sid = SECINITSID_UNLABELED; } static int flask_evtchn_send(struct domain *d, struct evtchn *chn) { int rc; switch ( chn->state ) { case ECS_INTERDOMAIN: rc = domain_has_evtchn(d, chn, EVENT__SEND); break; case ECS_IPI: case ECS_UNBOUND: rc = 0; break; default: rc = -EPERM; } return rc; } static int flask_evtchn_status(struct domain *d, struct evtchn *chn) { return domain_has_evtchn(d, chn, EVENT__STATUS); } static int flask_evtchn_reset(struct domain *d1, struct domain *d2) { return domain_has_perm(d1, d2, SECCLASS_EVENT, EVENT__RESET); } static int flask_alloc_security_evtchn(struct evtchn *chn) { struct evtchn_security_struct *esec; esec = xmalloc(struct evtchn_security_struct); if ( !esec ) return -ENOMEM; memset(esec, 0, sizeof(struct evtchn_security_struct)); esec->sid = SECINITSID_UNLABELED; chn->ssid = esec; return 0; } static void flask_free_security_evtchn(struct evtchn *chn) { struct evtchn_security_struct *esec; if ( !chn ) return; esec = chn->ssid; if ( !esec ) return; chn->ssid = NULL; xfree(esec); } static char *flask_show_security_evtchn(struct domain *d, const struct evtchn *chn) { int irq; u32 sid = 0; char *ctx; u32 ctx_len; switch ( chn->state ) { case ECS_UNBOUND: case ECS_INTERDOMAIN: sid = evtchn_sid(chn); break; case ECS_PIRQ: irq = domain_pirq_to_irq(d, chn->u.pirq.irq); if (irq && get_irq_sid(irq, &sid, NULL)) return NULL; break; } if ( !sid ) return NULL; if (security_sid_to_context(sid, &ctx, &ctx_len)) return NULL; return ctx; } static int flask_grant_mapref(struct domain *d1, struct domain *d2, uint32_t flags) { u32 perms = GRANT__MAP_READ; if ( !(flags & GNTMAP_readonly) ) perms |= GRANT__MAP_WRITE; return domain_has_perm(d1, d2, SECCLASS_GRANT, perms); } static int flask_grant_unmapref(struct domain *d1, struct domain *d2) { return domain_has_perm(d1, d2, SECCLASS_GRANT, GRANT__UNMAP); } static int flask_grant_setup(struct domain *d1, struct domain *d2) { return domain_has_perm(d1, d2, SECCLASS_GRANT, GRANT__SETUP); } static int flask_grant_transfer(struct domain *d1, struct domain *d2) { return domain_has_perm(d1, d2, SECCLASS_GRANT, GRANT__TRANSFER); } static int flask_grant_copy(struct domain *d1, struct domain *d2) { return domain_has_perm(d1, d2, SECCLASS_GRANT, GRANT__COPY); } static int flask_grant_query_size(struct domain *d1, struct domain *d2) { return domain_has_perm(d1, d2, SECCLASS_GRANT, GRANT__QUERY); } static int flask_get_pod_target(struct domain *d) { return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETPODTARGET); } static int flask_set_pod_target(struct domain *d) { return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETPODTARGET); } static int flask_memory_exchange(struct domain *d) { return current_has_perm(d, SECCLASS_MMU, MMU__EXCHANGE); } static int flask_memory_adjust_reservation(struct domain *d1, struct domain *d2) { return domain_has_perm(d1, d2, SECCLASS_MMU, MMU__ADJUST); } static int flask_memory_stat_reservation(struct domain *d1, struct domain *d2) { return domain_has_perm(d1, d2, SECCLASS_MMU, MMU__STAT); } static int flask_memory_pin_page(struct domain *d1, struct domain *d2, struct page_info *page) { return domain_has_perm(d1, d2, SECCLASS_MMU, MMU__PINPAGE); } static int flask_claim_pages(struct domain *d) { return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__SETCLAIM); } static int flask_console_io(struct domain *d, int cmd) { u32 perm; switch ( cmd ) { case CONSOLEIO_read: perm = XEN__READCONSOLE; break; case CONSOLEIO_write: perm = XEN__WRITECONSOLE; break; default: return -EPERM; } return domain_has_xen(d, perm); } static int flask_profile(struct domain *d, int op) { u32 perm; switch ( op ) { case XENOPROF_init: case XENOPROF_enable_virq: case XENOPROF_disable_virq: case XENOPROF_get_buffer: perm = XEN__NONPRIVPROFILE; break; case XENOPROF_reset_active_list: case XENOPROF_reset_passive_list: case XENOPROF_set_active: case XENOPROF_set_passive: case XENOPROF_reserve_counters: case XENOPROF_counter: case XENOPROF_setup_events: case XENOPROF_start: case XENOPROF_stop: case XENOPROF_release_counters: case XENOPROF_shutdown: perm = XEN__PRIVPROFILE; break; default: return -EPERM; } return domain_has_xen(d, perm); } static int flask_kexec(void) { return domain_has_xen(current->domain, XEN__KEXEC); } static int flask_schedop_shutdown(struct domain *d1, struct domain *d2) { return domain_has_perm(d1, d2, SECCLASS_DOMAIN, DOMAIN__SHUTDOWN); } static void flask_security_domaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info) { info->ssidref = domain_sid(d); } static int flask_domain_create(struct domain *d, u32 ssidref) { int rc; struct domain_security_struct *dsec = d->ssid; static int dom0_created = 0; if ( is_idle_domain(current->domain) && !dom0_created ) { dsec->sid = SECINITSID_DOM0; dom0_created = 1; } else { rc = avc_current_has_perm(ssidref, SECCLASS_DOMAIN, DOMAIN__CREATE, NULL); if ( rc ) return rc; dsec->sid = ssidref; } dsec->self_sid = dsec->sid; rc = security_transition_sid(dsec->sid, dsec->sid, SECCLASS_DOMAIN, &dsec->self_sid); return rc; } static int flask_getdomaininfo(struct domain *d) { return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETDOMAININFO); } static int flask_domctl_scheduler_op(struct domain *d, int op) { switch ( op ) { case XEN_DOMCTL_SCHEDOP_putinfo: return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__SETSCHEDULER); case XEN_DOMCTL_SCHEDOP_getinfo: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETSCHEDULER); default: printk("flask_domctl_scheduler_op: Unknown op %d\n", op); return -EPERM; } } static int flask_sysctl_scheduler_op(int op) { switch ( op ) { case XEN_DOMCTL_SCHEDOP_putinfo: return domain_has_xen(current->domain, XEN__SETSCHEDULER); case XEN_DOMCTL_SCHEDOP_getinfo: return domain_has_xen(current->domain, XEN__GETSCHEDULER); default: printk("flask_domctl_scheduler_op: Unknown op %d\n", op); return -EPERM; } } static int flask_set_target(struct domain *d, struct domain *t) { int rc; struct domain_security_struct *dsec, *tsec; dsec = d->ssid; tsec = t->ssid; rc = current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__MAKE_PRIV_FOR); if ( rc ) return rc; rc = current_has_perm(t, SECCLASS_DOMAIN2, DOMAIN2__SET_AS_TARGET); if ( rc ) return rc; /* Use avc_has_perm to avoid resolving target/current SID */ rc = avc_has_perm(dsec->sid, tsec->sid, SECCLASS_DOMAIN, DOMAIN__SET_TARGET, NULL); if ( rc ) return rc; /* (tsec, dsec) defaults the label to tsec, as it should here */ rc = security_transition_sid(tsec->sid, dsec->sid, SECCLASS_DOMAIN, &dsec->target_sid); return rc; } static int flask_domctl(struct domain *d, int cmd) { switch ( cmd ) { /* These have individual XSM hooks (common/domctl.c) */ case XEN_DOMCTL_createdomain: case XEN_DOMCTL_getdomaininfo: case XEN_DOMCTL_scheduler_op: case XEN_DOMCTL_irq_permission: case XEN_DOMCTL_iomem_permission: case XEN_DOMCTL_set_target: #ifdef CONFIG_X86 /* These have individual XSM hooks (arch/x86/domctl.c) */ case XEN_DOMCTL_shadow_op: case XEN_DOMCTL_ioport_permission: case XEN_DOMCTL_bind_pt_irq: case XEN_DOMCTL_unbind_pt_irq: case XEN_DOMCTL_memory_mapping: case XEN_DOMCTL_ioport_mapping: case XEN_DOMCTL_mem_event_op: /* These have individual XSM hooks (drivers/passthrough/iommu.c) */ case XEN_DOMCTL_get_device_group: case XEN_DOMCTL_test_assign_device: case XEN_DOMCTL_assign_device: case XEN_DOMCTL_deassign_device: #endif return 0; case XEN_DOMCTL_destroydomain: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__DESTROY); case XEN_DOMCTL_pausedomain: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__PAUSE); case XEN_DOMCTL_unpausedomain: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__UNPAUSE); case XEN_DOMCTL_setvcpuaffinity: case XEN_DOMCTL_setnodeaffinity: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETAFFINITY); case XEN_DOMCTL_getvcpuaffinity: case XEN_DOMCTL_getnodeaffinity: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETAFFINITY); case XEN_DOMCTL_resumedomain: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__RESUME); case XEN_DOMCTL_max_vcpus: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__MAX_VCPUS); case XEN_DOMCTL_max_mem: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETDOMAINMAXMEM); case XEN_DOMCTL_setdomainhandle: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETDOMAINHANDLE); case XEN_DOMCTL_setvcpucontext: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETVCPUCONTEXT); case XEN_DOMCTL_getvcpucontext: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETVCPUCONTEXT); case XEN_DOMCTL_getvcpuinfo: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETVCPUINFO); case XEN_DOMCTL_settimeoffset: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETTIME); case XEN_DOMCTL_setdebugging: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETDEBUGGING); case XEN_DOMCTL_getpageframeinfo: case XEN_DOMCTL_getpageframeinfo2: case XEN_DOMCTL_getpageframeinfo3: return current_has_perm(d, SECCLASS_MMU, MMU__PAGEINFO); case XEN_DOMCTL_getmemlist: return current_has_perm(d, SECCLASS_MMU, MMU__PAGELIST); case XEN_DOMCTL_hypercall_init: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__HYPERCALL); case XEN_DOMCTL_sethvmcontext: return current_has_perm(d, SECCLASS_HVM, HVM__SETHVMC); case XEN_DOMCTL_gethvmcontext: case XEN_DOMCTL_gethvmcontext_partial: return current_has_perm(d, SECCLASS_HVM, HVM__GETHVMC); case XEN_DOMCTL_set_address_size: case XEN_DOMCTL_set_machine_address_size: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETADDRSIZE); case XEN_DOMCTL_get_address_size: case XEN_DOMCTL_get_machine_address_size: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETADDRSIZE); case XEN_DOMCTL_mem_sharing_op: return current_has_perm(d, SECCLASS_HVM, HVM__MEM_SHARING); case XEN_DOMCTL_pin_mem_cacheattr: return current_has_perm(d, SECCLASS_HVM, HVM__CACHEATTR); case XEN_DOMCTL_set_ext_vcpucontext: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETEXTVCPUCONTEXT); case XEN_DOMCTL_get_ext_vcpucontext: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETEXTVCPUCONTEXT); case XEN_DOMCTL_setvcpuextstate: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETVCPUEXTSTATE); case XEN_DOMCTL_getvcpuextstate: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETVCPUEXTSTATE); case XEN_DOMCTL_sendtrigger: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__TRIGGER); case XEN_DOMCTL_set_access_required: return current_has_perm(d, SECCLASS_HVM, HVM__MEM_EVENT); case XEN_DOMCTL_debug_op: case XEN_DOMCTL_gdbsx_guestmemio: case XEN_DOMCTL_gdbsx_pausevcpu: case XEN_DOMCTL_gdbsx_unpausevcpu: case XEN_DOMCTL_gdbsx_domstatus: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETDEBUGGING); case XEN_DOMCTL_subscribe: case XEN_DOMCTL_disable_migrate: case XEN_DOMCTL_suppress_spurious_page_faults: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SET_MISC_INFO); case XEN_DOMCTL_set_virq_handler: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SET_VIRQ_HANDLER); case XEN_DOMCTL_set_cpuid: return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__SET_CPUID); case XEN_DOMCTL_gettscinfo: return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__GETTSC); case XEN_DOMCTL_settscinfo: return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__SETTSC); case XEN_DOMCTL_audit_p2m: return current_has_perm(d, SECCLASS_HVM, HVM__AUDIT_P2M); case XEN_DOMCTL_set_max_evtchn: return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__SET_MAX_EVTCHN); case XEN_DOMCTL_cacheflush: return current_has_perm(d, SECCLASS_DOMAIN2, DOMAIN2__CACHEFLUSH); default: printk("flask_domctl: Unknown op %d\n", cmd); return -EPERM; } } static int flask_sysctl(int cmd) { switch ( cmd ) { /* These have individual XSM hooks */ case XEN_SYSCTL_readconsole: case XEN_SYSCTL_getdomaininfolist: case XEN_SYSCTL_page_offline_op: case XEN_SYSCTL_scheduler_op: #ifdef CONFIG_X86 case XEN_SYSCTL_cpu_hotplug: #endif return 0; case XEN_SYSCTL_tbuf_op: return domain_has_xen(current->domain, XEN__TBUFCONTROL); case XEN_SYSCTL_sched_id: return domain_has_xen(current->domain, XEN__GETSCHEDULER); case XEN_SYSCTL_perfc_op: return domain_has_xen(current->domain, XEN__PERFCONTROL); case XEN_SYSCTL_debug_keys: return domain_has_xen(current->domain, XEN__DEBUG); case XEN_SYSCTL_getcpuinfo: return domain_has_xen(current->domain, XEN__GETCPUINFO); case XEN_SYSCTL_availheap: return domain_has_xen(current->domain, XEN__HEAP); case XEN_SYSCTL_get_pmstat: return domain_has_xen(current->domain, XEN__PM_OP); case XEN_SYSCTL_pm_op: return domain_has_xen(current->domain, XEN__PM_OP); case XEN_SYSCTL_lockprof_op: return domain_has_xen(current->domain, XEN__LOCKPROF); case XEN_SYSCTL_cpupool_op: return domain_has_xen(current->domain, XEN__CPUPOOL_OP); case XEN_SYSCTL_physinfo: case XEN_SYSCTL_topologyinfo: case XEN_SYSCTL_numainfo: return domain_has_xen(current->domain, XEN__PHYSINFO); default: printk("flask_sysctl: Unknown op %d\n", cmd); return -EPERM; } } static int flask_readconsole(uint32_t clear) { u32 perms = XEN__READCONSOLE; if ( clear ) perms |= XEN__CLEARCONSOLE; return domain_has_xen(current->domain, perms); } static int flask_do_mca(void) { return domain_has_xen(current->domain, XEN__MCA_OP); } static inline u32 resource_to_perm(uint8_t access) { if ( access ) return RESOURCE__ADD; else return RESOURCE__REMOVE; } static char *flask_show_irq_sid (int irq) { u32 sid, ctx_len; char *ctx; int rc = get_irq_sid(irq, &sid, NULL); if ( rc ) return NULL; if (security_sid_to_context(sid, &ctx, &ctx_len)) return NULL; return ctx; } static int flask_map_domain_pirq (struct domain *d) { return current_has_perm(d, SECCLASS_RESOURCE, RESOURCE__ADD); } static int flask_map_domain_irq (struct domain *d, int irq, void *data) { u32 sid, dsid; int rc = -EPERM; struct msi_info *msi = data; struct avc_audit_data ad; if ( irq >= nr_static_irqs && msi ) { u32 machine_bdf = (msi->seg << 16) | (msi->bus << 8) | msi->devfn; AVC_AUDIT_DATA_INIT(&ad, DEV); ad.device = machine_bdf; rc = security_device_sid(machine_bdf, &sid); } else { rc = get_irq_sid(irq, &sid, &ad); } if ( rc ) return rc; dsid = domain_sid(d); rc = avc_current_has_perm(sid, SECCLASS_RESOURCE, RESOURCE__ADD_IRQ, &ad); if ( rc ) return rc; rc = avc_has_perm(dsid, sid, SECCLASS_RESOURCE, RESOURCE__USE, &ad); return rc; } static int flask_unmap_domain_pirq (struct domain *d) { return current_has_perm(d, SECCLASS_RESOURCE, RESOURCE__REMOVE); } static int flask_unmap_domain_irq (struct domain *d, int irq, void *data) { u32 sid; int rc = -EPERM; struct msi_info *msi = data; struct avc_audit_data ad; if ( irq >= nr_static_irqs && msi ) { u32 machine_bdf = (msi->seg << 16) | (msi->bus << 8) | msi->devfn; AVC_AUDIT_DATA_INIT(&ad, DEV); ad.device = machine_bdf; rc = security_device_sid(machine_bdf, &sid); } else { rc = get_irq_sid(irq, &sid, &ad); } if ( rc ) return rc; rc = avc_current_has_perm(sid, SECCLASS_RESOURCE, RESOURCE__REMOVE_IRQ, &ad); return rc; } static int flask_irq_permission (struct domain *d, int pirq, uint8_t access) { /* the PIRQ number is not useful; real IRQ is checked during mapping */ return current_has_perm(d, SECCLASS_RESOURCE, resource_to_perm(access)); } struct iomem_has_perm_data { u32 ssid; u32 dsid; u32 perm; }; static int _iomem_has_perm(void *v, u32 sid, unsigned long start, unsigned long end) { struct iomem_has_perm_data *data = v; struct avc_audit_data ad; int rc = -EPERM; AVC_AUDIT_DATA_INIT(&ad, RANGE); ad.range.start = start; ad.range.end = end; rc = avc_has_perm(data->ssid, sid, SECCLASS_RESOURCE, data->perm, &ad); if ( rc ) return rc; return avc_has_perm(data->dsid, sid, SECCLASS_RESOURCE, RESOURCE__USE, &ad); } static int flask_iomem_permission(struct domain *d, uint64_t start, uint64_t end, uint8_t access) { struct iomem_has_perm_data data; int rc; rc = current_has_perm(d, SECCLASS_RESOURCE, resource_to_perm(access)); if ( rc ) return rc; if ( access ) data.perm = RESOURCE__ADD_IOMEM; else data.perm = RESOURCE__REMOVE_IOMEM; data.ssid = domain_sid(current->domain); data.dsid = domain_sid(d); return security_iterate_iomem_sids(start, end, _iomem_has_perm, &data); } static int flask_iomem_mapping(struct domain *d, uint64_t start, uint64_t end, uint8_t access) { return flask_iomem_permission(d, start, end, access); } static int flask_pci_config_permission(struct domain *d, uint32_t machine_bdf, uint16_t start, uint16_t end, uint8_t access) { u32 dsid, rsid; int rc = -EPERM; struct avc_audit_data ad; u32 perm = RESOURCE__USE; rc = security_device_sid(machine_bdf, &rsid); if ( rc ) return rc; /* Writes to the BARs count as setup */ if ( access && (end >= 0x10 && start < 0x28) ) perm = RESOURCE__SETUP; AVC_AUDIT_DATA_INIT(&ad, DEV); ad.device = (unsigned long) machine_bdf; dsid = domain_sid(d); return avc_has_perm(dsid, rsid, SECCLASS_RESOURCE, perm, &ad); } static int flask_resource_plug_core(void) { return avc_current_has_perm(SECINITSID_DOMXEN, SECCLASS_RESOURCE, RESOURCE__PLUG, NULL); } static int flask_resource_unplug_core(void) { return avc_current_has_perm(SECINITSID_DOMXEN, SECCLASS_RESOURCE, RESOURCE__UNPLUG, NULL); } static int flask_resource_use_core(void) { return avc_current_has_perm(SECINITSID_DOMXEN, SECCLASS_RESOURCE, RESOURCE__USE, NULL); } static int flask_resource_plug_pci(uint32_t machine_bdf) { u32 rsid; int rc = -EPERM; struct avc_audit_data ad; rc = security_device_sid(machine_bdf, &rsid); if ( rc ) return rc; AVC_AUDIT_DATA_INIT(&ad, DEV); ad.device = (unsigned long) machine_bdf; return avc_current_has_perm(rsid, SECCLASS_RESOURCE, RESOURCE__PLUG, &ad); } static int flask_resource_unplug_pci(uint32_t machine_bdf) { u32 rsid; int rc = -EPERM; struct avc_audit_data ad; rc = security_device_sid(machine_bdf, &rsid); if ( rc ) return rc; AVC_AUDIT_DATA_INIT(&ad, DEV); ad.device = (unsigned long) machine_bdf; return avc_current_has_perm(rsid, SECCLASS_RESOURCE, RESOURCE__UNPLUG, &ad); } static int flask_resource_setup_pci(uint32_t machine_bdf) { u32 rsid; int rc = -EPERM; struct avc_audit_data ad; rc = security_device_sid(machine_bdf, &rsid); if ( rc ) return rc; AVC_AUDIT_DATA_INIT(&ad, DEV); ad.device = (unsigned long) machine_bdf; return avc_current_has_perm(rsid, SECCLASS_RESOURCE, RESOURCE__SETUP, &ad); } static int flask_resource_setup_gsi(int gsi) { u32 rsid; int rc = -EPERM; struct avc_audit_data ad; rc = get_irq_sid(gsi, &rsid, &ad); if ( rc ) return rc; return avc_current_has_perm(rsid, SECCLASS_RESOURCE, RESOURCE__SETUP, &ad); } static int flask_resource_setup_misc(void) { return avc_current_has_perm(SECINITSID_XEN, SECCLASS_RESOURCE, RESOURCE__SETUP, NULL); } static inline int flask_page_offline(uint32_t cmd) { switch (cmd) { case sysctl_page_offline: return flask_resource_unplug_core(); case sysctl_page_online: return flask_resource_plug_core(); case sysctl_query_page_offline: return flask_resource_use_core(); default: return -EPERM; } } static inline int flask_tmem_op(void) { return domain_has_xen(current->domain, XEN__TMEM_OP); } static inline int flask_tmem_control(void) { return domain_has_xen(current->domain, XEN__TMEM_CONTROL); } static int flask_add_to_physmap(struct domain *d1, struct domain *d2) { return domain_has_perm(d1, d2, SECCLASS_MMU, MMU__PHYSMAP); } static int flask_remove_from_physmap(struct domain *d1, struct domain *d2) { return domain_has_perm(d1, d2, SECCLASS_MMU, MMU__PHYSMAP); } static int flask_hvm_param(struct domain *d, unsigned long op) { u32 perm; switch ( op ) { case HVMOP_set_param: perm = HVM__SETPARAM; break; case HVMOP_get_param: perm = HVM__GETPARAM; break; case HVMOP_track_dirty_vram: perm = HVM__TRACKDIRTYVRAM; break; default: perm = HVM__HVMCTL; } return current_has_perm(d, SECCLASS_HVM, perm); } static int flask_hvm_param_nested(struct domain *d) { return current_has_perm(d, SECCLASS_HVM, HVM__NESTED); } #ifdef CONFIG_X86 static int flask_shadow_control(struct domain *d, uint32_t op) { u32 perm; switch ( op ) { case XEN_DOMCTL_SHADOW_OP_OFF: perm = SHADOW__DISABLE; break; case XEN_DOMCTL_SHADOW_OP_ENABLE: case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST: case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE: case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: perm = SHADOW__ENABLE; break; case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY: case XEN_DOMCTL_SHADOW_OP_PEEK: case XEN_DOMCTL_SHADOW_OP_CLEAN: perm = SHADOW__LOGDIRTY; break; default: return -EPERM; } return current_has_perm(d, SECCLASS_SHADOW, perm); } struct ioport_has_perm_data { u32 ssid; u32 dsid; u32 perm; }; static int _ioport_has_perm(void *v, u32 sid, unsigned long start, unsigned long end) { struct ioport_has_perm_data *data = v; struct avc_audit_data ad; int rc; AVC_AUDIT_DATA_INIT(&ad, RANGE); ad.range.start = start; ad.range.end = end; rc = avc_has_perm(data->ssid, sid, SECCLASS_RESOURCE, data->perm, &ad); if ( rc ) return rc; return avc_has_perm(data->dsid, sid, SECCLASS_RESOURCE, RESOURCE__USE, &ad); } static int flask_ioport_permission(struct domain *d, uint32_t start, uint32_t end, uint8_t access) { int rc; struct ioport_has_perm_data data; rc = current_has_perm(d, SECCLASS_RESOURCE, resource_to_perm(access)); if ( rc ) return rc; if ( access ) data.perm = RESOURCE__ADD_IOPORT; else data.perm = RESOURCE__REMOVE_IOPORT; data.ssid = domain_sid(current->domain); data.dsid = domain_sid(d); return security_iterate_ioport_sids(start, end, _ioport_has_perm, &data); } static int flask_ioport_mapping(struct domain *d, uint32_t start, uint32_t end, uint8_t access) { return flask_ioport_permission(d, start, end, access); } static int flask_hvm_set_pci_intx_level(struct domain *d) { return current_has_perm(d, SECCLASS_HVM, HVM__PCILEVEL); } static int flask_hvm_set_isa_irq_level(struct domain *d) { return current_has_perm(d, SECCLASS_HVM, HVM__IRQLEVEL); } static int flask_hvm_set_pci_link_route(struct domain *d) { return current_has_perm(d, SECCLASS_HVM, HVM__PCIROUTE); } static int flask_hvm_inject_msi(struct domain *d) { return current_has_perm(d, SECCLASS_HVM, HVM__SEND_IRQ); } static int flask_mem_event_control(struct domain *d, int mode, int op) { return current_has_perm(d, SECCLASS_HVM, HVM__MEM_EVENT); } static int flask_mem_event_op(struct domain *d, int op) { return current_has_perm(d, SECCLASS_HVM, HVM__MEM_EVENT); } static int flask_mem_sharing_op(struct domain *d, struct domain *cd, int op) { int rc = current_has_perm(cd, SECCLASS_HVM, HVM__MEM_SHARING); if ( rc ) return rc; return domain_has_perm(d, cd, SECCLASS_HVM, HVM__SHARE_MEM); } static int flask_apic(struct domain *d, int cmd) { u32 perm; switch ( cmd ) { case PHYSDEVOP_apic_read: case PHYSDEVOP_alloc_irq_vector: perm = XEN__READAPIC; break; case PHYSDEVOP_apic_write: perm = XEN__WRITEAPIC; break; default: return -EPERM; } return domain_has_xen(d, perm); } static int flask_platform_op(uint32_t op) { switch ( op ) { #ifdef CONFIG_X86 /* These operations have their own XSM hooks */ case XENPF_cpu_online: case XENPF_cpu_offline: case XENPF_cpu_hotadd: case XENPF_mem_hotadd: return 0; #endif case XENPF_settime: return domain_has_xen(current->domain, XEN__SETTIME); case XENPF_add_memtype: return domain_has_xen(current->domain, XEN__MTRR_ADD); case XENPF_del_memtype: return domain_has_xen(current->domain, XEN__MTRR_DEL); case XENPF_read_memtype: return domain_has_xen(current->domain, XEN__MTRR_READ); case XENPF_microcode_update: return domain_has_xen(current->domain, XEN__MICROCODE); case XENPF_platform_quirk: return domain_has_xen(current->domain, XEN__QUIRK); case XENPF_firmware_info: return domain_has_xen(current->domain, XEN__FIRMWARE); case XENPF_efi_runtime_call: return domain_has_xen(current->domain, XEN__FIRMWARE); case XENPF_enter_acpi_sleep: return domain_has_xen(current->domain, XEN__SLEEP); case XENPF_change_freq: return domain_has_xen(current->domain, XEN__FREQUENCY); case XENPF_getidletime: return domain_has_xen(current->domain, XEN__GETIDLE); case XENPF_set_processor_pminfo: case XENPF_core_parking: return domain_has_xen(current->domain, XEN__PM_OP); case XENPF_get_cpu_version: case XENPF_get_cpuinfo: return domain_has_xen(current->domain, XEN__GETCPUINFO); default: printk("flask_platform_op: Unknown op %d\n", op); return -EPERM; } } static int flask_machine_memory_map(void) { return avc_current_has_perm(SECINITSID_XEN, SECCLASS_MMU, MMU__MEMORYMAP, NULL); } static int flask_domain_memory_map(struct domain *d) { return current_has_perm(d, SECCLASS_MMU, MMU__MEMORYMAP); } static int flask_mmu_update(struct domain *d, struct domain *t, struct domain *f, uint32_t flags) { int rc = 0; u32 map_perms = 0; if ( t && d != t ) rc = domain_has_perm(d, t, SECCLASS_MMU, MMU__REMOTE_REMAP); if ( rc ) return rc; if ( flags & XSM_MMU_UPDATE_READ ) map_perms |= MMU__MAP_READ; if ( flags & XSM_MMU_UPDATE_WRITE ) map_perms |= MMU__MAP_WRITE; if ( flags & XSM_MMU_MACHPHYS_UPDATE ) map_perms |= MMU__UPDATEMP; if ( map_perms ) rc = domain_has_perm(d, f, SECCLASS_MMU, map_perms); return rc; } static int flask_mmuext_op(struct domain *d, struct domain *f) { return domain_has_perm(d, f, SECCLASS_MMU, MMU__MMUEXT_OP); } static int flask_update_va_mapping(struct domain *d, struct domain *f, l1_pgentry_t pte) { u32 map_perms = MMU__MAP_READ; if ( !(l1e_get_flags(pte) & _PAGE_PRESENT) ) return 0; if ( l1e_get_flags(pte) & _PAGE_RW ) map_perms |= MMU__MAP_WRITE; return domain_has_perm(d, f, SECCLASS_MMU, map_perms); } static int flask_priv_mapping(struct domain *d, struct domain *t) { return domain_has_perm(d, t, SECCLASS_MMU, MMU__TARGET_HACK); } static int flask_get_device_group(uint32_t machine_bdf) { u32 rsid; int rc = -EPERM; rc = security_device_sid(machine_bdf, &rsid); if ( rc ) return rc; return avc_current_has_perm(rsid, SECCLASS_RESOURCE, RESOURCE__STAT_DEVICE, NULL); } static int flask_test_assign_device(uint32_t machine_bdf) { u32 rsid; int rc = -EPERM; rc = security_device_sid(machine_bdf, &rsid); if ( rc ) return rc; return avc_current_has_perm(rsid, SECCLASS_RESOURCE, RESOURCE__STAT_DEVICE, NULL); } static int flask_assign_device(struct domain *d, uint32_t machine_bdf) { u32 dsid, rsid; int rc = -EPERM; struct avc_audit_data ad; rc = current_has_perm(d, SECCLASS_RESOURCE, RESOURCE__ADD); if ( rc ) return rc; rc = security_device_sid(machine_bdf, &rsid); if ( rc ) return rc; AVC_AUDIT_DATA_INIT(&ad, DEV); ad.device = (unsigned long) machine_bdf; rc = avc_current_has_perm(rsid, SECCLASS_RESOURCE, RESOURCE__ADD_DEVICE, &ad); if ( rc ) return rc; dsid = domain_sid(d); return avc_has_perm(dsid, rsid, SECCLASS_RESOURCE, RESOURCE__USE, &ad); } static int flask_deassign_device(struct domain *d, uint32_t machine_bdf) { u32 rsid; int rc = -EPERM; rc = current_has_perm(d, SECCLASS_RESOURCE, RESOURCE__REMOVE); if ( rc ) return rc; rc = security_device_sid(machine_bdf, &rsid); if ( rc ) return rc; return avc_current_has_perm(rsid, SECCLASS_RESOURCE, RESOURCE__REMOVE_DEVICE, NULL); } static int flask_bind_pt_irq (struct domain *d, struct xen_domctl_bind_pt_irq *bind) { u32 dsid, rsid; int rc = -EPERM; int irq; struct avc_audit_data ad; rc = current_has_perm(d, SECCLASS_RESOURCE, RESOURCE__ADD); if ( rc ) return rc; irq = domain_pirq_to_irq(d, bind->machine_irq); rc = get_irq_sid(irq, &rsid, &ad); if ( rc ) return rc; rc = avc_current_has_perm(rsid, SECCLASS_HVM, HVM__BIND_IRQ, &ad); if ( rc ) return rc; dsid = domain_sid(d); return avc_has_perm(dsid, rsid, SECCLASS_RESOURCE, RESOURCE__USE, &ad); } static int flask_unbind_pt_irq (struct domain *d, struct xen_domctl_bind_pt_irq *bind) { return current_has_perm(d, SECCLASS_RESOURCE, RESOURCE__REMOVE); } #endif /* CONFIG_X86 */ #ifdef CONFIG_ARM static int flask_map_gmfn_foreign(struct domain *d, struct domain *t) { return domain_has_perm(d, t, SECCLASS_MMU, MMU__MAP_READ | MMU__MAP_WRITE); } #endif long do_flask_op(XEN_GUEST_HANDLE_PARAM(xsm_op_t) u_flask_op); static struct xsm_operations flask_ops = { .security_domaininfo = flask_security_domaininfo, .domain_create = flask_domain_create, .getdomaininfo = flask_getdomaininfo, .domctl_scheduler_op = flask_domctl_scheduler_op, .sysctl_scheduler_op = flask_sysctl_scheduler_op, .set_target = flask_set_target, .domctl = flask_domctl, .sysctl = flask_sysctl, .readconsole = flask_readconsole, .do_mca = flask_do_mca, .evtchn_unbound = flask_evtchn_unbound, .evtchn_interdomain = flask_evtchn_interdomain, .evtchn_close_post = flask_evtchn_close_post, .evtchn_send = flask_evtchn_send, .evtchn_status = flask_evtchn_status, .evtchn_reset = flask_evtchn_reset, .grant_mapref = flask_grant_mapref, .grant_unmapref = flask_grant_unmapref, .grant_setup = flask_grant_setup, .grant_transfer = flask_grant_transfer, .grant_copy = flask_grant_copy, .grant_query_size = flask_grant_query_size, .alloc_security_domain = flask_domain_alloc_security, .free_security_domain = flask_domain_free_security, .alloc_security_evtchn = flask_alloc_security_evtchn, .free_security_evtchn = flask_free_security_evtchn, .show_security_evtchn = flask_show_security_evtchn, .get_pod_target = flask_get_pod_target, .set_pod_target = flask_set_pod_target, .memory_exchange = flask_memory_exchange, .memory_adjust_reservation = flask_memory_adjust_reservation, .memory_stat_reservation = flask_memory_stat_reservation, .memory_pin_page = flask_memory_pin_page, .claim_pages = flask_claim_pages, .console_io = flask_console_io, .profile = flask_profile, .kexec = flask_kexec, .schedop_shutdown = flask_schedop_shutdown, .show_irq_sid = flask_show_irq_sid, .map_domain_pirq = flask_map_domain_pirq, .map_domain_irq = flask_map_domain_irq, .unmap_domain_pirq = flask_unmap_domain_pirq, .unmap_domain_irq = flask_unmap_domain_irq, .irq_permission = flask_irq_permission, .iomem_permission = flask_iomem_permission, .iomem_mapping = flask_iomem_mapping, .pci_config_permission = flask_pci_config_permission, .resource_plug_core = flask_resource_plug_core, .resource_unplug_core = flask_resource_unplug_core, .resource_plug_pci = flask_resource_plug_pci, .resource_unplug_pci = flask_resource_unplug_pci, .resource_setup_pci = flask_resource_setup_pci, .resource_setup_gsi = flask_resource_setup_gsi, .resource_setup_misc = flask_resource_setup_misc, .page_offline = flask_page_offline, .tmem_op = flask_tmem_op, .tmem_control = flask_tmem_control, .hvm_param = flask_hvm_param, .hvm_param_nested = flask_hvm_param_nested, .do_xsm_op = do_flask_op, .add_to_physmap = flask_add_to_physmap, .remove_from_physmap = flask_remove_from_physmap, #ifdef CONFIG_X86 .shadow_control = flask_shadow_control, .hvm_set_pci_intx_level = flask_hvm_set_pci_intx_level, .hvm_set_isa_irq_level = flask_hvm_set_isa_irq_level, .hvm_set_pci_link_route = flask_hvm_set_pci_link_route, .hvm_inject_msi = flask_hvm_inject_msi, .mem_event_control = flask_mem_event_control, .mem_event_op = flask_mem_event_op, .mem_sharing_op = flask_mem_sharing_op, .apic = flask_apic, .platform_op = flask_platform_op, .machine_memory_map = flask_machine_memory_map, .domain_memory_map = flask_domain_memory_map, .mmu_update = flask_mmu_update, .mmuext_op = flask_mmuext_op, .update_va_mapping = flask_update_va_mapping, .priv_mapping = flask_priv_mapping, .get_device_group = flask_get_device_group, .test_assign_device = flask_test_assign_device, .assign_device = flask_assign_device, .deassign_device = flask_deassign_device, .bind_pt_irq = flask_bind_pt_irq, .unbind_pt_irq = flask_unbind_pt_irq, .ioport_permission = flask_ioport_permission, .ioport_mapping = flask_ioport_mapping, #endif #ifdef CONFIG_ARM .map_gmfn_foreign = flask_map_gmfn_foreign, #endif }; static __init int flask_init(void) { int ret = 0; if ( !flask_enabled ) { printk("Flask: Disabled at boot.\n"); return 0; } printk("Flask: Initializing.\n"); avc_init(); original_ops = xsm_ops; if ( register_xsm(&flask_ops) ) panic("Flask: Unable to register with XSM"); ret = security_load_policy(policy_buffer, policy_size); if ( flask_enforcing ) printk("Flask: Starting in enforcing mode.\n"); else printk("Flask: Starting in permissive mode.\n"); return ret; } xsm_initcall(flask_init); /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/xsm/flask/policy/0000775000175000017500000000000012307313555014443 5ustar smbsmbxen-4.4.0/xen/xsm/flask/policy/initial_sids0000664000175000017500000000024512307313555017042 0ustar smbsmb# FLASK # # Define initial security identifiers # sid xen sid dom0 sid domio sid domxen sid unlabeled sid security sid ioport sid iomem sid irq sid device # FLASK xen-4.4.0/xen/xsm/flask/policy/security_classes0000664000175000017500000000040412307313555017750 0ustar smbsmb# FLASK # # Define the security object classes # # Classes marked as userspace are classes # for userspace object managers class xen class domain class domain2 class hvm class mmu class resource class shadow class event class grant class security # FLASK xen-4.4.0/xen/xsm/flask/policy/mkflask.sh0000664000175000017500000000453712307313555016440 0ustar smbsmb#!/bin/sh - # # FLASK set -e awk=$1 shift 1 # output file output_file="include/flask.h" debug_file="include/class_to_string.h" debug_file2="include/initial_sid_to_string.h" cat $* | $awk " BEGIN { outfile = \"$output_file\" debugfile = \"$debug_file\" debugfile2 = \"$debug_file2\" "' nextstate = "CLASS"; printf("/* This file is automatically generated. Do not edit. */\n") > outfile; printf("#ifndef _SELINUX_FLASK_H_\n") > outfile; printf("#define _SELINUX_FLASK_H_\n") > outfile; printf("\n/*\n * Security object class definitions\n */\n") > outfile; printf("/* This file is automatically generated. Do not edit. */\n") > debugfile; printf("/*\n * Security object class definitions\n */\n") > debugfile; printf(" S_(\"null\")\n") > debugfile; printf("/* This file is automatically generated. Do not edit. */\n") > debugfile2; printf("static char *initial_sid_to_string[] =\n{\n") > debugfile2; printf(" \"null\",\n") > debugfile2; } /^[ \t]*#/ { next; } $1 == "class" { if (nextstate != "CLASS") { printf("Parse error: Unexpected class definition on line %d\n", NR); next; } if ($2 in class_found) { printf("Duplicate class definition for %s on line %d.\n", $2, NR); next; } class_found[$2] = 1; class_value++; printf("#define SECCLASS_%s", toupper($2)) > outfile; for (i = 0; i < 40 - length($2); i++) printf(" ") > outfile; printf("%d\n", class_value) > outfile; printf(" S_(\"%s\")\n", $2) > debugfile; } $1 == "sid" { if (nextstate == "CLASS") { nextstate = "SID"; printf("\n/*\n * Security identifier indices for initial entities\n */\n") > outfile; } if ($2 in sid_found) { printf("Duplicate SID definition for %s on line %d.\n", $2, NR); next; } sid_found[$2] = 1; sid_value++; printf("#define SECINITSID_%s", toupper($2)) > outfile; for (i = 0; i < 37 - length($2); i++) printf(" ") > outfile; printf("%d\n", sid_value) > outfile; printf(" \"%s\",\n", $2) > debugfile2; } END { if (nextstate != "SID") printf("Parse error: Unexpected end of file\n"); printf("\n#define SECINITSID_NUM") > outfile; for (i = 0; i < 34; i++) printf(" ") > outfile; printf("%d\n", sid_value) > outfile; printf("\n#endif\n") > outfile; printf("};\n\n") > debugfile2; }' # FLASK xen-4.4.0/xen/xsm/flask/policy/access_vectors0000664000175000017500000003300712307313555017377 0ustar smbsmb# # Define the access vectors. # # class class_name { permission_name ... } # Class xen consists of dom0-only operations dealing with the hypervisor itself. # Unless otherwise specified, the source is the domain executing the hypercall, # and the target is the xen initial sid (type xen_t). class xen { # XENPF_settime settime # XEN_SYSCTL_tbuf_op tbufcontrol # CONSOLEIO_read, XEN_SYSCTL_readconsole readconsole # XEN_SYSCTL_readconsole with clear=1 clearconsole # XEN_SYSCTL_perfc_op perfcontrol # XENPF_add_memtype mtrr_add # XENPF_del_memtype mtrr_del # XENPF_read_memtype mtrr_read # XENPF_microcode_update microcode # XEN_SYSCTL_physinfo, XEN_SYSCTL_topologyinfo, XEN_SYSCTL_numainfo physinfo # XENPF_platform_quirk quirk # CONSOLEIO_write writeconsole # PHYSDEVOP_apic_read, PHYSDEVOP_alloc_irq_vector readapic # PHYSDEVOP_apic_write writeapic # Most XENOPROF_* privprofile # XENOPROF_{init,enable_virq,disable_virq,get_buffer} nonprivprofile # kexec hypercall kexec # XENPF_firmware_info, XENPF_efi_runtime_call firmware # XENPF_enter_acpi_sleep sleep # XENPF_change_freq frequency # XENPF_getidletime getidle # XEN_SYSCTL_debug_keys debug # XEN_SYSCTL_getcpuinfo, XENPF_get_cpu_version, XENPF_get_cpuinfo getcpuinfo # XEN_SYSCTL_availheap heap # XEN_SYSCTL_get_pmstat, XEN_SYSCTL_pm_op, XENPF_set_processor_pminfo, # XENPF_core_parking pm_op # mca hypercall mca_op # XEN_SYSCTL_lockprof_op lockprof # XEN_SYSCTL_cpupool_op cpupool_op # tmem hypercall (any access) tmem_op # TMEM_CONTROL command of tmem hypercall tmem_control # XEN_SYSCTL_scheduler_op with XEN_DOMCTL_SCHEDOP_getinfo, XEN_SYSCTL_sched_id getscheduler # XEN_SYSCTL_scheduler_op with XEN_DOMCTL_SCHEDOP_putinfo setscheduler } # Classes domain and domain2 consist of operations that a domain performs on # another domain or on itself. Unless otherwise specified, the source is the # domain executing the hypercall, and the target is the domain being operated on # (which may result in a _self or _target type). # # transitions in class domain are used to produce the _self and _target types; # see docs/misc/xsm-flask.txt and the example XSM policy for details. class domain { # XEN_DOMCTL_setvcpucontext setvcpucontext # XEN_DOMCTL_pausedomain pause # XEN_DOMCTL_unpausedomain unpause # XEN_DOMCTL_resumedomain resume # XEN_DOMCTL_createdomain create # checked in FLASK_RELABEL_DOMAIN for any relabel operation: # source = the old label of the domain # target = the new label of the domain # see also the domain2 relabel{from,to,self} permissions transition # XEN_DOMCTL_max_vcpus max_vcpus # XEN_DOMCTL_destroydomain destroy # XEN_DOMCTL_setvcpuaffinity # XEN_DOMCTL_setnodeaffinity setaffinity # XEN_DOMCTL_getvcpuaffinity # XEN_DOMCTL_getnodeaffinity getaffinity # XEN_DOMCTL_scheduler_op with XEN_DOMCTL_SCHEDOP_getinfo getscheduler # XEN_DOMCTL_getdomaininfo, XEN_SYSCTL_getdomaininfolist getdomaininfo # XEN_DOMCTL_getvcpuinfo getvcpuinfo # XEN_DOMCTL_getvcpucontext getvcpucontext # XEN_DOMCTL_max_mem setdomainmaxmem # XEN_DOMCTL_setdomainhandle setdomainhandle # XEN_DOMCTL_setdebugging setdebugging # XEN_DOMCTL_hypercall_init hypercall # XEN_DOMCTL_settimeoffset settime # checked in XEN_DOMCTL_set_target: # source = the new device model domain # target = the new target domain # see also the domain2 make_priv_for and set_as_target checks set_target # SCHEDOP_remote_shutdown shutdown # XEN_DOMCTL_set{,_machine}_address_size setaddrsize # XEN_DOMCTL_get{,_machine}_address_size getaddrsize # XEN_DOMCTL_sendtrigger trigger # XEN_DOMCTL_get_ext_vcpucontext getextvcpucontext # XEN_DOMCTL_set_ext_vcpucontext setextvcpucontext # XEN_DOMCTL_getvcpuextstate getvcpuextstate # XEN_DOMCTL_setvcpuextstate setvcpuextstate # XENMEM_get_pod_target getpodtarget # XENMEM_set_pod_target setpodtarget # XEN_DOMCTL_subscribe, XEN_DOMCTL_disable_migrate, # XEN_DOMCTL_suppress_spurious_page_faults set_misc_info # XEN_DOMCTL_set_virq_handler set_virq_handler } # This is a continuation of class domain, since only 32 permissions can be # defined per class class domain2 { # checked in FLASK_RELABEL_DOMAIN with non-DOMID_SELF: # source = the domain making the hypercall # target = the old label of the domain being relabeled relabelfrom # checked in FLASK_RELABEL_DOMAIN with non-DOMID_SELF: # source = the domain making the hypercall # target = the new label of the domain being relabeled relabelto # checked in FLASK_RELABEL_DOMAIN, only with DOMID_SELF: # source = the old label of the domain # target = the new label of the domain # see also domain__transition relabelself # checked in XEN_DOMCTL_set_target: # source = the domain making the hypercall # target = the new device model domain make_priv_for # checked in XEN_DOMCTL_set_target: # source = the domain making the hypercall # target = the new target domain set_as_target # XEN_DOMCTL_set_cpuid set_cpuid # XEN_DOMCTL_gettscinfo gettsc # XEN_DOMCTL_settscinfo settsc # XEN_DOMCTL_scheduler_op with XEN_DOMCTL_SCHEDOP_putinfo setscheduler # XENMEM_claim_pages setclaim # XEN_DOMCTL_set_max_evtchn set_max_evtchn # XEN_DOMCTL_cacheflush cacheflush } # Similar to class domain, but primarily contains domctls related to HVM domains class hvm { # XEN_DOMCTL_sethvmcontext sethvmc # XEN_DOMCTL_gethvmcontext, XEN_DOMCTL_gethvmcontext_partial gethvmc # HVMOP_set_param setparam # HVMOP_get_param getparam # HVMOP_set_pci_intx_level (also needs hvmctl) pcilevel # HVMOP_set_isa_irq_level irqlevel # HVMOP_set_pci_link_route pciroute bind_irq # XEN_DOMCTL_pin_mem_cacheattr cacheattr # HVMOP_track_dirty_vram trackdirtyvram # HVMOP_modified_memory, HVMOP_get_mem_type, HVMOP_set_mem_type, # HVMOP_set_mem_access, HVMOP_get_mem_access, HVMOP_pagetable_dying, # HVMOP_inject_trap hvmctl # XEN_DOMCTL_set_access_required mem_event # XEN_DOMCTL_mem_sharing_op and XENMEM_sharing_op_{share,add_physmap} with: # source = the domain making the hypercall # target = domain whose memory is being shared mem_sharing # XEN_DOMCTL_audit_p2m audit_p2m # HVMOP_inject_msi send_irq # checked in XENMEM_sharing_op_{share,add_physmap} with: # source = domain whose memory is being shared # target = client domain share_mem # HVMOP_set_param setting HVM_PARAM_NESTEDHVM nested } # Class event describes event channels. Interdomain event channels have their # own security label which is computed using a type transition between the # source and target domains. Each endpoint has its own label, and the # permission checks must pass on both endpoints for an event channel to be # established. class event { # when creating an interdomain event channel endpoint: # source = event channel label # target = remote domain the event channel binds to. This may be a _self or # _target label if the endpoints are related as such. # This permission is checked when creating an unbound event channel and when the # interdomain event channel is established. bind # EVTCHNOP_send: # source = domain sending the event # target = event channel label send # EVTCHNOP_status; same as _send status # when creating an interdomain event channel endpoint: # source = the domain creating the channel (which might not be an endpoint) # target = event channel label create # EVTCHNOP_reset: # source = domain making the hypercall # target = domain whose event channels are being reset reset } # Class grant describes pages shared by grant mappings. Pages use the security # label of their owning domain. class grant { # GNTTABOP_map_grant_ref with any access map_read # GNTTABOP_map_grant_ref with write access map_write # GNTTABOP_unmap_grant_ref unmap # GNTTABOP_transfer transfer # GNTTABOP_setup_table, GNTTABOP_get_status_frames (target is commonly _self) setup # GNTTABOP_copy copy # GNTTABOP_query_size, GNTTABOP_get_version query } # Class mmu describes pages of memory not accessed using grants. Permissions # are checked using the domain ID used to access the page - the most common case # is a domain's own ID (the _self label). Using DOMID_IO in the map command to # restrict the mapping to IO memory will result in the target being domio_t, and # migration uses read-only mappings with a target of DOMID_XEN (domxen_t). class mmu { # checked when using mmu_update to map a page readably # source = domain making the hypercall (which might not own the page table) # target = domain whose pages are being mapped map_read # checked when using mmu_update to map a page writably # source = domain making the hypercall # target = domain whose pages are being mapped map_write # XEN_DOMCTL_getpageframeinfo* pageinfo # XEN_DOMCTL_getmemlist pagelist # XENMEM_{increase,decrease}_reservation, XENMEM_populate_physmap adjust # XENMEM_{current,maximum}_reservation, XENMEM_maximum_gpfn stat # mmu_update MMU_MACHPHYS_UPDATE updatemp # XENMEM_add_to_physmap, XENMEM_remove_from_physmap physmap # MMUEXT_PIN_L*_TABLE pinpage # XENMEM_machine_memory_map (with target xen_t) # XENMEM_set_memory_map (with domain target) memorymap # checked when using mmu_update to update the page tables of another domain # source = domain making the hypercall # target = domain whose page tables are being modified remote_remap # the mmuext_op hypercall acting on the target domain mmuext_op # XENMEM_exchange: # source = domain making the hypercall # target = domain whose pages are being exchanged exchange # Allow a privileged domain to install a map of a page it does not own. Used # for stub domain device models with the PV framebuffer. target_hack } # control of the paging_domctl split by subop class shadow { # XEN_DOMCTL_SHADOW_OP_OFF disable # enable, get/set allocation enable # enable, read, and clean log logdirty } # Class resource is used to describe the resources used in hardware device # passthrough. Resources include: hardware IRQs, MMIO regions, x86 I/O ports, # and PCI devices; see docs/misc/xsm-flask.txt for how to label them. # # Access to the legacy PCI configuration space on x86 via port 0xCF8/CFC # requires IS_PRIV, even with FLASK. Writes to the BARs are checked as "setup", # while other reads/writes are "use"; the target is the PCI device whose # configuration space is being modified. Accesses to the MMIO-based PCI express # configuration space described by the ACPI MCFG table are controlled as MMIO # accesses, and cannot special-case BAR writes. # # The {add,remove}_{irq,ioport,iomem,device} permissions use: # source = domain making the hypercall # target = resource's security label class resource { # checked when adding a resource to a domain: # source = domain making the hypercall # target = domain which will have access to the resource add # checked when removing a resource from a domain: # source = domain making the hypercall # target = domain which will no longer have access to the resource remove # checked when adding a resource to a domain: # source = domain which will have access to the resource # target = resource's security label # also checked when using some core Xen devices (target xen_t) use # PHYSDEVOP_map_pirq and ioapic writes for dom0, when acting on real IRQs # For GSI interrupts, the IRQ's label is indexed by the IRQ number # For MSI interrupts, the label of the PCI device is used add_irq # PHYSDEVOP_unmap_pirq (same as map, and only for real IRQs) remove_irq # XEN_DOMCTL_ioport_permission, XEN_DOMCTL_ioport_mapping add_ioport remove_ioport # XEN_DOMCTL_iomem_permission, XEN_DOMCTL_memory_mapping add_iomem remove_iomem # XEN_DOMCTL_get_device_group, XEN_DOMCTL_test_assign_device: # source = domain making the hypercall # target = PCI device being queried stat_device # XEN_DOMCTL_assign_device add_device # XEN_DOMCTL_deassign_device remove_device # checked for PCI hot and cold-plug hypercalls, with target as the PCI device # checked for CPU and memory hotplug with xen_t as the target plug # checked for PCI hot-unplug hypercalls, with target as the PCI device # checked for CPU offlining with xen_t as the target unplug # checked for PHYSDEVOP_restore_msi* (target PCI device) # checked for PHYSDEVOP_setup_gsi (target IRQ) # checked for PHYSDEVOP_pci_mmcfg_reserved (target xen_t) setup } # Class security describes the FLASK security server itself; these operations # are accessed using the xsm_op hypercall. The source is the domain invoking # the hypercall, and the target is security_t. # # Any domain with access to load_policy or setenforce must be trusted, since it # can bypass the rest of the security policy. class security { # use the security server to compute an access check compute_av # use the security server to compute a type transition compute_create # use the security server to compute member selection compute_member # sid <-> context string conversions check_context # allow loading a new XSM/FLASK policy load_policy # use the security server to compute an object relabel compute_relabel # use the security server to list the SIDs reachable by a given user compute_user # allow switching between enforcing and permissive mode setenforce # allow changing policy booleans setbool # allow changing security server configuration parmeters setsecparam # add ocontext label definitions for resources add_ocontext # remove ocontext label definitions for resources del_ocontext } xen-4.4.0/xen/xsm/flask/policy/mkaccess_vector.sh0000664000175000017500000000622312307313555020155 0ustar smbsmb#!/bin/sh - # # FLASK set -e awk=$1 shift # output files av_permissions="include/av_permissions.h" av_perm_to_string="include/av_perm_to_string.h" cat $* | $awk " BEGIN { outfile = \"$av_permissions\" avpermfile = \"$av_perm_to_string\" "' nextstate = "COMMON_OR_AV"; printf("/* This file is automatically generated. Do not edit. */\n") > outfile; printf("/* This file is automatically generated. Do not edit. */\n") > avpermfile; ; } /^[ \t]*#/ { next; } $1 == "class" { if (nextstate != "COMMON_OR_AV" && nextstate != "CLASS_OR_CLASS-OPENBRACKET") { printf("Parse error: Unexpected class definition on line %d\n", NR); next; } tclass = $2; if (tclass in av_defined) { printf("Duplicate access vector definition for %s on line %d\n", tclass, NR); next; } av_defined[tclass] = 1; permission = 1; nextstate = "INHERITS_OR_CLASS-OPENBRACKET"; next; } $1 == "{" { if (nextstate != "INHERITS_OR_CLASS-OPENBRACKET" && nextstate != "CLASS_OR_CLASS-OPENBRACKET" && nextstate != "COMMON-OPENBRACKET") { printf("Parse error: Unexpected { on line %d\n", NR); next; } if (nextstate == "INHERITS_OR_CLASS-OPENBRACKET") nextstate = "CLASS-CLOSEBRACKET"; if (nextstate == "CLASS_OR_CLASS-OPENBRACKET") nextstate = "CLASS-CLOSEBRACKET"; if (nextstate == "COMMON-OPENBRACKET") nextstate = "COMMON-CLOSEBRACKET"; } /[a-z][a-z_]*/ { if (nextstate != "COMMON-CLOSEBRACKET" && nextstate != "CLASS-CLOSEBRACKET") { printf("Parse error: Unexpected symbol %s on line %d\n", $1, NR); next; } if (nextstate == "COMMON-CLOSEBRACKET") { if ((common_name,$1) in common_perms) { printf("Duplicate permission %s for common %s on line %d.\n", $1, common_name, NR); next; } common_perms[common_name,$1] = permission; printf("#define COMMON_%s__%s", toupper(common_name), toupper($1)) > outfile; printf(" S_(\"%s\")\n", $1) > cpermfile; } else { if ((tclass,$1) in av_perms) { printf("Duplicate permission %s for %s on line %d.\n", $1, tclass, NR); next; } av_perms[tclass,$1] = permission; printf("#define %s__%s", toupper(tclass), toupper($1)) > outfile; printf(" S_(SECCLASS_%s, %s__%s, \"%s\")\n", toupper(tclass), toupper(tclass), toupper($1), $1) > avpermfile; } spaces = 40 - (length($1) + length(tclass)); if (spaces < 1) spaces = 1; for (i = 0; i < spaces; i++) printf(" ") > outfile; printf("0x%08xUL\n", permission) > outfile; permission = permission * 2; } $1 == "}" { if (nextstate != "CLASS-CLOSEBRACKET" && nextstate != "COMMON-CLOSEBRACKET") { printf("Parse error: Unexpected } on line %d\n", NR); next; } if (nextstate == "COMMON-CLOSEBRACKET") { common_base[common_name] = permission; printf("TE_(common_%s_perm_to_string)\n\n", common_name) > cpermfile; } printf("\n") > outfile; nextstate = "COMMON_OR_AV"; } END { if (nextstate != "COMMON_OR_AV" && nextstate != "CLASS_OR_CLASS-OPENBRACKET") printf("Parse error: Unexpected end of file\n"); }' # FLASK xen-4.4.0/xen/xsm/flask/avc.c0000664000175000017500000005704312307313555014072 0ustar smbsmb/* * Implementation of the kernel access vector cache (AVC). * * Authors: Stephen Smalley, * James Morris * * Update: KaiGai, Kohei * Replaced the avc_lock spinlock by RCU. * * Copyright (C) 2003 Red Hat, Inc., James Morris * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, * as published by the Free Software Foundation. */ /* Ported to Xen 3.0, George Coker, */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "avc.h" #include "avc_ss.h" static const struct av_perm_to_string av_perm_to_string[] = { #define S_(c, v, s) { c, v, s }, #include "av_perm_to_string.h" #undef S_ }; static const char *class_to_string[] = { #define S_(s) s, #include "class_to_string.h" #undef S_ }; const struct selinux_class_perm selinux_class_perm = { .av_perm_to_string = av_perm_to_string, .av_pts_len = ARRAY_SIZE(av_perm_to_string), .class_to_string = class_to_string, .cts_len = ARRAY_SIZE(class_to_string), }; #define AVC_CACHE_SLOTS 512 #define AVC_DEF_CACHE_THRESHOLD 512 #define AVC_CACHE_RECLAIM 16 #ifdef FLASK_AVC_STATS #define avc_cache_stats_incr(field) \ do { \ __get_cpu_var(avc_cache_stats).field++; \ } while (0) #else #define avc_cache_stats_incr(field) do {} while (0) #endif struct avc_entry { u32 ssid; u32 tsid; u16 tclass; struct av_decision avd; }; struct avc_node { struct avc_entry ae; struct hlist_node list; /* anchored in avc_cache->slots[i] */ struct rcu_head rhead; }; struct avc_cache { struct hlist_head slots[AVC_CACHE_SLOTS]; /* head for avc_node->list */ spinlock_t slots_lock[AVC_CACHE_SLOTS]; /* lock for writes */ atomic_t lru_hint; /* LRU hint for reclaim scan */ atomic_t active_nodes; u32 latest_notif; /* latest revocation notification */ }; struct avc_callback_node { int (*callback) (u32 event, u32 ssid, u32 tsid, u16 tclass, u32 perms, u32 *out_retained); u32 events; u32 ssid; u32 tsid; u16 tclass; u32 perms; struct avc_callback_node *next; }; /* Exported via Flask hypercall */ unsigned int avc_cache_threshold = AVC_DEF_CACHE_THRESHOLD; #ifdef FLASK_AVC_STATS DEFINE_PER_CPU(struct avc_cache_stats, avc_cache_stats); #endif static struct avc_cache avc_cache; static struct avc_callback_node *avc_callbacks; static DEFINE_RCU_READ_LOCK(avc_rcu_lock); static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass) { return (ssid ^ (tsid<<2) ^ (tclass<<4)) & (AVC_CACHE_SLOTS - 1); } /* no use making this larger than the printk buffer */ #define AVC_BUF_SIZE 1024 static DEFINE_SPINLOCK(avc_emerg_lock); static char avc_emerg_buf[AVC_BUF_SIZE]; struct avc_dump_buf { char *start; char *pos; u32 free; }; static void avc_printk(struct avc_dump_buf *buf, const char *fmt, ...) { int i; va_list args; again: va_start(args, fmt); i = vsnprintf(buf->pos, buf->free, fmt, args); va_end(args); if ( i < buf->free ) { buf->pos += i; buf->free -= i; } else if ( buf->free < AVC_BUF_SIZE ) { buf->pos[0] = 0; printk("%s", buf->start); buf->pos = buf->start; buf->free = AVC_BUF_SIZE; goto again; } else { printk("%s", buf->start); printk("\navc_printk: overflow\n"); buf->pos = buf->start; buf->free = AVC_BUF_SIZE; } } /** * avc_dump_av - Display an access vector in human-readable form. * @tclass: target security class * @av: access vector */ static void avc_dump_av(struct avc_dump_buf *buf, u16 tclass, u32 av) { int i, i2, perm; if ( av == 0 ) { avc_printk(buf, " null"); return; } avc_printk(buf, " {"); i = 0; perm = 1; while ( i < sizeof(av) * 8 ) { if ( perm & av ) { for ( i2 = 0; i2 < ARRAY_SIZE(av_perm_to_string); i2++ ) { if ( (av_perm_to_string[i2].tclass == tclass) && (av_perm_to_string[i2].value == perm) ) break; } if ( i2 < ARRAY_SIZE(av_perm_to_string) ) { avc_printk(buf, " %s", av_perm_to_string[i2].name); av &= ~perm; } } i++; perm <<= 1; } if ( av ) avc_printk(buf, " 0x%x", av); avc_printk(buf, " }"); } /** * avc_dump_query - Display a SID pair and a class in human-readable form. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class */ static void avc_dump_query(struct avc_dump_buf *buf, u32 ssid, u32 tsid, u16 tclass) { int rc; char *scontext; u32 scontext_len; rc = security_sid_to_context(ssid, &scontext, &scontext_len); if ( rc ) avc_printk(buf, "ssid=%d", ssid); else { avc_printk(buf, "scontext=%s", scontext); xfree(scontext); } rc = security_sid_to_context(tsid, &scontext, &scontext_len); if ( rc ) avc_printk(buf, " tsid=%d", tsid); else { avc_printk(buf, " tcontext=%s", scontext); xfree(scontext); } avc_printk(buf, " tclass=%s", class_to_string[tclass]); } /** * avc_init - Initialize the AVC. * * Initialize the access vector cache. */ void __init avc_init(void) { int i; for ( i = 0; i < AVC_CACHE_SLOTS; i++ ) { INIT_HLIST_HEAD(&avc_cache.slots[i]); spin_lock_init(&avc_cache.slots_lock[i]); } atomic_set(&avc_cache.active_nodes, 0); atomic_set(&avc_cache.lru_hint, 0); printk("AVC INITIALIZED\n"); } int avc_get_hash_stats(struct xen_flask_hash_stats *arg) { int i, chain_len, max_chain_len, slots_used; struct avc_node *node; struct hlist_head *head; rcu_read_lock(&avc_rcu_lock); slots_used = 0; max_chain_len = 0; for ( i = 0; i < AVC_CACHE_SLOTS; i++ ) { head = &avc_cache.slots[i]; if ( !hlist_empty(head) ) { struct hlist_node *next; slots_used++; chain_len = 0; hlist_for_each_entry_rcu(node, next, head, list) chain_len++; if ( chain_len > max_chain_len ) max_chain_len = chain_len; } } rcu_read_unlock(&avc_rcu_lock); arg->entries = atomic_read(&avc_cache.active_nodes); arg->buckets_used = slots_used; arg->buckets_total = AVC_CACHE_SLOTS; arg->max_chain_len = max_chain_len; return 0; } static void avc_node_free(struct rcu_head *rhead) { struct avc_node *node = container_of(rhead, struct avc_node, rhead); xfree(node); avc_cache_stats_incr(frees); } static void avc_node_delete(struct avc_node *node) { hlist_del_rcu(&node->list); call_rcu(&node->rhead, avc_node_free); atomic_dec(&avc_cache.active_nodes); } static void avc_node_kill(struct avc_node *node) { xfree(node); avc_cache_stats_incr(frees); atomic_dec(&avc_cache.active_nodes); } static void avc_node_replace(struct avc_node *new, struct avc_node *old) { hlist_replace_rcu(&old->list, &new->list); call_rcu(&old->rhead, avc_node_free); atomic_dec(&avc_cache.active_nodes); } static inline int avc_reclaim_node(void) { struct avc_node *node; int hvalue, try, ecx; unsigned long flags; struct hlist_head *head; struct hlist_node *next; spinlock_t *lock; for ( try = 0, ecx = 0; try < AVC_CACHE_SLOTS; try++ ) { atomic_inc(&avc_cache.lru_hint); hvalue = atomic_read(&avc_cache.lru_hint) & (AVC_CACHE_SLOTS - 1); head = &avc_cache.slots[hvalue]; lock = &avc_cache.slots_lock[hvalue]; spin_lock_irqsave(&avc_cache.slots_lock[hvalue], flags); rcu_read_lock(&avc_rcu_lock); hlist_for_each_entry(node, next, head, list) { avc_node_delete(node); avc_cache_stats_incr(reclaims); ecx++; if ( ecx >= AVC_CACHE_RECLAIM ) { rcu_read_unlock(&avc_rcu_lock); spin_unlock_irqrestore(lock, flags); goto out; } } rcu_read_unlock(&avc_rcu_lock); spin_unlock_irqrestore(lock, flags); } out: return ecx; } static struct avc_node *avc_alloc_node(void) { struct avc_node *node; node = xmalloc(struct avc_node); if (!node) goto out; memset(node, 0, sizeof(*node)); INIT_RCU_HEAD(&node->rhead); INIT_HLIST_NODE(&node->list); avc_cache_stats_incr(allocations); atomic_inc(&avc_cache.active_nodes); if ( atomic_read(&avc_cache.active_nodes) > avc_cache_threshold ) avc_reclaim_node(); out: return node; } static void avc_node_populate(struct avc_node *node, u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd) { node->ae.ssid = ssid; node->ae.tsid = tsid; node->ae.tclass = tclass; memcpy(&node->ae.avd, avd, sizeof(node->ae.avd)); } static inline struct avc_node *avc_search_node(u32 ssid, u32 tsid, u16 tclass) { struct avc_node *node, *ret = NULL; int hvalue; struct hlist_head *head; struct hlist_node *next; hvalue = avc_hash(ssid, tsid, tclass); head = &avc_cache.slots[hvalue]; hlist_for_each_entry_rcu(node, next, head, list) { if ( ssid == node->ae.ssid && tclass == node->ae.tclass && tsid == node->ae.tsid ) { ret = node; break; } } return ret; } /** * avc_lookup - Look up an AVC entry. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @requested: requested permissions, interpreted based on @tclass * * Look up an AVC entry that is valid for the * @requested permissions between the SID pair * (@ssid, @tsid), interpreting the permissions * based on @tclass. If a valid AVC entry exists, * then this function return the avc_node. * Otherwise, this function returns NULL. */ static struct avc_node *avc_lookup(u32 ssid, u32 tsid, u16 tclass) { struct avc_node *node; avc_cache_stats_incr(lookups); node = avc_search_node(ssid, tsid, tclass); if ( node ) avc_cache_stats_incr(hits); else avc_cache_stats_incr(misses); return node; } static int avc_latest_notif_update(int seqno, int is_insert) { int ret = 0; static DEFINE_SPINLOCK(notif_lock); unsigned long flag; spin_lock_irqsave(¬if_lock, flag); if ( is_insert ) { if ( seqno < avc_cache.latest_notif ) { printk(KERN_WARNING "avc: seqno %d < latest_notif %d\n", seqno, avc_cache.latest_notif); ret = -EAGAIN; } } else { if ( seqno > avc_cache.latest_notif ) avc_cache.latest_notif = seqno; } spin_unlock_irqrestore(¬if_lock, flag); return ret; } /** * avc_insert - Insert an AVC entry. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @ae: AVC entry * * Insert an AVC entry for the SID pair * (@ssid, @tsid) and class @tclass. * The access vectors and the sequence number are * normally provided by the security server in * response to a security_compute_av() call. If the * sequence number @ae->avd.seqno is not less than the latest * revocation notification, then the function copies * the access vectors into a cache entry, returns * avc_node inserted. Otherwise, this function returns NULL. */ static struct avc_node *avc_insert(u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd) { struct avc_node *pos, *node = NULL; int hvalue; unsigned long flag; if ( avc_latest_notif_update(avd->seqno, 1) ) goto out; node = avc_alloc_node(); if ( node ) { struct hlist_head *head; struct hlist_node *next; spinlock_t *lock; hvalue = avc_hash(ssid, tsid, tclass); avc_node_populate(node, ssid, tsid, tclass, avd); head = &avc_cache.slots[hvalue]; lock = &avc_cache.slots_lock[hvalue]; spin_lock_irqsave(lock, flag); hlist_for_each_entry(pos, next, head, list) { if ( pos->ae.ssid == ssid && pos->ae.tsid == tsid && pos->ae.tclass == tclass ) { avc_node_replace(node, pos); goto found; } } hlist_add_head_rcu(&node->list, head); found: spin_unlock_irqrestore(lock, flag); } out: return node; } /** * avc_audit - Audit the granting or denial of permissions. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @requested: requested permissions * @avd: access vector decisions * @result: result from avc_has_perm_noaudit * @a: auxiliary audit data * * Audit the granting or denial of permissions in accordance * with the policy. This function is typically called by * avc_has_perm() after a permission check, but can also be * called directly by callers who use avc_has_perm_noaudit() * in order to separate the permission check from the auditing. * For example, this separation is useful when the permission check must * be performed under a lock, to allow the lock to be released * before calling the auditing code. */ void avc_audit(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct av_decision *avd, int result, struct avc_audit_data *a) { struct domain *cdom = current->domain; u32 denied, audited; struct avc_dump_buf buf; denied = requested & ~avd->allowed; if ( denied ) { audited = denied; if ( !(audited & avd->auditdeny) ) return; } else if ( result ) { audited = denied = requested; } else { audited = requested; if ( !(audited & avd->auditallow) ) return; } buf.start = xmalloc_bytes(AVC_BUF_SIZE); if ( !buf.start ) { spin_lock(&avc_emerg_lock); buf.start = avc_emerg_buf; } buf.pos = buf.start; buf.free = AVC_BUF_SIZE; avc_printk(&buf, "avc: %s ", denied ? "denied" : "granted"); avc_dump_av(&buf, tclass, audited); avc_printk(&buf, " for "); if ( a && (a->sdom || a->tdom) ) { if ( a->sdom && a->tdom && a->sdom != a->tdom ) avc_printk(&buf, "domid=%d target=%d ", a->sdom->domain_id, a->tdom->domain_id); else if ( a->sdom ) avc_printk(&buf, "domid=%d ", a->sdom->domain_id); else avc_printk(&buf, "target=%d ", a->tdom->domain_id); } else if ( cdom ) avc_printk(&buf, "domid=%d ", cdom->domain_id); switch ( a ? a->type : 0 ) { case AVC_AUDIT_DATA_DEV: avc_printk(&buf, "device=0x%lx ", a->device); break; case AVC_AUDIT_DATA_IRQ: avc_printk(&buf, "irq=%d ", a->irq); break; case AVC_AUDIT_DATA_RANGE: avc_printk(&buf, "range=0x%lx-0x%lx ", a->range.start, a->range.end); break; case AVC_AUDIT_DATA_MEMORY: avc_printk(&buf, "pte=0x%lx mfn=0x%lx ", a->memory.pte, a->memory.mfn); break; } avc_dump_query(&buf, ssid, tsid, tclass); avc_printk(&buf, "\n"); printk("%s", buf.start); if ( buf.start == avc_emerg_buf ) spin_unlock(&avc_emerg_lock); else xfree(buf.start); } /** * avc_add_callback - Register a callback for security events. * @callback: callback function * @events: security events * @ssid: source security identifier or %SECSID_WILD * @tsid: target security identifier or %SECSID_WILD * @tclass: target security class * @perms: permissions * * Register a callback function for events in the set @events * related to the SID pair (@ssid, @tsid) and * and the permissions @perms, interpreting * @perms based on @tclass. Returns %0 on success or * -%ENOMEM if insufficient memory exists to add the callback. */ int avc_add_callback(int (*callback)(u32 event, u32 ssid, u32 tsid, u16 tclass, u32 perms, u32 *out_retained), u32 events, u32 ssid, u32 tsid, u16 tclass, u32 perms) { struct avc_callback_node *c; int rc = 0; c = xmalloc(struct avc_callback_node); if ( !c ) { rc = -ENOMEM; goto out; } c->callback = callback; c->events = events; c->ssid = ssid; c->tsid = tsid; c->perms = perms; c->next = avc_callbacks; avc_callbacks = c; out: return rc; } static inline int avc_sidcmp(u32 x, u32 y) { return (x == y || x == SECSID_WILD || y == SECSID_WILD); } /** * avc_update_node Update an AVC entry * @event : Updating event * @perms : Permission mask bits * @ssid,@tsid,@tclass : identifier of an AVC entry * * if a valid AVC entry doesn't exist,this function returns -ENOENT. * if kmalloc() called internal returns NULL, this function returns -ENOMEM. * otherwise, this function update the AVC entry. The original AVC-entry object * will release later by RCU. */ static int avc_update_node(u32 event, u32 perms, u32 ssid, u32 tsid, u16 tclass, u32 seqno) { int hvalue, rc = 0; unsigned long flag; struct avc_node *pos, *node, *orig = NULL; struct hlist_head *head; struct hlist_node *next; spinlock_t *lock; node = avc_alloc_node(); if ( !node ) { rc = -ENOMEM; goto out; } hvalue = avc_hash(ssid, tsid, tclass); head = &avc_cache.slots[hvalue]; lock = &avc_cache.slots_lock[hvalue]; spin_lock_irqsave(lock, flag); hlist_for_each_entry(pos, next, head, list) { if ( ssid == pos->ae.ssid && tsid == pos->ae.tsid && tclass == pos->ae.tclass && seqno == pos->ae.avd.seqno ) { orig = pos; break; } } if ( !orig ) { rc = -ENOENT; avc_node_kill(node); goto out_unlock; } /* * Copy and replace original node. */ avc_node_populate(node, ssid, tsid, tclass, &orig->ae.avd); switch ( event ) { case AVC_CALLBACK_GRANT: node->ae.avd.allowed |= perms; break; case AVC_CALLBACK_TRY_REVOKE: case AVC_CALLBACK_REVOKE: node->ae.avd.allowed &= ~perms; break; case AVC_CALLBACK_AUDITALLOW_ENABLE: node->ae.avd.auditallow |= perms; break; case AVC_CALLBACK_AUDITALLOW_DISABLE: node->ae.avd.auditallow &= ~perms; break; case AVC_CALLBACK_AUDITDENY_ENABLE: node->ae.avd.auditdeny |= perms; break; case AVC_CALLBACK_AUDITDENY_DISABLE: node->ae.avd.auditdeny &= ~perms; break; } avc_node_replace(node, orig); out_unlock: spin_unlock_irqrestore(lock, flag); out: return rc; } /** * avc_ss_reset - Flush the cache and revalidate migrated permissions. * @seqno: policy sequence number */ int avc_ss_reset(u32 seqno) { struct avc_callback_node *c; int i, rc = 0, tmprc; unsigned long flag; struct avc_node *node; struct hlist_head *head; struct hlist_node *next; spinlock_t *lock; for ( i = 0; i < AVC_CACHE_SLOTS; i++ ) { head = &avc_cache.slots[i]; lock = &avc_cache.slots_lock[i]; spin_lock_irqsave(lock, flag); rcu_read_lock(&avc_rcu_lock); hlist_for_each_entry(node, next, head, list) avc_node_delete(node); rcu_read_unlock(&avc_rcu_lock); spin_unlock_irqrestore(lock, flag); } for ( c = avc_callbacks; c; c = c->next ) { if ( c->events & AVC_CALLBACK_RESET ) { tmprc = c->callback(AVC_CALLBACK_RESET, 0, 0, 0, 0, NULL); /* save the first error encountered for the return value and continue processing the callbacks */ if ( !rc ) rc = tmprc; } } avc_latest_notif_update(seqno, 0); return rc; } /** * avc_has_perm_noaudit - Check permissions but perform no auditing. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @requested: requested permissions, interpreted based on @tclass * @avd: access vector decisions * * Check the AVC to determine whether the @requested permissions are granted * for the SID pair (@ssid, @tsid), interpreting the permissions * based on @tclass, and call the security server on a cache miss to obtain * a new decision and add it to the cache. Return a copy of the decisions * in @avd. Return %0 if all @requested permissions are granted, * -%EACCES if any permissions are denied, or another -errno upon * other errors. This function is typically called by avc_has_perm(), * but may also be called directly to separate permission checking from * auditing, e.g. in cases where a lock must be held for the check but * should be released for the auditing. */ int avc_has_perm_noaudit(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct av_decision *in_avd) { struct avc_node *node; struct av_decision avd_entry, *avd; int rc = 0; u32 denied; BUG_ON(!requested); rcu_read_lock(&avc_rcu_lock); node = avc_lookup(ssid, tsid, tclass); if ( !node ) { rcu_read_unlock(&avc_rcu_lock); if ( in_avd ) avd = in_avd; else avd = &avd_entry; rc = security_compute_av(ssid,tsid,tclass,requested,avd); if ( rc ) goto out; rcu_read_lock(&avc_rcu_lock); node = avc_insert(ssid,tsid,tclass,avd); } else { if ( in_avd ) memcpy(in_avd, &node->ae.avd, sizeof(*in_avd)); avd = &node->ae.avd; } denied = requested & ~(avd->allowed); if ( denied ) { if ( !flask_enforcing || (avd->flags & AVD_FLAGS_PERMISSIVE) ) avc_update_node(AVC_CALLBACK_GRANT,requested, ssid,tsid,tclass,avd->seqno); else rc = -EACCES; } rcu_read_unlock(&avc_rcu_lock); out: return rc; } /** * avc_has_perm - Check permissions and perform any appropriate auditing. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @requested: requested permissions, interpreted based on @tclass * @auditdata: auxiliary audit data * * Check the AVC to determine whether the @requested permissions are granted * for the SID pair (@ssid, @tsid), interpreting the permissions * based on @tclass, and call the security server on a cache miss to obtain * a new decision and add it to the cache. Audit the granting or denial of * permissions in accordance with the policy. Return %0 if all @requested * permissions are granted, -%EACCES if any permissions are denied, or * another -errno upon other errors. */ int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct avc_audit_data *auditdata) { struct av_decision avd; int rc; rc = avc_has_perm_noaudit(ssid, tsid, tclass, requested, &avd); avc_audit(ssid, tsid, tclass, requested, &avd, rc, auditdata); return rc; } xen-4.4.0/xen/xsm/flask/include/0000775000175000017500000000000012307313555014567 5ustar smbsmbxen-4.4.0/xen/xsm/flask/include/conditional.h0000664000175000017500000000137312307313555017247 0ustar smbsmb/* * Interface to booleans in the security server. This is exported * for the selinuxfs. * * Author: Karl MacMillan * * Copyright (C) 2003 - 2004 Tresys Technology, LLC * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, version 2. */ #ifndef _FLASK_CONDITIONAL_H_ #define _FLASK_CONDITIONAL_H_ #include int security_get_bools(int *len, char ***names, int **values, size_t *maxstr); int security_set_bools(int len, int *values); int security_find_bool(const char *name); char *security_get_bool_name(unsigned int bool); int security_get_bool_value(unsigned int bool); #endif xen-4.4.0/xen/xsm/flask/include/objsec.h0000664000175000017500000000160112307313555016203 0ustar smbsmb/* * NSA Security-Enhanced Linux (SELinux) security module * * This file contains the Flask security data structures for xen objects. * * Author(s): George Coker, * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, * as published by the Free Software Foundation. */ #ifndef _FLASK_OBJSEC_H_ #define _FLASK_OBJSEC_H_ #include #include "flask.h" #include "avc.h" struct domain_security_struct { u32 sid; /* current SID */ u32 self_sid; /* SID for target when operating on DOMID_SELF */ u32 target_sid; /* SID for device model target domain */ }; struct evtchn_security_struct { u32 sid; /* current SID */ }; extern unsigned int selinux_checkreqprot; #endif /* _FLASK_OBJSEC_H_ */ xen-4.4.0/xen/xsm/flask/include/avc_ss.h0000664000175000017500000000105012307313555016212 0ustar smbsmb/* * Access vector cache interface for the security server. * * Author : Stephen Smalley, */ #ifndef _FLASK_AVC_SS_H_ #define _FLASK_AVC_SS_H_ #include "flask.h" int avc_ss_reset(u32 seqno); struct av_perm_to_string { u16 tclass; u32 value; const char *name; }; struct selinux_class_perm { const struct av_perm_to_string *av_perm_to_string; u32 av_pts_len; u32 cts_len; const char **class_to_string; }; extern const struct selinux_class_perm selinux_class_perm; #endif /* _FLASK_AVC_SS_H_ */ xen-4.4.0/xen/xsm/flask/include/avc.h0000664000175000017500000000554712307313555015524 0ustar smbsmb/* * Access vector cache interface for object managers. * * Author : Stephen Smalley, */ /* Ported to Xen 3.0, George Coker, */ #ifndef _FLASK_AVC_H_ #define _FLASK_AVC_H_ #include #include #include #include #include "flask.h" #include "av_permissions.h" #include "security.h" #ifdef FLASK_DEVELOP extern int flask_enforcing; #else #define flask_enforcing 1 #endif /* * An entry in the AVC. */ struct avc_entry; struct task_struct; struct vfsmount; struct dentry; struct inode; struct sock; struct sk_buff; /* Auxiliary data to use in generating the audit record. */ struct avc_audit_data { char type; #define AVC_AUDIT_DATA_NONE 0 #define AVC_AUDIT_DATA_DEV 1 #define AVC_AUDIT_DATA_IRQ 2 #define AVC_AUDIT_DATA_RANGE 3 #define AVC_AUDIT_DATA_MEMORY 4 struct domain *sdom; struct domain *tdom; union { unsigned long device; int irq; struct { unsigned long start; unsigned long end; } range; struct { unsigned long pte; unsigned long mfn; } memory; }; }; /* Initialize an AVC audit data structure. */ #define AVC_AUDIT_DATA_INIT(_d,_t) \ { memset((_d), 0, sizeof(struct avc_audit_data)); \ (_d)->type = AVC_AUDIT_DATA_##_t; } /* * AVC statistics */ struct avc_cache_stats { unsigned int lookups; unsigned int hits; unsigned int misses; unsigned int allocations; unsigned int reclaims; unsigned int frees; }; /* * AVC operations */ void avc_init(void); void avc_audit(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct av_decision *avd, int result, struct avc_audit_data *auditdata); int avc_has_perm_noaudit(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct av_decision *avd); int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct avc_audit_data *auditdata); #define AVC_CALLBACK_GRANT 1 #define AVC_CALLBACK_TRY_REVOKE 2 #define AVC_CALLBACK_REVOKE 4 #define AVC_CALLBACK_RESET 8 #define AVC_CALLBACK_AUDITALLOW_ENABLE 16 #define AVC_CALLBACK_AUDITALLOW_DISABLE 32 #define AVC_CALLBACK_AUDITDENY_ENABLE 64 #define AVC_CALLBACK_AUDITDENY_DISABLE 128 int avc_add_callback(int (*callback)(u32 event, u32 ssid, u32 tsid, u16 tclass, u32 perms, u32 *out_retained), u32 events, u32 ssid, u32 tsid, u16 tclass, u32 perms); /* Exported to selinuxfs */ struct xen_flask_hash_stats; int avc_get_hash_stats(struct xen_flask_hash_stats *arg); extern unsigned int avc_cache_threshold; #ifdef FLASK_AVC_STATS DECLARE_PER_CPU(struct avc_cache_stats, avc_cache_stats); #endif #endif /* _FLASK_AVC_H_ */ xen-4.4.0/xen/xsm/flask/include/security.h0000664000175000017500000000574712307313555016624 0ustar smbsmb/* * Security server interface. * * Author : Stephen Smalley, * */ /* Ported to Xen 3.0, George Coker, */ #ifndef _FLASK_SECURITY_H_ #define _FLASK_SECURITY_H_ #include "flask.h" #define SECSID_NULL 0x00000000 /* unspecified SID */ #define SECSID_WILD 0xffffffff /* wildcard SID */ #define SECCLASS_NULL 0x0000 /* no class */ #define FLASK_MAGIC 0xf97cff8c /* Identify specific policy version changes */ #define POLICYDB_VERSION_BASE 15 #define POLICYDB_VERSION_BOOL 16 #define POLICYDB_VERSION_IPV6 17 #define POLICYDB_VERSION_NLCLASS 18 #define POLICYDB_VERSION_VALIDATETRANS 19 #define POLICYDB_VERSION_MLS 19 #define POLICYDB_VERSION_AVTAB 20 #define POLICYDB_VERSION_RANGETRANS 21 #define POLICYDB_VERSION_POLCAP 22 #define POLICYDB_VERSION_PERMISSIVE 23 #define POLICYDB_VERSION_BOUNDARY 24 /* Range of policy versions we understand*/ #define POLICYDB_VERSION_MIN POLICYDB_VERSION_BASE #define POLICYDB_VERSION_MAX POLICYDB_VERSION_BOUNDARY #ifdef FLASK_BOOTPARAM extern int flask_enabled; #else #define flask_enabled 1 #endif extern int flask_mls_enabled; int security_load_policy(void * data, size_t len); struct av_decision { u32 allowed; u32 auditallow; u32 auditdeny; u32 seqno; u32 flags; }; /* definitions of av_decision.flags */ #define AVD_FLAGS_PERMISSIVE 0x0001 int security_compute_av(u32 ssid, u32 tsid, u16 tclass, u32 requested, struct av_decision *avd); int security_transition_sid(u32 ssid, u32 tsid, u16 tclass, u32 *out_sid); int security_member_sid(u32 ssid, u32 tsid, u16 tclass, u32 *out_sid); int security_change_sid(u32 ssid, u32 tsid, u16 tclass, u32 *out_sid); int security_sid_to_context(u32 sid, char **scontext, u32 *scontext_len); int security_context_to_sid(char *scontext, u32 scontext_len, u32 *out_sid); int security_get_user_sids(u32 callsid, char *username, u32 **sids, u32 *nel); int security_irq_sid(int pirq, u32 *out_sid); int security_iomem_sid(unsigned long, u32 *out_sid); int security_ioport_sid(u32 ioport, u32 *out_sid); int security_device_sid(u32 device, u32 *out_sid); int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, u16 tclass); typedef int (*security_iterate_fn)(void *data, u32 sid, unsigned long start, unsigned long end); int security_iterate_iomem_sids(unsigned long start, unsigned long end, security_iterate_fn fn, void *data); int security_iterate_ioport_sids(u32 start, u32 end, security_iterate_fn fn, void *data); int security_ocontext_add(u32 ocontext, unsigned long low, unsigned long high, u32 sid); int security_ocontext_del(u32 ocontext, unsigned int low, unsigned int high); #endif /* _FLASK_SECURITY_H_ */ xen-4.4.0/xen/xsm/flask/flask_op.c0000664000175000017500000003771312307313555015121 0ustar smbsmb/* * This file contains the flask_op hypercall and associated functions. * * Author: George Coker, * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, * as published by the Free Software Foundation. */ #include #include #include #include #include #include #include #include #include #ifdef FLASK_DEVELOP int flask_enforcing = 0; integer_param("flask_enforcing", flask_enforcing); #endif #ifdef FLASK_BOOTPARAM int flask_enabled = 1; integer_param("flask_enabled", flask_enabled); #endif #define MAX_POLICY_SIZE 0x4000000 #define FLASK_COPY_OUT \ ( \ 1UL<ssid; if ( !dsec ) return -EACCES; return avc_has_perm(dsec->sid, SECINITSID_SECURITY, SECCLASS_SECURITY, perms, NULL); } static int flask_copyin_string(XEN_GUEST_HANDLE_PARAM(char) u_buf, char **buf, size_t size, size_t max_size) { char *tmp; if ( size > max_size ) return -ENOENT; tmp = xmalloc_array(char, size + 1); if ( !tmp ) return -ENOMEM; if ( copy_from_guest(tmp, u_buf, size) ) { xfree(tmp); return -EFAULT; } tmp[size] = 0; *buf = tmp; return 0; } static int flask_security_user(struct xen_flask_userlist *arg) { char *user; u32 *sids; u32 nsids; int rv; rv = domain_has_security(current->domain, SECURITY__COMPUTE_USER); if ( rv ) return rv; rv = flask_copyin_string(arg->u.user, &user, arg->size, PAGE_SIZE); if ( rv ) return rv; rv = security_get_user_sids(arg->start_sid, user, &sids, &nsids); if ( rv < 0 ) goto out; if ( nsids * sizeof(sids[0]) > arg->size ) nsids = arg->size / sizeof(sids[0]); arg->size = nsids; if ( copy_to_guest(arg->u.sids, sids, nsids) ) rv = -EFAULT; xfree(sids); out: xfree(user); return rv; } static int flask_security_relabel(struct xen_flask_transition *arg) { int rv; rv = domain_has_security(current->domain, SECURITY__COMPUTE_RELABEL); if ( rv ) return rv; rv = security_change_sid(arg->ssid, arg->tsid, arg->tclass, &arg->newsid); return rv; } static int flask_security_create(struct xen_flask_transition *arg) { int rv; rv = domain_has_security(current->domain, SECURITY__COMPUTE_CREATE); if ( rv ) return rv; rv = security_transition_sid(arg->ssid, arg->tsid, arg->tclass, &arg->newsid); return rv; } static int flask_security_access(struct xen_flask_access *arg) { struct av_decision avd; int rv; rv = domain_has_security(current->domain, SECURITY__COMPUTE_AV); if ( rv ) return rv; rv = security_compute_av(arg->ssid, arg->tsid, arg->tclass, arg->req, &avd); if ( rv < 0 ) return rv; arg->allowed = avd.allowed; arg->audit_allow = avd.auditallow; arg->audit_deny = avd.auditdeny; arg->seqno = avd.seqno; return rv; } static int flask_security_member(struct xen_flask_transition *arg) { int rv; rv = domain_has_security(current->domain, SECURITY__COMPUTE_MEMBER); if ( rv ) return rv; rv = security_member_sid(arg->ssid, arg->tsid, arg->tclass, &arg->newsid); return rv; } static int flask_security_setenforce(struct xen_flask_setenforce *arg) { int enforce = !!(arg->enforcing); int rv; if ( enforce == flask_enforcing ) return 0; rv = domain_has_security(current->domain, SECURITY__SETENFORCE); if ( rv ) return rv; flask_enforcing = enforce; if ( flask_enforcing ) avc_ss_reset(0); return 0; } static int flask_security_context(struct xen_flask_sid_context *arg) { int rv; char *buf; rv = domain_has_security(current->domain, SECURITY__CHECK_CONTEXT); if ( rv ) return rv; rv = flask_copyin_string(arg->context, &buf, arg->size, PAGE_SIZE); if ( rv ) return rv; rv = security_context_to_sid(buf, arg->size, &arg->sid); if ( rv < 0 ) goto out; out: xfree(buf); return rv; } static int flask_security_sid(struct xen_flask_sid_context *arg) { int rv; char *context; u32 len; rv = domain_has_security(current->domain, SECURITY__CHECK_CONTEXT); if ( rv ) return rv; rv = security_sid_to_context(arg->sid, &context, &len); if ( rv < 0 ) return rv; rv = 0; if ( len > arg->size ) rv = -ERANGE; arg->size = len; if ( !rv && copy_to_guest(arg->context, context, len) ) rv = -EFAULT; xfree(context); return rv; } int flask_disable(void) { static int flask_disabled = 0; if ( ss_initialized ) { /* Not permitted after initial policy load. */ return -EINVAL; } if ( flask_disabled ) { /* Only do this once. */ return -EINVAL; } printk("Flask: Disabled at runtime.\n"); flask_disabled = 1; /* Reset xsm_ops to the original module. */ xsm_ops = original_ops; return 0; } static int flask_security_setavc_threshold(struct xen_flask_setavc_threshold *arg) { int rv = 0; if ( arg->threshold != avc_cache_threshold ) { rv = domain_has_security(current->domain, SECURITY__SETSECPARAM); if ( rv ) goto out; avc_cache_threshold = arg->threshold; } out: return rv; } static int flask_security_resolve_bool(struct xen_flask_boolean *arg) { char *name; int rv; if ( arg->bool_id != -1 ) return 0; rv = flask_copyin_string(arg->name, &name, arg->size, bool_maxstr); if ( rv ) return rv; arg->bool_id = security_find_bool(name); arg->size = 0; xfree(name); return 0; } static int flask_security_set_bool(struct xen_flask_boolean *arg) { int rv; rv = domain_has_security(current->domain, SECURITY__SETBOOL); if ( rv ) return rv; rv = flask_security_resolve_bool(arg); if ( rv ) return rv; spin_lock(&sel_sem); if ( arg->commit ) { int num; int *values; rv = security_get_bools(&num, NULL, &values, NULL); if ( rv != 0 ) goto out; if ( arg->bool_id >= num ) { xfree(values); rv = -ENOENT; goto out; } values[arg->bool_id] = !!(arg->new_value); arg->enforcing = arg->pending = !!(arg->new_value); if ( bool_pending_values ) bool_pending_values[arg->bool_id] = !!(arg->new_value); rv = security_set_bools(num, values); xfree(values); } else { if ( !bool_pending_values ) rv = flask_security_make_bools(); if ( !rv && arg->bool_id >= bool_num ) rv = -ENOENT; if ( rv ) goto out; bool_pending_values[arg->bool_id] = !!(arg->new_value); arg->pending = !!(arg->new_value); arg->enforcing = security_get_bool_value(arg->bool_id); rv = 0; } out: spin_unlock(&sel_sem); return rv; } static int flask_security_commit_bools(void) { int rv; spin_lock(&sel_sem); rv = domain_has_security(current->domain, SECURITY__SETBOOL); if ( rv ) goto out; if ( bool_pending_values ) rv = security_set_bools(bool_num, bool_pending_values); out: spin_unlock(&sel_sem); return rv; } static int flask_security_get_bool(struct xen_flask_boolean *arg) { int rv; rv = flask_security_resolve_bool(arg); if ( rv ) return rv; spin_lock(&sel_sem); rv = security_get_bool_value(arg->bool_id); if ( rv < 0 ) goto out; arg->enforcing = rv; if ( bool_pending_values ) arg->pending = bool_pending_values[arg->bool_id]; else arg->pending = rv; rv = 0; if ( arg->size ) { char *nameout = security_get_bool_name(arg->bool_id); size_t nameout_len = strlen(nameout); if ( nameout_len > arg->size ) rv = -ERANGE; arg->size = nameout_len; if ( !rv && copy_to_guest(arg->name, nameout, nameout_len) ) rv = -EFAULT; xfree(nameout); } out: spin_unlock(&sel_sem); return rv; } static int flask_security_make_bools(void) { int ret = 0; int num; int *values = NULL; xfree(bool_pending_values); ret = security_get_bools(&num, NULL, &values, &bool_maxstr); if ( ret != 0 ) goto out; bool_num = num; bool_pending_values = values; out: return ret; } #ifdef FLASK_AVC_STATS static int flask_security_avc_cachestats(struct xen_flask_cache_stats *arg) { struct avc_cache_stats *st; if ( arg->cpu >= nr_cpu_ids ) return -ENOENT; if ( !cpu_online(arg->cpu) ) return -ENOENT; st = &per_cpu(avc_cache_stats, arg->cpu); arg->lookups = st->lookups; arg->hits = st->hits; arg->misses = st->misses; arg->allocations = st->allocations; arg->reclaims = st->reclaims; arg->frees = st->frees; return 0; } #endif static int flask_security_load(struct xen_flask_load *load) { int ret; void *buf = NULL; ret = domain_has_security(current->domain, SECURITY__LOAD_POLICY); if ( ret ) return ret; if ( load->size > MAX_POLICY_SIZE ) return -EINVAL; buf = xmalloc_bytes(load->size); if ( !buf ) return -ENOMEM; if ( copy_from_guest(buf, load->buffer, load->size) ) { ret = -EFAULT; goto out_free; } spin_lock(&sel_sem); ret = security_load_policy(buf, load->size); if ( ret ) goto out; xfree(bool_pending_values); bool_pending_values = NULL; ret = 0; out: spin_unlock(&sel_sem); out_free: xfree(buf); return ret; } static int flask_ocontext_del(struct xen_flask_ocontext *arg) { int rv; if ( arg->low > arg->high ) return -EINVAL; rv = domain_has_security(current->domain, SECURITY__DEL_OCONTEXT); if ( rv ) return rv; return security_ocontext_del(arg->ocon, arg->low, arg->high); } static int flask_ocontext_add(struct xen_flask_ocontext *arg) { int rv; if ( arg->low > arg->high ) return -EINVAL; rv = domain_has_security(current->domain, SECURITY__ADD_OCONTEXT); if ( rv ) return rv; return security_ocontext_add(arg->ocon, arg->low, arg->high, arg->sid); } static int flask_get_peer_sid(struct xen_flask_peersid *arg) { int rv = -EINVAL; struct domain *d = current->domain; struct domain *peer; struct evtchn *chn; struct domain_security_struct *dsec; spin_lock(&d->event_lock); if ( !port_is_valid(d, arg->evtchn) ) goto out; chn = evtchn_from_port(d, arg->evtchn); if ( chn->state != ECS_INTERDOMAIN ) goto out; peer = chn->u.interdomain.remote_dom; if ( !peer ) goto out; dsec = peer->ssid; arg->sid = dsec->sid; rv = 0; out: spin_unlock(&d->event_lock); return rv; } static int flask_relabel_domain(struct xen_flask_relabel *arg) { int rc; struct domain *d; struct domain_security_struct *csec = current->domain->ssid; struct domain_security_struct *dsec; struct avc_audit_data ad; AVC_AUDIT_DATA_INIT(&ad, NONE); d = rcu_lock_domain_by_any_id(arg->domid); if ( d == NULL ) return -ESRCH; ad.sdom = current->domain; ad.tdom = d; dsec = d->ssid; if ( arg->domid == DOMID_SELF ) { rc = avc_has_perm(dsec->sid, arg->sid, SECCLASS_DOMAIN2, DOMAIN2__RELABELSELF, &ad); if ( rc ) goto out; } else { rc = avc_has_perm(csec->sid, dsec->sid, SECCLASS_DOMAIN2, DOMAIN2__RELABELFROM, &ad); if ( rc ) goto out; rc = avc_has_perm(csec->sid, arg->sid, SECCLASS_DOMAIN2, DOMAIN2__RELABELTO, &ad); if ( rc ) goto out; } rc = avc_has_perm(dsec->sid, arg->sid, SECCLASS_DOMAIN, DOMAIN__TRANSITION, &ad); if ( rc ) goto out; dsec->sid = arg->sid; dsec->self_sid = arg->sid; security_transition_sid(dsec->sid, dsec->sid, SECCLASS_DOMAIN, &dsec->self_sid); if ( d->target ) { struct domain_security_struct *tsec = d->target->ssid; security_transition_sid(tsec->sid, dsec->sid, SECCLASS_DOMAIN, &dsec->target_sid); } out: rcu_unlock_domain(d); return rc; } long do_flask_op(XEN_GUEST_HANDLE_PARAM(xsm_op_t) u_flask_op) { xen_flask_op_t op; int rv; if ( copy_from_guest(&op, u_flask_op, 1) ) return -EFAULT; if ( op.interface_version != XEN_FLASK_INTERFACE_VERSION ) return -ENOSYS; switch ( op.cmd ) { case FLASK_LOAD: rv = flask_security_load(&op.u.load); break; case FLASK_GETENFORCE: rv = flask_enforcing; break; case FLASK_SETENFORCE: rv = flask_security_setenforce(&op.u.enforce); break; case FLASK_CONTEXT_TO_SID: rv = flask_security_context(&op.u.sid_context); break; case FLASK_SID_TO_CONTEXT: rv = flask_security_sid(&op.u.sid_context); break; case FLASK_ACCESS: rv = flask_security_access(&op.u.access); break; case FLASK_CREATE: rv = flask_security_create(&op.u.transition); break; case FLASK_RELABEL: rv = flask_security_relabel(&op.u.transition); break; case FLASK_USER: rv = flask_security_user(&op.u.userlist); break; case FLASK_POLICYVERS: rv = POLICYDB_VERSION_MAX; break; case FLASK_GETBOOL: rv = flask_security_get_bool(&op.u.boolean); break; case FLASK_SETBOOL: rv = flask_security_set_bool(&op.u.boolean); break; case FLASK_COMMITBOOLS: rv = flask_security_commit_bools(); break; case FLASK_MLS: rv = flask_mls_enabled; break; case FLASK_DISABLE: rv = flask_disable(); break; case FLASK_GETAVC_THRESHOLD: rv = avc_cache_threshold; break; case FLASK_SETAVC_THRESHOLD: rv = flask_security_setavc_threshold(&op.u.setavc_threshold); break; case FLASK_AVC_HASHSTATS: rv = avc_get_hash_stats(&op.u.hash_stats); break; #ifdef FLASK_AVC_STATS case FLASK_AVC_CACHESTATS: rv = flask_security_avc_cachestats(&op.u.cache_stats); break; #endif case FLASK_MEMBER: rv = flask_security_member(&op.u.transition); break; case FLASK_ADD_OCONTEXT: rv = flask_ocontext_add(&op.u.ocontext); break; case FLASK_DEL_OCONTEXT: rv = flask_ocontext_del(&op.u.ocontext); break; case FLASK_GET_PEER_SID: rv = flask_get_peer_sid(&op.u.peersid); break; case FLASK_RELABEL_DOMAIN: rv = flask_relabel_domain(&op.u.relabel); break; default: rv = -ENOSYS; } if ( rv < 0 ) goto out; if ( (FLASK_COPY_OUT&(1UL< * Stefan Berger, * * Contributors: * Michael LeMay, * George Coker, * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, * as published by the Free Software Foundation. * * * This file contains the XSM policy init functions for Xen. * */ #include #include #include char *__initdata policy_buffer = NULL; u32 __initdata policy_size = 0; int __init xsm_policy_init(unsigned long *module_map, const multiboot_info_t *mbi, void *(*bootstrap_map)(const module_t *)) { int i; module_t *mod = (module_t *)__va(mbi->mods_addr); int rc = 0; u32 *_policy_start; unsigned long _policy_len; /* * Try all modules and see whichever could be the binary policy. * Adjust module_map for the module that is the binary policy. */ for ( i = mbi->mods_count-1; i >= 1; i-- ) { if ( !test_bit(i, module_map) ) continue; _policy_start = bootstrap_map(mod + i); _policy_len = mod[i].mod_end; if ( (xsm_magic_t)(*_policy_start) == XSM_MAGIC ) { policy_buffer = (char *)_policy_start; policy_size = _policy_len; printk("Policy len 0x%lx, start at %p.\n", _policy_len,_policy_start); __clear_bit(i, module_map); break; } bootstrap_map(NULL); } return rc; } xen-4.4.0/xen/xsm/dummy.c0000664000175000017500000001246712307313555013355 0ustar smbsmb/* * This work is based on the LSM implementation in Linux 2.6.13.4. * * Author: George Coker, * * Contributors: Michael LeMay, * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, * as published by the Free Software Foundation. */ #define XSM_NO_WRAPPERS #define XSM_INLINE /* */ #include struct xsm_operations dummy_xsm_ops; #define set_to_dummy_if_null(ops, function) \ do { \ if ( !ops->function ) \ { \ ops->function = xsm_##function; \ if (ops != &dummy_xsm_ops) \ dprintk(XENLOG_DEBUG, "Had to override the " #function \ " security operation with the dummy one.\n"); \ } \ } while (0) void xsm_fixup_ops (struct xsm_operations *ops) { set_to_dummy_if_null(ops, security_domaininfo); set_to_dummy_if_null(ops, domain_create); set_to_dummy_if_null(ops, getdomaininfo); set_to_dummy_if_null(ops, domctl_scheduler_op); set_to_dummy_if_null(ops, sysctl_scheduler_op); set_to_dummy_if_null(ops, set_target); set_to_dummy_if_null(ops, domctl); set_to_dummy_if_null(ops, sysctl); set_to_dummy_if_null(ops, readconsole); set_to_dummy_if_null(ops, do_mca); set_to_dummy_if_null(ops, evtchn_unbound); set_to_dummy_if_null(ops, evtchn_interdomain); set_to_dummy_if_null(ops, evtchn_close_post); set_to_dummy_if_null(ops, evtchn_send); set_to_dummy_if_null(ops, evtchn_status); set_to_dummy_if_null(ops, evtchn_reset); set_to_dummy_if_null(ops, grant_mapref); set_to_dummy_if_null(ops, grant_unmapref); set_to_dummy_if_null(ops, grant_setup); set_to_dummy_if_null(ops, grant_transfer); set_to_dummy_if_null(ops, grant_copy); set_to_dummy_if_null(ops, grant_query_size); set_to_dummy_if_null(ops, alloc_security_domain); set_to_dummy_if_null(ops, free_security_domain); set_to_dummy_if_null(ops, alloc_security_evtchn); set_to_dummy_if_null(ops, free_security_evtchn); set_to_dummy_if_null(ops, show_security_evtchn); set_to_dummy_if_null(ops, get_pod_target); set_to_dummy_if_null(ops, set_pod_target); set_to_dummy_if_null(ops, memory_exchange); set_to_dummy_if_null(ops, memory_adjust_reservation); set_to_dummy_if_null(ops, memory_stat_reservation); set_to_dummy_if_null(ops, memory_pin_page); set_to_dummy_if_null(ops, claim_pages); set_to_dummy_if_null(ops, console_io); set_to_dummy_if_null(ops, profile); set_to_dummy_if_null(ops, kexec); set_to_dummy_if_null(ops, schedop_shutdown); set_to_dummy_if_null(ops, show_irq_sid); set_to_dummy_if_null(ops, map_domain_pirq); set_to_dummy_if_null(ops, map_domain_irq); set_to_dummy_if_null(ops, unmap_domain_pirq); set_to_dummy_if_null(ops, unmap_domain_irq); set_to_dummy_if_null(ops, irq_permission); set_to_dummy_if_null(ops, iomem_permission); set_to_dummy_if_null(ops, iomem_mapping); set_to_dummy_if_null(ops, pci_config_permission); set_to_dummy_if_null(ops, get_device_group); set_to_dummy_if_null(ops, test_assign_device); set_to_dummy_if_null(ops, assign_device); set_to_dummy_if_null(ops, deassign_device); set_to_dummy_if_null(ops, resource_plug_core); set_to_dummy_if_null(ops, resource_unplug_core); set_to_dummy_if_null(ops, resource_plug_pci); set_to_dummy_if_null(ops, resource_unplug_pci); set_to_dummy_if_null(ops, resource_setup_pci); set_to_dummy_if_null(ops, resource_setup_gsi); set_to_dummy_if_null(ops, resource_setup_misc); set_to_dummy_if_null(ops, page_offline); set_to_dummy_if_null(ops, tmem_op); set_to_dummy_if_null(ops, tmem_control); set_to_dummy_if_null(ops, hvm_param); set_to_dummy_if_null(ops, hvm_param_nested); set_to_dummy_if_null(ops, do_xsm_op); set_to_dummy_if_null(ops, add_to_physmap); set_to_dummy_if_null(ops, remove_from_physmap); #ifdef CONFIG_X86 set_to_dummy_if_null(ops, shadow_control); set_to_dummy_if_null(ops, hvm_set_pci_intx_level); set_to_dummy_if_null(ops, hvm_set_isa_irq_level); set_to_dummy_if_null(ops, hvm_set_pci_link_route); set_to_dummy_if_null(ops, hvm_inject_msi); set_to_dummy_if_null(ops, mem_event_control); set_to_dummy_if_null(ops, mem_event_op); set_to_dummy_if_null(ops, mem_sharing_op); set_to_dummy_if_null(ops, apic); set_to_dummy_if_null(ops, platform_op); set_to_dummy_if_null(ops, machine_memory_map); set_to_dummy_if_null(ops, domain_memory_map); set_to_dummy_if_null(ops, mmu_update); set_to_dummy_if_null(ops, mmuext_op); set_to_dummy_if_null(ops, update_va_mapping); set_to_dummy_if_null(ops, priv_mapping); set_to_dummy_if_null(ops, bind_pt_irq); set_to_dummy_if_null(ops, unbind_pt_irq); set_to_dummy_if_null(ops, ioport_permission); set_to_dummy_if_null(ops, ioport_mapping); #endif #ifdef CONFIG_ARM set_to_dummy_if_null(ops, map_gmfn_foreign); #endif } xen-4.4.0/xen/COPYING0000664000175000017500000004536112307313555012301 0ustar smbsmb XEN NOTICE ========== This license does *not* cover guest operating systems that use Xen services via normal hypercalls - this is merely considered normal use of Xen, and does *not* fall under the heading of "derived work". Also note that the GPL below is copyrighted by the Free Software Foundation, but the instance of code that it refers to (the Xen virtual machine monitor) is copyrighted by me and others who actually wrote it. A few files are licensed under both GPL and a weaker BSD-style license. This includes all files within the subdirectory include/public, as described in include/public/COPYING. All such files include the non-GPL license text as a source-code comment. Although the license text refers generically to "the software", the non-GPL license applies *only* to those source files that explicitly include the non-GPL license text. Note that the only valid version of the GPL as far as Xen is concerned is _this_ particular version of the license (i.e., *only* v2, not v2.2 or v3.x or whatever), unless explicitly otherwise stated. -- Keir Fraser (on behalf of the Xen team) ===================================================================== GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Library General Public License instead of this License. xen-4.4.0/xen/common/0000775000175000017500000000000012307313555012525 5ustar smbsmbxen-4.4.0/xen/common/domctl.c0000664000175000017500000005762612307313555014173 0ustar smbsmb/****************************************************************************** * domctl.c * * Domain management operations. For use by node control stack. * * Copyright (c) 2002-2006, K A Fraser */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static DEFINE_SPINLOCK(domctl_lock); DEFINE_SPINLOCK(vcpu_alloc_lock); int bitmap_to_xenctl_bitmap(struct xenctl_bitmap *xenctl_bitmap, const unsigned long *bitmap, unsigned int nbits) { unsigned int guest_bytes, copy_bytes, i; uint8_t zero = 0; int err = 0; uint8_t *bytemap = xmalloc_array(uint8_t, (nbits + 7) / 8); if ( !bytemap ) return -ENOMEM; guest_bytes = (xenctl_bitmap->nr_bits + 7) / 8; copy_bytes = min_t(unsigned int, guest_bytes, (nbits + 7) / 8); bitmap_long_to_byte(bytemap, bitmap, nbits); if ( copy_bytes != 0 ) if ( copy_to_guest(xenctl_bitmap->bitmap, bytemap, copy_bytes) ) err = -EFAULT; for ( i = copy_bytes; !err && i < guest_bytes; i++ ) if ( copy_to_guest_offset(xenctl_bitmap->bitmap, i, &zero, 1) ) err = -EFAULT; xfree(bytemap); return err; } int xenctl_bitmap_to_bitmap(unsigned long *bitmap, const struct xenctl_bitmap *xenctl_bitmap, unsigned int nbits) { unsigned int guest_bytes, copy_bytes; int err = 0; uint8_t *bytemap = xzalloc_array(uint8_t, (nbits + 7) / 8); if ( !bytemap ) return -ENOMEM; guest_bytes = (xenctl_bitmap->nr_bits + 7) / 8; copy_bytes = min_t(unsigned int, guest_bytes, (nbits + 7) / 8); if ( copy_bytes != 0 ) { if ( copy_from_guest(bytemap, xenctl_bitmap->bitmap, copy_bytes) ) err = -EFAULT; if ( (xenctl_bitmap->nr_bits & 7) && (guest_bytes == copy_bytes) ) bytemap[guest_bytes-1] &= ~(0xff << (xenctl_bitmap->nr_bits & 7)); } if ( !err ) bitmap_byte_to_long(bitmap, bytemap, nbits); xfree(bytemap); return err; } int cpumask_to_xenctl_bitmap(struct xenctl_bitmap *xenctl_cpumap, const cpumask_t *cpumask) { return bitmap_to_xenctl_bitmap(xenctl_cpumap, cpumask_bits(cpumask), nr_cpu_ids); } int xenctl_bitmap_to_cpumask(cpumask_var_t *cpumask, const struct xenctl_bitmap *xenctl_cpumap) { int err = 0; if ( alloc_cpumask_var(cpumask) ) { err = xenctl_bitmap_to_bitmap(cpumask_bits(*cpumask), xenctl_cpumap, nr_cpu_ids); /* In case of error, cleanup is up to us, as the caller won't care! */ if ( err ) free_cpumask_var(*cpumask); } else err = -ENOMEM; return err; } int nodemask_to_xenctl_bitmap(struct xenctl_bitmap *xenctl_nodemap, const nodemask_t *nodemask) { return bitmap_to_xenctl_bitmap(xenctl_nodemap, nodes_addr(*nodemask), MAX_NUMNODES); } int xenctl_bitmap_to_nodemask(nodemask_t *nodemask, const struct xenctl_bitmap *xenctl_nodemap) { return xenctl_bitmap_to_bitmap(nodes_addr(*nodemask), xenctl_nodemap, MAX_NUMNODES); } static inline int is_free_domid(domid_t dom) { struct domain *d; if ( dom >= DOMID_FIRST_RESERVED ) return 0; if ( (d = rcu_lock_domain_by_id(dom)) == NULL ) return 1; rcu_unlock_domain(d); return 0; } void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info) { struct vcpu *v; u64 cpu_time = 0; int flags = XEN_DOMINF_blocked; struct vcpu_runstate_info runstate; info->domain = d->domain_id; info->nr_online_vcpus = 0; info->ssidref = 0; /* * - domain is marked as blocked only if all its vcpus are blocked * - domain is marked as running if any of its vcpus is running */ for_each_vcpu ( d, v ) { vcpu_runstate_get(v, &runstate); cpu_time += runstate.time[RUNSTATE_running]; info->max_vcpu_id = v->vcpu_id; if ( !test_bit(_VPF_down, &v->pause_flags) ) { if ( !(v->pause_flags & VPF_blocked) ) flags &= ~XEN_DOMINF_blocked; if ( v->is_running ) flags |= XEN_DOMINF_running; info->nr_online_vcpus++; } } info->cpu_time = cpu_time; info->flags = (info->nr_online_vcpus ? flags : 0) | ((d->is_dying == DOMDYING_dead) ? XEN_DOMINF_dying : 0) | (d->is_shut_down ? XEN_DOMINF_shutdown : 0) | (d->is_paused_by_controller ? XEN_DOMINF_paused : 0) | (d->debugger_attached ? XEN_DOMINF_debugged : 0) | d->shutdown_code << XEN_DOMINF_shutdownshift; switch ( d->guest_type ) { case guest_type_hvm: info->flags |= XEN_DOMINF_hvm_guest; break; case guest_type_pvh: info->flags |= XEN_DOMINF_pvh_guest; break; default: break; } xsm_security_domaininfo(d, info); info->tot_pages = d->tot_pages; info->max_pages = d->max_pages; info->outstanding_pages = d->outstanding_pages; info->shr_pages = atomic_read(&d->shr_pages); info->paged_pages = atomic_read(&d->paged_pages); info->shared_info_frame = mfn_to_gmfn(d, virt_to_mfn(d->shared_info)); BUG_ON(SHARED_M2P(info->shared_info_frame)); info->cpupool = d->cpupool ? d->cpupool->cpupool_id : CPUPOOLID_NONE; memcpy(info->handle, d->handle, sizeof(xen_domain_handle_t)); } static unsigned int default_vcpu0_location(cpumask_t *online) { struct domain *d; struct vcpu *v; unsigned int i, cpu, nr_cpus, *cnt; cpumask_t cpu_exclude_map; /* Do an initial CPU placement. Pick the least-populated CPU. */ nr_cpus = cpumask_last(&cpu_online_map) + 1; cnt = xzalloc_array(unsigned int, nr_cpus); if ( cnt ) { rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) for_each_vcpu ( d, v ) if ( !test_bit(_VPF_down, &v->pause_flags) && ((cpu = v->processor) < nr_cpus) ) cnt[cpu]++; rcu_read_unlock(&domlist_read_lock); } /* * If we're on a HT system, we only auto-allocate to a non-primary HT. We * favour high numbered CPUs in the event of a tie. */ cpumask_copy(&cpu_exclude_map, per_cpu(cpu_sibling_mask, 0)); cpu = cpumask_first(&cpu_exclude_map); i = cpumask_next(cpu, &cpu_exclude_map); if ( i < nr_cpu_ids ) cpu = i; for_each_cpu(i, online) { if ( cpumask_test_cpu(i, &cpu_exclude_map) ) continue; if ( (i == cpumask_first(per_cpu(cpu_sibling_mask, i))) && (cpumask_next(i, per_cpu(cpu_sibling_mask, i)) < nr_cpu_ids) ) continue; cpumask_or(&cpu_exclude_map, &cpu_exclude_map, per_cpu(cpu_sibling_mask, i)); if ( !cnt || cnt[i] <= cnt[cpu] ) cpu = i; } xfree(cnt); return cpu; } bool_t domctl_lock_acquire(void) { /* * Caller may try to pause its own VCPUs. We must prevent deadlock * against other non-domctl routines which try to do the same. */ if ( !spin_trylock(¤t->domain->hypercall_deadlock_mutex) ) return 0; /* * Trylock here is paranoia if we have multiple privileged domains. Then * we could have one domain trying to pause another which is spinning * on domctl_lock -- results in deadlock. */ if ( spin_trylock(&domctl_lock) ) return 1; spin_unlock(¤t->domain->hypercall_deadlock_mutex); return 0; } void domctl_lock_release(void) { spin_unlock(&domctl_lock); spin_unlock(¤t->domain->hypercall_deadlock_mutex); } long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) { long ret = 0; bool_t copyback = 0; struct xen_domctl curop, *op = &curop; struct domain *d; if ( copy_from_guest(op, u_domctl, 1) ) return -EFAULT; if ( op->interface_version != XEN_DOMCTL_INTERFACE_VERSION ) return -EACCES; switch ( op->cmd ) { case XEN_DOMCTL_createdomain: case XEN_DOMCTL_getdomaininfo: case XEN_DOMCTL_test_assign_device: d = NULL; break; default: d = rcu_lock_domain_by_id(op->domain); if ( d == NULL ) return -ESRCH; } ret = xsm_domctl(XSM_OTHER, d, op->cmd); if ( ret ) goto domctl_out_unlock_domonly; if ( !domctl_lock_acquire() ) { if ( d ) rcu_unlock_domain(d); return hypercall_create_continuation( __HYPERVISOR_domctl, "h", u_domctl); } switch ( op->cmd ) { case XEN_DOMCTL_setvcpucontext: { vcpu_guest_context_u c = { .nat = NULL }; unsigned int vcpu = op->u.vcpucontext.vcpu; struct vcpu *v; ret = -EINVAL; if ( (d == current->domain) || /* no domain_pause() */ (vcpu >= d->max_vcpus) || ((v = d->vcpu[vcpu]) == NULL) ) break; if ( guest_handle_is_null(op->u.vcpucontext.ctxt) ) { ret = vcpu_reset(v); if ( ret == -EAGAIN ) ret = hypercall_create_continuation( __HYPERVISOR_domctl, "h", u_domctl); break; } #ifdef CONFIG_COMPAT BUILD_BUG_ON(sizeof(struct vcpu_guest_context) < sizeof(struct compat_vcpu_guest_context)); #endif ret = -ENOMEM; if ( (c.nat = alloc_vcpu_guest_context()) == NULL ) break; #ifdef CONFIG_COMPAT if ( !is_pv_32on64_vcpu(v) ) ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1); else ret = copy_from_guest(c.cmp, guest_handle_cast(op->u.vcpucontext.ctxt, void), 1); #else ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1); #endif ret = ret ? -EFAULT : 0; if ( ret == 0 ) { domain_pause(d); ret = arch_set_info_guest(v, c); domain_unpause(d); if ( ret == -EAGAIN ) ret = hypercall_create_continuation( __HYPERVISOR_domctl, "h", u_domctl); } free_vcpu_guest_context(c.nat); } break; case XEN_DOMCTL_pausedomain: { ret = -EINVAL; if ( d != current->domain ) { domain_pause_by_systemcontroller(d); ret = 0; } } break; case XEN_DOMCTL_unpausedomain: { domain_unpause_by_systemcontroller(d); ret = 0; } break; case XEN_DOMCTL_resumedomain: { domain_resume(d); ret = 0; } break; case XEN_DOMCTL_createdomain: { domid_t dom; static domid_t rover = 0; unsigned int domcr_flags; ret = -EINVAL; if ( supervisor_mode_kernel || (op->u.createdomain.flags & ~(XEN_DOMCTL_CDF_hvm_guest | XEN_DOMCTL_CDF_pvh_guest | XEN_DOMCTL_CDF_hap | XEN_DOMCTL_CDF_s3_integrity | XEN_DOMCTL_CDF_oos_off)) ) break; dom = op->domain; if ( (dom > 0) && (dom < DOMID_FIRST_RESERVED) ) { ret = -EINVAL; if ( !is_free_domid(dom) ) break; } else { for ( dom = rover + 1; dom != rover; dom++ ) { if ( dom == DOMID_FIRST_RESERVED ) dom = 0; if ( is_free_domid(dom) ) break; } ret = -ENOMEM; if ( dom == rover ) break; rover = dom; } if ( (op->u.createdomain.flags & XEN_DOMCTL_CDF_hvm_guest) && (op->u.createdomain.flags & XEN_DOMCTL_CDF_pvh_guest) ) return -EINVAL; domcr_flags = 0; if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_hvm_guest ) domcr_flags |= DOMCRF_hvm; if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_pvh_guest ) domcr_flags |= DOMCRF_pvh; if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_hap ) domcr_flags |= DOMCRF_hap; if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_s3_integrity ) domcr_flags |= DOMCRF_s3_integrity; if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_oos_off ) domcr_flags |= DOMCRF_oos_off; d = domain_create(dom, domcr_flags, op->u.createdomain.ssidref); if ( IS_ERR(d) ) { ret = PTR_ERR(d); d = NULL; break; } ret = 0; memcpy(d->handle, op->u.createdomain.handle, sizeof(xen_domain_handle_t)); op->domain = d->domain_id; copyback = 1; d = NULL; } break; case XEN_DOMCTL_max_vcpus: { unsigned int i, max = op->u.max_vcpus.max, cpu; cpumask_t *online; ret = -EINVAL; if ( (d == current->domain) || /* no domain_pause() */ (max > MAX_VIRT_CPUS) || (is_hvm_domain(d) && (max > MAX_HVM_VCPUS)) ) break; /* Until Xenoprof can dynamically grow its vcpu-s array... */ if ( d->xenoprof ) { ret = -EAGAIN; break; } /* Needed, for example, to ensure writable p.t. state is synced. */ domain_pause(d); /* * Certain operations (e.g. CPU microcode updates) modify data which is * used during VCPU allocation/initialization */ while ( !spin_trylock(&vcpu_alloc_lock) ) { if ( hypercall_preempt_check() ) { ret = hypercall_create_continuation( __HYPERVISOR_domctl, "h", u_domctl); goto maxvcpu_out_novcpulock; } } /* We cannot reduce maximum VCPUs. */ ret = -EINVAL; if ( (max < d->max_vcpus) && (d->vcpu[max] != NULL) ) goto maxvcpu_out; /* * For now don't allow increasing the vcpu count from a non-zero * value: This code and all readers of d->vcpu would otherwise need * to be converted to use RCU, but at present there's no tools side * code path that would issue such a request. */ ret = -EBUSY; if ( (d->max_vcpus > 0) && (max > d->max_vcpus) ) goto maxvcpu_out; ret = -ENOMEM; online = cpupool_online_cpumask(d->cpupool); if ( max > d->max_vcpus ) { struct vcpu **vcpus; BUG_ON(d->vcpu != NULL); BUG_ON(d->max_vcpus != 0); if ( (vcpus = xzalloc_array(struct vcpu *, max)) == NULL ) goto maxvcpu_out; /* Install vcpu array /then/ update max_vcpus. */ d->vcpu = vcpus; smp_wmb(); d->max_vcpus = max; } for ( i = 0; i < max; i++ ) { if ( d->vcpu[i] != NULL ) continue; cpu = (i == 0) ? default_vcpu0_location(online) : cpumask_cycle(d->vcpu[i-1]->processor, online); if ( alloc_vcpu(d, i, cpu) == NULL ) goto maxvcpu_out; } ret = 0; maxvcpu_out: spin_unlock(&vcpu_alloc_lock); maxvcpu_out_novcpulock: domain_unpause(d); } break; case XEN_DOMCTL_destroydomain: { ret = domain_kill(d); } break; case XEN_DOMCTL_setnodeaffinity: { nodemask_t new_affinity; ret = xenctl_bitmap_to_nodemask(&new_affinity, &op->u.nodeaffinity.nodemap); if ( !ret ) ret = domain_set_node_affinity(d, &new_affinity); } break; case XEN_DOMCTL_getnodeaffinity: { ret = nodemask_to_xenctl_bitmap(&op->u.nodeaffinity.nodemap, &d->node_affinity); } break; case XEN_DOMCTL_setvcpuaffinity: case XEN_DOMCTL_getvcpuaffinity: { struct vcpu *v; ret = -EINVAL; if ( op->u.vcpuaffinity.vcpu >= d->max_vcpus ) break; ret = -ESRCH; if ( (v = d->vcpu[op->u.vcpuaffinity.vcpu]) == NULL ) break; if ( op->cmd == XEN_DOMCTL_setvcpuaffinity ) { cpumask_var_t new_affinity; ret = xenctl_bitmap_to_cpumask( &new_affinity, &op->u.vcpuaffinity.cpumap); if ( !ret ) { ret = vcpu_set_affinity(v, new_affinity); free_cpumask_var(new_affinity); } } else { ret = cpumask_to_xenctl_bitmap( &op->u.vcpuaffinity.cpumap, v->cpu_affinity); } } break; case XEN_DOMCTL_scheduler_op: { ret = sched_adjust(d, &op->u.scheduler_op); copyback = 1; } break; case XEN_DOMCTL_getdomaininfo: { domid_t dom = op->domain; rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) if ( d->domain_id >= dom ) break; if ( d == NULL ) { rcu_read_unlock(&domlist_read_lock); ret = -ESRCH; break; } ret = xsm_getdomaininfo(XSM_HOOK, d); if ( ret ) goto getdomaininfo_out; getdomaininfo(d, &op->u.getdomaininfo); op->domain = op->u.getdomaininfo.domain; copyback = 1; getdomaininfo_out: rcu_read_unlock(&domlist_read_lock); d = NULL; } break; case XEN_DOMCTL_getvcpucontext: { vcpu_guest_context_u c = { .nat = NULL }; struct vcpu *v; ret = -EINVAL; if ( op->u.vcpucontext.vcpu >= d->max_vcpus || (v = d->vcpu[op->u.vcpucontext.vcpu]) == NULL || v == current ) /* no vcpu_pause() */ goto getvcpucontext_out; ret = -ENODATA; if ( !v->is_initialised ) goto getvcpucontext_out; #ifdef CONFIG_COMPAT BUILD_BUG_ON(sizeof(struct vcpu_guest_context) < sizeof(struct compat_vcpu_guest_context)); #endif ret = -ENOMEM; if ( (c.nat = xmalloc(struct vcpu_guest_context)) == NULL ) goto getvcpucontext_out; vcpu_pause(v); arch_get_info_guest(v, c); ret = 0; vcpu_unpause(v); #ifdef CONFIG_COMPAT if ( !is_pv_32on64_vcpu(v) ) ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1); else ret = copy_to_guest(guest_handle_cast(op->u.vcpucontext.ctxt, void), c.cmp, 1); #else ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1); #endif if ( ret ) ret = -EFAULT; copyback = 1; getvcpucontext_out: xfree(c.nat); } break; case XEN_DOMCTL_getvcpuinfo: { struct vcpu *v; struct vcpu_runstate_info runstate; ret = -EINVAL; if ( op->u.getvcpuinfo.vcpu >= d->max_vcpus ) break; ret = -ESRCH; if ( (v = d->vcpu[op->u.getvcpuinfo.vcpu]) == NULL ) break; vcpu_runstate_get(v, &runstate); op->u.getvcpuinfo.online = !test_bit(_VPF_down, &v->pause_flags); op->u.getvcpuinfo.blocked = test_bit(_VPF_blocked, &v->pause_flags); op->u.getvcpuinfo.running = v->is_running; op->u.getvcpuinfo.cpu_time = runstate.time[RUNSTATE_running]; op->u.getvcpuinfo.cpu = v->processor; ret = 0; copyback = 1; } break; case XEN_DOMCTL_max_mem: { unsigned long new_max; ret = -EINVAL; new_max = op->u.max_mem.max_memkb >> (PAGE_SHIFT-10); spin_lock(&d->page_alloc_lock); /* * NB. We removed a check that new_max >= current tot_pages; this means * that the domain will now be allowed to "ratchet" down to new_max. In * the meantime, while tot > max, all new allocations are disallowed. */ d->max_pages = new_max; ret = 0; spin_unlock(&d->page_alloc_lock); } break; case XEN_DOMCTL_setdomainhandle: { memcpy(d->handle, op->u.setdomainhandle.handle, sizeof(xen_domain_handle_t)); ret = 0; } break; case XEN_DOMCTL_setdebugging: { ret = -EINVAL; if ( d == current->domain ) /* no domain_pause() */ break; domain_pause(d); d->debugger_attached = !!op->u.setdebugging.enable; domain_unpause(d); /* causes guest to latch new status */ ret = 0; } break; case XEN_DOMCTL_irq_permission: { unsigned int pirq = op->u.irq_permission.pirq; int allow = op->u.irq_permission.allow_access; if ( pirq >= d->nr_pirqs ) ret = -EINVAL; else if ( xsm_irq_permission(XSM_HOOK, d, pirq, allow) ) ret = -EPERM; else if ( allow ) ret = pirq_permit_access(d, pirq); else ret = pirq_deny_access(d, pirq); } break; case XEN_DOMCTL_iomem_permission: { unsigned long mfn = op->u.iomem_permission.first_mfn; unsigned long nr_mfns = op->u.iomem_permission.nr_mfns; int allow = op->u.iomem_permission.allow_access; ret = -EINVAL; if ( (mfn + nr_mfns - 1) < mfn ) /* wrap? */ break; if ( xsm_iomem_permission(XSM_HOOK, d, mfn, mfn + nr_mfns - 1, allow) ) ret = -EPERM; else if ( allow ) ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1); else ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1); } break; case XEN_DOMCTL_settimeoffset: { domain_set_time_offset(d, op->u.settimeoffset.time_offset_seconds); ret = 0; } break; case XEN_DOMCTL_set_target: { struct domain *e; ret = -ESRCH; e = get_domain_by_id(op->u.set_target.target); if ( e == NULL ) break; ret = -EINVAL; if ( (d == e) || (d->target != NULL) ) { put_domain(e); break; } ret = xsm_set_target(XSM_HOOK, d, e); if ( ret ) { put_domain(e); break; } /* Hold reference on @e until we destroy @d. */ d->target = e; ret = 0; } break; case XEN_DOMCTL_subscribe: { d->suspend_evtchn = op->u.subscribe.port; } break; case XEN_DOMCTL_disable_migrate: { d->disable_migrate = op->u.disable_migrate.disable; } break; case XEN_DOMCTL_set_virq_handler: { uint32_t virq = op->u.set_virq_handler.virq; ret = set_global_virq_handler(d, virq); } break; case XEN_DOMCTL_set_max_evtchn: { d->max_evtchn_port = min_t(unsigned int, op->u.set_max_evtchn.max_port, INT_MAX); } break; default: ret = arch_do_domctl(op, d, u_domctl); break; } domctl_lock_release(); domctl_out_unlock_domonly: if ( d ) rcu_unlock_domain(d); if ( copyback && __copy_to_guest(u_domctl, op, 1) ) ret = -EFAULT; return ret; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/xz/0000775000175000017500000000000012307313555013166 5ustar smbsmbxen-4.4.0/xen/common/xz/dec_bcj.c0000664000175000017500000003317512307313555014714 0ustar smbsmb/* * Branch/Call/Jump (BCJ) filter decoders * * Authors: Lasse Collin * Igor Pavlov * * This file has been put into the public domain. * You can do whatever you want with this file. */ #include "private.h" /* * The rest of the file is inside this ifdef. It makes things a little more * convenient when building without support for any BCJ filters. */ #ifdef XZ_DEC_BCJ struct xz_dec_bcj { /* Type of the BCJ filter being used */ enum { BCJ_X86 = 4, /* x86 or x86-64 */ BCJ_POWERPC = 5, /* Big endian only */ BCJ_IA64 = 6, /* Big or little endian */ BCJ_ARM = 7, /* Little endian only */ BCJ_ARMTHUMB = 8, /* Little endian only */ BCJ_SPARC = 9 /* Big or little endian */ } type; /* * Return value of the next filter in the chain. We need to preserve * this information across calls, because we must not call the next * filter anymore once it has returned XZ_STREAM_END. */ enum xz_ret ret; /* True if we are operating in single-call mode. */ bool_t single_call; /* * Absolute position relative to the beginning of the uncompressed * data (in a single .xz Block). We care only about the lowest 32 * bits so this doesn't need to be uint64_t even with big files. */ uint32_t pos; /* x86 filter state */ uint32_t x86_prev_mask; /* Temporary space to hold the variables from struct xz_buf */ uint8_t *out; size_t out_pos; size_t out_size; struct { /* Amount of already filtered data in the beginning of buf */ size_t filtered; /* Total amount of data currently stored in buf */ size_t size; /* * Buffer to hold a mix of filtered and unfiltered data. This * needs to be big enough to hold Alignment + 2 * Look-ahead: * * Type Alignment Look-ahead * x86 1 4 * PowerPC 4 0 * IA-64 16 0 * ARM 4 0 * ARM-Thumb 2 2 * SPARC 4 0 */ uint8_t buf[16]; } temp; }; #ifdef XZ_DEC_X86 /* * This is used to test the most significant byte of a memory address * in an x86 instruction. */ static inline int INIT bcj_x86_test_msbyte(uint8_t b) { return b == 0x00 || b == 0xFF; } static size_t INIT bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size) { static const bool_t mask_to_allowed_status[8] = { true, true, true, false, true, false, false, false }; static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 }; size_t i; size_t prev_pos = (size_t)-1; uint32_t prev_mask = s->x86_prev_mask; uint32_t src; uint32_t dest; uint32_t j; uint8_t b; if (size <= 4) return 0; size -= 4; for (i = 0; i < size; ++i) { if ((buf[i] & 0xFE) != 0xE8) continue; prev_pos = i - prev_pos; if (prev_pos > 3) { prev_mask = 0; } else { prev_mask = (prev_mask << (prev_pos - 1)) & 7; if (prev_mask != 0) { b = buf[i + 4 - mask_to_bit_num[prev_mask]]; if (!mask_to_allowed_status[prev_mask] || bcj_x86_test_msbyte(b)) { prev_pos = i; prev_mask = (prev_mask << 1) | 1; continue; } } } prev_pos = i; if (bcj_x86_test_msbyte(buf[i + 4])) { src = get_unaligned_le32(buf + i + 1); while (true) { dest = src - (s->pos + (uint32_t)i + 5); if (prev_mask == 0) break; j = mask_to_bit_num[prev_mask] * 8; b = (uint8_t)(dest >> (24 - j)); if (!bcj_x86_test_msbyte(b)) break; src = dest ^ (((uint32_t)1 << (32 - j)) - 1); } dest &= 0x01FFFFFF; dest |= (uint32_t)0 - (dest & 0x01000000); put_unaligned_le32(dest, buf + i + 1); i += 4; } else { prev_mask = (prev_mask << 1) | 1; } } prev_pos = i - prev_pos; s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1); return i; } #endif #ifdef XZ_DEC_POWERPC static size_t INIT bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) { size_t i; uint32_t instr; for (i = 0; i + 4 <= size; i += 4) { instr = get_unaligned_be32(buf + i); if ((instr & 0xFC000003) == 0x48000001) { instr &= 0x03FFFFFC; instr -= s->pos + (uint32_t)i; instr &= 0x03FFFFFC; instr |= 0x48000001; put_unaligned_be32(instr, buf + i); } } return i; } #endif #ifdef XZ_DEC_IA64 static size_t INIT bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size) { static const uint8_t branch_table[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 6, 6, 0, 0, 7, 7, 4, 4, 0, 0, 4, 4, 0, 0 }; /* * The local variables take a little bit stack space, but it's less * than what LZMA2 decoder takes, so it doesn't make sense to reduce * stack usage here without doing that for the LZMA2 decoder too. */ /* Loop counters */ size_t i; size_t j; /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */ uint32_t slot; /* Bitwise offset of the instruction indicated by slot */ uint32_t bit_pos; /* bit_pos split into byte and bit parts */ uint32_t byte_pos; uint32_t bit_res; /* Address part of an instruction */ uint32_t addr; /* Mask used to detect which instructions to convert */ uint32_t mask; /* 41-bit instruction stored somewhere in the lowest 48 bits */ uint64_t instr; /* Instruction normalized with bit_res for easier manipulation */ uint64_t norm; for (i = 0; i + 16 <= size; i += 16) { mask = branch_table[buf[i] & 0x1F]; for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) { if (((mask >> slot) & 1) == 0) continue; byte_pos = bit_pos >> 3; bit_res = bit_pos & 7; instr = 0; for (j = 0; j < 6; ++j) instr |= (uint64_t)(buf[i + j + byte_pos]) << (8 * j); norm = instr >> bit_res; if (((norm >> 37) & 0x0F) == 0x05 && ((norm >> 9) & 0x07) == 0) { addr = (norm >> 13) & 0x0FFFFF; addr |= ((uint32_t)(norm >> 36) & 1) << 20; addr <<= 4; addr -= s->pos + (uint32_t)i; addr >>= 4; norm &= ~((uint64_t)0x8FFFFF << 13); norm |= (uint64_t)(addr & 0x0FFFFF) << 13; norm |= (uint64_t)(addr & 0x100000) << (36 - 20); instr &= (1 << bit_res) - 1; instr |= norm << bit_res; for (j = 0; j < 6; j++) buf[i + j + byte_pos] = (uint8_t)(instr >> (8 * j)); } } } return i; } #endif #ifdef XZ_DEC_ARM static size_t INIT bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size) { size_t i; uint32_t addr; for (i = 0; i + 4 <= size; i += 4) { if (buf[i + 3] == 0xEB) { addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8) | ((uint32_t)buf[i + 2] << 16); addr <<= 2; addr -= s->pos + (uint32_t)i + 8; addr >>= 2; buf[i] = (uint8_t)addr; buf[i + 1] = (uint8_t)(addr >> 8); buf[i + 2] = (uint8_t)(addr >> 16); } } return i; } #endif #ifdef XZ_DEC_ARMTHUMB static size_t INIT bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size) { size_t i; uint32_t addr; for (i = 0; i + 4 <= size; i += 2) { if ((buf[i + 1] & 0xF8) == 0xF0 && (buf[i + 3] & 0xF8) == 0xF8) { addr = (((uint32_t)buf[i + 1] & 0x07) << 19) | ((uint32_t)buf[i] << 11) | (((uint32_t)buf[i + 3] & 0x07) << 8) | (uint32_t)buf[i + 2]; addr <<= 1; addr -= s->pos + (uint32_t)i + 4; addr >>= 1; buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07)); buf[i] = (uint8_t)(addr >> 11); buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07)); buf[i + 2] = (uint8_t)addr; i += 2; } } return i; } #endif #ifdef XZ_DEC_SPARC static size_t INIT bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) { size_t i; uint32_t instr; for (i = 0; i + 4 <= size; i += 4) { instr = get_unaligned_be32(buf + i); if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) { instr <<= 2; instr -= s->pos + (uint32_t)i; instr >>= 2; instr = ((uint32_t)0x40000000 - (instr & 0x400000)) | 0x40000000 | (instr & 0x3FFFFF); put_unaligned_be32(instr, buf + i); } } return i; } #endif /* * Apply the selected BCJ filter. Update *pos and s->pos to match the amount * of data that got filtered. * * NOTE: This is implemented as a switch statement to avoid using function * pointers, which could be problematic in the kernel boot code, which must * avoid pointers to static data (at least on x86). */ static void INIT bcj_apply(struct xz_dec_bcj *s, uint8_t *buf, size_t *pos, size_t size) { size_t filtered; buf += *pos; size -= *pos; switch (s->type) { #ifdef XZ_DEC_X86 case BCJ_X86: filtered = bcj_x86(s, buf, size); break; #endif #ifdef XZ_DEC_POWERPC case BCJ_POWERPC: filtered = bcj_powerpc(s, buf, size); break; #endif #ifdef XZ_DEC_IA64 case BCJ_IA64: filtered = bcj_ia64(s, buf, size); break; #endif #ifdef XZ_DEC_ARM case BCJ_ARM: filtered = bcj_arm(s, buf, size); break; #endif #ifdef XZ_DEC_ARMTHUMB case BCJ_ARMTHUMB: filtered = bcj_armthumb(s, buf, size); break; #endif #ifdef XZ_DEC_SPARC case BCJ_SPARC: filtered = bcj_sparc(s, buf, size); break; #endif default: /* Never reached but silence compiler warnings. */ filtered = 0; break; } *pos += filtered; s->pos += filtered; } /* * Flush pending filtered data from temp to the output buffer. * Move the remaining mixture of possibly filtered and unfiltered * data to the beginning of temp. */ static void INIT bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b) { size_t copy_size; copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos); memcpy(b->out + b->out_pos, s->temp.buf, copy_size); b->out_pos += copy_size; s->temp.filtered -= copy_size; s->temp.size -= copy_size; memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size); } /* * The BCJ filter functions are primitive in sense that they process the * data in chunks of 1-16 bytes. To hide this issue, this function does * some buffering. */ XZ_EXTERN enum xz_ret INIT xz_dec_bcj_run(struct xz_dec_bcj *s, struct xz_dec_lzma2 *lzma2, struct xz_buf *b) { size_t out_start; /* * Flush pending already filtered data to the output buffer. Return * immediatelly if we couldn't flush everything, or if the next * filter in the chain had already returned XZ_STREAM_END. */ if (s->temp.filtered > 0) { bcj_flush(s, b); if (s->temp.filtered > 0) return XZ_OK; if (s->ret == XZ_STREAM_END) return XZ_STREAM_END; } /* * If we have more output space than what is currently pending in * temp, copy the unfiltered data from temp to the output buffer * and try to fill the output buffer by decoding more data from the * next filter in the chain. Apply the BCJ filter on the new data * in the output buffer. If everything cannot be filtered, copy it * to temp and rewind the output buffer position accordingly. * * This needs to be always run when temp.size == 0 to handle a special * case where the output buffer is full and the next filter has no * more output coming but hasn't returned XZ_STREAM_END yet. */ if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) { out_start = b->out_pos; memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size); b->out_pos += s->temp.size; s->ret = xz_dec_lzma2_run(lzma2, b); if (s->ret != XZ_STREAM_END && (s->ret != XZ_OK || s->single_call)) return s->ret; bcj_apply(s, b->out, &out_start, b->out_pos); /* * As an exception, if the next filter returned XZ_STREAM_END, * we can do that too, since the last few bytes that remain * unfiltered are meant to remain unfiltered. */ if (s->ret == XZ_STREAM_END) return XZ_STREAM_END; s->temp.size = b->out_pos - out_start; b->out_pos -= s->temp.size; memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size); /* * If there wasn't enough input to the next filter to fill * the output buffer with unfiltered data, there's no point * to try decoding more data to temp. */ if (b->out_pos + s->temp.size < b->out_size) return XZ_OK; } /* * We have unfiltered data in temp. If the output buffer isn't full * yet, try to fill the temp buffer by decoding more data from the * next filter. Apply the BCJ filter on temp. Then we hopefully can * fill the actual output buffer by copying filtered data from temp. * A mix of filtered and unfiltered data may be left in temp; it will * be taken care on the next call to this function. */ if (b->out_pos < b->out_size) { /* Make b->out{,_pos,_size} temporarily point to s->temp. */ s->out = b->out; s->out_pos = b->out_pos; s->out_size = b->out_size; b->out = s->temp.buf; b->out_pos = s->temp.size; b->out_size = sizeof(s->temp.buf); s->ret = xz_dec_lzma2_run(lzma2, b); s->temp.size = b->out_pos; b->out = s->out; b->out_pos = s->out_pos; b->out_size = s->out_size; if (s->ret != XZ_OK && s->ret != XZ_STREAM_END) return s->ret; bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size); /* * If the next filter returned XZ_STREAM_END, we mark that * everything is filtered, since the last unfiltered bytes * of the stream are meant to be left as is. */ if (s->ret == XZ_STREAM_END) s->temp.filtered = s->temp.size; bcj_flush(s, b); if (s->temp.filtered > 0) return XZ_OK; } return s->ret; } XZ_EXTERN struct xz_dec_bcj *INIT xz_dec_bcj_create(bool_t single_call) { struct xz_dec_bcj *s = malloc(sizeof(*s)); if (s != NULL) s->single_call = single_call; return s; } XZ_EXTERN enum xz_ret INIT xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id) { switch (id) { #ifdef XZ_DEC_X86 case BCJ_X86: #endif #ifdef XZ_DEC_POWERPC case BCJ_POWERPC: #endif #ifdef XZ_DEC_IA64 case BCJ_IA64: #endif #ifdef XZ_DEC_ARM case BCJ_ARM: #endif #ifdef XZ_DEC_ARMTHUMB case BCJ_ARMTHUMB: #endif #ifdef XZ_DEC_SPARC case BCJ_SPARC: #endif break; default: /* Unsupported Filter ID */ return XZ_OPTIONS_ERROR; } s->type = id; s->ret = XZ_OK; s->pos = 0; s->x86_prev_mask = 0; s->temp.filtered = 0; s->temp.size = 0; return XZ_OK; } #endif xen-4.4.0/xen/common/xz/stream.h0000664000175000017500000000242012307313555014630 0ustar smbsmb/* * Definitions for handling the .xz file format * * Author: Lasse Collin * * This file has been put into the public domain. * You can do whatever you want with this file. */ #ifndef XZ_STREAM_H #define XZ_STREAM_H /* * See the .xz file format specification at * http://tukaani.org/xz/xz-file-format.txt * to understand the container format. */ #define STREAM_HEADER_SIZE 12 #define HEADER_MAGIC "\3757zXZ" #define HEADER_MAGIC_SIZE 6 #define FOOTER_MAGIC "YZ" #define FOOTER_MAGIC_SIZE 2 /* * Variable-length integer can hold a 63-bit unsigned integer or a special * value indicating that the value is unknown. * * Experimental: vli_type can be defined to uint32_t to save a few bytes * in code size (no effect on speed). Doing so limits the uncompressed and * compressed size of the file to less than 256 MiB and may also weaken * error detection slightly. */ typedef uint64_t vli_type; #define VLI_MAX ((vli_type)-1 / 2) #define VLI_UNKNOWN ((vli_type)-1) /* Maximum encoded size of a VLI */ #define VLI_BYTES_MAX (sizeof(vli_type) * 8 / 7) /* Integrity Check types */ enum xz_check { XZ_CHECK_NONE = 0, XZ_CHECK_CRC32 = 1, XZ_CHECK_CRC64 = 4, XZ_CHECK_SHA256 = 10 }; /* Maximum possible Check ID */ #define XZ_CHECK_MAX 15 #endif xen-4.4.0/xen/common/xz/dec_stream.c0000664000175000017500000004650512307313555015452 0ustar smbsmb/* * .xz Stream decoder * * Author: Lasse Collin * * This file has been put into the public domain. * You can do whatever you want with this file. */ #include "private.h" #include "stream.h" /* Hash used to validate the Index field */ struct xz_dec_hash { vli_type unpadded; vli_type uncompressed; uint32_t crc32; }; struct xz_dec { /* Position in dec_main() */ enum { SEQ_STREAM_HEADER, SEQ_BLOCK_START, SEQ_BLOCK_HEADER, SEQ_BLOCK_UNCOMPRESS, SEQ_BLOCK_PADDING, SEQ_BLOCK_CHECK, SEQ_INDEX, SEQ_INDEX_PADDING, SEQ_INDEX_CRC32, SEQ_STREAM_FOOTER } sequence; /* Position in variable-length integers and Check fields */ uint32_t pos; /* Variable-length integer decoded by dec_vli() */ vli_type vli; /* Saved in_pos and out_pos */ size_t in_start; size_t out_start; /* CRC32 value in Block or Index */ uint32_t crc32; /* Type of the integrity check calculated from uncompressed data */ enum xz_check check_type; /* Operation mode */ enum xz_mode mode; /* * True if the next call to xz_dec_run() is allowed to return * XZ_BUF_ERROR. */ bool_t allow_buf_error; /* Information stored in Block Header */ struct { /* * Value stored in the Compressed Size field, or * VLI_UNKNOWN if Compressed Size is not present. */ vli_type compressed; /* * Value stored in the Uncompressed Size field, or * VLI_UNKNOWN if Uncompressed Size is not present. */ vli_type uncompressed; /* Size of the Block Header field */ uint32_t size; } block_header; /* Information collected when decoding Blocks */ struct { /* Observed compressed size of the current Block */ vli_type compressed; /* Observed uncompressed size of the current Block */ vli_type uncompressed; /* Number of Blocks decoded so far */ vli_type count; /* * Hash calculated from the Block sizes. This is used to * validate the Index field. */ struct xz_dec_hash hash; } block; /* Variables needed when verifying the Index field */ struct { /* Position in dec_index() */ enum { SEQ_INDEX_COUNT, SEQ_INDEX_UNPADDED, SEQ_INDEX_UNCOMPRESSED } sequence; /* Size of the Index in bytes */ vli_type size; /* Number of Records (matches block.count in valid files) */ vli_type count; /* * Hash calculated from the Records (matches block.hash in * valid files). */ struct xz_dec_hash hash; } index; /* * Temporary buffer needed to hold Stream Header, Block Header, * and Stream Footer. The Block Header is the biggest (1 KiB) * so we reserve space according to that. buf[] has to be aligned * to a multiple of four bytes; the size_t variables before it * should guarantee this. */ struct { size_t pos; size_t size; uint8_t buf[1024]; } temp; struct xz_dec_lzma2 *lzma2; #ifdef XZ_DEC_BCJ struct xz_dec_bcj *bcj; bool_t bcj_active; #endif }; #ifdef XZ_DEC_ANY_CHECK /* Sizes of the Check field with different Check IDs */ static const uint8_t check_sizes[16] = { 0, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32, 64, 64, 64 }; #endif /* * Fill s->temp by copying data starting from b->in[b->in_pos]. Caller * must have set s->temp.pos to indicate how much data we are supposed * to copy into s->temp.buf. Return true once s->temp.pos has reached * s->temp.size. */ static bool_t INIT fill_temp(struct xz_dec *s, struct xz_buf *b) { size_t copy_size = min_t(size_t, b->in_size - b->in_pos, s->temp.size - s->temp.pos); memcpy(s->temp.buf + s->temp.pos, b->in + b->in_pos, copy_size); b->in_pos += copy_size; s->temp.pos += copy_size; if (s->temp.pos == s->temp.size) { s->temp.pos = 0; return true; } return false; } /* Decode a variable-length integer (little-endian base-128 encoding) */ static enum xz_ret INIT dec_vli(struct xz_dec *s, const uint8_t *in, size_t *in_pos, size_t in_size) { uint8_t byte; if (s->pos == 0) s->vli = 0; while (*in_pos < in_size) { byte = in[*in_pos]; ++*in_pos; s->vli |= (vli_type)(byte & 0x7F) << s->pos; if ((byte & 0x80) == 0) { /* Don't allow non-minimal encodings. */ if (byte == 0 && s->pos != 0) return XZ_DATA_ERROR; s->pos = 0; return XZ_STREAM_END; } s->pos += 7; if (s->pos == 7 * VLI_BYTES_MAX) return XZ_DATA_ERROR; } return XZ_OK; } /* * Decode the Compressed Data field from a Block. Update and validate * the observed compressed and uncompressed sizes of the Block so that * they don't exceed the values possibly stored in the Block Header * (validation assumes that no integer overflow occurs, since vli_type * is normally uint64_t). Update the CRC32 if presence of the CRC32 * field was indicated in Stream Header. * * Once the decoding is finished, validate that the observed sizes match * the sizes possibly stored in the Block Header. Update the hash and * Block count, which are later used to validate the Index field. */ static enum xz_ret INIT dec_block(struct xz_dec *s, struct xz_buf *b) { enum xz_ret ret; s->in_start = b->in_pos; s->out_start = b->out_pos; #ifdef XZ_DEC_BCJ if (s->bcj_active) ret = xz_dec_bcj_run(s->bcj, s->lzma2, b); else #endif ret = xz_dec_lzma2_run(s->lzma2, b); s->block.compressed += b->in_pos - s->in_start; s->block.uncompressed += b->out_pos - s->out_start; /* * There is no need to separately check for VLI_UNKNOWN, since * the observed sizes are always smaller than VLI_UNKNOWN. */ if (s->block.compressed > s->block_header.compressed || s->block.uncompressed > s->block_header.uncompressed) return XZ_DATA_ERROR; if (s->check_type == XZ_CHECK_CRC32) s->crc32 = xz_crc32(b->out + s->out_start, b->out_pos - s->out_start, s->crc32); if (ret == XZ_STREAM_END) { if (s->block_header.compressed != VLI_UNKNOWN && s->block_header.compressed != s->block.compressed) return XZ_DATA_ERROR; if (s->block_header.uncompressed != VLI_UNKNOWN && s->block_header.uncompressed != s->block.uncompressed) return XZ_DATA_ERROR; s->block.hash.unpadded += s->block_header.size + s->block.compressed; #ifdef XZ_DEC_ANY_CHECK s->block.hash.unpadded += check_sizes[s->check_type]; #else if (s->check_type == XZ_CHECK_CRC32) s->block.hash.unpadded += 4; #endif s->block.hash.uncompressed += s->block.uncompressed; s->block.hash.crc32 = xz_crc32( (const uint8_t *)&s->block.hash, sizeof(s->block.hash), s->block.hash.crc32); ++s->block.count; } return ret; } /* Update the Index size and the CRC32 value. */ static void INIT index_update(struct xz_dec *s, const struct xz_buf *b) { size_t in_used = b->in_pos - s->in_start; s->index.size += in_used; s->crc32 = xz_crc32(b->in + s->in_start, in_used, s->crc32); } /* * Decode the Number of Records, Unpadded Size, and Uncompressed Size * fields from the Index field. That is, Index Padding and CRC32 are not * decoded by this function. * * This can return XZ_OK (more input needed), XZ_STREAM_END (everything * successfully decoded), or XZ_DATA_ERROR (input is corrupt). */ static enum xz_ret INIT dec_index(struct xz_dec *s, struct xz_buf *b) { enum xz_ret ret; do { ret = dec_vli(s, b->in, &b->in_pos, b->in_size); if (ret != XZ_STREAM_END) { index_update(s, b); return ret; } switch (s->index.sequence) { case SEQ_INDEX_COUNT: s->index.count = s->vli; /* * Validate that the Number of Records field * indicates the same number of Records as * there were Blocks in the Stream. */ if (s->index.count != s->block.count) return XZ_DATA_ERROR; s->index.sequence = SEQ_INDEX_UNPADDED; break; case SEQ_INDEX_UNPADDED: s->index.hash.unpadded += s->vli; s->index.sequence = SEQ_INDEX_UNCOMPRESSED; break; case SEQ_INDEX_UNCOMPRESSED: s->index.hash.uncompressed += s->vli; s->index.hash.crc32 = xz_crc32( (const uint8_t *)&s->index.hash, sizeof(s->index.hash), s->index.hash.crc32); --s->index.count; s->index.sequence = SEQ_INDEX_UNPADDED; break; } } while (s->index.count > 0); return XZ_STREAM_END; } /* * Validate that the next four input bytes match the value of s->crc32. * s->pos must be zero when starting to validate the first byte. */ static enum xz_ret INIT crc32_validate(struct xz_dec *s, struct xz_buf *b) { do { if (b->in_pos == b->in_size) return XZ_OK; if (((s->crc32 >> s->pos) & 0xFF) != b->in[b->in_pos++]) return XZ_DATA_ERROR; s->pos += 8; } while (s->pos < 32); s->crc32 = 0; s->pos = 0; return XZ_STREAM_END; } #ifdef XZ_DEC_ANY_CHECK /* * Skip over the Check field when the Check ID is not supported. * Returns true once the whole Check field has been skipped over. */ static bool_t INIT check_skip(struct xz_dec *s, struct xz_buf *b) { while (s->pos < check_sizes[s->check_type]) { if (b->in_pos == b->in_size) return false; ++b->in_pos; ++s->pos; } s->pos = 0; return true; } #endif /* Decode the Stream Header field (the first 12 bytes of the .xz Stream). */ static enum xz_ret INIT dec_stream_header(struct xz_dec *s) { if (!memeq(s->temp.buf, HEADER_MAGIC, HEADER_MAGIC_SIZE)) return XZ_FORMAT_ERROR; if (xz_crc32(s->temp.buf + HEADER_MAGIC_SIZE, 2, 0) != get_le32(s->temp.buf + HEADER_MAGIC_SIZE + 2)) return XZ_DATA_ERROR; if (s->temp.buf[HEADER_MAGIC_SIZE] != 0) return XZ_OPTIONS_ERROR; /* * Of integrity checks, we support only none (Check ID = 0) and * CRC32 (Check ID = 1). However, if XZ_DEC_ANY_CHECK is defined, * we will accept other check types too, but then the check won't * be verified and a warning (XZ_UNSUPPORTED_CHECK) will be given. */ s->check_type = s->temp.buf[HEADER_MAGIC_SIZE + 1]; #ifdef XZ_DEC_ANY_CHECK if (s->check_type > XZ_CHECK_MAX) return XZ_OPTIONS_ERROR; if (s->check_type > XZ_CHECK_CRC32) return XZ_UNSUPPORTED_CHECK; #else if (s->check_type > XZ_CHECK_CRC32) return XZ_OPTIONS_ERROR; #endif return XZ_OK; } /* Decode the Stream Footer field (the last 12 bytes of the .xz Stream) */ static enum xz_ret INIT dec_stream_footer(struct xz_dec *s) { if (!memeq(s->temp.buf + 10, FOOTER_MAGIC, FOOTER_MAGIC_SIZE)) return XZ_DATA_ERROR; if (xz_crc32(s->temp.buf + 4, 6, 0) != get_le32(s->temp.buf)) return XZ_DATA_ERROR; /* * Validate Backward Size. Note that we never added the size of the * Index CRC32 field to s->index.size, thus we use s->index.size / 4 * instead of s->index.size / 4 - 1. */ if ((s->index.size >> 2) != get_le32(s->temp.buf + 4)) return XZ_DATA_ERROR; if (s->temp.buf[8] != 0 || s->temp.buf[9] != s->check_type) return XZ_DATA_ERROR; /* * Use XZ_STREAM_END instead of XZ_OK to be more convenient * for the caller. */ return XZ_STREAM_END; } /* Decode the Block Header and initialize the filter chain. */ static enum xz_ret INIT dec_block_header(struct xz_dec *s) { enum xz_ret ret; /* * Validate the CRC32. We know that the temp buffer is at least * eight bytes so this is safe. */ s->temp.size -= 4; if (xz_crc32(s->temp.buf, s->temp.size, 0) != get_le32(s->temp.buf + s->temp.size)) return XZ_DATA_ERROR; s->temp.pos = 2; /* * Catch unsupported Block Flags. We support only one or two filters * in the chain, so we catch that with the same test. */ #ifdef XZ_DEC_BCJ if (s->temp.buf[1] & 0x3E) #else if (s->temp.buf[1] & 0x3F) #endif return XZ_OPTIONS_ERROR; /* Compressed Size */ if (s->temp.buf[1] & 0x40) { if (dec_vli(s, s->temp.buf, &s->temp.pos, s->temp.size) != XZ_STREAM_END) return XZ_DATA_ERROR; s->block_header.compressed = s->vli; } else { s->block_header.compressed = VLI_UNKNOWN; } /* Uncompressed Size */ if (s->temp.buf[1] & 0x80) { if (dec_vli(s, s->temp.buf, &s->temp.pos, s->temp.size) != XZ_STREAM_END) return XZ_DATA_ERROR; s->block_header.uncompressed = s->vli; } else { s->block_header.uncompressed = VLI_UNKNOWN; } #ifdef XZ_DEC_BCJ /* If there are two filters, the first one must be a BCJ filter. */ s->bcj_active = s->temp.buf[1] & 0x01; if (s->bcj_active) { if (s->temp.size - s->temp.pos < 2) return XZ_OPTIONS_ERROR; ret = xz_dec_bcj_reset(s->bcj, s->temp.buf[s->temp.pos++]); if (ret != XZ_OK) return ret; /* * We don't support custom start offset, * so Size of Properties must be zero. */ if (s->temp.buf[s->temp.pos++] != 0x00) return XZ_OPTIONS_ERROR; } #endif /* Valid Filter Flags always take at least two bytes. */ if (s->temp.size - s->temp.pos < 2) return XZ_DATA_ERROR; /* Filter ID = LZMA2 */ if (s->temp.buf[s->temp.pos++] != 0x21) return XZ_OPTIONS_ERROR; /* Size of Properties = 1-byte Filter Properties */ if (s->temp.buf[s->temp.pos++] != 0x01) return XZ_OPTIONS_ERROR; /* Filter Properties contains LZMA2 dictionary size. */ if (s->temp.size - s->temp.pos < 1) return XZ_DATA_ERROR; ret = xz_dec_lzma2_reset(s->lzma2, s->temp.buf[s->temp.pos++]); if (ret != XZ_OK) return ret; /* The rest must be Header Padding. */ while (s->temp.pos < s->temp.size) if (s->temp.buf[s->temp.pos++] != 0x00) return XZ_OPTIONS_ERROR; s->temp.pos = 0; s->block.compressed = 0; s->block.uncompressed = 0; return XZ_OK; } static enum xz_ret INIT dec_main(struct xz_dec *s, struct xz_buf *b) { enum xz_ret ret; /* * Store the start position for the case when we are in the middle * of the Index field. */ s->in_start = b->in_pos; while (true) { switch (s->sequence) { case SEQ_STREAM_HEADER: /* * Stream Header is copied to s->temp, and then * decoded from there. This way if the caller * gives us only little input at a time, we can * still keep the Stream Header decoding code * simple. Similar approach is used in many places * in this file. */ if (!fill_temp(s, b)) return XZ_OK; /* * If dec_stream_header() returns * XZ_UNSUPPORTED_CHECK, it is still possible * to continue decoding if working in multi-call * mode. Thus, update s->sequence before calling * dec_stream_header(). */ s->sequence = SEQ_BLOCK_START; ret = dec_stream_header(s); if (ret != XZ_OK) return ret; case SEQ_BLOCK_START: /* We need one byte of input to continue. */ if (b->in_pos == b->in_size) return XZ_OK; /* See if this is the beginning of the Index field. */ if (b->in[b->in_pos] == 0) { s->in_start = b->in_pos++; s->sequence = SEQ_INDEX; break; } /* * Calculate the size of the Block Header and * prepare to decode it. */ s->block_header.size = ((uint32_t)b->in[b->in_pos] + 1) * 4; s->temp.size = s->block_header.size; s->temp.pos = 0; s->sequence = SEQ_BLOCK_HEADER; case SEQ_BLOCK_HEADER: if (!fill_temp(s, b)) return XZ_OK; ret = dec_block_header(s); if (ret != XZ_OK) return ret; s->sequence = SEQ_BLOCK_UNCOMPRESS; case SEQ_BLOCK_UNCOMPRESS: ret = dec_block(s, b); if (ret != XZ_STREAM_END) return ret; s->sequence = SEQ_BLOCK_PADDING; case SEQ_BLOCK_PADDING: /* * Size of Compressed Data + Block Padding * must be a multiple of four. We don't need * s->block.compressed for anything else * anymore, so we use it here to test the size * of the Block Padding field. */ while (s->block.compressed & 3) { if (b->in_pos == b->in_size) return XZ_OK; if (b->in[b->in_pos++] != 0) return XZ_DATA_ERROR; ++s->block.compressed; } s->sequence = SEQ_BLOCK_CHECK; case SEQ_BLOCK_CHECK: if (s->check_type == XZ_CHECK_CRC32) { ret = crc32_validate(s, b); if (ret != XZ_STREAM_END) return ret; } #ifdef XZ_DEC_ANY_CHECK else if (!check_skip(s, b)) { return XZ_OK; } #endif s->sequence = SEQ_BLOCK_START; break; case SEQ_INDEX: ret = dec_index(s, b); if (ret != XZ_STREAM_END) return ret; s->sequence = SEQ_INDEX_PADDING; case SEQ_INDEX_PADDING: while ((s->index.size + (b->in_pos - s->in_start)) & 3) { if (b->in_pos == b->in_size) { index_update(s, b); return XZ_OK; } if (b->in[b->in_pos++] != 0) return XZ_DATA_ERROR; } /* Finish the CRC32 value and Index size. */ index_update(s, b); /* Compare the hashes to validate the Index field. */ if (!memeq(&s->block.hash, &s->index.hash, sizeof(s->block.hash))) return XZ_DATA_ERROR; s->sequence = SEQ_INDEX_CRC32; case SEQ_INDEX_CRC32: ret = crc32_validate(s, b); if (ret != XZ_STREAM_END) return ret; s->temp.size = STREAM_HEADER_SIZE; s->sequence = SEQ_STREAM_FOOTER; case SEQ_STREAM_FOOTER: if (!fill_temp(s, b)) return XZ_OK; return dec_stream_footer(s); } } /* Never reached */ } XZ_EXTERN void INIT xz_dec_reset(struct xz_dec *s) { s->sequence = SEQ_STREAM_HEADER; s->allow_buf_error = false; s->pos = 0; s->crc32 = 0; memzero(&s->block, sizeof(s->block)); memzero(&s->index, sizeof(s->index)); s->temp.pos = 0; s->temp.size = STREAM_HEADER_SIZE; } /* * xz_dec_run() is a wrapper for dec_main() to handle some special cases in * multi-call and single-call decoding. * * In multi-call mode, we must return XZ_BUF_ERROR when it seems clear that we * are not going to make any progress anymore. This is to prevent the caller * from calling us infinitely when the input file is truncated or otherwise * corrupt. Since zlib-style API allows that the caller fills the input buffer * only when the decoder doesn't produce any new output, we have to be careful * to avoid returning XZ_BUF_ERROR too easily: XZ_BUF_ERROR is returned only * after the second consecutive call to xz_dec_run() that makes no progress. * * In single-call mode, if we couldn't decode everything and no error * occurred, either the input is truncated or the output buffer is too small. * Since we know that the last input byte never produces any output, we know * that if all the input was consumed and decoding wasn't finished, the file * must be corrupt. Otherwise the output buffer has to be too small or the * file is corrupt in a way that decoding it produces too big output. * * If single-call decoding fails, we reset b->in_pos and b->out_pos back to * their original values. This is because with some filter chains there won't * be any valid uncompressed data in the output buffer unless the decoding * actually succeeds (that's the price to pay of using the output buffer as * the workspace). */ XZ_EXTERN enum xz_ret INIT xz_dec_run(struct xz_dec *s, struct xz_buf *b) { size_t in_start; size_t out_start; enum xz_ret ret; if (DEC_IS_SINGLE(s->mode)) xz_dec_reset(s); in_start = b->in_pos; out_start = b->out_pos; ret = dec_main(s, b); if (DEC_IS_SINGLE(s->mode)) { if (ret == XZ_OK) ret = b->in_pos == b->in_size ? XZ_DATA_ERROR : XZ_BUF_ERROR; if (ret != XZ_STREAM_END) { b->in_pos = in_start; b->out_pos = out_start; } } else if (ret == XZ_OK && in_start == b->in_pos && out_start == b->out_pos) { if (s->allow_buf_error) ret = XZ_BUF_ERROR; s->allow_buf_error = true; } else { s->allow_buf_error = false; } return ret; } XZ_EXTERN struct xz_dec *INIT xz_dec_init(enum xz_mode mode, uint32_t dict_max) { struct xz_dec *s = malloc(sizeof(*s)); if (s == NULL) return NULL; s->mode = mode; #ifdef XZ_DEC_BCJ s->bcj = xz_dec_bcj_create(DEC_IS_SINGLE(mode)); if (s->bcj == NULL) goto error_bcj; #endif s->lzma2 = xz_dec_lzma2_create(mode, dict_max); if (s->lzma2 == NULL) goto error_lzma2; xz_dec_reset(s); return s; error_lzma2: #ifdef XZ_DEC_BCJ xz_dec_bcj_end(s->bcj); error_bcj: #endif free(s); return NULL; } XZ_EXTERN void INIT xz_dec_end(struct xz_dec *s) { if (s != NULL) { xz_dec_lzma2_end(s->lzma2); #ifdef XZ_DEC_BCJ xz_dec_bcj_end(s->bcj); #endif free(s); } } xen-4.4.0/xen/common/xz/private.h0000664000175000017500000002316212307313555015015 0ustar smbsmb/* * Private includes and definitions * * Author: Lasse Collin * * This file has been put into the public domain. * You can do whatever you want with this file. */ #ifndef XZ_PRIVATE_H #define XZ_PRIVATE_H #ifdef __XEN__ #include #include #endif #define get_le32(p) le32_to_cpup((const uint32_t *)(p)) #if 1 /* ndef CONFIG_??? */ static inline u32 INIT get_unaligned_le32(void *p) { return le32_to_cpup(p); } static inline void INIT put_unaligned_le32(u32 val, void *p) { *(__force __le32*)p = cpu_to_le32(val); } #else #include static inline u32 INIT get_unaligned_le32(void *p) { return le32_to_cpu(__get_unaligned(p, 4)); } static inline void INIT put_unaligned_le32(u32 val, void *p) { __put_unaligned(cpu_to_le32(val), p, 4); } #endif #define false 0 #define true 1 /** * enum xz_mode - Operation mode * * @XZ_SINGLE: Single-call mode. This uses less RAM than * than multi-call modes, because the LZMA2 * dictionary doesn't need to be allocated as * part of the decoder state. All required data * structures are allocated at initialization, * so xz_dec_run() cannot return XZ_MEM_ERROR. * @XZ_PREALLOC: Multi-call mode with preallocated LZMA2 * dictionary buffer. All data structures are * allocated at initialization, so xz_dec_run() * cannot return XZ_MEM_ERROR. * @XZ_DYNALLOC: Multi-call mode. The LZMA2 dictionary is * allocated once the required size has been * parsed from the stream headers. If the * allocation fails, xz_dec_run() will return * XZ_MEM_ERROR. * * It is possible to enable support only for a subset of the above * modes at compile time by defining XZ_DEC_SINGLE, XZ_DEC_PREALLOC, * or XZ_DEC_DYNALLOC. The xz_dec kernel module is always compiled * with support for all operation modes, but the preboot code may * be built with fewer features to minimize code size. */ enum xz_mode { XZ_SINGLE, XZ_PREALLOC, XZ_DYNALLOC }; /** * enum xz_ret - Return codes * @XZ_OK: Everything is OK so far. More input or more * output space is required to continue. This * return code is possible only in multi-call mode * (XZ_PREALLOC or XZ_DYNALLOC). * @XZ_STREAM_END: Operation finished successfully. * @XZ_UNSUPPORTED_CHECK: Integrity check type is not supported. Decoding * is still possible in multi-call mode by simply * calling xz_dec_run() again. * Note that this return value is used only if * XZ_DEC_ANY_CHECK was defined at build time, * which is not used in the kernel. Unsupported * check types return XZ_OPTIONS_ERROR if * XZ_DEC_ANY_CHECK was not defined at build time. * @XZ_MEM_ERROR: Allocating memory failed. This return code is * possible only if the decoder was initialized * with XZ_DYNALLOC. The amount of memory that was * tried to be allocated was no more than the * dict_max argument given to xz_dec_init(). * @XZ_MEMLIMIT_ERROR: A bigger LZMA2 dictionary would be needed than * allowed by the dict_max argument given to * xz_dec_init(). This return value is possible * only in multi-call mode (XZ_PREALLOC or * XZ_DYNALLOC); the single-call mode (XZ_SINGLE) * ignores the dict_max argument. * @XZ_FORMAT_ERROR: File format was not recognized (wrong magic * bytes). * @XZ_OPTIONS_ERROR: This implementation doesn't support the requested * compression options. In the decoder this means * that the header CRC32 matches, but the header * itself specifies something that we don't support. * @XZ_DATA_ERROR: Compressed data is corrupt. * @XZ_BUF_ERROR: Cannot make any progress. Details are slightly * different between multi-call and single-call * mode; more information below. * * In multi-call mode, XZ_BUF_ERROR is returned when two consecutive calls * to XZ code cannot consume any input and cannot produce any new output. * This happens when there is no new input available, or the output buffer * is full while at least one output byte is still pending. Assuming your * code is not buggy, you can get this error only when decoding a compressed * stream that is truncated or otherwise corrupt. * * In single-call mode, XZ_BUF_ERROR is returned only when the output buffer * is too small or the compressed input is corrupt in a way that makes the * decoder produce more output than the caller expected. When it is * (relatively) clear that the compressed input is truncated, XZ_DATA_ERROR * is used instead of XZ_BUF_ERROR. */ enum xz_ret { XZ_OK, XZ_STREAM_END, XZ_UNSUPPORTED_CHECK, XZ_MEM_ERROR, XZ_MEMLIMIT_ERROR, XZ_FORMAT_ERROR, XZ_OPTIONS_ERROR, XZ_DATA_ERROR, XZ_BUF_ERROR }; /** * struct xz_buf - Passing input and output buffers to XZ code * @in: Beginning of the input buffer. This may be NULL if and only * if in_pos is equal to in_size. * @in_pos: Current position in the input buffer. This must not exceed * in_size. * @in_size: Size of the input buffer * @out: Beginning of the output buffer. This may be NULL if and only * if out_pos is equal to out_size. * @out_pos: Current position in the output buffer. This must not exceed * out_size. * @out_size: Size of the output buffer * * Only the contents of the output buffer from out[out_pos] onward, and * the variables in_pos and out_pos are modified by the XZ code. */ struct xz_buf { const uint8_t *in; size_t in_pos; size_t in_size; uint8_t *out; size_t out_pos; size_t out_size; }; /** * struct xz_dec - Opaque type to hold the XZ decoder state */ struct xz_dec; /* If no specific decoding mode is requested, enable support for all modes. */ #if !defined(XZ_DEC_SINGLE) && !defined(XZ_DEC_PREALLOC) \ && !defined(XZ_DEC_DYNALLOC) # define XZ_DEC_SINGLE # define XZ_DEC_PREALLOC # define XZ_DEC_DYNALLOC #endif /* * The DEC_IS_foo(mode) macros are used in "if" statements. If only some * of the supported modes are enabled, these macros will evaluate to true or * false at compile time and thus allow the compiler to omit unneeded code. */ #ifdef XZ_DEC_SINGLE # define DEC_IS_SINGLE(mode) ((mode) == XZ_SINGLE) #else # define DEC_IS_SINGLE(mode) (false) #endif #ifdef XZ_DEC_PREALLOC # define DEC_IS_PREALLOC(mode) ((mode) == XZ_PREALLOC) #else # define DEC_IS_PREALLOC(mode) (false) #endif #ifdef XZ_DEC_DYNALLOC # define DEC_IS_DYNALLOC(mode) ((mode) == XZ_DYNALLOC) #else # define DEC_IS_DYNALLOC(mode) (false) #endif #if !defined(XZ_DEC_SINGLE) # define DEC_IS_MULTI(mode) (true) #elif defined(XZ_DEC_PREALLOC) || defined(XZ_DEC_DYNALLOC) # define DEC_IS_MULTI(mode) ((mode) != XZ_SINGLE) #else # define DEC_IS_MULTI(mode) (false) #endif /* * If any of the BCJ filter decoders are wanted, define XZ_DEC_BCJ. * XZ_DEC_BCJ is used to enable generic support for BCJ decoders. */ #ifndef XZ_DEC_BCJ # if defined(XZ_DEC_X86) || defined(XZ_DEC_POWERPC) \ || defined(XZ_DEC_IA64) || defined(XZ_DEC_ARM) \ || defined(XZ_DEC_ARM) || defined(XZ_DEC_ARMTHUMB) \ || defined(XZ_DEC_SPARC) # define XZ_DEC_BCJ # endif #endif /* * Allocate memory for LZMA2 decoder. xz_dec_lzma2_reset() must be used * before calling xz_dec_lzma2_run(). */ XZ_EXTERN struct xz_dec_lzma2 *xz_dec_lzma2_create(enum xz_mode mode, uint32_t dict_max); /* * Decode the LZMA2 properties (one byte) and reset the decoder. Return * XZ_OK on success, XZ_MEMLIMIT_ERROR if the preallocated dictionary is not * big enough, and XZ_OPTIONS_ERROR if props indicates something that this * decoder doesn't support. */ XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props); /* Decode raw LZMA2 stream from b->in to b->out. */ XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s, struct xz_buf *b); /* Free the memory allocated for the LZMA2 decoder. */ XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s); #ifdef XZ_DEC_BCJ /* * Allocate memory for BCJ decoders. xz_dec_bcj_reset() must be used before * calling xz_dec_bcj_run(). */ XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool_t single_call); /* * Decode the Filter ID of a BCJ filter. This implementation doesn't * support custom start offsets, so no decoding of Filter Properties * is needed. Returns XZ_OK if the given Filter ID is supported. * Otherwise XZ_OPTIONS_ERROR is returned. */ XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id); /* * Decode raw BCJ + LZMA2 stream. This must be used only if there actually is * a BCJ filter in the chain. If the chain has only LZMA2, xz_dec_lzma2_run() * must be called directly. */ XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, struct xz_dec_lzma2 *lzma2, struct xz_buf *b); /* Free the memory allocated for the BCJ filters. */ #define xz_dec_bcj_end(s) free(s) #endif #endif xen-4.4.0/xen/common/xz/dec_lzma2.c0000664000175000017500000007101412307313555015175 0ustar smbsmb/* * LZMA2 decoder * * Authors: Lasse Collin * Igor Pavlov * * This file has been put into the public domain. * You can do whatever you want with this file. */ #include "private.h" #include "lzma2.h" /* * Range decoder initialization eats the first five bytes of each LZMA chunk. */ #define RC_INIT_BYTES 5 /* * Minimum number of usable input buffer to safely decode one LZMA symbol. * The worst case is that we decode 22 bits using probabilities and 26 * direct bits. This may decode at maximum of 20 bytes of input. However, * lzma_main() does an extra normalization before returning, thus we * need to put 21 here. */ #define LZMA_IN_REQUIRED 21 /* * Dictionary (history buffer) * * These are always true: * start <= pos <= full <= end * pos <= limit <= end * * In multi-call mode, also these are true: * end == size * size <= size_max * allocated <= size * * Most of these variables are size_t to support single-call mode, * in which the dictionary variables address the actual output * buffer directly. */ struct dictionary { /* Beginning of the history buffer */ uint8_t *buf; /* Old position in buf (before decoding more data) */ size_t start; /* Position in buf */ size_t pos; /* * How full dictionary is. This is used to detect corrupt input that * would read beyond the beginning of the uncompressed stream. */ size_t full; /* Write limit; we don't write to buf[limit] or later bytes. */ size_t limit; /* * End of the dictionary buffer. In multi-call mode, this is * the same as the dictionary size. In single-call mode, this * indicates the size of the output buffer. */ size_t end; /* * Size of the dictionary as specified in Block Header. This is used * together with "full" to detect corrupt input that would make us * read beyond the beginning of the uncompressed stream. */ uint32_t size; /* * Maximum allowed dictionary size in multi-call mode. * This is ignored in single-call mode. */ uint32_t size_max; /* * Amount of memory currently allocated for the dictionary. * This is used only with XZ_DYNALLOC. (With XZ_PREALLOC, * size_max is always the same as the allocated size.) */ uint32_t allocated; /* Operation mode */ enum xz_mode mode; }; /* Range decoder */ struct rc_dec { uint32_t range; uint32_t code; /* * Number of initializing bytes remaining to be read * by rc_read_init(). */ uint32_t init_bytes_left; /* * Buffer from which we read our input. It can be either * temp.buf or the caller-provided input buffer. */ const uint8_t *in; size_t in_pos; size_t in_limit; }; /* Probabilities for a length decoder. */ struct lzma_len_dec { /* Probability of match length being at least 10 */ uint16_t choice; /* Probability of match length being at least 18 */ uint16_t choice2; /* Probabilities for match lengths 2-9 */ uint16_t low[POS_STATES_MAX][LEN_LOW_SYMBOLS]; /* Probabilities for match lengths 10-17 */ uint16_t mid[POS_STATES_MAX][LEN_MID_SYMBOLS]; /* Probabilities for match lengths 18-273 */ uint16_t high[LEN_HIGH_SYMBOLS]; }; struct lzma_dec { /* Distances of latest four matches */ uint32_t rep0; uint32_t rep1; uint32_t rep2; uint32_t rep3; /* Types of the most recently seen LZMA symbols */ enum lzma_state state; /* * Length of a match. This is updated so that dict_repeat can * be called again to finish repeating the whole match. */ uint32_t len; /* * LZMA properties or related bit masks (number of literal * context bits, a mask dervied from the number of literal * position bits, and a mask dervied from the number * position bits) */ uint32_t lc; uint32_t literal_pos_mask; /* (1 << lp) - 1 */ uint32_t pos_mask; /* (1 << pb) - 1 */ /* If 1, it's a match. Otherwise it's a single 8-bit literal. */ uint16_t is_match[STATES][POS_STATES_MAX]; /* If 1, it's a repeated match. The distance is one of rep0 .. rep3. */ uint16_t is_rep[STATES]; /* * If 0, distance of a repeated match is rep0. * Otherwise check is_rep1. */ uint16_t is_rep0[STATES]; /* * If 0, distance of a repeated match is rep1. * Otherwise check is_rep2. */ uint16_t is_rep1[STATES]; /* If 0, distance of a repeated match is rep2. Otherwise it is rep3. */ uint16_t is_rep2[STATES]; /* * If 1, the repeated match has length of one byte. Otherwise * the length is decoded from rep_len_decoder. */ uint16_t is_rep0_long[STATES][POS_STATES_MAX]; /* * Probability tree for the highest two bits of the match * distance. There is a separate probability tree for match * lengths of 2 (i.e. MATCH_LEN_MIN), 3, 4, and [5, 273]. */ uint16_t dist_slot[DIST_STATES][DIST_SLOTS]; /* * Probility trees for additional bits for match distance * when the distance is in the range [4, 127]. */ uint16_t dist_special[FULL_DISTANCES - DIST_MODEL_END]; /* * Probability tree for the lowest four bits of a match * distance that is equal to or greater than 128. */ uint16_t dist_align[ALIGN_SIZE]; /* Length of a normal match */ struct lzma_len_dec match_len_dec; /* Length of a repeated match */ struct lzma_len_dec rep_len_dec; /* Probabilities of literals */ uint16_t literal[LITERAL_CODERS_MAX][LITERAL_CODER_SIZE]; }; struct lzma2_dec { /* Position in xz_dec_lzma2_run(). */ enum lzma2_seq { SEQ_CONTROL, SEQ_UNCOMPRESSED_1, SEQ_UNCOMPRESSED_2, SEQ_COMPRESSED_0, SEQ_COMPRESSED_1, SEQ_PROPERTIES, SEQ_LZMA_PREPARE, SEQ_LZMA_RUN, SEQ_COPY } sequence; /* Next position after decoding the compressed size of the chunk. */ enum lzma2_seq next_sequence; /* Uncompressed size of LZMA chunk (2 MiB at maximum) */ uint32_t uncompressed; /* * Compressed size of LZMA chunk or compressed/uncompressed * size of uncompressed chunk (64 KiB at maximum) */ uint32_t compressed; /* * True if dictionary reset is needed. This is false before * the first chunk (LZMA or uncompressed). */ bool_t need_dict_reset; /* * True if new LZMA properties are needed. This is false * before the first LZMA chunk. */ bool_t need_props; }; struct xz_dec_lzma2 { /* * The order below is important on x86 to reduce code size and * it shouldn't hurt on other platforms. Everything up to and * including lzma.pos_mask are in the first 128 bytes on x86-32, * which allows using smaller instructions to access those * variables. On x86-64, fewer variables fit into the first 128 * bytes, but this is still the best order without sacrificing * the readability by splitting the structures. */ struct rc_dec rc; struct dictionary dict; struct lzma2_dec lzma2; struct lzma_dec lzma; /* * Temporary buffer which holds small number of input bytes between * decoder calls. See lzma2_lzma() for details. */ struct { uint32_t size; uint8_t buf[3 * LZMA_IN_REQUIRED]; } temp; }; /************** * Dictionary * **************/ /* * Reset the dictionary state. When in single-call mode, set up the beginning * of the dictionary to point to the actual output buffer. */ static void INIT dict_reset(struct dictionary *dict, struct xz_buf *b) { if (DEC_IS_SINGLE(dict->mode)) { dict->buf = b->out + b->out_pos; dict->end = b->out_size - b->out_pos; } dict->start = 0; dict->pos = 0; dict->limit = 0; dict->full = 0; } /* Set dictionary write limit */ static void INIT dict_limit(struct dictionary *dict, size_t out_max) { if (dict->end - dict->pos <= out_max) dict->limit = dict->end; else dict->limit = dict->pos + out_max; } /* Return true if at least one byte can be written into the dictionary. */ static inline bool_t INIT dict_has_space(const struct dictionary *dict) { return dict->pos < dict->limit; } /* * Get a byte from the dictionary at the given distance. The distance is * assumed to valid, or as a special case, zero when the dictionary is * still empty. This special case is needed for single-call decoding to * avoid writing a '\0' to the end of the destination buffer. */ static inline uint32_t INIT dict_get(const struct dictionary *dict, uint32_t dist) { size_t offset = dict->pos - dist - 1; if (dist >= dict->pos) offset += dict->end; return dict->full > 0 ? dict->buf[offset] : 0; } /* * Put one byte into the dictionary. It is assumed that there is space for it. */ static inline void INIT dict_put(struct dictionary *dict, uint8_t byte) { dict->buf[dict->pos++] = byte; if (dict->full < dict->pos) dict->full = dict->pos; } /* * Repeat given number of bytes from the given distance. If the distance is * invalid, false is returned. On success, true is returned and *len is * updated to indicate how many bytes were left to be repeated. */ static bool_t INIT dict_repeat(struct dictionary *dict, uint32_t *len, uint32_t dist) { size_t back; uint32_t left; if (dist >= dict->full || dist >= dict->size) return false; left = min_t(size_t, dict->limit - dict->pos, *len); *len -= left; back = dict->pos - dist - 1; if (dist >= dict->pos) back += dict->end; do { dict->buf[dict->pos++] = dict->buf[back++]; if (back == dict->end) back = 0; } while (--left > 0); if (dict->full < dict->pos) dict->full = dict->pos; return true; } /* Copy uncompressed data as is from input to dictionary and output buffers. */ static void INIT dict_uncompressed(struct dictionary *dict, struct xz_buf *b, uint32_t *left) { size_t copy_size; while (*left > 0 && b->in_pos < b->in_size && b->out_pos < b->out_size) { copy_size = min(b->in_size - b->in_pos, b->out_size - b->out_pos); if (copy_size > dict->end - dict->pos) copy_size = dict->end - dict->pos; if (copy_size > *left) copy_size = *left; *left -= copy_size; memcpy(dict->buf + dict->pos, b->in + b->in_pos, copy_size); dict->pos += copy_size; if (dict->full < dict->pos) dict->full = dict->pos; if (DEC_IS_MULTI(dict->mode)) { if (dict->pos == dict->end) dict->pos = 0; memcpy(b->out + b->out_pos, b->in + b->in_pos, copy_size); } dict->start = dict->pos; b->out_pos += copy_size; b->in_pos += copy_size; } } /* * Flush pending data from dictionary to b->out. It is assumed that there is * enough space in b->out. This is guaranteed because caller uses dict_limit() * before decoding data into the dictionary. */ static uint32_t INIT dict_flush(struct dictionary *dict, struct xz_buf *b) { size_t copy_size = dict->pos - dict->start; if (DEC_IS_MULTI(dict->mode)) { if (dict->pos == dict->end) dict->pos = 0; memcpy(b->out + b->out_pos, dict->buf + dict->start, copy_size); } dict->start = dict->pos; b->out_pos += copy_size; return copy_size; } /***************** * Range decoder * *****************/ /* Reset the range decoder. */ static void INIT rc_reset(struct rc_dec *rc) { rc->range = (uint32_t)-1; rc->code = 0; rc->init_bytes_left = RC_INIT_BYTES; } /* * Read the first five initial bytes into rc->code if they haven't been * read already. (Yes, the first byte gets completely ignored.) */ static bool_t INIT rc_read_init(struct rc_dec *rc, struct xz_buf *b) { while (rc->init_bytes_left > 0) { if (b->in_pos == b->in_size) return false; rc->code = (rc->code << 8) + b->in[b->in_pos++]; --rc->init_bytes_left; } return true; } /* Return true if there may not be enough input for the next decoding loop. */ static inline bool_t INIT rc_limit_exceeded(const struct rc_dec *rc) { return rc->in_pos > rc->in_limit; } /* * Return true if it is possible (from point of view of range decoder) that * we have reached the end of the LZMA chunk. */ static inline bool_t INIT rc_is_finished(const struct rc_dec *rc) { return rc->code == 0; } /* Read the next input byte if needed. */ static always_inline void rc_normalize(struct rc_dec *rc) { if (rc->range < RC_TOP_VALUE) { rc->range <<= RC_SHIFT_BITS; rc->code = (rc->code << RC_SHIFT_BITS) + rc->in[rc->in_pos++]; } } /* * Decode one bit. In some versions, this function has been splitted in three * functions so that the compiler is supposed to be able to more easily avoid * an extra branch. In this particular version of the LZMA decoder, this * doesn't seem to be a good idea (tested with GCC 3.3.6, 3.4.6, and 4.3.3 * on x86). Using a non-splitted version results in nicer looking code too. * * NOTE: This must return an int. Do not make it return a bool or the speed * of the code generated by GCC 3.x decreases 10-15 %. (GCC 4.3 doesn't care, * and it generates 10-20 % faster code than GCC 3.x from this file anyway.) */ static always_inline int rc_bit(struct rc_dec *rc, uint16_t *prob) { uint32_t bound; int bit; rc_normalize(rc); bound = (rc->range >> RC_BIT_MODEL_TOTAL_BITS) * *prob; if (rc->code < bound) { rc->range = bound; *prob += (RC_BIT_MODEL_TOTAL - *prob) >> RC_MOVE_BITS; bit = 0; } else { rc->range -= bound; rc->code -= bound; *prob -= *prob >> RC_MOVE_BITS; bit = 1; } return bit; } /* Decode a bittree starting from the most significant bit. */ static always_inline uint32_t rc_bittree(struct rc_dec *rc, uint16_t *probs, uint32_t limit) { uint32_t symbol = 1; do { if (rc_bit(rc, &probs[symbol])) symbol = (symbol << 1) + 1; else symbol <<= 1; } while (symbol < limit); return symbol; } /* Decode a bittree starting from the least significant bit. */ static always_inline void rc_bittree_reverse(struct rc_dec *rc, uint16_t *probs, uint32_t *dest, uint32_t limit) { uint32_t symbol = 1; uint32_t i = 0; do { if (rc_bit(rc, &probs[symbol])) { symbol = (symbol << 1) + 1; *dest += 1 << i; } else { symbol <<= 1; } } while (++i < limit); } /* Decode direct bits (fixed fifty-fifty probability) */ static inline void INIT rc_direct(struct rc_dec *rc, uint32_t *dest, uint32_t limit) { uint32_t mask; do { rc_normalize(rc); rc->range >>= 1; rc->code -= rc->range; mask = (uint32_t)0 - (rc->code >> 31); rc->code += rc->range & mask; *dest = (*dest << 1) + (mask + 1); } while (--limit > 0); } /******** * LZMA * ********/ /* Get pointer to literal coder probability array. */ static uint16_t *INIT lzma_literal_probs(struct xz_dec_lzma2 *s) { uint32_t prev_byte = dict_get(&s->dict, 0); uint32_t low = prev_byte >> (8 - s->lzma.lc); uint32_t high = (s->dict.pos & s->lzma.literal_pos_mask) << s->lzma.lc; return s->lzma.literal[low + high]; } /* Decode a literal (one 8-bit byte) */ static void INIT lzma_literal(struct xz_dec_lzma2 *s) { uint16_t *probs; uint32_t symbol; uint32_t match_byte; uint32_t match_bit; uint32_t offset; uint32_t i; probs = lzma_literal_probs(s); if (lzma_state_is_literal(s->lzma.state)) { symbol = rc_bittree(&s->rc, probs, 0x100); } else { symbol = 1; match_byte = dict_get(&s->dict, s->lzma.rep0) << 1; offset = 0x100; do { match_bit = match_byte & offset; match_byte <<= 1; i = offset + match_bit + symbol; if (rc_bit(&s->rc, &probs[i])) { symbol = (symbol << 1) + 1; offset &= match_bit; } else { symbol <<= 1; offset &= ~match_bit; } } while (symbol < 0x100); } dict_put(&s->dict, (uint8_t)symbol); lzma_state_literal(&s->lzma.state); } /* Decode the length of the match into s->lzma.len. */ static void INIT lzma_len(struct xz_dec_lzma2 *s, struct lzma_len_dec *l, uint32_t pos_state) { uint16_t *probs; uint32_t limit; if (!rc_bit(&s->rc, &l->choice)) { probs = l->low[pos_state]; limit = LEN_LOW_SYMBOLS; s->lzma.len = MATCH_LEN_MIN; } else { if (!rc_bit(&s->rc, &l->choice2)) { probs = l->mid[pos_state]; limit = LEN_MID_SYMBOLS; s->lzma.len = MATCH_LEN_MIN + LEN_LOW_SYMBOLS; } else { probs = l->high; limit = LEN_HIGH_SYMBOLS; s->lzma.len = MATCH_LEN_MIN + LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS; } } s->lzma.len += rc_bittree(&s->rc, probs, limit) - limit; } /* Decode a match. The distance will be stored in s->lzma.rep0. */ static void INIT lzma_match(struct xz_dec_lzma2 *s, uint32_t pos_state) { uint16_t *probs; uint32_t dist_slot; uint32_t limit; lzma_state_match(&s->lzma.state); s->lzma.rep3 = s->lzma.rep2; s->lzma.rep2 = s->lzma.rep1; s->lzma.rep1 = s->lzma.rep0; lzma_len(s, &s->lzma.match_len_dec, pos_state); probs = s->lzma.dist_slot[lzma_get_dist_state(s->lzma.len)]; dist_slot = rc_bittree(&s->rc, probs, DIST_SLOTS) - DIST_SLOTS; if (dist_slot < DIST_MODEL_START) { s->lzma.rep0 = dist_slot; } else { limit = (dist_slot >> 1) - 1; s->lzma.rep0 = 2 + (dist_slot & 1); if (dist_slot < DIST_MODEL_END) { s->lzma.rep0 <<= limit; probs = s->lzma.dist_special + s->lzma.rep0 - dist_slot - 1; rc_bittree_reverse(&s->rc, probs, &s->lzma.rep0, limit); } else { rc_direct(&s->rc, &s->lzma.rep0, limit - ALIGN_BITS); s->lzma.rep0 <<= ALIGN_BITS; rc_bittree_reverse(&s->rc, s->lzma.dist_align, &s->lzma.rep0, ALIGN_BITS); } } } /* * Decode a repeated match. The distance is one of the four most recently * seen matches. The distance will be stored in s->lzma.rep0. */ static void INIT lzma_rep_match(struct xz_dec_lzma2 *s, uint32_t pos_state) { uint32_t tmp; if (!rc_bit(&s->rc, &s->lzma.is_rep0[s->lzma.state])) { if (!rc_bit(&s->rc, &s->lzma.is_rep0_long[ s->lzma.state][pos_state])) { lzma_state_short_rep(&s->lzma.state); s->lzma.len = 1; return; } } else { if (!rc_bit(&s->rc, &s->lzma.is_rep1[s->lzma.state])) { tmp = s->lzma.rep1; } else { if (!rc_bit(&s->rc, &s->lzma.is_rep2[s->lzma.state])) { tmp = s->lzma.rep2; } else { tmp = s->lzma.rep3; s->lzma.rep3 = s->lzma.rep2; } s->lzma.rep2 = s->lzma.rep1; } s->lzma.rep1 = s->lzma.rep0; s->lzma.rep0 = tmp; } lzma_state_long_rep(&s->lzma.state); lzma_len(s, &s->lzma.rep_len_dec, pos_state); } /* LZMA decoder core */ static bool_t INIT lzma_main(struct xz_dec_lzma2 *s) { uint32_t pos_state; /* * If the dictionary was reached during the previous call, try to * finish the possibly pending repeat in the dictionary. */ if (dict_has_space(&s->dict) && s->lzma.len > 0) dict_repeat(&s->dict, &s->lzma.len, s->lzma.rep0); /* * Decode more LZMA symbols. One iteration may consume up to * LZMA_IN_REQUIRED - 1 bytes. */ while (dict_has_space(&s->dict) && !rc_limit_exceeded(&s->rc)) { pos_state = s->dict.pos & s->lzma.pos_mask; if (!rc_bit(&s->rc, &s->lzma.is_match[ s->lzma.state][pos_state])) { lzma_literal(s); } else { if (rc_bit(&s->rc, &s->lzma.is_rep[s->lzma.state])) lzma_rep_match(s, pos_state); else lzma_match(s, pos_state); if (!dict_repeat(&s->dict, &s->lzma.len, s->lzma.rep0)) return false; } } /* * Having the range decoder always normalized when we are outside * this function makes it easier to correctly handle end of the chunk. */ rc_normalize(&s->rc); return true; } /* * Reset the LZMA decoder and range decoder state. Dictionary is nore reset * here, because LZMA state may be reset without resetting the dictionary. */ static void INIT lzma_reset(struct xz_dec_lzma2 *s) { uint16_t *probs; size_t i; s->lzma.state = STATE_LIT_LIT; s->lzma.rep0 = 0; s->lzma.rep1 = 0; s->lzma.rep2 = 0; s->lzma.rep3 = 0; /* * All probabilities are initialized to the same value. This hack * makes the code smaller by avoiding a separate loop for each * probability array. * * This could be optimized so that only that part of literal * probabilities that are actually required. In the common case * we would write 12 KiB less. */ probs = s->lzma.is_match[0]; for (i = 0; i < PROBS_TOTAL; ++i) probs[i] = RC_BIT_MODEL_TOTAL / 2; rc_reset(&s->rc); } /* * Decode and validate LZMA properties (lc/lp/pb) and calculate the bit masks * from the decoded lp and pb values. On success, the LZMA decoder state is * reset and true is returned. */ static bool_t INIT lzma_props(struct xz_dec_lzma2 *s, uint8_t props) { if (props > (4 * 5 + 4) * 9 + 8) return false; s->lzma.pos_mask = 0; while (props >= 9 * 5) { props -= 9 * 5; ++s->lzma.pos_mask; } s->lzma.pos_mask = (1 << s->lzma.pos_mask) - 1; s->lzma.literal_pos_mask = 0; while (props >= 9) { props -= 9; ++s->lzma.literal_pos_mask; } s->lzma.lc = props; if (s->lzma.lc + s->lzma.literal_pos_mask > 4) return false; s->lzma.literal_pos_mask = (1 << s->lzma.literal_pos_mask) - 1; lzma_reset(s); return true; } /********* * LZMA2 * *********/ /* * The LZMA decoder assumes that if the input limit (s->rc.in_limit) hasn't * been exceeded, it is safe to read up to LZMA_IN_REQUIRED bytes. This * wrapper function takes care of making the LZMA decoder's assumption safe. * * As long as there is plenty of input left to be decoded in the current LZMA * chunk, we decode directly from the caller-supplied input buffer until * there's LZMA_IN_REQUIRED bytes left. Those remaining bytes are copied into * s->temp.buf, which (hopefully) gets filled on the next call to this * function. We decode a few bytes from the temporary buffer so that we can * continue decoding from the caller-supplied input buffer again. */ static bool_t INIT lzma2_lzma(struct xz_dec_lzma2 *s, struct xz_buf *b) { size_t in_avail; uint32_t tmp; in_avail = b->in_size - b->in_pos; if (s->temp.size > 0 || s->lzma2.compressed == 0) { tmp = 2 * LZMA_IN_REQUIRED - s->temp.size; if (tmp > s->lzma2.compressed - s->temp.size) tmp = s->lzma2.compressed - s->temp.size; if (tmp > in_avail) tmp = in_avail; memcpy(s->temp.buf + s->temp.size, b->in + b->in_pos, tmp); if (s->temp.size + tmp == s->lzma2.compressed) { memzero(s->temp.buf + s->temp.size + tmp, sizeof(s->temp.buf) - s->temp.size - tmp); s->rc.in_limit = s->temp.size + tmp; } else if (s->temp.size + tmp < LZMA_IN_REQUIRED) { s->temp.size += tmp; b->in_pos += tmp; return true; } else { s->rc.in_limit = s->temp.size + tmp - LZMA_IN_REQUIRED; } s->rc.in = s->temp.buf; s->rc.in_pos = 0; if (!lzma_main(s) || s->rc.in_pos > s->temp.size + tmp) return false; s->lzma2.compressed -= s->rc.in_pos; if (s->rc.in_pos < s->temp.size) { s->temp.size -= s->rc.in_pos; memmove(s->temp.buf, s->temp.buf + s->rc.in_pos, s->temp.size); return true; } b->in_pos += s->rc.in_pos - s->temp.size; s->temp.size = 0; } in_avail = b->in_size - b->in_pos; if (in_avail >= LZMA_IN_REQUIRED) { s->rc.in = b->in; s->rc.in_pos = b->in_pos; if (in_avail >= s->lzma2.compressed + LZMA_IN_REQUIRED) s->rc.in_limit = b->in_pos + s->lzma2.compressed; else s->rc.in_limit = b->in_size - LZMA_IN_REQUIRED; if (!lzma_main(s)) return false; in_avail = s->rc.in_pos - b->in_pos; if (in_avail > s->lzma2.compressed) return false; s->lzma2.compressed -= in_avail; b->in_pos = s->rc.in_pos; } in_avail = b->in_size - b->in_pos; if (in_avail < LZMA_IN_REQUIRED) { if (in_avail > s->lzma2.compressed) in_avail = s->lzma2.compressed; memcpy(s->temp.buf, b->in + b->in_pos, in_avail); s->temp.size = in_avail; b->in_pos += in_avail; } return true; } /* * Take care of the LZMA2 control layer, and forward the job of actual LZMA * decoding or copying of uncompressed chunks to other functions. */ XZ_EXTERN enum xz_ret INIT xz_dec_lzma2_run(struct xz_dec_lzma2 *s, struct xz_buf *b) { uint32_t tmp; while (b->in_pos < b->in_size || s->lzma2.sequence == SEQ_LZMA_RUN) { switch (s->lzma2.sequence) { case SEQ_CONTROL: /* * LZMA2 control byte * * Exact values: * 0x00 End marker * 0x01 Dictionary reset followed by * an uncompressed chunk * 0x02 Uncompressed chunk (no dictionary reset) * * Highest three bits (s->control & 0xE0): * 0xE0 Dictionary reset, new properties and state * reset, followed by LZMA compressed chunk * 0xC0 New properties and state reset, followed * by LZMA compressed chunk (no dictionary * reset) * 0xA0 State reset using old properties, * followed by LZMA compressed chunk (no * dictionary reset) * 0x80 LZMA chunk (no dictionary or state reset) * * For LZMA compressed chunks, the lowest five bits * (s->control & 1F) are the highest bits of the * uncompressed size (bits 16-20). * * A new LZMA2 stream must begin with a dictionary * reset. The first LZMA chunk must set new * properties and reset the LZMA state. * * Values that don't match anything described above * are invalid and we return XZ_DATA_ERROR. */ tmp = b->in[b->in_pos++]; if (tmp == 0x00) return XZ_STREAM_END; if (tmp >= 0xE0 || tmp == 0x01) { s->lzma2.need_props = true; s->lzma2.need_dict_reset = false; dict_reset(&s->dict, b); } else if (s->lzma2.need_dict_reset) { return XZ_DATA_ERROR; } if (tmp >= 0x80) { s->lzma2.uncompressed = (tmp & 0x1F) << 16; s->lzma2.sequence = SEQ_UNCOMPRESSED_1; if (tmp >= 0xC0) { /* * When there are new properties, * state reset is done at * SEQ_PROPERTIES. */ s->lzma2.need_props = false; s->lzma2.next_sequence = SEQ_PROPERTIES; } else if (s->lzma2.need_props) { return XZ_DATA_ERROR; } else { s->lzma2.next_sequence = SEQ_LZMA_PREPARE; if (tmp >= 0xA0) lzma_reset(s); } } else { if (tmp > 0x02) return XZ_DATA_ERROR; s->lzma2.sequence = SEQ_COMPRESSED_0; s->lzma2.next_sequence = SEQ_COPY; } break; case SEQ_UNCOMPRESSED_1: s->lzma2.uncompressed += (uint32_t)b->in[b->in_pos++] << 8; s->lzma2.sequence = SEQ_UNCOMPRESSED_2; break; case SEQ_UNCOMPRESSED_2: s->lzma2.uncompressed += (uint32_t)b->in[b->in_pos++] + 1; s->lzma2.sequence = SEQ_COMPRESSED_0; break; case SEQ_COMPRESSED_0: s->lzma2.compressed = (uint32_t)b->in[b->in_pos++] << 8; s->lzma2.sequence = SEQ_COMPRESSED_1; break; case SEQ_COMPRESSED_1: s->lzma2.compressed += (uint32_t)b->in[b->in_pos++] + 1; s->lzma2.sequence = s->lzma2.next_sequence; break; case SEQ_PROPERTIES: if (!lzma_props(s, b->in[b->in_pos++])) return XZ_DATA_ERROR; s->lzma2.sequence = SEQ_LZMA_PREPARE; case SEQ_LZMA_PREPARE: if (s->lzma2.compressed < RC_INIT_BYTES) return XZ_DATA_ERROR; if (!rc_read_init(&s->rc, b)) return XZ_OK; s->lzma2.compressed -= RC_INIT_BYTES; s->lzma2.sequence = SEQ_LZMA_RUN; case SEQ_LZMA_RUN: /* * Set dictionary limit to indicate how much we want * to be encoded at maximum. Decode new data into the * dictionary. Flush the new data from dictionary to * b->out. Check if we finished decoding this chunk. * In case the dictionary got full but we didn't fill * the output buffer yet, we may run this loop * multiple times without changing s->lzma2.sequence. */ dict_limit(&s->dict, min_t(size_t, b->out_size - b->out_pos, s->lzma2.uncompressed)); if (!lzma2_lzma(s, b)) return XZ_DATA_ERROR; s->lzma2.uncompressed -= dict_flush(&s->dict, b); if (s->lzma2.uncompressed == 0) { if (s->lzma2.compressed > 0 || s->lzma.len > 0 || !rc_is_finished(&s->rc)) return XZ_DATA_ERROR; rc_reset(&s->rc); s->lzma2.sequence = SEQ_CONTROL; } else if (b->out_pos == b->out_size || (b->in_pos == b->in_size && s->temp.size < s->lzma2.compressed)) { return XZ_OK; } break; case SEQ_COPY: dict_uncompressed(&s->dict, b, &s->lzma2.compressed); if (s->lzma2.compressed > 0) return XZ_OK; s->lzma2.sequence = SEQ_CONTROL; break; } } return XZ_OK; } XZ_EXTERN struct xz_dec_lzma2 *INIT xz_dec_lzma2_create(enum xz_mode mode, uint32_t dict_max) { struct xz_dec_lzma2 *s = malloc(sizeof(*s)); if (s == NULL) return NULL; s->dict.mode = mode; s->dict.size_max = dict_max; if (DEC_IS_PREALLOC(mode)) { s->dict.buf = large_malloc(dict_max); if (s->dict.buf == NULL) { free(s); return NULL; } } else if (DEC_IS_DYNALLOC(mode)) { s->dict.buf = NULL; s->dict.allocated = 0; } return s; } XZ_EXTERN enum xz_ret INIT xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props) { /* This limits dictionary size to 3 GiB to keep parsing simpler. */ if (props > 39) return XZ_OPTIONS_ERROR; s->dict.size = 2 + (props & 1); s->dict.size <<= (props >> 1) + 11; if (DEC_IS_MULTI(s->dict.mode)) { if (s->dict.size > s->dict.size_max) return XZ_MEMLIMIT_ERROR; s->dict.end = s->dict.size; if (DEC_IS_DYNALLOC(s->dict.mode)) { if (s->dict.allocated < s->dict.size) { large_free(s->dict.buf); s->dict.buf = large_malloc(s->dict.size); if (s->dict.buf == NULL) { s->dict.allocated = 0; return XZ_MEM_ERROR; } } } } s->lzma.len = 0; s->lzma2.sequence = SEQ_CONTROL; s->lzma2.need_dict_reset = true; s->temp.size = 0; return XZ_OK; } XZ_EXTERN void INIT xz_dec_lzma2_end(struct xz_dec_lzma2 *s) { if (DEC_IS_MULTI(s->dict.mode)) large_free(s->dict.buf); free(s); } xen-4.4.0/xen/common/xz/crc32.c0000664000175000017500000000207212307313555014247 0ustar smbsmb/* * CRC32 using the polynomial from IEEE-802.3 * * Authors: Lasse Collin * Igor Pavlov * * This file has been put into the public domain. * You can do whatever you want with this file. */ /* * This is not the fastest implementation, but it is pretty compact. * The fastest versions of xz_crc32() on modern CPUs without hardware * accelerated CRC instruction are 3-5 times as fast as this version, * but they are bigger and use more memory for the lookup table. */ #include "private.h" XZ_EXTERN uint32_t INITDATA xz_crc32_table[256]; XZ_EXTERN void INIT xz_crc32_init(void) { const uint32_t poly = 0xEDB88320; uint32_t i; uint32_t j; uint32_t r; for (i = 0; i < 256; ++i) { r = i; for (j = 0; j < 8; ++j) r = (r >> 1) ^ (poly & ~((r & 1) - 1)); xz_crc32_table[i] = r; } return; } XZ_EXTERN uint32_t INIT xz_crc32(const uint8_t *buf, size_t size, uint32_t crc) { crc = ~crc; while (size != 0) { crc = xz_crc32_table[*buf++ ^ (crc & 0xFF)] ^ (crc >> 8); --size; } return ~crc; } xen-4.4.0/xen/common/xz/lzma2.h0000664000175000017500000001400012307313555014357 0ustar smbsmb/* * LZMA2 definitions * * Authors: Lasse Collin * Igor Pavlov * * This file has been put into the public domain. * You can do whatever you want with this file. */ #ifndef XZ_LZMA2_H #define XZ_LZMA2_H /* Range coder constants */ #define RC_SHIFT_BITS 8 #define RC_TOP_BITS 24 #define RC_TOP_VALUE (1 << RC_TOP_BITS) #define RC_BIT_MODEL_TOTAL_BITS 11 #define RC_BIT_MODEL_TOTAL (1 << RC_BIT_MODEL_TOTAL_BITS) #define RC_MOVE_BITS 5 /* * Maximum number of position states. A position state is the lowest pb * number of bits of the current uncompressed offset. In some places there * are different sets of probabilities for different position states. */ #define POS_STATES_MAX (1 << 4) /* * This enum is used to track which LZMA symbols have occurred most recently * and in which order. This information is used to predict the next symbol. * * Symbols: * - Literal: One 8-bit byte * - Match: Repeat a chunk of data at some distance * - Long repeat: Multi-byte match at a recently seen distance * - Short repeat: One-byte repeat at a recently seen distance * * The symbol names are in from STATE_oldest_older_previous. REP means * either short or long repeated match, and NONLIT means any non-literal. */ enum lzma_state { STATE_LIT_LIT, STATE_MATCH_LIT_LIT, STATE_REP_LIT_LIT, STATE_SHORTREP_LIT_LIT, STATE_MATCH_LIT, STATE_REP_LIT, STATE_SHORTREP_LIT, STATE_LIT_MATCH, STATE_LIT_LONGREP, STATE_LIT_SHORTREP, STATE_NONLIT_MATCH, STATE_NONLIT_REP }; /* Total number of states */ #define STATES 12 /* The lowest 7 states indicate that the previous state was a literal. */ #define LIT_STATES 7 /* Indicate that the latest symbol was a literal. */ static inline void INIT lzma_state_literal(enum lzma_state *state) { if (*state <= STATE_SHORTREP_LIT_LIT) *state = STATE_LIT_LIT; else if (*state <= STATE_LIT_SHORTREP) *state -= 3; else *state -= 6; } /* Indicate that the latest symbol was a match. */ static inline void INIT lzma_state_match(enum lzma_state *state) { *state = *state < LIT_STATES ? STATE_LIT_MATCH : STATE_NONLIT_MATCH; } /* Indicate that the latest state was a long repeated match. */ static inline void INIT lzma_state_long_rep(enum lzma_state *state) { *state = *state < LIT_STATES ? STATE_LIT_LONGREP : STATE_NONLIT_REP; } /* Indicate that the latest symbol was a short match. */ static inline void INIT lzma_state_short_rep(enum lzma_state *state) { *state = *state < LIT_STATES ? STATE_LIT_SHORTREP : STATE_NONLIT_REP; } /* Test if the previous symbol was a literal. */ static inline bool_t INIT lzma_state_is_literal(enum lzma_state state) { return state < LIT_STATES; } /* Each literal coder is divided in three sections: * - 0x001-0x0FF: Without match byte * - 0x101-0x1FF: With match byte; match bit is 0 * - 0x201-0x2FF: With match byte; match bit is 1 * * Match byte is used when the previous LZMA symbol was something else than * a literal (that is, it was some kind of match). */ #define LITERAL_CODER_SIZE 0x300 /* Maximum number of literal coders */ #define LITERAL_CODERS_MAX (1 << 4) /* Minimum length of a match is two bytes. */ #define MATCH_LEN_MIN 2 /* Match length is encoded with 4, 5, or 10 bits. * * Length Bits * 2-9 4 = Choice=0 + 3 bits * 10-17 5 = Choice=1 + Choice2=0 + 3 bits * 18-273 10 = Choice=1 + Choice2=1 + 8 bits */ #define LEN_LOW_BITS 3 #define LEN_LOW_SYMBOLS (1 << LEN_LOW_BITS) #define LEN_MID_BITS 3 #define LEN_MID_SYMBOLS (1 << LEN_MID_BITS) #define LEN_HIGH_BITS 8 #define LEN_HIGH_SYMBOLS (1 << LEN_HIGH_BITS) #define LEN_SYMBOLS (LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS + LEN_HIGH_SYMBOLS) /* * Maximum length of a match is 273 which is a result of the encoding * described above. */ #define MATCH_LEN_MAX (MATCH_LEN_MIN + LEN_SYMBOLS - 1) /* * Different sets of probabilities are used for match distances that have * very short match length: Lengths of 2, 3, and 4 bytes have a separate * set of probabilities for each length. The matches with longer length * use a shared set of probabilities. */ #define DIST_STATES 4 /* * Get the index of the appropriate probability array for decoding * the distance slot. */ static inline uint32_t INIT lzma_get_dist_state(uint32_t len) { return len < DIST_STATES + MATCH_LEN_MIN ? len - MATCH_LEN_MIN : DIST_STATES - 1; } /* * The highest two bits of a 32-bit match distance are encoded using six bits. * This six-bit value is called a distance slot. This way encoding a 32-bit * value takes 6-36 bits, larger values taking more bits. */ #define DIST_SLOT_BITS 6 #define DIST_SLOTS (1 << DIST_SLOT_BITS) /* Match distances up to 127 are fully encoded using probabilities. Since * the highest two bits (distance slot) are always encoded using six bits, * the distances 0-3 don't need any additional bits to encode, since the * distance slot itself is the same as the actual distance. DIST_MODEL_START * indicates the first distance slot where at least one additional bit is * needed. */ #define DIST_MODEL_START 4 /* * Match distances greater than 127 are encoded in three pieces: * - distance slot: the highest two bits * - direct bits: 2-26 bits below the highest two bits * - alignment bits: four lowest bits * * Direct bits don't use any probabilities. * * The distance slot value of 14 is for distances 128-191. */ #define DIST_MODEL_END 14 /* Distance slots that indicate a distance <= 127. */ #define FULL_DISTANCES_BITS (DIST_MODEL_END / 2) #define FULL_DISTANCES (1 << FULL_DISTANCES_BITS) /* * For match distances greater than 127, only the highest two bits and the * lowest four bits (alignment) is encoded using probabilities. */ #define ALIGN_BITS 4 #define ALIGN_SIZE (1 << ALIGN_BITS) #define ALIGN_MASK (ALIGN_SIZE - 1) /* Total number of all probability variables */ #define PROBS_TOTAL (1846 + LITERAL_CODERS_MAX * LITERAL_CODER_SIZE) /* * LZMA remembers the four most recent match distances. Reusing these * distances tends to take less space than re-encoding the actual * distance value. */ #define REPS 4 #endif xen-4.4.0/xen/common/Makefile0000664000175000017500000000265012307313555014170 0ustar smbsmbobj-y += bitmap.o obj-y += core_parking.o obj-y += cpu.o obj-y += cpupool.o obj-$(HAS_DEVICE_TREE) += device_tree.o obj-y += domctl.o obj-y += domain.o obj-y += event_2l.o obj-y += event_channel.o obj-y += event_fifo.o obj-y += grant_table.o obj-y += irq.o obj-y += kernel.o obj-y += keyhandler.o obj-$(HAS_KEXEC) += kexec.o obj-$(HAS_KEXEC) += kimage.o obj-y += lib.o obj-y += memory.o obj-y += multicall.o obj-y += notifier.o obj-y += page_alloc.o obj-y += preempt.o obj-y += random.o obj-y += rangeset.o obj-y += sched_credit.o obj-y += sched_credit2.o obj-y += sched_sedf.o obj-y += sched_arinc653.o obj-y += schedule.o obj-y += shutdown.o obj-y += softirq.o obj-y += sort.o obj-y += smp.o obj-y += spinlock.o obj-y += stop_machine.o obj-y += string.o obj-y += symbols.o obj-y += sysctl.o obj-y += tasklet.o obj-y += time.o obj-y += timer.o obj-y += trace.o obj-y += version.o obj-y += vmap.o obj-y += vsprintf.o obj-y += wait.o obj-y += xmalloc_tlsf.o obj-y += rcupdate.o obj-y += tmem.o obj-y += tmem_xen.o obj-y += radix-tree.o obj-y += rbtree.o obj-y += lzo.o obj-bin-$(CONFIG_X86) += $(foreach n,decompress bunzip2 unxz unlzma unlzo unlz4 earlycpio,$(n).init.o) obj-$(perfc) += perfc.o obj-$(crash_debug) += gdbstub.o obj-$(xenoprof) += xenoprof.o obj-$(CONFIG_XENCOMM) += xencomm.o subdir-$(CONFIG_COMPAT) += compat subdir-$(x86_64) += hvm subdir-$(coverage) += gcov subdir-y += libelf subdir-$(HAS_DEVICE_TREE) += libfdt xen-4.4.0/xen/common/trace.c0000664000175000017500000006001312307313555013767 0ustar smbsmb/****************************************************************************** * common/trace.c * * Xen Trace Buffer * * Copyright (C) 2004 by Intel Research Cambridge * * Authors: Mark Williamson, mark.a.williamson@intel.com * Rob Gardner, rob.gardner@hp.com * Date: October 2005 * * Copyright (C) 2005 Bin Ren * * The trace buffer code is designed to allow debugging traces of Xen to be * generated on UP / SMP machines. Each trace entry is timestamped so that * it's possible to reconstruct a chronological record of trace events. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CONFIG_COMPAT #include #define xen_t_buf t_buf CHECK_t_buf; #undef xen_t_buf #else #define compat_t_rec t_rec #endif /* opt_tbuf_size: trace buffer size (in pages) for each cpu */ static unsigned int opt_tbuf_size; static unsigned int opt_tevt_mask; integer_param("tbuf_size", opt_tbuf_size); integer_param("tevt_mask", opt_tevt_mask); /* Pointers to the meta-data objects for all system trace buffers */ static struct t_info *t_info; static unsigned int t_info_pages; static DEFINE_PER_CPU_READ_MOSTLY(struct t_buf *, t_bufs); static DEFINE_PER_CPU_READ_MOSTLY(spinlock_t, t_lock); static u32 data_size __read_mostly; /* High water mark for trace buffers; */ /* Send virtual interrupt when buffer level reaches this point */ static u32 t_buf_highwater; /* Number of records lost due to per-CPU trace buffer being full. */ static DEFINE_PER_CPU(unsigned long, lost_records); static DEFINE_PER_CPU(unsigned long, lost_records_first_tsc); /* a flag recording whether initialization has been done */ /* or more properly, if the tbuf subsystem is enabled right now */ int tb_init_done __read_mostly; /* which CPUs tracing is enabled on */ static cpumask_t tb_cpu_mask; /* which tracing events are enabled */ static u32 tb_event_mask = TRC_ALL; /* Return the number of elements _type necessary to store at least _x bytes of data * i.e., sizeof(_type) * ans >= _x. */ #define fit_to_type(_type, _x) (((_x)+sizeof(_type)-1) / sizeof(_type)) static int cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; if ( action == CPU_UP_PREPARE ) spin_lock_init(&per_cpu(t_lock, cpu)); return NOTIFY_DONE; } static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; static uint32_t calc_tinfo_first_offset(void) { int offset_in_bytes = offsetof(struct t_info, mfn_offset[NR_CPUS]); return fit_to_type(uint32_t, offset_in_bytes); } /** * calculate_tbuf_size - check to make sure that the proposed size will fit * in the currently sized struct t_info and allows prod and cons to * reach double the value without overflow. * The t_info layout is fixed and cant be changed without breaking xentrace. * Initialize t_info_pages based on number of trace pages. */ static int calculate_tbuf_size(unsigned int pages, uint16_t t_info_first_offset) { struct t_buf dummy_size; typeof(dummy_size.prod) max_size; struct t_info dummy_pages; typeof(dummy_pages.tbuf_size) max_pages; typeof(dummy_pages.mfn_offset[0]) max_mfn_offset; unsigned int max_cpus = num_online_cpus(); unsigned int t_info_words; /* force maximum value for an unsigned type */ max_size = -1; max_pages = -1; max_mfn_offset = -1; /* max size holds up to n pages */ max_size /= PAGE_SIZE; if ( max_size < max_pages ) max_pages = max_size; /* * max mfn_offset holds up to n pages per cpu * The array of mfns for the highest cpu can start at the maximum value * mfn_offset can hold. So reduce the number of cpus and also the mfn_offset. */ max_mfn_offset -= t_info_first_offset; max_cpus--; if ( max_cpus ) max_mfn_offset /= max_cpus; if ( max_mfn_offset < max_pages ) max_pages = max_mfn_offset; if ( pages > max_pages ) { printk(XENLOG_INFO "xentrace: requested number of %u pages " "reduced to %u\n", pages, max_pages); pages = max_pages; } t_info_words = num_online_cpus() * pages * sizeof(uint32_t); t_info_pages = PFN_UP(t_info_first_offset + t_info_words); printk(XENLOG_INFO "xentrace: requesting %u t_info pages " "for %u trace pages on %u cpus\n", t_info_pages, pages, num_online_cpus()); return pages; } /** * alloc_trace_bufs - performs initialization of the per-cpu trace buffers. * * This function is called at start of day in order to initialize the per-cpu * trace buffers. The trace buffers are then available for debugging use, via * the %TRACE_xD macros exported in . * * This function may also be called later when enabling trace buffers * via the SET_SIZE hypercall. */ static int alloc_trace_bufs(unsigned int pages) { int i, cpu; /* Start after a fixed-size array of NR_CPUS */ uint32_t *t_info_mfn_list; uint16_t t_info_first_offset; uint16_t offset; if ( t_info ) return -EBUSY; if ( pages == 0 ) return -EINVAL; /* Calculate offset in units of u32 of first mfn */ t_info_first_offset = calc_tinfo_first_offset(); pages = calculate_tbuf_size(pages, t_info_first_offset); t_info = alloc_xenheap_pages(get_order_from_pages(t_info_pages), 0); if ( t_info == NULL ) goto out_fail; memset(t_info, 0, t_info_pages*PAGE_SIZE); t_info_mfn_list = (uint32_t *)t_info; t_info->tbuf_size = pages; /* * Allocate buffers for all of the cpus. * If any fails, deallocate what you have so far and exit. */ for_each_online_cpu(cpu) { offset = t_info_first_offset + (cpu * pages); t_info->mfn_offset[cpu] = offset; for ( i = 0; i < pages; i++ ) { void *p = alloc_xenheap_pages(0, MEMF_bits(32 + PAGE_SHIFT)); if ( !p ) { printk(XENLOG_INFO "xentrace: memory allocation failed " "on cpu %d after %d pages\n", cpu, i); t_info_mfn_list[offset + i] = 0; goto out_dealloc; } t_info_mfn_list[offset + i] = virt_to_mfn(p); } } /* * Initialize buffers for all of the cpus. */ for_each_online_cpu(cpu) { struct t_buf *buf; struct page_info *pg; spin_lock_init(&per_cpu(t_lock, cpu)); offset = t_info->mfn_offset[cpu]; /* Initialize the buffer metadata */ per_cpu(t_bufs, cpu) = buf = mfn_to_virt(t_info_mfn_list[offset]); buf->cons = buf->prod = 0; printk(XENLOG_INFO "xentrace: p%d mfn %x offset %u\n", cpu, t_info_mfn_list[offset], offset); /* Now share the trace pages */ for ( i = 0; i < pages; i++ ) { pg = mfn_to_page(t_info_mfn_list[offset + i]); share_xen_page_with_privileged_guests(pg, XENSHARE_writable); } } /* Finally, share the t_info page */ for(i = 0; i < t_info_pages; i++) share_xen_page_with_privileged_guests( virt_to_page(t_info) + i, XENSHARE_readonly); data_size = (pages * PAGE_SIZE - sizeof(struct t_buf)); t_buf_highwater = data_size >> 1; /* 50% high water */ opt_tbuf_size = pages; printk("xentrace: initialised\n"); smp_wmb(); /* above must be visible before tb_init_done flag set */ tb_init_done = 1; return 0; out_dealloc: for_each_online_cpu(cpu) { offset = t_info->mfn_offset[cpu]; if ( !offset ) continue; for ( i = 0; i < pages; i++ ) { uint32_t mfn = t_info_mfn_list[offset + i]; if ( !mfn ) break; ASSERT(!(mfn_to_page(mfn)->count_info & PGC_allocated)); free_xenheap_pages(mfn_to_virt(mfn), 0); } } free_xenheap_pages(t_info, get_order_from_pages(t_info_pages)); t_info = NULL; out_fail: printk(XENLOG_WARNING "xentrace: allocation failed! Tracing disabled.\n"); return -ENOMEM; } /** * tb_set_size - handle the logic involved with dynamically allocating tbufs * * This function is called when the SET_SIZE hypercall is done. */ static int tb_set_size(unsigned int pages) { /* * Setting size is a one-shot operation. It can be done either at * boot time or via control tools, but not by both. Once buffers * are created they cannot be destroyed. */ if ( opt_tbuf_size && pages != opt_tbuf_size ) { printk(XENLOG_INFO "xentrace: tb_set_size from %d to %d " "not implemented\n", opt_tbuf_size, pages); return -EINVAL; } return alloc_trace_bufs(pages); } int trace_will_trace_event(u32 event) { if ( !tb_init_done ) return 0; /* * Copied from __trace_var() */ if ( (tb_event_mask & event) == 0 ) return 0; /* match class */ if ( ((tb_event_mask >> TRC_CLS_SHIFT) & (event >> TRC_CLS_SHIFT)) == 0 ) return 0; /* then match subclass */ if ( (((tb_event_mask >> TRC_SUBCLS_SHIFT) & 0xf ) & ((event >> TRC_SUBCLS_SHIFT) & 0xf )) == 0 ) return 0; if ( !cpumask_test_cpu(smp_processor_id(), &tb_cpu_mask) ) return 0; return 1; } /** * init_trace_bufs - performs initialization of the per-cpu trace buffers. * * This function is called at start of day in order to initialize the per-cpu * trace buffers. The trace buffers are then available for debugging use, via * the %TRACE_xD macros exported in . */ void __init init_trace_bufs(void) { cpumask_setall(&tb_cpu_mask); register_cpu_notifier(&cpu_nfb); if ( opt_tbuf_size ) { if ( alloc_trace_bufs(opt_tbuf_size) ) { printk("xentrace: allocation size %d failed, disabling\n", opt_tbuf_size); opt_tbuf_size = 0; } else if ( opt_tevt_mask ) { printk("xentrace: Starting tracing, enabling mask %x\n", opt_tevt_mask); tb_event_mask = opt_tevt_mask; tb_init_done=1; } } } /** * tb_control - sysctl operations on trace buffers. * @tbc: a pointer to a xen_sysctl_tbuf_op_t to be filled out */ int tb_control(xen_sysctl_tbuf_op_t *tbc) { static DEFINE_SPINLOCK(lock); int rc = 0; spin_lock(&lock); switch ( tbc->cmd ) { case XEN_SYSCTL_TBUFOP_get_info: tbc->evt_mask = tb_event_mask; tbc->buffer_mfn = t_info ? virt_to_mfn(t_info) : 0; tbc->size = t_info_pages * PAGE_SIZE; break; case XEN_SYSCTL_TBUFOP_set_cpu_mask: { cpumask_var_t mask; rc = xenctl_bitmap_to_cpumask(&mask, &tbc->cpu_mask); if ( !rc ) { cpumask_copy(&tb_cpu_mask, mask); free_cpumask_var(mask); } } break; case XEN_SYSCTL_TBUFOP_set_evt_mask: tb_event_mask = tbc->evt_mask; break; case XEN_SYSCTL_TBUFOP_set_size: rc = tb_set_size(tbc->size); break; case XEN_SYSCTL_TBUFOP_enable: /* Enable trace buffers. Check buffers are already allocated. */ if ( opt_tbuf_size == 0 ) rc = -EINVAL; else tb_init_done = 1; break; case XEN_SYSCTL_TBUFOP_disable: { /* * Disable trace buffers. Just stops new records from being written, * does not deallocate any memory. */ int i; tb_init_done = 0; smp_wmb(); /* Clear any lost-record info so we don't get phantom lost records next time we * start tracing. Grab the lock to make sure we're not racing anyone. After this * hypercall returns, no more records should be placed into the buffers. */ for_each_online_cpu(i) { unsigned long flags; spin_lock_irqsave(&per_cpu(t_lock, i), flags); per_cpu(lost_records, i)=0; spin_unlock_irqrestore(&per_cpu(t_lock, i), flags); } } break; default: rc = -EINVAL; break; } spin_unlock(&lock); return rc; } static inline unsigned int calc_rec_size(bool_t cycles, unsigned int extra) { unsigned int rec_size = 4; if ( cycles ) rec_size += 8; rec_size += extra; return rec_size; } static inline bool_t bogus(u32 prod, u32 cons) { if ( unlikely(prod & 3) || unlikely(prod >= 2 * data_size) || unlikely(cons & 3) || unlikely(cons >= 2 * data_size) ) { tb_init_done = 0; printk(XENLOG_WARNING "trc#%u: bogus prod (%08x) and/or cons (%08x)\n", smp_processor_id(), prod, cons); return 1; } return 0; } static inline u32 calc_unconsumed_bytes(const struct t_buf *buf) { u32 prod = buf->prod, cons = buf->cons; s32 x; barrier(); /* must read buf->prod and buf->cons only once */ if ( bogus(prod, cons) ) return data_size; x = prod - cons; if ( x < 0 ) x += 2*data_size; ASSERT(x >= 0); ASSERT(x <= data_size); return x; } static inline u32 calc_bytes_to_wrap(const struct t_buf *buf) { u32 prod = buf->prod, cons = buf->cons; s32 x; barrier(); /* must read buf->prod and buf->cons only once */ if ( bogus(prod, cons) ) return 0; x = data_size - prod; if ( x <= 0 ) x += data_size; ASSERT(x > 0); ASSERT(x <= data_size); return x; } static inline u32 calc_bytes_avail(const struct t_buf *buf) { return data_size - calc_unconsumed_bytes(buf); } static unsigned char *next_record(const struct t_buf *buf, uint32_t *next, unsigned char **next_page, uint32_t *offset_in_page) { u32 x = buf->prod, cons = buf->cons; uint16_t per_cpu_mfn_offset; uint32_t per_cpu_mfn_nr; uint32_t *mfn_list; uint32_t mfn; unsigned char *this_page; barrier(); /* must read buf->prod and buf->cons only once */ *next = x; if ( !tb_init_done || bogus(x, cons) ) return NULL; if ( x >= data_size ) x -= data_size; ASSERT(x < data_size); /* add leading header to get total offset of next record */ x += sizeof(struct t_buf); *offset_in_page = x & ~PAGE_MASK; /* offset into array of mfns */ per_cpu_mfn_nr = x >> PAGE_SHIFT; per_cpu_mfn_offset = t_info->mfn_offset[smp_processor_id()]; mfn_list = (uint32_t *)t_info; mfn = mfn_list[per_cpu_mfn_offset + per_cpu_mfn_nr]; this_page = mfn_to_virt(mfn); if (per_cpu_mfn_nr + 1 >= opt_tbuf_size) { /* reached end of buffer? */ *next_page = NULL; } else { mfn = mfn_list[per_cpu_mfn_offset + per_cpu_mfn_nr + 1]; *next_page = mfn_to_virt(mfn); } return this_page; } static inline void __insert_record(struct t_buf *buf, unsigned long event, unsigned int extra, bool_t cycles, unsigned int rec_size, const void *extra_data) { struct t_rec split_rec, *rec; uint32_t *dst; unsigned char *this_page, *next_page; unsigned int extra_word = extra / sizeof(u32); unsigned int local_rec_size = calc_rec_size(cycles, extra); uint32_t next; uint32_t offset; uint32_t remaining; BUG_ON(local_rec_size != rec_size); BUG_ON(extra & 3); this_page = next_record(buf, &next, &next_page, &offset); if ( !this_page ) return; remaining = PAGE_SIZE - offset; if ( unlikely(rec_size > remaining) ) { if ( next_page == NULL ) { /* access beyond end of buffer */ printk(XENLOG_WARNING "%s: size=%08x prod=%08x cons=%08x rec=%u remaining=%u\n", __func__, data_size, next, buf->cons, rec_size, remaining); return; } rec = &split_rec; } else { rec = (struct t_rec*)(this_page + offset); } rec->event = event; rec->extra_u32 = extra_word; dst = rec->u.nocycles.extra_u32; if ( (rec->cycles_included = cycles) != 0 ) { u64 tsc = (u64)get_cycles(); rec->u.cycles.cycles_lo = (uint32_t)tsc; rec->u.cycles.cycles_hi = (uint32_t)(tsc >> 32); dst = rec->u.cycles.extra_u32; } if ( extra_data && extra ) memcpy(dst, extra_data, extra); if ( unlikely(rec_size > remaining) ) { memcpy(this_page + offset, rec, remaining); memcpy(next_page, (char *)rec + remaining, rec_size - remaining); } smp_wmb(); next += rec_size; if ( next >= 2*data_size ) next -= 2*data_size; ASSERT(next < 2*data_size); buf->prod = next; } static inline void insert_wrap_record(struct t_buf *buf, unsigned int size) { u32 space_left = calc_bytes_to_wrap(buf); unsigned int extra_space = space_left - sizeof(u32); bool_t cycles = 0; BUG_ON(space_left > size); /* We may need to add cycles to take up enough space... */ if ( (extra_space/sizeof(u32)) > TRACE_EXTRA_MAX ) { cycles = 1; extra_space -= sizeof(u64); ASSERT((extra_space/sizeof(u32)) <= TRACE_EXTRA_MAX); } __insert_record(buf, TRC_TRACE_WRAP_BUFFER, extra_space, cycles, space_left, NULL); } #define LOST_REC_SIZE (4 + 8 + 16) /* header + tsc + sizeof(struct ed) */ static inline void insert_lost_records(struct t_buf *buf) { struct { u32 lost_records; u32 did:16, vid:16; u64 first_tsc; } __attribute__((packed)) ed; ed.vid = current->vcpu_id; ed.did = current->domain->domain_id; ed.lost_records = this_cpu(lost_records); ed.first_tsc = this_cpu(lost_records_first_tsc); this_cpu(lost_records) = 0; __insert_record(buf, TRC_LOST_RECORDS, sizeof(ed), 1 /* cycles */, LOST_REC_SIZE, &ed); } /* * Notification is performed in qtasklet to avoid deadlocks with contexts * which __trace_var() may be called from (e.g., scheduler critical regions). */ static void trace_notify_dom0(unsigned long unused) { send_global_virq(VIRQ_TBUF); } static DECLARE_SOFTIRQ_TASKLET(trace_notify_dom0_tasklet, trace_notify_dom0, 0); /** * __trace_var - Enters a trace tuple into the trace buffer for the current CPU. * @event: the event type being logged * @cycles: include tsc timestamp into trace record * @extra: size of additional trace data in bytes * @extra_data: pointer to additional trace data * * Logs a trace record into the appropriate buffer. */ void __trace_var(u32 event, bool_t cycles, unsigned int extra, const void *extra_data) { struct t_buf *buf; unsigned long flags; u32 bytes_to_tail, bytes_to_wrap; unsigned int rec_size, total_size; unsigned int extra_word; bool_t started_below_highwater; if( !tb_init_done ) return; /* Convert byte count into word count, rounding up */ extra_word = (extra / sizeof(u32)); if ( (extra % sizeof(u32)) != 0 ) extra_word++; ASSERT(extra_word <= TRACE_EXTRA_MAX); extra_word = min_t(int, extra_word, TRACE_EXTRA_MAX); /* Round size up to nearest word */ extra = extra_word * sizeof(u32); if ( (tb_event_mask & event) == 0 ) return; /* match class */ if ( ((tb_event_mask >> TRC_CLS_SHIFT) & (event >> TRC_CLS_SHIFT)) == 0 ) return; /* then match subclass */ if ( (((tb_event_mask >> TRC_SUBCLS_SHIFT) & 0xf ) & ((event >> TRC_SUBCLS_SHIFT) & 0xf )) == 0 ) return; if ( !cpumask_test_cpu(smp_processor_id(), &tb_cpu_mask) ) return; /* Read tb_init_done /before/ t_bufs. */ smp_rmb(); spin_lock_irqsave(&this_cpu(t_lock), flags); buf = this_cpu(t_bufs); if ( unlikely(!buf) ) { /* Make gcc happy */ started_below_highwater = 0; goto unlock; } started_below_highwater = (calc_unconsumed_bytes(buf) < t_buf_highwater); /* Calculate the record size */ rec_size = calc_rec_size(cycles, extra); /* How many bytes are available in the buffer? */ bytes_to_tail = calc_bytes_avail(buf); /* How many bytes until the next wrap-around? */ bytes_to_wrap = calc_bytes_to_wrap(buf); /* * Calculate expected total size to commit this record by * doing a dry-run. */ total_size = 0; /* First, check to see if we need to include a lost_record. */ if ( this_cpu(lost_records) ) { if ( LOST_REC_SIZE > bytes_to_wrap ) { total_size += bytes_to_wrap; bytes_to_wrap = data_size; } total_size += LOST_REC_SIZE; bytes_to_wrap -= LOST_REC_SIZE; /* LOST_REC might line up perfectly with the buffer wrap */ if ( bytes_to_wrap == 0 ) bytes_to_wrap = data_size; } if ( rec_size > bytes_to_wrap ) { total_size += bytes_to_wrap; } total_size += rec_size; /* Do we have enough space for everything? */ if ( total_size > bytes_to_tail ) { if ( ++this_cpu(lost_records) == 1 ) this_cpu(lost_records_first_tsc)=(u64)get_cycles(); started_below_highwater = 0; goto unlock; } /* * Now, actually write information */ bytes_to_wrap = calc_bytes_to_wrap(buf); if ( this_cpu(lost_records) ) { if ( LOST_REC_SIZE > bytes_to_wrap ) { insert_wrap_record(buf, LOST_REC_SIZE); bytes_to_wrap = data_size; } insert_lost_records(buf); bytes_to_wrap -= LOST_REC_SIZE; /* LOST_REC might line up perfectly with the buffer wrap */ if ( bytes_to_wrap == 0 ) bytes_to_wrap = data_size; } if ( rec_size > bytes_to_wrap ) insert_wrap_record(buf, rec_size); /* Write the original record */ __insert_record(buf, event, extra, cycles, rec_size, extra_data); unlock: spin_unlock_irqrestore(&this_cpu(t_lock), flags); /* Notify trace buffer consumer that we've crossed the high water mark. */ if ( likely(buf!=NULL) && started_below_highwater && (calc_unconsumed_bytes(buf) >= t_buf_highwater) ) tasklet_schedule(&trace_notify_dom0_tasklet); } void __trace_hypercall(uint32_t event, unsigned long op, const unsigned long *args) { struct { uint32_t op; uint32_t args[6]; } __attribute__((packed)) d; uint32_t *a = d.args; #define APPEND_ARG32(i) \ do { \ unsigned i_ = (i); \ *a++ = args[(i_)]; \ d.op |= TRC_PV_HYPERCALL_V2_ARG_32(i_); \ } while( 0 ) /* * This shouldn't happen as @op should be small enough but just in * case, warn if the argument bits in the trace record would * clobber the hypercall op. */ WARN_ON(op & TRC_PV_HYPERCALL_V2_ARG_MASK); d.op = op; switch ( op ) { case __HYPERVISOR_mmu_update: APPEND_ARG32(1); /* count */ break; case __HYPERVISOR_multicall: APPEND_ARG32(1); /* count */ break; case __HYPERVISOR_grant_table_op: APPEND_ARG32(0); /* cmd */ APPEND_ARG32(2); /* count */ break; case __HYPERVISOR_vcpu_op: APPEND_ARG32(0); /* cmd */ APPEND_ARG32(1); /* vcpuid */ break; case __HYPERVISOR_mmuext_op: APPEND_ARG32(1); /* count */ break; case __HYPERVISOR_sched_op: APPEND_ARG32(0); /* cmd */ break; } __trace_var(event, 1, sizeof(uint32_t) * (1 + (a - d.args)), &d); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/sched_arinc653.c0000664000175000017500000005626012307313555015402 0ustar smbsmb/****************************************************************************** * sched_arinc653.c * * An ARINC653-compatible scheduling algorithm for use in Xen. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 2010, DornerWorks, Ltd. */ #include #include #include #include #include #include #include #include #include #include #include /************************************************************************** * Private Macros * **************************************************************************/ /** * Default timeslice for domain 0. */ #define DEFAULT_TIMESLICE MILLISECS(10) /** * Retrieve the idle VCPU for a given physical CPU */ #define IDLETASK(cpu) (idle_vcpu[cpu]) /** * Return a pointer to the ARINC 653-specific scheduler data information * associated with the given VCPU (vc) */ #define AVCPU(vc) ((arinc653_vcpu_t *)(vc)->sched_priv) /** * Return the global scheduler private data given the scheduler ops pointer */ #define SCHED_PRIV(s) ((a653sched_priv_t *)((s)->sched_data)) /************************************************************************** * Private Type Definitions * **************************************************************************/ /** * The arinc653_vcpu_t structure holds ARINC 653-scheduler-specific * information for all non-idle VCPUs */ typedef struct arinc653_vcpu_s { /* vc points to Xen's struct vcpu so we can get to it from an * arinc653_vcpu_t pointer. */ struct vcpu * vc; /* awake holds whether the VCPU has been woken with vcpu_wake() */ bool_t awake; /* list holds the linked list information for the list this VCPU * is stored in */ struct list_head list; } arinc653_vcpu_t; /** * The sched_entry_t structure holds a single entry of the * ARINC 653 schedule. */ typedef struct sched_entry_s { /* dom_handle holds the handle ("UUID") for the domain that this * schedule entry refers to. */ xen_domain_handle_t dom_handle; /* vcpu_id holds the VCPU number for the VCPU that this schedule * entry refers to. */ int vcpu_id; /* runtime holds the number of nanoseconds that the VCPU for this * schedule entry should be allowed to run per major frame. */ s_time_t runtime; /* vc holds a pointer to the Xen VCPU structure */ struct vcpu * vc; } sched_entry_t; /** * This structure defines data that is global to an instance of the scheduler */ typedef struct a653sched_priv_s { /* lock for the whole pluggable scheduler, nests inside cpupool_lock */ spinlock_t lock; /** * This array holds the active ARINC 653 schedule. * * When the system tries to start a new VCPU, this schedule is scanned * to look for a matching (handle, VCPU #) pair. If both the handle (UUID) * and VCPU number match, then the VCPU is allowed to run. Its run time * (per major frame) is given in the third entry of the schedule. */ sched_entry_t schedule[ARINC653_MAX_DOMAINS_PER_SCHEDULE]; /** * This variable holds the number of entries that are valid in * the arinc653_schedule table. * * This is not necessarily the same as the number of domains in the * schedule. A domain could be listed multiple times within the schedule, * or a domain with multiple VCPUs could have a different * schedule entry for each VCPU. */ unsigned int num_schedule_entries; /** * the major frame time for the ARINC 653 schedule. */ s_time_t major_frame; /** * the time that the next major frame starts */ s_time_t next_major_frame; /** * pointers to all Xen VCPU structures for iterating through */ struct list_head vcpu_list; } a653sched_priv_t; /************************************************************************** * Helper functions * **************************************************************************/ /** * This function compares two domain handles. * * @param h1 Pointer to handle 1 * @param h2 Pointer to handle 2 * * @return
    *
  • <0: handle 1 is less than handle 2 *
  • 0: handle 1 is equal to handle 2 *
  • >0: handle 1 is greater than handle 2 *
*/ static int dom_handle_cmp(const xen_domain_handle_t h1, const xen_domain_handle_t h2) { return memcmp(h1, h2, sizeof(xen_domain_handle_t)); } /** * This function searches the vcpu list to find a VCPU that matches * the domain handle and VCPU ID specified. * * @param ops Pointer to this instance of the scheduler structure * @param handle Pointer to handler * @param vcpu_id VCPU ID * * @return
    *
  • Pointer to the matching VCPU if one is found *
  • NULL otherwise *
*/ static struct vcpu *find_vcpu( const struct scheduler *ops, xen_domain_handle_t handle, int vcpu_id) { arinc653_vcpu_t *avcpu; /* loop through the vcpu_list looking for the specified VCPU */ list_for_each_entry ( avcpu, &SCHED_PRIV(ops)->vcpu_list, list ) if ( (dom_handle_cmp(avcpu->vc->domain->handle, handle) == 0) && (vcpu_id == avcpu->vc->vcpu_id) ) return avcpu->vc; return NULL; } /** * This function updates the pointer to the Xen VCPU structure for each entry * in the ARINC 653 schedule. * * @param ops Pointer to this instance of the scheduler structure * @return */ static void update_schedule_vcpus(const struct scheduler *ops) { unsigned int i, n_entries = SCHED_PRIV(ops)->num_schedule_entries; for ( i = 0; i < n_entries; i++ ) SCHED_PRIV(ops)->schedule[i].vc = find_vcpu(ops, SCHED_PRIV(ops)->schedule[i].dom_handle, SCHED_PRIV(ops)->schedule[i].vcpu_id); } /** * This function is called by the adjust_global scheduler hook to put * in place a new ARINC653 schedule. * * @param ops Pointer to this instance of the scheduler structure * * @return
    *
  • 0 = success *
  • !0 = error *
*/ static int arinc653_sched_set( const struct scheduler *ops, struct xen_sysctl_arinc653_schedule *schedule) { a653sched_priv_t *sched_priv = SCHED_PRIV(ops); s_time_t total_runtime = 0; unsigned int i; unsigned long flags; int rc = -EINVAL; spin_lock_irqsave(&sched_priv->lock, flags); /* Check for valid major frame and number of schedule entries. */ if ( (schedule->major_frame <= 0) || (schedule->num_sched_entries < 1) || (schedule->num_sched_entries > ARINC653_MAX_DOMAINS_PER_SCHEDULE) ) goto fail; for ( i = 0; i < schedule->num_sched_entries; i++ ) { /* Check for a valid VCPU ID and run time. */ if ( (schedule->sched_entries[i].vcpu_id >= MAX_VIRT_CPUS) || (schedule->sched_entries[i].runtime <= 0) ) goto fail; /* Add this entry's run time to total run time. */ total_runtime += schedule->sched_entries[i].runtime; } /* * Error if the major frame is not large enough to run all entries as * indicated by comparing the total run time to the major frame length. */ if ( total_runtime > schedule->major_frame ) goto fail; /* Copy the new schedule into place. */ sched_priv->num_schedule_entries = schedule->num_sched_entries; sched_priv->major_frame = schedule->major_frame; for ( i = 0; i < schedule->num_sched_entries; i++ ) { memcpy(sched_priv->schedule[i].dom_handle, schedule->sched_entries[i].dom_handle, sizeof(sched_priv->schedule[i].dom_handle)); sched_priv->schedule[i].vcpu_id = schedule->sched_entries[i].vcpu_id; sched_priv->schedule[i].runtime = schedule->sched_entries[i].runtime; } update_schedule_vcpus(ops); /* * The newly-installed schedule takes effect immediately. We do not even * wait for the current major frame to expire. * * Signal a new major frame to begin. The next major frame is set up by * the do_schedule callback function when it is next invoked. */ sched_priv->next_major_frame = NOW(); rc = 0; fail: spin_unlock_irqrestore(&sched_priv->lock, flags); return rc; } /** * This function is called by the adjust_global scheduler hook to read the * current ARINC 653 schedule * * @param ops Pointer to this instance of the scheduler structure * @return
    *
  • 0 = success *
  • !0 = error *
*/ static int arinc653_sched_get( const struct scheduler *ops, struct xen_sysctl_arinc653_schedule *schedule) { a653sched_priv_t *sched_priv = SCHED_PRIV(ops); unsigned int i; unsigned long flags; spin_lock_irqsave(&sched_priv->lock, flags); schedule->num_sched_entries = sched_priv->num_schedule_entries; schedule->major_frame = sched_priv->major_frame; for ( i = 0; i < sched_priv->num_schedule_entries; i++ ) { memcpy(schedule->sched_entries[i].dom_handle, sched_priv->schedule[i].dom_handle, sizeof(sched_priv->schedule[i].dom_handle)); schedule->sched_entries[i].vcpu_id = sched_priv->schedule[i].vcpu_id; schedule->sched_entries[i].runtime = sched_priv->schedule[i].runtime; } spin_unlock_irqrestore(&sched_priv->lock, flags); return 0; } /************************************************************************** * Scheduler callback functions * **************************************************************************/ /** * This function performs initialization for an instance of the scheduler. * * @param ops Pointer to this instance of the scheduler structure * * @return
    *
  • 0 = success *
  • !0 = error *
*/ static int a653sched_init(struct scheduler *ops) { a653sched_priv_t *prv; prv = xzalloc(a653sched_priv_t); if ( prv == NULL ) return -ENOMEM; ops->sched_data = prv; prv->next_major_frame = 0; spin_lock_init(&prv->lock); INIT_LIST_HEAD(&prv->vcpu_list); return 0; } /** * This function performs deinitialization for an instance of the scheduler * * @param ops Pointer to this instance of the scheduler structure */ static void a653sched_deinit(const struct scheduler *ops) { xfree(SCHED_PRIV(ops)); } /** * This function allocates scheduler-specific data for a VCPU * * @param ops Pointer to this instance of the scheduler structure * * @return Pointer to the allocated data */ static void * a653sched_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd) { a653sched_priv_t *sched_priv = SCHED_PRIV(ops); arinc653_vcpu_t *svc; unsigned int entry; unsigned long flags; /* * Allocate memory for the ARINC 653-specific scheduler data information * associated with the given VCPU (vc). */ svc = xmalloc(arinc653_vcpu_t); if ( svc == NULL ) return NULL; spin_lock_irqsave(&sched_priv->lock, flags); /* * Add every one of dom0's vcpus to the schedule, as long as there are * slots available. */ if ( vc->domain->domain_id == 0 ) { entry = sched_priv->num_schedule_entries; if ( entry < ARINC653_MAX_DOMAINS_PER_SCHEDULE ) { sched_priv->schedule[entry].dom_handle[0] = '\0'; sched_priv->schedule[entry].vcpu_id = vc->vcpu_id; sched_priv->schedule[entry].runtime = DEFAULT_TIMESLICE; sched_priv->schedule[entry].vc = vc; sched_priv->major_frame += DEFAULT_TIMESLICE; ++sched_priv->num_schedule_entries; } } /* * Initialize our ARINC 653 scheduler-specific information for the VCPU. * The VCPU starts "asleep." When Xen is ready for the VCPU to run, it * will call the vcpu_wake scheduler callback function and our scheduler * will mark the VCPU awake. */ svc->vc = vc; svc->awake = 0; if ( !is_idle_vcpu(vc) ) list_add(&svc->list, &SCHED_PRIV(ops)->vcpu_list); update_schedule_vcpus(ops); spin_unlock_irqrestore(&sched_priv->lock, flags); return svc; } /** * This function frees scheduler-specific VCPU data * * @param ops Pointer to this instance of the scheduler structure */ static void a653sched_free_vdata(const struct scheduler *ops, void *priv) { arinc653_vcpu_t *av = priv; if (av == NULL) return; if ( !is_idle_vcpu(av->vc) ) list_del(&av->list); xfree(av); update_schedule_vcpus(ops); } /** * This function allocates scheduler-specific data for a physical CPU * * We do not actually make use of any per-CPU data but the hypervisor expects * a non-NULL return value * * @param ops Pointer to this instance of the scheduler structure * * @return Pointer to the allocated data */ static void * a653sched_alloc_pdata(const struct scheduler *ops, int cpu) { /* return a non-NULL value to keep schedule.c happy */ return SCHED_PRIV(ops); } /** * This function frees scheduler-specific data for a physical CPU * * @param ops Pointer to this instance of the scheduler structure */ static void a653sched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) { /* nop */ } /** * This function allocates scheduler-specific data for a domain * * We do not actually make use of any per-domain data but the hypervisor * expects a non-NULL return value * * @param ops Pointer to this instance of the scheduler structure * * @return Pointer to the allocated data */ static void * a653sched_alloc_domdata(const struct scheduler *ops, struct domain *dom) { /* return a non-NULL value to keep schedule.c happy */ return SCHED_PRIV(ops); } /** * This function frees scheduler-specific data for a domain * * @param ops Pointer to this instance of the scheduler structure */ static void a653sched_free_domdata(const struct scheduler *ops, void *data) { /* nop */ } /** * Xen scheduler callback function to sleep a VCPU * * @param ops Pointer to this instance of the scheduler structure * @param vc Pointer to the VCPU structure for the current domain */ static void a653sched_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc) { if ( AVCPU(vc) != NULL ) AVCPU(vc)->awake = 0; /* * If the VCPU being put to sleep is the same one that is currently * running, raise a softirq to invoke the scheduler to switch domains. */ if ( per_cpu(schedule_data, vc->processor).curr == vc ) cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ); } /** * Xen scheduler callback function to wake up a VCPU * * @param ops Pointer to this instance of the scheduler structure * @param vc Pointer to the VCPU structure for the current domain */ static void a653sched_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) { if ( AVCPU(vc) != NULL ) AVCPU(vc)->awake = 1; cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ); } /** * Xen scheduler callback function to select a VCPU to run. * This is the main scheduler routine. * * @param ops Pointer to this instance of the scheduler structure * @param now Current time * * @return Address of the VCPU structure scheduled to be run next * Amount of time to execute the returned VCPU * Flag for whether the VCPU was migrated */ static struct task_slice a653sched_do_schedule( const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled) { struct task_slice ret; /* hold the chosen domain */ struct vcpu * new_task = NULL; static unsigned int sched_index = 0; static s_time_t next_switch_time; a653sched_priv_t *sched_priv = SCHED_PRIV(ops); const unsigned int cpu = smp_processor_id(); unsigned long flags; spin_lock_irqsave(&sched_priv->lock, flags); if ( sched_priv->num_schedule_entries < 1 ) sched_priv->next_major_frame = now + DEFAULT_TIMESLICE; else if ( now >= sched_priv->next_major_frame ) { /* time to enter a new major frame * the first time this function is called, this will be true */ /* start with the first domain in the schedule */ sched_index = 0; sched_priv->next_major_frame = now + sched_priv->major_frame; next_switch_time = now + sched_priv->schedule[0].runtime; } else { while ( (now >= next_switch_time) && (sched_index < sched_priv->num_schedule_entries) ) { /* time to switch to the next domain in this major frame */ sched_index++; next_switch_time += sched_priv->schedule[sched_index].runtime; } } /* * If we exhausted the domains in the schedule and still have time left * in the major frame then switch next at the next major frame. */ if ( sched_index >= sched_priv->num_schedule_entries ) next_switch_time = sched_priv->next_major_frame; /* * If there are more domains to run in the current major frame, set * new_task equal to the address of next domain's VCPU structure. * Otherwise, set new_task equal to the address of the idle task's VCPU * structure. */ new_task = (sched_index < sched_priv->num_schedule_entries) ? sched_priv->schedule[sched_index].vc : IDLETASK(cpu); /* Check to see if the new task can be run (awake & runnable). */ if ( !((new_task != NULL) && (AVCPU(new_task) != NULL) && AVCPU(new_task)->awake && vcpu_runnable(new_task)) ) new_task = IDLETASK(cpu); BUG_ON(new_task == NULL); /* * Check to make sure we did not miss a major frame. * This is a good test for robust partitioning. */ BUG_ON(now >= sched_priv->next_major_frame); spin_unlock_irqrestore(&sched_priv->lock, flags); /* Tasklet work (which runs in idle VCPU context) overrides all else. */ if ( tasklet_work_scheduled ) new_task = IDLETASK(cpu); /* Running this task would result in a migration */ if ( !is_idle_vcpu(new_task) && (new_task->processor != cpu) ) new_task = IDLETASK(cpu); /* * Return the amount of time the next domain has to run and the address * of the selected task's VCPU structure. */ ret.time = next_switch_time - now; ret.task = new_task; ret.migrated = 0; BUG_ON(ret.time <= 0); return ret; } /** * Xen scheduler callback function to select a CPU for the VCPU to run on * * @param ops Pointer to this instance of the scheduler structure * @param v Pointer to the VCPU structure for the current domain * * @return Number of selected physical CPU */ static int a653sched_pick_cpu(const struct scheduler *ops, struct vcpu *vc) { cpumask_t *online; unsigned int cpu; /* * If present, prefer vc's current processor, else * just find the first valid vcpu . */ online = cpupool_scheduler_cpumask(vc->domain->cpupool); cpu = cpumask_first(online); if ( cpumask_test_cpu(vc->processor, online) || (cpu >= nr_cpu_ids) ) cpu = vc->processor; return cpu; } /** * Xen scheduler callback function to perform a global (not domain-specific) * adjustment. It is used by the ARINC 653 scheduler to put in place a new * ARINC 653 schedule or to retrieve the schedule currently in place. * * @param ops Pointer to this instance of the scheduler structure * @param sc Pointer to the scheduler operation specified by Domain 0 */ static int a653sched_adjust_global(const struct scheduler *ops, struct xen_sysctl_scheduler_op *sc) { xen_sysctl_arinc653_schedule_t local_sched; int rc = -EINVAL; switch ( sc->cmd ) { case XEN_SYSCTL_SCHEDOP_putinfo: if ( copy_from_guest(&local_sched, sc->u.sched_arinc653.schedule, 1) ) { rc = -EFAULT; break; } rc = arinc653_sched_set(ops, &local_sched); break; case XEN_SYSCTL_SCHEDOP_getinfo: rc = arinc653_sched_get(ops, &local_sched); if ( rc ) break; if ( copy_to_guest(sc->u.sched_arinc653.schedule, &local_sched, 1) ) rc = -EFAULT; break; } return rc; } /** * This structure defines our scheduler for Xen. * The entries tell Xen where to find our scheduler-specific * callback functions. * The symbol must be visible to the rest of Xen at link time. */ const struct scheduler sched_arinc653_def = { .name = "ARINC 653 Scheduler", .opt_name = "arinc653", .sched_id = XEN_SCHEDULER_ARINC653, .sched_data = NULL, .init = a653sched_init, .deinit = a653sched_deinit, .free_vdata = a653sched_free_vdata, .alloc_vdata = a653sched_alloc_vdata, .free_pdata = a653sched_free_pdata, .alloc_pdata = a653sched_alloc_pdata, .free_domdata = a653sched_free_domdata, .alloc_domdata = a653sched_alloc_domdata, .init_domain = NULL, .destroy_domain = NULL, .insert_vcpu = NULL, .remove_vcpu = NULL, .sleep = a653sched_vcpu_sleep, .wake = a653sched_vcpu_wake, .yield = NULL, .context_saved = NULL, .do_schedule = a653sched_do_schedule, .pick_cpu = a653sched_pick_cpu, .adjust = NULL, .adjust_global = a653sched_adjust_global, .dump_settings = NULL, .dump_cpu_state = NULL, .tick_suspend = NULL, .tick_resume = NULL, }; /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/spinlock.c0000664000175000017500000003660312307313555014523 0ustar smbsmb#include #include #include #include #include #include #include #include #include #include #include #ifndef NDEBUG static atomic_t spin_debug __read_mostly = ATOMIC_INIT(0); static void check_lock(struct lock_debug *debug) { int irq_safe = !local_irq_is_enabled(); if ( unlikely(atomic_read(&spin_debug) <= 0) ) return; /* A few places take liberties with this. */ /* BUG_ON(in_irq() && !irq_safe); */ /* * We partition locks into IRQ-safe (always held with IRQs disabled) and * IRQ-unsafe (always held with IRQs enabled) types. The convention for * every lock must be consistently observed else we can deadlock in * IRQ-context rendezvous functions (a rendezvous which gets every CPU * into IRQ context before any CPU is released from the rendezvous). * * If we can mix IRQ-disabled and IRQ-enabled callers, the following can * happen: * * Lock is held by CPU A, with IRQs enabled * * CPU B is spinning on same lock, with IRQs disabled * * Rendezvous starts -- CPU A takes interrupt and enters rendezbous spin * * DEADLOCK -- CPU B will never enter rendezvous, CPU A will never exit * the rendezvous, and will hence never release the lock. * * To guard against this subtle bug we latch the IRQ safety of every * spinlock in the system, on first use. */ if ( unlikely(debug->irq_safe != irq_safe) ) { int seen = cmpxchg(&debug->irq_safe, -1, irq_safe); BUG_ON(seen == !irq_safe); } } static void check_barrier(struct lock_debug *debug) { if ( unlikely(atomic_read(&spin_debug) <= 0) ) return; /* * For a barrier, we have a relaxed IRQ-safety-consistency check. * * It is always safe to spin at the barrier with IRQs enabled -- that does * not prevent us from entering an IRQ-context rendezvous, and nor are * we preventing anyone else from doing so (since we do not actually * acquire the lock during a barrier operation). * * However, if we spin on an IRQ-unsafe lock with IRQs disabled then that * is clearly wrong, for the same reason outlined in check_lock() above. */ BUG_ON(!local_irq_is_enabled() && (debug->irq_safe == 0)); } void spin_debug_enable(void) { atomic_inc(&spin_debug); } void spin_debug_disable(void) { atomic_dec(&spin_debug); } #else /* defined(NDEBUG) */ #define check_lock(l) ((void)0) #define check_barrier(l) ((void)0) #endif #ifdef LOCK_PROFILE #define LOCK_PROFILE_REL \ if (lock->profile) \ { \ lock->profile->time_hold += NOW() - lock->profile->time_locked; \ lock->profile->lock_cnt++; \ } #define LOCK_PROFILE_VAR s_time_t block = 0 #define LOCK_PROFILE_BLOCK block = block ? : NOW(); #define LOCK_PROFILE_GOT \ if (lock->profile) \ { \ lock->profile->time_locked = NOW(); \ if (block) \ { \ lock->profile->time_block += lock->profile->time_locked - block; \ lock->profile->block_cnt++; \ } \ } #else #define LOCK_PROFILE_REL #define LOCK_PROFILE_VAR #define LOCK_PROFILE_BLOCK #define LOCK_PROFILE_GOT #endif void _spin_lock(spinlock_t *lock) { LOCK_PROFILE_VAR; check_lock(&lock->debug); while ( unlikely(!_raw_spin_trylock(&lock->raw)) ) { LOCK_PROFILE_BLOCK; while ( likely(_raw_spin_is_locked(&lock->raw)) ) cpu_relax(); } LOCK_PROFILE_GOT; preempt_disable(); } void _spin_lock_irq(spinlock_t *lock) { LOCK_PROFILE_VAR; ASSERT(local_irq_is_enabled()); local_irq_disable(); check_lock(&lock->debug); while ( unlikely(!_raw_spin_trylock(&lock->raw)) ) { LOCK_PROFILE_BLOCK; local_irq_enable(); while ( likely(_raw_spin_is_locked(&lock->raw)) ) cpu_relax(); local_irq_disable(); } LOCK_PROFILE_GOT; preempt_disable(); } unsigned long _spin_lock_irqsave(spinlock_t *lock) { unsigned long flags; LOCK_PROFILE_VAR; local_irq_save(flags); check_lock(&lock->debug); while ( unlikely(!_raw_spin_trylock(&lock->raw)) ) { LOCK_PROFILE_BLOCK; local_irq_restore(flags); while ( likely(_raw_spin_is_locked(&lock->raw)) ) cpu_relax(); local_irq_save(flags); } LOCK_PROFILE_GOT; preempt_disable(); return flags; } void _spin_unlock(spinlock_t *lock) { preempt_enable(); LOCK_PROFILE_REL; _raw_spin_unlock(&lock->raw); } void _spin_unlock_irq(spinlock_t *lock) { preempt_enable(); LOCK_PROFILE_REL; _raw_spin_unlock(&lock->raw); local_irq_enable(); } void _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { preempt_enable(); LOCK_PROFILE_REL; _raw_spin_unlock(&lock->raw); local_irq_restore(flags); } int _spin_is_locked(spinlock_t *lock) { check_lock(&lock->debug); return _raw_spin_is_locked(&lock->raw); } int _spin_trylock(spinlock_t *lock) { check_lock(&lock->debug); if ( !_raw_spin_trylock(&lock->raw) ) return 0; #ifdef LOCK_PROFILE if (lock->profile) lock->profile->time_locked = NOW(); #endif preempt_disable(); return 1; } void _spin_barrier(spinlock_t *lock) { #ifdef LOCK_PROFILE s_time_t block = NOW(); u64 loop = 0; check_barrier(&lock->debug); do { smp_mb(); loop++;} while ( _raw_spin_is_locked(&lock->raw) ); if ((loop > 1) && lock->profile) { lock->profile->time_block += NOW() - block; lock->profile->block_cnt++; } #else check_barrier(&lock->debug); do { smp_mb(); } while ( _raw_spin_is_locked(&lock->raw) ); #endif smp_mb(); } int _spin_trylock_recursive(spinlock_t *lock) { int cpu = smp_processor_id(); /* Don't allow overflow of recurse_cpu field. */ BUILD_BUG_ON(NR_CPUS > 0xfffu); check_lock(&lock->debug); if ( likely(lock->recurse_cpu != cpu) ) { if ( !spin_trylock(lock) ) return 0; lock->recurse_cpu = cpu; } /* We support only fairly shallow recursion, else the counter overflows. */ ASSERT(lock->recurse_cnt < 0xfu); lock->recurse_cnt++; return 1; } void _spin_lock_recursive(spinlock_t *lock) { while ( !spin_trylock_recursive(lock) ) cpu_relax(); } void _spin_unlock_recursive(spinlock_t *lock) { if ( likely(--lock->recurse_cnt == 0) ) { lock->recurse_cpu = 0xfffu; spin_unlock(lock); } } void _read_lock(rwlock_t *lock) { check_lock(&lock->debug); while ( unlikely(!_raw_read_trylock(&lock->raw)) ) { while ( likely(_raw_rw_is_write_locked(&lock->raw)) ) cpu_relax(); } preempt_disable(); } void _read_lock_irq(rwlock_t *lock) { ASSERT(local_irq_is_enabled()); local_irq_disable(); check_lock(&lock->debug); while ( unlikely(!_raw_read_trylock(&lock->raw)) ) { local_irq_enable(); while ( likely(_raw_rw_is_write_locked(&lock->raw)) ) cpu_relax(); local_irq_disable(); } preempt_disable(); } unsigned long _read_lock_irqsave(rwlock_t *lock) { unsigned long flags; local_irq_save(flags); check_lock(&lock->debug); while ( unlikely(!_raw_read_trylock(&lock->raw)) ) { local_irq_restore(flags); while ( likely(_raw_rw_is_write_locked(&lock->raw)) ) cpu_relax(); local_irq_save(flags); } preempt_disable(); return flags; } int _read_trylock(rwlock_t *lock) { check_lock(&lock->debug); if ( !_raw_read_trylock(&lock->raw) ) return 0; preempt_disable(); return 1; } void _read_unlock(rwlock_t *lock) { preempt_enable(); _raw_read_unlock(&lock->raw); } void _read_unlock_irq(rwlock_t *lock) { preempt_enable(); _raw_read_unlock(&lock->raw); local_irq_enable(); } void _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { preempt_enable(); _raw_read_unlock(&lock->raw); local_irq_restore(flags); } void _write_lock(rwlock_t *lock) { check_lock(&lock->debug); while ( unlikely(!_raw_write_trylock(&lock->raw)) ) { while ( likely(_raw_rw_is_locked(&lock->raw)) ) cpu_relax(); } preempt_disable(); } void _write_lock_irq(rwlock_t *lock) { ASSERT(local_irq_is_enabled()); local_irq_disable(); check_lock(&lock->debug); while ( unlikely(!_raw_write_trylock(&lock->raw)) ) { local_irq_enable(); while ( likely(_raw_rw_is_locked(&lock->raw)) ) cpu_relax(); local_irq_disable(); } preempt_disable(); } unsigned long _write_lock_irqsave(rwlock_t *lock) { unsigned long flags; local_irq_save(flags); check_lock(&lock->debug); while ( unlikely(!_raw_write_trylock(&lock->raw)) ) { local_irq_restore(flags); while ( likely(_raw_rw_is_locked(&lock->raw)) ) cpu_relax(); local_irq_save(flags); } preempt_disable(); return flags; } int _write_trylock(rwlock_t *lock) { check_lock(&lock->debug); if ( !_raw_write_trylock(&lock->raw) ) return 0; preempt_disable(); return 1; } void _write_unlock(rwlock_t *lock) { preempt_enable(); _raw_write_unlock(&lock->raw); } void _write_unlock_irq(rwlock_t *lock) { preempt_enable(); _raw_write_unlock(&lock->raw); local_irq_enable(); } void _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { preempt_enable(); _raw_write_unlock(&lock->raw); local_irq_restore(flags); } int _rw_is_locked(rwlock_t *lock) { check_lock(&lock->debug); return _raw_rw_is_locked(&lock->raw); } int _rw_is_write_locked(rwlock_t *lock) { check_lock(&lock->debug); return _raw_rw_is_write_locked(&lock->raw); } #ifdef LOCK_PROFILE struct lock_profile_anc { struct lock_profile_qhead *head_q; /* first head of this type */ char *name; /* descriptive string for print */ }; typedef void lock_profile_subfunc( struct lock_profile *, int32_t, int32_t, void *); extern struct lock_profile *__lock_profile_start; extern struct lock_profile *__lock_profile_end; static s_time_t lock_profile_start; static struct lock_profile_anc lock_profile_ancs[LOCKPROF_TYPE_N]; static struct lock_profile_qhead lock_profile_glb_q; static spinlock_t lock_profile_lock = SPIN_LOCK_UNLOCKED; static void spinlock_profile_iterate(lock_profile_subfunc *sub, void *par) { int i; struct lock_profile_qhead *hq; struct lock_profile *eq; spin_lock(&lock_profile_lock); for ( i = 0; i < LOCKPROF_TYPE_N; i++ ) for ( hq = lock_profile_ancs[i].head_q; hq; hq = hq->head_q ) for ( eq = hq->elem_q; eq; eq = eq->next ) sub(eq, i, hq->idx, par); spin_unlock(&lock_profile_lock); } static void spinlock_profile_print_elem(struct lock_profile *data, int32_t type, int32_t idx, void *par) { if ( type == LOCKPROF_TYPE_GLOBAL ) printk("%s %s:\n", lock_profile_ancs[type].name, data->name); else printk("%s %d %s:\n", lock_profile_ancs[type].name, idx, data->name); printk(" lock:%12"PRId64"(%08X:%08X), block:%12"PRId64"(%08X:%08X)\n", data->lock_cnt, (u32)(data->time_hold >> 32), (u32)data->time_hold, data->block_cnt, (u32)(data->time_block >> 32), (u32)data->time_block); } void spinlock_profile_printall(unsigned char key) { s_time_t now = NOW(); s_time_t diff; diff = now - lock_profile_start; printk("Xen lock profile info SHOW (now = %08X:%08X, " "total = %08X:%08X)\n", (u32)(now>>32), (u32)now, (u32)(diff>>32), (u32)diff); spinlock_profile_iterate(spinlock_profile_print_elem, NULL); } static void spinlock_profile_reset_elem(struct lock_profile *data, int32_t type, int32_t idx, void *par) { data->lock_cnt = 0; data->block_cnt = 0; data->time_hold = 0; data->time_block = 0; } void spinlock_profile_reset(unsigned char key) { s_time_t now = NOW(); if ( key != '\0' ) printk("Xen lock profile info RESET (now = %08X:%08X)\n", (u32)(now>>32), (u32)now); lock_profile_start = now; spinlock_profile_iterate(spinlock_profile_reset_elem, NULL); } typedef struct { xen_sysctl_lockprof_op_t *pc; int rc; } spinlock_profile_ucopy_t; static void spinlock_profile_ucopy_elem(struct lock_profile *data, int32_t type, int32_t idx, void *par) { spinlock_profile_ucopy_t *p = par; xen_sysctl_lockprof_data_t elem; if ( p->rc ) return; if ( p->pc->nr_elem < p->pc->max_elem ) { safe_strcpy(elem.name, data->name); elem.type = type; elem.idx = idx; elem.lock_cnt = data->lock_cnt; elem.block_cnt = data->block_cnt; elem.lock_time = data->time_hold; elem.block_time = data->time_block; if ( copy_to_guest_offset(p->pc->data, p->pc->nr_elem, &elem, 1) ) p->rc = -EFAULT; } if ( !p->rc ) p->pc->nr_elem++; } /* Dom0 control of lock profiling */ int spinlock_profile_control(xen_sysctl_lockprof_op_t *pc) { int rc = 0; spinlock_profile_ucopy_t par; switch ( pc->cmd ) { case XEN_SYSCTL_LOCKPROF_reset: spinlock_profile_reset('\0'); break; case XEN_SYSCTL_LOCKPROF_query: pc->nr_elem = 0; par.rc = 0; par.pc = pc; spinlock_profile_iterate(spinlock_profile_ucopy_elem, &par); pc->time = NOW() - lock_profile_start; rc = par.rc; break; default: rc = -EINVAL; break; } return rc; } void _lock_profile_register_struct( int32_t type, struct lock_profile_qhead *qhead, int32_t idx, char *name) { qhead->idx = idx; spin_lock(&lock_profile_lock); qhead->head_q = lock_profile_ancs[type].head_q; lock_profile_ancs[type].head_q = qhead; lock_profile_ancs[type].name = name; spin_unlock(&lock_profile_lock); } void _lock_profile_deregister_struct( int32_t type, struct lock_profile_qhead *qhead) { struct lock_profile_qhead **q; spin_lock(&lock_profile_lock); for ( q = &lock_profile_ancs[type].head_q; *q; q = &(*q)->head_q ) { if ( *q == qhead ) { *q = qhead->head_q; break; } } spin_unlock(&lock_profile_lock); } static int __init lock_prof_init(void) { struct lock_profile **q; for ( q = &__lock_profile_start; q < &__lock_profile_end; q++ ) { (*q)->next = lock_profile_glb_q.elem_q; lock_profile_glb_q.elem_q = *q; (*q)->lock->profile = *q; } _lock_profile_register_struct( LOCKPROF_TYPE_GLOBAL, &lock_profile_glb_q, 0, "Global lock"); return 0; } __initcall(lock_prof_init); #endif /* LOCK_PROFILE */ xen-4.4.0/xen/common/sort.c0000664000175000017500000000413512307313555013663 0ustar smbsmb/* * A fast, small, non-recursive O(nlog n) sort for the Linux kernel * * Jan 23 2005 Matt Mackall */ #include static void u32_swap(void *a, void *b, int size) { u32 t = *(u32 *)a; *(u32 *)a = *(u32 *)b; *(u32 *)b = t; } static void generic_swap(void *a, void *b, int size) { char t; do { t = *(char *)a; *(char *)a++ = *(char *)b; *(char *)b++ = t; } while ( --size > 0 ); } /* * sort - sort an array of elements * @base: pointer to data to sort * @num: number of elements * @size: size of each element * @cmp: pointer to comparison function * @swap: pointer to swap function or NULL * * This function does a heapsort on the given array. You may provide a * swap function optimized to your element type. * * Sorting time is O(n log n) both on average and worst-case. While * qsort is about 20% faster on average, it suffers from exploitable * O(n*n) worst-case behavior and extra memory requirements that make * it less suitable for kernel use. */ void sort(void *base, size_t num, size_t size, int (*cmp)(const void *, const void *), void (*swap)(void *, void *, int size)) { /* pre-scale counters for performance */ int i = (num/2) * size, n = num * size, c, r; if (!swap) swap = (size == 4 ? u32_swap : generic_swap); /* heapify */ for ( ; i >= 0; i -= size ) { for ( r = i; r * 2 < n; r = c ) { c = r * 2; if ( (c < n - size) && (cmp(base + c, base + c + size) < 0) ) c += size; if ( cmp(base + r, base + c) >= 0 ) break; swap(base + r, base + c, size); } } /* sort */ for ( i = n - size; i >= 0; i -= size ) { swap(base, base + i, size); for ( r = 0; r * 2 < i; r = c ) { c = r * 2; if ( (c < i - size) && (cmp(base + c, base + c + size) < 0) ) c += size; if ( cmp(base + r, base + c) >= 0 ) break; swap(base + r, base + c, size); } } } xen-4.4.0/xen/common/rbtree.c0000664000175000017500000002542312307313555014162 0ustar smbsmb/* Red Black Trees (C) 1999 Andrea Arcangeli (C) 2002 David Woodhouse This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA linux/lib/rbtree.c */ #include #include #include static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) { struct rb_node *right = node->rb_right; struct rb_node *parent = rb_parent(node); if ((node->rb_right = right->rb_left)) rb_set_parent(right->rb_left, node); right->rb_left = node; rb_set_parent(right, parent); if (parent) { if (node == parent->rb_left) parent->rb_left = right; else parent->rb_right = right; } else root->rb_node = right; rb_set_parent(node, right); } static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) { struct rb_node *left = node->rb_left; struct rb_node *parent = rb_parent(node); if ((node->rb_left = left->rb_right)) rb_set_parent(left->rb_right, node); left->rb_right = node; rb_set_parent(left, parent); if (parent) { if (node == parent->rb_right) parent->rb_right = left; else parent->rb_left = left; } else root->rb_node = left; rb_set_parent(node, left); } void rb_insert_color(struct rb_node *node, struct rb_root *root) { struct rb_node *parent, *gparent; while ((parent = rb_parent(node)) && rb_is_red(parent)) { gparent = rb_parent(parent); if (parent == gparent->rb_left) { { register struct rb_node *uncle = gparent->rb_right; if (uncle && rb_is_red(uncle)) { rb_set_black(uncle); rb_set_black(parent); rb_set_red(gparent); node = gparent; continue; } } if (parent->rb_right == node) { register struct rb_node *tmp; __rb_rotate_left(parent, root); tmp = parent; parent = node; node = tmp; } rb_set_black(parent); rb_set_red(gparent); __rb_rotate_right(gparent, root); } else { { register struct rb_node *uncle = gparent->rb_left; if (uncle && rb_is_red(uncle)) { rb_set_black(uncle); rb_set_black(parent); rb_set_red(gparent); node = gparent; continue; } } if (parent->rb_left == node) { register struct rb_node *tmp; __rb_rotate_right(parent, root); tmp = parent; parent = node; node = tmp; } rb_set_black(parent); rb_set_red(gparent); __rb_rotate_left(gparent, root); } } rb_set_black(root->rb_node); } EXPORT_SYMBOL(rb_insert_color); static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, struct rb_root *root) { struct rb_node *other; while ((!node || rb_is_black(node)) && node != root->rb_node) { if (parent->rb_left == node) { other = parent->rb_right; if (rb_is_red(other)) { rb_set_black(other); rb_set_red(parent); __rb_rotate_left(parent, root); other = parent->rb_right; } if ((!other->rb_left || rb_is_black(other->rb_left)) && (!other->rb_right || rb_is_black(other->rb_right))) { rb_set_red(other); node = parent; parent = rb_parent(node); } else { if (!other->rb_right || rb_is_black(other->rb_right)) { struct rb_node *o_left; if ((o_left = other->rb_left)) rb_set_black(o_left); rb_set_red(other); __rb_rotate_right(other, root); other = parent->rb_right; } rb_set_color(other, rb_color(parent)); rb_set_black(parent); if (other->rb_right) rb_set_black(other->rb_right); __rb_rotate_left(parent, root); node = root->rb_node; break; } } else { other = parent->rb_left; if (rb_is_red(other)) { rb_set_black(other); rb_set_red(parent); __rb_rotate_right(parent, root); other = parent->rb_left; } if ((!other->rb_left || rb_is_black(other->rb_left)) && (!other->rb_right || rb_is_black(other->rb_right))) { rb_set_red(other); node = parent; parent = rb_parent(node); } else { if (!other->rb_left || rb_is_black(other->rb_left)) { register struct rb_node *o_right; if ((o_right = other->rb_right)) rb_set_black(o_right); rb_set_red(other); __rb_rotate_left(other, root); other = parent->rb_left; } rb_set_color(other, rb_color(parent)); rb_set_black(parent); if (other->rb_left) rb_set_black(other->rb_left); __rb_rotate_right(parent, root); node = root->rb_node; break; } } } if (node) rb_set_black(node); } void rb_erase(struct rb_node *node, struct rb_root *root) { struct rb_node *child, *parent; int color; if (!node->rb_left) child = node->rb_right; else if (!node->rb_right) child = node->rb_left; else { struct rb_node *old = node, *left; node = node->rb_right; while ((left = node->rb_left) != NULL) node = left; child = node->rb_right; parent = rb_parent(node); color = rb_color(node); if (child) rb_set_parent(child, parent); if (parent == old) { parent->rb_right = child; parent = node; } else parent->rb_left = child; node->rb_parent_color = old->rb_parent_color; node->rb_right = old->rb_right; node->rb_left = old->rb_left; if (rb_parent(old)) { if (rb_parent(old)->rb_left == old) rb_parent(old)->rb_left = node; else rb_parent(old)->rb_right = node; } else root->rb_node = node; rb_set_parent(old->rb_left, node); if (old->rb_right) rb_set_parent(old->rb_right, node); goto color; } parent = rb_parent(node); color = rb_color(node); if (child) rb_set_parent(child, parent); if (parent) { if (parent->rb_left == node) parent->rb_left = child; else parent->rb_right = child; } else root->rb_node = child; color: if (color == RB_BLACK) __rb_erase_color(child, parent, root); } EXPORT_SYMBOL(rb_erase); /* * This function returns the first node (in sort order) of the tree. */ struct rb_node *rb_first(struct rb_root *root) { struct rb_node *n; n = root->rb_node; if (!n) return NULL; while (n->rb_left) n = n->rb_left; return n; } EXPORT_SYMBOL(rb_first); struct rb_node *rb_last(struct rb_root *root) { struct rb_node *n; n = root->rb_node; if (!n) return NULL; while (n->rb_right) n = n->rb_right; return n; } EXPORT_SYMBOL(rb_last); struct rb_node *rb_next(struct rb_node *node) { struct rb_node *parent; if (rb_parent(node) == node) return NULL; /* If we have a right-hand child, go down and then left as far as we can. */ if (node->rb_right) { node = node->rb_right; while (node->rb_left) node=node->rb_left; return node; } /* No right-hand children. Everything down and left is smaller than us, so any 'next' node must be in the general direction of our parent. Go up the tree; any time the ancestor is a right-hand child of its parent, keep going up. First time it's a left-hand child of its parent, said parent is our 'next' node. */ while ((parent = rb_parent(node)) && node == parent->rb_right) node = parent; return parent; } EXPORT_SYMBOL(rb_next); struct rb_node *rb_prev(struct rb_node *node) { struct rb_node *parent; if (rb_parent(node) == node) return NULL; /* If we have a left-hand child, go down and then right as far as we can. */ if (node->rb_left) { node = node->rb_left; while (node->rb_right) node=node->rb_right; return node; } /* No left-hand children. Go up till we find an ancestor which is a right-hand child of its parent */ while ((parent = rb_parent(node)) && node == parent->rb_left) node = parent; return parent; } EXPORT_SYMBOL(rb_prev); void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root) { struct rb_node *parent = rb_parent(victim); /* Set the surrounding nodes to point to the replacement */ if (parent) { if (victim == parent->rb_left) parent->rb_left = new; else parent->rb_right = new; } else { root->rb_node = new; } if (victim->rb_left) rb_set_parent(victim->rb_left, new); if (victim->rb_right) rb_set_parent(victim->rb_right, new); /* Copy the pointers/colour from the victim to the replacement */ *new = *victim; } EXPORT_SYMBOL(rb_replace_node); xen-4.4.0/xen/common/hvm/0000775000175000017500000000000012307313555013317 5ustar smbsmbxen-4.4.0/xen/common/hvm/Makefile0000664000175000017500000000002012307313555014747 0ustar smbsmbobj-y += save.o xen-4.4.0/xen/common/hvm/save.c0000664000175000017500000002357112307313555014431 0ustar smbsmb/* * hvm/save.c: Save and restore HVM guest's emulated hardware state. * * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2007, Isaku Yamahata * VA Linux Systems Japan K.K. * split arch generic part * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include /* List of handlers for various HVM save and restore types */ static struct { hvm_save_handler save; hvm_load_handler load; const char *name; size_t size; int kind; } hvm_sr_handlers [HVM_SAVE_CODE_MAX + 1] = {{NULL, NULL, ""},}; /* Init-time function to add entries to that list */ void __init hvm_register_savevm(uint16_t typecode, const char *name, hvm_save_handler save_state, hvm_load_handler load_state, size_t size, int kind) { ASSERT(typecode <= HVM_SAVE_CODE_MAX); ASSERT(hvm_sr_handlers[typecode].save == NULL); ASSERT(hvm_sr_handlers[typecode].load == NULL); hvm_sr_handlers[typecode].save = save_state; hvm_sr_handlers[typecode].load = load_state; hvm_sr_handlers[typecode].name = name; hvm_sr_handlers[typecode].size = size; hvm_sr_handlers[typecode].kind = kind; } size_t hvm_save_size(struct domain *d) { struct vcpu *v; size_t sz; int i; /* Basic overhead for header and footer */ sz = (2 * sizeof (struct hvm_save_descriptor)) + HVM_SAVE_LENGTH(HEADER); /* Plus space for each thing we will be saving */ for ( i = 0; i <= HVM_SAVE_CODE_MAX; i++ ) if ( hvm_sr_handlers[i].kind == HVMSR_PER_VCPU ) for_each_vcpu(d, v) sz += hvm_sr_handlers[i].size; else sz += hvm_sr_handlers[i].size; return sz; } /* Extract a single instance of a save record, by marshalling all * records of that type and copying out the one we need. */ int hvm_save_one(struct domain *d, uint16_t typecode, uint16_t instance, XEN_GUEST_HANDLE_64(uint8) handle) { int rv = 0; size_t sz = 0; struct vcpu *v; hvm_domain_context_t ctxt = { 0, }; if ( d->is_dying || typecode > HVM_SAVE_CODE_MAX || hvm_sr_handlers[typecode].size < sizeof(struct hvm_save_descriptor) || hvm_sr_handlers[typecode].save == NULL ) return -EINVAL; if ( hvm_sr_handlers[typecode].kind == HVMSR_PER_VCPU ) for_each_vcpu(d, v) sz += hvm_sr_handlers[typecode].size; else sz = hvm_sr_handlers[typecode].size; ctxt.size = sz; ctxt.data = xmalloc_bytes(sz); if ( !ctxt.data ) return -ENOMEM; if ( hvm_sr_handlers[typecode].save(d, &ctxt) != 0 ) { printk(XENLOG_G_ERR "HVM%d save: failed to save type %"PRIu16"\n", d->domain_id, typecode); rv = -EFAULT; } else { uint32_t off; const struct hvm_save_descriptor *desc; rv = -EBADSLT; for ( off = 0; off < (ctxt.cur - sizeof(*desc)); off += desc->length ) { desc = (void *)(ctxt.data + off); /* Move past header */ off += sizeof(*desc); if ( instance == desc->instance ) { uint32_t copy_length = desc->length; if ( off + copy_length > ctxt.cur ) copy_length = ctxt.cur - off; rv = 0; if ( copy_to_guest(handle, ctxt.data + off, copy_length) ) rv = -EFAULT; break; } } } xfree(ctxt.data); return rv; } int hvm_save(struct domain *d, hvm_domain_context_t *h) { char *c; struct hvm_save_header hdr; struct hvm_save_end end; hvm_save_handler handler; uint16_t i; if ( d->is_dying ) return -EINVAL; hdr.magic = HVM_FILE_MAGIC; hdr.version = HVM_FILE_VERSION; /* Save xen changeset */ c = strrchr(xen_changeset(), ':'); if ( c ) hdr.changeset = simple_strtoll(c, NULL, 16); else hdr.changeset = -1ULL; /* Unknown */ arch_hvm_save(d, &hdr); if ( hvm_save_entry(HEADER, 0, h, &hdr) != 0 ) { printk(XENLOG_G_ERR "HVM%d save: failed to write header\n", d->domain_id); return -EFAULT; } /* Save all available kinds of state */ for ( i = 0; i <= HVM_SAVE_CODE_MAX; i++ ) { handler = hvm_sr_handlers[i].save; if ( handler != NULL ) { printk(XENLOG_G_INFO "HVM%d save: %s\n", d->domain_id, hvm_sr_handlers[i].name); if ( handler(d, h) != 0 ) { printk(XENLOG_G_ERR "HVM%d save: failed to save type %"PRIu16"\n", d->domain_id, i); return -EFAULT; } } } /* Save an end-of-file marker */ if ( hvm_save_entry(END, 0, h, &end) != 0 ) { /* Run out of data */ printk(XENLOG_G_ERR "HVM%d save: no room for end marker\n", d->domain_id); return -EFAULT; } /* Save macros should not have let us overrun */ ASSERT(h->cur <= h->size); return 0; } int hvm_load(struct domain *d, hvm_domain_context_t *h) { struct hvm_save_header hdr; struct hvm_save_descriptor *desc; hvm_load_handler handler; struct vcpu *v; if ( d->is_dying ) return -EINVAL; /* Read the save header, which must be first */ if ( hvm_load_entry(HEADER, h, &hdr) != 0 ) return -1; if ( arch_hvm_load(d, &hdr) ) return -1; /* Down all the vcpus: we only re-enable the ones that had state saved. */ for_each_vcpu(d, v) if ( test_and_set_bit(_VPF_down, &v->pause_flags) ) vcpu_sleep_nosync(v); for ( ; ; ) { if ( h->size - h->cur < sizeof(struct hvm_save_descriptor) ) { /* Run out of data */ printk(XENLOG_G_ERR "HVM%d restore: save did not end with a null entry\n", d->domain_id); return -1; } /* Read the typecode of the next entry and check for the end-marker */ desc = (struct hvm_save_descriptor *)(&h->data[h->cur]); if ( desc->typecode == 0 ) return 0; /* Find the handler for this entry */ if ( (desc->typecode > HVM_SAVE_CODE_MAX) || ((handler = hvm_sr_handlers[desc->typecode].load) == NULL) ) { printk(XENLOG_G_ERR "HVM%d restore: unknown entry typecode %u\n", d->domain_id, desc->typecode); return -1; } /* Load the entry */ printk(XENLOG_G_INFO "HVM%d restore: %s %"PRIu16"\n", d->domain_id, hvm_sr_handlers[desc->typecode].name, desc->instance); if ( handler(d, h) != 0 ) { printk(XENLOG_G_ERR "HVM%d restore: failed to load entry %u/%u\n", d->domain_id, desc->typecode, desc->instance); return -1; } } /* Not reached */ } int _hvm_init_entry(struct hvm_domain_context *h, uint16_t tc, uint16_t inst, uint32_t len) { struct hvm_save_descriptor *d = (struct hvm_save_descriptor *)&h->data[h->cur]; if ( h->size - h->cur < len + sizeof (*d) ) { printk(XENLOG_G_WARNING "HVM save: no room for" " %"PRIu32" + %zu bytes for typecode %"PRIu16"\n", len, sizeof(*d), tc); return -1; } d->typecode = tc; d->instance = inst; d->length = len; h->cur += sizeof(*d); return 0; } void _hvm_write_entry(struct hvm_domain_context *h, void *src, uint32_t src_len) { memcpy(&h->data[h->cur], src, src_len); h->cur += src_len; } int _hvm_check_entry(struct hvm_domain_context *h, uint16_t type, uint32_t len, bool_t strict_length) { struct hvm_save_descriptor *d = (struct hvm_save_descriptor *)&h->data[h->cur]; if ( len + sizeof (*d) > h->size - h->cur) { printk(XENLOG_G_WARNING "HVM restore: not enough data left to read %u bytes " "for type %u\n", len, type); return -1; } if ( (type != d->typecode) || (len < d->length) || (strict_length && (len != d->length)) ) { printk(XENLOG_G_WARNING "HVM restore mismatch: expected type %u length %u, " "saw type %u length %u\n", type, len, d->typecode, d->length); return -1; } h->cur += sizeof(*d); return 0; } void _hvm_read_entry(struct hvm_domain_context *h, void *dest, uint32_t dest_len) { struct hvm_save_descriptor *d = (struct hvm_save_descriptor *)&h->data[h->cur - sizeof(*d)]; BUG_ON(d->length > dest_len); memcpy(dest, &h->data[h->cur], d->length); if ( d->length < dest_len ) memset((char *)dest + d->length, 0, dest_len - d->length); h->cur += d->length; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/bitmap.c0000664000175000017500000003440212307313555014150 0ustar smbsmb/* * lib/bitmap.c * Helper functions for bitmap.h. * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ #include #include #include #include #include #include /* * bitmaps provide an array of bits, implemented using an an * array of unsigned longs. The number of valid bits in a * given bitmap does _not_ need to be an exact multiple of * BITS_PER_LONG. * * The possible unused bits in the last, partially used word * of a bitmap are 'don't care'. The implementation makes * no particular effort to keep them zero. It ensures that * their value will not affect the results of any operation. * The bitmap operations that return Boolean (bitmap_empty, * for example) or scalar (bitmap_weight, for example) results * carefully filter out these unused bits from impacting their * results. * * These operations actually hold to a slightly stronger rule: * if you don't input any bitmaps to these ops that have some * unused bits set, then they won't output any set unused bits * in output bitmaps. * * The byte ordering of bitmaps is more natural on little * endian architectures. See the big-endian headers * include/asm-ppc64/bitops.h and include/asm-s390/bitops.h * for the best explanations of this ordering. */ /* * If a bitmap has a number of bits which is not a multiple of 8 then * the last few bits of the last byte of the bitmap can be * unexpectedly set which can confuse consumers (e.g. in the tools) * who also round up their loops to 8 bits. Ensure we clear those left * over bits so as to prevent surprises. */ static void clamp_last_byte(uint8_t *bp, unsigned int nbits) { unsigned int remainder = nbits % 8; if (remainder) bp[nbits/8] &= (1U << remainder) - 1; } int __bitmap_empty(const unsigned long *bitmap, int bits) { int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap[k]) return 0; if (bits % BITS_PER_LONG) if (bitmap[k] & BITMAP_LAST_WORD_MASK(bits)) return 0; return 1; } EXPORT_SYMBOL(__bitmap_empty); int __bitmap_full(const unsigned long *bitmap, int bits) { int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (~bitmap[k]) return 0; if (bits % BITS_PER_LONG) if (~bitmap[k] & BITMAP_LAST_WORD_MASK(bits)) return 0; return 1; } EXPORT_SYMBOL(__bitmap_full); int __bitmap_equal(const unsigned long *bitmap1, const unsigned long *bitmap2, int bits) { int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] != bitmap2[k]) return 0; if (bits % BITS_PER_LONG) if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) return 0; return 1; } EXPORT_SYMBOL(__bitmap_equal); void __bitmap_complement(unsigned long *dst, const unsigned long *src, int bits) { int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) dst[k] = ~src[k]; if (bits % BITS_PER_LONG) dst[k] = ~src[k] & BITMAP_LAST_WORD_MASK(bits); } EXPORT_SYMBOL(__bitmap_complement); /* * __bitmap_shift_right - logical right shift of the bits in a bitmap * @dst - destination bitmap * @src - source bitmap * @nbits - shift by this many bits * @bits - bitmap size, in bits * * Shifting right (dividing) means moving bits in the MS -> LS bit * direction. Zeros are fed into the vacated MS positions and the * LS bits shifted off the bottom are lost. */ void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, int shift, int bits) { int k, lim = BITS_TO_LONGS(bits), left = bits % BITS_PER_LONG; int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG; unsigned long mask = (1UL << left) - 1; for (k = 0; off + k < lim; ++k) { unsigned long upper, lower; /* * If shift is not word aligned, take lower rem bits of * word above and make them the top rem bits of result. */ if (!rem || off + k + 1 >= lim) upper = 0; else { upper = src[off + k + 1]; if (off + k + 1 == lim - 1 && left) upper &= mask; } lower = src[off + k]; if (left && off + k == lim - 1) lower &= mask; dst[k] = upper << (BITS_PER_LONG - rem) | lower >> rem; if (left && k == lim - 1) dst[k] &= mask; } if (off) memset(&dst[lim - off], 0, off*sizeof(unsigned long)); } EXPORT_SYMBOL(__bitmap_shift_right); /* * __bitmap_shift_left - logical left shift of the bits in a bitmap * @dst - destination bitmap * @src - source bitmap * @nbits - shift by this many bits * @bits - bitmap size, in bits * * Shifting left (multiplying) means moving bits in the LS -> MS * direction. Zeros are fed into the vacated LS bit positions * and those MS bits shifted off the top are lost. */ void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, int shift, int bits) { int k, lim = BITS_TO_LONGS(bits), left = bits % BITS_PER_LONG; int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG; for (k = lim - off - 1; k >= 0; --k) { unsigned long upper, lower; /* * If shift is not word aligned, take upper rem bits of * word below and make them the bottom rem bits of result. */ if (rem && k > 0) lower = src[k - 1]; else lower = 0; upper = src[k]; if (left && k == lim - 1) upper &= (1UL << left) - 1; dst[k + off] = lower >> (BITS_PER_LONG - rem) | upper << rem; if (left && k + off == lim - 1) dst[k + off] &= (1UL << left) - 1; } if (off) memset(dst, 0, off*sizeof(unsigned long)); } EXPORT_SYMBOL(__bitmap_shift_left); void __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits) { int k; int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] & bitmap2[k]; } EXPORT_SYMBOL(__bitmap_and); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits) { int k; int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] | bitmap2[k]; } EXPORT_SYMBOL(__bitmap_or); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits) { int k; int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] ^ bitmap2[k]; } EXPORT_SYMBOL(__bitmap_xor); void __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits) { int k; int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] & ~bitmap2[k]; } EXPORT_SYMBOL(__bitmap_andnot); int __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, int bits) { int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & bitmap2[k]) return 1; if (bits % BITS_PER_LONG) if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) return 1; return 0; } EXPORT_SYMBOL(__bitmap_intersects); int __bitmap_subset(const unsigned long *bitmap1, const unsigned long *bitmap2, int bits) { int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & ~bitmap2[k]) return 0; if (bits % BITS_PER_LONG) if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) return 0; return 1; } EXPORT_SYMBOL(__bitmap_subset); #if BITS_PER_LONG == 32 int __bitmap_weight(const unsigned long *bitmap, int bits) { int k, w = 0, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; k++) w += hweight32(bitmap[k]); if (bits % BITS_PER_LONG) w += hweight32(bitmap[k] & BITMAP_LAST_WORD_MASK(bits)); return w; } #else int __bitmap_weight(const unsigned long *bitmap, int bits) { int k, w = 0, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; k++) w += hweight64(bitmap[k]); if (bits % BITS_PER_LONG) w += hweight64(bitmap[k] & BITMAP_LAST_WORD_MASK(bits)); return w; } #endif EXPORT_SYMBOL(__bitmap_weight); /* * Bitmap printing & parsing functions: first version by Bill Irwin, * second version by Paul Jackson, third by Joe Korty. */ #define CHUNKSZ 32 #define nbits_to_hold_value(val) fls(val) #define roundup_power2(val,modulus) (((val) + (modulus) - 1) & ~((modulus) - 1)) #define unhex(c) (isdigit(c) ? (c - '0') : (toupper(c) - 'A' + 10)) #define BASEDEC 10 /* fancier cpuset lists input in decimal */ /** * bitmap_scnprintf - convert bitmap to an ASCII hex string. * @buf: byte buffer into which string is placed * @buflen: reserved size of @buf, in bytes * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * * Exactly @nmaskbits bits are displayed. Hex digits are grouped into * comma-separated sets of eight digits per set. */ int bitmap_scnprintf(char *buf, unsigned int buflen, const unsigned long *maskp, int nmaskbits) { int i, word, bit, len = 0; unsigned long val; const char *sep = ""; int chunksz; u32 chunkmask; chunksz = nmaskbits & (CHUNKSZ - 1); if (chunksz == 0) chunksz = CHUNKSZ; i = roundup_power2(nmaskbits, CHUNKSZ) - CHUNKSZ; for (; i >= 0; i -= CHUNKSZ) { chunkmask = ((1ULL << chunksz) - 1); word = i / BITS_PER_LONG; bit = i % BITS_PER_LONG; val = (maskp[word] >> bit) & chunkmask; len += scnprintf(buf+len, buflen-len, "%s%0*lx", sep, (chunksz+3)/4, val); chunksz = CHUNKSZ; sep = ","; } return len; } EXPORT_SYMBOL(bitmap_scnprintf); /* * bscnl_emit(buf, buflen, rbot, rtop, bp) * * Helper routine for bitmap_scnlistprintf(). Write decimal number * or range to buf, suppressing output past buf+buflen, with optional * comma-prefix. Return len of what would be written to buf, if it * all fit. */ static inline int bscnl_emit(char *buf, int buflen, int rbot, int rtop, int len) { if (len > 0) len += scnprintf(buf + len, buflen - len, ","); if (rbot == rtop) len += scnprintf(buf + len, buflen - len, "%d", rbot); else len += scnprintf(buf + len, buflen - len, "%d-%d", rbot, rtop); return len; } /** * bitmap_scnlistprintf - convert bitmap to list format ASCII string * @buf: byte buffer into which string is placed * @buflen: reserved size of @buf, in bytes * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * * Output format is a comma-separated list of decimal numbers and * ranges. Consecutively set bits are shown as two hyphen-separated * decimal numbers, the smallest and largest bit numbers set in * the range. Output format is compatible with the format * accepted as input by bitmap_parselist(). * * The return value is the number of characters which were output, * excluding the trailing '\0'. */ int bitmap_scnlistprintf(char *buf, unsigned int buflen, const unsigned long *maskp, int nmaskbits) { int len = 0; /* current bit is 'cur', most recently seen range is [rbot, rtop] */ int cur, rbot, rtop; rbot = cur = find_first_bit(maskp, nmaskbits); while (cur < nmaskbits) { rtop = cur; cur = find_next_bit(maskp, nmaskbits, cur+1); if (cur >= nmaskbits || cur > rtop + 1) { len = bscnl_emit(buf, buflen, rbot, rtop, len); rbot = cur; } } if (!len && buflen) *buf = 0; return len; } EXPORT_SYMBOL(bitmap_scnlistprintf); /** * bitmap_find_free_region - find a contiguous aligned mem region * @bitmap: an array of unsigned longs corresponding to the bitmap * @bits: number of bits in the bitmap * @order: region size to find (size is actually 1< BITS_PER_LONG) return -EINVAL; /* make a mask of the order */ mask = (1ul << (pages - 1)); mask += mask - 1; /* run up the bitmap pages bits at a time */ for (i = 0; i < bits; i += pages) { int index = i/BITS_PER_LONG; int offset = i - (index * BITS_PER_LONG); if((bitmap[index] & (mask << offset)) == 0) { /* set region in bimap */ bitmap[index] |= (mask << offset); return i; } } return -ENOMEM; } EXPORT_SYMBOL(bitmap_find_free_region); /** * bitmap_release_region - release allocated bitmap region * @bitmap: a pointer to the bitmap * @pos: the beginning of the region * @order: the order of the bits to release (number is 1< BITS_PER_LONG. The * algorithm would be a simple look for multiple zeros in the * array, but there's no driver today that needs this. If you * trip this BUG(), you get to code it... */ BUG_ON(pages > BITS_PER_LONG); mask += mask - 1; if (bitmap[index] & (mask << offset)) return -EBUSY; bitmap[index] |= (mask << offset); return 0; } EXPORT_SYMBOL(bitmap_allocate_region); #ifdef __BIG_ENDIAN void bitmap_long_to_byte(uint8_t *bp, const unsigned long *lp, int nbits) { unsigned long l; int i, j, b; for (i = 0, b = 0; nbits > 0; i++, b += sizeof(l)) { l = lp[i]; for (j = 0; (j < sizeof(l)) && (nbits > 0); j++) { bp[b+j] = l; l >>= 8; nbits -= 8; } } clamp_last_byte(bp, nbits); } void bitmap_byte_to_long(unsigned long *lp, const uint8_t *bp, int nbits) { unsigned long l; int i, j, b; for (i = 0, b = 0; nbits > 0; i++, b += sizeof(l)) { l = 0; for (j = 0; (j < sizeof(l)) && (nbits > 0); j++) { l |= (unsigned long)bp[b+j] << (j*8); nbits -= 8; } lp[i] = l; } } #elif defined(__LITTLE_ENDIAN) void bitmap_long_to_byte(uint8_t *bp, const unsigned long *lp, int nbits) { memcpy(bp, lp, (nbits+7)/8); clamp_last_byte(bp, nbits); } void bitmap_byte_to_long(unsigned long *lp, const uint8_t *bp, int nbits) { /* We may need to pad the final longword with zeroes. */ if (nbits & (BITS_PER_LONG-1)) lp[BITS_TO_LONGS(nbits)-1] = 0; memcpy(lp, bp, (nbits+7)/8); } #endif xen-4.4.0/xen/common/smp.c0000664000175000017500000000461712307313555013500 0ustar smbsmb/* * xen/common/smp.c * * Generic SMP function * * Copyright (c) 2013 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include /* * Structure and data for smp_call_function()/on_selected_cpus(). */ static DEFINE_SPINLOCK(call_lock); static struct call_data_struct { void (*func) (void *info); void *info; int wait; cpumask_t selected; } call_data; void smp_call_function( void (*func) (void *info), void *info, int wait) { cpumask_t allbutself; cpumask_andnot(&allbutself, &cpu_online_map, cpumask_of(smp_processor_id())); on_selected_cpus(&allbutself, func, info, wait); } void on_selected_cpus( const cpumask_t *selected, void (*func) (void *info), void *info, int wait) { unsigned int nr_cpus; ASSERT(local_irq_is_enabled()); spin_lock(&call_lock); cpumask_copy(&call_data.selected, selected); nr_cpus = cpumask_weight(&call_data.selected); if ( nr_cpus == 0 ) goto out; call_data.func = func; call_data.info = info; call_data.wait = wait; smp_send_call_function_mask(&call_data.selected); while ( !cpumask_empty(&call_data.selected) ) cpu_relax(); out: spin_unlock(&call_lock); } void smp_call_function_interrupt(void) { void (*func)(void *info) = call_data.func; void *info = call_data.info; unsigned int cpu = smp_processor_id(); if ( !cpumask_test_cpu(cpu, &call_data.selected) ) return; irq_enter(); if ( call_data.wait ) { (*func)(info); smp_mb(); cpumask_clear_cpu(cpu, &call_data.selected); } else { smp_mb(); cpumask_clear_cpu(cpu, &call_data.selected); (*func)(info); } irq_exit(); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/xencomm.c0000664000175000017500000003676112307313555014354 0ustar smbsmb/****************************************************************************** * xencomm.c * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright (C) IBM Corp. 2006 * * Authors: Hollis Blanchard * Tristan Gingold * Isaku Yamahata multiple page support */ #include #include #include #include #include #include #undef DEBUG #ifdef DEBUG #define xc_dprintk(f, a...) printk("[xencomm]" f , ## a) #else #define xc_dprintk(f, a...) ((void)0) #endif static void * xencomm_vaddr(unsigned long paddr, struct page_info *page) { return (void*)((paddr & ~PAGE_MASK) | (unsigned long)page_to_virt(page)); } /* get_page() to prevent another vcpu freeing the page. */ static int xencomm_get_page(unsigned long paddr, struct page_info **page) { unsigned long maddr = paddr_to_maddr(paddr); if ( maddr == 0 ) return -EFAULT; *page = maddr_to_page(maddr); if ( !get_page(*page, current->domain) ) { /* * This page might be a page granted by another domain, or this page * is freed with decrease reservation hypercall at the same time. */ gdprintk(XENLOG_WARNING, "bad page is passed. paddr %#lx maddr %#lx\n", paddr, maddr); return -EFAULT; } return 0; } /* check if struct desc doesn't cross page boundry */ static int xencomm_desc_cross_page_boundary(unsigned long paddr) { unsigned long offset = paddr & ~PAGE_MASK; if ( offset > PAGE_SIZE - sizeof(struct xencomm_desc) ) return 1; return 0; } struct xencomm_ctxt { struct xencomm_desc __user *desc_in_paddr; uint32_t nr_addrs; struct page_info *page; unsigned long *address; }; static uint32_t xencomm_ctxt_nr_addrs(const struct xencomm_ctxt *ctxt) { return ctxt->nr_addrs; } static unsigned long* xencomm_ctxt_address(struct xencomm_ctxt *ctxt) { return ctxt->address; } static int xencomm_ctxt_init(const void *handle, struct xencomm_ctxt *ctxt) { struct page_info *page; struct xencomm_desc *desc; int ret; /* Avoid unaligned access. */ if ( ((unsigned long)handle % __alignof__(*desc)) != 0 ) return -EINVAL; if ( xencomm_desc_cross_page_boundary((unsigned long)handle) ) return -EINVAL; /* First we need to access the descriptor. */ ret = xencomm_get_page((unsigned long)handle, &page); if ( ret ) return ret; desc = xencomm_vaddr((unsigned long)handle, page); if ( desc->magic != XENCOMM_MAGIC ) { printk("%s: error: %p magic was %#x\n", __func__, desc, desc->magic); put_page(page); return -EINVAL; } /* Copy before use: It is possible for a guest to modify concurrently. */ ctxt->nr_addrs = desc->nr_addrs; ctxt->desc_in_paddr = (struct xencomm_desc*)handle; ctxt->page = page; ctxt->address = &desc->address[0]; return 0; } /* * Calculate the vaddr of &ctxt->desc_in_paddr->address[i] and get_page(). * And put the results in ctxt->page and ctxt->address. * If there is the previous page, put_page(). * * A guest domain passes the array, ctxt->desc_in_paddr->address[]. * It is gpaddr-contiguous, but not maddr-contiguous so that * we can't obtain the vaddr by simple offsetting. * We need to convert gpaddr, &ctxt->desc_in_paddr->address[i], * into maddr and then convert it to the xen virtual address in order * to access there. * The conversion can be optimized out by using the last result of * ctxt->address because we access the array sequentially. * The conversion, gpaddr -> maddr -> vaddr, is necessary only when * crossing page boundary. */ static int xencomm_ctxt_next(struct xencomm_ctxt *ctxt, int i) { unsigned long paddr; struct page_info *page; int ret; BUG_ON(i >= ctxt->nr_addrs); /* For i == 0 case we already calculated it in xencomm_ctxt_init(). */ if ( i != 0 ) ctxt->address++; if ( ((unsigned long)ctxt->address & ~PAGE_MASK) != 0 ) return 0; /* Crossing page boundary: machine address must be calculated. */ paddr = (unsigned long)&ctxt->desc_in_paddr->address[i]; ret = xencomm_get_page(paddr, &page); if ( ret ) return ret; put_page(ctxt->page); ctxt->page = page; ctxt->address = xencomm_vaddr(paddr, page); return 0; } static void xencomm_ctxt_done(struct xencomm_ctxt *ctxt) { put_page(ctxt->page); } static int xencomm_copy_chunk_from( unsigned long to, unsigned long paddr, unsigned int len) { struct page_info *page; int res; do { res = xencomm_get_page(paddr, &page); } while ( res == -EAGAIN ); if ( res ) return res; xc_dprintk("%lx[%d] -> %lx\n", (unsigned long)xencomm_vaddr(paddr, page), len, to); memcpy((void *)to, xencomm_vaddr(paddr, page), len); put_page(page); return 0; } static unsigned long xencomm_inline_from_guest( void *to, const void *from, unsigned int n, unsigned int skip) { unsigned long src_paddr = xencomm_inline_addr(from) + skip; while ( n > 0 ) { unsigned int chunksz, bytes; chunksz = PAGE_SIZE - (src_paddr % PAGE_SIZE); bytes = min(chunksz, n); if ( xencomm_copy_chunk_from((unsigned long)to, src_paddr, bytes) ) return n; src_paddr += bytes; to += bytes; n -= bytes; } /* Always successful. */ return 0; } /** * xencomm_copy_from_guest: Copy a block of data from domain space. * @to: Machine address. * @from: Physical address to a xencomm buffer descriptor. * @n: Number of bytes to copy. * @skip: Number of bytes from the start to skip. * * Copy data from domain to hypervisor. * * Returns number of bytes that could not be copied. * On success, this will be zero. */ unsigned long xencomm_copy_from_guest( void *to, const void *from, unsigned int n, unsigned int skip) { struct xencomm_ctxt ctxt; unsigned int from_pos = 0; unsigned int to_pos = 0; unsigned int i = 0; if ( xencomm_is_inline(from) ) return xencomm_inline_from_guest(to, from, n, skip); if ( xencomm_ctxt_init(from, &ctxt) ) return n; /* Iterate through the descriptor, copying up to a page at a time */ while ( (to_pos < n) && (i < xencomm_ctxt_nr_addrs(&ctxt)) ) { unsigned long src_paddr; unsigned int pgoffset, chunksz, chunk_skip; if ( xencomm_ctxt_next(&ctxt, i) ) goto out; src_paddr = *xencomm_ctxt_address(&ctxt); if ( src_paddr == XENCOMM_INVALID ) { i++; continue; } pgoffset = src_paddr % PAGE_SIZE; chunksz = PAGE_SIZE - pgoffset; chunk_skip = min(chunksz, skip); from_pos += chunk_skip; chunksz -= chunk_skip; skip -= chunk_skip; if ( skip == 0 && chunksz > 0 ) { unsigned int bytes = min(chunksz, n - to_pos); if ( xencomm_copy_chunk_from((unsigned long)to + to_pos, src_paddr + chunk_skip, bytes) ) goto out; from_pos += bytes; to_pos += bytes; } i++; } out: xencomm_ctxt_done(&ctxt); return n - to_pos; } static int xencomm_copy_chunk_to( unsigned long paddr, unsigned long from, unsigned int len) { struct page_info *page; int res; do { res = xencomm_get_page(paddr, &page); } while ( res == -EAGAIN ); if ( res ) return res; xc_dprintk("%lx[%d] -> %lx\n", from, len, (unsigned long)xencomm_vaddr(paddr, page)); memcpy(xencomm_vaddr(paddr, page), (void *)from, len); xencomm_mark_dirty((unsigned long)xencomm_vaddr(paddr, page), len); put_page(page); return 0; } static unsigned long xencomm_inline_to_guest( void *to, const void *from, unsigned int n, unsigned int skip) { unsigned long dest_paddr = xencomm_inline_addr(to) + skip; while ( n > 0 ) { unsigned int chunksz, bytes; chunksz = PAGE_SIZE - (dest_paddr % PAGE_SIZE); bytes = min(chunksz, n); if ( xencomm_copy_chunk_to(dest_paddr, (unsigned long)from, bytes) ) return n; dest_paddr += bytes; from += bytes; n -= bytes; } /* Always successful. */ return 0; } /** * xencomm_copy_to_guest: Copy a block of data to domain space. * @to: Physical address to xencomm buffer descriptor. * @from: Machine address. * @n: Number of bytes to copy. * @skip: Number of bytes from the start to skip. * * Copy data from hypervisor to domain. * * Returns number of bytes that could not be copied. * On success, this will be zero. */ unsigned long xencomm_copy_to_guest( void *to, const void *from, unsigned int n, unsigned int skip) { struct xencomm_ctxt ctxt; unsigned int from_pos = 0; unsigned int to_pos = 0; unsigned int i = 0; if ( xencomm_is_inline(to) ) return xencomm_inline_to_guest(to, from, n, skip); if ( xencomm_ctxt_init(to, &ctxt) ) return n; /* Iterate through the descriptor, copying up to a page at a time */ while ( (from_pos < n) && (i < xencomm_ctxt_nr_addrs(&ctxt)) ) { unsigned long dest_paddr; unsigned int pgoffset, chunksz, chunk_skip; if ( xencomm_ctxt_next(&ctxt, i) ) goto out; dest_paddr = *xencomm_ctxt_address(&ctxt); if ( dest_paddr == XENCOMM_INVALID ) { i++; continue; } pgoffset = dest_paddr % PAGE_SIZE; chunksz = PAGE_SIZE - pgoffset; chunk_skip = min(chunksz, skip); to_pos += chunk_skip; chunksz -= chunk_skip; skip -= chunk_skip; if ( skip == 0 && chunksz > 0 ) { unsigned int bytes = min(chunksz, n - from_pos); if ( xencomm_copy_chunk_to(dest_paddr + chunk_skip, (unsigned long)from + from_pos, bytes) ) goto out; from_pos += bytes; to_pos += bytes; } i++; } out: xencomm_ctxt_done(&ctxt); return n - from_pos; } static int xencomm_clear_chunk( unsigned long paddr, unsigned int len) { struct page_info *page; int res; do { res = xencomm_get_page(paddr, &page); } while ( res == -EAGAIN ); if ( res ) return res; memset(xencomm_vaddr(paddr, page), 0x00, len); xencomm_mark_dirty((unsigned long)xencomm_vaddr(paddr, page), len); put_page(page); return 0; } static unsigned long xencomm_inline_clear_guest( void *to, unsigned int n, unsigned int skip) { unsigned long dest_paddr = xencomm_inline_addr(to) + skip; while ( n > 0 ) { unsigned int chunksz, bytes; chunksz = PAGE_SIZE - (dest_paddr % PAGE_SIZE); bytes = min(chunksz, n); if ( xencomm_clear_chunk(dest_paddr, bytes) ) return n; dest_paddr += bytes; n -= bytes; } /* Always successful. */ return 0; } /** * xencomm_clear_guest: Clear a block of data in domain space. * @to: Physical address to xencomm buffer descriptor. * @n: Number of bytes to copy. * @skip: Number of bytes from the start to skip. * * Clear domain data * * Returns number of bytes that could not be cleared * On success, this will be zero. */ unsigned long xencomm_clear_guest( void *to, unsigned int n, unsigned int skip) { struct xencomm_ctxt ctxt; unsigned int from_pos = 0; unsigned int to_pos = 0; unsigned int i = 0; if ( xencomm_is_inline(to) ) return xencomm_inline_clear_guest(to, n, skip); if ( xencomm_ctxt_init(to, &ctxt) ) return n; /* Iterate through the descriptor, copying up to a page at a time */ while ( (from_pos < n) && (i < xencomm_ctxt_nr_addrs(&ctxt)) ) { unsigned long dest_paddr; unsigned int pgoffset, chunksz, chunk_skip; if ( xencomm_ctxt_next(&ctxt, i) ) goto out; dest_paddr = *xencomm_ctxt_address(&ctxt); if ( dest_paddr == XENCOMM_INVALID ) { i++; continue; } pgoffset = dest_paddr % PAGE_SIZE; chunksz = PAGE_SIZE - pgoffset; chunk_skip = min(chunksz, skip); to_pos += chunk_skip; chunksz -= chunk_skip; skip -= chunk_skip; if ( skip == 0 && chunksz > 0 ) { unsigned int bytes = min(chunksz, n - from_pos); if ( xencomm_clear_chunk(dest_paddr + chunk_skip, bytes) ) goto out; from_pos += bytes; to_pos += bytes; } i++; } out: xencomm_ctxt_done(&ctxt); return n - from_pos; } static int xencomm_inline_add_offset(void **handle, unsigned int bytes) { *handle += bytes; return 0; } /* Offset page addresses in 'handle' to skip 'bytes' bytes. Set completely * exhausted pages to XENCOMM_INVALID. */ int xencomm_add_offset(void **handle, unsigned int bytes) { struct xencomm_ctxt ctxt; int i = 0; int res = 0; if ( xencomm_is_inline(*handle) ) return xencomm_inline_add_offset(handle, bytes); res = xencomm_ctxt_init(handle, &ctxt); if ( res != 0 ) return res; /* Iterate through the descriptor incrementing addresses */ while ( (bytes > 0) && (i < xencomm_ctxt_nr_addrs(&ctxt)) ) { unsigned long *address; unsigned long dest_paddr; unsigned int pgoffset, chunksz, chunk_skip; res = xencomm_ctxt_next(&ctxt, i); if ( res ) goto out; address = xencomm_ctxt_address(&ctxt); dest_paddr = *address; if ( dest_paddr == XENCOMM_INVALID ) { i++; continue; } pgoffset = dest_paddr % PAGE_SIZE; chunksz = PAGE_SIZE - pgoffset; chunk_skip = min(chunksz, bytes); if ( chunk_skip == chunksz ) *address = XENCOMM_INVALID; /* exhausted this page */ else *address += chunk_skip; bytes -= chunk_skip; i++; } out: xencomm_ctxt_done(&ctxt); return res; } int xencomm_handle_is_null(void *handle) { struct xencomm_ctxt ctxt; int i; int res = 1; if ( xencomm_is_inline(handle) ) return xencomm_inline_addr(handle) == 0; if ( xencomm_ctxt_init(handle, &ctxt) ) return 1; for ( i = 0; i < xencomm_ctxt_nr_addrs(&ctxt); i++ ) { if ( xencomm_ctxt_next(&ctxt, i) ) goto out; if ( *xencomm_ctxt_address(&ctxt) != XENCOMM_INVALID ) { res = 0; goto out; } } out: xencomm_ctxt_done(&ctxt); return res; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/xenoprof.c0000664000175000017500000005412312307313555014536 0ustar smbsmb/* * Copyright (C) 2005 Hewlett-Packard Co. * written by Aravind Menon & Jose Renato Santos * (email: xenoprof@groups.hp.com) * * arch generic xenoprof and IA64 support. * dynamic map/unmap xenoprof buffer support. * Copyright (c) 2006 Isaku Yamahata * VA Linux Systems Japan K.K. */ #ifndef COMPAT #include #include #include #include #include #include #include #include /* Limit amount of pages used for shared buffer (per domain) */ #define MAX_OPROF_SHARED_PAGES 32 /* Lock protecting the following global state */ static DEFINE_SPINLOCK(xenoprof_lock); static DEFINE_SPINLOCK(pmu_owner_lock); int pmu_owner = 0; int pmu_hvm_refcount = 0; static struct domain *active_domains[MAX_OPROF_DOMAINS]; static int active_ready[MAX_OPROF_DOMAINS]; static unsigned int adomains; static struct domain *passive_domains[MAX_OPROF_DOMAINS]; static unsigned int pdomains; static unsigned int activated; static struct domain *xenoprof_primary_profiler; static int xenoprof_state = XENOPROF_IDLE; static unsigned long backtrace_depth; static u64 total_samples; static u64 invalid_buffer_samples; static u64 corrupted_buffer_samples; static u64 lost_samples; static u64 active_samples; static u64 passive_samples; static u64 idle_samples; static u64 others_samples; int acquire_pmu_ownership(int pmu_ownship) { spin_lock(&pmu_owner_lock); if ( pmu_owner == PMU_OWNER_NONE ) { pmu_owner = pmu_ownship; goto out; } if ( pmu_owner == pmu_ownship ) goto out; spin_unlock(&pmu_owner_lock); return 0; out: if ( pmu_owner == PMU_OWNER_HVM ) pmu_hvm_refcount++; spin_unlock(&pmu_owner_lock); return 1; } void release_pmu_ownship(int pmu_ownship) { spin_lock(&pmu_owner_lock); if ( pmu_ownship == PMU_OWNER_HVM ) pmu_hvm_refcount--; if ( !pmu_hvm_refcount ) pmu_owner = PMU_OWNER_NONE; spin_unlock(&pmu_owner_lock); } int is_active(struct domain *d) { struct xenoprof *x = d->xenoprof; return ((x != NULL) && (x->domain_type == XENOPROF_DOMAIN_ACTIVE)); } int is_passive(struct domain *d) { struct xenoprof *x = d->xenoprof; return ((x != NULL) && (x->domain_type == XENOPROF_DOMAIN_PASSIVE)); } static int is_profiled(struct domain *d) { return (is_active(d) || is_passive(d)); } static void xenoprof_reset_stat(void) { total_samples = 0; invalid_buffer_samples = 0; corrupted_buffer_samples = 0; lost_samples = 0; active_samples = 0; passive_samples = 0; idle_samples = 0; others_samples = 0; } static void xenoprof_reset_buf(struct domain *d) { int j; xenoprof_buf_t *buf; if ( d->xenoprof == NULL ) { printk("xenoprof_reset_buf: ERROR - Unexpected " "Xenoprof NULL pointer \n"); return; } for ( j = 0; j < d->max_vcpus; j++ ) { buf = d->xenoprof->vcpu[j].buffer; if ( buf != NULL ) { xenoprof_buf(d, buf, event_head) = 0; xenoprof_buf(d, buf, event_tail) = 0; } } } static int share_xenoprof_page_with_guest(struct domain *d, unsigned long mfn, int npages) { int i; /* Check if previous page owner has released the page. */ for ( i = 0; i < npages; i++ ) { struct page_info *page = mfn_to_page(mfn + i); if ( (page->count_info & (PGC_allocated|PGC_count_mask)) != 0 ) { printk(XENLOG_G_INFO "dom%d mfn %#lx page->count_info %#lx\n", d->domain_id, mfn + i, page->count_info); return -EBUSY; } page_set_owner(page, NULL); } for ( i = 0; i < npages; i++ ) share_xen_page_with_guest(mfn_to_page(mfn + i), d, XENSHARE_writable); return 0; } static void unshare_xenoprof_page_with_guest(struct xenoprof *x) { int i, npages = x->npages; unsigned long mfn = virt_to_mfn(x->rawbuf); for ( i = 0; i < npages; i++ ) { struct page_info *page = mfn_to_page(mfn + i); BUG_ON(page_get_owner(page) != current->domain); if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) put_page(page); } } static void xenoprof_shared_gmfn_with_guest( struct domain *d, unsigned long maddr, unsigned long gmaddr, int npages) { int i; for ( i = 0; i < npages; i++, maddr += PAGE_SIZE, gmaddr += PAGE_SIZE ) { BUG_ON(page_get_owner(maddr_to_page(maddr)) != d); xenoprof_shared_gmfn(d, gmaddr, maddr); } } static int alloc_xenoprof_struct( struct domain *d, int max_samples, int is_passive) { struct vcpu *v; int nvcpu, npages, bufsize, max_bufsize; unsigned max_max_samples; int i; nvcpu = 0; for_each_vcpu ( d, v ) nvcpu++; if ( !nvcpu ) return -EINVAL; d->xenoprof = xzalloc(struct xenoprof); if ( d->xenoprof == NULL ) { printk("alloc_xenoprof_struct(): memory allocation failed\n"); return -ENOMEM; } d->xenoprof->vcpu = xzalloc_array(struct xenoprof_vcpu, d->max_vcpus); if ( d->xenoprof->vcpu == NULL ) { xfree(d->xenoprof); d->xenoprof = NULL; printk("alloc_xenoprof_struct(): vcpu array allocation failed\n"); return -ENOMEM; } bufsize = sizeof(struct xenoprof_buf); i = sizeof(struct event_log); #ifdef CONFIG_COMPAT d->xenoprof->is_compat = is_pv_32on64_domain(is_passive ? dom0 : d); if ( XENOPROF_COMPAT(d->xenoprof) ) { bufsize = sizeof(struct compat_oprof_buf); i = sizeof(struct compat_event_log); } #endif /* reduce max_samples if necessary to limit pages allocated */ max_bufsize = (MAX_OPROF_SHARED_PAGES * PAGE_SIZE) / nvcpu; max_max_samples = ( (max_bufsize - bufsize) / i ) + 1; if ( (unsigned)max_samples > max_max_samples ) max_samples = max_max_samples; bufsize += (max_samples - 1) * i; npages = (nvcpu * bufsize - 1) / PAGE_SIZE + 1; d->xenoprof->rawbuf = alloc_xenheap_pages(get_order_from_pages(npages), 0); if ( d->xenoprof->rawbuf == NULL ) { xfree(d->xenoprof); d->xenoprof = NULL; return -ENOMEM; } d->xenoprof->npages = npages; d->xenoprof->nbuf = nvcpu; d->xenoprof->bufsize = bufsize; d->xenoprof->domain_ready = 0; d->xenoprof->domain_type = XENOPROF_DOMAIN_IGNORED; /* Update buffer pointers for active vcpus */ i = 0; for_each_vcpu ( d, v ) { xenoprof_buf_t *buf = (xenoprof_buf_t *) &d->xenoprof->rawbuf[i * bufsize]; d->xenoprof->vcpu[v->vcpu_id].event_size = max_samples; d->xenoprof->vcpu[v->vcpu_id].buffer = buf; xenoprof_buf(d, buf, event_size) = max_samples; xenoprof_buf(d, buf, vcpu_id) = v->vcpu_id; i++; /* in the unlikely case that the number of active vcpus changes */ if ( i >= nvcpu ) break; } return 0; } void free_xenoprof_pages(struct domain *d) { struct xenoprof *x; int order; x = d->xenoprof; if ( x == NULL ) return; if ( x->rawbuf != NULL ) { order = get_order_from_pages(x->npages); free_xenheap_pages(x->rawbuf, order); } xfree(x); d->xenoprof = NULL; } static int active_index(struct domain *d) { int i; for ( i = 0; i < adomains; i++ ) if ( active_domains[i] == d ) return i; return -1; } static int set_active(struct domain *d) { int ind; struct xenoprof *x; ind = active_index(d); if ( ind < 0 ) return -EPERM; x = d->xenoprof; if ( x == NULL ) return -EPERM; x->domain_ready = 1; x->domain_type = XENOPROF_DOMAIN_ACTIVE; active_ready[ind] = 1; activated++; return 0; } static int reset_active(struct domain *d) { int ind; struct xenoprof *x; ind = active_index(d); if ( ind < 0 ) return -EPERM; x = d->xenoprof; if ( x == NULL ) return -EPERM; x->domain_ready = 0; x->domain_type = XENOPROF_DOMAIN_IGNORED; active_ready[ind] = 0; active_domains[ind] = NULL; activated--; put_domain(d); if ( activated <= 0 ) adomains = 0; return 0; } static void reset_passive(struct domain *d) { struct xenoprof *x; if ( d == NULL ) return; x = d->xenoprof; if ( x == NULL ) return; unshare_xenoprof_page_with_guest(x); x->domain_type = XENOPROF_DOMAIN_IGNORED; } static void reset_active_list(void) { int i; for ( i = 0; i < adomains; i++ ) if ( active_ready[i] ) reset_active(active_domains[i]); adomains = 0; activated = 0; } static void reset_passive_list(void) { int i; for ( i = 0; i < pdomains; i++ ) { reset_passive(passive_domains[i]); put_domain(passive_domains[i]); passive_domains[i] = NULL; } pdomains = 0; } static int add_active_list(domid_t domid) { struct domain *d; if ( adomains >= MAX_OPROF_DOMAINS ) return -E2BIG; d = get_domain_by_id(domid); if ( d == NULL ) return -EINVAL; active_domains[adomains] = d; active_ready[adomains] = 0; adomains++; return 0; } static int add_passive_list(XEN_GUEST_HANDLE_PARAM(void) arg) { struct xenoprof_passive passive; struct domain *d; int ret = 0; if ( pdomains >= MAX_OPROF_DOMAINS ) return -E2BIG; if ( copy_from_guest(&passive, arg, 1) ) return -EFAULT; d = get_domain_by_id(passive.domain_id); if ( d == NULL ) return -EINVAL; if ( d->xenoprof == NULL ) { ret = alloc_xenoprof_struct(d, passive.max_samples, 1); if ( ret < 0 ) { put_domain(d); return -ENOMEM; } } ret = share_xenoprof_page_with_guest( current->domain, virt_to_mfn(d->xenoprof->rawbuf), d->xenoprof->npages); if ( ret < 0 ) { put_domain(d); return ret; } d->xenoprof->domain_type = XENOPROF_DOMAIN_PASSIVE; passive.nbuf = d->xenoprof->nbuf; passive.bufsize = d->xenoprof->bufsize; if ( !paging_mode_translate(current->domain) ) passive.buf_gmaddr = __pa(d->xenoprof->rawbuf); else xenoprof_shared_gmfn_with_guest( current->domain, __pa(d->xenoprof->rawbuf), passive.buf_gmaddr, d->xenoprof->npages); if ( __copy_to_guest(arg, &passive, 1) ) { put_domain(d); return -EFAULT; } passive_domains[pdomains] = d; pdomains++; return ret; } /* Get space in the buffer */ static int xenoprof_buf_space(struct domain *d, xenoprof_buf_t * buf, int size) { int head, tail; head = xenoprof_buf(d, buf, event_head); tail = xenoprof_buf(d, buf, event_tail); return ((tail > head) ? 0 : size) + tail - head - 1; } /* Check for space and add a sample. Return 1 if successful, 0 otherwise. */ static int xenoprof_add_sample(struct domain *d, xenoprof_buf_t *buf, uint64_t eip, int mode, int event) { int head, tail, size; head = xenoprof_buf(d, buf, event_head); tail = xenoprof_buf(d, buf, event_tail); size = xenoprof_buf(d, buf, event_size); /* make sure indexes in shared buffer are sane */ if ( (head < 0) || (head >= size) || (tail < 0) || (tail >= size) ) { corrupted_buffer_samples++; return 0; } if ( xenoprof_buf_space(d, buf, size) > 0 ) { xenoprof_buf(d, buf, event_log[head].eip) = eip; xenoprof_buf(d, buf, event_log[head].mode) = mode; xenoprof_buf(d, buf, event_log[head].event) = event; head++; if ( head >= size ) head = 0; xenoprof_buf(d, buf, event_head) = head; } else { xenoprof_buf(d, buf, lost_samples)++; lost_samples++; return 0; } return 1; } int xenoprof_add_trace(struct vcpu *vcpu, uint64_t pc, int mode) { struct domain *d = vcpu->domain; xenoprof_buf_t *buf = d->xenoprof->vcpu[vcpu->vcpu_id].buffer; /* Do not accidentally write an escape code due to a broken frame. */ if ( pc == XENOPROF_ESCAPE_CODE ) { invalid_buffer_samples++; return 0; } return xenoprof_add_sample(d, buf, pc, mode, 0); } void xenoprof_log_event(struct vcpu *vcpu, const struct cpu_user_regs *regs, uint64_t pc, int mode, int event) { struct domain *d = vcpu->domain; struct xenoprof_vcpu *v; xenoprof_buf_t *buf; total_samples++; /* Ignore samples of un-monitored domains. */ if ( !is_profiled(d) ) { others_samples++; return; } v = &d->xenoprof->vcpu[vcpu->vcpu_id]; if ( v->buffer == NULL ) { invalid_buffer_samples++; return; } buf = v->buffer; /* Provide backtrace if requested. */ if ( backtrace_depth > 0 ) { if ( (xenoprof_buf_space(d, buf, v->event_size) < 2) || !xenoprof_add_sample(d, buf, XENOPROF_ESCAPE_CODE, mode, XENOPROF_TRACE_BEGIN) ) { xenoprof_buf(d, buf, lost_samples)++; lost_samples++; return; } } if ( xenoprof_add_sample(d, buf, pc, mode, event) ) { if ( is_active(vcpu->domain) ) active_samples++; else passive_samples++; if ( mode == 0 ) xenoprof_buf(d, buf, user_samples)++; else if ( mode == 1 ) xenoprof_buf(d, buf, kernel_samples)++; else xenoprof_buf(d, buf, xen_samples)++; } if ( backtrace_depth > 0 ) xenoprof_backtrace(vcpu, regs, backtrace_depth, mode); } static int xenoprof_op_init(XEN_GUEST_HANDLE_PARAM(void) arg) { struct domain *d = current->domain; struct xenoprof_init xenoprof_init; int ret; if ( copy_from_guest(&xenoprof_init, arg, 1) ) return -EFAULT; if ( (ret = xenoprof_arch_init(&xenoprof_init.num_events, xenoprof_init.cpu_type)) ) return ret; xenoprof_init.is_primary = ((xenoprof_primary_profiler == d) || ((xenoprof_primary_profiler == NULL) && (d->domain_id == 0))); if ( xenoprof_init.is_primary ) xenoprof_primary_profiler = current->domain; return __copy_to_guest(arg, &xenoprof_init, 1) ? -EFAULT : 0; } #define ret_t long #endif /* !COMPAT */ static int xenoprof_op_get_buffer(XEN_GUEST_HANDLE_PARAM(void) arg) { struct xenoprof_get_buffer xenoprof_get_buffer; struct domain *d = current->domain; int ret; if ( copy_from_guest(&xenoprof_get_buffer, arg, 1) ) return -EFAULT; /* * We allocate xenoprof struct and buffers only at first time * get_buffer is called. Memory is then kept until domain is destroyed. */ if ( d->xenoprof == NULL ) { ret = alloc_xenoprof_struct(d, xenoprof_get_buffer.max_samples, 0); if ( ret < 0 ) return ret; } ret = share_xenoprof_page_with_guest( d, virt_to_mfn(d->xenoprof->rawbuf), d->xenoprof->npages); if ( ret < 0 ) return ret; xenoprof_reset_buf(d); d->xenoprof->domain_type = XENOPROF_DOMAIN_IGNORED; d->xenoprof->domain_ready = 0; d->xenoprof->is_primary = (xenoprof_primary_profiler == current->domain); xenoprof_get_buffer.nbuf = d->xenoprof->nbuf; xenoprof_get_buffer.bufsize = d->xenoprof->bufsize; if ( !paging_mode_translate(d) ) xenoprof_get_buffer.buf_gmaddr = __pa(d->xenoprof->rawbuf); else xenoprof_shared_gmfn_with_guest( d, __pa(d->xenoprof->rawbuf), xenoprof_get_buffer.buf_gmaddr, d->xenoprof->npages); return __copy_to_guest(arg, &xenoprof_get_buffer, 1) ? -EFAULT : 0; } #define NONPRIV_OP(op) ( (op == XENOPROF_init) \ || (op == XENOPROF_enable_virq) \ || (op == XENOPROF_disable_virq) \ || (op == XENOPROF_get_buffer)) ret_t do_xenoprof_op(int op, XEN_GUEST_HANDLE_PARAM(void) arg) { int ret = 0; if ( (op < 0) || (op > XENOPROF_last_op) ) { printk("xenoprof: invalid operation %d for domain %d\n", op, current->domain->domain_id); return -EINVAL; } if ( !NONPRIV_OP(op) && (current->domain != xenoprof_primary_profiler) ) { printk("xenoprof: dom %d denied privileged operation %d\n", current->domain->domain_id, op); return -EPERM; } ret = xsm_profile(XSM_HOOK, current->domain, op); if ( ret ) return ret; spin_lock(&xenoprof_lock); switch ( op ) { case XENOPROF_init: ret = xenoprof_op_init(arg); if ( (ret == 0) && (current->domain == xenoprof_primary_profiler) ) xenoprof_state = XENOPROF_INITIALIZED; break; case XENOPROF_get_buffer: if ( !acquire_pmu_ownership(PMU_OWNER_XENOPROF) ) { ret = -EBUSY; break; } ret = xenoprof_op_get_buffer(arg); break; case XENOPROF_reset_active_list: reset_active_list(); ret = 0; break; case XENOPROF_reset_passive_list: reset_passive_list(); ret = 0; break; case XENOPROF_set_active: { domid_t domid; if ( xenoprof_state != XENOPROF_INITIALIZED ) { ret = -EPERM; break; } if ( copy_from_guest(&domid, arg, 1) ) { ret = -EFAULT; break; } ret = add_active_list(domid); break; } case XENOPROF_set_passive: if ( xenoprof_state != XENOPROF_INITIALIZED ) { ret = -EPERM; break; } ret = add_passive_list(arg); break; case XENOPROF_reserve_counters: if ( xenoprof_state != XENOPROF_INITIALIZED ) { ret = -EPERM; break; } ret = xenoprof_arch_reserve_counters(); if ( !ret ) xenoprof_state = XENOPROF_COUNTERS_RESERVED; break; case XENOPROF_counter: if ( (xenoprof_state != XENOPROF_COUNTERS_RESERVED) || (adomains == 0) ) { ret = -EPERM; break; } ret = xenoprof_arch_counter(arg); break; case XENOPROF_setup_events: if ( xenoprof_state != XENOPROF_COUNTERS_RESERVED ) { ret = -EPERM; break; } ret = xenoprof_arch_setup_events(); if ( !ret ) xenoprof_state = XENOPROF_READY; break; case XENOPROF_enable_virq: { int i; if ( current->domain == xenoprof_primary_profiler ) { if ( xenoprof_state != XENOPROF_READY ) { ret = -EPERM; break; } xenoprof_arch_enable_virq(); xenoprof_reset_stat(); for ( i = 0; i < pdomains; i++ ) xenoprof_reset_buf(passive_domains[i]); } xenoprof_reset_buf(current->domain); ret = set_active(current->domain); break; } case XENOPROF_start: ret = -EPERM; if ( (xenoprof_state == XENOPROF_READY) && (activated == adomains) ) ret = xenoprof_arch_start(); if ( ret == 0 ) xenoprof_state = XENOPROF_PROFILING; break; case XENOPROF_stop: { struct domain *d; struct vcpu *v; int i; if ( xenoprof_state != XENOPROF_PROFILING ) { ret = -EPERM; break; } xenoprof_arch_stop(); /* Flush remaining samples. */ for ( i = 0; i < adomains; i++ ) { if ( !active_ready[i] ) continue; d = active_domains[i]; for_each_vcpu(d, v) send_guest_vcpu_virq(v, VIRQ_XENOPROF); } xenoprof_state = XENOPROF_READY; break; } case XENOPROF_disable_virq: { struct xenoprof *x; if ( (xenoprof_state == XENOPROF_PROFILING) && (is_active(current->domain)) ) { ret = -EPERM; break; } if ( (ret = reset_active(current->domain)) != 0 ) break; x = current->domain->xenoprof; unshare_xenoprof_page_with_guest(x); release_pmu_ownship(PMU_OWNER_XENOPROF); break; } case XENOPROF_release_counters: ret = -EPERM; if ( (xenoprof_state == XENOPROF_COUNTERS_RESERVED) || (xenoprof_state == XENOPROF_READY) ) { xenoprof_state = XENOPROF_INITIALIZED; xenoprof_arch_release_counters(); xenoprof_arch_disable_virq(); reset_passive_list(); ret = 0; } break; case XENOPROF_shutdown: ret = -EPERM; if ( xenoprof_state == XENOPROF_INITIALIZED ) { activated = 0; adomains=0; xenoprof_primary_profiler = NULL; backtrace_depth=0; ret = 0; } break; case XENOPROF_set_backtrace: ret = 0; if ( !xenoprof_backtrace_supported() ) ret = -EINVAL; else if ( copy_from_guest(&backtrace_depth, arg, 1) ) ret = -EFAULT; break; case XENOPROF_ibs_counter: if ( (xenoprof_state != XENOPROF_COUNTERS_RESERVED) || (adomains == 0) ) { ret = -EPERM; break; } ret = xenoprof_arch_ibs_counter(arg); break; case XENOPROF_get_ibs_caps: ret = ibs_caps; break; default: ret = -ENOSYS; } spin_unlock(&xenoprof_lock); if ( ret < 0 ) printk("xenoprof: operation %d failed for dom %d (status : %d)\n", op, current->domain->domain_id, ret); return ret; } #if defined(CONFIG_COMPAT) && !defined(COMPAT) #undef ret_t #include "compat/xenoprof.c" #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/perfc.c0000664000175000017500000001740412307313555013776 0ustar smbsmb #include #include #include #include #include #include #include #include #include #include #define PERFCOUNTER( var, name ) { name, TYPE_SINGLE, 0 }, #define PERFCOUNTER_ARRAY( var, name, size ) { name, TYPE_ARRAY, size }, #define PERFSTATUS( var, name ) { name, TYPE_S_SINGLE, 0 }, #define PERFSTATUS_ARRAY( var, name, size ) { name, TYPE_S_ARRAY, size }, static const struct { const char *name; enum { TYPE_SINGLE, TYPE_ARRAY, TYPE_S_SINGLE, TYPE_S_ARRAY } type; unsigned int nr_elements; } perfc_info[] = { #include }; #define NR_PERFCTRS (sizeof(perfc_info) / sizeof(perfc_info[0])) DEFINE_PER_CPU(perfc_t[NUM_PERFCOUNTERS], perfcounters); void perfc_printall(unsigned char key) { unsigned int i, j; s_time_t now = NOW(); printk("Xen performance counters SHOW (now = 0x%08X:%08X)\n", (u32)(now>>32), (u32)now); for ( i = j = 0; i < NR_PERFCTRS; i++ ) { unsigned int k, cpu; unsigned long long sum = 0; printk("%-32s ", perfc_info[i].name); switch ( perfc_info[i].type ) { case TYPE_SINGLE: case TYPE_S_SINGLE: for_each_online_cpu ( cpu ) sum += per_cpu(perfcounters, cpu)[j]; if ( perfc_info[i].type == TYPE_S_SINGLE ) sum = (perfc_t) sum; printk("TOTAL[%12Lu]", sum); if ( sum ) { k = 0; for_each_online_cpu ( cpu ) { if ( k > 0 && (k % 4) == 0 ) printk("\n%46s", ""); printk(" CPU%02u[%10"PRIperfc"u]", cpu, per_cpu(perfcounters, cpu)[j]); ++k; } } ++j; break; case TYPE_ARRAY: case TYPE_S_ARRAY: for_each_online_cpu ( cpu ) { perfc_t *counters = per_cpu(perfcounters, cpu) + j; for ( k = 0; k < perfc_info[i].nr_elements; k++ ) sum += counters[k]; } if ( perfc_info[i].type == TYPE_S_ARRAY ) sum = (perfc_t) sum; printk("TOTAL[%12Lu]", sum); if (sum) { #ifdef PERF_ARRAYS for ( k = 0; k < perfc_info[i].nr_elements; k++ ) { sum = 0; for_each_online_cpu ( cpu ) sum += per_cpu(perfcounters, cpu)[j + k]; if ( perfc_info[i].type == TYPE_S_ARRAY ) sum = (perfc_t) sum; if ( (k % 4) == 0 ) printk("\n%16s", ""); printk(" ARR%02u[%10Lu]", k, sum); } #else k = 0; for_each_online_cpu ( cpu ) { perfc_t *counters = per_cpu(perfcounters, cpu) + j; unsigned int n; sum = 0; for ( n = 0; n < perfc_info[i].nr_elements; n++ ) sum += counters[n]; if ( perfc_info[i].type == TYPE_S_ARRAY ) sum = (perfc_t) sum; if ( k > 0 && (k % 4) == 0 ) printk("\n%46s", ""); printk(" CPU%02u[%10Lu]", cpu, sum); ++k; } #endif } j += perfc_info[i].nr_elements; break; } printk("\n"); } } void perfc_reset(unsigned char key) { unsigned int i, j; s_time_t now = NOW(); if ( key != '\0' ) printk("Xen performance counters RESET (now = 0x%08X:%08X)\n", (u32)(now>>32), (u32)now); /* leave STATUS counters alone -- don't reset */ for ( i = j = 0; i < NR_PERFCTRS; i++ ) { unsigned int cpu; switch ( perfc_info[i].type ) { case TYPE_SINGLE: for_each_online_cpu ( cpu ) per_cpu(perfcounters, cpu)[j] = 0; case TYPE_S_SINGLE: ++j; break; case TYPE_ARRAY: for_each_online_cpu ( cpu ) memset(per_cpu(perfcounters, cpu) + j, 0, perfc_info[i].nr_elements * sizeof(perfc_t)); case TYPE_S_ARRAY: j += perfc_info[i].nr_elements; break; } } arch_perfc_reset(); } static xen_sysctl_perfc_desc_t perfc_d[NR_PERFCTRS]; static xen_sysctl_perfc_val_t *perfc_vals; static unsigned int perfc_nbr_vals; static cpumask_t perfc_cpumap; static int perfc_copy_info(XEN_GUEST_HANDLE_64(xen_sysctl_perfc_desc_t) desc, XEN_GUEST_HANDLE_64(xen_sysctl_perfc_val_t) val) { unsigned int i, j, v; /* We only copy the name and array-size information once. */ if ( !cpumask_equal(&cpu_online_map, &perfc_cpumap) ) { unsigned int nr_cpus; perfc_cpumap = cpu_online_map; nr_cpus = cpumask_weight(&perfc_cpumap); perfc_nbr_vals = 0; for ( i = 0; i < NR_PERFCTRS; i++ ) { safe_strcpy(perfc_d[i].name, perfc_info[i].name); switch ( perfc_info[i].type ) { case TYPE_SINGLE: case TYPE_S_SINGLE: perfc_d[i].nr_vals = nr_cpus; break; case TYPE_ARRAY: case TYPE_S_ARRAY: perfc_d[i].nr_vals = perfc_info[i].nr_elements; break; } perfc_nbr_vals += perfc_d[i].nr_vals; } xfree(perfc_vals); perfc_vals = xmalloc_array(xen_sysctl_perfc_val_t, perfc_nbr_vals); } if ( guest_handle_is_null(desc) ) return 0; if ( perfc_vals == NULL ) return -ENOMEM; /* Architecture may fill counters from hardware. */ arch_perfc_gather(); /* We gather the counts together every time. */ for ( i = j = v = 0; i < NR_PERFCTRS; i++ ) { unsigned int cpu; switch ( perfc_info[i].type ) { case TYPE_SINGLE: case TYPE_S_SINGLE: for_each_cpu ( cpu, &perfc_cpumap ) perfc_vals[v++] = per_cpu(perfcounters, cpu)[j]; ++j; break; case TYPE_ARRAY: case TYPE_S_ARRAY: memset(perfc_vals + v, 0, perfc_d[i].nr_vals * sizeof(*perfc_vals)); for_each_cpu ( cpu, &perfc_cpumap ) { perfc_t *counters = per_cpu(perfcounters, cpu) + j; unsigned int k; for ( k = 0; k < perfc_d[i].nr_vals; k++ ) perfc_vals[v + k] += counters[k]; } v += perfc_d[i].nr_vals; j += perfc_info[i].nr_elements; break; } } BUG_ON(v != perfc_nbr_vals); if ( copy_to_guest(desc, perfc_d, NR_PERFCTRS) ) return -EFAULT; if ( copy_to_guest(val, perfc_vals, perfc_nbr_vals) ) return -EFAULT; return 0; } /* Dom0 control of perf counters */ int perfc_control(xen_sysctl_perfc_op_t *pc) { static DEFINE_SPINLOCK(lock); int rc; spin_lock(&lock); switch ( pc->cmd ) { case XEN_SYSCTL_PERFCOP_reset: rc = perfc_copy_info(pc->desc, pc->val); perfc_reset(0); break; case XEN_SYSCTL_PERFCOP_query: rc = perfc_copy_info(pc->desc, pc->val); break; default: rc = -EINVAL; break; } spin_unlock(&lock); pc->nr_counters = NR_PERFCTRS; pc->nr_vals = perfc_nbr_vals; return rc; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/libfdt/0000775000175000017500000000000012307313555013771 5ustar smbsmbxen-4.4.0/xen/common/libfdt/fdt_strerror.c0000664000175000017500000000651112307313555016657 0ustar smbsmb/* * libfdt - Flat Device Tree manipulation * Copyright (C) 2006 David Gibson, IBM Corporation. * * libfdt is dual licensed: you can use it either under the terms of * the GPL, or the BSD license, at your option. * * a) This library is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, * MA 02110-1301 USA * * Alternatively, * * b) Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * 1. Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "libfdt_env.h" #include #include #include "libfdt_internal.h" struct fdt_errtabent { const char *str; }; #define FDT_ERRTABENT(val) \ [(val)] = { .str = #val, } static struct fdt_errtabent fdt_errtable[] = { FDT_ERRTABENT(FDT_ERR_NOTFOUND), FDT_ERRTABENT(FDT_ERR_EXISTS), FDT_ERRTABENT(FDT_ERR_NOSPACE), FDT_ERRTABENT(FDT_ERR_BADOFFSET), FDT_ERRTABENT(FDT_ERR_BADPATH), FDT_ERRTABENT(FDT_ERR_BADSTATE), FDT_ERRTABENT(FDT_ERR_TRUNCATED), FDT_ERRTABENT(FDT_ERR_BADMAGIC), FDT_ERRTABENT(FDT_ERR_BADVERSION), FDT_ERRTABENT(FDT_ERR_BADSTRUCTURE), FDT_ERRTABENT(FDT_ERR_BADLAYOUT), }; #define FDT_ERRTABSIZE (sizeof(fdt_errtable) / sizeof(fdt_errtable[0])) const char *fdt_strerror(int errval) { if (errval > 0) return ""; else if (errval == 0) return ""; else if (errval > -FDT_ERRTABSIZE) { const char *s = fdt_errtable[-errval].str; if (s) return s; } return ""; } xen-4.4.0/xen/common/libfdt/fdt.c0000664000175000017500000001354712307313555014724 0ustar smbsmb/* * libfdt - Flat Device Tree manipulation * Copyright (C) 2006 David Gibson, IBM Corporation. * * libfdt is dual licensed: you can use it either under the terms of * the GPL, or the BSD license, at your option. * * a) This library is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, * MA 02110-1301 USA * * Alternatively, * * b) Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * 1. Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "libfdt_env.h" #include #include #include "libfdt_internal.h" int fdt_check_header(const void *fdt) { if (fdt_magic(fdt) == FDT_MAGIC) { /* Complete tree */ if (fdt_version(fdt) < FDT_FIRST_SUPPORTED_VERSION) return -FDT_ERR_BADVERSION; if (fdt_last_comp_version(fdt) > FDT_LAST_SUPPORTED_VERSION) return -FDT_ERR_BADVERSION; } else if (fdt_magic(fdt) == FDT_SW_MAGIC) { /* Unfinished sequential-write blob */ if (fdt_size_dt_struct(fdt) == 0) return -FDT_ERR_BADSTATE; } else { return -FDT_ERR_BADMAGIC; } return 0; } const void *fdt_offset_ptr(const void *fdt, int offset, unsigned int len) { const char *p; if (fdt_version(fdt) >= 0x11) if (((offset + len) < offset) || ((offset + len) > fdt_size_dt_struct(fdt))) return NULL; p = _fdt_offset_ptr(fdt, offset); if (p + len < p) return NULL; return p; } uint32_t fdt_next_tag(const void *fdt, int startoffset, int *nextoffset) { const uint32_t *tagp, *lenp; uint32_t tag; int offset = startoffset; const char *p; *nextoffset = -FDT_ERR_TRUNCATED; tagp = fdt_offset_ptr(fdt, offset, FDT_TAGSIZE); if (!tagp) return FDT_END; /* premature end */ tag = fdt32_to_cpu(*tagp); offset += FDT_TAGSIZE; *nextoffset = -FDT_ERR_BADSTRUCTURE; switch (tag) { case FDT_BEGIN_NODE: /* skip name */ do { p = fdt_offset_ptr(fdt, offset++, 1); } while (p && (*p != '\0')); if (!p) return FDT_END; /* premature end */ break; case FDT_PROP: lenp = fdt_offset_ptr(fdt, offset, sizeof(*lenp)); if (!lenp) return FDT_END; /* premature end */ /* skip-name offset, length and value */ offset += sizeof(struct fdt_property) - FDT_TAGSIZE + fdt32_to_cpu(*lenp); break; case FDT_END: case FDT_END_NODE: case FDT_NOP: break; default: return FDT_END; } if (!fdt_offset_ptr(fdt, startoffset, offset - startoffset)) return FDT_END; /* premature end */ *nextoffset = FDT_TAGALIGN(offset); return tag; } int _fdt_check_node_offset(const void *fdt, int offset) { if ((offset < 0) || (offset % FDT_TAGSIZE) || (fdt_next_tag(fdt, offset, &offset) != FDT_BEGIN_NODE)) return -FDT_ERR_BADOFFSET; return offset; } int _fdt_check_prop_offset(const void *fdt, int offset) { if ((offset < 0) || (offset % FDT_TAGSIZE) || (fdt_next_tag(fdt, offset, &offset) != FDT_PROP)) return -FDT_ERR_BADOFFSET; return offset; } int fdt_next_node(const void *fdt, int offset, int *depth) { int nextoffset = 0; uint32_t tag; if (offset >= 0) if ((nextoffset = _fdt_check_node_offset(fdt, offset)) < 0) return nextoffset; do { offset = nextoffset; tag = fdt_next_tag(fdt, offset, &nextoffset); switch (tag) { case FDT_PROP: case FDT_NOP: break; case FDT_BEGIN_NODE: if (depth) (*depth)++; break; case FDT_END_NODE: if (depth && ((--(*depth)) < 0)) return nextoffset; break; case FDT_END: if ((nextoffset >= 0) || ((nextoffset == -FDT_ERR_TRUNCATED) && !depth)) return -FDT_ERR_NOTFOUND; else return nextoffset; } } while (tag != FDT_BEGIN_NODE); return offset; } const char *_fdt_find_string(const char *strtab, int tabsize, const char *s) { int len = strlen(s) + 1; const char *last = strtab + tabsize - len; const char *p; for (p = strtab; p <= last; p++) if (memcmp(p, s, len) == 0) return p; return NULL; } int fdt_move(const void *fdt, void *buf, int bufsize) { FDT_CHECK_HEADER(fdt); if (fdt_totalsize(fdt) > bufsize) return -FDT_ERR_NOSPACE; memmove(buf, fdt, fdt_totalsize(fdt)); return 0; } xen-4.4.0/xen/common/libfdt/Makefile0000664000175000017500000000013512307313555015430 0ustar smbsmbinclude Makefile.libfdt obj-y += $(LIBFDT_OBJS) CFLAGS += -I$(BASEDIR)/include/xen/libfdt/ xen-4.4.0/xen/common/libfdt/TODO0000664000175000017500000000012512307313555014457 0ustar smbsmb- Tree traversal functions - Graft function - Complete libfdt.h documenting comments xen-4.4.0/xen/common/libfdt/libfdt_internal.h0000664000175000017500000000712412307313555017306 0ustar smbsmb#ifndef _LIBFDT_INTERNAL_H #define _LIBFDT_INTERNAL_H /* * libfdt - Flat Device Tree manipulation * Copyright (C) 2006 David Gibson, IBM Corporation. * * libfdt is dual licensed: you can use it either under the terms of * the GPL, or the BSD license, at your option. * * a) This library is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, * MA 02110-1301 USA * * Alternatively, * * b) Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * 1. Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #define FDT_ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1)) #define FDT_TAGALIGN(x) (FDT_ALIGN((x), FDT_TAGSIZE)) #define FDT_CHECK_HEADER(fdt) \ { \ int err; \ if ((err = fdt_check_header(fdt)) != 0) \ return err; \ } int _fdt_check_node_offset(const void *fdt, int offset); int _fdt_check_prop_offset(const void *fdt, int offset); const char *_fdt_find_string(const char *strtab, int tabsize, const char *s); int _fdt_node_end_offset(void *fdt, int nodeoffset); static inline const void *_fdt_offset_ptr(const void *fdt, int offset) { return (const char *)fdt + fdt_off_dt_struct(fdt) + offset; } static inline void *_fdt_offset_ptr_w(void *fdt, int offset) { return (void *)(uintptr_t)_fdt_offset_ptr(fdt, offset); } static inline const struct fdt_reserve_entry *_fdt_mem_rsv(const void *fdt, int n) { const struct fdt_reserve_entry *rsv_table = (const struct fdt_reserve_entry *) ((const char *)fdt + fdt_off_mem_rsvmap(fdt)); return rsv_table + n; } static inline struct fdt_reserve_entry *_fdt_mem_rsv_w(void *fdt, int n) { return (void *)(uintptr_t)_fdt_mem_rsv(fdt, n); } #define FDT_SW_MAGIC (~FDT_MAGIC) #endif /* _LIBFDT_INTERNAL_H */ xen-4.4.0/xen/common/libfdt/fdt_rw.c0000664000175000017500000003006412307313555015425 0ustar smbsmb/* * libfdt - Flat Device Tree manipulation * Copyright (C) 2006 David Gibson, IBM Corporation. * * libfdt is dual licensed: you can use it either under the terms of * the GPL, or the BSD license, at your option. * * a) This library is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, * MA 02110-1301 USA * * Alternatively, * * b) Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * 1. Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "libfdt_env.h" #include #include #include "libfdt_internal.h" static int _fdt_blocks_misordered(const void *fdt, int mem_rsv_size, int struct_size) { return (fdt_off_mem_rsvmap(fdt) < FDT_ALIGN(sizeof(struct fdt_header), 8)) || (fdt_off_dt_struct(fdt) < (fdt_off_mem_rsvmap(fdt) + mem_rsv_size)) || (fdt_off_dt_strings(fdt) < (fdt_off_dt_struct(fdt) + struct_size)) || (fdt_totalsize(fdt) < (fdt_off_dt_strings(fdt) + fdt_size_dt_strings(fdt))); } static int _fdt_rw_check_header(void *fdt) { FDT_CHECK_HEADER(fdt); if (fdt_version(fdt) < 17) return -FDT_ERR_BADVERSION; if (_fdt_blocks_misordered(fdt, sizeof(struct fdt_reserve_entry), fdt_size_dt_struct(fdt))) return -FDT_ERR_BADLAYOUT; if (fdt_version(fdt) > 17) fdt_set_version(fdt, 17); return 0; } #define FDT_RW_CHECK_HEADER(fdt) \ { \ int err; \ if ((err = _fdt_rw_check_header(fdt)) != 0) \ return err; \ } static inline int _fdt_data_size(void *fdt) { return fdt_off_dt_strings(fdt) + fdt_size_dt_strings(fdt); } static int _fdt_splice(void *fdt, void *splicepoint, int oldlen, int newlen) { char *p = splicepoint; char *end = (char *)fdt + _fdt_data_size(fdt); if (((p + oldlen) < p) || ((p + oldlen) > end)) return -FDT_ERR_BADOFFSET; if ((end - oldlen + newlen) > ((char *)fdt + fdt_totalsize(fdt))) return -FDT_ERR_NOSPACE; memmove(p + newlen, p + oldlen, end - p - oldlen); return 0; } static int _fdt_splice_mem_rsv(void *fdt, struct fdt_reserve_entry *p, int oldn, int newn) { int delta = (newn - oldn) * sizeof(*p); int err; err = _fdt_splice(fdt, p, oldn * sizeof(*p), newn * sizeof(*p)); if (err) return err; fdt_set_off_dt_struct(fdt, fdt_off_dt_struct(fdt) + delta); fdt_set_off_dt_strings(fdt, fdt_off_dt_strings(fdt) + delta); return 0; } static int _fdt_splice_struct(void *fdt, void *p, int oldlen, int newlen) { int delta = newlen - oldlen; int err; if ((err = _fdt_splice(fdt, p, oldlen, newlen))) return err; fdt_set_size_dt_struct(fdt, fdt_size_dt_struct(fdt) + delta); fdt_set_off_dt_strings(fdt, fdt_off_dt_strings(fdt) + delta); return 0; } static int _fdt_splice_string(void *fdt, int newlen) { void *p = (char *)fdt + fdt_off_dt_strings(fdt) + fdt_size_dt_strings(fdt); int err; if ((err = _fdt_splice(fdt, p, 0, newlen))) return err; fdt_set_size_dt_strings(fdt, fdt_size_dt_strings(fdt) + newlen); return 0; } static int _fdt_find_add_string(void *fdt, const char *s) { char *strtab = (char *)fdt + fdt_off_dt_strings(fdt); const char *p; char *new; int len = strlen(s) + 1; int err; p = _fdt_find_string(strtab, fdt_size_dt_strings(fdt), s); if (p) /* found it */ return (p - strtab); new = strtab + fdt_size_dt_strings(fdt); err = _fdt_splice_string(fdt, len); if (err) return err; memcpy(new, s, len); return (new - strtab); } int fdt_add_mem_rsv(void *fdt, uint64_t address, uint64_t size) { struct fdt_reserve_entry *re; int err; FDT_RW_CHECK_HEADER(fdt); re = _fdt_mem_rsv_w(fdt, fdt_num_mem_rsv(fdt)); err = _fdt_splice_mem_rsv(fdt, re, 0, 1); if (err) return err; re->address = cpu_to_fdt64(address); re->size = cpu_to_fdt64(size); return 0; } int fdt_del_mem_rsv(void *fdt, int n) { struct fdt_reserve_entry *re = _fdt_mem_rsv_w(fdt, n); int err; FDT_RW_CHECK_HEADER(fdt); if (n >= fdt_num_mem_rsv(fdt)) return -FDT_ERR_NOTFOUND; err = _fdt_splice_mem_rsv(fdt, re, 1, 0); if (err) return err; return 0; } static int _fdt_resize_property(void *fdt, int nodeoffset, const char *name, int len, struct fdt_property **prop) { int oldlen; int err; *prop = fdt_get_property_w(fdt, nodeoffset, name, &oldlen); if (! (*prop)) return oldlen; if ((err = _fdt_splice_struct(fdt, (*prop)->data, FDT_TAGALIGN(oldlen), FDT_TAGALIGN(len)))) return err; (*prop)->len = cpu_to_fdt32(len); return 0; } static int _fdt_add_property(void *fdt, int nodeoffset, const char *name, int len, struct fdt_property **prop) { int proplen; int nextoffset; int namestroff; int err; if ((nextoffset = _fdt_check_node_offset(fdt, nodeoffset)) < 0) return nextoffset; namestroff = _fdt_find_add_string(fdt, name); if (namestroff < 0) return namestroff; *prop = _fdt_offset_ptr_w(fdt, nextoffset); proplen = sizeof(**prop) + FDT_TAGALIGN(len); err = _fdt_splice_struct(fdt, *prop, 0, proplen); if (err) return err; (*prop)->tag = cpu_to_fdt32(FDT_PROP); (*prop)->nameoff = cpu_to_fdt32(namestroff); (*prop)->len = cpu_to_fdt32(len); return 0; } int fdt_set_name(void *fdt, int nodeoffset, const char *name) { char *namep; int oldlen, newlen; int err; FDT_RW_CHECK_HEADER(fdt); namep = (char *)(uintptr_t)fdt_get_name(fdt, nodeoffset, &oldlen); if (!namep) return oldlen; newlen = strlen(name); err = _fdt_splice_struct(fdt, namep, FDT_TAGALIGN(oldlen+1), FDT_TAGALIGN(newlen+1)); if (err) return err; memcpy(namep, name, newlen+1); return 0; } int fdt_setprop(void *fdt, int nodeoffset, const char *name, const void *val, int len) { struct fdt_property *prop; int err; FDT_RW_CHECK_HEADER(fdt); err = _fdt_resize_property(fdt, nodeoffset, name, len, &prop); if (err == -FDT_ERR_NOTFOUND) err = _fdt_add_property(fdt, nodeoffset, name, len, &prop); if (err) return err; memcpy(prop->data, val, len); return 0; } int fdt_delprop(void *fdt, int nodeoffset, const char *name) { struct fdt_property *prop; int len, proplen; FDT_RW_CHECK_HEADER(fdt); prop = fdt_get_property_w(fdt, nodeoffset, name, &len); if (! prop) return len; proplen = sizeof(*prop) + FDT_TAGALIGN(len); return _fdt_splice_struct(fdt, prop, proplen, 0); } int fdt_add_subnode_namelen(void *fdt, int parentoffset, const char *name, int namelen) { struct fdt_node_header *nh; int offset, nextoffset; int nodelen; int err; uint32_t tag; uint32_t *endtag; FDT_RW_CHECK_HEADER(fdt); offset = fdt_subnode_offset_namelen(fdt, parentoffset, name, namelen); if (offset >= 0) return -FDT_ERR_EXISTS; else if (offset != -FDT_ERR_NOTFOUND) return offset; /* Try to place the new node after the parent's properties */ fdt_next_tag(fdt, parentoffset, &nextoffset); /* skip the BEGIN_NODE */ do { offset = nextoffset; tag = fdt_next_tag(fdt, offset, &nextoffset); } while ((tag == FDT_PROP) || (tag == FDT_NOP)); nh = _fdt_offset_ptr_w(fdt, offset); nodelen = sizeof(*nh) + FDT_TAGALIGN(namelen+1) + FDT_TAGSIZE; err = _fdt_splice_struct(fdt, nh, 0, nodelen); if (err) return err; nh->tag = cpu_to_fdt32(FDT_BEGIN_NODE); memset(nh->name, 0, FDT_TAGALIGN(namelen+1)); memcpy(nh->name, name, namelen); endtag = (uint32_t *)((char *)nh + nodelen - FDT_TAGSIZE); *endtag = cpu_to_fdt32(FDT_END_NODE); return offset; } int fdt_add_subnode(void *fdt, int parentoffset, const char *name) { return fdt_add_subnode_namelen(fdt, parentoffset, name, strlen(name)); } int fdt_del_node(void *fdt, int nodeoffset) { int endoffset; FDT_RW_CHECK_HEADER(fdt); endoffset = _fdt_node_end_offset(fdt, nodeoffset); if (endoffset < 0) return endoffset; return _fdt_splice_struct(fdt, _fdt_offset_ptr_w(fdt, nodeoffset), endoffset - nodeoffset, 0); } static void _fdt_packblocks(const char *old, char *new, int mem_rsv_size, int struct_size) { int mem_rsv_off, struct_off, strings_off; mem_rsv_off = FDT_ALIGN(sizeof(struct fdt_header), 8); struct_off = mem_rsv_off + mem_rsv_size; strings_off = struct_off + struct_size; memmove(new + mem_rsv_off, old + fdt_off_mem_rsvmap(old), mem_rsv_size); fdt_set_off_mem_rsvmap(new, mem_rsv_off); memmove(new + struct_off, old + fdt_off_dt_struct(old), struct_size); fdt_set_off_dt_struct(new, struct_off); fdt_set_size_dt_struct(new, struct_size); memmove(new + strings_off, old + fdt_off_dt_strings(old), fdt_size_dt_strings(old)); fdt_set_off_dt_strings(new, strings_off); fdt_set_size_dt_strings(new, fdt_size_dt_strings(old)); } int fdt_open_into(const void *fdt, void *buf, int bufsize) { int err; int mem_rsv_size, struct_size; int newsize; const char *fdtstart = fdt; const char *fdtend = fdtstart + fdt_totalsize(fdt); char *tmp; FDT_CHECK_HEADER(fdt); mem_rsv_size = (fdt_num_mem_rsv(fdt)+1) * sizeof(struct fdt_reserve_entry); if (fdt_version(fdt) >= 17) { struct_size = fdt_size_dt_struct(fdt); } else { struct_size = 0; while (fdt_next_tag(fdt, struct_size, &struct_size) != FDT_END) ; if (struct_size < 0) return struct_size; } if (!_fdt_blocks_misordered(fdt, mem_rsv_size, struct_size)) { /* no further work necessary */ err = fdt_move(fdt, buf, bufsize); if (err) return err; fdt_set_version(buf, 17); fdt_set_size_dt_struct(buf, struct_size); fdt_set_totalsize(buf, bufsize); return 0; } /* Need to reorder */ newsize = FDT_ALIGN(sizeof(struct fdt_header), 8) + mem_rsv_size + struct_size + fdt_size_dt_strings(fdt); if (bufsize < newsize) return -FDT_ERR_NOSPACE; /* First attempt to build converted tree at beginning of buffer */ tmp = buf; /* But if that overlaps with the old tree... */ if (((tmp + newsize) > fdtstart) && (tmp < fdtend)) { /* Try right after the old tree instead */ tmp = (char *)(uintptr_t)fdtend; if ((tmp + newsize) > ((char *)buf + bufsize)) return -FDT_ERR_NOSPACE; } _fdt_packblocks(fdt, tmp, mem_rsv_size, struct_size); memmove(buf, tmp, newsize); fdt_set_magic(buf, FDT_MAGIC); fdt_set_totalsize(buf, bufsize); fdt_set_version(buf, 17); fdt_set_last_comp_version(buf, 16); fdt_set_boot_cpuid_phys(buf, fdt_boot_cpuid_phys(fdt)); return 0; } int fdt_pack(void *fdt) { int mem_rsv_size; FDT_RW_CHECK_HEADER(fdt); mem_rsv_size = (fdt_num_mem_rsv(fdt)+1) * sizeof(struct fdt_reserve_entry); _fdt_packblocks(fdt, fdt, mem_rsv_size, fdt_size_dt_struct(fdt)); fdt_set_totalsize(fdt, _fdt_data_size(fdt)); return 0; } xen-4.4.0/xen/common/libfdt/fdt_wip.c0000664000175000017500000000712512307313555015576 0ustar smbsmb/* * libfdt - Flat Device Tree manipulation * Copyright (C) 2006 David Gibson, IBM Corporation. * * libfdt is dual licensed: you can use it either under the terms of * the GPL, or the BSD license, at your option. * * a) This library is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, * MA 02110-1301 USA * * Alternatively, * * b) Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * 1. Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "libfdt_env.h" #include #include #include "libfdt_internal.h" int fdt_setprop_inplace(void *fdt, int nodeoffset, const char *name, const void *val, int len) { void *propval; int proplen; propval = fdt_getprop_w(fdt, nodeoffset, name, &proplen); if (! propval) return proplen; if (proplen != len) return -FDT_ERR_NOSPACE; memcpy(propval, val, len); return 0; } static void _fdt_nop_region(void *start, int len) { uint32_t *p; for (p = start; (char *)p < ((char *)start + len); p++) *p = cpu_to_fdt32(FDT_NOP); } int fdt_nop_property(void *fdt, int nodeoffset, const char *name) { struct fdt_property *prop; int len; prop = fdt_get_property_w(fdt, nodeoffset, name, &len); if (! prop) return len; _fdt_nop_region(prop, len + sizeof(*prop)); return 0; } int _fdt_node_end_offset(void *fdt, int offset) { int depth = 0; while ((offset >= 0) && (depth >= 0)) offset = fdt_next_node(fdt, offset, &depth); return offset; } int fdt_nop_node(void *fdt, int nodeoffset) { int endoffset; endoffset = _fdt_node_end_offset(fdt, nodeoffset); if (endoffset < 0) return endoffset; _fdt_nop_region(fdt_offset_ptr_w(fdt, nodeoffset, 0), endoffset - nodeoffset); return 0; } xen-4.4.0/xen/common/libfdt/version.lds0000664000175000017500000000175112307313555016166 0ustar smbsmbLIBFDT_1.2 { global: fdt_next_node; fdt_check_header; fdt_move; fdt_string; fdt_num_mem_rsv; fdt_get_mem_rsv; fdt_subnode_offset_namelen; fdt_subnode_offset; fdt_path_offset; fdt_get_name; fdt_get_property_namelen; fdt_get_property; fdt_getprop_namelen; fdt_getprop; fdt_get_phandle; fdt_get_alias_namelen; fdt_get_alias; fdt_get_path; fdt_supernode_atdepth_offset; fdt_node_depth; fdt_parent_offset; fdt_node_offset_by_prop_value; fdt_node_offset_by_phandle; fdt_node_check_compatible; fdt_node_offset_by_compatible; fdt_setprop_inplace; fdt_nop_property; fdt_nop_node; fdt_create; fdt_add_reservemap_entry; fdt_finish_reservemap; fdt_begin_node; fdt_property; fdt_end_node; fdt_finish; fdt_open_into; fdt_pack; fdt_add_mem_rsv; fdt_del_mem_rsv; fdt_set_name; fdt_setprop; fdt_delprop; fdt_add_subnode_namelen; fdt_add_subnode; fdt_del_node; fdt_strerror; fdt_offset_ptr; fdt_next_tag; local: *; }; xen-4.4.0/xen/common/libfdt/fdt_sw.c0000664000175000017500000001576012307313555015434 0ustar smbsmb/* * libfdt - Flat Device Tree manipulation * Copyright (C) 2006 David Gibson, IBM Corporation. * * libfdt is dual licensed: you can use it either under the terms of * the GPL, or the BSD license, at your option. * * a) This library is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, * MA 02110-1301 USA * * Alternatively, * * b) Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * 1. Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "libfdt_env.h" #include #include #include "libfdt_internal.h" static int _fdt_sw_check_header(void *fdt) { if (fdt_magic(fdt) != FDT_SW_MAGIC) return -FDT_ERR_BADMAGIC; /* FIXME: should check more details about the header state */ return 0; } #define FDT_SW_CHECK_HEADER(fdt) \ { \ int err; \ if ((err = _fdt_sw_check_header(fdt)) != 0) \ return err; \ } static void *_fdt_grab_space(void *fdt, size_t len) { int offset = fdt_size_dt_struct(fdt); int spaceleft; spaceleft = fdt_totalsize(fdt) - fdt_off_dt_struct(fdt) - fdt_size_dt_strings(fdt); if ((offset + len < offset) || (offset + len > spaceleft)) return NULL; fdt_set_size_dt_struct(fdt, offset + len); return _fdt_offset_ptr_w(fdt, offset); } int fdt_create(void *buf, int bufsize) { void *fdt = buf; if (bufsize < sizeof(struct fdt_header)) return -FDT_ERR_NOSPACE; memset(buf, 0, bufsize); fdt_set_magic(fdt, FDT_SW_MAGIC); fdt_set_version(fdt, FDT_LAST_SUPPORTED_VERSION); fdt_set_last_comp_version(fdt, FDT_FIRST_SUPPORTED_VERSION); fdt_set_totalsize(fdt, bufsize); fdt_set_off_mem_rsvmap(fdt, FDT_ALIGN(sizeof(struct fdt_header), sizeof(struct fdt_reserve_entry))); fdt_set_off_dt_struct(fdt, fdt_off_mem_rsvmap(fdt)); fdt_set_off_dt_strings(fdt, bufsize); return 0; } int fdt_add_reservemap_entry(void *fdt, uint64_t addr, uint64_t size) { struct fdt_reserve_entry *re; int offset; FDT_SW_CHECK_HEADER(fdt); if (fdt_size_dt_struct(fdt)) return -FDT_ERR_BADSTATE; offset = fdt_off_dt_struct(fdt); if ((offset + sizeof(*re)) > fdt_totalsize(fdt)) return -FDT_ERR_NOSPACE; re = (struct fdt_reserve_entry *)((char *)fdt + offset); re->address = cpu_to_fdt64(addr); re->size = cpu_to_fdt64(size); fdt_set_off_dt_struct(fdt, offset + sizeof(*re)); return 0; } int fdt_finish_reservemap(void *fdt) { return fdt_add_reservemap_entry(fdt, 0, 0); } int fdt_begin_node(void *fdt, const char *name) { struct fdt_node_header *nh; int namelen = strlen(name) + 1; FDT_SW_CHECK_HEADER(fdt); nh = _fdt_grab_space(fdt, sizeof(*nh) + FDT_TAGALIGN(namelen)); if (! nh) return -FDT_ERR_NOSPACE; nh->tag = cpu_to_fdt32(FDT_BEGIN_NODE); memcpy(nh->name, name, namelen); return 0; } int fdt_end_node(void *fdt) { uint32_t *en; FDT_SW_CHECK_HEADER(fdt); en = _fdt_grab_space(fdt, FDT_TAGSIZE); if (! en) return -FDT_ERR_NOSPACE; *en = cpu_to_fdt32(FDT_END_NODE); return 0; } static int _fdt_find_add_string(void *fdt, const char *s) { char *strtab = (char *)fdt + fdt_totalsize(fdt); const char *p; int strtabsize = fdt_size_dt_strings(fdt); int len = strlen(s) + 1; int struct_top, offset; p = _fdt_find_string(strtab - strtabsize, strtabsize, s); if (p) return p - strtab; /* Add it */ offset = -strtabsize - len; struct_top = fdt_off_dt_struct(fdt) + fdt_size_dt_struct(fdt); if (fdt_totalsize(fdt) + offset < struct_top) return 0; /* no more room :( */ memcpy(strtab + offset, s, len); fdt_set_size_dt_strings(fdt, strtabsize + len); return offset; } int fdt_property(void *fdt, const char *name, const void *val, int len) { struct fdt_property *prop; int nameoff; FDT_SW_CHECK_HEADER(fdt); nameoff = _fdt_find_add_string(fdt, name); if (nameoff == 0) return -FDT_ERR_NOSPACE; prop = _fdt_grab_space(fdt, sizeof(*prop) + FDT_TAGALIGN(len)); if (! prop) return -FDT_ERR_NOSPACE; prop->tag = cpu_to_fdt32(FDT_PROP); prop->nameoff = cpu_to_fdt32(nameoff); prop->len = cpu_to_fdt32(len); memcpy(prop->data, val, len); return 0; } int fdt_finish(void *fdt) { char *p = (char *)fdt; uint32_t *end; int oldstroffset, newstroffset; uint32_t tag; int offset, nextoffset; FDT_SW_CHECK_HEADER(fdt); /* Add terminator */ end = _fdt_grab_space(fdt, sizeof(*end)); if (! end) return -FDT_ERR_NOSPACE; *end = cpu_to_fdt32(FDT_END); /* Relocate the string table */ oldstroffset = fdt_totalsize(fdt) - fdt_size_dt_strings(fdt); newstroffset = fdt_off_dt_struct(fdt) + fdt_size_dt_struct(fdt); memmove(p + newstroffset, p + oldstroffset, fdt_size_dt_strings(fdt)); fdt_set_off_dt_strings(fdt, newstroffset); /* Walk the structure, correcting string offsets */ offset = 0; while ((tag = fdt_next_tag(fdt, offset, &nextoffset)) != FDT_END) { if (tag == FDT_PROP) { struct fdt_property *prop = _fdt_offset_ptr_w(fdt, offset); int nameoff; nameoff = fdt32_to_cpu(prop->nameoff); nameoff += fdt_size_dt_strings(fdt); prop->nameoff = cpu_to_fdt32(nameoff); } offset = nextoffset; } if (nextoffset < 0) return nextoffset; /* Finally, adjust the header */ fdt_set_totalsize(fdt, newstroffset + fdt_size_dt_strings(fdt)); fdt_set_magic(fdt, FDT_MAGIC); return 0; } xen-4.4.0/xen/common/libfdt/Makefile.libfdt0000664000175000017500000000055412307313555016700 0ustar smbsmb# Makefile.libfdt # # This is not a complete Makefile of itself. Instead, it is designed to # be easily embeddable into other systems of Makefiles. # LIBFDT_soname = libfdt.$(SHAREDLIB_EXT).1 LIBFDT_INCLUDES = fdt.h libfdt.h LIBFDT_VERSION = version.lds LIBFDT_SRCS = fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c LIBFDT_OBJS = $(LIBFDT_SRCS:%.c=%.o) xen-4.4.0/xen/common/libfdt/fdt_ro.c0000664000175000017500000003332512307313555015420 0ustar smbsmb/* * libfdt - Flat Device Tree manipulation * Copyright (C) 2006 David Gibson, IBM Corporation. * * libfdt is dual licensed: you can use it either under the terms of * the GPL, or the BSD license, at your option. * * a) This library is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, * MA 02110-1301 USA * * Alternatively, * * b) Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * 1. Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "libfdt_env.h" #include #include #include "libfdt_internal.h" static int _fdt_nodename_eq(const void *fdt, int offset, const char *s, int len) { const char *p = fdt_offset_ptr(fdt, offset + FDT_TAGSIZE, len+1); if (! p) /* short match */ return 0; if (memcmp(p, s, len) != 0) return 0; if (p[len] == '\0') return 1; else if (!memchr(s, '@', len) && (p[len] == '@')) return 1; else return 0; } const char *fdt_string(const void *fdt, int stroffset) { return (const char *)fdt + fdt_off_dt_strings(fdt) + stroffset; } static int _fdt_string_eq(const void *fdt, int stroffset, const char *s, int len) { const char *p = fdt_string(fdt, stroffset); return (strlen(p) == len) && (memcmp(p, s, len) == 0); } int fdt_get_mem_rsv(const void *fdt, int n, uint64_t *address, uint64_t *size) { FDT_CHECK_HEADER(fdt); *address = fdt64_to_cpu(_fdt_mem_rsv(fdt, n)->address); *size = fdt64_to_cpu(_fdt_mem_rsv(fdt, n)->size); return 0; } int fdt_num_mem_rsv(const void *fdt) { int i = 0; while (fdt64_to_cpu(_fdt_mem_rsv(fdt, i)->size) != 0) i++; return i; } static int _nextprop(const void *fdt, int offset) { uint32_t tag; int nextoffset; do { tag = fdt_next_tag(fdt, offset, &nextoffset); switch (tag) { case FDT_END: if (nextoffset >= 0) return -FDT_ERR_BADSTRUCTURE; else return nextoffset; case FDT_PROP: return offset; } offset = nextoffset; } while (tag == FDT_NOP); return -FDT_ERR_NOTFOUND; } int fdt_subnode_offset_namelen(const void *fdt, int offset, const char *name, int namelen) { int depth; FDT_CHECK_HEADER(fdt); for (depth = 0; (offset >= 0) && (depth >= 0); offset = fdt_next_node(fdt, offset, &depth)) if ((depth == 1) && _fdt_nodename_eq(fdt, offset, name, namelen)) return offset; if (depth < 0) return -FDT_ERR_NOTFOUND; return offset; /* error */ } int fdt_subnode_offset(const void *fdt, int parentoffset, const char *name) { return fdt_subnode_offset_namelen(fdt, parentoffset, name, strlen(name)); } int fdt_path_offset(const void *fdt, const char *path) { const char *end = path + strlen(path); const char *p = path; int offset = 0; FDT_CHECK_HEADER(fdt); /* see if we have an alias */ if (*path != '/') { const char *q = strchr(path, '/'); if (!q) q = end; p = fdt_get_alias_namelen(fdt, p, q - p); if (!p) return -FDT_ERR_BADPATH; offset = fdt_path_offset(fdt, p); p = q; } while (*p) { const char *q; while (*p == '/') p++; if (! *p) return offset; q = strchr(p, '/'); if (! q) q = end; offset = fdt_subnode_offset_namelen(fdt, offset, p, q-p); if (offset < 0) return offset; p = q; } return offset; } const char *fdt_get_name(const void *fdt, int nodeoffset, int *len) { const struct fdt_node_header *nh = _fdt_offset_ptr(fdt, nodeoffset); int err; if (((err = fdt_check_header(fdt)) != 0) || ((err = _fdt_check_node_offset(fdt, nodeoffset)) < 0)) goto fail; if (len) *len = strlen(nh->name); return nh->name; fail: if (len) *len = err; return NULL; } int fdt_first_property_offset(const void *fdt, int nodeoffset) { int offset; if ((offset = _fdt_check_node_offset(fdt, nodeoffset)) < 0) return offset; return _nextprop(fdt, offset); } int fdt_next_property_offset(const void *fdt, int offset) { if ((offset = _fdt_check_prop_offset(fdt, offset)) < 0) return offset; return _nextprop(fdt, offset); } const struct fdt_property *fdt_get_property_by_offset(const void *fdt, int offset, int *lenp) { int err; const struct fdt_property *prop; if ((err = _fdt_check_prop_offset(fdt, offset)) < 0) { if (lenp) *lenp = err; return NULL; } prop = _fdt_offset_ptr(fdt, offset); if (lenp) *lenp = fdt32_to_cpu(prop->len); return prop; } const struct fdt_property *fdt_get_property_namelen(const void *fdt, int offset, const char *name, int namelen, int *lenp) { for (offset = fdt_first_property_offset(fdt, offset); (offset >= 0); (offset = fdt_next_property_offset(fdt, offset))) { const struct fdt_property *prop; if (!(prop = fdt_get_property_by_offset(fdt, offset, lenp))) { offset = -FDT_ERR_INTERNAL; break; } if (_fdt_string_eq(fdt, fdt32_to_cpu(prop->nameoff), name, namelen)) return prop; } if (lenp) *lenp = offset; return NULL; } const struct fdt_property *fdt_get_property(const void *fdt, int nodeoffset, const char *name, int *lenp) { return fdt_get_property_namelen(fdt, nodeoffset, name, strlen(name), lenp); } const void *fdt_getprop_namelen(const void *fdt, int nodeoffset, const char *name, int namelen, int *lenp) { const struct fdt_property *prop; prop = fdt_get_property_namelen(fdt, nodeoffset, name, namelen, lenp); if (! prop) return NULL; return prop->data; } const void *fdt_getprop_by_offset(const void *fdt, int offset, const char **namep, int *lenp) { const struct fdt_property *prop; prop = fdt_get_property_by_offset(fdt, offset, lenp); if (!prop) return NULL; if (namep) *namep = fdt_string(fdt, fdt32_to_cpu(prop->nameoff)); return prop->data; } const void *fdt_getprop(const void *fdt, int nodeoffset, const char *name, int *lenp) { return fdt_getprop_namelen(fdt, nodeoffset, name, strlen(name), lenp); } uint32_t fdt_get_phandle(const void *fdt, int nodeoffset) { const uint32_t *php; int len; /* FIXME: This is a bit sub-optimal, since we potentially scan * over all the properties twice. */ php = fdt_getprop(fdt, nodeoffset, "phandle", &len); if (!php || (len != sizeof(*php))) { php = fdt_getprop(fdt, nodeoffset, "linux,phandle", &len); if (!php || (len != sizeof(*php))) return 0; } return fdt32_to_cpu(*php); } const char *fdt_get_alias_namelen(const void *fdt, const char *name, int namelen) { int aliasoffset; aliasoffset = fdt_path_offset(fdt, "/aliases"); if (aliasoffset < 0) return NULL; return fdt_getprop_namelen(fdt, aliasoffset, name, namelen, NULL); } const char *fdt_get_alias(const void *fdt, const char *name) { return fdt_get_alias_namelen(fdt, name, strlen(name)); } int fdt_get_path(const void *fdt, int nodeoffset, char *buf, int buflen) { int pdepth = 0, p = 0; int offset, depth, namelen; const char *name; FDT_CHECK_HEADER(fdt); if (buflen < 2) return -FDT_ERR_NOSPACE; for (offset = 0, depth = 0; (offset >= 0) && (offset <= nodeoffset); offset = fdt_next_node(fdt, offset, &depth)) { while (pdepth > depth) { do { p--; } while (buf[p-1] != '/'); pdepth--; } if (pdepth >= depth) { name = fdt_get_name(fdt, offset, &namelen); if (!name) return namelen; if ((p + namelen + 1) <= buflen) { memcpy(buf + p, name, namelen); p += namelen; buf[p++] = '/'; pdepth++; } } if (offset == nodeoffset) { if (pdepth < (depth + 1)) return -FDT_ERR_NOSPACE; if (p > 1) /* special case so that root path is "/", not "" */ p--; buf[p] = '\0'; return 0; } } if ((offset == -FDT_ERR_NOTFOUND) || (offset >= 0)) return -FDT_ERR_BADOFFSET; else if (offset == -FDT_ERR_BADOFFSET) return -FDT_ERR_BADSTRUCTURE; return offset; /* error from fdt_next_node() */ } int fdt_supernode_atdepth_offset(const void *fdt, int nodeoffset, int supernodedepth, int *nodedepth) { int offset, depth; int supernodeoffset = -FDT_ERR_INTERNAL; FDT_CHECK_HEADER(fdt); if (supernodedepth < 0) return -FDT_ERR_NOTFOUND; for (offset = 0, depth = 0; (offset >= 0) && (offset <= nodeoffset); offset = fdt_next_node(fdt, offset, &depth)) { if (depth == supernodedepth) supernodeoffset = offset; if (offset == nodeoffset) { if (nodedepth) *nodedepth = depth; if (supernodedepth > depth) return -FDT_ERR_NOTFOUND; else return supernodeoffset; } } if ((offset == -FDT_ERR_NOTFOUND) || (offset >= 0)) return -FDT_ERR_BADOFFSET; else if (offset == -FDT_ERR_BADOFFSET) return -FDT_ERR_BADSTRUCTURE; return offset; /* error from fdt_next_node() */ } int fdt_node_depth(const void *fdt, int nodeoffset) { int nodedepth; int err; err = fdt_supernode_atdepth_offset(fdt, nodeoffset, 0, &nodedepth); if (err) return (err < 0) ? err : -FDT_ERR_INTERNAL; return nodedepth; } int fdt_parent_offset(const void *fdt, int nodeoffset) { int nodedepth = fdt_node_depth(fdt, nodeoffset); if (nodedepth < 0) return nodedepth; return fdt_supernode_atdepth_offset(fdt, nodeoffset, nodedepth - 1, NULL); } int fdt_node_offset_by_prop_value(const void *fdt, int startoffset, const char *propname, const void *propval, int proplen) { int offset; const void *val; int len; FDT_CHECK_HEADER(fdt); /* FIXME: The algorithm here is pretty horrible: we scan each * property of a node in fdt_getprop(), then if that didn't * find what we want, we scan over them again making our way * to the next node. Still it's the easiest to implement * approach; performance can come later. */ for (offset = fdt_next_node(fdt, startoffset, NULL); offset >= 0; offset = fdt_next_node(fdt, offset, NULL)) { val = fdt_getprop(fdt, offset, propname, &len); if (val && (len == proplen) && (memcmp(val, propval, len) == 0)) return offset; } return offset; /* error from fdt_next_node() */ } int fdt_node_offset_by_phandle(const void *fdt, uint32_t phandle) { int offset; if ((phandle == 0) || (phandle == -1)) return -FDT_ERR_BADPHANDLE; FDT_CHECK_HEADER(fdt); /* FIXME: The algorithm here is pretty horrible: we * potentially scan each property of a node in * fdt_get_phandle(), then if that didn't find what * we want, we scan over them again making our way to the next * node. Still it's the easiest to implement approach; * performance can come later. */ for (offset = fdt_next_node(fdt, -1, NULL); offset >= 0; offset = fdt_next_node(fdt, offset, NULL)) { if (fdt_get_phandle(fdt, offset) == phandle) return offset; } return offset; /* error from fdt_next_node() */ } static int _fdt_stringlist_contains(const char *strlist, int listlen, const char *str) { int len = strlen(str); const char *p; while (listlen >= len) { if (memcmp(str, strlist, len+1) == 0) return 1; p = memchr(strlist, '\0', listlen); if (!p) return 0; /* malformed strlist.. */ listlen -= (p-strlist) + 1; strlist = p + 1; } return 0; } int fdt_node_check_compatible(const void *fdt, int nodeoffset, const char *compatible) { const void *prop; int len; prop = fdt_getprop(fdt, nodeoffset, "compatible", &len); if (!prop) return len; if (_fdt_stringlist_contains(prop, len, compatible)) return 0; else return 1; } int fdt_node_offset_by_compatible(const void *fdt, int startoffset, const char *compatible) { int offset, err; FDT_CHECK_HEADER(fdt); /* FIXME: The algorithm here is pretty horrible: we scan each * property of a node in fdt_node_check_compatible(), then if * that didn't find what we want, we scan over them again * making our way to the next node. Still it's the easiest to * implement approach; performance can come later. */ for (offset = fdt_next_node(fdt, startoffset, NULL); offset >= 0; offset = fdt_next_node(fdt, offset, NULL)) { err = fdt_node_check_compatible(fdt, offset, compatible); if ((err < 0) && (err != -FDT_ERR_NOTFOUND)) return err; else if (err == 0) return offset; } return offset; /* error from fdt_next_node() */ } xen-4.4.0/xen/common/irq.c0000664000175000017500000000141312307313555013463 0ustar smbsmb#include #include #include int init_one_irq_desc(struct irq_desc *desc) { int err; if (irq_desc_initialized(desc)) return 0; if ( !alloc_cpumask_var(&desc->affinity) ) return -ENOMEM; desc->status = IRQ_DISABLED; desc->handler = &no_irq_type; spin_lock_init(&desc->lock); cpumask_setall(desc->affinity); INIT_LIST_HEAD(&desc->rl_link); err = arch_init_one_irq_desc(desc); if ( err ) { free_cpumask_var(desc->affinity); desc->handler = NULL; } return err; } void no_action(int cpl, void *dev_id, struct cpu_user_regs *regs) { } void irq_actor_none(struct irq_desc *desc) { } unsigned int irq_startup_none(struct irq_desc *desc) { return 0; } xen-4.4.0/xen/common/unlz4.c0000664000175000017500000000667512307313555013763 0ustar smbsmb/* * Wrapper for decompressing LZ4-compressed kernel, initramfs, and initrd * * Copyright (C) 2013, LG Electronics, Kyungsik Lee * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include "decompress.h" #include #include "lz4/decompress.c" /* * Note: Uncompressed chunk size is used in the compressor side * (userspace side for compression). * It is hardcoded because there is not proper way to extract it * from the binary stream which is generated by the preliminary * version of LZ4 tool so far. */ #define LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE (8 << 20) #define ARCHIVE_MAGICNUMBER 0x184C2102 STATIC int INIT unlz4(unsigned char *input, unsigned int in_len, int (*fill)(void *, unsigned int), int (*flush)(void *, unsigned int), unsigned char *output, unsigned int *posp, void (*error)(const char *x)) { int ret = -1; size_t chunksize = 0; size_t uncomp_chunksize = LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE; u8 *inp; u8 *inp_start; u8 *outp; int size = in_len -= 4; #if defined(__XEN__) || defined(__MINIOS__) size_t out_len = get_unaligned_le32(input + in_len); #endif size_t dest_len; if (output) { outp = output; } else if (!flush) { error("NULL output pointer and no flush function provided"); goto exit_0; } else { outp = large_malloc(uncomp_chunksize); if (!outp) { error("Could not allocate output buffer"); goto exit_0; } } if (input && fill) { error("Both input pointer and fill function provided,"); goto exit_1; } else if (input) { inp = input; } else if (!fill) { error("NULL input pointer and missing fill function"); goto exit_1; } else { inp = large_malloc(lz4_compressbound(uncomp_chunksize)); if (!inp) { error("Could not allocate input buffer"); goto exit_1; } } inp_start = inp; if (posp) *posp = 0; if (fill) fill(inp, 4); chunksize = get_unaligned_le32(inp); if (chunksize == ARCHIVE_MAGICNUMBER) { inp += 4; size -= 4; } else { error("invalid header"); goto exit_2; } if (posp) *posp += 4; for (;;) { if (fill) fill(inp, 4); chunksize = get_unaligned_le32(inp); if (chunksize == ARCHIVE_MAGICNUMBER) { inp += 4; size -= 4; if (posp) *posp += 4; continue; } inp += 4; size -= 4; if (posp) *posp += 4; if (fill) { if (chunksize > lz4_compressbound(uncomp_chunksize)) { error("chunk length is longer than allocated"); goto exit_2; } fill(inp, chunksize); } #if defined(__XEN__) || defined(__MINIOS__) if (out_len >= uncomp_chunksize) { dest_len = uncomp_chunksize; out_len -= dest_len; } else dest_len = out_len; ret = lz4_decompress(inp, &chunksize, outp, dest_len); #else dest_len = uncomp_chunksize; ret = lz4_decompress_unknownoutputsize(inp, chunksize, outp, &dest_len); #endif if (ret < 0) { error("Decoding failed"); goto exit_2; } ret = -1; if (flush && flush(outp, dest_len) != dest_len) goto exit_2; if (output) outp += dest_len; if (posp) *posp += chunksize; size -= chunksize; if (size == 0) break; else if (size < 0) { error("data corrupted"); goto exit_2; } inp += chunksize; if (fill) inp = inp_start; } ret = 0; exit_2: if (!input) large_free(inp_start); exit_1: if (!output) large_free(outp); exit_0: return ret; } xen-4.4.0/xen/common/string.c0000664000175000017500000002142212307313555014200 0ustar smbsmb/* * linux/lib/string.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include #include #include #ifndef __HAVE_ARCH_STRNICMP /** * strnicmp - Case insensitive, length-limited string comparison * @s1: One string * @s2: The other string * @len: the maximum number of characters to compare */ int strnicmp(const char *s1, const char *s2, size_t len) { /* Yes, Virginia, it had better be unsigned */ unsigned char c1, c2; c1 = 0; c2 = 0; if (len) { do { c1 = *s1; c2 = *s2; s1++; s2++; if (!c1) break; if (!c2) break; if (c1 == c2) continue; c1 = tolower(c1); c2 = tolower(c2); if (c1 != c2) break; } while (--len); } return (int)c1 - (int)c2; } #endif #ifndef __HAVE_ARCH_STRCASECMP int strcasecmp(const char *s1, const char *s2) { int c1, c2; do { c1 = tolower(*s1++); c2 = tolower(*s2++); } while ( c1 == c2 && c1 != 0 ); return c1 - c2; } #endif #ifndef __HAVE_ARCH_STRLCPY /** * strlcpy - Copy a %NUL terminated string into a sized buffer * @dest: Where to copy the string to * @src: Where to copy the string from * @size: size of destination buffer * * Compatible with *BSD: the result is always a valid * NUL-terminated string that fits in the buffer (unless, * of course, the buffer size is zero). It does not pad * out the result like strncpy() does. */ size_t strlcpy(char *dest, const char *src, size_t size) { size_t ret = strlen(src); if (size) { size_t len = (ret >= size) ? size-1 : ret; memcpy(dest, src, len); dest[len] = '\0'; } return ret; } EXPORT_SYMBOL(strlcpy); #endif #ifndef __HAVE_ARCH_STRLCAT /** * strlcat - Append a %NUL terminated string into a sized buffer * @dest: Where to copy the string to * @src: Where to copy the string from * @size: size of destination buffer * * Compatible with *BSD: the result is always a valid * NUL-terminated string that fits in the buffer (unless, * of course, the buffer size is zero). */ size_t strlcat(char *dest, const char *src, size_t size) { size_t slen = strlen(src); size_t dlen = strnlen(dest, size); char *p = dest + dlen; while ((p - dest) < size) if ((*p++ = *src++) == '\0') break; if (dlen < size) *(p-1) = '\0'; return slen + dlen; } EXPORT_SYMBOL(strlcat); #endif #ifndef __HAVE_ARCH_STRCMP /** * strcmp - Compare two strings * @cs: One string * @ct: Another string */ int strcmp(const char * cs,const char * ct) { register signed char __res; while (1) { if ((__res = *cs - *ct++) != 0 || !*cs++) break; } return __res; } #endif #ifndef __HAVE_ARCH_STRNCMP /** * strncmp - Compare two length-limited strings * @cs: One string * @ct: Another string * @count: The maximum number of bytes to compare */ int strncmp(const char * cs,const char * ct,size_t count) { register signed char __res = 0; while (count) { if ((__res = *cs - *ct++) != 0 || !*cs++) break; count--; } return __res; } #endif #ifndef __HAVE_ARCH_STRCHR /** * strchr - Find the first occurrence of a character in a string * @s: The string to be searched * @c: The character to search for */ char * strchr(const char * s, int c) { for(; *s != (char) c; ++s) if (*s == '\0') return NULL; return (char *) s; } #endif #ifndef __HAVE_ARCH_STRRCHR /** * strrchr - Find the last occurrence of a character in a string * @s: The string to be searched * @c: The character to search for */ char * strrchr(const char * s, int c) { const char *p = s + strlen(s); do { if (*p == (char)c) return (char *)p; } while (--p >= s); return NULL; } #endif #ifndef __HAVE_ARCH_STRLEN /** * strlen - Find the length of a string * @s: The string to be sized */ size_t strlen(const char * s) { const char *sc; for (sc = s; *sc != '\0'; ++sc) /* nothing */; return sc - s; } #endif #ifndef __HAVE_ARCH_STRNLEN /** * strnlen - Find the length of a length-limited string * @s: The string to be sized * @count: The maximum number of bytes to search */ size_t strnlen(const char * s, size_t count) { const char *sc; for (sc = s; count-- && *sc != '\0'; ++sc) /* nothing */; return sc - s; } #endif #ifndef __HAVE_ARCH_STRSPN /** * strspn - Calculate the length of the initial substring of @s which only * contain letters in @accept * @s: The string to be searched * @accept: The string to search for */ size_t strspn(const char *s, const char *accept) { const char *p; const char *a; size_t count = 0; for (p = s; *p != '\0'; ++p) { for (a = accept; *a != '\0'; ++a) { if (*p == *a) break; } if (*a == '\0') return count; ++count; } return count; } #endif #ifndef __HAVE_ARCH_STRPBRK /** * strpbrk - Find the first occurrence of a set of characters * @cs: The string to be searched * @ct: The characters to search for */ char * strpbrk(const char * cs,const char * ct) { const char *sc1,*sc2; for( sc1 = cs; *sc1 != '\0'; ++sc1) { for( sc2 = ct; *sc2 != '\0'; ++sc2) { if (*sc1 == *sc2) return (char *) sc1; } } return NULL; } #endif #ifndef __HAVE_ARCH_STRSEP /** * strsep - Split a string into tokens * @s: The string to be searched * @ct: The characters to search for * * strsep() updates @s to point after the token, ready for the next call. * * It returns empty tokens, too, behaving exactly like the libc function * of that name. In fact, it was stolen from glibc2 and de-fancy-fied. * Same semantics, slimmer shape. ;) */ char * strsep(char **s, const char *ct) { char *sbegin = *s, *end; if (sbegin == NULL) return NULL; end = strpbrk(sbegin, ct); if (end) *end++ = '\0'; *s = end; return sbegin; } #endif #ifndef __HAVE_ARCH_MEMSET /** * memset - Fill a region of memory with the given value * @s: Pointer to the start of the area. * @c: The byte to fill the area with * @count: The size of the area. * * Do not use memset() to access IO space, use memset_io() instead. */ void * memset(void * s,int c,size_t count) { char *xs = (char *) s; while (count--) *xs++ = c; return s; } #endif #ifndef __HAVE_ARCH_MEMCPY /** * memcpy - Copy one area of memory to another * @dest: Where to copy to * @src: Where to copy from * @count: The size of the area. * * You should not use this function to access IO space, use memcpy_toio() * or memcpy_fromio() instead. */ void * memcpy(void * dest,const void *src,size_t count) { char *tmp = (char *) dest, *s = (char *) src; while (count--) *tmp++ = *s++; return dest; } #endif #ifndef __HAVE_ARCH_MEMMOVE /** * memmove - Copy one area of memory to another * @dest: Where to copy to * @src: Where to copy from * @count: The size of the area. * * Unlike memcpy(), memmove() copes with overlapping areas. */ void * memmove(void * dest,const void *src,size_t count) { char *tmp, *s; if (dest <= src) { tmp = (char *) dest; s = (char *) src; while (count--) *tmp++ = *s++; } else { tmp = (char *) dest + count; s = (char *) src + count; while (count--) *--tmp = *--s; } return dest; } #endif #ifndef __HAVE_ARCH_MEMCMP /** * memcmp - Compare two areas of memory * @cs: One area of memory * @ct: Another area of memory * @count: The size of the area. */ int memcmp(const void * cs,const void * ct,size_t count) { const unsigned char *su1, *su2; int res = 0; for( su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--) if ((res = *su1 - *su2) != 0) break; return res; } #endif #ifndef __HAVE_ARCH_MEMSCAN /** * memscan - Find a character in an area of memory. * @addr: The memory area * @c: The byte to search for * @size: The size of the area. * * returns the address of the first occurrence of @c, or 1 byte past * the area if @c is not found */ void * memscan(void * addr, int c, size_t size) { unsigned char * p = (unsigned char *) addr; while (size) { if (*p == c) return (void *) p; p++; size--; } return (void *) p; } #endif #ifndef __HAVE_ARCH_STRSTR /** * strstr - Find the first substring in a %NUL terminated string * @s1: The string to be searched * @s2: The string to search for */ char * strstr(const char * s1,const char * s2) { int l1, l2; l2 = strlen(s2); if (!l2) return (char *) s1; l1 = strlen(s1); while (l1 >= l2) { l1--; if (!memcmp(s1,s2,l2)) return (char *) s1; s1++; } return NULL; } #endif #ifndef __HAVE_ARCH_MEMCHR /** * memchr - Find a character in an area of memory. * @s: The memory area * @c: The byte to search for * @n: The size of the area. * * returns the address of the first occurrence of @c, or %NULL * if @c is not found */ void *memchr(const void *s, int c, size_t n) { const unsigned char *p = s; while (n-- != 0) { if ((unsigned char)c == *p++) { return (void *)(p-1); } } return NULL; } #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 8 * tab-width: 8 * indent-tabs-mode: t * End: */ xen-4.4.0/xen/common/cpu.c0000664000175000017500000001206212307313555013461 0ustar smbsmb#include #include #include #include #include #include #include unsigned int __read_mostly nr_cpu_ids = NR_CPUS; #ifndef nr_cpumask_bits unsigned int __read_mostly nr_cpumask_bits = BITS_TO_LONGS(NR_CPUS) * BITS_PER_LONG; #endif /* * cpu_bit_bitmap[] is a special, "compressed" data structure that * represents all NR_CPUS bits binary values of 1< 32 MASK_DECLARE_8(32), MASK_DECLARE_8(40), MASK_DECLARE_8(48), MASK_DECLARE_8(56), #endif }; static DEFINE_SPINLOCK(cpu_add_remove_lock); bool_t get_cpu_maps(void) { return spin_trylock_recursive(&cpu_add_remove_lock); } void put_cpu_maps(void) { spin_unlock_recursive(&cpu_add_remove_lock); } bool_t cpu_hotplug_begin(void) { return get_cpu_maps(); } void cpu_hotplug_done(void) { put_cpu_maps(); } static NOTIFIER_HEAD(cpu_chain); void __init register_cpu_notifier(struct notifier_block *nb) { if ( !spin_trylock(&cpu_add_remove_lock) ) BUG(); /* Should never fail as we are called only during boot. */ notifier_chain_register(&cpu_chain, nb); spin_unlock(&cpu_add_remove_lock); } static int take_cpu_down(void *unused) { void *hcpu = (void *)(long)smp_processor_id(); int notifier_rc = notifier_call_chain(&cpu_chain, CPU_DYING, hcpu, NULL); BUG_ON(notifier_rc != NOTIFY_DONE); __cpu_disable(); return 0; } int cpu_down(unsigned int cpu) { int err, notifier_rc; void *hcpu = (void *)(long)cpu; struct notifier_block *nb = NULL; if ( !cpu_hotplug_begin() ) return -EBUSY; if ( (cpu >= nr_cpu_ids) || (cpu == 0) || !cpu_online(cpu) ) { cpu_hotplug_done(); return -EINVAL; } notifier_rc = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, hcpu, &nb); if ( notifier_rc != NOTIFY_DONE ) { err = notifier_to_errno(notifier_rc); goto fail; } if ( (err = stop_machine_run(take_cpu_down, NULL, cpu)) < 0 ) goto fail; __cpu_die(cpu); BUG_ON(cpu_online(cpu)); notifier_rc = notifier_call_chain(&cpu_chain, CPU_DEAD, hcpu, NULL); BUG_ON(notifier_rc != NOTIFY_DONE); send_global_virq(VIRQ_PCPU_STATE); cpu_hotplug_done(); return 0; fail: notifier_rc = notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, hcpu, &nb); BUG_ON(notifier_rc != NOTIFY_DONE); cpu_hotplug_done(); return err; } int cpu_up(unsigned int cpu) { int notifier_rc, err = 0; void *hcpu = (void *)(long)cpu; struct notifier_block *nb = NULL; if ( !cpu_hotplug_begin() ) return -EBUSY; if ( (cpu >= nr_cpu_ids) || cpu_online(cpu) || !cpu_present(cpu) ) { cpu_hotplug_done(); return -EINVAL; } notifier_rc = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu, &nb); if ( notifier_rc != NOTIFY_DONE ) { err = notifier_to_errno(notifier_rc); goto fail; } err = __cpu_up(cpu); if ( err < 0 ) goto fail; notifier_rc = notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu, NULL); BUG_ON(notifier_rc != NOTIFY_DONE); send_global_virq(VIRQ_PCPU_STATE); cpu_hotplug_done(); return 0; fail: notifier_rc = notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu, &nb); BUG_ON(notifier_rc != NOTIFY_DONE); cpu_hotplug_done(); return err; } void notify_cpu_starting(unsigned int cpu) { void *hcpu = (void *)(long)cpu; int notifier_rc = notifier_call_chain( &cpu_chain, CPU_STARTING, hcpu, NULL); BUG_ON(notifier_rc != NOTIFY_DONE); } static cpumask_t frozen_cpus; int disable_nonboot_cpus(void) { int cpu, error = 0; BUG_ON(smp_processor_id() != 0); cpumask_clear(&frozen_cpus); printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu ( cpu ) { if ( cpu == 0 ) continue; if ( (error = cpu_down(cpu)) ) { BUG_ON(error == -EBUSY); printk("Error taking CPU%d down: %d\n", cpu, error); break; } cpumask_set_cpu(cpu, &frozen_cpus); } BUG_ON(!error && (num_online_cpus() != 1)); return error; } void enable_nonboot_cpus(void) { int cpu, error; printk("Enabling non-boot CPUs ...\n"); for_each_cpu ( cpu, &frozen_cpus ) { if ( (error = cpu_up(cpu)) ) { BUG_ON(error == -EBUSY); printk("Error taking CPU%d up: %d\n", cpu, error); } } cpumask_clear(&frozen_cpus); } xen-4.4.0/xen/common/decompress.c0000664000175000017500000000207612307313555015042 0ustar smbsmb#include #include #include #include #include static void __init error(const char *msg) { printk("%s\n", msg); } int __init decompress(void *inbuf, unsigned int len, void *outbuf) { #if 0 /* Not needed here yet. */ if ( len >= 2 && (!memcmp(inbuf, "\037\213", 2) || !memcmp(inbuf, "\037\236", 2)) ) return gunzip(inbuf, len, NULL, NULL, outbuf, NULL, error); #endif if ( len >= 3 && !memcmp(inbuf, "\x42\x5a\x68", 3) ) return bunzip2(inbuf, len, NULL, NULL, outbuf, NULL, error); if ( len >= 6 && !memcmp(inbuf, "\3757zXZ", 6) ) return unxz(inbuf, len, NULL, NULL, outbuf, NULL, error); if ( len >= 2 && !memcmp(inbuf, "\135\000", 2) ) return unlzma(inbuf, len, NULL, NULL, outbuf, NULL, error); if ( len >= 5 && !memcmp(inbuf, "\x89LZO", 5) ) return unlzo(inbuf, len, NULL, NULL, outbuf, NULL, error); if ( len >= 2 && !memcmp(inbuf, "\x02\x21", 2) ) return unlz4(inbuf, len, NULL, NULL, outbuf, NULL, error); return 1; } xen-4.4.0/xen/common/inflate.c0000664000175000017500000012353412307313555014323 0ustar smbsmb#define DEBG(x) #define DEBG1(x) /* inflate.c -- Not copyrighted 1992 by Mark Adler version c10p1, 10 January 1993 */ /* * Adapted for booting Linux by Hannu Savolainen 1993 * based on gzip-1.0.3 * * Nicolas Pitre , 1999/04/14 : * Little mods for all variable to reside either into rodata or bss segments * by marking constant variables with 'const' and initializing all the others * at run-time only. This allows for the kernel uncompressor to run * directly from Flash or ROM memory on embedded systems. */ /* Inflate deflated (PKZIP's method 8 compressed) data. The compression method searches for as much of the current string of bytes (up to a length of 258) in the previous 32 K bytes. If it doesn't find any matches (of at least length 3), it codes the next byte. Otherwise, it codes the length of the matched string and its distance backwards from the current position. There is a single Huffman code that codes both single bytes (called "literals") and match lengths. A second Huffman code codes the distance information, which follows a length code. Each length or distance code actually represents a base value and a number of "extra" (sometimes zero) bits to get to add to the base value. At the end of each deflated block is a special end-of-block (EOB) literal/ length code. The decoding process is basically: get a literal/length code; if EOB then done; if a literal, emit the decoded byte; if a length then get the distance and emit the referred-to bytes from the sliding window of previously emitted data. There are (currently) three kinds of inflate blocks: stored, fixed, and dynamic. The compressor deals with some chunk of data at a time, and decides which method to use on a chunk-by-chunk basis. A chunk might typically be 32 K or 64 K. If the chunk is incompressible, then the "stored" method is used. In this case, the bytes are simply stored as is, eight bits per byte, with none of the above coding. The bytes are preceded by a count, since there is no longer an EOB code. If the data is compressible, then either the fixed or dynamic methods are used. In the dynamic method, the compressed data is preceded by an encoding of the literal/length and distance Huffman codes that are to be used to decode this block. The representation is itself Huffman coded, and so is preceded by a description of that code. These code descriptions take up a little space, and so for small blocks, there is a predefined set of codes, called the fixed codes. The fixed method is used if the block codes up smaller that way (usually for quite small chunks), otherwise the dynamic method is used. In the latter case, the codes are customized to the probabilities in the current block, and so can code it much better than the pre-determined fixed codes. The Huffman codes themselves are decoded using a multi-level table lookup, in order to maximize the speed of decoding plus the speed of building the decoding tables. See the comments below that precede the lbits and dbits tuning parameters. */ /* Notes beyond the 1.93a appnote.txt: 1. Distance pointers never point before the beginning of the output stream. 2. Distance pointers can point back across blocks, up to 32k away. 3. There is an implied maximum of 7 bits for the bit length table and 15 bits for the actual data. 4. If only one code exists, then it is encoded using one bit. (Zero would be more efficient, but perhaps a little confusing.) If two codes exist, they are coded using one bit each (0 and 1). 5. There is no way of sending zero distance codes--a dummy must be sent if there are none. (History: a pre 2.0 version of PKZIP would store blocks with no distance codes, but this was discovered to be too harsh a criterion.) Valid only for 1.93a. 2.04c does allow zero distance codes, which is sent as one code of zero bits in length. 6. There are up to 286 literal/length codes. Code 256 represents the end-of-block. Note however that the static length tree defines 288 codes just to fill out the Huffman codes. Codes 286 and 287 cannot be used though, since there is no length base or extra bits defined for them. Similarly, there are up to 30 distance codes. However, static trees define 32 codes (all 5 bits) to fill out the Huffman codes, but the last two had better not show up in the data. 7. Unzip can check dynamic Huffman blocks for complete code sets. The exception is that a single code would not be complete (see #4). 8. The five bits following the block type is really the number of literal codes sent minus 257. 9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits (1+6+6). Therefore, to output three times the length, you output three codes (1+1+1), whereas to output four times the same length, you only need two codes (1+3). Hmm. 10. In the tree reconstruction algorithm, Code = Code + Increment only if BitLength(i) is not zero. (Pretty obvious.) 11. Correction: 4 Bits: # of Bit Length codes - 4 (4 - 19) 12. Note: length code 284 can represent 227-258, but length code 285 really is 258. The last length deserves its own, short code since it gets used a lot in very redundant files. The length 258 is special since 258 - 3 (the min match length) is 255. 13. The literal/length and distance code bit lengths are read as a single stream of lengths. It is possible (and advantageous) for a repeat code (16, 17, or 18) to go across the boundary between the two sets of lengths. */ #ifdef RCSID static char rcsid[] = "#Id: inflate.c,v 0.14 1993/06/10 13:27:04 jloup Exp #"; #endif #ifndef STATIC #if defined(STDC_HEADERS) || defined(HAVE_STDLIB_H) # include # include #endif #include "gzip.h" #define STATIC #endif /* !STATIC */ #ifndef INIT #define INIT #define INITDATA #endif #define slide window /* Huffman code lookup table entry--this entry is four bytes for machines that have 16-bit pointers (e.g. PC's in the small or medium model). Valid extra bits are 0..13. e == 15 is EOB (end of block), e == 16 means that v is a literal, 16 < e < 32 means that v is a pointer to the next table, which codes e - 16 bits, and lastly e == 99 indicates an unused code. If a code with e == 99 is looked up, this implies an error in the data. */ struct huft { uch e; /* number of extra bits or operation */ uch b; /* number of bits in this code or subcode */ union { ush n; /* literal, length base, or distance base */ struct huft *t; /* pointer to next level of table */ } v; }; /* Function prototypes */ STATIC int INIT huft_build OF((unsigned *, unsigned, unsigned, const ush *, const ush *, struct huft **, int *)); STATIC int INIT huft_free OF((struct huft *)); STATIC int INIT inflate_codes OF((struct huft *, struct huft *, int, int)); STATIC int INIT inflate_stored OF((void)); STATIC int INIT inflate_fixed OF((void)); STATIC int INIT inflate_dynamic OF((void)); STATIC int INIT inflate_block OF((int *)); STATIC int INIT inflate OF((void)); /* The inflate algorithm uses a sliding 32 K byte window on the uncompressed stream to find repeated byte strings. This is implemented here as a circular buffer. The index is updated simply by incrementing and then ANDing with 0x7fff (32K-1). */ /* It is left to other modules to supply the 32 K area. It is assumed to be usable as if it were declared "uch slide[32768];" or as just "uch *slide;" and then malloc'ed in the latter case. The definition must be in unzip.h, included above. */ /* unsigned wp; current position in slide */ #define wp outcnt #define flush_output(w) (wp=(w),flush_window()) /* Tables for deflate from PKZIP's appnote.txt. */ static const unsigned border[] = { /* Order of the bit length code lengths */ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; static const ush cplens[] = { /* Copy lengths for literal codes 257..285 */ 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; /* note: see note #13 above about the 258 in this list. */ static const ush cplext[] = { /* Extra bits for literal codes 257..285 */ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 99, 99}; /* 99==invalid */ static const ush cpdist[] = { /* Copy offsets for distance codes 0..29 */ 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577}; static const ush cpdext[] = { /* Extra bits for distance codes */ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13}; /* Macros for inflate() bit peeking and grabbing. The usage is: NEEDBITS(j) x = b & mask_bits[j]; DUMPBITS(j) where NEEDBITS makes sure that b has at least j bits in it, and DUMPBITS removes the bits from b. The macros use the variable k for the number of bits in b. Normally, b and k are register variables for speed, and are initialized at the beginning of a routine that uses these macros from a global bit buffer and count. If we assume that EOB will be the longest code, then we will never ask for bits with NEEDBITS that are beyond the end of the stream. So, NEEDBITS should not read any more bytes than are needed to meet the request. Then no bytes need to be "returned" to the buffer at the end of the last block. However, this assumption is not true for fixed blocks--the EOB code is 7 bits, but the other literal/length codes can be 8 or 9 bits. (The EOB code is shorter than other codes because fixed blocks are generally short. So, while a block always has an EOB, many other literal/length codes have a significantly lower probability of showing up at all.) However, by making the first table have a lookup of seven bits, the EOB code will be found in that first lookup, and so will not require that too many bits be pulled from the stream. */ STATIC ulg INITDATA bb; /* bit buffer */ STATIC unsigned INITDATA bk; /* bits in bit buffer */ STATIC const ush mask_bits[] = { 0x0000, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff, 0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff }; #define NEXTBYTE() ({ int v = get_byte(); if (v < 0) goto underrun; (uch)v; }) #define NEEDBITS(n) {while(k<(n)){b|=((ulg)NEXTBYTE())<>=(n);k-=(n);} #ifndef NO_INFLATE_MALLOC /* A trivial malloc implementation, adapted from * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 */ static unsigned long INITDATA malloc_ptr; static int INITDATA malloc_count; static void *INIT malloc(int size) { void *p; if (size < 0) error("Malloc error"); if (!malloc_ptr) malloc_ptr = free_mem_ptr; malloc_ptr = (malloc_ptr + 3) & ~3; /* Align */ p = (void *)malloc_ptr; malloc_ptr += size; if (free_mem_end_ptr && malloc_ptr >= free_mem_end_ptr) error("Out of memory"); malloc_count++; return p; } static void INIT free(void *where) { malloc_count--; if (!malloc_count) malloc_ptr = free_mem_ptr; } #else #define malloc(a) kmalloc(a, GFP_KERNEL) #define free(a) kfree(a) #endif /* Huffman code decoding is performed using a multi-level table lookup. The fastest way to decode is to simply build a lookup table whose size is determined by the longest code. However, the time it takes to build this table can also be a factor if the data being decoded is not very long. The most common codes are necessarily the shortest codes, so those codes dominate the decoding time, and hence the speed. The idea is you can have a shorter table that decodes the shorter, more probable codes, and then point to subsidiary tables for the longer codes. The time it costs to decode the longer codes is then traded against the time it takes to make longer tables. This results of this trade are in the variables lbits and dbits below. lbits is the number of bits the first level table for literal/ length codes can decode in one step, and dbits is the same thing for the distance codes. Subsequent tables are also less than or equal to those sizes. These values may be adjusted either when all of the codes are shorter than that, in which case the longest code length in bits is used, or when the shortest code is *longer* than the requested table size, in which case the length of the shortest code in bits is used. There are two different values for the two tables, since they code a different number of possibilities each. The literal/length table codes 286 possible values, or in a flat code, a little over eight bits. The distance table codes 30 possible values, or a little less than five bits, flat. The optimum values for speed end up being about one bit more than those, so lbits is 8+1 and dbits is 5+1. The optimum values may differ though from machine to machine, and possibly even between compilers. Your mileage may vary. */ STATIC const int lbits = 9; /* bits in base literal/length lookup table */ STATIC const int dbits = 6; /* bits in base distance lookup table */ /* If BMAX needs to be larger than 16, then h and x[] should be ulg. */ #define BMAX 16 /* maximum bit length of any code (16 for explode) */ #define N_MAX 288 /* maximum number of codes in any set */ STATIC unsigned INITDATA hufts; /* track memory usage */ STATIC int INIT huft_build( unsigned *b, /* code lengths in bits (all assumed <= BMAX) */ unsigned n, /* number of codes (assumed <= N_MAX) */ unsigned s, /* number of simple-valued codes (0..s-1) */ const ush *d, /* list of base values for non-simple codes */ const ush *e, /* list of extra bits for non-simple codes */ struct huft **t, /* result: starting table */ int *m /* maximum lookup bits, returns actual */ ) /* Given a list of code lengths and a maximum table size, make a set of tables to decode that set of codes. Return zero on success, one if the given code set is incomplete (the tables are still built in this case), two if the input is invalid (all zero length codes or an oversubscribed set of lengths), and three if not enough memory. */ { unsigned a; /* counter for codes of length k */ unsigned f; /* i repeats in table every f entries */ int g; /* maximum code length */ int h; /* table level */ register unsigned i; /* counter, current code */ register unsigned j; /* counter */ register int k; /* number of bits in current code */ int l; /* bits per table (returned in m) */ register unsigned *p; /* pointer into c[], b[], or v[] */ register struct huft *q; /* points to current table */ struct huft r; /* table entry for structure assignment */ register int w; /* bits before this table == (l * h) */ unsigned *xp; /* pointer into x */ int y; /* number of dummy codes added */ unsigned z; /* number of entries in current table */ struct { unsigned c[BMAX+1]; /* bit length count table */ struct huft *u[BMAX]; /* table stack */ unsigned v[N_MAX]; /* values in order of bit length */ unsigned x[BMAX+1]; /* bit offsets, then code stack */ } *stk; unsigned *c, *v, *x; struct huft **u; int ret; DEBG("huft1 "); stk = malloc(sizeof(*stk)); if (stk == NULL) return 3; /* out of memory */ c = stk->c; v = stk->v; x = stk->x; u = stk->u; /* Generate counts for each bit length */ memzero(stk->c, sizeof(stk->c)); p = b; i = n; do { Tracecv(*p, (stderr, (n-i >= ' ' && n-i <= '~' ? "%c %d\n" : "0x%x %d\n"), n-i, *p)); c[*p]++; /* assume all entries <= BMAX */ p++; /* Can't combine with above line (Solaris bug) */ } while (--i); if (c[0] == n) /* null input--all zero length codes */ { *t = (struct huft *)NULL; *m = 0; ret = 2; goto out; } DEBG("huft2 "); /* Find minimum and maximum length, bound *m by those */ l = *m; for (j = 1; j <= BMAX; j++) if (c[j]) break; k = j; /* minimum code length */ if ((unsigned)l < j) l = j; for (i = BMAX; i; i--) if (c[i]) break; g = i; /* maximum code length */ if ((unsigned)l > i) l = i; *m = l; DEBG("huft3 "); /* Adjust last length count to fill out codes, if needed */ for (y = 1 << j; j < i; j++, y <<= 1) if ((y -= c[j]) < 0) { ret = 2; /* bad input: more codes than bits */ goto out; } if ((y -= c[i]) < 0) { ret = 2; goto out; } c[i] += y; DEBG("huft4 "); /* Generate starting offsets into the value table for each length */ x[1] = j = 0; p = c + 1; xp = x + 2; while (--i) { /* note that i == g from above */ *xp++ = (j += *p++); } DEBG("huft5 "); /* Make a table of values in order of bit lengths */ p = b; i = 0; do { if ((j = *p++) != 0) v[x[j]++] = i; } while (++i < n); n = x[g]; /* set n to length of v */ DEBG("h6 "); /* Generate the Huffman codes and for each, make the table entries */ x[0] = i = 0; /* first Huffman code is zero */ p = v; /* grab values in bit order */ h = -1; /* no tables yet--level -1 */ w = -l; /* bits decoded == (l * h) */ u[0] = (struct huft *)NULL; /* just to keep compilers happy */ q = (struct huft *)NULL; /* ditto */ z = 0; /* ditto */ DEBG("h6a "); /* go through the bit lengths (k already is bits in shortest code) */ for (; k <= g; k++) { DEBG("h6b "); a = c[k]; while (a--) { DEBG("h6b1 "); /* here i is the Huffman code of length k bits for value *p */ /* make tables up to required level */ while (k > w + l) { DEBG1("1 "); h++; w += l; /* previous table always l bits */ /* compute minimum size table less than or equal to l bits */ z = (z = g - w) > (unsigned)l ? l : z; /* upper limit on table size */ if ((f = 1 << (j = k - w)) > a + 1) /* try a k-w bit table */ { /* too few codes for k-w bit table */ DEBG1("2 "); f -= a + 1; /* deduct codes from patterns left */ xp = c + k; if (j < z) while (++j < z) /* try smaller tables up to z bits */ { if ((f <<= 1) <= *++xp) break; /* enough codes to use up j bits */ f -= *xp; /* else deduct codes from patterns */ } } DEBG1("3 "); z = 1 << j; /* table entries for j-bit table */ /* allocate and link in new table */ if ((q = (struct huft *)malloc((z + 1)*sizeof(struct huft))) == (struct huft *)NULL) { if (h) huft_free(u[0]); ret = 3; /* not enough memory */ goto out; } DEBG1("4 "); hufts += z + 1; /* track memory usage */ *t = q + 1; /* link to list for huft_free() */ *(t = &(q->v.t)) = (struct huft *)NULL; u[h] = ++q; /* table starts after link */ DEBG1("5 "); /* connect to last table, if there is one */ if (h) { x[h] = i; /* save pattern for backing up */ r.b = (uch)l; /* bits to dump before this table */ r.e = (uch)(16 + j); /* bits in this table */ r.v.t = q; /* pointer to this table */ j = i >> (w - l); /* (get around Turbo C bug) */ u[h-1][j] = r; /* connect to last table */ } DEBG1("6 "); } DEBG("h6c "); /* set up table entry in r */ r.b = (uch)(k - w); if (p >= v + n) r.e = 99; /* out of values--invalid code */ else if (*p < s) { r.e = (uch)(*p < 256 ? 16 : 15); /* 256 is end-of-block code */ r.v.n = (ush)(*p); /* simple code is just the value */ p++; /* one compiler does not like *p++ */ } else { r.e = (uch)e[*p - s]; /* non-simple--look up in lists */ r.v.n = d[*p++ - s]; } DEBG("h6d "); /* fill code-like entries with r */ f = 1 << (k - w); for (j = i >> w; j < z; j += f) q[j] = r; /* backwards increment the k-bit code i */ for (j = 1 << (k - 1); i & j; j >>= 1) i ^= j; i ^= j; /* backup over finished tables */ while ((i & ((1 << w) - 1)) != x[h]) { h--; /* don't need to update q */ w -= l; } DEBG("h6e "); } DEBG("h6f "); } DEBG("huft7 "); /* Return true (1) if we were given an incomplete table */ ret = y != 0 && g != 1; out: free(stk); return ret; } STATIC int INIT huft_free( struct huft *t /* table to free */ ) /* Free the malloc'ed tables built by huft_build(), which makes a linked list of the tables it made, with the links in a dummy first entry of each table. */ { register struct huft *p, *q; /* Go through linked list, freeing from the malloced (t[-1]) address. */ p = t; while (p != (struct huft *)NULL) { q = (--p)->v.t; free((char*)p); p = q; } return 0; } STATIC int INIT inflate_codes( struct huft *tl, /* literal/length decoder tables */ struct huft *td, /* distance decoder tables */ int bl, /* number of bits decoded by tl[] */ int bd /* number of bits decoded by td[] */ ) /* inflate (decompress) the codes in a deflated (compressed) block. Return an error code or zero if it all goes ok. */ { register unsigned e; /* table entry flag/number of extra bits */ unsigned n, d; /* length and index for copy */ unsigned w; /* current window position */ struct huft *t; /* pointer to table entry */ unsigned ml, md; /* masks for bl and bd bits */ register ulg b; /* bit buffer */ register unsigned k; /* number of bits in bit buffer */ /* make local copies of globals */ b = bb; /* initialize bit buffer */ k = bk; w = wp; /* initialize window position */ /* inflate the coded data */ ml = mask_bits[bl]; /* precompute masks for speed */ md = mask_bits[bd]; for (;;) /* do until end of block */ { NEEDBITS((unsigned)bl) if ((e = (t = tl + ((unsigned)b & ml))->e) > 16) do { if (e == 99) return 1; DUMPBITS(t->b) e -= 16; NEEDBITS(e) } while ((e = (t = t->v.t + ((unsigned)b & mask_bits[e]))->e) > 16); DUMPBITS(t->b) if (e == 16) /* then it's a literal */ { slide[w++] = (uch)t->v.n; Tracevv((stderr, "%c", slide[w-1])); if (w == WSIZE) { flush_output(w); w = 0; } } else /* it's an EOB or a length */ { /* exit if end of block */ if (e == 15) break; /* get length of block to copy */ NEEDBITS(e) n = t->v.n + ((unsigned)b & mask_bits[e]); DUMPBITS(e); /* decode distance of block to copy */ NEEDBITS((unsigned)bd) if ((e = (t = td + ((unsigned)b & md))->e) > 16) do { if (e == 99) return 1; DUMPBITS(t->b) e -= 16; NEEDBITS(e) } while ((e = (t = t->v.t + ((unsigned)b & mask_bits[e]))->e) > 16); DUMPBITS(t->b) NEEDBITS(e) d = w - t->v.n - ((unsigned)b & mask_bits[e]); DUMPBITS(e) Tracevv((stderr,"\\[%d,%d]", w-d, n)); /* do the copy */ do { n -= (e = (e = WSIZE - ((d &= WSIZE-1) > w ? d : w)) > n ? n : e); #if !defined(NOMEMCPY) && !defined(DEBUG) if (w - d >= e) /* (this test assumes unsigned comparison) */ { memcpy(slide + w, slide + d, e); w += e; d += e; } else /* do it slow to avoid memcpy() overlap */ #endif /* !NOMEMCPY */ do { slide[w++] = slide[d++]; Tracevv((stderr, "%c", slide[w-1])); } while (--e); if (w == WSIZE) { flush_output(w); w = 0; } } while (n); } } /* restore the globals from the locals */ wp = w; /* restore global window pointer */ bb = b; /* restore global bit buffer */ bk = k; /* done */ return 0; underrun: return 4; /* Input underrun */ } STATIC int INIT inflate_stored(void) /* "decompress" an inflated type 0 (stored) block. */ { unsigned n; /* number of bytes in block */ unsigned w; /* current window position */ register ulg b; /* bit buffer */ register unsigned k; /* number of bits in bit buffer */ DEBG(""); return 0; underrun: return 4; /* Input underrun */ } /* * We use `noinline' here to prevent gcc-3.5 from using too much stack space */ STATIC int noinline INIT inflate_fixed(void) /* decompress an inflated type 1 (fixed Huffman codes) block. We should either replace this with a custom decoder, or at least precompute the Huffman tables. */ { int i; /* temporary variable */ struct huft *tl; /* literal/length code table */ struct huft *td; /* distance code table */ int bl; /* lookup bits for tl */ int bd; /* lookup bits for td */ unsigned *l; /* length list for huft_build */ DEBG(" 1) { huft_free(tl); free(l); DEBG(">"); return i; } /* decompress until an end-of-block code */ if (inflate_codes(tl, td, bl, bd)) { free(l); return 1; } /* free the decoding tables, return */ free(l); huft_free(tl); huft_free(td); return 0; } /* * We use `noinline' here to prevent gcc-3.5 from using too much stack space */ STATIC int noinline INIT inflate_dynamic(void) /* decompress an inflated type 2 (dynamic Huffman codes) block. */ { int i; /* temporary variables */ unsigned j; unsigned l; /* last length */ unsigned m; /* mask for bit lengths table */ unsigned n; /* number of lengths to get */ struct huft *tl; /* literal/length code table */ struct huft *td; /* distance code table */ int bl; /* lookup bits for tl */ int bd; /* lookup bits for td */ unsigned nb; /* number of bit length codes */ unsigned nl; /* number of literal/length codes */ unsigned nd; /* number of distance codes */ unsigned *ll; /* literal/length and distance code lengths */ register ulg b; /* bit buffer */ register unsigned k; /* number of bits in bit buffer */ int ret; DEBG(" 288 || nd > 32) #else if (nl > 286 || nd > 30) #endif { ret = 1; /* bad lengths */ goto out; } DEBG("dyn1 "); /* read in bit-length-code lengths */ for (j = 0; j < nb; j++) { NEEDBITS(3) ll[border[j]] = (unsigned)b & 7; DUMPBITS(3) } for (; j < 19; j++) ll[border[j]] = 0; DEBG("dyn2 "); /* build decoding table for trees--single level, 7 bit lookup */ bl = 7; if ((i = huft_build(ll, 19, 19, NULL, NULL, &tl, &bl)) != 0) { if (i == 1) huft_free(tl); ret = i; /* incomplete code set */ goto out; } DEBG("dyn3 "); /* read in literal and distance code lengths */ n = nl + nd; m = mask_bits[bl]; i = l = 0; while ((unsigned)i < n) { NEEDBITS((unsigned)bl) j = (td = tl + ((unsigned)b & m))->b; DUMPBITS(j) j = td->v.n; if (j < 16) /* length of code in bits (0..15) */ ll[i++] = l = j; /* save last length in l */ else if (j == 16) /* repeat last length 3 to 6 times */ { NEEDBITS(2) j = 3 + ((unsigned)b & 3); DUMPBITS(2) if ((unsigned)i + j > n) { ret = 1; goto out; } while (j--) ll[i++] = l; } else if (j == 17) /* 3 to 10 zero length codes */ { NEEDBITS(3) j = 3 + ((unsigned)b & 7); DUMPBITS(3) if ((unsigned)i + j > n) { ret = 1; goto out; } while (j--) ll[i++] = 0; l = 0; } else /* j == 18: 11 to 138 zero length codes */ { NEEDBITS(7) j = 11 + ((unsigned)b & 0x7f); DUMPBITS(7) if ((unsigned)i + j > n) { ret = 1; goto out; } while (j--) ll[i++] = 0; l = 0; } } DEBG("dyn4 "); /* free decoding table for trees */ huft_free(tl); DEBG("dyn5 "); /* restore the global bit buffer */ bb = b; bk = k; DEBG("dyn5a "); /* build the decoding tables for literal/length and distance codes */ bl = lbits; if ((i = huft_build(ll, nl, 257, cplens, cplext, &tl, &bl)) != 0) { DEBG("dyn5b "); if (i == 1) { error("incomplete literal tree"); huft_free(tl); } ret = i; /* incomplete code set */ goto out; } DEBG("dyn5c "); bd = dbits; if ((i = huft_build(ll + nl, nd, 0, cpdist, cpdext, &td, &bd)) != 0) { DEBG("dyn5d "); if (i == 1) { error("incomplete distance tree"); #ifdef PKZIP_BUG_WORKAROUND i = 0; } #else huft_free(td); } huft_free(tl); ret = i; /* incomplete code set */ goto out; #endif } DEBG("dyn6 "); /* decompress until an end-of-block code */ if (inflate_codes(tl, td, bl, bd)) { ret = 1; goto out; } DEBG("dyn7 "); /* free the decoding tables, return */ huft_free(tl); huft_free(td); DEBG(">"); ret = 0; out: free(ll); return ret; underrun: ret = 4; /* Input underrun */ goto out; } STATIC int INIT inflate_block( int *e /* last block flag */ ) /* decompress an inflated block */ { unsigned t; /* block type */ register ulg b; /* bit buffer */ register unsigned k; /* number of bits in bit buffer */ DEBG(""); /* bad block type */ return 2; underrun: return 4; /* Input underrun */ } STATIC int INIT inflate(void) /* decompress an inflated entry */ { int e; /* last block flag */ int r; /* result code */ unsigned h; /* maximum struct huft's malloc'ed */ /* initialize window, bit buffer */ wp = 0; bk = 0; bb = 0; /* decompress until the last block */ h = 0; do { hufts = 0; #ifdef ARCH_HAS_DECOMP_WDOG arch_decomp_wdog(); #endif r = inflate_block(&e); if (r) return r; if (hufts > h) h = hufts; } while (!e); /* Undo too much lookahead. The next read will be byte aligned so we * can discard unused bits in the last meaningful byte. */ while (bk >= 8) { bk -= 8; inptr--; } /* flush out slide */ flush_output(wp); /* return success */ #ifdef DEBUG fprintf(stderr, "<%u> ", h); #endif /* DEBUG */ return 0; } /********************************************************************** * * The following are support routines for inflate.c * **********************************************************************/ static ulg INITDATA crc_32_tab[256]; static ulg INITDATA crc; /* initialized in makecrc() so it'll reside in bss */ #define CRC_VALUE (crc ^ 0xffffffffUL) /* * Code to compute the CRC-32 table. Borrowed from * gzip-1.0.3/makecrc.c. */ static void INIT makecrc(void) { /* Not copyrighted 1990 Mark Adler */ unsigned long c; /* crc shift register */ unsigned long e; /* polynomial exclusive-or pattern */ int i; /* counter for all possible eight bit values */ int k; /* byte being shifted into crc apparatus */ /* terms of polynomial defining this crc (except x^32): */ static const int p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26}; /* Make exclusive-or pattern from polynomial */ e = 0; for (i = 0; i < sizeof(p)/sizeof(int); i++) e |= 1L << (31 - p[i]); crc_32_tab[0] = 0; for (i = 1; i < 256; i++) { c = 0; for (k = i | 256; k != 1; k >>= 1) { c = c & 1 ? (c >> 1) ^ e : c >> 1; if (k & 1) c ^= e; } crc_32_tab[i] = c; } /* this is initialized here so this code could reside in ROM */ crc = (ulg)0xffffffffUL; /* shift register contents */ } /* gzip flag byte */ #define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ #define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */ #define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ #define ORIG_NAME 0x08 /* bit 3 set: original file name present */ #define COMMENT 0x10 /* bit 4 set: file comment present */ #define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */ #define RESERVED 0xC0 /* bit 6,7: reserved */ /* * Do the uncompression! */ static int INIT gunzip(void) { uch flags; unsigned char magic[2]; /* magic header */ char method; ulg orig_crc = 0; /* original crc */ ulg orig_len = 0; /* original uncompressed length */ int res; magic[0] = NEXTBYTE(); magic[1] = NEXTBYTE(); method = NEXTBYTE(); if (magic[0] != 037 || ((magic[1] != 0213) && (magic[1] != 0236))) { error("bad gzip magic numbers"); return -1; } /* We only support method #8, DEFLATED */ if (method != 8) { error("internal error, invalid method"); return -1; } flags = (uch)get_byte(); if ((flags & ENCRYPTED) != 0) { error("Input is encrypted"); return -1; } if ((flags & CONTINUATION) != 0) { error("Multi part input"); return -1; } if ((flags & RESERVED) != 0) { error("Input has invalid flags"); return -1; } NEXTBYTE(); /* Get timestamp */ NEXTBYTE(); NEXTBYTE(); NEXTBYTE(); (void)NEXTBYTE(); /* Ignore extra flags for the moment */ (void)NEXTBYTE(); /* Ignore OS type for the moment */ if ((flags & EXTRA_FIELD) != 0) { unsigned len = (unsigned)NEXTBYTE(); len |= ((unsigned)NEXTBYTE())<<8; while (len--) (void)NEXTBYTE(); } /* Get original file name if it was truncated */ if ((flags & ORIG_NAME) != 0) { /* Discard the old name */ while (NEXTBYTE() != 0) /* null */ ; } /* Discard file comment if any */ if ((flags & COMMENT) != 0) { while (NEXTBYTE() != 0) /* null */ ; } /* Decompress */ if ((res = inflate())) { switch (res) { case 0: break; case 1: error("invalid compressed format (err=1)"); break; case 2: error("invalid compressed format (err=2)"); break; case 3: error("out of memory"); break; case 4: error("out of input data"); break; default: error("invalid compressed format (other)"); } return -1; } /* Get the crc and original length */ /* crc32 (see algorithm.doc) * uncompressed input size modulo 2^32 */ orig_crc = (ulg) NEXTBYTE(); orig_crc |= (ulg) NEXTBYTE() << 8; orig_crc |= (ulg) NEXTBYTE() << 16; orig_crc |= (ulg) NEXTBYTE() << 24; orig_len = (ulg) NEXTBYTE(); orig_len |= (ulg) NEXTBYTE() << 8; orig_len |= (ulg) NEXTBYTE() << 16; orig_len |= (ulg) NEXTBYTE() << 24; /* Validate decompression */ if (orig_crc != CRC_VALUE) { error("crc error"); return -1; } if (orig_len != bytes_out) { error("length error"); return -1; } return 0; underrun: /* NEXTBYTE() goto's here if needed */ error("out of input data"); return -1; } xen-4.4.0/xen/common/event_channel.c0000664000175000017500000010546212307313555015512 0ustar smbsmb/****************************************************************************** * event_channel.c * * Event notifications from VIRQs, PIRQs, and other domains. * * Copyright (c) 2003-2006, K A Fraser. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ERROR_EXIT(_errno) \ do { \ gdprintk(XENLOG_WARNING, \ "EVTCHNOP failure: error %d\n", \ (_errno)); \ rc = (_errno); \ goto out; \ } while ( 0 ) #define ERROR_EXIT_DOM(_errno, _dom) \ do { \ gdprintk(XENLOG_WARNING, \ "EVTCHNOP failure: domain %d, error %d\n", \ (_dom)->domain_id, (_errno)); \ rc = (_errno); \ goto out; \ } while ( 0 ) #define consumer_is_xen(e) (!!(e)->xen_consumer) /* * The function alloc_unbound_xen_event_channel() allows an arbitrary * notifier function to be specified. However, very few unique functions * are specified in practice, so to prevent bloating the evtchn structure * with a pointer, we stash them dynamically in a small lookup array which * can be indexed by a small integer. */ static xen_event_channel_notification_t xen_consumers[8]; /* Default notification action: wake up from wait_on_xen_event_channel(). */ static void default_xen_notification_fn(struct vcpu *v, unsigned int port) { /* Consumer needs notification only if blocked. */ if ( test_and_clear_bit(_VPF_blocked_in_xen, &v->pause_flags) ) vcpu_wake(v); } /* * Given a notification function, return the value to stash in * the evtchn->xen_consumer field. */ static uint8_t get_xen_consumer(xen_event_channel_notification_t fn) { unsigned int i; if ( fn == NULL ) fn = default_xen_notification_fn; for ( i = 0; i < ARRAY_SIZE(xen_consumers); i++ ) { if ( xen_consumers[i] == NULL ) xen_consumers[i] = fn; if ( xen_consumers[i] == fn ) break; } BUG_ON(i >= ARRAY_SIZE(xen_consumers)); return i+1; } /* Get the notification function for a given Xen-bound event channel. */ #define xen_notification_fn(e) (xen_consumers[(e)->xen_consumer-1]) static void evtchn_set_pending(struct vcpu *v, int port); static int virq_is_global(uint32_t virq) { int rc; ASSERT(virq < NR_VIRQS); switch ( virq ) { case VIRQ_TIMER: case VIRQ_DEBUG: case VIRQ_XENOPROF: rc = 0; break; case VIRQ_ARCH_0 ... VIRQ_ARCH_7: rc = arch_virq_is_global(virq); break; default: rc = 1; break; } return rc; } static struct evtchn *alloc_evtchn_bucket(struct domain *d, unsigned int port) { struct evtchn *chn; unsigned int i; chn = xzalloc_array(struct evtchn, EVTCHNS_PER_BUCKET); if ( !chn ) return NULL; for ( i = 0; i < EVTCHNS_PER_BUCKET; i++ ) { if ( xsm_alloc_security_evtchn(&chn[i]) ) { while ( i-- ) xsm_free_security_evtchn(&chn[i]); xfree(chn); return NULL; } chn[i].port = port + i; } return chn; } static void free_evtchn_bucket(struct domain *d, struct evtchn *bucket) { unsigned int i; if ( !bucket ) return; for ( i = 0; i < EVTCHNS_PER_BUCKET; i++ ) xsm_free_security_evtchn(bucket + i); xfree(bucket); } static int get_free_port(struct domain *d) { struct evtchn *chn; struct evtchn **grp; int port; if ( d->is_dying ) return -EINVAL; for ( port = 0; port_is_valid(d, port); port++ ) { if ( port > d->max_evtchn_port ) return -ENOSPC; if ( evtchn_from_port(d, port)->state == ECS_FREE ) return port; } if ( port == d->max_evtchns || port > d->max_evtchn_port ) return -ENOSPC; if ( !group_from_port(d, port) ) { grp = xzalloc_array(struct evtchn *, BUCKETS_PER_GROUP); if ( !grp ) return -ENOMEM; group_from_port(d, port) = grp; } chn = alloc_evtchn_bucket(d, port); if ( !chn ) return -ENOMEM; bucket_from_port(d, port) = chn; return port; } static long evtchn_alloc_unbound(evtchn_alloc_unbound_t *alloc) { struct evtchn *chn; struct domain *d; int port; domid_t dom = alloc->dom; long rc; d = rcu_lock_domain_by_any_id(dom); if ( d == NULL ) return -ESRCH; spin_lock(&d->event_lock); if ( (port = get_free_port(d)) < 0 ) ERROR_EXIT_DOM(port, d); chn = evtchn_from_port(d, port); rc = xsm_evtchn_unbound(XSM_TARGET, d, chn, alloc->remote_dom); if ( rc ) goto out; chn->state = ECS_UNBOUND; if ( (chn->u.unbound.remote_domid = alloc->remote_dom) == DOMID_SELF ) chn->u.unbound.remote_domid = current->domain->domain_id; evtchn_port_init(d, chn); alloc->port = port; out: spin_unlock(&d->event_lock); rcu_unlock_domain(d); return rc; } static long evtchn_bind_interdomain(evtchn_bind_interdomain_t *bind) { struct evtchn *lchn, *rchn; struct domain *ld = current->domain, *rd; int lport, rport = bind->remote_port; domid_t rdom = bind->remote_dom; long rc; if ( rdom == DOMID_SELF ) rdom = current->domain->domain_id; if ( (rd = rcu_lock_domain_by_id(rdom)) == NULL ) return -ESRCH; /* Avoid deadlock by first acquiring lock of domain with smaller id. */ if ( ld < rd ) { spin_lock(&ld->event_lock); spin_lock(&rd->event_lock); } else { if ( ld != rd ) spin_lock(&rd->event_lock); spin_lock(&ld->event_lock); } if ( (lport = get_free_port(ld)) < 0 ) ERROR_EXIT(lport); lchn = evtchn_from_port(ld, lport); if ( !port_is_valid(rd, rport) ) ERROR_EXIT_DOM(-EINVAL, rd); rchn = evtchn_from_port(rd, rport); if ( (rchn->state != ECS_UNBOUND) || (rchn->u.unbound.remote_domid != ld->domain_id) ) ERROR_EXIT_DOM(-EINVAL, rd); rc = xsm_evtchn_interdomain(XSM_HOOK, ld, lchn, rd, rchn); if ( rc ) goto out; lchn->u.interdomain.remote_dom = rd; lchn->u.interdomain.remote_port = (u16)rport; lchn->state = ECS_INTERDOMAIN; evtchn_port_init(ld, lchn); rchn->u.interdomain.remote_dom = ld; rchn->u.interdomain.remote_port = (u16)lport; rchn->state = ECS_INTERDOMAIN; /* * We may have lost notifications on the remote unbound port. Fix that up * here by conservatively always setting a notification on the local port. */ evtchn_set_pending(ld->vcpu[lchn->notify_vcpu_id], lport); bind->local_port = lport; out: spin_unlock(&ld->event_lock); if ( ld != rd ) spin_unlock(&rd->event_lock); rcu_unlock_domain(rd); return rc; } static long evtchn_bind_virq(evtchn_bind_virq_t *bind) { struct evtchn *chn; struct vcpu *v; struct domain *d = current->domain; int port, virq = bind->virq, vcpu = bind->vcpu; long rc = 0; if ( (virq < 0) || (virq >= ARRAY_SIZE(v->virq_to_evtchn)) ) return -EINVAL; if ( virq_is_global(virq) && (vcpu != 0) ) return -EINVAL; if ( (vcpu < 0) || (vcpu >= d->max_vcpus) || ((v = d->vcpu[vcpu]) == NULL) ) return -ENOENT; spin_lock(&d->event_lock); if ( v->virq_to_evtchn[virq] != 0 ) ERROR_EXIT(-EEXIST); if ( (port = get_free_port(d)) < 0 ) ERROR_EXIT(port); chn = evtchn_from_port(d, port); chn->state = ECS_VIRQ; chn->notify_vcpu_id = vcpu; chn->u.virq = virq; evtchn_port_init(d, chn); v->virq_to_evtchn[virq] = bind->port = port; out: spin_unlock(&d->event_lock); return rc; } static long evtchn_bind_ipi(evtchn_bind_ipi_t *bind) { struct evtchn *chn; struct domain *d = current->domain; int port, vcpu = bind->vcpu; long rc = 0; if ( (vcpu < 0) || (vcpu >= d->max_vcpus) || (d->vcpu[vcpu] == NULL) ) return -ENOENT; spin_lock(&d->event_lock); if ( (port = get_free_port(d)) < 0 ) ERROR_EXIT(port); chn = evtchn_from_port(d, port); chn->state = ECS_IPI; chn->notify_vcpu_id = vcpu; evtchn_port_init(d, chn); bind->port = port; out: spin_unlock(&d->event_lock); return rc; } static void link_pirq_port(int port, struct evtchn *chn, struct vcpu *v) { chn->u.pirq.prev_port = 0; chn->u.pirq.next_port = v->pirq_evtchn_head; if ( v->pirq_evtchn_head ) evtchn_from_port(v->domain, v->pirq_evtchn_head) ->u.pirq.prev_port = port; v->pirq_evtchn_head = port; } static void unlink_pirq_port(struct evtchn *chn, struct vcpu *v) { struct domain *d = v->domain; if ( chn->u.pirq.prev_port ) evtchn_from_port(d, chn->u.pirq.prev_port)->u.pirq.next_port = chn->u.pirq.next_port; else v->pirq_evtchn_head = chn->u.pirq.next_port; if ( chn->u.pirq.next_port ) evtchn_from_port(d, chn->u.pirq.next_port)->u.pirq.prev_port = chn->u.pirq.prev_port; } static long evtchn_bind_pirq(evtchn_bind_pirq_t *bind) { struct evtchn *chn; struct domain *d = current->domain; struct vcpu *v = d->vcpu[0]; struct pirq *info; int port, pirq = bind->pirq; long rc; if ( (pirq < 0) || (pirq >= d->nr_pirqs) ) return -EINVAL; if ( !is_hvm_domain(d) && !pirq_access_permitted(d, pirq) ) return -EPERM; spin_lock(&d->event_lock); if ( pirq_to_evtchn(d, pirq) != 0 ) ERROR_EXIT(-EEXIST); if ( (port = get_free_port(d)) < 0 ) ERROR_EXIT(port); chn = evtchn_from_port(d, port); info = pirq_get_info(d, pirq); if ( !info ) ERROR_EXIT(-ENOMEM); info->evtchn = port; rc = (!is_hvm_domain(d) ? pirq_guest_bind(v, info, !!(bind->flags & BIND_PIRQ__WILL_SHARE)) : 0); if ( rc != 0 ) { info->evtchn = 0; pirq_cleanup_check(info, d); goto out; } chn->state = ECS_PIRQ; chn->u.pirq.irq = pirq; link_pirq_port(port, chn, v); evtchn_port_init(d, chn); bind->port = port; #ifdef CONFIG_X86 if ( is_hvm_domain(d) && domain_pirq_to_irq(d, pirq) > 0 ) map_domain_emuirq_pirq(d, pirq, IRQ_PT); #endif out: spin_unlock(&d->event_lock); return rc; } static long __evtchn_close(struct domain *d1, int port1) { struct domain *d2 = NULL; struct vcpu *v; struct evtchn *chn1, *chn2; int port2; long rc = 0; again: spin_lock(&d1->event_lock); if ( !port_is_valid(d1, port1) ) { rc = -EINVAL; goto out; } chn1 = evtchn_from_port(d1, port1); /* Guest cannot close a Xen-attached event channel. */ if ( unlikely(consumer_is_xen(chn1)) ) { rc = -EINVAL; goto out; } switch ( chn1->state ) { case ECS_FREE: case ECS_RESERVED: rc = -EINVAL; goto out; case ECS_UNBOUND: break; case ECS_PIRQ: { struct pirq *pirq = pirq_info(d1, chn1->u.pirq.irq); if ( !pirq ) break; if ( !is_hvm_domain(d1) ) pirq_guest_unbind(d1, pirq); pirq->evtchn = 0; pirq_cleanup_check(pirq, d1); unlink_pirq_port(chn1, d1->vcpu[chn1->notify_vcpu_id]); #ifdef CONFIG_X86 if ( is_hvm_domain(d1) && domain_pirq_to_irq(d1, pirq->pirq) > 0 ) unmap_domain_pirq_emuirq(d1, pirq->pirq); #endif break; } case ECS_VIRQ: for_each_vcpu ( d1, v ) { if ( v->virq_to_evtchn[chn1->u.virq] != port1 ) continue; v->virq_to_evtchn[chn1->u.virq] = 0; spin_barrier(&v->virq_lock); } break; case ECS_IPI: break; case ECS_INTERDOMAIN: if ( d2 == NULL ) { d2 = chn1->u.interdomain.remote_dom; /* If we unlock d1 then we could lose d2. Must get a reference. */ if ( unlikely(!get_domain(d2)) ) BUG(); if ( d1 < d2 ) { spin_lock(&d2->event_lock); } else if ( d1 != d2 ) { spin_unlock(&d1->event_lock); spin_lock(&d2->event_lock); goto again; } } else if ( d2 != chn1->u.interdomain.remote_dom ) { /* * We can only get here if the port was closed and re-bound after * unlocking d1 but before locking d2 above. We could retry but * it is easier to return the same error as if we had seen the * port in ECS_CLOSED. It must have passed through that state for * us to end up here, so it's a valid error to return. */ rc = -EINVAL; goto out; } port2 = chn1->u.interdomain.remote_port; BUG_ON(!port_is_valid(d2, port2)); chn2 = evtchn_from_port(d2, port2); BUG_ON(chn2->state != ECS_INTERDOMAIN); BUG_ON(chn2->u.interdomain.remote_dom != d1); chn2->state = ECS_UNBOUND; chn2->u.unbound.remote_domid = d1->domain_id; break; default: BUG(); } /* Clear pending event to avoid unexpected behavior on re-bind. */ evtchn_port_clear_pending(d1, chn1); /* Reset binding to vcpu0 when the channel is freed. */ chn1->state = ECS_FREE; chn1->notify_vcpu_id = 0; xsm_evtchn_close_post(chn1); out: if ( d2 != NULL ) { if ( d1 != d2 ) spin_unlock(&d2->event_lock); put_domain(d2); } spin_unlock(&d1->event_lock); return rc; } static long evtchn_close(evtchn_close_t *close) { return __evtchn_close(current->domain, close->port); } int evtchn_send(struct domain *d, unsigned int lport) { struct evtchn *lchn, *rchn; struct domain *ld = d, *rd; struct vcpu *rvcpu; int rport, ret = 0; spin_lock(&ld->event_lock); if ( unlikely(!port_is_valid(ld, lport)) ) { spin_unlock(&ld->event_lock); return -EINVAL; } lchn = evtchn_from_port(ld, lport); /* Guest cannot send via a Xen-attached event channel. */ if ( unlikely(consumer_is_xen(lchn)) ) { spin_unlock(&ld->event_lock); return -EINVAL; } ret = xsm_evtchn_send(XSM_HOOK, ld, lchn); if ( ret ) goto out; switch ( lchn->state ) { case ECS_INTERDOMAIN: rd = lchn->u.interdomain.remote_dom; rport = lchn->u.interdomain.remote_port; rchn = evtchn_from_port(rd, rport); rvcpu = rd->vcpu[rchn->notify_vcpu_id]; if ( consumer_is_xen(rchn) ) (*xen_notification_fn(rchn))(rvcpu, rport); else evtchn_set_pending(rvcpu, rport); break; case ECS_IPI: evtchn_set_pending(ld->vcpu[lchn->notify_vcpu_id], lport); break; case ECS_UNBOUND: /* silently drop the notification */ break; default: ret = -EINVAL; } out: spin_unlock(&ld->event_lock); return ret; } static void evtchn_set_pending(struct vcpu *v, int port) { evtchn_port_set_pending(v, evtchn_from_port(v->domain, port)); } int guest_enabled_event(struct vcpu *v, uint32_t virq) { return ((v != NULL) && (v->virq_to_evtchn[virq] != 0)); } void send_guest_vcpu_virq(struct vcpu *v, uint32_t virq) { unsigned long flags; int port; ASSERT(!virq_is_global(virq)); spin_lock_irqsave(&v->virq_lock, flags); port = v->virq_to_evtchn[virq]; if ( unlikely(port == 0) ) goto out; evtchn_set_pending(v, port); out: spin_unlock_irqrestore(&v->virq_lock, flags); } static void send_guest_global_virq(struct domain *d, uint32_t virq) { unsigned long flags; int port; struct vcpu *v; struct evtchn *chn; ASSERT(virq_is_global(virq)); if ( unlikely(d == NULL) || unlikely(d->vcpu == NULL) ) return; v = d->vcpu[0]; if ( unlikely(v == NULL) ) return; spin_lock_irqsave(&v->virq_lock, flags); port = v->virq_to_evtchn[virq]; if ( unlikely(port == 0) ) goto out; chn = evtchn_from_port(d, port); evtchn_set_pending(d->vcpu[chn->notify_vcpu_id], port); out: spin_unlock_irqrestore(&v->virq_lock, flags); } void send_guest_pirq(struct domain *d, const struct pirq *pirq) { int port; struct evtchn *chn; /* * PV guests: It should not be possible to race with __evtchn_close(). The * caller of this function must synchronise with pirq_guest_unbind(). * HVM guests: Port is legitimately zero when the guest disables the * emulated interrupt/evtchn. */ if ( pirq == NULL || (port = pirq->evtchn) == 0 ) { BUG_ON(!is_hvm_domain(d)); return; } chn = evtchn_from_port(d, port); evtchn_set_pending(d->vcpu[chn->notify_vcpu_id], port); } static struct domain *global_virq_handlers[NR_VIRQS] __read_mostly; static DEFINE_SPINLOCK(global_virq_handlers_lock); void send_global_virq(uint32_t virq) { ASSERT(virq < NR_VIRQS); ASSERT(virq_is_global(virq)); send_guest_global_virq(global_virq_handlers[virq] ?: dom0, virq); } int set_global_virq_handler(struct domain *d, uint32_t virq) { struct domain *old; if (virq >= NR_VIRQS) return -EINVAL; if (!virq_is_global(virq)) return -EINVAL; if (global_virq_handlers[virq] == d) return 0; if (unlikely(!get_domain(d))) return -EINVAL; spin_lock(&global_virq_handlers_lock); old = global_virq_handlers[virq]; global_virq_handlers[virq] = d; spin_unlock(&global_virq_handlers_lock); if (old != NULL) put_domain(old); return 0; } static void clear_global_virq_handlers(struct domain *d) { uint32_t virq; int put_count = 0; spin_lock(&global_virq_handlers_lock); for (virq = 0; virq < NR_VIRQS; virq++) { if (global_virq_handlers[virq] == d) { global_virq_handlers[virq] = NULL; put_count++; } } spin_unlock(&global_virq_handlers_lock); while (put_count) { put_domain(d); put_count--; } } static long evtchn_status(evtchn_status_t *status) { struct domain *d; domid_t dom = status->dom; int port = status->port; struct evtchn *chn; long rc = 0; d = rcu_lock_domain_by_any_id(dom); if ( d == NULL ) return -ESRCH; spin_lock(&d->event_lock); if ( !port_is_valid(d, port) ) { rc = -EINVAL; goto out; } chn = evtchn_from_port(d, port); rc = xsm_evtchn_status(XSM_TARGET, d, chn); if ( rc ) goto out; switch ( chn->state ) { case ECS_FREE: case ECS_RESERVED: status->status = EVTCHNSTAT_closed; break; case ECS_UNBOUND: status->status = EVTCHNSTAT_unbound; status->u.unbound.dom = chn->u.unbound.remote_domid; break; case ECS_INTERDOMAIN: status->status = EVTCHNSTAT_interdomain; status->u.interdomain.dom = chn->u.interdomain.remote_dom->domain_id; status->u.interdomain.port = chn->u.interdomain.remote_port; break; case ECS_PIRQ: status->status = EVTCHNSTAT_pirq; status->u.pirq = chn->u.pirq.irq; break; case ECS_VIRQ: status->status = EVTCHNSTAT_virq; status->u.virq = chn->u.virq; break; case ECS_IPI: status->status = EVTCHNSTAT_ipi; break; default: BUG(); } status->vcpu = chn->notify_vcpu_id; out: spin_unlock(&d->event_lock); rcu_unlock_domain(d); return rc; } long evtchn_bind_vcpu(unsigned int port, unsigned int vcpu_id) { struct domain *d = current->domain; struct evtchn *chn; long rc = 0; if ( (vcpu_id >= d->max_vcpus) || (d->vcpu[vcpu_id] == NULL) ) return -ENOENT; spin_lock(&d->event_lock); if ( !port_is_valid(d, port) ) { rc = -EINVAL; goto out; } chn = evtchn_from_port(d, port); /* Guest cannot re-bind a Xen-attached event channel. */ if ( unlikely(consumer_is_xen(chn)) ) { rc = -EINVAL; goto out; } switch ( chn->state ) { case ECS_VIRQ: if ( virq_is_global(chn->u.virq) ) chn->notify_vcpu_id = vcpu_id; else rc = -EINVAL; break; case ECS_UNBOUND: case ECS_INTERDOMAIN: chn->notify_vcpu_id = vcpu_id; break; case ECS_PIRQ: if ( chn->notify_vcpu_id == vcpu_id ) break; unlink_pirq_port(chn, d->vcpu[chn->notify_vcpu_id]); chn->notify_vcpu_id = vcpu_id; pirq_set_affinity(d, chn->u.pirq.irq, cpumask_of(d->vcpu[vcpu_id]->processor)); link_pirq_port(port, chn, d->vcpu[vcpu_id]); break; default: rc = -EINVAL; break; } out: spin_unlock(&d->event_lock); return rc; } int evtchn_unmask(unsigned int port) { struct domain *d = current->domain; struct evtchn *evtchn; ASSERT(spin_is_locked(&d->event_lock)); if ( unlikely(!port_is_valid(d, port)) ) return -EINVAL; evtchn = evtchn_from_port(d, port); evtchn_port_unmask(d, evtchn); return 0; } static long evtchn_reset(evtchn_reset_t *r) { domid_t dom = r->dom; struct domain *d; int i, rc; d = rcu_lock_domain_by_any_id(dom); if ( d == NULL ) return -ESRCH; rc = xsm_evtchn_reset(XSM_TARGET, current->domain, d); if ( rc ) goto out; for ( i = 0; port_is_valid(d, i); i++ ) (void)__evtchn_close(d, i); rc = 0; out: rcu_unlock_domain(d); return rc; } static long evtchn_set_priority(const struct evtchn_set_priority *set_priority) { struct domain *d = current->domain; unsigned int port = set_priority->port; long ret; spin_lock(&d->event_lock); if ( !port_is_valid(d, port) ) { spin_unlock(&d->event_lock); return -EINVAL; } ret = evtchn_port_set_priority(d, evtchn_from_port(d, port), set_priority->priority); spin_unlock(&d->event_lock); return ret; } long do_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) { long rc; switch ( cmd ) { case EVTCHNOP_alloc_unbound: { struct evtchn_alloc_unbound alloc_unbound; if ( copy_from_guest(&alloc_unbound, arg, 1) != 0 ) return -EFAULT; rc = evtchn_alloc_unbound(&alloc_unbound); if ( !rc && __copy_to_guest(arg, &alloc_unbound, 1) ) rc = -EFAULT; /* Cleaning up here would be a mess! */ break; } case EVTCHNOP_bind_interdomain: { struct evtchn_bind_interdomain bind_interdomain; if ( copy_from_guest(&bind_interdomain, arg, 1) != 0 ) return -EFAULT; rc = evtchn_bind_interdomain(&bind_interdomain); if ( !rc && __copy_to_guest(arg, &bind_interdomain, 1) ) rc = -EFAULT; /* Cleaning up here would be a mess! */ break; } case EVTCHNOP_bind_virq: { struct evtchn_bind_virq bind_virq; if ( copy_from_guest(&bind_virq, arg, 1) != 0 ) return -EFAULT; rc = evtchn_bind_virq(&bind_virq); if ( !rc && __copy_to_guest(arg, &bind_virq, 1) ) rc = -EFAULT; /* Cleaning up here would be a mess! */ break; } case EVTCHNOP_bind_ipi: { struct evtchn_bind_ipi bind_ipi; if ( copy_from_guest(&bind_ipi, arg, 1) != 0 ) return -EFAULT; rc = evtchn_bind_ipi(&bind_ipi); if ( !rc && __copy_to_guest(arg, &bind_ipi, 1) ) rc = -EFAULT; /* Cleaning up here would be a mess! */ break; } case EVTCHNOP_bind_pirq: { struct evtchn_bind_pirq bind_pirq; if ( copy_from_guest(&bind_pirq, arg, 1) != 0 ) return -EFAULT; rc = evtchn_bind_pirq(&bind_pirq); if ( !rc && __copy_to_guest(arg, &bind_pirq, 1) ) rc = -EFAULT; /* Cleaning up here would be a mess! */ break; } case EVTCHNOP_close: { struct evtchn_close close; if ( copy_from_guest(&close, arg, 1) != 0 ) return -EFAULT; rc = evtchn_close(&close); break; } case EVTCHNOP_send: { struct evtchn_send send; if ( copy_from_guest(&send, arg, 1) != 0 ) return -EFAULT; rc = evtchn_send(current->domain, send.port); break; } case EVTCHNOP_status: { struct evtchn_status status; if ( copy_from_guest(&status, arg, 1) != 0 ) return -EFAULT; rc = evtchn_status(&status); if ( !rc && __copy_to_guest(arg, &status, 1) ) rc = -EFAULT; break; } case EVTCHNOP_bind_vcpu: { struct evtchn_bind_vcpu bind_vcpu; if ( copy_from_guest(&bind_vcpu, arg, 1) != 0 ) return -EFAULT; rc = evtchn_bind_vcpu(bind_vcpu.port, bind_vcpu.vcpu); break; } case EVTCHNOP_unmask: { struct evtchn_unmask unmask; if ( copy_from_guest(&unmask, arg, 1) != 0 ) return -EFAULT; spin_lock(¤t->domain->event_lock); rc = evtchn_unmask(unmask.port); spin_unlock(¤t->domain->event_lock); break; } case EVTCHNOP_reset: { struct evtchn_reset reset; if ( copy_from_guest(&reset, arg, 1) != 0 ) return -EFAULT; rc = evtchn_reset(&reset); break; } case EVTCHNOP_init_control: { struct evtchn_init_control init_control; if ( copy_from_guest(&init_control, arg, 1) != 0 ) return -EFAULT; rc = evtchn_fifo_init_control(&init_control); if ( !rc && __copy_to_guest(arg, &init_control, 1) ) rc = -EFAULT; break; } case EVTCHNOP_expand_array: { struct evtchn_expand_array expand_array; if ( copy_from_guest(&expand_array, arg, 1) != 0 ) return -EFAULT; rc = evtchn_fifo_expand_array(&expand_array); break; } case EVTCHNOP_set_priority: { struct evtchn_set_priority set_priority; if ( copy_from_guest(&set_priority, arg, 1) != 0 ) return -EFAULT; rc = evtchn_set_priority(&set_priority); break; } default: rc = -ENOSYS; break; } return rc; } int alloc_unbound_xen_event_channel( struct vcpu *local_vcpu, domid_t remote_domid, xen_event_channel_notification_t notification_fn) { struct evtchn *chn; struct domain *d = local_vcpu->domain; int port, rc; spin_lock(&d->event_lock); if ( (port = get_free_port(d)) < 0 ) goto out; chn = evtchn_from_port(d, port); rc = xsm_evtchn_unbound(XSM_TARGET, d, chn, remote_domid); chn->state = ECS_UNBOUND; chn->xen_consumer = get_xen_consumer(notification_fn); chn->notify_vcpu_id = local_vcpu->vcpu_id; chn->u.unbound.remote_domid = !rc ? remote_domid : DOMID_INVALID; out: spin_unlock(&d->event_lock); return port; } void free_xen_event_channel( struct vcpu *local_vcpu, int port) { struct evtchn *chn; struct domain *d = local_vcpu->domain; spin_lock(&d->event_lock); if ( unlikely(d->is_dying) ) { spin_unlock(&d->event_lock); return; } BUG_ON(!port_is_valid(d, port)); chn = evtchn_from_port(d, port); BUG_ON(!consumer_is_xen(chn)); chn->xen_consumer = 0; spin_unlock(&d->event_lock); (void)__evtchn_close(d, port); } void notify_via_xen_event_channel(struct domain *ld, int lport) { struct evtchn *lchn, *rchn; struct domain *rd; int rport; spin_lock(&ld->event_lock); if ( unlikely(ld->is_dying) ) { spin_unlock(&ld->event_lock); return; } ASSERT(port_is_valid(ld, lport)); lchn = evtchn_from_port(ld, lport); ASSERT(consumer_is_xen(lchn)); if ( likely(lchn->state == ECS_INTERDOMAIN) ) { rd = lchn->u.interdomain.remote_dom; rport = lchn->u.interdomain.remote_port; rchn = evtchn_from_port(rd, rport); evtchn_set_pending(rd->vcpu[rchn->notify_vcpu_id], rport); } spin_unlock(&ld->event_lock); } void evtchn_check_pollers(struct domain *d, unsigned int port) { struct vcpu *v; unsigned int vcpuid; /* Check if some VCPU might be polling for this event. */ if ( likely(bitmap_empty(d->poll_mask, d->max_vcpus)) ) return; /* Wake any interested (or potentially interested) pollers. */ for ( vcpuid = find_first_bit(d->poll_mask, d->max_vcpus); vcpuid < d->max_vcpus; vcpuid = find_next_bit(d->poll_mask, d->max_vcpus, vcpuid+1) ) { v = d->vcpu[vcpuid]; if ( ((v->poll_evtchn <= 0) || (v->poll_evtchn == port)) && test_and_clear_bit(vcpuid, d->poll_mask) ) { v->poll_evtchn = 0; vcpu_unblock(v); } } } int evtchn_init(struct domain *d) { evtchn_2l_init(d); d->max_evtchn_port = INT_MAX; d->evtchn = alloc_evtchn_bucket(d, 0); if ( !d->evtchn ) return -ENOMEM; spin_lock_init(&d->event_lock); if ( get_free_port(d) != 0 ) { free_evtchn_bucket(d, d->evtchn); return -EINVAL; } evtchn_from_port(d, 0)->state = ECS_RESERVED; #if MAX_VIRT_CPUS > BITS_PER_LONG d->poll_mask = xmalloc_array(unsigned long, BITS_TO_LONGS(MAX_VIRT_CPUS)); if ( !d->poll_mask ) { free_evtchn_bucket(d, d->evtchn); return -ENOMEM; } bitmap_zero(d->poll_mask, MAX_VIRT_CPUS); #endif return 0; } void evtchn_destroy(struct domain *d) { unsigned int i, j; /* After this barrier no new event-channel allocations can occur. */ BUG_ON(!d->is_dying); spin_barrier(&d->event_lock); /* Close all existing event channels. */ for ( i = 0; port_is_valid(d, i); i++ ) { evtchn_from_port(d, i)->xen_consumer = 0; (void)__evtchn_close(d, i); } /* Free all event-channel buckets. */ spin_lock(&d->event_lock); for ( i = 0; i < NR_EVTCHN_GROUPS; i++ ) { if ( !d->evtchn_group[i] ) continue; for ( j = 0; j < BUCKETS_PER_GROUP; j++ ) free_evtchn_bucket(d, d->evtchn_group[i][j]); xfree(d->evtchn_group[i]); d->evtchn_group[i] = NULL; } free_evtchn_bucket(d, d->evtchn); d->evtchn = NULL; spin_unlock(&d->event_lock); clear_global_virq_handlers(d); evtchn_fifo_destroy(d); } void evtchn_destroy_final(struct domain *d) { #if MAX_VIRT_CPUS > BITS_PER_LONG xfree(d->poll_mask); d->poll_mask = NULL; #endif } void evtchn_move_pirqs(struct vcpu *v) { struct domain *d = v->domain; const cpumask_t *mask = cpumask_of(v->processor); unsigned int port; struct evtchn *chn; spin_lock(&d->event_lock); for ( port = v->pirq_evtchn_head; port; port = chn->u.pirq.next_port ) { chn = evtchn_from_port(d, port); pirq_set_affinity(d, chn->u.pirq.irq, mask); } spin_unlock(&d->event_lock); } static void domain_dump_evtchn_info(struct domain *d) { unsigned int port; int irq; bitmap_scnlistprintf(keyhandler_scratch, sizeof(keyhandler_scratch), d->poll_mask, d->max_vcpus); printk("Event channel information for domain %d:\n" "Polling vCPUs: {%s}\n" " port [p/m/s]\n", d->domain_id, keyhandler_scratch); spin_lock(&d->event_lock); for ( port = 1; port < d->max_evtchns; ++port ) { const struct evtchn *chn; char *ssid; if ( !port_is_valid(d, port) ) continue; chn = evtchn_from_port(d, port); if ( chn->state == ECS_FREE ) continue; printk(" %4u [%d/%d/", port, !!evtchn_port_is_pending(d, chn), !!evtchn_port_is_masked(d, chn)); evtchn_port_print_state(d, chn); printk("]: s=%d n=%d x=%d", chn->state, chn->notify_vcpu_id, chn->xen_consumer); switch ( chn->state ) { case ECS_UNBOUND: printk(" d=%d", chn->u.unbound.remote_domid); break; case ECS_INTERDOMAIN: printk(" d=%d p=%d", chn->u.interdomain.remote_dom->domain_id, chn->u.interdomain.remote_port); break; case ECS_PIRQ: irq = domain_pirq_to_irq(d, chn->u.pirq.irq); printk(" p=%d i=%d", chn->u.pirq.irq, irq); break; case ECS_VIRQ: printk(" v=%d", chn->u.virq); break; } ssid = xsm_show_security_evtchn(d, chn); if (ssid) { printk(" Z=%s\n", ssid); xfree(ssid); } else { printk("\n"); } } spin_unlock(&d->event_lock); } static void dump_evtchn_info(unsigned char key) { struct domain *d; printk("'%c' pressed -> dumping event-channel info\n", key); rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) domain_dump_evtchn_info(d); rcu_read_unlock(&domlist_read_lock); } static struct keyhandler dump_evtchn_info_keyhandler = { .diagnostic = 1, .u.fn = dump_evtchn_info, .desc = "dump evtchn info" }; static int __init dump_evtchn_info_key_init(void) { register_keyhandler('e', &dump_evtchn_info_keyhandler); return 0; } __initcall(dump_evtchn_info_key_init); /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/libelf/0000775000175000017500000000000012307313555013762 5ustar smbsmbxen-4.4.0/xen/common/libelf/Makefile0000664000175000017500000000050312307313555015420 0ustar smbsmbobj-bin-y := libelf.o SECTIONS := text data $(SPECIAL_DATA_SECTIONS) CFLAGS += -Wno-pointer-sign libelf.o: libelf-temp.o Makefile $(OBJCOPY) $(foreach s,$(SECTIONS),--rename-section .$(s)=.init.$(s)) $< $@ libelf-temp.o: libelf-tools.o libelf-loader.o libelf-dominfo.o #libelf-relocate.o $(LD) $(LDFLAGS) -r -o $@ $^ xen-4.4.0/xen/common/libelf/libelf-loader.c0000664000175000017500000003003712307313555016632 0ustar smbsmb/* * parse and load elf binaries * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; * version 2.1 of the License. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #ifdef __XEN__ #include #endif #include "libelf-private.h" /* ------------------------------------------------------------------------ */ elf_errorstatus elf_init(struct elf_binary *elf, const char *image_input, size_t size) { ELF_HANDLE_DECL(elf_shdr) shdr; uint64_t i, count, section, offset; if ( !elf_is_elfbinary(image_input, size) ) { elf_err(elf, "%s: not an ELF binary\n", __FUNCTION__); return -1; } elf_memset_unchecked(elf, 0, sizeof(*elf)); elf->image_base = image_input; elf->size = size; elf->ehdr = ELF_MAKE_HANDLE(elf_ehdr, (elf_ptrval)image_input); elf->class = elf_uval_3264(elf, elf->ehdr, e32.e_ident[EI_CLASS]); elf->data = elf_uval_3264(elf, elf->ehdr, e32.e_ident[EI_DATA]); elf->caller_xdest_base = NULL; elf->caller_xdest_size = 0; /* Sanity check phdr. */ offset = elf_uval(elf, elf->ehdr, e_phoff) + elf_uval(elf, elf->ehdr, e_phentsize) * elf_phdr_count(elf); if ( offset > elf->size ) { elf_err(elf, "%s: phdr overflow (off %" PRIx64 " > size %lx)\n", __FUNCTION__, offset, (unsigned long)elf->size); return -1; } /* Sanity check shdr. */ offset = elf_uval(elf, elf->ehdr, e_shoff) + elf_uval(elf, elf->ehdr, e_shentsize) * elf_shdr_count(elf); if ( offset > elf->size ) { elf_err(elf, "%s: shdr overflow (off %" PRIx64 " > size %lx)\n", __FUNCTION__, offset, (unsigned long)elf->size); return -1; } /* Find section string table. */ section = elf_uval(elf, elf->ehdr, e_shstrndx); shdr = elf_shdr_by_index(elf, section); if ( ELF_HANDLE_VALID(shdr) ) elf->sec_strtab = elf_section_start(elf, shdr); /* Find symbol table and symbol string table. */ count = elf_shdr_count(elf); for ( i = 0; i < count; i++ ) { shdr = elf_shdr_by_index(elf, i); if ( !elf_access_ok(elf, ELF_HANDLE_PTRVAL(shdr), 1) ) /* input has an insane section header count field */ break; if ( elf_uval(elf, shdr, sh_type) != SHT_SYMTAB ) continue; elf->sym_tab = shdr; shdr = elf_shdr_by_index(elf, elf_uval(elf, shdr, sh_link)); if ( !ELF_HANDLE_VALID(shdr) ) { elf->sym_tab = ELF_INVALID_HANDLE(elf_shdr); continue; } elf->sym_strtab = elf_section_start(elf, shdr); break; } return 0; } #ifndef __XEN__ void elf_call_log_callback(struct elf_binary *elf, bool iserr, const char *fmt,...) { va_list al; if (!elf->log_callback) return; if (!(iserr || elf->verbose)) return; va_start(al,fmt); elf->log_callback(elf, elf->log_caller_data, iserr, fmt, al); va_end(al); } void elf_set_log(struct elf_binary *elf, elf_log_callback *log_callback, void *log_caller_data, bool verbose) { elf->log_callback = log_callback; elf->log_caller_data = log_caller_data; elf->verbose = verbose; } static elf_errorstatus elf_load_image(struct elf_binary *elf, elf_ptrval dst, elf_ptrval src, uint64_t filesz, uint64_t memsz) { elf_memcpy_safe(elf, dst, src, filesz); elf_memset_safe(elf, dst + filesz, 0, memsz - filesz); return 0; } #else void elf_set_verbose(struct elf_binary *elf) { elf->verbose = 1; } static elf_errorstatus elf_load_image(struct elf_binary *elf, elf_ptrval dst, elf_ptrval src, uint64_t filesz, uint64_t memsz) { elf_errorstatus rc; if ( filesz > ULONG_MAX || memsz > ULONG_MAX ) return -1; /* We trust the dom0 kernel image completely, so we don't care * about overruns etc. here. */ rc = raw_copy_to_guest(ELF_UNSAFE_PTR(dst), ELF_UNSAFE_PTR(src), filesz); if ( rc != 0 ) return -1; rc = raw_clear_guest(ELF_UNSAFE_PTR(dst + filesz), memsz - filesz); if ( rc != 0 ) return -1; return 0; } #endif /* Calculate the required additional kernel space for the elf image */ void elf_parse_bsdsyms(struct elf_binary *elf, uint64_t pstart) { uint64_t sz; ELF_HANDLE_DECL(elf_shdr) shdr; unsigned i, type; if ( !ELF_HANDLE_VALID(elf->sym_tab) ) return; pstart = elf_round_up(elf, pstart); /* Space to store the size of the elf image */ sz = sizeof(uint32_t); /* Space for the elf and elf section headers */ sz += (elf_uval(elf, elf->ehdr, e_ehsize) + elf_shdr_count(elf) * elf_uval(elf, elf->ehdr, e_shentsize)); sz = elf_round_up(elf, sz); /* Space for the symbol and string tables. */ for ( i = 0; i < elf_shdr_count(elf); i++ ) { shdr = elf_shdr_by_index(elf, i); if ( !elf_access_ok(elf, ELF_HANDLE_PTRVAL(shdr), 1) ) /* input has an insane section header count field */ break; type = elf_uval(elf, shdr, sh_type); if ( (type == SHT_STRTAB) || (type == SHT_SYMTAB) ) sz = elf_round_up(elf, sz + elf_uval(elf, shdr, sh_size)); } elf->bsd_symtab_pstart = pstart; elf->bsd_symtab_pend = pstart + sz; } static void elf_load_bsdsyms(struct elf_binary *elf) { ELF_HANDLE_DECL(elf_ehdr) sym_ehdr; unsigned long sz; elf_ptrval maxva; elf_ptrval symbase; elf_ptrval symtab_addr; ELF_HANDLE_DECL(elf_shdr) shdr; unsigned i, type; if ( !elf->bsd_symtab_pstart ) return; #define elf_hdr_elm(_elf, _hdr, _elm, _val) \ do { \ if ( elf_64bit(_elf) ) \ elf_store_field(_elf, _hdr, e64._elm, _val); \ else \ elf_store_field(_elf, _hdr, e32._elm, _val); \ } while ( 0 ) symbase = elf_get_ptr(elf, elf->bsd_symtab_pstart); symtab_addr = maxva = symbase + sizeof(uint32_t); /* Set up Elf header. */ sym_ehdr = ELF_MAKE_HANDLE(elf_ehdr, symtab_addr); sz = elf_uval(elf, elf->ehdr, e_ehsize); elf_memcpy_safe(elf, ELF_HANDLE_PTRVAL(sym_ehdr), ELF_HANDLE_PTRVAL(elf->ehdr), sz); maxva += sz; /* no round up */ elf_hdr_elm(elf, sym_ehdr, e_phoff, 0); elf_hdr_elm(elf, sym_ehdr, e_shoff, elf_uval(elf, elf->ehdr, e_ehsize)); elf_hdr_elm(elf, sym_ehdr, e_phentsize, 0); elf_hdr_elm(elf, sym_ehdr, e_phnum, 0); /* Copy Elf section headers. */ shdr = ELF_MAKE_HANDLE(elf_shdr, maxva); sz = elf_shdr_count(elf) * elf_uval(elf, elf->ehdr, e_shentsize); elf_memcpy_safe(elf, ELF_HANDLE_PTRVAL(shdr), ELF_IMAGE_BASE(elf) + elf_uval(elf, elf->ehdr, e_shoff), sz); maxva = elf_round_up(elf, (unsigned long)maxva + sz); for ( i = 0; i < elf_shdr_count(elf); i++ ) { elf_ptrval old_shdr_p; elf_ptrval new_shdr_p; type = elf_uval(elf, shdr, sh_type); if ( (type == SHT_STRTAB) || (type == SHT_SYMTAB) ) { elf_msg(elf, "%s: shdr %i at 0x%"ELF_PRPTRVAL" -> 0x%"ELF_PRPTRVAL"\n", __func__, i, elf_section_start(elf, shdr), maxva); sz = elf_uval(elf, shdr, sh_size); elf_memcpy_safe(elf, maxva, elf_section_start(elf, shdr), sz); /* Mangled to be based on ELF header location. */ elf_hdr_elm(elf, shdr, sh_offset, maxva - symtab_addr); maxva = elf_round_up(elf, (unsigned long)maxva + sz); } old_shdr_p = ELF_HANDLE_PTRVAL(shdr); new_shdr_p = old_shdr_p + elf_uval(elf, elf->ehdr, e_shentsize); if ( new_shdr_p <= old_shdr_p ) /* wrapped or stuck */ { elf_mark_broken(elf, "bad section header length"); break; } if ( !elf_access_ok(elf, new_shdr_p, 1) ) /* outside image */ break; shdr = ELF_MAKE_HANDLE(elf_shdr, new_shdr_p); } /* Write down the actual sym size. */ elf_store_val(elf, uint32_t, symbase, maxva - symtab_addr); #undef elf_ehdr_elm } void elf_parse_binary(struct elf_binary *elf) { ELF_HANDLE_DECL(elf_phdr) phdr; uint64_t low = -1; uint64_t high = 0; uint64_t i, count, paddr, memsz; count = elf_uval(elf, elf->ehdr, e_phnum); for ( i = 0; i < count; i++ ) { phdr = elf_phdr_by_index(elf, i); if ( !elf_access_ok(elf, ELF_HANDLE_PTRVAL(phdr), 1) ) /* input has an insane program header count field */ break; if ( !elf_phdr_is_loadable(elf, phdr) ) continue; paddr = elf_uval(elf, phdr, p_paddr); memsz = elf_uval(elf, phdr, p_memsz); elf_msg(elf, "%s: phdr: paddr=0x%" PRIx64 " memsz=0x%" PRIx64 "\n", __FUNCTION__, paddr, memsz); if ( low > paddr ) low = paddr; if ( high < paddr + memsz ) high = paddr + memsz; } elf->pstart = low; elf->pend = high; elf_msg(elf, "%s: memory: 0x%" PRIx64 " -> 0x%" PRIx64 "\n", __FUNCTION__, elf->pstart, elf->pend); } elf_errorstatus elf_load_binary(struct elf_binary *elf) { ELF_HANDLE_DECL(elf_phdr) phdr; uint64_t i, count, paddr, offset, filesz, memsz; elf_ptrval dest; /* * Let bizarre ELFs write the output image up to twice; this * calculation is just to ensure our copying loop is no worse than * O(domain_size). */ uint64_t remain_allow_copy = (uint64_t)elf->dest_size * 2; count = elf_uval(elf, elf->ehdr, e_phnum); for ( i = 0; i < count; i++ ) { phdr = elf_phdr_by_index(elf, i); if ( !elf_access_ok(elf, ELF_HANDLE_PTRVAL(phdr), 1) ) /* input has an insane program header count field */ break; if ( !elf_phdr_is_loadable(elf, phdr) ) continue; paddr = elf_uval(elf, phdr, p_paddr); offset = elf_uval(elf, phdr, p_offset); filesz = elf_uval(elf, phdr, p_filesz); memsz = elf_uval(elf, phdr, p_memsz); dest = elf_get_ptr(elf, paddr); /* * We need to check that the input image doesn't have us copy * the whole image zillions of times, as that could lead to * O(n^2) time behaviour and possible DoS by a malicous ELF. */ if ( remain_allow_copy < memsz ) { elf_mark_broken(elf, "program segments total to more" " than the input image size"); break; } remain_allow_copy -= memsz; elf_msg(elf, "%s: phdr %" PRIu64 " at 0x%"ELF_PRPTRVAL" -> 0x%"ELF_PRPTRVAL"\n", __func__, i, dest, (elf_ptrval)(dest + filesz)); if ( elf_load_image(elf, dest, ELF_IMAGE_BASE(elf) + offset, filesz, memsz) != 0 ) return -1; } elf_load_bsdsyms(elf); return 0; } elf_ptrval elf_get_ptr(struct elf_binary *elf, unsigned long addr) { return ELF_REALPTR2PTRVAL(elf->dest_base) + addr - elf->pstart; } uint64_t elf_lookup_addr(struct elf_binary * elf, const char *symbol) { ELF_HANDLE_DECL(elf_sym) sym; uint64_t value; sym = elf_sym_by_name(elf, symbol); if ( !ELF_HANDLE_VALID(sym) ) { elf_err(elf, "%s: not found: %s\n", __FUNCTION__, symbol); return -1; } value = elf_uval(elf, sym, st_value); elf_msg(elf, "%s: symbol \"%s\" at 0x%" PRIx64 "\n", __FUNCTION__, symbol, value); return value; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/libelf/README0000664000175000017500000000006712307313555014645 0ustar smbsmbTake care, this code is used by both xen and tools ... xen-4.4.0/xen/common/libelf/libelf-private.h0000664000175000017500000000620312307313555017041 0ustar smbsmb/* * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; * version 2.1 of the License. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __LIBELF_PRIVATE_H__ #define __LIBELF_PRIVATE_H__ #ifdef __XEN__ #include #include #include #include #include #include #include /* we would like to use elf->log_callback but we can't because * there is no vprintk in Xen */ #define elf_msg(elf, fmt, args ... ) \ if (elf->verbose) printk(fmt, ## args ) #define elf_err(elf, fmt, args ... ) \ printk(fmt, ## args ) #define strtoull(str, end, base) simple_strtoull(str, end, base) #define bswap_16(x) swab16(x) #define bswap_32(x) swab32(x) #define bswap_64(x) swab64(x) #else /* !__XEN__ */ #include #include #include #include #include #include #ifdef __sun__ #include #define bswap_16(x) BSWAP_16(x) #define bswap_32(x) BSWAP_32(x) #define bswap_64(x) BSWAP_64(x) #elif defined(__NetBSD__) #include #define bswap_16(x) bswap16(x) #define bswap_32(x) bswap32(x) #define bswap_64(x) bswap64(x) #elif defined(__OpenBSD__) #include #define bswap_16(x) swap16(x) #define bswap_32(x) swap32(x) #define bswap_64(x) swap64(x) #elif defined(__linux__) || defined(__Linux__) || defined(__MINIOS__) #include #else #error Unsupported OS #endif #include #include #include "xenctrl.h" #include "xc_private.h" #define elf_msg(elf, fmt, args ... ) \ elf_call_log_callback(elf, 0, fmt , ## args ); #define elf_err(elf, fmt, args ... ) \ elf_call_log_callback(elf, 1, fmt , ## args ); void elf_call_log_callback(struct elf_binary*, bool iserr, const char *fmt,...); #define safe_strcpy(d,s) \ do { strncpy((d),(s),sizeof((d))-1); \ (d)[sizeof((d))-1] = '\0'; \ } while (0) #endif #undef memcpy #undef memset #undef memmove #undef strcpy #define memcpy MISTAKE_unspecified_memcpy #define memset MISTAKE_unspecified_memset #define memmove MISTAKE_unspecified_memmove #define strcpy MISTAKE_unspecified_strcpy /* This prevents libelf from using these undecorated versions * of memcpy, memset, memmove and strcpy. Every call site * must either use elf_mem*_unchecked, or elf_mem*_safe. */ #endif /* __LIBELF_PRIVATE_H__ */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/libelf/libelf-dominfo.c0000664000175000017500000004645712307313555017034 0ustar smbsmb/* * parse xen-specific informations out of elf kernel binaries. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; * version 2.1 of the License. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libelf-private.h" /* ------------------------------------------------------------------------ */ /* xen features */ static const char *const elf_xen_feature_names[] = { [XENFEAT_writable_page_tables] = "writable_page_tables", [XENFEAT_writable_descriptor_tables] = "writable_descriptor_tables", [XENFEAT_auto_translated_physmap] = "auto_translated_physmap", [XENFEAT_supervisor_mode_kernel] = "supervisor_mode_kernel", [XENFEAT_pae_pgdir_above_4gb] = "pae_pgdir_above_4gb", [XENFEAT_hvm_callback_vector] = "hvm_callback_vector", [XENFEAT_dom0] = "dom0" }; static const unsigned elf_xen_features = sizeof(elf_xen_feature_names) / sizeof(elf_xen_feature_names[0]); elf_errorstatus elf_xen_parse_features(const char *features, uint32_t *supported, uint32_t *required) { unsigned char feature[64]; unsigned pos, len, i; if ( features == NULL ) return 0; for ( pos = 0; features[pos] != '\0'; pos += len ) { elf_memset_unchecked(feature, 0, sizeof(feature)); for ( len = 0;; len++ ) { if ( len >= sizeof(feature)-1 ) break; if ( features[pos + len] == '\0' ) break; if ( features[pos + len] == '|' ) { len++; break; } feature[len] = features[pos + len]; } for ( i = 0; i < elf_xen_features; i++ ) { if ( !elf_xen_feature_names[i] ) continue; if ( feature[0] == '!' ) { /* required */ if ( !strcmp(feature + 1, elf_xen_feature_names[i]) ) { elf_xen_feature_set(i, supported); if ( required ) elf_xen_feature_set(i, required); break; } } else { /* supported */ if ( !strcmp(feature, elf_xen_feature_names[i]) ) { elf_xen_feature_set(i, supported); break; } } } if ( i == elf_xen_features && required && feature[0] == '!' ) return -1; } return 0; } /* ------------------------------------------------------------------------ */ /* xen elf notes */ elf_errorstatus elf_xen_parse_note(struct elf_binary *elf, struct elf_dom_parms *parms, ELF_HANDLE_DECL(elf_note) note) { /* *INDENT-OFF* */ static const struct { char *name; bool str; } note_desc[] = { [XEN_ELFNOTE_ENTRY] = { "ENTRY", 0}, [XEN_ELFNOTE_HYPERCALL_PAGE] = { "HYPERCALL_PAGE", 0}, [XEN_ELFNOTE_VIRT_BASE] = { "VIRT_BASE", 0}, [XEN_ELFNOTE_INIT_P2M] = { "INIT_P2M", 0}, [XEN_ELFNOTE_PADDR_OFFSET] = { "PADDR_OFFSET", 0}, [XEN_ELFNOTE_HV_START_LOW] = { "HV_START_LOW", 0}, [XEN_ELFNOTE_XEN_VERSION] = { "XEN_VERSION", 1}, [XEN_ELFNOTE_GUEST_OS] = { "GUEST_OS", 1}, [XEN_ELFNOTE_GUEST_VERSION] = { "GUEST_VERSION", 1}, [XEN_ELFNOTE_LOADER] = { "LOADER", 1}, [XEN_ELFNOTE_PAE_MODE] = { "PAE_MODE", 1}, [XEN_ELFNOTE_FEATURES] = { "FEATURES", 1}, [XEN_ELFNOTE_SUPPORTED_FEATURES] = { "SUPPORTED_FEATURES", 0}, [XEN_ELFNOTE_BSD_SYMTAB] = { "BSD_SYMTAB", 1}, [XEN_ELFNOTE_SUSPEND_CANCEL] = { "SUSPEND_CANCEL", 0 }, [XEN_ELFNOTE_MOD_START_PFN] = { "MOD_START_PFN", 0 }, }; /* *INDENT-ON* */ const char *str = NULL; uint64_t val = 0; unsigned int i; unsigned type = elf_uval(elf, note, type); if ( (type >= sizeof(note_desc) / sizeof(note_desc[0])) || (note_desc[type].name == NULL) ) { elf_msg(elf, "%s: unknown xen elf note (0x%x)\n", __FUNCTION__, type); return 0; } if ( note_desc[type].str ) { str = elf_strval(elf, elf_note_desc(elf, note)); if (str == NULL) /* elf_strval will mark elf broken if it fails so no need to log */ return 0; elf_msg(elf, "%s: %s = \"%s\"\n", __FUNCTION__, note_desc[type].name, str); parms->elf_notes[type].type = XEN_ENT_STR; parms->elf_notes[type].data.str = str; } else { val = elf_note_numeric(elf, note); elf_msg(elf, "%s: %s = 0x%" PRIx64 "\n", __FUNCTION__, note_desc[type].name, val); parms->elf_notes[type].type = XEN_ENT_LONG; parms->elf_notes[type].data.num = val; } parms->elf_notes[type].name = note_desc[type].name; switch ( type ) { case XEN_ELFNOTE_LOADER: safe_strcpy(parms->loader, str); break; case XEN_ELFNOTE_GUEST_OS: safe_strcpy(parms->guest_os, str); break; case XEN_ELFNOTE_GUEST_VERSION: safe_strcpy(parms->guest_ver, str); break; case XEN_ELFNOTE_XEN_VERSION: safe_strcpy(parms->xen_ver, str); break; case XEN_ELFNOTE_PAE_MODE: if ( !strcmp(str, "yes") ) parms->pae = 2 /* extended_cr3 */; if ( strstr(str, "bimodal") ) parms->pae = 3 /* bimodal */; break; case XEN_ELFNOTE_BSD_SYMTAB: if ( !strcmp(str, "yes") ) parms->bsd_symtab = 1; break; case XEN_ELFNOTE_VIRT_BASE: parms->virt_base = val; break; case XEN_ELFNOTE_ENTRY: parms->virt_entry = val; break; case XEN_ELFNOTE_INIT_P2M: parms->p2m_base = val; break; case XEN_ELFNOTE_PADDR_OFFSET: parms->elf_paddr_offset = val; break; case XEN_ELFNOTE_HYPERCALL_PAGE: parms->virt_hypercall = val; break; case XEN_ELFNOTE_HV_START_LOW: parms->virt_hv_start_low = val; break; case XEN_ELFNOTE_FEATURES: if ( elf_xen_parse_features(str, parms->f_supported, parms->f_required) ) return -1; break; case XEN_ELFNOTE_SUPPORTED_FEATURES: for ( i = 0; i < XENFEAT_NR_SUBMAPS; ++i ) parms->f_supported[i] |= elf_note_numeric_array( elf, note, sizeof(*parms->f_supported), i); break; } return 0; } #define ELF_NOTE_INVALID (~0U) static unsigned elf_xen_parse_notes(struct elf_binary *elf, struct elf_dom_parms *parms, elf_ptrval start, elf_ptrval end, unsigned *total_note_count) { unsigned xen_elfnotes = 0; ELF_HANDLE_DECL(elf_note) note; const char *note_name; parms->elf_note_start = start; parms->elf_note_end = end; for ( note = ELF_MAKE_HANDLE(elf_note, parms->elf_note_start); ELF_HANDLE_PTRVAL(note) < parms->elf_note_end; note = elf_note_next(elf, note) ) { if ( *total_note_count >= ELF_MAX_TOTAL_NOTE_COUNT ) { elf_mark_broken(elf, "too many ELF notes"); break; } (*total_note_count)++; note_name = elf_note_name(elf, note); if ( note_name == NULL ) continue; if ( strcmp(note_name, "Xen") ) continue; if ( elf_xen_parse_note(elf, parms, note) ) return ELF_NOTE_INVALID; xen_elfnotes++; } return xen_elfnotes; } /* ------------------------------------------------------------------------ */ /* __xen_guest section */ elf_errorstatus elf_xen_parse_guest_info(struct elf_binary *elf, struct elf_dom_parms *parms) { elf_ptrval h; unsigned char name[32], value[128]; unsigned len; h = parms->guest_info; #define STAR(h) (elf_access_unsigned(elf, (h), 0, 1)) while ( STAR(h) ) { elf_memset_unchecked(name, 0, sizeof(name)); elf_memset_unchecked(value, 0, sizeof(value)); for ( len = 0;; len++, h++ ) { if ( len >= sizeof(name)-1 ) break; if ( STAR(h) == '\0' ) break; if ( STAR(h) == ',' ) { h++; break; } if ( STAR(h) == '=' ) { h++; for ( len = 0;; len++, h++ ) { if ( len >= sizeof(value)-1 ) break; if ( STAR(h) == '\0' ) break; if ( STAR(h) == ',' ) { h++; break; } value[len] = STAR(h); } break; } name[len] = STAR(h); } elf_msg(elf, "%s: %s=\"%s\"\n", __FUNCTION__, name, value); /* strings */ if ( !strcmp(name, "LOADER") ) safe_strcpy(parms->loader, value); if ( !strcmp(name, "GUEST_OS") ) safe_strcpy(parms->guest_os, value); if ( !strcmp(name, "GUEST_VER") ) safe_strcpy(parms->guest_ver, value); if ( !strcmp(name, "XEN_VER") ) safe_strcpy(parms->xen_ver, value); if ( !strcmp(name, "PAE") ) { if ( !strcmp(value, "yes[extended-cr3]") ) parms->pae = 2 /* extended_cr3 */; else if ( !strncmp(value, "yes", 3) ) parms->pae = 1 /* yes */; } if ( !strcmp(name, "BSD_SYMTAB") ) parms->bsd_symtab = 1; /* longs */ if ( !strcmp(name, "VIRT_BASE") ) parms->virt_base = strtoull(value, NULL, 0); if ( !strcmp(name, "VIRT_ENTRY") ) parms->virt_entry = strtoull(value, NULL, 0); if ( !strcmp(name, "ELF_PADDR_OFFSET") ) parms->elf_paddr_offset = strtoull(value, NULL, 0); if ( !strcmp(name, "HYPERCALL_PAGE") ) parms->virt_hypercall = (strtoull(value, NULL, 0) << 12) + parms->virt_base; /* other */ if ( !strcmp(name, "FEATURES") ) if ( elf_xen_parse_features(value, parms->f_supported, parms->f_required) ) return -1; } return 0; } /* ------------------------------------------------------------------------ */ /* sanity checks */ static elf_errorstatus elf_xen_note_check(struct elf_binary *elf, struct elf_dom_parms *parms) { if ( (ELF_PTRVAL_INVALID(parms->elf_note_start)) && (ELF_PTRVAL_INVALID(parms->guest_info)) ) { unsigned machine = elf_uval(elf, elf->ehdr, e_machine); if ( (machine == EM_386) || (machine == EM_X86_64) ) { elf_err(elf, "%s: ERROR: Not a Xen-ELF image: " "No ELF notes or '__xen_guest' section found.\n", __FUNCTION__); return -1; } return 0; } if ( elf_uval(elf, elf->ehdr, e_machine) == EM_ARM ) { elf_msg(elf, "%s: Not bothering with notes on ARM\n", __FUNCTION__); return 0; } /* Check the contents of the Xen notes or guest string. */ if ( ((strlen(parms->loader) == 0) || strncmp(parms->loader, "generic", 7)) && ((strlen(parms->guest_os) == 0) || strncmp(parms->guest_os, "linux", 5)) ) { elf_err(elf, "%s: ERROR: Will only load images built for the generic " "loader or Linux images (Not '%.*s' and '%.*s')\n", __FUNCTION__, (int)sizeof(parms->loader), parms->loader, (int)sizeof(parms->guest_os), parms->guest_os); return -1; } if ( (strlen(parms->xen_ver) == 0) || strncmp(parms->xen_ver, "xen-3.0", 7) ) { elf_err(elf, "%s: ERROR: Xen will only load images built " "for Xen v3.0 (Not '%.*s')\n", __FUNCTION__, (int)sizeof(parms->xen_ver), parms->xen_ver); return -1; } return 0; } static elf_errorstatus elf_xen_addr_calc_check(struct elf_binary *elf, struct elf_dom_parms *parms) { if ( (parms->elf_paddr_offset != UNSET_ADDR) && (parms->virt_base == UNSET_ADDR) ) { elf_err(elf, "%s: ERROR: ELF_PADDR_OFFSET set, VIRT_BASE unset\n", __FUNCTION__); return -1; } /* Initial guess for virt_base is 0 if it is not explicitly defined. */ if ( parms->virt_base == UNSET_ADDR ) { parms->virt_base = 0; elf_msg(elf, "%s: VIRT_BASE unset, using 0x%" PRIx64 "\n", __FUNCTION__, parms->virt_base); } /* * If we are using the legacy __xen_guest section then elf_pa_off * defaults to v_start in order to maintain compatibility with * older hypervisors which set padd in the ELF header to * virt_base. * * If we are using the modern ELF notes interface then the default * is 0. */ if ( parms->elf_paddr_offset == UNSET_ADDR ) { if ( parms->elf_note_start ) parms->elf_paddr_offset = 0; else parms->elf_paddr_offset = parms->virt_base; elf_msg(elf, "%s: ELF_PADDR_OFFSET unset, using 0x%" PRIx64 "\n", __FUNCTION__, parms->elf_paddr_offset); } parms->virt_offset = parms->virt_base - parms->elf_paddr_offset; parms->virt_kstart = elf->pstart + parms->virt_offset; parms->virt_kend = elf->pend + parms->virt_offset; if ( parms->virt_entry == UNSET_ADDR ) parms->virt_entry = elf_uval(elf, elf->ehdr, e_entry); if ( parms->bsd_symtab ) { elf_parse_bsdsyms(elf, parms->virt_kend); if ( elf->bsd_symtab_pend ) parms->virt_kend = elf->bsd_symtab_pend + parms->virt_offset; } elf_msg(elf, "%s: addresses:\n", __FUNCTION__); elf_msg(elf, " virt_base = 0x%" PRIx64 "\n", parms->virt_base); elf_msg(elf, " elf_paddr_offset = 0x%" PRIx64 "\n", parms->elf_paddr_offset); elf_msg(elf, " virt_offset = 0x%" PRIx64 "\n", parms->virt_offset); elf_msg(elf, " virt_kstart = 0x%" PRIx64 "\n", parms->virt_kstart); elf_msg(elf, " virt_kend = 0x%" PRIx64 "\n", parms->virt_kend); elf_msg(elf, " virt_entry = 0x%" PRIx64 "\n", parms->virt_entry); elf_msg(elf, " p2m_base = 0x%" PRIx64 "\n", parms->p2m_base); if ( (parms->virt_kstart > parms->virt_kend) || (parms->virt_entry < parms->virt_kstart) || (parms->virt_entry > parms->virt_kend) || (parms->virt_base > parms->virt_kstart) ) { elf_err(elf, "%s: ERROR: ELF start or entries are out of bounds.\n", __FUNCTION__); return -1; } if ( (parms->p2m_base != UNSET_ADDR) && (parms->p2m_base >= parms->virt_kstart) && (parms->p2m_base < parms->virt_kend) ) { elf_err(elf, "%s: ERROR: P->M table base is out of bounds.\n", __FUNCTION__); return -1; } return 0; } /* ------------------------------------------------------------------------ */ /* glue it all together ... */ elf_errorstatus elf_xen_parse(struct elf_binary *elf, struct elf_dom_parms *parms) { ELF_HANDLE_DECL(elf_shdr) shdr; ELF_HANDLE_DECL(elf_phdr) phdr; unsigned xen_elfnotes = 0; unsigned i, count, more_notes; unsigned total_note_count = 0; elf_memset_unchecked(parms, 0, sizeof(*parms)); parms->virt_base = UNSET_ADDR; parms->virt_entry = UNSET_ADDR; parms->virt_hypercall = UNSET_ADDR; parms->virt_hv_start_low = UNSET_ADDR; parms->p2m_base = UNSET_ADDR; parms->elf_paddr_offset = UNSET_ADDR; /* Find and parse elf notes. */ count = elf_phdr_count(elf); for ( i = 0; i < count; i++ ) { phdr = elf_phdr_by_index(elf, i); if ( !elf_access_ok(elf, ELF_HANDLE_PTRVAL(phdr), 1) ) /* input has an insane program header count field */ break; if ( elf_uval(elf, phdr, p_type) != PT_NOTE ) continue; /* * Some versions of binutils do not correctly set p_offset for * note segments. */ if (elf_uval(elf, phdr, p_offset) == 0) continue; more_notes = elf_xen_parse_notes(elf, parms, elf_segment_start(elf, phdr), elf_segment_end(elf, phdr), &total_note_count); if ( more_notes == ELF_NOTE_INVALID ) return -1; xen_elfnotes += more_notes; } /* * Fall back to any SHT_NOTE sections if no valid note segments * were found. */ if ( xen_elfnotes == 0 ) { count = elf_shdr_count(elf); for ( i = 0; i < count; i++ ) { shdr = elf_shdr_by_index(elf, i); if ( !elf_access_ok(elf, ELF_HANDLE_PTRVAL(shdr), 1) ) /* input has an insane section header count field */ break; if ( elf_uval(elf, shdr, sh_type) != SHT_NOTE ) continue; more_notes = elf_xen_parse_notes(elf, parms, elf_section_start(elf, shdr), elf_section_end(elf, shdr), &total_note_count); if ( more_notes == ELF_NOTE_INVALID ) return -1; if ( xen_elfnotes == 0 && more_notes > 0 ) elf_msg(elf, "%s: using notes from SHT_NOTE section\n", __FUNCTION__); xen_elfnotes += more_notes; } } /* * Finally fall back to the __xen_guest section. */ if ( xen_elfnotes == 0 ) { shdr = elf_shdr_by_name(elf, "__xen_guest"); if ( ELF_HANDLE_VALID(shdr) ) { parms->guest_info = elf_section_start(elf, shdr); parms->elf_note_start = ELF_INVALID_PTRVAL; parms->elf_note_end = ELF_INVALID_PTRVAL; elf_msg(elf, "%s: __xen_guest: \"%s\"\n", __FUNCTION__, elf_strfmt(elf, parms->guest_info)); elf_xen_parse_guest_info(elf, parms); } } if ( elf_xen_note_check(elf, parms) != 0 ) return -1; if ( elf_xen_addr_calc_check(elf, parms) != 0 ) return -1; return 0; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/libelf/COPYING0000664000175000017500000006045112307313555015023 0ustar smbsmbNote that the only valid version of the LGPL as far as the files in this directory are concerned is _this_ particular version of the license (i.e., *only* v2.1, not v2.2 or v3.x or whatever), unless explicitly otherwise stated. Where clause 3 is invoked in order to relicense under the GPL then this shall be considered to be GPL v2 only for files which have specified LGPL v2.1 only. GNU LESSER GENERAL PUBLIC LICENSE Version 2.1, February 1999 Copyright (C) 1991, 1999 Free Software Foundation, Inc. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. [This is the first released version of the Lesser GPL. It also counts as the successor of the GNU Library Public License, version 2, hence the version number 2.1.] Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public Licenses are intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This license, the Lesser General Public License, applies to some specially designated software packages--typically libraries--of the Free Software Foundation and other authors who decide to use it. You can use it too, but we suggest you first think carefully about whether this license or the ordinary General Public License is the better strategy to use in any particular case, based on the explanations below. When we speak of free software, we are referring to freedom of use, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish); that you receive source code or can get it if you want it; that you can change the software and use pieces of it in new free programs; and that you are informed that you can do these things. To protect your rights, we need to make restrictions that forbid distributors to deny you these rights or to ask you to surrender these rights. These restrictions translate to certain responsibilities for you if you distribute copies of the library or if you modify it. For example, if you distribute copies of the library, whether gratis or for a fee, you must give the recipients all the rights that we gave you. You must make sure that they, too, receive or can get the source code. If you link other code with the library, you must provide complete object files to the recipients, so that they can relink them with the library after making changes to the library and recompiling it. And you must show them these terms so they know their rights. We protect your rights with a two-step method: (1) we copyright the library, and (2) we offer you this license, which gives you legal permission to copy, distribute and/or modify the library. To protect each distributor, we want to make it very clear that there is no warranty for the free library. Also, if the library is modified by someone else and passed on, the recipients should know that what they have is not the original version, so that the original author's reputation will not be affected by problems that might be introduced by others. Finally, software patents pose a constant threat to the existence of any free program. We wish to make sure that a company cannot effectively restrict the users of a free program by obtaining a restrictive license from a patent holder. Therefore, we insist that any patent license obtained for a version of the library must be consistent with the full freedom of use specified in this license. Most GNU software, including some libraries, is covered by the ordinary GNU General Public License. This license, the GNU Lesser General Public License, applies to certain designated libraries, and is quite different from the ordinary General Public License. We use this license for certain libraries in order to permit linking those libraries into non-free programs. When a program is linked with a library, whether statically or using a shared library, the combination of the two is legally speaking a combined work, a derivative of the original library. The ordinary General Public License therefore permits such linking only if the entire combination fits its criteria of freedom. The Lesser General Public License permits more lax criteria for linking other code with the library. We call this license the "Lesser" General Public License because it does Less to protect the user's freedom than the ordinary General Public License. It also provides other free software developers Less of an advantage over competing non-free programs. These disadvantages are the reason we use the ordinary General Public License for many libraries. However, the Lesser license provides advantages in certain special circumstances. For example, on rare occasions, there may be a special need to encourage the widest possible use of a certain library, so that it becomes a de-facto standard. To achieve this, non-free programs must be allowed to use the library. A more frequent case is that a free library does the same job as widely used non-free libraries. In this case, there is little to gain by limiting the free library to free software only, so we use the Lesser General Public License. In other cases, permission to use a particular library in non-free programs enables a greater number of people to use a large body of free software. For example, permission to use the GNU C Library in non-free programs enables many more people to use the whole GNU operating system, as well as its variant, the GNU/Linux operating system. Although the Lesser General Public License is Less protective of the users' freedom, it does ensure that the user of a program that is linked with the Library has the freedom and the wherewithal to run that program using a modified version of the Library. The precise terms and conditions for copying, distribution and modification follow. Pay close attention to the difference between a "work based on the library" and a "work that uses the library". The former contains code derived from the library, whereas the latter must be combined with the library in order to run. GNU LESSER GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License Agreement applies to any software library or other program which contains a notice placed by the copyright holder or other authorized party saying it may be distributed under the terms of this Lesser General Public License (also called "this License"). Each licensee is addressed as "you". A "library" means a collection of software functions and/or data prepared so as to be conveniently linked with application programs (which use some of those functions and data) to form executables. The "Library", below, refers to any such software library or work which has been distributed under these terms. A "work based on the Library" means either the Library or any derivative work under copyright law: that is to say, a work containing the Library or a portion of it, either verbatim or with modifications and/or translated straightforwardly into another language. (Hereinafter, translation is included without limitation in the term "modification".) "Source code" for a work means the preferred form of the work for making modifications to it. For a library, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the library. Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running a program using the Library is not restricted, and output from such a program is covered only if its contents constitute a work based on the Library (independent of the use of the Library in a tool for writing it). Whether that is true depends on what the Library does and what the program that uses the Library does. 1. You may copy and distribute verbatim copies of the Library's complete source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and distribute a copy of this License along with the Library. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Library or any portion of it, thus forming a work based on the Library, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) The modified work must itself be a software library. b) You must cause the files modified to carry prominent notices stating that you changed the files and the date of any change. c) You must cause the whole of the work to be licensed at no charge to all third parties under the terms of this License. d) If a facility in the modified Library refers to a function or a table of data to be supplied by an application program that uses the facility, other than as an argument passed when the facility is invoked, then you must make a good faith effort to ensure that, in the event an application does not supply such function or table, the facility still operates, and performs whatever part of its purpose remains meaningful. (For example, a function in a library to compute square roots has a purpose that is entirely well-defined independent of the application. Therefore, Subsection 2d requires that any application-supplied function or table used by this function must be optional: if the application does not supply it, the square root function must still compute square roots.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Library, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Library, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Library. In addition, mere aggregation of another work not based on the Library with the Library (or with a work based on the Library) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may opt to apply the terms of the ordinary GNU General Public License instead of this License to a given copy of the Library. To do this, you must alter all the notices that refer to this License, so that they refer to the ordinary GNU General Public License, version 2, instead of to this License. (If a newer version than version 2 of the ordinary GNU General Public License has appeared, then you can specify that version instead if you wish.) Do not make any other change in these notices. Once this change is made in a given copy, it is irreversible for that copy, so the ordinary GNU General Public License applies to all subsequent copies and derivative works made from that copy. This option is useful when you wish to copy part of the code of the Library into a program that is not a library. 4. You may copy and distribute the Library (or a portion or derivative of it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange. If distribution of object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place satisfies the requirement to distribute the source code, even though third parties are not compelled to copy the source along with the object code. 5. A program that contains no derivative of any portion of the Library, but is designed to work with the Library by being compiled or linked with it, is called a "work that uses the Library". Such a work, in isolation, is not a derivative work of the Library, and therefore falls outside the scope of this License. However, linking a "work that uses the Library" with the Library creates an executable that is a derivative of the Library (because it contains portions of the Library), rather than a "work that uses the library". The executable is therefore covered by this License. Section 6 states terms for distribution of such executables. When a "work that uses the Library" uses material from a header file that is part of the Library, the object code for the work may be a derivative work of the Library even though the source code is not. Whether this is true is especially significant if the work can be linked without the Library, or if the work is itself a library. The threshold for this to be true is not precisely defined by law. If such an object file uses only numerical parameters, data structure layouts and accessors, and small macros and small inline functions (ten lines or less in length), then the use of the object file is unrestricted, regardless of whether it is legally a derivative work. (Executables containing this object code plus portions of the Library will still fall under Section 6.) Otherwise, if the work is a derivative of the Library, you may distribute the object code for the work under the terms of Section 6. Any executables containing that work also fall under Section 6, whether or not they are linked directly with the Library itself. 6. As an exception to the Sections above, you may also combine or link a "work that uses the Library" with the Library to produce a work containing portions of the Library, and distribute that work under terms of your choice, provided that the terms permit modification of the work for the customer's own use and reverse engineering for debugging such modifications. You must give prominent notice with each copy of the work that the Library is used in it and that the Library and its use are covered by this License. You must supply a copy of this License. If the work during execution displays copyright notices, you must include the copyright notice for the Library among them, as well as a reference directing the user to the copy of this License. Also, you must do one of these things: a) Accompany the work with the complete corresponding machine-readable source code for the Library including whatever changes were used in the work (which must be distributed under Sections 1 and 2 above); and, if the work is an executable linked with the Library, with the complete machine-readable "work that uses the Library", as object code and/or source code, so that the user can modify the Library and then relink to produce a modified executable containing the modified Library. (It is understood that the user who changes the contents of definitions files in the Library will not necessarily be able to recompile the application to use the modified definitions.) b) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (1) uses at run time a copy of the library already present on the user's computer system, rather than copying library functions into the executable, and (2) will operate properly with a modified version of the library, if the user installs one, as long as the modified version is interface-compatible with the version that the work was made with. c) Accompany the work with a written offer, valid for at least three years, to give the same user the materials specified in Subsection 6a, above, for a charge no more than the cost of performing this distribution. d) If distribution of the work is made by offering access to copy from a designated place, offer equivalent access to copy the above specified materials from the same place. e) Verify that the user has already received a copy of these materials or that you have already sent this user a copy. For an executable, the required form of the "work that uses the Library" must include any data and utility programs needed for reproducing the executable from it. However, as a special exception, the materials to be distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. It may happen that this requirement contradicts the license restrictions of other proprietary libraries that do not normally accompany the operating system. Such a contradiction means you cannot use both them and the Library together in an executable that you distribute. 7. You may place library facilities that are a work based on the Library side-by-side in a single library together with other library facilities not covered by this License, and distribute such a combined library, provided that the separate distribution of the work based on the Library and of the other library facilities is otherwise permitted, and provided that you do these two things: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities. This must be distributed under the terms of the Sections above. b) Give prominent notice with the combined library of the fact that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 8. You may not copy, modify, sublicense, link with, or distribute the Library except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense, link with, or distribute the Library is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 9. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Library or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Library (or any work based on the Library), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Library or works based on it. 10. Each time you redistribute the Library (or any work based on the Library), the recipient automatically receives a license from the original licensor to copy, distribute, link with or modify the Library subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties with this License. 11. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Library at all. For example, if a patent license would not permit royalty-free redistribution of the Library by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Library. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply, and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 12. If the distribution and/or use of the Library is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Library under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 13. The Free Software Foundation may publish revised and/or new versions of the Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Library does not specify a license version number, you may choose any version ever published by the Free Software Foundation. 14. If you wish to incorporate parts of the Library into other free programs whose distribution conditions are incompatible with these, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS xen-4.4.0/xen/common/libelf/libelf-tools.c0000664000175000017500000002604612307313555016531 0ustar smbsmb/* * various helper functions to access elf structures * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; * version 2.1 of the License. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libelf-private.h" /* ------------------------------------------------------------------------ */ void elf_mark_broken(struct elf_binary *elf, const char *msg) { if ( elf->broken == NULL ) elf->broken = msg; } const char *elf_check_broken(const struct elf_binary *elf) { return elf->broken; } static bool elf_ptrval_in_range(elf_ptrval ptrval, uint64_t size, const void *region, uint64_t regionsize) /* * Returns true if the putative memory area [ptrval,ptrval+size> * is completely inside the region [region,region+regionsize>. * * ptrval and size are the untrusted inputs to be checked. * region and regionsize are trusted and must be correct and valid, * although it is OK for region to perhaps be maliciously NULL * (but not some other malicious value). */ { elf_ptrval regionp = (elf_ptrval)region; if ( (region == NULL) || (ptrval < regionp) || /* start is before region */ (ptrval > regionp + regionsize) || /* start is after region */ (size > regionsize - (ptrval - regionp)) ) /* too big */ return 0; return 1; } bool elf_access_ok(struct elf_binary * elf, uint64_t ptrval, size_t size) { if ( elf_ptrval_in_range(ptrval, size, elf->image_base, elf->size) ) return 1; if ( elf_ptrval_in_range(ptrval, size, elf->dest_base, elf->dest_size) ) return 1; if ( elf_ptrval_in_range(ptrval, size, elf->caller_xdest_base, elf->caller_xdest_size) ) return 1; elf_mark_broken(elf, "out of range access"); return 0; } void elf_memcpy_safe(struct elf_binary *elf, elf_ptrval dst, elf_ptrval src, size_t size) { if ( elf_access_ok(elf, dst, size) && elf_access_ok(elf, src, size) ) { /* use memmove because these checks do not prove that the * regions don't overlap and overlapping regions grant * permission for compiler malice */ elf_memmove_unchecked(ELF_UNSAFE_PTR(dst), ELF_UNSAFE_PTR(src), size); } } void elf_memset_safe(struct elf_binary *elf, elf_ptrval dst, int c, size_t size) { if ( elf_access_ok(elf, dst, size) ) { elf_memset_unchecked(ELF_UNSAFE_PTR(dst), c, size); } } uint64_t elf_access_unsigned(struct elf_binary * elf, elf_ptrval base, uint64_t moreoffset, size_t size) { elf_ptrval ptrval = base + moreoffset; bool need_swap = elf_swap(elf); const uint8_t *u8; const uint16_t *u16; const uint32_t *u32; const uint64_t *u64; if ( !elf_access_ok(elf, ptrval, size) ) return 0; switch ( size ) { case 1: u8 = (const void*)ptrval; return *u8; case 2: u16 = (const void*)ptrval; return need_swap ? bswap_16(*u16) : *u16; case 4: u32 = (const void*)ptrval; return need_swap ? bswap_32(*u32) : *u32; case 8: u64 = (const void*)ptrval; return need_swap ? bswap_64(*u64) : *u64; default: return 0; } } uint64_t elf_round_up(struct elf_binary *elf, uint64_t addr) { uint64_t elf_round = (elf_64bit(elf) ? 8 : 4) - 1; return (addr + elf_round) & ~elf_round; } /* ------------------------------------------------------------------------ */ unsigned elf_shdr_count(struct elf_binary *elf) { unsigned count = elf_uval(elf, elf->ehdr, e_shnum); uint64_t max = elf->size / sizeof(Elf32_Shdr); if (max > ~(unsigned)0) max = ~(unsigned)0; /* Xen doesn't have limits.h :-/ */ if (count > max) { elf_mark_broken(elf, "far too many section headers"); count = max; } return count; } unsigned elf_phdr_count(struct elf_binary *elf) { return elf_uval(elf, elf->ehdr, e_phnum); } ELF_HANDLE_DECL(elf_shdr) elf_shdr_by_name(struct elf_binary *elf, const char *name) { uint64_t count = elf_shdr_count(elf); ELF_HANDLE_DECL(elf_shdr) shdr; const char *sname; unsigned i; for ( i = 0; i < count; i++ ) { shdr = elf_shdr_by_index(elf, i); if ( !elf_access_ok(elf, ELF_HANDLE_PTRVAL(shdr), 1) ) /* input has an insane section header count field */ break; sname = elf_section_name(elf, shdr); if ( sname && !strcmp(sname, name) ) return shdr; } return ELF_INVALID_HANDLE(elf_shdr); } ELF_HANDLE_DECL(elf_shdr) elf_shdr_by_index(struct elf_binary *elf, unsigned index) { uint64_t count = elf_shdr_count(elf); elf_ptrval ptr; if ( index >= count ) return ELF_INVALID_HANDLE(elf_shdr); ptr = (ELF_IMAGE_BASE(elf) + elf_uval(elf, elf->ehdr, e_shoff) + elf_uval(elf, elf->ehdr, e_shentsize) * index); return ELF_MAKE_HANDLE(elf_shdr, ptr); } ELF_HANDLE_DECL(elf_phdr) elf_phdr_by_index(struct elf_binary *elf, unsigned index) { uint64_t count = elf_uval(elf, elf->ehdr, e_phnum); elf_ptrval ptr; if ( index >= count ) return ELF_INVALID_HANDLE(elf_phdr); ptr = (ELF_IMAGE_BASE(elf) + elf_uval(elf, elf->ehdr, e_phoff) + elf_uval(elf, elf->ehdr, e_phentsize) * index); return ELF_MAKE_HANDLE(elf_phdr, ptr); } const char *elf_section_name(struct elf_binary *elf, ELF_HANDLE_DECL(elf_shdr) shdr) { if ( ELF_PTRVAL_INVALID(elf->sec_strtab) ) return "unknown"; return elf_strval(elf, elf->sec_strtab + elf_uval(elf, shdr, sh_name)); } const char *elf_strval(struct elf_binary *elf, elf_ptrval start) { uint64_t length; for ( length = 0; ; length++ ) { if ( !elf_access_ok(elf, start + length, 1) ) return NULL; if ( !elf_access_unsigned(elf, start, length, 1) ) /* ok */ return ELF_UNSAFE_PTR(start); if ( length >= ELF_MAX_STRING_LENGTH ) { elf_mark_broken(elf, "excessively long string"); return NULL; } } } const char *elf_strfmt(struct elf_binary *elf, elf_ptrval start) { const char *str = elf_strval(elf, start); if ( str == NULL ) return "(invalid)"; return str; } elf_ptrval elf_section_start(struct elf_binary *elf, ELF_HANDLE_DECL(elf_shdr) shdr) { return ELF_IMAGE_BASE(elf) + elf_uval(elf, shdr, sh_offset); } elf_ptrval elf_section_end(struct elf_binary *elf, ELF_HANDLE_DECL(elf_shdr) shdr) { return ELF_IMAGE_BASE(elf) + elf_uval(elf, shdr, sh_offset) + elf_uval(elf, shdr, sh_size); } elf_ptrval elf_segment_start(struct elf_binary *elf, ELF_HANDLE_DECL(elf_phdr) phdr) { return ELF_IMAGE_BASE(elf) + elf_uval(elf, phdr, p_offset); } elf_ptrval elf_segment_end(struct elf_binary *elf, ELF_HANDLE_DECL(elf_phdr) phdr) { return ELF_IMAGE_BASE(elf) + elf_uval(elf, phdr, p_offset) + elf_uval(elf, phdr, p_filesz); } ELF_HANDLE_DECL(elf_sym) elf_sym_by_name(struct elf_binary *elf, const char *symbol) { elf_ptrval ptr = elf_section_start(elf, elf->sym_tab); elf_ptrval end = elf_section_end(elf, elf->sym_tab); ELF_HANDLE_DECL(elf_sym) sym; uint64_t info, name; const char *sym_name; for ( ; ptr < end; ptr += elf_size(elf, sym) ) { sym = ELF_MAKE_HANDLE(elf_sym, ptr); info = elf_uval(elf, sym, st_info); name = elf_uval(elf, sym, st_name); if ( ELF32_ST_BIND(info) != STB_GLOBAL ) continue; sym_name = elf_strval(elf, elf->sym_strtab + name); if ( sym_name == NULL ) /* out of range, oops */ return ELF_INVALID_HANDLE(elf_sym); if ( strcmp(sym_name, symbol) ) continue; return sym; } return ELF_INVALID_HANDLE(elf_sym); } ELF_HANDLE_DECL(elf_sym) elf_sym_by_index(struct elf_binary *elf, unsigned index) { elf_ptrval ptr = elf_section_start(elf, elf->sym_tab); ELF_HANDLE_DECL(elf_sym) sym; sym = ELF_MAKE_HANDLE(elf_sym, ptr + index * elf_size(elf, sym)); return sym; } const char *elf_note_name(struct elf_binary *elf, ELF_HANDLE_DECL(elf_note) note) { return elf_strval(elf, ELF_HANDLE_PTRVAL(note) + elf_size(elf, note)); } elf_ptrval elf_note_desc(struct elf_binary *elf, ELF_HANDLE_DECL(elf_note) note) { unsigned namesz = (elf_uval(elf, note, namesz) + 3) & ~3; return ELF_HANDLE_PTRVAL(note) + elf_size(elf, note) + namesz; } uint64_t elf_note_numeric(struct elf_binary *elf, ELF_HANDLE_DECL(elf_note) note) { elf_ptrval desc = elf_note_desc(elf, note); unsigned descsz = elf_uval(elf, note, descsz); switch (descsz) { case 1: case 2: case 4: case 8: return elf_access_unsigned(elf, desc, 0, descsz); default: return 0; } } uint64_t elf_note_numeric_array(struct elf_binary *elf, ELF_HANDLE_DECL(elf_note) note, unsigned int unitsz, unsigned int idx) { elf_ptrval desc = elf_note_desc(elf, note); unsigned descsz = elf_uval(elf, note, descsz); if ( descsz % unitsz || idx >= descsz / unitsz ) return 0; switch (unitsz) { case 1: case 2: case 4: case 8: return elf_access_unsigned(elf, desc, idx * unitsz, unitsz); default: return 0; } } ELF_HANDLE_DECL(elf_note) elf_note_next(struct elf_binary *elf, ELF_HANDLE_DECL(elf_note) note) { unsigned namesz = (elf_uval(elf, note, namesz) + 3) & ~3; unsigned descsz = (elf_uval(elf, note, descsz) + 3) & ~3; elf_ptrval ptrval = ELF_HANDLE_PTRVAL(note) + elf_size(elf, note) + namesz + descsz; if ( ( ptrval <= ELF_HANDLE_PTRVAL(note) || /* wrapped or stuck */ !elf_access_ok(elf, ELF_HANDLE_PTRVAL(note), 1) ) ) ptrval = ELF_MAX_PTRVAL; /* terminate caller's loop */ return ELF_MAKE_HANDLE(elf_note, ptrval); } /* ------------------------------------------------------------------------ */ bool elf_is_elfbinary(const void *image_start, size_t image_size) { const Elf32_Ehdr *ehdr = image_start; if ( image_size < sizeof(*ehdr) ) return 0; return IS_ELF(*ehdr); } bool elf_phdr_is_loadable(struct elf_binary *elf, ELF_HANDLE_DECL(elf_phdr) phdr) { uint64_t p_type = elf_uval(elf, phdr, p_type); uint64_t p_flags = elf_uval(elf, phdr, p_flags); return ((p_type == PT_LOAD) && (p_flags & (PF_R | PF_W | PF_X)) != 0); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/rcupdate.c0000664000175000017500000003421112307313555014501 0ustar smbsmb/* * Read-Copy Update mechanism for mutual exclusion * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * Copyright (C) IBM Corporation, 2001 * * Authors: Dipankar Sarma * Manfred Spraul * * Modifications for Xen: Jose Renato Santos * Copyright (C) Hewlett-Packard, 2006 * * Based on the original work by Paul McKenney * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) * * For detailed explanation of Read-Copy Update mechanism see - * http://lse.sourceforge.net/locking/rcupdate.html */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* Global control variables for rcupdate callback mechanism. */ static struct rcu_ctrlblk { long cur; /* Current batch number. */ long completed; /* Number of the last completed batch */ int next_pending; /* Is the next batch already waiting? */ spinlock_t lock __cacheline_aligned; cpumask_t cpumask; /* CPUs that need to switch in order */ /* for current batch to proceed. */ } __cacheline_aligned rcu_ctrlblk = { .cur = -300, .completed = -300, .lock = SPIN_LOCK_UNLOCKED, }; /* * Per-CPU data for Read-Copy Update. * nxtlist - new callbacks are added here * curlist - current batch for which quiescent cycle started if any */ struct rcu_data { /* 1) quiescent state handling : */ long quiescbatch; /* Batch # for grace period */ int qs_pending; /* core waits for quiesc state */ /* 2) batch handling */ long batch; /* Batch # for current RCU batch */ struct rcu_head *nxtlist; struct rcu_head **nxttail; long qlen; /* # of queued callbacks */ struct rcu_head *curlist; struct rcu_head **curtail; struct rcu_head *donelist; struct rcu_head **donetail; long blimit; /* Upper limit on a processed batch */ int cpu; struct rcu_head barrier; long last_rs_qlen; /* qlen during the last resched */ }; static DEFINE_PER_CPU(struct rcu_data, rcu_data); static int blimit = 10; static int qhimark = 10000; static int qlowmark = 100; static int rsinterval = 1000; struct rcu_barrier_data { struct rcu_head head; atomic_t *cpu_count; }; static void rcu_barrier_callback(struct rcu_head *head) { struct rcu_barrier_data *data = container_of( head, struct rcu_barrier_data, head); atomic_inc(data->cpu_count); } static int rcu_barrier_action(void *_cpu_count) { struct rcu_barrier_data data = { .cpu_count = _cpu_count }; ASSERT(!local_irq_is_enabled()); local_irq_enable(); /* * When callback is executed, all previously-queued RCU work on this CPU * is completed. When all CPUs have executed their callback, data.cpu_count * will have been incremented to include every online CPU. */ call_rcu(&data.head, rcu_barrier_callback); while ( atomic_read(data.cpu_count) != num_online_cpus() ) { process_pending_softirqs(); cpu_relax(); } local_irq_disable(); return 0; } int rcu_barrier(void) { atomic_t cpu_count = ATOMIC_INIT(0); return stop_machine_run(rcu_barrier_action, &cpu_count, NR_CPUS); } /* Is batch a before batch b ? */ static inline int rcu_batch_before(long a, long b) { return (a - b) < 0; } /* Is batch a after batch b ? */ static inline int rcu_batch_after(long a, long b) { return (a - b) > 0; } static void force_quiescent_state(struct rcu_data *rdp, struct rcu_ctrlblk *rcp) { cpumask_t cpumask; raise_softirq(SCHEDULE_SOFTIRQ); if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) { rdp->last_rs_qlen = rdp->qlen; /* * Don't send IPI to itself. With irqs disabled, * rdp->cpu is the current cpu. */ cpumask_andnot(&cpumask, &rcp->cpumask, cpumask_of(rdp->cpu)); cpumask_raise_softirq(&cpumask, SCHEDULE_SOFTIRQ); } } /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. * @func: actual update function to be invoked after the grace period * * The update function will be invoked some time after a full grace * period elapses, in other words after all currently executing RCU * read-side critical sections have completed. RCU read-side critical * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; struct rcu_data *rdp; head->func = func; head->next = NULL; local_irq_save(flags); rdp = &__get_cpu_var(rcu_data); *rdp->nxttail = head; rdp->nxttail = &head->next; if (unlikely(++rdp->qlen > qhimark)) { rdp->blimit = INT_MAX; force_quiescent_state(rdp, &rcu_ctrlblk); } local_irq_restore(flags); } /* * Invoke the completed RCU callbacks. They are expected to be in * a per-cpu list. */ static void rcu_do_batch(struct rcu_data *rdp) { struct rcu_head *next, *list; int count = 0; list = rdp->donelist; while (list) { next = rdp->donelist = list->next; list->func(list); list = next; rdp->qlen--; if (++count >= rdp->blimit) break; } if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; if (!rdp->donelist) rdp->donetail = &rdp->donelist; else raise_softirq(RCU_SOFTIRQ); } /* * Grace period handling: * The grace period handling consists out of two steps: * - A new grace period is started. * This is done by rcu_start_batch. The start is not broadcasted to * all cpus, they must pick this up by comparing rcp->cur with * rdp->quiescbatch. All cpus are recorded in the * rcu_ctrlblk.cpumask bitmap. * - All cpus must go through a quiescent state. * Since the start of the grace period is not broadcasted, at least two * calls to rcu_check_quiescent_state are required: * The first call just notices that a new grace period is running. The * following calls check if there was a quiescent state since the beginning * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If * the bitmap is empty, then the grace period is completed. * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace * period (if necessary). */ /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. * Caller must hold rcu_ctrlblk.lock. */ static void rcu_start_batch(struct rcu_ctrlblk *rcp) { if (rcp->next_pending && rcp->completed == rcp->cur) { rcp->next_pending = 0; /* * next_pending == 0 must be visible in * __rcu_process_callbacks() before it can see new value of cur. */ smp_wmb(); rcp->cur++; cpumask_copy(&rcp->cpumask, &cpu_online_map); } } /* * cpu went through a quiescent state since the beginning of the grace period. * Clear it from the cpu mask and complete the grace period if it was the last * cpu. Start another grace period if someone has further entries pending */ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) { cpumask_clear_cpu(cpu, &rcp->cpumask); if (cpumask_empty(&rcp->cpumask)) { /* batch completed ! */ rcp->completed = rcp->cur; rcu_start_batch(rcp); } } /* * Check if the cpu has gone through a quiescent state (say context * switch). If so and if it already hasn't done so in this RCU * quiescent cycle, then indicate that it has done so. */ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { if (rdp->quiescbatch != rcp->cur) { /* start new grace period: */ rdp->qs_pending = 1; rdp->quiescbatch = rcp->cur; return; } /* Grace period already completed for this cpu? * qs_pending is checked instead of the actual bitmap to avoid * cacheline trashing. */ if (!rdp->qs_pending) return; rdp->qs_pending = 0; spin_lock(&rcp->lock); /* * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync * during cpu startup. Ignore the quiescent state. */ if (likely(rdp->quiescbatch == rcp->cur)) cpu_quiet(rdp->cpu, rcp); spin_unlock(&rcp->lock); } /* * This does the RCU processing work from softirq context. */ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { *rdp->donetail = rdp->curlist; rdp->donetail = rdp->curtail; rdp->curlist = NULL; rdp->curtail = &rdp->curlist; } local_irq_disable(); if (rdp->nxtlist && !rdp->curlist) { rdp->curlist = rdp->nxtlist; rdp->curtail = rdp->nxttail; rdp->nxtlist = NULL; rdp->nxttail = &rdp->nxtlist; local_irq_enable(); /* * start the next batch of callbacks */ /* determine batch number */ rdp->batch = rcp->cur + 1; /* see the comment and corresponding wmb() in * the rcu_start_batch() */ smp_rmb(); if (!rcp->next_pending) { /* and start it/schedule start if it's a new batch */ spin_lock(&rcp->lock); rcp->next_pending = 1; rcu_start_batch(rcp); spin_unlock(&rcp->lock); } } else { local_irq_enable(); } rcu_check_quiescent_state(rcp, rdp); if (rdp->donelist) rcu_do_batch(rdp); } static void rcu_process_callbacks(void) { __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); } static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { /* This cpu has pending rcu entries and the grace period * for them has completed. */ if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) return 1; /* This cpu has no pending entries, but there are new entries */ if (!rdp->curlist && rdp->nxtlist) return 1; /* This cpu has finished callbacks to invoke */ if (rdp->donelist) return 1; /* The rcu core waits for a quiescent state from the cpu */ if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) return 1; /* nothing to do */ return 0; } int rcu_pending(int cpu) { return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)); } /* * Check to see if any future RCU-related work will need to be done * by the current CPU, even if none need be done immediately, returning * 1 if so. This function is part of the RCU implementation; it is -not- * an exported member of the RCU API. */ int rcu_needs_cpu(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_data, cpu); return (!!rdp->curlist || rcu_pending(cpu)); } void rcu_check_callbacks(int cpu) { raise_softirq(RCU_SOFTIRQ); } static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, struct rcu_head **tail) { local_irq_disable(); *this_rdp->nxttail = list; if (list) this_rdp->nxttail = tail; local_irq_enable(); } static void rcu_offline_cpu(struct rcu_data *this_rdp, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { /* If the cpu going offline owns the grace period we can block * indefinitely waiting for it, so flush it here. */ spin_lock(&rcp->lock); if (rcp->cur != rcp->completed) cpu_quiet(rdp->cpu, rcp); spin_unlock(&rcp->lock); rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); local_irq_disable(); this_rdp->qlen += rdp->qlen; local_irq_enable(); } static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { memset(rdp, 0, sizeof(*rdp)); rdp->curtail = &rdp->curlist; rdp->nxttail = &rdp->nxtlist; rdp->donetail = &rdp->donelist; rdp->quiescbatch = rcp->completed; rdp->qs_pending = 0; rdp->cpu = cpu; rdp->blimit = blimit; } static int cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct rcu_data *rdp = &per_cpu(rcu_data, cpu); switch ( action ) { case CPU_UP_PREPARE: rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); break; case CPU_UP_CANCELED: case CPU_DEAD: rcu_offline_cpu(&this_cpu(rcu_data), &rcu_ctrlblk, rdp); break; default: break; } return NOTIFY_DONE; } static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; void __init rcu_init(void) { void *cpu = (void *)(long)smp_processor_id(); cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); register_cpu_notifier(&cpu_nfb); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } xen-4.4.0/xen/common/symbols-dummy.c0000664000175000017500000000072412307313555015515 0ustar smbsmb/* * symbols-dummy.c: dummy symbol-table definitions for the inital partial * link of the hypervisor image. */ #include #include #ifdef SYMBOLS_ORIGIN const unsigned int symbols_offsets[1]; #else const unsigned long symbols_addresses[1]; #endif const unsigned int symbols_num_syms; const u8 symbols_names[1]; const u8 symbols_token_table[1]; const u16 symbols_token_index[1]; const unsigned int symbols_markers[1]; xen-4.4.0/xen/common/compat/0000775000175000017500000000000012307313555014010 5ustar smbsmbxen-4.4.0/xen/common/compat/Makefile0000664000175000017500000000015712307313555015453 0ustar smbsmbobj-y += domain.o obj-y += kernel.o obj-y += memory.o obj-y += multicall.o obj-y += xlat.o obj-y += tmem_xen.o xen-4.4.0/xen/common/compat/xenoprof.c0000664000175000017500000000144212307313555016015 0ustar smbsmb/* * compat/xenoprof.c */ #include #define COMPAT #define ret_t int #define do_xenoprof_op compat_xenoprof_op #define xen_oprof_init xenoprof_init CHECK_oprof_init; #undef xen_oprof_init #define xenoprof_get_buffer compat_oprof_get_buffer #define xenoprof_op_get_buffer compat_oprof_op_get_buffer #define xenoprof_arch_counter compat_oprof_arch_counter #define xen_domid_t domid_t #define compat_domid_t domid_compat_t CHECK_TYPE(domid); #undef compat_domid_t #undef xen_domid_t #define xen_oprof_passive xenoprof_passive CHECK_oprof_passive; #undef xen_oprof_passive #define xenoprof_counter compat_oprof_counter #include "../xenoprof.c" /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/compat/domain.c0000664000175000017500000000564612307313555015436 0ustar smbsmb/****************************************************************************** * domain.c * */ #include #include #include #include #include #include #include #define xen_vcpu_set_periodic_timer vcpu_set_periodic_timer CHECK_vcpu_set_periodic_timer; #undef xen_vcpu_set_periodic_timer #define xen_vcpu_info vcpu_info CHECK_SIZE_(struct, vcpu_info); #undef xen_vcpu_info #define xen_vcpu_register_vcpu_info vcpu_register_vcpu_info CHECK_vcpu_register_vcpu_info; #undef xen_vcpu_register_vcpu_info int compat_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg) { struct domain *d = current->domain; struct vcpu *v; int rc = 0; if ( (vcpuid < 0) || (vcpuid >= MAX_VIRT_CPUS) ) return -EINVAL; if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) return -ENOENT; switch ( cmd ) { case VCPUOP_initialise: { struct compat_vcpu_guest_context *cmp_ctxt; if ( (cmp_ctxt = xmalloc(struct compat_vcpu_guest_context)) == NULL ) { rc = -ENOMEM; break; } if ( copy_from_guest(cmp_ctxt, arg, 1) ) { xfree(cmp_ctxt); rc = -EFAULT; break; } domain_lock(d); rc = v->is_initialised ? -EEXIST : arch_set_info_guest(v, cmp_ctxt); domain_unlock(d); if ( rc == -EAGAIN ) rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih", cmd, vcpuid, arg); xfree(cmp_ctxt); break; } case VCPUOP_up: case VCPUOP_down: case VCPUOP_is_up: case VCPUOP_set_periodic_timer: case VCPUOP_stop_periodic_timer: case VCPUOP_stop_singleshot_timer: case VCPUOP_register_vcpu_info: case VCPUOP_send_nmi: rc = do_vcpu_op(cmd, vcpuid, arg); break; case VCPUOP_get_runstate_info: { union { struct vcpu_runstate_info nat; struct compat_vcpu_runstate_info cmp; } runstate; vcpu_runstate_get(v, &runstate.nat); xlat_vcpu_runstate_info(&runstate.nat); if ( copy_to_guest(arg, &runstate.cmp, 1) ) rc = -EFAULT; break; } case VCPUOP_set_singleshot_timer: { struct compat_vcpu_set_singleshot_timer cmp; struct vcpu_set_singleshot_timer *nat; if ( copy_from_guest(&cmp, arg, 1) ) return -EFAULT; nat = COMPAT_ARG_XLAT_VIRT_BASE; XLAT_vcpu_set_singleshot_timer(nat, &cmp); rc = do_vcpu_op(cmd, vcpuid, guest_handle_from_ptr(nat, void)); break; } default: rc = arch_compat_vcpu_op(cmd, v, arg); break; } return rc; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/compat/schedule.c0000664000175000017500000000200112307313555015741 0ustar smbsmb/**************************************************************************** * schedule.c * */ #include #define COMPAT #define ret_t int #define do_sched_op compat_sched_op #define xen_sched_shutdown sched_shutdown CHECK_sched_shutdown; #undef xen_sched_shutdown #define xen_sched_remote_shutdown sched_remote_shutdown CHECK_sched_remote_shutdown; #undef xen_sched_remote_shutdown static int compat_poll(struct compat_sched_poll *compat) { struct sched_poll native; #define XLAT_sched_poll_HNDL_ports(_d_, _s_) \ guest_from_compat_handle((_d_)->ports, (_s_)->ports) XLAT_sched_poll(&native, compat); #undef XLAT_sched_poll_HNDL_ports return do_poll(&native); } #define do_poll compat_poll #define sched_poll compat_sched_poll #include "../schedule.c" int compat_set_timer_op(u32 lo, s32 hi) { return do_set_timer_op(((s64)hi << 32) | lo); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/compat/tmem_xen.c0000664000175000017500000000073312307313555015773 0ustar smbsmb/****************************************************************************** * tmem_xen.c * */ #include #include #include #include #include #include #include #define xen_tmem_op tmem_op /*CHECK_tmem_op;*/ #undef xen_tmem_op /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/compat/kernel.c0000664000175000017500000000257012307313555015440 0ustar smbsmb/****************************************************************************** * kernel.c */ #include #include #include #include #include #include #include #include #include #include #include #include extern xen_commandline_t saved_cmdline; #define xen_extraversion compat_extraversion #define xen_extraversion_t compat_extraversion_t #define xen_compile_info compat_compile_info #define xen_compile_info_t compat_compile_info_t CHECK_TYPE(capabilities_info); #define xen_platform_parameters compat_platform_parameters #define xen_platform_parameters_t compat_platform_parameters_t #undef HYPERVISOR_VIRT_START #define HYPERVISOR_VIRT_START HYPERVISOR_COMPAT_VIRT_START(current->domain) #define xen_changeset_info compat_changeset_info #define xen_changeset_info_t compat_changeset_info_t #define xen_feature_info compat_feature_info #define xen_feature_info_t compat_feature_info_t CHECK_TYPE(domain_handle); #define xennmi_callback compat_nmi_callback #define xennmi_callback_t compat_nmi_callback_t #define DO(fn) int compat_##fn #define COMPAT #include "../kernel.c" /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/compat/xlat.c0000664000175000017500000000364612307313555015135 0ustar smbsmb/****************************************************************************** * xlat.c */ #include #include #include #include #include /* In-place translation functons: */ void xlat_start_info(struct start_info *native, enum XLAT_start_info_console console) { struct compat_start_info *compat = (void *)native; BUILD_BUG_ON(sizeof(*native) < sizeof(*compat)); XLAT_start_info(compat, native); } void xlat_vcpu_runstate_info(struct vcpu_runstate_info *native) { struct compat_vcpu_runstate_info *compat = (void *)native; BUILD_BUG_ON(sizeof(*native) < sizeof(*compat)); XLAT_vcpu_runstate_info(compat, native); } #define xen_dom0_vga_console_info dom0_vga_console_info CHECK_dom0_vga_console_info; #undef dom0_vga_console_info #define xen_evtchn_alloc_unbound evtchn_alloc_unbound #define xen_evtchn_bind_interdomain evtchn_bind_interdomain #define xen_evtchn_bind_ipi evtchn_bind_ipi #define xen_evtchn_bind_pirq evtchn_bind_pirq #define xen_evtchn_bind_vcpu evtchn_bind_vcpu #define xen_evtchn_bind_virq evtchn_bind_virq #define xen_evtchn_close evtchn_close #define xen_evtchn_op evtchn_op #define xen_evtchn_send evtchn_send #define xen_evtchn_status evtchn_status #define xen_evtchn_unmask evtchn_unmask CHECK_evtchn_op; #undef xen_evtchn_alloc_unbound #undef xen_evtchn_bind_interdomain #undef xen_evtchn_bind_ipi #undef xen_evtchn_bind_pirq #undef xen_evtchn_bind_vcpu #undef xen_evtchn_bind_virq #undef xen_evtchn_close #undef xen_evtchn_op #undef xen_evtchn_send #undef xen_evtchn_status #undef xen_evtchn_unmask #define xen_mmu_update mmu_update CHECK_mmu_update; #undef xen_mmu_update #define xen_vcpu_time_info vcpu_time_info CHECK_vcpu_time_info; #undef xen_vcpu_time_info /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/compat/memory.c0000664000175000017500000003446012307313555015473 0ustar smbsmb#include #include #include #include #include #include #include #include #define xen_domid_t domid_t #define compat_domid_t domid_compat_t CHECK_TYPE(domid); #undef compat_domid_t #undef xen_domid_t int compat_memory_op(unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) compat) { int split, op = cmd & MEMOP_CMD_MASK; long rc; unsigned int start_extent = cmd >> MEMOP_EXTENT_SHIFT; do { unsigned int i, end_extent = 0; union { XEN_GUEST_HANDLE_PARAM(void) hnd; struct xen_memory_reservation *rsrv; struct xen_memory_exchange *xchg; struct xen_add_to_physmap *atp; struct xen_add_to_physmap_batch *atpb; struct xen_remove_from_physmap *xrfp; } nat; union { struct compat_memory_reservation rsrv; struct compat_memory_exchange xchg; struct compat_add_to_physmap atp; struct compat_add_to_physmap_batch atpb; } cmp; set_xen_guest_handle(nat.hnd, COMPAT_ARG_XLAT_VIRT_BASE); split = 0; switch ( op ) { xen_pfn_t *space; case XENMEM_increase_reservation: case XENMEM_decrease_reservation: case XENMEM_populate_physmap: if ( copy_from_guest(&cmp.rsrv, compat, 1) ) return start_extent; /* Is size too large for us to encode a continuation? */ if ( cmp.rsrv.nr_extents > (UINT_MAX >> MEMOP_EXTENT_SHIFT) ) return start_extent; if ( !compat_handle_is_null(cmp.rsrv.extent_start) && !compat_handle_okay(cmp.rsrv.extent_start, cmp.rsrv.nr_extents) ) return start_extent; end_extent = start_extent + (COMPAT_ARG_XLAT_SIZE - sizeof(*nat.rsrv)) / sizeof(*space); if ( end_extent > cmp.rsrv.nr_extents ) end_extent = cmp.rsrv.nr_extents; space = (xen_pfn_t *)(nat.rsrv + 1); #define XLAT_memory_reservation_HNDL_extent_start(_d_, _s_) \ do \ { \ if ( !compat_handle_is_null((_s_)->extent_start) ) \ { \ set_xen_guest_handle((_d_)->extent_start, space - start_extent); \ if ( op != XENMEM_increase_reservation ) \ { \ for ( i = start_extent; i < end_extent; ++i ) \ { \ compat_pfn_t pfn; \ if ( __copy_from_compat_offset(&pfn, (_s_)->extent_start, i, 1) ) \ { \ end_extent = i; \ split = -1; \ break; \ } \ *space++ = pfn; \ } \ } \ } \ else \ { \ set_xen_guest_handle((_d_)->extent_start, NULL); \ end_extent = cmp.rsrv.nr_extents; \ } \ } while (0) XLAT_memory_reservation(nat.rsrv, &cmp.rsrv); #undef XLAT_memory_reservation_HNDL_extent_start if ( end_extent < cmp.rsrv.nr_extents ) { nat.rsrv->nr_extents = end_extent; ++split; } break; case XENMEM_exchange: { int order_delta; if ( copy_from_guest(&cmp.xchg, compat, 1) ) return -EFAULT; order_delta = cmp.xchg.out.extent_order - cmp.xchg.in.extent_order; /* Various sanity checks. */ if ( (cmp.xchg.nr_exchanged > cmp.xchg.in.nr_extents) || (order_delta > 0 && (cmp.xchg.nr_exchanged & ((1U << order_delta) - 1))) || /* Sizes of input and output lists do not overflow an int? */ ((~0U >> cmp.xchg.in.extent_order) < cmp.xchg.in.nr_extents) || ((~0U >> cmp.xchg.out.extent_order) < cmp.xchg.out.nr_extents) || /* Sizes of input and output lists match? */ ((cmp.xchg.in.nr_extents << cmp.xchg.in.extent_order) != (cmp.xchg.out.nr_extents << cmp.xchg.out.extent_order)) ) return -EINVAL; if ( !compat_handle_okay(cmp.xchg.in.extent_start, cmp.xchg.in.nr_extents) || !compat_handle_okay(cmp.xchg.out.extent_start, cmp.xchg.out.nr_extents) ) return -EFAULT; start_extent = cmp.xchg.nr_exchanged; end_extent = (COMPAT_ARG_XLAT_SIZE - sizeof(*nat.xchg)) / (((1U << ABS(order_delta)) + 1) * sizeof(*space)); if ( end_extent == 0 ) { printk("Cannot translate compatibility mode XENMEM_exchange extents (%u,%u)\n", cmp.xchg.in.extent_order, cmp.xchg.out.extent_order); return -E2BIG; } if ( order_delta > 0 ) end_extent <<= order_delta; end_extent += start_extent; if ( end_extent > cmp.xchg.in.nr_extents ) end_extent = cmp.xchg.in.nr_extents; space = (xen_pfn_t *)(nat.xchg + 1); /* Code below depends upon .in preceding .out. */ BUILD_BUG_ON(offsetof(xen_memory_exchange_t, in) > offsetof(xen_memory_exchange_t, out)); #define XLAT_memory_reservation_HNDL_extent_start(_d_, _s_) \ do \ { \ set_xen_guest_handle((_d_)->extent_start, space - start_extent); \ for ( i = start_extent; i < end_extent; ++i ) \ { \ compat_pfn_t pfn; \ if ( __copy_from_compat_offset(&pfn, (_s_)->extent_start, i, 1) ) \ return -EFAULT; \ *space++ = pfn; \ } \ if ( order_delta > 0 ) \ { \ start_extent >>= order_delta; \ end_extent >>= order_delta; \ } \ else \ { \ start_extent <<= -order_delta; \ end_extent <<= -order_delta; \ } \ order_delta = -order_delta; \ } while (0) XLAT_memory_exchange(nat.xchg, &cmp.xchg); #undef XLAT_memory_reservation_HNDL_extent_start if ( end_extent < cmp.xchg.in.nr_extents ) { nat.xchg->in.nr_extents = end_extent; if ( order_delta >= 0 ) nat.xchg->out.nr_extents = end_extent >> order_delta; else nat.xchg->out.nr_extents = end_extent << -order_delta; ++split; } break; } case XENMEM_current_reservation: case XENMEM_maximum_reservation: case XENMEM_maximum_gpfn: case XENMEM_maximum_ram_page: nat.hnd = compat; break; case XENMEM_add_to_physmap: BUILD_BUG_ON((typeof(cmp.atp.size))-1 > (UINT_MAX >> MEMOP_EXTENT_SHIFT)); if ( copy_from_guest(&cmp.atp, compat, 1) ) return -EFAULT; XLAT_add_to_physmap(nat.atp, &cmp.atp); break; case XENMEM_add_to_physmap_batch: { unsigned int limit = (COMPAT_ARG_XLAT_SIZE - sizeof(*nat.atpb)) / (sizeof(nat.atpb->idxs.p) + sizeof(nat.atpb->gpfns.p)); /* Use an intermediate variable to suppress warnings on old gcc: */ unsigned int size = cmp.atpb.size; xen_ulong_t *idxs = (void *)(nat.atpb + 1); xen_pfn_t *gpfns = (void *)(idxs + limit); if ( copy_from_guest(&cmp.atpb, compat, 1) || !compat_handle_okay(cmp.atpb.idxs, size) || !compat_handle_okay(cmp.atpb.gpfns, size) || !compat_handle_okay(cmp.atpb.errs, size) ) return -EFAULT; end_extent = start_extent + limit; if ( end_extent > size ) end_extent = size; idxs -= start_extent; gpfns -= start_extent; for ( i = start_extent; i < end_extent; ++i ) { compat_ulong_t idx; compat_pfn_t gpfn; if ( __copy_from_compat_offset(&idx, cmp.atpb.idxs, i, 1) || __copy_from_compat_offset(&gpfn, cmp.atpb.gpfns, i, 1) ) return -EFAULT; idxs[i] = idx; gpfns[i] = gpfn; } #define XLAT_add_to_physmap_batch_HNDL_idxs(_d_, _s_) \ set_xen_guest_handle((_d_)->idxs, idxs) #define XLAT_add_to_physmap_batch_HNDL_gpfns(_d_, _s_) \ set_xen_guest_handle((_d_)->gpfns, gpfns) #define XLAT_add_to_physmap_batch_HNDL_errs(_d_, _s_) \ guest_from_compat_handle((_d_)->errs, (_s_)->errs) XLAT_add_to_physmap_batch(nat.atpb, &cmp.atpb); #undef XLAT_add_to_physmap_batch_HNDL_errs #undef XLAT_add_to_physmap_batch_HNDL_gpfns #undef XLAT_add_to_physmap_batch_HNDL_idxs if ( end_extent < cmp.atpb.size ) { nat.atpb->size = end_extent; ++split; } break; } case XENMEM_remove_from_physmap: { struct compat_remove_from_physmap cmp; if ( copy_from_guest(&cmp, compat, 1) ) return -EFAULT; XLAT_remove_from_physmap(nat.xrfp, &cmp); break; } default: return compat_arch_memory_op(cmd, compat); } rc = do_memory_op(cmd, nat.hnd); if ( rc < 0 ) break; cmd = 0; if ( hypercall_xlat_continuation(&cmd, 0x02, nat.hnd, compat) ) { BUG_ON(rc != __HYPERVISOR_memory_op); BUG_ON((cmd & MEMOP_CMD_MASK) != op); split = -1; } switch ( op ) { case XENMEM_increase_reservation: case XENMEM_decrease_reservation: case XENMEM_populate_physmap: end_extent = split >= 0 ? rc : cmd >> MEMOP_EXTENT_SHIFT; if ( (op != XENMEM_decrease_reservation) && !guest_handle_is_null(nat.rsrv->extent_start) ) { for ( ; start_extent < end_extent; ++start_extent ) { compat_pfn_t pfn = nat.rsrv->extent_start.p[start_extent]; BUG_ON(pfn != nat.rsrv->extent_start.p[start_extent]); if ( __copy_to_compat_offset(cmp.rsrv.extent_start, start_extent, &pfn, 1) ) { if ( split >= 0 ) { rc = start_extent; split = 0; } else /* * Short of being able to cancel the continuation, * force it to restart here; eventually we shall * get out of this state. */ rc = (start_extent << MEMOP_EXTENT_SHIFT) | op; break; } } } else { start_extent = end_extent; } /* Bail if there was an error. */ if ( (split >= 0) && (end_extent != nat.rsrv->nr_extents) ) split = 0; break; case XENMEM_exchange: { DEFINE_XEN_GUEST_HANDLE(compat_memory_exchange_t); int order_delta; BUG_ON(split >= 0 && rc); BUG_ON(end_extent < nat.xchg->nr_exchanged); end_extent = nat.xchg->nr_exchanged; order_delta = cmp.xchg.out.extent_order - cmp.xchg.in.extent_order; if ( order_delta > 0 ) { start_extent >>= order_delta; BUG_ON(end_extent & ((1U << order_delta) - 1)); end_extent >>= order_delta; } else { start_extent <<= -order_delta; end_extent <<= -order_delta; } for ( ; start_extent < end_extent; ++start_extent ) { compat_pfn_t pfn = nat.xchg->out.extent_start.p[start_extent]; BUG_ON(pfn != nat.xchg->out.extent_start.p[start_extent]); if ( __copy_to_compat_offset(cmp.xchg.out.extent_start, start_extent, &pfn, 1) ) { rc = -EFAULT; break; } } cmp.xchg.nr_exchanged = nat.xchg->nr_exchanged; if ( __copy_field_to_guest(guest_handle_cast(compat, compat_memory_exchange_t), &cmp.xchg, nr_exchanged) ) rc = -EFAULT; if ( rc < 0 ) { if ( split < 0 ) /* Cannot cancel the continuation... */ domain_crash(current->domain); return rc; } break; } case XENMEM_add_to_physmap_batch: start_extent = end_extent; break; case XENMEM_maximum_ram_page: case XENMEM_current_reservation: case XENMEM_maximum_reservation: case XENMEM_maximum_gpfn: case XENMEM_add_to_physmap: case XENMEM_remove_from_physmap: break; default: domain_crash(current->domain); split = 0; break; } cmd = op | (start_extent << MEMOP_EXTENT_SHIFT); if ( split > 0 && hypercall_preempt_check() ) return hypercall_create_continuation( __HYPERVISOR_memory_op, "ih", cmd, compat); } while ( split > 0 ); if ( unlikely(rc > INT_MAX) ) return INT_MAX; if ( unlikely(rc < INT_MIN) ) return INT_MIN; return rc; } xen-4.4.0/xen/common/compat/grant_table.c0000664000175000017500000002710712307313555016445 0ustar smbsmb/****************************************************************************** * common/compat/grant_table.c * */ #include #define xen_grant_entry_v1 grant_entry_v1 CHECK_grant_entry_v1; #undef xen_grant_entry_v1 #define xen_grant_entry_header grant_entry_header CHECK_grant_entry_header; #undef xen_grant_entry_header #define xen_grant_entry_v2 grant_entry_v2 CHECK_grant_entry_v2; #undef xen_grant_entry_v2 #define xen_gnttab_map_grant_ref gnttab_map_grant_ref CHECK_gnttab_map_grant_ref; #undef xen_gnttab_map_grant_ref #define xen_gnttab_unmap_grant_ref gnttab_unmap_grant_ref CHECK_gnttab_unmap_grant_ref; #undef xen_gnttab_unmap_grant_ref #define xen_gnttab_unmap_and_replace gnttab_unmap_and_replace CHECK_gnttab_unmap_and_replace; #undef xen_gnttab_unmap_and_replace DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_compat_t); DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_compat_t); DEFINE_XEN_GUEST_HANDLE(gnttab_copy_compat_t); #define xen_gnttab_dump_table gnttab_dump_table CHECK_gnttab_dump_table; #undef xen_gnttab_dump_table #define xen_gnttab_set_version gnttab_set_version CHECK_gnttab_set_version; #undef xen_gnttab_set_version DEFINE_XEN_GUEST_HANDLE(gnttab_get_status_frames_compat_t); #define xen_gnttab_get_version gnttab_get_version CHECK_gnttab_get_version; #undef xen_gnttab_get_version #define xen_gnttab_swap_grant_ref gnttab_swap_grant_ref CHECK_gnttab_swap_grant_ref; #undef xen_gnttab_swap_grant_ref int compat_grant_table_op(unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) cmp_uop, unsigned int count) { int rc = 0; unsigned int i; XEN_GUEST_HANDLE_PARAM(void) cnt_uop; set_xen_guest_handle(cnt_uop, NULL); switch ( cmd ) { #define CASE(name) \ case GNTTABOP_##name: \ if ( unlikely(!guest_handle_okay(guest_handle_cast(cmp_uop, \ gnttab_##name##_compat_t), \ count)) ) \ rc = -EFAULT; \ break #ifndef CHECK_gnttab_map_grant_ref CASE(map_grant_ref); #endif #ifndef CHECK_gnttab_unmap_grant_ref CASE(unmap_grant_ref); #endif #ifndef CHECK_gnttab_unmap_and_replace CASE(unmap_and_replace); #endif #ifndef CHECK_gnttab_setup_table CASE(setup_table); #endif #ifndef CHECK_gnttab_transfer CASE(transfer); #endif #ifndef CHECK_gnttab_copy CASE(copy); #endif #ifndef CHECK_gnttab_dump_table CASE(dump_table); #endif #ifndef CHECK_gnttab_get_status_frames CASE(get_status_frames); #endif #ifndef CHECK_gnttab_swap_grant_ref CASE(swap_grant_ref); #endif #undef CASE default: return do_grant_table_op(cmd, cmp_uop, count); } if ( (int)count < 0 ) rc = -EINVAL; for ( i = 0; i < count && rc == 0; ) { unsigned int n; union { XEN_GUEST_HANDLE(void) uop; struct gnttab_setup_table *setup; struct gnttab_transfer *xfer; struct gnttab_copy *copy; struct gnttab_get_status_frames *get_status; } nat; union { struct compat_gnttab_setup_table setup; struct compat_gnttab_transfer xfer; struct compat_gnttab_copy copy; struct compat_gnttab_get_status_frames get_status; } cmp; set_xen_guest_handle(nat.uop, COMPAT_ARG_XLAT_VIRT_BASE); switch ( cmd ) { case GNTTABOP_setup_table: if ( unlikely(count > 1) ) rc = -EINVAL; else if ( unlikely(__copy_from_guest(&cmp.setup, cmp_uop, 1)) ) rc = -EFAULT; else if ( unlikely(!compat_handle_okay(cmp.setup.frame_list, cmp.setup.nr_frames)) ) rc = -EFAULT; else { unsigned int max_frame_list_size_in_page = (COMPAT_ARG_XLAT_SIZE - sizeof(*nat.setup)) / sizeof(*nat.setup->frame_list.p); if ( max_frame_list_size_in_page < max_nr_grant_frames ) { gdprintk(XENLOG_WARNING, "max_nr_grant_frames is too large (%u,%u)\n", max_nr_grant_frames, max_frame_list_size_in_page); rc = -EINVAL; } else { #define XLAT_gnttab_setup_table_HNDL_frame_list(_d_, _s_) \ set_xen_guest_handle((_d_)->frame_list, (unsigned long *)(nat.setup + 1)) XLAT_gnttab_setup_table(nat.setup, &cmp.setup); #undef XLAT_gnttab_setup_table_HNDL_frame_list rc = gnttab_setup_table(guest_handle_cast(nat.uop, gnttab_setup_table_t), 1); } } ASSERT(rc <= 0); if ( rc == 0 ) { #define XLAT_gnttab_setup_table_HNDL_frame_list(_d_, _s_) \ do \ { \ if ( (_s_)->status == GNTST_okay ) \ { \ for ( i = 0; i < (_s_)->nr_frames; ++i ) \ { \ unsigned int frame = (_s_)->frame_list.p[i]; \ if ( __copy_to_compat_offset((_d_)->frame_list, \ i, &frame, 1) ) \ (_s_)->status = GNTST_bad_virt_addr; \ } \ } \ } while (0) XLAT_gnttab_setup_table(&cmp.setup, nat.setup); #undef XLAT_gnttab_setup_table_HNDL_frame_list if ( unlikely(__copy_to_guest(cmp_uop, &cmp.setup, 1)) ) rc = -EFAULT; else i = 1; } break; case GNTTABOP_transfer: for ( n = 0; n < COMPAT_ARG_XLAT_SIZE / sizeof(*nat.xfer) && i < count && rc == 0; ++i, ++n ) { if ( unlikely(__copy_from_guest_offset(&cmp.xfer, cmp_uop, i, 1)) ) rc = -EFAULT; else { XLAT_gnttab_transfer(nat.xfer + n, &cmp.xfer); } } if ( rc == 0 ) rc = gnttab_transfer(guest_handle_cast(nat.uop, gnttab_transfer_t), n); if ( rc > 0 ) { ASSERT(rc < n); i -= n - rc; n = rc; } if ( rc >= 0 ) { XEN_GUEST_HANDLE_PARAM(gnttab_transfer_compat_t) xfer; xfer = guest_handle_cast(cmp_uop, gnttab_transfer_compat_t); guest_handle_add_offset(xfer, i); cnt_uop = guest_handle_cast(xfer, void); while ( n-- ) { guest_handle_add_offset(xfer, -1); if ( __copy_field_to_guest(xfer, nat.xfer + n, status) ) rc = -EFAULT; } } break; case GNTTABOP_copy: for ( n = 0; n < COMPAT_ARG_XLAT_SIZE / sizeof(*nat.copy) && i < count && rc == 0; ++i, ++n ) { if ( unlikely(__copy_from_guest_offset(&cmp.copy, cmp_uop, i, 1)) ) rc = -EFAULT; else { enum XLAT_gnttab_copy_source_u source_u; enum XLAT_gnttab_copy_dest_u dest_u; if ( cmp.copy.flags & GNTCOPY_source_gref ) source_u = XLAT_gnttab_copy_source_u_ref; else source_u = XLAT_gnttab_copy_source_u_gmfn; if ( cmp.copy.flags & GNTCOPY_dest_gref ) dest_u = XLAT_gnttab_copy_dest_u_ref; else dest_u = XLAT_gnttab_copy_dest_u_gmfn; XLAT_gnttab_copy(nat.copy + n, &cmp.copy); } } if ( rc == 0 ) rc = gnttab_copy(guest_handle_cast(nat.uop, gnttab_copy_t), n); if ( rc > 0 ) { ASSERT(rc < n); i -= n - rc; n = rc; } if ( rc >= 0 ) { XEN_GUEST_HANDLE_PARAM(gnttab_copy_compat_t) copy; copy = guest_handle_cast(cmp_uop, gnttab_copy_compat_t); guest_handle_add_offset(copy, i); cnt_uop = guest_handle_cast(copy, void); while ( n-- ) { guest_handle_add_offset(copy, -1); if ( __copy_field_to_guest(copy, nat.copy + n, status) ) rc = -EFAULT; } } break; case GNTTABOP_get_status_frames: { unsigned int max_frame_list_size_in_pages = (COMPAT_ARG_XLAT_SIZE - sizeof(*nat.get_status)) / sizeof(*nat.get_status->frame_list.p); if ( count != 1) { rc = -EINVAL; break; } if ( unlikely(__copy_from_guest(&cmp.get_status, cmp_uop, 1) || !compat_handle_okay(cmp.get_status.frame_list, cmp.get_status.nr_frames)) ) { rc = -EFAULT; break; } if ( max_frame_list_size_in_pages < grant_to_status_frames(max_nr_grant_frames) ) { gdprintk(XENLOG_WARNING, "grant_to_status_frames(max_nr_grant_frames) is too large (%u,%u)\n", grant_to_status_frames(max_nr_grant_frames), max_frame_list_size_in_pages); rc = -EINVAL; break; } #define XLAT_gnttab_get_status_frames_HNDL_frame_list(_d_, _s_) \ set_xen_guest_handle((_d_)->frame_list, (uint64_t *)(nat.get_status + 1)) XLAT_gnttab_get_status_frames(nat.get_status, &cmp.get_status); #undef XLAT_gnttab_get_status_frames_HNDL_frame_list rc = gnttab_get_status_frames( guest_handle_cast(nat.uop, gnttab_get_status_frames_t), count); if ( rc >= 0 ) { #define XLAT_gnttab_get_status_frames_HNDL_frame_list(_d_, _s_) \ do \ { \ if ( (_s_)->status == GNTST_okay ) \ { \ for ( i = 0; i < (_s_)->nr_frames; ++i ) \ { \ uint64_t frame = (_s_)->frame_list.p[i]; \ if ( __copy_to_compat_offset((_d_)->frame_list, \ i, &frame, 1) ) \ (_s_)->status = GNTST_bad_virt_addr; \ } \ } \ } while (0) XLAT_gnttab_get_status_frames(&cmp.get_status, nat.get_status); #undef XLAT_gnttab_get_status_frames_HNDL_frame_list if ( unlikely(__copy_to_guest(cmp_uop, &cmp.get_status, 1)) ) rc = -EFAULT; else i = 1; } break; } default: domain_crash(current->domain); break; } } if ( rc > 0 ) { ASSERT(i < count); ASSERT(!guest_handle_is_null(cnt_uop)); rc = hypercall_create_continuation(__HYPERVISOR_grant_table_op, "ihi", cmd, cnt_uop, count - i); } return rc; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/compat/multicall.c0000664000175000017500000000231512307313555016143 0ustar smbsmb/****************************************************************************** * multicall.c */ #include #include #include #include #define COMPAT typedef int ret_t; #undef do_multicall_call static inline void xlat_multicall_entry(struct mc_state *mcs) { int i; for (i=0; i<6; i++) mcs->compat_call.args[i] = mcs->call.args[i]; } DEFINE_XEN_GUEST_HANDLE(multicall_entry_compat_t); #define multicall_entry compat_multicall_entry #define multicall_entry_t multicall_entry_compat_t #define do_multicall_call compat_multicall_call #define call compat_call #define do_multicall(l, n) compat_multicall(_##l, n) #define _XEN_GUEST_HANDLE(t) XEN_GUEST_HANDLE(t) #define _XEN_GUEST_HANDLE_PARAM(t) XEN_GUEST_HANDLE(t) static void __trace_multicall_call(multicall_entry_t *call) { unsigned long args[6]; int i; for ( i = 0; i < ARRAY_SIZE(args); i++ ) args[i] = call->args[i]; __trace_hypercall(TRC_PV_HYPERCALL_SUBCALL, call->op, args); } #include "../multicall.c" /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/domain.c0000664000175000017500000007530512307313555014152 0ustar smbsmb/****************************************************************************** * domain.c * * Generic domain-handling functions. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Linux config option: propageted to domain0 */ /* xen_processor_pmbits: xen control Cx, Px, ... */ unsigned int xen_processor_pmbits = XEN_PROCESSOR_PM_PX; /* opt_dom0_vcpus_pin: If true, dom0 VCPUs are pinned. */ bool_t opt_dom0_vcpus_pin; boolean_param("dom0_vcpus_pin", opt_dom0_vcpus_pin); /* Protect updates/reads (resp.) of domain_list and domain_hash. */ DEFINE_SPINLOCK(domlist_update_lock); DEFINE_RCU_READ_LOCK(domlist_read_lock); #define DOMAIN_HASH_SIZE 256 #define DOMAIN_HASH(_id) ((int)(_id)&(DOMAIN_HASH_SIZE-1)) static struct domain *domain_hash[DOMAIN_HASH_SIZE]; struct domain *domain_list; struct domain *dom0; struct vcpu *idle_vcpu[NR_CPUS] __read_mostly; vcpu_info_t dummy_vcpu_info; int current_domain_id(void) { return current->domain->domain_id; } static void __domain_finalise_shutdown(struct domain *d) { struct vcpu *v; BUG_ON(!spin_is_locked(&d->shutdown_lock)); if ( d->is_shut_down ) return; for_each_vcpu ( d, v ) if ( !v->paused_for_shutdown ) return; d->is_shut_down = 1; if ( (d->shutdown_code == SHUTDOWN_suspend) && d->suspend_evtchn ) evtchn_send(d, d->suspend_evtchn); else send_global_virq(VIRQ_DOM_EXC); } static void vcpu_check_shutdown(struct vcpu *v) { struct domain *d = v->domain; spin_lock(&d->shutdown_lock); if ( d->is_shutting_down ) { if ( !v->paused_for_shutdown ) vcpu_pause_nosync(v); v->paused_for_shutdown = 1; v->defer_shutdown = 0; __domain_finalise_shutdown(d); } spin_unlock(&d->shutdown_lock); } struct vcpu *alloc_vcpu( struct domain *d, unsigned int vcpu_id, unsigned int cpu_id) { struct vcpu *v; BUG_ON((!is_idle_domain(d) || vcpu_id) && d->vcpu[vcpu_id]); if ( (v = alloc_vcpu_struct()) == NULL ) return NULL; v->domain = d; v->vcpu_id = vcpu_id; spin_lock_init(&v->virq_lock); tasklet_init(&v->continue_hypercall_tasklet, NULL, 0); if ( !zalloc_cpumask_var(&v->cpu_affinity) || !zalloc_cpumask_var(&v->cpu_affinity_tmp) || !zalloc_cpumask_var(&v->cpu_affinity_saved) || !zalloc_cpumask_var(&v->vcpu_dirty_cpumask) ) goto fail_free; if ( is_idle_domain(d) ) { v->runstate.state = RUNSTATE_running; } else { v->runstate.state = RUNSTATE_offline; v->runstate.state_entry_time = NOW(); set_bit(_VPF_down, &v->pause_flags); v->vcpu_info = ((vcpu_id < XEN_LEGACY_MAX_VCPUS) ? (vcpu_info_t *)&shared_info(d, vcpu_info[vcpu_id]) : &dummy_vcpu_info); v->vcpu_info_mfn = INVALID_MFN; init_waitqueue_vcpu(v); } if ( sched_init_vcpu(v, cpu_id) != 0 ) goto fail_wq; if ( vcpu_initialise(v) != 0 ) { sched_destroy_vcpu(v); fail_wq: destroy_waitqueue_vcpu(v); fail_free: free_cpumask_var(v->cpu_affinity); free_cpumask_var(v->cpu_affinity_tmp); free_cpumask_var(v->cpu_affinity_saved); free_cpumask_var(v->vcpu_dirty_cpumask); free_vcpu_struct(v); return NULL; } d->vcpu[vcpu_id] = v; if ( vcpu_id != 0 ) { int prev_id = v->vcpu_id - 1; while ( (prev_id >= 0) && (d->vcpu[prev_id] == NULL) ) prev_id--; BUG_ON(prev_id < 0); v->next_in_list = d->vcpu[prev_id]->next_in_list; d->vcpu[prev_id]->next_in_list = v; } /* Must be called after making new vcpu visible to for_each_vcpu(). */ vcpu_check_shutdown(v); domain_update_node_affinity(d); return v; } static unsigned int __read_mostly extra_dom0_irqs = 256; static unsigned int __read_mostly extra_domU_irqs = 32; static void __init parse_extra_guest_irqs(const char *s) { if ( isdigit(*s) ) extra_domU_irqs = simple_strtoul(s, &s, 0); if ( *s == ',' && isdigit(*++s) ) extra_dom0_irqs = simple_strtoul(s, &s, 0); } custom_param("extra_guest_irqs", parse_extra_guest_irqs); struct domain *domain_create( domid_t domid, unsigned int domcr_flags, uint32_t ssidref) { struct domain *d, **pd; enum { INIT_xsm = 1u<<0, INIT_watchdog = 1u<<1, INIT_rangeset = 1u<<2, INIT_evtchn = 1u<<3, INIT_gnttab = 1u<<4, INIT_arch = 1u<<5 }; int err, init_status = 0; int poolid = CPUPOOLID_NONE; if ( (d = alloc_domain_struct()) == NULL ) return ERR_PTR(-ENOMEM); d->domain_id = domid; lock_profile_register_struct(LOCKPROF_TYPE_PERDOM, d, domid, "Domain"); if ( (err = xsm_alloc_security_domain(d)) != 0 ) goto fail; init_status |= INIT_xsm; watchdog_domain_init(d); init_status |= INIT_watchdog; atomic_set(&d->refcnt, 1); spin_lock_init_prof(d, domain_lock); spin_lock_init_prof(d, page_alloc_lock); spin_lock_init(&d->hypercall_deadlock_mutex); INIT_PAGE_LIST_HEAD(&d->page_list); INIT_PAGE_LIST_HEAD(&d->xenpage_list); spin_lock_init(&d->node_affinity_lock); d->node_affinity = NODE_MASK_ALL; d->auto_node_affinity = 1; spin_lock_init(&d->shutdown_lock); d->shutdown_code = -1; spin_lock_init(&d->pbuf_lock); err = -ENOMEM; if ( !zalloc_cpumask_var(&d->domain_dirty_cpumask) ) goto fail; if ( domcr_flags & DOMCRF_hvm ) d->guest_type = guest_type_hvm; else if ( domcr_flags & DOMCRF_pvh ) d->guest_type = guest_type_pvh; if ( domid == 0 ) { d->is_pinned = opt_dom0_vcpus_pin; d->disable_migrate = 1; } rangeset_domain_initialise(d); init_status |= INIT_rangeset; d->iomem_caps = rangeset_new(d, "I/O Memory", RANGESETF_prettyprint_hex); d->irq_caps = rangeset_new(d, "Interrupts", 0); if ( (d->iomem_caps == NULL) || (d->irq_caps == NULL) ) goto fail; if ( domcr_flags & DOMCRF_dummy ) return d; if ( !is_idle_domain(d) ) { if ( (err = xsm_domain_create(XSM_HOOK, d, ssidref)) != 0 ) goto fail; d->is_paused_by_controller = 1; atomic_inc(&d->pause_count); if ( domid ) d->nr_pirqs = nr_static_irqs + extra_domU_irqs; else d->nr_pirqs = nr_static_irqs + extra_dom0_irqs; if ( d->nr_pirqs > nr_irqs ) d->nr_pirqs = nr_irqs; radix_tree_init(&d->pirq_tree); if ( (err = evtchn_init(d)) != 0 ) goto fail; init_status |= INIT_evtchn; if ( (err = grant_table_create(d)) != 0 ) goto fail; init_status |= INIT_gnttab; poolid = 0; err = -ENOMEM; d->mem_event = xzalloc(struct mem_event_per_domain); if ( !d->mem_event ) goto fail; d->pbuf = xzalloc_array(char, DOMAIN_PBUF_SIZE); if ( !d->pbuf ) goto fail; } if ( (err = arch_domain_create(d, domcr_flags)) != 0 ) goto fail; init_status |= INIT_arch; if ( (err = cpupool_add_domain(d, poolid)) != 0 ) goto fail; if ( (err = sched_init_domain(d)) != 0 ) goto fail; if ( !is_idle_domain(d) ) { spin_lock(&domlist_update_lock); pd = &domain_list; /* NB. domain_list maintained in order of domid. */ for ( pd = &domain_list; *pd != NULL; pd = &(*pd)->next_in_list ) if ( (*pd)->domain_id > d->domain_id ) break; d->next_in_list = *pd; d->next_in_hashbucket = domain_hash[DOMAIN_HASH(domid)]; rcu_assign_pointer(*pd, d); rcu_assign_pointer(domain_hash[DOMAIN_HASH(domid)], d); spin_unlock(&domlist_update_lock); } return d; fail: d->is_dying = DOMDYING_dead; atomic_set(&d->refcnt, DOMAIN_DESTROYED); xfree(d->mem_event); xfree(d->pbuf); if ( init_status & INIT_arch ) arch_domain_destroy(d); if ( init_status & INIT_gnttab ) grant_table_destroy(d); if ( init_status & INIT_evtchn ) { evtchn_destroy(d); evtchn_destroy_final(d); radix_tree_destroy(&d->pirq_tree, free_pirq_struct); } if ( init_status & INIT_rangeset ) rangeset_domain_destroy(d); if ( init_status & INIT_watchdog ) watchdog_domain_destroy(d); if ( init_status & INIT_xsm ) xsm_free_security_domain(d); free_cpumask_var(d->domain_dirty_cpumask); free_domain_struct(d); return ERR_PTR(err); } void domain_update_node_affinity(struct domain *d) { cpumask_var_t cpumask; cpumask_var_t online_affinity; const cpumask_t *online; struct vcpu *v; unsigned int node; if ( !zalloc_cpumask_var(&cpumask) ) return; if ( !alloc_cpumask_var(&online_affinity) ) { free_cpumask_var(cpumask); return; } online = cpupool_online_cpumask(d->cpupool); spin_lock(&d->node_affinity_lock); for_each_vcpu ( d, v ) { cpumask_and(online_affinity, v->cpu_affinity, online); cpumask_or(cpumask, cpumask, online_affinity); } /* * If d->auto_node_affinity is true, the domain's node-affinity mask * (d->node_affinity) is automaically computed from all the domain's * vcpus' vcpu-affinity masks (the union of which we have just built * above in cpumask). OTOH, if d->auto_node_affinity is false, we * must leave the node-affinity of the domain alone. */ if ( d->auto_node_affinity ) { nodes_clear(d->node_affinity); for_each_online_node ( node ) if ( cpumask_intersects(&node_to_cpumask(node), cpumask) ) node_set(node, d->node_affinity); } sched_set_node_affinity(d, &d->node_affinity); spin_unlock(&d->node_affinity_lock); free_cpumask_var(online_affinity); free_cpumask_var(cpumask); } int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity) { /* Being affine with no nodes is just wrong */ if ( nodes_empty(*affinity) ) return -EINVAL; spin_lock(&d->node_affinity_lock); /* * Being/becoming explicitly affine to all nodes is not particularly * useful. Let's take it as the `reset node affinity` command. */ if ( nodes_full(*affinity) ) { d->auto_node_affinity = 1; goto out; } d->auto_node_affinity = 0; d->node_affinity = *affinity; out: spin_unlock(&d->node_affinity_lock); domain_update_node_affinity(d); return 0; } struct domain *get_domain_by_id(domid_t dom) { struct domain *d; rcu_read_lock(&domlist_read_lock); for ( d = rcu_dereference(domain_hash[DOMAIN_HASH(dom)]); d != NULL; d = rcu_dereference(d->next_in_hashbucket) ) { if ( d->domain_id == dom ) { if ( unlikely(!get_domain(d)) ) d = NULL; break; } } rcu_read_unlock(&domlist_read_lock); return d; } struct domain *rcu_lock_domain_by_id(domid_t dom) { struct domain *d = NULL; rcu_read_lock(&domlist_read_lock); for ( d = rcu_dereference(domain_hash[DOMAIN_HASH(dom)]); d != NULL; d = rcu_dereference(d->next_in_hashbucket) ) { if ( d->domain_id == dom ) { rcu_lock_domain(d); break; } } rcu_read_unlock(&domlist_read_lock); return d; } struct domain *rcu_lock_domain_by_any_id(domid_t dom) { if ( dom == DOMID_SELF ) return rcu_lock_current_domain(); return rcu_lock_domain_by_id(dom); } int rcu_lock_remote_domain_by_id(domid_t dom, struct domain **d) { if ( (*d = rcu_lock_domain_by_id(dom)) == NULL ) return -ESRCH; if ( *d == current->domain ) { rcu_unlock_domain(*d); return -EPERM; } return 0; } int rcu_lock_live_remote_domain_by_id(domid_t dom, struct domain **d) { int rv; rv = rcu_lock_remote_domain_by_id(dom, d); if ( rv ) return rv; if ( (*d)->is_dying ) { rcu_unlock_domain(*d); return -EINVAL; } return 0; } int domain_kill(struct domain *d) { int rc = 0; struct vcpu *v; if ( d == current->domain ) return -EINVAL; /* Protected by domctl_lock. */ switch ( d->is_dying ) { case DOMDYING_alive: domain_pause(d); d->is_dying = DOMDYING_dying; spin_barrier(&d->domain_lock); evtchn_destroy(d); gnttab_release_mappings(d); tmem_destroy(d->tmem_client); domain_set_outstanding_pages(d, 0); d->tmem_client = NULL; /* fallthrough */ case DOMDYING_dying: rc = domain_relinquish_resources(d); if ( rc != 0 ) { BUG_ON(rc != -EAGAIN); break; } for_each_vcpu ( d, v ) unmap_vcpu_info(v); d->is_dying = DOMDYING_dead; /* Mem event cleanup has to go here because the rings * have to be put before we call put_domain. */ mem_event_cleanup(d); put_domain(d); send_global_virq(VIRQ_DOM_EXC); /* fallthrough */ case DOMDYING_dead: break; } return rc; } void __domain_crash(struct domain *d) { if ( d->is_shutting_down ) { /* Print nothing: the domain is already shutting down. */ } else if ( d == current->domain ) { printk("Domain %d (vcpu#%d) crashed on cpu#%d:\n", d->domain_id, current->vcpu_id, smp_processor_id()); show_execution_state(guest_cpu_user_regs()); } else { printk("Domain %d reported crashed by domain %d on cpu#%d:\n", d->domain_id, current->domain->domain_id, smp_processor_id()); } domain_shutdown(d, SHUTDOWN_crash); } void __domain_crash_synchronous(void) { __domain_crash(current->domain); vcpu_end_shutdown_deferral(current); for ( ; ; ) do_softirq(); } void domain_shutdown(struct domain *d, u8 reason) { struct vcpu *v; spin_lock(&d->shutdown_lock); if ( d->shutdown_code == -1 ) d->shutdown_code = reason; reason = d->shutdown_code; if ( d->domain_id == 0 ) dom0_shutdown(reason); if ( d->is_shutting_down ) { spin_unlock(&d->shutdown_lock); return; } d->is_shutting_down = 1; smp_mb(); /* set shutdown status /then/ check for per-cpu deferrals */ for_each_vcpu ( d, v ) { if ( reason == SHUTDOWN_crash ) v->defer_shutdown = 0; else if ( v->defer_shutdown ) continue; vcpu_pause_nosync(v); v->paused_for_shutdown = 1; } __domain_finalise_shutdown(d); spin_unlock(&d->shutdown_lock); } void domain_resume(struct domain *d) { struct vcpu *v; /* * Some code paths assume that shutdown status does not get reset under * their feet (e.g., some assertions make this assumption). */ domain_pause(d); spin_lock(&d->shutdown_lock); d->is_shutting_down = d->is_shut_down = 0; d->shutdown_code = -1; for_each_vcpu ( d, v ) { if ( v->paused_for_shutdown ) vcpu_unpause(v); v->paused_for_shutdown = 0; } spin_unlock(&d->shutdown_lock); domain_unpause(d); } int vcpu_start_shutdown_deferral(struct vcpu *v) { if ( v->defer_shutdown ) return 1; v->defer_shutdown = 1; smp_mb(); /* set deferral status /then/ check for shutdown */ if ( unlikely(v->domain->is_shutting_down) ) vcpu_check_shutdown(v); return v->defer_shutdown; } void vcpu_end_shutdown_deferral(struct vcpu *v) { v->defer_shutdown = 0; smp_mb(); /* clear deferral status /then/ check for shutdown */ if ( unlikely(v->domain->is_shutting_down) ) vcpu_check_shutdown(v); } #ifdef HAS_GDBSX void domain_pause_for_debugger(void) { struct domain *d = current->domain; struct vcpu *v; atomic_inc(&d->pause_count); if ( test_and_set_bool(d->is_paused_by_controller) ) domain_unpause(d); /* race-free atomic_dec(&d->pause_count) */ for_each_vcpu ( d, v ) vcpu_sleep_nosync(v); /* if gdbsx active, we just need to pause the domain */ if (current->arch.gdbsx_vcpu_event == 0) send_global_virq(VIRQ_DEBUGGER); } #endif /* Complete domain destroy after RCU readers are not holding old references. */ static void complete_domain_destroy(struct rcu_head *head) { struct domain *d = container_of(head, struct domain, rcu); struct vcpu *v; int i; for ( i = d->max_vcpus - 1; i >= 0; i-- ) { if ( (v = d->vcpu[i]) == NULL ) continue; tasklet_kill(&v->continue_hypercall_tasklet); vcpu_destroy(v); sched_destroy_vcpu(v); destroy_waitqueue_vcpu(v); } grant_table_destroy(d); arch_domain_destroy(d); watchdog_domain_destroy(d); rangeset_domain_destroy(d); sched_destroy_domain(d); cpupool_rm_domain(d); /* Free page used by xen oprofile buffer. */ #ifdef CONFIG_XENOPROF free_xenoprof_pages(d); #endif xfree(d->mem_event); xfree(d->pbuf); for ( i = d->max_vcpus - 1; i >= 0; i-- ) if ( (v = d->vcpu[i]) != NULL ) { free_cpumask_var(v->cpu_affinity); free_cpumask_var(v->cpu_affinity_tmp); free_cpumask_var(v->cpu_affinity_saved); free_cpumask_var(v->vcpu_dirty_cpumask); free_vcpu_struct(v); } if ( d->target != NULL ) put_domain(d->target); evtchn_destroy_final(d); radix_tree_destroy(&d->pirq_tree, free_pirq_struct); xsm_free_security_domain(d); free_cpumask_var(d->domain_dirty_cpumask); free_domain_struct(d); send_global_virq(VIRQ_DOM_EXC); } /* Release resources belonging to task @p. */ void domain_destroy(struct domain *d) { struct domain **pd; atomic_t old, new; BUG_ON(!d->is_dying); /* May be already destroyed, or get_domain() can race us. */ _atomic_set(old, 0); _atomic_set(new, DOMAIN_DESTROYED); old = atomic_compareandswap(old, new, &d->refcnt); if ( _atomic_read(old) != 0 ) return; /* Delete from task list and task hashtable. */ TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id); spin_lock(&domlist_update_lock); pd = &domain_list; while ( *pd != d ) pd = &(*pd)->next_in_list; rcu_assign_pointer(*pd, d->next_in_list); pd = &domain_hash[DOMAIN_HASH(d->domain_id)]; while ( *pd != d ) pd = &(*pd)->next_in_hashbucket; rcu_assign_pointer(*pd, d->next_in_hashbucket); spin_unlock(&domlist_update_lock); /* Schedule RCU asynchronous completion of domain destroy. */ call_rcu(&d->rcu, complete_domain_destroy); } void vcpu_pause(struct vcpu *v) { ASSERT(v != current); atomic_inc(&v->pause_count); vcpu_sleep_sync(v); } void vcpu_pause_nosync(struct vcpu *v) { atomic_inc(&v->pause_count); vcpu_sleep_nosync(v); } void vcpu_unpause(struct vcpu *v) { if ( atomic_dec_and_test(&v->pause_count) ) vcpu_wake(v); } void domain_pause(struct domain *d) { struct vcpu *v; ASSERT(d != current->domain); atomic_inc(&d->pause_count); for_each_vcpu( d, v ) vcpu_sleep_sync(v); } void domain_pause_nosync(struct domain *d) { struct vcpu *v; atomic_inc(&d->pause_count); for_each_vcpu( d, v ) vcpu_sleep_nosync(v); } void domain_unpause(struct domain *d) { struct vcpu *v; if ( atomic_dec_and_test(&d->pause_count) ) for_each_vcpu( d, v ) vcpu_wake(v); } void domain_pause_by_systemcontroller(struct domain *d) { domain_pause(d); if ( test_and_set_bool(d->is_paused_by_controller) ) domain_unpause(d); } void domain_unpause_by_systemcontroller(struct domain *d) { if ( test_and_clear_bool(d->is_paused_by_controller) ) domain_unpause(d); } int vcpu_reset(struct vcpu *v) { struct domain *d = v->domain; int rc; vcpu_pause(v); domain_lock(d); set_bit(_VPF_in_reset, &v->pause_flags); rc = arch_vcpu_reset(v); if ( rc ) goto out_unlock; set_bit(_VPF_down, &v->pause_flags); clear_bit(v->vcpu_id, d->poll_mask); v->poll_evtchn = 0; v->fpu_initialised = 0; v->fpu_dirtied = 0; v->is_initialised = 0; #ifdef VCPU_TRAP_LAST v->async_exception_mask = 0; memset(v->async_exception_state, 0, sizeof(v->async_exception_state)); #endif cpumask_clear(v->cpu_affinity_tmp); clear_bit(_VPF_blocked, &v->pause_flags); clear_bit(_VPF_in_reset, &v->pause_flags); out_unlock: domain_unlock(v->domain); vcpu_unpause(v); return rc; } /* * Map a guest page in and point the vcpu_info pointer at it. This * makes sure that the vcpu_info is always pointing at a valid piece * of memory, and it sets a pending event to make sure that a pending * event doesn't get missed. */ int map_vcpu_info(struct vcpu *v, unsigned long gfn, unsigned offset) { struct domain *d = v->domain; void *mapping; vcpu_info_t *new_info; struct page_info *page; int i; if ( offset > (PAGE_SIZE - sizeof(vcpu_info_t)) ) return -EINVAL; if ( v->vcpu_info_mfn != INVALID_MFN ) return -EINVAL; /* Run this command on yourself or on other offline VCPUS. */ if ( (v != current) && !test_bit(_VPF_down, &v->pause_flags) ) return -EINVAL; page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC); if ( !page ) return -EINVAL; if ( !get_page_type(page, PGT_writable_page) ) { put_page(page); return -EINVAL; } mapping = __map_domain_page_global(page); if ( mapping == NULL ) { put_page_and_type(page); return -ENOMEM; } new_info = (vcpu_info_t *)(mapping + offset); if ( v->vcpu_info == &dummy_vcpu_info ) { memset(new_info, 0, sizeof(*new_info)); #ifdef XEN_HAVE_PV_UPCALL_MASK __vcpu_info(v, new_info, evtchn_upcall_mask) = 1; #endif } else { memcpy(new_info, v->vcpu_info, sizeof(*new_info)); } v->vcpu_info = new_info; v->vcpu_info_mfn = page_to_mfn(page); /* Set new vcpu_info pointer /before/ setting pending flags. */ smp_wmb(); /* * Mark everything as being pending just to make sure nothing gets * lost. The domain will get a spurious event, but it can cope. */ vcpu_info(v, evtchn_upcall_pending) = 1; for ( i = 0; i < BITS_PER_EVTCHN_WORD(d); i++ ) set_bit(i, &vcpu_info(v, evtchn_pending_sel)); return 0; } /* * Unmap the vcpu info page if the guest decided to place it somewhere * else. This is only used from arch_domain_destroy, so there's no * need to do anything clever. */ void unmap_vcpu_info(struct vcpu *v) { unsigned long mfn; if ( v->vcpu_info_mfn == INVALID_MFN ) return; mfn = v->vcpu_info_mfn; unmap_domain_page_global((void *) ((unsigned long)v->vcpu_info & PAGE_MASK)); v->vcpu_info = &dummy_vcpu_info; v->vcpu_info_mfn = INVALID_MFN; put_page_and_type(mfn_to_page(mfn)); } long do_vcpu_op(int cmd, int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg) { struct domain *d = current->domain; struct vcpu *v; struct vcpu_guest_context *ctxt; long rc = 0; if ( (vcpuid < 0) || (vcpuid >= MAX_VIRT_CPUS) ) return -EINVAL; if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) return -ENOENT; switch ( cmd ) { case VCPUOP_initialise: if ( v->vcpu_info == &dummy_vcpu_info ) return -EINVAL; if ( (ctxt = alloc_vcpu_guest_context()) == NULL ) return -ENOMEM; if ( copy_from_guest(ctxt, arg, 1) ) { free_vcpu_guest_context(ctxt); return -EFAULT; } domain_lock(d); rc = v->is_initialised ? -EEXIST : arch_set_info_guest(v, ctxt); domain_unlock(d); free_vcpu_guest_context(ctxt); if ( rc == -EAGAIN ) rc = hypercall_create_continuation(__HYPERVISOR_vcpu_op, "iih", cmd, vcpuid, arg); break; case VCPUOP_up: { bool_t wake = 0; domain_lock(d); if ( !v->is_initialised ) rc = -EINVAL; else wake = test_and_clear_bit(_VPF_down, &v->pause_flags); domain_unlock(d); if ( wake ) vcpu_wake(v); break; } case VCPUOP_down: if ( !test_and_set_bit(_VPF_down, &v->pause_flags) ) vcpu_sleep_nosync(v); break; case VCPUOP_is_up: rc = !test_bit(_VPF_down, &v->pause_flags); break; case VCPUOP_get_runstate_info: { struct vcpu_runstate_info runstate; vcpu_runstate_get(v, &runstate); if ( copy_to_guest(arg, &runstate, 1) ) rc = -EFAULT; break; } case VCPUOP_set_periodic_timer: { struct vcpu_set_periodic_timer set; if ( copy_from_guest(&set, arg, 1) ) return -EFAULT; if ( set.period_ns < MILLISECS(1) ) return -EINVAL; if ( set.period_ns > STIME_DELTA_MAX ) return -EINVAL; v->periodic_period = set.period_ns; vcpu_force_reschedule(v); break; } case VCPUOP_stop_periodic_timer: v->periodic_period = 0; vcpu_force_reschedule(v); break; case VCPUOP_set_singleshot_timer: { struct vcpu_set_singleshot_timer set; if ( v != current ) return -EINVAL; if ( copy_from_guest(&set, arg, 1) ) return -EFAULT; if ( (set.flags & VCPU_SSHOTTMR_future) && (set.timeout_abs_ns < NOW()) ) return -ETIME; migrate_timer(&v->singleshot_timer, smp_processor_id()); set_timer(&v->singleshot_timer, set.timeout_abs_ns); break; } case VCPUOP_stop_singleshot_timer: if ( v != current ) return -EINVAL; stop_timer(&v->singleshot_timer); break; case VCPUOP_register_vcpu_info: { struct domain *d = v->domain; struct vcpu_register_vcpu_info info; rc = -EFAULT; if ( copy_from_guest(&info, arg, 1) ) break; domain_lock(d); rc = map_vcpu_info(v, info.mfn, info.offset); domain_unlock(d); break; } case VCPUOP_register_runstate_memory_area: { struct vcpu_register_runstate_memory_area area; struct vcpu_runstate_info runstate; rc = -EFAULT; if ( copy_from_guest(&area, arg, 1) ) break; if ( !guest_handle_okay(area.addr.h, 1) ) break; rc = 0; runstate_guest(v) = area.addr.h; if ( v == current ) { __copy_to_guest(runstate_guest(v), &v->runstate, 1); } else { vcpu_runstate_get(v, &runstate); __copy_to_guest(runstate_guest(v), &runstate, 1); } break; } #ifdef VCPU_TRAP_NMI case VCPUOP_send_nmi: if ( !guest_handle_is_null(arg) ) return -EINVAL; if ( !test_and_set_bool(v->nmi_pending) ) vcpu_kick(v); break; #endif default: rc = arch_do_vcpu_op(cmd, v, arg); break; } return rc; } long vm_assist(struct domain *p, unsigned int cmd, unsigned int type) { if ( type > MAX_VMASST_TYPE ) return -EINVAL; switch ( cmd ) { case VMASST_CMD_enable: set_bit(type, &p->vm_assist); return 0; case VMASST_CMD_disable: clear_bit(type, &p->vm_assist); return 0; } return -ENOSYS; } struct pirq *pirq_get_info(struct domain *d, int pirq) { struct pirq *info = pirq_info(d, pirq); if ( !info && (info = alloc_pirq_struct(d)) != NULL ) { info->pirq = pirq; if ( radix_tree_insert(&d->pirq_tree, pirq, info) ) { free_pirq_struct(info); info = NULL; } } return info; } static void _free_pirq_struct(struct rcu_head *head) { xfree(container_of(head, struct pirq, rcu_head)); } void free_pirq_struct(void *ptr) { struct pirq *pirq = ptr; call_rcu(&pirq->rcu_head, _free_pirq_struct); } struct migrate_info { long (*func)(void *data); void *data; struct vcpu *vcpu; unsigned int cpu; unsigned int nest; }; static DEFINE_PER_CPU(struct migrate_info *, continue_info); static void continue_hypercall_tasklet_handler(unsigned long _info) { struct migrate_info *info = (struct migrate_info *)_info; struct vcpu *v = info->vcpu; /* Wait for vcpu to sleep so that we can access its register state. */ vcpu_sleep_sync(v); this_cpu(continue_info) = info; return_reg(v) = (info->cpu == smp_processor_id()) ? info->func(info->data) : -EINVAL; this_cpu(continue_info) = NULL; if ( info->nest-- == 0 ) { xfree(info); vcpu_unpause(v); put_domain(v->domain); } } int continue_hypercall_on_cpu( unsigned int cpu, long (*func)(void *data), void *data) { struct migrate_info *info; if ( (cpu >= nr_cpu_ids) || !cpu_online(cpu) ) return -EINVAL; info = this_cpu(continue_info); if ( info == NULL ) { struct vcpu *curr = current; info = xmalloc(struct migrate_info); if ( info == NULL ) return -ENOMEM; info->vcpu = curr; info->nest = 0; tasklet_kill( &curr->continue_hypercall_tasklet); tasklet_init( &curr->continue_hypercall_tasklet, continue_hypercall_tasklet_handler, (unsigned long)info); get_knownalive_domain(curr->domain); vcpu_pause_nosync(curr); } else { BUG_ON(info->nest != 0); info->nest++; } info->func = func; info->data = data; info->cpu = cpu; tasklet_schedule_on_cpu(&info->vcpu->continue_hypercall_tasklet, cpu); /* Dummy return value will be overwritten by tasklet. */ return 0; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/schedule.c0000664000175000017500000011761612307313555014501 0ustar smbsmb/**************************************************************************** * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge * (C) 2002-2003 University of Cambridge * (C) 2004 - Mark Williamson - Intel Research Cambridge **************************************************************************** * * File: common/schedule.c * Author: Rolf Neugebauer & Keir Fraser * Updated for generic API by Mark Williamson * * Description: Generic CPU scheduling code * implements support functionality for the Xen scheduler API. * */ #ifndef COMPAT #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* opt_sched: scheduler - default to credit */ static char __initdata opt_sched[10] = "credit"; string_param("sched", opt_sched); /* if sched_smt_power_savings is set, * scheduler will give preferrence to partially idle package compared to * the full idle package, when picking pCPU to schedule vCPU. */ bool_t sched_smt_power_savings = 0; boolean_param("sched_smt_power_savings", sched_smt_power_savings); /* Default scheduling rate limit: 1ms * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms is undefined * */ int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US; integer_param("sched_ratelimit_us", sched_ratelimit_us); /* Various timer handlers. */ static void s_timer_fn(void *unused); static void vcpu_periodic_timer_fn(void *data); static void vcpu_singleshot_timer_fn(void *data); static void poll_timer_fn(void *data); /* This is global for now so that private implementations can reach it */ DEFINE_PER_CPU(struct schedule_data, schedule_data); DEFINE_PER_CPU(struct scheduler *, scheduler); static const struct scheduler *schedulers[] = { &sched_sedf_def, &sched_credit_def, &sched_credit2_def, &sched_arinc653_def, }; static struct scheduler __read_mostly ops; #define SCHED_OP(opsptr, fn, ...) \ (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ ) \ : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 ) #define DOM2OP(_d) (((_d)->cpupool == NULL) ? &ops : ((_d)->cpupool->sched)) #define VCPU2OP(_v) (DOM2OP((_v)->domain)) #define VCPU2ONLINE(_v) cpupool_online_cpumask((_v)->domain->cpupool) static inline void trace_runstate_change(struct vcpu *v, int new_state) { struct { uint32_t vcpu:16, domain:16; } d; uint32_t event; if ( likely(!tb_init_done) ) return; d.vcpu = v->vcpu_id; d.domain = v->domain->domain_id; event = TRC_SCHED_RUNSTATE_CHANGE; event |= ( v->runstate.state & 0x3 ) << 8; event |= ( new_state & 0x3 ) << 4; __trace_var(event, 1/*tsc*/, sizeof(d), &d); } static inline void trace_continue_running(struct vcpu *v) { struct { uint32_t vcpu:16, domain:16; } d; if ( likely(!tb_init_done) ) return; d.vcpu = v->vcpu_id; d.domain = v->domain->domain_id; __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d); } static inline void vcpu_urgent_count_update(struct vcpu *v) { if ( is_idle_vcpu(v) ) return; if ( unlikely(v->is_urgent) ) { if ( !test_bit(_VPF_blocked, &v->pause_flags) || !test_bit(v->vcpu_id, v->domain->poll_mask) ) { v->is_urgent = 0; atomic_dec(&per_cpu(schedule_data,v->processor).urgent_count); } } else { if ( unlikely(test_bit(_VPF_blocked, &v->pause_flags) && test_bit(v->vcpu_id, v->domain->poll_mask)) ) { v->is_urgent = 1; atomic_inc(&per_cpu(schedule_data,v->processor).urgent_count); } } } static inline void vcpu_runstate_change( struct vcpu *v, int new_state, s_time_t new_entry_time) { s_time_t delta; ASSERT(v->runstate.state != new_state); ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock)); vcpu_urgent_count_update(v); trace_runstate_change(v, new_state); delta = new_entry_time - v->runstate.state_entry_time; if ( delta > 0 ) { v->runstate.time[v->runstate.state] += delta; v->runstate.state_entry_time = new_entry_time; } v->runstate.state = new_state; } void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate) { spinlock_t *lock = likely(v == current) ? NULL : vcpu_schedule_lock_irq(v); s_time_t delta; memcpy(runstate, &v->runstate, sizeof(*runstate)); delta = NOW() - runstate->state_entry_time; if ( delta > 0 ) runstate->time[runstate->state] += delta; if ( unlikely(lock != NULL) ) vcpu_schedule_unlock_irq(lock, v); } uint64_t get_cpu_idle_time(unsigned int cpu) { struct vcpu_runstate_info state = { 0 }; struct vcpu *v = idle_vcpu[cpu]; if ( cpu_online(cpu) && v ) vcpu_runstate_get(v, &state); return state.time[RUNSTATE_running]; } int sched_init_vcpu(struct vcpu *v, unsigned int processor) { struct domain *d = v->domain; /* * Initialize processor and affinity settings. The idler, and potentially * domain-0 VCPUs, are pinned onto their respective physical CPUs. */ v->processor = processor; if ( is_idle_domain(d) || d->is_pinned ) cpumask_copy(v->cpu_affinity, cpumask_of(processor)); else cpumask_setall(v->cpu_affinity); /* Initialise the per-vcpu timers. */ init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, v, v->processor); init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, v, v->processor); init_timer(&v->poll_timer, poll_timer_fn, v, v->processor); /* Idle VCPUs are scheduled immediately. */ if ( is_idle_domain(d) ) { per_cpu(schedule_data, v->processor).curr = v; v->is_running = 1; } TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id); v->sched_priv = SCHED_OP(DOM2OP(d), alloc_vdata, v, d->sched_priv); if ( v->sched_priv == NULL ) return 1; SCHED_OP(DOM2OP(d), insert_vcpu, v); return 0; } int sched_move_domain(struct domain *d, struct cpupool *c) { struct vcpu *v; unsigned int new_p; void **vcpu_priv; void *domdata; void *vcpudata; struct scheduler *old_ops; void *old_domdata; domdata = SCHED_OP(c->sched, alloc_domdata, d); if ( domdata == NULL ) return -ENOMEM; vcpu_priv = xzalloc_array(void *, d->max_vcpus); if ( vcpu_priv == NULL ) { SCHED_OP(c->sched, free_domdata, domdata); return -ENOMEM; } for_each_vcpu ( d, v ) { vcpu_priv[v->vcpu_id] = SCHED_OP(c->sched, alloc_vdata, v, domdata); if ( vcpu_priv[v->vcpu_id] == NULL ) { for_each_vcpu ( d, v ) { if ( vcpu_priv[v->vcpu_id] != NULL ) xfree(vcpu_priv[v->vcpu_id]); } xfree(vcpu_priv); SCHED_OP(c->sched, free_domdata, domdata); return -ENOMEM; } } domain_pause(d); old_ops = DOM2OP(d); old_domdata = d->sched_priv; for_each_vcpu ( d, v ) { SCHED_OP(old_ops, remove_vcpu, v); } d->cpupool = c; d->sched_priv = domdata; new_p = cpumask_first(c->cpu_valid); for_each_vcpu ( d, v ) { spinlock_t *lock; vcpudata = v->sched_priv; migrate_timer(&v->periodic_timer, new_p); migrate_timer(&v->singleshot_timer, new_p); migrate_timer(&v->poll_timer, new_p); cpumask_setall(v->cpu_affinity); lock = vcpu_schedule_lock_irq(v); v->processor = new_p; /* * With v->processor modified we must not * - make any further changes assuming we hold the scheduler lock, * - use vcpu_schedule_unlock_irq(). */ spin_unlock_irq(lock); v->sched_priv = vcpu_priv[v->vcpu_id]; evtchn_move_pirqs(v); new_p = cpumask_cycle(new_p, c->cpu_valid); SCHED_OP(c->sched, insert_vcpu, v); SCHED_OP(old_ops, free_vdata, vcpudata); } domain_update_node_affinity(d); domain_unpause(d); SCHED_OP(old_ops, free_domdata, old_domdata); xfree(vcpu_priv); return 0; } void sched_destroy_vcpu(struct vcpu *v) { kill_timer(&v->periodic_timer); kill_timer(&v->singleshot_timer); kill_timer(&v->poll_timer); if ( test_and_clear_bool(v->is_urgent) ) atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count); SCHED_OP(VCPU2OP(v), remove_vcpu, v); SCHED_OP(VCPU2OP(v), free_vdata, v->sched_priv); } int sched_init_domain(struct domain *d) { SCHED_STAT_CRANK(dom_init); return SCHED_OP(DOM2OP(d), init_domain, d); } void sched_destroy_domain(struct domain *d) { SCHED_STAT_CRANK(dom_destroy); SCHED_OP(DOM2OP(d), destroy_domain, d); } void vcpu_sleep_nosync(struct vcpu *v) { unsigned long flags; spinlock_t *lock = vcpu_schedule_lock_irqsave(v, &flags); if ( likely(!vcpu_runnable(v)) ) { if ( v->runstate.state == RUNSTATE_runnable ) vcpu_runstate_change(v, RUNSTATE_offline, NOW()); SCHED_OP(VCPU2OP(v), sleep, v); } vcpu_schedule_unlock_irqrestore(lock, flags, v); TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); } void vcpu_sleep_sync(struct vcpu *v) { vcpu_sleep_nosync(v); while ( !vcpu_runnable(v) && v->is_running ) cpu_relax(); sync_vcpu_execstate(v); } void vcpu_wake(struct vcpu *v) { unsigned long flags; spinlock_t *lock = vcpu_schedule_lock_irqsave(v, &flags); if ( likely(vcpu_runnable(v)) ) { if ( v->runstate.state >= RUNSTATE_blocked ) vcpu_runstate_change(v, RUNSTATE_runnable, NOW()); SCHED_OP(VCPU2OP(v), wake, v); } else if ( !test_bit(_VPF_blocked, &v->pause_flags) ) { if ( v->runstate.state == RUNSTATE_blocked ) vcpu_runstate_change(v, RUNSTATE_offline, NOW()); } vcpu_schedule_unlock_irqrestore(lock, flags, v); TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id); } void vcpu_unblock(struct vcpu *v) { if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) ) return; /* Polling period ends when a VCPU is unblocked. */ if ( unlikely(v->poll_evtchn != 0) ) { v->poll_evtchn = 0; /* * We *must* re-clear _VPF_blocked to avoid racing other wakeups of * this VCPU (and it then going back to sleep on poll_mask). * Test-and-clear is idiomatic and ensures clear_bit not reordered. */ if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) ) clear_bit(_VPF_blocked, &v->pause_flags); } vcpu_wake(v); } static void vcpu_migrate(struct vcpu *v) { unsigned long flags; unsigned int old_cpu, new_cpu; spinlock_t *old_lock, *new_lock; bool_t pick_called = 0; old_cpu = new_cpu = v->processor; for ( ; ; ) { /* * If per-cpu locks for old and new cpu are different, take the one * with the lower lock address first. This avoids dead- or live-locks * when this code is running on both cpus at the same time. * We need another iteration if the pre-calculated lock addresses * are not correct any longer after evaluating old and new cpu holding * the locks. */ old_lock = per_cpu(schedule_data, old_cpu).schedule_lock; new_lock = per_cpu(schedule_data, new_cpu).schedule_lock; if ( old_lock == new_lock ) { spin_lock_irqsave(old_lock, flags); } else if ( old_lock < new_lock ) { spin_lock_irqsave(old_lock, flags); spin_lock(new_lock); } else { spin_lock_irqsave(new_lock, flags); spin_lock(old_lock); } old_cpu = v->processor; if ( old_lock == per_cpu(schedule_data, old_cpu).schedule_lock ) { /* * If we selected a CPU on the previosu iteration, check if it * remains suitable for running this vCPU. */ if ( pick_called && (new_lock == per_cpu(schedule_data, new_cpu).schedule_lock) && cpumask_test_cpu(new_cpu, v->cpu_affinity) && cpumask_test_cpu(new_cpu, v->domain->cpupool->cpu_valid) ) break; /* Select a new CPU. */ new_cpu = SCHED_OP(VCPU2OP(v), pick_cpu, v); if ( (new_lock == per_cpu(schedule_data, new_cpu).schedule_lock) && cpumask_test_cpu(new_cpu, v->domain->cpupool->cpu_valid) ) break; pick_called = 1; } else { /* * We do not hold the scheduler lock appropriate for this vCPU. * Thus we cannot select a new CPU on this iteration. Try again. */ pick_called = 0; } if ( old_lock != new_lock ) spin_unlock(new_lock); spin_unlock_irqrestore(old_lock, flags); } /* * NB. Check of v->running happens /after/ setting migration flag * because they both happen in (different) spinlock regions, and those * regions are strictly serialised. */ if ( v->is_running || !test_and_clear_bit(_VPF_migrating, &v->pause_flags) ) { if ( old_lock != new_lock ) spin_unlock(new_lock); spin_unlock_irqrestore(old_lock, flags); return; } /* * Transfer urgency status to new CPU before switching CPUs, as once * the switch occurs, v->is_urgent is no longer protected by the per-CPU * scheduler lock we are holding. */ if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) ) { atomic_inc(&per_cpu(schedule_data, new_cpu).urgent_count); atomic_dec(&per_cpu(schedule_data, old_cpu).urgent_count); } /* * Switch to new CPU, then unlock new and old CPU. This is safe because * the lock pointer cant' change while the current lock is held. */ if ( VCPU2OP(v)->migrate ) SCHED_OP(VCPU2OP(v), migrate, v, new_cpu); else v->processor = new_cpu; if ( old_lock != new_lock ) spin_unlock(new_lock); spin_unlock_irqrestore(old_lock, flags); if ( old_cpu != new_cpu ) evtchn_move_pirqs(v); /* Wake on new CPU. */ vcpu_wake(v); } /* * Force a VCPU through a deschedule/reschedule path. * For example, using this when setting the periodic timer period means that * most periodic-timer state need only be touched from within the scheduler * which can thus be done without need for synchronisation. */ void vcpu_force_reschedule(struct vcpu *v) { spinlock_t *lock = vcpu_schedule_lock_irq(v); if ( v->is_running ) set_bit(_VPF_migrating, &v->pause_flags); vcpu_schedule_unlock_irq(lock, v); if ( test_bit(_VPF_migrating, &v->pause_flags) ) { vcpu_sleep_nosync(v); vcpu_migrate(v); } } void restore_vcpu_affinity(struct domain *d) { struct vcpu *v; for_each_vcpu ( d, v ) { spinlock_t *lock = vcpu_schedule_lock_irq(v); if ( v->affinity_broken ) { printk(XENLOG_DEBUG "Restoring affinity for d%dv%d\n", d->domain_id, v->vcpu_id); cpumask_copy(v->cpu_affinity, v->cpu_affinity_saved); v->affinity_broken = 0; } if ( v->processor == smp_processor_id() ) { set_bit(_VPF_migrating, &v->pause_flags); vcpu_schedule_unlock_irq(lock, v); vcpu_sleep_nosync(v); vcpu_migrate(v); } else { vcpu_schedule_unlock_irq(lock, v); } } domain_update_node_affinity(d); } /* * This function is used by cpu_hotplug code from stop_machine context * and from cpupools to switch schedulers on a cpu. */ int cpu_disable_scheduler(unsigned int cpu) { struct domain *d; struct vcpu *v; struct cpupool *c; cpumask_t online_affinity; int ret = 0; c = per_cpu(cpupool, cpu); if ( c == NULL ) return ret; for_each_domain_in_cpupool ( d, c ) { for_each_vcpu ( d, v ) { unsigned long flags; spinlock_t *lock = vcpu_schedule_lock_irqsave(v, &flags); cpumask_and(&online_affinity, v->cpu_affinity, c->cpu_valid); if ( cpumask_empty(&online_affinity) && cpumask_test_cpu(cpu, v->cpu_affinity) ) { printk(XENLOG_DEBUG "Breaking affinity for d%dv%d\n", d->domain_id, v->vcpu_id); if (system_state == SYS_STATE_suspend) { cpumask_copy(v->cpu_affinity_saved, v->cpu_affinity); v->affinity_broken = 1; } cpumask_setall(v->cpu_affinity); } if ( v->processor == cpu ) { set_bit(_VPF_migrating, &v->pause_flags); vcpu_schedule_unlock_irqrestore(lock, flags, v); vcpu_sleep_nosync(v); vcpu_migrate(v); } else vcpu_schedule_unlock_irqrestore(lock, flags, v); /* * A vcpu active in the hypervisor will not be migratable. * The caller should try again after releasing and reaquiring * all locks. */ if ( v->processor == cpu ) ret = -EAGAIN; } domain_update_node_affinity(d); } return ret; } void sched_set_node_affinity(struct domain *d, nodemask_t *mask) { SCHED_OP(DOM2OP(d), set_node_affinity, d, mask); } int vcpu_set_affinity(struct vcpu *v, const cpumask_t *affinity) { cpumask_t online_affinity; cpumask_t *online; spinlock_t *lock; if ( v->domain->is_pinned ) return -EINVAL; online = VCPU2ONLINE(v); cpumask_and(&online_affinity, affinity, online); if ( cpumask_empty(&online_affinity) ) return -EINVAL; lock = vcpu_schedule_lock_irq(v); cpumask_copy(v->cpu_affinity, affinity); /* Always ask the scheduler to re-evaluate placement * when changing the affinity */ set_bit(_VPF_migrating, &v->pause_flags); vcpu_schedule_unlock_irq(lock, v); domain_update_node_affinity(v->domain); if ( test_bit(_VPF_migrating, &v->pause_flags) ) { vcpu_sleep_nosync(v); vcpu_migrate(v); } return 0; } /* Block the currently-executing domain until a pertinent event occurs. */ void vcpu_block(void) { struct vcpu *v = current; set_bit(_VPF_blocked, &v->pause_flags); /* Check for events /after/ blocking: avoids wakeup waiting race. */ if ( local_events_need_delivery() ) { clear_bit(_VPF_blocked, &v->pause_flags); } else { TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id); raise_softirq(SCHEDULE_SOFTIRQ); } } static void vcpu_block_enable_events(void) { local_event_delivery_enable(); vcpu_block(); } static long do_poll(struct sched_poll *sched_poll) { struct vcpu *v = current; struct domain *d = v->domain; evtchn_port_t port; long rc; unsigned int i; /* Fairly arbitrary limit. */ if ( sched_poll->nr_ports > 128 ) return -EINVAL; if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) ) return -EFAULT; set_bit(_VPF_blocked, &v->pause_flags); v->poll_evtchn = -1; set_bit(v->vcpu_id, d->poll_mask); #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */ /* Check for events /after/ setting flags: avoids wakeup waiting race. */ smp_mb(); /* * Someone may have seen we are blocked but not that we are polling, or * vice versa. We are certainly being woken, so clean up and bail. Beyond * this point others can be guaranteed to clean up for us if they wake us. */ rc = 0; if ( (v->poll_evtchn == 0) || !test_bit(_VPF_blocked, &v->pause_flags) || !test_bit(v->vcpu_id, d->poll_mask) ) goto out; #endif rc = 0; if ( local_events_need_delivery() ) goto out; for ( i = 0; i < sched_poll->nr_ports; i++ ) { rc = -EFAULT; if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) ) goto out; rc = -EINVAL; if ( port >= d->max_evtchns ) goto out; rc = 0; if ( evtchn_port_is_pending(d, evtchn_from_port(d, port)) ) goto out; } if ( sched_poll->nr_ports == 1 ) v->poll_evtchn = port; if ( sched_poll->timeout != 0 ) set_timer(&v->poll_timer, sched_poll->timeout); TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id); raise_softirq(SCHEDULE_SOFTIRQ); return 0; out: v->poll_evtchn = 0; clear_bit(v->vcpu_id, d->poll_mask); clear_bit(_VPF_blocked, &v->pause_flags); return rc; } /* Voluntarily yield the processor for this allocation. */ static long do_yield(void) { struct vcpu * v=current; spinlock_t *lock = vcpu_schedule_lock_irq(v); SCHED_OP(VCPU2OP(v), yield, v); vcpu_schedule_unlock_irq(lock, v); TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id); raise_softirq(SCHEDULE_SOFTIRQ); return 0; } static void domain_watchdog_timeout(void *data) { struct domain *d = data; if ( d->is_shutting_down || d->is_dying ) return; printk("Watchdog timer fired for domain %u\n", d->domain_id); domain_shutdown(d, SHUTDOWN_watchdog); } static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout) { if ( id > NR_DOMAIN_WATCHDOG_TIMERS ) return -EINVAL; spin_lock(&d->watchdog_lock); if ( id == 0 ) { for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ ) { if ( test_and_set_bit(id, &d->watchdog_inuse_map) ) continue; set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout)); break; } spin_unlock(&d->watchdog_lock); return id == NR_DOMAIN_WATCHDOG_TIMERS ? -ENOSPC : id + 1; } id -= 1; if ( !test_bit(id, &d->watchdog_inuse_map) ) { spin_unlock(&d->watchdog_lock); return -EINVAL; } if ( timeout == 0 ) { stop_timer(&d->watchdog_timer[id]); clear_bit(id, &d->watchdog_inuse_map); } else { set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout)); } spin_unlock(&d->watchdog_lock); return 0; } void watchdog_domain_init(struct domain *d) { unsigned int i; spin_lock_init(&d->watchdog_lock); d->watchdog_inuse_map = 0; for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ ) init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0); } void watchdog_domain_destroy(struct domain *d) { unsigned int i; for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ ) kill_timer(&d->watchdog_timer[i]); } long do_sched_op_compat(int cmd, unsigned long arg) { long ret = 0; switch ( cmd ) { case SCHEDOP_yield: { ret = do_yield(); break; } case SCHEDOP_block: { vcpu_block_enable_events(); break; } case SCHEDOP_shutdown: { TRACE_3D(TRC_SCHED_SHUTDOWN, current->domain->domain_id, current->vcpu_id, arg); domain_shutdown(current->domain, (u8)arg); break; } default: ret = -ENOSYS; } return ret; } typedef long ret_t; #endif /* !COMPAT */ ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) { ret_t ret = 0; switch ( cmd ) { case SCHEDOP_yield: { ret = do_yield(); break; } case SCHEDOP_block: { vcpu_block_enable_events(); break; } case SCHEDOP_shutdown: { struct sched_shutdown sched_shutdown; ret = -EFAULT; if ( copy_from_guest(&sched_shutdown, arg, 1) ) break; ret = 0; TRACE_3D(TRC_SCHED_SHUTDOWN, current->domain->domain_id, current->vcpu_id, sched_shutdown.reason); domain_shutdown(current->domain, (u8)sched_shutdown.reason); break; } case SCHEDOP_shutdown_code: { struct sched_shutdown sched_shutdown; struct domain *d = current->domain; ret = -EFAULT; if ( copy_from_guest(&sched_shutdown, arg, 1) ) break; TRACE_3D(TRC_SCHED_SHUTDOWN_CODE, d->domain_id, current->vcpu_id, sched_shutdown.reason); spin_lock(&d->shutdown_lock); if ( d->shutdown_code == -1 ) d->shutdown_code = (u8)sched_shutdown.reason; spin_unlock(&d->shutdown_lock); ret = 0; break; } case SCHEDOP_poll: { struct sched_poll sched_poll; ret = -EFAULT; if ( copy_from_guest(&sched_poll, arg, 1) ) break; ret = do_poll(&sched_poll); break; } case SCHEDOP_remote_shutdown: { struct domain *d; struct sched_remote_shutdown sched_remote_shutdown; ret = -EFAULT; if ( copy_from_guest(&sched_remote_shutdown, arg, 1) ) break; ret = -ESRCH; d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id); if ( d == NULL ) break; ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d); if ( ret ) { rcu_unlock_domain(d); return ret; } domain_shutdown(d, (u8)sched_remote_shutdown.reason); rcu_unlock_domain(d); ret = 0; break; } case SCHEDOP_watchdog: { struct sched_watchdog sched_watchdog; ret = -EFAULT; if ( copy_from_guest(&sched_watchdog, arg, 1) ) break; ret = domain_watchdog( current->domain, sched_watchdog.id, sched_watchdog.timeout); break; } default: ret = -ENOSYS; } return ret; } #ifndef COMPAT /* Per-vcpu oneshot-timer hypercall. */ long do_set_timer_op(s_time_t timeout) { struct vcpu *v = current; s_time_t offset = timeout - NOW(); if ( timeout == 0 ) { stop_timer(&v->singleshot_timer); } else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */ unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) ) { /* * Linux workaround: occasionally we will see timeouts a long way in * the future due to wrapping in Linux's jiffy time handling. We check * for timeouts wrapped negative, and for positive timeouts more than * about 13 days in the future (2^50ns). The correct fix is to trigger * an interrupt immediately (since Linux in fact has pending work to * do in this situation). However, older guests also set a long timeout * when they have *no* pending timers at all: setting an immediate * timeout in this case can burn a lot of CPU. We therefore go for a * reasonable middleground of triggering a timer event in 100ms. */ gdprintk(XENLOG_INFO, "Warning: huge timeout set by vcpu %d: %"PRIx64"\n", v->vcpu_id, (uint64_t)timeout); set_timer(&v->singleshot_timer, NOW() + MILLISECS(100)); } else { migrate_timer(&v->singleshot_timer, smp_processor_id()); set_timer(&v->singleshot_timer, timeout); } return 0; } /* sched_id - fetch ID of current scheduler */ int sched_id(void) { return ops.sched_id; } /* Adjust scheduling parameter for a given domain. */ long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op) { long ret; ret = xsm_domctl_scheduler_op(XSM_HOOK, d, op->cmd); if ( ret ) return ret; if ( (op->sched_id != DOM2OP(d)->sched_id) || ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) && (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) ) return -EINVAL; /* NB: the pluggable scheduler code needs to take care * of locking by itself. */ if ( (ret = SCHED_OP(DOM2OP(d), adjust, d, op)) == 0 ) TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id); return ret; } long sched_adjust_global(struct xen_sysctl_scheduler_op *op) { struct cpupool *pool; int rc; rc = xsm_sysctl_scheduler_op(XSM_HOOK, op->cmd); if ( rc ) return rc; if ( (op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) && (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo) ) return -EINVAL; pool = cpupool_get_by_id(op->cpupool_id); if ( pool == NULL ) return -ESRCH; rc = ((op->sched_id == pool->sched->sched_id) ? SCHED_OP(pool->sched, adjust_global, op) : -EINVAL); cpupool_put(pool); return rc; } static void vcpu_periodic_timer_work(struct vcpu *v) { s_time_t now = NOW(); s_time_t periodic_next_event; if ( v->periodic_period == 0 ) return; periodic_next_event = v->periodic_last_event + v->periodic_period; if ( now >= periodic_next_event ) { send_timer_event(v); v->periodic_last_event = now; periodic_next_event = now + v->periodic_period; } migrate_timer(&v->periodic_timer, smp_processor_id()); set_timer(&v->periodic_timer, periodic_next_event); } /* * The main function * - deschedule the current domain (scheduler independent). * - pick a new domain (scheduler dependent). */ static void schedule(void) { struct vcpu *prev = current, *next = NULL; s_time_t now = NOW(); struct scheduler *sched; unsigned long *tasklet_work = &this_cpu(tasklet_work_to_do); bool_t tasklet_work_scheduled = 0; struct schedule_data *sd; spinlock_t *lock; struct task_slice next_slice; int cpu = smp_processor_id(); ASSERT_NOT_IN_ATOMIC(); SCHED_STAT_CRANK(sched_run); sd = &this_cpu(schedule_data); /* Update tasklet scheduling status. */ switch ( *tasklet_work ) { case TASKLET_enqueued: set_bit(_TASKLET_scheduled, tasklet_work); case TASKLET_enqueued|TASKLET_scheduled: tasklet_work_scheduled = 1; break; case TASKLET_scheduled: clear_bit(_TASKLET_scheduled, tasklet_work); case 0: /*tasklet_work_scheduled = 0;*/ break; default: BUG(); } lock = pcpu_schedule_lock_irq(cpu); stop_timer(&sd->s_timer); /* get policy-specific decision on scheduling... */ sched = this_cpu(scheduler); next_slice = sched->do_schedule(sched, now, tasklet_work_scheduled); next = next_slice.task; sd->curr = next; if ( next_slice.time >= 0 ) /* -ve means no limit */ set_timer(&sd->s_timer, now + next_slice.time); if ( unlikely(prev == next) ) { pcpu_schedule_unlock_irq(lock, cpu); trace_continue_running(next); return continue_running(prev); } TRACE_2D(TRC_SCHED_SWITCH_INFPREV, prev->domain->domain_id, now - prev->runstate.state_entry_time); TRACE_3D(TRC_SCHED_SWITCH_INFNEXT, next->domain->domain_id, (next->runstate.state == RUNSTATE_runnable) ? (now - next->runstate.state_entry_time) : 0, next_slice.time); ASSERT(prev->runstate.state == RUNSTATE_running); TRACE_4D(TRC_SCHED_SWITCH, prev->domain->domain_id, prev->vcpu_id, next->domain->domain_id, next->vcpu_id); vcpu_runstate_change( prev, (test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked : (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)), now); prev->last_run_time = now; ASSERT(next->runstate.state != RUNSTATE_running); vcpu_runstate_change(next, RUNSTATE_running, now); /* * NB. Don't add any trace records from here until the actual context * switch, else lost_records resume will not work properly. */ ASSERT(!next->is_running); next->is_running = 1; pcpu_schedule_unlock_irq(lock, cpu); SCHED_STAT_CRANK(sched_ctx); stop_timer(&prev->periodic_timer); if ( next_slice.migrated ) evtchn_move_pirqs(next); vcpu_periodic_timer_work(next); context_switch(prev, next); } void context_saved(struct vcpu *prev) { /* Clear running flag /after/ writing context to memory. */ smp_wmb(); prev->is_running = 0; /* Check for migration request /after/ clearing running flag. */ smp_mb(); SCHED_OP(VCPU2OP(prev), context_saved, prev); if ( unlikely(test_bit(_VPF_migrating, &prev->pause_flags)) ) vcpu_migrate(prev); } /* The scheduler timer: force a run through the scheduler */ static void s_timer_fn(void *unused) { raise_softirq(SCHEDULE_SOFTIRQ); SCHED_STAT_CRANK(sched_irq); } /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */ static void vcpu_periodic_timer_fn(void *data) { struct vcpu *v = data; vcpu_periodic_timer_work(v); } /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */ static void vcpu_singleshot_timer_fn(void *data) { struct vcpu *v = data; send_timer_event(v); } /* SCHEDOP_poll timeout callback. */ static void poll_timer_fn(void *data) { struct vcpu *v = data; if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) ) vcpu_unblock(v); } static int cpu_schedule_up(unsigned int cpu) { struct schedule_data *sd = &per_cpu(schedule_data, cpu); per_cpu(scheduler, cpu) = &ops; spin_lock_init(&sd->_lock); sd->schedule_lock = &sd->_lock; sd->curr = idle_vcpu[cpu]; init_timer(&sd->s_timer, s_timer_fn, NULL, cpu); atomic_set(&sd->urgent_count, 0); /* Boot CPU is dealt with later in schedule_init(). */ if ( cpu == 0 ) return 0; if ( idle_vcpu[cpu] == NULL ) alloc_vcpu(idle_vcpu[0]->domain, cpu, cpu); if ( idle_vcpu[cpu] == NULL ) return -ENOMEM; if ( (ops.alloc_pdata != NULL) && ((sd->sched_priv = ops.alloc_pdata(&ops, cpu)) == NULL) ) return -ENOMEM; return 0; } static void cpu_schedule_down(unsigned int cpu) { struct schedule_data *sd = &per_cpu(schedule_data, cpu); if ( sd->sched_priv != NULL ) SCHED_OP(&ops, free_pdata, sd->sched_priv, cpu); kill_timer(&sd->s_timer); } static int cpu_schedule_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int rc = 0; switch ( action ) { case CPU_UP_PREPARE: rc = cpu_schedule_up(cpu); break; case CPU_UP_CANCELED: case CPU_DEAD: cpu_schedule_down(cpu); break; default: break; } return !rc ? NOTIFY_DONE : notifier_from_errno(rc); } static struct notifier_block cpu_schedule_nfb = { .notifier_call = cpu_schedule_callback }; /* Initialise the data structures. */ void __init scheduler_init(void) { struct domain *idle_domain; int i; open_softirq(SCHEDULE_SOFTIRQ, schedule); for ( i = 0; i < ARRAY_SIZE(schedulers); i++ ) { if ( schedulers[i]->global_init && schedulers[i]->global_init() < 0 ) schedulers[i] = NULL; else if ( !ops.name && !strcmp(schedulers[i]->opt_name, opt_sched) ) ops = *schedulers[i]; } if ( !ops.name ) { printk("Could not find scheduler: %s\n", opt_sched); for ( i = 0; i < ARRAY_SIZE(schedulers); i++ ) if ( schedulers[i] ) { ops = *schedulers[i]; break; } BUG_ON(!ops.name); printk("Using '%s' (%s)\n", ops.name, ops.opt_name); } if ( cpu_schedule_up(0) ) BUG(); register_cpu_notifier(&cpu_schedule_nfb); printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name); if ( SCHED_OP(&ops, init) ) panic("scheduler returned error on init"); if ( sched_ratelimit_us && (sched_ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX || sched_ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN) ) { printk("WARNING: sched_ratelimit_us outside of valid range [%d,%d].\n" " Resetting to default %u\n", XEN_SYSCTL_SCHED_RATELIMIT_MIN, XEN_SYSCTL_SCHED_RATELIMIT_MAX, SCHED_DEFAULT_RATELIMIT_US); sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US; } idle_domain = domain_create(DOMID_IDLE, 0, 0); BUG_ON(IS_ERR(idle_domain)); idle_domain->vcpu = idle_vcpu; idle_domain->max_vcpus = nr_cpu_ids; if ( alloc_vcpu(idle_domain, 0, 0) == NULL ) BUG(); if ( ops.alloc_pdata && !(this_cpu(schedule_data).sched_priv = ops.alloc_pdata(&ops, 0)) ) BUG(); } int schedule_cpu_switch(unsigned int cpu, struct cpupool *c) { unsigned long flags; struct vcpu *idle; spinlock_t *lock; void *ppriv, *ppriv_old, *vpriv, *vpriv_old; struct scheduler *old_ops = per_cpu(scheduler, cpu); struct scheduler *new_ops = (c == NULL) ? &ops : c->sched; if ( old_ops == new_ops ) return 0; idle = idle_vcpu[cpu]; ppriv = SCHED_OP(new_ops, alloc_pdata, cpu); if ( ppriv == NULL ) return -ENOMEM; vpriv = SCHED_OP(new_ops, alloc_vdata, idle, idle->domain->sched_priv); if ( vpriv == NULL ) { SCHED_OP(new_ops, free_pdata, ppriv, cpu); return -ENOMEM; } lock = pcpu_schedule_lock_irqsave(cpu, &flags); SCHED_OP(old_ops, tick_suspend, cpu); vpriv_old = idle->sched_priv; idle->sched_priv = vpriv; per_cpu(scheduler, cpu) = new_ops; ppriv_old = per_cpu(schedule_data, cpu).sched_priv; per_cpu(schedule_data, cpu).sched_priv = ppriv; SCHED_OP(new_ops, tick_resume, cpu); SCHED_OP(new_ops, insert_vcpu, idle); pcpu_schedule_unlock_irqrestore(lock, flags, cpu); SCHED_OP(old_ops, free_vdata, vpriv_old); SCHED_OP(old_ops, free_pdata, ppriv_old, cpu); return 0; } struct scheduler *scheduler_get_default(void) { return &ops; } struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr) { int i; struct scheduler *sched; for ( i = 0; i < ARRAY_SIZE(schedulers); i++ ) if ( schedulers[i] && schedulers[i]->sched_id == sched_id ) goto found; *perr = -ENOENT; return NULL; found: *perr = -ENOMEM; if ( (sched = xmalloc(struct scheduler)) == NULL ) return NULL; memcpy(sched, schedulers[i], sizeof(*sched)); if ( (*perr = SCHED_OP(sched, init)) != 0 ) { xfree(sched); sched = NULL; } return sched; } void scheduler_free(struct scheduler *sched) { BUG_ON(sched == &ops); SCHED_OP(sched, deinit); xfree(sched); } void schedule_dump(struct cpupool *c) { int i; struct scheduler *sched; cpumask_t *cpus; sched = (c == NULL) ? &ops : c->sched; cpus = cpupool_scheduler_cpumask(c); printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name); SCHED_OP(sched, dump_settings); for_each_cpu (i, cpus) { spinlock_t *lock = pcpu_schedule_lock(i); printk("CPU[%02d] ", i); SCHED_OP(sched, dump_cpu_state, i); pcpu_schedule_unlock(lock, i); } } void sched_tick_suspend(void) { struct scheduler *sched; unsigned int cpu = smp_processor_id(); sched = per_cpu(scheduler, cpu); SCHED_OP(sched, tick_suspend, cpu); } void sched_tick_resume(void) { struct scheduler *sched; unsigned int cpu = smp_processor_id(); sched = per_cpu(scheduler, cpu); SCHED_OP(sched, tick_resume, cpu); } void wait(void) { schedule(); } #ifdef CONFIG_COMPAT #include "compat/schedule.c" #endif #endif /* !COMPAT */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/tmem.c0000664000175000017500000025573712307313555013656 0ustar smbsmb/****************************************************************************** * tmem.c * * Transcendent memory * * Copyright (c) 2009, Dan Magenheimer, Oracle Corp. */ /* TODO list: 090129 (updated 100318) - any better reclamation policy? - use different tlsf pools for each client (maybe each pool) - test shared access more completely (ocfs2) - add feedback-driven compression (not for persistent pools though!) - add data-structure total bytes overhead stats */ #ifdef __XEN__ #include /* host-specific (eg Xen) code goes here */ #endif #include #include #include #include #include #define TMEM_SPEC_VERSION 1 /* global statistics (none need to be locked) */ static unsigned long total_tmem_ops = 0; static unsigned long errored_tmem_ops = 0; static unsigned long total_flush_pool = 0; static unsigned long alloc_failed = 0, alloc_page_failed = 0; static unsigned long evicted_pgs = 0, evict_attempts = 0; static unsigned long relinq_pgs = 0, relinq_attempts = 0; static unsigned long max_evicts_per_relinq = 0; static unsigned long low_on_memory = 0; static unsigned long deduped_puts = 0; static unsigned long tot_good_eph_puts = 0; static int global_obj_count_max = 0; static int global_pgp_count_max = 0; static int global_pcd_count_max = 0; static int global_page_count_max = 0; static int global_rtree_node_count_max = 0; static long global_eph_count_max = 0; static unsigned long failed_copies; static unsigned long pcd_tot_tze_size = 0; static unsigned long pcd_tot_csize = 0; /************ CORE DATA STRUCTURES ************************************/ #define MAX_POOLS_PER_DOMAIN 16 #define MAX_GLOBAL_SHARED_POOLS 16 struct tmem_pool; struct tmem_page_descriptor; struct tmem_page_content_descriptor; struct client { struct list_head client_list; struct tmem_pool *pools[MAX_POOLS_PER_DOMAIN]; struct domain *domain; struct xmem_pool *persistent_pool; struct list_head ephemeral_page_list; long eph_count, eph_count_max; domid_t cli_id; uint32_t weight; uint32_t cap; bool_t compress; bool_t frozen; bool_t shared_auth_required; /* for save/restore/migration */ bool_t live_migrating; bool_t was_frozen; struct list_head persistent_invalidated_list; struct tmem_page_descriptor *cur_pgp; /* statistics collection */ unsigned long compress_poor, compress_nomem; unsigned long compressed_pages; uint64_t compressed_sum_size; uint64_t total_cycles; unsigned long succ_pers_puts, succ_eph_gets, succ_pers_gets; /* shared pool authentication */ uint64_t shared_auth_uuid[MAX_GLOBAL_SHARED_POOLS][2]; }; struct share_list { struct list_head share_list; struct client *client; }; #define OBJ_HASH_BUCKETS 256 /* must be power of two */ #define OBJ_HASH_BUCKETS_MASK (OBJ_HASH_BUCKETS-1) struct tmem_pool { bool_t shared; bool_t persistent; bool_t is_dying; int pageshift; /* 0 == 2**12 */ struct list_head pool_list; struct client *client; uint64_t uuid[2]; /* 0 for private, non-zero for shared */ uint32_t pool_id; rwlock_t pool_rwlock; struct rb_root obj_rb_root[OBJ_HASH_BUCKETS]; /* protected by pool_rwlock */ struct list_head share_list; /* valid if shared */ int shared_count; /* valid if shared */ /* for save/restore/migration */ struct list_head persistent_page_list; struct tmem_page_descriptor *cur_pgp; /* statistics collection */ atomic_t pgp_count; int pgp_count_max; long obj_count; /* atomicity depends on pool_rwlock held for write */ long obj_count_max; unsigned long objnode_count, objnode_count_max; uint64_t sum_life_cycles; uint64_t sum_evicted_cycles; unsigned long puts, good_puts, no_mem_puts; unsigned long dup_puts_flushed, dup_puts_replaced; unsigned long gets, found_gets; unsigned long flushs, flushs_found; unsigned long flush_objs, flush_objs_found; }; #define is_persistent(_p) (_p->persistent) #define is_shared(_p) (_p->shared) struct oid { uint64_t oid[3]; }; struct tmem_object_root { struct oid oid; struct rb_node rb_tree_node; /* protected by pool->pool_rwlock */ unsigned long objnode_count; /* atomicity depends on obj_spinlock */ long pgp_count; /* atomicity depends on obj_spinlock */ struct radix_tree_root tree_root; /* tree of pages within object */ struct tmem_pool *pool; domid_t last_client; spinlock_t obj_spinlock; }; struct tmem_object_node { struct tmem_object_root *obj; struct radix_tree_node rtn; }; struct tmem_page_descriptor { union { struct list_head global_eph_pages; struct list_head client_inv_pages; }; union { struct { union { struct list_head client_eph_pages; struct list_head pool_pers_pages; }; struct tmem_object_root *obj; } us; struct oid inv_oid; /* used for invalid list only */ }; pagesize_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid, else compressed data (cdata) */ uint32_t index; /* must hold pcd_tree_rwlocks[firstbyte] to use pcd pointer/siblings */ uint16_t firstbyte; /* NON_SHAREABLE->pfp otherwise->pcd */ bool_t eviction_attempted; /* CHANGE TO lifetimes? (settable) */ struct list_head pcd_siblings; union { struct page_info *pfp; /* page frame pointer */ char *cdata; /* compressed data */ struct tmem_page_content_descriptor *pcd; /* page dedup */ }; union { uint64_t timestamp; uint32_t pool_id; /* used for invalid list only */ }; }; #define PCD_TZE_MAX_SIZE (PAGE_SIZE - (PAGE_SIZE/64)) struct tmem_page_content_descriptor { union { struct page_info *pfp; /* page frame pointer */ char *cdata; /* if compression_enabled */ char *tze; /* if !compression_enabled, trailing zeroes eliminated */ }; struct list_head pgp_list; struct rb_node pcd_rb_tree_node; uint32_t pgp_ref_count; pagesize_t size; /* if compression_enabled -> 0 *pfp */ }; struct rb_root pcd_tree_roots[256]; /* choose based on first byte of page */ rwlock_t pcd_tree_rwlocks[256]; /* poor man's concurrency for now */ static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools */ static LIST_HEAD(global_client_list); static LIST_HEAD(global_pool_list); static struct tmem_pool *global_shared_pools[MAX_GLOBAL_SHARED_POOLS] = { 0 }; static bool_t global_shared_auth = 0; static atomic_t client_weight_total = ATOMIC_INIT(0); static int tmem_initialized = 0; struct xmem_pool *tmem_mempool = 0; unsigned int tmem_mempool_maxalloc = 0; DEFINE_SPINLOCK(tmem_page_list_lock); PAGE_LIST_HEAD(tmem_page_list); unsigned long tmem_page_list_pages = 0; DEFINE_RWLOCK(tmem_rwlock); static DEFINE_SPINLOCK(eph_lists_spinlock); /* protects global AND clients */ static DEFINE_SPINLOCK(pers_lists_spinlock); #define ASSERT_SPINLOCK(_l) ASSERT(spin_is_locked(_l)) #define ASSERT_WRITELOCK(_l) ASSERT(rw_is_write_locked(_l)) /* global counters (should use long_atomic_t access) */ static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */ static atomic_t global_obj_count = ATOMIC_INIT(0); static atomic_t global_pgp_count = ATOMIC_INIT(0); static atomic_t global_pcd_count = ATOMIC_INIT(0); static atomic_t global_page_count = ATOMIC_INIT(0); static atomic_t global_rtree_node_count = ATOMIC_INIT(0); #define atomic_inc_and_max(_c) do { \ atomic_inc(&_c); \ if ( _atomic_read(_c) > _c##_max ) \ _c##_max = _atomic_read(_c); \ } while (0) #define atomic_dec_and_assert(_c) do { \ atomic_dec(&_c); \ ASSERT(_atomic_read(_c) >= 0); \ } while (0) /* * There two types of memory allocation interfaces in tmem. * One is based on xmem_pool and the other is used for allocate a whole page. * Both of them are based on the lowlevel function __tmem_alloc_page/_thispool(). * The call trace of alloc path is like below. * Persistant pool: * 1.tmem_malloc() * > xmem_pool_alloc() * > tmem_persistent_pool_page_get() * > __tmem_alloc_page_thispool() * 2.tmem_alloc_page() * > __tmem_alloc_page_thispool() * * Ephemeral pool: * 1.tmem_malloc() * > xmem_pool_alloc() * > tmem_mempool_page_get() * > __tmem_alloc_page() * 2.tmem_alloc_page() * > __tmem_alloc_page() * * The free path is done in the same manner. */ static void *tmem_malloc(size_t size, struct tmem_pool *pool) { void *v = NULL; if ( (pool != NULL) && is_persistent(pool) ) { if ( pool->client->persistent_pool ) v = xmem_pool_alloc(size, pool->client->persistent_pool); } else { ASSERT( size < tmem_mempool_maxalloc ); ASSERT( tmem_mempool != NULL ); v = xmem_pool_alloc(size, tmem_mempool); } if ( v == NULL ) alloc_failed++; return v; } static void tmem_free(void *p, struct tmem_pool *pool) { if ( pool == NULL || !is_persistent(pool) ) { ASSERT( tmem_mempool != NULL ); xmem_pool_free(p, tmem_mempool); } else { ASSERT( pool->client->persistent_pool != NULL ); xmem_pool_free(p, pool->client->persistent_pool); } } static struct page_info *tmem_alloc_page(struct tmem_pool *pool) { struct page_info *pfp = NULL; if ( pool != NULL && is_persistent(pool) ) pfp = __tmem_alloc_page_thispool(pool->client->domain); else pfp = __tmem_alloc_page(); if ( pfp == NULL ) alloc_page_failed++; else atomic_inc_and_max(global_page_count); return pfp; } static void tmem_free_page(struct tmem_pool *pool, struct page_info *pfp) { ASSERT(pfp); if ( pool == NULL || !is_persistent(pool) ) __tmem_free_page(pfp); else __tmem_free_page_thispool(pfp); atomic_dec_and_assert(global_page_count); } static noinline void *tmem_mempool_page_get(unsigned long size) { struct page_info *pi; ASSERT(size == PAGE_SIZE); if ( (pi = __tmem_alloc_page()) == NULL ) return NULL; return page_to_virt(pi); } static void tmem_mempool_page_put(void *page_va) { ASSERT(IS_PAGE_ALIGNED(page_va)); __tmem_free_page(virt_to_page(page_va)); } static int __init tmem_mempool_init(void) { tmem_mempool = xmem_pool_create("tmem", tmem_mempool_page_get, tmem_mempool_page_put, PAGE_SIZE, 0, PAGE_SIZE); if ( tmem_mempool ) tmem_mempool_maxalloc = xmem_pool_maxalloc(tmem_mempool); return tmem_mempool != NULL; } /* persistent pools are per-domain */ static void *tmem_persistent_pool_page_get(unsigned long size) { struct page_info *pi; struct domain *d = current->domain; ASSERT(size == PAGE_SIZE); if ( (pi = __tmem_alloc_page_thispool(d)) == NULL ) return NULL; ASSERT(IS_VALID_PAGE(pi)); return page_to_virt(pi); } static void tmem_persistent_pool_page_put(void *page_va) { struct page_info *pi; ASSERT(IS_PAGE_ALIGNED(page_va)); pi = mfn_to_page(virt_to_mfn(page_va)); ASSERT(IS_VALID_PAGE(pi)); __tmem_free_page_thispool(pi); } /* * Page content descriptor manipulation routines */ #define NOT_SHAREABLE ((uint16_t)-1UL) static int pcd_copy_to_client(xen_pfn_t cmfn, struct tmem_page_descriptor *pgp) { uint8_t firstbyte = pgp->firstbyte; struct tmem_page_content_descriptor *pcd; int ret; ASSERT(tmem_dedup_enabled()); read_lock(&pcd_tree_rwlocks[firstbyte]); pcd = pgp->pcd; if ( pgp->size < PAGE_SIZE && pgp->size != 0 && pcd->size < PAGE_SIZE && pcd->size != 0 ) ret = tmem_decompress_to_client(cmfn, pcd->cdata, pcd->size, tmem_cli_buf_null); else if ( tmem_tze_enabled() && pcd->size < PAGE_SIZE ) ret = tmem_copy_tze_to_client(cmfn, pcd->tze, pcd->size); else ret = tmem_copy_to_client(cmfn, pcd->pfp, tmem_cli_buf_null); read_unlock(&pcd_tree_rwlocks[firstbyte]); return ret; } /* ensure pgp no longer points to pcd, nor vice-versa */ /* take pcd rwlock unless have_pcd_rwlock is set, always unlock when done */ static void pcd_disassociate(struct tmem_page_descriptor *pgp, struct tmem_pool *pool, bool_t have_pcd_rwlock) { struct tmem_page_content_descriptor *pcd = pgp->pcd; struct page_info *pfp = pgp->pcd->pfp; uint16_t firstbyte = pgp->firstbyte; char *pcd_tze = pgp->pcd->tze; pagesize_t pcd_size = pcd->size; pagesize_t pgp_size = pgp->size; char *pcd_cdata = pgp->pcd->cdata; pagesize_t pcd_csize = pgp->pcd->size; ASSERT(tmem_dedup_enabled()); ASSERT(firstbyte != NOT_SHAREABLE); ASSERT(firstbyte < 256); if ( have_pcd_rwlock ) ASSERT_WRITELOCK(&pcd_tree_rwlocks[firstbyte]); else write_lock(&pcd_tree_rwlocks[firstbyte]); list_del_init(&pgp->pcd_siblings); pgp->pcd = NULL; pgp->firstbyte = NOT_SHAREABLE; pgp->size = -1; if ( --pcd->pgp_ref_count ) { write_unlock(&pcd_tree_rwlocks[firstbyte]); return; } /* no more references to this pcd, recycle it and the physical page */ ASSERT(list_empty(&pcd->pgp_list)); pcd->pfp = NULL; /* remove pcd from rbtree */ rb_erase(&pcd->pcd_rb_tree_node,&pcd_tree_roots[firstbyte]); /* reinit the struct for safety for now */ RB_CLEAR_NODE(&pcd->pcd_rb_tree_node); /* now free up the pcd memory */ tmem_free(pcd, NULL); atomic_dec_and_assert(global_pcd_count); if ( pgp_size != 0 && pcd_size < PAGE_SIZE ) { /* compressed data */ tmem_free(pcd_cdata, pool); pcd_tot_csize -= pcd_csize; } else if ( pcd_size != PAGE_SIZE ) { /* trailing zero data */ pcd_tot_tze_size -= pcd_size; if ( pcd_size ) tmem_free(pcd_tze, pool); } else { /* real physical page */ if ( tmem_tze_enabled() ) pcd_tot_tze_size -= PAGE_SIZE; if ( tmem_compression_enabled() ) pcd_tot_csize -= PAGE_SIZE; tmem_free_page(pool,pfp); } write_unlock(&pcd_tree_rwlocks[firstbyte]); } static int pcd_associate(struct tmem_page_descriptor *pgp, char *cdata, pagesize_t csize) { struct rb_node **new, *parent = NULL; struct rb_root *root; struct tmem_page_content_descriptor *pcd; int cmp; pagesize_t pfp_size = 0; uint8_t firstbyte = (cdata == NULL) ? tmem_get_first_byte(pgp->pfp) : *cdata; int ret = 0; if ( !tmem_dedup_enabled() ) return 0; ASSERT(pgp->us.obj != NULL); ASSERT(pgp->us.obj->pool != NULL); ASSERT(!pgp->us.obj->pool->persistent); if ( cdata == NULL ) { ASSERT(pgp->pfp != NULL); pfp_size = PAGE_SIZE; if ( tmem_tze_enabled() ) { pfp_size = tmem_tze_pfp_scan(pgp->pfp); if ( pfp_size > PCD_TZE_MAX_SIZE ) pfp_size = PAGE_SIZE; } ASSERT(pfp_size <= PAGE_SIZE); ASSERT(!(pfp_size & (sizeof(uint64_t)-1))); } write_lock(&pcd_tree_rwlocks[firstbyte]); /* look for page match */ root = &pcd_tree_roots[firstbyte]; new = &(root->rb_node); while ( *new ) { pcd = container_of(*new, struct tmem_page_content_descriptor, pcd_rb_tree_node); parent = *new; /* compare new entry and rbtree entry, set cmp accordingly */ if ( cdata != NULL ) { if ( pcd->size < PAGE_SIZE ) /* both new entry and rbtree entry are compressed */ cmp = tmem_pcd_cmp(cdata,csize,pcd->cdata,pcd->size); else /* new entry is compressed, rbtree entry is not */ cmp = -1; } else if ( pcd->size < PAGE_SIZE ) /* rbtree entry is compressed, rbtree entry is not */ cmp = 1; else if ( tmem_tze_enabled() ) { if ( pcd->size < PAGE_SIZE ) /* both new entry and rbtree entry are trailing zero */ cmp = tmem_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->tze,pcd->size); else /* new entry is trailing zero, rbtree entry is not */ cmp = tmem_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->pfp,PAGE_SIZE); } else { /* both new entry and rbtree entry are full physical pages */ ASSERT(pgp->pfp != NULL); ASSERT(pcd->pfp != NULL); cmp = tmem_page_cmp(pgp->pfp,pcd->pfp); } /* walk tree or match depending on cmp */ if ( cmp < 0 ) new = &((*new)->rb_left); else if ( cmp > 0 ) new = &((*new)->rb_right); else { /* match! if not compressed, free the no-longer-needed page */ /* but if compressed, data is assumed static so don't free! */ if ( cdata == NULL ) tmem_free_page(pgp->us.obj->pool,pgp->pfp); deduped_puts++; goto match; } } /* exited while loop with no match, so alloc a pcd and put it in the tree */ if ( (pcd = tmem_malloc(sizeof(struct tmem_page_content_descriptor), NULL)) == NULL ) { ret = -ENOMEM; goto unlock; } else if ( cdata != NULL ) { if ( (pcd->cdata = tmem_malloc(csize,pgp->us.obj->pool)) == NULL ) { tmem_free(pcd, NULL); ret = -ENOMEM; goto unlock; } } atomic_inc_and_max(global_pcd_count); RB_CLEAR_NODE(&pcd->pcd_rb_tree_node); /* is this necessary */ INIT_LIST_HEAD(&pcd->pgp_list); /* is this necessary */ pcd->pgp_ref_count = 0; if ( cdata != NULL ) { memcpy(pcd->cdata,cdata,csize); pcd->size = csize; pcd_tot_csize += csize; } else if ( pfp_size == 0 ) { ASSERT(tmem_tze_enabled()); pcd->size = 0; pcd->tze = NULL; } else if ( pfp_size < PAGE_SIZE && ((pcd->tze = tmem_malloc(pfp_size,pgp->us.obj->pool)) != NULL) ) { tmem_tze_copy_from_pfp(pcd->tze,pgp->pfp,pfp_size); pcd->size = pfp_size; pcd_tot_tze_size += pfp_size; tmem_free_page(pgp->us.obj->pool,pgp->pfp); } else { pcd->pfp = pgp->pfp; pcd->size = PAGE_SIZE; if ( tmem_tze_enabled() ) pcd_tot_tze_size += PAGE_SIZE; if ( tmem_compression_enabled() ) pcd_tot_csize += PAGE_SIZE; } rb_link_node(&pcd->pcd_rb_tree_node, parent, new); rb_insert_color(&pcd->pcd_rb_tree_node, root); match: pcd->pgp_ref_count++; list_add(&pgp->pcd_siblings,&pcd->pgp_list); pgp->firstbyte = firstbyte; pgp->eviction_attempted = 0; pgp->pcd = pcd; unlock: write_unlock(&pcd_tree_rwlocks[firstbyte]); return ret; } /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/ /* allocate a struct tmem_page_descriptor and associate it with an object */ static struct tmem_page_descriptor *pgp_alloc(struct tmem_object_root *obj) { struct tmem_page_descriptor *pgp; struct tmem_pool *pool; ASSERT(obj != NULL); ASSERT(obj->pool != NULL); pool = obj->pool; if ( (pgp = tmem_malloc(sizeof(struct tmem_page_descriptor), pool)) == NULL ) return NULL; pgp->us.obj = obj; INIT_LIST_HEAD(&pgp->global_eph_pages); INIT_LIST_HEAD(&pgp->us.client_eph_pages); pgp->pfp = NULL; if ( tmem_dedup_enabled() ) { pgp->firstbyte = NOT_SHAREABLE; pgp->eviction_attempted = 0; INIT_LIST_HEAD(&pgp->pcd_siblings); } pgp->size = -1; pgp->index = -1; pgp->timestamp = get_cycles(); atomic_inc_and_max(global_pgp_count); atomic_inc_and_max(pool->pgp_count); return pgp; } static struct tmem_page_descriptor *pgp_lookup_in_obj(struct tmem_object_root *obj, uint32_t index) { ASSERT(obj != NULL); ASSERT_SPINLOCK(&obj->obj_spinlock); ASSERT(obj->pool != NULL); return radix_tree_lookup(&obj->tree_root, index); } static void pgp_free_data(struct tmem_page_descriptor *pgp, struct tmem_pool *pool) { pagesize_t pgp_size = pgp->size; if ( pgp->pfp == NULL ) return; if ( tmem_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE ) pcd_disassociate(pgp,pool,0); /* pgp->size lost */ else if ( pgp_size ) tmem_free(pgp->cdata, pool); else tmem_free_page(pgp->us.obj->pool,pgp->pfp); if ( pool != NULL && pgp_size ) { pool->client->compressed_pages--; pool->client->compressed_sum_size -= pgp_size; } pgp->pfp = NULL; pgp->size = -1; } static void pgp_free(struct tmem_page_descriptor *pgp, int from_delete) { struct tmem_pool *pool = NULL; ASSERT(pgp->us.obj != NULL); ASSERT(pgp->us.obj->pool->client != NULL); if ( from_delete ) ASSERT(pgp_lookup_in_obj(pgp->us.obj,pgp->index) == NULL); ASSERT(pgp->us.obj->pool != NULL); pool = pgp->us.obj->pool; if ( !is_persistent(pool) ) { ASSERT(list_empty(&pgp->global_eph_pages)); ASSERT(list_empty(&pgp->us.client_eph_pages)); } pgp_free_data(pgp, pool); atomic_dec_and_assert(global_pgp_count); atomic_dec_and_assert(pool->pgp_count); pgp->size = -1; if ( is_persistent(pool) && pool->client->live_migrating ) { pgp->inv_oid = pgp->us.obj->oid; pgp->pool_id = pool->pool_id; return; } pgp->us.obj = NULL; pgp->index = -1; tmem_free(pgp, pool); } static void pgp_free_from_inv_list(struct client *client, struct tmem_page_descriptor *pgp) { struct tmem_pool *pool = client->pools[pgp->pool_id]; pgp->us.obj = NULL; pgp->index = -1; tmem_free(pgp, pool); } /* remove the page from appropriate lists but not from parent object */ static void pgp_delist(struct tmem_page_descriptor *pgp, bool_t no_eph_lock) { struct client *client; ASSERT(pgp != NULL); ASSERT(pgp->us.obj != NULL); ASSERT(pgp->us.obj->pool != NULL); client = pgp->us.obj->pool->client; ASSERT(client != NULL); if ( !is_persistent(pgp->us.obj->pool) ) { if ( !no_eph_lock ) spin_lock(&eph_lists_spinlock); if ( !list_empty(&pgp->us.client_eph_pages) ) client->eph_count--; ASSERT(client->eph_count >= 0); list_del_init(&pgp->us.client_eph_pages); if ( !list_empty(&pgp->global_eph_pages) ) global_eph_count--; ASSERT(global_eph_count >= 0); list_del_init(&pgp->global_eph_pages); if ( !no_eph_lock ) spin_unlock(&eph_lists_spinlock); } else { if ( client->live_migrating ) { spin_lock(&pers_lists_spinlock); list_add_tail(&pgp->client_inv_pages, &client->persistent_invalidated_list); if ( pgp != pgp->us.obj->pool->cur_pgp ) list_del_init(&pgp->us.pool_pers_pages); spin_unlock(&pers_lists_spinlock); } else { spin_lock(&pers_lists_spinlock); list_del_init(&pgp->us.pool_pers_pages); spin_unlock(&pers_lists_spinlock); } } } /* remove page from lists (but not from parent object) and free it */ static void pgp_delete(struct tmem_page_descriptor *pgp, bool_t no_eph_lock) { uint64_t life; ASSERT(pgp != NULL); ASSERT(pgp->us.obj != NULL); ASSERT(pgp->us.obj->pool != NULL); life = get_cycles() - pgp->timestamp; pgp->us.obj->pool->sum_life_cycles += life; pgp_delist(pgp, no_eph_lock); pgp_free(pgp,1); } /* called only indirectly by radix_tree_destroy */ static void pgp_destroy(void *v) { struct tmem_page_descriptor *pgp = (struct tmem_page_descriptor *)v; ASSERT_SPINLOCK(&pgp->us.obj->obj_spinlock); pgp_delist(pgp,0); ASSERT(pgp->us.obj != NULL); pgp->us.obj->pgp_count--; ASSERT(pgp->us.obj->pgp_count >= 0); pgp_free(pgp,0); } static int pgp_add_to_obj(struct tmem_object_root *obj, uint32_t index, struct tmem_page_descriptor *pgp) { int ret; ASSERT_SPINLOCK(&obj->obj_spinlock); ret = radix_tree_insert(&obj->tree_root, index, pgp); if ( !ret ) obj->pgp_count++; return ret; } static struct tmem_page_descriptor *pgp_delete_from_obj(struct tmem_object_root *obj, uint32_t index) { struct tmem_page_descriptor *pgp; ASSERT(obj != NULL); ASSERT_SPINLOCK(&obj->obj_spinlock); ASSERT(obj->pool != NULL); pgp = radix_tree_delete(&obj->tree_root, index); if ( pgp != NULL ) obj->pgp_count--; ASSERT(obj->pgp_count >= 0); return pgp; } /************ RADIX TREE NODE MANIPULATION ROUTINES *******************/ /* called only indirectly from radix_tree_insert */ static struct radix_tree_node *rtn_alloc(void *arg) { struct tmem_object_node *objnode; struct tmem_object_root *obj = (struct tmem_object_root *)arg; ASSERT(obj->pool != NULL); objnode = tmem_malloc(sizeof(struct tmem_object_node),obj->pool); if (objnode == NULL) return NULL; objnode->obj = obj; memset(&objnode->rtn, 0, sizeof(struct radix_tree_node)); if (++obj->pool->objnode_count > obj->pool->objnode_count_max) obj->pool->objnode_count_max = obj->pool->objnode_count; atomic_inc_and_max(global_rtree_node_count); obj->objnode_count++; return &objnode->rtn; } /* called only indirectly from radix_tree_delete/destroy */ static void rtn_free(struct radix_tree_node *rtn, void *arg) { struct tmem_pool *pool; struct tmem_object_node *objnode; ASSERT(rtn != NULL); objnode = container_of(rtn,struct tmem_object_node,rtn); ASSERT(objnode->obj != NULL); ASSERT_SPINLOCK(&objnode->obj->obj_spinlock); pool = objnode->obj->pool; ASSERT(pool != NULL); pool->objnode_count--; objnode->obj->objnode_count--; objnode->obj = NULL; tmem_free(objnode, pool); atomic_dec_and_assert(global_rtree_node_count); } /************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/ int oid_compare(struct oid *left, struct oid *right) { if ( left->oid[2] == right->oid[2] ) { if ( left->oid[1] == right->oid[1] ) { if ( left->oid[0] == right->oid[0] ) return 0; else if ( left->oid[0] < right->oid[0] ) return -1; else return 1; } else if ( left->oid[1] < right->oid[1] ) return -1; else return 1; } else if ( left->oid[2] < right->oid[2] ) return -1; else return 1; } void oid_set_invalid(struct oid *oidp) { oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL; } unsigned oid_hash(struct oid *oidp) { return (tmem_hash(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2], BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK); } /* searches for object==oid in pool, returns locked object if found */ static struct tmem_object_root * obj_find(struct tmem_pool *pool, struct oid *oidp) { struct rb_node *node; struct tmem_object_root *obj; restart_find: read_lock(&pool->pool_rwlock); node = pool->obj_rb_root[oid_hash(oidp)].rb_node; while ( node ) { obj = container_of(node, struct tmem_object_root, rb_tree_node); switch ( oid_compare(&obj->oid, oidp) ) { case 0: /* equal */ if ( !spin_trylock(&obj->obj_spinlock) ) { read_unlock(&pool->pool_rwlock); goto restart_find; } read_unlock(&pool->pool_rwlock); return obj; case -1: node = node->rb_left; break; case 1: node = node->rb_right; } } read_unlock(&pool->pool_rwlock); return NULL; } /* free an object that has no more pgps in it */ static void obj_free(struct tmem_object_root *obj, int no_rebalance) { struct tmem_pool *pool; struct oid old_oid; ASSERT_SPINLOCK(&obj->obj_spinlock); ASSERT(obj != NULL); ASSERT(obj->pgp_count == 0); pool = obj->pool; ASSERT(pool != NULL); ASSERT(pool->client != NULL); ASSERT_WRITELOCK(&pool->pool_rwlock); if ( obj->tree_root.rnode != NULL ) /* may be a "stump" with no leaves */ radix_tree_destroy(&obj->tree_root, pgp_destroy); ASSERT((long)obj->objnode_count == 0); ASSERT(obj->tree_root.rnode == NULL); pool->obj_count--; ASSERT(pool->obj_count >= 0); obj->pool = NULL; old_oid = obj->oid; oid_set_invalid(&obj->oid); obj->last_client = TMEM_CLI_ID_NULL; atomic_dec_and_assert(global_obj_count); /* use no_rebalance only if all objects are being destroyed anyway */ if ( !no_rebalance ) rb_erase(&obj->rb_tree_node,&pool->obj_rb_root[oid_hash(&old_oid)]); spin_unlock(&obj->obj_spinlock); tmem_free(obj, pool); } static int obj_rb_insert(struct rb_root *root, struct tmem_object_root *obj) { struct rb_node **new, *parent = NULL; struct tmem_object_root *this; new = &(root->rb_node); while ( *new ) { this = container_of(*new, struct tmem_object_root, rb_tree_node); parent = *new; switch ( oid_compare(&this->oid, &obj->oid) ) { case 0: return 0; case -1: new = &((*new)->rb_left); break; case 1: new = &((*new)->rb_right); break; } } rb_link_node(&obj->rb_tree_node, parent, new); rb_insert_color(&obj->rb_tree_node, root); return 1; } /* * allocate, initialize, and insert an tmem_object_root * (should be called only if find failed) */ static struct tmem_object_root * obj_new(struct tmem_pool *pool, struct oid *oidp) { struct tmem_object_root *obj; ASSERT(pool != NULL); ASSERT_WRITELOCK(&pool->pool_rwlock); if ( (obj = tmem_malloc(sizeof(struct tmem_object_root), pool)) == NULL ) return NULL; pool->obj_count++; if (pool->obj_count > pool->obj_count_max) pool->obj_count_max = pool->obj_count; atomic_inc_and_max(global_obj_count); radix_tree_init(&obj->tree_root); radix_tree_set_alloc_callbacks(&obj->tree_root, rtn_alloc, rtn_free, obj); spin_lock_init(&obj->obj_spinlock); obj->pool = pool; obj->oid = *oidp; obj->objnode_count = 0; obj->pgp_count = 0; obj->last_client = TMEM_CLI_ID_NULL; spin_lock(&obj->obj_spinlock); obj_rb_insert(&pool->obj_rb_root[oid_hash(oidp)], obj); ASSERT_SPINLOCK(&obj->obj_spinlock); return obj; } /* free an object after destroying any pgps in it */ static void obj_destroy(struct tmem_object_root *obj, int no_rebalance) { ASSERT_WRITELOCK(&obj->pool->pool_rwlock); radix_tree_destroy(&obj->tree_root, pgp_destroy); obj_free(obj,no_rebalance); } /* destroys all objs in a pool, or only if obj->last_client matches cli_id */ static void pool_destroy_objs(struct tmem_pool *pool, bool_t selective, domid_t cli_id) { struct rb_node *node; struct tmem_object_root *obj; int i; write_lock(&pool->pool_rwlock); pool->is_dying = 1; for (i = 0; i < OBJ_HASH_BUCKETS; i++) { node = rb_first(&pool->obj_rb_root[i]); while ( node != NULL ) { obj = container_of(node, struct tmem_object_root, rb_tree_node); spin_lock(&obj->obj_spinlock); node = rb_next(node); if ( !selective ) /* FIXME: should be obj,1 but walking/erasing rbtree is racy */ obj_destroy(obj,0); else if ( obj->last_client == cli_id ) obj_destroy(obj,0); else spin_unlock(&obj->obj_spinlock); } } write_unlock(&pool->pool_rwlock); } /************ POOL MANIPULATION ROUTINES ******************************/ static struct tmem_pool * pool_alloc(void) { struct tmem_pool *pool; int i; if ( (pool = xmalloc(struct tmem_pool)) == NULL ) return NULL; for (i = 0; i < OBJ_HASH_BUCKETS; i++) pool->obj_rb_root[i] = RB_ROOT; INIT_LIST_HEAD(&pool->pool_list); INIT_LIST_HEAD(&pool->persistent_page_list); pool->cur_pgp = NULL; rwlock_init(&pool->pool_rwlock); pool->pgp_count_max = pool->obj_count_max = 0; pool->objnode_count = pool->objnode_count_max = 0; atomic_set(&pool->pgp_count,0); pool->obj_count = 0; pool->shared_count = 0; pool->pageshift = PAGE_SHIFT - 12; pool->good_puts = pool->puts = pool->dup_puts_flushed = 0; pool->dup_puts_replaced = pool->no_mem_puts = 0; pool->found_gets = pool->gets = 0; pool->flushs_found = pool->flushs = 0; pool->flush_objs_found = pool->flush_objs = 0; pool->is_dying = 0; return pool; } static void pool_free(struct tmem_pool *pool) { pool->client = NULL; list_del(&pool->pool_list); xfree(pool); } /* register new_client as a user of this shared pool and return new total number of registered users */ static int shared_pool_join(struct tmem_pool *pool, struct client *new_client) { struct share_list *sl; ASSERT(is_shared(pool)); if ( (sl = tmem_malloc(sizeof(struct share_list), NULL)) == NULL ) return -1; sl->client = new_client; list_add_tail(&sl->share_list, &pool->share_list); if ( new_client->cli_id != pool->client->cli_id ) tmem_client_info("adding new %s %d to shared pool owned by %s %d\n", tmem_client_str, new_client->cli_id, tmem_client_str, pool->client->cli_id); return ++pool->shared_count; } /* reassign "ownership" of the pool to another client that shares this pool */ static void shared_pool_reassign(struct tmem_pool *pool) { struct share_list *sl; int poolid; struct client *old_client = pool->client, *new_client; ASSERT(is_shared(pool)); if ( list_empty(&pool->share_list) ) { ASSERT(pool->shared_count == 0); return; } old_client->pools[pool->pool_id] = NULL; sl = list_entry(pool->share_list.next, struct share_list, share_list); ASSERT(sl->client != old_client); pool->client = new_client = sl->client; for (poolid = 0; poolid < MAX_POOLS_PER_DOMAIN; poolid++) if (new_client->pools[poolid] == pool) break; ASSERT(poolid != MAX_POOLS_PER_DOMAIN); new_client->eph_count += _atomic_read(pool->pgp_count); old_client->eph_count -= _atomic_read(pool->pgp_count); list_splice_init(&old_client->ephemeral_page_list, &new_client->ephemeral_page_list); tmem_client_info("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n", tmem_cli_id_str, old_client->cli_id, tmem_cli_id_str, new_client->cli_id, poolid); pool->pool_id = poolid; } /* destroy all objects with last_client same as passed cli_id, remove pool's cli_id from list of sharers of this pool */ static int shared_pool_quit(struct tmem_pool *pool, domid_t cli_id) { struct share_list *sl; int s_poolid; ASSERT(is_shared(pool)); ASSERT(pool->client != NULL); ASSERT_WRITELOCK(&tmem_rwlock); pool_destroy_objs(pool,1,cli_id); list_for_each_entry(sl,&pool->share_list, share_list) { if (sl->client->cli_id != cli_id) continue; list_del(&sl->share_list); tmem_free(sl, pool); --pool->shared_count; if (pool->client->cli_id == cli_id) shared_pool_reassign(pool); if (pool->shared_count) return pool->shared_count; for (s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++) if ( (global_shared_pools[s_poolid]) == pool ) { global_shared_pools[s_poolid] = NULL; break; } return 0; } tmem_client_warn("tmem: no match unsharing pool, %s=%d\n", tmem_cli_id_str,pool->client->cli_id); return -1; } /* flush all data (owned by cli_id) from a pool and, optionally, free it */ static void pool_flush(struct tmem_pool *pool, domid_t cli_id, bool_t destroy) { ASSERT(pool != NULL); if ( (is_shared(pool)) && (shared_pool_quit(pool,cli_id) > 0) ) { tmem_client_warn("tmem: %s=%d no longer using shared pool %d owned by %s=%d\n", tmem_cli_id_str, cli_id, pool->pool_id, tmem_cli_id_str,pool->client->cli_id); return; } tmem_client_info("%s %s-%s tmem pool %s=%d pool_id=%d\n", destroy ? "destroying" : "flushing", is_persistent(pool) ? "persistent" : "ephemeral" , is_shared(pool) ? "shared" : "private", tmem_cli_id_str, pool->client->cli_id, pool->pool_id); if ( pool->client->live_migrating ) { tmem_client_warn("can't %s pool while %s is live-migrating\n", destroy?"destroy":"flush", tmem_client_str); return; } pool_destroy_objs(pool,0,TMEM_CLI_ID_NULL); if ( destroy ) { pool->client->pools[pool->pool_id] = NULL; pool_free(pool); } } /************ CLIENT MANIPULATION OPERATIONS **************************/ static struct client *client_create(domid_t cli_id) { struct client *client = xzalloc(struct client); int i, shift; char name[5]; struct domain *d; tmem_client_info("tmem: initializing tmem capability for %s=%d...", tmem_cli_id_str, cli_id); if ( client == NULL ) { tmem_client_err("failed... out of memory\n"); goto fail; } for (i = 0, shift = 12; i < 4; shift -=4, i++) name[i] = (((unsigned short)cli_id >> shift) & 0xf) + '0'; name[4] = '\0'; client->persistent_pool = xmem_pool_create(name, tmem_persistent_pool_page_get, tmem_persistent_pool_page_put, PAGE_SIZE, 0, PAGE_SIZE); if ( client->persistent_pool == NULL ) { tmem_client_err("failed... can't alloc persistent pool\n"); goto fail; } d = rcu_lock_domain_by_id(cli_id); if ( d == NULL ) { tmem_client_err("failed... can't set client\n"); xmem_pool_destroy(client->persistent_pool); goto fail; } if ( !d->is_dying ) { d->tmem_client = client; client->domain = d; } rcu_unlock_domain(d); client->cli_id = cli_id; client->compress = tmem_compression_enabled(); client->shared_auth_required = tmem_shared_auth(); for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++) client->shared_auth_uuid[i][0] = client->shared_auth_uuid[i][1] = -1L; client->frozen = 0; client->live_migrating = 0; client->weight = 0; client->cap = 0; list_add_tail(&client->client_list, &global_client_list); INIT_LIST_HEAD(&client->ephemeral_page_list); INIT_LIST_HEAD(&client->persistent_invalidated_list); client->cur_pgp = NULL; client->eph_count = client->eph_count_max = 0; client->total_cycles = 0; client->succ_pers_puts = 0; client->succ_eph_gets = 0; client->succ_pers_gets = 0; tmem_client_info("ok\n"); return client; fail: xfree(client); return NULL; } static void client_free(struct client *client) { list_del(&client->client_list); xmem_pool_destroy(client->persistent_pool); xfree(client); } /* flush all data from a client and, optionally, free it */ static void client_flush(struct client *client, bool_t destroy) { int i; struct tmem_pool *pool; for (i = 0; i < MAX_POOLS_PER_DOMAIN; i++) { if ( (pool = client->pools[i]) == NULL ) continue; pool_flush(pool,client->cli_id,destroy); if ( destroy ) client->pools[i] = NULL; } if ( destroy ) client_free(client); } static bool_t client_over_quota(struct client *client) { int total = _atomic_read(client_weight_total); ASSERT(client != NULL); if ( (total == 0) || (client->weight == 0) || (client->eph_count == 0) ) return 0; return ( ((global_eph_count*100L) / client->eph_count ) > ((total*100L) / client->weight) ); } static void client_freeze(struct client *client, int freeze) { client->frozen = freeze; } /************ MEMORY REVOCATION ROUTINES *******************************/ static bool_t tmem_try_to_evict_pgp(struct tmem_page_descriptor *pgp, bool_t *hold_pool_rwlock) { struct tmem_object_root *obj = pgp->us.obj; struct tmem_pool *pool = obj->pool; struct client *client = pool->client; uint16_t firstbyte = pgp->firstbyte; if ( pool->is_dying ) return 0; if ( spin_trylock(&obj->obj_spinlock) ) { if ( tmem_dedup_enabled() ) { firstbyte = pgp->firstbyte; if ( firstbyte == NOT_SHAREABLE ) goto obj_unlock; ASSERT(firstbyte < 256); if ( !write_trylock(&pcd_tree_rwlocks[firstbyte]) ) goto obj_unlock; if ( pgp->pcd->pgp_ref_count > 1 && !pgp->eviction_attempted ) { pgp->eviction_attempted++; list_del(&pgp->global_eph_pages); list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list); list_del(&pgp->us.client_eph_pages); list_add_tail(&pgp->us.client_eph_pages,&client->ephemeral_page_list); goto pcd_unlock; } } if ( obj->pgp_count > 1 ) return 1; if ( write_trylock(&pool->pool_rwlock) ) { *hold_pool_rwlock = 1; return 1; } pcd_unlock: write_unlock(&pcd_tree_rwlocks[firstbyte]); obj_unlock: spin_unlock(&obj->obj_spinlock); } return 0; } static int tmem_evict(void) { struct client *client = current->domain->tmem_client; struct tmem_page_descriptor *pgp = NULL, *pgp2, *pgp_del; struct tmem_object_root *obj; struct tmem_pool *pool; int ret = 0; bool_t hold_pool_rwlock = 0; evict_attempts++; spin_lock(&eph_lists_spinlock); if ( (client != NULL) && client_over_quota(client) && !list_empty(&client->ephemeral_page_list) ) { list_for_each_entry_safe(pgp,pgp2,&client->ephemeral_page_list,us.client_eph_pages) if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) ) goto found; } else if ( list_empty(&global_ephemeral_page_list) ) { goto out; } else { list_for_each_entry_safe(pgp,pgp2,&global_ephemeral_page_list,global_eph_pages) if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) ) goto found; } ret = 0; goto out; found: ASSERT(pgp != NULL); obj = pgp->us.obj; ASSERT(obj != NULL); ASSERT(obj->pool != NULL); pool = obj->pool; ASSERT_SPINLOCK(&obj->obj_spinlock); pgp_del = pgp_delete_from_obj(obj, pgp->index); ASSERT(pgp_del == pgp); if ( tmem_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE ) { ASSERT(pgp->pcd->pgp_ref_count == 1 || pgp->eviction_attempted); pcd_disassociate(pgp,pool,1); } pgp_delete(pgp,1); if ( obj->pgp_count == 0 ) { ASSERT_WRITELOCK(&pool->pool_rwlock); obj_free(obj,0); } else spin_unlock(&obj->obj_spinlock); if ( hold_pool_rwlock ) write_unlock(&pool->pool_rwlock); evicted_pgs++; ret = 1; out: spin_unlock(&eph_lists_spinlock); return ret; } static unsigned long tmem_flush_npages(unsigned long n) { unsigned long avail_pages = 0; while ( (avail_pages = tmem_page_list_pages) < n ) { if ( !tmem_evict() ) break; } if ( avail_pages ) { spin_lock(&tmem_page_list_lock); while ( !page_list_empty(&tmem_page_list) ) { struct page_info *pg = page_list_remove_head(&tmem_page_list); scrub_one_page(pg); tmem_page_list_pages--; free_domheap_page(pg); } ASSERT(tmem_page_list_pages == 0); INIT_PAGE_LIST_HEAD(&tmem_page_list); spin_unlock(&tmem_page_list_lock); } return avail_pages; } /* * Under certain conditions (e.g. if each client is putting pages for exactly * one object), once locks are held, freeing up memory may * result in livelocks and very long "put" times, so we try to ensure there * is a minimum amount of memory (1MB) available BEFORE any data structure * locks are held. */ static inline bool_t tmem_ensure_avail_pages(void) { int failed_evict = 10; unsigned long free_mem; do { free_mem = (tmem_page_list_pages + total_free_pages()) >> (20 - PAGE_SHIFT); if ( free_mem ) return 1; if ( !tmem_evict() ) failed_evict--; } while ( failed_evict > 0 ); return 0; } /************ TMEM CORE OPERATIONS ************************************/ static int do_tmem_put_compress(struct tmem_page_descriptor *pgp, xen_pfn_t cmfn, tmem_cli_va_param_t clibuf) { void *dst, *p; size_t size; int ret = 0; ASSERT(pgp != NULL); ASSERT(pgp->us.obj != NULL); ASSERT_SPINLOCK(&pgp->us.obj->obj_spinlock); ASSERT(pgp->us.obj->pool != NULL); ASSERT(pgp->us.obj->pool->client != NULL); if ( pgp->pfp != NULL ) pgp_free_data(pgp, pgp->us.obj->pool); ret = tmem_compress_from_client(cmfn, &dst, &size, clibuf); if ( ret <= 0 ) goto out; else if ( (size == 0) || (size >= tmem_mempool_maxalloc) ) { ret = 0; goto out; } else if ( tmem_dedup_enabled() && !is_persistent(pgp->us.obj->pool) ) { if ( (ret = pcd_associate(pgp,dst,size)) == -ENOMEM ) goto out; } else if ( (p = tmem_malloc(size,pgp->us.obj->pool)) == NULL ) { ret = -ENOMEM; goto out; } else { memcpy(p,dst,size); pgp->cdata = p; } pgp->size = size; pgp->us.obj->pool->client->compressed_pages++; pgp->us.obj->pool->client->compressed_sum_size += size; ret = 1; out: return ret; } static int do_tmem_dup_put(struct tmem_page_descriptor *pgp, xen_pfn_t cmfn, tmem_cli_va_param_t clibuf) { struct tmem_pool *pool; struct tmem_object_root *obj; struct client *client; struct tmem_page_descriptor *pgpfound = NULL; int ret; ASSERT(pgp != NULL); ASSERT(pgp->pfp != NULL); ASSERT(pgp->size != -1); obj = pgp->us.obj; ASSERT_SPINLOCK(&obj->obj_spinlock); ASSERT(obj != NULL); pool = obj->pool; ASSERT(pool != NULL); client = pool->client; if ( client->live_migrating ) goto failed_dup; /* no dups allowed when migrating */ /* can we successfully manipulate pgp to change out the data? */ if ( client->compress && pgp->size != 0 ) { ret = do_tmem_put_compress(pgp, cmfn, clibuf); if ( ret == 1 ) goto done; else if ( ret == 0 ) goto copy_uncompressed; else if ( ret == -ENOMEM ) goto failed_dup; else if ( ret == -EFAULT ) goto bad_copy; } copy_uncompressed: if ( pgp->pfp ) pgp_free_data(pgp, pool); if ( ( pgp->pfp = tmem_alloc_page(pool) ) == NULL ) goto failed_dup; pgp->size = 0; ret = tmem_copy_from_client(pgp->pfp, cmfn, tmem_cli_buf_null); if ( ret < 0 ) goto bad_copy; if ( tmem_dedup_enabled() && !is_persistent(pool) ) { if ( pcd_associate(pgp,NULL,0) == -ENOMEM ) goto failed_dup; } done: /* successfully replaced data, clean up and return success */ if ( is_shared(pool) ) obj->last_client = client->cli_id; spin_unlock(&obj->obj_spinlock); pool->dup_puts_replaced++; pool->good_puts++; if ( is_persistent(pool) ) client->succ_pers_puts++; return 1; bad_copy: failed_copies++; goto cleanup; failed_dup: /* couldn't change out the data, flush the old data and return * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put */ ret = -ENOSPC; cleanup: pgpfound = pgp_delete_from_obj(obj, pgp->index); ASSERT(pgpfound == pgp); pgp_delete(pgpfound,0); if ( obj->pgp_count == 0 ) { write_lock(&pool->pool_rwlock); obj_free(obj,0); write_unlock(&pool->pool_rwlock); } else { spin_unlock(&obj->obj_spinlock); } pool->dup_puts_flushed++; return ret; } static int do_tmem_put(struct tmem_pool *pool, struct oid *oidp, uint32_t index, xen_pfn_t cmfn, tmem_cli_va_param_t clibuf) { struct tmem_object_root *obj = NULL; struct tmem_page_descriptor *pgp = NULL; struct client *client; int ret, newobj = 0; ASSERT(pool != NULL); client = pool->client; ret = client->frozen ? -EFROZEN : -ENOMEM; pool->puts++; /* does page already exist (dup)? if so, handle specially */ if ( (obj = obj_find(pool,oidp)) != NULL ) { if ((pgp = pgp_lookup_in_obj(obj, index)) != NULL) { return do_tmem_dup_put(pgp, cmfn, clibuf); } else { /* no puts allowed into a frozen pool (except dup puts) */ if ( client->frozen ) goto unlock_obj; } } else { /* no puts allowed into a frozen pool (except dup puts) */ if ( client->frozen ) return ret; write_lock(&pool->pool_rwlock); if ( (obj = obj_new(pool,oidp)) == NULL ) { write_unlock(&pool->pool_rwlock); return -ENOMEM; } newobj = 1; write_unlock(&pool->pool_rwlock); } /* When arrive here, we have a spinlocked obj for use */ ASSERT_SPINLOCK(&obj->obj_spinlock); if ( (pgp = pgp_alloc(obj)) == NULL ) goto unlock_obj; ret = pgp_add_to_obj(obj, index, pgp); if ( ret == -ENOMEM ) /* warning, may result in partially built radix tree ("stump") */ goto free_pgp; pgp->index = index; pgp->size = 0; if ( client->compress ) { ASSERT(pgp->pfp == NULL); ret = do_tmem_put_compress(pgp, cmfn, clibuf); if ( ret == 1 ) goto insert_page; if ( ret == -ENOMEM ) { client->compress_nomem++; goto del_pgp_from_obj; } if ( ret == 0 ) { client->compress_poor++; goto copy_uncompressed; } if ( ret == -EFAULT ) goto bad_copy; } copy_uncompressed: if ( ( pgp->pfp = tmem_alloc_page(pool) ) == NULL ) { ret = -ENOMEM; goto del_pgp_from_obj; } ret = tmem_copy_from_client(pgp->pfp, cmfn, clibuf); if ( ret < 0 ) goto bad_copy; if ( tmem_dedup_enabled() && !is_persistent(pool) ) { if ( pcd_associate(pgp,NULL,0) == -ENOMEM ) goto del_pgp_from_obj; } insert_page: if ( !is_persistent(pool) ) { spin_lock(&eph_lists_spinlock); list_add_tail(&pgp->global_eph_pages, &global_ephemeral_page_list); if (++global_eph_count > global_eph_count_max) global_eph_count_max = global_eph_count; list_add_tail(&pgp->us.client_eph_pages, &client->ephemeral_page_list); if (++client->eph_count > client->eph_count_max) client->eph_count_max = client->eph_count; spin_unlock(&eph_lists_spinlock); } else { /* is_persistent */ spin_lock(&pers_lists_spinlock); list_add_tail(&pgp->us.pool_pers_pages, &pool->persistent_page_list); spin_unlock(&pers_lists_spinlock); } if ( is_shared(pool) ) obj->last_client = client->cli_id; /* free the obj spinlock */ spin_unlock(&obj->obj_spinlock); pool->good_puts++; if ( is_persistent(pool) ) client->succ_pers_puts++; else tot_good_eph_puts++; return 1; bad_copy: failed_copies++; del_pgp_from_obj: ASSERT((obj != NULL) && (pgp != NULL) && (pgp->index != -1)); pgp_delete_from_obj(obj, pgp->index); free_pgp: pgp_delete(pgp, 0); unlock_obj: if ( newobj ) { write_lock(&pool->pool_rwlock); obj_free(obj, 0); write_unlock(&pool->pool_rwlock); } else { spin_unlock(&obj->obj_spinlock); } pool->no_mem_puts++; return ret; } static int do_tmem_get(struct tmem_pool *pool, struct oid *oidp, uint32_t index, xen_pfn_t cmfn, tmem_cli_va_param_t clibuf) { struct tmem_object_root *obj; struct tmem_page_descriptor *pgp; struct client *client = pool->client; int rc; if ( !_atomic_read(pool->pgp_count) ) return -EEMPTY; pool->gets++; obj = obj_find(pool,oidp); if ( obj == NULL ) return 0; ASSERT_SPINLOCK(&obj->obj_spinlock); if (is_shared(pool) || is_persistent(pool) ) pgp = pgp_lookup_in_obj(obj, index); else pgp = pgp_delete_from_obj(obj, index); if ( pgp == NULL ) { spin_unlock(&obj->obj_spinlock); return 0; } ASSERT(pgp->size != -1); if ( tmem_dedup_enabled() && !is_persistent(pool) && pgp->firstbyte != NOT_SHAREABLE ) rc = pcd_copy_to_client(cmfn, pgp); else if ( pgp->size != 0 ) { rc = tmem_decompress_to_client(cmfn, pgp->cdata, pgp->size, clibuf); } else rc = tmem_copy_to_client(cmfn, pgp->pfp, clibuf); if ( rc <= 0 ) goto bad_copy; if ( !is_persistent(pool) ) { if ( !is_shared(pool) ) { pgp_delete(pgp,0); if ( obj->pgp_count == 0 ) { write_lock(&pool->pool_rwlock); obj_free(obj,0); obj = NULL; write_unlock(&pool->pool_rwlock); } } else { spin_lock(&eph_lists_spinlock); list_del(&pgp->global_eph_pages); list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list); list_del(&pgp->us.client_eph_pages); list_add_tail(&pgp->us.client_eph_pages,&client->ephemeral_page_list); spin_unlock(&eph_lists_spinlock); obj->last_client = current->domain->domain_id; } } if ( obj != NULL ) { spin_unlock(&obj->obj_spinlock); } pool->found_gets++; if ( is_persistent(pool) ) client->succ_pers_gets++; else client->succ_eph_gets++; return 1; bad_copy: spin_unlock(&obj->obj_spinlock); failed_copies++; return rc; } static int do_tmem_flush_page(struct tmem_pool *pool, struct oid *oidp, uint32_t index) { struct tmem_object_root *obj; struct tmem_page_descriptor *pgp; pool->flushs++; obj = obj_find(pool,oidp); if ( obj == NULL ) goto out; pgp = pgp_delete_from_obj(obj, index); if ( pgp == NULL ) { spin_unlock(&obj->obj_spinlock); goto out; } pgp_delete(pgp,0); if ( obj->pgp_count == 0 ) { write_lock(&pool->pool_rwlock); obj_free(obj,0); write_unlock(&pool->pool_rwlock); } else { spin_unlock(&obj->obj_spinlock); } pool->flushs_found++; out: if ( pool->client->frozen ) return -EFROZEN; else return 1; } static int do_tmem_flush_object(struct tmem_pool *pool, struct oid *oidp) { struct tmem_object_root *obj; pool->flush_objs++; obj = obj_find(pool,oidp); if ( obj == NULL ) goto out; write_lock(&pool->pool_rwlock); obj_destroy(obj,0); pool->flush_objs_found++; write_unlock(&pool->pool_rwlock); out: if ( pool->client->frozen ) return -EFROZEN; else return 1; } static int do_tmem_destroy_pool(uint32_t pool_id) { struct client *client = current->domain->tmem_client; struct tmem_pool *pool; if ( client->pools == NULL ) return 0; if ( pool_id >= MAX_POOLS_PER_DOMAIN ) return 0; if ( (pool = client->pools[pool_id]) == NULL ) return 0; client->pools[pool_id] = NULL; pool_flush(pool,client->cli_id,1); return 1; } static int do_tmem_new_pool(domid_t this_cli_id, uint32_t d_poolid, uint32_t flags, uint64_t uuid_lo, uint64_t uuid_hi) { struct client *client; domid_t cli_id; int persistent = flags & TMEM_POOL_PERSIST; int shared = flags & TMEM_POOL_SHARED; int pagebits = (flags >> TMEM_POOL_PAGESIZE_SHIFT) & TMEM_POOL_PAGESIZE_MASK; int specversion = (flags >> TMEM_POOL_VERSION_SHIFT) & TMEM_POOL_VERSION_MASK; struct tmem_pool *pool, *shpool; int s_poolid, first_unused_s_poolid; int i; if ( this_cli_id == TMEM_CLI_ID_NULL ) cli_id = current->domain->domain_id; else cli_id = this_cli_id; tmem_client_info("tmem: allocating %s-%s tmem pool for %s=%d...", persistent ? "persistent" : "ephemeral" , shared ? "shared" : "private", tmem_cli_id_str, cli_id); if ( specversion != TMEM_SPEC_VERSION ) { tmem_client_err("failed... unsupported spec version\n"); return -EPERM; } if ( pagebits != (PAGE_SHIFT - 12) ) { tmem_client_err("failed... unsupported pagesize %d\n", 1 << (pagebits + 12)); return -EPERM; } if ( flags & TMEM_POOL_PRECOMPRESSED ) { tmem_client_err("failed... precompression flag set but unsupported\n"); return -EPERM; } if ( flags & TMEM_POOL_RESERVED_BITS ) { tmem_client_err("failed... reserved bits must be zero\n"); return -EPERM; } if ( (pool = pool_alloc()) == NULL ) { tmem_client_err("failed... out of memory\n"); return -ENOMEM; } if ( this_cli_id != TMEM_CLI_ID_NULL ) { if ( (client = tmem_client_from_cli_id(this_cli_id)) == NULL || d_poolid >= MAX_POOLS_PER_DOMAIN || client->pools[d_poolid] != NULL ) goto fail; } else { client = current->domain->tmem_client; ASSERT(client != NULL); for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ ) if ( client->pools[d_poolid] == NULL ) break; if ( d_poolid >= MAX_POOLS_PER_DOMAIN ) { tmem_client_err("failed... no more pool slots available for this %s\n", tmem_client_str); goto fail; } } if ( shared ) { if ( uuid_lo == -1L && uuid_hi == -1L ) shared = 0; if ( client->shared_auth_required && !global_shared_auth ) { for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++) if ( (client->shared_auth_uuid[i][0] == uuid_lo) && (client->shared_auth_uuid[i][1] == uuid_hi) ) break; if ( i == MAX_GLOBAL_SHARED_POOLS ) shared = 0; } } pool->shared = shared; pool->client = client; if ( shared ) { first_unused_s_poolid = MAX_GLOBAL_SHARED_POOLS; for ( s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++ ) { if ( (shpool = global_shared_pools[s_poolid]) != NULL ) { if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi ) { tmem_client_info("(matches shared pool uuid=%"PRIx64".%"PRIx64") pool_id=%d\n", uuid_hi, uuid_lo, d_poolid); client->pools[d_poolid] = global_shared_pools[s_poolid]; shared_pool_join(global_shared_pools[s_poolid], client); pool_free(pool); return d_poolid; } } else if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS ) first_unused_s_poolid = s_poolid; } if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS ) { tmem_client_warn("tmem: failed... no global shared pool slots available\n"); goto fail; } else { INIT_LIST_HEAD(&pool->share_list); pool->shared_count = 0; global_shared_pools[first_unused_s_poolid] = pool; (void)shared_pool_join(pool,client); } } client->pools[d_poolid] = pool; list_add_tail(&pool->pool_list, &global_pool_list); pool->pool_id = d_poolid; pool->persistent = persistent; pool->uuid[0] = uuid_lo; pool->uuid[1] = uuid_hi; tmem_client_info("pool_id=%d\n", d_poolid); return d_poolid; fail: pool_free(pool); return -EPERM; } /************ TMEM CONTROL OPERATIONS ************************************/ /* freeze/thaw all pools belonging to client cli_id (all domains if -1) */ static int tmemc_freeze_pools(domid_t cli_id, int arg) { struct client *client; bool_t freeze = (arg == TMEMC_FREEZE) ? 1 : 0; bool_t destroy = (arg == TMEMC_DESTROY) ? 1 : 0; char *s; s = destroy ? "destroyed" : ( freeze ? "frozen" : "thawed" ); if ( cli_id == TMEM_CLI_ID_NULL ) { list_for_each_entry(client,&global_client_list,client_list) client_freeze(client,freeze); tmem_client_info("tmem: all pools %s for all %ss\n", s, tmem_client_str); } else { if ( (client = tmem_client_from_cli_id(cli_id)) == NULL) return -1; client_freeze(client,freeze); tmem_client_info("tmem: all pools %s for %s=%d\n", s, tmem_cli_id_str, cli_id); } return 0; } static int tmemc_flush_mem(domid_t cli_id, uint32_t kb) { uint32_t npages, flushed_pages, flushed_kb; if ( cli_id != TMEM_CLI_ID_NULL ) { tmem_client_warn("tmem: %s-specific flush not supported yet, use --all\n", tmem_client_str); return -1; } /* convert kb to pages, rounding up if necessary */ npages = (kb + ((1 << (PAGE_SHIFT-10))-1)) >> (PAGE_SHIFT-10); flushed_pages = tmem_flush_npages(npages); flushed_kb = flushed_pages << (PAGE_SHIFT-10); return flushed_kb; } /* * These tmemc_list* routines output lots of stats in a format that is * intended to be program-parseable, not human-readable. Further, by * tying each group of stats to a line format indicator (e.g. G= for * global stats) and each individual stat to a two-letter specifier * (e.g. Ec:nnnnn in the G= line says there are nnnnn pages in the * global ephemeral pool), it should allow the stats reported to be * forward and backwards compatible as tmem evolves. */ #define BSIZE 1024 static int tmemc_list_client(struct client *c, tmem_cli_va_param_t buf, int off, uint32_t len, bool_t use_long) { char info[BSIZE]; int i, n = 0, sum = 0; struct tmem_pool *p; bool_t s; n = scnprintf(info,BSIZE,"C=CI:%d,ww:%d,ca:%d,co:%d,fr:%d," "Tc:%"PRIu64",Ge:%ld,Pp:%ld,Gp:%ld%c", c->cli_id, c->weight, c->cap, c->compress, c->frozen, c->total_cycles, c->succ_eph_gets, c->succ_pers_puts, c->succ_pers_gets, use_long ? ',' : '\n'); if (use_long) n += scnprintf(info+n,BSIZE-n, "Ec:%ld,Em:%ld,cp:%ld,cb:%"PRId64",cn:%ld,cm:%ld\n", c->eph_count, c->eph_count_max, c->compressed_pages, c->compressed_sum_size, c->compress_poor, c->compress_nomem); if ( !copy_to_guest_offset(buf, off + sum, info, n + 1) ) sum += n; for ( i = 0; i < MAX_POOLS_PER_DOMAIN; i++ ) { if ( (p = c->pools[i]) == NULL ) continue; s = is_shared(p); n = scnprintf(info,BSIZE,"P=CI:%d,PI:%d," "PT:%c%c,U0:%"PRIx64",U1:%"PRIx64"%c", c->cli_id, p->pool_id, is_persistent(p) ? 'P' : 'E', s ? 'S' : 'P', (uint64_t)(s ? p->uuid[0] : 0), (uint64_t)(s ? p->uuid[1] : 0LL), use_long ? ',' : '\n'); if (use_long) n += scnprintf(info+n,BSIZE-n, "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu," "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu," "fs:%lu,ft:%lu,os:%lu,ot:%lu\n", _atomic_read(p->pgp_count), p->pgp_count_max, p->obj_count, p->obj_count_max, p->objnode_count, p->objnode_count_max, p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced, p->no_mem_puts, p->found_gets, p->gets, p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs); if ( sum + n >= len ) return sum; if ( !copy_to_guest_offset(buf, off + sum, info, n + 1) ) sum += n; } return sum; } static int tmemc_list_shared(tmem_cli_va_param_t buf, int off, uint32_t len, bool_t use_long) { char info[BSIZE]; int i, n = 0, sum = 0; struct tmem_pool *p; struct share_list *sl; for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++ ) { if ( (p = global_shared_pools[i]) == NULL ) continue; n = scnprintf(info+n,BSIZE-n,"S=SI:%d,PT:%c%c,U0:%"PRIx64",U1:%"PRIx64, i, is_persistent(p) ? 'P' : 'E', is_shared(p) ? 'S' : 'P', p->uuid[0], p->uuid[1]); list_for_each_entry(sl,&p->share_list, share_list) n += scnprintf(info+n,BSIZE-n,",SC:%d",sl->client->cli_id); n += scnprintf(info+n,BSIZE-n,"%c", use_long ? ',' : '\n'); if (use_long) n += scnprintf(info+n,BSIZE-n, "Pc:%d,Pm:%d,Oc:%ld,Om:%ld,Nc:%lu,Nm:%lu," "ps:%lu,pt:%lu,pd:%lu,pr:%lu,px:%lu,gs:%lu,gt:%lu," "fs:%lu,ft:%lu,os:%lu,ot:%lu\n", _atomic_read(p->pgp_count), p->pgp_count_max, p->obj_count, p->obj_count_max, p->objnode_count, p->objnode_count_max, p->good_puts, p->puts,p->dup_puts_flushed, p->dup_puts_replaced, p->no_mem_puts, p->found_gets, p->gets, p->flushs_found, p->flushs, p->flush_objs_found, p->flush_objs); if ( sum + n >= len ) return sum; if ( !copy_to_guest_offset(buf, off + sum, info, n + 1) ) sum += n; } return sum; } static int tmemc_list_global_perf(tmem_cli_va_param_t buf, int off, uint32_t len, bool_t use_long) { char info[BSIZE]; int n = 0, sum = 0; n = scnprintf(info+n,BSIZE-n,"T="); n--; /* overwrite trailing comma */ n += scnprintf(info+n,BSIZE-n,"\n"); if ( sum + n >= len ) return sum; if ( !copy_to_guest_offset(buf, off + sum, info, n + 1) ) sum += n; return sum; } static int tmemc_list_global(tmem_cli_va_param_t buf, int off, uint32_t len, bool_t use_long) { char info[BSIZE]; int n = 0, sum = off; n += scnprintf(info,BSIZE,"G=" "Tt:%lu,Te:%lu,Cf:%lu,Af:%lu,Pf:%lu,Ta:%lu," "Lm:%lu,Et:%lu,Ea:%lu,Rt:%lu,Ra:%lu,Rx:%lu,Fp:%lu%c", total_tmem_ops, errored_tmem_ops, failed_copies, alloc_failed, alloc_page_failed, tmem_page_list_pages, low_on_memory, evicted_pgs, evict_attempts, relinq_pgs, relinq_attempts, max_evicts_per_relinq, total_flush_pool, use_long ? ',' : '\n'); if (use_long) n += scnprintf(info+n,BSIZE-n, "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d," "Fc:%d,Fm:%d,Sc:%d,Sm:%d,Ep:%lu,Gd:%lu,Zt:%lu,Gz:%lu\n", global_eph_count, global_eph_count_max, _atomic_read(global_obj_count), global_obj_count_max, _atomic_read(global_rtree_node_count), global_rtree_node_count_max, _atomic_read(global_pgp_count), global_pgp_count_max, _atomic_read(global_page_count), global_page_count_max, _atomic_read(global_pcd_count), global_pcd_count_max, tot_good_eph_puts,deduped_puts,pcd_tot_tze_size,pcd_tot_csize); if ( sum + n >= len ) return sum; if ( !copy_to_guest_offset(buf, off + sum, info, n + 1) ) sum += n; return sum; } static int tmemc_list(domid_t cli_id, tmem_cli_va_param_t buf, uint32_t len, bool_t use_long) { struct client *client; int off = 0; if ( cli_id == TMEM_CLI_ID_NULL ) { off = tmemc_list_global(buf,0,len,use_long); off += tmemc_list_shared(buf,off,len-off,use_long); list_for_each_entry(client,&global_client_list,client_list) off += tmemc_list_client(client, buf, off, len-off, use_long); off += tmemc_list_global_perf(buf,off,len-off,use_long); } else if ( (client = tmem_client_from_cli_id(cli_id)) == NULL) return -1; else off = tmemc_list_client(client, buf, 0, len, use_long); return 0; } static int tmemc_set_var_one(struct client *client, uint32_t subop, uint32_t arg1) { domid_t cli_id = client->cli_id; uint32_t old_weight; switch (subop) { case TMEMC_SET_WEIGHT: old_weight = client->weight; client->weight = arg1; tmem_client_info("tmem: weight set to %d for %s=%d\n", arg1, tmem_cli_id_str, cli_id); atomic_sub(old_weight,&client_weight_total); atomic_add(client->weight,&client_weight_total); break; case TMEMC_SET_CAP: client->cap = arg1; tmem_client_info("tmem: cap set to %d for %s=%d\n", arg1, tmem_cli_id_str, cli_id); break; case TMEMC_SET_COMPRESS: if ( tmem_dedup_enabled() ) { tmem_client_warn("tmem: compression %s for all %ss, cannot be changed when tmem_dedup is enabled\n", tmem_compression_enabled() ? "enabled" : "disabled", tmem_client_str); return -1; } client->compress = arg1 ? 1 : 0; tmem_client_info("tmem: compression %s for %s=%d\n", arg1 ? "enabled" : "disabled",tmem_cli_id_str,cli_id); break; default: tmem_client_warn("tmem: unknown subop %d for tmemc_set_var\n", subop); return -1; } return 0; } static int tmemc_set_var(domid_t cli_id, uint32_t subop, uint32_t arg1) { struct client *client; if ( cli_id == TMEM_CLI_ID_NULL ) list_for_each_entry(client,&global_client_list,client_list) tmemc_set_var_one(client, subop, arg1); else if ( (client = tmem_client_from_cli_id(cli_id)) == NULL) return -1; else tmemc_set_var_one(client, subop, arg1); return 0; } static int tmemc_shared_pool_auth(domid_t cli_id, uint64_t uuid_lo, uint64_t uuid_hi, bool_t auth) { struct client *client; int i, free = -1; if ( cli_id == TMEM_CLI_ID_NULL ) { global_shared_auth = auth; return 1; } client = tmem_client_from_cli_id(cli_id); if ( client == NULL ) return -EINVAL; for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++) { if ( (client->shared_auth_uuid[i][0] == uuid_lo) && (client->shared_auth_uuid[i][1] == uuid_hi) ) { if ( auth == 0 ) client->shared_auth_uuid[i][0] = client->shared_auth_uuid[i][1] = -1L; return 1; } if ( (auth == 1) && (client->shared_auth_uuid[i][0] == -1L) && (client->shared_auth_uuid[i][1] == -1L) && (free == -1) ) free = i; } if ( auth == 0 ) return 0; if ( auth == 1 && free == -1 ) return -ENOMEM; client->shared_auth_uuid[free][0] = uuid_lo; client->shared_auth_uuid[free][1] = uuid_hi; return 1; } static int tmemc_save_subop(int cli_id, uint32_t pool_id, uint32_t subop, tmem_cli_va_param_t buf, uint32_t arg1) { struct client *client = tmem_client_from_cli_id(cli_id); struct tmem_pool *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN) ? NULL : client->pools[pool_id]; uint32_t p; struct tmem_page_descriptor *pgp, *pgp2; int rc = -1; switch(subop) { case TMEMC_SAVE_BEGIN: if ( client == NULL ) return 0; for (p = 0; p < MAX_POOLS_PER_DOMAIN; p++) if ( client->pools[p] != NULL ) break; if ( p == MAX_POOLS_PER_DOMAIN ) { rc = 0; break; } client->was_frozen = client->frozen; client->frozen = 1; if ( arg1 != 0 ) client->live_migrating = 1; rc = 1; break; case TMEMC_RESTORE_BEGIN: if ( client == NULL && (client = client_create(cli_id)) != NULL ) return 1; break; case TMEMC_SAVE_GET_VERSION: rc = TMEM_SPEC_VERSION; break; case TMEMC_SAVE_GET_MAXPOOLS: rc = MAX_POOLS_PER_DOMAIN; break; case TMEMC_SAVE_GET_CLIENT_WEIGHT: if ( client == NULL ) break; rc = client->weight == -1 ? -2 : client->weight; break; case TMEMC_SAVE_GET_CLIENT_CAP: if ( client == NULL ) break; rc = client->cap == -1 ? -2 : client->cap; break; case TMEMC_SAVE_GET_CLIENT_FLAGS: if ( client == NULL ) break; rc = (client->compress ? TMEM_CLIENT_COMPRESS : 0 ) | (client->was_frozen ? TMEM_CLIENT_FROZEN : 0 ); break; case TMEMC_SAVE_GET_POOL_FLAGS: if ( pool == NULL ) break; rc = (pool->persistent ? TMEM_POOL_PERSIST : 0) | (pool->shared ? TMEM_POOL_SHARED : 0) | (pool->pageshift << TMEM_POOL_PAGESIZE_SHIFT) | (TMEM_SPEC_VERSION << TMEM_POOL_VERSION_SHIFT); break; case TMEMC_SAVE_GET_POOL_NPAGES: if ( pool == NULL ) break; rc = _atomic_read(pool->pgp_count); break; case TMEMC_SAVE_GET_POOL_UUID: if ( pool == NULL ) break; rc = 0; if ( copy_to_guest(guest_handle_cast(buf, void), pool->uuid, 2) ) rc = -EFAULT; break; case TMEMC_SAVE_END: if ( client == NULL ) break; client->live_migrating = 0; if ( !list_empty(&client->persistent_invalidated_list) ) list_for_each_entry_safe(pgp,pgp2, &client->persistent_invalidated_list, client_inv_pages) pgp_free_from_inv_list(client,pgp); client->frozen = client->was_frozen; rc = 0; break; } return rc; } static int tmemc_save_get_next_page(int cli_id, uint32_t pool_id, tmem_cli_va_param_t buf, uint32_t bufsize) { struct client *client = tmem_client_from_cli_id(cli_id); struct tmem_pool *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN) ? NULL : client->pools[pool_id]; struct tmem_page_descriptor *pgp; struct oid oid; int ret = 0; struct tmem_handle h; unsigned int pagesize; if ( pool == NULL || !is_persistent(pool) ) return -1; pagesize = 1 << (pool->pageshift + 12); if ( bufsize < pagesize + sizeof(struct tmem_handle) ) return -ENOMEM; spin_lock(&pers_lists_spinlock); if ( list_empty(&pool->persistent_page_list) ) { ret = -1; goto out; } /* note: pool->cur_pgp is the pgp last returned by get_next_page */ if ( pool->cur_pgp == NULL ) { /* process the first one */ pool->cur_pgp = pgp = list_entry((&pool->persistent_page_list)->next, struct tmem_page_descriptor,us.pool_pers_pages); } else if ( list_is_last(&pool->cur_pgp->us.pool_pers_pages, &pool->persistent_page_list) ) { /* already processed the last one in the list */ ret = -1; goto out; } pgp = list_entry((&pool->cur_pgp->us.pool_pers_pages)->next, struct tmem_page_descriptor,us.pool_pers_pages); pool->cur_pgp = pgp; oid = pgp->us.obj->oid; h.pool_id = pool_id; BUILD_BUG_ON(sizeof(h.oid) != sizeof(oid)); memcpy(h.oid, oid.oid, sizeof(h.oid)); h.index = pgp->index; if ( copy_to_guest(guest_handle_cast(buf, void), &h, 1) ) { ret = -EFAULT; goto out; } guest_handle_add_offset(buf, sizeof(h)); ret = do_tmem_get(pool, &oid, pgp->index, 0, buf); out: spin_unlock(&pers_lists_spinlock); return ret; } static int tmemc_save_get_next_inv(int cli_id, tmem_cli_va_param_t buf, uint32_t bufsize) { struct client *client = tmem_client_from_cli_id(cli_id); struct tmem_page_descriptor *pgp; struct tmem_handle h; int ret = 0; if ( client == NULL ) return 0; if ( bufsize < sizeof(struct tmem_handle) ) return 0; spin_lock(&pers_lists_spinlock); if ( list_empty(&client->persistent_invalidated_list) ) goto out; if ( client->cur_pgp == NULL ) { pgp = list_entry((&client->persistent_invalidated_list)->next, struct tmem_page_descriptor,client_inv_pages); client->cur_pgp = pgp; } else if ( list_is_last(&client->cur_pgp->client_inv_pages, &client->persistent_invalidated_list) ) { client->cur_pgp = NULL; ret = 0; goto out; } else { pgp = list_entry((&client->cur_pgp->client_inv_pages)->next, struct tmem_page_descriptor,client_inv_pages); client->cur_pgp = pgp; } h.pool_id = pgp->pool_id; BUILD_BUG_ON(sizeof(h.oid) != sizeof(pgp->inv_oid)); memcpy(h.oid, pgp->inv_oid.oid, sizeof(h.oid)); h.index = pgp->index; ret = 1; if ( copy_to_guest(guest_handle_cast(buf, void), &h, 1) ) ret = -EFAULT; out: spin_unlock(&pers_lists_spinlock); return ret; } static int tmemc_restore_put_page(int cli_id, uint32_t pool_id, struct oid *oidp, uint32_t index, tmem_cli_va_param_t buf, uint32_t bufsize) { struct client *client = tmem_client_from_cli_id(cli_id); struct tmem_pool *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN) ? NULL : client->pools[pool_id]; if ( pool == NULL ) return -1; if (bufsize != PAGE_SIZE) { tmem_client_err("tmem: %s: invalid parameter bufsize(%d) != (%ld)\n", __func__, bufsize, PAGE_SIZE); return -EINVAL; } return do_tmem_put(pool, oidp, index, 0, buf); } static int tmemc_restore_flush_page(int cli_id, uint32_t pool_id, struct oid *oidp, uint32_t index) { struct client *client = tmem_client_from_cli_id(cli_id); struct tmem_pool *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN) ? NULL : client->pools[pool_id]; if ( pool == NULL ) return -1; return do_tmem_flush_page(pool,oidp,index); } static int do_tmem_control(struct tmem_op *op) { int ret; uint32_t pool_id = op->pool_id; uint32_t subop = op->u.ctrl.subop; struct oid *oidp = (struct oid *)(&op->u.ctrl.oid[0]); if ( xsm_tmem_control(XSM_PRIV) ) return -EPERM; switch(subop) { case TMEMC_THAW: case TMEMC_FREEZE: case TMEMC_DESTROY: ret = tmemc_freeze_pools(op->u.ctrl.cli_id,subop); break; case TMEMC_FLUSH: ret = tmemc_flush_mem(op->u.ctrl.cli_id,op->u.ctrl.arg1); break; case TMEMC_LIST: ret = tmemc_list(op->u.ctrl.cli_id, guest_handle_cast(op->u.ctrl.buf, char), op->u.ctrl.arg1,op->u.ctrl.arg2); break; case TMEMC_SET_WEIGHT: case TMEMC_SET_CAP: case TMEMC_SET_COMPRESS: ret = tmemc_set_var(op->u.ctrl.cli_id,subop,op->u.ctrl.arg1); break; case TMEMC_QUERY_FREEABLE_MB: ret = tmem_freeable_pages() >> (20 - PAGE_SHIFT); break; case TMEMC_SAVE_BEGIN: case TMEMC_RESTORE_BEGIN: case TMEMC_SAVE_GET_VERSION: case TMEMC_SAVE_GET_MAXPOOLS: case TMEMC_SAVE_GET_CLIENT_WEIGHT: case TMEMC_SAVE_GET_CLIENT_CAP: case TMEMC_SAVE_GET_CLIENT_FLAGS: case TMEMC_SAVE_GET_POOL_FLAGS: case TMEMC_SAVE_GET_POOL_NPAGES: case TMEMC_SAVE_GET_POOL_UUID: case TMEMC_SAVE_END: ret = tmemc_save_subop(op->u.ctrl.cli_id,pool_id,subop, guest_handle_cast(op->u.ctrl.buf, char), op->u.ctrl.arg1); break; case TMEMC_SAVE_GET_NEXT_PAGE: ret = tmemc_save_get_next_page(op->u.ctrl.cli_id, pool_id, guest_handle_cast(op->u.ctrl.buf, char), op->u.ctrl.arg1); break; case TMEMC_SAVE_GET_NEXT_INV: ret = tmemc_save_get_next_inv(op->u.ctrl.cli_id, guest_handle_cast(op->u.ctrl.buf, char), op->u.ctrl.arg1); break; case TMEMC_RESTORE_PUT_PAGE: ret = tmemc_restore_put_page(op->u.ctrl.cli_id,pool_id, oidp, op->u.ctrl.arg2, guest_handle_cast(op->u.ctrl.buf, char), op->u.ctrl.arg1); break; case TMEMC_RESTORE_FLUSH_PAGE: ret = tmemc_restore_flush_page(op->u.ctrl.cli_id,pool_id, oidp, op->u.ctrl.arg2); break; default: ret = -1; } return ret; } /************ EXPORTed FUNCTIONS **************************************/ long do_tmem_op(tmem_cli_op_t uops) { struct tmem_op op; struct client *client = current->domain->tmem_client; struct tmem_pool *pool = NULL; struct oid *oidp; int rc = 0; bool_t succ_get = 0, succ_put = 0; bool_t non_succ_get = 0, non_succ_put = 0; bool_t flush = 0, flush_obj = 0; bool_t write_lock_set = 0, read_lock_set = 0; if ( !tmem_initialized ) return -ENODEV; if ( xsm_tmem_op(XSM_HOOK) ) return -EPERM; total_tmem_ops++; if ( client != NULL && client->domain->is_dying ) { rc = -ENODEV; simple_error: errored_tmem_ops++; return rc; } if ( unlikely(tmem_get_tmemop_from_client(&op, uops) != 0) ) { tmem_client_err("tmem: can't get tmem struct from %s\n", tmem_client_str); rc = -EFAULT; goto simple_error; } if ( op.cmd == TMEM_CONTROL ) { write_lock(&tmem_rwlock); write_lock_set = 1; rc = do_tmem_control(&op); goto out; } else if ( op.cmd == TMEM_AUTH ) { write_lock(&tmem_rwlock); write_lock_set = 1; rc = tmemc_shared_pool_auth(op.u.creat.arg1,op.u.creat.uuid[0], op.u.creat.uuid[1],op.u.creat.flags); goto out; } else if ( op.cmd == TMEM_RESTORE_NEW ) { write_lock(&tmem_rwlock); write_lock_set = 1; rc = do_tmem_new_pool(op.u.creat.arg1, op.pool_id, op.u.creat.flags, op.u.creat.uuid[0], op.u.creat.uuid[1]); goto out; } /* create per-client tmem structure dynamically on first use by client */ if ( client == NULL ) { write_lock(&tmem_rwlock); write_lock_set = 1; if ( (client = client_create(current->domain->domain_id)) == NULL ) { tmem_client_err("tmem: can't create tmem structure for %s\n", tmem_client_str); rc = -ENOMEM; goto out; } } if ( op.cmd == TMEM_NEW_POOL || op.cmd == TMEM_DESTROY_POOL ) { if ( !write_lock_set ) { write_lock(&tmem_rwlock); write_lock_set = 1; } } else { if ( !write_lock_set ) { read_lock(&tmem_rwlock); read_lock_set = 1; } if ( ((uint32_t)op.pool_id >= MAX_POOLS_PER_DOMAIN) || ((pool = client->pools[op.pool_id]) == NULL) ) { tmem_client_err("tmem: operation requested on uncreated pool\n"); rc = -ENODEV; goto out; } } oidp = (struct oid *)&op.u.gen.oid[0]; switch ( op.cmd ) { case TMEM_NEW_POOL: rc = do_tmem_new_pool(TMEM_CLI_ID_NULL, 0, op.u.creat.flags, op.u.creat.uuid[0], op.u.creat.uuid[1]); break; case TMEM_PUT_PAGE: if (tmem_ensure_avail_pages()) rc = do_tmem_put(pool, oidp, op.u.gen.index, op.u.gen.cmfn, tmem_cli_buf_null); else rc = -ENOMEM; if (rc == 1) succ_put = 1; else non_succ_put = 1; break; case TMEM_GET_PAGE: rc = do_tmem_get(pool, oidp, op.u.gen.index, op.u.gen.cmfn, tmem_cli_buf_null); if (rc == 1) succ_get = 1; else non_succ_get = 1; break; case TMEM_FLUSH_PAGE: flush = 1; rc = do_tmem_flush_page(pool, oidp, op.u.gen.index); break; case TMEM_FLUSH_OBJECT: rc = do_tmem_flush_object(pool, oidp); flush_obj = 1; break; case TMEM_DESTROY_POOL: flush = 1; rc = do_tmem_destroy_pool(op.pool_id); break; default: tmem_client_warn("tmem: op %d not implemented\n", op.cmd); rc = -ENOSYS; break; } out: if ( rc < 0 ) errored_tmem_ops++; if ( write_lock_set ) write_unlock(&tmem_rwlock); else if ( read_lock_set ) read_unlock(&tmem_rwlock); else ASSERT(0); return rc; } /* this should be called when the host is destroying a client */ void tmem_destroy(void *v) { struct client *client = (struct client *)v; if ( client == NULL ) return; if ( !client->domain->is_dying ) { printk("tmem: tmem_destroy can only destroy dying client\n"); return; } write_lock(&tmem_rwlock); printk("tmem: flushing tmem pools for %s=%d\n", tmem_cli_id_str, client->cli_id); client_flush(client, 1); write_unlock(&tmem_rwlock); } #define MAX_EVICTS 10 /* should be variable or set via TMEMC_ ?? */ void *tmem_relinquish_pages(unsigned int order, unsigned int memflags) { struct page_info *pfp; unsigned long evicts_per_relinq = 0; int max_evictions = 10; if (!tmem_enabled() || !tmem_freeable_pages()) return NULL; relinq_attempts++; if ( order > 0 ) { #ifndef NDEBUG printk("tmem_relinquish_page: failing order=%d\n", order); #endif return NULL; } if ( memflags & MEMF_tmem ) read_lock(&tmem_rwlock); while ( (pfp = tmem_page_list_get()) == NULL ) { if ( (max_evictions-- <= 0) || !tmem_evict()) break; evicts_per_relinq++; } if ( evicts_per_relinq > max_evicts_per_relinq ) max_evicts_per_relinq = evicts_per_relinq; if ( pfp != NULL ) { if ( !(memflags & MEMF_tmem) ) scrub_one_page(pfp); relinq_pgs++; } if ( memflags & MEMF_tmem ) read_unlock(&tmem_rwlock); return pfp; } unsigned long tmem_freeable_pages(void) { return tmem_page_list_pages + _atomic_read(freeable_page_count); } /* called at hypervisor startup */ static int __init init_tmem(void) { int i; if ( !tmem_enabled() ) return 0; if ( tmem_dedup_enabled() ) for (i = 0; i < 256; i++ ) { pcd_tree_roots[i] = RB_ROOT; rwlock_init(&pcd_tree_rwlocks[i]); } if ( !tmem_mempool_init() ) return 0; if ( tmem_init() ) { printk("tmem: initialized comp=%d dedup=%d tze=%d\n", tmem_compression_enabled(), tmem_dedup_enabled(), tmem_tze_enabled()); if ( tmem_dedup_enabled()&&tmem_compression_enabled()&&tmem_tze_enabled() ) { tmem_tze_disable(); printk("tmem: tze and compression not compatible, disabling tze\n"); } tmem_initialized = 1; } else printk("tmem: initialization FAILED\n"); return 0; } __initcall(init_tmem); /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/tmem_xen.c0000664000175000017500000002024612307313555014511 0ustar smbsmb/****************************************************************************** * tmem-xen.c * * Xen-specific Transcendent memory * * Copyright (c) 2009, Dan Magenheimer, Oracle Corp. */ #include #include #include /* compression code */ #include #include #include #include bool_t __read_mostly opt_tmem = 0; boolean_param("tmem", opt_tmem); bool_t __read_mostly opt_tmem_compress = 0; boolean_param("tmem_compress", opt_tmem_compress); bool_t __read_mostly opt_tmem_dedup = 0; boolean_param("tmem_dedup", opt_tmem_dedup); bool_t __read_mostly opt_tmem_tze = 0; boolean_param("tmem_tze", opt_tmem_tze); bool_t __read_mostly opt_tmem_shared_auth = 0; boolean_param("tmem_shared_auth", opt_tmem_shared_auth); atomic_t freeable_page_count = ATOMIC_INIT(0); /* these are a concurrency bottleneck, could be percpu and dynamically * allocated iff opt_tmem_compress */ #define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS #define LZO_DSTMEM_PAGES 2 static DEFINE_PER_CPU_READ_MOSTLY(unsigned char *, workmem); static DEFINE_PER_CPU_READ_MOSTLY(unsigned char *, dstmem); static DEFINE_PER_CPU_READ_MOSTLY(void *, scratch_page); #if defined(CONFIG_ARM) static inline void *cli_get_page(xen_pfn_t cmfn, unsigned long *pcli_mfn, struct page_info **pcli_pfp, bool_t cli_write) { ASSERT(0); return NULL; } static inline void cli_put_page(void *cli_va, struct page_info *cli_pfp, unsigned long cli_mfn, bool_t mark_dirty) { ASSERT(0); } #else #include static inline void *cli_get_page(xen_pfn_t cmfn, unsigned long *pcli_mfn, struct page_info **pcli_pfp, bool_t cli_write) { p2m_type_t t; struct page_info *page; page = get_page_from_gfn(current->domain, cmfn, &t, P2M_ALLOC); if ( !page || t != p2m_ram_rw ) { if ( page ) put_page(page); return NULL; } if ( cli_write && !get_page_type(page, PGT_writable_page) ) { put_page(page); return NULL; } *pcli_mfn = page_to_mfn(page); *pcli_pfp = page; return map_domain_page(*pcli_mfn); } static inline void cli_put_page(void *cli_va, struct page_info *cli_pfp, unsigned long cli_mfn, bool_t mark_dirty) { if ( mark_dirty ) { put_page_and_type(cli_pfp); paging_mark_dirty(current->domain,cli_mfn); } else put_page(cli_pfp); unmap_domain_page(cli_va); } #endif int tmem_copy_from_client(struct page_info *pfp, xen_pfn_t cmfn, tmem_cli_va_param_t clibuf) { unsigned long tmem_mfn, cli_mfn = 0; char *tmem_va, *cli_va = NULL; struct page_info *cli_pfp = NULL; int rc = 1; ASSERT(pfp != NULL); tmem_mfn = page_to_mfn(pfp); tmem_va = map_domain_page(tmem_mfn); if ( guest_handle_is_null(clibuf) ) { cli_va = cli_get_page(cmfn, &cli_mfn, &cli_pfp, 0); if ( cli_va == NULL ) { unmap_domain_page(tmem_va); return -EFAULT; } } smp_mb(); if ( cli_va ) { memcpy(tmem_va, cli_va, PAGE_SIZE); cli_put_page(cli_va, cli_pfp, cli_mfn, 0); } else rc = -EINVAL; unmap_domain_page(tmem_va); return rc; } int tmem_compress_from_client(xen_pfn_t cmfn, void **out_va, size_t *out_len, tmem_cli_va_param_t clibuf) { int ret = 0; unsigned char *dmem = this_cpu(dstmem); unsigned char *wmem = this_cpu(workmem); char *scratch = this_cpu(scratch_page); struct page_info *cli_pfp = NULL; unsigned long cli_mfn = 0; void *cli_va = NULL; if ( dmem == NULL || wmem == NULL ) return 0; /* no buffer, so can't compress */ if ( guest_handle_is_null(clibuf) ) { cli_va = cli_get_page(cmfn, &cli_mfn, &cli_pfp, 0); if ( cli_va == NULL ) return -EFAULT; } else if ( !scratch ) return 0; else if ( copy_from_guest(scratch, clibuf, PAGE_SIZE) ) return -EFAULT; smp_mb(); ret = lzo1x_1_compress(cli_va ?: scratch, PAGE_SIZE, dmem, out_len, wmem); ASSERT(ret == LZO_E_OK); *out_va = dmem; if ( cli_va ) cli_put_page(cli_va, cli_pfp, cli_mfn, 0); return 1; } int tmem_copy_to_client(xen_pfn_t cmfn, struct page_info *pfp, tmem_cli_va_param_t clibuf) { unsigned long tmem_mfn, cli_mfn = 0; char *tmem_va, *cli_va = NULL; struct page_info *cli_pfp = NULL; int rc = 1; ASSERT(pfp != NULL); if ( guest_handle_is_null(clibuf) ) { cli_va = cli_get_page(cmfn, &cli_mfn, &cli_pfp, 1); if ( cli_va == NULL ) return -EFAULT; } tmem_mfn = page_to_mfn(pfp); tmem_va = map_domain_page(tmem_mfn); if ( cli_va ) { memcpy(cli_va, tmem_va, PAGE_SIZE); cli_put_page(cli_va, cli_pfp, cli_mfn, 1); } else rc = -EINVAL; unmap_domain_page(tmem_va); smp_mb(); return rc; } int tmem_decompress_to_client(xen_pfn_t cmfn, void *tmem_va, size_t size, tmem_cli_va_param_t clibuf) { unsigned long cli_mfn = 0; struct page_info *cli_pfp = NULL; void *cli_va = NULL; char *scratch = this_cpu(scratch_page); size_t out_len = PAGE_SIZE; int ret; if ( guest_handle_is_null(clibuf) ) { cli_va = cli_get_page(cmfn, &cli_mfn, &cli_pfp, 1); if ( cli_va == NULL ) return -EFAULT; } else if ( !scratch ) return 0; ret = lzo1x_decompress_safe(tmem_va, size, cli_va ?: scratch, &out_len); ASSERT(ret == LZO_E_OK); ASSERT(out_len == PAGE_SIZE); if ( cli_va ) cli_put_page(cli_va, cli_pfp, cli_mfn, 1); else if ( copy_to_guest(clibuf, scratch, PAGE_SIZE) ) return -EFAULT; smp_mb(); return 1; } int tmem_copy_tze_to_client(xen_pfn_t cmfn, void *tmem_va, pagesize_t len) { void *cli_va; unsigned long cli_mfn; struct page_info *cli_pfp = NULL; ASSERT(!(len & (sizeof(uint64_t)-1))); ASSERT(len <= PAGE_SIZE); ASSERT(len > 0 || tmem_va == NULL); cli_va = cli_get_page(cmfn, &cli_mfn, &cli_pfp, 1); if ( cli_va == NULL ) return -EFAULT; if ( len > 0 ) memcpy((char *)cli_va,(char *)tmem_va,len); if ( len < PAGE_SIZE ) memset((char *)cli_va+len,0,PAGE_SIZE-len); cli_put_page(cli_va, cli_pfp, cli_mfn, 1); smp_mb(); return 1; } /****************** XEN-SPECIFIC HOST INITIALIZATION ********************/ static int dstmem_order, workmem_order; static int cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; switch ( action ) { case CPU_UP_PREPARE: { if ( per_cpu(dstmem, cpu) == NULL ) per_cpu(dstmem, cpu) = alloc_xenheap_pages(dstmem_order, 0); if ( per_cpu(workmem, cpu) == NULL ) per_cpu(workmem, cpu) = alloc_xenheap_pages(workmem_order, 0); if ( per_cpu(scratch_page, cpu) == NULL ) per_cpu(scratch_page, cpu) = alloc_xenheap_page(); break; } case CPU_DEAD: case CPU_UP_CANCELED: { if ( per_cpu(dstmem, cpu) != NULL ) { free_xenheap_pages(per_cpu(dstmem, cpu), dstmem_order); per_cpu(dstmem, cpu) = NULL; } if ( per_cpu(workmem, cpu) != NULL ) { free_xenheap_pages(per_cpu(workmem, cpu), workmem_order); per_cpu(workmem, cpu) = NULL; } if ( per_cpu(scratch_page, cpu) != NULL ) { free_xenheap_page(per_cpu(scratch_page, cpu)); per_cpu(scratch_page, cpu) = NULL; } break; } default: break; } return NOTIFY_DONE; } static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; int __init tmem_init(void) { unsigned int cpu; dstmem_order = get_order_from_pages(LZO_DSTMEM_PAGES); workmem_order = get_order_from_bytes(LZO1X_1_MEM_COMPRESS); for_each_online_cpu ( cpu ) { void *hcpu = (void *)(long)cpu; cpu_callback(&cpu_nfb, CPU_UP_PREPARE, hcpu); } register_cpu_notifier(&cpu_nfb); return 1; } xen-4.4.0/xen/common/kexec.c0000664000175000017500000007734712307313555014012 0ustar smbsmb/****************************************************************************** * kexec.c - Achitecture independent kexec code for Xen * * Xen port written by: * - Simon 'Horms' Horman * - Magnus Damm */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CONFIG_COMPAT #include #endif bool_t kexecing = FALSE; /* Memory regions to store the per cpu register state etc. on a crash. */ typedef struct { Elf_Note * start; size_t size; } crash_note_range_t; static crash_note_range_t * crash_notes; /* Lock to prevent race conditions when allocating the crash note buffers. * It also serves to protect calls to alloc_from_crash_heap when allocating * crash note buffers in lower memory. */ static DEFINE_SPINLOCK(crash_notes_lock); static Elf_Note *xen_crash_note; static cpumask_t crash_saved_cpus; static struct kexec_image *kexec_image[KEXEC_IMAGE_NR]; #define KEXEC_FLAG_DEFAULT_POS (KEXEC_IMAGE_NR + 0) #define KEXEC_FLAG_CRASH_POS (KEXEC_IMAGE_NR + 1) #define KEXEC_FLAG_IN_PROGRESS (KEXEC_IMAGE_NR + 2) static unsigned long kexec_flags = 0; /* the lowest bits are for KEXEC_IMAGE... */ static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; static size_t vmcoreinfo_size = 0; xen_kexec_reserve_t kexec_crash_area; static struct { u64 start, end; unsigned long size; } ranges[16] __initdata; /* Low crashinfo mode. Start as INVALID so serveral codepaths can set up * defaults without needing to know the state of the others. */ enum low_crashinfo low_crashinfo_mode = LOW_CRASHINFO_INVALID; /* This value is only considered if low_crash_mode is set to MIN or ALL, so * setting a default here is safe. Default to 4GB. This is because the current * KEXEC_CMD_get_range compat hypercall trucates 64bit pointers to 32 bits. The * typical usecase for crashinfo_maxaddr will be for 64bit Xen with 32bit dom0 * and 32bit crash kernel. */ static paddr_t __initdata crashinfo_maxaddr = 4ULL << 30; /* = log base 2 of crashinfo_maxaddr after checking for sanity. Default to * larger than the entire physical address space. */ paddr_t crashinfo_maxaddr_bits = 64; /* Pointers to keep track of the crash heap region. */ static void *crash_heap_current = NULL, *crash_heap_end = NULL; /* * Parse command lines in the format * * crashkernel=:[,...][@] * * with being of form * * -[] * * as well as the legacy ones in the format * * crashkernel=[@] */ static void __init parse_crashkernel(const char *str) { const char *cur; if ( strchr(str, ':' ) ) { unsigned int idx = 0; do { if ( idx >= ARRAY_SIZE(ranges) ) { printk(XENLOG_WARNING "crashkernel: too many ranges\n"); cur = NULL; str = strchr(str, '@'); break; } ranges[idx].start = parse_size_and_unit(cur = str + !!idx, &str); if ( cur == str ) break; if ( *str != '-' ) { printk(XENLOG_WARNING "crashkernel: '-' expected\n"); break; } if ( *++str != ':' ) { ranges[idx].end = parse_size_and_unit(cur = str, &str); if ( cur == str ) break; if ( ranges[idx].end <= ranges[idx].start ) { printk(XENLOG_WARNING "crashkernel: end <= start\n"); break; } } else ranges[idx].end = -1; if ( *str != ':' ) { printk(XENLOG_WARNING "crashkernel: ':' expected\n"); break; } ranges[idx].size = parse_size_and_unit(cur = str + 1, &str); if ( cur == str ) break; ++idx; } while ( *str == ',' ); if ( idx < ARRAY_SIZE(ranges) ) ranges[idx].size = 0; } else kexec_crash_area.size = parse_size_and_unit(cur = str, &str); if ( cur != str && *str == '@' ) kexec_crash_area.start = parse_size_and_unit(cur = str + 1, &str); if ( cur == str ) printk(XENLOG_WARNING "crashkernel: memory value expected\n"); } custom_param("crashkernel", parse_crashkernel); /* Parse command lines in the format: * * low_crashinfo=[none,min,all] * * - none disables the low allocation of crash info. * - min will allocate enough low information for the crash kernel to be able * to extract the hypervisor and dom0 message ring buffers. * - all will allocate additional structures such as domain and vcpu structs * low so the crash kernel can perform an extended analysis of state. */ static void __init parse_low_crashinfo(const char * str) { if ( !strlen(str) ) /* default to min if user just specifies "low_crashinfo" */ low_crashinfo_mode = LOW_CRASHINFO_MIN; else if ( !strcmp(str, "none" ) ) low_crashinfo_mode = LOW_CRASHINFO_NONE; else if ( !strcmp(str, "min" ) ) low_crashinfo_mode = LOW_CRASHINFO_MIN; else if ( !strcmp(str, "all" ) ) low_crashinfo_mode = LOW_CRASHINFO_ALL; else { printk("Unknown low_crashinfo parameter '%s'. Defaulting to min.\n", str); low_crashinfo_mode = LOW_CRASHINFO_MIN; } } custom_param("low_crashinfo", parse_low_crashinfo); /* Parse command lines in the format: * * crashinfo_maxaddr= * * will be rounded down to the nearest power of two. Defaults to 64G */ static void __init parse_crashinfo_maxaddr(const char * str) { u64 addr; /* if low_crashinfo_mode is unset, default to min. */ if ( low_crashinfo_mode == LOW_CRASHINFO_INVALID ) low_crashinfo_mode = LOW_CRASHINFO_MIN; if ( (addr = parse_size_and_unit(str, NULL)) ) crashinfo_maxaddr = addr; else printk("Unable to parse crashinfo_maxaddr. Defaulting to %"PRIpaddr"\n", crashinfo_maxaddr); } custom_param("crashinfo_maxaddr", parse_crashinfo_maxaddr); void __init set_kexec_crash_area_size(u64 system_ram) { unsigned int idx; for ( idx = 0; idx < ARRAY_SIZE(ranges) && !kexec_crash_area.size; ++idx ) { if ( !ranges[idx].size ) break; if ( ranges[idx].size >= system_ram ) { printk(XENLOG_WARNING "crashkernel: invalid size\n"); continue; } if ( ranges[idx].start <= system_ram && ranges[idx].end > system_ram ) kexec_crash_area.size = ranges[idx].size; } } /* * Only allow one cpu to continue on the crash path, forcing others to spin. * Racing on the crash path from here will end in misery. If we reenter, * something has very gone wrong and retrying will (almost certainly) be * futile. Return up to our nested panic() to try and reboot. * * This is noinline to make it obvious in stack traces which cpus have lost * the race (as opposed to being somewhere in kexec_common_shutdown()) */ static int noinline one_cpu_only(void) { static unsigned int crashing_cpu = -1; unsigned int cpu = smp_processor_id(); if ( cmpxchg(&crashing_cpu, -1, cpu) != -1 ) { /* Not the first entry into one_cpu_only(). */ if ( crashing_cpu == cpu ) { printk("Reentered the crash path. Something is very broken\n"); return -EBUSY; } /* * Another cpu has beaten us to this point. Wait here patiently for * it to kill us. */ for ( ; ; ) halt(); } set_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags); return 0; } /* Save the registers in the per-cpu crash note buffer. */ void kexec_crash_save_cpu(void) { int cpu = smp_processor_id(); Elf_Note *note; ELF_Prstatus *prstatus; crash_xen_core_t *xencore; BUG_ON ( ! crash_notes ); if ( cpumask_test_and_set_cpu(cpu, &crash_saved_cpus) ) return; note = crash_notes[cpu].start; prstatus = (ELF_Prstatus *)ELFNOTE_DESC(note); note = ELFNOTE_NEXT(note); xencore = (crash_xen_core_t *)ELFNOTE_DESC(note); elf_core_save_regs(&prstatus->pr_reg, xencore); } /* Set up the single Xen-specific-info crash note. */ crash_xen_info_t *kexec_crash_save_info(void) { int cpu = smp_processor_id(); crash_xen_info_t info; crash_xen_info_t *out = (crash_xen_info_t *)ELFNOTE_DESC(xen_crash_note); BUG_ON(!cpumask_test_and_set_cpu(cpu, &crash_saved_cpus)); memset(&info, 0, sizeof(info)); info.xen_major_version = xen_major_version(); info.xen_minor_version = xen_minor_version(); info.xen_extra_version = __pa(xen_extra_version()); info.xen_changeset = __pa(xen_changeset()); info.xen_compiler = __pa(xen_compiler()); info.xen_compile_date = __pa(xen_compile_date()); info.xen_compile_time = __pa(xen_compile_time()); info.tainted = tainted; /* Copy from guaranteed-aligned local copy to possibly-unaligned dest. */ memcpy(out, &info, sizeof(info)); return out; } static int kexec_common_shutdown(void) { int ret; ret = one_cpu_only(); if ( ret ) return ret; watchdog_disable(); console_start_sync(); spin_debug_disable(); acpi_dmar_reinstate(); return 0; } void kexec_crash(void) { int pos; pos = (test_bit(KEXEC_FLAG_CRASH_POS, &kexec_flags) != 0); if ( !test_bit(KEXEC_IMAGE_CRASH_BASE + pos, &kexec_flags) ) return; printk("Executing crash image\n"); kexecing = TRUE; if ( kexec_common_shutdown() != 0 ) return; kexec_crash_save_cpu(); machine_crash_shutdown(); machine_kexec(kexec_image[KEXEC_IMAGE_CRASH_BASE + pos]); BUG(); } static long kexec_reboot(void *_image) { struct kexec_image *image = _image; kexecing = TRUE; kexec_common_shutdown(); machine_reboot_kexec(image); BUG(); return 0; } static void do_crashdump_trigger(unsigned char key) { printk("'%c' pressed -> triggering crashdump\n", key); kexec_crash(); printk(" * no crash kernel loaded!\n"); } static struct keyhandler crashdump_trigger_keyhandler = { .u.fn = do_crashdump_trigger, .desc = "trigger a crashdump" }; static void setup_note(Elf_Note *n, const char *name, int type, int descsz) { int l = strlen(name) + 1; strlcpy(ELFNOTE_NAME(n), name, l); n->namesz = l; n->descsz = descsz; n->type = type; } static size_t sizeof_note(const char *name, int descsz) { return (sizeof(Elf_Note) + ELFNOTE_ALIGN(strlen(name)+1) + ELFNOTE_ALIGN(descsz)); } static size_t sizeof_cpu_notes(const unsigned long cpu) { /* All CPUs present a PRSTATUS and crash_xen_core note. */ size_t bytes = + sizeof_note("CORE", sizeof(ELF_Prstatus)) + + sizeof_note("Xen", sizeof(crash_xen_core_t)); /* CPU0 also presents the crash_xen_info note. */ if ( ! cpu ) bytes = bytes + sizeof_note("Xen", sizeof(crash_xen_info_t)); return bytes; } /* Allocate size_t bytes of space from the previously allocated * crash heap if the user has requested that crash notes be allocated * in lower memory. There is currently no case where the crash notes * should be free()'d. */ static void * alloc_from_crash_heap(const size_t bytes) { void * ret; if ( crash_heap_current + bytes > crash_heap_end ) return NULL; ret = (void*)crash_heap_current; crash_heap_current += bytes; return ret; } /* Allocate a crash note buffer for a newly onlined cpu. */ static int kexec_init_cpu_notes(const unsigned long cpu) { Elf_Note * note = NULL; int ret = 0; int nr_bytes = 0; BUG_ON( cpu >= nr_cpu_ids || ! crash_notes ); /* If already allocated, nothing to do. */ if ( crash_notes[cpu].start ) return ret; nr_bytes = sizeof_cpu_notes(cpu); /* If we dont care about the position of allocation, malloc. */ if ( low_crashinfo_mode == LOW_CRASHINFO_NONE ) note = xzalloc_bytes(nr_bytes); /* Protect the write into crash_notes[] with a spinlock, as this function * is on a hotplug path and a hypercall path. */ spin_lock(&crash_notes_lock); /* If we are racing with another CPU and it has beaten us, give up * gracefully. */ if ( crash_notes[cpu].start ) { spin_unlock(&crash_notes_lock); /* Always return ok, because whether we successfully allocated or not, * another CPU has successfully allocated. */ if ( note ) xfree(note); } else { /* If we care about memory possition, alloc from the crash heap, * also protected by the crash_notes_lock. */ if ( low_crashinfo_mode > LOW_CRASHINFO_NONE ) note = alloc_from_crash_heap(nr_bytes); crash_notes[cpu].start = note; crash_notes[cpu].size = nr_bytes; spin_unlock(&crash_notes_lock); /* If the allocation failed, and another CPU did not beat us, give * up with ENOMEM. */ if ( ! note ) ret = -ENOMEM; /* else all is good so lets set up the notes. */ else { /* Set up CORE note. */ setup_note(note, "CORE", NT_PRSTATUS, sizeof(ELF_Prstatus)); note = ELFNOTE_NEXT(note); /* Set up Xen CORE note. */ setup_note(note, "Xen", XEN_ELFNOTE_CRASH_REGS, sizeof(crash_xen_core_t)); if ( ! cpu ) { /* Set up Xen Crash Info note. */ xen_crash_note = note = ELFNOTE_NEXT(note); setup_note(note, "Xen", XEN_ELFNOTE_CRASH_INFO, sizeof(crash_xen_info_t)); } } } return ret; } static int cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned long cpu = (unsigned long)hcpu; /* Only hook on CPU_UP_PREPARE because once a crash_note has been reported * to dom0, it must keep it around in case of a crash, as the crash kernel * will be hard coded to the original physical address reported. */ switch ( action ) { case CPU_UP_PREPARE: /* Ignore return value. If this boot time, -ENOMEM will cause all * manner of problems elsewhere very soon, and if it is during runtime, * then failing to allocate crash notes is not a good enough reason to * fail the CPU_UP_PREPARE */ kexec_init_cpu_notes(cpu); break; default: break; } return NOTIFY_DONE; } static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; void __init kexec_early_calculations(void) { /* If low_crashinfo_mode is still INVALID, neither "low_crashinfo" nor * "crashinfo_maxaddr" have been specified on the command line, so * explicitly set to NONE. */ if ( low_crashinfo_mode == LOW_CRASHINFO_INVALID ) low_crashinfo_mode = LOW_CRASHINFO_NONE; if ( low_crashinfo_mode > LOW_CRASHINFO_NONE ) crashinfo_maxaddr_bits = fls(crashinfo_maxaddr) - 1; } static int __init kexec_init(void) { void *cpu = (void *)(unsigned long)smp_processor_id(); /* If no crash area, no need to allocate space for notes. */ if ( !kexec_crash_area.size ) return 0; if ( low_crashinfo_mode > LOW_CRASHINFO_NONE ) { size_t crash_heap_size; /* This calculation is safe even if the machine is booted in * uniprocessor mode. */ crash_heap_size = sizeof_cpu_notes(0) + sizeof_cpu_notes(1) * (nr_cpu_ids - 1); crash_heap_size = PAGE_ALIGN(crash_heap_size); crash_heap_current = alloc_xenheap_pages( get_order_from_bytes(crash_heap_size), MEMF_bits(crashinfo_maxaddr_bits) ); if ( ! crash_heap_current ) return -ENOMEM; memset(crash_heap_current, 0, crash_heap_size); crash_heap_end = crash_heap_current + crash_heap_size; } /* crash_notes may be allocated anywhere Xen can reach in memory. Only the individual CPU crash notes themselves must be allocated in lower memory if requested. */ crash_notes = xzalloc_array(crash_note_range_t, nr_cpu_ids); if ( ! crash_notes ) return -ENOMEM; register_keyhandler('C', &crashdump_trigger_keyhandler); cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); register_cpu_notifier(&cpu_nfb); return 0; } /* The reason for this to be a presmp_initcall as opposed to a regular * __initcall is to allow the setup of the cpu hotplug handler before APs are * brought up. */ presmp_initcall(kexec_init); static int kexec_get_reserve(xen_kexec_range_t *range) { if ( kexec_crash_area.size > 0 && kexec_crash_area.start > 0) { range->start = kexec_crash_area.start; range->size = kexec_crash_area.size; } else range->start = range->size = 0; return 0; } static int kexec_get_cpu(xen_kexec_range_t *range) { int nr = range->nr; if ( nr < 0 || nr >= nr_cpu_ids ) return -ERANGE; if ( ! crash_notes ) return -EINVAL; /* Try once again to allocate room for the crash notes. It is just possible * that more space has become available since we last tried. If space has * already been allocated, kexec_init_cpu_notes() will return early with 0. */ kexec_init_cpu_notes(nr); /* In the case of still not having enough memory to allocate buffer room, * returning a range of 0,0 is still valid. */ if ( crash_notes[nr].start ) { range->start = __pa(crash_notes[nr].start); range->size = crash_notes[nr].size; } else range->start = range->size = 0; return 0; } static int kexec_get_vmcoreinfo(xen_kexec_range_t *range) { range->start = __pa((unsigned long)vmcoreinfo_data); range->size = VMCOREINFO_BYTES; return 0; } static int kexec_get_range_internal(xen_kexec_range_t *range) { int ret = -EINVAL; switch ( range->range ) { case KEXEC_RANGE_MA_CRASH: ret = kexec_get_reserve(range); break; case KEXEC_RANGE_MA_CPU: ret = kexec_get_cpu(range); break; case KEXEC_RANGE_MA_VMCOREINFO: ret = kexec_get_vmcoreinfo(range); break; default: ret = machine_kexec_get(range); break; } return ret; } static int kexec_get_range(XEN_GUEST_HANDLE_PARAM(void) uarg) { xen_kexec_range_t range; int ret = -EINVAL; if ( unlikely(copy_from_guest(&range, uarg, 1)) ) return -EFAULT; ret = kexec_get_range_internal(&range); if ( ret == 0 && unlikely(copy_to_guest(uarg, &range, 1)) ) return -EFAULT; return ret; } static int kexec_get_range_compat(XEN_GUEST_HANDLE_PARAM(void) uarg) { #ifdef CONFIG_COMPAT xen_kexec_range_t range; compat_kexec_range_t compat_range; int ret = -EINVAL; if ( unlikely(copy_from_guest(&compat_range, uarg, 1)) ) return -EFAULT; XLAT_kexec_range(&range, &compat_range); ret = kexec_get_range_internal(&range); /* Dont silently truncate physical addresses or sizes. */ if ( (range.start | range.size) & ~(unsigned long)(~0u) ) return -ERANGE; if ( ret == 0 ) { XLAT_kexec_range(&compat_range, &range); if ( unlikely(copy_to_guest(uarg, &compat_range, 1)) ) return -EFAULT; } return ret; #else /* CONFIG_COMPAT */ return 0; #endif /* CONFIG_COMPAT */ } static int kexec_load_get_bits(int type, int *base, int *bit) { switch ( type ) { case KEXEC_TYPE_DEFAULT: *base = KEXEC_IMAGE_DEFAULT_BASE; *bit = KEXEC_FLAG_DEFAULT_POS; break; case KEXEC_TYPE_CRASH: *base = KEXEC_IMAGE_CRASH_BASE; *bit = KEXEC_FLAG_CRASH_POS; break; default: return -1; } return 0; } void vmcoreinfo_append_str(const char *fmt, ...) { va_list args; char buf[0x50]; int r; size_t note_size = sizeof(Elf_Note) + ELFNOTE_ALIGN(strlen(VMCOREINFO_NOTE_NAME) + 1); if (vmcoreinfo_size + note_size + sizeof(buf) > VMCOREINFO_BYTES) return; va_start(args, fmt); r = vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); memcpy(&vmcoreinfo_data[note_size + vmcoreinfo_size], buf, r); vmcoreinfo_size += r; } static void crash_save_vmcoreinfo(void) { size_t data_size; if (vmcoreinfo_size > 0) /* already saved */ return; data_size = VMCOREINFO_BYTES - (sizeof(Elf_Note) + ELFNOTE_ALIGN(strlen(VMCOREINFO_NOTE_NAME) + 1)); setup_note((Elf_Note *)vmcoreinfo_data, VMCOREINFO_NOTE_NAME, 0, data_size); VMCOREINFO_PAGESIZE(PAGE_SIZE); VMCOREINFO_SYMBOL(domain_list); #ifndef frame_table VMCOREINFO_SYMBOL(frame_table); #else { static const void *const _frame_table = frame_table; VMCOREINFO_SYMBOL_ALIAS(frame_table, _frame_table); } #endif VMCOREINFO_SYMBOL(max_page); VMCOREINFO_STRUCT_SIZE(page_info); VMCOREINFO_STRUCT_SIZE(domain); VMCOREINFO_OFFSET(page_info, count_info); VMCOREINFO_OFFSET_SUB(page_info, v.inuse, _domain); VMCOREINFO_OFFSET(domain, domain_id); VMCOREINFO_OFFSET(domain, next_in_list); #ifdef ARCH_CRASH_SAVE_VMCOREINFO arch_crash_save_vmcoreinfo(); #endif } static void kexec_unload_image(struct kexec_image *image) { if ( !image ) return; machine_kexec_unload(image); kimage_free(image); } static int kexec_exec(XEN_GUEST_HANDLE_PARAM(void) uarg) { xen_kexec_exec_t exec; struct kexec_image *image; int base, bit, pos, ret = -EINVAL; if ( unlikely(copy_from_guest(&exec, uarg, 1)) ) return -EFAULT; if ( kexec_load_get_bits(exec.type, &base, &bit) ) return -EINVAL; pos = (test_bit(bit, &kexec_flags) != 0); /* Only allow kexec/kdump into loaded images */ if ( !test_bit(base + pos, &kexec_flags) ) return -ENOENT; switch (exec.type) { case KEXEC_TYPE_DEFAULT: image = kexec_image[base + pos]; ret = continue_hypercall_on_cpu(0, kexec_reboot, image); break; case KEXEC_TYPE_CRASH: kexec_crash(); /* Does not return */ break; } return -EINVAL; /* never reached */ } static int kexec_swap_images(int type, struct kexec_image *new, struct kexec_image **old) { static DEFINE_SPINLOCK(kexec_lock); int base, bit, pos; int new_slot, old_slot; *old = NULL; if ( test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) ) return -EBUSY; if ( kexec_load_get_bits(type, &base, &bit) ) return -EINVAL; spin_lock(&kexec_lock); pos = (test_bit(bit, &kexec_flags) != 0); old_slot = base + pos; new_slot = base + !pos; if ( new ) { kexec_image[new_slot] = new; set_bit(new_slot, &kexec_flags); } change_bit(bit, &kexec_flags); clear_bit(old_slot, &kexec_flags); *old = kexec_image[old_slot]; spin_unlock(&kexec_lock); return 0; } static int kexec_load_slot(struct kexec_image *kimage) { struct kexec_image *old_kimage; int ret = -ENOMEM; ret = machine_kexec_load(kimage); if ( ret < 0 ) return ret; crash_save_vmcoreinfo(); ret = kexec_swap_images(kimage->type, kimage, &old_kimage); if ( ret < 0 ) return ret; kexec_unload_image(old_kimage); return 0; } static uint16_t kexec_load_v1_arch(void) { #ifdef CONFIG_X86 return is_pv_32on64_domain(dom0) ? EM_386 : EM_X86_64; #else return EM_NONE; #endif } static int kexec_segments_add_segment( unsigned int *nr_segments, xen_kexec_segment_t *segments, unsigned long mfn) { paddr_t maddr = (paddr_t)mfn << PAGE_SHIFT; unsigned int n = *nr_segments; /* Need a new segment? */ if ( n == 0 || segments[n-1].dest_maddr + segments[n-1].dest_size != maddr ) { n++; if ( n > KEXEC_SEGMENT_MAX ) return -EINVAL; *nr_segments = n; set_xen_guest_handle(segments[n-1].buf.h, NULL); segments[n-1].buf_size = 0; segments[n-1].dest_maddr = maddr; segments[n-1].dest_size = 0; } return 0; } static int kexec_segments_from_ind_page(unsigned long mfn, unsigned int *nr_segments, xen_kexec_segment_t *segments, bool_t compat) { void *page; kimage_entry_t *entry; int ret = 0; page = map_domain_page(mfn); /* * Walk the indirection page list, adding destination pages to the * segments. */ for ( entry = page; ; ) { unsigned long ind; ind = kimage_entry_ind(entry, compat); mfn = kimage_entry_mfn(entry, compat); switch ( ind ) { case IND_DESTINATION: ret = kexec_segments_add_segment(nr_segments, segments, mfn); if ( ret < 0 ) goto done; break; case IND_INDIRECTION: unmap_domain_page(page); entry = page = map_domain_page(mfn); continue; case IND_DONE: goto done; case IND_SOURCE: if ( *nr_segments == 0 ) { ret = -EINVAL; goto done; } segments[*nr_segments-1].dest_size += PAGE_SIZE; break; default: ret = -EINVAL; goto done; } entry = kimage_entry_next(entry, compat); } done: unmap_domain_page(page); return ret; } static int kexec_do_load_v1(xen_kexec_load_v1_t *load, int compat) { struct kexec_image *kimage = NULL; xen_kexec_segment_t *segments; uint16_t arch; unsigned int nr_segments = 0; unsigned long ind_mfn = load->image.indirection_page >> PAGE_SHIFT; int ret; arch = kexec_load_v1_arch(); if ( arch == EM_NONE ) return -ENOSYS; segments = xmalloc_array(xen_kexec_segment_t, KEXEC_SEGMENT_MAX); if ( segments == NULL ) return -ENOMEM; /* * Work out the image segments (destination only) from the * indirection pages. * * This is needed so we don't allocate pages that will overlap * with the destination when building the new set of indirection * pages below. */ ret = kexec_segments_from_ind_page(ind_mfn, &nr_segments, segments, compat); if ( ret < 0 ) goto error; ret = kimage_alloc(&kimage, load->type, arch, load->image.start_address, nr_segments, segments); if ( ret < 0 ) goto error; /* * Build a new set of indirection pages in the native format. * * This walks the guest provided indirection pages a second time. * The guest could have altered then, invalidating the segment * information constructed above. This will only result in the * resulting image being potentially unrelocatable. */ ret = kimage_build_ind(kimage, ind_mfn, compat); if ( ret < 0 ) goto error; ret = kexec_load_slot(kimage); if ( ret < 0 ) goto error; return 0; error: if ( !kimage ) xfree(segments); kimage_free(kimage); return ret; } static int kexec_load_v1(XEN_GUEST_HANDLE_PARAM(void) uarg) { xen_kexec_load_v1_t load; if ( unlikely(copy_from_guest(&load, uarg, 1)) ) return -EFAULT; return kexec_do_load_v1(&load, 0); } static int kexec_load_v1_compat(XEN_GUEST_HANDLE_PARAM(void) uarg) { #ifdef CONFIG_COMPAT compat_kexec_load_v1_t compat_load; xen_kexec_load_v1_t load; if ( unlikely(copy_from_guest(&compat_load, uarg, 1)) ) return -EFAULT; /* This is a bit dodgy, load.image is inside load, * but XLAT_kexec_load (which is automatically generated) * doesn't translate load.image (correctly) * Just copy load->type, the only other member, manually instead. * * XLAT_kexec_load(&load, &compat_load); */ load.type = compat_load.type; XLAT_kexec_image(&load.image, &compat_load.image); return kexec_do_load_v1(&load, 1); #else return 0; #endif } static int kexec_load(XEN_GUEST_HANDLE_PARAM(void) uarg) { xen_kexec_load_t load; xen_kexec_segment_t *segments; struct kexec_image *kimage = NULL; int ret; if ( copy_from_guest(&load, uarg, 1) ) return -EFAULT; if ( load.nr_segments >= KEXEC_SEGMENT_MAX ) return -EINVAL; segments = xmalloc_array(xen_kexec_segment_t, load.nr_segments); if ( segments == NULL ) return -ENOMEM; if ( copy_from_guest(segments, load.segments.h, load.nr_segments) ) { ret = -EFAULT; goto error; } ret = kimage_alloc(&kimage, load.type, load.arch, load.entry_maddr, load.nr_segments, segments); if ( ret < 0 ) goto error; ret = kimage_load_segments(kimage); if ( ret < 0 ) goto error; ret = kexec_load_slot(kimage); if ( ret < 0 ) goto error; return 0; error: if ( ! kimage ) xfree(segments); kimage_free(kimage); return ret; } static int kexec_do_unload(xen_kexec_unload_t *unload) { struct kexec_image *old_kimage; int ret; ret = kexec_swap_images(unload->type, NULL, &old_kimage); if ( ret < 0 ) return ret; kexec_unload_image(old_kimage); return 0; } static int kexec_unload_v1(XEN_GUEST_HANDLE_PARAM(void) uarg) { xen_kexec_load_v1_t load; xen_kexec_unload_t unload; if ( copy_from_guest(&load, uarg, 1) ) return -EFAULT; unload.type = load.type; return kexec_do_unload(&unload); } static int kexec_unload_v1_compat(XEN_GUEST_HANDLE_PARAM(void) uarg) { #ifdef CONFIG_COMPAT compat_kexec_load_v1_t compat_load; xen_kexec_unload_t unload; if ( copy_from_guest(&compat_load, uarg, 1) ) return -EFAULT; unload.type = compat_load.type; return kexec_do_unload(&unload); #else return 0; #endif } static int kexec_unload(XEN_GUEST_HANDLE_PARAM(void) uarg) { xen_kexec_unload_t unload; if ( unlikely(copy_from_guest(&unload, uarg, 1)) ) return -EFAULT; return kexec_do_unload(&unload); } static int do_kexec_op_internal(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) uarg, bool_t compat) { int ret = -EINVAL; ret = xsm_kexec(XSM_PRIV); if ( ret ) return ret; switch ( op ) { case KEXEC_CMD_kexec_get_range: if (compat) ret = kexec_get_range_compat(uarg); else ret = kexec_get_range(uarg); break; case KEXEC_CMD_kexec_load_v1: if ( compat ) ret = kexec_load_v1_compat(uarg); else ret = kexec_load_v1(uarg); break; case KEXEC_CMD_kexec_unload_v1: if ( compat ) ret = kexec_unload_v1_compat(uarg); else ret = kexec_unload_v1(uarg); break; case KEXEC_CMD_kexec: ret = kexec_exec(uarg); break; case KEXEC_CMD_kexec_load: ret = kexec_load(uarg); break; case KEXEC_CMD_kexec_unload: ret = kexec_unload(uarg); break; } return ret; } long do_kexec_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) uarg) { return do_kexec_op_internal(op, uarg, 0); } #ifdef CONFIG_COMPAT int compat_kexec_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) uarg) { return do_kexec_op_internal(op, uarg, 1); } #endif /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/sched_sedf.c0000664000175000017500000013656612307313555015001 0ustar smbsmb/****************************************************************************** * Simple EDF scheduler for xen * * by Stephan Diestelhorst (C) 2004 Cambridge University * based on code by Mark Williamson (C) 2004 Intel Research Cambridge */ #include #include #include #include #include #include #include #ifndef NDEBUG #define SEDF_STATS #define CHECK(_p) \ do { \ if ( !(_p) ) \ printk("Check '%s' failed, line %d, file %s\n", \ #_p , __LINE__, __FILE__); \ } while ( 0 ) #else #define CHECK(_p) ((void)0) #endif #define EXTRA_NONE (0) #define EXTRA_AWARE (1) #define EXTRA_RUN_PEN (2) #define EXTRA_RUN_UTIL (4) #define EXTRA_WANT_PEN_Q (8) #define EXTRA_PEN_Q (0) #define EXTRA_UTIL_Q (1) #define SEDF_ASLEEP (16) #define EXTRA_QUANTUM (MICROSECS(500)) #define WEIGHT_PERIOD (MILLISECS(100)) #define WEIGHT_SAFETY (MILLISECS(5)) #define PERIOD_MAX MILLISECS(10000) /* 10s */ #define PERIOD_MIN (MICROSECS(10)) /* 10us */ #define SLICE_MIN (MICROSECS(5)) /* 5us */ #define IMPLY(a, b) (!(a) || (b)) #define EQ(a, b) ((!!(a)) == (!!(b))) struct sedf_dom_info { struct domain *domain; }; struct sedf_priv_info { /* lock for the whole pluggable scheduler, nests inside cpupool_lock */ spinlock_t lock; }; struct sedf_vcpu_info { struct vcpu *vcpu; struct list_head list; struct list_head extralist[2]; /* Parameters for EDF */ s_time_t period; /* = relative deadline */ s_time_t slice; /* = worst case execution time */ /* Advaced Parameters */ /* Latency Scaling */ s_time_t period_orig; s_time_t slice_orig; s_time_t latency; /* Status of domain */ int status; /* Weights for "Scheduling for beginners/ lazy/ etc." ;) */ short weight; short extraweight; /* Bookkeeping */ s_time_t deadl_abs; s_time_t sched_start_abs; s_time_t cputime; /* Times the domain un-/blocked */ s_time_t block_abs; s_time_t unblock_abs; /* Scores for {util, block penalty}-weighted extratime distribution */ int score[2]; s_time_t short_block_lost_tot; /* Statistics */ s_time_t extra_time_tot; #ifdef SEDF_STATS s_time_t block_time_tot; s_time_t penalty_time_tot; int block_tot; int short_block_tot; int long_block_tot; int pen_extra_blocks; int pen_extra_slices; #endif }; struct sedf_cpu_info { struct list_head runnableq; struct list_head waitq; struct list_head extraq[2]; s_time_t current_slice_expires; }; #define SEDF_PRIV(_ops) \ ((struct sedf_priv_info *)((_ops)->sched_data)) #define EDOM_INFO(d) ((struct sedf_vcpu_info *)((d)->sched_priv)) #define CPU_INFO(cpu) \ ((struct sedf_cpu_info *)per_cpu(schedule_data, cpu).sched_priv) #define LIST(d) (&EDOM_INFO(d)->list) #define EXTRALIST(d,i) (&(EDOM_INFO(d)->extralist[i])) #define RUNQ(cpu) (&CPU_INFO(cpu)->runnableq) #define WAITQ(cpu) (&CPU_INFO(cpu)->waitq) #define EXTRAQ(cpu,i) (&(CPU_INFO(cpu)->extraq[i])) #define IDLETASK(cpu) (idle_vcpu[cpu]) #define PERIOD_BEGIN(inf) ((inf)->deadl_abs - (inf)->period) #define DIV_UP(x,y) (((x) + (y) - 1) / y) #define extra_runs(inf) ((inf->status) & 6) #define extra_get_cur_q(inf) (((inf->status & 6) >> 1)-1) #define sedf_runnable(edom) (!(EDOM_INFO(edom)->status & SEDF_ASLEEP)) static void sedf_dump_cpu_state(const struct scheduler *ops, int i); static inline int extraq_on(struct vcpu *d, int i) { return ((EXTRALIST(d,i)->next != NULL) && (EXTRALIST(d,i)->next != EXTRALIST(d,i))); } static inline void extraq_add_head(struct vcpu *d, int i) { list_add(EXTRALIST(d,i), EXTRAQ(d->processor,i)); ASSERT(extraq_on(d, i)); } static inline void extraq_add_tail(struct vcpu *d, int i) { list_add_tail(EXTRALIST(d,i), EXTRAQ(d->processor,i)); ASSERT(extraq_on(d, i)); } static inline void extraq_del(struct vcpu *d, int i) { struct list_head *list = EXTRALIST(d,i); ASSERT(extraq_on(d,i)); list_del(list); list->next = NULL; ASSERT(!extraq_on(d, i)); } /* * Adds a domain to the queue of processes which are aware of extra time. List * is sorted by score, where a lower score means higher priority for an extra * slice. It also updates the score, by simply subtracting a fixed value from * each entry, in order to avoid overflow. The algorithm works by simply * charging each domain that recieved extratime with an inverse of its weight. */ static inline void extraq_add_sort_update(struct vcpu *d, int i, int sub) { struct list_head *cur; struct sedf_vcpu_info *curinf; ASSERT(!extraq_on(d,i)); /* * Iterate through all elements to find our "hole" and on our way * update all the other scores. */ list_for_each ( cur, EXTRAQ(d->processor, i) ) { curinf = list_entry(cur,struct sedf_vcpu_info,extralist[i]); curinf->score[i] -= sub; if ( EDOM_INFO(d)->score[i] < curinf->score[i] ) break; } /* cur now contains the element, before which we'll enqueue */ list_add(EXTRALIST(d,i),cur->prev); /* Continue updating the extraq */ if ( (cur != EXTRAQ(d->processor,i)) && sub ) { for ( cur = cur->next; cur != EXTRAQ(d->processor,i); cur = cur->next ) { curinf = list_entry(cur,struct sedf_vcpu_info, extralist[i]); curinf->score[i] -= sub; } } ASSERT(extraq_on(d,i)); } static inline void extraq_check(struct vcpu *d) { if ( extraq_on(d, EXTRA_UTIL_Q) ) { if ( !(EDOM_INFO(d)->status & EXTRA_AWARE) && !extra_runs(EDOM_INFO(d)) ) extraq_del(d, EXTRA_UTIL_Q); } else { if ( (EDOM_INFO(d)->status & EXTRA_AWARE) && sedf_runnable(d) ) extraq_add_sort_update(d, EXTRA_UTIL_Q, 0); } } static inline void extraq_check_add_unblocked(struct vcpu *d, int priority) { struct sedf_vcpu_info *inf = EDOM_INFO(d); if ( inf->status & EXTRA_AWARE ) /* Put on the weighted extraq without updating any scores */ extraq_add_sort_update(d, EXTRA_UTIL_Q, 0); } static inline int __task_on_queue(struct vcpu *d) { return (((LIST(d))->next != NULL) && (LIST(d)->next != LIST(d))); } static inline void __del_from_queue(struct vcpu *d) { struct list_head *list = LIST(d); ASSERT(__task_on_queue(d)); list_del(list); list->next = NULL; ASSERT(!__task_on_queue(d)); } typedef int(*list_comparer)(struct list_head* el1, struct list_head* el2); static inline void list_insert_sort( struct list_head *list, struct list_head *element, list_comparer comp) { struct list_head *cur; /* Iterate through all elements to find our "hole" */ list_for_each( cur, list ) if ( comp(element, cur) < 0 ) break; /* cur now contains the element, before which we'll enqueue */ list_add(element, cur->prev); } #define DOMAIN_COMPARER(name, field, comp1, comp2) \ static int name##_comp(struct list_head* el1, struct list_head* el2) \ { \ struct sedf_vcpu_info *d1, *d2; \ d1 = list_entry(el1,struct sedf_vcpu_info, field); \ d2 = list_entry(el2,struct sedf_vcpu_info, field); \ if ( (comp1) == (comp2) ) \ return 0; \ if ( (comp1) < (comp2) ) \ return -1; \ else \ return 1; \ } /* * Adds a domain to the queue of processes which wait for the beginning of the * next period; this list is therefore sortet by this time, which is simply * absol. deadline - period. */ DOMAIN_COMPARER(waitq, list, PERIOD_BEGIN(d1), PERIOD_BEGIN(d2)); static inline void __add_to_waitqueue_sort(struct vcpu *v) { ASSERT(!__task_on_queue(v)); list_insert_sort(WAITQ(v->processor), LIST(v), waitq_comp); ASSERT(__task_on_queue(v)); } /* * Adds a domain to the queue of processes which have started their current * period and are runnable (i.e. not blocked, dieing,...). The first element * on this list is running on the processor, if the list is empty the idle * task will run. As we are implementing EDF, this list is sorted by deadlines. */ DOMAIN_COMPARER(runq, list, d1->deadl_abs, d2->deadl_abs); static inline void __add_to_runqueue_sort(struct vcpu *v) { list_insert_sort(RUNQ(v->processor), LIST(v), runq_comp); } static void sedf_insert_vcpu(const struct scheduler *ops, struct vcpu *v) { if ( !is_idle_vcpu(v) ) { extraq_check(v); } else { EDOM_INFO(v)->deadl_abs = 0; EDOM_INFO(v)->status &= ~SEDF_ASLEEP; } } static void *sedf_alloc_vdata(const struct scheduler *ops, struct vcpu *v, void *dd) { struct sedf_vcpu_info *inf; inf = xzalloc(struct sedf_vcpu_info); if ( inf == NULL ) return NULL; inf->vcpu = v; /* Every VCPU gets an equal share of extratime by default */ inf->deadl_abs = 0; inf->latency = 0; inf->status = EXTRA_AWARE | SEDF_ASLEEP; inf->extraweight = 1; /* Upon creation all domain are best-effort */ inf->period = WEIGHT_PERIOD; inf->slice = 0; inf->period_orig = inf->period; inf->slice_orig = inf->slice; INIT_LIST_HEAD(&(inf->list)); INIT_LIST_HEAD(&(inf->extralist[EXTRA_PEN_Q])); INIT_LIST_HEAD(&(inf->extralist[EXTRA_UTIL_Q])); SCHED_STAT_CRANK(vcpu_init); return inf; } static void * sedf_alloc_pdata(const struct scheduler *ops, int cpu) { struct sedf_cpu_info *spc; spc = xzalloc(struct sedf_cpu_info); BUG_ON(spc == NULL); INIT_LIST_HEAD(&spc->waitq); INIT_LIST_HEAD(&spc->runnableq); INIT_LIST_HEAD(&spc->extraq[EXTRA_PEN_Q]); INIT_LIST_HEAD(&spc->extraq[EXTRA_UTIL_Q]); return (void *)spc; } static void sedf_free_pdata(const struct scheduler *ops, void *spc, int cpu) { if ( spc == NULL ) return; xfree(spc); } static void sedf_free_vdata(const struct scheduler *ops, void *priv) { xfree(priv); } static void * sedf_alloc_domdata(const struct scheduler *ops, struct domain *d) { return xzalloc(struct sedf_dom_info); } static int sedf_init_domain(const struct scheduler *ops, struct domain *d) { d->sched_priv = sedf_alloc_domdata(ops, d); if ( d->sched_priv == NULL ) return -ENOMEM; return 0; } static void sedf_free_domdata(const struct scheduler *ops, void *data) { xfree(data); } static void sedf_destroy_domain(const struct scheduler *ops, struct domain *d) { sedf_free_domdata(ops, d->sched_priv); } static int sedf_pick_cpu(const struct scheduler *ops, struct vcpu *v) { cpumask_t online_affinity; cpumask_t *online; online = cpupool_scheduler_cpumask(v->domain->cpupool); cpumask_and(&online_affinity, v->cpu_affinity, online); return cpumask_cycle(v->vcpu_id % cpumask_weight(&online_affinity) - 1, &online_affinity); } /* * Handles the rescheduling & bookkeeping of domains running in their * guaranteed timeslice. */ static void desched_edf_dom(s_time_t now, struct vcpu* d) { struct sedf_vcpu_info* inf = EDOM_INFO(d); /* Current domain is running in real time mode */ ASSERT(__task_on_queue(d)); /* Update the domain's cputime */ inf->cputime += now - inf->sched_start_abs; /* Scheduling decisions which don't remove the running domain from * the runq */ if ( (inf->cputime < inf->slice) && sedf_runnable(d) ) return; __del_from_queue(d); /* * Manage bookkeeping (i.e. calculate next deadline, memorise * overrun-time of slice) of finished domains. */ if ( inf->cputime >= inf->slice ) { inf->cputime -= inf->slice; if ( inf->period < inf->period_orig ) { /* This domain runs in latency scaling or burst mode */ inf->period *= 2; inf->slice *= 2; if ( (inf->period > inf->period_orig) || (inf->slice > inf->slice_orig) ) { /* Reset slice and period */ inf->period = inf->period_orig; inf->slice = inf->slice_orig; } } /* Set next deadline */ inf->deadl_abs += inf->period; } /* Add a runnable domain to the waitqueue */ if ( sedf_runnable(d) ) { __add_to_waitqueue_sort(d); } else { /* We have a blocked realtime task -> remove it from exqs too */ if ( extraq_on(d, EXTRA_PEN_Q) ) extraq_del(d, EXTRA_PEN_Q); if ( extraq_on(d, EXTRA_UTIL_Q) ) extraq_del(d, EXTRA_UTIL_Q); } ASSERT(EQ(sedf_runnable(d), __task_on_queue(d))); ASSERT(IMPLY(extraq_on(d, EXTRA_UTIL_Q) || extraq_on(d, EXTRA_PEN_Q), sedf_runnable(d))); } /* Update all elements on the queues */ static void update_queues( s_time_t now, struct list_head *runq, struct list_head *waitq) { struct list_head *cur, *tmp; struct sedf_vcpu_info *curinf; /* * Check for the first elements of the waitqueue, whether their * next period has already started. */ list_for_each_safe ( cur, tmp, waitq ) { curinf = list_entry(cur, struct sedf_vcpu_info, list); if ( PERIOD_BEGIN(curinf) > now ) break; __del_from_queue(curinf->vcpu); __add_to_runqueue_sort(curinf->vcpu); } /* Process the runq, find domains that are on the runq that shouldn't */ list_for_each_safe ( cur, tmp, runq ) { curinf = list_entry(cur,struct sedf_vcpu_info,list); if ( unlikely(curinf->slice == 0) ) { /* Ignore domains with empty slice */ __del_from_queue(curinf->vcpu); /* Move them to their next period */ curinf->deadl_abs += curinf->period; /* Ensure that the start of the next period is in the future */ if ( unlikely(PERIOD_BEGIN(curinf) < now) ) curinf->deadl_abs += (DIV_UP(now - PERIOD_BEGIN(curinf), curinf->period)) * curinf->period; /* Put them back into the queue */ __add_to_waitqueue_sort(curinf->vcpu); } else if ( unlikely((curinf->deadl_abs < now) || (curinf->cputime > curinf->slice)) ) { /* * We missed the deadline or the slice was already finished. * Might hapen because of dom_adj. */ printk("\tDomain %i.%i exceeded it's deadline/" "slice (%"PRIu64" / %"PRIu64") now: %"PRIu64 " cputime: %"PRIu64"\n", curinf->vcpu->domain->domain_id, curinf->vcpu->vcpu_id, curinf->deadl_abs, curinf->slice, now, curinf->cputime); __del_from_queue(curinf->vcpu); /* Common case: we miss one period */ curinf->deadl_abs += curinf->period; /* * If we are still behind: modulo arithmetic, force deadline * to be in future and aligned to period borders. */ if ( unlikely(curinf->deadl_abs < now) ) curinf->deadl_abs += DIV_UP(now - curinf->deadl_abs, curinf->period) * curinf->period; ASSERT(curinf->deadl_abs >= now); /* Give a fresh slice */ curinf->cputime = 0; if ( PERIOD_BEGIN(curinf) > now ) __add_to_waitqueue_sort(curinf->vcpu); else __add_to_runqueue_sort(curinf->vcpu); } else break; } } /* * removes a domain from the head of the according extraQ and * requeues it at a specified position: * round-robin extratime: end of extraQ * weighted ext.: insert in sorted list by score * if the domain is blocked / has regained its short-block-loss * time it is not put on any queue. */ static void desched_extra_dom(s_time_t now, struct vcpu *d) { struct sedf_vcpu_info *inf = EDOM_INFO(d); int i = extra_get_cur_q(inf); unsigned long oldscore; ASSERT(extraq_on(d, i)); /* Unset all running flags */ inf->status &= ~(EXTRA_RUN_PEN | EXTRA_RUN_UTIL); /* Fresh slice for the next run */ inf->cputime = 0; /* Accumulate total extratime */ inf->extra_time_tot += now - inf->sched_start_abs; /* Remove extradomain from head of the queue. */ extraq_del(d, i); /* Update the score */ oldscore = inf->score[i]; if ( i == EXTRA_PEN_Q ) { /* Domain was running in L0 extraq */ /* reduce block lost, probably more sophistication here!*/ /*inf->short_block_lost_tot -= EXTRA_QUANTUM;*/ inf->short_block_lost_tot -= now - inf->sched_start_abs; #if 0 /* KAF: If we don't exit short-blocking state at this point * domain0 can steal all CPU for up to 10 seconds before * scheduling settles down (when competing against another * CPU-bound domain). Doing this seems to make things behave * nicely. Noone gets starved by default. */ if ( inf->short_block_lost_tot <= 0 ) #endif { /* We have (over-)compensated our block penalty */ inf->short_block_lost_tot = 0; /* We don't want a place on the penalty queue anymore! */ inf->status &= ~EXTRA_WANT_PEN_Q; goto check_extra_queues; } /* * We have to go again for another try in the block-extraq, * the score is not used incremantally here, as this is * already done by recalculating the block_lost */ inf->score[EXTRA_PEN_Q] = (inf->period << 10) / inf->short_block_lost_tot; oldscore = 0; } else { /* * Domain was running in L1 extraq => score is inverse of * utilization and is used somewhat incremental! */ if ( !inf->extraweight ) { /* NB: use fixed point arithmetic with 10 bits */ inf->score[EXTRA_UTIL_Q] = (inf->period << 10) / inf->slice; } else { /* * Conversion between realtime utilisation and extrawieght: * full (ie 100%) utilization is equivalent to 128 extraweight */ inf->score[EXTRA_UTIL_Q] = (1<<17) / inf->extraweight; } } check_extra_queues: /* Adding a runnable domain to the right queue and removing blocked ones */ if ( sedf_runnable(d) ) { /* Add according to score: weighted round robin */ if (((inf->status & EXTRA_AWARE) && (i == EXTRA_UTIL_Q)) || ((inf->status & EXTRA_WANT_PEN_Q) && (i == EXTRA_PEN_Q))) extraq_add_sort_update(d, i, oldscore); } else { /* Remove this blocked domain from the waitq! */ __del_from_queue(d); /* Make sure that we remove a blocked domain from the other * extraq too. */ if ( i == EXTRA_PEN_Q ) { if ( extraq_on(d, EXTRA_UTIL_Q) ) extraq_del(d, EXTRA_UTIL_Q); } else { if ( extraq_on(d, EXTRA_PEN_Q) ) extraq_del(d, EXTRA_PEN_Q); } } ASSERT(EQ(sedf_runnable(d), __task_on_queue(d))); ASSERT(IMPLY(extraq_on(d, EXTRA_UTIL_Q) || extraq_on(d, EXTRA_PEN_Q), sedf_runnable(d))); } static struct task_slice sedf_do_extra_schedule( s_time_t now, s_time_t end_xt, struct list_head *extraq[], int cpu) { struct task_slice ret = { 0 }; struct sedf_vcpu_info *runinf; ASSERT(end_xt > now); /* Enough time left to use for extratime? */ if ( end_xt - now < EXTRA_QUANTUM ) goto return_idle; if ( !list_empty(extraq[EXTRA_PEN_Q]) ) { /* * We still have elements on the level 0 extraq * => let those run first! */ runinf = list_entry(extraq[EXTRA_PEN_Q]->next, struct sedf_vcpu_info, extralist[EXTRA_PEN_Q]); runinf->status |= EXTRA_RUN_PEN; ret.task = runinf->vcpu; ret.time = EXTRA_QUANTUM; #ifdef SEDF_STATS runinf->pen_extra_slices++; #endif } else { if ( !list_empty(extraq[EXTRA_UTIL_Q]) ) { /* Use elements from the normal extraqueue */ runinf = list_entry(extraq[EXTRA_UTIL_Q]->next, struct sedf_vcpu_info, extralist[EXTRA_UTIL_Q]); runinf->status |= EXTRA_RUN_UTIL; ret.task = runinf->vcpu; ret.time = EXTRA_QUANTUM; } else goto return_idle; } ASSERT(ret.time > 0); ASSERT(sedf_runnable(ret.task)); return ret; return_idle: ret.task = IDLETASK(cpu); ret.time = end_xt - now; ASSERT(ret.time > 0); ASSERT(sedf_runnable(ret.task)); return ret; } static int sedf_init(struct scheduler *ops) { struct sedf_priv_info *prv; prv = xzalloc(struct sedf_priv_info); if ( prv == NULL ) return -ENOMEM; ops->sched_data = prv; spin_lock_init(&prv->lock); return 0; } static void sedf_deinit(const struct scheduler *ops) { struct sedf_priv_info *prv; prv = SEDF_PRIV(ops); if ( prv != NULL ) xfree(prv); } /* * Main scheduling function * Reasons for calling this function are: * -timeslice for the current period used up * -domain on waitqueue has started it's period * -and various others ;) in general: determine which domain to run next */ static struct task_slice sedf_do_schedule( const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled) { int cpu = smp_processor_id(); struct list_head *runq = RUNQ(cpu); struct list_head *waitq = WAITQ(cpu); struct sedf_vcpu_info *inf = EDOM_INFO(current); struct list_head *extraq[] = { EXTRAQ(cpu, EXTRA_PEN_Q), EXTRAQ(cpu, EXTRA_UTIL_Q)}; struct sedf_vcpu_info *runinf, *waitinf; struct task_slice ret; SCHED_STAT_CRANK(schedule); /* Idle tasks don't need any of the following stuf */ if ( is_idle_vcpu(current) ) goto check_waitq; /* * Create local state of the status of the domain, in order to avoid * inconsistent state during scheduling decisions, because data for * vcpu_runnable is not protected by the scheduling lock! */ if ( !vcpu_runnable(current) ) inf->status |= SEDF_ASLEEP; if ( inf->status & SEDF_ASLEEP ) inf->block_abs = now; if ( unlikely(extra_runs(inf)) ) { /* Special treatment of domains running in extra time */ desched_extra_dom(now, current); } else { desched_edf_dom(now, current); } check_waitq: update_queues(now, runq, waitq); /* * Now simply pick the first domain from the runqueue, which has the * earliest deadline, because the list is sorted * * Tasklet work (which runs in idle VCPU context) overrides all else. */ if ( tasklet_work_scheduled || (list_empty(runq) && list_empty(waitq)) || unlikely(!cpumask_test_cpu(cpu, cpupool_scheduler_cpumask(per_cpu(cpupool, cpu)))) ) { ret.task = IDLETASK(cpu); ret.time = SECONDS(1); } else if ( !list_empty(runq) ) { runinf = list_entry(runq->next,struct sedf_vcpu_info,list); ret.task = runinf->vcpu; if ( !list_empty(waitq) ) { waitinf = list_entry(waitq->next, struct sedf_vcpu_info,list); /* * Rerun scheduler, when scheduled domain reaches it's * end of slice or the first domain from the waitqueue * gets ready. */ ret.time = MIN(now + runinf->slice - runinf->cputime, PERIOD_BEGIN(waitinf)) - now; } else { ret.time = runinf->slice - runinf->cputime; } } else { waitinf = list_entry(waitq->next,struct sedf_vcpu_info, list); /* * We could not find any suitable domain * => look for domains that are aware of extratime */ ret = sedf_do_extra_schedule(now, PERIOD_BEGIN(waitinf), extraq, cpu); } /* * TODO: Do something USEFUL when this happens and find out, why it * still can happen!!! */ if ( ret.time < 0) { printk("Ouch! We are seriously BEHIND schedule! %"PRIi64"\n", ret.time); ret.time = EXTRA_QUANTUM; } ret.migrated = 0; EDOM_INFO(ret.task)->sched_start_abs = now; CHECK(ret.time > 0); ASSERT(sedf_runnable(ret.task)); CPU_INFO(cpu)->current_slice_expires = now + ret.time; return ret; } static void sedf_sleep(const struct scheduler *ops, struct vcpu *d) { if ( is_idle_vcpu(d) ) return; EDOM_INFO(d)->status |= SEDF_ASLEEP; if ( per_cpu(schedule_data, d->processor).curr == d ) { cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ); } else { if ( __task_on_queue(d) ) __del_from_queue(d); if ( extraq_on(d, EXTRA_UTIL_Q) ) extraq_del(d, EXTRA_UTIL_Q); if ( extraq_on(d, EXTRA_PEN_Q) ) extraq_del(d, EXTRA_PEN_Q); } } /* * This function wakes up a domain, i.e. moves them into the waitqueue * things to mention are: admission control is taking place nowhere at * the moment, so we can't be sure, whether it is safe to wake the domain * up at all. Anyway, even if it is safe (total cpu usage <=100%) there are * some considerations on when to allow the domain to wake up and have it's * first deadline... * I detected 3 cases, which could describe the possible behaviour of the * scheduler, * and I'll try to make them more clear: * * 1. Very conservative * -when a blocked domain unblocks, it is allowed to start execution at * the beginning of the next complete period * (D..deadline, R..running, B..blocking/sleeping, U..unblocking/waking up * * DRRB_____D__U_____DRRRRR___D________ ... * * -this causes the domain to miss a period (and a deadlline) * -doesn't disturb the schedule at all * -deadlines keep occuring isochronous * * 2. Conservative Part 1: Short Unblocking * -when a domain unblocks in the same period as it was blocked it * unblocks and may consume the rest of it's original time-slice minus * the time it was blocked * (assume period=9, slice=5) * * DRB_UR___DRRRRR___D... * * -this also doesn't disturb scheduling, but might lead to the fact, that * the domain can't finish it's workload in the period * -in addition to that the domain can be treated prioritised when * extratime is available * -addition: experiments have shown that this may have a HUGE impact on * performance of other domains, becaus it can lead to excessive context * switches * * Part2: Long Unblocking * Part 2a * -it is obvious that such accounting of block time, applied when * unblocking is happening in later periods, works fine aswell * -the domain is treated as if it would have been running since the start * of its new period * * DRB______D___UR___D... * * Part 2b * -if one needs the full slice in the next period, it is necessary to * treat the unblocking time as the start of the new period, i.e. move * the deadline further back (later) * -this doesn't disturb scheduling as well, because for EDF periods can * be treated as minimal inter-release times and scheduling stays * correct, when deadlines are kept relative to the time the process * unblocks * * DRB______D___URRRR___D... * (D) <- old deadline was here * -problem: deadlines don't occur isochronous anymore * Part 2c (Improved Atropos design) * -when a domain unblocks it is given a very short period (=latency hint) * and slice length scaled accordingly * -both rise again to the original value (e.g. get doubled every period) * * 3. Unconservative (i.e. incorrect) * -to boost the performance of I/O dependent domains it would be possible * to put the domain into the runnable queue immediately, and let it run * for the remainder of the slice of the current period * (or even worse: allocate a new full slice for the domain) * -either behaviour can lead to missed deadlines in other domains as * opposed to approaches 1,2a,2b */ static void unblock_short_extra_support( struct sedf_vcpu_info* inf, s_time_t now) { /* * This unblocking scheme tries to support the domain, by assigning it * a priority in extratime distribution according to the loss of time * in this slice due to blocking */ s_time_t pen; /* No more realtime execution in this period! */ inf->deadl_abs += inf->period; if ( likely(inf->block_abs) ) { /* Treat blocked time as consumed by the domain */ /*inf->cputime += now - inf->block_abs;*/ /* * Penalty is time the domain would have * had if it continued to run. */ pen = (inf->slice - inf->cputime); if ( pen < 0 ) pen = 0; /* Accumulate all penalties over the periods */ /*inf->short_block_lost_tot += pen;*/ /* Set penalty to the current value */ inf->short_block_lost_tot = pen; /* Not sure which one is better.. but seems to work well... */ if ( inf->short_block_lost_tot ) { inf->score[0] = (inf->period << 10) / inf->short_block_lost_tot; #ifdef SEDF_STATS inf->pen_extra_blocks++; #endif if ( extraq_on(inf->vcpu, EXTRA_PEN_Q) ) /* Remove domain for possible resorting! */ extraq_del(inf->vcpu, EXTRA_PEN_Q); else /* * Remember that we want to be on the penalty q * so that we can continue when we (un-)block * in penalty-extratime */ inf->status |= EXTRA_WANT_PEN_Q; /* (re-)add domain to the penalty extraq */ extraq_add_sort_update(inf->vcpu, EXTRA_PEN_Q, 0); } } /* Give it a fresh slice in the next period! */ inf->cputime = 0; } static void unblock_long_cons_b(struct sedf_vcpu_info* inf,s_time_t now) { /* Conservative 2b */ /* Treat the unblocking time as a start of a new period */ inf->deadl_abs = now + inf->period; inf->cputime = 0; } #define DOMAIN_EDF 1 #define DOMAIN_EXTRA_PEN 2 #define DOMAIN_EXTRA_UTIL 3 #define DOMAIN_IDLE 4 static inline int get_run_type(struct vcpu* d) { struct sedf_vcpu_info* inf = EDOM_INFO(d); if (is_idle_vcpu(d)) return DOMAIN_IDLE; if (inf->status & EXTRA_RUN_PEN) return DOMAIN_EXTRA_PEN; if (inf->status & EXTRA_RUN_UTIL) return DOMAIN_EXTRA_UTIL; return DOMAIN_EDF; } /* * Compares two domains in the relation of whether the one is allowed to * interrupt the others execution. * It returns true (!=0) if a switch to the other domain is good. * Current Priority scheme is as follows: * EDF > L0 (penalty based) extra-time > * L1 (utilization) extra-time > idle-domain * In the same class priorities are assigned as following: * EDF: early deadline > late deadline * L0 extra-time: lower score > higher score */ static inline int should_switch(struct vcpu *cur, struct vcpu *other, s_time_t now) { struct sedf_vcpu_info *cur_inf, *other_inf; cur_inf = EDOM_INFO(cur); other_inf = EDOM_INFO(other); /* Check whether we need to make an earlier scheduling decision */ if ( PERIOD_BEGIN(other_inf) < CPU_INFO(other->processor)->current_slice_expires ) return 1; /* No timing-based switches need to be taken into account here */ switch ( get_run_type(cur) ) { case DOMAIN_EDF: /* Do not interrupt a running EDF domain */ return 0; case DOMAIN_EXTRA_PEN: /* Check whether we also want the L0 ex-q with lower score */ return ((other_inf->status & EXTRA_WANT_PEN_Q) && (other_inf->score[EXTRA_PEN_Q] < cur_inf->score[EXTRA_PEN_Q])); case DOMAIN_EXTRA_UTIL: /* Check whether we want the L0 extraq. Don't * switch if both domains want L1 extraq. */ return !!(other_inf->status & EXTRA_WANT_PEN_Q); case DOMAIN_IDLE: return 1; } return 1; } static void sedf_wake(const struct scheduler *ops, struct vcpu *d) { s_time_t now = NOW(); struct sedf_vcpu_info* inf = EDOM_INFO(d); if ( unlikely(is_idle_vcpu(d)) ) return; if ( unlikely(__task_on_queue(d)) ) return; ASSERT(!sedf_runnable(d)); inf->status &= ~SEDF_ASLEEP; ASSERT(!extraq_on(d, EXTRA_UTIL_Q)); ASSERT(!extraq_on(d, EXTRA_PEN_Q)); if ( unlikely(inf->deadl_abs == 0) ) { /* Initial setup of the deadline */ inf->deadl_abs = now + inf->slice; } #ifdef SEDF_STATS inf->block_tot++; #endif if ( unlikely(now < PERIOD_BEGIN(inf)) ) { /* Unblocking in extra-time! */ if ( inf->status & EXTRA_WANT_PEN_Q ) { /* * We have a domain that wants compensation * for block penalty and did just block in * its compensation time. Give it another * chance! */ extraq_add_sort_update(d, EXTRA_PEN_Q, 0); } extraq_check_add_unblocked(d, 0); } else { if ( now < inf->deadl_abs ) { /* Short blocking */ #ifdef SEDF_STATS inf->short_block_tot++; #endif unblock_short_extra_support(inf, now); extraq_check_add_unblocked(d, 1); } else { /* Long unblocking */ #ifdef SEDF_STATS inf->long_block_tot++; #endif unblock_long_cons_b(inf, now); extraq_check_add_unblocked(d, 1); } } if ( PERIOD_BEGIN(inf) > now ) __add_to_waitqueue_sort(d); else __add_to_runqueue_sort(d); #ifdef SEDF_STATS /* Do some statistics here... */ if ( inf->block_abs != 0 ) { inf->block_time_tot += now - inf->block_abs; inf->penalty_time_tot += PERIOD_BEGIN(inf) + inf->cputime - inf->block_abs; } #endif /* Sanity check: make sure each extra-aware domain IS on the util-q! */ ASSERT(IMPLY(inf->status & EXTRA_AWARE, extraq_on(d, EXTRA_UTIL_Q))); ASSERT(__task_on_queue(d)); /* * Check whether the awakened task needs to invoke the do_schedule * routine. Try to avoid unnecessary runs but: * Save approximation: Always switch to scheduler! */ ASSERT(d->processor >= 0); ASSERT(d->processor < nr_cpu_ids); ASSERT(per_cpu(schedule_data, d->processor).curr); if ( should_switch(per_cpu(schedule_data, d->processor).curr, d, now) ) cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ); } /* Print a lot of useful information about a domains in the system */ static void sedf_dump_domain(struct vcpu *d) { printk("%i.%i has=%c ", d->domain->domain_id, d->vcpu_id, d->is_running ? 'T':'F'); printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu" " sc=%i xtr(%s)=%"PRIu64" ew=%hu", EDOM_INFO(d)->period, EDOM_INFO(d)->slice, EDOM_INFO(d)->deadl_abs, EDOM_INFO(d)->weight, EDOM_INFO(d)->score[EXTRA_UTIL_Q], (EDOM_INFO(d)->status & EXTRA_AWARE) ? "yes" : "no", EDOM_INFO(d)->extra_time_tot, EDOM_INFO(d)->extraweight); #ifdef SEDF_STATS if ( EDOM_INFO(d)->block_time_tot != 0 ) printk(" pen=%"PRIu64"%%", (EDOM_INFO(d)->penalty_time_tot * 100) / EDOM_INFO(d)->block_time_tot); if ( EDOM_INFO(d)->block_tot != 0 ) printk("\n blks=%u sh=%u (%u%%) (shex=%i "\ "shexsl=%i) l=%u (%u%%) avg: b=%"PRIu64" p=%"PRIu64"", EDOM_INFO(d)->block_tot, EDOM_INFO(d)->short_block_tot, (EDOM_INFO(d)->short_block_tot * 100) / EDOM_INFO(d)->block_tot, EDOM_INFO(d)->pen_extra_blocks, EDOM_INFO(d)->pen_extra_slices, EDOM_INFO(d)->long_block_tot, (EDOM_INFO(d)->long_block_tot * 100) / EDOM_INFO(d)->block_tot, (EDOM_INFO(d)->block_time_tot) / EDOM_INFO(d)->block_tot, (EDOM_INFO(d)->penalty_time_tot) / EDOM_INFO(d)->block_tot); #endif printk("\n"); } /* Dumps all domains on the specified cpu */ static void sedf_dump_cpu_state(const struct scheduler *ops, int i) { struct list_head *list, *queue, *tmp; struct sedf_vcpu_info *d_inf; struct domain *d; struct vcpu *ed; int loop = 0; printk("now=%"PRIu64"\n",NOW()); queue = RUNQ(i); printk("RUNQ rq %lx n: %lx, p: %lx\n", (unsigned long)queue, (unsigned long) queue->next, (unsigned long) queue->prev); list_for_each_safe ( list, tmp, queue ) { printk("%3d: ",loop++); d_inf = list_entry(list, struct sedf_vcpu_info, list); sedf_dump_domain(d_inf->vcpu); } queue = WAITQ(i); loop = 0; printk("\nWAITQ rq %lx n: %lx, p: %lx\n", (unsigned long)queue, (unsigned long) queue->next, (unsigned long) queue->prev); list_for_each_safe ( list, tmp, queue ) { printk("%3d: ",loop++); d_inf = list_entry(list, struct sedf_vcpu_info, list); sedf_dump_domain(d_inf->vcpu); } queue = EXTRAQ(i,EXTRA_PEN_Q); loop = 0; printk("\nEXTRAQ (penalty) rq %lx n: %lx, p: %lx\n", (unsigned long)queue, (unsigned long) queue->next, (unsigned long) queue->prev); list_for_each_safe ( list, tmp, queue ) { d_inf = list_entry(list, struct sedf_vcpu_info, extralist[EXTRA_PEN_Q]); printk("%3d: ",loop++); sedf_dump_domain(d_inf->vcpu); } queue = EXTRAQ(i,EXTRA_UTIL_Q); loop = 0; printk("\nEXTRAQ (utilization) rq %lx n: %lx, p: %lx\n", (unsigned long)queue, (unsigned long) queue->next, (unsigned long) queue->prev); list_for_each_safe ( list, tmp, queue ) { d_inf = list_entry(list, struct sedf_vcpu_info, extralist[EXTRA_UTIL_Q]); printk("%3d: ",loop++); sedf_dump_domain(d_inf->vcpu); } loop = 0; printk("\nnot on Q\n"); rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) { if ( (d->cpupool ? d->cpupool->sched : &sched_sedf_def) != ops ) continue; for_each_vcpu(d, ed) { if ( !__task_on_queue(ed) && (ed->processor == i) ) { printk("%3d: ",loop++); sedf_dump_domain(ed); } } } rcu_read_unlock(&domlist_read_lock); } /* Adjusts periods and slices of the domains accordingly to their weights */ static int sedf_adjust_weights(struct cpupool *c, int nr_cpus, int *sumw, s_time_t *sumt) { struct vcpu *p; struct domain *d; unsigned int cpu; /* * Sum across all weights. Notice that no runq locking is needed * here: the caller holds sedf_priv_info.lock and we're not changing * anything that is accessed during scheduling. */ rcu_read_lock(&domlist_read_lock); for_each_domain_in_cpupool( d, c ) { for_each_vcpu( d, p ) { if ( (cpu = p->processor) >= nr_cpus ) continue; if ( EDOM_INFO(p)->weight ) { sumw[cpu] += EDOM_INFO(p)->weight; } else { /* * Don't modify domains who don't have a weight, but sum * up the time they need, projected to a WEIGHT_PERIOD, * so that this time is not given to the weight-driven * domains */ /* Check for overflows */ ASSERT((WEIGHT_PERIOD < ULONG_MAX) && (EDOM_INFO(p)->slice_orig < ULONG_MAX)); sumt[cpu] += (WEIGHT_PERIOD * EDOM_INFO(p)->slice_orig) / EDOM_INFO(p)->period_orig; } } } rcu_read_unlock(&domlist_read_lock); /* * Adjust all slices (and periods) to the new weight. Unlike above, we * need to take thr runq lock for the various VCPUs: we're modyfing * slice and period which are referenced during scheduling. */ rcu_read_lock(&domlist_read_lock); for_each_domain_in_cpupool( d, c ) { for_each_vcpu ( d, p ) { if ( (cpu = p->processor) >= nr_cpus ) continue; if ( EDOM_INFO(p)->weight ) { /* Interrupts already off */ spinlock_t *lock = vcpu_schedule_lock(p); EDOM_INFO(p)->period_orig = EDOM_INFO(p)->period = WEIGHT_PERIOD; EDOM_INFO(p)->slice_orig = EDOM_INFO(p)->slice = (EDOM_INFO(p)->weight * (WEIGHT_PERIOD - WEIGHT_SAFETY - sumt[cpu])) / sumw[cpu]; vcpu_schedule_unlock(lock, p); } } } rcu_read_unlock(&domlist_read_lock); return 0; } /* Set or fetch domain scheduling parameters */ static int sedf_adjust(const struct scheduler *ops, struct domain *p, struct xen_domctl_scheduler_op *op) { struct sedf_priv_info *prv = SEDF_PRIV(ops); unsigned long flags; unsigned int nr_cpus = cpumask_last(&cpu_online_map) + 1; int *sumw = xzalloc_array(int, nr_cpus); s_time_t *sumt = xzalloc_array(s_time_t, nr_cpus); struct vcpu *v; int rc = 0; /* * Serialize against the pluggable scheduler lock to protect from * concurrent updates. We need to take the runq lock for the VCPUs * as well, since we are touching extraweight, weight, slice and * period. As in sched_credit2.c, runq locks nest inside the * pluggable scheduler lock. */ spin_lock_irqsave(&prv->lock, flags); if ( op->cmd == XEN_DOMCTL_SCHEDOP_putinfo ) { /* * These are used in sedf_adjust_weights() but have to be allocated in * this function, as we need to avoid nesting xmem_pool_alloc's lock * within our prv->lock. */ if ( !sumw || !sumt ) { /* Check for errors here, the _getinfo branch doesn't care */ rc = -ENOMEM; goto out; } /* Check for sane parameters */ if ( !op->u.sedf.period && !op->u.sedf.weight ) { rc = -EINVAL; goto out; } if ( op->u.sedf.weight ) { if ( (op->u.sedf.extratime & EXTRA_AWARE) && (!op->u.sedf.period) ) { /* Weight-driven domains with extratime only */ for_each_vcpu ( p, v ) { /* (Here and everywhere in the following) IRQs are already off, * hence vcpu_spin_lock() is the one. */ spinlock_t *lock = vcpu_schedule_lock(v); EDOM_INFO(v)->extraweight = op->u.sedf.weight; EDOM_INFO(v)->weight = 0; EDOM_INFO(v)->slice = 0; EDOM_INFO(v)->period = WEIGHT_PERIOD; vcpu_schedule_unlock(lock, v); } } else { /* Weight-driven domains with real-time execution */ for_each_vcpu ( p, v ) { spinlock_t *lock = vcpu_schedule_lock(v); EDOM_INFO(v)->weight = op->u.sedf.weight; vcpu_schedule_unlock(lock, v); } } } else { /* * Sanity checking: note that disabling extra weight requires * that we set a non-zero slice. */ if ( (op->u.sedf.period > PERIOD_MAX) || (op->u.sedf.period < PERIOD_MIN) || (op->u.sedf.slice > op->u.sedf.period) || (op->u.sedf.slice < SLICE_MIN) ) { rc = -EINVAL; goto out; } /* Time-driven domains */ for_each_vcpu ( p, v ) { spinlock_t *lock = vcpu_schedule_lock(v); EDOM_INFO(v)->weight = 0; EDOM_INFO(v)->extraweight = 0; EDOM_INFO(v)->period_orig = EDOM_INFO(v)->period = op->u.sedf.period; EDOM_INFO(v)->slice_orig = EDOM_INFO(v)->slice = op->u.sedf.slice; vcpu_schedule_unlock(lock, v); } } rc = sedf_adjust_weights(p->cpupool, nr_cpus, sumw, sumt); if ( rc ) goto out; for_each_vcpu ( p, v ) { spinlock_t *lock = vcpu_schedule_lock(v); EDOM_INFO(v)->status = (EDOM_INFO(v)->status & ~EXTRA_AWARE) | (op->u.sedf.extratime & EXTRA_AWARE); EDOM_INFO(v)->latency = op->u.sedf.latency; extraq_check(v); vcpu_schedule_unlock(lock, v); } } else if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) { if ( p->vcpu[0] == NULL ) { rc = -EINVAL; goto out; } op->u.sedf.period = EDOM_INFO(p->vcpu[0])->period; op->u.sedf.slice = EDOM_INFO(p->vcpu[0])->slice; op->u.sedf.extratime = EDOM_INFO(p->vcpu[0])->status & EXTRA_AWARE; op->u.sedf.latency = EDOM_INFO(p->vcpu[0])->latency; op->u.sedf.weight = EDOM_INFO(p->vcpu[0])->weight; } out: spin_unlock_irqrestore(&prv->lock, flags); xfree(sumt); xfree(sumw); return rc; } static struct sedf_priv_info _sedf_priv; const struct scheduler sched_sedf_def = { .name = "Simple EDF Scheduler", .opt_name = "sedf", .sched_id = XEN_SCHEDULER_SEDF, .sched_data = &_sedf_priv, .init_domain = sedf_init_domain, .destroy_domain = sedf_destroy_domain, .insert_vcpu = sedf_insert_vcpu, .alloc_vdata = sedf_alloc_vdata, .free_vdata = sedf_free_vdata, .alloc_pdata = sedf_alloc_pdata, .free_pdata = sedf_free_pdata, .alloc_domdata = sedf_alloc_domdata, .free_domdata = sedf_free_domdata, .init = sedf_init, .deinit = sedf_deinit, .do_schedule = sedf_do_schedule, .pick_cpu = sedf_pick_cpu, .dump_cpu_state = sedf_dump_cpu_state, .sleep = sedf_sleep, .wake = sedf_wake, .adjust = sedf_adjust, }; /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/cpupool.c0000664000175000017500000004253612307313555014364 0ustar smbsmb/****************************************************************************** * cpupool.c * * Generic cpupool-handling functions. * * Cpupools are a feature to have configurable scheduling domains. Each * cpupool runs an own scheduler on a dedicated set of physical cpus. * A domain is bound to one cpupool at any time, but it can be moved to * another cpupool. * * (C) 2009, Juergen Gross, Fujitsu Technology Solutions */ #include #include #include #include #include #include #include #define for_each_cpupool(ptr) \ for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next)) struct cpupool *cpupool0; /* Initial cpupool with Dom0 */ cpumask_t cpupool_free_cpus; /* cpus not in any cpupool */ static struct cpupool *cpupool_list; /* linked list, sorted by poolid */ static int cpupool_moving_cpu = -1; static struct cpupool *cpupool_cpu_moving = NULL; static cpumask_t cpupool_locked_cpus; static DEFINE_SPINLOCK(cpupool_lock); DEFINE_PER_CPU(struct cpupool *, cpupool); #define cpupool_dprintk(x...) ((void)0) static struct cpupool *alloc_cpupool_struct(void) { struct cpupool *c = xzalloc(struct cpupool); if ( !c || !zalloc_cpumask_var(&c->cpu_valid) ) { xfree(c); c = NULL; } else if ( !zalloc_cpumask_var(&c->cpu_suspended) ) { free_cpumask_var(c->cpu_valid); xfree(c); c = NULL; } return c; } static void free_cpupool_struct(struct cpupool *c) { if ( c ) { free_cpumask_var(c->cpu_suspended); free_cpumask_var(c->cpu_valid); } xfree(c); } /* * find a cpupool by it's id. to be called with cpupool lock held * if exact is not specified, the first cpupool with an id larger or equal to * the searched id is returned * returns NULL if not found. */ static struct cpupool *__cpupool_find_by_id(int id, int exact) { struct cpupool **q; ASSERT(spin_is_locked(&cpupool_lock)); for_each_cpupool(q) if ( (*q)->cpupool_id >= id ) break; return (!exact || (*q == NULL) || ((*q)->cpupool_id == id)) ? *q : NULL; } static struct cpupool *cpupool_find_by_id(int poolid) { return __cpupool_find_by_id(poolid, 1); } static struct cpupool *__cpupool_get_by_id(int poolid, int exact) { struct cpupool *c; spin_lock(&cpupool_lock); c = __cpupool_find_by_id(poolid, exact); if ( c != NULL ) atomic_inc(&c->refcnt); spin_unlock(&cpupool_lock); return c; } struct cpupool *cpupool_get_by_id(int poolid) { return __cpupool_get_by_id(poolid, 1); } static struct cpupool *cpupool_get_next_by_id(int poolid) { return __cpupool_get_by_id(poolid, 0); } void cpupool_put(struct cpupool *pool) { if ( !atomic_dec_and_test(&pool->refcnt) ) return; scheduler_free(pool->sched); free_cpupool_struct(pool); } /* * create a new cpupool with specified poolid and scheduler * returns pointer to new cpupool structure if okay, NULL else * possible failures: * - no memory * - poolid already used * - unknown scheduler */ static struct cpupool *cpupool_create( int poolid, unsigned int sched_id, int *perr) { struct cpupool *c; struct cpupool **q; int last = 0; *perr = -ENOMEM; if ( (c = alloc_cpupool_struct()) == NULL ) return NULL; /* One reference for caller, one reference for cpupool_destroy(). */ atomic_set(&c->refcnt, 2); cpupool_dprintk("cpupool_create(pool=%d,sched=%u)\n", poolid, sched_id); spin_lock(&cpupool_lock); for_each_cpupool(q) { last = (*q)->cpupool_id; if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) ) break; } if ( *q != NULL ) { if ( (*q)->cpupool_id == poolid ) { spin_unlock(&cpupool_lock); free_cpupool_struct(c); *perr = -EEXIST; return NULL; } c->next = *q; } c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid; if ( poolid == 0 ) { c->sched = scheduler_get_default(); } else { c->sched = scheduler_alloc(sched_id, perr); if ( c->sched == NULL ) { spin_unlock(&cpupool_lock); free_cpupool_struct(c); return NULL; } } *q = c; spin_unlock(&cpupool_lock); cpupool_dprintk("Created cpupool %d with scheduler %s (%s)\n", c->cpupool_id, c->sched->name, c->sched->opt_name); *perr = 0; return c; } /* * destroys the given cpupool * returns 0 on success, 1 else * possible failures: * - pool still in use * - cpus still assigned to pool * - pool not in list */ static int cpupool_destroy(struct cpupool *c) { struct cpupool **q; spin_lock(&cpupool_lock); for_each_cpupool(q) if ( *q == c ) break; if ( *q != c ) { spin_unlock(&cpupool_lock); return -ENOENT; } if ( (c->n_dom != 0) || cpumask_weight(c->cpu_valid) ) { spin_unlock(&cpupool_lock); return -EBUSY; } *q = c->next; spin_unlock(&cpupool_lock); cpupool_put(c); cpupool_dprintk("cpupool_destroy(pool=%d)\n", c->cpupool_id); return 0; } /* * assign a specific cpu to a cpupool * cpupool_lock must be held */ static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) { int ret; struct cpupool *old; struct domain *d; if ( (cpupool_moving_cpu == cpu) && (c != cpupool_cpu_moving) ) return -EBUSY; old = per_cpu(cpupool, cpu); per_cpu(cpupool, cpu) = c; ret = schedule_cpu_switch(cpu, c); if ( ret ) { per_cpu(cpupool, cpu) = old; return ret; } cpumask_clear_cpu(cpu, &cpupool_free_cpus); if (cpupool_moving_cpu == cpu) { cpupool_moving_cpu = -1; cpupool_put(cpupool_cpu_moving); cpupool_cpu_moving = NULL; } cpumask_set_cpu(cpu, c->cpu_valid); rcu_read_lock(&domlist_read_lock); for_each_domain_in_cpupool(d, c) { domain_update_node_affinity(d); } rcu_read_unlock(&domlist_read_lock); return 0; } static long cpupool_unassign_cpu_helper(void *info) { int cpu = cpupool_moving_cpu; long ret; cpupool_dprintk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n", cpupool_cpu_moving->cpupool_id, cpu); spin_lock(&cpupool_lock); ret = cpu_disable_scheduler(cpu); cpumask_set_cpu(cpu, &cpupool_free_cpus); if ( !ret ) { ret = schedule_cpu_switch(cpu, NULL); if ( ret ) { cpumask_clear_cpu(cpu, &cpupool_free_cpus); goto out; } per_cpu(cpupool, cpu) = NULL; cpupool_moving_cpu = -1; cpupool_put(cpupool_cpu_moving); cpupool_cpu_moving = NULL; } out: spin_unlock(&cpupool_lock); cpupool_dprintk("cpupool_unassign_cpu ret=%ld\n", ret); return ret; } /* * unassign a specific cpu from a cpupool * we must be sure not to run on the cpu to be unassigned! to achieve this * the main functionality is performed via continue_hypercall_on_cpu on a * specific cpu. * if the cpu to be removed is the last one of the cpupool no active domain * must be bound to the cpupool. dying domains are moved to cpupool0 as they * might be zombies. * possible failures: * - last cpu and still active domains in cpupool * - cpu just being unplugged */ int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu) { int work_cpu; int ret; struct domain *d; cpupool_dprintk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n", c->cpupool_id, cpu); spin_lock(&cpupool_lock); ret = -EBUSY; if ( (cpupool_moving_cpu != -1) && (cpu != cpupool_moving_cpu) ) goto out; if ( cpumask_test_cpu(cpu, &cpupool_locked_cpus) ) goto out; ret = 0; if ( !cpumask_test_cpu(cpu, c->cpu_valid) && (cpu != cpupool_moving_cpu) ) goto out; if ( (c->n_dom > 0) && (cpumask_weight(c->cpu_valid) == 1) && (cpu != cpupool_moving_cpu) ) { rcu_read_lock(&domlist_read_lock); for_each_domain_in_cpupool(d, c) { if ( !d->is_dying ) { ret = -EBUSY; break; } c->n_dom--; ret = sched_move_domain(d, cpupool0); if ( ret ) { c->n_dom++; break; } cpupool0->n_dom++; } rcu_read_unlock(&domlist_read_lock); if ( ret ) goto out; } cpupool_moving_cpu = cpu; atomic_inc(&c->refcnt); cpupool_cpu_moving = c; cpumask_clear_cpu(cpu, c->cpu_valid); rcu_read_lock(&domlist_read_lock); for_each_domain_in_cpupool(d, c) domain_update_node_affinity(d); rcu_read_unlock(&domlist_read_lock); spin_unlock(&cpupool_lock); work_cpu = smp_processor_id(); if ( work_cpu == cpu ) { work_cpu = cpumask_first(cpupool0->cpu_valid); if ( work_cpu == cpu ) work_cpu = cpumask_next(cpu, cpupool0->cpu_valid); } return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c); out: spin_unlock(&cpupool_lock); cpupool_dprintk("cpupool_unassign_cpu(pool=%d,cpu=%d) ret %d\n", c->cpupool_id, cpu, ret); return ret; } /* * add a new domain to a cpupool * possible failures: * - pool does not exist * - no cpu assigned to pool */ int cpupool_add_domain(struct domain *d, int poolid) { struct cpupool *c; int rc; int n_dom = 0; if ( poolid == CPUPOOLID_NONE ) return 0; spin_lock(&cpupool_lock); c = cpupool_find_by_id(poolid); if ( c == NULL ) rc = -ESRCH; else if ( !cpumask_weight(c->cpu_valid) ) rc = -ENODEV; else { c->n_dom++; n_dom = c->n_dom; d->cpupool = c; rc = 0; } spin_unlock(&cpupool_lock); cpupool_dprintk("cpupool_add_domain(dom=%d,pool=%d) n_dom %d rc %d\n", d->domain_id, poolid, n_dom, rc); return rc; } /* * remove a domain from a cpupool */ void cpupool_rm_domain(struct domain *d) { int cpupool_id; int n_dom; if ( d->cpupool == NULL ) return; spin_lock(&cpupool_lock); cpupool_id = d->cpupool->cpupool_id; d->cpupool->n_dom--; n_dom = d->cpupool->n_dom; d->cpupool = NULL; spin_unlock(&cpupool_lock); cpupool_dprintk("cpupool_rm_domain(dom=%d,pool=%d) n_dom %d\n", d->domain_id, cpupool_id, n_dom); return; } /* * called to add a new cpu to pool admin * we add a hotplugged cpu to the cpupool0 to be able to add it to dom0, * unless we are resuming from S3, in which case we put the cpu back * in the cpupool it was in prior to suspend. */ static void cpupool_cpu_add(unsigned int cpu) { spin_lock(&cpupool_lock); cpumask_clear_cpu(cpu, &cpupool_locked_cpus); cpumask_set_cpu(cpu, &cpupool_free_cpus); if ( system_state == SYS_STATE_resume ) { struct cpupool **c; for_each_cpupool(c) { if ( cpumask_test_cpu(cpu, (*c)->cpu_suspended ) ) { cpupool_assign_cpu_locked(*c, cpu); cpumask_clear_cpu(cpu, (*c)->cpu_suspended); } } } if ( cpumask_test_cpu(cpu, &cpupool_free_cpus) ) cpupool_assign_cpu_locked(cpupool0, cpu); spin_unlock(&cpupool_lock); } /* * called to remove a cpu from pool admin * the cpu to be removed is locked to avoid removing it from dom0 * returns failure if not in pool0 */ static int cpupool_cpu_remove(unsigned int cpu) { int ret = 0; spin_lock(&cpupool_lock); if ( !cpumask_test_cpu(cpu, cpupool0->cpu_valid)) ret = -EBUSY; else cpumask_set_cpu(cpu, &cpupool_locked_cpus); spin_unlock(&cpupool_lock); return ret; } /* * do cpupool related sysctl operations */ int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op) { int ret; struct cpupool *c; switch ( op->op ) { case XEN_SYSCTL_CPUPOOL_OP_CREATE: { int poolid; poolid = (op->cpupool_id == XEN_SYSCTL_CPUPOOL_PAR_ANY) ? CPUPOOLID_NONE: op->cpupool_id; c = cpupool_create(poolid, op->sched_id, &ret); if ( c != NULL ) { op->cpupool_id = c->cpupool_id; cpupool_put(c); } } break; case XEN_SYSCTL_CPUPOOL_OP_DESTROY: { c = cpupool_get_by_id(op->cpupool_id); ret = -ENOENT; if ( c == NULL ) break; ret = cpupool_destroy(c); cpupool_put(c); } break; case XEN_SYSCTL_CPUPOOL_OP_INFO: { c = cpupool_get_next_by_id(op->cpupool_id); ret = -ENOENT; if ( c == NULL ) break; op->cpupool_id = c->cpupool_id; op->sched_id = c->sched->sched_id; op->n_dom = c->n_dom; ret = cpumask_to_xenctl_bitmap(&op->cpumap, c->cpu_valid); cpupool_put(c); } break; case XEN_SYSCTL_CPUPOOL_OP_ADDCPU: { unsigned cpu; cpu = op->cpu; cpupool_dprintk("cpupool_assign_cpu(pool=%d,cpu=%d)\n", op->cpupool_id, cpu); spin_lock(&cpupool_lock); if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY ) cpu = cpumask_first(&cpupool_free_cpus); ret = -EINVAL; if ( cpu >= nr_cpu_ids ) goto addcpu_out; ret = -EBUSY; if ( !cpumask_test_cpu(cpu, &cpupool_free_cpus) ) goto addcpu_out; c = cpupool_find_by_id(op->cpupool_id); ret = -ENOENT; if ( c == NULL ) goto addcpu_out; ret = cpupool_assign_cpu_locked(c, cpu); addcpu_out: spin_unlock(&cpupool_lock); cpupool_dprintk("cpupool_assign_cpu(pool=%d,cpu=%d) ret %d\n", op->cpupool_id, cpu, ret); } break; case XEN_SYSCTL_CPUPOOL_OP_RMCPU: { unsigned cpu; c = cpupool_get_by_id(op->cpupool_id); ret = -ENOENT; if ( c == NULL ) break; cpu = op->cpu; if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY ) cpu = cpumask_last(c->cpu_valid); ret = (cpu < nr_cpu_ids) ? cpupool_unassign_cpu(c, cpu) : -EINVAL; cpupool_put(c); } break; case XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN: { struct domain *d; ret = rcu_lock_remote_domain_by_id(op->domid, &d); if ( ret ) break; if ( d->cpupool == NULL ) { ret = -EINVAL; rcu_unlock_domain(d); break; } if ( op->cpupool_id == d->cpupool->cpupool_id ) { ret = 0; rcu_unlock_domain(d); break; } cpupool_dprintk("cpupool move_domain(dom=%d)->pool=%d\n", d->domain_id, op->cpupool_id); ret = -ENOENT; spin_lock(&cpupool_lock); c = cpupool_find_by_id(op->cpupool_id); if ( (c != NULL) && cpumask_weight(c->cpu_valid) ) { d->cpupool->n_dom--; ret = sched_move_domain(d, c); if ( ret ) d->cpupool->n_dom++; else c->n_dom++; } spin_unlock(&cpupool_lock); cpupool_dprintk("cpupool move_domain(dom=%d)->pool=%d ret %d\n", d->domain_id, op->cpupool_id, ret); rcu_unlock_domain(d); } break; case XEN_SYSCTL_CPUPOOL_OP_FREEINFO: { ret = cpumask_to_xenctl_bitmap( &op->cpumap, &cpupool_free_cpus); } break; default: ret = -ENOSYS; break; } return ret; } void dump_runq(unsigned char key) { unsigned long flags; s_time_t now = NOW(); struct cpupool **c; spin_lock(&cpupool_lock); local_irq_save(flags); printk("sched_smt_power_savings: %s\n", sched_smt_power_savings? "enabled":"disabled"); printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now); printk("Idle cpupool:\n"); schedule_dump(NULL); for_each_cpupool(c) { printk("Cpupool %d:\n", (*c)->cpupool_id); schedule_dump(*c); } local_irq_restore(flags); spin_unlock(&cpupool_lock); } static int cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int rc = 0; if ( system_state == SYS_STATE_suspend ) { struct cpupool **c; for_each_cpupool(c) if ( cpumask_test_cpu(cpu, (*c)->cpu_valid ) ) cpumask_set_cpu(cpu, (*c)->cpu_suspended); } switch ( action ) { case CPU_DOWN_FAILED: case CPU_ONLINE: cpupool_cpu_add(cpu); break; case CPU_DOWN_PREPARE: rc = cpupool_cpu_remove(cpu); break; default: break; } return !rc ? NOTIFY_DONE : notifier_from_errno(rc); } static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; static int __init cpupool_presmp_init(void) { int err; void *cpu = (void *)(long)smp_processor_id(); cpupool0 = cpupool_create(0, 0, &err); BUG_ON(cpupool0 == NULL); cpupool_put(cpupool0); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); return 0; } presmp_initcall(cpupool_presmp_init); /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/timer.c0000664000175000017500000003713512307313555014022 0ustar smbsmb/****************************************************************************** * timer.c * * Copyright (c) 2002-2003 Rolf Neugebauer * Copyright (c) 2002-2005 K A Fraser */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* We program the time hardware this far behind the closest deadline. */ static unsigned int timer_slop __read_mostly = 50000; /* 50 us */ integer_param("timer_slop", timer_slop); struct timers { spinlock_t lock; struct timer **heap; struct timer *list; struct timer *running; struct list_head inactive; } __cacheline_aligned; static DEFINE_PER_CPU(struct timers, timers); /* Protects lock-free access to per-timer cpu field against cpu offlining. */ static DEFINE_RCU_READ_LOCK(timer_cpu_read_lock); DEFINE_PER_CPU(s_time_t, timer_deadline); /**************************************************************************** * HEAP OPERATIONS. */ #define GET_HEAP_SIZE(_h) ((int)(((u16 *)(_h))[0])) #define SET_HEAP_SIZE(_h,_v) (((u16 *)(_h))[0] = (u16)(_v)) #define GET_HEAP_LIMIT(_h) ((int)(((u16 *)(_h))[1])) #define SET_HEAP_LIMIT(_h,_v) (((u16 *)(_h))[1] = (u16)(_v)) /* Sink down element @pos of @heap. */ static void down_heap(struct timer **heap, int pos) { int sz = GET_HEAP_SIZE(heap), nxt; struct timer *t = heap[pos]; while ( (nxt = (pos << 1)) <= sz ) { if ( ((nxt+1) <= sz) && (heap[nxt+1]->expires < heap[nxt]->expires) ) nxt++; if ( heap[nxt]->expires > t->expires ) break; heap[pos] = heap[nxt]; heap[pos]->heap_offset = pos; pos = nxt; } heap[pos] = t; t->heap_offset = pos; } /* Float element @pos up @heap. */ static void up_heap(struct timer **heap, int pos) { struct timer *t = heap[pos]; while ( (pos > 1) && (t->expires < heap[pos>>1]->expires) ) { heap[pos] = heap[pos>>1]; heap[pos]->heap_offset = pos; pos >>= 1; } heap[pos] = t; t->heap_offset = pos; } /* Delete @t from @heap. Return TRUE if new top of heap. */ static int remove_from_heap(struct timer **heap, struct timer *t) { int sz = GET_HEAP_SIZE(heap); int pos = t->heap_offset; if ( unlikely(pos == sz) ) { SET_HEAP_SIZE(heap, sz-1); goto out; } heap[pos] = heap[sz]; heap[pos]->heap_offset = pos; SET_HEAP_SIZE(heap, --sz); if ( (pos > 1) && (heap[pos]->expires < heap[pos>>1]->expires) ) up_heap(heap, pos); else down_heap(heap, pos); out: return (pos == 1); } /* Add new entry @t to @heap. Return TRUE if new top of heap. */ static int add_to_heap(struct timer **heap, struct timer *t) { int sz = GET_HEAP_SIZE(heap); /* Fail if the heap is full. */ if ( unlikely(sz == GET_HEAP_LIMIT(heap)) ) return 0; SET_HEAP_SIZE(heap, ++sz); heap[sz] = t; t->heap_offset = sz; up_heap(heap, sz); return (t->heap_offset == 1); } /**************************************************************************** * LINKED LIST OPERATIONS. */ static int remove_from_list(struct timer **pprev, struct timer *t) { struct timer *curr, **_pprev = pprev; while ( (curr = *_pprev) != t ) _pprev = &curr->list_next; *_pprev = t->list_next; return (_pprev == pprev); } static int add_to_list(struct timer **pprev, struct timer *t) { struct timer *curr, **_pprev = pprev; while ( ((curr = *_pprev) != NULL) && (curr->expires <= t->expires) ) _pprev = &curr->list_next; t->list_next = curr; *_pprev = t; return (_pprev == pprev); } /**************************************************************************** * TIMER OPERATIONS. */ static int remove_entry(struct timer *t) { struct timers *timers = &per_cpu(timers, t->cpu); int rc; switch ( t->status ) { case TIMER_STATUS_in_heap: rc = remove_from_heap(timers->heap, t); break; case TIMER_STATUS_in_list: rc = remove_from_list(&timers->list, t); break; default: rc = 0; BUG(); } t->status = TIMER_STATUS_invalid; return rc; } static int add_entry(struct timer *t) { struct timers *timers = &per_cpu(timers, t->cpu); int rc; ASSERT(t->status == TIMER_STATUS_invalid); /* Try to add to heap. t->heap_offset indicates whether we succeed. */ t->heap_offset = 0; t->status = TIMER_STATUS_in_heap; rc = add_to_heap(timers->heap, t); if ( t->heap_offset != 0 ) return rc; /* Fall back to adding to the slower linked list. */ t->status = TIMER_STATUS_in_list; return add_to_list(&timers->list, t); } static inline void activate_timer(struct timer *timer) { ASSERT(timer->status == TIMER_STATUS_inactive); timer->status = TIMER_STATUS_invalid; list_del(&timer->inactive); if ( add_entry(timer) ) cpu_raise_softirq(timer->cpu, TIMER_SOFTIRQ); } static inline void deactivate_timer(struct timer *timer) { if ( remove_entry(timer) ) cpu_raise_softirq(timer->cpu, TIMER_SOFTIRQ); timer->status = TIMER_STATUS_inactive; list_add(&timer->inactive, &per_cpu(timers, timer->cpu).inactive); } static inline bool_t timer_lock(struct timer *timer) { unsigned int cpu; rcu_read_lock(&timer_cpu_read_lock); for ( ; ; ) { cpu = read_atomic(&timer->cpu); if ( unlikely(cpu == TIMER_CPU_status_killed) ) { rcu_read_unlock(&timer_cpu_read_lock); return 0; } spin_lock(&per_cpu(timers, cpu).lock); if ( likely(timer->cpu == cpu) ) break; spin_unlock(&per_cpu(timers, cpu).lock); } rcu_read_unlock(&timer_cpu_read_lock); return 1; } #define timer_lock_irqsave(t, flags) ({ \ bool_t __x; \ local_irq_save(flags); \ if ( !(__x = timer_lock(t)) ) \ local_irq_restore(flags); \ __x; \ }) static inline void timer_unlock(struct timer *timer) { spin_unlock(&per_cpu(timers, timer->cpu).lock); } #define timer_unlock_irqrestore(t, flags) ({ \ timer_unlock(t); \ local_irq_restore(flags); \ }) static bool_t active_timer(struct timer *timer) { ASSERT(timer->status >= TIMER_STATUS_inactive); ASSERT(timer->status <= TIMER_STATUS_in_list); return (timer->status >= TIMER_STATUS_in_heap); } void init_timer( struct timer *timer, void (*function)(void *), void *data, unsigned int cpu) { unsigned long flags; memset(timer, 0, sizeof(*timer)); timer->function = function; timer->data = data; write_atomic(&timer->cpu, cpu); timer->status = TIMER_STATUS_inactive; if ( !timer_lock_irqsave(timer, flags) ) BUG(); list_add(&timer->inactive, &per_cpu(timers, cpu).inactive); timer_unlock_irqrestore(timer, flags); } void set_timer(struct timer *timer, s_time_t expires) { unsigned long flags; if ( !timer_lock_irqsave(timer, flags) ) return; if ( active_timer(timer) ) deactivate_timer(timer); timer->expires = expires; activate_timer(timer); timer_unlock_irqrestore(timer, flags); } void stop_timer(struct timer *timer) { unsigned long flags; if ( !timer_lock_irqsave(timer, flags) ) return; if ( active_timer(timer) ) deactivate_timer(timer); timer_unlock_irqrestore(timer, flags); } void migrate_timer(struct timer *timer, unsigned int new_cpu) { unsigned int old_cpu; bool_t active; unsigned long flags; rcu_read_lock(&timer_cpu_read_lock); for ( ; ; ) { old_cpu = read_atomic(&timer->cpu); if ( (old_cpu == new_cpu) || (old_cpu == TIMER_CPU_status_killed) ) { rcu_read_unlock(&timer_cpu_read_lock); return; } if ( old_cpu < new_cpu ) { spin_lock_irqsave(&per_cpu(timers, old_cpu).lock, flags); spin_lock(&per_cpu(timers, new_cpu).lock); } else { spin_lock_irqsave(&per_cpu(timers, new_cpu).lock, flags); spin_lock(&per_cpu(timers, old_cpu).lock); } if ( likely(timer->cpu == old_cpu) ) break; spin_unlock(&per_cpu(timers, old_cpu).lock); spin_unlock_irqrestore(&per_cpu(timers, new_cpu).lock, flags); } rcu_read_unlock(&timer_cpu_read_lock); active = active_timer(timer); if ( active ) deactivate_timer(timer); list_del(&timer->inactive); write_atomic(&timer->cpu, new_cpu); list_add(&timer->inactive, &per_cpu(timers, new_cpu).inactive); if ( active ) activate_timer(timer); spin_unlock(&per_cpu(timers, old_cpu).lock); spin_unlock_irqrestore(&per_cpu(timers, new_cpu).lock, flags); } void kill_timer(struct timer *timer) { unsigned int old_cpu, cpu; unsigned long flags; BUG_ON(this_cpu(timers).running == timer); if ( !timer_lock_irqsave(timer, flags) ) return; if ( active_timer(timer) ) deactivate_timer(timer); list_del(&timer->inactive); timer->status = TIMER_STATUS_killed; old_cpu = timer->cpu; write_atomic(&timer->cpu, TIMER_CPU_status_killed); spin_unlock_irqrestore(&per_cpu(timers, old_cpu).lock, flags); for_each_online_cpu ( cpu ) while ( per_cpu(timers, cpu).running == timer ) cpu_relax(); } static void execute_timer(struct timers *ts, struct timer *t) { void (*fn)(void *) = t->function; void *data = t->data; t->status = TIMER_STATUS_inactive; list_add(&t->inactive, &ts->inactive); ts->running = t; spin_unlock_irq(&ts->lock); (*fn)(data); spin_lock_irq(&ts->lock); ts->running = NULL; } static void timer_softirq_action(void) { struct timer *t, **heap, *next; struct timers *ts; s_time_t now, deadline; ts = &this_cpu(timers); heap = ts->heap; /* If we overflowed the heap, try to allocate a larger heap. */ if ( unlikely(ts->list != NULL) ) { /* old_limit == (2^n)-1; new_limit == (2^(n+4))-1 */ int old_limit = GET_HEAP_LIMIT(heap); int new_limit = ((old_limit + 1) << 4) - 1; struct timer **newheap = xmalloc_array(struct timer *, new_limit + 1); if ( newheap != NULL ) { spin_lock_irq(&ts->lock); memcpy(newheap, heap, (old_limit + 1) * sizeof(*heap)); SET_HEAP_LIMIT(newheap, new_limit); ts->heap = newheap; spin_unlock_irq(&ts->lock); if ( old_limit != 0 ) xfree(heap); heap = newheap; } } spin_lock_irq(&ts->lock); now = NOW(); /* Execute ready heap timers. */ while ( (GET_HEAP_SIZE(heap) != 0) && ((t = heap[1])->expires < now) ) { remove_from_heap(heap, t); execute_timer(ts, t); } /* Execute ready list timers. */ while ( ((t = ts->list) != NULL) && (t->expires < now) ) { ts->list = t->list_next; execute_timer(ts, t); } /* Try to move timers from linked list to more efficient heap. */ next = ts->list; ts->list = NULL; while ( unlikely((t = next) != NULL) ) { next = t->list_next; t->status = TIMER_STATUS_invalid; add_entry(t); } /* Find earliest deadline from head of linked list and top of heap. */ deadline = STIME_MAX; if ( GET_HEAP_SIZE(heap) != 0 ) deadline = heap[1]->expires; if ( (ts->list != NULL) && (ts->list->expires < deadline) ) deadline = ts->list->expires; this_cpu(timer_deadline) = (deadline == STIME_MAX) ? 0 : deadline + timer_slop; if ( !reprogram_timer(this_cpu(timer_deadline)) ) raise_softirq(TIMER_SOFTIRQ); spin_unlock_irq(&ts->lock); } s_time_t align_timer(s_time_t firsttick, uint64_t period) { if ( !period ) return firsttick; return firsttick + (period - 1) - ((firsttick - 1) % period); } static void dump_timer(struct timer *t, s_time_t now) { printk(" ex=%12"PRId64"us timer=%p cb=%ps(%p)\n", (t->expires - now) / 1000, t, t->function, t->data); } static void dump_timerq(unsigned char key) { struct timer *t; struct timers *ts; unsigned long flags; s_time_t now = NOW(); int i, j; printk("Dumping timer queues:\n"); for_each_online_cpu( i ) { ts = &per_cpu(timers, i); printk("CPU%02d:\n", i); spin_lock_irqsave(&ts->lock, flags); for ( j = 1; j <= GET_HEAP_SIZE(ts->heap); j++ ) dump_timer(ts->heap[j], now); for ( t = ts->list, j = 0; t != NULL; t = t->list_next, j++ ) dump_timer(t, now); spin_unlock_irqrestore(&ts->lock, flags); } } static struct keyhandler dump_timerq_keyhandler = { .diagnostic = 1, .u.fn = dump_timerq, .desc = "dump timer queues" }; static void migrate_timers_from_cpu(unsigned int old_cpu) { unsigned int new_cpu = cpumask_any(&cpu_online_map); struct timers *old_ts, *new_ts; struct timer *t; bool_t notify = 0; ASSERT(!cpu_online(old_cpu) && cpu_online(new_cpu)); old_ts = &per_cpu(timers, old_cpu); new_ts = &per_cpu(timers, new_cpu); if ( old_cpu < new_cpu ) { spin_lock_irq(&old_ts->lock); spin_lock(&new_ts->lock); } else { spin_lock_irq(&new_ts->lock); spin_lock(&old_ts->lock); } while ( (t = GET_HEAP_SIZE(old_ts->heap) ? old_ts->heap[1] : old_ts->list) != NULL ) { remove_entry(t); write_atomic(&t->cpu, new_cpu); notify |= add_entry(t); } while ( !list_empty(&old_ts->inactive) ) { t = list_entry(old_ts->inactive.next, struct timer, inactive); list_del(&t->inactive); write_atomic(&t->cpu, new_cpu); list_add(&t->inactive, &new_ts->inactive); } spin_unlock(&old_ts->lock); spin_unlock_irq(&new_ts->lock); if ( notify ) cpu_raise_softirq(new_cpu, TIMER_SOFTIRQ); } static struct timer *dummy_heap; static int cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct timers *ts = &per_cpu(timers, cpu); switch ( action ) { case CPU_UP_PREPARE: INIT_LIST_HEAD(&ts->inactive); spin_lock_init(&ts->lock); ts->heap = &dummy_heap; break; case CPU_UP_CANCELED: case CPU_DEAD: migrate_timers_from_cpu(cpu); break; default: break; } return NOTIFY_DONE; } static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback, .priority = 99 }; void __init timer_init(void) { void *cpu = (void *)(long)smp_processor_id(); open_softirq(TIMER_SOFTIRQ, timer_softirq_action); /* * All CPUs initially share an empty dummy heap. Only those CPUs that * are brought online will be dynamically allocated their own heap. */ SET_HEAP_SIZE(&dummy_heap, 0); SET_HEAP_LIMIT(&dummy_heap, 0); cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); register_cpu_notifier(&cpu_nfb); register_keyhandler('a', &dump_timerq_keyhandler); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/wait.c0000664000175000017500000001436012307313555013641 0ustar smbsmb/****************************************************************************** * wait.c * * Sleep in hypervisor context for some event to occur. * * Copyright (c) 2010, Keir Fraser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include struct waitqueue_vcpu { struct list_head list; struct vcpu *vcpu; #ifdef CONFIG_X86 /* * Xen/x86 does not have per-vcpu hypervisor stacks. So we must save the * hypervisor context before sleeping (descheduling), setjmp/longjmp-style. */ void *esp; char *stack; cpumask_t saved_affinity; unsigned int wakeup_cpu; #endif }; int init_waitqueue_vcpu(struct vcpu *v) { struct waitqueue_vcpu *wqv; wqv = xzalloc(struct waitqueue_vcpu); if ( wqv == NULL ) return -ENOMEM; #ifdef CONFIG_X86 wqv->stack = alloc_xenheap_page(); if ( wqv->stack == NULL ) { xfree(wqv); return -ENOMEM; } #endif INIT_LIST_HEAD(&wqv->list); wqv->vcpu = v; v->waitqueue_vcpu = wqv; return 0; } void destroy_waitqueue_vcpu(struct vcpu *v) { struct waitqueue_vcpu *wqv; wqv = v->waitqueue_vcpu; if ( wqv == NULL ) return; BUG_ON(!list_empty(&wqv->list)); #ifdef CONFIG_X86 free_xenheap_page(wqv->stack); #endif xfree(wqv); v->waitqueue_vcpu = NULL; } void init_waitqueue_head(struct waitqueue_head *wq) { spin_lock_init(&wq->lock); INIT_LIST_HEAD(&wq->list); } void destroy_waitqueue_head(struct waitqueue_head *wq) { wake_up_all(wq); } void wake_up_nr(struct waitqueue_head *wq, unsigned int nr) { struct waitqueue_vcpu *wqv; spin_lock(&wq->lock); while ( !list_empty(&wq->list) && nr-- ) { wqv = list_entry(wq->list.next, struct waitqueue_vcpu, list); list_del_init(&wqv->list); vcpu_unpause(wqv->vcpu); put_domain(wqv->vcpu->domain); } spin_unlock(&wq->lock); } void wake_up_one(struct waitqueue_head *wq) { wake_up_nr(wq, 1); } void wake_up_all(struct waitqueue_head *wq) { wake_up_nr(wq, UINT_MAX); } #ifdef CONFIG_X86 static void __prepare_to_wait(struct waitqueue_vcpu *wqv) { struct cpu_info *cpu_info = get_cpu_info(); struct vcpu *curr = current; unsigned long dummy; u32 entry_vector = cpu_info->guest_cpu_user_regs.entry_vector; cpu_info->guest_cpu_user_regs.entry_vector &= ~TRAP_regs_partial; ASSERT(wqv->esp == 0); /* Save current VCPU affinity; force wakeup on *this* CPU only. */ wqv->wakeup_cpu = smp_processor_id(); cpumask_copy(&wqv->saved_affinity, curr->cpu_affinity); if ( vcpu_set_affinity(curr, cpumask_of(wqv->wakeup_cpu)) ) { gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n"); domain_crash_synchronous(); } asm volatile ( "push %%rax; push %%rbx; push %%rdx; " "push %%rbp; push %%r8; push %%r9; push %%r10; push %%r11; " "push %%r12; push %%r13; push %%r14; push %%r15; call 1f; " "1: addq $2f-1b,(%%rsp); sub %%esp,%%ecx; cmp %3,%%ecx; ja 3f; " "mov %%rsp,%%rsi; 2: rep movsb; mov %%rsp,%%rsi; 3: pop %%rax; " "pop %%r15; pop %%r14; pop %%r13; pop %%r12; " "pop %%r11; pop %%r10; pop %%r9; pop %%r8; " "pop %%rbp; pop %%rdx; pop %%rbx; pop %%rax" : "=&S" (wqv->esp), "=&c" (dummy), "=&D" (dummy) : "i" (PAGE_SIZE), "0" (0), "1" (cpu_info), "2" (wqv->stack) : "memory" ); if ( unlikely(wqv->esp == 0) ) { gdprintk(XENLOG_ERR, "Stack too large in %s\n", __FUNCTION__); domain_crash_synchronous(); } cpu_info->guest_cpu_user_regs.entry_vector = entry_vector; } static void __finish_wait(struct waitqueue_vcpu *wqv) { wqv->esp = NULL; (void)vcpu_set_affinity(current, &wqv->saved_affinity); } void check_wakeup_from_wait(void) { struct waitqueue_vcpu *wqv = current->waitqueue_vcpu; ASSERT(list_empty(&wqv->list)); if ( likely(wqv->esp == NULL) ) return; /* Check if we woke up on the wrong CPU. */ if ( unlikely(smp_processor_id() != wqv->wakeup_cpu) ) { /* Re-set VCPU affinity and re-enter the scheduler. */ struct vcpu *curr = current; cpumask_copy(&wqv->saved_affinity, curr->cpu_affinity); if ( vcpu_set_affinity(curr, cpumask_of(wqv->wakeup_cpu)) ) { gdprintk(XENLOG_ERR, "Unable to set vcpu affinity\n"); domain_crash_synchronous(); } wait(); /* takes us back into the scheduler */ } asm volatile ( "mov %1,%%"__OP"sp; jmp *(%0)" : : "S" (wqv->stack), "D" (wqv->esp), "c" ((char *)get_cpu_info() - (char *)wqv->esp) : "memory" ); } #else /* !CONFIG_X86 */ #define __prepare_to_wait(wqv) ((void)0) #define __finish_wait(wqv) ((void)0) #endif void prepare_to_wait(struct waitqueue_head *wq) { struct vcpu *curr = current; struct waitqueue_vcpu *wqv = curr->waitqueue_vcpu; ASSERT_NOT_IN_ATOMIC(); __prepare_to_wait(wqv); ASSERT(list_empty(&wqv->list)); spin_lock(&wq->lock); list_add_tail(&wqv->list, &wq->list); vcpu_pause_nosync(curr); get_knownalive_domain(curr->domain); spin_unlock(&wq->lock); } void finish_wait(struct waitqueue_head *wq) { struct vcpu *curr = current; struct waitqueue_vcpu *wqv = curr->waitqueue_vcpu; __finish_wait(wqv); if ( list_empty(&wqv->list) ) return; spin_lock(&wq->lock); if ( !list_empty(&wqv->list) ) { list_del_init(&wqv->list); vcpu_unpause(curr); put_domain(curr->domain); } spin_unlock(&wq->lock); } xen-4.4.0/xen/common/gcov/0000775000175000017500000000000012307313555013463 5ustar smbsmbxen-4.4.0/xen/common/gcov/Makefile0000664000175000017500000000002112307313555015114 0ustar smbsmbobj-y += gcov.o xen-4.4.0/xen/common/gcov/gcov.c0000664000175000017500000001247112307313555014572 0ustar smbsmb/* * This code maintains a list of active profiling data structures. * * Copyright IBM Corp. 2009 * Author(s): Peter Oberparleiter * * Uses gcc-internal data definitions. * Based on the gcov-kernel patch by: * Hubertus Franke * Nigel Hinds * Rajan Ravindran * Peter Oberparleiter * Paul Larson */ #include #include #include #include #include #include #include #include #include static struct gcov_info *info_list; /* * __gcov_init is called by gcc-generated constructor code for each object * file compiled with -fprofile-arcs. * * Although this function is called only during initialization is called from * a .text section which is still present after initialization so not declare * as __init. */ void __gcov_init(struct gcov_info *info) { /* add new profiling data structure to list */ info->next = info_list; info_list = info; } /* * These functions may be referenced by gcc-generated profiling code but serve * no function for Xen. */ void __gcov_flush(void) { /* Unused. */ } void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) { /* Unused. */ } void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) { /* Unused. */ } void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) { /* Unused. */ } static inline int counter_active(const struct gcov_info *info, unsigned int type) { return (1 << type) & info->ctr_mask; } typedef struct write_iter_t { XEN_GUEST_HANDLE(uint8) ptr; int real; uint32_t write_offset; } write_iter_t; static int write_raw(struct write_iter_t *iter, const void *data, size_t data_len) { if ( iter->real && copy_to_guest_offset(iter->ptr, iter->write_offset, (const unsigned char *) data, data_len) ) return -EFAULT; iter->write_offset += data_len; return 0; } #define chk(v) do { ret=(v); if ( ret ) return ret; } while(0) static inline int write32(write_iter_t *iter, uint32_t val) { return write_raw(iter, &val, sizeof(val)); } static int write_string(write_iter_t *iter, const char *s) { int ret; size_t len = strlen(s); chk(write32(iter, len)); return write_raw(iter, s, len); } static inline int next_type(const struct gcov_info *info, int *type) { while ( ++*type < XENCOV_COUNTERS && !counter_active(info, *type) ) continue; return *type; } static inline void align_iter(write_iter_t *iter) { iter->write_offset = (iter->write_offset + sizeof(uint64_t) - 1) & -sizeof(uint64_t); } static int write_gcov(write_iter_t *iter) { struct gcov_info *info; int ret; /* reset offset */ iter->write_offset = 0; /* dump all files */ for ( info = info_list ; info; info = info->next ) { const struct gcov_ctr_info *ctr; int type; size_t size_fn = sizeof(struct gcov_fn_info); align_iter(iter); chk(write32(iter, XENCOV_TAG_FILE)); chk(write32(iter, info->version)); chk(write32(iter, info->stamp)); chk(write_string(iter, info->filename)); /* dump counters */ ctr = info->counts; for ( type = -1; next_type(info, &type) < XENCOV_COUNTERS; ++ctr ) { align_iter(iter); chk(write32(iter, XENCOV_TAG_COUNTER(type))); chk(write32(iter, ctr->num)); chk(write_raw(iter, ctr->values, ctr->num * sizeof(ctr->values[0]))); size_fn += sizeof(unsigned); } /* dump all functions together */ align_iter(iter); chk(write32(iter, XENCOV_TAG_FUNC)); chk(write32(iter, info->n_functions)); chk(write_raw(iter, info->functions, info->n_functions * size_fn)); } /* stop tag */ align_iter(iter); chk(write32(iter, XENCOV_TAG_END)); return 0; } static int reset_counters(void) { struct gcov_info *info; for ( info = info_list ; info; info = info->next ) { const struct gcov_ctr_info *ctr; int type; /* reset counters */ ctr = info->counts; for ( type = -1; next_type(info, &type) < XENCOV_COUNTERS; ++ctr ) memset(ctr->values, 0, ctr->num * sizeof(ctr->values[0])); } return 0; } int sysctl_coverage_op(xen_sysctl_coverage_op_t *op) { int ret = -EINVAL; write_iter_t iter; switch ( op->cmd ) { case XEN_SYSCTL_COVERAGE_get_total_size: iter.real = 0; write_gcov(&iter); op->u.total_size = iter.write_offset; ret = 0; break; case XEN_SYSCTL_COVERAGE_read_and_reset: case XEN_SYSCTL_COVERAGE_read: iter.ptr = op->u.raw_info; iter.real = 1; ret = write_gcov(&iter); if ( ret || op->cmd != XEN_SYSCTL_COVERAGE_read_and_reset ) break; /* fall through */ case XEN_SYSCTL_COVERAGE_reset: ret = reset_counters(); break; } return ret; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/unlzma.c0000664000175000017500000003667412307313555014217 0ustar smbsmb/* Lzma decompressor for Linux kernel. Shamelessly snarfed * from busybox 1.1.1 * * Linux kernel adaptation * Copyright (C) 2006 Alain < alain@knaff.lu > * * Based on small lzma deflate implementation/Small range coder * implementation for lzma. * Copyright (C) 2006 Aurelien Jacobs < aurel@gnuage.org > * * Based on LzmaDecode.c from the LZMA SDK 4.22 (http://www.7-zip.org/) * Copyright (C) 1999-2005 Igor Pavlov * * Copyrights of the parts, see headers below. * * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "decompress.h" #define MIN(a, b) (((a) < (b)) ? (a) : (b)) static long long INIT read_int(unsigned char *ptr, int size) { int i; long long ret = 0; for (i = 0; i < size; i++) ret = (ret << 8) | ptr[size-i-1]; return ret; } #define ENDIAN_CONVERT(x) \ x = (typeof(x))read_int((unsigned char *)&x, sizeof(x)) /* Small range coder implementation for lzma. * Copyright (C) 2006 Aurelien Jacobs < aurel@gnuage.org > * * Based on LzmaDecode.c from the LZMA SDK 4.22 (http://www.7-zip.org/) * Copyright (c) 1999-2005 Igor Pavlov */ #ifdef __XEN__ #include #endif #define LZMA_IOBUF_SIZE 0x10000 struct rc { int (*fill)(void*, unsigned int); uint8_t *ptr; uint8_t *buffer; uint8_t *buffer_end; int buffer_size; uint32_t code; uint32_t range; uint32_t bound; void (*error)(const char *); }; #define RC_TOP_BITS 24 #define RC_MOVE_BITS 5 #define RC_MODEL_TOTAL_BITS 11 static int INIT nofill(void *buffer, unsigned int len) { return -1; } /* Called twice: once at startup and once in rc_normalize() */ static void INIT rc_read(struct rc *rc) { rc->buffer_size = rc->fill((char *)rc->buffer, LZMA_IOBUF_SIZE); if (rc->buffer_size <= 0) rc->error("unexpected EOF"); rc->ptr = rc->buffer; rc->buffer_end = rc->buffer + rc->buffer_size; } /* Called once */ static inline void INIT rc_init(struct rc *rc, int (*fill)(void*, unsigned int), unsigned char *buffer, int buffer_size) { if (fill) rc->fill = fill; else rc->fill = nofill; rc->buffer = (uint8_t *)buffer; rc->buffer_size = buffer_size; rc->buffer_end = rc->buffer + rc->buffer_size; rc->ptr = rc->buffer; rc->code = 0; rc->range = 0xFFFFFFFF; } static inline void INIT rc_init_code(struct rc *rc) { int i; for (i = 0; i < 5; i++) { if (rc->ptr >= rc->buffer_end) rc_read(rc); rc->code = (rc->code << 8) | *rc->ptr++; } } /* Called twice, but one callsite is in inline'd rc_is_bit_0_helper() */ static void INIT rc_do_normalize(struct rc *rc) { if (rc->ptr >= rc->buffer_end) rc_read(rc); rc->range <<= 8; rc->code = (rc->code << 8) | *rc->ptr++; } static inline void INIT rc_normalize(struct rc *rc) { if (rc->range < (1 << RC_TOP_BITS)) rc_do_normalize(rc); } /* Called 9 times */ /* Why rc_is_bit_0_helper exists? *Because we want to always expose (rc->code < rc->bound) to optimizer */ static inline uint32_t INIT rc_is_bit_0_helper(struct rc *rc, uint16_t *p) { rc_normalize(rc); rc->bound = *p * (rc->range >> RC_MODEL_TOTAL_BITS); return rc->bound; } static inline int INIT rc_is_bit_0(struct rc *rc, uint16_t *p) { uint32_t t = rc_is_bit_0_helper(rc, p); return rc->code < t; } /* Called ~10 times, but very small, thus inlined */ static inline void INIT rc_update_bit_0(struct rc *rc, uint16_t *p) { rc->range = rc->bound; *p += ((1 << RC_MODEL_TOTAL_BITS) - *p) >> RC_MOVE_BITS; } static inline void rc_update_bit_1(struct rc *rc, uint16_t *p) { rc->range -= rc->bound; rc->code -= rc->bound; *p -= *p >> RC_MOVE_BITS; } /* Called 4 times in unlzma loop */ static int INIT rc_get_bit(struct rc *rc, uint16_t *p, int *symbol) { if (rc_is_bit_0(rc, p)) { rc_update_bit_0(rc, p); *symbol *= 2; return 0; } else { rc_update_bit_1(rc, p); *symbol = *symbol * 2 + 1; return 1; } } /* Called once */ static inline int INIT rc_direct_bit(struct rc *rc) { rc_normalize(rc); rc->range >>= 1; if (rc->code >= rc->range) { rc->code -= rc->range; return 1; } return 0; } /* Called twice */ static inline void INIT rc_bit_tree_decode(struct rc *rc, uint16_t *p, int num_levels, int *symbol) { int i = num_levels; *symbol = 1; while (i--) rc_get_bit(rc, p + *symbol, symbol); *symbol -= 1 << num_levels; } /* * Small lzma deflate implementation. * Copyright (C) 2006 Aurelien Jacobs < aurel@gnuage.org > * * Based on LzmaDecode.c from the LZMA SDK 4.22 (http://www.7-zip.org/) * Copyright (C) 1999-2005 Igor Pavlov */ struct lzma_header { uint8_t pos; uint32_t dict_size; uint64_t dst_size; } __attribute__ ((packed)) ; #define LZMA_BASE_SIZE 1846 #define LZMA_LIT_SIZE 768 #define LZMA_NUM_POS_BITS_MAX 4 #define LZMA_LEN_NUM_LOW_BITS 3 #define LZMA_LEN_NUM_MID_BITS 3 #define LZMA_LEN_NUM_HIGH_BITS 8 #define LZMA_LEN_CHOICE 0 #define LZMA_LEN_CHOICE_2 (LZMA_LEN_CHOICE + 1) #define LZMA_LEN_LOW (LZMA_LEN_CHOICE_2 + 1) #define LZMA_LEN_MID (LZMA_LEN_LOW \ + (1 << (LZMA_NUM_POS_BITS_MAX + LZMA_LEN_NUM_LOW_BITS))) #define LZMA_LEN_HIGH (LZMA_LEN_MID \ +(1 << (LZMA_NUM_POS_BITS_MAX + LZMA_LEN_NUM_MID_BITS))) #define LZMA_NUM_LEN_PROBS (LZMA_LEN_HIGH + (1 << LZMA_LEN_NUM_HIGH_BITS)) #define LZMA_NUM_STATES 12 #define LZMA_NUM_LIT_STATES 7 #define LZMA_START_POS_MODEL_INDEX 4 #define LZMA_END_POS_MODEL_INDEX 14 #define LZMA_NUM_FULL_DISTANCES (1 << (LZMA_END_POS_MODEL_INDEX >> 1)) #define LZMA_NUM_POS_SLOT_BITS 6 #define LZMA_NUM_LEN_TO_POS_STATES 4 #define LZMA_NUM_ALIGN_BITS 4 #define LZMA_MATCH_MIN_LEN 2 #define LZMA_IS_MATCH 0 #define LZMA_IS_REP (LZMA_IS_MATCH + (LZMA_NUM_STATES << LZMA_NUM_POS_BITS_MAX)) #define LZMA_IS_REP_G0 (LZMA_IS_REP + LZMA_NUM_STATES) #define LZMA_IS_REP_G1 (LZMA_IS_REP_G0 + LZMA_NUM_STATES) #define LZMA_IS_REP_G2 (LZMA_IS_REP_G1 + LZMA_NUM_STATES) #define LZMA_IS_REP_0_LONG (LZMA_IS_REP_G2 + LZMA_NUM_STATES) #define LZMA_POS_SLOT (LZMA_IS_REP_0_LONG \ + (LZMA_NUM_STATES << LZMA_NUM_POS_BITS_MAX)) #define LZMA_SPEC_POS (LZMA_POS_SLOT \ +(LZMA_NUM_LEN_TO_POS_STATES << LZMA_NUM_POS_SLOT_BITS)) #define LZMA_ALIGN (LZMA_SPEC_POS \ + LZMA_NUM_FULL_DISTANCES - LZMA_END_POS_MODEL_INDEX) #define LZMA_LEN_CODER (LZMA_ALIGN + (1 << LZMA_NUM_ALIGN_BITS)) #define LZMA_REP_LEN_CODER (LZMA_LEN_CODER + LZMA_NUM_LEN_PROBS) #define LZMA_LITERAL (LZMA_REP_LEN_CODER + LZMA_NUM_LEN_PROBS) struct writer { uint8_t *buffer; uint8_t previous_byte; size_t buffer_pos; int bufsize; size_t global_pos; int(*flush)(void*, unsigned int); struct lzma_header *header; }; struct cstate { int state; uint32_t rep0, rep1, rep2, rep3; }; static inline size_t INIT get_pos(struct writer *wr) { return wr->global_pos + wr->buffer_pos; } static inline uint8_t INIT peek_old_byte(struct writer *wr, uint32_t offs) { if (!wr->flush) { int32_t pos; while (offs > wr->header->dict_size) offs -= wr->header->dict_size; pos = wr->buffer_pos - offs; return wr->buffer[pos]; } else { uint32_t pos = wr->buffer_pos - offs; while (pos >= wr->header->dict_size) pos += wr->header->dict_size; return wr->buffer[pos]; } } static inline int INIT write_byte(struct writer *wr, uint8_t byte) { wr->buffer[wr->buffer_pos++] = wr->previous_byte = byte; if (wr->flush && wr->buffer_pos == wr->header->dict_size) { wr->buffer_pos = 0; wr->global_pos += wr->header->dict_size; if (wr->flush((char *)wr->buffer, wr->header->dict_size) != wr->header->dict_size) return -1; } return 0; } static inline int INIT copy_byte(struct writer *wr, uint32_t offs) { return write_byte(wr, peek_old_byte(wr, offs)); } static inline int INIT copy_bytes(struct writer *wr, uint32_t rep0, int len) { do { if (copy_byte(wr, rep0)) return -1; len--; } while (len != 0 && wr->buffer_pos < wr->header->dst_size); return len; } static inline int INIT process_bit0(struct writer *wr, struct rc *rc, struct cstate *cst, uint16_t *p, int pos_state, uint16_t *prob, int lc, uint32_t literal_pos_mask) { int mi = 1; rc_update_bit_0(rc, prob); prob = (p + LZMA_LITERAL + (LZMA_LIT_SIZE * (((get_pos(wr) & literal_pos_mask) << lc) + (wr->previous_byte >> (8 - lc)))) ); if (cst->state >= LZMA_NUM_LIT_STATES) { int match_byte = peek_old_byte(wr, cst->rep0); do { int bit; uint16_t *prob_lit; match_byte <<= 1; bit = match_byte & 0x100; prob_lit = prob + 0x100 + bit + mi; if (rc_get_bit(rc, prob_lit, &mi)) { if (!bit) break; } else { if (bit) break; } } while (mi < 0x100); } while (mi < 0x100) { uint16_t *prob_lit = prob + mi; rc_get_bit(rc, prob_lit, &mi); } if (cst->state < 4) cst->state = 0; else if (cst->state < 10) cst->state -= 3; else cst->state -= 6; return write_byte(wr, mi); } static inline int INIT process_bit1(struct writer *wr, struct rc *rc, struct cstate *cst, uint16_t *p, int pos_state, uint16_t *prob) { int offset; uint16_t *prob_len; int num_bits; int len; rc_update_bit_1(rc, prob); prob = p + LZMA_IS_REP + cst->state; if (rc_is_bit_0(rc, prob)) { rc_update_bit_0(rc, prob); cst->rep3 = cst->rep2; cst->rep2 = cst->rep1; cst->rep1 = cst->rep0; cst->state = cst->state < LZMA_NUM_LIT_STATES ? 0 : 3; prob = p + LZMA_LEN_CODER; } else { rc_update_bit_1(rc, prob); prob = p + LZMA_IS_REP_G0 + cst->state; if (rc_is_bit_0(rc, prob)) { rc_update_bit_0(rc, prob); prob = (p + LZMA_IS_REP_0_LONG + (cst->state << LZMA_NUM_POS_BITS_MAX) + pos_state); if (rc_is_bit_0(rc, prob)) { rc_update_bit_0(rc, prob); cst->state = cst->state < LZMA_NUM_LIT_STATES ? 9 : 11; return copy_byte(wr, cst->rep0); } else { rc_update_bit_1(rc, prob); } } else { uint32_t distance; rc_update_bit_1(rc, prob); prob = p + LZMA_IS_REP_G1 + cst->state; if (rc_is_bit_0(rc, prob)) { rc_update_bit_0(rc, prob); distance = cst->rep1; } else { rc_update_bit_1(rc, prob); prob = p + LZMA_IS_REP_G2 + cst->state; if (rc_is_bit_0(rc, prob)) { rc_update_bit_0(rc, prob); distance = cst->rep2; } else { rc_update_bit_1(rc, prob); distance = cst->rep3; cst->rep3 = cst->rep2; } cst->rep2 = cst->rep1; } cst->rep1 = cst->rep0; cst->rep0 = distance; } cst->state = cst->state < LZMA_NUM_LIT_STATES ? 8 : 11; prob = p + LZMA_REP_LEN_CODER; } prob_len = prob + LZMA_LEN_CHOICE; if (rc_is_bit_0(rc, prob_len)) { rc_update_bit_0(rc, prob_len); prob_len = (prob + LZMA_LEN_LOW + (pos_state << LZMA_LEN_NUM_LOW_BITS)); offset = 0; num_bits = LZMA_LEN_NUM_LOW_BITS; } else { rc_update_bit_1(rc, prob_len); prob_len = prob + LZMA_LEN_CHOICE_2; if (rc_is_bit_0(rc, prob_len)) { rc_update_bit_0(rc, prob_len); prob_len = (prob + LZMA_LEN_MID + (pos_state << LZMA_LEN_NUM_MID_BITS)); offset = 1 << LZMA_LEN_NUM_LOW_BITS; num_bits = LZMA_LEN_NUM_MID_BITS; } else { rc_update_bit_1(rc, prob_len); prob_len = prob + LZMA_LEN_HIGH; offset = ((1 << LZMA_LEN_NUM_LOW_BITS) + (1 << LZMA_LEN_NUM_MID_BITS)); num_bits = LZMA_LEN_NUM_HIGH_BITS; } } rc_bit_tree_decode(rc, prob_len, num_bits, &len); len += offset; if (cst->state < 4) { int pos_slot; cst->state += LZMA_NUM_LIT_STATES; prob = p + LZMA_POS_SLOT + ((len < LZMA_NUM_LEN_TO_POS_STATES ? len : LZMA_NUM_LEN_TO_POS_STATES - 1) << LZMA_NUM_POS_SLOT_BITS); rc_bit_tree_decode(rc, prob, LZMA_NUM_POS_SLOT_BITS, &pos_slot); if (pos_slot >= LZMA_START_POS_MODEL_INDEX) { int i, mi; num_bits = (pos_slot >> 1) - 1; cst->rep0 = 2 | (pos_slot & 1); if (pos_slot < LZMA_END_POS_MODEL_INDEX) { cst->rep0 <<= num_bits; prob = p + LZMA_SPEC_POS + cst->rep0 - pos_slot - 1; } else { num_bits -= LZMA_NUM_ALIGN_BITS; while (num_bits--) cst->rep0 = (cst->rep0 << 1) | rc_direct_bit(rc); prob = p + LZMA_ALIGN; cst->rep0 <<= LZMA_NUM_ALIGN_BITS; num_bits = LZMA_NUM_ALIGN_BITS; } i = 1; mi = 1; while (num_bits--) { if (rc_get_bit(rc, prob + mi, &mi)) cst->rep0 |= i; i <<= 1; } } else cst->rep0 = pos_slot; if (++(cst->rep0) == 0) return 0; if (cst->rep0 > wr->header->dict_size || cst->rep0 > get_pos(wr)) return -1; } len += LZMA_MATCH_MIN_LEN; return copy_bytes(wr, cst->rep0, len); } STATIC int INIT unlzma(unsigned char *buf, unsigned int in_len, int(*fill)(void*, unsigned int), int(*flush)(void*, unsigned int), unsigned char *output, unsigned int *posp, void(*error)(const char *x) ) { struct lzma_header header; int lc, pb, lp; uint32_t pos_state_mask; uint32_t literal_pos_mask; uint16_t *p; int num_probs; struct rc rc; int i, mi; struct writer wr; struct cstate cst; unsigned char *inbuf; int ret = -1; rc.error = error; if (buf) inbuf = buf; else inbuf = malloc(LZMA_IOBUF_SIZE); if (!inbuf) { error("Could not allocate input buffer"); goto exit_0; } cst.state = 0; cst.rep0 = cst.rep1 = cst.rep2 = cst.rep3 = 1; wr.header = &header; wr.flush = flush; wr.global_pos = 0; wr.previous_byte = 0; wr.buffer_pos = 0; rc_init(&rc, fill, inbuf, in_len); for (i = 0; i < sizeof(header); i++) { if (rc.ptr >= rc.buffer_end) rc_read(&rc); ((unsigned char *)&header)[i] = *rc.ptr++; } if (header.pos >= (9 * 5 * 5)) { error("bad header"); goto exit_1; } mi = 0; lc = header.pos; while (lc >= 9) { mi++; lc -= 9; } pb = 0; lp = mi; while (lp >= 5) { pb++; lp -= 5; } pos_state_mask = (1 << pb) - 1; literal_pos_mask = (1 << lp) - 1; ENDIAN_CONVERT(header.dict_size); ENDIAN_CONVERT(header.dst_size); if (header.dict_size == 0) header.dict_size = 1; if (output) wr.buffer = output; else { wr.bufsize = MIN(header.dst_size, header.dict_size); wr.buffer = large_malloc(wr.bufsize); } if (wr.buffer == NULL) goto exit_1; num_probs = LZMA_BASE_SIZE + (LZMA_LIT_SIZE << (lc + lp)); p = (uint16_t *) large_malloc(num_probs * sizeof(*p)); if (p == 0) goto exit_2; num_probs = LZMA_LITERAL + (LZMA_LIT_SIZE << (lc + lp)); for (i = 0; i < num_probs; i++) p[i] = (1 << RC_MODEL_TOTAL_BITS) >> 1; rc_init_code(&rc); while (get_pos(&wr) < header.dst_size) { int pos_state = get_pos(&wr) & pos_state_mask; uint16_t *prob = p + LZMA_IS_MATCH + (cst.state << LZMA_NUM_POS_BITS_MAX) + pos_state; if (rc_is_bit_0(&rc, prob)) { if (process_bit0(&wr, &rc, &cst, p, pos_state, prob, lc, literal_pos_mask)) { error("LZMA data is corrupt"); goto exit_3; } } else { if (process_bit1(&wr, &rc, &cst, p, pos_state, prob)) { error("LZMA data is corrupt"); goto exit_3; } if (cst.rep0 == 0) break; } if (rc.buffer_size <= 0) goto exit_3; } if (posp) *posp = rc.ptr-rc.buffer; if (!wr.flush || wr.flush(wr.buffer, wr.buffer_pos) == wr.buffer_pos) ret = 0; exit_3: large_free(p); exit_2: if (!output) large_free(wr.buffer); exit_1: if (!buf) free(inbuf); exit_0: return ret; } xen-4.4.0/xen/common/kernel.c0000664000175000017500000002406412307313555014157 0ustar smbsmb/****************************************************************************** * kernel.c * * Copyright (c) 2002-2005 K A Fraser */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef COMPAT enum system_state system_state = SYS_STATE_early_boot; int tainted; xen_commandline_t saved_cmdline; static void __init assign_integer_param( struct kernel_param *param, uint64_t val) { switch ( param->len ) { case sizeof(uint8_t): *(uint8_t *)param->var = val; break; case sizeof(uint16_t): *(uint16_t *)param->var = val; break; case sizeof(uint32_t): *(uint32_t *)param->var = val; break; case sizeof(uint64_t): *(uint64_t *)param->var = val; break; default: BUG(); } } void __init cmdline_parse(const char *cmdline) { char opt[100], *optval, *optkey, *q; const char *p = cmdline; struct kernel_param *param; int bool_assert; if ( cmdline == NULL ) return; safe_strcpy(saved_cmdline, cmdline); for ( ; ; ) { /* Skip whitespace. */ while ( *p == ' ' ) p++; if ( *p == '\0' ) break; /* Grab the next whitespace-delimited option. */ q = optkey = opt; while ( (*p != ' ') && (*p != '\0') ) { if ( (q-opt) < (sizeof(opt)-1) ) /* avoid overflow */ *q++ = *p; p++; } *q = '\0'; /* Search for value part of a key=value option. */ optval = strchr(opt, '='); if ( optval != NULL ) { *optval++ = '\0'; /* nul-terminate the option value */ q = strpbrk(opt, "([{<"); } else { optval = q; /* default option value is empty string */ q = NULL; } /* Boolean parameters can be inverted with 'no-' prefix. */ bool_assert = !!strncmp("no-", optkey, 3); if ( !bool_assert ) optkey += 3; for ( param = &__setup_start; param < &__setup_end; param++ ) { if ( strcmp(param->name, optkey) ) { if ( param->type == OPT_CUSTOM && q && strlen(param->name) == q + 1 - opt && !strncmp(param->name, opt, q + 1 - opt) ) { optval[-1] = '='; ((void (*)(const char *))param->var)(q); optval[-1] = '\0'; } continue; } switch ( param->type ) { case OPT_STR: strlcpy(param->var, optval, param->len); break; case OPT_UINT: assign_integer_param( param, simple_strtoll(optval, NULL, 0)); break; case OPT_BOOL: case OPT_INVBOOL: if ( !parse_bool(optval) ) bool_assert = !bool_assert; assign_integer_param( param, (param->type == OPT_BOOL) == bool_assert); break; case OPT_SIZE: assign_integer_param( param, parse_size_and_unit(optval, NULL)); break; case OPT_CUSTOM: ((void (*)(const char *))param->var)(optval); break; default: BUG(); break; } } } } int __init parse_bool(const char *s) { if ( !strcmp("no", s) || !strcmp("off", s) || !strcmp("false", s) || !strcmp("disable", s) || !strcmp("0", s) ) return 0; if ( !strcmp("yes", s) || !strcmp("on", s) || !strcmp("true", s) || !strcmp("enable", s) || !strcmp("1", s) ) return 1; return -1; } /** * print_tainted - return a string to represent the kernel taint state. * * 'S' - SMP with CPUs not designed for SMP. * 'M' - Machine had a machine check experience. * 'B' - System has hit bad_page. * * The string is overwritten by the next call to print_taint(). */ char *print_tainted(char *str) { if ( tainted ) { snprintf(str, TAINT_STRING_MAX_LEN, "Tainted: %c%c%c%c", tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', tainted & TAINT_BAD_PAGE ? 'B' : ' ', tainted & TAINT_SYNC_CONSOLE ? 'C' : ' '); } else { snprintf(str, TAINT_STRING_MAX_LEN, "Not tainted"); } return str; } void add_taint(unsigned flag) { tainted |= flag; } extern const initcall_t __initcall_start[], __presmp_initcall_end[], __initcall_end[]; void __init do_presmp_initcalls(void) { const initcall_t *call; for ( call = __initcall_start; call < __presmp_initcall_end; call++ ) (*call)(); } void __init do_initcalls(void) { const initcall_t *call; for ( call = __presmp_initcall_end; call < __initcall_end; call++ ) (*call)(); } # define DO(fn) long do_##fn #endif /* * Simple hypercalls. */ DO(xen_version)(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) { switch ( cmd ) { case XENVER_version: { return (xen_major_version() << 16) | xen_minor_version(); } case XENVER_extraversion: { xen_extraversion_t extraversion; safe_strcpy(extraversion, xen_extra_version()); if ( copy_to_guest(arg, extraversion, ARRAY_SIZE(extraversion)) ) return -EFAULT; return 0; } case XENVER_compile_info: { struct xen_compile_info info; safe_strcpy(info.compiler, xen_compiler()); safe_strcpy(info.compile_by, xen_compile_by()); safe_strcpy(info.compile_domain, xen_compile_domain()); safe_strcpy(info.compile_date, xen_compile_date()); if ( copy_to_guest(arg, &info, 1) ) return -EFAULT; return 0; } case XENVER_capabilities: { xen_capabilities_info_t info; memset(info, 0, sizeof(info)); arch_get_xen_caps(&info); if ( copy_to_guest(arg, info, ARRAY_SIZE(info)) ) return -EFAULT; return 0; } case XENVER_platform_parameters: { xen_platform_parameters_t params = { .virt_start = HYPERVISOR_VIRT_START }; if ( copy_to_guest(arg, ¶ms, 1) ) return -EFAULT; return 0; } case XENVER_changeset: { xen_changeset_info_t chgset; safe_strcpy(chgset, xen_changeset()); if ( copy_to_guest(arg, chgset, ARRAY_SIZE(chgset)) ) return -EFAULT; return 0; } case XENVER_get_features: { xen_feature_info_t fi; struct domain *d = current->domain; if ( copy_from_guest(&fi, arg, 1) ) return -EFAULT; switch ( fi.submap_idx ) { case 0: fi.submap = 0; if ( VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3) ) fi.submap |= (1U << XENFEAT_pae_pgdir_above_4gb); if ( paging_mode_translate(current->domain) ) fi.submap |= (1U << XENFEAT_writable_page_tables) | (1U << XENFEAT_auto_translated_physmap); if ( supervisor_mode_kernel ) fi.submap |= 1U << XENFEAT_supervisor_mode_kernel; if ( current->domain == dom0 ) fi.submap |= 1U << XENFEAT_dom0; #ifdef CONFIG_X86 switch ( d->guest_type ) { case guest_type_pv: fi.submap |= (1U << XENFEAT_mmu_pt_update_preserve_ad) | (1U << XENFEAT_highmem_assist) | (1U << XENFEAT_gnttab_map_avail_bits); break; case guest_type_pvh: fi.submap |= (1U << XENFEAT_hvm_safe_pvclock) | (1U << XENFEAT_supervisor_mode_kernel) | (1U << XENFEAT_hvm_callback_vector); break; case guest_type_hvm: fi.submap |= (1U << XENFEAT_hvm_safe_pvclock) | (1U << XENFEAT_hvm_callback_vector) | (1U << XENFEAT_hvm_pirqs); break; } #endif break; default: return -EINVAL; } if ( copy_to_guest(arg, &fi, 1) ) return -EFAULT; return 0; } case XENVER_pagesize: { return (!guest_handle_is_null(arg) ? -EINVAL : PAGE_SIZE); } case XENVER_guest_handle: { if ( copy_to_guest(arg, current->domain->handle, ARRAY_SIZE(current->domain->handle)) ) return -EFAULT; return 0; } case XENVER_commandline: { if ( copy_to_guest(arg, saved_cmdline, ARRAY_SIZE(saved_cmdline)) ) return -EFAULT; return 0; } } return -ENOSYS; } DO(nmi_op)(unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) { struct xennmi_callback cb; long rc = 0; switch ( cmd ) { case XENNMI_register_callback: rc = -EFAULT; if ( copy_from_guest(&cb, arg, 1) ) break; rc = register_guest_nmi_callback(cb.handler_address); break; case XENNMI_unregister_callback: rc = unregister_guest_nmi_callback(); break; default: rc = -ENOSYS; break; } return rc; } DO(vm_assist)(unsigned int cmd, unsigned int type) { return vm_assist(current->domain, cmd, type); } DO(ni_hypercall)(void) { /* No-op hypercall. */ return -ENOSYS; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/symbols.c0000664000175000017500000001043512307313555014364 0ustar smbsmb/* * symbols.c: in-kernel printing of symbolic oopses and stack traces. * * Copyright 2002 Rusty Russell IBM Corporation * * ChangeLog: * * (25/Aug/2004) Paulo Marques * Changed the compression method from stem compression to "table lookup" * compression (see tools/symbols.c for a more complete description) */ #include #include #include #include #include #include #include #ifdef SYMBOLS_ORIGIN extern const unsigned int symbols_offsets[1]; #define symbols_address(n) (SYMBOLS_ORIGIN + symbols_offsets[n]) #else extern const unsigned long symbols_addresses[]; #define symbols_address(n) symbols_addresses[n] #endif extern const unsigned int symbols_num_syms; extern const u8 symbols_names[]; extern const u8 symbols_token_table[]; extern const u16 symbols_token_index[]; extern const unsigned int symbols_markers[]; /* expand a compressed symbol data into the resulting uncompressed string, given the offset to where the symbol is in the compressed stream */ static unsigned int symbols_expand_symbol(unsigned int off, char *result) { int len, skipped_first = 0; const u8 *tptr, *data; /* get the compressed symbol length from the first symbol byte */ data = &symbols_names[off]; len = *data; data++; /* update the offset to return the offset for the next symbol on * the compressed stream */ off += len + 1; /* for every byte on the compressed symbol data, copy the table entry for that byte */ while(len) { tptr = &symbols_token_table[ symbols_token_index[*data] ]; data++; len--; while (*tptr) { if(skipped_first) { *result = *tptr; result++; } else skipped_first = 1; tptr++; } } *result = '\0'; /* return to offset to the next symbol */ return off; } /* find the offset on the compressed stream given and index in the * symbols array */ static unsigned int get_symbol_offset(unsigned long pos) { const u8 *name; int i; /* use the closest marker we have. We have markers every 256 positions, * so that should be close enough */ name = &symbols_names[ symbols_markers[pos>>8] ]; /* sequentially scan all the symbols up to the point we're searching for. * Every symbol is stored in a [][ bytes of data] format, so we * just need to add the len to the current pointer for every symbol we * wish to skip */ for(i = 0; i < (pos&0xFF); i++) name = name + (*name) + 1; return name - symbols_names; } bool_t is_active_kernel_text(unsigned long addr) { return (is_kernel_text(addr) || (system_state == SYS_STATE_boot && is_kernel_inittext(addr))); } const char *symbols_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char *namebuf) { unsigned long i, low, high, mid; unsigned long symbol_end = 0; namebuf[KSYM_NAME_LEN] = 0; namebuf[0] = 0; if (!is_active_kernel_text(addr)) return NULL; /* do a binary search on the sorted symbols_addresses array */ low = 0; high = symbols_num_syms; while (high-low > 1) { mid = (low + high) / 2; if (symbols_address(mid) <= addr) low = mid; else high = mid; } /* search for the first aliased symbol. Aliased symbols are symbols with the same address */ while (low && symbols_address(low - 1) == symbols_address(low)) --low; /* Grab name */ symbols_expand_symbol(get_symbol_offset(low), namebuf); /* Search for next non-aliased symbol */ for (i = low + 1; i < symbols_num_syms; i++) { if (symbols_address(i) > symbols_address(low)) { symbol_end = symbols_address(i); break; } } /* if we found no next symbol, we use the end of the section */ if (!symbol_end) symbol_end = is_kernel_inittext(addr) ? (unsigned long)_einittext : (unsigned long)_etext; *symbolsize = symbol_end - symbols_address(low); *offset = addr - symbols_address(low); return namebuf; } xen-4.4.0/xen/common/sched_credit2.c0000664000175000017500000017337712307313555015415 0ustar smbsmb /**************************************************************************** * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd **************************************************************************** * * File: common/csched_credit2.c * Author: George Dunlap * * Description: Credit-based SMP CPU scheduler * Based on an earlier verson by Emmanuel Ackaouy. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define d2printk(x...) //#define d2printk printk /* * Credit2 tracing events ("only" 512 available!). Check * include/public/trace.h for more details. */ #define TRC_CSCHED2_TICK TRC_SCHED_CLASS_EVT(CSCHED2, 1) #define TRC_CSCHED2_RUNQ_POS TRC_SCHED_CLASS_EVT(CSCHED2, 2) #define TRC_CSCHED2_CREDIT_BURN TRC_SCHED_CLASS_EVT(CSCHED2, 3) #define TRC_CSCHED2_CREDIT_ADD TRC_SCHED_CLASS_EVT(CSCHED2, 4) #define TRC_CSCHED2_TICKLE_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 5) #define TRC_CSCHED2_TICKLE TRC_SCHED_CLASS_EVT(CSCHED2, 6) #define TRC_CSCHED2_CREDIT_RESET TRC_SCHED_CLASS_EVT(CSCHED2, 7) #define TRC_CSCHED2_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED2, 8) #define TRC_CSCHED2_UPDATE_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 9) #define TRC_CSCHED2_RUNQ_ASSIGN TRC_SCHED_CLASS_EVT(CSCHED2, 10) #define TRC_CSCHED2_UPDATE_VCPU_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 11) #define TRC_CSCHED2_UPDATE_RUNQ_LOAD TRC_SCHED_CLASS_EVT(CSCHED2, 12) /* * WARNING: This is still in an experimental phase. Status and work can be found at the * credit2 wiki page: * http://wiki.xen.org/wiki/Credit2_Scheduler_Development * TODO: * + Immediate bug-fixes * - Do per-runqueue, grab proper lock for dump debugkey * + Multiple sockets * - Detect cpu layout and make runqueue map, one per L2 (make_runq_map()) * - Simple load balancer / runqueue assignment * - Runqueue load measurement * - Load-based load balancer * + Hyperthreading * - Look for non-busy core if possible * - "Discount" time run on a thread with busy siblings * + Algorithm: * - "Mixed work" problem: if a VM is playing audio (5%) but also burning cpu (e.g., * a flash animation in the background) can we schedule it with low enough latency * so that audio doesn't skip? * - Cap and reservation: How to implement with the current system? * + Optimizing * - Profiling, making new algorithms, making math more efficient (no long division) */ /* * Design: * * VMs "burn" credits based on their weight; higher weight means * credits burn more slowly. The highest weight vcpu burns credits at * a rate of 1 credit per nanosecond. Others burn proportionally * more. * * vcpus are inserted into the runqueue by credit order. * * Credits are "reset" when the next vcpu in the runqueue is less than * or equal to zero. At that point, everyone's credits are "clipped" * to a small value, and a fixed credit is added to everyone. * * The plan is for all cores that share an L2 will share the same * runqueue. At the moment, there is one global runqueue for all * cores. */ /* * Locking: * - Schedule-lock is per-runqueue * + Protects runqueue data, runqueue insertion, &c * + Also protects updates to private sched vcpu structure * + Must be grabbed using vcpu_schedule_lock_irq() to make sure vcpu->processr * doesn't change under our feet. * - Private data lock * + Protects access to global domain list * + All other private data is written at init and only read afterwards. * Ordering: * - We grab private->schedule when updating domain weight; so we * must never grab private if a schedule lock is held. */ /* * Basic constants */ /* Default weight: How much a new domain starts with */ #define CSCHED_DEFAULT_WEIGHT 256 /* Min timer: Minimum length a timer will be set, to * achieve efficiency */ #define CSCHED_MIN_TIMER MICROSECS(500) /* Amount of credit VMs begin with, and are reset to. * ATM, set so that highest-weight VMs can only run for 10ms * before a reset event. */ #define CSCHED_CREDIT_INIT MILLISECS(10) /* Carryover: How much "extra" credit may be carried over after * a reset. */ #define CSCHED_CARRYOVER_MAX CSCHED_MIN_TIMER /* Stickiness: Cross-L2 migration resistance. Should be less than * MIN_TIMER. */ #define CSCHED_MIGRATE_RESIST ((opt_migrate_resist)*MICROSECS(1)) /* How much to "compensate" a vcpu for L2 migration */ #define CSCHED_MIGRATE_COMPENSATION MICROSECS(50) /* Reset: Value below which credit will be reset. */ #define CSCHED_CREDIT_RESET 0 /* Max timer: Maximum time a guest can be run for. */ #define CSCHED_MAX_TIMER MILLISECS(2) #define CSCHED_IDLE_CREDIT (-(1<<30)) /* * Flags */ /* CSFLAG_scheduled: Is this vcpu either running on, or context-switching off, * a physical cpu? * + Accessed only with runqueue lock held * + Set when chosen as next in csched_schedule(). * + Cleared after context switch has been saved in csched_context_saved() * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should * set CSFLAG_delayed_runq_add * + Checked to be false in runq_insert. */ #define __CSFLAG_scheduled 1 #define CSFLAG_scheduled (1<<__CSFLAG_scheduled) /* CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done * being context switched out? * + Set when scheduling out in csched_schedule() if prev is runnable * + Set in csched_vcpu_wake if it finds CSFLAG_scheduled set * + Read in csched_context_saved(). If set, it adds prev to the runqueue and * clears the bit. */ #define __CSFLAG_delayed_runq_add 2 #define CSFLAG_delayed_runq_add (1<<__CSFLAG_delayed_runq_add) /* CSFLAG_runq_migrate_request: This vcpu is being migrated as a result of a * credit2-initiated runq migrate request; migrate it to the runqueue indicated * in the svc struct. */ #define __CSFLAG_runq_migrate_request 3 #define CSFLAG_runq_migrate_request (1<<__CSFLAG_runq_migrate_request) int opt_migrate_resist=500; integer_param("sched_credit2_migrate_resist", opt_migrate_resist); /* * Useful macros */ #define CSCHED_PRIV(_ops) \ ((struct csched_private *)((_ops)->sched_data)) #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv) #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv) /* CPU to runq_id macro */ #define c2r(_ops, _cpu) (CSCHED_PRIV(_ops)->runq_map[(_cpu)]) /* CPU to runqueue struct macro */ #define RQD(_ops, _cpu) (&CSCHED_PRIV(_ops)->rqd[c2r(_ops, _cpu)]) /* * Shifts for load average. * - granularity: Reduce granularity of time by a factor of 1000, so we can use 32-bit maths * - window shift: Given granularity shift, make the window about 1 second * - scale shift: Shift up load by this amount rather than using fractions; 128 corresponds * to a load of 1. */ #define LOADAVG_GRANULARITY_SHIFT (10) int opt_load_window_shift=18; #define LOADAVG_WINDOW_SHIFT_MIN 4 integer_param("credit2_load_window_shift", opt_load_window_shift); int opt_underload_balance_tolerance=0; integer_param("credit2_balance_under", opt_underload_balance_tolerance); int opt_overload_balance_tolerance=-3; integer_param("credit2_balance_over", opt_overload_balance_tolerance); /* * Per-runqueue data */ struct csched_runqueue_data { int id; spinlock_t lock; /* Lock for this runqueue. */ cpumask_t active; /* CPUs enabled for this runqueue */ struct list_head runq; /* Ordered list of runnable vms */ struct list_head svc; /* List of all vcpus assigned to this runqueue */ unsigned int max_weight; cpumask_t idle, /* Currently idle */ tickled; /* Another cpu in the queue is already targeted for this one */ int load; /* Instantaneous load: Length of queue + num non-idle threads */ s_time_t load_last_update; /* Last time average was updated */ s_time_t avgload; /* Decaying queue load */ s_time_t b_avgload; /* Decaying queue load modified by balancing */ }; /* * System-wide private data */ struct csched_private { spinlock_t lock; cpumask_t initialized; /* CPU is initialized for this pool */ struct list_head sdom; /* Used mostly for dump keyhandler. */ int runq_map[NR_CPUS]; cpumask_t active_queues; /* Queues which may have active cpus */ struct csched_runqueue_data rqd[NR_CPUS]; int load_window_shift; }; /* * Virtual CPU */ struct csched_vcpu { struct list_head rqd_elem; /* On the runqueue data list */ struct list_head sdom_elem; /* On the domain vcpu list */ struct list_head runq_elem; /* On the runqueue */ struct csched_runqueue_data *rqd; /* Up-pointer to the runqueue */ /* Up-pointers */ struct csched_dom *sdom; struct vcpu *vcpu; unsigned int weight; unsigned int residual; int credit; s_time_t start_time; /* When we were scheduled (used for credit) */ unsigned flags; /* 16 bits doesn't seem to play well with clear_bit() */ /* Individual contribution to load */ s_time_t load_last_update; /* Last time average was updated */ s_time_t avgload; /* Decaying queue load */ struct csched_runqueue_data *migrate_rqd; /* Pre-determined rqd to which to migrate */ }; /* * Domain */ struct csched_dom { struct list_head vcpu; struct list_head sdom_elem; struct domain *dom; uint16_t weight; uint16_t nr_vcpus; }; /* * Time-to-credit, credit-to-time. * * We keep track of the "residual" time to make sure that frequent short * schedules still get accounted for in the end. * * FIXME: Do pre-calculated division? */ static void t2c_update(struct csched_runqueue_data *rqd, s_time_t time, struct csched_vcpu *svc) { uint64_t val = time * rqd->max_weight + svc->residual; svc->residual = do_div(val, svc->weight); svc->credit -= val; } static s_time_t c2t(struct csched_runqueue_data *rqd, s_time_t credit, struct csched_vcpu *svc) { return credit * svc->weight / rqd->max_weight; } /* * Runqueue related code */ static /*inline*/ int __vcpu_on_runq(struct csched_vcpu *svc) { return !list_empty(&svc->runq_elem); } static /*inline*/ struct csched_vcpu * __runq_elem(struct list_head *elem) { return list_entry(elem, struct csched_vcpu, runq_elem); } static void __update_runq_load(const struct scheduler *ops, struct csched_runqueue_data *rqd, int change, s_time_t now) { struct csched_private *prv = CSCHED_PRIV(ops); s_time_t delta=-1; now >>= LOADAVG_GRANULARITY_SHIFT; if ( rqd->load_last_update + (1ULL<load_window_shift) < now ) { rqd->avgload = (unsigned long long)rqd->load << prv->load_window_shift; rqd->b_avgload = (unsigned long long)rqd->load << prv->load_window_shift; } else { delta = now - rqd->load_last_update; rqd->avgload = ( ( delta * ( (unsigned long long)rqd->load << prv->load_window_shift ) ) + ( ((1ULL<load_window_shift) - delta) * rqd->avgload ) ) >> prv->load_window_shift; rqd->b_avgload = ( ( delta * ( (unsigned long long)rqd->load << prv->load_window_shift ) ) + ( ((1ULL<load_window_shift) - delta) * rqd->b_avgload ) ) >> prv->load_window_shift; } rqd->load += change; rqd->load_last_update = now; { struct { unsigned rq_load:4, rq_avgload:28; unsigned rq_id:4, b_avgload:28; } d; d.rq_id=rqd->id; d.rq_load = rqd->load; d.rq_avgload = rqd->avgload; d.b_avgload = rqd->b_avgload; trace_var(TRC_CSCHED2_UPDATE_RUNQ_LOAD, 1, sizeof(d), (unsigned char *)&d); } } static void __update_svc_load(const struct scheduler *ops, struct csched_vcpu *svc, int change, s_time_t now) { struct csched_private *prv = CSCHED_PRIV(ops); s_time_t delta=-1; int vcpu_load; if ( change == -1 ) vcpu_load = 1; else if ( change == 1 ) vcpu_load = 0; else vcpu_load = vcpu_runnable(svc->vcpu); now >>= LOADAVG_GRANULARITY_SHIFT; if ( svc->load_last_update + (1ULL<load_window_shift) < now ) { svc->avgload = (unsigned long long)vcpu_load << prv->load_window_shift; } else { delta = now - svc->load_last_update; svc->avgload = ( ( delta * ( (unsigned long long)vcpu_load << prv->load_window_shift ) ) + ( ((1ULL<load_window_shift) - delta) * svc->avgload ) ) >> prv->load_window_shift; } svc->load_last_update = now; { struct { unsigned dom:16,vcpu:16; unsigned v_avgload:32; } d; d.dom = svc->vcpu->domain->domain_id; d.vcpu = svc->vcpu->vcpu_id; d.v_avgload = svc->avgload; trace_var(TRC_CSCHED2_UPDATE_VCPU_LOAD, 1, sizeof(d), (unsigned char *)&d); } } static void update_load(const struct scheduler *ops, struct csched_runqueue_data *rqd, struct csched_vcpu *svc, int change, s_time_t now) { __update_runq_load(ops, rqd, change, now); if ( svc ) __update_svc_load(ops, svc, change, now); } static int __runq_insert(struct list_head *runq, struct csched_vcpu *svc) { struct list_head *iter; int pos = 0; d2printk("rqi d%dv%d\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id); BUG_ON(&svc->rqd->runq != runq); /* Idle vcpus not allowed on the runqueue anymore */ BUG_ON(is_idle_vcpu(svc->vcpu)); BUG_ON(svc->vcpu->is_running); BUG_ON(test_bit(__CSFLAG_scheduled, &svc->flags)); list_for_each( iter, runq ) { struct csched_vcpu * iter_svc = __runq_elem(iter); if ( svc->credit > iter_svc->credit ) { d2printk(" p%d d%dv%d\n", pos, iter_svc->vcpu->domain->domain_id, iter_svc->vcpu->vcpu_id); break; } pos++; } list_add_tail(&svc->runq_elem, iter); return pos; } static void runq_insert(const struct scheduler *ops, unsigned int cpu, struct csched_vcpu *svc) { struct list_head * runq = &RQD(ops, cpu)->runq; int pos = 0; ASSERT( spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock) ); BUG_ON( __vcpu_on_runq(svc) ); BUG_ON( c2r(ops, cpu) != c2r(ops, svc->vcpu->processor) ); pos = __runq_insert(runq, svc); { struct { unsigned dom:16,vcpu:16; unsigned pos; } d; d.dom = svc->vcpu->domain->domain_id; d.vcpu = svc->vcpu->vcpu_id; d.pos = pos; trace_var(TRC_CSCHED2_RUNQ_POS, 0, sizeof(d), (unsigned char *)&d); } return; } static inline void __runq_remove(struct csched_vcpu *svc) { BUG_ON( !__vcpu_on_runq(svc) ); list_del_init(&svc->runq_elem); } void burn_credits(struct csched_runqueue_data *rqd, struct csched_vcpu *, s_time_t); /* Check to see if the item on the runqueue is higher priority than what's * currently running; if so, wake up the processor */ static /*inline*/ void runq_tickle(const struct scheduler *ops, unsigned int cpu, struct csched_vcpu *new, s_time_t now) { int i, ipid=-1; s_time_t lowest=(1<<30); struct csched_runqueue_data *rqd = RQD(ops, cpu); cpumask_t mask; struct csched_vcpu * cur; d2printk("rqt d%dv%d cd%dv%d\n", new->vcpu->domain->domain_id, new->vcpu->vcpu_id, current->domain->domain_id, current->vcpu_id); BUG_ON(new->vcpu->processor != cpu); BUG_ON(new->rqd != rqd); /* Look at the cpu it's running on first */ cur = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr); burn_credits(rqd, cur, now); if ( cur->credit < new->credit ) { ipid = cpu; goto tickle; } /* Get a mask of idle, but not tickled */ cpumask_andnot(&mask, &rqd->idle, &rqd->tickled); /* If it's not empty, choose one */ i = cpumask_cycle(cpu, &mask); if ( i < nr_cpu_ids ) { ipid = i; goto tickle; } /* Otherwise, look for the non-idle cpu with the lowest credit, * skipping cpus which have been tickled but not scheduled yet */ cpumask_andnot(&mask, &rqd->active, &rqd->idle); cpumask_andnot(&mask, &mask, &rqd->tickled); for_each_cpu(i, &mask) { struct csched_vcpu * cur; /* Already looked at this one above */ if ( i == cpu ) continue; cur = CSCHED_VCPU(per_cpu(schedule_data, i).curr); BUG_ON(is_idle_vcpu(cur->vcpu)); /* Update credits for current to see if we want to preempt */ burn_credits(rqd, cur, now); if ( cur->credit < lowest ) { ipid = i; lowest = cur->credit; } /* TRACE */ { struct { unsigned dom:16,vcpu:16; unsigned credit; } d; d.dom = cur->vcpu->domain->domain_id; d.vcpu = cur->vcpu->vcpu_id; d.credit = cur->credit; trace_var(TRC_CSCHED2_TICKLE_CHECK, 1, sizeof(d), (unsigned char *)&d); } } /* Only switch to another processor if the credit difference is greater * than the migrate resistance */ if ( ipid == -1 || lowest + CSCHED_MIGRATE_RESIST > new->credit ) goto no_tickle; tickle: BUG_ON(ipid == -1); /* TRACE */ { struct { unsigned cpu:8; } d; d.cpu = ipid; trace_var(TRC_CSCHED2_TICKLE, 0, sizeof(d), (unsigned char *)&d); } cpumask_set_cpu(ipid, &rqd->tickled); cpu_raise_softirq(ipid, SCHEDULE_SOFTIRQ); no_tickle: return; } /* * Credit-related code */ static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now, struct csched_vcpu *snext) { struct csched_runqueue_data *rqd = RQD(ops, cpu); struct list_head *iter; int m; /* * Under normal circumstances, snext->credit should never be less * than -CSCHED_MIN_TIMER. However, under some circumstances, a * vcpu with low credits may be allowed to run long enough that * its credits are actually less than -CSCHED_CREDIT_INIT. * (Instances have been observed, for example, where a vcpu with * 200us of credit was allowed to run for 11ms, giving it -10.8ms * of credit. Thus it was still negative even after the reset.) * * If this is the case for snext, we simply want to keep moving * everyone up until it is in the black again. This fair because * none of the other vcpus want to run at the moment. * * Rather than looping, however, we just calculate a multiplier, * avoiding an integer division and multiplication in the common * case. */ m = 1; if ( snext->credit < -CSCHED_CREDIT_INIT ) m += (-snext->credit) / CSCHED_CREDIT_INIT; list_for_each( iter, &rqd->svc ) { struct csched_vcpu * svc; int start_credit; svc = list_entry(iter, struct csched_vcpu, rqd_elem); BUG_ON( is_idle_vcpu(svc->vcpu) ); BUG_ON( svc->rqd != rqd ); start_credit = svc->credit; /* And add INIT * m, avoiding integer multiplication in the * common case. */ if ( likely(m==1) ) svc->credit += CSCHED_CREDIT_INIT; else svc->credit += m * CSCHED_CREDIT_INIT; /* "Clip" credits to max carryover */ if ( svc->credit > CSCHED_CREDIT_INIT + CSCHED_CARRYOVER_MAX ) svc->credit = CSCHED_CREDIT_INIT + CSCHED_CARRYOVER_MAX; svc->start_time = now; /* TRACE */ { struct { unsigned dom:16,vcpu:16; unsigned credit_start, credit_end; unsigned multiplier; } d; d.dom = svc->vcpu->domain->domain_id; d.vcpu = svc->vcpu->vcpu_id; d.credit_start = start_credit; d.credit_end = svc->credit; d.multiplier = m; trace_var(TRC_CSCHED2_CREDIT_RESET, 1, sizeof(d), (unsigned char *)&d); } } /* No need to resort runqueue, as everyone's order should be the same. */ } void burn_credits(struct csched_runqueue_data *rqd, struct csched_vcpu *svc, s_time_t now) { s_time_t delta; /* Assert svc is current */ ASSERT(svc==CSCHED_VCPU(per_cpu(schedule_data, svc->vcpu->processor).curr)); if ( is_idle_vcpu(svc->vcpu) ) { BUG_ON(svc->credit != CSCHED_IDLE_CREDIT); return; } delta = now - svc->start_time; if ( delta > 0 ) { t2c_update(rqd, delta, svc); svc->start_time = now; d2printk("b d%dv%d c%d\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id, svc->credit); } else { d2printk("%s: Time went backwards? now %"PRI_stime" start %"PRI_stime"\n", __func__, now, svc->start_time); } /* TRACE */ { struct { unsigned dom:16,vcpu:16; unsigned credit; int delta; } d; d.dom = svc->vcpu->domain->domain_id; d.vcpu = svc->vcpu->vcpu_id; d.credit = svc->credit; d.delta = delta; trace_var(TRC_CSCHED2_CREDIT_BURN, 1, sizeof(d), (unsigned char *)&d); } } /* Find the domain with the highest weight. */ void update_max_weight(struct csched_runqueue_data *rqd, int new_weight, int old_weight) { /* Try to avoid brute-force search: * - If new_weight is larger, max_weigth <- new_weight * - If old_weight != max_weight, someone else is still max_weight * (No action required) * - If old_weight == max_weight, brute-force search for max weight */ if ( new_weight > rqd->max_weight ) { rqd->max_weight = new_weight; d2printk("%s: Runqueue id %d max weight %d\n", __func__, rqd->id, rqd->max_weight); } else if ( old_weight == rqd->max_weight ) { struct list_head *iter; int max_weight = 1; list_for_each( iter, &rqd->svc ) { struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, rqd_elem); if ( svc->weight > max_weight ) max_weight = svc->weight; } rqd->max_weight = max_weight; d2printk("%s: Runqueue %d max weight %d\n", __func__, rqd->id, rqd->max_weight); } } #ifndef NDEBUG static /*inline*/ void __csched_vcpu_check(struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); struct csched_dom * const sdom = svc->sdom; BUG_ON( svc->vcpu != vc ); BUG_ON( sdom != CSCHED_DOM(vc->domain) ); if ( sdom ) { BUG_ON( is_idle_vcpu(vc) ); BUG_ON( sdom->dom != vc->domain ); } else { BUG_ON( !is_idle_vcpu(vc) ); } } #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc)) #else #define CSCHED_VCPU_CHECK(_vc) #endif static void * csched_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd) { struct csched_vcpu *svc; /* Allocate per-VCPU info */ svc = xzalloc(struct csched_vcpu); if ( svc == NULL ) return NULL; INIT_LIST_HEAD(&svc->rqd_elem); INIT_LIST_HEAD(&svc->sdom_elem); INIT_LIST_HEAD(&svc->runq_elem); svc->sdom = dd; svc->vcpu = vc; svc->flags = 0U; if ( ! is_idle_vcpu(vc) ) { BUG_ON( svc->sdom == NULL ); svc->credit = CSCHED_CREDIT_INIT; svc->weight = svc->sdom->weight; /* Starting load of 50% */ svc->avgload = 1ULL << (CSCHED_PRIV(ops)->load_window_shift - 1); svc->load_last_update = NOW(); } else { BUG_ON( svc->sdom != NULL ); svc->credit = CSCHED_IDLE_CREDIT; svc->weight = 0; } SCHED_STAT_CRANK(vcpu_init); return svc; } /* Add and remove from runqueue assignment (not active run queue) */ static void __runq_assign(struct csched_vcpu *svc, struct csched_runqueue_data *rqd) { svc->rqd = rqd; list_add_tail(&svc->rqd_elem, &svc->rqd->svc); update_max_weight(svc->rqd, svc->weight, 0); /* Expected new load based on adding this vcpu */ rqd->b_avgload += svc->avgload; /* TRACE */ { struct { unsigned dom:16,vcpu:16; unsigned rqi:16; } d; d.dom = svc->vcpu->domain->domain_id; d.vcpu = svc->vcpu->vcpu_id; d.rqi=rqd->id; trace_var(TRC_CSCHED2_RUNQ_ASSIGN, 1, sizeof(d), (unsigned char *)&d); } } static void runq_assign(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu *svc = vc->sched_priv; BUG_ON(svc->rqd != NULL); __runq_assign(svc, RQD(ops, vc->processor)); } static void __runq_deassign(struct csched_vcpu *svc) { BUG_ON(__vcpu_on_runq(svc)); BUG_ON(test_bit(__CSFLAG_scheduled, &svc->flags)); list_del_init(&svc->rqd_elem); update_max_weight(svc->rqd, 0, svc->weight); /* Expected new load based on removing this vcpu */ svc->rqd->b_avgload -= svc->avgload; svc->rqd = NULL; } static void runq_deassign(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu *svc = vc->sched_priv; BUG_ON(svc->rqd != RQD(ops, vc->processor)); __runq_deassign(svc); } static void csched_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu *svc = vc->sched_priv; struct domain * const dom = vc->domain; struct csched_dom * const sdom = svc->sdom; printk("%s: Inserting d%dv%d\n", __func__, dom->domain_id, vc->vcpu_id); /* NB: On boot, idle vcpus are inserted before alloc_pdata() has * been called for that cpu. */ if ( ! is_idle_vcpu(vc) ) { spinlock_t *lock; /* FIXME: Do we need the private lock here? */ list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu); /* Add vcpu to runqueue of initial processor */ lock = vcpu_schedule_lock_irq(vc); runq_assign(ops, vc); vcpu_schedule_unlock_irq(lock, vc); sdom->nr_vcpus++; } CSCHED_VCPU_CHECK(vc); } static void csched_free_vdata(const struct scheduler *ops, void *priv) { struct csched_vcpu *svc = priv; xfree(svc); } static void csched_vcpu_remove(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); struct csched_dom * const sdom = svc->sdom; BUG_ON( sdom == NULL ); BUG_ON( !list_empty(&svc->runq_elem) ); if ( ! is_idle_vcpu(vc) ) { spinlock_t *lock; SCHED_STAT_CRANK(vcpu_destroy); /* Remove from runqueue */ lock = vcpu_schedule_lock_irq(vc); runq_deassign(ops, vc); vcpu_schedule_unlock_irq(lock, vc); /* Remove from sdom list. Don't need a lock for this, as it's called * syncronously when nothing else can happen. */ list_del_init(&svc->sdom_elem); svc->sdom->nr_vcpus--; } } static void csched_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); BUG_ON( is_idle_vcpu(vc) ); if ( per_cpu(schedule_data, vc->processor).curr == vc ) cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ); else if ( __vcpu_on_runq(svc) ) { BUG_ON(svc->rqd != RQD(ops, vc->processor)); update_load(ops, svc->rqd, svc, -1, NOW()); __runq_remove(svc); } else if ( test_bit(__CSFLAG_delayed_runq_add, &svc->flags) ) clear_bit(__CSFLAG_delayed_runq_add, &svc->flags); } static void csched_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); s_time_t now = 0; /* Schedule lock should be held at this point. */ d2printk("w d%dv%d\n", vc->domain->domain_id, vc->vcpu_id); BUG_ON( is_idle_vcpu(vc) ); /* Make sure svc priority mod happens before runq check */ if ( unlikely(per_cpu(schedule_data, vc->processor).curr == vc) ) { goto out; } if ( unlikely(__vcpu_on_runq(svc)) ) { /* If we've boosted someone that's already on a runqueue, prioritize * it and inform the cpu in question. */ goto out; } /* If the context hasn't been saved for this vcpu yet, we can't put it on * another runqueue. Instead, we set a flag so that it will be put on the runqueue * after the context has been saved. */ if ( unlikely (test_bit(__CSFLAG_scheduled, &svc->flags) ) ) { set_bit(__CSFLAG_delayed_runq_add, &svc->flags); goto out; } /* Add into the new runqueue if necessary */ if ( svc->rqd == NULL ) runq_assign(ops, vc); else BUG_ON(RQD(ops, vc->processor) != svc->rqd ); now = NOW(); update_load(ops, svc->rqd, svc, 1, now); /* Put the VCPU on the runq */ runq_insert(ops, vc->processor, svc); runq_tickle(ops, vc->processor, svc, now); out: d2printk("w-\n"); return; } static void csched_context_saved(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); s_time_t now = NOW(); spinlock_t *lock = vcpu_schedule_lock_irq(vc); BUG_ON( !is_idle_vcpu(vc) && svc->rqd != RQD(ops, vc->processor)); /* This vcpu is now eligible to be put on the runqueue again */ clear_bit(__CSFLAG_scheduled, &svc->flags); /* If someone wants it on the runqueue, put it there. */ /* * NB: We can get rid of CSFLAG_scheduled by checking for * vc->is_running and __vcpu_on_runq(svc) here. However, * since we're accessing the flags cacheline anyway, * it seems a bit pointless; especially as we have plenty of * bits free. */ if ( test_and_clear_bit(__CSFLAG_delayed_runq_add, &svc->flags) && likely(vcpu_runnable(vc)) ) { BUG_ON(__vcpu_on_runq(svc)); runq_insert(ops, vc->processor, svc); runq_tickle(ops, vc->processor, svc, now); } else if ( !is_idle_vcpu(vc) ) update_load(ops, svc->rqd, svc, -1, now); vcpu_schedule_unlock_irq(lock, vc); } #define MAX_LOAD (1ULL<<60); static int choose_cpu(const struct scheduler *ops, struct vcpu *vc) { struct csched_private *prv = CSCHED_PRIV(ops); int i, min_rqi = -1, new_cpu; struct csched_vcpu *svc = CSCHED_VCPU(vc); s_time_t min_avgload; BUG_ON(cpumask_empty(&prv->active_queues)); /* Locking: * - vc->processor is already locked * - Need to grab prv lock to make sure active runqueues don't * change * - Need to grab locks for other runqueues while checking * avgload * Locking constraint is: * - Lock prv before runqueue locks * - Trylock between runqueue locks (no ordering) * * Since one of the runqueue locks is already held, we can't * just grab the prv lock. Instead, we'll have to trylock, and * do something else reasonable if we fail. */ if ( !spin_trylock(&prv->lock) ) { if ( test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) ) { d2printk("d%dv%d -\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id); clear_bit(__CSFLAG_runq_migrate_request, &svc->flags); } /* Leave it where it is for now. When we actually pay attention * to affinity we'll have to figure something out... */ return vc->processor; } /* First check to see if we're here because someone else suggested a place * for us to move. */ if ( test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) ) { if ( unlikely(svc->migrate_rqd->id < 0) ) { printk("%s: Runqueue migrate aborted because target runqueue disappeared!\n", __func__); /* Fall-through to normal cpu pick */ } else { d2printk("d%dv%d +\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id); new_cpu = cpumask_cycle(vc->processor, &svc->migrate_rqd->active); goto out_up; } } /* FIXME: Pay attention to cpu affinity */ min_avgload = MAX_LOAD; /* Find the runqueue with the lowest instantaneous load */ for_each_cpu(i, &prv->active_queues) { struct csched_runqueue_data *rqd; s_time_t rqd_avgload; rqd = prv->rqd + i; /* If checking a different runqueue, grab the lock, * read the avg, and then release the lock. * * If on our own runqueue, don't grab or release the lock; * but subtract our own load from the runqueue load to simulate * impartiality */ if ( rqd == svc->rqd ) { rqd_avgload = rqd->b_avgload - svc->avgload; } else if ( spin_trylock(&rqd->lock) ) { rqd_avgload = rqd->b_avgload; spin_unlock(&rqd->lock); } else continue; if ( rqd_avgload < min_avgload ) { min_avgload = rqd_avgload; min_rqi=i; } } /* We didn't find anyone (most likely because of spinlock contention); leave it where it is */ if ( min_rqi == -1 ) new_cpu = vc->processor; else { new_cpu = cpumask_cycle(vc->processor, &prv->rqd[min_rqi].active); BUG_ON(new_cpu >= nr_cpu_ids); } out_up: spin_unlock(&prv->lock); return new_cpu; } /* Working state of the load-balancing algorithm */ typedef struct { /* NB: Modified by consider() */ s_time_t load_delta; struct csched_vcpu * best_push_svc, *best_pull_svc; /* NB: Read by consider() */ struct csched_runqueue_data *lrqd; struct csched_runqueue_data *orqd; } balance_state_t; static void consider(balance_state_t *st, struct csched_vcpu *push_svc, struct csched_vcpu *pull_svc) { s_time_t l_load, o_load, delta; l_load = st->lrqd->b_avgload; o_load = st->orqd->b_avgload; if ( push_svc ) { /* What happens to the load on both if we push? */ l_load -= push_svc->avgload; o_load += push_svc->avgload; } if ( pull_svc ) { /* What happens to the load on both if we pull? */ l_load += pull_svc->avgload; o_load -= pull_svc->avgload; } delta = l_load - o_load; if ( delta < 0 ) delta = -delta; if ( delta < st->load_delta ) { st->load_delta = delta; st->best_push_svc=push_svc; st->best_pull_svc=pull_svc; } } void migrate(const struct scheduler *ops, struct csched_vcpu *svc, struct csched_runqueue_data *trqd, s_time_t now) { if ( test_bit(__CSFLAG_scheduled, &svc->flags) ) { d2printk("d%dv%d %d-%d a\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id, svc->rqd->id, trqd->id); /* It's running; mark it to migrate. */ svc->migrate_rqd = trqd; set_bit(_VPF_migrating, &svc->vcpu->pause_flags); set_bit(__CSFLAG_runq_migrate_request, &svc->flags); } else { int on_runq=0; /* It's not running; just move it */ d2printk("d%dv%d %d-%d i\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id, svc->rqd->id, trqd->id); if ( __vcpu_on_runq(svc) ) { __runq_remove(svc); update_load(ops, svc->rqd, svc, -1, now); on_runq=1; } __runq_deassign(svc); svc->vcpu->processor = cpumask_any(&trqd->active); __runq_assign(svc, trqd); if ( on_runq ) { update_load(ops, svc->rqd, svc, 1, now); runq_insert(ops, svc->vcpu->processor, svc); runq_tickle(ops, svc->vcpu->processor, svc, now); } } } static void balance_load(const struct scheduler *ops, int cpu, s_time_t now) { struct csched_private *prv = CSCHED_PRIV(ops); int i, max_delta_rqi = -1; struct list_head *push_iter, *pull_iter; balance_state_t st = { .best_push_svc = NULL, .best_pull_svc = NULL }; /* * Basic algorithm: Push, pull, or swap. * - Find the runqueue with the furthest load distance * - Find a pair that makes the difference the least (where one * on either side may be empty). */ /* Locking: * - pcpu schedule lock should be already locked */ st.lrqd = RQD(ops, cpu); __update_runq_load(ops, st.lrqd, 0, now); retry: if ( !spin_trylock(&prv->lock) ) return; st.load_delta = 0; for_each_cpu(i, &prv->active_queues) { s_time_t delta; st.orqd = prv->rqd + i; if ( st.orqd == st.lrqd || !spin_trylock(&st.orqd->lock) ) continue; __update_runq_load(ops, st.orqd, 0, now); delta = st.lrqd->b_avgload - st.orqd->b_avgload; if ( delta < 0 ) delta = -delta; if ( delta > st.load_delta ) { st.load_delta = delta; max_delta_rqi = i; } spin_unlock(&st.orqd->lock); } /* Minimize holding the big lock */ spin_unlock(&prv->lock); if ( max_delta_rqi == -1 ) goto out; { s_time_t load_max; int cpus_max; load_max = st.lrqd->b_avgload; if ( st.orqd->b_avgload > load_max ) load_max = st.orqd->b_avgload; cpus_max = cpumask_weight(&st.lrqd->active); i = cpumask_weight(&st.orqd->active); if ( i > cpus_max ) cpus_max = i; /* If we're under 100% capacaty, only shift if load difference * is > 1. otherwise, shift if under 12.5% */ if ( load_max < (1ULL<<(prv->load_window_shift))*cpus_max ) { if ( st.load_delta < (1ULL<<(prv->load_window_shift+opt_underload_balance_tolerance) ) ) goto out; } else if ( st.load_delta < (1ULL<<(prv->load_window_shift+opt_overload_balance_tolerance)) ) goto out; } /* Try to grab the other runqueue lock; if it's been taken in the * meantime, try the process over again. This can't deadlock * because if it doesn't get any other rqd locks, it will simply * give up and return. */ st.orqd = prv->rqd + max_delta_rqi; if ( !spin_trylock(&st.orqd->lock) ) goto retry; /* Make sure the runqueue hasn't been deactivated since we released prv->lock */ if ( unlikely(st.orqd->id < 0) ) goto out_up; /* Look for "swap" which gives the best load average * FIXME: O(n^2)! */ /* Reuse load delta (as we're trying to minimize it) */ list_for_each( push_iter, &st.lrqd->svc ) { int inner_load_updated = 0; struct csched_vcpu * push_svc = list_entry(push_iter, struct csched_vcpu, rqd_elem); __update_svc_load(ops, push_svc, 0, now); /* Skip this one if it's already been flagged to migrate */ if ( test_bit(__CSFLAG_runq_migrate_request, &push_svc->flags) ) continue; list_for_each( pull_iter, &st.orqd->svc ) { struct csched_vcpu * pull_svc = list_entry(pull_iter, struct csched_vcpu, rqd_elem); if ( ! inner_load_updated ) { __update_svc_load(ops, pull_svc, 0, now); } /* Skip this one if it's already been flagged to migrate */ if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) ) continue; consider(&st, push_svc, pull_svc); } inner_load_updated = 1; /* Consider push only */ consider(&st, push_svc, NULL); } list_for_each( pull_iter, &st.orqd->svc ) { struct csched_vcpu * pull_svc = list_entry(pull_iter, struct csched_vcpu, rqd_elem); /* Skip this one if it's already been flagged to migrate */ if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) ) continue; /* Consider pull only */ consider(&st, NULL, pull_svc); } /* OK, now we have some candidates; do the moving */ if ( st.best_push_svc ) migrate(ops, st.best_push_svc, st.orqd, now); if ( st.best_pull_svc ) migrate(ops, st.best_pull_svc, st.lrqd, now); out_up: spin_unlock(&st.orqd->lock); out: return; } static int csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc) { int new_cpu; new_cpu = choose_cpu(ops, vc); return new_cpu; } static void csched_vcpu_migrate( const struct scheduler *ops, struct vcpu *vc, unsigned int new_cpu) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); struct csched_runqueue_data *trqd; /* Check if new_cpu is valid */ BUG_ON(!cpumask_test_cpu(new_cpu, &CSCHED_PRIV(ops)->initialized)); trqd = RQD(ops, new_cpu); if ( trqd != svc->rqd ) migrate(ops, svc, trqd, NOW()); } static int csched_dom_cntl( const struct scheduler *ops, struct domain *d, struct xen_domctl_scheduler_op *op) { struct csched_dom * const sdom = CSCHED_DOM(d); struct csched_private *prv = CSCHED_PRIV(ops); unsigned long flags; /* Must hold csched_priv lock to read and update sdom, * runq lock to update csvcs. */ spin_lock_irqsave(&prv->lock, flags); if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) { op->u.credit2.weight = sdom->weight; } else { ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo); if ( op->u.credit2.weight != 0 ) { struct list_head *iter; int old_weight; old_weight = sdom->weight; sdom->weight = op->u.credit2.weight; /* Update weights for vcpus, and max_weight for runqueues on which they reside */ list_for_each ( iter, &sdom->vcpu ) { struct csched_vcpu *svc = list_entry(iter, struct csched_vcpu, sdom_elem); /* NB: Locking order is important here. Because we grab this lock here, we * must never lock csched_priv.lock if we're holding a runqueue lock. * Also, calling vcpu_schedule_lock() is enough, since IRQs have already * been disabled. */ spinlock_t *lock = vcpu_schedule_lock(svc->vcpu); BUG_ON(svc->rqd != RQD(ops, svc->vcpu->processor)); svc->weight = sdom->weight; update_max_weight(svc->rqd, svc->weight, old_weight); vcpu_schedule_unlock(lock, svc->vcpu); } } } spin_unlock_irqrestore(&prv->lock, flags); return 0; } static void * csched_alloc_domdata(const struct scheduler *ops, struct domain *dom) { struct csched_dom *sdom; unsigned long flags; sdom = xzalloc(struct csched_dom); if ( sdom == NULL ) return NULL; /* Initialize credit and weight */ INIT_LIST_HEAD(&sdom->vcpu); INIT_LIST_HEAD(&sdom->sdom_elem); sdom->dom = dom; sdom->weight = CSCHED_DEFAULT_WEIGHT; sdom->nr_vcpus = 0; spin_lock_irqsave(&CSCHED_PRIV(ops)->lock, flags); list_add_tail(&sdom->sdom_elem, &CSCHED_PRIV(ops)->sdom); spin_unlock_irqrestore(&CSCHED_PRIV(ops)->lock, flags); return (void *)sdom; } static int csched_dom_init(const struct scheduler *ops, struct domain *dom) { struct csched_dom *sdom; printk("%s: Initializing domain %d\n", __func__, dom->domain_id); if ( is_idle_domain(dom) ) return 0; sdom = csched_alloc_domdata(ops, dom); if ( sdom == NULL ) return -ENOMEM; dom->sched_priv = sdom; return 0; } static void csched_free_domdata(const struct scheduler *ops, void *data) { unsigned long flags; struct csched_dom *sdom = data; spin_lock_irqsave(&CSCHED_PRIV(ops)->lock, flags); list_del_init(&sdom->sdom_elem); spin_unlock_irqrestore(&CSCHED_PRIV(ops)->lock, flags); xfree(data); } static void csched_dom_destroy(const struct scheduler *ops, struct domain *dom) { struct csched_dom *sdom = CSCHED_DOM(dom); BUG_ON(!list_empty(&sdom->vcpu)); csched_free_domdata(ops, CSCHED_DOM(dom)); } /* How long should we let this vcpu run for? */ static s_time_t csched_runtime(const struct scheduler *ops, int cpu, struct csched_vcpu *snext) { s_time_t time; int rt_credit; /* Proposed runtime measured in credits */ struct csched_runqueue_data *rqd = RQD(ops, cpu); struct list_head *runq = &rqd->runq; if ( is_idle_vcpu(snext->vcpu) ) return CSCHED_MAX_TIMER; /* General algorithm: * 1) Run until snext's credit will be 0 * 2) But if someone is waiting, run until snext's credit is equal * to his * 3) But never run longer than MAX_TIMER or shorter than MIN_TIMER. */ /* 1) Basic time: Run until credit is 0. */ rt_credit = snext->credit; /* 2) If there's someone waiting whose credit is positive, * run until your credit ~= his */ if ( ! list_empty(runq) ) { struct csched_vcpu *swait = __runq_elem(runq->next); if ( ! is_idle_vcpu(swait->vcpu) && swait->credit > 0 ) { rt_credit = snext->credit - swait->credit; } } /* The next guy may actually have a higher credit, if we've tried to * avoid migrating him from a different cpu. DTRT. */ if ( rt_credit <= 0 ) time = CSCHED_MIN_TIMER; else { /* FIXME: See if we can eliminate this conversion if we know time * will be outside (MIN,MAX). Probably requires pre-calculating * credit values of MIN,MAX per vcpu, since each vcpu burns credit * at a different rate. */ time = c2t(rqd, rt_credit, snext); /* Check limits */ if ( time < CSCHED_MIN_TIMER ) time = CSCHED_MIN_TIMER; else if ( time > CSCHED_MAX_TIMER ) time = CSCHED_MAX_TIMER; } return time; } void __dump_execstate(void *unused); /* * Find a candidate. */ static struct csched_vcpu * runq_candidate(struct csched_runqueue_data *rqd, struct csched_vcpu *scurr, int cpu, s_time_t now) { struct list_head *iter; struct csched_vcpu *snext = NULL; /* Default to current if runnable, idle otherwise */ if ( vcpu_runnable(scurr->vcpu) ) snext = scurr; else snext = CSCHED_VCPU(idle_vcpu[cpu]); list_for_each( iter, &rqd->runq ) { struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, runq_elem); /* If this is on a different processor, don't pull it unless * its credit is at least CSCHED_MIGRATE_RESIST higher. */ if ( svc->vcpu->processor != cpu && snext->credit + CSCHED_MIGRATE_RESIST > svc->credit ) continue; /* If the next one on the list has more credit than current * (or idle, if current is not runnable), choose it. */ if ( svc->credit > snext->credit ) snext = svc; /* In any case, if we got this far, break. */ break; } return snext; } /* * This function is in the critical path. It is designed to be simple and * fast for the common case. */ static struct task_slice csched_schedule( const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled) { const int cpu = smp_processor_id(); struct csched_runqueue_data *rqd; struct csched_vcpu * const scurr = CSCHED_VCPU(current); struct csched_vcpu *snext = NULL; struct task_slice ret; SCHED_STAT_CRANK(schedule); CSCHED_VCPU_CHECK(current); d2printk("sc p%d c d%dv%d now %"PRI_stime"\n", cpu, scurr->vcpu->domain->domain_id, scurr->vcpu->vcpu_id, now); BUG_ON(!cpumask_test_cpu(cpu, &CSCHED_PRIV(ops)->initialized)); rqd = RQD(ops, cpu); BUG_ON(!cpumask_test_cpu(cpu, &rqd->active)); /* Protected by runqueue lock */ /* DEBUG */ if ( !is_idle_vcpu(scurr->vcpu) && scurr->rqd != rqd) { int other_rqi = -1, this_rqi = c2r(ops, cpu); if ( scurr->rqd ) { int rq; other_rqi = -2; for_each_cpu ( rq, &CSCHED_PRIV(ops)->active_queues ) { if ( scurr->rqd == &CSCHED_PRIV(ops)->rqd[rq] ) { other_rqi = rq; break; } } } printk("%s: pcpu %d rq %d, but scurr d%dv%d assigned to " "pcpu %d rq %d!\n", __func__, cpu, this_rqi, scurr->vcpu->domain->domain_id, scurr->vcpu->vcpu_id, scurr->vcpu->processor, other_rqi); } BUG_ON(!is_idle_vcpu(scurr->vcpu) && scurr->rqd != rqd); /* Clear "tickled" bit now that we've been scheduled */ if ( cpumask_test_cpu(cpu, &rqd->tickled) ) cpumask_clear_cpu(cpu, &rqd->tickled); /* Update credits */ burn_credits(rqd, scurr, now); /* * Select next runnable local VCPU (ie top of local runq). * * If the current vcpu is runnable, and has higher credit than * the next guy on the queue (or there is noone else), we want to * run him again. * * If there's tasklet work to do, we want to chose the idle vcpu * for this processor, and mark the current for delayed runqueue * add. * * If the current vcpu is runnable, and there's another runnable * candidate, we want to mark current for delayed runqueue add, * and remove the next guy from the queue. * * If the current vcpu is not runnable, we want to chose the idle * vcpu for this processor. */ if ( tasklet_work_scheduled ) { trace_var(TRC_CSCHED2_SCHED_TASKLET, 0, 0, NULL); snext = CSCHED_VCPU(idle_vcpu[cpu]); } else snext=runq_candidate(rqd, scurr, cpu, now); /* If switching from a non-idle runnable vcpu, put it * back on the runqueue. */ if ( snext != scurr && !is_idle_vcpu(scurr->vcpu) && vcpu_runnable(current) ) set_bit(__CSFLAG_delayed_runq_add, &scurr->flags); ret.migrated = 0; /* Accounting for non-idle tasks */ if ( !is_idle_vcpu(snext->vcpu) ) { /* If switching, remove this from the runqueue and mark it scheduled */ if ( snext != scurr ) { BUG_ON(snext->rqd != rqd); __runq_remove(snext); if ( snext->vcpu->is_running ) { printk("p%d: snext d%dv%d running on p%d! scurr d%dv%d\n", cpu, snext->vcpu->domain->domain_id, snext->vcpu->vcpu_id, snext->vcpu->processor, scurr->vcpu->domain->domain_id, scurr->vcpu->vcpu_id); BUG(); } set_bit(__CSFLAG_scheduled, &snext->flags); } /* Check for the reset condition */ if ( snext->credit <= CSCHED_CREDIT_RESET ) { reset_credit(ops, cpu, now, snext); balance_load(ops, cpu, now); } /* Clear the idle mask if necessary */ if ( cpumask_test_cpu(cpu, &rqd->idle) ) cpumask_clear_cpu(cpu, &rqd->idle); snext->start_time = now; /* Safe because lock for old processor is held */ if ( snext->vcpu->processor != cpu ) { snext->credit += CSCHED_MIGRATE_COMPENSATION; snext->vcpu->processor = cpu; ret.migrated = 1; } } else { /* Update the idle mask if necessary */ if ( !cpumask_test_cpu(cpu, &rqd->idle) ) cpumask_set_cpu(cpu, &rqd->idle); /* Make sure avgload gets updated periodically even * if there's no activity */ update_load(ops, rqd, NULL, 0, now); } /* * Return task to run next... */ ret.time = csched_runtime(ops, cpu, snext); ret.task = snext->vcpu; CSCHED_VCPU_CHECK(ret.task); return ret; } static void csched_dump_vcpu(struct csched_vcpu *svc) { printk("[%i.%i] flags=%x cpu=%i", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id, svc->flags, svc->vcpu->processor); printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight); printk("\n"); } static void csched_dump_pcpu(const struct scheduler *ops, int cpu) { struct list_head *runq, *iter; struct csched_vcpu *svc; int loop; char cpustr[100]; /* FIXME: Do locking properly for access to runqueue structures */ runq = &RQD(ops, cpu)->runq; cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu)); printk(" sibling=%s, ", cpustr); cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu)); printk("core=%s\n", cpustr); /* current VCPU */ svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr); if ( svc ) { printk("\trun: "); csched_dump_vcpu(svc); } loop = 0; list_for_each( iter, runq ) { svc = __runq_elem(iter); if ( svc ) { printk("\t%3d: ", ++loop); csched_dump_vcpu(svc); } } } static void csched_dump(const struct scheduler *ops) { struct list_head *iter_sdom, *iter_svc; struct csched_private *prv = CSCHED_PRIV(ops); int i, loop; printk("Active queues: %d\n" "\tdefault-weight = %d\n", cpumask_weight(&prv->active_queues), CSCHED_DEFAULT_WEIGHT); for_each_cpu(i, &prv->active_queues) { s_time_t fraction; fraction = prv->rqd[i].avgload * 100 / (1ULL<load_window_shift); printk("Runqueue %d:\n" "\tncpus = %u\n" "\tmax_weight = %d\n" "\tinstload = %d\n" "\taveload = %3"PRI_stime"\n", i, cpumask_weight(&prv->rqd[i].active), prv->rqd[i].max_weight, prv->rqd[i].load, fraction); } /* FIXME: Locking! */ printk("Domain info:\n"); loop = 0; list_for_each( iter_sdom, &prv->sdom ) { struct csched_dom *sdom; sdom = list_entry(iter_sdom, struct csched_dom, sdom_elem); printk("\tDomain: %d w %d v %d\n\t", sdom->dom->domain_id, sdom->weight, sdom->nr_vcpus); list_for_each( iter_svc, &sdom->vcpu ) { struct csched_vcpu *svc; svc = list_entry(iter_svc, struct csched_vcpu, sdom_elem); printk("\t%3d: ", ++loop); csched_dump_vcpu(svc); } } } static void activate_runqueue(struct csched_private *prv, int rqi) { struct csched_runqueue_data *rqd; rqd = prv->rqd + rqi; BUG_ON(!cpumask_empty(&rqd->active)); rqd->max_weight = 1; rqd->id = rqi; INIT_LIST_HEAD(&rqd->svc); INIT_LIST_HEAD(&rqd->runq); spin_lock_init(&rqd->lock); cpumask_set_cpu(rqi, &prv->active_queues); } static void deactivate_runqueue(struct csched_private *prv, int rqi) { struct csched_runqueue_data *rqd; rqd = prv->rqd + rqi; BUG_ON(!cpumask_empty(&rqd->active)); rqd->id = -1; cpumask_clear_cpu(rqi, &prv->active_queues); } static void init_pcpu(const struct scheduler *ops, int cpu) { int rqi; unsigned long flags; struct csched_private *prv = CSCHED_PRIV(ops); struct csched_runqueue_data *rqd; spinlock_t *old_lock; spin_lock_irqsave(&prv->lock, flags); if ( cpumask_test_cpu(cpu, &prv->initialized) ) { printk("%s: Strange, cpu %d already initialized!\n", __func__, cpu); spin_unlock_irqrestore(&prv->lock, flags); return; } /* Figure out which runqueue to put it in */ rqi = 0; /* Figure out which runqueue to put it in */ /* NB: cpu 0 doesn't get a STARTING callback, so we hard-code it to runqueue 0. */ if ( cpu == 0 ) rqi = 0; else rqi = cpu_to_socket(cpu); if ( rqi < 0 ) { printk("%s: cpu_to_socket(%d) returned %d!\n", __func__, cpu, rqi); BUG(); } rqd=prv->rqd + rqi; printk("Adding cpu %d to runqueue %d\n", cpu, rqi); if ( ! cpumask_test_cpu(rqi, &prv->active_queues) ) { printk(" First cpu on runqueue, activating\n"); activate_runqueue(prv, rqi); } /* IRQs already disabled */ old_lock=pcpu_schedule_lock(cpu); /* Move spinlock to new runq lock. */ per_cpu(schedule_data, cpu).schedule_lock = &rqd->lock; /* Set the runqueue map */ prv->runq_map[cpu]=rqi; cpumask_set_cpu(cpu, &rqd->idle); cpumask_set_cpu(cpu, &rqd->active); /* _Not_ pcpu_schedule_unlock(): per_cpu().schedule_lock changed! */ spin_unlock(old_lock); cpumask_set_cpu(cpu, &prv->initialized); spin_unlock_irqrestore(&prv->lock, flags); return; } static void * csched_alloc_pdata(const struct scheduler *ops, int cpu) { /* Check to see if the cpu is online yet */ /* Note: cpu 0 doesn't get a STARTING callback */ if ( cpu == 0 || cpu_to_socket(cpu) >= 0 ) init_pcpu(ops, cpu); else printk("%s: cpu %d not online yet, deferring initializatgion\n", __func__, cpu); return (void *)1; } static void csched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) { unsigned long flags; struct csched_private *prv = CSCHED_PRIV(ops); struct csched_runqueue_data *rqd; struct schedule_data *sd = &per_cpu(schedule_data, cpu); int rqi; spin_lock_irqsave(&prv->lock, flags); BUG_ON(!cpumask_test_cpu(cpu, &prv->initialized)); /* Find the old runqueue and remove this cpu from it */ rqi = prv->runq_map[cpu]; rqd = prv->rqd + rqi; /* No need to save IRQs here, they're already disabled */ spin_lock(&rqd->lock); BUG_ON(!cpumask_test_cpu(cpu, &rqd->idle)); printk("Removing cpu %d from runqueue %d\n", cpu, rqi); cpumask_clear_cpu(cpu, &rqd->idle); cpumask_clear_cpu(cpu, &rqd->active); if ( cpumask_empty(&rqd->active) ) { printk(" No cpus left on runqueue, disabling\n"); deactivate_runqueue(prv, rqi); } /* Move spinlock to the original lock. */ ASSERT(sd->schedule_lock == &rqd->lock); ASSERT(!spin_is_locked(&sd->_lock)); sd->schedule_lock = &sd->_lock; spin_unlock(&rqd->lock); cpumask_clear_cpu(cpu, &prv->initialized); spin_unlock_irqrestore(&prv->lock, flags); return; } static int csched_cpu_starting(int cpu) { struct scheduler *ops; /* Hope this is safe from cpupools switching things around. :-) */ ops = per_cpu(scheduler, cpu); if ( ops->alloc_pdata == csched_alloc_pdata ) init_pcpu(ops, cpu); return NOTIFY_DONE; } static int cpu_credit2_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int rc = 0; switch ( action ) { case CPU_STARTING: csched_cpu_starting(cpu); break; default: break; } return !rc ? NOTIFY_DONE : notifier_from_errno(rc); } static struct notifier_block cpu_credit2_nfb = { .notifier_call = cpu_credit2_callback }; static int csched_global_init(void) { register_cpu_notifier(&cpu_credit2_nfb); return 0; } static int csched_init(struct scheduler *ops) { int i; struct csched_private *prv; printk("Initializing Credit2 scheduler\n" \ " WARNING: This is experimental software in development.\n" \ " Use at your own risk.\n"); printk(" load_window_shift: %d\n", opt_load_window_shift); printk(" underload_balance_tolerance: %d\n", opt_underload_balance_tolerance); printk(" overload_balance_tolerance: %d\n", opt_overload_balance_tolerance); if ( opt_load_window_shift < LOADAVG_WINDOW_SHIFT_MIN ) { printk("%s: opt_load_window_shift %d below min %d, resetting\n", __func__, opt_load_window_shift, LOADAVG_WINDOW_SHIFT_MIN); opt_load_window_shift = LOADAVG_WINDOW_SHIFT_MIN; } /* Basically no CPU information is available at this point; just * set up basic structures, and a callback when the CPU info is * available. */ prv = xzalloc(struct csched_private); if ( prv == NULL ) return -ENOMEM; ops->sched_data = prv; spin_lock_init(&prv->lock); INIT_LIST_HEAD(&prv->sdom); /* But un-initialize all runqueues */ for ( i = 0; i < nr_cpu_ids; i++ ) { prv->runq_map[i] = -1; prv->rqd[i].id = -1; } prv->load_window_shift = opt_load_window_shift; return 0; } static void csched_deinit(const struct scheduler *ops) { struct csched_private *prv; prv = CSCHED_PRIV(ops); if ( prv != NULL ) xfree(prv); } static struct csched_private _csched_priv; const struct scheduler sched_credit2_def = { .name = "SMP Credit Scheduler rev2", .opt_name = "credit2", .sched_id = XEN_SCHEDULER_CREDIT2, .sched_data = &_csched_priv, .init_domain = csched_dom_init, .destroy_domain = csched_dom_destroy, .insert_vcpu = csched_vcpu_insert, .remove_vcpu = csched_vcpu_remove, .sleep = csched_vcpu_sleep, .wake = csched_vcpu_wake, .adjust = csched_dom_cntl, .pick_cpu = csched_cpu_pick, .migrate = csched_vcpu_migrate, .do_schedule = csched_schedule, .context_saved = csched_context_saved, .dump_cpu_state = csched_dump_pcpu, .dump_settings = csched_dump, .global_init = csched_global_init, .init = csched_init, .deinit = csched_deinit, .alloc_vdata = csched_alloc_vdata, .free_vdata = csched_free_vdata, .alloc_pdata = csched_alloc_pdata, .free_pdata = csched_free_pdata, .alloc_domdata = csched_alloc_domdata, .free_domdata = csched_free_domdata, }; xen-4.4.0/xen/common/radix-tree.c0000664000175000017500000004723112307313555014744 0ustar smbsmb/* * Copyright (C) 2001 Momchil Velikov * Portions Copyright (C) 2001 Christoph Hellwig * Copyright (C) 2005 SGI, Christoph Lameter * Copyright (C) 2006 Nick Piggin * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include #include #include #include struct radix_tree_path { struct radix_tree_node *node; int offset; }; #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) /* * The height_to_maxindex array needs to be one deeper than the maximum * path as height 0 holds only 1 entry. */ static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly; static inline void *ptr_to_indirect(void *ptr) { return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR); } static inline void *indirect_to_ptr(void *ptr) { return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); } struct rcu_node { struct radix_tree_node node; struct rcu_head rcu_head; }; static struct radix_tree_node *rcu_node_alloc(void *arg) { struct rcu_node *rcu_node = xmalloc(struct rcu_node); return rcu_node ? &rcu_node->node : NULL; } static void _rcu_node_free(struct rcu_head *head) { struct rcu_node *rcu_node = container_of(head, struct rcu_node, rcu_head); xfree(rcu_node); } static void rcu_node_free(struct radix_tree_node *node, void *arg) { struct rcu_node *rcu_node = container_of(node, struct rcu_node, node); call_rcu(&rcu_node->rcu_head, _rcu_node_free); } static struct radix_tree_node *radix_tree_node_alloc( struct radix_tree_root *root) { struct radix_tree_node *ret; ret = root->node_alloc(root->node_alloc_free_arg); if (ret) memset(ret, 0, sizeof(*ret)); return ret; } static void radix_tree_node_free( struct radix_tree_root *root, struct radix_tree_node *node) { root->node_free(node, root->node_alloc_free_arg); } /* * Return the maximum key which can be store into a * radix tree with height HEIGHT. */ static inline unsigned long radix_tree_maxindex(unsigned int height) { return height_to_maxindex[height]; } /* * Extend a radix tree so it can store key @index. */ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) { struct radix_tree_node *node; unsigned int height; /* Figure out what the height should be. */ height = root->height + 1; while (index > radix_tree_maxindex(height)) height++; if (root->rnode == NULL) { root->height = height; goto out; } do { unsigned int newheight; if (!(node = radix_tree_node_alloc(root))) return -ENOMEM; /* Increase the height. */ node->slots[0] = indirect_to_ptr(root->rnode); newheight = root->height+1; node->height = newheight; node->count = 1; node = ptr_to_indirect(node); rcu_assign_pointer(root->rnode, node); root->height = newheight; } while (height > root->height); out: return 0; } /** * radix_tree_insert - insert into a radix tree * @root: radix tree root * @index: index key * @item: item to insert * * Insert an item into the radix tree at position @index. */ int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) { struct radix_tree_node *node = NULL, *slot; unsigned int height, shift; int offset; int error; BUG_ON(radix_tree_is_indirect_ptr(item)); /* Make sure the tree is high enough. */ if (index > radix_tree_maxindex(root->height)) { error = radix_tree_extend(root, index); if (error) return error; } slot = indirect_to_ptr(root->rnode); height = root->height; shift = (height-1) * RADIX_TREE_MAP_SHIFT; offset = 0; /* uninitialised var warning */ while (height > 0) { if (slot == NULL) { /* Have to add a child node. */ if (!(slot = radix_tree_node_alloc(root))) return -ENOMEM; slot->height = height; if (node) { rcu_assign_pointer(node->slots[offset], slot); node->count++; } else rcu_assign_pointer(root->rnode, ptr_to_indirect(slot)); } /* Go a level down */ offset = (index >> shift) & RADIX_TREE_MAP_MASK; node = slot; slot = node->slots[offset]; shift -= RADIX_TREE_MAP_SHIFT; height--; } if (slot != NULL) return -EEXIST; if (node) { node->count++; rcu_assign_pointer(node->slots[offset], item); } else { rcu_assign_pointer(root->rnode, item); } return 0; } EXPORT_SYMBOL(radix_tree_insert); /* * is_slot == 1 : search for the slot. * is_slot == 0 : search for the node. */ static void *radix_tree_lookup_element(struct radix_tree_root *root, unsigned long index, int is_slot) { unsigned int height, shift; struct radix_tree_node *node, **slot; node = rcu_dereference(root->rnode); if (node == NULL) return NULL; if (!radix_tree_is_indirect_ptr(node)) { if (index > 0) return NULL; return is_slot ? (void *)&root->rnode : node; } node = indirect_to_ptr(node); height = node->height; if (index > radix_tree_maxindex(height)) return NULL; shift = (height-1) * RADIX_TREE_MAP_SHIFT; do { slot = (struct radix_tree_node **) (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK)); node = rcu_dereference(*slot); if (node == NULL) return NULL; shift -= RADIX_TREE_MAP_SHIFT; height--; } while (height > 0); return is_slot ? (void *)slot : indirect_to_ptr(node); } /** * radix_tree_lookup_slot - lookup a slot in a radix tree * @root: radix tree root * @index: index key * * Returns: the slot corresponding to the position @index in the * radix tree @root. This is useful for update-if-exists operations. * * This function can be called under rcu_read_lock iff the slot is not * modified by radix_tree_replace_slot, otherwise it must be called * exclusive from other writers. Any dereference of the slot must be done * using radix_tree_deref_slot. */ void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) { return (void **)radix_tree_lookup_element(root, index, 1); } EXPORT_SYMBOL(radix_tree_lookup_slot); /** * radix_tree_lookup - perform lookup operation on a radix tree * @root: radix tree root * @index: index key * * Lookup the item at the position @index in the radix tree @root. * * This function can be called under rcu_read_lock, however the caller * must manage lifetimes of leaf nodes (eg. RCU may also be used to free * them safely). No RCU barriers are required to access or modify the * returned item, however. */ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) { return radix_tree_lookup_element(root, index, 0); } EXPORT_SYMBOL(radix_tree_lookup); /** * radix_tree_next_hole - find the next hole (not-present entry) * @root: tree root * @index: index key * @max_scan: maximum range to search * * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest * indexed hole. * * Returns: the index of the hole if found, otherwise returns an index * outside of the set specified (in which case 'return - index >= max_scan' * will be true). In rare cases of index wrap-around, 0 will be returned. * * radix_tree_next_hole may be called under rcu_read_lock. However, like * radix_tree_gang_lookup, this will not atomically search a snapshot of * the tree at a single point in time. For example, if a hole is created * at index 5, then subsequently a hole is created at index 10, * radix_tree_next_hole covering both indexes may return 10 if called * under rcu_read_lock. */ unsigned long radix_tree_next_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan) { unsigned long i; for (i = 0; i < max_scan; i++) { if (!radix_tree_lookup(root, index)) break; index++; if (index == 0) break; } return index; } EXPORT_SYMBOL(radix_tree_next_hole); /** * radix_tree_prev_hole - find the prev hole (not-present entry) * @root: tree root * @index: index key * @max_scan: maximum range to search * * Search backwards in the range [max(index-max_scan+1, 0), index] * for the first hole. * * Returns: the index of the hole if found, otherwise returns an index * outside of the set specified (in which case 'index - return >= max_scan' * will be true). In rare cases of wrap-around, ULONG_MAX will be returned. * * radix_tree_next_hole may be called under rcu_read_lock. However, like * radix_tree_gang_lookup, this will not atomically search a snapshot of * the tree at a single point in time. For example, if a hole is created * at index 10, then subsequently a hole is created at index 5, * radix_tree_prev_hole covering both indexes may return 5 if called under * rcu_read_lock. */ unsigned long radix_tree_prev_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan) { unsigned long i; for (i = 0; i < max_scan; i++) { if (!radix_tree_lookup(root, index)) break; index--; if (index == ULONG_MAX) break; } return index; } EXPORT_SYMBOL(radix_tree_prev_hole); static unsigned int __lookup(struct radix_tree_node *slot, void ***results, unsigned long index, unsigned int max_items, unsigned long *next_index) { unsigned int nr_found = 0; unsigned int shift, height; unsigned long i; height = slot->height; if (height == 0) goto out; shift = (height-1) * RADIX_TREE_MAP_SHIFT; for ( ; height > 1; height--) { i = (index >> shift) & RADIX_TREE_MAP_MASK; for (;;) { if (slot->slots[i] != NULL) break; index &= ~((1UL << shift) - 1); index += 1UL << shift; if (index == 0) goto out; /* 32-bit wraparound */ i++; if (i == RADIX_TREE_MAP_SIZE) goto out; } shift -= RADIX_TREE_MAP_SHIFT; slot = rcu_dereference(slot->slots[i]); if (slot == NULL) goto out; } /* Bottom level: grab some items */ for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { index++; if (slot->slots[i]) { results[nr_found++] = &(slot->slots[i]); if (nr_found == max_items) goto out; } } out: *next_index = index; return nr_found; } /** * radix_tree_gang_lookup - perform multiple lookup on a radix tree * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * * Performs an index-ascending scan of the tree for present items. Places * them at *@results and returns the number of items which were placed at * *@results. * * The implementation is naive. * * Like radix_tree_lookup, radix_tree_gang_lookup may be called under * rcu_read_lock. In this case, rather than the returned results being * an atomic snapshot of the tree at a single point in time, the semantics * of an RCU protected gang lookup are as though multiple radix_tree_lookups * have been issued in individual locks, and results stored in 'results'. */ unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items) { unsigned long max_index; struct radix_tree_node *node; unsigned long cur_index = first_index; unsigned int ret; node = rcu_dereference(root->rnode); if (!node) return 0; if (!radix_tree_is_indirect_ptr(node)) { if (first_index > 0) return 0; results[0] = node; return 1; } node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); ret = 0; while (ret < max_items) { unsigned int nr_found, slots_found, i; unsigned long next_index; /* Index of next search */ if (cur_index > max_index) break; slots_found = __lookup(node, (void ***)results + ret, cur_index, max_items - ret, &next_index); nr_found = 0; for (i = 0; i < slots_found; i++) { struct radix_tree_node *slot; slot = *(((void ***)results)[ret + i]); if (!slot) continue; results[ret + nr_found] = indirect_to_ptr(rcu_dereference(slot)); nr_found++; } ret += nr_found; if (next_index == 0) break; cur_index = next_index; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup); /** * radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * * Performs an index-ascending scan of the tree for present items. Places * their slots at *@results and returns the number of items which were * placed at *@results. * * The implementation is naive. * * Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must * be dereferenced with radix_tree_deref_slot, and if using only RCU * protection, radix_tree_deref_slot may fail requiring a retry. */ unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, unsigned long first_index, unsigned int max_items) { unsigned long max_index; struct radix_tree_node *node; unsigned long cur_index = first_index; unsigned int ret; node = rcu_dereference(root->rnode); if (!node) return 0; if (!radix_tree_is_indirect_ptr(node)) { if (first_index > 0) return 0; results[0] = (void **)&root->rnode; return 1; } node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); ret = 0; while (ret < max_items) { unsigned int slots_found; unsigned long next_index; /* Index of next search */ if (cur_index > max_index) break; slots_found = __lookup(node, results + ret, cur_index, max_items - ret, &next_index); ret += slots_found; if (next_index == 0) break; cur_index = next_index; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup_slot); /** * radix_tree_shrink - shrink height of a radix tree to minimal * @root radix tree root */ static inline void radix_tree_shrink(struct radix_tree_root *root) { /* try to shrink tree height */ while (root->height > 0) { struct radix_tree_node *to_free = root->rnode; void *newptr; BUG_ON(!radix_tree_is_indirect_ptr(to_free)); to_free = indirect_to_ptr(to_free); /* * The candidate node has more than one child, or its child * is not at the leftmost slot, we cannot shrink. */ if (to_free->count != 1) break; if (!to_free->slots[0]) break; /* * We don't need rcu_assign_pointer(), since we are simply * moving the node from one part of the tree to another: if it * was safe to dereference the old pointer to it * (to_free->slots[0]), it will be safe to dereference the new * one (root->rnode) as far as dependent read barriers go. */ newptr = to_free->slots[0]; if (root->height > 1) newptr = ptr_to_indirect(newptr); root->rnode = newptr; root->height--; /* * We have a dilemma here. The node's slot[0] must not be * NULLed in case there are concurrent lookups expecting to * find the item. However if this was a bottom-level node, * then it may be subject to the slot pointer being visible * to callers dereferencing it. If item corresponding to * slot[0] is subsequently deleted, these callers would expect * their slot to become empty sooner or later. * * For example, lockless pagecache will look up a slot, deref * the page pointer, and if the page is 0 refcount it means it * was concurrently deleted from pagecache so try the deref * again. Fortunately there is already a requirement for logic * to retry the entire slot lookup -- the indirect pointer * problem (replacing direct root node with an indirect pointer * also results in a stale slot). So tag the slot as indirect * to force callers to retry. */ if (root->height == 0) *((unsigned long *)&to_free->slots[0]) |= RADIX_TREE_INDIRECT_PTR; radix_tree_node_free(root, to_free); } } /** * radix_tree_delete - delete an item from a radix tree * @root: radix tree root * @index: index key * * Remove the item at @index from the radix tree rooted at @root. * * Returns the address of the deleted item, or NULL if it was not present. */ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) { /* * The radix tree path needs to be one longer than the maximum path * since the "list" is null terminated. */ struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; struct radix_tree_node *slot = NULL; struct radix_tree_node *to_free; unsigned int height, shift; int offset; height = root->height; if (index > radix_tree_maxindex(height)) goto out; slot = root->rnode; if (height == 0) { root->rnode = NULL; goto out; } slot = indirect_to_ptr(slot); shift = (height - 1) * RADIX_TREE_MAP_SHIFT; pathp->node = NULL; do { if (slot == NULL) goto out; pathp++; offset = (index >> shift) & RADIX_TREE_MAP_MASK; pathp->offset = offset; pathp->node = slot; slot = slot->slots[offset]; shift -= RADIX_TREE_MAP_SHIFT; height--; } while (height > 0); if (slot == NULL) goto out; to_free = NULL; /* Now free the nodes we do not need anymore */ while (pathp->node) { pathp->node->slots[pathp->offset] = NULL; pathp->node->count--; /* * Queue the node for deferred freeing after the * last reference to it disappears (set NULL, above). */ if (to_free) radix_tree_node_free(root, to_free); if (pathp->node->count) { if (pathp->node == indirect_to_ptr(root->rnode)) radix_tree_shrink(root); goto out; } /* Node with zero slots in use so free it */ to_free = pathp->node; pathp--; } root->height = 0; root->rnode = NULL; if (to_free) radix_tree_node_free(root, to_free); out: return slot; } EXPORT_SYMBOL(radix_tree_delete); static void radix_tree_node_destroy( struct radix_tree_root *root, struct radix_tree_node *node, void (*slot_free)(void *)) { int i; for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { struct radix_tree_node *slot = node->slots[i]; BUG_ON(radix_tree_is_indirect_ptr(slot)); if (slot == NULL) continue; if (node->height == 1) { if (slot_free) slot_free(slot); } else { radix_tree_node_destroy(root, slot, slot_free); } } radix_tree_node_free(root, node); } void radix_tree_destroy( struct radix_tree_root *root, void (*slot_free)(void *)) { struct radix_tree_node *node = root->rnode; if (node == NULL) return; if (!radix_tree_is_indirect_ptr(node)) { if (slot_free) slot_free(node); } else { node = indirect_to_ptr(node); radix_tree_node_destroy(root, node, slot_free); } radix_tree_init(root); } void radix_tree_init(struct radix_tree_root *root) { memset(root, 0, sizeof(*root)); root->node_alloc = rcu_node_alloc; root->node_free = rcu_node_free; } void radix_tree_set_alloc_callbacks( struct radix_tree_root *root, radix_tree_alloc_fn_t *node_alloc, radix_tree_free_fn_t *node_free, void *node_alloc_free_arg) { root->node_alloc = node_alloc; root->node_free = node_free; root->node_alloc_free_arg = node_alloc_free_arg; } static __init unsigned long __maxindex(unsigned int height) { unsigned int width = height * RADIX_TREE_MAP_SHIFT; int shift = RADIX_TREE_INDEX_BITS - width; if (shift < 0) return ~0UL; if (shift >= BITS_PER_LONG) return 0UL; return ~0UL >> shift; } static __init int radix_tree_init_maxindex(void) { unsigned int i; for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) height_to_maxindex[i] = __maxindex(i); return 0; } /* pre-SMP just so it runs before 'normal' initcalls */ presmp_initcall(radix_tree_init_maxindex); xen-4.4.0/xen/common/keyhandler.c0000664000175000017500000004062612307313555015027 0ustar smbsmb/****************************************************************************** * keyhandler.c */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct keyhandler *key_table[256]; static unsigned char keypress_key; static bool_t alt_key_handling; char keyhandler_scratch[1024]; static void keypress_action(unsigned long unused) { handle_keypress(keypress_key, NULL); } static DECLARE_TASKLET(keypress_tasklet, keypress_action, 0); void handle_keypress(unsigned char key, struct cpu_user_regs *regs) { struct keyhandler *h; if ( (h = key_table[key]) == NULL ) return; if ( !in_irq() || h->irq_callback ) { console_start_log_everything(); h->irq_callback ? (*h->u.irq_fn)(key, regs) : (*h->u.fn)(key); console_end_log_everything(); } else { keypress_key = key; tasklet_schedule(&keypress_tasklet); } } void register_keyhandler(unsigned char key, struct keyhandler *handler) { ASSERT(key_table[key] == NULL); key_table[key] = handler; } static void show_handlers(unsigned char key) { int i; printk("'%c' pressed -> showing installed handlers\n", key); for ( i = 0; i < ARRAY_SIZE(key_table); i++ ) if ( key_table[i] != NULL ) printk(" key '%c' (ascii '%02x') => %s\n", isprint(i) ? i : ' ', i, key_table[i]->desc); } static struct keyhandler show_handlers_keyhandler = { .u.fn = show_handlers, .desc = "show this message" }; static cpumask_t dump_execstate_mask; void dump_execstate(struct cpu_user_regs *regs) { unsigned int cpu = smp_processor_id(); if ( !guest_mode(regs) ) { printk("*** Dumping CPU%u host state: ***\n", cpu); show_execution_state(regs); } if ( !is_idle_vcpu(current) ) { printk("*** Dumping CPU%u guest state (d%d:v%d): ***\n", smp_processor_id(), current->domain->domain_id, current->vcpu_id); show_execution_state(guest_cpu_user_regs()); printk("\n"); } cpumask_clear_cpu(cpu, &dump_execstate_mask); if ( !alt_key_handling ) return; cpu = cpumask_cycle(cpu, &dump_execstate_mask); if ( cpu < nr_cpu_ids ) { smp_send_state_dump(cpu); return; } console_end_sync(); watchdog_enable(); } static void dump_registers(unsigned char key, struct cpu_user_regs *regs) { unsigned int cpu; /* We want to get everything out that we possibly can. */ watchdog_disable(); console_start_sync(); printk("'%c' pressed -> dumping registers\n\n", key); cpumask_copy(&dump_execstate_mask, &cpu_online_map); /* Get local execution state out immediately, in case we get stuck. */ dump_execstate(regs); /* Alt. handling: remaining CPUs are dumped asynchronously one-by-one. */ if ( alt_key_handling ) return; /* Normal handling: synchronously dump the remaining CPUs' states. */ for_each_cpu ( cpu, &dump_execstate_mask ) { smp_send_state_dump(cpu); while ( cpumask_test_cpu(cpu, &dump_execstate_mask) ) cpu_relax(); } console_end_sync(); watchdog_enable(); } static struct keyhandler dump_registers_keyhandler = { .irq_callback = 1, .diagnostic = 1, .u.irq_fn = dump_registers, .desc = "dump registers" }; static DECLARE_TASKLET(dump_dom0_tasklet, NULL, 0); static void dump_dom0_action(unsigned long arg) { struct vcpu *v = (void *)arg; for ( ; ; ) { vcpu_show_execution_state(v); if ( (v = v->next_in_list) == NULL ) break; if ( softirq_pending(smp_processor_id()) ) { dump_dom0_tasklet.data = (unsigned long)v; tasklet_schedule_on_cpu(&dump_dom0_tasklet, v->processor); break; } } } static void dump_dom0_registers(unsigned char key) { struct vcpu *v; if ( dom0 == NULL ) return; printk("'%c' pressed -> dumping Dom0's registers\n", key); for_each_vcpu ( dom0, v ) { if ( alt_key_handling && softirq_pending(smp_processor_id()) ) { tasklet_kill(&dump_dom0_tasklet); tasklet_init(&dump_dom0_tasklet, dump_dom0_action, (unsigned long)v); tasklet_schedule_on_cpu(&dump_dom0_tasklet, v->processor); return; } vcpu_show_execution_state(v); } } static struct keyhandler dump_dom0_registers_keyhandler = { .diagnostic = 1, .u.fn = dump_dom0_registers, .desc = "dump Dom0 registers" }; static void reboot_machine(unsigned char key, struct cpu_user_regs *regs) { printk("'%c' pressed -> rebooting machine\n", key); machine_restart(0); } static struct keyhandler reboot_machine_keyhandler = { .irq_callback = 1, .u.irq_fn = reboot_machine, .desc = "reboot machine" }; static void cpuset_print(char *set, int size, const cpumask_t *mask) { *set++ = '{'; set += cpulist_scnprintf(set, size-2, mask); *set++ = '}'; *set++ = '\0'; } static void nodeset_print(char *set, int size, const nodemask_t *mask) { *set++ = '['; set += nodelist_scnprintf(set, size-2, mask); *set++ = ']'; *set++ = '\0'; } static void periodic_timer_print(char *str, int size, uint64_t period) { if ( period == 0 ) { strlcpy(str, "No periodic timer", size); return; } snprintf(str, size, "%u Hz periodic timer (period %u ms)", 1000000000/(int)period, (int)period/1000000); } static void dump_domains(unsigned char key) { struct domain *d; struct vcpu *v; s_time_t now = NOW(); #define tmpstr keyhandler_scratch printk("'%c' pressed -> dumping domain info (now=0x%X:%08X)\n", key, (u32)(now>>32), (u32)now); rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) { unsigned int i; printk("General information for domain %u:\n", d->domain_id); cpuset_print(tmpstr, sizeof(tmpstr), d->domain_dirty_cpumask); printk(" refcnt=%d dying=%d pause_count=%d\n", atomic_read(&d->refcnt), d->is_dying, atomic_read(&d->pause_count)); printk(" nr_pages=%d xenheap_pages=%d shared_pages=%u paged_pages=%u " "dirty_cpus=%s max_pages=%u\n", d->tot_pages, d->xenheap_pages, atomic_read(&d->shr_pages), atomic_read(&d->paged_pages), tmpstr, d->max_pages); printk(" handle=%02x%02x%02x%02x-%02x%02x-%02x%02x-" "%02x%02x-%02x%02x%02x%02x%02x%02x vm_assist=%08lx\n", d->handle[ 0], d->handle[ 1], d->handle[ 2], d->handle[ 3], d->handle[ 4], d->handle[ 5], d->handle[ 6], d->handle[ 7], d->handle[ 8], d->handle[ 9], d->handle[10], d->handle[11], d->handle[12], d->handle[13], d->handle[14], d->handle[15], d->vm_assist); for ( i = 0 ; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ ) if ( test_bit(i, &d->watchdog_inuse_map) ) printk(" watchdog %d expires in %d seconds\n", i, (u32)((d->watchdog_timer[i].expires - NOW()) >> 30)); arch_dump_domain_info(d); rangeset_domain_printk(d); dump_pageframe_info(d); nodeset_print(tmpstr, sizeof(tmpstr), &d->node_affinity); printk("NODE affinity for domain %d: %s\n", d->domain_id, tmpstr); printk("VCPU information and callbacks for domain %u:\n", d->domain_id); for_each_vcpu ( d, v ) { printk(" VCPU%d: CPU%d [has=%c] poll=%d " "upcall_pend = %02x, upcall_mask = %02x ", v->vcpu_id, v->processor, v->is_running ? 'T':'F', v->poll_evtchn, vcpu_info(v, evtchn_upcall_pending), !vcpu_event_delivery_is_enabled(v)); cpuset_print(tmpstr, sizeof(tmpstr), v->vcpu_dirty_cpumask); printk("dirty_cpus=%s ", tmpstr); cpuset_print(tmpstr, sizeof(tmpstr), v->cpu_affinity); printk("cpu_affinity=%s\n", tmpstr); printk(" pause_count=%d pause_flags=%lx\n", atomic_read(&v->pause_count), v->pause_flags); arch_dump_vcpu_info(v); periodic_timer_print(tmpstr, sizeof(tmpstr), v->periodic_period); printk(" %s\n", tmpstr); } } for_each_domain ( d ) { for_each_vcpu ( d, v ) { printk("Notifying guest %d:%d (virq %d, port %d)\n", d->domain_id, v->vcpu_id, VIRQ_DEBUG, v->virq_to_evtchn[VIRQ_DEBUG]); send_guest_vcpu_virq(v, VIRQ_DEBUG); } } arch_dump_shared_mem_info(); rcu_read_unlock(&domlist_read_lock); #undef tmpstr } static struct keyhandler dump_domains_keyhandler = { .diagnostic = 1, .u.fn = dump_domains, .desc = "dump domain (and guest debug) info" }; static cpumask_t read_clocks_cpumask; static DEFINE_PER_CPU(s_time_t, read_clocks_time); static DEFINE_PER_CPU(u64, read_cycles_time); static void read_clocks_slave(void *unused) { unsigned int cpu = smp_processor_id(); local_irq_disable(); while ( !cpumask_test_cpu(cpu, &read_clocks_cpumask) ) cpu_relax(); per_cpu(read_clocks_time, cpu) = NOW(); per_cpu(read_cycles_time, cpu) = get_cycles(); cpumask_clear_cpu(cpu, &read_clocks_cpumask); local_irq_enable(); } static void read_clocks(unsigned char key) { unsigned int cpu = smp_processor_id(), min_stime_cpu, max_stime_cpu; unsigned int min_cycles_cpu, max_cycles_cpu; u64 min_stime, max_stime, dif_stime; u64 min_cycles, max_cycles, dif_cycles; static u64 sumdif_stime = 0, maxdif_stime = 0; static u64 sumdif_cycles = 0, maxdif_cycles = 0; static u32 count = 0; static DEFINE_SPINLOCK(lock); spin_lock(&lock); smp_call_function(read_clocks_slave, NULL, 0); local_irq_disable(); cpumask_andnot(&read_clocks_cpumask, &cpu_online_map, cpumask_of(cpu)); per_cpu(read_clocks_time, cpu) = NOW(); per_cpu(read_cycles_time, cpu) = get_cycles(); local_irq_enable(); while ( !cpumask_empty(&read_clocks_cpumask) ) cpu_relax(); min_stime_cpu = max_stime_cpu = min_cycles_cpu = max_cycles_cpu = cpu; for_each_online_cpu ( cpu ) { if ( per_cpu(read_clocks_time, cpu) < per_cpu(read_clocks_time, min_stime_cpu) ) min_stime_cpu = cpu; if ( per_cpu(read_clocks_time, cpu) > per_cpu(read_clocks_time, max_stime_cpu) ) max_stime_cpu = cpu; if ( per_cpu(read_cycles_time, cpu) < per_cpu(read_cycles_time, min_cycles_cpu) ) min_cycles_cpu = cpu; if ( per_cpu(read_cycles_time, cpu) > per_cpu(read_cycles_time, max_cycles_cpu) ) max_cycles_cpu = cpu; } min_stime = per_cpu(read_clocks_time, min_stime_cpu); max_stime = per_cpu(read_clocks_time, max_stime_cpu); min_cycles = per_cpu(read_cycles_time, min_cycles_cpu); max_cycles = per_cpu(read_cycles_time, max_cycles_cpu); spin_unlock(&lock); dif_stime = max_stime - min_stime; if ( dif_stime > maxdif_stime ) maxdif_stime = dif_stime; sumdif_stime += dif_stime; dif_cycles = max_cycles - min_cycles; if ( dif_cycles > maxdif_cycles ) maxdif_cycles = dif_cycles; sumdif_cycles += dif_cycles; count++; printk("Synced stime skew: max=%"PRIu64"ns avg=%"PRIu64"ns " "samples=%"PRIu32" current=%"PRIu64"ns\n", maxdif_stime, sumdif_stime/count, count, dif_stime); printk("Synced cycles skew: max=%"PRIu64" avg=%"PRIu64" " "samples=%"PRIu32" current=%"PRIu64"\n", maxdif_cycles, sumdif_cycles/count, count, dif_cycles); } static struct keyhandler read_clocks_keyhandler = { .diagnostic = 1, .u.fn = read_clocks, .desc = "display multi-cpu clock info" }; static struct keyhandler dump_runq_keyhandler = { .diagnostic = 1, .u.fn = dump_runq, .desc = "dump run queues" }; #ifdef PERF_COUNTERS static struct keyhandler perfc_printall_keyhandler = { .diagnostic = 1, .u.fn = perfc_printall, .desc = "print performance counters" }; static struct keyhandler perfc_reset_keyhandler = { .u.fn = perfc_reset, .desc = "reset performance counters" }; #endif #ifdef LOCK_PROFILE static struct keyhandler spinlock_printall_keyhandler = { .diagnostic = 1, .u.fn = spinlock_profile_printall, .desc = "print lock profile info" }; static struct keyhandler spinlock_reset_keyhandler = { .u.fn = spinlock_profile_reset, .desc = "reset lock profile info" }; #endif static void run_all_nonirq_keyhandlers(unsigned long unused) { /* Fire all the non-IRQ-context diagnostic keyhandlers */ struct keyhandler *h; int k; console_start_log_everything(); for ( k = 0; k < ARRAY_SIZE(key_table); k++ ) { process_pending_softirqs(); h = key_table[k]; if ( (h == NULL) || !h->diagnostic || h->irq_callback ) continue; printk("[%c: %s]\n", k, h->desc); (*h->u.fn)(k); } console_end_log_everything(); } static DECLARE_TASKLET(run_all_keyhandlers_tasklet, run_all_nonirq_keyhandlers, 0); static void run_all_keyhandlers(unsigned char key, struct cpu_user_regs *regs) { struct keyhandler *h; int k; watchdog_disable(); printk("'%c' pressed -> firing all diagnostic keyhandlers\n", key); /* Fire all the IRQ-context diangostic keyhandlers now */ for ( k = 0; k < ARRAY_SIZE(key_table); k++ ) { h = key_table[k]; if ( (h == NULL) || !h->diagnostic || !h->irq_callback ) continue; printk("[%c: %s]\n", k, h->desc); (*h->u.irq_fn)(k, regs); } watchdog_enable(); /* Trigger the others from a tasklet in non-IRQ context */ tasklet_schedule(&run_all_keyhandlers_tasklet); } static struct keyhandler run_all_keyhandlers_keyhandler = { .irq_callback = 1, .u.irq_fn = run_all_keyhandlers, .desc = "print all diagnostics" }; static void do_debug_key(unsigned char key, struct cpu_user_regs *regs) { printk("'%c' pressed -> trapping into debugger\n", key); (void)debugger_trap_fatal(0xf001, regs); nop(); /* Prevent the compiler doing tail call optimisation, as that confuses xendbg a bit. */ } static struct keyhandler do_debug_key_keyhandler = { .irq_callback = 1, .u.irq_fn = do_debug_key, .desc = "trap to xendbg" }; static void do_toggle_alt_key(unsigned char key, struct cpu_user_regs *regs) { alt_key_handling = !alt_key_handling; printk("'%c' pressed -> using %s key handling\n", key, alt_key_handling ? "alternative" : "normal"); } static struct keyhandler toggle_alt_keyhandler = { .irq_callback = 1, .u.irq_fn = do_toggle_alt_key, .desc = "toggle alternative key handling" }; void __init initialize_keytable(void) { if ( num_present_cpus() > 16 ) { alt_key_handling = 1; printk(XENLOG_INFO "Defaulting to alternative key handling; " "send 'A' to switch to normal mode.\n"); } register_keyhandler('A', &toggle_alt_keyhandler); register_keyhandler('d', &dump_registers_keyhandler); register_keyhandler('h', &show_handlers_keyhandler); register_keyhandler('q', &dump_domains_keyhandler); register_keyhandler('r', &dump_runq_keyhandler); register_keyhandler('R', &reboot_machine_keyhandler); register_keyhandler('t', &read_clocks_keyhandler); register_keyhandler('0', &dump_dom0_registers_keyhandler); register_keyhandler('%', &do_debug_key_keyhandler); register_keyhandler('*', &run_all_keyhandlers_keyhandler); #ifdef PERF_COUNTERS register_keyhandler('p', &perfc_printall_keyhandler); register_keyhandler('P', &perfc_reset_keyhandler); #endif #ifdef LOCK_PROFILE register_keyhandler('l', &spinlock_printall_keyhandler); register_keyhandler('L', &spinlock_reset_keyhandler); #endif } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/tasklet.c0000664000175000017500000001432612307313555014346 0ustar smbsmb/****************************************************************************** * tasklet.c * * Tasklets are dynamically-allocatable tasks run in either VCPU context * (specifically, the idle VCPU's context) or in softirq context, on at most * one CPU at a time. Softirq versus VCPU context execution is specified * during per-tasklet initialisation. * * Copyright (c) 2010, Citrix Systems, Inc. * Copyright (c) 1992, Linus Torvalds * * Authors: * Keir Fraser */ #include #include #include #include #include #include /* Some subsystems call into us before we are initialised. We ignore them. */ static bool_t tasklets_initialised; DEFINE_PER_CPU(unsigned long, tasklet_work_to_do); static DEFINE_PER_CPU(struct list_head, tasklet_list); static DEFINE_PER_CPU(struct list_head, softirq_tasklet_list); /* Protects all lists and tasklet structures. */ static DEFINE_SPINLOCK(tasklet_lock); static void tasklet_enqueue(struct tasklet *t) { unsigned int cpu = t->scheduled_on; if ( t->is_softirq ) { struct list_head *list = &per_cpu(softirq_tasklet_list, cpu); bool_t was_empty = list_empty(list); list_add_tail(&t->list, list); if ( was_empty ) cpu_raise_softirq(cpu, TASKLET_SOFTIRQ); } else { unsigned long *work_to_do = &per_cpu(tasklet_work_to_do, cpu); list_add_tail(&t->list, &per_cpu(tasklet_list, cpu)); if ( !test_and_set_bit(_TASKLET_enqueued, work_to_do) ) cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); } } void tasklet_schedule_on_cpu(struct tasklet *t, unsigned int cpu) { unsigned long flags; spin_lock_irqsave(&tasklet_lock, flags); if ( tasklets_initialised && !t->is_dead ) { t->scheduled_on = cpu; if ( !t->is_running ) { list_del(&t->list); tasklet_enqueue(t); } } spin_unlock_irqrestore(&tasklet_lock, flags); } void tasklet_schedule(struct tasklet *t) { tasklet_schedule_on_cpu(t, smp_processor_id()); } static void do_tasklet_work(unsigned int cpu, struct list_head *list) { struct tasklet *t; if ( unlikely(list_empty(list) || cpu_is_offline(cpu)) ) return; t = list_entry(list->next, struct tasklet, list); list_del_init(&t->list); BUG_ON(t->is_dead || t->is_running || (t->scheduled_on != cpu)); t->scheduled_on = -1; t->is_running = 1; spin_unlock_irq(&tasklet_lock); sync_local_execstate(); t->func(t->data); spin_lock_irq(&tasklet_lock); t->is_running = 0; if ( t->scheduled_on >= 0 ) { BUG_ON(t->is_dead || !list_empty(&t->list)); tasklet_enqueue(t); } } /* VCPU context work */ void do_tasklet(void) { unsigned int cpu = smp_processor_id(); unsigned long *work_to_do = &per_cpu(tasklet_work_to_do, cpu); struct list_head *list = &per_cpu(tasklet_list, cpu); /* * Work must be enqueued *and* scheduled. Otherwise there is no work to * do, and/or scheduler needs to run to update idle vcpu priority. */ if ( likely(*work_to_do != (TASKLET_enqueued|TASKLET_scheduled)) ) return; spin_lock_irq(&tasklet_lock); do_tasklet_work(cpu, list); if ( list_empty(list) ) { clear_bit(_TASKLET_enqueued, work_to_do); raise_softirq(SCHEDULE_SOFTIRQ); } spin_unlock_irq(&tasklet_lock); } /* Softirq context work */ static void tasklet_softirq_action(void) { unsigned int cpu = smp_processor_id(); struct list_head *list = &per_cpu(softirq_tasklet_list, cpu); spin_lock_irq(&tasklet_lock); do_tasklet_work(cpu, list); if ( !list_empty(list) && !cpu_is_offline(cpu) ) raise_softirq(TASKLET_SOFTIRQ); spin_unlock_irq(&tasklet_lock); } void tasklet_kill(struct tasklet *t) { unsigned long flags; spin_lock_irqsave(&tasklet_lock, flags); if ( !list_empty(&t->list) ) { BUG_ON(t->is_dead || t->is_running || (t->scheduled_on < 0)); list_del_init(&t->list); } t->scheduled_on = -1; t->is_dead = 1; while ( t->is_running ) { spin_unlock_irqrestore(&tasklet_lock, flags); cpu_relax(); spin_lock_irqsave(&tasklet_lock, flags); } spin_unlock_irqrestore(&tasklet_lock, flags); } static void migrate_tasklets_from_cpu(unsigned int cpu, struct list_head *list) { unsigned long flags; struct tasklet *t; spin_lock_irqsave(&tasklet_lock, flags); while ( !list_empty(list) ) { t = list_entry(list->next, struct tasklet, list); BUG_ON(t->scheduled_on != cpu); t->scheduled_on = smp_processor_id(); list_del(&t->list); tasklet_enqueue(t); } spin_unlock_irqrestore(&tasklet_lock, flags); } void tasklet_init( struct tasklet *t, void (*func)(unsigned long), unsigned long data) { memset(t, 0, sizeof(*t)); INIT_LIST_HEAD(&t->list); t->scheduled_on = -1; t->func = func; t->data = data; } void softirq_tasklet_init( struct tasklet *t, void (*func)(unsigned long), unsigned long data) { tasklet_init(t, func, data); t->is_softirq = 1; } static int cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; switch ( action ) { case CPU_UP_PREPARE: INIT_LIST_HEAD(&per_cpu(tasklet_list, cpu)); INIT_LIST_HEAD(&per_cpu(softirq_tasklet_list, cpu)); break; case CPU_UP_CANCELED: case CPU_DEAD: migrate_tasklets_from_cpu(cpu, &per_cpu(tasklet_list, cpu)); migrate_tasklets_from_cpu(cpu, &per_cpu(softirq_tasklet_list, cpu)); break; default: break; } return NOTIFY_DONE; } static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback, .priority = 99 }; void __init tasklet_subsys_init(void) { void *hcpu = (void *)(long)smp_processor_id(); cpu_callback(&cpu_nfb, CPU_UP_PREPARE, hcpu); register_cpu_notifier(&cpu_nfb); open_softirq(TASKLET_SOFTIRQ, tasklet_softirq_action); tasklets_initialised = 1; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/shutdown.c0000664000175000017500000000326412307313555014551 0ustar smbsmb#include #include #include #include #include #include #include #include #include #ifdef CONFIG_KEXEC #include #endif #include #include /* opt_noreboot: If true, machine will need manual reset on error. */ bool_t __read_mostly opt_noreboot; boolean_param("noreboot", opt_noreboot); static void maybe_reboot(void) { if ( opt_noreboot ) { printk("'noreboot' set - not rebooting.\n"); machine_halt(); } else { printk("rebooting machine in 5 seconds.\n"); watchdog_disable(); machine_restart(5000); } } void dom0_shutdown(u8 reason) { switch ( reason ) { case SHUTDOWN_poweroff: { printk("Domain 0 halted: halting machine.\n"); machine_halt(); break; /* not reached */ } case SHUTDOWN_crash: { debugger_trap_immediate(); printk("Domain 0 crashed: "); #ifdef CONFIG_KEXEC kexec_crash(); #endif maybe_reboot(); break; /* not reached */ } case SHUTDOWN_reboot: { printk("Domain 0 shutdown: rebooting machine.\n"); machine_restart(0); break; /* not reached */ } case SHUTDOWN_watchdog: { printk("Domain 0 shutdown: watchdog rebooting machine.\n"); #ifdef CONFIG_KEXEC kexec_crash(); #endif machine_restart(0); break; /* not reached */ } default: { printk("Domain 0 shutdown (unknown reason %u): ", reason); maybe_reboot(); break; /* not reached */ } } } xen-4.4.0/xen/common/lzo.c0000664000175000017500000003335412307313555013505 0ustar smbsmb/* * lzo.c -- LZO1X Compressor from MiniLZO * * Copyright (C) 1996-2005 Markus F.X.J. Oberhumer * * The full LZO package can be found at: * http://www.oberhumer.com/opensource/lzo/ * * Adapted for Xen (files combined and syntactic/header changes) by: * Dan Magenheimer * */ /* * lzodefs.h -- architecture, OS and compiler specific defines * * Copyright (C) 1996-2005 Markus F.X.J. Oberhumer * * The full LZO package can be found at: * http://www.oberhumer.com/opensource/lzo/ * * Changed for kernel use by: * Nitin Gupta * Richard Purdie */ #define LZO_VERSION 0x2020 #define LZO_VERSION_STRING "2.02" #define LZO_VERSION_DATE "Oct 17 2005" #define M1_MAX_OFFSET 0x0400 #define M2_MAX_OFFSET 0x0800 #define M3_MAX_OFFSET 0x4000 #define M4_MAX_OFFSET 0xbfff #define M1_MIN_LEN 2 #define M1_MAX_LEN 2 #define M2_MIN_LEN 3 #define M2_MAX_LEN 8 #define M3_MIN_LEN 3 #define M3_MAX_LEN 33 #define M4_MIN_LEN 3 #define M4_MAX_LEN 9 #define M1_MARKER 0 #define M2_MARKER 64 #define M3_MARKER 32 #define M4_MARKER 16 #define D_BITS 14 #define D_MASK ((1u << D_BITS) - 1) #define D_HIGH ((D_MASK >> 1) + 1) #define DX2(p, s1, s2) (((((size_t)((p)[2]) << (s2)) ^ (p)[1]) \ << (s1)) ^ (p)[0]) #define DX3(p, s1, s2, s3) ((DX2((p)+1, s2, s3) << (s1)) ^ (p)[0]) /* * LZO1X Compressor from MiniLZO * * Copyright (C) 1996-2005 Markus F.X.J. Oberhumer * * The full LZO package can be found at: * http://www.oberhumer.com/opensource/lzo/ * * Changed for kernel use by: * Nitin Gupta * Richard Purdie */ #ifdef __XEN__ #include #endif #include #define get_unaligned(_p) (*(_p)) #define put_unaligned(_val,_p) (*(_p)=_val) #define get_unaligned_le16(_p) (*(u16 *)(_p)) static noinline size_t _lzo1x_1_do_compress(const unsigned char *in, size_t in_len, unsigned char *out, size_t *out_len, void *wrkmem) { const unsigned char * const in_end = in + in_len; const unsigned char * const ip_end = in + in_len - M2_MAX_LEN - 5; const unsigned char ** const dict = wrkmem; const unsigned char *ip = in, *ii = ip; const unsigned char *end, *m, *m_pos; size_t m_off, m_len, dindex; unsigned char *op = out; ip += 4; for (;;) { dindex = ((size_t)(0x21 * DX3(ip, 5, 5, 6)) >> 5) & D_MASK; m_pos = dict[dindex]; if (m_pos < in) goto literal; if (ip == m_pos || ((size_t)(ip - m_pos) > M4_MAX_OFFSET)) goto literal; m_off = ip - m_pos; if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3]) goto try_match; dindex = (dindex & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f); m_pos = dict[dindex]; if (m_pos < in) goto literal; if (ip == m_pos || ((size_t)(ip - m_pos) > M4_MAX_OFFSET)) goto literal; m_off = ip - m_pos; if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3]) goto try_match; goto literal; try_match: if (get_unaligned((const unsigned short *)m_pos) == get_unaligned((const unsigned short *)ip)) { if (likely(m_pos[2] == ip[2])) goto match; } literal: dict[dindex] = ip; ++ip; if (unlikely(ip >= ip_end)) break; continue; match: dict[dindex] = ip; if (ip != ii) { size_t t = ip - ii; if (t <= 3) { op[-2] |= t; } else if (t <= 18) { *op++ = (t - 3); } else { size_t tt = t - 18; *op++ = 0; while (tt > 255) { tt -= 255; *op++ = 0; } *op++ = tt; } do { *op++ = *ii++; } while (--t > 0); } ip += 3; if (m_pos[3] != *ip++ || m_pos[4] != *ip++ || m_pos[5] != *ip++ || m_pos[6] != *ip++ || m_pos[7] != *ip++ || m_pos[8] != *ip++) { --ip; m_len = ip - ii; if (m_off <= M2_MAX_OFFSET) { m_off -= 1; *op++ = (((m_len - 1) << 5) | ((m_off & 7) << 2)); *op++ = (m_off >> 3); } else if (m_off <= M3_MAX_OFFSET) { m_off -= 1; *op++ = (M3_MARKER | (m_len - 2)); goto m3_m4_offset; } else { m_off -= 0x4000; *op++ = (M4_MARKER | ((m_off & 0x4000) >> 11) | (m_len - 2)); goto m3_m4_offset; } } else { end = in_end; m = m_pos + M2_MAX_LEN + 1; while (ip < end && *m == *ip) { m++; ip++; } m_len = ip - ii; if (m_off <= M3_MAX_OFFSET) { m_off -= 1; if (m_len <= 33) { *op++ = (M3_MARKER | (m_len - 2)); } else { m_len -= 33; *op++ = M3_MARKER | 0; goto m3_m4_len; } } else { m_off -= 0x4000; if (m_len <= M4_MAX_LEN) { *op++ = (M4_MARKER | ((m_off & 0x4000) >> 11) | (m_len - 2)); } else { m_len -= M4_MAX_LEN; *op++ = (M4_MARKER | ((m_off & 0x4000) >> 11)); m3_m4_len: while (m_len > 255) { m_len -= 255; *op++ = 0; } *op++ = (m_len); } } m3_m4_offset: *op++ = ((m_off & 63) << 2); *op++ = (m_off >> 6); } ii = ip; if (unlikely(ip >= ip_end)) break; } *out_len = op - out; return in_end - ii; } int lzo1x_1_compress(const unsigned char *in, size_t in_len, unsigned char *out, size_t *out_len, void *wrkmem) { const unsigned char *ii; unsigned char *op = out; size_t t; if (unlikely(in_len <= M2_MAX_LEN + 5)) { t = in_len; } else { t = _lzo1x_1_do_compress(in, in_len, op, out_len, wrkmem); op += *out_len; } if (t > 0) { ii = in + in_len - t; if (op == out && t <= 238) { *op++ = (17 + t); } else if (t <= 3) { op[-2] |= t; } else if (t <= 18) { *op++ = (t - 3); } else { size_t tt = t - 18; *op++ = 0; while (tt > 255) { tt -= 255; *op++ = 0; } *op++ = tt; } do { *op++ = *ii++; } while (--t > 0); } *op++ = M4_MARKER | 1; *op++ = 0; *op++ = 0; *out_len = op - out; return LZO_E_OK; } /* * LZO1X Decompressor from MiniLZO * * Copyright (C) 1996-2005 Markus F.X.J. Oberhumer * * The full LZO package can be found at: * http://www.oberhumer.com/opensource/lzo/ * * Changed for kernel use by: * Nitin Gupta * Richard Purdie */ #define HAVE_IP(x, ip_end, ip) ((size_t)(ip_end - ip) < (x)) #define HAVE_OP(x, op_end, op) ((size_t)(op_end - op) < (x)) #define HAVE_LB(m_pos, out, op) (m_pos < out || m_pos >= op) #define COPY4(dst, src) \ put_unaligned(get_unaligned((const u32 *)(src)), (u32 *)(dst)) int lzo1x_decompress_safe(const unsigned char *in, size_t in_len, unsigned char *out, size_t *out_len) { const unsigned char * const ip_end = in + in_len; unsigned char * const op_end = out + *out_len; const unsigned char *ip = in, *m_pos; unsigned char *op = out; size_t t; *out_len = 0; if (*ip > 17) { t = *ip++ - 17; if (t < 4) goto match_next; if (HAVE_OP(t, op_end, op)) goto output_overrun; if (HAVE_IP(t + 1, ip_end, ip)) goto input_overrun; do { *op++ = *ip++; } while (--t > 0); goto first_literal_run; } while ((ip < ip_end)) { t = *ip++; if (t >= 16) goto match; if (t == 0) { if (HAVE_IP(1, ip_end, ip)) goto input_overrun; while (*ip == 0) { t += 255; ip++; if (HAVE_IP(1, ip_end, ip)) goto input_overrun; } t += 15 + *ip++; } if (HAVE_OP(t + 3, op_end, op)) goto output_overrun; if (HAVE_IP(t + 4, ip_end, ip)) goto input_overrun; COPY4(op, ip); op += 4; ip += 4; if (--t > 0) { if (t >= 4) { do { COPY4(op, ip); op += 4; ip += 4; t -= 4; } while (t >= 4); if (t > 0) { do { *op++ = *ip++; } while (--t > 0); } } else { do { *op++ = *ip++; } while (--t > 0); } } first_literal_run: t = *ip++; if (t >= 16) goto match; m_pos = op - (1 + M2_MAX_OFFSET); m_pos -= t >> 2; m_pos -= *ip++ << 2; if (HAVE_LB(m_pos, out, op)) goto lookbehind_overrun; if (HAVE_OP(3, op_end, op)) goto output_overrun; *op++ = *m_pos++; *op++ = *m_pos++; *op++ = *m_pos; goto match_done; do { match: if (t >= 64) { m_pos = op - 1; m_pos -= (t >> 2) & 7; m_pos -= *ip++ << 3; t = (t >> 5) - 1; if (HAVE_LB(m_pos, out, op)) goto lookbehind_overrun; if (HAVE_OP(t + 3 - 1, op_end, op)) goto output_overrun; goto copy_match; } else if (t >= 32) { t &= 31; if (t == 0) { if (HAVE_IP(1, ip_end, ip)) goto input_overrun; while (*ip == 0) { t += 255; ip++; if (HAVE_IP(1, ip_end, ip)) goto input_overrun; } t += 31 + *ip++; } m_pos = op - 1; m_pos -= get_unaligned_le16(ip) >> 2; ip += 2; } else if (t >= 16) { m_pos = op; m_pos -= (t & 8) << 11; t &= 7; if (t == 0) { if (HAVE_IP(1, ip_end, ip)) goto input_overrun; while (*ip == 0) { t += 255; ip++; if (HAVE_IP(1, ip_end, ip)) goto input_overrun; } t += 7 + *ip++; } m_pos -= get_unaligned_le16(ip) >> 2; ip += 2; if (m_pos == op) goto eof_found; m_pos -= 0x4000; } else { m_pos = op - 1; m_pos -= t >> 2; m_pos -= *ip++ << 2; if (HAVE_LB(m_pos, out, op)) goto lookbehind_overrun; if (HAVE_OP(2, op_end, op)) goto output_overrun; *op++ = *m_pos++; *op++ = *m_pos; goto match_done; } if (HAVE_LB(m_pos, out, op)) goto lookbehind_overrun; if (HAVE_OP(t + 3 - 1, op_end, op)) goto output_overrun; if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) { COPY4(op, m_pos); op += 4; m_pos += 4; t -= 4 - (3 - 1); do { COPY4(op, m_pos); op += 4; m_pos += 4; t -= 4; } while (t >= 4); if (t > 0) do { *op++ = *m_pos++; } while (--t > 0); } else { copy_match: *op++ = *m_pos++; *op++ = *m_pos++; do { *op++ = *m_pos++; } while (--t > 0); } match_done: t = ip[-2] & 3; if (t == 0) break; match_next: if (HAVE_OP(t, op_end, op)) goto output_overrun; if (HAVE_IP(t + 1, ip_end, ip)) goto input_overrun; *op++ = *ip++; if (t > 1) { *op++ = *ip++; if (t > 2) *op++ = *ip++; } t = *ip++; } while (ip < ip_end); } *out_len = op - out; return LZO_E_EOF_NOT_FOUND; eof_found: *out_len = op - out; return (ip == ip_end ? LZO_E_OK : (ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN)); input_overrun: *out_len = op - out; return LZO_E_INPUT_OVERRUN; output_overrun: *out_len = op - out; return LZO_E_OUTPUT_OVERRUN; lookbehind_overrun: *out_len = op - out; return LZO_E_LOOKBEHIND_OVERRUN; } xen-4.4.0/xen/common/version.c0000664000175000017500000000141512307313555014357 0ustar smbsmb#include #include const char *xen_compile_date(void) { return XEN_COMPILE_DATE; } const char *xen_compile_time(void) { return XEN_COMPILE_TIME; } const char *xen_compile_by(void) { return XEN_COMPILE_BY; } const char *xen_compile_domain(void) { return XEN_COMPILE_DOMAIN; } const char *xen_compile_host(void) { return XEN_COMPILE_HOST; } const char *xen_compiler(void) { return XEN_COMPILER; } unsigned int xen_major_version(void) { return XEN_VERSION; } unsigned int xen_minor_version(void) { return XEN_SUBVERSION; } const char *xen_extra_version(void) { return XEN_EXTRAVERSION; } const char *xen_changeset(void) { return XEN_CHANGESET; } const char *xen_banner(void) { return XEN_BANNER; } xen-4.4.0/xen/common/notifier.c0000664000175000017500000000556112307313555014517 0ustar smbsmb/****************************************************************************** * common/notifier.c * * Routines to manage notifier chains for passing status changes to any * interested routines. * * Original code from Linux kernel 2.6.27 (Alan Cox ) */ #include #include #include /** * notifier_chain_register - Add notifier to a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @n: New entry in notifier chain * * Adds a notifier to a raw notifier chain. * All locking must be provided by the caller. */ void __init notifier_chain_register( struct notifier_head *nh, struct notifier_block *n) { struct list_head *chain = &nh->head.chain; struct notifier_block *nb; while ( chain->next != &nh->head.chain ) { nb = list_entry(chain->next, struct notifier_block, chain); if ( n->priority > nb->priority ) break; chain = chain->next; } list_add(&n->chain, chain); } /** * notifier_chain_unregister - Remove notifier from a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from a raw notifier chain. * All locking must be provided by the caller. */ void __init notifier_chain_unregister( struct notifier_head *nh, struct notifier_block *n) { list_del(&n->chain); } /** * notifier_call_chain - Informs the registered notifiers about an event. * @nh: Pointer to head of the raw notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * @pcursor: If non-NULL, position in chain to start from. Also updated on * return to indicate how far notifications got before stopping. * * Calls each function in a notifier chain in turn. The functions run in an * undefined context. All locking must be provided by the caller. * * If the return value of the notifier can be and'ed with %NOTIFY_STOP_MASK * then notifier_call_chain() will return immediately, with teh return value of * the notifier function which halted execution. Otherwise the return value is * the return value of the last notifier function called. */ int notifier_call_chain( struct notifier_head *nh, unsigned long val, void *v, struct notifier_block **pcursor) { int ret = NOTIFY_DONE; struct list_head *cursor; struct notifier_block *nb; bool_t reverse = !!(val & NOTIFY_REVERSE); cursor = &(pcursor && *pcursor ? *pcursor : &nh->head)->chain; do { cursor = reverse ? cursor->prev : cursor->next; nb = list_entry(cursor, struct notifier_block, chain); if ( cursor == &nh->head.chain ) break; ret = nb->notifier_call(nb, val, v); } while ( !(ret & NOTIFY_STOP_MASK) ); if ( pcursor ) *pcursor = nb; return ret; } xen-4.4.0/xen/common/unlzo.c0000664000175000017500000001415612307313555014047 0ustar smbsmb/* * LZO decompressor for the Linux kernel. Code borrowed from the lzo * implementation by Markus Franz Xaver Johannes Oberhumer. * * Linux kernel adaptation: * Copyright (C) 2009 * Albin Tonnerre, Free Electrons * * Original code: * Copyright (C) 1996-2005 Markus Franz Xaver Johannes Oberhumer * All Rights Reserved. * * lzop and the LZO library are free software; you can redistribute them * and/or modify them under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; see the file COPYING. * If not, write to the Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * Markus F.X.J. Oberhumer * * http://www.oberhumer.com/opensource/lzop/ */ #include "decompress.h" #include #ifdef __XEN__ #include #endif #if 1 /* ndef CONFIG_??? */ static inline u16 INIT get_unaligned_be16(void *p) { return be16_to_cpup(p); } static inline u32 INIT get_unaligned_be32(void *p) { return be32_to_cpup(p); } #else #include static inline u16 INIT get_unaligned_be16(void *p) { return be16_to_cpu(__get_unaligned(p, 2)); } static inline u32 INIT get_unaligned_be32(void *p) { return be32_to_cpu(__get_unaligned(p, 4)); } #endif static const unsigned char lzop_magic[] = { 0x89, 0x4c, 0x5a, 0x4f, 0x00, 0x0d, 0x0a, 0x1a, 0x0a }; #define LZO_BLOCK_SIZE (256*1024l) #define HEADER_HAS_FILTER 0x00000800L #define HEADER_SIZE_MIN (9 + 7 + 4 + 8 + 1 + 4) #define HEADER_SIZE_MAX (9 + 7 + 1 + 8 + 8 + 4 + 1 + 255 + 4) static int INIT parse_header(u8 *input, int *skip, int in_len) { int l; u8 *parse = input; u8 *end = input + in_len; u8 level = 0; u16 version; /* * Check that there's enough input to possibly have a valid header. * Then it is possible to parse several fields until the minimum * size may have been used. */ if (in_len < HEADER_SIZE_MIN) return 0; /* read magic: 9 first bits */ for (l = 0; l < 9; l++) { if (*parse++ != lzop_magic[l]) return 0; } /* get version (2bytes), skip library version (2), * 'need to be extracted' version (2) and * method (1) */ version = get_unaligned_be16(parse); parse += 7; if (version >= 0x0940) level = *parse++; if (get_unaligned_be32(parse) & HEADER_HAS_FILTER) parse += 8; /* flags + filter info */ else parse += 4; /* flags */ /* * At least mode, mtime_low, filename length, and checksum must * be left to be parsed. If also mtime_high is present, it's OK * because the next input buffer check is after reading the * filename length. */ if (end - parse < 8 + 1 + 4) return 0; /* skip mode and mtime_low */ parse += 8; if (version >= 0x0940) parse += 4; /* skip mtime_high */ l = *parse++; /* don't care about the file name, and skip checksum */ if (end - parse < l + 4) return 0; parse += l + 4; *skip = parse - input; return 1; } STATIC int INIT unlzo(u8 *input, unsigned int in_len, int (*fill) (void *, unsigned int), int (*flush) (void *, unsigned int), u8 *output, unsigned int *posp, void (*error) (const char *x)) { u8 r = 0; int skip = 0; u32 src_len, dst_len; size_t tmp; u8 *in_buf, *in_buf_save, *out_buf; int ret = -1; if (output) { out_buf = output; } else if (!flush) { error("NULL output pointer and no flush function provided"); goto exit; } else { out_buf = malloc(LZO_BLOCK_SIZE); if (!out_buf) { error("Could not allocate output buffer"); goto exit; } } if (input && fill) { error("Both input pointer and fill function provided, don't know what to do"); goto exit_1; } else if (input) { in_buf = input; } else if (!fill || !posp) { error("NULL input pointer and missing position pointer or fill function"); goto exit_1; } else { in_buf = malloc(lzo1x_worst_compress(LZO_BLOCK_SIZE)); if (!in_buf) { error("Could not allocate input buffer"); goto exit_1; } } in_buf_save = in_buf; if (posp) *posp = 0; if (fill) fill(in_buf, lzo1x_worst_compress(LZO_BLOCK_SIZE)); if (!parse_header(input, &skip, in_len)) { error("invalid header"); goto exit_2; } in_buf += skip; in_len -= skip; if (posp) *posp = skip; for (;;) { /* read uncompressed block size */ if (in_len < 4) { error("file corrupted"); goto exit_2; } dst_len = get_unaligned_be32(in_buf); in_buf += 4; in_len -= 4; /* exit if last block */ if (dst_len == 0) { if (posp) *posp += 4; break; } if (dst_len > LZO_BLOCK_SIZE) { error("dest len longer than block size"); goto exit_2; } /* read compressed block size, and skip block checksum info */ if (in_len < 8) { error("file corrupted"); goto exit_2; } src_len = get_unaligned_be32(in_buf); in_buf += 8; in_len -= 8; if (src_len <= 0 || src_len > dst_len || src_len > in_len) { error("file corrupted"); goto exit_2; } /* decompress */ tmp = dst_len; /* When the input data is not compressed at all, * lzo1x_decompress_safe will fail, so call memcpy() * instead */ if (unlikely(dst_len == src_len)) memcpy(out_buf, in_buf, src_len); else { r = lzo1x_decompress_safe(in_buf, src_len, out_buf, &tmp); if (r != LZO_E_OK || dst_len != tmp) { error("Compressed data violation"); goto exit_2; } } if (flush && flush(out_buf, dst_len) != dst_len) goto exit_2; if (output) out_buf += dst_len; if (posp) *posp += src_len + 12; if (fill) { in_buf = in_buf_save; fill(in_buf, lzo1x_worst_compress(LZO_BLOCK_SIZE)); } else { in_buf += src_len; in_len -= src_len; } } ret = 0; exit_2: if (!input) free(in_buf_save); exit_1: if (!output) free(out_buf); exit: return ret; } xen-4.4.0/xen/common/gdbstub.c0000664000175000017500000004261312307313555014331 0ustar smbsmb/* * Copyright (C) 2005 Jimi Xenidis , IBM Corporation * Copyright (C) 2006 Isaku Yamahata * VA Linux Systems Japan. K.K. * * gdbstub arch neutral part * Based on x86 cdb (xen/arch/x86/cdb.c) and ppc gdbstub(xen/common/gdbstub.c) * But extensively modified. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * gdbstub: implements the architecture independant parts of the * gdb remote protocol. */ /* We try to avoid assuming much about what the rest of the system is doing. In particular, dynamic memory allocation is out of the question. */ /* Resuming after we've stopped used to work, but more through luck than any actual intention. It doesn't at the moment. */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* Printk isn't particularly safe just after we've trapped to the debugger. so avoid it. */ #define dbg_printk(...) /*#define dbg_printk(...) printk(__VA_ARGS__)*/ #define GDB_RETRY_MAX 10 struct gdb_cpu_info { atomic_t paused; atomic_t ack; }; static struct gdb_cpu_info gdb_cpu[NR_CPUS]; static atomic_t gdb_smp_paused_count; static void gdb_smp_pause(void); static void gdb_smp_resume(void); static char __initdata opt_gdb[30]; string_param("gdb", opt_gdb); static void gdbstub_console_puts(const char *str); /* value <-> char (de)serialzers */ static char hex2char(unsigned long x) { const char array[] = "0123456789abcdef"; return array[x & 15]; } static unsigned int char2hex(unsigned char c) { if ( (c >= '0') && (c <= '9') ) return c - '0'; else if ( (c >= 'a') && (c <= 'f') ) return c - 'a' + 10; else if ( (c >= 'A') && (c <= 'F') ) return c - 'A' + 10; else BUG(); return -1; } static unsigned char str2hex(const char *str) { return (char2hex(str[0]) << 4) | char2hex(str[1]); } static unsigned long str2ulong(const char *str, unsigned long bytes) { unsigned long x = 0; unsigned long i = 0; while ( *str && (i < (bytes * 2)) ) { x <<= 4; x += char2hex(*str); ++str; ++i; } return x; } static unsigned long str_to_native_ulong(const char *str) { unsigned long x = 0, i = 0; while ( *str && (i < BYTES_PER_LONG) ) { #ifdef __BIG_ENDIAN x <<= 8; x += str2hex(str); #elif defined(__LITTLE_ENDIAN) x += (unsigned long)str2hex(str) << (i*8); #else # error unknown endian #endif str += 2; i++; } return x; } /* gdb io wrappers */ static signed long gdb_io_write(const char *buf, unsigned long len, struct gdb_context *ctx) { int i; for ( i = 0; i < len; i++ ) serial_putc(ctx->serhnd, buf[i]); return i; } static int gdb_io_write_char(u8 data, struct gdb_context *ctx) { return gdb_io_write((char*)&data, 1, ctx); } static unsigned char gdb_io_read(struct gdb_context *ctx) { return serial_getc(ctx->serhnd); } /* Receive a command. Returns -1 on csum error, 0 otherwise. */ /* Does not acknowledge. */ static int attempt_receive_packet(struct gdb_context *ctx) { u8 csum; u8 received_csum; u8 ch; /* Skip over everything up to the first '$' */ while ( (ch = gdb_io_read(ctx)) != '$' ) continue; csum = 0; for ( ctx->in_bytes = 0; ctx->in_bytes < sizeof(ctx->in_buf); ctx->in_bytes++ ) { ch = gdb_io_read(ctx); if ( ch == '#' ) break; ctx->in_buf[ctx->in_bytes] = ch; csum += ch; } if ( ctx->in_bytes == sizeof(ctx->in_buf) ) { dbg_printk("WARNING: GDB sent a stupidly big packet.\n"); return -1; } ctx->in_buf[ctx->in_bytes] = '\0'; received_csum = char2hex(gdb_io_read(ctx)) * 16 + char2hex(gdb_io_read(ctx)); return (received_csum == csum) ? 0 : -1; } /* Receive a command, discarding up to ten packets with csum * errors. Acknowledges all received packets. */ static int receive_command(struct gdb_context *ctx) { int r, count = 0; count = 0; do { r = attempt_receive_packet(ctx); gdb_io_write_char((r < 0) ? '-' : '+', ctx); count++; } while ( (r < 0) && (count < GDB_RETRY_MAX) ); return r; } /* routines to send reply packets */ static void gdb_start_packet(struct gdb_context *ctx) { ctx->out_buf[0] = '$'; ctx->out_offset = 1; ctx->out_csum = 0; } static void gdb_write_to_packet_char(u8 data, struct gdb_context *ctx) { ctx->out_csum += data; ctx->out_buf[ctx->out_offset] = data; ctx->out_offset++; } void gdb_write_to_packet(const char *buf, int count, struct gdb_context *ctx) { int x; for ( x = 0; x < count; x++ ) gdb_write_to_packet_char(buf[x], ctx); } void gdb_write_to_packet_str(const char *buf, struct gdb_context *ctx) { gdb_write_to_packet(buf, strlen(buf), ctx); } void gdb_write_to_packet_hex(unsigned long x, int int_size, struct gdb_context *ctx) { char buf[sizeof(unsigned long) * 2 + 1]; int i, width = int_size * 2; buf[sizeof(unsigned long) * 2] = 0; switch ( int_size ) { case sizeof(u8): case sizeof(u16): case sizeof(u32): case sizeof(u64): break; default: dbg_printk("WARNING: %s x: %#lx int_size: %d\n", __func__, x, int_size); break; } #ifdef __BIG_ENDIAN i = sizeof(unsigned long) * 2 do { buf[--i] = hex2char(x & 15); x >>= 4; } while ( x ); while ( (i + width) > (sizeof(unsigned long) * 2) ) buf[--i] = '0'; gdb_write_to_packet(&buf[i], width, ctx); #elif defined(__LITTLE_ENDIAN) i = 0; while ( i < width ) { buf[i++] = hex2char(x>>4); buf[i++] = hex2char(x); x >>= 8; } gdb_write_to_packet(buf, width, ctx); #else # error unknown endian #endif } static int gdb_check_ack(struct gdb_context *ctx) { u8 c = gdb_io_read(ctx); switch ( c ) { case '+': return 1; case '-': return 0; default: printk("Bad ack: %c\n", c); return 0; } } /* Return 0 if the reply was successfully received, !0 otherwise. */ void gdb_send_packet(struct gdb_context *ctx) { char buf[3]; int count; snprintf(buf, sizeof(buf), "%.02x\n", ctx->out_csum); gdb_write_to_packet_char('#', ctx); gdb_write_to_packet(buf, 2, ctx); count = 0; do { gdb_io_write(ctx->out_buf, ctx->out_offset, ctx); } while ( !gdb_check_ack(ctx) && (count++ < GDB_RETRY_MAX) ); if ( count == GDB_RETRY_MAX ) dbg_printk("WARNING: %s reached max retry %d\n", __func__, GDB_RETRY_MAX); } void gdb_send_reply(const char *buf, struct gdb_context *ctx) { gdb_start_packet(ctx); gdb_write_to_packet_str(buf, ctx); gdb_send_packet(ctx); } /* arch neutral command handlers */ static void gdb_cmd_signum(struct gdb_context *ctx) { gdb_write_to_packet_char('S', ctx); gdb_write_to_packet_hex(ctx->signum, sizeof(ctx->signum), ctx); gdb_send_packet(ctx); } static void gdb_cmd_read_mem(unsigned long addr, unsigned long length, struct gdb_context *ctx) { int x, r; unsigned char val; dbg_printk("Memory read starting at %lx, length %lx.\n", addr, length); for ( x = 0; x < length; x++ ) { r = gdb_arch_copy_from_user(&val, (void *)(addr + x), 1); if ( r != 0 ) { dbg_printk("Error reading from %lx.\n", addr + x); break; } gdb_write_to_packet_hex(val, sizeof(val), ctx); } if ( x == 0 ) gdb_write_to_packet_str("E05", ctx); dbg_printk("Read done.\n"); gdb_send_packet(ctx); } static void gdb_cmd_write_mem(unsigned long addr, unsigned long length, const char *buf, struct gdb_context *ctx) { int x, r; unsigned char val; dbg_printk("Memory write starting at %lx, length %lx.\n", addr, length); for ( x = 0; x < length; x++, addr++, buf += 2 ) { val = str2ulong(buf, sizeof(val)); r = gdb_arch_copy_to_user((void*)addr, (void*)&val, 1); if ( r != 0 ) { dbg_printk("Error writing to %lx.\n", addr); break; } } if (x == length) gdb_write_to_packet_str("OK", ctx); else gdb_write_to_packet_str("E11", ctx); dbg_printk("Write done.\n"); gdb_send_packet(ctx); } static void gdbstub_attach(struct gdb_context *ctx) { if ( ctx->currently_attached ) return; ctx->currently_attached = 1; ctx->console_steal_id = console_steal(ctx->serhnd, gdbstub_console_puts); } static void gdbstub_detach(struct gdb_context *ctx) { if ( !ctx->currently_attached ) return; ctx->currently_attached = 0; console_giveback(ctx->console_steal_id); } /* command dispatcher */ static int process_command(struct cpu_user_regs *regs, struct gdb_context *ctx) { const char *ptr; unsigned long addr, length, val; int resume = 0; unsigned long type = GDB_CONTINUE; /* XXX check ctx->in_bytes >= 2 or similar. */ gdb_start_packet(ctx); switch ( ctx->in_buf[0] ) { case '?': /* query signal number */ gdb_cmd_signum(ctx); break; case 'H': /* thread operations */ gdb_send_reply("OK", ctx); break; case 'g': /* Read registers */ gdb_arch_read_reg_array(regs, ctx); break; case 'G': /* Write registers */ gdb_arch_write_reg_array(regs, ctx->in_buf + 1, ctx); break; case 'm': /* Read memory */ addr = simple_strtoul(ctx->in_buf + 1, &ptr, 16); if ( (ptr == (ctx->in_buf + 1)) || (ptr[0] != ',') ) { gdb_send_reply("E03", ctx); return 0; } length = simple_strtoul(ptr + 1, &ptr, 16); if ( ptr[0] != 0 ) { gdb_send_reply("E04", ctx); return 0; } gdb_cmd_read_mem(addr, length, ctx); break; case 'M': /* Write memory */ addr = simple_strtoul(ctx->in_buf + 1, &ptr, 16); if ( (ptr == (ctx->in_buf + 1)) || (ptr[0] != ',') ) { gdb_send_reply("E03", ctx); return 0; } length = simple_strtoul(ptr + 1, &ptr, 16); if ( ptr[0] != ':') { gdb_send_reply("E04", ctx); return 0; } gdb_cmd_write_mem(addr, length, ptr + 1, ctx); break; case 'p': /* read register */ addr = simple_strtoul(ctx->in_buf + 1, &ptr, 16); if ( ptr == (ctx->in_buf + 1) ) { gdb_send_reply("E03", ctx); return 0; } if ( ptr[0] != 0 ) { gdb_send_reply("E04", ctx); return 0; } gdb_arch_read_reg(addr, regs, ctx); break; case 'P': /* write register */ addr = simple_strtoul(ctx->in_buf + 1, &ptr, 16); if ( ptr == (ctx->in_buf + 1) ) { gdb_send_reply("E03", ctx); return 0; } if ( ptr[0] != '=' ) { gdb_send_reply("E04", ctx); return 0; } ptr++; val = str_to_native_ulong(ptr); gdb_arch_write_reg(addr, val, regs, ctx); break; case 'D': case 'k': gdbstub_detach(ctx); gdb_send_reply("OK", ctx); ctx->connected = 0; resume = 1; break; case 's': /* Single step */ type = GDB_STEP; case 'c': /* Resume at current address */ addr = ~((unsigned long)0); if ( ctx->in_buf[1] ) addr = str2ulong(&ctx->in_buf[1], sizeof(unsigned long)); gdbstub_attach(ctx); resume = 1; gdb_arch_resume(regs, addr, type, ctx); break; default: gdb_send_reply("", ctx); break; } return resume; } static struct gdb_context __gdb_ctx = { .serhnd = -1, .running = ATOMIC_INIT(1), .signum = 1 }; static struct gdb_context *gdb_ctx = &__gdb_ctx; static void gdbstub_console_puts(const char *str) { const char *p; gdb_start_packet(gdb_ctx); gdb_write_to_packet_char('O', gdb_ctx); for ( p = str; *p != '\0'; p++ ) { gdb_write_to_packet_char(hex2char((*p>>4) & 0x0f), gdb_ctx ); gdb_write_to_packet_char(hex2char((*p) & 0x0f), gdb_ctx ); } gdb_send_packet(gdb_ctx); } /* trap handler: main entry point */ int __trap_to_gdb(struct cpu_user_regs *regs, unsigned long cookie) { int rc = 0; unsigned long flags; if ( gdb_ctx->serhnd < 0 ) { printk("Debugging connection not set up.\n"); return -EBUSY; } /* We rely on our caller to ensure we're only on one processor * at a time... We should probably panic here, but given that * we're a debugger we should probably be a little tolerant of * things going wrong. */ /* We don't want to use a spin lock here, because we're doing two distinct things: 1 -- we don't want to run on more than one processor at a time, and 2 -- we want to do something sensible if we re-enter ourselves. Spin locks are good for 1, but useless for 2. */ if ( !atomic_dec_and_test(&gdb_ctx->running) ) { printk("WARNING WARNING WARNING: Avoiding recursive gdb.\n"); atomic_inc(&gdb_ctx->running); return -EBUSY; } if ( !gdb_ctx->connected ) { printk("GDB connection activated.\n"); gdb_arch_print_state(regs); gdb_ctx->connected = 1; } gdb_smp_pause(); local_irq_save(flags); watchdog_disable(); console_start_sync(); gdb_arch_enter(regs); gdb_ctx->signum = gdb_arch_signal_num(regs, cookie); /* If gdb is already attached, tell it we've stopped again. */ if ( gdb_ctx->currently_attached ) { gdb_start_packet(gdb_ctx); gdb_cmd_signum(gdb_ctx); } do { if ( receive_command(gdb_ctx) < 0 ) { dbg_printk("Error in GDB session...\n"); rc = -EIO; break; } } while ( process_command(regs, gdb_ctx) == 0 ); gdb_smp_resume(); gdb_arch_exit(regs); console_end_sync(); watchdog_enable(); atomic_inc(&gdb_ctx->running); local_irq_restore(flags); return rc; } static int __init initialise_gdb(void) { if ( *opt_gdb == '\0' ) return 0; gdb_ctx->serhnd = serial_parse_handle(opt_gdb); if ( gdb_ctx->serhnd == -1 ) { printk("Bad gdb= option '%s'\n", opt_gdb); return 0; } serial_start_sync(gdb_ctx->serhnd); printk("GDB stub initialised.\n"); return 0; } presmp_initcall(initialise_gdb); static void gdb_pause_this_cpu(void *unused) { unsigned long flags; local_irq_save(flags); atomic_set(&gdb_cpu[smp_processor_id()].ack, 1); atomic_inc(&gdb_smp_paused_count); while ( atomic_read(&gdb_cpu[smp_processor_id()].paused) ) mdelay(1); atomic_dec(&gdb_smp_paused_count); atomic_set(&gdb_cpu[smp_processor_id()].ack, 0); /* Restore interrupts */ local_irq_restore(flags); } static void gdb_smp_pause(void) { int timeout = 100; int cpu; for_each_online_cpu(cpu) { atomic_set(&gdb_cpu[cpu].ack, 0); atomic_set(&gdb_cpu[cpu].paused, 1); } atomic_set(&gdb_smp_paused_count, 0); smp_call_function(gdb_pause_this_cpu, NULL, /* dont wait! */0); /* Wait 100ms for all other CPUs to enter pause loop */ while ( (atomic_read(&gdb_smp_paused_count) < (num_online_cpus() - 1)) && (timeout-- > 0) ) mdelay(1); if ( atomic_read(&gdb_smp_paused_count) < (num_online_cpus() - 1) ) { printk("GDB: Not all CPUs have paused, missing CPUs "); for_each_online_cpu(cpu) { if ( (cpu != smp_processor_id()) && !atomic_read(&gdb_cpu[cpu].ack) ) printk("%d ", cpu); } printk("\n"); } } static void gdb_smp_resume(void) { int cpu; int timeout = 100; for_each_online_cpu(cpu) atomic_set(&gdb_cpu[cpu].paused, 0); /* Make sure all CPUs resume */ while ( (atomic_read(&gdb_smp_paused_count) > 0) && (timeout-- > 0) ) mdelay(1); if ( atomic_read(&gdb_smp_paused_count) > 0 ) { printk("GDB: Not all CPUs have resumed execution, missing CPUs "); for_each_online_cpu(cpu) { if ( (cpu != smp_processor_id()) && atomic_read(&gdb_cpu[cpu].ack) ) printk("%d ", cpu); } printk("\n"); } } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * End: */ xen-4.4.0/xen/common/time.c0000664000175000017500000000507212307313555013633 0ustar smbsmb/****************************************************************************** * time.c * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include /* Nonzero if YEAR is a leap year (every 4 years, except every 100th isn't, and every 400th is). */ #define __isleap(year) \ ((year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0)) /* How many days are in each month. */ const unsigned short int __mon_lengths[2][12] = { /* Normal years. */ {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, /* Leap years. */ {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31} }; #define SECS_PER_HOUR (60 * 60) #define SECS_PER_DAY (SECS_PER_HOUR * 24) struct tm gmtime(unsigned long t) { struct tm tbuf; long days, rem; int y; const unsigned short int *ip; y = 1970; #if BITS_PER_LONG >= 64 /* Allow the concept of time before 1970. 64-bit only; for 32-bit * time after 2038 seems more important than time before 1970. */ while ( t & (1UL<<39) ) { y -= 400; t += ((unsigned long)(365 * 303 + 366 * 97)) * SECS_PER_DAY; } t &= (1UL << 40) - 1; #endif days = t / SECS_PER_DAY; rem = t % SECS_PER_DAY; tbuf.tm_hour = rem / SECS_PER_HOUR; rem %= SECS_PER_HOUR; tbuf.tm_min = rem / 60; tbuf.tm_sec = rem % 60; /* January 1, 1970 was a Thursday. */ tbuf.tm_wday = (4 + days) % 7; if ( tbuf.tm_wday < 0 ) tbuf.tm_wday += 7; while ( days >= (rem = __isleap(y) ? 366 : 365) ) { ++y; days -= rem; } while ( days < 0 ) { --y; days += __isleap(y) ? 366 : 365; } tbuf.tm_year = y - 1900; tbuf.tm_yday = days; ip = (const unsigned short int *)__mon_lengths[__isleap(y)]; for ( y = 0; days >= ip[y]; ++y ) days -= ip[y]; tbuf.tm_mon = y; tbuf.tm_mday = days + 1; tbuf.tm_isdst = -1; return tbuf; } xen-4.4.0/xen/common/lib.c0000664000175000017500000003527312307313555013451 0ustar smbsmb #include #include #include #include #include /* for ctype.h */ const unsigned char _ctype[] = { _C,_C,_C,_C,_C,_C,_C,_C, /* 0-7 */ _C,_C|_S,_C|_S,_C|_S,_C|_S,_C|_S,_C,_C, /* 8-15 */ _C,_C,_C,_C,_C,_C,_C,_C, /* 16-23 */ _C,_C,_C,_C,_C,_C,_C,_C, /* 24-31 */ _S|_SP,_P,_P,_P,_P,_P,_P,_P, /* 32-39 */ _P,_P,_P,_P,_P,_P,_P,_P, /* 40-47 */ _D,_D,_D,_D,_D,_D,_D,_D, /* 48-55 */ _D,_D,_P,_P,_P,_P,_P,_P, /* 56-63 */ _P,_U|_X,_U|_X,_U|_X,_U|_X,_U|_X,_U|_X,_U, /* 64-71 */ _U,_U,_U,_U,_U,_U,_U,_U, /* 72-79 */ _U,_U,_U,_U,_U,_U,_U,_U, /* 80-87 */ _U,_U,_U,_P,_P,_P,_P,_P, /* 88-95 */ _P,_L|_X,_L|_X,_L|_X,_L|_X,_L|_X,_L|_X,_L, /* 96-103 */ _L,_L,_L,_L,_L,_L,_L,_L, /* 104-111 */ _L,_L,_L,_L,_L,_L,_L,_L, /* 112-119 */ _L,_L,_L,_P,_P,_P,_P,_C, /* 120-127 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 128-143 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 144-159 */ _S|_SP,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P, /* 160-175 */ _P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P,_P, /* 176-191 */ _U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U,_U, /* 192-207 */ _U,_U,_U,_U,_U,_U,_U,_P,_U,_U,_U,_U,_U,_U,_U,_L, /* 208-223 */ _L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L,_L, /* 224-239 */ _L,_L,_L,_L,_L,_L,_L,_P,_L,_L,_L,_L,_L,_L,_L,_L}; /* 240-255 */ /* * A couple of 64 bit operations ported from FreeBSD. * The code within the '#if BITS_PER_LONG == 32' block below, and no other * code in this file, is distributed under the following licensing terms * This is the modified '3-clause' BSD license with the obnoxious * advertising clause removed, as permitted by University of California. * * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This software was developed by the Computer Systems Engineering group * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and * contributed to Berkeley. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if BITS_PER_LONG == 32 /* * Depending on the desired operation, we view a `long long' (aka quad_t) in * one or more of the following formats. */ union uu { s64 q; /* as a (signed) quad */ s64 uq; /* as an unsigned quad */ long sl[2]; /* as two signed longs */ unsigned long ul[2]; /* as two unsigned longs */ }; #ifdef __BIG_ENDIAN #define _QUAD_HIGHWORD 0 #define _QUAD_LOWWORD 1 #else /* __LITTLE_ENDIAN */ #define _QUAD_HIGHWORD 1 #define _QUAD_LOWWORD 0 #endif /* * Define high and low longwords. */ #define H _QUAD_HIGHWORD #define L _QUAD_LOWWORD /* * Total number of bits in a quad_t and in the pieces that make it up. * These are used for shifting, and also below for halfword extraction * and assembly. */ #define CHAR_BIT 8 /* number of bits in a char */ #define QUAD_BITS (sizeof(s64) * CHAR_BIT) #define LONG_BITS (sizeof(long) * CHAR_BIT) #define HALF_BITS (sizeof(long) * CHAR_BIT / 2) /* * Extract high and low shortwords from longword, and move low shortword of * longword to upper half of long, i.e., produce the upper longword of * ((quad_t)(x) << (number_of_bits_in_long/2)). (`x' must actually be u_long.) * * These are used in the multiply code, to split a longword into upper * and lower halves, and to reassemble a product as a quad_t, shifted left * (sizeof(long)*CHAR_BIT/2). */ #define HHALF(x) ((x) >> HALF_BITS) #define LHALF(x) ((x) & ((1 << HALF_BITS) - 1)) #define LHUP(x) ((x) << HALF_BITS) /* * Multiprecision divide. This algorithm is from Knuth vol. 2 (2nd ed), * section 4.3.1, pp. 257--259. */ #define B (1 << HALF_BITS) /* digit base */ /* Combine two `digits' to make a single two-digit number. */ #define COMBINE(a, b) (((u_long)(a) << HALF_BITS) | (b)) /* select a type for digits in base B */ typedef u_long digit; /* * Shift p[0]..p[len] left `sh' bits, ignoring any bits that * `fall out' the left (there never will be any such anyway). * We may assume len >= 0. NOTE THAT THIS WRITES len+1 DIGITS. */ static void shl(register digit *p, register int len, register int sh) { register int i; for (i = 0; i < len; i++) p[i] = LHALF(p[i] << sh) | (p[i + 1] >> (HALF_BITS - sh)); p[i] = LHALF(p[i] << sh); } /* * __qdivrem(u, v, rem) returns u/v and, optionally, sets *rem to u%v. * * We do this in base 2-sup-HALF_BITS, so that all intermediate products * fit within u_long. As a consequence, the maximum length dividend and * divisor are 4 `digits' in this base (they are shorter if they have * leading zeros). */ u64 __qdivrem(u64 uq, u64 vq, u64 *arq) { union uu tmp; digit *u, *v, *q; register digit v1, v2; u_long qhat, rhat, t; int m, n, d, j, i; digit uspace[5], vspace[5], qspace[5]; /* * Take care of special cases: divide by zero, and u < v. */ if (vq == 0) { /* divide by zero. */ static volatile const unsigned int zero = 0; tmp.ul[H] = tmp.ul[L] = 1 / zero; if (arq) *arq = uq; return (tmp.q); } if (uq < vq) { if (arq) *arq = uq; return (0); } u = &uspace[0]; v = &vspace[0]; q = &qspace[0]; /* * Break dividend and divisor into digits in base B, then * count leading zeros to determine m and n. When done, we * will have: * u = (u[1]u[2]...u[m+n]) sub B * v = (v[1]v[2]...v[n]) sub B * v[1] != 0 * 1 < n <= 4 (if n = 1, we use a different division algorithm) * m >= 0 (otherwise u < v, which we already checked) * m + n = 4 * and thus * m = 4 - n <= 2 */ tmp.uq = uq; u[0] = 0; u[1] = HHALF(tmp.ul[H]); u[2] = LHALF(tmp.ul[H]); u[3] = HHALF(tmp.ul[L]); u[4] = LHALF(tmp.ul[L]); tmp.uq = vq; v[1] = HHALF(tmp.ul[H]); v[2] = LHALF(tmp.ul[H]); v[3] = HHALF(tmp.ul[L]); v[4] = LHALF(tmp.ul[L]); for (n = 4; v[1] == 0; v++) { if (--n == 1) { u_long rbj; /* r*B+u[j] (not root boy jim) */ digit q1, q2, q3, q4; /* * Change of plan, per exercise 16. * r = 0; * for j = 1..4: * q[j] = floor((r*B + u[j]) / v), * r = (r*B + u[j]) % v; * We unroll this completely here. */ t = v[2]; /* nonzero, by definition */ q1 = u[1] / t; rbj = COMBINE(u[1] % t, u[2]); q2 = rbj / t; rbj = COMBINE(rbj % t, u[3]); q3 = rbj / t; rbj = COMBINE(rbj % t, u[4]); q4 = rbj / t; if (arq) *arq = rbj % t; tmp.ul[H] = COMBINE(q1, q2); tmp.ul[L] = COMBINE(q3, q4); return (tmp.q); } } /* * By adjusting q once we determine m, we can guarantee that * there is a complete four-digit quotient at &qspace[1] when * we finally stop. */ for (m = 4 - n; u[1] == 0; u++) m--; for (i = 4 - m; --i >= 0;) q[i] = 0; q += 4 - m; /* * Here we run Program D, translated from MIX to C and acquiring * a few minor changes. * * D1: choose multiplier 1 << d to ensure v[1] >= B/2. */ d = 0; for (t = v[1]; t < B / 2; t <<= 1) d++; if (d > 0) { shl(&u[0], m + n, d); /* u <<= d */ shl(&v[1], n - 1, d); /* v <<= d */ } /* * D2: j = 0. */ j = 0; v1 = v[1]; /* for D3 -- note that v[1..n] are constant */ v2 = v[2]; /* for D3 */ do { register digit uj0, uj1, uj2; /* * D3: Calculate qhat (\^q, in TeX notation). * Let qhat = min((u[j]*B + u[j+1])/v[1], B-1), and * let rhat = (u[j]*B + u[j+1]) mod v[1]. * While rhat < B and v[2]*qhat > rhat*B+u[j+2], * decrement qhat and increase rhat correspondingly. * Note that if rhat >= B, v[2]*qhat < rhat*B. */ uj0 = u[j + 0]; /* for D3 only -- note that u[j+...] change */ uj1 = u[j + 1]; /* for D3 only */ uj2 = u[j + 2]; /* for D3 only */ if (uj0 == v1) { qhat = B; rhat = uj1; goto qhat_too_big; } else { u_long nn = COMBINE(uj0, uj1); qhat = nn / v1; rhat = nn % v1; } while (v2 * qhat > COMBINE(rhat, uj2)) { qhat_too_big: qhat--; if ((rhat += v1) >= B) break; } /* * D4: Multiply and subtract. * The variable `t' holds any borrows across the loop. * We split this up so that we do not require v[0] = 0, * and to eliminate a final special case. */ for (t = 0, i = n; i > 0; i--) { t = u[i + j] - v[i] * qhat - t; u[i + j] = LHALF(t); t = (B - HHALF(t)) & (B - 1); } t = u[j] - t; u[j] = LHALF(t); /* * D5: test remainder. * There is a borrow if and only if HHALF(t) is nonzero; * in that (rare) case, qhat was too large (by exactly 1). * Fix it by adding v[1..n] to u[j..j+n]. */ if (HHALF(t)) { qhat--; for (t = 0, i = n; i > 0; i--) { /* D6: add back. */ t += u[i + j] + v[i]; u[i + j] = LHALF(t); t = HHALF(t); } u[j] = LHALF(u[j] + t); } q[j] = qhat; } while (++j <= m); /* D7: loop on j. */ /* * If caller wants the remainder, we have to calculate it as * u[m..m+n] >> d (this is at most n digits and thus fits in * u[m+1..m+n], but we may need more source digits). */ if (arq) { if (d) { for (i = m + n; i > m; --i) u[i] = (u[i] >> d) | LHALF(u[i - 1] << (HALF_BITS - d)); u[i] = 0; } tmp.ul[H] = COMBINE(uspace[1], uspace[2]); tmp.ul[L] = COMBINE(uspace[3], uspace[4]); *arq = tmp.q; } tmp.ul[H] = COMBINE(qspace[1], qspace[2]); tmp.ul[L] = COMBINE(qspace[3], qspace[4]); return (tmp.q); } /* * Divide two signed quads. * Truncates towards zero, as required by C99. */ s64 __divdi3(s64 a, s64 b) { u64 ua, ub, uq; int neg = (a < 0) ^ (b < 0); ua = (a < 0) ? -(u64)a : a; ub = (b < 0) ? -(u64)b : b; uq = __qdivrem(ua, ub, (u64 *)0); return (neg ? -uq : uq); } /* * Divide two unsigned quads. */ u64 __udivdi3(u64 a, u64 b) { return __qdivrem(a, b, (u64 *)0); } /* * Remainder of unsigned quad division */ u64 __umoddi3(u64 a, u64 b) { u64 rem; __qdivrem(a, b, &rem); return rem; } /* * Remainder of signed quad division. * Truncates towards zero, as required by C99: * 11 % 5 = 1 * -11 % 5 = -1 * 11 % -5 = 1 * -11 % -5 = 1 */ s64 __moddi3(s64 a, s64 b) { u64 ua, ub, urem; int neg = (a < 0); ua = neg ? -(u64)a : a; ub = (b < 0) ? -(u64)b : b; __qdivrem(ua, ub, &urem); return (neg ? -urem : urem); } /* * Quotient and remainder of unsigned long long division */ s64 __ldivmod_helper(s64 a, s64 b, s64 *r) { u64 ua, ub, rem, quot; ua = ABS(a); ub = ABS(b); quot = __qdivrem(ua, ub, &rem); if ( a < 0 ) *r = -rem; else *r = rem; if ( (a < 0) ^ (b < 0) ) return -quot; else return quot; } #endif /* BITS_PER_LONG == 32 */ /* Compute with 96 bit intermediate result: (a*b)/c */ uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) { #ifdef CONFIG_X86 asm ( "mul %%rdx; div %%rcx" : "=a" (a) : "0" (a), "d" (b), "c" (c) ); return a; #else union { uint64_t ll; struct { #ifdef WORDS_BIGENDIAN uint32_t high, low; #else uint32_t low, high; #endif } l; } u, res; uint64_t rl, rh; u.ll = a; rl = (uint64_t)u.l.low * (uint64_t)b; rh = (uint64_t)u.l.high * (uint64_t)b; rh += (rl >> 32); res.l.high = rh / c; res.l.low = (((rh % c) << 32) + (rl & 0xffffffff)) / c; return res.ll; #endif } unsigned long long parse_size_and_unit(const char *s, const char **ps) { unsigned long long ret; const char *s1; ret = simple_strtoull(s, &s1, 0); switch ( *s1 ) { case 'G': case 'g': ret <<= 10; case 'M': case 'm': ret <<= 10; case 'K': case 'k': ret <<= 10; case 'B': case 'b': s1++; break; default: ret <<= 10; /* default to kB */ break; } if ( ps != NULL ) *ps = s1; return ret; } typedef void (*ctor_func_t)(void); extern const ctor_func_t __ctors_start[], __ctors_end[]; void __init init_constructors(void) { const ctor_func_t *f; for ( f = __ctors_start; f < __ctors_end; ++f ) (*f)(); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/event_2l.c0000664000175000017500000000616412307313555014416 0ustar smbsmb/* * Event channel port operations. * * Copyright (c) 2003-2006, K A Fraser. * * This source code is licensed under the GNU General Public License, * Version 2 or later. See the file COPYING for more details. */ #include #include #include #include #include #include static void evtchn_2l_set_pending(struct vcpu *v, struct evtchn *evtchn) { struct domain *d = v->domain; unsigned int port = evtchn->port; /* * The following bit operations must happen in strict order. * NB. On x86, the atomic bit operations also act as memory barriers. * There is therefore sufficiently strict ordering for this architecture -- * others may require explicit memory barriers. */ if ( test_and_set_bit(port, &shared_info(d, evtchn_pending)) ) return; if ( !test_bit (port, &shared_info(d, evtchn_mask)) && !test_and_set_bit(port / BITS_PER_EVTCHN_WORD(d), &vcpu_info(v, evtchn_pending_sel)) ) { vcpu_mark_events_pending(v); } evtchn_check_pollers(d, port); } static void evtchn_2l_clear_pending(struct domain *d, struct evtchn *evtchn) { clear_bit(evtchn->port, &shared_info(d, evtchn_pending)); } static void evtchn_2l_unmask(struct domain *d, struct evtchn *evtchn) { struct vcpu *v = d->vcpu[evtchn->notify_vcpu_id]; unsigned int port = evtchn->port; /* * These operations must happen in strict order. Based on * evtchn_2l_set_pending() above. */ if ( test_and_clear_bit(port, &shared_info(d, evtchn_mask)) && test_bit (port, &shared_info(d, evtchn_pending)) && !test_and_set_bit (port / BITS_PER_EVTCHN_WORD(d), &vcpu_info(v, evtchn_pending_sel)) ) { vcpu_mark_events_pending(v); } } static bool_t evtchn_2l_is_pending(struct domain *d, const struct evtchn *evtchn) { return test_bit(evtchn->port, &shared_info(d, evtchn_pending)); } static bool_t evtchn_2l_is_masked(struct domain *d, const struct evtchn *evtchn) { return test_bit(evtchn->port, &shared_info(d, evtchn_mask)); } static void evtchn_2l_print_state(struct domain *d, const struct evtchn *evtchn) { struct vcpu *v = d->vcpu[evtchn->notify_vcpu_id]; printk("%d", !!test_bit(evtchn->port / BITS_PER_EVTCHN_WORD(d), &vcpu_info(v, evtchn_pending_sel))); } static const struct evtchn_port_ops evtchn_port_ops_2l = { .set_pending = evtchn_2l_set_pending, .clear_pending = evtchn_2l_clear_pending, .unmask = evtchn_2l_unmask, .is_pending = evtchn_2l_is_pending, .is_masked = evtchn_2l_is_masked, .print_state = evtchn_2l_print_state, }; void evtchn_2l_init(struct domain *d) { d->evtchn_port_ops = &evtchn_port_ops_2l; d->max_evtchns = BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/core_parking.c0000664000175000017500000001450612307313555015342 0ustar smbsmb/* * core_parking.c - implement core parking according to dom0 requirement * * Copyright (C) 2012, Intel Corporation. * Author: Liu, Jinsong * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ #include #include #include #include #include #include #define CORE_PARKING_INCREMENT 1 #define CORE_PARKING_DECREMENT 2 static unsigned int core_parking_power(unsigned int event); static unsigned int core_parking_performance(unsigned int event); static uint32_t cur_idle_nums; static unsigned int core_parking_cpunum[NR_CPUS] = {[0 ... NR_CPUS-1] = -1}; static struct core_parking_policy { char name[30]; unsigned int (*next)(unsigned int event); } *core_parking_policy; static enum core_parking_controller { POWER_FIRST, PERFORMANCE_FIRST } core_parking_controller = POWER_FIRST; static void __init setup_core_parking_option(char *str) { if ( !strcmp(str, "power") ) core_parking_controller = POWER_FIRST; else if ( !strcmp(str, "performance") ) core_parking_controller = PERFORMANCE_FIRST; else return; } custom_param("core_parking", setup_core_parking_option); static unsigned int core_parking_performance(unsigned int event) { unsigned int cpu = -1; switch ( event ) { case CORE_PARKING_INCREMENT: { int core_tmp, core_weight = -1; int sibling_tmp, sibling_weight = -1; cpumask_t core_candidate_map, sibling_candidate_map; cpumask_clear(&core_candidate_map); cpumask_clear(&sibling_candidate_map); for_each_cpu(cpu, &cpu_online_map) { if ( cpu == 0 ) continue; core_tmp = cpumask_weight(per_cpu(cpu_core_mask, cpu)); if ( core_weight < core_tmp ) { core_weight = core_tmp; cpumask_clear(&core_candidate_map); cpumask_set_cpu(cpu, &core_candidate_map); } else if ( core_weight == core_tmp ) cpumask_set_cpu(cpu, &core_candidate_map); } for_each_cpu(cpu, &core_candidate_map) { sibling_tmp = cpumask_weight(per_cpu(cpu_sibling_mask, cpu)); if ( sibling_weight < sibling_tmp ) { sibling_weight = sibling_tmp; cpumask_clear(&sibling_candidate_map); cpumask_set_cpu(cpu, &sibling_candidate_map); } else if ( sibling_weight == sibling_tmp ) cpumask_set_cpu(cpu, &sibling_candidate_map); } cpu = cpumask_first(&sibling_candidate_map); } break; case CORE_PARKING_DECREMENT: { cpu = core_parking_cpunum[cur_idle_nums -1]; } break; default: break; } return cpu; } static unsigned int core_parking_power(unsigned int event) { unsigned int cpu = -1; switch ( event ) { case CORE_PARKING_INCREMENT: { int core_tmp, core_weight = NR_CPUS + 1; int sibling_tmp, sibling_weight = NR_CPUS + 1; cpumask_t core_candidate_map, sibling_candidate_map; cpumask_clear(&core_candidate_map); cpumask_clear(&sibling_candidate_map); for_each_cpu(cpu, &cpu_online_map) { if ( cpu == 0 ) continue; core_tmp = cpumask_weight(per_cpu(cpu_core_mask, cpu)); if ( core_weight > core_tmp ) { core_weight = core_tmp; cpumask_clear(&core_candidate_map); cpumask_set_cpu(cpu, &core_candidate_map); } else if ( core_weight == core_tmp ) cpumask_set_cpu(cpu, &core_candidate_map); } for_each_cpu(cpu, &core_candidate_map) { sibling_tmp = cpumask_weight(per_cpu(cpu_sibling_mask, cpu)); if ( sibling_weight > sibling_tmp ) { sibling_weight = sibling_tmp; cpumask_clear(&sibling_candidate_map); cpumask_set_cpu(cpu, &sibling_candidate_map); } else if ( sibling_weight == sibling_tmp ) cpumask_set_cpu(cpu, &sibling_candidate_map); } cpu = cpumask_first(&sibling_candidate_map); } break; case CORE_PARKING_DECREMENT: { cpu = core_parking_cpunum[cur_idle_nums -1]; } break; default: break; } return cpu; } long core_parking_helper(void *data) { uint32_t idle_nums = (unsigned long)data; unsigned int cpu; int ret = 0; if ( !core_parking_policy ) return -EINVAL; while ( cur_idle_nums < idle_nums ) { cpu = core_parking_policy->next(CORE_PARKING_INCREMENT); ret = cpu_down(cpu); if ( ret ) return ret; core_parking_cpunum[cur_idle_nums++] = cpu; } while ( cur_idle_nums > idle_nums ) { cpu = core_parking_policy->next(CORE_PARKING_DECREMENT); ret = cpu_up(cpu); if ( ret ) return ret; core_parking_cpunum[--cur_idle_nums] = -1; } return ret; } uint32_t get_cur_idle_nums(void) { return cur_idle_nums; } static struct core_parking_policy power_first = { .name = "power", .next = core_parking_power, }; static struct core_parking_policy performance_first = { .name = "performance", .next = core_parking_performance, }; static int register_core_parking_policy(struct core_parking_policy *policy) { if ( !policy || !policy->next ) return -EINVAL; core_parking_policy = policy; return 0; } static int __init core_parking_init(void) { int ret = 0; if ( core_parking_controller == PERFORMANCE_FIRST ) ret = register_core_parking_policy(&performance_first); else ret = register_core_parking_policy(&power_first); return ret; } __initcall(core_parking_init); xen-4.4.0/xen/common/preempt.c0000664000175000017500000000242712307313555014352 0ustar smbsmb/****************************************************************************** * preempt.c * * Track atomic regions in the hypervisor which disallow sleeping. * * Copyright (c) 2010, Keir Fraser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include DEFINE_PER_CPU(unsigned int, __preempt_count); bool_t in_atomic(void) { return preempt_count() || in_irq() || !local_irq_is_enabled(); } #ifndef NDEBUG void ASSERT_NOT_IN_ATOMIC(void) { ASSERT(!preempt_count()); ASSERT(!in_irq()); ASSERT(local_irq_is_enabled()); } #endif xen-4.4.0/xen/common/vmap.c0000664000175000017500000001241312307313555013635 0ustar smbsmb#ifdef VMAP_VIRT_START #include #include #include #include #include #include #include #include #include static DEFINE_SPINLOCK(vm_lock); static void *__read_mostly vm_base; #define vm_bitmap ((unsigned long *)vm_base) /* highest allocated bit in the bitmap */ static unsigned int __read_mostly vm_top; /* total number of bits in the bitmap */ static unsigned int __read_mostly vm_end; /* lowest known clear bit in the bitmap */ static unsigned int vm_low; void __init vm_init(void) { unsigned int i, nr; unsigned long va; vm_base = (void *)VMAP_VIRT_START; vm_end = PFN_DOWN(arch_vmap_virt_end() - vm_base); vm_low = PFN_UP((vm_end + 7) / 8); nr = PFN_UP((vm_low + 7) / 8); vm_top = nr * PAGE_SIZE * 8; for ( i = 0, va = (unsigned long)vm_bitmap; i < nr; ++i, va += PAGE_SIZE ) { struct page_info *pg = alloc_domheap_page(NULL, 0); map_pages_to_xen(va, page_to_mfn(pg), 1, PAGE_HYPERVISOR); clear_page((void *)va); } bitmap_fill(vm_bitmap, vm_low); /* Populate page tables for the bitmap if necessary. */ map_pages_to_xen(va, 0, vm_low - nr, MAP_SMALL_PAGES); } void *vm_alloc(unsigned int nr, unsigned int align) { unsigned int start, bit; if ( !align ) align = 1; else if ( align & (align - 1) ) align &= -align; spin_lock(&vm_lock); for ( ; ; ) { struct page_info *pg; ASSERT(vm_low == vm_top || !test_bit(vm_low, vm_bitmap)); for ( start = vm_low; start < vm_top; ) { bit = find_next_bit(vm_bitmap, vm_top, start + 1); if ( bit > vm_top ) bit = vm_top; /* * Note that this skips the first bit, making the * corresponding page a guard one. */ start = (start + align) & ~(align - 1); if ( bit < vm_top ) { if ( start + nr < bit ) break; start = find_next_zero_bit(vm_bitmap, vm_top, bit + 1); } else { if ( start + nr <= bit ) break; start = bit; } } if ( start < vm_top ) break; spin_unlock(&vm_lock); if ( vm_top >= vm_end ) return NULL; pg = alloc_domheap_page(NULL, 0); if ( !pg ) return NULL; spin_lock(&vm_lock); if ( start >= vm_top ) { unsigned long va = (unsigned long)vm_bitmap + vm_top / 8; if ( !map_pages_to_xen(va, page_to_mfn(pg), 1, PAGE_HYPERVISOR) ) { clear_page((void *)va); vm_top += PAGE_SIZE * 8; if ( vm_top > vm_end ) vm_top = vm_end; continue; } } free_domheap_page(pg); if ( start >= vm_top ) { spin_unlock(&vm_lock); return NULL; } } for ( bit = start; bit < start + nr; ++bit ) __set_bit(bit, vm_bitmap); if ( bit < vm_top ) ASSERT(!test_bit(bit, vm_bitmap)); else ASSERT(bit == vm_top); if ( start <= vm_low + 2 ) vm_low = bit; spin_unlock(&vm_lock); return vm_base + start * PAGE_SIZE; } static unsigned int vm_index(const void *va) { unsigned long addr = (unsigned long)va & ~(PAGE_SIZE - 1); unsigned int idx; if ( addr < VMAP_VIRT_START + (vm_end / 8) || addr >= VMAP_VIRT_START + vm_top * PAGE_SIZE ) return 0; idx = PFN_DOWN(va - vm_base); return !test_bit(idx - 1, vm_bitmap) && test_bit(idx, vm_bitmap) ? idx : 0; } static unsigned int vm_size(const void *va) { unsigned int start = vm_index(va), end; if ( !start ) return 0; end = find_next_zero_bit(vm_bitmap, vm_top, start + 1); return min(end, vm_top) - start; } void vm_free(const void *va) { unsigned int bit = vm_index(va); if ( !bit ) { WARN_ON(va != NULL); return; } spin_lock(&vm_lock); if ( bit < vm_low ) { vm_low = bit - 1; while ( !test_bit(vm_low - 1, vm_bitmap) ) --vm_low; } while ( __test_and_clear_bit(bit, vm_bitmap) ) if ( ++bit == vm_top ) break; spin_unlock(&vm_lock); } void *__vmap(const unsigned long *mfn, unsigned int granularity, unsigned int nr, unsigned int align, unsigned int flags) { void *va = vm_alloc(nr * granularity, align); unsigned long cur = (unsigned long)va; for ( ; va && nr--; ++mfn, cur += PAGE_SIZE * granularity ) { if ( map_pages_to_xen(cur, *mfn, granularity, flags) ) { vunmap(va); va = NULL; } } return va; } void *vmap(const unsigned long *mfn, unsigned int nr) { return __vmap(mfn, 1, nr, 1, PAGE_HYPERVISOR); } void vunmap(const void *va) { #ifndef _PAGE_NONE unsigned long addr = (unsigned long)va; destroy_xen_mappings(addr, addr + PAGE_SIZE * vm_size(va)); #else /* Avoid tearing down intermediate page tables. */ map_pages_to_xen((unsigned long)va, 0, vm_size(va), _PAGE_NONE); #endif vm_free(va); } #endif xen-4.4.0/xen/common/kimage.c0000664000175000017500000006512212307313555014134 0ustar smbsmb/* * Kexec Image * * Copyright (C) 2013 Citrix Systems R&D Ltd. * * Derived from kernel/kexec.c from Linux: * * Copyright (C) 2002-2004 Eric Biederman * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ #include #include #include #include #include #include #include #include #include #include #include /* * When kexec transitions to the new kernel there is a one-to-one * mapping between physical and virtual addresses. On processors * where you can disable the MMU this is trivial, and easy. For * others it is still a simple predictable page table to setup. * * The code for the transition from the current kernel to the the new * kernel is placed in the page-size control_code_buffer. This memory * must be identity mapped in the transition from virtual to physical * addresses. * * The assembly stub in the control code buffer is passed a linked list * of descriptor pages detailing the source pages of the new kernel, * and the destination addresses of those source pages. As this data * structure is not used in the context of the current OS, it must * be self-contained. * * The code has been made to work with highmem pages and will use a * destination page in its final resting place (if it happens * to allocate it). The end product of this is that most of the * physical address space, and most of RAM can be used. * * Future directions include: * - allocating a page table with the control code buffer identity * mapped, to simplify machine_kexec and make kexec_on_panic more * reliable. */ /* * KIMAGE_NO_DEST is an impossible destination address..., for * allocating pages whose destination address we do not care about. */ #define KIMAGE_NO_DEST (-1UL) /* * Offset of the last entry in an indirection page. */ #define KIMAGE_LAST_ENTRY (PAGE_SIZE/sizeof(kimage_entry_t) - 1) static int kimage_is_destination_range(struct kexec_image *image, paddr_t start, paddr_t end); static struct page_info *kimage_alloc_page(struct kexec_image *image, paddr_t dest); static struct page_info *kimage_alloc_zeroed_page(unsigned memflags) { struct page_info *page; page = alloc_domheap_page(NULL, memflags); if ( !page ) return NULL; clear_domain_page(page_to_mfn(page)); return page; } static int do_kimage_alloc(struct kexec_image **rimage, paddr_t entry, unsigned long nr_segments, xen_kexec_segment_t *segments, uint8_t type) { struct kexec_image *image; unsigned long i; int result; /* Allocate a controlling structure */ result = -ENOMEM; image = xzalloc(typeof(*image)); if ( !image ) goto out; image->entry_maddr = entry; image->type = type; image->nr_segments = nr_segments; image->segments = segments; image->next_crash_page = kexec_crash_area.start; INIT_PAGE_LIST_HEAD(&image->control_pages); INIT_PAGE_LIST_HEAD(&image->dest_pages); INIT_PAGE_LIST_HEAD(&image->unusable_pages); /* * Verify we have good destination addresses. The caller is * responsible for making certain we don't attempt to load the new * image into invalid or reserved areas of RAM. This just * verifies it is an address we can use. * * Since the kernel does everything in page size chunks ensure the * destination addresses are page aligned. Too many special cases * crop of when we don't do this. The most insidious is getting * overlapping destination addresses simply because addresses are * changed to page size granularity. */ result = -EADDRNOTAVAIL; for ( i = 0; i < nr_segments; i++ ) { paddr_t mstart, mend; mstart = image->segments[i].dest_maddr; mend = mstart + image->segments[i].dest_size; if ( (mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK) ) goto out; } /* * Verify our destination addresses do not overlap. If we allowed * overlapping destination addresses through very weird things can * happen with no easy explanation as one segment stops on * another. */ result = -EINVAL; for ( i = 0; i < nr_segments; i++ ) { paddr_t mstart, mend; unsigned long j; mstart = image->segments[i].dest_maddr; mend = mstart + image->segments[i].dest_size; for (j = 0; j < i; j++ ) { paddr_t pstart, pend; pstart = image->segments[j].dest_maddr; pend = pstart + image->segments[j].dest_size; /* Do the segments overlap? */ if ( (mend > pstart) && (mstart < pend) ) goto out; } } /* * Ensure our buffer sizes are strictly less than our memory * sizes. This should always be the case, and it is easier to * check up front than to be surprised later on. */ result = -EINVAL; for ( i = 0; i < nr_segments; i++ ) { if ( image->segments[i].buf_size > image->segments[i].dest_size ) goto out; } /* * Page for the relocation code must still be accessible after the * processor has switched to 32-bit mode. */ result = -ENOMEM; image->control_code_page = kimage_alloc_control_page(image, MEMF_bits(32)); if ( !image->control_code_page ) goto out; result = machine_kexec_add_page(image, page_to_maddr(image->control_code_page), page_to_maddr(image->control_code_page)); if ( result < 0 ) goto out; /* Add an empty indirection page. */ image->entry_page = kimage_alloc_control_page(image, 0); if ( !image->entry_page ) goto out; result = machine_kexec_add_page(image, page_to_maddr(image->entry_page), page_to_maddr(image->entry_page)); if ( result < 0 ) goto out; image->head = page_to_maddr(image->entry_page); result = 0; out: if ( result == 0 ) *rimage = image; else if ( image ) { image->segments = NULL; /* caller frees segments after an error */ kimage_free(image); } return result; } static int kimage_normal_alloc(struct kexec_image **rimage, paddr_t entry, unsigned long nr_segments, xen_kexec_segment_t *segments) { return do_kimage_alloc(rimage, entry, nr_segments, segments, KEXEC_TYPE_DEFAULT); } static int kimage_crash_alloc(struct kexec_image **rimage, paddr_t entry, unsigned long nr_segments, xen_kexec_segment_t *segments) { unsigned long i; /* Verify we have a valid entry point */ if ( (entry < kexec_crash_area.start) || (entry > kexec_crash_area.start + kexec_crash_area.size)) return -EADDRNOTAVAIL; /* * Verify we have good destination addresses. Normally * the caller is responsible for making certain we don't * attempt to load the new image into invalid or reserved * areas of RAM. But crash kernels are preloaded into a * reserved area of ram. We must ensure the addresses * are in the reserved area otherwise preloading the * kernel could corrupt things. */ for ( i = 0; i < nr_segments; i++ ) { paddr_t mstart, mend; if ( guest_handle_is_null(segments[i].buf.h) ) continue; mstart = segments[i].dest_maddr; mend = mstart + segments[i].dest_size; /* Ensure we are within the crash kernel limits. */ if ( (mstart < kexec_crash_area.start ) || (mend > kexec_crash_area.start + kexec_crash_area.size)) return -EADDRNOTAVAIL; } /* Allocate and initialize a controlling structure. */ return do_kimage_alloc(rimage, entry, nr_segments, segments, KEXEC_TYPE_CRASH); } static int kimage_is_destination_range(struct kexec_image *image, paddr_t start, paddr_t end) { unsigned long i; for ( i = 0; i < image->nr_segments; i++ ) { paddr_t mstart, mend; mstart = image->segments[i].dest_maddr; mend = mstart + image->segments[i].dest_size; if ( (end > mstart) && (start < mend) ) return 1; } return 0; } static void kimage_free_page_list(struct page_list_head *list) { struct page_info *page, *next; page_list_for_each_safe(page, next, list) { page_list_del(page, list); free_domheap_page(page); } } static struct page_info *kimage_alloc_normal_control_page( struct kexec_image *image, unsigned memflags) { /* * Control pages are special, they are the intermediaries that are * needed while we copy the rest of the pages to their final * resting place. As such they must not conflict with either the * destination addresses or memory the kernel is already using. * * The only case where we really need more than one of these are * for architectures where we cannot disable the MMU and must * instead generate an identity mapped page table for all of the * memory. * * At worst this runs in O(N) of the image size. */ struct page_list_head extra_pages; struct page_info *page = NULL; INIT_PAGE_LIST_HEAD(&extra_pages); /* * Loop while I can allocate a page and the page allocated is a * destination page. */ do { unsigned long mfn, emfn; paddr_t addr, eaddr; page = kimage_alloc_zeroed_page(memflags); if ( !page ) break; mfn = page_to_mfn(page); emfn = mfn + 1; addr = page_to_maddr(page); eaddr = addr + PAGE_SIZE; if ( kimage_is_destination_range(image, addr, eaddr) ) { page_list_add(page, &extra_pages); page = NULL; } } while ( !page ); if ( page ) { /* Remember the allocated page... */ page_list_add(page, &image->control_pages); /* * Because the page is already in it's destination location we * will never allocate another page at that address. * Therefore kimage_alloc_page will not return it (again) and * we don't need to give it an entry in image->segments[]. */ } /* * Deal with the destination pages I have inadvertently allocated. * * Ideally I would convert multi-page allocations into single page * allocations, and add everything to image->dest_pages. * * For now it is simpler to just free the pages. */ kimage_free_page_list(&extra_pages); return page; } static struct page_info *kimage_alloc_crash_control_page(struct kexec_image *image) { /* * Control pages are special, they are the intermediaries that are * needed while we copy the rest of the pages to their final * resting place. As such they must not conflict with either the * destination addresses or memory the kernel is already using. * * Control pages are also the only pags we must allocate when * loading a crash kernel. All of the other pages are specified * by the segments and we just memcpy into them directly. * * The only case where we really need more than one of these are * for architectures where we cannot disable the MMU and must * instead generate an identity mapped page table for all of the * memory. * * Given the low demand this implements a very simple allocator * that finds the first hole of the appropriate size in the * reserved memory region, and allocates all of the memory up to * and including the hole. */ paddr_t hole_start, hole_end; struct page_info *page = NULL; hole_start = PAGE_ALIGN(image->next_crash_page); hole_end = hole_start + PAGE_SIZE; while ( hole_end <= kexec_crash_area.start + kexec_crash_area.size ) { unsigned long i; /* See if I overlap any of the segments. */ for ( i = 0; i < image->nr_segments; i++ ) { paddr_t mstart, mend; mstart = image->segments[i].dest_maddr; mend = mstart + image->segments[i].dest_size; if ( (hole_end > mstart) && (hole_start < mend) ) { /* Advance the hole to the end of the segment. */ hole_start = PAGE_ALIGN(mend); hole_end = hole_start + PAGE_SIZE; break; } } /* If I don't overlap any segments I have found my hole! */ if ( i == image->nr_segments ) { page = maddr_to_page(hole_start); break; } } if ( page ) { image->next_crash_page = hole_end; clear_domain_page(page_to_mfn(page)); } return page; } struct page_info *kimage_alloc_control_page(struct kexec_image *image, unsigned memflags) { struct page_info *pages = NULL; switch ( image->type ) { case KEXEC_TYPE_DEFAULT: pages = kimage_alloc_normal_control_page(image, memflags); break; case KEXEC_TYPE_CRASH: pages = kimage_alloc_crash_control_page(image); break; } return pages; } static int kimage_add_entry(struct kexec_image *image, kimage_entry_t entry) { kimage_entry_t *entries; if ( image->next_entry == KIMAGE_LAST_ENTRY ) { struct page_info *page; page = kimage_alloc_page(image, KIMAGE_NO_DEST); if ( !page ) return -ENOMEM; entries = __map_domain_page(image->entry_page); entries[image->next_entry] = page_to_maddr(page) | IND_INDIRECTION; unmap_domain_page(entries); image->entry_page = page; image->next_entry = 0; } entries = __map_domain_page(image->entry_page); entries[image->next_entry] = entry; image->next_entry++; unmap_domain_page(entries); return 0; } static int kimage_set_destination(struct kexec_image *image, paddr_t destination) { return kimage_add_entry(image, (destination & PAGE_MASK) | IND_DESTINATION); } static int kimage_add_page(struct kexec_image *image, paddr_t maddr) { return kimage_add_entry(image, (maddr & PAGE_MASK) | IND_SOURCE); } static void kimage_free_extra_pages(struct kexec_image *image) { kimage_free_page_list(&image->dest_pages); kimage_free_page_list(&image->unusable_pages); } static void kimage_terminate(struct kexec_image *image) { kimage_entry_t *entries; entries = __map_domain_page(image->entry_page); entries[image->next_entry] = IND_DONE; unmap_domain_page(entries); } /* * Iterate over all the entries in the indirection pages. * * Call unmap_domain_page(ptr) after the loop exits. */ #define for_each_kimage_entry(image, ptr, entry) \ for ( ptr = map_domain_page(image->head >> PAGE_SHIFT); \ (entry = *ptr) && !(entry & IND_DONE); \ ptr = (entry & IND_INDIRECTION) ? \ (unmap_domain_page(ptr), map_domain_page(entry >> PAGE_SHIFT)) \ : ptr + 1 ) static void kimage_free_entry(kimage_entry_t entry) { struct page_info *page; page = mfn_to_page(entry >> PAGE_SHIFT); free_domheap_page(page); } static void kimage_free_all_entries(struct kexec_image *image) { kimage_entry_t *ptr, entry; kimage_entry_t ind = 0; if ( !image->head ) return; for_each_kimage_entry(image, ptr, entry) { if ( entry & IND_INDIRECTION ) { /* Free the previous indirection page */ if ( ind & IND_INDIRECTION ) kimage_free_entry(ind); /* Save this indirection page until we are done with it. */ ind = entry; } else if ( entry & IND_SOURCE ) kimage_free_entry(entry); } unmap_domain_page(ptr); /* Free the final indirection page. */ if ( ind & IND_INDIRECTION ) kimage_free_entry(ind); } void kimage_free(struct kexec_image *image) { if ( !image ) return; kimage_free_extra_pages(image); kimage_free_all_entries(image); kimage_free_page_list(&image->control_pages); xfree(image->segments); xfree(image); } static kimage_entry_t *kimage_dst_used(struct kexec_image *image, paddr_t maddr) { kimage_entry_t *ptr, entry; unsigned long destination = 0; for_each_kimage_entry(image, ptr, entry) { if ( entry & IND_DESTINATION ) destination = entry & PAGE_MASK; else if ( entry & IND_SOURCE ) { if ( maddr == destination ) return ptr; destination += PAGE_SIZE; } } unmap_domain_page(ptr); return NULL; } static struct page_info *kimage_alloc_page(struct kexec_image *image, paddr_t destination) { /* * Here we implement safeguards to ensure that a source page is * not copied to its destination page before the data on the * destination page is no longer useful. * * To do this we maintain the invariant that a source page is * either its own destination page, or it is not a destination * page at all. * * That is slightly stronger than required, but the proof that no * problems will not occur is trivial, and the implementation is * simply to verify. * * When allocating all pages normally this algorithm will run in * O(N) time, but in the worst case it will run in O(N^2) time. * If the runtime is a problem the data structures can be fixed. */ struct page_info *page; paddr_t addr; int ret; /* * Walk through the list of destination pages, and see if I have a * match. */ page_list_for_each(page, &image->dest_pages) { addr = page_to_maddr(page); if ( addr == destination ) { page_list_del(page, &image->dest_pages); goto found; } } page = NULL; for (;;) { kimage_entry_t *old; /* Allocate a page, if we run out of memory give up. */ page = kimage_alloc_zeroed_page(0); if ( !page ) return NULL; addr = page_to_maddr(page); /* If it is the destination page we want use it. */ if ( addr == destination ) break; /* If the page is not a destination page use it. */ if ( !kimage_is_destination_range(image, addr, addr + PAGE_SIZE) ) break; /* * I know that the page is someones destination page. See if * there is already a source page for this destination page. * And if so swap the source pages. */ old = kimage_dst_used(image, addr); if ( old ) { /* If so move it. */ unsigned long old_mfn = *old >> PAGE_SHIFT; unsigned long mfn = addr >> PAGE_SHIFT; copy_domain_page(mfn, old_mfn); clear_domain_page(old_mfn); *old = (addr & ~PAGE_MASK) | IND_SOURCE; unmap_domain_page(old); page = mfn_to_page(old_mfn); break; } else { /* * Place the page on the destination list; I will use it * later. */ page_list_add(page, &image->dest_pages); } } found: ret = machine_kexec_add_page(image, page_to_maddr(page), page_to_maddr(page)); if ( ret < 0 ) { free_domheap_page(page); return NULL; } return page; } static int kimage_load_normal_segment(struct kexec_image *image, xen_kexec_segment_t *segment) { unsigned long to_copy; unsigned long src_offset; paddr_t dest, end; int ret; to_copy = segment->buf_size; src_offset = 0; dest = segment->dest_maddr; ret = kimage_set_destination(image, dest); if ( ret < 0 ) return ret; while ( to_copy ) { unsigned long dest_mfn; struct page_info *page; void *dest_va; size_t size; dest_mfn = dest >> PAGE_SHIFT; size = min_t(unsigned long, PAGE_SIZE, to_copy); page = kimage_alloc_page(image, dest); if ( !page ) return -ENOMEM; ret = kimage_add_page(image, page_to_maddr(page)); if ( ret < 0 ) return ret; dest_va = __map_domain_page(page); ret = copy_from_guest_offset(dest_va, segment->buf.h, src_offset, size); unmap_domain_page(dest_va); if ( ret ) return -EFAULT; to_copy -= size; src_offset += size; dest += PAGE_SIZE; } /* Remainder of the destination should be zeroed. */ end = segment->dest_maddr + segment->dest_size; for ( ; dest < end; dest += PAGE_SIZE ) kimage_add_entry(image, IND_ZERO); return 0; } static int kimage_load_crash_segment(struct kexec_image *image, xen_kexec_segment_t *segment) { /* * For crash dumps kernels we simply copy the data from user space * to it's destination. */ paddr_t dest; unsigned long sbytes, dbytes; int ret = 0; unsigned long src_offset = 0; sbytes = segment->buf_size; dbytes = segment->dest_size; dest = segment->dest_maddr; while ( dbytes ) { unsigned long dest_mfn; void *dest_va; size_t schunk, dchunk; dest_mfn = dest >> PAGE_SHIFT; dchunk = PAGE_SIZE; schunk = min(dchunk, sbytes); dest_va = map_domain_page(dest_mfn); if ( !dest_va ) return -EINVAL; ret = copy_from_guest_offset(dest_va, segment->buf.h, src_offset, schunk); memset(dest_va + schunk, 0, dchunk - schunk); unmap_domain_page(dest_va); if ( ret ) return -EFAULT; dbytes -= dchunk; sbytes -= schunk; dest += dchunk; src_offset += schunk; } return 0; } static int kimage_load_segment(struct kexec_image *image, xen_kexec_segment_t *segment) { int result = -ENOMEM; paddr_t addr; if ( !guest_handle_is_null(segment->buf.h) ) { switch ( image->type ) { case KEXEC_TYPE_DEFAULT: result = kimage_load_normal_segment(image, segment); break; case KEXEC_TYPE_CRASH: result = kimage_load_crash_segment(image, segment); break; } } for ( addr = segment->dest_maddr & PAGE_MASK; addr < segment->dest_maddr + segment->dest_size; addr += PAGE_SIZE ) { result = machine_kexec_add_page(image, addr, addr); if ( result < 0 ) break; } return result; } int kimage_alloc(struct kexec_image **rimage, uint8_t type, uint16_t arch, uint64_t entry_maddr, uint32_t nr_segments, xen_kexec_segment_t *segment) { int result; switch( type ) { case KEXEC_TYPE_DEFAULT: result = kimage_normal_alloc(rimage, entry_maddr, nr_segments, segment); break; case KEXEC_TYPE_CRASH: result = kimage_crash_alloc(rimage, entry_maddr, nr_segments, segment); break; default: result = -EINVAL; break; } if ( result < 0 ) return result; (*rimage)->arch = arch; return result; } int kimage_load_segments(struct kexec_image *image) { int s; int result; for ( s = 0; s < image->nr_segments; s++ ) { result = kimage_load_segment(image, &image->segments[s]); if ( result < 0 ) return result; } kimage_terminate(image); return 0; } kimage_entry_t *kimage_entry_next(kimage_entry_t *entry, bool_t compat) { if ( compat ) return (kimage_entry_t *)((uint32_t *)entry + 1); return entry + 1; } unsigned long kimage_entry_mfn(kimage_entry_t *entry, bool_t compat) { if ( compat ) return *(uint32_t *)entry >> PAGE_SHIFT; return *entry >> PAGE_SHIFT; } unsigned long kimage_entry_ind(kimage_entry_t *entry, bool_t compat) { if ( compat ) return *(uint32_t *)entry & 0xf; return *entry & 0xf; } int kimage_build_ind(struct kexec_image *image, unsigned long ind_mfn, bool_t compat) { void *page; kimage_entry_t *entry; int ret = 0; paddr_t dest = KIMAGE_NO_DEST; page = map_domain_page(ind_mfn); if ( !page ) return -ENOMEM; /* * Walk the guest-supplied indirection pages, adding entries to * the image's indirection pages. */ for ( entry = page; ; ) { unsigned long ind; unsigned long mfn; ind = kimage_entry_ind(entry, compat); mfn = kimage_entry_mfn(entry, compat); switch ( ind ) { case IND_DESTINATION: dest = (paddr_t)mfn << PAGE_SHIFT; ret = kimage_set_destination(image, dest); if ( ret < 0 ) goto done; break; case IND_INDIRECTION: unmap_domain_page(page); page = map_domain_page(mfn); entry = page; continue; case IND_DONE: kimage_terminate(image); goto done; case IND_SOURCE: { struct page_info *guest_page, *xen_page; guest_page = mfn_to_page(mfn); if ( !get_page(guest_page, current->domain) ) { ret = -EFAULT; goto done; } xen_page = kimage_alloc_page(image, dest); if ( !xen_page ) { put_page(guest_page); ret = -ENOMEM; goto done; } copy_domain_page(page_to_mfn(xen_page), mfn); put_page(guest_page); ret = kimage_add_page(image, page_to_maddr(xen_page)); if ( ret < 0 ) goto done; dest += PAGE_SIZE; break; } default: ret = -EINVAL; goto done; } entry = kimage_entry_next(entry, compat); } done: unmap_domain_page(page); return ret; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/lz4/0000775000175000017500000000000012307313555013236 5ustar smbsmbxen-4.4.0/xen/common/lz4/decompress.c0000664000175000017500000001700412307313555015550 0ustar smbsmb/* * LZ4 Decompressor for Linux kernel * * Copyright (C) 2013, LG Electronics, Kyungsik Lee * * Based on LZ4 implementation by Yann Collet. * * LZ4 - Fast LZ compression algorithm * Copyright (C) 2011-2012, Yann Collet. * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * You can contact the author at : * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html * - LZ4 source repository : http://code.google.com/p/lz4/ */ #include "defs.h" #if defined(__XEN__) || defined(__MINIOS__) static int INIT lz4_uncompress(const unsigned char *source, unsigned char *dest, int osize) { const BYTE *ip = (const BYTE *) source; const BYTE *ref; BYTE *op = (BYTE *) dest; BYTE * const oend = op + osize; BYTE *cpy; unsigned token; size_t length; size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; #if LZ4_ARCH64 size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; #endif while (1) { /* get runlength */ token = *ip++; length = (token >> ML_BITS); if (length == RUN_MASK) { size_t len; len = *ip++; for (; len == 255; length += 255) len = *ip++; length += len; } /* copy literals */ cpy = op + length; if (unlikely(cpy > oend - COPYLENGTH)) { /* * Error: not enough place for another match * (min 4) + 5 literals */ if (cpy != oend) goto _output_error; memcpy(op, ip, length); ip += length; break; /* EOF */ } LZ4_WILDCOPY(ip, op, cpy); ip -= (op - cpy); op = cpy; /* get offset */ LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); ip += 2; /* Error: offset create reference outside destination buffer */ if (unlikely(ref < (BYTE *const) dest)) goto _output_error; /* get matchlength */ length = token & ML_MASK; if (length == ML_MASK) { for (; *ip == 255; length += 255) ip++; length += *ip++; } /* copy repeated sequence */ if (unlikely((op - ref) < STEPSIZE)) { #if LZ4_ARCH64 size_t dec64 = dec64table[op - ref]; #else const int dec64 = 0; #endif op[0] = ref[0]; op[1] = ref[1]; op[2] = ref[2]; op[3] = ref[3]; op += 4; ref += 4; ref -= dec32table[op-ref]; PUT4(ref, op); op += STEPSIZE - 4; ref -= dec64; } else { LZ4_COPYSTEP(ref, op); } cpy = op + length - (STEPSIZE - 4); if (cpy > (oend - COPYLENGTH)) { /* Error: request to write beyond destination buffer */ if (cpy > oend) goto _output_error; LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); while (op < cpy) *op++ = *ref++; op = cpy; /* * Check EOF (should never happen, since last 5 bytes * are supposed to be literals) */ if (op == oend) goto _output_error; continue; } LZ4_SECURECOPY(ref, op, cpy); op = cpy; /* correction */ } /* end of decoding */ return (int) (ip - source); /* write overflow error detected */ _output_error: return (int) (-(ip - source)); } #else /* defined(__XEN__) || defined(__MINIOS__) */ static int lz4_uncompress_unknownoutputsize(const unsigned char *source, unsigned char *dest, int isize, size_t maxoutputsize) { const BYTE *ip = (const BYTE *) source; const BYTE *const iend = ip + isize; const BYTE *ref; BYTE *op = (BYTE *) dest; BYTE * const oend = op + maxoutputsize; BYTE *cpy; size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; #if LZ4_ARCH64 size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; #endif /* Main Loop */ while (ip < iend) { unsigned token; size_t length; /* get runlength */ token = *ip++; length = (token >> ML_BITS); if (length == RUN_MASK) { int s = 255; while ((ip < iend) && (s == 255)) { s = *ip++; length += s; } } /* copy literals */ cpy = op + length; if ((cpy > oend - COPYLENGTH) || (ip + length > iend - COPYLENGTH)) { if (cpy > oend) goto _output_error;/* writes beyond buffer */ if (ip + length != iend) goto _output_error;/* * Error: LZ4 format requires * to consume all input * at this stage */ memcpy(op, ip, length); op += length; break;/* Necessarily EOF, due to parsing restrictions */ } LZ4_WILDCOPY(ip, op, cpy); ip -= (op - cpy); op = cpy; /* get offset */ LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); ip += 2; if (ref < (BYTE * const) dest) goto _output_error; /* * Error : offset creates reference * outside of destination buffer */ /* get matchlength */ length = (token & ML_MASK); if (length == ML_MASK) { while (ip < iend) { int s = *ip++; length += s; if (s == 255) continue; break; } } /* copy repeated sequence */ if (unlikely((op - ref) < STEPSIZE)) { #if LZ4_ARCH64 size_t dec64 = dec64table[op - ref]; #else const int dec64 = 0; #endif op[0] = ref[0]; op[1] = ref[1]; op[2] = ref[2]; op[3] = ref[3]; op += 4; ref += 4; ref -= dec32table[op - ref]; PUT4(ref, op); op += STEPSIZE - 4; ref -= dec64; } else { LZ4_COPYSTEP(ref, op); } cpy = op + length - (STEPSIZE-4); if (cpy > oend - COPYLENGTH) { if (cpy > oend) goto _output_error; /* write outside of buf */ LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); while (op < cpy) *op++ = *ref++; op = cpy; /* * Check EOF (should never happen, since last 5 bytes * are supposed to be literals) */ if (op == oend) goto _output_error; continue; } LZ4_SECURECOPY(ref, op, cpy); op = cpy; /* correction */ } /* end of decoding */ return (int) (op - dest); /* write overflow error detected */ _output_error: return (int) (-(ip - source)); } #endif #if defined(__XEN__) || defined(__MINIOS__) int INIT lz4_decompress(const unsigned char *src, size_t *src_len, unsigned char *dest, size_t actual_dest_len) { int ret = -1; int input_len = 0; input_len = lz4_uncompress(src, dest, actual_dest_len); if (input_len < 0) goto exit_0; *src_len = input_len; return 0; exit_0: return ret; } #else /* defined(__XEN__) || defined(__MINIOS__) */ int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, unsigned char *dest, size_t *dest_len) { int ret = -1; int out_len = 0; out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len, *dest_len); if (out_len < 0) goto exit_0; *dest_len = out_len; return 0; exit_0: return ret; } #endif xen-4.4.0/xen/common/lz4/defs.h0000664000175000017500000001034212307313555014330 0ustar smbsmb/* * lz4defs.h -- architecture specific defines * * Copyright (C) 2013, LG Electronics, Kyungsik Lee * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #ifdef __XEN__ #include #endif #ifdef __LITTLE_ENDIAN static inline u16 INIT get_unaligned_le16(const void *p) { return le16_to_cpup(p); } static inline u32 INIT get_unaligned_le32(const void *p) { return le32_to_cpup(p); } #else #include static inline u16 INIT get_unaligned_le16(const void *p) { return le16_to_cpu(__get_unaligned(p, 2)); } static inline u32 INIT get_unaligned_le32(void *p) { return le32_to_cpu(__get_unaligned(p, 4)); } #endif /* * Detects 64 bits mode */ #if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) \ || defined(__ppc64__) || defined(__LP64__)) #define LZ4_ARCH64 1 #else #define LZ4_ARCH64 0 #endif /* * Architecture-specific macros */ #define BYTE u8 typedef struct _U16_S { u16 v; } U16_S; typedef struct _U32_S { u32 v; } U32_S; typedef struct _U64_S { u64 v; } U64_S; #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) \ || defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 6 \ && defined(ARM_EFFICIENT_UNALIGNED_ACCESS) #define A16(x) (((U16_S *)(x))->v) #define A32(x) (((U32_S *)(x))->v) #define A64(x) (((U64_S *)(x))->v) #define PUT4(s, d) (A32(d) = A32(s)) #define PUT8(s, d) (A64(d) = A64(s)) #define LZ4_WRITE_LITTLEENDIAN_16(p, v) \ do { \ A16(p) = v; \ p += 2; \ } while (0) #else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ #define A64(x) get_unaligned((u64 *)&(((U16_S *)(x))->v)) #define A32(x) get_unaligned((u32 *)&(((U16_S *)(x))->v)) #define A16(x) get_unaligned((u16 *)&(((U16_S *)(x))->v)) #define PUT4(s, d) \ put_unaligned(get_unaligned((const u32 *) s), (u32 *) d) #define PUT8(s, d) \ put_unaligned(get_unaligned((const u64 *) s), (u64 *) d) #define LZ4_WRITE_LITTLEENDIAN_16(p, v) \ do { \ put_unaligned(v, (u16 *)(p)); \ p += 2; \ } while (0) #endif #define COPYLENGTH 8 #define ML_BITS 4 #define ML_MASK ((1U << ML_BITS) - 1) #define RUN_BITS (8 - ML_BITS) #define RUN_MASK ((1U << RUN_BITS) - 1) #define MEMORY_USAGE 14 #define MINMATCH 4 #define SKIPSTRENGTH 6 #define LASTLITERALS 5 #define MFLIMIT (COPYLENGTH + MINMATCH) #define MINLENGTH (MFLIMIT + 1) #define MAXD_LOG 16 #define MAXD (1 << MAXD_LOG) #define MAXD_MASK (u32)(MAXD - 1) #define MAX_DISTANCE (MAXD - 1) #define HASH_LOG (MAXD_LOG - 1) #define HASHTABLESIZE (1 << HASH_LOG) #define MAX_NB_ATTEMPTS 256 #define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH) #define LZ4_64KLIMIT ((1<<16) + (MFLIMIT - 1)) #define HASHLOG64K ((MEMORY_USAGE - 2) + 1) #define HASH64KTABLESIZE (1U << HASHLOG64K) #define LZ4_HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \ ((MINMATCH * 8) - (MEMORY_USAGE-2))) #define LZ4_HASH64K_VALUE(p) (((A32(p)) * 2654435761U) >> \ ((MINMATCH * 8) - HASHLOG64K)) #define HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \ ((MINMATCH * 8) - HASH_LOG)) #if LZ4_ARCH64/* 64-bit */ #define STEPSIZE 8 #define LZ4_COPYSTEP(s, d) \ do { \ PUT8(s, d); \ d += 8; \ s += 8; \ } while (0) #define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d) #define LZ4_SECURECOPY(s, d, e) \ do { \ if (d < e) { \ LZ4_WILDCOPY(s, d, e); \ } \ } while (0) #define HTYPE u32 #ifdef __BIG_ENDIAN #define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3) #else #define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3) #endif #else /* 32-bit */ #define STEPSIZE 4 #define LZ4_COPYSTEP(s, d) \ do { \ PUT4(s, d); \ d += 4; \ s += 4; \ } while (0) #define LZ4_COPYPACKET(s, d) \ do { \ LZ4_COPYSTEP(s, d); \ LZ4_COPYSTEP(s, d); \ } while (0) #define LZ4_SECURECOPY LZ4_WILDCOPY #define HTYPE const u8* #ifdef __BIG_ENDIAN #define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3) #else #define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3) #endif #endif #define LZ4_READ_LITTLEENDIAN_16(d, s, p) \ (d = s - get_unaligned_le16(p)) #define LZ4_WILDCOPY(s, d, e) \ do { \ LZ4_COPYPACKET(s, d); \ } while (d < e) #define LZ4_BLINDCOPY(s, d, l) \ do { \ u8 *e = (d) + l; \ LZ4_WILDCOPY(s, d, e); \ d = e; \ } while (0) xen-4.4.0/xen/common/stop_machine.c0000664000175000017500000001213112307313555015340 0ustar smbsmb/****************************************************************************** * common/stop_machine.c * * Facilities to put whole machine in a safe 'stop' state * * Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation * Copyright 2008 Kevin Tian , Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include #include #include enum stopmachine_state { STOPMACHINE_START, STOPMACHINE_PREPARE, STOPMACHINE_DISABLE_IRQ, STOPMACHINE_INVOKE, STOPMACHINE_EXIT }; struct stopmachine_data { unsigned int nr_cpus; enum stopmachine_state state; atomic_t done; unsigned int fn_cpu; int fn_result; int (*fn)(void *); void *fn_data; }; static DEFINE_PER_CPU(struct tasklet, stopmachine_tasklet); static struct stopmachine_data stopmachine_data; static DEFINE_SPINLOCK(stopmachine_lock); static void stopmachine_set_state(enum stopmachine_state state) { atomic_set(&stopmachine_data.done, 0); smp_wmb(); stopmachine_data.state = state; } static void stopmachine_wait_state(void) { while ( atomic_read(&stopmachine_data.done) != stopmachine_data.nr_cpus ) cpu_relax(); } int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) { cpumask_t allbutself; unsigned int i, nr_cpus; int ret; BUG_ON(!local_irq_is_enabled()); /* cpu_online_map must not change. */ if ( !get_cpu_maps() ) return -EBUSY; cpumask_andnot(&allbutself, &cpu_online_map, cpumask_of(smp_processor_id())); nr_cpus = cpumask_weight(&allbutself); /* Must not spin here as the holder will expect us to be descheduled. */ if ( !spin_trylock(&stopmachine_lock) ) { put_cpu_maps(); return -EBUSY; } stopmachine_data.fn = fn; stopmachine_data.fn_data = data; stopmachine_data.nr_cpus = nr_cpus; stopmachine_data.fn_cpu = cpu; atomic_set(&stopmachine_data.done, 0); stopmachine_data.state = STOPMACHINE_START; smp_wmb(); for_each_cpu ( i, &allbutself ) tasklet_schedule_on_cpu(&per_cpu(stopmachine_tasklet, i), i); stopmachine_set_state(STOPMACHINE_PREPARE); stopmachine_wait_state(); local_irq_disable(); stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); stopmachine_wait_state(); spin_debug_disable(); stopmachine_set_state(STOPMACHINE_INVOKE); if ( (cpu == smp_processor_id()) || (cpu == NR_CPUS) ) stopmachine_data.fn_result = (*fn)(data); stopmachine_wait_state(); ret = stopmachine_data.fn_result; spin_debug_enable(); stopmachine_set_state(STOPMACHINE_EXIT); stopmachine_wait_state(); local_irq_enable(); spin_unlock(&stopmachine_lock); put_cpu_maps(); return ret; } static void stopmachine_action(unsigned long cpu) { enum stopmachine_state state = STOPMACHINE_START; BUG_ON(cpu != smp_processor_id()); smp_mb(); while ( state != STOPMACHINE_EXIT ) { while ( stopmachine_data.state == state ) cpu_relax(); state = stopmachine_data.state; switch ( state ) { case STOPMACHINE_DISABLE_IRQ: local_irq_disable(); break; case STOPMACHINE_INVOKE: if ( (stopmachine_data.fn_cpu == smp_processor_id()) || (stopmachine_data.fn_cpu == NR_CPUS) ) stopmachine_data.fn_result = stopmachine_data.fn(stopmachine_data.fn_data); break; default: break; } smp_mb(); atomic_inc(&stopmachine_data.done); } local_irq_enable(); } static int cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; if ( action == CPU_UP_PREPARE ) tasklet_init(&per_cpu(stopmachine_tasklet, cpu), stopmachine_action, cpu); return NOTIFY_DONE; } static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; static int __init cpu_stopmachine_init(void) { unsigned int cpu; for_each_online_cpu ( cpu ) { void *hcpu = (void *)(long)cpu; cpu_callback(&cpu_nfb, CPU_UP_PREPARE, hcpu); } register_cpu_notifier(&cpu_nfb); return 0; } __initcall(cpu_stopmachine_init); xen-4.4.0/xen/common/decompress.h0000664000175000017500000000074012307313555015043 0ustar smbsmb#ifdef __XEN__ #include #include #include #include #include #include #include #define STATIC #define INIT __init #define INITDATA __initdata #define malloc xmalloc_bytes #define free xfree #define large_malloc xmalloc_bytes #define large_free xfree #else #define STATIC static #define INIT #define INITDATA #define large_malloc malloc #define large_free free #endif xen-4.4.0/xen/common/sysctl.c0000664000175000017500000002540412307313555014217 0ustar smbsmb/****************************************************************************** * sysctl.c * * System management operations. For use by node control stack. * * Copyright (c) 2002-2006, K Fraser */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include long do_sysctl(XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) u_sysctl) { long ret = 0; int copyback = -1; struct xen_sysctl curop, *op = &curop; static DEFINE_SPINLOCK(sysctl_lock); if ( copy_from_guest(op, u_sysctl, 1) ) return -EFAULT; if ( op->interface_version != XEN_SYSCTL_INTERFACE_VERSION ) return -EACCES; ret = xsm_sysctl(XSM_PRIV, op->cmd); if ( ret ) return ret; /* * Trylock here avoids deadlock with an existing sysctl critical section * which might (for some current or future reason) want to synchronise * with this vcpu. */ while ( !spin_trylock(&sysctl_lock) ) if ( hypercall_preempt_check() ) return hypercall_create_continuation( __HYPERVISOR_sysctl, "h", u_sysctl); switch ( op->cmd ) { case XEN_SYSCTL_readconsole: ret = xsm_readconsole(XSM_HOOK, op->u.readconsole.clear); if ( ret ) break; ret = read_console_ring(&op->u.readconsole); break; case XEN_SYSCTL_tbuf_op: ret = tb_control(&op->u.tbuf_op); break; case XEN_SYSCTL_sched_id: op->u.sched_id.sched_id = sched_id(); break; case XEN_SYSCTL_getdomaininfolist: { struct domain *d; struct xen_domctl_getdomaininfo info; u32 num_domains = 0; rcu_read_lock(&domlist_read_lock); for_each_domain ( d ) { if ( d->domain_id < op->u.getdomaininfolist.first_domain ) continue; if ( num_domains == op->u.getdomaininfolist.max_domains ) break; ret = xsm_getdomaininfo(XSM_HOOK, d); if ( ret ) continue; getdomaininfo(d, &info); if ( copy_to_guest_offset(op->u.getdomaininfolist.buffer, num_domains, &info, 1) ) { ret = -EFAULT; break; } num_domains++; } rcu_read_unlock(&domlist_read_lock); if ( ret != 0 ) break; op->u.getdomaininfolist.num_domains = num_domains; } break; #ifdef PERF_COUNTERS case XEN_SYSCTL_perfc_op: ret = perfc_control(&op->u.perfc_op); break; #endif #ifdef LOCK_PROFILE case XEN_SYSCTL_lockprof_op: ret = spinlock_profile_control(&op->u.lockprof_op); break; #endif case XEN_SYSCTL_debug_keys: { char c; uint32_t i; ret = -EFAULT; for ( i = 0; i < op->u.debug_keys.nr_keys; i++ ) { if ( copy_from_guest_offset(&c, op->u.debug_keys.keys, i, 1) ) goto out; handle_keypress(c, guest_cpu_user_regs()); } ret = 0; copyback = 0; } break; case XEN_SYSCTL_getcpuinfo: { uint32_t i, nr_cpus; struct xen_sysctl_cpuinfo cpuinfo; nr_cpus = min(op->u.getcpuinfo.max_cpus, nr_cpu_ids); ret = -EFAULT; for ( i = 0; i < nr_cpus; i++ ) { cpuinfo.idletime = get_cpu_idle_time(i); if ( copy_to_guest_offset(op->u.getcpuinfo.info, i, &cpuinfo, 1) ) goto out; } op->u.getcpuinfo.nr_cpus = i; ret = 0; } break; case XEN_SYSCTL_availheap: op->u.availheap.avail_bytes = avail_domheap_pages_region( op->u.availheap.node, op->u.availheap.min_bitwidth, op->u.availheap.max_bitwidth); op->u.availheap.avail_bytes <<= PAGE_SHIFT; break; #ifdef HAS_ACPI case XEN_SYSCTL_get_pmstat: ret = do_get_pm_info(&op->u.get_pmstat); break; case XEN_SYSCTL_pm_op: ret = do_pm_op(&op->u.pm_op); if ( ret == -EAGAIN ) copyback = 1; break; #endif case XEN_SYSCTL_page_offline_op: { uint32_t *status, *ptr; unsigned long pfn; ret = xsm_page_offline(XSM_HOOK, op->u.page_offline.cmd); if ( ret ) break; ptr = status = xmalloc_bytes( sizeof(uint32_t) * (op->u.page_offline.end - op->u.page_offline.start + 1)); if ( !status ) { dprintk(XENLOG_WARNING, "Out of memory for page offline op\n"); ret = -ENOMEM; break; } memset(status, PG_OFFLINE_INVALID, sizeof(uint32_t) * (op->u.page_offline.end - op->u.page_offline.start + 1)); for ( pfn = op->u.page_offline.start; pfn <= op->u.page_offline.end; pfn ++ ) { switch ( op->u.page_offline.cmd ) { /* Shall revert her if failed, or leave caller do it? */ case sysctl_page_offline: ret = offline_page(pfn, 0, ptr++); break; case sysctl_page_online: ret = online_page(pfn, ptr++); break; case sysctl_query_page_offline: ret = query_page_offline(pfn, ptr++); break; default: ret = -EINVAL; break; } if (ret) break; } if ( copy_to_guest( op->u.page_offline.status, status, op->u.page_offline.end - op->u.page_offline.start + 1) ) ret = -EFAULT; xfree(status); copyback = 0; } break; case XEN_SYSCTL_cpupool_op: ret = cpupool_do_sysctl(&op->u.cpupool_op); break; case XEN_SYSCTL_scheduler_op: ret = sched_adjust_global(&op->u.scheduler_op); break; case XEN_SYSCTL_physinfo: { xen_sysctl_physinfo_t *pi = &op->u.physinfo; memset(pi, 0, sizeof(*pi)); pi->threads_per_core = cpumask_weight(per_cpu(cpu_sibling_mask, 0)); pi->cores_per_socket = cpumask_weight(per_cpu(cpu_core_mask, 0)) / pi->threads_per_core; pi->nr_cpus = num_online_cpus(); pi->nr_nodes = num_online_nodes(); pi->max_node_id = MAX_NUMNODES-1; pi->max_cpu_id = nr_cpu_ids - 1; pi->total_pages = total_pages; /* Protected by lock */ get_outstanding_claims(&pi->free_pages, &pi->outstanding_pages); pi->scrub_pages = 0; pi->cpu_khz = cpu_khz; arch_do_physinfo(pi); if ( copy_to_guest(u_sysctl, op, 1) ) ret = -EFAULT; } break; case XEN_SYSCTL_numainfo: { uint32_t i, j, max_node_index, last_online_node; xen_sysctl_numainfo_t *ni = &op->u.numainfo; last_online_node = last_node(node_online_map); max_node_index = min_t(uint32_t, ni->max_node_index, last_online_node); ni->max_node_index = last_online_node; for ( i = 0; i <= max_node_index; i++ ) { if ( !guest_handle_is_null(ni->node_to_memsize) ) { uint64_t memsize = node_online(i) ? node_spanned_pages(i) << PAGE_SHIFT : 0ul; if ( copy_to_guest_offset(ni->node_to_memsize, i, &memsize, 1) ) break; } if ( !guest_handle_is_null(ni->node_to_memfree) ) { uint64_t memfree = node_online(i) ? avail_node_heap_pages(i) << PAGE_SHIFT : 0ul; if ( copy_to_guest_offset(ni->node_to_memfree, i, &memfree, 1) ) break; } if ( !guest_handle_is_null(ni->node_to_node_distance) ) { for ( j = 0; j <= max_node_index; j++) { uint32_t distance = ~0u; if ( node_online(i) && node_online(j) ) distance = __node_distance(i, j); if ( copy_to_guest_offset( ni->node_to_node_distance, i*(max_node_index+1) + j, &distance, 1) ) break; } if ( j <= max_node_index ) break; } } ret = ((i <= max_node_index) || copy_to_guest(u_sysctl, op, 1)) ? -EFAULT : 0; } break; case XEN_SYSCTL_topologyinfo: { uint32_t i, max_cpu_index, last_online_cpu; xen_sysctl_topologyinfo_t *ti = &op->u.topologyinfo; last_online_cpu = cpumask_last(&cpu_online_map); max_cpu_index = min_t(uint32_t, ti->max_cpu_index, last_online_cpu); ti->max_cpu_index = last_online_cpu; for ( i = 0; i <= max_cpu_index; i++ ) { if ( !guest_handle_is_null(ti->cpu_to_core) ) { uint32_t core = cpu_online(i) ? cpu_to_core(i) : ~0u; if ( copy_to_guest_offset(ti->cpu_to_core, i, &core, 1) ) break; } if ( !guest_handle_is_null(ti->cpu_to_socket) ) { uint32_t socket = cpu_online(i) ? cpu_to_socket(i) : ~0u; if ( copy_to_guest_offset(ti->cpu_to_socket, i, &socket, 1) ) break; } if ( !guest_handle_is_null(ti->cpu_to_node) ) { uint32_t node = cpu_online(i) ? cpu_to_node(i) : ~0u; if ( copy_to_guest_offset(ti->cpu_to_node, i, &node, 1) ) break; } } ret = ((i <= max_cpu_index) || copy_to_guest(u_sysctl, op, 1)) ? -EFAULT : 0; } break; #ifdef TEST_COVERAGE case XEN_SYSCTL_coverage_op: ret = sysctl_coverage_op(&op->u.coverage_op); break; #endif default: ret = arch_do_sysctl(op, u_sysctl); copyback = 0; break; } out: spin_unlock(&sysctl_lock); if ( copyback && (!ret || copyback > 0) && __copy_to_guest(u_sysctl, op, 1) ) ret = -EFAULT; return ret; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/xmalloc_tlsf.c0000664000175000017500000004141512307313555015365 0ustar smbsmb/* * Two Levels Segregate Fit memory allocator (TLSF) * Version 2.3.2 * * Written by Miguel Masmano Tello * * Thanks to Ismael Ripoll for his suggestions and reviews * * Copyright (C) 2007, 2006, 2005, 2004 * * This code is released using a dual license strategy: GPL/LGPL * You can choose the licence that better fits your requirements. * * Released under the terms of the GNU General Public License Version 2.0 * Released under the terms of the GNU Lesser General Public License * Version 2.1 * * This is kernel port of TLSF allocator. * Original code can be found at: http://rtportal.upv.es/rtmalloc/ * Adapted for Linux by Nitin Gupta (nitingupta910@gmail.com) * (http://code.google.com/p/compcache/source/browse/trunk/sub-projects * /allocators/tlsf-kmod r229 dated Aug 27, 2008 * Adapted for Xen by Dan Magenheimer (dan.magenheimer@oracle.com) */ #include #include #include #include #include #define MAX_POOL_NAME_LEN 16 /* Some IMPORTANT TLSF parameters */ #define MEM_ALIGN (sizeof(void *) * 2) #define MEM_ALIGN_MASK (~(MEM_ALIGN - 1)) #define MAX_FLI (30) #define MAX_LOG2_SLI (5) #define MAX_SLI (1 << MAX_LOG2_SLI) #define FLI_OFFSET (6) /* tlsf structure just will manage blocks bigger than 128 bytes */ #define SMALL_BLOCK (128) #define REAL_FLI (MAX_FLI - FLI_OFFSET) #define MIN_BLOCK_SIZE (sizeof(struct free_ptr)) #define BHDR_OVERHEAD (sizeof(struct bhdr) - MIN_BLOCK_SIZE) #define PTR_MASK (sizeof(void *) - 1) #define BLOCK_SIZE_MASK (0xFFFFFFFF - PTR_MASK) #define GET_NEXT_BLOCK(addr, r) ((struct bhdr *) \ ((char *)(addr) + (r))) #define ROUNDUP_SIZE(r) (((r) + MEM_ALIGN - 1) & MEM_ALIGN_MASK) #define ROUNDDOWN_SIZE(r) ((r) & MEM_ALIGN_MASK) #define ROUNDUP_PAGE(r) (((r) + PAGE_SIZE - 1) & PAGE_MASK) #define BLOCK_STATE (0x1) #define PREV_STATE (0x2) /* bit 0 of the block size */ #define FREE_BLOCK (0x1) #define USED_BLOCK (0x0) /* bit 1 of the block size */ #define PREV_FREE (0x2) #define PREV_USED (0x0) static spinlock_t pool_list_lock; static struct list_head pool_list_head; struct free_ptr { struct bhdr *prev; struct bhdr *next; }; struct bhdr { /* All blocks in a region are linked in order of physical address */ struct bhdr *prev_hdr; /* * The size is stored in bytes * bit 0: block is free, if set * bit 1: previous block is free, if set */ u32 size; /* Free blocks in individual freelists are linked */ union { struct free_ptr free_ptr; u8 buffer[sizeof(struct free_ptr)]; } ptr; }; struct xmem_pool { /* First level bitmap (REAL_FLI bits) */ u32 fl_bitmap; /* Second level bitmap */ u32 sl_bitmap[REAL_FLI]; /* Free lists */ struct bhdr *matrix[REAL_FLI][MAX_SLI]; spinlock_t lock; unsigned long init_size; unsigned long max_size; unsigned long grow_size; /* Basic stats */ unsigned long used_size; unsigned long num_regions; /* User provided functions for expanding/shrinking pool */ xmem_pool_get_memory *get_mem; xmem_pool_put_memory *put_mem; struct list_head list; void *init_region; char name[MAX_POOL_NAME_LEN]; }; /* * Helping functions */ /** * Returns indexes (fl, sl) of the list used to serve request of size r */ static inline void MAPPING_SEARCH(unsigned long *r, int *fl, int *sl) { int t; if ( *r < SMALL_BLOCK ) { *fl = 0; *sl = *r / (SMALL_BLOCK / MAX_SLI); } else { t = (1 << (fls(*r) - 1 - MAX_LOG2_SLI)) - 1; *r = *r + t; *fl = fls(*r) - 1; *sl = (*r >> (*fl - MAX_LOG2_SLI)) - MAX_SLI; *fl -= FLI_OFFSET; /*if ((*fl -= FLI_OFFSET) < 0) // FL will be always >0! *fl = *sl = 0; */ *r &= ~t; } } /** * Returns indexes (fl, sl) which is used as starting point to search * for a block of size r. It also rounds up requested size(r) to the * next list. */ static inline void MAPPING_INSERT(unsigned long r, int *fl, int *sl) { if ( r < SMALL_BLOCK ) { *fl = 0; *sl = r / (SMALL_BLOCK / MAX_SLI); } else { *fl = fls(r) - 1; *sl = (r >> (*fl - MAX_LOG2_SLI)) - MAX_SLI; *fl -= FLI_OFFSET; } } /** * Returns first block from a list that hold blocks larger than or * equal to the one pointed by the indexes (fl, sl) */ static inline struct bhdr *FIND_SUITABLE_BLOCK(struct xmem_pool *p, int *fl, int *sl) { u32 tmp = p->sl_bitmap[*fl] & (~0 << *sl); struct bhdr *b = NULL; if ( tmp ) { *sl = ffs(tmp) - 1; b = p->matrix[*fl][*sl]; } else { *fl = ffs(p->fl_bitmap & (~0 << (*fl + 1))) - 1; if ( likely(*fl > 0) ) { *sl = ffs(p->sl_bitmap[*fl]) - 1; b = p->matrix[*fl][*sl]; } } return b; } /** * Remove first free block(b) from free list with indexes (fl, sl). */ static inline void EXTRACT_BLOCK_HDR(struct bhdr *b, struct xmem_pool *p, int fl, int sl) { p->matrix[fl][sl] = b->ptr.free_ptr.next; if ( p->matrix[fl][sl] ) { p->matrix[fl][sl]->ptr.free_ptr.prev = NULL; } else { clear_bit(sl, &p->sl_bitmap[fl]); if ( !p->sl_bitmap[fl] ) clear_bit(fl, &p->fl_bitmap); } b->ptr.free_ptr = (struct free_ptr) {NULL, NULL}; } /** * Removes block(b) from free list with indexes (fl, sl) */ static inline void EXTRACT_BLOCK(struct bhdr *b, struct xmem_pool *p, int fl, int sl) { if ( b->ptr.free_ptr.next ) b->ptr.free_ptr.next->ptr.free_ptr.prev = b->ptr.free_ptr.prev; if ( b->ptr.free_ptr.prev ) b->ptr.free_ptr.prev->ptr.free_ptr.next = b->ptr.free_ptr.next; if ( p->matrix[fl][sl] == b ) { p->matrix[fl][sl] = b->ptr.free_ptr.next; if ( !p->matrix[fl][sl] ) { clear_bit(sl, &p->sl_bitmap[fl]); if ( !p->sl_bitmap[fl] ) clear_bit (fl, &p->fl_bitmap); } } b->ptr.free_ptr = (struct free_ptr) {NULL, NULL}; } /** * Insert block(b) in free list with indexes (fl, sl) */ static inline void INSERT_BLOCK(struct bhdr *b, struct xmem_pool *p, int fl, int sl) { b->ptr.free_ptr = (struct free_ptr) {NULL, p->matrix[fl][sl]}; if ( p->matrix[fl][sl] ) p->matrix[fl][sl]->ptr.free_ptr.prev = b; p->matrix[fl][sl] = b; set_bit(sl, &p->sl_bitmap[fl]); set_bit(fl, &p->fl_bitmap); } /** * Region is a virtually contiguous memory region and Pool is * collection of such regions */ static inline void ADD_REGION(void *region, unsigned long region_size, struct xmem_pool *pool) { int fl, sl; struct bhdr *b, *lb; b = (struct bhdr *)(region); b->prev_hdr = NULL; b->size = ROUNDDOWN_SIZE(region_size - 2 * BHDR_OVERHEAD) | FREE_BLOCK | PREV_USED; MAPPING_INSERT(b->size & BLOCK_SIZE_MASK, &fl, &sl); INSERT_BLOCK(b, pool, fl, sl); /* The sentinel block: allows us to know when we're in the last block */ lb = GET_NEXT_BLOCK(b->ptr.buffer, b->size & BLOCK_SIZE_MASK); lb->prev_hdr = b; lb->size = 0 | USED_BLOCK | PREV_FREE; pool->used_size += BHDR_OVERHEAD; /* only sentinel block is "used" */ pool->num_regions++; } /* * TLSF pool-based allocator start. */ struct xmem_pool *xmem_pool_create( const char *name, xmem_pool_get_memory get_mem, xmem_pool_put_memory put_mem, unsigned long init_size, unsigned long max_size, unsigned long grow_size) { struct xmem_pool *pool; int pool_bytes, pool_order; BUG_ON(max_size && (max_size < init_size)); pool_bytes = ROUNDUP_SIZE(sizeof(*pool)); pool_order = get_order_from_bytes(pool_bytes); pool = (void *)alloc_xenheap_pages(pool_order, 0); if ( pool == NULL ) return NULL; memset(pool, 0, pool_bytes); /* Round to next page boundary */ init_size = ROUNDUP_PAGE(init_size); max_size = ROUNDUP_PAGE(max_size); grow_size = ROUNDUP_PAGE(grow_size); /* pool global overhead not included in used size */ pool->used_size = 0; pool->init_size = init_size; pool->max_size = max_size; pool->grow_size = grow_size; pool->get_mem = get_mem; pool->put_mem = put_mem; strlcpy(pool->name, name, sizeof(pool->name)); /* always obtain init_region lazily now to ensure it is get_mem'd * in the same "context" as all other regions */ spin_lock_init(&pool->lock); spin_lock(&pool_list_lock); list_add_tail(&pool->list, &pool_list_head); spin_unlock(&pool_list_lock); return pool; } unsigned long xmem_pool_get_used_size(struct xmem_pool *pool) { return pool->used_size; } unsigned long xmem_pool_get_total_size(struct xmem_pool *pool) { unsigned long total; total = ROUNDUP_SIZE(sizeof(*pool)) + pool->init_size + (pool->num_regions - 1) * pool->grow_size; return total; } void xmem_pool_destroy(struct xmem_pool *pool) { int pool_bytes, pool_order; if ( pool == NULL ) return; /* User is destroying without ever allocating from this pool */ if ( xmem_pool_get_used_size(pool) == BHDR_OVERHEAD ) { ASSERT(!pool->init_region); pool->used_size -= BHDR_OVERHEAD; } /* Check for memory leaks in this pool */ if ( xmem_pool_get_used_size(pool) ) printk("memory leak in pool: %s (%p). " "%lu bytes still in use.\n", pool->name, pool, xmem_pool_get_used_size(pool)); spin_lock(&pool_list_lock); list_del_init(&pool->list); spin_unlock(&pool_list_lock); pool_bytes = ROUNDUP_SIZE(sizeof(*pool)); pool_order = get_order_from_bytes(pool_bytes); free_xenheap_pages(pool,pool_order); } void *xmem_pool_alloc(unsigned long size, struct xmem_pool *pool) { struct bhdr *b, *b2, *next_b, *region; int fl, sl; unsigned long tmp_size; if ( pool->init_region == NULL ) { if ( (region = pool->get_mem(pool->init_size)) == NULL ) goto out; ADD_REGION(region, pool->init_size, pool); pool->init_region = region; } size = (size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : ROUNDUP_SIZE(size); /* Rounding up the requested size and calculating fl and sl */ spin_lock(&pool->lock); retry_find: MAPPING_SEARCH(&size, &fl, &sl); /* Searching a free block */ if ( !(b = FIND_SUITABLE_BLOCK(pool, &fl, &sl)) ) { /* Not found */ if ( size > (pool->grow_size - 2 * BHDR_OVERHEAD) ) goto out_locked; if ( pool->max_size && (pool->init_size + pool->num_regions * pool->grow_size > pool->max_size) ) goto out_locked; spin_unlock(&pool->lock); if ( (region = pool->get_mem(pool->grow_size)) == NULL ) goto out; spin_lock(&pool->lock); ADD_REGION(region, pool->grow_size, pool); goto retry_find; } EXTRACT_BLOCK_HDR(b, pool, fl, sl); /*-- found: */ next_b = GET_NEXT_BLOCK(b->ptr.buffer, b->size & BLOCK_SIZE_MASK); /* Should the block be split? */ tmp_size = (b->size & BLOCK_SIZE_MASK) - size; if ( tmp_size >= sizeof(struct bhdr) ) { tmp_size -= BHDR_OVERHEAD; b2 = GET_NEXT_BLOCK(b->ptr.buffer, size); b2->size = tmp_size | FREE_BLOCK | PREV_USED; b2->prev_hdr = b; next_b->prev_hdr = b2; MAPPING_INSERT(tmp_size, &fl, &sl); INSERT_BLOCK(b2, pool, fl, sl); b->size = size | (b->size & PREV_STATE); } else { next_b->size &= (~PREV_FREE); b->size &= (~FREE_BLOCK); /* Now it's used */ } pool->used_size += (b->size & BLOCK_SIZE_MASK) + BHDR_OVERHEAD; spin_unlock(&pool->lock); return (void *)b->ptr.buffer; /* Failed alloc */ out_locked: spin_unlock(&pool->lock); out: return NULL; } void xmem_pool_free(void *ptr, struct xmem_pool *pool) { struct bhdr *b, *tmp_b; int fl = 0, sl = 0; if ( unlikely(ptr == NULL) ) return; b = (struct bhdr *)((char *) ptr - BHDR_OVERHEAD); spin_lock(&pool->lock); b->size |= FREE_BLOCK; pool->used_size -= (b->size & BLOCK_SIZE_MASK) + BHDR_OVERHEAD; b->ptr.free_ptr = (struct free_ptr) { NULL, NULL}; tmp_b = GET_NEXT_BLOCK(b->ptr.buffer, b->size & BLOCK_SIZE_MASK); if ( tmp_b->size & FREE_BLOCK ) { MAPPING_INSERT(tmp_b->size & BLOCK_SIZE_MASK, &fl, &sl); EXTRACT_BLOCK(tmp_b, pool, fl, sl); b->size += (tmp_b->size & BLOCK_SIZE_MASK) + BHDR_OVERHEAD; } if ( b->size & PREV_FREE ) { tmp_b = b->prev_hdr; MAPPING_INSERT(tmp_b->size & BLOCK_SIZE_MASK, &fl, &sl); EXTRACT_BLOCK(tmp_b, pool, fl, sl); tmp_b->size += (b->size & BLOCK_SIZE_MASK) + BHDR_OVERHEAD; b = tmp_b; } tmp_b = GET_NEXT_BLOCK(b->ptr.buffer, b->size & BLOCK_SIZE_MASK); tmp_b->prev_hdr = b; MAPPING_INSERT(b->size & BLOCK_SIZE_MASK, &fl, &sl); if ( (b->prev_hdr == NULL) && ((tmp_b->size & BLOCK_SIZE_MASK) == 0) ) { pool->put_mem(b); pool->num_regions--; pool->used_size -= BHDR_OVERHEAD; /* sentinel block header */ goto out; } INSERT_BLOCK(b, pool, fl, sl); tmp_b->size |= PREV_FREE; tmp_b->prev_hdr = b; out: spin_unlock(&pool->lock); } int xmem_pool_maxalloc(struct xmem_pool *pool) { return pool->grow_size - (2 * BHDR_OVERHEAD); } /* * Glue for xmalloc(). */ static struct xmem_pool *xenpool; static void *xmalloc_pool_get(unsigned long size) { ASSERT(size == PAGE_SIZE); return alloc_xenheap_page(); } static void xmalloc_pool_put(void *p) { free_xenheap_page(p); } static void *xmalloc_whole_pages(unsigned long size, unsigned long align) { unsigned int i, order = get_order_from_bytes(size); void *res, *p; if ( align > size ) get_order_from_bytes(align); res = alloc_xenheap_pages(order, 0); if ( res == NULL ) return NULL; for ( p = res + PAGE_ALIGN(size), i = 0; i < order; ++i ) if ( (unsigned long)p & (PAGE_SIZE << i) ) { free_xenheap_pages(p, i); p += PAGE_SIZE << i; } PFN_ORDER(virt_to_page(res)) = PFN_UP(size); /* Check that there was no truncation: */ ASSERT(PFN_ORDER(virt_to_page(res)) == PFN_UP(size)); return res; } static void tlsf_init(void) { INIT_LIST_HEAD(&pool_list_head); spin_lock_init(&pool_list_lock); xenpool = xmem_pool_create( "xmalloc", xmalloc_pool_get, xmalloc_pool_put, PAGE_SIZE, 0, PAGE_SIZE); BUG_ON(!xenpool); } /* * xmalloc() */ #ifndef ZERO_BLOCK_PTR /* Return value for zero-size allocation, distinguished from NULL. */ #define ZERO_BLOCK_PTR ((void *)-1L) #endif void *_xmalloc(unsigned long size, unsigned long align) { void *p = NULL; u32 pad; ASSERT(!in_irq()); if ( !size ) return ZERO_BLOCK_PTR; ASSERT((align & (align - 1)) == 0); if ( align < MEM_ALIGN ) align = MEM_ALIGN; size += align - MEM_ALIGN; if ( !xenpool ) tlsf_init(); if ( size < PAGE_SIZE ) p = xmem_pool_alloc(size, xenpool); if ( p == NULL ) return xmalloc_whole_pages(size - align + MEM_ALIGN, align); /* Add alignment padding. */ if ( (pad = -(long)p & (align - 1)) != 0 ) { char *q = (char *)p + pad; struct bhdr *b = (struct bhdr *)(q - BHDR_OVERHEAD); ASSERT(q > (char *)p); b->size = pad | 1; p = q; } ASSERT(((unsigned long)p & (align - 1)) == 0); return p; } void *_xzalloc(unsigned long size, unsigned long align) { void *p = _xmalloc(size, align); return p ? memset(p, 0, size) : p; } void xfree(void *p) { struct bhdr *b; if ( p == NULL || p == ZERO_BLOCK_PTR ) return; ASSERT(!in_irq()); if ( !((unsigned long)p & (PAGE_SIZE - 1)) ) { unsigned long size = PFN_ORDER(virt_to_page(p)); unsigned int i, order = get_order_from_pages(size); BUG_ON((unsigned long)p & ((PAGE_SIZE << order) - 1)); PFN_ORDER(virt_to_page(p)) = 0; for ( i = 0; ; ++i ) { if ( !(size & (1 << i)) ) continue; size -= 1 << i; free_xenheap_pages(p + (size << PAGE_SHIFT), i); if ( i + 1 >= order ) return; } } /* Strip alignment padding. */ b = (struct bhdr *)((char *) p - BHDR_OVERHEAD); if ( b->size & 1 ) { p = (char *)p - (b->size & ~1u); b = (struct bhdr *)((char *)p - BHDR_OVERHEAD); ASSERT(!(b->size & 1)); } xmem_pool_free(p, xenpool); } xen-4.4.0/xen/common/bunzip2.c0000664000175000017500000005572412307313555014277 0ustar smbsmb/* vi: set sw = 4 ts = 4: */ /* Small bzip2 deflate implementation, by Rob Landley (rob@landley.net). Based on bzip2 decompression code by Julian R Seward (jseward@acm.org), which also acknowledges contributions by Mike Burrows, David Wheeler, Peter Fenwick, Alistair Moffat, Radford Neal, Ian H. Witten, Robert Sedgewick, and Jon L. Bentley. This code is licensed under the LGPLv2: LGPL (http://www.gnu.org/copyleft/lgpl.html */ /* Size and speed optimizations by Manuel Novoa III (mjn3@codepoet.org). More efficient reading of Huffman codes, a streamlined read_bunzip() function, and various other tweaks. In (limited) tests, approximately 20% faster than bzcat on x86 and about 10% faster on arm. Note that about 2/3 of the time is spent in read_unzip() reversing the Burrows-Wheeler transformation. Much of that time is delay resulting from cache misses. I would ask that anyone benefiting from this work, especially those using it in commercial products, consider making a donation to my local non-profit hospice organization in the name of the woman I loved, who passed away Feb. 12, 2003. In memory of Toni W. Hagan Hospice of Acadiana, Inc. 2600 Johnston St., Suite 200 Lafayette, LA 70503-3240 Phone (337) 232-1234 or 1-800-738-2226 Fax (337) 232-1297 http://www.hospiceacadiana.com/ Manuel */ /* Made it fit for running in Linux Kernel by Alain Knaff (alain@knaff.lu) */ #include "decompress.h" #ifndef INT_MAX #define INT_MAX 0x7fffffff #endif /* Constants for Huffman coding */ #define MAX_GROUPS 6 #define GROUP_SIZE 50 /* 64 would have been more efficient */ #define MAX_HUFCODE_BITS 20 /* Longest Huffman code allowed */ #define MAX_SYMBOLS 258 /* 256 literals + RUNA + RUNB */ #define SYMBOL_RUNA 0 #define SYMBOL_RUNB 1 /* Status return values */ #define RETVAL_OK 0 #define RETVAL_LAST_BLOCK (-1) #define RETVAL_NOT_BZIP_DATA (-2) #define RETVAL_UNEXPECTED_INPUT_EOF (-3) #define RETVAL_UNEXPECTED_OUTPUT_EOF (-4) #define RETVAL_DATA_ERROR (-5) #define RETVAL_OUT_OF_MEMORY (-6) #define RETVAL_OBSOLETE_INPUT (-7) /* Other housekeeping constants */ #define BZIP2_IOBUF_SIZE 4096 /* This is what we know about each Huffman coding group */ struct group_data { /* We have an extra slot at the end of limit[] for a sentinal value. */ int limit[MAX_HUFCODE_BITS+1]; int base[MAX_HUFCODE_BITS]; int permute[MAX_SYMBOLS]; int minLen, maxLen; }; /* Structure holding all the housekeeping data, including IO buffers and memory that persists between calls to bunzip */ struct bunzip_data { /* State for interrupting output loop */ int writeCopies, writePos, writeRunCountdown, writeCount, writeCurrent; /* I/O tracking data (file handles, buffers, positions, etc.) */ int (*fill)(void*, unsigned int); int inbufCount, inbufPos /*, outbufPos*/; unsigned char *inbuf /*,*outbuf*/; unsigned int inbufBitCount, inbufBits; /* The CRC values stored in the block header and calculated from the data */ unsigned int crc32Table[256], headerCRC, totalCRC, writeCRC; /* Intermediate buffer and its size (in bytes) */ unsigned int *dbuf, dbufSize; /* These things are a bit too big to go on the stack */ unsigned char selectors[32768]; /* nSelectors = 15 bits */ struct group_data groups[MAX_GROUPS]; /* Huffman coding tables */ int io_error; /* non-zero if we have IO error */ }; /* Return the next nnn bits of input. All reads from the compressed input are done through this function. All reads are big endian */ static unsigned int INIT get_bits(struct bunzip_data *bd, char bits_wanted) { unsigned int bits = 0; /* If we need to get more data from the byte buffer, do so. (Loop getting one byte at a time to enforce endianness and avoid unaligned access.) */ while (bd->inbufBitCount < bits_wanted) { /* If we need to read more data from file into byte buffer, do so */ if (bd->inbufPos == bd->inbufCount) { if (bd->io_error) return 0; bd->inbufCount = bd->fill(bd->inbuf, BZIP2_IOBUF_SIZE); if (bd->inbufCount <= 0) { bd->io_error = RETVAL_UNEXPECTED_INPUT_EOF; return 0; } bd->inbufPos = 0; } /* Avoid 32-bit overflow (dump bit buffer to top of output) */ if (bd->inbufBitCount >= 24) { bits = bd->inbufBits&((1 << bd->inbufBitCount)-1); bits_wanted -= bd->inbufBitCount; bits <<= bits_wanted; bd->inbufBitCount = 0; } /* Grab next 8 bits of input from buffer. */ bd->inbufBits = (bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++]; bd->inbufBitCount += 8; } /* Calculate result */ bd->inbufBitCount -= bits_wanted; bits |= (bd->inbufBits >> bd->inbufBitCount)&((1 << bits_wanted)-1); return bits; } /* Unpacks the next block and sets up for the inverse burrows-wheeler step. */ static int INIT get_next_block(struct bunzip_data *bd) { struct group_data *hufGroup = NULL; int *base = NULL; int *limit = NULL; int dbufCount, nextSym, dbufSize, groupCount, selector, i, j, k, t, runPos, symCount, symTotal, nSelectors, byteCount[256]; unsigned char uc, symToByte[256], mtfSymbol[256], *selectors; unsigned int *dbuf, origPtr; dbuf = bd->dbuf; dbufSize = bd->dbufSize; selectors = bd->selectors; /* Read in header signature and CRC, then validate signature. (last block signature means CRC is for whole file, return now) */ i = get_bits(bd, 24); j = get_bits(bd, 24); bd->headerCRC = get_bits(bd, 32); if ((i == 0x177245) && (j == 0x385090)) return RETVAL_LAST_BLOCK; if ((i != 0x314159) || (j != 0x265359)) return RETVAL_NOT_BZIP_DATA; /* We can add support for blockRandomised if anybody complains. There was some code for this in busybox 1.0.0-pre3, but nobody ever noticed that it didn't actually work. */ if (get_bits(bd, 1)) return RETVAL_OBSOLETE_INPUT; origPtr = get_bits(bd, 24); if (origPtr > dbufSize) return RETVAL_DATA_ERROR; /* mapping table: if some byte values are never used (encoding things like ascii text), the compression code removes the gaps to have fewer symbols to deal with, and writes a sparse bitfield indicating which values were present. We make a translation table to convert the symbols back to the corresponding bytes. */ t = get_bits(bd, 16); symTotal = 0; for (i = 0; i < 16; i++) { if (t&(1 << (15-i))) { k = get_bits(bd, 16); for (j = 0; j < 16; j++) if (k&(1 << (15-j))) symToByte[symTotal++] = (16*i)+j; } } /* How many different Huffman coding groups does this block use? */ groupCount = get_bits(bd, 3); if (groupCount < 2 || groupCount > MAX_GROUPS) return RETVAL_DATA_ERROR; /* nSelectors: Every GROUP_SIZE many symbols we select a new Huffman coding group. Read in the group selector list, which is stored as MTF encoded bit runs. (MTF = Move To Front, as each value is used it's moved to the start of the list.) */ nSelectors = get_bits(bd, 15); if (!nSelectors) return RETVAL_DATA_ERROR; for (i = 0; i < groupCount; i++) mtfSymbol[i] = i; for (i = 0; i < nSelectors; i++) { /* Get next value */ for (j = 0; get_bits(bd, 1); j++) if (j >= groupCount) return RETVAL_DATA_ERROR; /* Decode MTF to get the next selector */ uc = mtfSymbol[j]; for (; j; j--) mtfSymbol[j] = mtfSymbol[j-1]; mtfSymbol[0] = selectors[i] = uc; } /* Read the Huffman coding tables for each group, which code for symTotal literal symbols, plus two run symbols (RUNA, RUNB) */ symCount = symTotal+2; for (j = 0; j < groupCount; j++) { unsigned char length[MAX_SYMBOLS], temp[MAX_HUFCODE_BITS+1]; int minLen, maxLen, pp; /* Read Huffman code lengths for each symbol. They're stored in a way similar to mtf; record a starting value for the first symbol, and an offset from the previous value for everys symbol after that. (Subtracting 1 before the loop and then adding it back at the end is an optimization that makes the test inside the loop simpler: symbol length 0 becomes negative, so an unsigned inequality catches it.) */ t = get_bits(bd, 5)-1; for (i = 0; i < symCount; i++) { for (;;) { if (((unsigned)t) > (MAX_HUFCODE_BITS-1)) return RETVAL_DATA_ERROR; /* If first bit is 0, stop. Else second bit indicates whether to increment or decrement the value. Optimization: grab 2 bits and unget the second if the first was 0. */ k = get_bits(bd, 2); if (k < 2) { bd->inbufBitCount++; break; } /* Add one if second bit 1, else * subtract 1. Avoids if/else */ t += (((k+1)&2)-1); } /* Correct for the initial -1, to get the * final symbol length */ length[i] = t+1; } /* Find largest and smallest lengths in this group */ minLen = maxLen = length[0]; for (i = 1; i < symCount; i++) { if (length[i] > maxLen) maxLen = length[i]; else if (length[i] < minLen) minLen = length[i]; } /* Calculate permute[], base[], and limit[] tables from * length[]. * * permute[] is the lookup table for converting * Huffman coded symbols into decoded symbols. base[] * is the amount to subtract from the value of a * Huffman symbol of a given length when using * permute[]. * * limit[] indicates the largest numerical value a * symbol with a given number of bits can have. This * is how the Huffman codes can vary in length: each * code with a value > limit[length] needs another * bit. */ hufGroup = bd->groups+j; hufGroup->minLen = minLen; hufGroup->maxLen = maxLen; /* Note that minLen can't be smaller than 1, so we adjust the base and limit array pointers so we're not always wasting the first entry. We do this again when using them (during symbol decoding).*/ base = hufGroup->base-1; limit = hufGroup->limit-1; /* Calculate permute[]. Concurently, initialize * temp[] and limit[]. */ pp = 0; for (i = minLen; i <= maxLen; i++) { temp[i] = limit[i] = 0; for (t = 0; t < symCount; t++) if (length[t] == i) hufGroup->permute[pp++] = t; } /* Count symbols coded for at each bit length */ for (i = 0; i < symCount; i++) temp[length[i]]++; /* Calculate limit[] (the largest symbol-coding value *at each bit length, which is (previous limit << *1)+symbols at this level), and base[] (number of *symbols to ignore at each bit length, which is limit *minus the cumulative count of symbols coded for *already). */ pp = t = 0; for (i = minLen; i < maxLen; i++) { pp += temp[i]; /* We read the largest possible symbol size and then unget bits after determining how many we need, and those extra bits could be set to anything. (They're noise from future symbols.) At each level we're really only interested in the first few bits, so here we set all the trailing to-be-ignored bits to 1 so they don't affect the value > limit[length] comparison. */ limit[i] = (pp << (maxLen - i)) - 1; pp <<= 1; base[i+1] = pp-(t += temp[i]); } limit[maxLen+1] = INT_MAX; /* Sentinal value for * reading next sym. */ limit[maxLen] = pp+temp[maxLen]-1; base[minLen] = 0; } /* We've finished reading and digesting the block header. Now read this block's Huffman coded symbols from the file and undo the Huffman coding and run length encoding, saving the result into dbuf[dbufCount++] = uc */ /* Initialize symbol occurrence counters and symbol Move To * Front table */ for (i = 0; i < 256; i++) { byteCount[i] = 0; mtfSymbol[i] = (unsigned char)i; } /* Loop through compressed symbols. */ runPos = dbufCount = symCount = selector = 0; for (;;) { /* Determine which Huffman coding group to use. */ if (!(symCount--)) { symCount = GROUP_SIZE-1; if (selector >= nSelectors) return RETVAL_DATA_ERROR; hufGroup = bd->groups+selectors[selector++]; base = hufGroup->base-1; limit = hufGroup->limit-1; } /* Read next Huffman-coded symbol. */ /* Note: It is far cheaper to read maxLen bits and back up than it is to read minLen bits and then an additional bit at a time, testing as we go. Because there is a trailing last block (with file CRC), there is no danger of the overread causing an unexpected EOF for a valid compressed file. As a further optimization, we do the read inline (falling back to a call to get_bits if the buffer runs dry). The following (up to got_huff_bits:) is equivalent to j = get_bits(bd, hufGroup->maxLen); */ while (bd->inbufBitCount < hufGroup->maxLen) { if (bd->inbufPos == bd->inbufCount) { j = get_bits(bd, hufGroup->maxLen); goto got_huff_bits; } bd->inbufBits = (bd->inbufBits << 8)|bd->inbuf[bd->inbufPos++]; bd->inbufBitCount += 8; }; bd->inbufBitCount -= hufGroup->maxLen; j = (bd->inbufBits >> bd->inbufBitCount)& ((1 << hufGroup->maxLen)-1); got_huff_bits: /* Figure how how many bits are in next symbol and * unget extras */ i = hufGroup->minLen; while (j > limit[i]) ++i; bd->inbufBitCount += (hufGroup->maxLen - i); /* Huffman decode value to get nextSym (with bounds checking) */ if ((i > hufGroup->maxLen) || (((unsigned)(j = (j>>(hufGroup->maxLen-i))-base[i])) >= MAX_SYMBOLS)) return RETVAL_DATA_ERROR; nextSym = hufGroup->permute[j]; /* We have now decoded the symbol, which indicates either a new literal byte, or a repeated run of the most recent literal byte. First, check if nextSym indicates a repeated run, and if so loop collecting how many times to repeat the last literal. */ if (((unsigned)nextSym) <= SYMBOL_RUNB) { /* RUNA or RUNB */ /* If this is the start of a new run, zero out * counter */ if (!runPos) { runPos = 1; t = 0; } /* Neat trick that saves 1 symbol: instead of or-ing 0 or 1 at each bit position, add 1 or 2 instead. For example, 1011 is 1 << 0 + 1 << 1 + 2 << 2. 1010 is 2 << 0 + 2 << 1 + 1 << 2. You can make any bit pattern that way using 1 less symbol than the basic or 0/1 method (except all bits 0, which would use no symbols, but a run of length 0 doesn't mean anything in this context). Thus space is saved. */ t += (runPos << nextSym); /* +runPos if RUNA; +2*runPos if RUNB */ runPos <<= 1; continue; } /* When we hit the first non-run symbol after a run, we now know how many times to repeat the last literal, so append that many copies to our buffer of decoded symbols (dbuf) now. (The last literal used is the one at the head of the mtfSymbol array.) */ if (runPos) { runPos = 0; if (dbufCount+t >= dbufSize) return RETVAL_DATA_ERROR; uc = symToByte[mtfSymbol[0]]; byteCount[uc] += t; while (t--) dbuf[dbufCount++] = uc; } /* Is this the terminating symbol? */ if (nextSym > symTotal) break; /* At this point, nextSym indicates a new literal character. Subtract one to get the position in the MTF array at which this literal is currently to be found. (Note that the result can't be -1 or 0, because 0 and 1 are RUNA and RUNB. But another instance of the first symbol in the mtf array, position 0, would have been handled as part of a run above. Therefore 1 unused mtf position minus 2 non-literal nextSym values equals -1.) */ if (dbufCount >= dbufSize) return RETVAL_DATA_ERROR; i = nextSym - 1; uc = mtfSymbol[i]; /* Adjust the MTF array. Since we typically expect to *move only a small number of symbols, and are bound *by 256 in any case, using memmove here would *typically be bigger and slower due to function call *overhead and other assorted setup costs. */ do { mtfSymbol[i] = mtfSymbol[i-1]; } while (--i); mtfSymbol[0] = uc; uc = symToByte[uc]; /* We have our literal byte. Save it into dbuf. */ byteCount[uc]++; dbuf[dbufCount++] = (unsigned int)uc; } /* At this point, we've read all the Huffman-coded symbols (and repeated runs) for this block from the input stream, and decoded them into the intermediate buffer. There are dbufCount many decoded bytes in dbuf[]. Now undo the Burrows-Wheeler transform on dbuf. See http://dogma.net/markn/articles/bwt/bwt.htm */ /* Turn byteCount into cumulative occurrence counts of 0 to n-1. */ j = 0; for (i = 0; i < 256; i++) { k = j+byteCount[i]; byteCount[i] = j; j = k; } /* Figure out what order dbuf would be in if we sorted it. */ for (i = 0; i < dbufCount; i++) { uc = (unsigned char)(dbuf[i] & 0xff); dbuf[byteCount[uc]] |= (i << 8); byteCount[uc]++; } /* Decode first byte by hand to initialize "previous" byte. Note that it doesn't get output, and if the first three characters are identical it doesn't qualify as a run (hence writeRunCountdown = 5). */ if (dbufCount) { if (origPtr >= dbufCount) return RETVAL_DATA_ERROR; bd->writePos = dbuf[origPtr]; bd->writeCurrent = (unsigned char)(bd->writePos&0xff); bd->writePos >>= 8; bd->writeRunCountdown = 5; } bd->writeCount = dbufCount; return RETVAL_OK; } /* Undo burrows-wheeler transform on intermediate buffer to produce output. If start_bunzip was initialized with out_fd =-1, then up to len bytes of data are written to outbuf. Return value is number of bytes written or error (all errors are negative numbers). If out_fd!=-1, outbuf and len are ignored, data is written to out_fd and return is RETVAL_OK or error. */ static int INIT read_bunzip(struct bunzip_data *bd, unsigned char *outbuf, int len) { const unsigned int *dbuf; int pos, xcurrent, previous, gotcount; /* If last read was short due to end of file, return last block now */ if (bd->writeCount < 0) return bd->writeCount; gotcount = 0; dbuf = bd->dbuf; pos = bd->writePos; xcurrent = bd->writeCurrent; /* We will always have pending decoded data to write into the output buffer unless this is the very first call (in which case we haven't Huffman-decoded a block into the intermediate buffer yet). */ if (bd->writeCopies) { /* Inside the loop, writeCopies means extra copies (beyond 1) */ --bd->writeCopies; /* Loop outputting bytes */ for (;;) { /* If the output buffer is full, snapshot * state and return */ if (gotcount >= len) { bd->writePos = pos; bd->writeCurrent = xcurrent; bd->writeCopies++; return len; } /* Write next byte into output buffer, updating CRC */ outbuf[gotcount++] = xcurrent; bd->writeCRC = (((bd->writeCRC) << 8) ^bd->crc32Table[((bd->writeCRC) >> 24) ^xcurrent]); /* Loop now if we're outputting multiple * copies of this byte */ if (bd->writeCopies) { --bd->writeCopies; continue; } decode_next_byte: if (!bd->writeCount--) break; /* Follow sequence vector to undo * Burrows-Wheeler transform */ previous = xcurrent; pos = dbuf[pos]; xcurrent = pos&0xff; pos >>= 8; /* After 3 consecutive copies of the same byte, the 4th is a repeat count. We count down from 4 instead *of counting up because testing for non-zero is faster */ if (--bd->writeRunCountdown) { if (xcurrent != previous) bd->writeRunCountdown = 4; } else { /* We have a repeated run, this byte * indicates the count */ bd->writeCopies = xcurrent; xcurrent = previous; bd->writeRunCountdown = 5; /* Sometimes there are just 3 bytes * (run length 0) */ if (!bd->writeCopies) goto decode_next_byte; /* Subtract the 1 copy we'd output * anyway to get extras */ --bd->writeCopies; } } /* Decompression of this block completed successfully */ bd->writeCRC = ~bd->writeCRC; bd->totalCRC = ((bd->totalCRC << 1) | (bd->totalCRC >> 31)) ^ bd->writeCRC; /* If this block had a CRC error, force file level CRC error. */ if (bd->writeCRC != bd->headerCRC) { bd->totalCRC = bd->headerCRC+1; return RETVAL_LAST_BLOCK; } } /* Refill the intermediate buffer by Huffman-decoding next * block of input */ /* (previous is just a convenient unused temp variable here) */ previous = get_next_block(bd); if (previous) { bd->writeCount = previous; return (previous != RETVAL_LAST_BLOCK) ? previous : gotcount; } bd->writeCRC = 0xffffffffUL; pos = bd->writePos; xcurrent = bd->writeCurrent; goto decode_next_byte; } static int INIT nofill(void *buf, unsigned int len) { return -1; } /* Allocate the structure, read file header. If in_fd ==-1, inbuf must contain a complete bunzip file (len bytes long). If in_fd!=-1, inbuf and len are ignored, and data is read from file handle into temporary buffer. */ static int INIT start_bunzip(struct bunzip_data **bdp, void *inbuf, int len, int (*fill)(void*, unsigned int)) { struct bunzip_data *bd; unsigned int i, j, c; const unsigned int BZh0 = (((unsigned int)'B') << 24)+(((unsigned int)'Z') << 16) +(((unsigned int)'h') << 8)+(unsigned int)'0'; /* Figure out how much data to allocate */ i = sizeof(struct bunzip_data); /* Allocate bunzip_data. Most fields initialize to zero. */ bd = *bdp = malloc(i); if (!bd) return RETVAL_OUT_OF_MEMORY; memset(bd, 0, sizeof(struct bunzip_data)); /* Setup input buffer */ bd->inbuf = inbuf; bd->inbufCount = len; if (fill != NULL) bd->fill = fill; else bd->fill = nofill; /* Init the CRC32 table (big endian) */ for (i = 0; i < 256; i++) { c = i << 24; for (j = 8; j; j--) c = c&0x80000000 ? (c << 1)^0x04c11db7 : (c << 1); bd->crc32Table[i] = c; } /* Ensure that file starts with "BZh['1'-'9']." */ i = get_bits(bd, 32); if (((unsigned int)(i-BZh0-1)) >= 9) return RETVAL_NOT_BZIP_DATA; /* Fourth byte (ascii '1'-'9'), indicates block size in units of 100k of uncompressed data. Allocate intermediate buffer for block. */ bd->dbufSize = 100000*(i-BZh0); bd->dbuf = large_malloc(bd->dbufSize * sizeof(int)); if (!bd->dbuf) return RETVAL_OUT_OF_MEMORY; return RETVAL_OK; } /* Example usage: decompress src_fd to dst_fd. (Stops at end of bzip2 data, not end of file.) */ STATIC int INIT bunzip2(unsigned char *buf, unsigned int len, int(*fill)(void*, unsigned int), int(*flush)(void*, unsigned int), unsigned char *outbuf, unsigned int *pos, void(*error)(const char *x)) { struct bunzip_data *bd; int i = -1; unsigned char *inbuf; if (flush) outbuf = malloc(BZIP2_IOBUF_SIZE); if (!outbuf) { error("Could not allocate output buffer"); return RETVAL_OUT_OF_MEMORY; } if (buf) inbuf = buf; else inbuf = malloc(BZIP2_IOBUF_SIZE); if (!inbuf) { error("Could not allocate input buffer"); i = RETVAL_OUT_OF_MEMORY; goto exit_0; } i = start_bunzip(&bd, inbuf, len, fill); if (!i) { for (;;) { i = read_bunzip(bd, outbuf, BZIP2_IOBUF_SIZE); if (i <= 0) break; if (!flush) outbuf += i; else if (i != flush(outbuf, i)) { i = RETVAL_UNEXPECTED_OUTPUT_EOF; break; } } } /* Check CRC and release memory */ if (i == RETVAL_LAST_BLOCK) { if (bd->headerCRC != bd->totalCRC) error("Data integrity error when decompressing."); else i = RETVAL_OK; } else if (i == RETVAL_UNEXPECTED_OUTPUT_EOF) { error("Compressed file ends unexpectedly"); } if (!bd) goto exit_1; if (bd->dbuf) large_free(bd->dbuf); if (pos) *pos = bd->inbufPos; free(bd); exit_1: if (!buf) free(inbuf); exit_0: if (flush) free(outbuf); return i; } xen-4.4.0/xen/common/page_alloc.c0000664000175000017500000013657412307313555014777 0ustar smbsmb/****************************************************************************** * page_alloc.c * * Simple buddy heap allocator for Xen. * * Copyright (c) 2002-2004 K A Fraser * Copyright (c) 2006 IBM Ryan Harper * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CONFIG_X86 #include #include /* for highmem_start only */ #else #define p2m_pod_offline_or_broken_hit(pg) 0 #define p2m_pod_offline_or_broken_replace(pg) BUG_ON(pg != NULL) #endif /* * Comma-separated list of hexadecimal page numbers containing bad bytes. * e.g. 'badpage=0x3f45,0x8a321'. */ static char __initdata opt_badpage[100] = ""; string_param("badpage", opt_badpage); /* * no-bootscrub -> Free pages are not zeroed during boot. */ static bool_t opt_bootscrub __initdata = 1; boolean_param("bootscrub", opt_bootscrub); /* * Bit width of the DMA heap -- used to override NUMA-node-first. * allocation strategy, which can otherwise exhaust low memory. */ static unsigned int dma_bitsize; integer_param("dma_bits", dma_bitsize); #define round_pgdown(_p) ((_p)&PAGE_MASK) #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK) /* Offlined page list, protected by heap_lock. */ PAGE_LIST_HEAD(page_offlined_list); /* Broken page list, protected by heap_lock. */ PAGE_LIST_HEAD(page_broken_list); /************************* * BOOT-TIME ALLOCATOR */ static unsigned long __initdata first_valid_mfn = ~0UL; static struct bootmem_region { unsigned long s, e; /* MFNs @s through @e-1 inclusive are free */ } *__initdata bootmem_region_list; static unsigned int __initdata nr_bootmem_regions; static void __init boot_bug(int line) { panic("Boot BUG at %s:%d", __FILE__, line); } #define BOOT_BUG_ON(p) if ( p ) boot_bug(__LINE__); static void __init bootmem_region_add(unsigned long s, unsigned long e) { unsigned int i; if ( (bootmem_region_list == NULL) && (s < e) ) bootmem_region_list = mfn_to_virt(s++); if ( s >= e ) return; for ( i = 0; i < nr_bootmem_regions; i++ ) if ( s < bootmem_region_list[i].e ) break; BOOT_BUG_ON((i < nr_bootmem_regions) && (e > bootmem_region_list[i].s)); BOOT_BUG_ON(nr_bootmem_regions == (PAGE_SIZE / sizeof(struct bootmem_region))); memmove(&bootmem_region_list[i+1], &bootmem_region_list[i], (nr_bootmem_regions - i) * sizeof(*bootmem_region_list)); bootmem_region_list[i] = (struct bootmem_region) { s, e }; nr_bootmem_regions++; } static void __init bootmem_region_zap(unsigned long s, unsigned long e) { unsigned int i; for ( i = 0; i < nr_bootmem_regions; i++ ) { struct bootmem_region *r = &bootmem_region_list[i]; if ( e <= r->s ) break; if ( s >= r->e ) continue; if ( s <= r->s ) { r->s = min(e, r->e); } else if ( e >= r->e ) { r->e = s; } else { unsigned long _e = r->e; r->e = s; bootmem_region_add(e, _e); } } } void __init init_boot_pages(paddr_t ps, paddr_t pe) { unsigned long bad_spfn, bad_epfn; const char *p; #ifdef CONFIG_X86 const unsigned long *badpage = NULL; unsigned int i, array_size; #endif ps = round_pgup(ps); pe = round_pgdown(pe); if ( pe <= ps ) return; first_valid_mfn = min_t(unsigned long, ps >> PAGE_SHIFT, first_valid_mfn); bootmem_region_add(ps >> PAGE_SHIFT, pe >> PAGE_SHIFT); #ifdef CONFIG_X86 /* * Here we put platform-specific memory range workarounds, i.e. * memory known to be corrupt or otherwise in need to be reserved on * specific platforms. * We get these certain pages and remove them from memory region list. */ badpage = get_platform_badpages(&array_size); if ( badpage ) { for ( i = 0; i < array_size; i++ ) { bootmem_region_zap(*badpage >> PAGE_SHIFT, (*badpage >> PAGE_SHIFT) + 1); badpage++; } } #endif /* Check new pages against the bad-page list. */ p = opt_badpage; while ( *p != '\0' ) { bad_spfn = simple_strtoul(p, &p, 0); bad_epfn = bad_spfn; if ( *p == '-' ) { p++; bad_epfn = simple_strtoul(p, &p, 0); if ( bad_epfn < bad_spfn ) bad_epfn = bad_spfn; } if ( *p == ',' ) p++; else if ( *p != '\0' ) break; bootmem_region_zap(bad_spfn, bad_epfn+1); } } unsigned long __init alloc_boot_pages( unsigned long nr_pfns, unsigned long pfn_align) { unsigned long pg, _e; int i; for ( i = nr_bootmem_regions - 1; i >= 0; i-- ) { struct bootmem_region *r = &bootmem_region_list[i]; pg = (r->e - nr_pfns) & ~(pfn_align - 1); if ( pg < r->s ) continue; #if defined(CONFIG_X86) && !defined(NDEBUG) /* * Filtering pfn_align == 1 since the only allocations using a bigger * alignment are the ones used for setting up the frame table chunks. * Those allocations get remapped anyway, i.e. them not having 1:1 * mappings always accessible is not a problem. */ if ( highmem_start && pfn_align == 1 && r->e > PFN_DOWN(highmem_start) ) { pg = r->s; if ( pg + nr_pfns > PFN_DOWN(highmem_start) ) continue; r->s = pg + nr_pfns; return pg; } #endif _e = r->e; r->e = pg; bootmem_region_add(pg + nr_pfns, _e); return pg; } BOOT_BUG_ON(1); return 0; } /************************* * BINARY BUDDY ALLOCATOR */ #define MEMZONE_XEN 0 #define NR_ZONES (PADDR_BITS - PAGE_SHIFT + 1) #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 1 : ((b) - PAGE_SHIFT)) #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN : \ (fls(page_to_mfn(pg)) ? : 1)) typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1]; static heap_by_zone_and_order_t *_heap[MAX_NUMNODES]; #define heap(node, zone, order) ((*_heap[node])[zone][order]) static unsigned long *avail[MAX_NUMNODES]; static long total_avail_pages; /* TMEM: Reserve a fraction of memory for mid-size (0page_alloc_lock)); d->tot_pages += pages; /* * can test d->claimed_pages race-free because it can only change * if d->page_alloc_lock and heap_lock are both held, see also * domain_set_outstanding_pages below */ if ( !d->outstanding_pages ) goto out; spin_lock(&heap_lock); /* adjust domain outstanding pages; may not go negative */ dom_before = d->outstanding_pages; dom_after = dom_before - pages; BUG_ON(dom_before < 0); dom_claimed = dom_after < 0 ? 0 : dom_after; d->outstanding_pages = dom_claimed; /* flag accounting bug if system outstanding_claims would go negative */ sys_before = outstanding_claims; sys_after = sys_before - (dom_before - dom_claimed); BUG_ON(sys_after < 0); outstanding_claims = sys_after; spin_unlock(&heap_lock); out: return d->tot_pages; } int domain_set_outstanding_pages(struct domain *d, unsigned long pages) { int ret = -ENOMEM; unsigned long claim, avail_pages; /* * take the domain's page_alloc_lock, else all d->tot_page adjustments * must always take the global heap_lock rather than only in the much * rarer case that d->outstanding_pages is non-zero */ spin_lock(&d->page_alloc_lock); spin_lock(&heap_lock); /* pages==0 means "unset" the claim. */ if ( pages == 0 ) { outstanding_claims -= d->outstanding_pages; d->outstanding_pages = 0; ret = 0; goto out; } /* only one active claim per domain please */ if ( d->outstanding_pages ) { ret = -EINVAL; goto out; } /* disallow a claim not exceeding current tot_pages or above max_pages */ if ( (pages <= d->tot_pages) || (pages > d->max_pages) ) { ret = -EINVAL; goto out; } /* how much memory is available? */ avail_pages = total_avail_pages; /* Note: The usage of claim means that allocation from a guest *might* * have to come from freeable memory. Using free memory is always better, if * it is available, than using freeable memory. * * But that is OK as once the claim has been made, it still can take minutes * before the claim is fully satisfied. Tmem can make use of the unclaimed * pages during this time (to store ephemeral/freeable pages only, * not persistent pages). */ avail_pages += tmem_freeable_pages(); avail_pages -= outstanding_claims; /* * Note, if domain has already allocated memory before making a claim * then the claim must take tot_pages into account */ claim = pages - d->tot_pages; if ( claim > avail_pages ) goto out; /* yay, claim fits in available memory, stake the claim, success! */ d->outstanding_pages = claim; outstanding_claims += d->outstanding_pages; ret = 0; out: spin_unlock(&heap_lock); spin_unlock(&d->page_alloc_lock); return ret; } void get_outstanding_claims(uint64_t *free_pages, uint64_t *outstanding_pages) { spin_lock(&heap_lock); *outstanding_pages = outstanding_claims; *free_pages = avail_domheap_pages(); spin_unlock(&heap_lock); } static unsigned long init_node_heap(int node, unsigned long mfn, unsigned long nr, bool_t *use_tail) { /* First node to be discovered has its heap metadata statically alloced. */ static heap_by_zone_and_order_t _heap_static; static unsigned long avail_static[NR_ZONES]; static int first_node_initialised; unsigned long needed = (sizeof(**_heap) + sizeof(**avail) * NR_ZONES + PAGE_SIZE - 1) >> PAGE_SHIFT; #ifdef DIRECTMAP_VIRT_END unsigned long eva = min(DIRECTMAP_VIRT_END, HYPERVISOR_VIRT_END); #endif int i, j; if ( !first_node_initialised ) { _heap[node] = &_heap_static; avail[node] = avail_static; first_node_initialised = 1; needed = 0; } #ifdef DIRECTMAP_VIRT_END else if ( *use_tail && nr >= needed && (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) ) { _heap[node] = mfn_to_virt(mfn + nr - needed); avail[node] = mfn_to_virt(mfn + nr - 1) + PAGE_SIZE - sizeof(**avail) * NR_ZONES; } else if ( nr >= needed && (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) ) { _heap[node] = mfn_to_virt(mfn); avail[node] = mfn_to_virt(mfn + needed - 1) + PAGE_SIZE - sizeof(**avail) * NR_ZONES; *use_tail = 0; } #endif else if ( get_order_from_bytes(sizeof(**_heap)) == get_order_from_pages(needed) ) { _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0); BUG_ON(!_heap[node]); avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) - sizeof(**avail) * NR_ZONES; needed = 0; } else { _heap[node] = xmalloc(heap_by_zone_and_order_t); avail[node] = xmalloc_array(unsigned long, NR_ZONES); BUG_ON(!_heap[node] || !avail[node]); needed = 0; } memset(avail[node], 0, NR_ZONES * sizeof(long)); for ( i = 0; i < NR_ZONES; i++ ) for ( j = 0; j <= MAX_ORDER; j++ ) INIT_PAGE_LIST_HEAD(&(*_heap[node])[i][j]); return needed; } /* Default to 64 MiB */ #define DEFAULT_LOW_MEM_VIRQ (((paddr_t) 64) << 20) #define MAX_LOW_MEM_VIRQ (((paddr_t) 1024) << 20) static paddr_t __read_mostly opt_low_mem_virq = ((paddr_t) -1); size_param("low_mem_virq_limit", opt_low_mem_virq); /* Thresholds to control hysteresis. In pages */ /* When memory grows above this threshold, reset hysteresis. * -1 initially to not reset until at least one virq issued. */ static unsigned long low_mem_virq_high = -1UL; /* Threshold at which we issue virq */ static unsigned long low_mem_virq_th = 0; /* Original threshold after all checks completed */ static unsigned long low_mem_virq_orig = 0; /* Order for current threshold */ static unsigned int low_mem_virq_th_order = 0; /* Perform bootstrapping checks and set bounds */ static void __init setup_low_mem_virq(void) { unsigned int order; paddr_t threshold; bool_t halve; /* If the user specifies zero, then he/she doesn't want this virq * to ever trigger. */ if ( opt_low_mem_virq == 0 ) { low_mem_virq_th = -1UL; return; } /* If the user did not specify a knob, remember that */ halve = (opt_low_mem_virq == ((paddr_t) -1)); threshold = halve ? DEFAULT_LOW_MEM_VIRQ : opt_low_mem_virq; /* Dom0 has already been allocated by now. So check we won't be * complaining immediately with whatever's left of the heap. */ threshold = min(threshold, ((paddr_t) total_avail_pages) << PAGE_SHIFT); /* Then, cap to some predefined maximum */ threshold = min(threshold, MAX_LOW_MEM_VIRQ); /* If the user specified no knob, and we are at the current available * level, halve the threshold. */ if ( halve && (threshold == (((paddr_t) total_avail_pages) << PAGE_SHIFT)) ) threshold >>= 1; /* Zero? Have to fire immediately */ threshold = max(threshold, (paddr_t) PAGE_SIZE); /* Threshold bytes -> pages */ low_mem_virq_th = threshold >> PAGE_SHIFT; /* Next, round the threshold down to the next order */ order = get_order_from_pages(low_mem_virq_th); if ( (1UL << order) > low_mem_virq_th ) order--; /* Set bounds, ready to go */ low_mem_virq_th = low_mem_virq_orig = 1UL << order; low_mem_virq_th_order = order; printk("Initial low memory virq threshold set at %#lx pages.\n", low_mem_virq_th); } static void check_low_mem_virq(void) { unsigned long avail_pages = total_avail_pages + (opt_tmem ? tmem_freeable_pages() : 0) - outstanding_claims; if ( unlikely(avail_pages <= low_mem_virq_th) ) { send_global_virq(VIRQ_ENOMEM); /* Update thresholds. Next warning will be when we drop below * next order. However, we wait until we grow beyond one * order above us to complain again at the current order */ low_mem_virq_high = 1UL << (low_mem_virq_th_order + 1); if ( low_mem_virq_th_order > 0 ) low_mem_virq_th_order--; low_mem_virq_th = 1UL << low_mem_virq_th_order; return; } if ( unlikely(avail_pages >= low_mem_virq_high) ) { /* Reset hysteresis. Bring threshold up one order. * If we are back where originally set, set high * threshold to -1 to avoid further growth of * virq threshold. */ low_mem_virq_th_order++; low_mem_virq_th = 1UL << low_mem_virq_th_order; if ( low_mem_virq_th == low_mem_virq_orig ) low_mem_virq_high = -1UL; else low_mem_virq_high = 1UL << (low_mem_virq_th_order + 2); } } /* Allocate 2^@order contiguous pages. */ static struct page_info *alloc_heap_pages( unsigned int zone_lo, unsigned int zone_hi, unsigned int order, unsigned int memflags, struct domain *d) { unsigned int first_node, i, j, zone = 0, nodemask_retry = 0; unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1); unsigned long request = 1UL << order; struct page_info *pg; nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map; bool_t need_tlbflush = 0; uint32_t tlbflush_timestamp = 0; if ( node == NUMA_NO_NODE ) { memflags &= ~MEMF_exact_node; if ( d != NULL ) { node = next_node(d->last_alloc_node, nodemask); if ( node >= MAX_NUMNODES ) node = first_node(nodemask); } if ( node >= MAX_NUMNODES ) node = cpu_to_node(smp_processor_id()); } first_node = node; ASSERT(node >= 0); ASSERT(zone_lo <= zone_hi); ASSERT(zone_hi < NR_ZONES); if ( unlikely(order > MAX_ORDER) ) return NULL; spin_lock(&heap_lock); /* * Claimed memory is considered unavailable unless the request * is made by a domain with sufficient unclaimed pages. */ if ( (outstanding_claims + request > total_avail_pages + tmem_freeable_pages()) && (d == NULL || d->outstanding_pages < request) ) goto not_found; /* * TMEM: When available memory is scarce due to tmem absorbing it, allow * only mid-size allocations to avoid worst of fragmentation issues. * Others try tmem pools then fail. This is a workaround until all * post-dom0-creation-multi-page allocations can be eliminated. */ if ( opt_tmem && ((order == 0) || (order >= 9)) && (total_avail_pages <= midsize_alloc_zone_pages) && tmem_freeable_pages() ) goto try_tmem; /* * Start with requested node, but exhaust all node memory in requested * zone before failing, only calc new node value if we fail to find memory * in target node, this avoids needless computation on fast-path. */ for ( ; ; ) { zone = zone_hi; do { /* Check if target node can support the allocation. */ if ( !avail[node] || (avail[node][zone] < request) ) continue; /* Find smallest order which can satisfy the request. */ for ( j = order; j <= MAX_ORDER; j++ ) if ( (pg = page_list_remove_head(&heap(node, zone, j))) ) goto found; } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */ if ( memflags & MEMF_exact_node ) goto not_found; /* Pick next node. */ if ( !node_isset(node, nodemask) ) { /* Very first node may be caller-specified and outside nodemask. */ ASSERT(!nodemask_retry); first_node = node = first_node(nodemask); if ( node < MAX_NUMNODES ) continue; } else if ( (node = next_node(node, nodemask)) >= MAX_NUMNODES ) node = first_node(nodemask); if ( node == first_node ) { /* When we have tried all in nodemask, we fall back to others. */ if ( nodemask_retry++ ) goto not_found; nodes_andnot(nodemask, node_online_map, nodemask); first_node = node = first_node(nodemask); if ( node >= MAX_NUMNODES ) goto not_found; } } try_tmem: /* Try to free memory from tmem */ if ( (pg = tmem_relinquish_pages(order, memflags)) != NULL ) { /* reassigning an already allocated anonymous heap page */ spin_unlock(&heap_lock); return pg; } not_found: /* No suitable memory blocks. Fail the request. */ spin_unlock(&heap_lock); return NULL; found: /* We may have to halve the chunk a number of times. */ while ( j != order ) { PFN_ORDER(pg) = --j; page_list_add_tail(pg, &heap(node, zone, j)); pg += 1 << j; } ASSERT(avail[node][zone] >= request); avail[node][zone] -= request; total_avail_pages -= request; ASSERT(total_avail_pages >= 0); check_low_mem_virq(); if ( d != NULL ) d->last_alloc_node = node; for ( i = 0; i < (1 << order); i++ ) { /* Reference count must continuously be zero for free pages. */ BUG_ON(pg[i].count_info != PGC_state_free); pg[i].count_info = PGC_state_inuse; if ( pg[i].u.free.need_tlbflush && (pg[i].tlbflush_timestamp <= tlbflush_current_time()) && (!need_tlbflush || (pg[i].tlbflush_timestamp > tlbflush_timestamp)) ) { need_tlbflush = 1; tlbflush_timestamp = pg[i].tlbflush_timestamp; } /* Initialise fields which have other uses for free pages. */ pg[i].u.inuse.type_info = 0; page_set_owner(&pg[i], NULL); /* Ensure cache and RAM are consistent for platforms where the * guest can control its own visibility of/through the cache. */ flush_page_to_ram(page_to_mfn(&pg[i])); } spin_unlock(&heap_lock); if ( need_tlbflush ) { cpumask_t mask = cpu_online_map; tlbflush_filter(mask, tlbflush_timestamp); if ( !cpumask_empty(&mask) ) { perfc_incr(need_flush_tlb_flush); flush_tlb_mask(&mask); } } return pg; } /* Remove any offlined page in the buddy pointed to by head. */ static int reserve_offlined_page(struct page_info *head) { unsigned int node = phys_to_nid(page_to_maddr(head)); int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0; struct page_info *cur_head; int cur_order; ASSERT(spin_is_locked(&heap_lock)); cur_head = head; page_list_del(head, &heap(node, zone, head_order)); while ( cur_head < (head + (1 << head_order)) ) { struct page_info *pg; int next_order; if ( page_state_is(cur_head, offlined) ) { cur_head++; continue; } next_order = cur_order = 0; while ( cur_order < head_order ) { next_order = cur_order + 1; if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) ) goto merge; for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order ); i < (1 << next_order); i++, pg++ ) if ( page_state_is(pg, offlined) ) break; if ( i == ( 1 << next_order) ) { cur_order = next_order; continue; } else { merge: /* We don't consider merging outside the head_order. */ page_list_add_tail(cur_head, &heap(node, zone, cur_order)); PFN_ORDER(cur_head) = cur_order; cur_head += (1 << cur_order); break; } } } for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ ) { if ( !page_state_is(cur_head, offlined) ) continue; avail[node][zone]--; total_avail_pages--; ASSERT(total_avail_pages >= 0); page_list_add_tail(cur_head, test_bit(_PGC_broken, &cur_head->count_info) ? &page_broken_list : &page_offlined_list); count++; } return count; } /* Free 2^@order set of pages. */ static void free_heap_pages( struct page_info *pg, unsigned int order) { unsigned long mask, mfn = page_to_mfn(pg); unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0; unsigned int zone = page_to_zone(pg); ASSERT(order <= MAX_ORDER); ASSERT(node >= 0); spin_lock(&heap_lock); for ( i = 0; i < (1 << order); i++ ) { /* * Cannot assume that count_info == 0, as there are some corner cases * where it isn't the case and yet it isn't a bug: * 1. page_get_owner() is NULL * 2. page_get_owner() is a domain that was never accessible by * its domid (e.g., failed to fully construct the domain). * 3. page was never addressable by the guest (e.g., it's an * auto-translate-physmap guest and the page was never included * in its pseudophysical address space). * In all the above cases there can be no guest mappings of this page. */ ASSERT(!page_state_is(&pg[i], offlined)); pg[i].count_info = ((pg[i].count_info & PGC_broken) | (page_state_is(&pg[i], offlining) ? PGC_state_offlined : PGC_state_free)); if ( page_state_is(&pg[i], offlined) ) tainted = 1; /* If a page has no owner it will need no safety TLB flush. */ pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL); if ( pg[i].u.free.need_tlbflush ) pg[i].tlbflush_timestamp = tlbflush_current_time(); /* This page is not a guest frame any more. */ page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */ set_gpfn_from_mfn(mfn + i, INVALID_M2P_ENTRY); } avail[node][zone] += 1 << order; total_avail_pages += 1 << order; if ( opt_tmem ) midsize_alloc_zone_pages = max( midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC); /* Merge chunks as far as possible. */ while ( order < MAX_ORDER ) { mask = 1UL << order; if ( (page_to_mfn(pg) & mask) ) { /* Merge with predecessor block? */ if ( !mfn_valid(page_to_mfn(pg-mask)) || !page_state_is(pg-mask, free) || (PFN_ORDER(pg-mask) != order) || (phys_to_nid(page_to_maddr(pg-mask)) != node) ) break; pg -= mask; page_list_del(pg, &heap(node, zone, order)); } else { /* Merge with successor block? */ if ( !mfn_valid(page_to_mfn(pg+mask)) || !page_state_is(pg+mask, free) || (PFN_ORDER(pg+mask) != order) || (phys_to_nid(page_to_maddr(pg+mask)) != node) ) break; page_list_del(pg + mask, &heap(node, zone, order)); } order++; } PFN_ORDER(pg) = order; page_list_add_tail(pg, &heap(node, zone, order)); if ( tainted ) reserve_offlined_page(pg); spin_unlock(&heap_lock); } /* * Following rules applied for page offline: * Once a page is broken, it can't be assigned anymore * A page will be offlined only if it is free * return original count_info */ static unsigned long mark_page_offline(struct page_info *pg, int broken) { unsigned long nx, x, y = pg->count_info; ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL)); ASSERT(spin_is_locked(&heap_lock)); do { nx = x = y; if ( ((x & PGC_state) != PGC_state_offlined) && ((x & PGC_state) != PGC_state_offlining) ) { nx &= ~PGC_state; nx |= (((x & PGC_state) == PGC_state_free) ? PGC_state_offlined : PGC_state_offlining); } if ( broken ) nx |= PGC_broken; if ( x == nx ) break; } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x ); return y; } static int reserve_heap_page(struct page_info *pg) { struct page_info *head = NULL; unsigned int i, node = phys_to_nid(page_to_maddr(pg)); unsigned int zone = page_to_zone(pg); for ( i = 0; i <= MAX_ORDER; i++ ) { struct page_info *tmp; if ( page_list_empty(&heap(node, zone, i)) ) continue; page_list_for_each_safe ( head, tmp, &heap(node, zone, i) ) { if ( (head <= pg) && (head + (1UL << i) > pg) ) return reserve_offlined_page(head); } } return -EINVAL; } int offline_page(unsigned long mfn, int broken, uint32_t *status) { unsigned long old_info = 0; struct domain *owner; struct page_info *pg; if ( !mfn_valid(mfn) ) { dprintk(XENLOG_WARNING, "try to offline page out of range %lx\n", mfn); return -EINVAL; } *status = 0; pg = mfn_to_page(mfn); if ( is_xen_fixed_mfn(mfn) ) { *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED | (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT); return -EPERM; } /* * N.B. xen's txt in x86_64 is marked reserved and handled already. * Also kexec range is reserved. */ if ( !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) ) { *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM; return -EINVAL; } /* * NB. When broken page belong to guest, usually hypervisor will * notify the guest to handle the broken page. However, hypervisor * need to prevent malicious guest access the broken page again. * Under such case, hypervisor shutdown guest, preventing recursive mce. */ if ( (pg->count_info & PGC_broken) && (owner = page_get_owner(pg)) ) { *status = PG_OFFLINE_AGAIN; domain_shutdown(owner, SHUTDOWN_crash); return 0; } spin_lock(&heap_lock); old_info = mark_page_offline(pg, broken); if ( page_state_is(pg, offlined) ) { reserve_heap_page(pg); spin_unlock(&heap_lock); *status = broken ? PG_OFFLINE_OFFLINED | PG_OFFLINE_BROKEN : PG_OFFLINE_OFFLINED; return 0; } spin_unlock(&heap_lock); if ( (owner = page_get_owner_and_reference(pg)) ) { if ( p2m_pod_offline_or_broken_hit(pg) ) { put_page(pg); p2m_pod_offline_or_broken_replace(pg); *status = PG_OFFLINE_OFFLINED; } else { *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING | (owner->domain_id << PG_OFFLINE_OWNER_SHIFT); /* Release the reference since it will not be allocated anymore */ put_page(pg); } } else if ( old_info & PGC_xen_heap ) { *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING | (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT); } else { /* * assign_pages does not hold heap_lock, so small window that the owner * may be set later, but please notice owner will only change from * NULL to be set, not verse, since page is offlining now. * No windows If called from #MC handler, since all CPU are in softirq * If called from user space like CE handling, tools can wait some time * before call again. */ *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED | (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT ); } if ( broken ) *status |= PG_OFFLINE_BROKEN; return 0; } /* * Online the memory. * The caller should make sure end_pfn <= max_page, * if not, expand_pages() should be called prior to online_page(). */ unsigned int online_page(unsigned long mfn, uint32_t *status) { unsigned long x, nx, y; struct page_info *pg; int ret; if ( !mfn_valid(mfn) ) { dprintk(XENLOG_WARNING, "call expand_pages() first\n"); return -EINVAL; } pg = mfn_to_page(mfn); spin_lock(&heap_lock); y = pg->count_info; do { ret = *status = 0; if ( y & PGC_broken ) { ret = -EINVAL; *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN; break; } if ( (y & PGC_state) == PGC_state_offlined ) { page_list_del(pg, &page_offlined_list); *status = PG_ONLINE_ONLINED; } else if ( (y & PGC_state) == PGC_state_offlining ) { *status = PG_ONLINE_ONLINED; } else { break; } x = y; nx = (x & ~PGC_state) | PGC_state_inuse; } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x ); spin_unlock(&heap_lock); if ( (y & PGC_state) == PGC_state_offlined ) free_heap_pages(pg, 0); return ret; } int query_page_offline(unsigned long mfn, uint32_t *status) { struct page_info *pg; if ( !mfn_valid(mfn) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) ) { dprintk(XENLOG_WARNING, "call expand_pages() first\n"); return -EINVAL; } *status = 0; spin_lock(&heap_lock); pg = mfn_to_page(mfn); if ( page_state_is(pg, offlining) ) *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING; if ( pg->count_info & PGC_broken ) *status |= PG_OFFLINE_STATUS_BROKEN; if ( page_state_is(pg, offlined) ) *status |= PG_OFFLINE_STATUS_OFFLINED; spin_unlock(&heap_lock); return 0; } /* * Hand the specified arbitrary page range to the specified heap zone * checking the node_id of the previous page. If they differ and the * latter is not on a MAX_ORDER boundary, then we reserve the page by * not freeing it to the buddy allocator. */ static void init_heap_pages( struct page_info *pg, unsigned long nr_pages) { unsigned long i; for ( i = 0; i < nr_pages; i++ ) { unsigned int nid = phys_to_nid(page_to_maddr(pg+i)); if ( unlikely(!avail[nid]) ) { unsigned long s = page_to_mfn(pg + i); unsigned long e = page_to_mfn(pg + nr_pages - 1) + 1; bool_t use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) && !(s & ((1UL << MAX_ORDER) - 1)) && (find_first_set_bit(e) <= find_first_set_bit(s)); unsigned long n; n = init_node_heap(nid, page_to_mfn(pg+i), nr_pages - i, &use_tail); BUG_ON(i + n > nr_pages); if ( n && !use_tail ) { i += n - 1; continue; } if ( i + n == nr_pages ) break; nr_pages -= n; } free_heap_pages(pg+i, 0); } } static unsigned long avail_heap_pages( unsigned int zone_lo, unsigned int zone_hi, unsigned int node) { unsigned int i, zone; unsigned long free_pages = 0; if ( zone_hi >= NR_ZONES ) zone_hi = NR_ZONES - 1; for_each_online_node(i) { if ( !avail[i] ) continue; for ( zone = zone_lo; zone <= zone_hi; zone++ ) if ( (node == -1) || (node == i) ) free_pages += avail[i][zone]; } return free_pages; } unsigned long total_free_pages(void) { return total_avail_pages - midsize_alloc_zone_pages; } void __init end_boot_allocator(void) { unsigned int i; /* Pages that are free now go to the domain sub-allocator. */ for ( i = 0; i < nr_bootmem_regions; i++ ) { struct bootmem_region *r = &bootmem_region_list[i]; if ( (r->s < r->e) && (phys_to_nid(pfn_to_paddr(r->s)) == cpu_to_node(0)) ) { init_heap_pages(mfn_to_page(r->s), r->e - r->s); r->e = r->s; break; } } for ( i = nr_bootmem_regions; i-- > 0; ) { struct bootmem_region *r = &bootmem_region_list[i]; if ( r->s < r->e ) init_heap_pages(mfn_to_page(r->s), r->e - r->s); } init_heap_pages(virt_to_page(bootmem_region_list), 1); if ( !dma_bitsize && (num_online_nodes() > 1) ) { #ifdef CONFIG_X86 dma_bitsize = min_t(unsigned int, fls(NODE_DATA(0)->node_spanned_pages) - 1 + PAGE_SHIFT - 2, 32); #else dma_bitsize = 32; #endif } printk("Domain heap initialised"); if ( dma_bitsize ) printk(" DMA width %u bits", dma_bitsize); printk("\n"); } /* * Scrub all unallocated pages in all heap zones. This function is more * convoluted than appears necessary because we do not want to continuously * hold the lock while scrubbing very large memory areas. */ void __init scrub_heap_pages(void) { unsigned long mfn; struct page_info *pg; if ( !opt_bootscrub ) return; printk("Scrubbing Free RAM: "); for ( mfn = first_valid_mfn; mfn < max_page; mfn++ ) { process_pending_softirqs(); pg = mfn_to_page(mfn); /* Quick lock-free check. */ if ( !mfn_valid(mfn) || !page_state_is(pg, free) ) continue; /* Every 100MB, print a progress dot. */ if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 ) printk("."); spin_lock(&heap_lock); /* Re-check page status with lock held. */ if ( page_state_is(pg, free) ) scrub_one_page(pg); spin_unlock(&heap_lock); } printk("done.\n"); /* Now that the heap is initialized, run checks and set bounds * for the low mem virq algorithm. */ setup_low_mem_virq(); } /************************* * XEN-HEAP SUB-ALLOCATOR */ #if defined(CONFIG_SEPARATE_XENHEAP) void init_xenheap_pages(paddr_t ps, paddr_t pe) { ps = round_pgup(ps); pe = round_pgdown(pe); if ( pe <= ps ) return; /* * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to * prevent merging of power-of-two blocks across the zone boundary. */ if ( ps && !is_xen_heap_mfn(paddr_to_pfn(ps)-1) ) ps += PAGE_SIZE; if ( !is_xen_heap_mfn(paddr_to_pfn(pe)) ) pe -= PAGE_SIZE; memguard_guard_range(maddr_to_virt(ps), pe - ps); init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT); } void *alloc_xenheap_pages(unsigned int order, unsigned int memflags) { struct page_info *pg; ASSERT(!in_irq()); pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN, order, memflags, NULL); if ( unlikely(pg == NULL) ) return NULL; memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT)); return page_to_virt(pg); } void free_xenheap_pages(void *v, unsigned int order) { ASSERT(!in_irq()); if ( v == NULL ) return; memguard_guard_range(v, 1 << (order + PAGE_SHIFT)); free_heap_pages(virt_to_page(v), order); } #else static unsigned int __read_mostly xenheap_bits; void __init xenheap_max_mfn(unsigned long mfn) { xenheap_bits = fls(mfn) + PAGE_SHIFT; } void init_xenheap_pages(paddr_t ps, paddr_t pe) { init_domheap_pages(ps, pe); } void *alloc_xenheap_pages(unsigned int order, unsigned int memflags) { struct page_info *pg; unsigned int i; ASSERT(!in_irq()); if ( xenheap_bits && (memflags >> _MEMF_bits) > xenheap_bits ) memflags &= ~MEMF_bits(~0); if ( !(memflags >> _MEMF_bits) ) memflags |= MEMF_bits(xenheap_bits); pg = alloc_domheap_pages(NULL, order, memflags); if ( unlikely(pg == NULL) ) return NULL; for ( i = 0; i < (1u << order); i++ ) pg[i].count_info |= PGC_xen_heap; return page_to_virt(pg); } void free_xenheap_pages(void *v, unsigned int order) { struct page_info *pg; unsigned int i; ASSERT(!in_irq()); if ( v == NULL ) return; pg = virt_to_page(v); for ( i = 0; i < (1u << order); i++ ) pg[i].count_info &= ~PGC_xen_heap; free_heap_pages(pg, order); } #endif /************************* * DOMAIN-HEAP SUB-ALLOCATOR */ void init_domheap_pages(paddr_t ps, paddr_t pe) { unsigned long smfn, emfn; ASSERT(!in_irq()); smfn = round_pgup(ps) >> PAGE_SHIFT; emfn = round_pgdown(pe) >> PAGE_SHIFT; if ( emfn <= smfn ) return; init_heap_pages(mfn_to_page(smfn), emfn - smfn); } int assign_pages( struct domain *d, struct page_info *pg, unsigned int order, unsigned int memflags) { unsigned long i; spin_lock(&d->page_alloc_lock); if ( unlikely(d->is_dying) ) { gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n", d->domain_id); goto fail; } if ( !(memflags & MEMF_no_refcount) ) { if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) ) { if ( !opt_tmem || order != 0 || d->tot_pages != d->max_pages ) gdprintk(XENLOG_INFO, "Over-allocation for domain %u: " "%u > %u\n", d->domain_id, d->tot_pages + (1 << order), d->max_pages); goto fail; } if ( unlikely(d->tot_pages == 0) ) get_knownalive_domain(d); domain_adjust_tot_pages(d, 1 << order); } for ( i = 0; i < (1 << order); i++ ) { ASSERT(page_get_owner(&pg[i]) == NULL); ASSERT((pg[i].count_info & ~(PGC_allocated | 1)) == 0); page_set_owner(&pg[i], d); smp_wmb(); /* Domain pointer must be visible before updating refcnt. */ pg[i].count_info = PGC_allocated | 1; page_list_add_tail(&pg[i], &d->page_list); } spin_unlock(&d->page_alloc_lock); return 0; fail: spin_unlock(&d->page_alloc_lock); return -1; } struct page_info *alloc_domheap_pages( struct domain *d, unsigned int order, unsigned int memflags) { struct page_info *pg = NULL; unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1; unsigned int dma_zone; ASSERT(!in_irq()); bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT)); if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 ) return NULL; if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) ) pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d); if ( (pg == NULL) && ((memflags & MEMF_no_dma) || ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order, memflags, d)) == NULL)) ) return NULL; if ( (d != NULL) && assign_pages(d, pg, order, memflags) ) { free_heap_pages(pg, order); return NULL; } return pg; } void free_domheap_pages(struct page_info *pg, unsigned int order) { struct domain *d = page_get_owner(pg); unsigned int i; bool_t drop_dom_ref; ASSERT(!in_irq()); if ( unlikely(is_xen_heap_page(pg)) ) { /* NB. May recursively lock from relinquish_memory(). */ spin_lock_recursive(&d->page_alloc_lock); for ( i = 0; i < (1 << order); i++ ) page_list_del2(&pg[i], &d->xenpage_list, &d->arch.relmem_list); d->xenheap_pages -= 1 << order; drop_dom_ref = (d->xenheap_pages == 0); spin_unlock_recursive(&d->page_alloc_lock); } else if ( likely(d != NULL) && likely(d != dom_cow) ) { /* NB. May recursively lock from relinquish_memory(). */ spin_lock_recursive(&d->page_alloc_lock); for ( i = 0; i < (1 << order); i++ ) { BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0); page_list_del2(&pg[i], &d->page_list, &d->arch.relmem_list); } drop_dom_ref = !domain_adjust_tot_pages(d, -(1 << order)); spin_unlock_recursive(&d->page_alloc_lock); /* * Normally we expect a domain to clear pages before freeing them, if * it cares about the secrecy of their contents. However, after a * domain has died we assume responsibility for erasure. */ if ( unlikely(d->is_dying) ) for ( i = 0; i < (1 << order); i++ ) scrub_one_page(&pg[i]); free_heap_pages(pg, order); } else if ( unlikely(d == dom_cow) ) { ASSERT(order == 0); scrub_one_page(pg); free_heap_pages(pg, 0); drop_dom_ref = 0; } else { /* Freeing anonymous domain-heap pages. */ free_heap_pages(pg, order); drop_dom_ref = 0; } if ( drop_dom_ref ) put_domain(d); } unsigned long avail_domheap_pages_region( unsigned int node, unsigned int min_width, unsigned int max_width) { int zone_lo, zone_hi; zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1); zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo)); zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1); zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi)); return avail_heap_pages(zone_lo, zone_hi, node); } unsigned long avail_domheap_pages(void) { return avail_heap_pages(MEMZONE_XEN + 1, NR_ZONES - 1, -1); } unsigned long avail_node_heap_pages(unsigned int nodeid) { return avail_heap_pages(MEMZONE_XEN, NR_ZONES -1, nodeid); } static void pagealloc_info(unsigned char key) { unsigned int zone = MEMZONE_XEN; unsigned long n, total = 0; printk("Physical memory information:\n"); printk(" Xen heap: %lukB free\n", avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10)); while ( ++zone < NR_ZONES ) { if ( (zone + PAGE_SHIFT) == dma_bitsize ) { printk(" DMA heap: %lukB free\n", total << (PAGE_SHIFT-10)); total = 0; } if ( (n = avail_heap_pages(zone, zone, -1)) != 0 ) { total += n; printk(" heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10)); } } printk(" Dom heap: %lukB free\n", total << (PAGE_SHIFT-10)); } static struct keyhandler pagealloc_info_keyhandler = { .diagnostic = 1, .u.fn = pagealloc_info, .desc = "memory info" }; static __init int pagealloc_keyhandler_init(void) { register_keyhandler('m', &pagealloc_info_keyhandler); return 0; } __initcall(pagealloc_keyhandler_init); void scrub_one_page(struct page_info *pg) { void *p; if ( unlikely(pg->count_info & PGC_broken) ) return; p = __map_domain_page(pg); #ifndef NDEBUG /* Avoid callers relying on allocations returning zeroed pages. */ memset(p, 0xc2, PAGE_SIZE); #else /* For a production build, clear_page() is the fastest way to scrub. */ clear_page(p); #endif unmap_domain_page(p); } static void dump_heap(unsigned char key) { s_time_t now = NOW(); int i, j; printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key, (u32)(now>>32), (u32)now); for ( i = 0; i < MAX_NUMNODES; i++ ) { if ( !avail[i] ) continue; for ( j = 0; j < NR_ZONES; j++ ) printk("heap[node=%d][zone=%d] -> %lu pages\n", i, j, avail[i][j]); } } static struct keyhandler dump_heap_keyhandler = { .diagnostic = 1, .u.fn = dump_heap, .desc = "dump heap info" }; static __init int register_heap_trigger(void) { register_keyhandler('H', &dump_heap_keyhandler); return 0; } __initcall(register_heap_trigger); /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/memory.c0000664000175000017500000006752112307313555014214 0ustar smbsmb/****************************************************************************** * memory.c * * Code to handle memory-related requests. * * Copyright (c) 2003-2004, B Dragovic * Copyright (c) 2003-2005, K A Fraser */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef is_domain_direct_mapped # define is_domain_direct_mapped(d) ((void)(d), 0) #endif struct memop_args { /* INPUT */ struct domain *domain; /* Domain to be affected. */ XEN_GUEST_HANDLE(xen_pfn_t) extent_list; /* List of extent base addrs. */ unsigned int nr_extents; /* Number of extents to allocate or free. */ unsigned int extent_order; /* Size of each extent. */ unsigned int memflags; /* Allocation flags. */ /* INPUT/OUTPUT */ unsigned int nr_done; /* Number of extents processed so far. */ int preempted; /* Was the hypercall preempted? */ }; static void increase_reservation(struct memop_args *a) { struct page_info *page; unsigned long i; xen_pfn_t mfn; struct domain *d = a->domain; if ( !guest_handle_is_null(a->extent_list) && !guest_handle_subrange_okay(a->extent_list, a->nr_done, a->nr_extents-1) ) return; if ( !multipage_allocation_permitted(current->domain, a->extent_order) ) return; for ( i = a->nr_done; i < a->nr_extents; i++ ) { if ( hypercall_preempt_check() ) { a->preempted = 1; goto out; } page = alloc_domheap_pages(d, a->extent_order, a->memflags); if ( unlikely(page == NULL) ) { gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: " "id=%d memflags=%x (%ld of %d)\n", a->extent_order, d->domain_id, a->memflags, i, a->nr_extents); goto out; } /* Inform the domain of the new page's machine address. */ if ( !guest_handle_is_null(a->extent_list) ) { mfn = page_to_mfn(page); if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) ) goto out; } } out: a->nr_done = i; } static void populate_physmap(struct memop_args *a) { struct page_info *page; unsigned long i, j; xen_pfn_t gpfn, mfn; struct domain *d = a->domain; if ( !guest_handle_subrange_okay(a->extent_list, a->nr_done, a->nr_extents-1) ) return; if ( a->memflags & MEMF_populate_on_demand ? a->extent_order > MAX_ORDER : !multipage_allocation_permitted(current->domain, a->extent_order) ) return; for ( i = a->nr_done; i < a->nr_extents; i++ ) { if ( hypercall_preempt_check() ) { a->preempted = 1; goto out; } if ( unlikely(__copy_from_guest_offset(&gpfn, a->extent_list, i, 1)) ) goto out; if ( a->memflags & MEMF_populate_on_demand ) { if ( guest_physmap_mark_populate_on_demand(d, gpfn, a->extent_order) < 0 ) goto out; } else { if ( is_domain_direct_mapped(d) ) { mfn = gpfn; if ( !mfn_valid(mfn) ) { gdprintk(XENLOG_INFO, "Invalid mfn %#"PRI_xen_pfn"\n", mfn); goto out; } page = mfn_to_page(mfn); if ( !get_page(page, d) ) { gdprintk(XENLOG_INFO, "mfn %#"PRI_xen_pfn" doesn't belong to the" " domain\n", mfn); goto out; } put_page(page); } else page = alloc_domheap_pages(d, a->extent_order, a->memflags); if ( unlikely(page == NULL) ) { if ( !opt_tmem || (a->extent_order != 0) ) gdprintk(XENLOG_INFO, "Could not allocate order=%d extent:" " id=%d memflags=%x (%ld of %d)\n", a->extent_order, d->domain_id, a->memflags, i, a->nr_extents); goto out; } mfn = page_to_mfn(page); guest_physmap_add_page(d, gpfn, mfn, a->extent_order); if ( !paging_mode_translate(d) ) { for ( j = 0; j < (1 << a->extent_order); j++ ) set_gpfn_from_mfn(mfn + j, gpfn + j); /* Inform the domain of the new page's machine address. */ if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) ) goto out; } } } out: a->nr_done = i; } int guest_remove_page(struct domain *d, unsigned long gmfn) { struct page_info *page; #ifdef CONFIG_X86 p2m_type_t p2mt; #endif unsigned long mfn; #ifdef CONFIG_X86 mfn = mfn_x(get_gfn_query(d, gmfn, &p2mt)); if ( unlikely(p2m_is_paging(p2mt)) ) { guest_physmap_remove_page(d, gmfn, mfn, 0); put_gfn(d, gmfn); /* If the page hasn't yet been paged out, there is an * actual page that needs to be released. */ if ( p2mt == p2m_ram_paging_out ) { ASSERT(mfn_valid(mfn)); page = mfn_to_page(mfn); if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) put_page(page); } p2m_mem_paging_drop_page(d, gmfn, p2mt); return 1; } #else mfn = gmfn_to_mfn(d, gmfn); #endif if ( unlikely(!mfn_valid(mfn)) ) { put_gfn(d, gmfn); gdprintk(XENLOG_INFO, "Domain %u page number %lx invalid\n", d->domain_id, gmfn); return 0; } #ifdef CONFIG_X86 if ( p2m_is_shared(p2mt) ) { /* Unshare the page, bail out on error. We unshare because * we might be the only one using this shared page, and we * need to trigger proper cleanup. Once done, this is * like any other page. */ if ( mem_sharing_unshare_page(d, gmfn, 0) ) { put_gfn(d, gmfn); (void)mem_sharing_notify_enomem(d, gmfn, 0); return 0; } /* Maybe the mfn changed */ mfn = mfn_x(get_gfn_query_unlocked(d, gmfn, &p2mt)); ASSERT(!p2m_is_shared(p2mt)); } #endif /* CONFIG_X86 */ page = mfn_to_page(mfn); if ( unlikely(!get_page(page, d)) ) { put_gfn(d, gmfn); gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", d->domain_id); return 0; } if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) ) put_page_and_type(page); if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) put_page(page); guest_physmap_remove_page(d, gmfn, mfn, 0); put_page(page); put_gfn(d, gmfn); return 1; } static void decrease_reservation(struct memop_args *a) { unsigned long i, j; xen_pfn_t gmfn; if ( !guest_handle_subrange_okay(a->extent_list, a->nr_done, a->nr_extents-1) || a->extent_order > MAX_ORDER ) return; for ( i = a->nr_done; i < a->nr_extents; i++ ) { if ( hypercall_preempt_check() ) { a->preempted = 1; goto out; } if ( unlikely(__copy_from_guest_offset(&gmfn, a->extent_list, i, 1)) ) goto out; if ( tb_init_done ) { struct { u64 gfn; int d:16,order:16; } t; t.gfn = gmfn; t.d = a->domain->domain_id; t.order = a->extent_order; __trace_var(TRC_MEM_DECREASE_RESERVATION, 0, sizeof(t), &t); } /* See if populate-on-demand wants to handle this */ if ( is_hvm_domain(a->domain) && p2m_pod_decrease_reservation(a->domain, gmfn, a->extent_order) ) continue; /* With the lack for iommu on some ARM platform, domain with DMA-capable * device must retrieve the same pfn when the hypercall * populate_physmap is called. */ if ( is_domain_direct_mapped(a->domain) ) continue; for ( j = 0; j < (1 << a->extent_order); j++ ) if ( !guest_remove_page(a->domain, gmfn + j) ) goto out; } out: a->nr_done = i; } static long memory_exchange(XEN_GUEST_HANDLE_PARAM(xen_memory_exchange_t) arg) { struct xen_memory_exchange exch; PAGE_LIST_HEAD(in_chunk_list); PAGE_LIST_HEAD(out_chunk_list); unsigned long in_chunk_order, out_chunk_order; xen_pfn_t gpfn, gmfn, mfn; unsigned long i, j, k = 0; /* gcc ... */ unsigned int memflags = 0; long rc = 0; struct domain *d; struct page_info *page; if ( copy_from_guest(&exch, arg, 1) ) return -EFAULT; /* Various sanity checks. */ if ( (exch.nr_exchanged > exch.in.nr_extents) || /* Input and output domain identifiers match? */ (exch.in.domid != exch.out.domid) || /* Extent orders are sensible? */ (exch.in.extent_order > MAX_ORDER) || (exch.out.extent_order > MAX_ORDER) || /* Sizes of input and output lists do not overflow a long? */ ((~0UL >> exch.in.extent_order) < exch.in.nr_extents) || ((~0UL >> exch.out.extent_order) < exch.out.nr_extents) || /* Sizes of input and output lists match? */ ((exch.in.nr_extents << exch.in.extent_order) != (exch.out.nr_extents << exch.out.extent_order)) ) { rc = -EINVAL; goto fail_early; } if ( !guest_handle_okay(exch.in.extent_start, exch.in.nr_extents) || !guest_handle_okay(exch.out.extent_start, exch.out.nr_extents) ) { rc = -EFAULT; goto fail_early; } /* Only privileged guests can allocate multi-page contiguous extents. */ if ( !multipage_allocation_permitted(current->domain, exch.in.extent_order) || !multipage_allocation_permitted(current->domain, exch.out.extent_order) ) { rc = -EPERM; goto fail_early; } if ( exch.in.extent_order <= exch.out.extent_order ) { in_chunk_order = exch.out.extent_order - exch.in.extent_order; out_chunk_order = 0; } else { in_chunk_order = 0; out_chunk_order = exch.in.extent_order - exch.out.extent_order; } d = rcu_lock_domain_by_any_id(exch.in.domid); if ( d == NULL ) { rc = -ESRCH; goto fail_early; } rc = xsm_memory_exchange(XSM_TARGET, d); if ( rc ) { rcu_unlock_domain(d); goto fail_early; } memflags |= MEMF_bits(domain_clamp_alloc_bitsize( d, XENMEMF_get_address_bits(exch.out.mem_flags) ? : (BITS_PER_LONG+PAGE_SHIFT))); memflags |= MEMF_node(XENMEMF_get_node(exch.out.mem_flags)); for ( i = (exch.nr_exchanged >> in_chunk_order); i < (exch.in.nr_extents >> in_chunk_order); i++ ) { if ( hypercall_preempt_check() ) { exch.nr_exchanged = i << in_chunk_order; rcu_unlock_domain(d); if ( __copy_field_to_guest(arg, &exch, nr_exchanged) ) return -EFAULT; return hypercall_create_continuation( __HYPERVISOR_memory_op, "lh", XENMEM_exchange, arg); } /* Steal a chunk's worth of input pages from the domain. */ for ( j = 0; j < (1UL << in_chunk_order); j++ ) { if ( unlikely(__copy_from_guest_offset( &gmfn, exch.in.extent_start, (i<count_info) ) BUG(); mfn = page_to_mfn(page); gfn = mfn_to_gmfn(d, mfn); /* Pages were unshared above */ BUG_ON(SHARED_M2P(gfn)); guest_physmap_remove_page(d, gfn, mfn, 0); put_page(page); } /* Assign each output page to the domain. */ for ( j = 0; (page = page_list_remove_head(&out_chunk_list)); ++j ) { if ( assign_pages(d, page, exch.out.extent_order, MEMF_no_refcount) ) { unsigned long dec_count; bool_t drop_dom_ref; /* * Pages in in_chunk_list is stolen without * decreasing the tot_pages. If the domain is dying when * assign pages, we need decrease the count. For those pages * that has been assigned, it should be covered by * domain_relinquish_resources(). */ dec_count = (((1UL << exch.in.extent_order) * (1UL << in_chunk_order)) - (j * (1UL << exch.out.extent_order))); spin_lock(&d->page_alloc_lock); drop_dom_ref = (dec_count && !domain_adjust_tot_pages(d, -dec_count)); spin_unlock(&d->page_alloc_lock); if ( drop_dom_ref ) put_domain(d); free_domheap_pages(page, exch.out.extent_order); goto dying; } if ( __copy_from_guest_offset(&gpfn, exch.out.extent_start, (i << out_chunk_order) + j, 1) ) { rc = -EFAULT; continue; } mfn = page_to_mfn(page); guest_physmap_add_page(d, gpfn, mfn, exch.out.extent_order); if ( !paging_mode_translate(d) ) { for ( k = 0; k < (1UL << exch.out.extent_order); k++ ) set_gpfn_from_mfn(mfn + k, gpfn + k); if ( __copy_to_guest_offset(exch.out.extent_start, (i << out_chunk_order) + j, &mfn, 1) ) rc = -EFAULT; } } BUG_ON( !(d->is_dying) && (j != (1UL << out_chunk_order)) ); } exch.nr_exchanged = exch.in.nr_extents; if ( __copy_field_to_guest(arg, &exch, nr_exchanged) ) rc = -EFAULT; rcu_unlock_domain(d); return rc; /* * Failed a chunk! Free any partial chunk work. Tell caller how many * chunks succeeded. */ fail: /* Reassign any input pages we managed to steal. */ while ( (page = page_list_remove_head(&in_chunk_list)) ) { put_gfn(d, gmfn + k--); if ( assign_pages(d, page, 0, MEMF_no_refcount) ) BUG(); } dying: rcu_unlock_domain(d); /* Free any output pages we managed to allocate. */ while ( (page = page_list_remove_head(&out_chunk_list)) ) free_domheap_pages(page, exch.out.extent_order); exch.nr_exchanged = i << in_chunk_order; fail_early: if ( __copy_field_to_guest(arg, &exch, nr_exchanged) ) rc = -EFAULT; return rc; } static int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp, unsigned int start) { unsigned int done = 0; long rc = 0; if ( xatp->space != XENMAPSPACE_gmfn_range ) return xenmem_add_to_physmap_one(d, xatp->space, DOMID_INVALID, xatp->idx, xatp->gpfn); if ( xatp->size < start ) return -EILSEQ; xatp->idx += start; xatp->gpfn += start; xatp->size -= start; #ifdef HAS_PASSTHROUGH if ( need_iommu(d) ) this_cpu(iommu_dont_flush_iotlb) = 1; #endif while ( xatp->size > done ) { rc = xenmem_add_to_physmap_one(d, xatp->space, DOMID_INVALID, xatp->idx, xatp->gpfn); if ( rc < 0 ) break; xatp->idx++; xatp->gpfn++; /* Check for continuation if it's not the last iteration. */ if ( xatp->size > ++done && hypercall_preempt_check() ) { rc = start + done; break; } } #ifdef HAS_PASSTHROUGH if ( need_iommu(d) ) { this_cpu(iommu_dont_flush_iotlb) = 0; iommu_iotlb_flush(d, xatp->idx - done, done); iommu_iotlb_flush(d, xatp->gpfn - done, done); } #endif return rc; } static int xenmem_add_to_physmap_batch(struct domain *d, struct xen_add_to_physmap_batch *xatpb, unsigned int start) { unsigned int done = 0; int rc; if ( xatpb->size < start ) return -EILSEQ; guest_handle_add_offset(xatpb->idxs, start); guest_handle_add_offset(xatpb->gpfns, start); guest_handle_add_offset(xatpb->errs, start); xatpb->size -= start; while ( xatpb->size > done ) { xen_ulong_t idx; xen_pfn_t gpfn; if ( unlikely(__copy_from_guest_offset(&idx, xatpb->idxs, 0, 1)) ) { rc = -EFAULT; goto out; } if ( unlikely(__copy_from_guest_offset(&gpfn, xatpb->gpfns, 0, 1)) ) { rc = -EFAULT; goto out; } rc = xenmem_add_to_physmap_one(d, xatpb->space, xatpb->foreign_domid, idx, gpfn); if ( unlikely(__copy_to_guest_offset(xatpb->errs, 0, &rc, 1)) ) { rc = -EFAULT; goto out; } guest_handle_add_offset(xatpb->idxs, 1); guest_handle_add_offset(xatpb->gpfns, 1); guest_handle_add_offset(xatpb->errs, 1); /* Check for continuation if it's not the last iteration. */ if ( xatpb->size > ++done && hypercall_preempt_check() ) { rc = start + done; goto out; } } rc = 0; out: return rc; } long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) { struct domain *d; long rc; unsigned int address_bits; struct xen_memory_reservation reservation; struct memop_args args; domid_t domid; unsigned long start_extent = cmd >> MEMOP_EXTENT_SHIFT; int op = cmd & MEMOP_CMD_MASK; switch ( op ) { case XENMEM_increase_reservation: case XENMEM_decrease_reservation: case XENMEM_populate_physmap: if ( copy_from_guest(&reservation, arg, 1) ) return start_extent; /* Is size too large for us to encode a continuation? */ if ( reservation.nr_extents > (UINT_MAX >> MEMOP_EXTENT_SHIFT) ) return start_extent; if ( unlikely(start_extent >= reservation.nr_extents) ) return start_extent; args.extent_list = reservation.extent_start; args.nr_extents = reservation.nr_extents; args.extent_order = reservation.extent_order; args.nr_done = start_extent; args.preempted = 0; args.memflags = 0; address_bits = XENMEMF_get_address_bits(reservation.mem_flags); if ( (address_bits != 0) && (address_bits < (get_order_from_pages(max_page) + PAGE_SHIFT)) ) { if ( address_bits <= PAGE_SHIFT ) return start_extent; args.memflags = MEMF_bits(address_bits); } args.memflags |= MEMF_node(XENMEMF_get_node(reservation.mem_flags)); if ( reservation.mem_flags & XENMEMF_exact_node_request ) args.memflags |= MEMF_exact_node; if ( op == XENMEM_populate_physmap && (reservation.mem_flags & XENMEMF_populate_on_demand) ) args.memflags |= MEMF_populate_on_demand; d = rcu_lock_domain_by_any_id(reservation.domid); if ( d == NULL ) return start_extent; args.domain = d; rc = xsm_memory_adjust_reservation(XSM_TARGET, current->domain, d); if ( rc ) { rcu_unlock_domain(d); return rc; } switch ( op ) { case XENMEM_increase_reservation: increase_reservation(&args); break; case XENMEM_decrease_reservation: decrease_reservation(&args); break; default: /* XENMEM_populate_physmap */ populate_physmap(&args); break; } rcu_unlock_domain(d); rc = args.nr_done; if ( args.preempted ) return hypercall_create_continuation( __HYPERVISOR_memory_op, "lh", op | (rc << MEMOP_EXTENT_SHIFT), arg); break; case XENMEM_exchange: rc = memory_exchange(guest_handle_cast(arg, xen_memory_exchange_t)); break; case XENMEM_maximum_ram_page: rc = max_page; break; case XENMEM_current_reservation: case XENMEM_maximum_reservation: case XENMEM_maximum_gpfn: if ( copy_from_guest(&domid, arg, 1) ) return -EFAULT; d = rcu_lock_domain_by_any_id(domid); if ( d == NULL ) return -ESRCH; rc = xsm_memory_stat_reservation(XSM_TARGET, current->domain, d); if ( rc ) { rcu_unlock_domain(d); return rc; } switch ( op ) { case XENMEM_current_reservation: rc = d->tot_pages; break; case XENMEM_maximum_reservation: rc = d->max_pages; break; default: ASSERT(op == XENMEM_maximum_gpfn); rc = domain_get_maximum_gpfn(d); break; } rcu_unlock_domain(d); break; case XENMEM_add_to_physmap: { struct xen_add_to_physmap xatp; BUILD_BUG_ON((typeof(xatp.size))-1 > (UINT_MAX >> MEMOP_EXTENT_SHIFT)); /* Check for malicious or buggy input. */ if ( start_extent != (typeof(xatp.size))start_extent ) return -EDOM; if ( copy_from_guest(&xatp, arg, 1) ) return -EFAULT; /* Foreign mapping is only possible via add_to_physmap_batch. */ if ( xatp.space == XENMAPSPACE_gmfn_foreign ) return -ENOSYS; d = rcu_lock_domain_by_any_id(xatp.domid); if ( d == NULL ) return -ESRCH; rc = xsm_add_to_physmap(XSM_TARGET, current->domain, d); if ( rc ) { rcu_unlock_domain(d); return rc; } rc = xenmem_add_to_physmap(d, &xatp, start_extent); rcu_unlock_domain(d); if ( xatp.space == XENMAPSPACE_gmfn_range && rc > 0 ) rc = hypercall_create_continuation( __HYPERVISOR_memory_op, "lh", op | (rc << MEMOP_EXTENT_SHIFT), arg); return rc; } case XENMEM_add_to_physmap_batch: { struct xen_add_to_physmap_batch xatpb; struct domain *d; BUILD_BUG_ON((typeof(xatpb.size))-1 > (UINT_MAX >> MEMOP_EXTENT_SHIFT)); /* Check for malicious or buggy input. */ if ( start_extent != (typeof(xatpb.size))start_extent ) return -EDOM; if ( copy_from_guest(&xatpb, arg, 1) || !guest_handle_okay(xatpb.idxs, xatpb.size) || !guest_handle_okay(xatpb.gpfns, xatpb.size) || !guest_handle_okay(xatpb.errs, xatpb.size) ) return -EFAULT; /* This mapspace is unsupported for this hypercall. */ if ( xatpb.space == XENMAPSPACE_gmfn_range ) return -EOPNOTSUPP; d = rcu_lock_domain_by_any_id(xatpb.domid); if ( d == NULL ) return -ESRCH; rc = xsm_add_to_physmap(XSM_TARGET, current->domain, d); if ( rc ) { rcu_unlock_domain(d); return rc; } rc = xenmem_add_to_physmap_batch(d, &xatpb, start_extent); rcu_unlock_domain(d); if ( rc > 0 ) rc = hypercall_create_continuation( __HYPERVISOR_memory_op, "lh", op | (rc << MEMOP_EXTENT_SHIFT), arg); return rc; } case XENMEM_remove_from_physmap: { struct xen_remove_from_physmap xrfp; struct page_info *page; struct domain *d; if ( copy_from_guest(&xrfp, arg, 1) ) return -EFAULT; d = rcu_lock_domain_by_any_id(xrfp.domid); if ( d == NULL ) return -ESRCH; rc = xsm_remove_from_physmap(XSM_TARGET, current->domain, d); if ( rc ) { rcu_unlock_domain(d); return rc; } page = get_page_from_gfn(d, xrfp.gpfn, NULL, P2M_ALLOC); if ( page ) { guest_physmap_remove_page(d, xrfp.gpfn, page_to_mfn(page), 0); put_page(page); } else rc = -ENOENT; rcu_unlock_domain(d); break; } case XENMEM_claim_pages: if ( copy_from_guest(&reservation, arg, 1) ) return -EFAULT; if ( !guest_handle_is_null(reservation.extent_start) ) return -EINVAL; if ( reservation.extent_order != 0 ) return -EINVAL; if ( reservation.mem_flags != 0 ) return -EINVAL; d = rcu_lock_domain_by_id(reservation.domid); if ( d == NULL ) return -EINVAL; rc = xsm_claim_pages(XSM_PRIV, d); if ( !rc ) rc = domain_set_outstanding_pages(d, reservation.nr_extents); rcu_unlock_domain(d); break; default: rc = arch_memory_op(op, arg); break; } return rc; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/grant_table.c0000664000175000017500000024667112307313555015173 0ustar smbsmb/****************************************************************************** * common/grant_table.c * * Mechanism for granting foreign access to page frames, and receiving * page-ownership transfers. * * Copyright (c) 2005-2006 Christopher Clark * Copyright (c) 2004 K A Fraser * Copyright (c) 2005 Andrew Warfield * Modifications by Geoffrey Lefebvre are (c) Intel Research Cambridge * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef max_nr_grant_frames unsigned int max_nr_grant_frames = DEFAULT_MAX_NR_GRANT_FRAMES; integer_param("gnttab_max_nr_frames", max_nr_grant_frames); #endif /* The maximum number of grant mappings is defined as a multiplier of the * maximum number of grant table entries. This defines the multiplier used. * Pretty arbitrary. [POLICY] */ #define MAX_MAPTRACK_TO_GRANTS_RATIO 8 /* * The first two members of a grant entry are updated as a combined pair. * The following union allows that to happen in an endian-neutral fashion. */ union grant_combo { uint32_t word; struct { uint16_t flags; domid_t domid; } shorts; }; /* Used to share code between unmap_grant_ref and unmap_and_replace. */ struct gnttab_unmap_common { /* Input */ uint64_t host_addr; uint64_t dev_bus_addr; uint64_t new_addr; grant_handle_t handle; /* Return */ int16_t status; /* Shared state beteen *_unmap and *_unmap_complete */ u16 flags; unsigned long frame; struct grant_mapping *map; struct domain *rd; }; /* Number of unmap operations that are done between each tlb flush */ #define GNTTAB_UNMAP_BATCH_SIZE 32 #define PIN_FAIL(_lbl, _rc, _f, _a...) \ do { \ gdprintk(XENLOG_WARNING, _f, ## _a ); \ rc = (_rc); \ goto _lbl; \ } while ( 0 ) #define MAPTRACK_PER_PAGE (PAGE_SIZE / sizeof(struct grant_mapping)) #define maptrack_entry(t, e) \ ((t)->maptrack[(e)/MAPTRACK_PER_PAGE][(e)%MAPTRACK_PER_PAGE]) static inline unsigned int nr_maptrack_frames(struct grant_table *t) { return t->maptrack_limit / MAPTRACK_PER_PAGE; } static unsigned inline int max_nr_maptrack_frames(void) { return (max_nr_grant_frames * MAX_MAPTRACK_TO_GRANTS_RATIO); } #define MAPTRACK_TAIL (~0u) #define SHGNT_PER_PAGE_V1 (PAGE_SIZE / sizeof(grant_entry_v1_t)) #define shared_entry_v1(t, e) \ ((t)->shared_v1[(e)/SHGNT_PER_PAGE_V1][(e)%SHGNT_PER_PAGE_V1]) #define SHGNT_PER_PAGE_V2 (PAGE_SIZE / sizeof(grant_entry_v2_t)) #define shared_entry_v2(t, e) \ ((t)->shared_v2[(e)/SHGNT_PER_PAGE_V2][(e)%SHGNT_PER_PAGE_V2]) #define STGNT_PER_PAGE (PAGE_SIZE / sizeof(grant_status_t)) #define status_entry(t, e) \ ((t)->status[(e)/STGNT_PER_PAGE][(e)%STGNT_PER_PAGE]) static grant_entry_header_t * shared_entry_header(struct grant_table *t, grant_ref_t ref) { ASSERT(t->gt_version != 0); if (t->gt_version == 1) return (grant_entry_header_t*)&shared_entry_v1(t, ref); else return &shared_entry_v2(t, ref).hdr; } /* Active grant entry - used for shadowing GTF_permit_access grants. */ struct active_grant_entry { u32 pin; /* Reference count information. */ domid_t domid; /* Domain being granted access. */ struct domain *trans_domain; uint32_t trans_gref; unsigned long frame; /* Frame being granted. */ unsigned long gfn; /* Guest's idea of the frame being granted. */ unsigned is_sub_page:1; /* True if this is a sub-page grant. */ unsigned start:15; /* For sub-page grants, the start offset in the page. */ unsigned length:16; /* For sub-page grants, the length of the grant. */ }; #define ACGNT_PER_PAGE (PAGE_SIZE / sizeof(struct active_grant_entry)) #define active_entry(t, e) \ ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE]) static inline unsigned int num_act_frames_from_sha_frames(const unsigned int num) { /* How many frames are needed for the active grant table, * given the size of the shared grant table? */ unsigned int sha_per_page = PAGE_SIZE / sizeof(grant_entry_v1_t); unsigned int num_sha_entries = num * sha_per_page; return (num_sha_entries + (ACGNT_PER_PAGE - 1)) / ACGNT_PER_PAGE; } #define max_nr_active_grant_frames \ num_act_frames_from_sha_frames(max_nr_grant_frames) static inline unsigned int nr_active_grant_frames(struct grant_table *gt) { return num_act_frames_from_sha_frames(nr_grant_frames(gt)); } /* Check if the page has been paged out, or needs unsharing. If rc == GNTST_okay, *page contains the page struct with a ref taken. Caller must do put_page(*page). If any error, *page = NULL, *frame = INVALID_MFN, no ref taken. */ static int __get_paged_frame(unsigned long gfn, unsigned long *frame, struct page_info **page, int readonly, struct domain *rd) { int rc = GNTST_okay; #if defined(P2M_PAGED_TYPES) || defined(P2M_SHARED_TYPES) p2m_type_t p2mt; *page = get_page_from_gfn(rd, gfn, &p2mt, (readonly) ? P2M_ALLOC : P2M_UNSHARE); if ( !(*page) ) { *frame = INVALID_MFN; if ( p2m_is_shared(p2mt) ) return GNTST_eagain; if ( p2m_is_paging(p2mt) ) { p2m_mem_paging_populate(rd, gfn); return GNTST_eagain; } return GNTST_bad_page; } *frame = page_to_mfn(*page); #else *frame = gmfn_to_mfn(rd, gfn); *page = mfn_valid(*frame) ? mfn_to_page(*frame) : NULL; if ( (!(*page)) || (!get_page(*page, rd)) ) { *frame = INVALID_MFN; *page = NULL; rc = GNTST_bad_page; } #endif return rc; } static inline void double_gt_lock(struct grant_table *lgt, struct grant_table *rgt) { if ( lgt < rgt ) { spin_lock(&lgt->lock); spin_lock(&rgt->lock); } else { if ( lgt != rgt ) spin_lock(&rgt->lock); spin_lock(&lgt->lock); } } static inline void double_gt_unlock(struct grant_table *lgt, struct grant_table *rgt) { spin_unlock(&lgt->lock); if ( lgt != rgt ) spin_unlock(&rgt->lock); } static inline int __get_maptrack_handle( struct grant_table *t) { unsigned int h; if ( unlikely((h = t->maptrack_head) == MAPTRACK_TAIL) ) return -1; t->maptrack_head = maptrack_entry(t, h).ref; return h; } static inline void put_maptrack_handle( struct grant_table *t, int handle) { spin_lock(&t->lock); maptrack_entry(t, handle).ref = t->maptrack_head; t->maptrack_head = handle; spin_unlock(&t->lock); } static inline int get_maptrack_handle( struct grant_table *lgt) { int i; grant_handle_t handle; struct grant_mapping *new_mt; unsigned int new_mt_limit, nr_frames; spin_lock(&lgt->lock); while ( unlikely((handle = __get_maptrack_handle(lgt)) == -1) ) { nr_frames = nr_maptrack_frames(lgt); if ( nr_frames >= max_nr_maptrack_frames() ) break; new_mt = alloc_xenheap_page(); if ( !new_mt ) break; clear_page(new_mt); new_mt_limit = lgt->maptrack_limit + MAPTRACK_PER_PAGE; for ( i = 1; i < MAPTRACK_PER_PAGE; i++ ) new_mt[i - 1].ref = lgt->maptrack_limit + i; new_mt[i - 1].ref = lgt->maptrack_head; lgt->maptrack_head = lgt->maptrack_limit; lgt->maptrack[nr_frames] = new_mt; smp_wmb(); lgt->maptrack_limit = new_mt_limit; gdprintk(XENLOG_INFO, "Increased maptrack size to %u frames\n", nr_frames + 1); } spin_unlock(&lgt->lock); return handle; } /* Number of grant table entries. Caller must hold d's grant table lock. */ static unsigned int nr_grant_entries(struct grant_table *gt) { ASSERT(gt->gt_version != 0); if (gt->gt_version == 1) return (nr_grant_frames(gt) << PAGE_SHIFT) / sizeof(grant_entry_v1_t); else return (nr_grant_frames(gt) << PAGE_SHIFT) / sizeof(grant_entry_v2_t); } static int _set_status_v1(domid_t domid, int readonly, int mapflag, grant_entry_header_t *shah, struct active_grant_entry *act) { int rc = GNTST_okay; union grant_combo scombo, prev_scombo, new_scombo; uint16_t mask = GTF_type_mask; /* * We bound the number of times we retry CMPXCHG on memory locations that * we share with a guest OS. The reason is that the guest can modify that * location at a higher rate than we can read-modify-CMPXCHG, so the guest * could cause us to livelock. There are a few cases where it is valid for * the guest to race our updates (e.g., to change the GTF_readonly flag), * so we allow a few retries before failing. */ int retries = 0; /* if this is a grant mapping operation we should ensure GTF_sub_page is not set */ if (mapflag) mask |= GTF_sub_page; scombo.word = *(u32 *)shah; /* * This loop attempts to set the access (reading/writing) flags * in the grant table entry. It tries a cmpxchg on the field * up to five times, and then fails under the assumption that * the guest is misbehaving. */ for ( ; ; ) { /* If not already pinned, check the grant domid and type. */ if ( !act->pin && (((scombo.shorts.flags & mask) != GTF_permit_access) || (scombo.shorts.domid != domid)) ) PIN_FAIL(done, GNTST_general_error, "Bad flags (%x) or dom (%d). (expected dom %d)\n", scombo.shorts.flags, scombo.shorts.domid, domid); new_scombo = scombo; new_scombo.shorts.flags |= GTF_reading; if ( !readonly ) { new_scombo.shorts.flags |= GTF_writing; if ( unlikely(scombo.shorts.flags & GTF_readonly) ) PIN_FAIL(done, GNTST_general_error, "Attempt to write-pin a r/o grant entry.\n"); } prev_scombo.word = cmpxchg((u32 *)shah, scombo.word, new_scombo.word); if ( likely(prev_scombo.word == scombo.word) ) break; if ( retries++ == 4 ) PIN_FAIL(done, GNTST_general_error, "Shared grant entry is unstable.\n"); scombo = prev_scombo; } done: return rc; } static int _set_status_v2(domid_t domid, int readonly, int mapflag, grant_entry_header_t *shah, struct active_grant_entry *act, grant_status_t *status) { int rc = GNTST_okay; union grant_combo scombo; uint16_t flags = shah->flags; domid_t id = shah->domid; uint16_t mask = GTF_type_mask; /* we read flags and domid in a single memory access. this avoids the need for another memory barrier to ensure access to these fields are not reordered */ scombo.word = *(u32 *)shah; barrier(); /* but we still need to stop the compiler from turning it back into two reads */ flags = scombo.shorts.flags; id = scombo.shorts.domid; /* if this is a grant mapping operation we should ensure GTF_sub_page is not set */ if (mapflag) mask |= GTF_sub_page; /* If not already pinned, check the grant domid and type. */ if ( !act->pin && ( (((flags & mask) != GTF_permit_access) && ((flags & mask) != GTF_transitive)) || (id != domid)) ) PIN_FAIL(done, GNTST_general_error, "Bad flags (%x) or dom (%d). (expected dom %d, flags %x)\n", flags, id, domid, mask); if ( readonly ) { *status |= GTF_reading; } else { if ( unlikely(flags & GTF_readonly) ) PIN_FAIL(done, GNTST_general_error, "Attempt to write-pin a r/o grant entry.\n"); *status |= GTF_reading | GTF_writing; } /* Make sure guest sees status update before checking if flags are still valid */ smp_mb(); scombo.word = *(u32 *)shah; barrier(); flags = scombo.shorts.flags; id = scombo.shorts.domid; if ( !act->pin ) { if ( (((flags & mask) != GTF_permit_access) && ((flags & mask) != GTF_transitive)) || (id != domid) || (!readonly && (flags & GTF_readonly)) ) { gnttab_clear_flag(_GTF_writing, status); gnttab_clear_flag(_GTF_reading, status); PIN_FAIL(done, GNTST_general_error, "Unstable flags (%x) or dom (%d). (expected dom %d) " "(r/w: %d)\n", flags, id, domid, !readonly); } } else { if ( unlikely(flags & GTF_readonly) ) { gnttab_clear_flag(_GTF_writing, status); PIN_FAIL(done, GNTST_general_error, "Unstable grant readonly flag\n"); } } done: return rc; } static int _set_status(unsigned gt_version, domid_t domid, int readonly, int mapflag, grant_entry_header_t *shah, struct active_grant_entry *act, grant_status_t *status) { if (gt_version == 1) return _set_status_v1(domid, readonly, mapflag, shah, act); else return _set_status_v2(domid, readonly, mapflag, shah, act, status); } static void mapcount( struct grant_table *lgt, struct domain *rd, unsigned long mfn, unsigned int *wrc, unsigned int *rdc) { struct grant_mapping *map; grant_handle_t handle; *wrc = *rdc = 0; for ( handle = 0; handle < lgt->maptrack_limit; handle++ ) { map = &maptrack_entry(lgt, handle); if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) || map->domid != rd->domain_id ) continue; if ( active_entry(rd->grant_table, map->ref).frame == mfn ) (map->flags & GNTMAP_readonly) ? (*rdc)++ : (*wrc)++; } } /* * Returns 0 if TLB flush / invalidate required by caller. * va will indicate the address to be invalidated. * * addr is _either_ a host virtual address, or the address of the pte to * update, as indicated by the GNTMAP_contains_pte flag. */ static void __gnttab_map_grant_ref( struct gnttab_map_grant_ref *op) { struct domain *ld, *rd, *owner = NULL; struct grant_table *lgt, *rgt; struct vcpu *led; int handle; unsigned long frame = 0, nr_gets = 0; struct page_info *pg = NULL; int rc = GNTST_okay; u32 old_pin; u32 act_pin; unsigned int cache_flags; struct active_grant_entry *act = NULL; struct grant_mapping *mt; grant_entry_v1_t *sha1; grant_entry_v2_t *sha2; grant_entry_header_t *shah; uint16_t *status; led = current; ld = led->domain; if ( unlikely((op->flags & (GNTMAP_device_map|GNTMAP_host_map)) == 0) ) { gdprintk(XENLOG_INFO, "Bad flags in grant map op (%x).\n", op->flags); op->status = GNTST_bad_gntref; return; } if ( unlikely(paging_mode_external(ld) && (op->flags & (GNTMAP_device_map|GNTMAP_application_map| GNTMAP_contains_pte))) ) { gdprintk(XENLOG_INFO, "No device mapping in HVM domain.\n"); op->status = GNTST_general_error; return; } if ( unlikely((rd = rcu_lock_domain_by_id(op->dom)) == NULL) ) { gdprintk(XENLOG_INFO, "Could not find domain %d\n", op->dom); op->status = GNTST_bad_domain; return; } rc = xsm_grant_mapref(XSM_HOOK, ld, rd, op->flags); if ( rc ) { rcu_unlock_domain(rd); op->status = GNTST_permission_denied; return; } lgt = ld->grant_table; if ( unlikely((handle = get_maptrack_handle(lgt)) == -1) ) { rcu_unlock_domain(rd); gdprintk(XENLOG_INFO, "Failed to obtain maptrack handle.\n"); op->status = GNTST_no_device_space; return; } rgt = rd->grant_table; spin_lock(&rgt->lock); if ( rgt->gt_version == 0 ) PIN_FAIL(unlock_out, GNTST_general_error, "remote grant table not yet set up"); /* Bounds check on the grant ref */ if ( unlikely(op->ref >= nr_grant_entries(rgt))) PIN_FAIL(unlock_out, GNTST_bad_gntref, "Bad ref (%d).\n", op->ref); act = &active_entry(rgt, op->ref); shah = shared_entry_header(rgt, op->ref); if (rgt->gt_version == 1) { sha1 = &shared_entry_v1(rgt, op->ref); sha2 = NULL; status = &shah->flags; } else { sha2 = &shared_entry_v2(rgt, op->ref); sha1 = NULL; status = &status_entry(rgt, op->ref); } /* If already pinned, check the active domid and avoid refcnt overflow. */ if ( act->pin && ((act->domid != ld->domain_id) || (act->pin & 0x80808080U) != 0 || (act->is_sub_page)) ) PIN_FAIL(unlock_out, GNTST_general_error, "Bad domain (%d != %d), or risk of counter overflow %08x, or subpage %d\n", act->domid, ld->domain_id, act->pin, act->is_sub_page); if ( !act->pin || (!(op->flags & GNTMAP_readonly) && !(act->pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask))) ) { if ( (rc = _set_status(rgt->gt_version, ld->domain_id, op->flags & GNTMAP_readonly, 1, shah, act, status) ) != GNTST_okay ) goto unlock_out; if ( !act->pin ) { unsigned long frame; unsigned long gfn = sha1 ? sha1->frame : sha2->full_page.frame; rc = __get_paged_frame(gfn, &frame, &pg, !!(op->flags & GNTMAP_readonly), rd); if ( rc != GNTST_okay ) goto unlock_out_clear; act->gfn = gfn; act->domid = ld->domain_id; act->frame = frame; act->start = 0; act->length = PAGE_SIZE; act->is_sub_page = 0; act->trans_domain = rd; act->trans_gref = op->ref; } } old_pin = act->pin; if ( op->flags & GNTMAP_device_map ) act->pin += (op->flags & GNTMAP_readonly) ? GNTPIN_devr_inc : GNTPIN_devw_inc; if ( op->flags & GNTMAP_host_map ) act->pin += (op->flags & GNTMAP_readonly) ? GNTPIN_hstr_inc : GNTPIN_hstw_inc; frame = act->frame; act_pin = act->pin; cache_flags = (shah->flags & (GTF_PAT | GTF_PWT | GTF_PCD) ); spin_unlock(&rgt->lock); /* pg may be set, with a refcount included, from __get_paged_frame */ if ( !pg ) { pg = mfn_valid(frame) ? mfn_to_page(frame) : NULL; if ( pg ) owner = page_get_owner_and_reference(pg); } else owner = page_get_owner(pg); if ( !pg || (owner == dom_io) ) { /* Only needed the reference to confirm dom_io ownership. */ if ( pg ) put_page(pg); if ( paging_mode_external(ld) ) { gdprintk(XENLOG_WARNING, "HVM guests can't grant map iomem\n"); rc = GNTST_general_error; goto undo_out; } if ( !iomem_access_permitted(rd, frame, frame) ) { gdprintk(XENLOG_WARNING, "Iomem mapping not permitted %lx (domain %d)\n", frame, rd->domain_id); rc = GNTST_general_error; goto undo_out; } rc = create_grant_host_mapping( op->host_addr, frame, op->flags, cache_flags); if ( rc != GNTST_okay ) goto undo_out; } else if ( owner == rd || owner == dom_cow ) { if ( gnttab_host_mapping_get_page_type(op, ld, rd) ) { if ( (owner == dom_cow) || !get_page_type(pg, PGT_writable_page) ) goto could_not_pin; } nr_gets++; if ( op->flags & GNTMAP_host_map ) { rc = create_grant_host_mapping(op->host_addr, frame, op->flags, 0); if ( rc != GNTST_okay ) goto undo_out; if ( op->flags & GNTMAP_device_map ) { nr_gets++; (void)get_page(pg, rd); if ( !(op->flags & GNTMAP_readonly) ) get_page_type(pg, PGT_writable_page); } } } else { could_not_pin: if ( !rd->is_dying ) gdprintk(XENLOG_WARNING, "Could not pin grant frame %lx\n", frame); if ( owner != NULL ) put_page(pg); rc = GNTST_general_error; goto undo_out; } double_gt_lock(lgt, rgt); if ( is_pv_domain(ld) && need_iommu(ld) ) { unsigned int wrc, rdc; int err = 0; /* Shouldn't happen, because you can't use iommu in a HVM domain. */ BUG_ON(paging_mode_translate(ld)); /* We're not translated, so we know that gmfns and mfns are the same things, so the IOMMU entry is always 1-to-1. */ mapcount(lgt, rd, frame, &wrc, &rdc); if ( (act_pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) && !(old_pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) ) { if ( wrc == 0 ) err = iommu_map_page(ld, frame, frame, IOMMUF_readable|IOMMUF_writable); } else if ( act_pin && !old_pin ) { if ( (wrc + rdc) == 0 ) err = iommu_map_page(ld, frame, frame, IOMMUF_readable); } if ( err ) { double_gt_unlock(lgt, rgt); rc = GNTST_general_error; goto undo_out; } } TRACE_1D(TRC_MEM_PAGE_GRANT_MAP, op->dom); mt = &maptrack_entry(lgt, handle); mt->domid = op->dom; mt->ref = op->ref; mt->flags = op->flags; double_gt_unlock(lgt, rgt); op->dev_bus_addr = (u64)frame << PAGE_SHIFT; op->handle = handle; op->status = GNTST_okay; rcu_unlock_domain(rd); return; undo_out: if ( nr_gets > 1 ) { if ( !(op->flags & GNTMAP_readonly) ) put_page_type(pg); put_page(pg); } if ( nr_gets > 0 ) { if ( gnttab_host_mapping_get_page_type(op, ld, rd) ) put_page_type(pg); put_page(pg); } spin_lock(&rgt->lock); act = &active_entry(rgt, op->ref); if ( op->flags & GNTMAP_device_map ) act->pin -= (op->flags & GNTMAP_readonly) ? GNTPIN_devr_inc : GNTPIN_devw_inc; if ( op->flags & GNTMAP_host_map ) act->pin -= (op->flags & GNTMAP_readonly) ? GNTPIN_hstr_inc : GNTPIN_hstw_inc; unlock_out_clear: if ( !(op->flags & GNTMAP_readonly) && !(act->pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) ) gnttab_clear_flag(_GTF_writing, status); if ( !act->pin ) gnttab_clear_flag(_GTF_reading, status); unlock_out: spin_unlock(&rgt->lock); op->status = rc; put_maptrack_handle(lgt, handle); rcu_unlock_domain(rd); } static long gnttab_map_grant_ref( XEN_GUEST_HANDLE_PARAM(gnttab_map_grant_ref_t) uop, unsigned int count) { int i; struct gnttab_map_grant_ref op; for ( i = 0; i < count; i++ ) { if (i && hypercall_preempt_check()) return i; if ( unlikely(__copy_from_guest_offset(&op, uop, i, 1)) ) return -EFAULT; __gnttab_map_grant_ref(&op); if ( unlikely(__copy_to_guest_offset(uop, i, &op, 1)) ) return -EFAULT; } return 0; } static void __gnttab_unmap_common( struct gnttab_unmap_common *op) { domid_t dom; struct domain *ld, *rd; struct grant_table *lgt, *rgt; struct active_grant_entry *act; s16 rc = 0; ld = current->domain; lgt = ld->grant_table; op->frame = (unsigned long)(op->dev_bus_addr >> PAGE_SHIFT); if ( unlikely(op->handle >= lgt->maptrack_limit) ) { gdprintk(XENLOG_INFO, "Bad handle (%d).\n", op->handle); op->status = GNTST_bad_handle; return; } op->map = &maptrack_entry(lgt, op->handle); spin_lock(&lgt->lock); if ( unlikely(!op->map->flags) ) { spin_unlock(&lgt->lock); gdprintk(XENLOG_INFO, "Zero flags for handle (%d).\n", op->handle); op->status = GNTST_bad_handle; return; } dom = op->map->domid; spin_unlock(&lgt->lock); if ( unlikely((rd = rcu_lock_domain_by_id(dom)) == NULL) ) { /* This can happen when a grant is implicitly unmapped. */ gdprintk(XENLOG_INFO, "Could not find domain %d\n", dom); domain_crash(ld); /* naughty... */ return; } rc = xsm_grant_unmapref(XSM_HOOK, ld, rd); if ( rc ) { rcu_unlock_domain(rd); op->status = GNTST_permission_denied; return; } TRACE_1D(TRC_MEM_PAGE_GRANT_UNMAP, dom); rgt = rd->grant_table; double_gt_lock(lgt, rgt); op->flags = op->map->flags; if ( unlikely(!op->flags) || unlikely(op->map->domid != dom) ) { gdprintk(XENLOG_WARNING, "Unstable handle %u\n", op->handle); rc = GNTST_bad_handle; goto unmap_out; } op->rd = rd; act = &active_entry(rgt, op->map->ref); if ( op->frame == 0 ) { op->frame = act->frame; } else { if ( unlikely(op->frame != act->frame) ) PIN_FAIL(unmap_out, GNTST_general_error, "Bad frame number doesn't match gntref. (%lx != %lx)\n", op->frame, act->frame); if ( op->flags & GNTMAP_device_map ) { ASSERT(act->pin & (GNTPIN_devw_mask | GNTPIN_devr_mask)); op->map->flags &= ~GNTMAP_device_map; if ( op->flags & GNTMAP_readonly ) act->pin -= GNTPIN_devr_inc; else act->pin -= GNTPIN_devw_inc; } } if ( (op->host_addr != 0) && (op->flags & GNTMAP_host_map) ) { if ( (rc = replace_grant_host_mapping(op->host_addr, op->frame, op->new_addr, op->flags)) < 0 ) goto unmap_out; ASSERT(act->pin & (GNTPIN_hstw_mask | GNTPIN_hstr_mask)); op->map->flags &= ~GNTMAP_host_map; if ( op->flags & GNTMAP_readonly ) act->pin -= GNTPIN_hstr_inc; else act->pin -= GNTPIN_hstw_inc; } if ( is_pv_domain(ld) && need_iommu(ld) ) { unsigned int wrc, rdc; int err = 0; BUG_ON(paging_mode_translate(ld)); mapcount(lgt, rd, op->frame, &wrc, &rdc); if ( (wrc + rdc) == 0 ) err = iommu_unmap_page(ld, op->frame); else if ( wrc == 0 ) err = iommu_map_page(ld, op->frame, op->frame, IOMMUF_readable); if ( err ) { rc = GNTST_general_error; goto unmap_out; } } /* If just unmapped a writable mapping, mark as dirtied */ if ( !(op->flags & GNTMAP_readonly) ) gnttab_mark_dirty(rd, op->frame); unmap_out: double_gt_unlock(lgt, rgt); op->status = rc; rcu_unlock_domain(rd); } static void __gnttab_unmap_common_complete(struct gnttab_unmap_common *op) { struct domain *ld, *rd = op->rd; struct grant_table *rgt; struct active_grant_entry *act; grant_entry_header_t *sha; struct page_info *pg; uint16_t *status; bool_t put_handle = 0; if ( rd == NULL ) { /* * Suggests that __gntab_unmap_common failed in * rcu_lock_domain_by_id() or earlier, and so we have nothing * to complete */ return; } ld = current->domain; rcu_lock_domain(rd); rgt = rd->grant_table; spin_lock(&rgt->lock); if ( rgt->gt_version == 0 ) goto unmap_out; act = &active_entry(rgt, op->map->ref); sha = shared_entry_header(rgt, op->map->ref); if ( rgt->gt_version == 1 ) status = &sha->flags; else status = &status_entry(rgt, op->map->ref); if ( unlikely(op->frame != act->frame) ) { /* * Suggests that __gntab_unmap_common failed early and so * nothing further to do */ goto unmap_out; } pg = mfn_to_page(op->frame); if ( op->flags & GNTMAP_device_map ) { if ( !is_iomem_page(act->frame) ) { if ( op->flags & GNTMAP_readonly ) put_page(pg); else put_page_and_type(pg); } } if ( (op->host_addr != 0) && (op->flags & GNTMAP_host_map) ) { if ( op->status != 0 ) { /* * Suggests that __gntab_unmap_common failed in * replace_grant_host_mapping() so nothing further to do */ goto unmap_out; } if ( !is_iomem_page(op->frame) ) { if ( gnttab_host_mapping_get_page_type(op, ld, rd) ) put_page_type(pg); put_page(pg); } } if ( (op->map->flags & (GNTMAP_device_map|GNTMAP_host_map)) == 0 ) put_handle = 1; if ( ((act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) == 0) && !(op->flags & GNTMAP_readonly) ) gnttab_clear_flag(_GTF_writing, status); if ( act->pin == 0 ) gnttab_clear_flag(_GTF_reading, status); unmap_out: spin_unlock(&rgt->lock); if ( put_handle ) { op->map->flags = 0; put_maptrack_handle(ld->grant_table, op->handle); } rcu_unlock_domain(rd); } static void __gnttab_unmap_grant_ref( struct gnttab_unmap_grant_ref *op, struct gnttab_unmap_common *common) { common->host_addr = op->host_addr; common->dev_bus_addr = op->dev_bus_addr; common->handle = op->handle; /* Intialise these in case common contains old state */ common->new_addr = 0; common->rd = NULL; __gnttab_unmap_common(common); op->status = common->status; } static long gnttab_unmap_grant_ref( XEN_GUEST_HANDLE_PARAM(gnttab_unmap_grant_ref_t) uop, unsigned int count) { int i, c, partial_done, done = 0; struct gnttab_unmap_grant_ref op; struct gnttab_unmap_common common[GNTTAB_UNMAP_BATCH_SIZE]; while ( count != 0 ) { c = min(count, (unsigned int)GNTTAB_UNMAP_BATCH_SIZE); partial_done = 0; for ( i = 0; i < c; i++ ) { if ( unlikely(__copy_from_guest(&op, uop, 1)) ) goto fault; __gnttab_unmap_grant_ref(&op, &(common[i])); ++partial_done; if ( unlikely(__copy_field_to_guest(uop, &op, status)) ) goto fault; guest_handle_add_offset(uop, 1); } flush_tlb_mask(current->domain->domain_dirty_cpumask); for ( i = 0; i < partial_done; i++ ) __gnttab_unmap_common_complete(&(common[i])); count -= c; done += c; if (count && hypercall_preempt_check()) return done; } return 0; fault: flush_tlb_mask(current->domain->domain_dirty_cpumask); for ( i = 0; i < partial_done; i++ ) __gnttab_unmap_common_complete(&(common[i])); return -EFAULT; } static void __gnttab_unmap_and_replace( struct gnttab_unmap_and_replace *op, struct gnttab_unmap_common *common) { common->host_addr = op->host_addr; common->new_addr = op->new_addr; common->handle = op->handle; /* Intialise these in case common contains old state */ common->dev_bus_addr = 0; common->rd = NULL; __gnttab_unmap_common(common); op->status = common->status; } static long gnttab_unmap_and_replace( XEN_GUEST_HANDLE_PARAM(gnttab_unmap_and_replace_t) uop, unsigned int count) { int i, c, partial_done, done = 0; struct gnttab_unmap_and_replace op; struct gnttab_unmap_common common[GNTTAB_UNMAP_BATCH_SIZE]; while ( count != 0 ) { c = min(count, (unsigned int)GNTTAB_UNMAP_BATCH_SIZE); partial_done = 0; for ( i = 0; i < c; i++ ) { if ( unlikely(__copy_from_guest(&op, uop, 1)) ) goto fault; __gnttab_unmap_and_replace(&op, &(common[i])); ++partial_done; if ( unlikely(__copy_field_to_guest(uop, &op, status)) ) goto fault; guest_handle_add_offset(uop, 1); } flush_tlb_mask(current->domain->domain_dirty_cpumask); for ( i = 0; i < partial_done; i++ ) __gnttab_unmap_common_complete(&(common[i])); count -= c; done += c; if (count && hypercall_preempt_check()) return done; } return 0; fault: flush_tlb_mask(current->domain->domain_dirty_cpumask); for ( i = 0; i < partial_done; i++ ) __gnttab_unmap_common_complete(&(common[i])); return -EFAULT; } static int gnttab_populate_status_frames(struct domain *d, struct grant_table *gt, unsigned int req_nr_frames) { unsigned i; unsigned req_status_frames; req_status_frames = grant_to_status_frames(req_nr_frames); for ( i = nr_status_frames(gt); i < req_status_frames; i++ ) { if ( (gt->status[i] = alloc_xenheap_page()) == NULL ) goto status_alloc_failed; clear_page(gt->status[i]); } /* Share the new status frames with the recipient domain */ for ( i = nr_status_frames(gt); i < req_status_frames; i++ ) gnttab_create_status_page(d, gt, i); gt->nr_status_frames = req_status_frames; return 0; status_alloc_failed: for ( i = nr_status_frames(gt); i < req_status_frames; i++ ) { free_xenheap_page(gt->status[i]); gt->status[i] = NULL; } return -ENOMEM; } static void gnttab_unpopulate_status_frames(struct domain *d, struct grant_table *gt) { int i; for ( i = 0; i < nr_status_frames(gt); i++ ) { struct page_info *pg = virt_to_page(gt->status[i]); BUG_ON(page_get_owner(pg) != d); if ( test_and_clear_bit(_PGC_allocated, &pg->count_info) ) put_page(pg); BUG_ON(pg->count_info & ~PGC_xen_heap); free_xenheap_page(gt->status[i]); gt->status[i] = NULL; } gt->nr_status_frames = 0; } int gnttab_grow_table(struct domain *d, unsigned int req_nr_frames) { /* d's grant table lock must be held by the caller */ struct grant_table *gt = d->grant_table; unsigned int i; ASSERT(req_nr_frames <= max_nr_grant_frames); gdprintk(XENLOG_INFO, "Expanding dom (%d) grant table from (%d) to (%d) frames.\n", d->domain_id, nr_grant_frames(gt), req_nr_frames); /* Active */ for ( i = nr_active_grant_frames(gt); i < num_act_frames_from_sha_frames(req_nr_frames); i++ ) { if ( (gt->active[i] = alloc_xenheap_page()) == NULL ) goto active_alloc_failed; clear_page(gt->active[i]); } /* Shared */ for ( i = nr_grant_frames(gt); i < req_nr_frames; i++ ) { if ( (gt->shared_raw[i] = alloc_xenheap_page()) == NULL ) goto shared_alloc_failed; clear_page(gt->shared_raw[i]); } /* Status pages - version 2 */ if (gt->gt_version > 1) { if ( gnttab_populate_status_frames(d, gt, req_nr_frames) ) goto shared_alloc_failed; } /* Share the new shared frames with the recipient domain */ for ( i = nr_grant_frames(gt); i < req_nr_frames; i++ ) gnttab_create_shared_page(d, gt, i); gt->nr_grant_frames = req_nr_frames; return 1; shared_alloc_failed: for ( i = nr_grant_frames(gt); i < req_nr_frames; i++ ) { free_xenheap_page(gt->shared_raw[i]); gt->shared_raw[i] = NULL; } active_alloc_failed: for ( i = nr_active_grant_frames(gt); i < num_act_frames_from_sha_frames(req_nr_frames); i++ ) { free_xenheap_page(gt->active[i]); gt->active[i] = NULL; } gdprintk(XENLOG_INFO, "Allocation failure when expanding grant table.\n"); return 0; } static long gnttab_setup_table( XEN_GUEST_HANDLE_PARAM(gnttab_setup_table_t) uop, unsigned int count) { struct gnttab_setup_table op; struct domain *d; struct grant_table *gt; int i; xen_pfn_t gmfn; if ( count != 1 ) return -EINVAL; if ( unlikely(copy_from_guest(&op, uop, 1) != 0) ) { gdprintk(XENLOG_INFO, "Fault while reading gnttab_setup_table_t.\n"); return -EFAULT; } if ( unlikely(op.nr_frames > max_nr_grant_frames) ) { gdprintk(XENLOG_INFO, "Xen only supports up to %d grant-table frames" " per domain.\n", max_nr_grant_frames); op.status = GNTST_general_error; goto out1; } if ( !guest_handle_okay(op.frame_list, op.nr_frames) ) return -EFAULT; d = rcu_lock_domain_by_any_id(op.dom); if ( d == NULL ) { gdprintk(XENLOG_INFO, "Bad domid %d.\n", op.dom); op.status = GNTST_bad_domain; goto out2; } if ( xsm_grant_setup(XSM_TARGET, current->domain, d) ) { op.status = GNTST_permission_denied; goto out2; } gt = d->grant_table; spin_lock(>->lock); if ( gt->gt_version == 0 ) gt->gt_version = 1; if ( (op.nr_frames > nr_grant_frames(gt) || ((gt->gt_version > 1) && (grant_to_status_frames(op.nr_frames) > nr_status_frames(gt)))) && !gnttab_grow_table(d, op.nr_frames) ) { gdprintk(XENLOG_INFO, "Expand grant table to %u failed. Current: %u Max: %u\n", op.nr_frames, nr_grant_frames(gt), max_nr_grant_frames); op.status = GNTST_general_error; goto out3; } op.status = GNTST_okay; for ( i = 0; i < op.nr_frames; i++ ) { gmfn = gnttab_shared_gmfn(d, gt, i); /* Grant tables cannot be shared */ BUG_ON(SHARED_M2P(gmfn)); if ( __copy_to_guest_offset(op.frame_list, i, &gmfn, 1) ) op.status = GNTST_bad_virt_addr; } out3: spin_unlock(>->lock); out2: rcu_unlock_domain(d); out1: if ( unlikely(__copy_field_to_guest(uop, &op, status)) ) return -EFAULT; return 0; } static long gnttab_query_size( XEN_GUEST_HANDLE_PARAM(gnttab_query_size_t) uop, unsigned int count) { struct gnttab_query_size op; struct domain *d; int rc; if ( count != 1 ) return -EINVAL; if ( unlikely(copy_from_guest(&op, uop, 1) != 0) ) { gdprintk(XENLOG_INFO, "Fault while reading gnttab_query_size_t.\n"); return -EFAULT; } d = rcu_lock_domain_by_any_id(op.dom); if ( d == NULL ) { gdprintk(XENLOG_INFO, "Bad domid %d.\n", op.dom); op.status = GNTST_bad_domain; goto query_out; } rc = xsm_grant_query_size(XSM_TARGET, current->domain, d); if ( rc ) { op.status = GNTST_permission_denied; goto query_out_unlock; } spin_lock(&d->grant_table->lock); op.nr_frames = nr_grant_frames(d->grant_table); op.max_nr_frames = max_nr_grant_frames; op.status = GNTST_okay; spin_unlock(&d->grant_table->lock); query_out_unlock: rcu_unlock_domain(d); query_out: if ( unlikely(__copy_to_guest(uop, &op, 1)) ) return -EFAULT; return 0; } /* * Check that the given grant reference (rd,ref) allows 'ld' to transfer * ownership of a page frame. If so, lock down the grant entry. */ static int gnttab_prepare_for_transfer( struct domain *rd, struct domain *ld, grant_ref_t ref) { struct grant_table *rgt = rd->grant_table; grant_entry_header_t *sha; union grant_combo scombo, prev_scombo, new_scombo; int retries = 0; spin_lock(&rgt->lock); if ( rgt->gt_version == 0 ) { gdprintk(XENLOG_INFO, "Grant table not ready for transfer to domain(%d).\n", rd->domain_id); goto fail; } if ( unlikely(ref >= nr_grant_entries(rgt)) ) { gdprintk(XENLOG_INFO, "Bad grant reference (%d) for transfer to domain(%d).\n", ref, rd->domain_id); goto fail; } sha = shared_entry_header(rgt, ref); scombo.word = *(u32 *)&sha->flags; for ( ; ; ) { if ( unlikely(scombo.shorts.flags != GTF_accept_transfer) || unlikely(scombo.shorts.domid != ld->domain_id) ) { gdprintk(XENLOG_INFO, "Bad flags (%x) or dom (%d). " "(NB. expected dom %d)\n", scombo.shorts.flags, scombo.shorts.domid, ld->domain_id); goto fail; } new_scombo = scombo; new_scombo.shorts.flags |= GTF_transfer_committed; prev_scombo.word = cmpxchg((u32 *)&sha->flags, scombo.word, new_scombo.word); if ( likely(prev_scombo.word == scombo.word) ) break; if ( retries++ == 4 ) { gdprintk(XENLOG_WARNING, "Shared grant entry is unstable.\n"); goto fail; } scombo = prev_scombo; } spin_unlock(&rgt->lock); return 1; fail: spin_unlock(&rgt->lock); return 0; } static long gnttab_transfer( XEN_GUEST_HANDLE_PARAM(gnttab_transfer_t) uop, unsigned int count) { struct domain *d = current->domain; struct domain *e; struct page_info *page; int i; struct gnttab_transfer gop; unsigned long mfn; unsigned int max_bitsize; for ( i = 0; i < count; i++ ) { bool_t okay; if (i && hypercall_preempt_check()) return i; /* Read from caller address space. */ if ( unlikely(__copy_from_guest(&gop, uop, 1)) ) { gdprintk(XENLOG_INFO, "gnttab_transfer: error reading req %d/%d\n", i, count); return -EFAULT; } #ifdef CONFIG_X86 { p2m_type_t __p2mt; mfn = mfn_x(get_gfn_unshare(d, gop.mfn, &__p2mt)); if ( p2m_is_shared(__p2mt) || !p2m_is_valid(__p2mt) ) mfn = INVALID_MFN; } #else mfn = gmfn_to_mfn(d, gop.mfn); #endif /* Check the passed page frame for basic validity. */ if ( unlikely(!mfn_valid(mfn)) ) { put_gfn(d, gop.mfn); gdprintk(XENLOG_INFO, "gnttab_transfer: out-of-range %lx\n", (unsigned long)gop.mfn); gop.status = GNTST_bad_page; goto copyback; } page = mfn_to_page(mfn); if ( unlikely(is_xen_heap_page(page)) ) { put_gfn(d, gop.mfn); gdprintk(XENLOG_INFO, "gnttab_transfer: xen frame %lx\n", (unsigned long)gop.mfn); gop.status = GNTST_bad_page; goto copyback; } if ( steal_page(d, page, 0) < 0 ) { put_gfn(d, gop.mfn); gop.status = GNTST_bad_page; goto copyback; } guest_physmap_remove_page(d, gop.mfn, mfn, 0); flush_tlb_mask(d->domain_dirty_cpumask); /* Find the target domain. */ if ( unlikely((e = rcu_lock_domain_by_id(gop.domid)) == NULL) ) { put_gfn(d, gop.mfn); gdprintk(XENLOG_INFO, "gnttab_transfer: can't find domain %d\n", gop.domid); page->count_info &= ~(PGC_count_mask|PGC_allocated); free_domheap_page(page); gop.status = GNTST_bad_domain; goto copyback; } if ( xsm_grant_transfer(XSM_HOOK, d, e) ) { put_gfn(d, gop.mfn); gop.status = GNTST_permission_denied; unlock_and_copyback: rcu_unlock_domain(e); page->count_info &= ~(PGC_count_mask|PGC_allocated); free_domheap_page(page); goto copyback; } max_bitsize = domain_clamp_alloc_bitsize( e, BITS_PER_LONG+PAGE_SHIFT-1); if ( (1UL << (max_bitsize - PAGE_SHIFT)) <= mfn ) { struct page_info *new_page; void *sp, *dp; new_page = alloc_domheap_page(NULL, MEMF_bits(max_bitsize)); if ( new_page == NULL ) { gop.status = GNTST_address_too_big; goto unlock_and_copyback; } sp = map_domain_page(mfn); dp = __map_domain_page(new_page); memcpy(dp, sp, PAGE_SIZE); unmap_domain_page(dp); unmap_domain_page(sp); page->count_info &= ~(PGC_count_mask|PGC_allocated); free_domheap_page(page); page = new_page; } spin_lock(&e->page_alloc_lock); /* * Check that 'e' will accept the page and has reservation * headroom. Also, a domain mustn't have PGC_allocated * pages when it is dying. */ if ( unlikely(e->is_dying) || unlikely(e->tot_pages >= e->max_pages) ) { spin_unlock(&e->page_alloc_lock); if ( e->is_dying ) gdprintk(XENLOG_INFO, "gnttab_transfer: " "Transferee (d%d) is dying\n", e->domain_id); else gdprintk(XENLOG_INFO, "gnttab_transfer: " "Transferee (d%d) has no headroom (tot %u, max %u)\n", e->domain_id, e->tot_pages, e->max_pages); rcu_unlock_domain(e); put_gfn(d, gop.mfn); page->count_info &= ~(PGC_count_mask|PGC_allocated); free_domheap_page(page); gop.status = GNTST_general_error; goto copyback; } /* Okay, add the page to 'e'. */ if ( unlikely(domain_adjust_tot_pages(e, 1) == 1) ) get_knownalive_domain(e); /* * We must drop the lock to avoid a possible deadlock in * gnttab_prepare_for_transfer. We have reserved a page in e so can * safely drop the lock and re-aquire it later to add page to the * pagelist. */ spin_unlock(&e->page_alloc_lock); okay = gnttab_prepare_for_transfer(e, d, gop.ref); spin_lock(&e->page_alloc_lock); if ( unlikely(!okay) || unlikely(e->is_dying) ) { bool_t drop_dom_ref = !domain_adjust_tot_pages(e, -1); spin_unlock(&e->page_alloc_lock); if ( okay /* i.e. e->is_dying due to the surrounding if() */ ) gdprintk(XENLOG_INFO, "gnttab_transfer: " "Transferee (d%d) is now dying\n", e->domain_id); if ( drop_dom_ref ) put_domain(e); rcu_unlock_domain(e); put_gfn(d, gop.mfn); page->count_info &= ~(PGC_count_mask|PGC_allocated); free_domheap_page(page); gop.status = GNTST_general_error; goto copyback; } page_list_add_tail(page, &e->page_list); page_set_owner(page, e); spin_unlock(&e->page_alloc_lock); put_gfn(d, gop.mfn); TRACE_1D(TRC_MEM_PAGE_GRANT_TRANSFER, e->domain_id); /* Tell the guest about its new page frame. */ spin_lock(&e->grant_table->lock); if ( e->grant_table->gt_version == 1 ) { grant_entry_v1_t *sha = &shared_entry_v1(e->grant_table, gop.ref); guest_physmap_add_page(e, sha->frame, mfn, 0); sha->frame = mfn; } else { grant_entry_v2_t *sha = &shared_entry_v2(e->grant_table, gop.ref); guest_physmap_add_page(e, sha->full_page.frame, mfn, 0); sha->full_page.frame = mfn; } smp_wmb(); shared_entry_header(e->grant_table, gop.ref)->flags |= GTF_transfer_completed; spin_unlock(&e->grant_table->lock); rcu_unlock_domain(e); gop.status = GNTST_okay; copyback: if ( unlikely(__copy_field_to_guest(uop, &gop, status)) ) { gdprintk(XENLOG_INFO, "gnttab_transfer: error writing resp " "%d/%d\n", i, count); return -EFAULT; } guest_handle_add_offset(uop, 1); } return 0; } /* Undo __acquire_grant_for_copy. Again, this has no effect on page type and reference counts. */ static void __release_grant_for_copy( struct domain *rd, unsigned long gref, int readonly) { struct grant_table *rgt = rd->grant_table; grant_entry_header_t *sha; struct active_grant_entry *act; unsigned long r_frame; uint16_t *status; grant_ref_t trans_gref; int released_read; int released_write; struct domain *td; released_read = 0; released_write = 0; spin_lock(&rgt->lock); act = &active_entry(rgt, gref); sha = shared_entry_header(rgt, gref); r_frame = act->frame; if (rgt->gt_version == 1) { status = &sha->flags; td = rd; trans_gref = gref; } else { status = &status_entry(rgt, gref); td = act->trans_domain; trans_gref = act->trans_gref; } if ( readonly ) { act->pin -= GNTPIN_hstr_inc; } else { gnttab_mark_dirty(rd, r_frame); act->pin -= GNTPIN_hstw_inc; if ( !(act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) ) { released_write = 1; gnttab_clear_flag(_GTF_writing, status); } } if ( !act->pin ) { gnttab_clear_flag(_GTF_reading, status); released_read = 1; } spin_unlock(&rgt->lock); if ( td != rd ) { /* Recursive calls, but they're tail calls, so it's okay. */ if ( released_write ) __release_grant_for_copy(td, trans_gref, 0); else if ( released_read ) __release_grant_for_copy(td, trans_gref, 1); rcu_unlock_domain(td); } } /* The status for a grant indicates that we're taking more access than the pin requires. Fix up the status to match the pin. Called under the domain's grant table lock. */ /* Only safe on transitive grants. Even then, note that we don't attempt to drop any pin on the referent grant. */ static void __fixup_status_for_copy_pin(const struct active_grant_entry *act, uint16_t *status) { if ( !(act->pin & GNTPIN_hstw_mask) ) gnttab_clear_flag(_GTF_writing, status); if ( !(act->pin & GNTPIN_hstr_mask) ) gnttab_clear_flag(_GTF_reading, status); } /* Grab a frame number from a grant entry and update the flags and pin count as appropriate. If rc == GNTST_okay, note that this *does* take one ref count on the target page, stored in *page. If there is any error, *page = NULL, no ref taken. */ static int __acquire_grant_for_copy( struct domain *rd, unsigned long gref, domid_t ldom, int readonly, unsigned long *frame, struct page_info **page, unsigned *page_off, unsigned *length, unsigned allow_transitive) { struct grant_table *rgt = rd->grant_table; grant_entry_v1_t *sha1; grant_entry_v2_t *sha2; grant_entry_header_t *shah; struct active_grant_entry *act; grant_status_t *status; uint32_t old_pin; domid_t trans_domid; grant_ref_t trans_gref; struct domain *td; unsigned long grant_frame; unsigned trans_page_off; unsigned trans_length; int is_sub_page; s16 rc = GNTST_okay; *page = NULL; spin_lock(&rgt->lock); if ( rgt->gt_version == 0 ) PIN_FAIL(unlock_out, GNTST_general_error, "remote grant table not ready\n"); if ( unlikely(gref >= nr_grant_entries(rgt)) ) PIN_FAIL(unlock_out, GNTST_bad_gntref, "Bad grant reference %ld\n", gref); act = &active_entry(rgt, gref); shah = shared_entry_header(rgt, gref); if ( rgt->gt_version == 1 ) { sha1 = &shared_entry_v1(rgt, gref); sha2 = NULL; status = &shah->flags; } else { sha1 = NULL; sha2 = &shared_entry_v2(rgt, gref); status = &status_entry(rgt, gref); } /* If already pinned, check the active domid and avoid refcnt overflow. */ if ( act->pin && ((act->domid != ldom) || (act->pin & 0x80808080U) != 0) ) PIN_FAIL(unlock_out, GNTST_general_error, "Bad domain (%d != %d), or risk of counter overflow %08x\n", act->domid, ldom, act->pin); old_pin = act->pin; if ( !act->pin || (!readonly && !(act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask))) ) { if ( (rc = _set_status(rgt->gt_version, ldom, readonly, 0, shah, act, status) ) != GNTST_okay ) goto unlock_out; td = rd; trans_gref = gref; if ( sha2 && (shah->flags & GTF_type_mask) == GTF_transitive ) { if ( !allow_transitive ) PIN_FAIL(unlock_out_clear, GNTST_general_error, "transitive grant when transitivity not allowed\n"); trans_domid = sha2->transitive.trans_domid; trans_gref = sha2->transitive.gref; barrier(); /* Stop the compiler from re-loading trans_domid from shared memory */ if ( trans_domid == rd->domain_id ) PIN_FAIL(unlock_out_clear, GNTST_general_error, "transitive grants cannot be self-referential\n"); /* We allow the trans_domid == ldom case, which corresponds to a grant being issued by one domain, sent to another one, and then transitively granted back to the original domain. Allowing it is easy, and means that you don't need to go out of your way to avoid it in the guest. */ /* We need to leave the rrd locked during the grant copy */ td = rcu_lock_domain_by_id(trans_domid); if ( td == NULL ) PIN_FAIL(unlock_out_clear, GNTST_general_error, "transitive grant referenced bad domain %d\n", trans_domid); spin_unlock(&rgt->lock); rc = __acquire_grant_for_copy(td, trans_gref, rd->domain_id, readonly, &grant_frame, page, &trans_page_off, &trans_length, 0); spin_lock(&rgt->lock); if ( rc != GNTST_okay ) { __fixup_status_for_copy_pin(act, status); rcu_unlock_domain(td); spin_unlock(&rgt->lock); return rc; } /* We dropped the lock, so we have to check that nobody else tried to pin (or, for that matter, unpin) the reference in *this* domain. If they did, just give up and try again. */ if ( act->pin != old_pin ) { __fixup_status_for_copy_pin(act, status); rcu_unlock_domain(td); spin_unlock(&rgt->lock); put_page(*page); return __acquire_grant_for_copy(rd, gref, ldom, readonly, frame, page, page_off, length, allow_transitive); } /* The actual remote remote grant may or may not be a sub-page, but we always treat it as one because that blocks mappings of transitive grants. */ is_sub_page = 1; act->gfn = -1ul; } else if ( sha1 ) { rc = __get_paged_frame(sha1->frame, &grant_frame, page, readonly, rd); if ( rc != GNTST_okay ) goto unlock_out_clear; act->gfn = sha1->frame; is_sub_page = 0; trans_page_off = 0; trans_length = PAGE_SIZE; } else if ( !(sha2->hdr.flags & GTF_sub_page) ) { rc = __get_paged_frame(sha2->full_page.frame, &grant_frame, page, readonly, rd); if ( rc != GNTST_okay ) goto unlock_out_clear; act->gfn = sha2->full_page.frame; is_sub_page = 0; trans_page_off = 0; trans_length = PAGE_SIZE; } else { rc = __get_paged_frame(sha2->sub_page.frame, &grant_frame, page, readonly, rd); if ( rc != GNTST_okay ) goto unlock_out_clear; act->gfn = sha2->sub_page.frame; is_sub_page = 1; trans_page_off = sha2->sub_page.page_off; trans_length = sha2->sub_page.length; } if ( !act->pin ) { act->domid = ldom; act->is_sub_page = is_sub_page; act->start = trans_page_off; act->length = trans_length; act->trans_domain = td; act->trans_gref = trans_gref; act->frame = grant_frame; } } else { ASSERT(mfn_valid(act->frame)); *page = mfn_to_page(act->frame); (void)page_get_owner_and_reference(*page); } act->pin += readonly ? GNTPIN_hstr_inc : GNTPIN_hstw_inc; *page_off = act->start; *length = act->length; *frame = act->frame; spin_unlock(&rgt->lock); return rc; unlock_out_clear: if ( !(readonly) && !(act->pin & GNTPIN_hstw_mask) ) gnttab_clear_flag(_GTF_writing, status); if ( !act->pin ) gnttab_clear_flag(_GTF_reading, status); unlock_out: spin_unlock(&rgt->lock); return rc; } static void __gnttab_copy( struct gnttab_copy *op) { struct domain *sd = NULL, *dd = NULL; unsigned long s_frame, d_frame; struct page_info *s_pg = NULL, *d_pg = NULL; char *sp, *dp; s16 rc = GNTST_okay; int have_d_grant = 0, have_s_grant = 0; int src_is_gref, dest_is_gref; if ( ((op->source.offset + op->len) > PAGE_SIZE) || ((op->dest.offset + op->len) > PAGE_SIZE) ) PIN_FAIL(error_out, GNTST_bad_copy_arg, "copy beyond page area.\n"); src_is_gref = op->flags & GNTCOPY_source_gref; dest_is_gref = op->flags & GNTCOPY_dest_gref; if ( (op->source.domid != DOMID_SELF && !src_is_gref ) || (op->dest.domid != DOMID_SELF && !dest_is_gref) ) PIN_FAIL(error_out, GNTST_permission_denied, "only allow copy-by-mfn for DOMID_SELF.\n"); if ( op->source.domid == DOMID_SELF ) sd = rcu_lock_current_domain(); else if ( (sd = rcu_lock_domain_by_id(op->source.domid)) == NULL ) PIN_FAIL(error_out, GNTST_bad_domain, "couldn't find %d\n", op->source.domid); if ( op->dest.domid == DOMID_SELF ) dd = rcu_lock_current_domain(); else if ( (dd = rcu_lock_domain_by_id(op->dest.domid)) == NULL ) PIN_FAIL(error_out, GNTST_bad_domain, "couldn't find %d\n", op->dest.domid); rc = xsm_grant_copy(XSM_HOOK, sd, dd); if ( rc ) { rc = GNTST_permission_denied; goto error_out; } if ( src_is_gref ) { unsigned source_off, source_len; rc = __acquire_grant_for_copy(sd, op->source.u.ref, current->domain->domain_id, 1, &s_frame, &s_pg, &source_off, &source_len, 1); if ( rc != GNTST_okay ) goto error_out; have_s_grant = 1; if ( op->source.offset < source_off || op->len > source_len ) PIN_FAIL(error_out, GNTST_general_error, "copy source out of bounds: %d < %d || %d > %d\n", op->source.offset, source_off, op->len, source_len); } else { rc = __get_paged_frame(op->source.u.gmfn, &s_frame, &s_pg, 1, sd); if ( rc != GNTST_okay ) PIN_FAIL(error_out, rc, "source frame %lx invalid.\n", s_frame); } if ( dest_is_gref ) { unsigned dest_off, dest_len; rc = __acquire_grant_for_copy(dd, op->dest.u.ref, current->domain->domain_id, 0, &d_frame, &d_pg, &dest_off, &dest_len, 1); if ( rc != GNTST_okay ) goto error_out; have_d_grant = 1; if ( op->dest.offset < dest_off || op->len > dest_len ) PIN_FAIL(error_out, GNTST_general_error, "copy dest out of bounds: %d < %d || %d > %d\n", op->dest.offset, dest_off, op->len, dest_len); } else { rc = __get_paged_frame(op->dest.u.gmfn, &d_frame, &d_pg, 0, dd); if ( rc != GNTST_okay ) PIN_FAIL(error_out, rc, "destination frame %lx invalid.\n", d_frame); } if ( !get_page_type(d_pg, PGT_writable_page) ) { if ( !dd->is_dying ) gdprintk(XENLOG_WARNING, "Could not get dst frame %lx\n", d_frame); rc = GNTST_general_error; goto error_out; } sp = map_domain_page(s_frame); dp = map_domain_page(d_frame); memcpy(dp + op->dest.offset, sp + op->source.offset, op->len); unmap_domain_page(dp); unmap_domain_page(sp); gnttab_mark_dirty(dd, d_frame); put_page_type(d_pg); error_out: if ( d_pg ) put_page(d_pg); if ( s_pg ) put_page(s_pg); if ( have_s_grant ) __release_grant_for_copy(sd, op->source.u.ref, 1); if ( have_d_grant ) __release_grant_for_copy(dd, op->dest.u.ref, 0); if ( sd ) rcu_unlock_domain(sd); if ( dd ) rcu_unlock_domain(dd); op->status = rc; } static long gnttab_copy( XEN_GUEST_HANDLE_PARAM(gnttab_copy_t) uop, unsigned int count) { int i; struct gnttab_copy op; for ( i = 0; i < count; i++ ) { if (i && hypercall_preempt_check()) return i; if ( unlikely(__copy_from_guest(&op, uop, 1)) ) return -EFAULT; __gnttab_copy(&op); if ( unlikely(__copy_field_to_guest(uop, &op, status)) ) return -EFAULT; guest_handle_add_offset(uop, 1); } return 0; } static long gnttab_set_version(XEN_GUEST_HANDLE_PARAM(gnttab_set_version_t) uop) { gnttab_set_version_t op; struct domain *d = current->domain; struct grant_table *gt = d->grant_table; struct active_grant_entry *act; grant_entry_v1_t reserved_entries[GNTTAB_NR_RESERVED_ENTRIES]; long res; int i; if (copy_from_guest(&op, uop, 1)) return -EFAULT; res = -EINVAL; if (op.version != 1 && op.version != 2) goto out; res = 0; if ( gt->gt_version == op.version ) goto out; spin_lock(>->lock); /* Make sure that the grant table isn't currently in use when we change the version number, except for the first 8 entries which are allowed to be in use (xenstore/xenconsole keeps them mapped). (You need to change the version number for e.g. kexec.) */ if ( gt->gt_version != 0 ) { for ( i = GNTTAB_NR_RESERVED_ENTRIES; i < nr_grant_entries(gt); i++ ) { act = &active_entry(gt, i); if ( act->pin != 0 ) { gdprintk(XENLOG_WARNING, "tried to change grant table version from %d to %d, but some grant entries still in use\n", gt->gt_version, op.version); res = -EBUSY; goto out_unlock; } } } /* XXX: If we're going to version 2, we could maybe shrink the active grant table here. */ if ( op.version == 2 && gt->gt_version < 2 ) { res = gnttab_populate_status_frames(d, gt, nr_grant_frames(gt)); if ( res < 0) goto out_unlock; } /* Preserve the first 8 entries (toolstack reserved grants) */ if ( gt->gt_version == 1 ) { memcpy(reserved_entries, &shared_entry_v1(gt, 0), sizeof(reserved_entries)); } else if ( gt->gt_version == 2 ) { for ( i = 0; i < GNTTAB_NR_RESERVED_ENTRIES && i < nr_grant_entries(gt); i++ ) { int flags = status_entry(gt, i); flags |= shared_entry_v2(gt, i).hdr.flags; if ((flags & GTF_type_mask) == GTF_permit_access) { reserved_entries[i].flags = flags; reserved_entries[i].domid = shared_entry_v2(gt, i).hdr.domid; reserved_entries[i].frame = shared_entry_v2(gt, i).full_page.frame; } else { if ((flags & GTF_type_mask) != GTF_invalid) gdprintk(XENLOG_INFO, "d%d: bad flags %x in grant %d when switching grant version\n", d->domain_id, flags, i); memset(&reserved_entries[i], 0, sizeof(reserved_entries[i])); } } } if ( op.version < 2 && gt->gt_version == 2 ) gnttab_unpopulate_status_frames(d, gt); /* Make sure there's no crud left over in the table from the old version. */ for ( i = 0; i < nr_grant_frames(gt); i++ ) memset(gt->shared_raw[i], 0, PAGE_SIZE); /* Restore the first 8 entries (toolstack reserved grants) */ if ( gt->gt_version != 0 && op.version == 1 ) { memcpy(&shared_entry_v1(gt, 0), reserved_entries, sizeof(reserved_entries)); } else if ( gt->gt_version != 0 && op.version == 2 ) { for ( i = 0; i < GNTTAB_NR_RESERVED_ENTRIES; i++ ) { status_entry(gt, i) = reserved_entries[i].flags & (GTF_reading|GTF_writing); shared_entry_v2(gt, i).hdr.flags = reserved_entries[i].flags & ~(GTF_reading|GTF_writing); shared_entry_v2(gt, i).hdr.domid = reserved_entries[i].domid; shared_entry_v2(gt, i).full_page.frame = reserved_entries[i].frame; } } gt->gt_version = op.version; out_unlock: spin_unlock(>->lock); out: op.version = gt->gt_version; if (__copy_to_guest(uop, &op, 1)) res = -EFAULT; return res; } static long gnttab_get_status_frames(XEN_GUEST_HANDLE_PARAM(gnttab_get_status_frames_t) uop, int count) { gnttab_get_status_frames_t op; struct domain *d; struct grant_table *gt; uint64_t gmfn; int i; int rc; if ( count != 1 ) return -EINVAL; if ( unlikely(copy_from_guest(&op, uop, 1) != 0) ) { gdprintk(XENLOG_INFO, "Fault while reading gnttab_get_status_frames_t.\n"); return -EFAULT; } d = rcu_lock_domain_by_any_id(op.dom); if ( d == NULL ) { op.status = GNTST_bad_domain; goto out1; } rc = xsm_grant_setup(XSM_TARGET, current->domain, d); if ( rc ) { op.status = GNTST_permission_denied; goto out2; } gt = d->grant_table; if ( unlikely(op.nr_frames > nr_status_frames(gt)) ) { gdprintk(XENLOG_INFO, "Guest requested addresses for %d grant status " "frames, but only %d are available.\n", op.nr_frames, nr_status_frames(gt)); op.status = GNTST_general_error; goto out2; } op.status = GNTST_okay; spin_lock(>->lock); for ( i = 0; i < op.nr_frames; i++ ) { gmfn = gnttab_status_gmfn(d, gt, i); if (copy_to_guest_offset(op.frame_list, i, &gmfn, 1)) op.status = GNTST_bad_virt_addr; } spin_unlock(>->lock); out2: rcu_unlock_domain(d); out1: if ( unlikely(__copy_field_to_guest(uop, &op, status)) ) return -EFAULT; return 0; } static long gnttab_get_version(XEN_GUEST_HANDLE_PARAM(gnttab_get_version_t) uop) { gnttab_get_version_t op; struct domain *d; int rc; if ( copy_from_guest(&op, uop, 1) ) return -EFAULT; d = rcu_lock_domain_by_any_id(op.dom); if ( d == NULL ) return -ESRCH; rc = xsm_grant_query_size(XSM_TARGET, current->domain, d); if ( rc ) { rcu_unlock_domain(d); return rc; } op.version = d->grant_table->gt_version; rcu_unlock_domain(d); if ( __copy_field_to_guest(uop, &op, version) ) return -EFAULT; return 0; } static s16 __gnttab_swap_grant_ref(grant_ref_t ref_a, grant_ref_t ref_b) { struct domain *d = rcu_lock_current_domain(); struct grant_table *gt = d->grant_table; struct active_grant_entry *act; s16 rc = GNTST_okay; spin_lock(>->lock); /* Bounds check on the grant refs */ if ( unlikely(ref_a >= nr_grant_entries(d->grant_table))) PIN_FAIL(out, GNTST_bad_gntref, "Bad ref-a (%d).\n", ref_a); if ( unlikely(ref_b >= nr_grant_entries(d->grant_table))) PIN_FAIL(out, GNTST_bad_gntref, "Bad ref-b (%d).\n", ref_b); act = &active_entry(gt, ref_a); if ( act->pin ) PIN_FAIL(out, GNTST_eagain, "ref a %ld busy\n", (long)ref_a); act = &active_entry(gt, ref_b); if ( act->pin ) PIN_FAIL(out, GNTST_eagain, "ref b %ld busy\n", (long)ref_b); if ( gt->gt_version == 1 ) { grant_entry_v1_t shared; shared = shared_entry_v1(gt, ref_a); shared_entry_v1(gt, ref_a) = shared_entry_v1(gt, ref_b); shared_entry_v1(gt, ref_b) = shared; } else { grant_entry_v2_t shared; grant_status_t status; shared = shared_entry_v2(gt, ref_a); status = status_entry(gt, ref_a); shared_entry_v2(gt, ref_a) = shared_entry_v2(gt, ref_b); status_entry(gt, ref_a) = status_entry(gt, ref_b); shared_entry_v2(gt, ref_b) = shared; status_entry(gt, ref_b) = status; } out: spin_unlock(>->lock); rcu_unlock_domain(d); return rc; } static long gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop, unsigned int count) { int i; gnttab_swap_grant_ref_t op; for ( i = 0; i < count; i++ ) { if ( i && hypercall_preempt_check() ) return i; if ( unlikely(__copy_from_guest(&op, uop, 1)) ) return -EFAULT; op.status = __gnttab_swap_grant_ref(op.ref_a, op.ref_b); if ( unlikely(__copy_field_to_guest(uop, &op, status)) ) return -EFAULT; guest_handle_add_offset(uop, 1); } return 0; } long do_grant_table_op( unsigned int cmd, XEN_GUEST_HANDLE_PARAM(void) uop, unsigned int count) { long rc; if ( (int)count < 0 ) return -EINVAL; rc = -EFAULT; switch ( cmd ) { case GNTTABOP_map_grant_ref: { XEN_GUEST_HANDLE_PARAM(gnttab_map_grant_ref_t) map = guest_handle_cast(uop, gnttab_map_grant_ref_t); if ( unlikely(!guest_handle_okay(map, count)) ) goto out; rc = gnttab_map_grant_ref(map, count); if ( rc > 0 ) { guest_handle_add_offset(map, rc); uop = guest_handle_cast(map, void); } break; } case GNTTABOP_unmap_grant_ref: { XEN_GUEST_HANDLE_PARAM(gnttab_unmap_grant_ref_t) unmap = guest_handle_cast(uop, gnttab_unmap_grant_ref_t); if ( unlikely(!guest_handle_okay(unmap, count)) ) goto out; rc = gnttab_unmap_grant_ref(unmap, count); if ( rc > 0 ) { guest_handle_add_offset(unmap, rc); uop = guest_handle_cast(unmap, void); } break; } case GNTTABOP_unmap_and_replace: { XEN_GUEST_HANDLE_PARAM(gnttab_unmap_and_replace_t) unmap = guest_handle_cast(uop, gnttab_unmap_and_replace_t); if ( unlikely(!guest_handle_okay(unmap, count)) ) goto out; rc = -ENOSYS; if ( unlikely(!replace_grant_supported()) ) goto out; rc = gnttab_unmap_and_replace(unmap, count); if ( rc > 0 ) { guest_handle_add_offset(unmap, rc); uop = guest_handle_cast(unmap, void); } break; } case GNTTABOP_setup_table: { rc = gnttab_setup_table( guest_handle_cast(uop, gnttab_setup_table_t), count); ASSERT(rc <= 0); break; } case GNTTABOP_transfer: { XEN_GUEST_HANDLE_PARAM(gnttab_transfer_t) transfer = guest_handle_cast(uop, gnttab_transfer_t); if ( unlikely(!guest_handle_okay(transfer, count)) ) goto out; rc = gnttab_transfer(transfer, count); if ( rc > 0 ) { guest_handle_add_offset(transfer, rc); uop = guest_handle_cast(transfer, void); } break; } case GNTTABOP_copy: { XEN_GUEST_HANDLE_PARAM(gnttab_copy_t) copy = guest_handle_cast(uop, gnttab_copy_t); if ( unlikely(!guest_handle_okay(copy, count)) ) goto out; rc = gnttab_copy(copy, count); if ( rc > 0 ) { guest_handle_add_offset(copy, rc); uop = guest_handle_cast(copy, void); } break; } case GNTTABOP_query_size: { rc = gnttab_query_size( guest_handle_cast(uop, gnttab_query_size_t), count); ASSERT(rc <= 0); break; } case GNTTABOP_set_version: { rc = gnttab_set_version(guest_handle_cast(uop, gnttab_set_version_t)); break; } case GNTTABOP_get_status_frames: { rc = gnttab_get_status_frames( guest_handle_cast(uop, gnttab_get_status_frames_t), count); break; } case GNTTABOP_get_version: { rc = gnttab_get_version(guest_handle_cast(uop, gnttab_get_version_t)); break; } case GNTTABOP_swap_grant_ref: { XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) swap = guest_handle_cast(uop, gnttab_swap_grant_ref_t); if ( unlikely(!guest_handle_okay(swap, count)) ) goto out; rc = gnttab_swap_grant_ref(swap, count); if ( rc > 0 ) { guest_handle_add_offset(swap, rc); uop = guest_handle_cast(swap, void); } break; } default: rc = -ENOSYS; break; } out: if ( rc > 0 ) { ASSERT(rc < count); rc = hypercall_create_continuation(__HYPERVISOR_grant_table_op, "ihi", cmd, uop, count - rc); } return rc; } #ifdef CONFIG_COMPAT #include "compat/grant_table.c" #endif int grant_table_create( struct domain *d) { struct grant_table *t; int i; if ( (t = xzalloc(struct grant_table)) == NULL ) goto no_mem_0; /* Simple stuff. */ spin_lock_init(&t->lock); t->nr_grant_frames = INITIAL_NR_GRANT_FRAMES; /* Active grant table. */ if ( (t->active = xzalloc_array(struct active_grant_entry *, max_nr_active_grant_frames)) == NULL ) goto no_mem_1; for ( i = 0; i < num_act_frames_from_sha_frames(INITIAL_NR_GRANT_FRAMES); i++ ) { if ( (t->active[i] = alloc_xenheap_page()) == NULL ) goto no_mem_2; clear_page(t->active[i]); } /* Tracking of mapped foreign frames table */ if ( (t->maptrack = xzalloc_array(struct grant_mapping *, max_nr_maptrack_frames())) == NULL ) goto no_mem_2; if ( (t->maptrack[0] = alloc_xenheap_page()) == NULL ) goto no_mem_3; clear_page(t->maptrack[0]); t->maptrack_limit = MAPTRACK_PER_PAGE; for ( i = 1; i < MAPTRACK_PER_PAGE; i++ ) t->maptrack[0][i - 1].ref = i; t->maptrack[0][i - 1].ref = MAPTRACK_TAIL; /* Shared grant table. */ if ( (t->shared_raw = xzalloc_array(void *, max_nr_grant_frames)) == NULL ) goto no_mem_3; for ( i = 0; i < INITIAL_NR_GRANT_FRAMES; i++ ) { if ( (t->shared_raw[i] = alloc_xenheap_page()) == NULL ) goto no_mem_4; clear_page(t->shared_raw[i]); } /* Status pages for grant table - for version 2 */ t->status = xzalloc_array(grant_status_t *, grant_to_status_frames(max_nr_grant_frames)); if ( t->status == NULL ) goto no_mem_4; for ( i = 0; i < INITIAL_NR_GRANT_FRAMES; i++ ) gnttab_create_shared_page(d, t, i); t->nr_status_frames = 0; /* Okay, install the structure. */ d->grant_table = t; return 0; no_mem_4: for ( i = 0; i < INITIAL_NR_GRANT_FRAMES; i++ ) free_xenheap_page(t->shared_raw[i]); xfree(t->shared_raw); no_mem_3: free_xenheap_page(t->maptrack[0]); xfree(t->maptrack); no_mem_2: for ( i = 0; i < num_act_frames_from_sha_frames(INITIAL_NR_GRANT_FRAMES); i++ ) free_xenheap_page(t->active[i]); xfree(t->active); no_mem_1: xfree(t); no_mem_0: return -ENOMEM; } void gnttab_release_mappings( struct domain *d) { struct grant_table *gt = d->grant_table, *rgt; struct grant_mapping *map; grant_ref_t ref; grant_handle_t handle; struct domain *rd; struct active_grant_entry *act; grant_entry_header_t *sha; uint16_t *status; struct page_info *pg; BUG_ON(!d->is_dying); for ( handle = 0; handle < gt->maptrack_limit; handle++ ) { map = &maptrack_entry(gt, handle); if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) ) continue; ref = map->ref; gdprintk(XENLOG_INFO, "Grant release (%hu) ref:(%hu) " "flags:(%x) dom:(%hu)\n", handle, ref, map->flags, map->domid); rd = rcu_lock_domain_by_id(map->domid); if ( rd == NULL ) { /* Nothing to clear up... */ map->flags = 0; continue; } rgt = rd->grant_table; spin_lock(&rgt->lock); act = &active_entry(rgt, ref); sha = shared_entry_header(rgt, ref); if (rgt->gt_version == 1) status = &sha->flags; else status = &status_entry(rgt, ref); pg = mfn_to_page(act->frame); if ( map->flags & GNTMAP_readonly ) { if ( map->flags & GNTMAP_device_map ) { BUG_ON(!(act->pin & GNTPIN_devr_mask)); act->pin -= GNTPIN_devr_inc; if ( !is_iomem_page(act->frame) ) put_page(pg); } if ( map->flags & GNTMAP_host_map ) { BUG_ON(!(act->pin & GNTPIN_hstr_mask)); act->pin -= GNTPIN_hstr_inc; if ( gnttab_release_host_mappings(d) && !is_iomem_page(act->frame) ) put_page(pg); } } else { if ( map->flags & GNTMAP_device_map ) { BUG_ON(!(act->pin & GNTPIN_devw_mask)); act->pin -= GNTPIN_devw_inc; if ( !is_iomem_page(act->frame) ) put_page_and_type(pg); } if ( map->flags & GNTMAP_host_map ) { BUG_ON(!(act->pin & GNTPIN_hstw_mask)); act->pin -= GNTPIN_hstw_inc; if ( gnttab_release_host_mappings(d) && !is_iomem_page(act->frame) ) { if ( gnttab_host_mapping_get_page_type(map, d, rd) ) put_page_type(pg); put_page(pg); } } if ( (act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) == 0 ) gnttab_clear_flag(_GTF_writing, status); } if ( act->pin == 0 ) gnttab_clear_flag(_GTF_reading, status); spin_unlock(&rgt->lock); rcu_unlock_domain(rd); map->flags = 0; } } void grant_table_destroy( struct domain *d) { struct grant_table *t = d->grant_table; int i; if ( t == NULL ) return; for ( i = 0; i < nr_grant_frames(t); i++ ) free_xenheap_page(t->shared_raw[i]); xfree(t->shared_raw); for ( i = 0; i < nr_maptrack_frames(t); i++ ) free_xenheap_page(t->maptrack[i]); xfree(t->maptrack); for ( i = 0; i < nr_active_grant_frames(t); i++ ) free_xenheap_page(t->active[i]); xfree(t->active); for ( i = 0; i < nr_status_frames(t); i++ ) free_xenheap_page(t->status[i]); xfree(t->status); xfree(t); d->grant_table = NULL; } static void gnttab_usage_print(struct domain *rd) { int first = 1; grant_ref_t ref; struct grant_table *gt = rd->grant_table; printk(" -------- active -------- -------- shared --------\n"); printk("[ref] localdom mfn pin localdom gmfn flags\n"); spin_lock(>->lock); if ( gt->gt_version == 0 ) goto out; for ( ref = 0; ref != nr_grant_entries(gt); ref++ ) { struct active_grant_entry *act; struct grant_entry_header *sha; grant_entry_v1_t *sha1; grant_entry_v2_t *sha2; uint16_t status; uint64_t frame; act = &active_entry(gt, ref); if ( !act->pin ) continue; sha = shared_entry_header(gt, ref); if ( gt->gt_version == 1 ) { sha1 = &shared_entry_v1(gt, ref); sha2 = NULL; status = sha->flags; frame = sha1->frame; } else { sha2 = &shared_entry_v2(gt, ref); sha1 = NULL; frame = sha2->full_page.frame; status = status_entry(gt, ref); } if ( first ) { printk("grant-table for remote domain:%5d (v%d)\n", rd->domain_id, gt->gt_version); first = 0; } /* [ddd] ddddd 0xXXXXXX 0xXXXXXXXX ddddd 0xXXXXXX 0xXX */ printk("[%3d] %5d 0x%06lx 0x%08x %5d 0x%06"PRIx64" 0x%02x\n", ref, act->domid, act->frame, act->pin, sha->domid, frame, status); } out: spin_unlock(>->lock); if ( first ) printk("grant-table for remote domain:%5d ... " "no active grant table entries\n", rd->domain_id); } static void gnttab_usage_print_all(unsigned char key) { struct domain *d; printk("%s [ key '%c' pressed\n", __FUNCTION__, key); for_each_domain ( d ) gnttab_usage_print(d); printk("%s ] done\n", __FUNCTION__); } static struct keyhandler gnttab_usage_print_all_keyhandler = { .diagnostic = 1, .u.fn = gnttab_usage_print_all, .desc = "print grant table usage" }; static int __init gnttab_usage_init(void) { register_keyhandler('g', &gnttab_usage_print_all_keyhandler); return 0; } __initcall(gnttab_usage_init); /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/earlycpio.c0000664000175000017500000000775712307313555014700 0ustar smbsmb/* ----------------------------------------------------------------------- * * * Copyright 2012 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available * under the terms of the GNU General Public License version 2, as * published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * ----------------------------------------------------------------------- */ /* * earlycpio.c * * Find a specific cpio member; must precede any compressed content. * This is used to locate data items in the initramfs used by the * kernel itself during early boot (before the main initramfs is * decompressed.) It is the responsibility of the initramfs creator * to ensure that these items are uncompressed at the head of the * blob. Depending on the boot loader or package tool that may be a * separate file or part of the same file. */ #include #include #include #include #include #define ALIGN(x, a) ((x + (a) - 1) & ~((a) - 1)) #define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) enum cpio_fields { C_MAGIC, C_INO, C_MODE, C_UID, C_GID, C_NLINK, C_MTIME, C_FILESIZE, C_MAJ, C_MIN, C_RMAJ, C_RMIN, C_NAMESIZE, C_CHKSUM, C_NFIELDS }; /** * cpio_data find_cpio_data - Search for files in an uncompressed cpio * @path: The directory to search for, including a slash at the end * @data: Pointer to the the cpio archive or a header inside * @len: Remaining length of the cpio based on data pointer * @offset: When a matching file is found, this is the offset to the * beginning of the cpio. It can be used to iterate through * the cpio to find all files inside of a directory path * * @return: struct cpio_data containing the address, length and * filename (with the directory path cut off) of the found file. * If you search for a filename and not for files in a directory, * pass the absolute path of the filename in the cpio and make sure * the match returned an empty filename string. */ struct cpio_data __init find_cpio_data(const char *path, void *data, size_t len, long *offset) { const size_t cpio_header_len = 8*C_NFIELDS - 2; struct cpio_data cd = { NULL, 0 }; const char *p, *dptr, *nptr; unsigned int ch[C_NFIELDS], *chp, v; unsigned char c, x; size_t mypathsize = strlen(path); int i, j; p = data; while (len > cpio_header_len) { if (!*p) { /* All cpio headers need to be 4-byte aligned */ p += 4; len -= 4; continue; } j = 6; /* The magic field is only 6 characters */ chp = ch; for (i = C_NFIELDS; i; i--) { v = 0; while (j--) { v <<= 4; c = *p++; x = c - '0'; if (x < 10) { v += x; continue; } x = (c | 0x20) - 'a'; if (x < 6) { v += x + 10; continue; } goto quit; /* Invalid hexadecimal */ } *chp++ = v; j = 8; /* All other fields are 8 characters */ } if ((ch[C_MAGIC] - 0x070701) > 1) goto quit; /* Invalid magic */ len -= cpio_header_len; dptr = PTR_ALIGN(p + ch[C_NAMESIZE], 4); nptr = PTR_ALIGN(dptr + ch[C_FILESIZE], 4); if (nptr > p + len || dptr < p || nptr < dptr) goto quit; /* Buffer overrun */ if ((ch[C_MODE] & 0170000) == 0100000 && ch[C_NAMESIZE] >= mypathsize && !memcmp(p, path, mypathsize)) { *offset = (long)nptr - (long)data; if (ch[C_NAMESIZE] - mypathsize >= MAX_CPIO_FILE_NAME) { printk( "File %s exceeding MAX_CPIO_FILE_NAME [%d]\n", p, MAX_CPIO_FILE_NAME); } if (ch[C_NAMESIZE] - 1 /* includes \0 */ == mypathsize) { cd.data = (void *)dptr; cd.size = ch[C_FILESIZE]; return cd; /* Found it! */ } } len -= (nptr - p); p = nptr; } quit: return cd; } xen-4.4.0/xen/common/unxz.c0000664000175000017500000002112112307313555013672 0ustar smbsmb/* * Wrapper for decompressing XZ-compressed kernel, initramfs, and initrd * * Author: Lasse Collin * * This file has been put into the public domain. * You can do whatever you want with this file. */ /* * Important notes about in-place decompression * * At least on x86, the kernel is decompressed in place: the compressed data * is placed to the end of the output buffer, and the decompressor overwrites * most of the compressed data. There must be enough safety margin to * guarantee that the write position is always behind the read position. * * The safety margin for XZ with LZMA2 or BCJ+LZMA2 is calculated below. * Note that the margin with XZ is bigger than with Deflate (gzip)! * * The worst case for in-place decompression is that the beginning of * the file is compressed extremely well, and the rest of the file is * uncompressible. Thus, we must look for worst-case expansion when the * compressor is encoding uncompressible data. * * The structure of the .xz file in case of a compresed kernel is as follows. * Sizes (as bytes) of the fields are in parenthesis. * * Stream Header (12) * Block Header: * Block Header (8-12) * Compressed Data (N) * Block Padding (0-3) * CRC32 (4) * Index (8-20) * Stream Footer (12) * * Normally there is exactly one Block, but let's assume that there are * 2-4 Blocks just in case. Because Stream Header and also Block Header * of the first Block don't make the decompressor produce any uncompressed * data, we can ignore them from our calculations. Block Headers of possible * additional Blocks have to be taken into account still. With these * assumptions, it is safe to assume that the total header overhead is * less than 128 bytes. * * Compressed Data contains LZMA2 or BCJ+LZMA2 encoded data. Since BCJ * doesn't change the size of the data, it is enough to calculate the * safety margin for LZMA2. * * LZMA2 stores the data in chunks. Each chunk has a header whose size is * a maximum of 6 bytes, but to get round 2^n numbers, let's assume that * the maximum chunk header size is 8 bytes. After the chunk header, there * may be up to 64 KiB of actual payload in the chunk. Often the payload is * quite a bit smaller though; to be safe, let's assume that an average * chunk has only 32 KiB of payload. * * The maximum uncompressed size of the payload is 2 MiB. The minimum * uncompressed size of the payload is in practice never less than the * payload size itself. The LZMA2 format would allow uncompressed size * to be less than the payload size, but no sane compressor creates such * files. LZMA2 supports storing uncompressible data in uncompressed form, * so there's never a need to create payloads whose uncompressed size is * smaller than the compressed size. * * The assumption, that the uncompressed size of the payload is never * smaller than the payload itself, is valid only when talking about * the payload as a whole. It is possible that the payload has parts where * the decompressor consumes more input than it produces output. Calculating * the worst case for this would be tricky. Instead of trying to do that, * let's simply make sure that the decompressor never overwrites any bytes * of the payload which it is currently reading. * * Now we have enough information to calculate the safety margin. We need * - 128 bytes for the .xz file format headers; * - 8 bytes per every 32 KiB of uncompressed size (one LZMA2 chunk header * per chunk, each chunk having average payload size of 32 KiB); and * - 64 KiB (biggest possible LZMA2 chunk payload size) to make sure that * the decompressor never overwrites anything from the LZMA2 chunk * payload it is currently reading. * * We get the following formula: * * safety_margin = 128 + uncompressed_size * 8 / 32768 + 65536 * = 128 + (uncompressed_size >> 12) + 65536 * * For comparision, according to arch/x86/boot/compressed/misc.c, the * equivalent formula for Deflate is this: * * safety_margin = 18 + (uncompressed_size >> 12) + 32768 * * Thus, when updating Deflate-only in-place kernel decompressor to * support XZ, the fixed overhead has to be increased from 18+32768 bytes * to 128+65536 bytes. */ #include "decompress.h" #define XZ_EXTERN STATIC /* * For boot time use, we enable only the BCJ filter of the current * architecture or none if no BCJ filter is available for the architecture. */ #ifdef CONFIG_X86 # define XZ_DEC_X86 #endif #ifdef CONFIG_PPC # define XZ_DEC_POWERPC #endif #ifdef CONFIG_ARM # define XZ_DEC_ARM #endif #ifdef CONFIG_IA64 # define XZ_DEC_IA64 #endif #ifdef CONFIG_SPARC # define XZ_DEC_SPARC #endif /* * This will get the basic headers so that memeq() and others * can be defined. */ #include "xz/private.h" /* * memeq and memzero are not used much and any remotely sane implementation * is fast enough. memcpy/memmove speed matters in multi-call mode, but * the kernel image is decompressed in single-call mode, in which only * memcpy speed can matter and only if there is a lot of uncompressible data * (LZMA2 stores uncompressible chunks in uncompressed form). Thus, the * functions below should just be kept small; it's probably not worth * optimizing for speed. */ #ifndef memeq #define memeq(p1, p2, sz) (memcmp(p1, p2, sz) == 0) #endif #ifndef memzero #define memzero(p, sz) memset(p, 0, sz) #endif #include "xz/crc32.c" #include "xz/dec_stream.c" #include "xz/dec_lzma2.c" #include "xz/dec_bcj.c" /* Size of the input and output buffers in multi-call mode */ #define XZ_IOBUF_SIZE 4096 /* * This function implements the API defined in . * * This wrapper will automatically choose single-call or multi-call mode * of the native XZ decoder API. The single-call mode can be used only when * both input and output buffers are available as a single chunk, i.e. when * fill() and flush() won't be used. */ STATIC int INIT unxz(unsigned char *in, unsigned int in_size, int (*fill)(void *dest, unsigned int size), int (*flush)(void *src, unsigned int size), unsigned char *out, unsigned int *in_used, void (*error)(const char *x)) { struct xz_buf b; struct xz_dec *s; enum xz_ret ret; bool_t must_free_in = false; xz_crc32_init(); if (in_used != NULL) *in_used = 0; if (fill == NULL && flush == NULL) s = xz_dec_init(XZ_SINGLE, 0); else s = xz_dec_init(XZ_DYNALLOC, (uint32_t)-1); if (s == NULL) goto error_alloc_state; if (flush == NULL) { b.out = out; b.out_size = (size_t)-1; } else { b.out_size = XZ_IOBUF_SIZE; b.out = malloc(XZ_IOBUF_SIZE); if (b.out == NULL) goto error_alloc_out; } if (in == NULL) { must_free_in = true; in = malloc(XZ_IOBUF_SIZE); if (in == NULL) goto error_alloc_in; } b.in = in; b.in_pos = 0; b.in_size = in_size; b.out_pos = 0; if (fill == NULL && flush == NULL) { ret = xz_dec_run(s, &b); } else { do { if (b.in_pos == b.in_size && fill != NULL) { if (in_used != NULL) *in_used += b.in_pos; b.in_pos = 0; in_size = fill(in, XZ_IOBUF_SIZE); if ((int) in_size < 0) { /* * This isn't an optimal error code * but it probably isn't worth making * a new one either. */ ret = XZ_BUF_ERROR; break; } b.in_size = in_size; } ret = xz_dec_run(s, &b); if (flush != NULL && (b.out_pos == b.out_size || (ret != XZ_OK && b.out_pos > 0))) { /* * Setting ret here may hide an error * returned by xz_dec_run(), but probably * it's not too bad. */ if (flush(b.out, b.out_pos) != (int)b.out_pos) ret = XZ_BUF_ERROR; b.out_pos = 0; } } while (ret == XZ_OK); if (must_free_in) free(in); if (flush != NULL) free(b.out); } if (in_used != NULL) *in_used += b.in_pos; xz_dec_end(s); switch (ret) { case XZ_STREAM_END: return 0; case XZ_MEM_ERROR: /* This can occur only in multi-call mode. */ error("XZ decompressor ran out of memory"); break; case XZ_FORMAT_ERROR: error("Input is not in the XZ format (wrong magic bytes)"); break; case XZ_OPTIONS_ERROR: error("Input was encoded with settings that are not " "supported by this XZ decoder"); break; case XZ_DATA_ERROR: case XZ_BUF_ERROR: error("XZ-compressed data is corrupt"); break; default: error("Bug in the XZ decompressor"); break; } return -1; error_alloc_in: if (flush != NULL) free(b.out); error_alloc_out: xz_dec_end(s); error_alloc_state: error("XZ decompressor ran out of memory"); return -1; } /* * This macro is used by architecture-specific files to decompress * the kernel image. */ #define decompress unxz xen-4.4.0/xen/common/vsprintf.c0000664000175000017500000004105412307313555014550 0ustar smbsmb/* * linux/lib/vsprintf.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* vsprintf.c -- Lars Wirzenius & Linus Torvalds. */ /* * Wirzenius wrote this portably, Torvalds fucked it up :-) */ /* * Fri Jul 13 2001 Crutcher Dunnavant * - changed to provide snprintf and vsnprintf functions * So Feb 1 16:51:32 CET 2004 Juergen Quade * - scnprintf and vscnprintf */ #include #include #include #include #include /** * simple_strtoul - convert a string to an unsigned long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use */ unsigned long simple_strtoul( const char *cp, const char **endp, unsigned int base) { unsigned long result = 0,value; if (!base) { base = 10; if (*cp == '0') { base = 8; cp++; if ((toupper(*cp) == 'X') && isxdigit(cp[1])) { cp++; base = 16; } } } else if (base == 16) { if (cp[0] == '0' && toupper(cp[1]) == 'X') cp += 2; } while (isxdigit(*cp) && (value = isdigit(*cp) ? *cp-'0' : toupper(*cp)-'A'+10) < base) { result = result*base + value; cp++; } if (endp) *endp = cp; return result; } EXPORT_SYMBOL(simple_strtoul); /** * simple_strtol - convert a string to a signed long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use */ long simple_strtol(const char *cp, const char **endp, unsigned int base) { if(*cp=='-') return -simple_strtoul(cp+1,endp,base); return simple_strtoul(cp,endp,base); } EXPORT_SYMBOL(simple_strtol); /** * simple_strtoull - convert a string to an unsigned long long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use */ unsigned long long simple_strtoull( const char *cp, const char **endp, unsigned int base) { unsigned long long result = 0,value; if (!base) { base = 10; if (*cp == '0') { base = 8; cp++; if ((toupper(*cp) == 'X') && isxdigit(cp[1])) { cp++; base = 16; } } } else if (base == 16) { if (cp[0] == '0' && toupper(cp[1]) == 'X') cp += 2; } while (isxdigit(*cp) && (value = isdigit(*cp) ? *cp-'0' : (islower(*cp) ? toupper(*cp) : *cp)-'A'+10) < base) { result = result*base + value; cp++; } if (endp) *endp = cp; return result; } EXPORT_SYMBOL(simple_strtoull); /** * simple_strtoll - convert a string to a signed long long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use */ long long simple_strtoll(const char *cp,const char **endp,unsigned int base) { if(*cp=='-') return -simple_strtoull(cp+1,endp,base); return simple_strtoull(cp,endp,base); } static int skip_atoi(const char **s) { int i=0; while (isdigit(**s)) i = i*10 + *((*s)++) - '0'; return i; } #define ZEROPAD 1 /* pad with zero */ #define SIGN 2 /* unsigned/signed long */ #define PLUS 4 /* show plus */ #define SPACE 8 /* space if plus */ #define LEFT 16 /* left justified */ #define SPECIAL 32 /* 0x */ #define LARGE 64 /* use 'ABCDEF' instead of 'abcdef' */ static char *number( char *buf, char *end, unsigned long long num, int base, int size, int precision, int type) { char c,sign,tmp[66]; const char *digits; static const char small_digits[] = "0123456789abcdefghijklmnopqrstuvwxyz"; static const char large_digits[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; int i; digits = (type & LARGE) ? large_digits : small_digits; if (type & LEFT) type &= ~ZEROPAD; if (base < 2 || base > 36) return NULL; c = (type & ZEROPAD) ? '0' : ' '; sign = 0; if (type & SIGN) { if ((signed long long) num < 0) { sign = '-'; num = - (signed long long) num; size--; } else if (type & PLUS) { sign = '+'; size--; } else if (type & SPACE) { sign = ' '; size--; } } if (type & SPECIAL) { if (num == 0) type &= ~SPECIAL; else if (base == 16) size -= 2; else if (base == 8) size--; else type &= ~SPECIAL; } i = 0; if (num == 0) tmp[i++]='0'; else while (num != 0) tmp[i++] = digits[do_div(num,base)]; if (i > precision) precision = i; size -= precision; if (!(type&(ZEROPAD+LEFT))) { while(size-->0) { if (buf <= end) *buf = ' '; ++buf; } } if (sign) { if (buf <= end) *buf = sign; ++buf; } if (type & SPECIAL) { if (buf <= end) *buf = '0'; ++buf; if (base == 16) { if (buf <= end) *buf = digits[33]; ++buf; } } if (!(type & LEFT)) { while (size-- > 0) { if (buf <= end) *buf = c; ++buf; } } while (i < precision--) { if (buf <= end) *buf = '0'; ++buf; } while (i-- > 0) { if (buf <= end) *buf = tmp[i]; ++buf; } while (size-- > 0) { if (buf <= end) *buf = ' '; ++buf; } return buf; } static char *string(char *str, char *end, const char *s, int field_width, int precision, int flags) { int i, len = strnlen(s, precision); if (!(flags & LEFT)) { while (len < field_width--) { if (str <= end) *str = ' '; ++str; } } for (i = 0; i < len; ++i) { if (str <= end) *str = *s; ++str; ++s; } while (len < field_width--) { if (str <= end) *str = ' '; ++str; } return str; } static char *pointer(char *str, char *end, const char **fmt_ptr, const void *arg, int field_width, int precision, int flags) { const char *fmt = *fmt_ptr, *s; /* Custom %p suffixes. See XEN_ROOT/docs/misc/printk-formats.txt */ switch ( fmt[1] ) { case 's': /* Symbol name with offset and size (iff offset != 0) */ case 'S': /* Symbol name unconditionally with offset and size */ { unsigned long sym_size, sym_offset; char namebuf[KSYM_NAME_LEN+1]; /* Advance parents fmt string, as we have consumed 's' or 'S' */ ++*fmt_ptr; s = symbols_lookup((unsigned long)arg, &sym_size, &sym_offset, namebuf); /* If the symbol is not found, fall back to printing the address */ if ( !s ) break; /* Print symbol name */ str = string(str, end, s, -1, -1, 0); if ( fmt[1] == 'S' || sym_offset != 0 ) { /* Print '+/' */ str = number(str, end, sym_offset, 16, -1, -1, SPECIAL|SIGN|PLUS); if ( str <= end ) *str = '/'; ++str; str = number(str, end, sym_size, 16, -1, -1, SPECIAL); } return str; } } if ( field_width == -1 ) { field_width = 2 * sizeof(void *); flags |= ZEROPAD; } return number(str, end, (unsigned long)arg, 16, field_width, precision, flags); } /** * vsnprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @args: Arguments for the format string * * The return value is the number of characters which would * be generated for the given input, excluding the trailing * '\0', as per ISO C99. If you want to have the exact * number of characters written into @buf as return value * (not including the trailing '\0'), use vscnprintf. If the * return is greater than or equal to @size, the resulting * string is truncated. * * Call this function if you are already dealing with a va_list. * You probably want snprintf instead. */ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) { unsigned long long num; int base; char *str, *end, c; const char *s; int flags; /* flags to number() */ int field_width; /* width of output field */ int precision; /* min. # of digits for integers; max number of chars for from string */ int qualifier; /* 'h', 'l', or 'L' for integer fields */ /* 'z' support added 23/7/1999 S.H. */ /* 'z' changed to 'Z' --davidm 1/25/99 */ /* Reject out-of-range values early */ BUG_ON(((int)size < 0) || ((unsigned int)size != size)); str = buf; end = buf + size - 1; if (end < buf - 1) { end = ((void *) -1); size = end - buf + 1; } for (; *fmt ; ++fmt) { if (*fmt != '%') { if (str <= end) *str = *fmt; ++str; continue; } /* process flags */ flags = 0; repeat: ++fmt; /* this also skips first '%' */ switch (*fmt) { case '-': flags |= LEFT; goto repeat; case '+': flags |= PLUS; goto repeat; case ' ': flags |= SPACE; goto repeat; case '#': flags |= SPECIAL; goto repeat; case '0': flags |= ZEROPAD; goto repeat; } /* get field width */ field_width = -1; if (isdigit(*fmt)) field_width = skip_atoi(&fmt); else if (*fmt == '*') { ++fmt; /* it's the next argument */ field_width = va_arg(args, int); if (field_width < 0) { field_width = -field_width; flags |= LEFT; } } /* get the precision */ precision = -1; if (*fmt == '.') { ++fmt; if (isdigit(*fmt)) precision = skip_atoi(&fmt); else if (*fmt == '*') { ++fmt; /* it's the next argument */ precision = va_arg(args, int); } if (precision < 0) precision = 0; } /* get the conversion qualifier */ qualifier = -1; if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || *fmt =='Z' || *fmt == 'z') { qualifier = *fmt; ++fmt; if (qualifier == 'l' && *fmt == 'l') { qualifier = 'L'; ++fmt; } } /* default base */ base = 10; switch (*fmt) { case 'c': if (!(flags & LEFT)) { while (--field_width > 0) { if (str <= end) *str = ' '; ++str; } } c = (unsigned char) va_arg(args, int); if (str <= end) *str = c; ++str; while (--field_width > 0) { if (str <= end) *str = ' '; ++str; } continue; case 's': s = va_arg(args, char *); if ((unsigned long)s < PAGE_SIZE) s = ""; str = string(str, end, s, field_width, precision, flags); continue; case 'p': /* pointer() might advance fmt (%pS for example) */ str = pointer(str, end, &fmt, va_arg(args, const void *), field_width, precision, flags); continue; case 'n': if (qualifier == 'l') { long * ip = va_arg(args, long *); *ip = (str - buf); } else if (qualifier == 'Z' || qualifier == 'z') { size_t * ip = va_arg(args, size_t *); *ip = (str - buf); } else { int * ip = va_arg(args, int *); *ip = (str - buf); } continue; case '%': if (str <= end) *str = '%'; ++str; continue; /* integer number formats - set up the flags and "break" */ case 'o': base = 8; break; case 'X': flags |= LARGE; case 'x': base = 16; break; case 'd': case 'i': flags |= SIGN; case 'u': break; default: if (str <= end) *str = '%'; ++str; if (*fmt) { if (str <= end) *str = *fmt; ++str; } else { --fmt; } continue; } if (qualifier == 'L') num = va_arg(args, long long); else if (qualifier == 'l') { num = va_arg(args, unsigned long); if (flags & SIGN) num = (signed long) num; } else if (qualifier == 'Z' || qualifier == 'z') { num = va_arg(args, size_t); } else if (qualifier == 'h') { num = (unsigned short) va_arg(args, int); if (flags & SIGN) num = (signed short) num; } else { num = va_arg(args, unsigned int); if (flags & SIGN) num = (signed int) num; } str = number(str, end, num, base, field_width, precision, flags); } if (str <= end) *str = '\0'; else if (size > 0) /* don't write out a null byte if the buf size is zero */ *end = '\0'; /* the trailing null byte doesn't count towards the total * ++str; */ return str-buf; } EXPORT_SYMBOL(vsnprintf); /** * vscnprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @args: Arguments for the format string * * The return value is the number of characters which have been written into * the @buf not including the trailing '\0'. If @size is <= 0 the function * returns 0. * * Call this function if you are already dealing with a va_list. * You probably want scnprintf instead. */ int vscnprintf(char *buf, size_t size, const char *fmt, va_list args) { int i; i = vsnprintf(buf,size,fmt,args); if (i >= size) i = size - 1; return (i > 0) ? i : 0; } EXPORT_SYMBOL(vscnprintf); /** * snprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @...: Arguments for the format string * * The return value is the number of characters which would be * generated for the given input, excluding the trailing null, * as per ISO C99. If the return is greater than or equal to * @size, the resulting string is truncated. */ int snprintf(char * buf, size_t size, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i=vsnprintf(buf,size,fmt,args); va_end(args); return i; } EXPORT_SYMBOL(snprintf); /** * scnprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @...: Arguments for the format string * * The return value is the number of characters written into @buf not including * the trailing '\0'. If @size is <= 0 the function returns 0. If the return is * greater than or equal to @size, the resulting string is truncated. */ int scnprintf(char * buf, size_t size, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vsnprintf(buf, size, fmt, args); va_end(args); if (i >= size) i = size - 1; return (i > 0) ? i : 0; } EXPORT_SYMBOL(scnprintf); /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/multicall.c0000664000175000017500000000551412307313555014664 0ustar smbsmb/****************************************************************************** * multicall.c */ #include #include #include #include #include #include #include #include #include #include #include #include #ifndef COMPAT typedef long ret_t; #define xlat_multicall_entry(mcs) static void __trace_multicall_call(multicall_entry_t *call) { __trace_hypercall(TRC_PV_HYPERCALL_SUBCALL, call->op, call->args); } #endif static void trace_multicall_call(multicall_entry_t *call) { if ( !tb_init_done ) return; __trace_multicall_call(call); } ret_t do_multicall( XEN_GUEST_HANDLE_PARAM(multicall_entry_t) call_list, unsigned int nr_calls) { struct mc_state *mcs = ¤t->mc_state; unsigned int i; int rc = 0; if ( unlikely(__test_and_set_bit(_MCSF_in_multicall, &mcs->flags)) ) { gdprintk(XENLOG_INFO, "Multicall reentry is disallowed.\n"); return -EINVAL; } if ( unlikely(!guest_handle_okay(call_list, nr_calls)) ) rc = -EFAULT; for ( i = 0; !rc && i < nr_calls; i++ ) { if ( hypercall_preempt_check() ) goto preempted; if ( unlikely(__copy_from_guest(&mcs->call, call_list, 1)) ) { rc = -EFAULT; break; } trace_multicall_call(&mcs->call); do_multicall_call(&mcs->call); #ifndef NDEBUG { /* * Deliberately corrupt the contents of the multicall structure. * The caller must depend only on the 'result' field on return. */ struct multicall_entry corrupt; memset(&corrupt, 0xAA, sizeof(corrupt)); (void)__copy_to_guest(call_list, &corrupt, 1); } #endif if ( unlikely(__copy_field_to_guest(call_list, &mcs->call, result)) ) rc = -EFAULT; else if ( test_bit(_MCSF_call_preempted, &mcs->flags) ) { /* Translate sub-call continuation to guest layout */ xlat_multicall_entry(mcs); /* Copy the sub-call continuation. */ if ( likely(!__copy_to_guest(call_list, &mcs->call, 1)) ) goto preempted; rc = -EFAULT; } else guest_handle_add_offset(call_list, 1); } perfc_incr(calls_to_multicall); perfc_add(calls_from_multicall, i); mcs->flags = 0; return rc; preempted: perfc_add(calls_from_multicall, i); mcs->flags = 0; return hypercall_create_continuation( __HYPERVISOR_multicall, "hi", call_list, nr_calls-i); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/sched_credit.c0000664000175000017500000016747312307313555015333 0ustar smbsmb/**************************************************************************** * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc. **************************************************************************** * * File: common/csched_credit.c * Author: Emmanuel Ackaouy * * Description: Credit-based SMP CPU scheduler */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Basic constants */ #define CSCHED_DEFAULT_WEIGHT 256 #define CSCHED_TICKS_PER_TSLICE 3 /* Default timeslice: 30ms */ #define CSCHED_DEFAULT_TSLICE_MS 30 #define CSCHED_CREDITS_PER_MSEC 10 /* * Priorities */ #define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */ #define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */ #define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */ #define CSCHED_PRI_IDLE -64 /* idle */ /* * Flags */ #define CSCHED_FLAG_VCPU_PARKED 0x0 /* VCPU over capped credits */ #define CSCHED_FLAG_VCPU_YIELD 0x1 /* VCPU yielding */ /* * Useful macros */ #define CSCHED_PRIV(_ops) \ ((struct csched_private *)((_ops)->sched_data)) #define CSCHED_PCPU(_c) \ ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv) #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv) #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv) #define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq)) /* Is the first element of _cpu's runq its idle vcpu? */ #define IS_RUNQ_IDLE(_cpu) (list_empty(RUNQ(_cpu)) || \ is_idle_vcpu(__runq_elem(RUNQ(_cpu)->next)->vcpu)) /* * CSCHED_STATS * * Manage very basic per-vCPU counters and stats. * * Useful for debugging live systems. The stats are displayed * with runq dumps ('r' on the Xen console). */ #ifdef SCHED_STATS #define CSCHED_STATS #define SCHED_VCPU_STATS_RESET(_V) \ do \ { \ memset(&(_V)->stats, 0, sizeof((_V)->stats)); \ } while ( 0 ) #define SCHED_VCPU_STAT_CRANK(_V, _X) (((_V)->stats._X)++) #define SCHED_VCPU_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y)) #else /* !SCHED_STATS */ #undef CSCHED_STATS #define SCHED_VCPU_STATS_RESET(_V) do {} while ( 0 ) #define SCHED_VCPU_STAT_CRANK(_V, _X) do {} while ( 0 ) #define SCHED_VCPU_STAT_SET(_V, _X, _Y) do {} while ( 0 ) #endif /* SCHED_STATS */ /* * Credit tracing events ("only" 512 available!). Check * include/public/trace.h for more details. */ #define TRC_CSCHED_SCHED_TASKLET TRC_SCHED_CLASS_EVT(CSCHED, 1) #define TRC_CSCHED_ACCOUNT_START TRC_SCHED_CLASS_EVT(CSCHED, 2) #define TRC_CSCHED_ACCOUNT_STOP TRC_SCHED_CLASS_EVT(CSCHED, 3) #define TRC_CSCHED_STOLEN_VCPU TRC_SCHED_CLASS_EVT(CSCHED, 4) #define TRC_CSCHED_PICKED_CPU TRC_SCHED_CLASS_EVT(CSCHED, 5) #define TRC_CSCHED_TICKLE TRC_SCHED_CLASS_EVT(CSCHED, 6) /* * Node Balancing */ #define CSCHED_BALANCE_NODE_AFFINITY 0 #define CSCHED_BALANCE_CPU_AFFINITY 1 /* * Boot parameters */ static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS; integer_param("sched_credit_tslice_ms", sched_credit_tslice_ms); /* * Physical CPU */ struct csched_pcpu { struct list_head runq; uint32_t runq_sort_last; struct timer ticker; unsigned int tick; unsigned int idle_bias; /* Store this here to avoid having too many cpumask_var_t-s on stack */ cpumask_var_t balance_mask; }; /* * Convenience macro for accessing the per-PCPU cpumask we need for * implementing the two steps (vcpu and node affinity) balancing logic. * It is stored in csched_pcpu so that serialization is not an issue, * as there is a csched_pcpu for each PCPU and we always hold the * runqueue spin-lock when using this. */ #define csched_balance_mask (CSCHED_PCPU(smp_processor_id())->balance_mask) /* * Virtual CPU */ struct csched_vcpu { struct list_head runq_elem; struct list_head active_vcpu_elem; struct csched_dom *sdom; struct vcpu *vcpu; atomic_t credit; unsigned int residual; s_time_t start_time; /* When we were scheduled (used for credit) */ unsigned flags; int16_t pri; #ifdef CSCHED_STATS struct { int credit_last; uint32_t credit_incr; uint32_t state_active; uint32_t state_idle; uint32_t migrate_q; uint32_t migrate_r; uint32_t kicked_away; } stats; #endif }; /* * Domain */ struct csched_dom { struct list_head active_vcpu; struct list_head active_sdom_elem; struct domain *dom; /* cpumask translated from the domain's node-affinity. * Basically, the CPUs we prefer to be scheduled on. */ cpumask_var_t node_affinity_cpumask; uint16_t active_vcpu_count; uint16_t weight; uint16_t cap; }; /* * System-wide private data */ struct csched_private { /* lock for the whole pluggable scheduler, nests inside cpupool_lock */ spinlock_t lock; struct list_head active_sdom; uint32_t ncpus; struct timer master_ticker; unsigned int master; cpumask_var_t idlers; cpumask_var_t cpus; uint32_t weight; uint32_t credit; int credit_balance; uint32_t runq_sort; unsigned ratelimit_us; /* Period of master and tick in milliseconds */ unsigned tslice_ms, tick_period_us, ticks_per_tslice; unsigned credits_per_tslice; }; static void csched_tick(void *_cpu); static void csched_acct(void *dummy); static inline int __vcpu_on_runq(struct csched_vcpu *svc) { return !list_empty(&svc->runq_elem); } static inline struct csched_vcpu * __runq_elem(struct list_head *elem) { return list_entry(elem, struct csched_vcpu, runq_elem); } static inline void __runq_insert(unsigned int cpu, struct csched_vcpu *svc) { const struct list_head * const runq = RUNQ(cpu); struct list_head *iter; BUG_ON( __vcpu_on_runq(svc) ); BUG_ON( cpu != svc->vcpu->processor ); list_for_each( iter, runq ) { const struct csched_vcpu * const iter_svc = __runq_elem(iter); if ( svc->pri > iter_svc->pri ) break; } /* If the vcpu yielded, try to put it behind one lower-priority * runnable vcpu if we can. The next runq_sort will bring it forward * within 30ms if the queue too long. */ if ( test_bit(CSCHED_FLAG_VCPU_YIELD, &svc->flags) && __runq_elem(iter)->pri > CSCHED_PRI_IDLE ) { iter=iter->next; /* Some sanity checks */ BUG_ON(iter == runq); } list_add_tail(&svc->runq_elem, iter); } static inline void __runq_remove(struct csched_vcpu *svc) { BUG_ON( !__vcpu_on_runq(svc) ); list_del_init(&svc->runq_elem); } /* * Translates node-affinity mask into a cpumask, so that we can use it during * actual scheduling. That of course will contain all the cpus from all the * set nodes in the original node-affinity mask. * * Note that any serialization needed to access mask safely is complete * responsibility of the caller of this function/hook. */ static void csched_set_node_affinity( const struct scheduler *ops, struct domain *d, nodemask_t *mask) { struct csched_dom *sdom; int node; /* Skip idle domain since it doesn't even have a node_affinity_cpumask */ if ( unlikely(is_idle_domain(d)) ) return; sdom = CSCHED_DOM(d); cpumask_clear(sdom->node_affinity_cpumask); for_each_node_mask( node, *mask ) cpumask_or(sdom->node_affinity_cpumask, sdom->node_affinity_cpumask, &node_to_cpumask(node)); } #define for_each_csched_balance_step(step) \ for ( (step) = 0; (step) <= CSCHED_BALANCE_CPU_AFFINITY; (step)++ ) /* * vcpu-affinity balancing is always necessary and must never be skipped. * OTOH, if a domain's node-affinity is said to be automatically computed * (or if it just spans all the nodes), we can safely avoid dealing with * node-affinity entirely. * * Node-affinity is also deemed meaningless in case it has empty * intersection with mask, to cover the cases where using the node-affinity * mask seems legit, but would instead led to trying to schedule the vcpu * on _no_ pcpu! Typical use cases are for mask to be equal to the vcpu's * vcpu-affinity, or to the && of vcpu-affinity and the set of online cpus * in the domain's cpupool. */ static inline int __vcpu_has_node_affinity(const struct vcpu *vc, const cpumask_t *mask) { const struct domain *d = vc->domain; const struct csched_dom *sdom = CSCHED_DOM(d); if ( d->auto_node_affinity || cpumask_full(sdom->node_affinity_cpumask) || !cpumask_intersects(sdom->node_affinity_cpumask, mask) ) return 0; return 1; } /* * Each csched-balance step uses its own cpumask. This function determines * which one (given the step) and copies it in mask. For the node-affinity * balancing step, the pcpus that are not part of vc's vcpu-affinity are * filtered out from the result, to avoid running a vcpu where it would * like, but is not allowed to! */ static void csched_balance_cpumask(const struct vcpu *vc, int step, cpumask_t *mask) { if ( step == CSCHED_BALANCE_NODE_AFFINITY ) { cpumask_and(mask, CSCHED_DOM(vc->domain)->node_affinity_cpumask, vc->cpu_affinity); if ( unlikely(cpumask_empty(mask)) ) cpumask_copy(mask, vc->cpu_affinity); } else /* step == CSCHED_BALANCE_CPU_AFFINITY */ cpumask_copy(mask, vc->cpu_affinity); } static void burn_credits(struct csched_vcpu *svc, s_time_t now) { s_time_t delta; uint64_t val; unsigned int credits; /* Assert svc is current */ ASSERT( svc == CSCHED_VCPU(curr_on_cpu(svc->vcpu->processor)) ); if ( (delta = now - svc->start_time) <= 0 ) return; val = delta * CSCHED_CREDITS_PER_MSEC + svc->residual; svc->residual = do_div(val, MILLISECS(1)); credits = val; ASSERT(credits == val); /* make sure we haven't truncated val */ atomic_sub(credits, &svc->credit); svc->start_time += (credits * MILLISECS(1)) / CSCHED_CREDITS_PER_MSEC; } static bool_t __read_mostly opt_tickle_one_idle = 1; boolean_param("tickle_one_idle_cpu", opt_tickle_one_idle); DEFINE_PER_CPU(unsigned int, last_tickle_cpu); static inline void __runq_tickle(unsigned int cpu, struct csched_vcpu *new) { struct csched_vcpu * const cur = CSCHED_VCPU(curr_on_cpu(cpu)); struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu)); cpumask_t mask, idle_mask; int balance_step, idlers_empty; ASSERT(cur); cpumask_clear(&mask); idlers_empty = cpumask_empty(prv->idlers); /* * If the pcpu is idle, or there are no idlers and the new * vcpu is a higher priority than the old vcpu, run it here. * * If there are idle cpus, first try to find one suitable to run * new, so we can avoid preempting cur. If we cannot find a * suitable idler on which to run new, run it here, but try to * find a suitable idler on which to run cur instead. */ if ( cur->pri == CSCHED_PRI_IDLE || (idlers_empty && new->pri > cur->pri) ) { if ( cur->pri != CSCHED_PRI_IDLE ) SCHED_STAT_CRANK(tickle_idlers_none); cpumask_set_cpu(cpu, &mask); } else if ( !idlers_empty ) { /* * Node and vcpu-affinity balancing loop. For vcpus without * a useful node-affinity, consider vcpu-affinity only. */ for_each_csched_balance_step( balance_step ) { int new_idlers_empty; if ( balance_step == CSCHED_BALANCE_NODE_AFFINITY && !__vcpu_has_node_affinity(new->vcpu, new->vcpu->cpu_affinity) ) continue; /* Are there idlers suitable for new (for this balance step)? */ csched_balance_cpumask(new->vcpu, balance_step, csched_balance_mask); cpumask_and(&idle_mask, prv->idlers, csched_balance_mask); new_idlers_empty = cpumask_empty(&idle_mask); /* * Let's not be too harsh! If there aren't idlers suitable * for new in its node-affinity mask, make sure we check its * vcpu-affinity as well, before taking final decisions. */ if ( new_idlers_empty && balance_step == CSCHED_BALANCE_NODE_AFFINITY ) continue; /* * If there are no suitable idlers for new, and it's higher * priority than cur, ask the scheduler to migrate cur away. * We have to act like this (instead of just waking some of * the idlers suitable for cur) because cur is running. * * If there are suitable idlers for new, no matter priorities, * leave cur alone (as it is running and is, likely, cache-hot) * and wake some of them (which is waking up and so is, likely, * cache cold anyway). */ if ( new_idlers_empty && new->pri > cur->pri ) { SCHED_STAT_CRANK(tickle_idlers_none); SCHED_VCPU_STAT_CRANK(cur, kicked_away); SCHED_VCPU_STAT_CRANK(cur, migrate_r); SCHED_STAT_CRANK(migrate_kicked_away); set_bit(_VPF_migrating, &cur->vcpu->pause_flags); cpumask_set_cpu(cpu, &mask); } else if ( !new_idlers_empty ) { /* Which of the idlers suitable for new shall we wake up? */ SCHED_STAT_CRANK(tickle_idlers_some); if ( opt_tickle_one_idle ) { this_cpu(last_tickle_cpu) = cpumask_cycle(this_cpu(last_tickle_cpu), &idle_mask); cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask); } else cpumask_or(&mask, &mask, &idle_mask); } /* Did we find anyone? */ if ( !cpumask_empty(&mask) ) break; } } if ( !cpumask_empty(&mask) ) { if ( unlikely(tb_init_done) ) { /* Avoid TRACE_*: saves checking !tb_init_done each step */ for_each_cpu(cpu, &mask) __trace_var(TRC_CSCHED_TICKLE, 0, sizeof(cpu), &cpu); } /* Send scheduler interrupts to designated CPUs */ cpumask_raise_softirq(&mask, SCHEDULE_SOFTIRQ); } } static void csched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) { struct csched_private *prv = CSCHED_PRIV(ops); struct csched_pcpu *spc = pcpu; unsigned long flags; if ( spc == NULL ) return; spin_lock_irqsave(&prv->lock, flags); prv->credit -= prv->credits_per_tslice; prv->ncpus--; cpumask_clear_cpu(cpu, prv->idlers); cpumask_clear_cpu(cpu, prv->cpus); if ( (prv->master == cpu) && (prv->ncpus > 0) ) { prv->master = cpumask_first(prv->cpus); migrate_timer(&prv->master_ticker, prv->master); } kill_timer(&spc->ticker); if ( prv->ncpus == 0 ) kill_timer(&prv->master_ticker); spin_unlock_irqrestore(&prv->lock, flags); free_cpumask_var(spc->balance_mask); xfree(spc); } static void * csched_alloc_pdata(const struct scheduler *ops, int cpu) { struct csched_pcpu *spc; struct csched_private *prv = CSCHED_PRIV(ops); unsigned long flags; /* Allocate per-PCPU info */ spc = xzalloc(struct csched_pcpu); if ( spc == NULL ) return NULL; if ( !alloc_cpumask_var(&spc->balance_mask) ) { xfree(spc); return NULL; } spin_lock_irqsave(&prv->lock, flags); /* Initialize/update system-wide config */ prv->credit += prv->credits_per_tslice; prv->ncpus++; cpumask_set_cpu(cpu, prv->cpus); if ( prv->ncpus == 1 ) { prv->master = cpu; init_timer(&prv->master_ticker, csched_acct, prv, cpu); set_timer(&prv->master_ticker, NOW() + MILLISECS(prv->tslice_ms)); } init_timer(&spc->ticker, csched_tick, (void *)(unsigned long)cpu, cpu); set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) ); INIT_LIST_HEAD(&spc->runq); spc->runq_sort_last = prv->runq_sort; spc->idle_bias = nr_cpu_ids - 1; if ( per_cpu(schedule_data, cpu).sched_priv == NULL ) per_cpu(schedule_data, cpu).sched_priv = spc; /* Start off idling... */ BUG_ON(!is_idle_vcpu(curr_on_cpu(cpu))); cpumask_set_cpu(cpu, prv->idlers); spin_unlock_irqrestore(&prv->lock, flags); return spc; } #ifndef NDEBUG static inline void __csched_vcpu_check(struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); struct csched_dom * const sdom = svc->sdom; BUG_ON( svc->vcpu != vc ); BUG_ON( sdom != CSCHED_DOM(vc->domain) ); if ( sdom ) { BUG_ON( is_idle_vcpu(vc) ); BUG_ON( sdom->dom != vc->domain ); } else { BUG_ON( !is_idle_vcpu(vc) ); } SCHED_STAT_CRANK(vcpu_check); } #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc)) #else #define CSCHED_VCPU_CHECK(_vc) #endif /* * Delay, in microseconds, between migrations of a VCPU between PCPUs. * This prevents rapid fluttering of a VCPU between CPUs, and reduces the * implicit overheads such as cache-warming. 1ms (1000) has been measured * as a good value. */ static unsigned int vcpu_migration_delay; integer_param("vcpu_migration_delay", vcpu_migration_delay); void set_vcpu_migration_delay(unsigned int delay) { vcpu_migration_delay = delay; } unsigned int get_vcpu_migration_delay(void) { return vcpu_migration_delay; } static inline int __csched_vcpu_is_cache_hot(struct vcpu *v) { int hot = ((NOW() - v->last_run_time) < ((uint64_t)vcpu_migration_delay * 1000u)); if ( hot ) SCHED_STAT_CRANK(vcpu_hot); return hot; } static inline int __csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu, cpumask_t *mask) { /* * Don't pick up work that's in the peer's scheduling tail or hot on * peer PCPU. Only pick up work that prefers and/or is allowed to run * on our CPU. */ return !vc->is_running && !__csched_vcpu_is_cache_hot(vc) && cpumask_test_cpu(dest_cpu, mask); } static int _csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc, bool_t commit) { cpumask_t cpus; cpumask_t idlers; cpumask_t *online; struct csched_pcpu *spc = NULL; int cpu = vc->processor; int balance_step; /* Store in cpus the mask of online cpus on which the domain can run */ online = cpupool_scheduler_cpumask(vc->domain->cpupool); cpumask_and(&cpus, vc->cpu_affinity, online); for_each_csched_balance_step( balance_step ) { /* * We want to pick up a pcpu among the ones that are online and * can accommodate vc, which is basically what we computed above * and stored in cpus. As far as vcpu-affinity is concerned, * there always will be at least one of these pcpus, hence cpus * is never empty and the calls to cpumask_cycle() and * cpumask_test_cpu() below are ok. * * On the other hand, when considering node-affinity too, it * is possible for the mask to become empty (for instance, if the * domain has been put in a cpupool that does not contain any of the * nodes in its node-affinity), which would result in the ASSERT()-s * inside cpumask_*() operations triggering (in debug builds). * * Therefore, in this case, we filter the node-affinity mask against * cpus and, if the result is empty, we just skip the node-affinity * balancing step all together. */ if ( balance_step == CSCHED_BALANCE_NODE_AFFINITY && !__vcpu_has_node_affinity(vc, &cpus) ) continue; /* Pick an online CPU from the proper affinity mask */ csched_balance_cpumask(vc, balance_step, &cpus); cpumask_and(&cpus, &cpus, online); /* If present, prefer vc's current processor */ cpu = cpumask_test_cpu(vc->processor, &cpus) ? vc->processor : cpumask_cycle(vc->processor, &cpus); ASSERT(cpumask_test_cpu(cpu, &cpus)); /* * Try to find an idle processor within the above constraints. * * In multi-core and multi-threaded CPUs, not all idle execution * vehicles are equal! * * We give preference to the idle execution vehicle with the most * idling neighbours in its grouping. This distributes work across * distinct cores first and guarantees we don't do something stupid * like run two VCPUs on co-hyperthreads while there are idle cores * or sockets. * * Notice that, when computing the "idleness" of cpu, we may want to * discount vc. That is, iff vc is the currently running and the only * runnable vcpu on cpu, we add cpu to the idlers. */ cpumask_and(&idlers, &cpu_online_map, CSCHED_PRIV(ops)->idlers); if ( vc->processor == cpu && IS_RUNQ_IDLE(cpu) ) cpumask_set_cpu(cpu, &idlers); cpumask_and(&cpus, &cpus, &idlers); /* * It is important that cpu points to an idle processor, if a suitable * one exists (and we can use cpus to check and, possibly, choose a new * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and * cpu points to a busy thread with an idle sibling, both the threads * will be considered the same, from the "idleness" calculation point * of view", preventing vcpu from being moved to the thread that is * actually idle. * * Notice that cpumask_test_cpu() is quicker than cpumask_empty(), so * we check for it first. */ if ( !cpumask_test_cpu(cpu, &cpus) && !cpumask_empty(&cpus) ) cpu = cpumask_cycle(cpu, &cpus); cpumask_clear_cpu(cpu, &cpus); while ( !cpumask_empty(&cpus) ) { cpumask_t cpu_idlers; cpumask_t nxt_idlers; int nxt, weight_cpu, weight_nxt; int migrate_factor; nxt = cpumask_cycle(cpu, &cpus); if ( cpumask_test_cpu(cpu, per_cpu(cpu_core_mask, nxt)) ) { /* We're on the same socket, so check the busy-ness of threads. * Migrate if # of idlers is less at all */ ASSERT( cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); migrate_factor = 1; cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_sibling_mask, cpu)); cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_sibling_mask, nxt)); } else { /* We're on different sockets, so check the busy-ness of cores. * Migrate only if the other core is twice as idle */ ASSERT( !cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); migrate_factor = 2; cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_core_mask, cpu)); cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_core_mask, nxt)); } weight_cpu = cpumask_weight(&cpu_idlers); weight_nxt = cpumask_weight(&nxt_idlers); /* smt_power_savings: consolidate work rather than spreading it */ if ( sched_smt_power_savings ? weight_cpu > weight_nxt : weight_cpu * migrate_factor < weight_nxt ) { cpumask_and(&nxt_idlers, &cpus, &nxt_idlers); spc = CSCHED_PCPU(nxt); cpu = cpumask_cycle(spc->idle_bias, &nxt_idlers); cpumask_andnot(&cpus, &cpus, per_cpu(cpu_sibling_mask, cpu)); } else { cpumask_andnot(&cpus, &cpus, &nxt_idlers); } } /* Stop if cpu is idle */ if ( cpumask_test_cpu(cpu, &idlers) ) break; } if ( commit && spc ) spc->idle_bias = cpu; TRACE_3D(TRC_CSCHED_PICKED_CPU, vc->domain->domain_id, vc->vcpu_id, cpu); return cpu; } static int csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc) { return _csched_cpu_pick(ops, vc, 1); } static inline void __csched_vcpu_acct_start(struct csched_private *prv, struct csched_vcpu *svc) { struct csched_dom * const sdom = svc->sdom; unsigned long flags; spin_lock_irqsave(&prv->lock, flags); if ( list_empty(&svc->active_vcpu_elem) ) { SCHED_VCPU_STAT_CRANK(svc, state_active); SCHED_STAT_CRANK(acct_vcpu_active); sdom->active_vcpu_count++; list_add(&svc->active_vcpu_elem, &sdom->active_vcpu); /* Make weight per-vcpu */ prv->weight += sdom->weight; if ( list_empty(&sdom->active_sdom_elem) ) { list_add(&sdom->active_sdom_elem, &prv->active_sdom); } } TRACE_3D(TRC_CSCHED_ACCOUNT_START, sdom->dom->domain_id, svc->vcpu->vcpu_id, sdom->active_vcpu_count); spin_unlock_irqrestore(&prv->lock, flags); } static inline void __csched_vcpu_acct_stop_locked(struct csched_private *prv, struct csched_vcpu *svc) { struct csched_dom * const sdom = svc->sdom; BUG_ON( list_empty(&svc->active_vcpu_elem) ); SCHED_VCPU_STAT_CRANK(svc, state_idle); SCHED_STAT_CRANK(acct_vcpu_idle); BUG_ON( prv->weight < sdom->weight ); sdom->active_vcpu_count--; list_del_init(&svc->active_vcpu_elem); prv->weight -= sdom->weight; if ( list_empty(&sdom->active_vcpu) ) { list_del_init(&sdom->active_sdom_elem); } TRACE_3D(TRC_CSCHED_ACCOUNT_STOP, sdom->dom->domain_id, svc->vcpu->vcpu_id, sdom->active_vcpu_count); } static void csched_vcpu_acct(struct csched_private *prv, unsigned int cpu) { struct csched_vcpu * const svc = CSCHED_VCPU(current); const struct scheduler *ops = per_cpu(scheduler, cpu); ASSERT( current->processor == cpu ); ASSERT( svc->sdom != NULL ); /* * If this VCPU's priority was boosted when it last awoke, reset it. * If the VCPU is found here, then it's consuming a non-negligeable * amount of CPU resources and should no longer be boosted. */ if ( svc->pri == CSCHED_PRI_TS_BOOST ) svc->pri = CSCHED_PRI_TS_UNDER; /* * Update credits */ if ( !is_idle_vcpu(svc->vcpu) ) burn_credits(svc, NOW()); /* * Put this VCPU and domain back on the active list if it was * idling. * * If it's been active a while, check if we'd be better off * migrating it to run elsewhere (see multi-core and multi-thread * support in csched_cpu_pick()). */ if ( list_empty(&svc->active_vcpu_elem) ) { __csched_vcpu_acct_start(prv, svc); } else if ( _csched_cpu_pick(ops, current, 0) != cpu ) { SCHED_VCPU_STAT_CRANK(svc, migrate_r); SCHED_STAT_CRANK(migrate_running); set_bit(_VPF_migrating, ¤t->pause_flags); cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); } } static void * csched_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd) { struct csched_vcpu *svc; /* Allocate per-VCPU info */ svc = xzalloc(struct csched_vcpu); if ( svc == NULL ) return NULL; INIT_LIST_HEAD(&svc->runq_elem); INIT_LIST_HEAD(&svc->active_vcpu_elem); svc->sdom = dd; svc->vcpu = vc; svc->pri = is_idle_domain(vc->domain) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER; SCHED_VCPU_STATS_RESET(svc); SCHED_STAT_CRANK(vcpu_init); return svc; } static void csched_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu *svc = vc->sched_priv; if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running ) __runq_insert(vc->processor, svc); } static void csched_free_vdata(const struct scheduler *ops, void *priv) { struct csched_vcpu *svc = priv; BUG_ON( !list_empty(&svc->runq_elem) ); xfree(svc); } static void csched_vcpu_remove(const struct scheduler *ops, struct vcpu *vc) { struct csched_private *prv = CSCHED_PRIV(ops); struct csched_vcpu * const svc = CSCHED_VCPU(vc); struct csched_dom * const sdom = svc->sdom; unsigned long flags; SCHED_STAT_CRANK(vcpu_destroy); if ( test_and_clear_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) ) { SCHED_STAT_CRANK(vcpu_unpark); vcpu_unpause(svc->vcpu); } if ( __vcpu_on_runq(svc) ) __runq_remove(svc); spin_lock_irqsave(&(prv->lock), flags); if ( !list_empty(&svc->active_vcpu_elem) ) __csched_vcpu_acct_stop_locked(prv, svc); spin_unlock_irqrestore(&(prv->lock), flags); BUG_ON( sdom == NULL ); BUG_ON( !list_empty(&svc->runq_elem) ); } static void csched_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); SCHED_STAT_CRANK(vcpu_sleep); BUG_ON( is_idle_vcpu(vc) ); if ( curr_on_cpu(vc->processor) == vc ) cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ); else if ( __vcpu_on_runq(svc) ) __runq_remove(svc); } static void csched_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); const unsigned int cpu = vc->processor; BUG_ON( is_idle_vcpu(vc) ); if ( unlikely(curr_on_cpu(cpu) == vc) ) { SCHED_STAT_CRANK(vcpu_wake_running); return; } if ( unlikely(__vcpu_on_runq(svc)) ) { SCHED_STAT_CRANK(vcpu_wake_onrunq); return; } if ( likely(vcpu_runnable(vc)) ) SCHED_STAT_CRANK(vcpu_wake_runnable); else SCHED_STAT_CRANK(vcpu_wake_not_runnable); /* * We temporarly boost the priority of awaking VCPUs! * * If this VCPU consumes a non negligeable amount of CPU, it * will eventually find itself in the credit accounting code * path where its priority will be reset to normal. * * If on the other hand the VCPU consumes little CPU and is * blocking and awoken a lot (doing I/O for example), its * priority will remain boosted, optimizing it's wake-to-run * latencies. * * This allows wake-to-run latency sensitive VCPUs to preempt * more CPU resource intensive VCPUs without impacting overall * system fairness. * * The one exception is for VCPUs of capped domains unpausing * after earning credits they had overspent. We don't boost * those. */ if ( svc->pri == CSCHED_PRI_TS_UNDER && !test_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) ) { svc->pri = CSCHED_PRI_TS_BOOST; } /* Put the VCPU on the runq and tickle CPUs */ __runq_insert(cpu, svc); __runq_tickle(cpu, svc); } static void csched_vcpu_yield(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); /* Let the scheduler know that this vcpu is trying to yield */ set_bit(CSCHED_FLAG_VCPU_YIELD, &svc->flags); } static int csched_dom_cntl( const struct scheduler *ops, struct domain *d, struct xen_domctl_scheduler_op *op) { struct csched_dom * const sdom = CSCHED_DOM(d); struct csched_private *prv = CSCHED_PRIV(ops); unsigned long flags; /* Protect both get and put branches with the pluggable scheduler * lock. Runq lock not needed anywhere in here. */ spin_lock_irqsave(&prv->lock, flags); if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) { op->u.credit.weight = sdom->weight; op->u.credit.cap = sdom->cap; } else { ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo); if ( op->u.credit.weight != 0 ) { if ( !list_empty(&sdom->active_sdom_elem) ) { prv->weight -= sdom->weight * sdom->active_vcpu_count; prv->weight += op->u.credit.weight * sdom->active_vcpu_count; } sdom->weight = op->u.credit.weight; } if ( op->u.credit.cap != (uint16_t)~0U ) sdom->cap = op->u.credit.cap; } spin_unlock_irqrestore(&prv->lock, flags); return 0; } static inline void __csched_set_tslice(struct csched_private *prv, unsigned timeslice) { prv->tslice_ms = timeslice; prv->ticks_per_tslice = CSCHED_TICKS_PER_TSLICE; if ( prv->tslice_ms < prv->ticks_per_tslice ) prv->ticks_per_tslice = 1; prv->tick_period_us = prv->tslice_ms * 1000 / prv->ticks_per_tslice; prv->credits_per_tslice = CSCHED_CREDITS_PER_MSEC * prv->tslice_ms; } static int csched_sys_cntl(const struct scheduler *ops, struct xen_sysctl_scheduler_op *sc) { int rc = -EINVAL; xen_sysctl_credit_schedule_t *params = &sc->u.sched_credit; struct csched_private *prv = CSCHED_PRIV(ops); switch ( sc->cmd ) { case XEN_SYSCTL_SCHEDOP_putinfo: if (params->tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX || params->tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN || (params->ratelimit_us && (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX || params->ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN)) || MICROSECS(params->ratelimit_us) > MILLISECS(params->tslice_ms) ) goto out; __csched_set_tslice(prv, params->tslice_ms); prv->ratelimit_us = params->ratelimit_us; /* FALLTHRU */ case XEN_SYSCTL_SCHEDOP_getinfo: params->tslice_ms = prv->tslice_ms; params->ratelimit_us = prv->ratelimit_us; rc = 0; break; } out: return rc; } static void * csched_alloc_domdata(const struct scheduler *ops, struct domain *dom) { struct csched_dom *sdom; sdom = xzalloc(struct csched_dom); if ( sdom == NULL ) return NULL; if ( !alloc_cpumask_var(&sdom->node_affinity_cpumask) ) { xfree(sdom); return NULL; } cpumask_setall(sdom->node_affinity_cpumask); /* Initialize credit and weight */ INIT_LIST_HEAD(&sdom->active_vcpu); INIT_LIST_HEAD(&sdom->active_sdom_elem); sdom->dom = dom; sdom->weight = CSCHED_DEFAULT_WEIGHT; return (void *)sdom; } static int csched_dom_init(const struct scheduler *ops, struct domain *dom) { struct csched_dom *sdom; if ( is_idle_domain(dom) ) return 0; sdom = csched_alloc_domdata(ops, dom); if ( sdom == NULL ) return -ENOMEM; dom->sched_priv = sdom; return 0; } static void csched_free_domdata(const struct scheduler *ops, void *data) { struct csched_dom *sdom = data; free_cpumask_var(sdom->node_affinity_cpumask); xfree(data); } static void csched_dom_destroy(const struct scheduler *ops, struct domain *dom) { csched_free_domdata(ops, CSCHED_DOM(dom)); } /* * This is a O(n) optimized sort of the runq. * * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk * through the runq and move up any UNDERs that are preceded by OVERS. We * remember the last UNDER to make the move up operation O(1). */ static void csched_runq_sort(struct csched_private *prv, unsigned int cpu) { struct csched_pcpu * const spc = CSCHED_PCPU(cpu); struct list_head *runq, *elem, *next, *last_under; struct csched_vcpu *svc_elem; spinlock_t *lock; unsigned long flags; int sort_epoch; sort_epoch = prv->runq_sort; if ( sort_epoch == spc->runq_sort_last ) return; spc->runq_sort_last = sort_epoch; lock = pcpu_schedule_lock_irqsave(cpu, &flags); runq = &spc->runq; elem = runq->next; last_under = runq; while ( elem != runq ) { next = elem->next; svc_elem = __runq_elem(elem); if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER ) { /* does elem need to move up the runq? */ if ( elem->prev != last_under ) { list_del(elem); list_add(elem, last_under); } last_under = elem; } elem = next; } pcpu_schedule_unlock_irqrestore(lock, flags, cpu); } static void csched_acct(void* dummy) { struct csched_private *prv = dummy; unsigned long flags; struct list_head *iter_vcpu, *next_vcpu; struct list_head *iter_sdom, *next_sdom; struct csched_vcpu *svc; struct csched_dom *sdom; uint32_t credit_total; uint32_t weight_total; uint32_t weight_left; uint32_t credit_fair; uint32_t credit_peak; uint32_t credit_cap; int credit_balance; int credit_xtra; int credit; spin_lock_irqsave(&prv->lock, flags); weight_total = prv->weight; credit_total = prv->credit; /* Converge balance towards 0 when it drops negative */ if ( prv->credit_balance < 0 ) { credit_total -= prv->credit_balance; SCHED_STAT_CRANK(acct_balance); } if ( unlikely(weight_total == 0) ) { prv->credit_balance = 0; spin_unlock_irqrestore(&prv->lock, flags); SCHED_STAT_CRANK(acct_no_work); goto out; } SCHED_STAT_CRANK(acct_run); weight_left = weight_total; credit_balance = 0; credit_xtra = 0; credit_cap = 0U; list_for_each_safe( iter_sdom, next_sdom, &prv->active_sdom ) { sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem); BUG_ON( is_idle_domain(sdom->dom) ); BUG_ON( sdom->active_vcpu_count == 0 ); BUG_ON( sdom->weight == 0 ); BUG_ON( (sdom->weight * sdom->active_vcpu_count) > weight_left ); weight_left -= ( sdom->weight * sdom->active_vcpu_count ); /* * A domain's fair share is computed using its weight in competition * with that of all other active domains. * * At most, a domain can use credits to run all its active VCPUs * for one full accounting period. We allow a domain to earn more * only when the system-wide credit balance is negative. */ credit_peak = sdom->active_vcpu_count * prv->credits_per_tslice; if ( prv->credit_balance < 0 ) { credit_peak += ( ( -prv->credit_balance * sdom->weight * sdom->active_vcpu_count) + (weight_total - 1) ) / weight_total; } if ( sdom->cap != 0U ) { credit_cap = ((sdom->cap * prv->credits_per_tslice) + 99) / 100; if ( credit_cap < credit_peak ) credit_peak = credit_cap; /* FIXME -- set cap per-vcpu as well...? */ credit_cap = ( credit_cap + ( sdom->active_vcpu_count - 1 ) ) / sdom->active_vcpu_count; } credit_fair = ( ( credit_total * sdom->weight * sdom->active_vcpu_count ) + (weight_total - 1) ) / weight_total; if ( credit_fair < credit_peak ) { credit_xtra = 1; } else { if ( weight_left != 0U ) { /* Give other domains a chance at unused credits */ credit_total += ( ( ( credit_fair - credit_peak ) * weight_total ) + ( weight_left - 1 ) ) / weight_left; } if ( credit_xtra ) { /* * Lazily keep domains with extra credits at the head of * the queue to give others a chance at them in future * accounting periods. */ SCHED_STAT_CRANK(acct_reorder); list_del(&sdom->active_sdom_elem); list_add(&sdom->active_sdom_elem, &prv->active_sdom); } credit_fair = credit_peak; } /* Compute fair share per VCPU */ credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 ) ) / sdom->active_vcpu_count; list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu ) { svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem); BUG_ON( sdom != svc->sdom ); /* Increment credit */ atomic_add(credit_fair, &svc->credit); credit = atomic_read(&svc->credit); /* * Recompute priority or, if VCPU is idling, remove it from * the active list. */ if ( credit < 0 ) { svc->pri = CSCHED_PRI_TS_OVER; /* Park running VCPUs of capped-out domains */ if ( sdom->cap != 0U && credit < -credit_cap && !test_and_set_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) ) { SCHED_STAT_CRANK(vcpu_park); vcpu_pause_nosync(svc->vcpu); } /* Lower bound on credits */ if ( credit < -prv->credits_per_tslice ) { SCHED_STAT_CRANK(acct_min_credit); credit = -prv->credits_per_tslice; atomic_set(&svc->credit, credit); } } else { svc->pri = CSCHED_PRI_TS_UNDER; /* Unpark any capped domains whose credits go positive */ if ( test_and_clear_bit(CSCHED_FLAG_VCPU_PARKED, &svc->flags) ) { /* * It's important to unset the flag AFTER the unpause() * call to make sure the VCPU's priority is not boosted * if it is woken up here. */ SCHED_STAT_CRANK(vcpu_unpark); vcpu_unpause(svc->vcpu); } /* Upper bound on credits means VCPU stops earning */ if ( credit > prv->credits_per_tslice ) { __csched_vcpu_acct_stop_locked(prv, svc); /* Divide credits in half, so that when it starts * accounting again, it starts a little bit "ahead" */ credit /= 2; atomic_set(&svc->credit, credit); } } SCHED_VCPU_STAT_SET(svc, credit_last, credit); SCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair); credit_balance += credit; } } prv->credit_balance = credit_balance; spin_unlock_irqrestore(&prv->lock, flags); /* Inform each CPU that its runq needs to be sorted */ prv->runq_sort++; out: set_timer( &prv->master_ticker, NOW() + MILLISECS(prv->tslice_ms)); } static void csched_tick(void *_cpu) { unsigned int cpu = (unsigned long)_cpu; struct csched_pcpu *spc = CSCHED_PCPU(cpu); struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu)); spc->tick++; /* * Accounting for running VCPU */ if ( !is_idle_vcpu(current) ) csched_vcpu_acct(prv, cpu); /* * Check if runq needs to be sorted * * Every physical CPU resorts the runq after the accounting master has * modified priorities. This is a special O(n) sort and runs at most * once per accounting period (currently 30 milliseconds). */ csched_runq_sort(prv, cpu); set_timer(&spc->ticker, NOW() + MICROSECS(prv->tick_period_us) ); } static struct csched_vcpu * csched_runq_steal(int peer_cpu, int cpu, int pri, int balance_step) { const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu); const struct vcpu * const peer_vcpu = curr_on_cpu(peer_cpu); struct csched_vcpu *speer; struct list_head *iter; struct vcpu *vc; /* * Don't steal from an idle CPU's runq because it's about to * pick up work from it itself. */ if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) ) { list_for_each( iter, &peer_pcpu->runq ) { speer = __runq_elem(iter); /* * If next available VCPU here is not of strictly higher * priority than ours, this PCPU is useless to us. */ if ( speer->pri <= pri ) break; /* Is this VCPU runnable on our PCPU? */ vc = speer->vcpu; BUG_ON( is_idle_vcpu(vc) ); /* * If the vcpu has no useful node-affinity, skip this vcpu. * In fact, what we want is to check if we have any node-affine * work to steal, before starting to look at vcpu-affine work. * * Notice that, if not even one vCPU on this runq has a useful * node-affinity, we could have avoid considering this runq for * a node balancing step in the first place. This, for instance, * can be implemented by taking note of on what runq there are * vCPUs with useful node-affinities in some sort of bitmap * or counter. */ if ( balance_step == CSCHED_BALANCE_NODE_AFFINITY && !__vcpu_has_node_affinity(vc, vc->cpu_affinity) ) continue; csched_balance_cpumask(vc, balance_step, csched_balance_mask); if ( __csched_vcpu_is_migrateable(vc, cpu, csched_balance_mask) ) { /* We got a candidate. Grab it! */ TRACE_3D(TRC_CSCHED_STOLEN_VCPU, peer_cpu, vc->domain->domain_id, vc->vcpu_id); SCHED_VCPU_STAT_CRANK(speer, migrate_q); SCHED_STAT_CRANK(migrate_queued); WARN_ON(vc->is_urgent); __runq_remove(speer); vc->processor = cpu; return speer; } } } SCHED_STAT_CRANK(steal_peer_idle); return NULL; } static struct csched_vcpu * csched_load_balance(struct csched_private *prv, int cpu, struct csched_vcpu *snext, bool_t *stolen) { struct csched_vcpu *speer; cpumask_t workers; cpumask_t *online; int peer_cpu, peer_node, bstep; int node = cpu_to_node(cpu); BUG_ON( cpu != snext->vcpu->processor ); online = cpupool_scheduler_cpumask(per_cpu(cpupool, cpu)); /* If this CPU is going offline we shouldn't steal work. */ if ( unlikely(!cpumask_test_cpu(cpu, online)) ) goto out; if ( snext->pri == CSCHED_PRI_IDLE ) SCHED_STAT_CRANK(load_balance_idle); else if ( snext->pri == CSCHED_PRI_TS_OVER ) SCHED_STAT_CRANK(load_balance_over); else SCHED_STAT_CRANK(load_balance_other); /* * Let's look around for work to steal, taking both vcpu-affinity * and node-affinity into account. More specifically, we check all * the non-idle CPUs' runq, looking for: * 1. any node-affine work to steal first, * 2. if not finding anything, any vcpu-affine work to steal. */ for_each_csched_balance_step( bstep ) { /* * We peek at the non-idling CPUs in a node-wise fashion. In fact, * it is more likely that we find some node-affine work on our same * node, not to mention that migrating vcpus within the same node * could well expected to be cheaper than across-nodes (memory * stays local, there might be some node-wide cache[s], etc.). */ peer_node = node; do { /* Find out what the !idle are in this node */ cpumask_andnot(&workers, online, prv->idlers); cpumask_and(&workers, &workers, &node_to_cpumask(peer_node)); cpumask_clear_cpu(cpu, &workers); peer_cpu = cpumask_first(&workers); if ( peer_cpu >= nr_cpu_ids ) goto next_node; do { /* * Get ahold of the scheduler lock for this peer CPU. * * Note: We don't spin on this lock but simply try it. Spinning * could cause a deadlock if the peer CPU is also load * balancing and trying to lock this CPU. */ spinlock_t *lock = pcpu_schedule_trylock(peer_cpu); if ( !lock ) { SCHED_STAT_CRANK(steal_trylock_failed); peer_cpu = cpumask_cycle(peer_cpu, &workers); continue; } /* Any work over there to steal? */ speer = cpumask_test_cpu(peer_cpu, online) ? csched_runq_steal(peer_cpu, cpu, snext->pri, bstep) : NULL; pcpu_schedule_unlock(lock, peer_cpu); /* As soon as one vcpu is found, balancing ends */ if ( speer != NULL ) { *stolen = 1; return speer; } peer_cpu = cpumask_cycle(peer_cpu, &workers); } while( peer_cpu != cpumask_first(&workers) ); next_node: peer_node = cycle_node(peer_node, node_online_map); } while( peer_node != node ); } out: /* Failed to find more important work elsewhere... */ __runq_remove(snext); return snext; } /* * This function is in the critical path. It is designed to be simple and * fast for the common case. */ static struct task_slice csched_schedule( const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled) { const int cpu = smp_processor_id(); struct list_head * const runq = RUNQ(cpu); struct csched_vcpu * const scurr = CSCHED_VCPU(current); struct csched_private *prv = CSCHED_PRIV(ops); struct csched_vcpu *snext; struct task_slice ret; s_time_t runtime, tslice; SCHED_STAT_CRANK(schedule); CSCHED_VCPU_CHECK(current); runtime = now - current->runstate.state_entry_time; if ( runtime < 0 ) /* Does this ever happen? */ runtime = 0; if ( !is_idle_vcpu(scurr->vcpu) ) { /* Update credits of a non-idle VCPU. */ burn_credits(scurr, now); scurr->start_time -= now; } else { /* Re-instate a boosted idle VCPU as normal-idle. */ scurr->pri = CSCHED_PRI_IDLE; } /* Choices, choices: * - If we have a tasklet, we need to run the idle vcpu no matter what. * - If sched rate limiting is in effect, and the current vcpu has * run for less than that amount of time, continue the current one, * but with a shorter timeslice and return it immediately * - Otherwise, chose the one with the highest priority (which may * be the one currently running) * - If the currently running one is TS_OVER, see if there * is a higher priority one waiting on the runqueue of another * cpu and steal it. */ /* If we have schedule rate limiting enabled, check to see * how long we've run for. */ if ( !tasklet_work_scheduled && prv->ratelimit_us && vcpu_runnable(current) && !is_idle_vcpu(current) && runtime < MICROSECS(prv->ratelimit_us) ) { snext = scurr; snext->start_time += now; perfc_incr(delay_ms); tslice = MICROSECS(prv->ratelimit_us); ret.migrated = 0; goto out; } tslice = MILLISECS(prv->tslice_ms); /* * Select next runnable local VCPU (ie top of local runq) */ if ( vcpu_runnable(current) ) __runq_insert(cpu, scurr); else BUG_ON( is_idle_vcpu(current) || list_empty(runq) ); snext = __runq_elem(runq->next); ret.migrated = 0; /* Tasklet work (which runs in idle VCPU context) overrides all else. */ if ( tasklet_work_scheduled ) { TRACE_0D(TRC_CSCHED_SCHED_TASKLET); snext = CSCHED_VCPU(idle_vcpu[cpu]); snext->pri = CSCHED_PRI_TS_BOOST; } /* * Clear YIELD flag before scheduling out */ clear_bit(CSCHED_FLAG_VCPU_YIELD, &scurr->flags); /* * SMP Load balance: * * If the next highest priority local runnable VCPU has already eaten * through its credits, look on other PCPUs to see if we have more * urgent work... If not, csched_load_balance() will return snext, but * already removed from the runq. */ if ( snext->pri > CSCHED_PRI_TS_OVER ) __runq_remove(snext); else snext = csched_load_balance(prv, cpu, snext, &ret.migrated); /* * Update idlers mask if necessary. When we're idling, other CPUs * will tickle us when they get extra work. */ if ( snext->pri == CSCHED_PRI_IDLE ) { if ( !cpumask_test_cpu(cpu, prv->idlers) ) cpumask_set_cpu(cpu, prv->idlers); } else if ( cpumask_test_cpu(cpu, prv->idlers) ) { cpumask_clear_cpu(cpu, prv->idlers); } if ( !is_idle_vcpu(snext->vcpu) ) snext->start_time += now; out: /* * Return task to run next... */ ret.time = (is_idle_vcpu(snext->vcpu) ? -1 : tslice); ret.task = snext->vcpu; CSCHED_VCPU_CHECK(ret.task); return ret; } static void csched_dump_vcpu(struct csched_vcpu *svc) { struct csched_dom * const sdom = svc->sdom; printk("[%i.%i] pri=%i flags=%x cpu=%i", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id, svc->pri, svc->flags, svc->vcpu->processor); if ( sdom ) { printk(" credit=%i [w=%u,cap=%u]", atomic_read(&svc->credit), sdom->weight, sdom->cap); #ifdef CSCHED_STATS printk(" (%d+%u) {a/i=%u/%u m=%u+%u (k=%u)}", svc->stats.credit_last, svc->stats.credit_incr, svc->stats.state_active, svc->stats.state_idle, svc->stats.migrate_q, svc->stats.migrate_r, svc->stats.kicked_away); #endif } printk("\n"); } static void csched_dump_pcpu(const struct scheduler *ops, int cpu) { struct list_head *runq, *iter; struct csched_pcpu *spc; struct csched_vcpu *svc; int loop; #define cpustr keyhandler_scratch spc = CSCHED_PCPU(cpu); runq = &spc->runq; cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu)); printk(" sort=%d, sibling=%s, ", spc->runq_sort_last, cpustr); cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu)); printk("core=%s\n", cpustr); /* current VCPU */ svc = CSCHED_VCPU(curr_on_cpu(cpu)); if ( svc ) { printk("\trun: "); csched_dump_vcpu(svc); } loop = 0; list_for_each( iter, runq ) { svc = __runq_elem(iter); if ( svc ) { printk("\t%3d: ", ++loop); csched_dump_vcpu(svc); } } #undef cpustr } static void csched_dump(const struct scheduler *ops) { struct list_head *iter_sdom, *iter_svc; struct csched_private *prv = CSCHED_PRIV(ops); int loop; unsigned long flags; spin_lock_irqsave(&(prv->lock), flags); #define idlers_buf keyhandler_scratch printk("info:\n" "\tncpus = %u\n" "\tmaster = %u\n" "\tcredit = %u\n" "\tcredit balance = %d\n" "\tweight = %u\n" "\trunq_sort = %u\n" "\tdefault-weight = %d\n" "\ttslice = %dms\n" "\tratelimit = %dus\n" "\tcredits per msec = %d\n" "\tticks per tslice = %d\n" "\tmigration delay = %uus\n", prv->ncpus, prv->master, prv->credit, prv->credit_balance, prv->weight, prv->runq_sort, CSCHED_DEFAULT_WEIGHT, prv->tslice_ms, prv->ratelimit_us, CSCHED_CREDITS_PER_MSEC, prv->ticks_per_tslice, vcpu_migration_delay); cpumask_scnprintf(idlers_buf, sizeof(idlers_buf), prv->idlers); printk("idlers: %s\n", idlers_buf); printk("active vcpus:\n"); loop = 0; list_for_each( iter_sdom, &prv->active_sdom ) { struct csched_dom *sdom; sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem); list_for_each( iter_svc, &sdom->active_vcpu ) { struct csched_vcpu *svc; svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem); printk("\t%3d: ", ++loop); csched_dump_vcpu(svc); } } #undef idlers_buf spin_unlock_irqrestore(&(prv->lock), flags); } static int csched_init(struct scheduler *ops) { struct csched_private *prv; prv = xzalloc(struct csched_private); if ( prv == NULL ) return -ENOMEM; if ( !zalloc_cpumask_var(&prv->cpus) || !zalloc_cpumask_var(&prv->idlers) ) { free_cpumask_var(prv->cpus); xfree(prv); return -ENOMEM; } ops->sched_data = prv; spin_lock_init(&prv->lock); INIT_LIST_HEAD(&prv->active_sdom); prv->master = UINT_MAX; if ( sched_credit_tslice_ms > XEN_SYSCTL_CSCHED_TSLICE_MAX || sched_credit_tslice_ms < XEN_SYSCTL_CSCHED_TSLICE_MIN ) { printk("WARNING: sched_credit_tslice_ms outside of valid range [%d,%d].\n" " Resetting to default %u\n", XEN_SYSCTL_CSCHED_TSLICE_MIN, XEN_SYSCTL_CSCHED_TSLICE_MAX, CSCHED_DEFAULT_TSLICE_MS); sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS; } __csched_set_tslice(prv, sched_credit_tslice_ms); if ( MICROSECS(sched_ratelimit_us) > MILLISECS(sched_credit_tslice_ms) ) { printk("WARNING: sched_ratelimit_us >" "sched_credit_tslice_ms is undefined\n" "Setting ratelimit_us to 1000 * tslice_ms\n"); prv->ratelimit_us = 1000 * prv->tslice_ms; } else prv->ratelimit_us = sched_ratelimit_us; return 0; } static void csched_deinit(const struct scheduler *ops) { struct csched_private *prv; prv = CSCHED_PRIV(ops); if ( prv != NULL ) { free_cpumask_var(prv->cpus); free_cpumask_var(prv->idlers); xfree(prv); } } static void csched_tick_suspend(const struct scheduler *ops, unsigned int cpu) { struct csched_pcpu *spc; spc = CSCHED_PCPU(cpu); stop_timer(&spc->ticker); } static void csched_tick_resume(const struct scheduler *ops, unsigned int cpu) { struct csched_private *prv; struct csched_pcpu *spc; uint64_t now = NOW(); spc = CSCHED_PCPU(cpu); prv = CSCHED_PRIV(ops); set_timer(&spc->ticker, now + MICROSECS(prv->tick_period_us) - now % MICROSECS(prv->tick_period_us) ); } static struct csched_private _csched_priv; const struct scheduler sched_credit_def = { .name = "SMP Credit Scheduler", .opt_name = "credit", .sched_id = XEN_SCHEDULER_CREDIT, .sched_data = &_csched_priv, .init_domain = csched_dom_init, .destroy_domain = csched_dom_destroy, .insert_vcpu = csched_vcpu_insert, .remove_vcpu = csched_vcpu_remove, .sleep = csched_vcpu_sleep, .wake = csched_vcpu_wake, .yield = csched_vcpu_yield, .adjust = csched_dom_cntl, .adjust_global = csched_sys_cntl, .set_node_affinity = csched_set_node_affinity, .pick_cpu = csched_cpu_pick, .do_schedule = csched_schedule, .dump_cpu_state = csched_dump_pcpu, .dump_settings = csched_dump, .init = csched_init, .deinit = csched_deinit, .alloc_vdata = csched_alloc_vdata, .free_vdata = csched_free_vdata, .alloc_pdata = csched_alloc_pdata, .free_pdata = csched_free_pdata, .alloc_domdata = csched_alloc_domdata, .free_domdata = csched_free_domdata, .tick_suspend = csched_tick_suspend, .tick_resume = csched_tick_resume, }; xen-4.4.0/xen/common/event_fifo.c0000664000175000017500000003562312307313555015026 0ustar smbsmb/* * FIFO event channel management. * * Copyright (C) 2013 Citrix Systems R&D Ltd. * * This source code is licensed under the GNU General Public License, * Version 2 or later. See the file COPYING for more details. */ #include #include #include #include #include #include #include #include #include #include #include static inline event_word_t *evtchn_fifo_word_from_port(struct domain *d, unsigned int port) { unsigned int p, w; if ( unlikely(port >= d->evtchn_fifo->num_evtchns) ) return NULL; p = port / EVTCHN_FIFO_EVENT_WORDS_PER_PAGE; w = port % EVTCHN_FIFO_EVENT_WORDS_PER_PAGE; return d->evtchn_fifo->event_array[p] + w; } static void evtchn_fifo_init(struct domain *d, struct evtchn *evtchn) { event_word_t *word; evtchn->priority = EVTCHN_FIFO_PRIORITY_DEFAULT; /* * If this event is still linked, the first event may be delivered * on the wrong VCPU or with an unexpected priority. */ word = evtchn_fifo_word_from_port(d, evtchn->port); if ( word && test_bit(EVTCHN_FIFO_LINKED, word) ) gdprintk(XENLOG_WARNING, "domain %d, port %d already on a queue\n", d->domain_id, evtchn->port); } static struct evtchn_fifo_queue *lock_old_queue(const struct domain *d, struct evtchn *evtchn, unsigned long *flags) { struct vcpu *v; struct evtchn_fifo_queue *q, *old_q; unsigned int try; for ( try = 0; try < 3; try++ ) { v = d->vcpu[evtchn->last_vcpu_id]; old_q = &v->evtchn_fifo->queue[evtchn->last_priority]; spin_lock_irqsave(&old_q->lock, *flags); v = d->vcpu[evtchn->last_vcpu_id]; q = &v->evtchn_fifo->queue[evtchn->last_priority]; if ( old_q == q ) return old_q; spin_unlock_irqrestore(&old_q->lock, *flags); } gdprintk(XENLOG_WARNING, "domain %d, port %d lost event (too many queue changes)\n", d->domain_id, evtchn->port); return NULL; } static int try_set_link(event_word_t *word, event_word_t *w, uint32_t link) { event_word_t new, old; if ( !(*w & (1 << EVTCHN_FIFO_LINKED)) ) return 0; old = *w; new = (old & ~((1 << EVTCHN_FIFO_BUSY) | EVTCHN_FIFO_LINK_MASK)) | link; *w = cmpxchg(word, old, new); if ( *w == old ) return 1; return -EAGAIN; } /* * Atomically set the LINK field iff it is still LINKED. * * The guest is only permitted to make the following changes to a * LINKED event. * * - set MASKED * - clear MASKED * - clear PENDING * - clear LINKED (and LINK) * * We block unmasking by the guest by marking the tail word as BUSY, * therefore, the cmpxchg() may fail at most 4 times. */ static bool_t evtchn_fifo_set_link(const struct domain *d, event_word_t *word, uint32_t link) { event_word_t w; unsigned int try; int ret; w = read_atomic(word); ret = try_set_link(word, &w, link); if ( ret >= 0 ) return ret; /* Lock the word to prevent guest unmasking. */ set_bit(EVTCHN_FIFO_BUSY, word); w = read_atomic(word); for ( try = 0; try < 4; try++ ) { ret = try_set_link(word, &w, link); if ( ret >= 0 ) { if ( ret == 0 ) clear_bit(EVTCHN_FIFO_BUSY, word); return ret; } } gdprintk(XENLOG_WARNING, "domain %d, port %d not linked\n", d->domain_id, link); clear_bit(EVTCHN_FIFO_BUSY, word); return 1; } static void evtchn_fifo_set_pending(struct vcpu *v, struct evtchn *evtchn) { struct domain *d = v->domain; unsigned int port; event_word_t *word; unsigned long flags; bool_t was_pending; port = evtchn->port; word = evtchn_fifo_word_from_port(d, port); /* * Event array page may not exist yet, save the pending state for * when the page is added. */ if ( unlikely(!word) ) { evtchn->pending = 1; return; } was_pending = test_and_set_bit(EVTCHN_FIFO_PENDING, word); /* * Link the event if it unmasked and not already linked. */ if ( !test_bit(EVTCHN_FIFO_MASKED, word) && !test_bit(EVTCHN_FIFO_LINKED, word) ) { struct evtchn_fifo_queue *q, *old_q; event_word_t *tail_word; bool_t linked = 0; /* * No locking around getting the queue. This may race with * changing the priority but we are allowed to signal the * event once on the old priority. */ q = &v->evtchn_fifo->queue[evtchn->priority]; old_q = lock_old_queue(d, evtchn, &flags); if ( !old_q ) goto done; if ( test_and_set_bit(EVTCHN_FIFO_LINKED, word) ) { spin_unlock_irqrestore(&old_q->lock, flags); goto done; } /* * If this event was a tail, the old queue is now empty and * its tail must be invalidated to prevent adding an event to * the old queue from corrupting the new queue. */ if ( old_q->tail == port ) old_q->tail = 0; /* Moved to a different queue? */ if ( old_q != q ) { evtchn->last_vcpu_id = evtchn->notify_vcpu_id; evtchn->last_priority = evtchn->priority; spin_unlock_irqrestore(&old_q->lock, flags); spin_lock_irqsave(&q->lock, flags); } /* * Atomically link the tail to port iff the tail is linked. * If the tail is unlinked the queue is empty. * * If port is the same as tail, the queue is empty but q->tail * will appear linked as we just set LINKED above. * * If the queue is empty (i.e., we haven't linked to the new * event), head must be updated. */ if ( q->tail ) { tail_word = evtchn_fifo_word_from_port(d, q->tail); linked = evtchn_fifo_set_link(d, tail_word, port); } if ( !linked ) write_atomic(q->head, port); q->tail = port; spin_unlock_irqrestore(&q->lock, flags); if ( !linked && !test_and_set_bit(q->priority, &v->evtchn_fifo->control_block->ready) ) vcpu_mark_events_pending(v); } done: if ( !was_pending ) evtchn_check_pollers(d, port); } static void evtchn_fifo_clear_pending(struct domain *d, struct evtchn *evtchn) { event_word_t *word; word = evtchn_fifo_word_from_port(d, evtchn->port); if ( unlikely(!word) ) return; /* * Just clear the P bit. * * No need to unlink as the guest will unlink and ignore * non-pending events. */ clear_bit(EVTCHN_FIFO_PENDING, word); } static void evtchn_fifo_unmask(struct domain *d, struct evtchn *evtchn) { struct vcpu *v = d->vcpu[evtchn->notify_vcpu_id]; event_word_t *word; word = evtchn_fifo_word_from_port(d, evtchn->port); if ( unlikely(!word) ) return; clear_bit(EVTCHN_FIFO_MASKED, word); /* Relink if pending. */ if ( test_bit(EVTCHN_FIFO_PENDING, word) ) evtchn_fifo_set_pending(v, evtchn); } static bool_t evtchn_fifo_is_pending(struct domain *d, const struct evtchn *evtchn) { event_word_t *word; word = evtchn_fifo_word_from_port(d, evtchn->port); if ( unlikely(!word) ) return 0; return test_bit(EVTCHN_FIFO_PENDING, word); } static bool_t evtchn_fifo_is_masked(struct domain *d, const struct evtchn *evtchn) { event_word_t *word; word = evtchn_fifo_word_from_port(d, evtchn->port); if ( unlikely(!word) ) return 1; return test_bit(EVTCHN_FIFO_MASKED, word); } static int evtchn_fifo_set_priority(struct domain *d, struct evtchn *evtchn, unsigned int priority) { if ( priority > EVTCHN_FIFO_PRIORITY_MIN ) return -EINVAL; /* * Only need to switch to the new queue for future events. If the * event is already pending or in the process of being linked it * will be on the old queue -- this is fine. */ evtchn->priority = priority; return 0; } static void evtchn_fifo_print_state(struct domain *d, const struct evtchn *evtchn) { event_word_t *word; word = evtchn_fifo_word_from_port(d, evtchn->port); if ( !word ) printk("? "); else if ( test_bit(EVTCHN_FIFO_LINKED, word) ) printk("%c %-4u", test_bit(EVTCHN_FIFO_BUSY, word) ? 'B' : ' ', *word & EVTCHN_FIFO_LINK_MASK); else printk("%c - ", test_bit(EVTCHN_FIFO_BUSY, word) ? 'B' : ' '); } static const struct evtchn_port_ops evtchn_port_ops_fifo = { .init = evtchn_fifo_init, .set_pending = evtchn_fifo_set_pending, .clear_pending = evtchn_fifo_clear_pending, .unmask = evtchn_fifo_unmask, .is_pending = evtchn_fifo_is_pending, .is_masked = evtchn_fifo_is_masked, .set_priority = evtchn_fifo_set_priority, .print_state = evtchn_fifo_print_state, }; static int map_guest_page(struct domain *d, uint64_t gfn, void **virt) { struct page_info *p; p = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC); if ( !p ) return -EINVAL; if ( !get_page_type(p, PGT_writable_page) ) { put_page(p); return -EINVAL; } *virt = __map_domain_page_global(p); if ( !*virt ) { put_page_and_type(p); return -ENOMEM; } return 0; } static void unmap_guest_page(void *virt) { struct page_info *page; if ( !virt ) return; virt = (void *)((unsigned long)virt & PAGE_MASK); page = mfn_to_page(domain_page_map_to_mfn(virt)); unmap_domain_page_global(virt); put_page_and_type(page); } static void init_queue(struct vcpu *v, struct evtchn_fifo_queue *q, unsigned int i) { spin_lock_init(&q->lock); q->priority = i; q->head = &v->evtchn_fifo->control_block->head[i]; } static int setup_control_block(struct vcpu *v, uint64_t gfn, uint32_t offset) { struct domain *d = v->domain; struct evtchn_fifo_vcpu *efv; void *virt; unsigned int i; int rc; if ( v->evtchn_fifo ) return -EINVAL; efv = xzalloc(struct evtchn_fifo_vcpu); if ( !efv ) return -ENOMEM; rc = map_guest_page(d, gfn, &virt); if ( rc < 0 ) { xfree(efv); return rc; } v->evtchn_fifo = efv; v->evtchn_fifo->control_block = virt + offset; for ( i = 0; i <= EVTCHN_FIFO_PRIORITY_MIN; i++ ) init_queue(v, &v->evtchn_fifo->queue[i], i); return 0; } static void cleanup_control_block(struct vcpu *v) { if ( !v->evtchn_fifo ) return; unmap_guest_page(v->evtchn_fifo->control_block); xfree(v->evtchn_fifo); v->evtchn_fifo = NULL; } /* * Setup an event array with no pages. */ static int setup_event_array(struct domain *d) { d->evtchn_fifo = xzalloc(struct evtchn_fifo_domain); if ( !d->evtchn_fifo ) return -ENOMEM; return 0; } static void cleanup_event_array(struct domain *d) { unsigned int i; if ( !d->evtchn_fifo ) return; for ( i = 0; i < EVTCHN_FIFO_MAX_EVENT_ARRAY_PAGES; i++ ) unmap_guest_page(d->evtchn_fifo->event_array[i]); xfree(d->evtchn_fifo); } static void setup_ports(struct domain *d) { unsigned int port; /* * For each port that is already bound: * * - save its pending state. * - set default priority. */ for ( port = 1; port < d->max_evtchns; port++ ) { struct evtchn *evtchn; if ( !port_is_valid(d, port) ) break; evtchn = evtchn_from_port(d, port); if ( test_bit(port, &shared_info(d, evtchn_pending)) ) evtchn->pending = 1; evtchn_fifo_set_priority(d, evtchn, EVTCHN_FIFO_PRIORITY_DEFAULT); } } int evtchn_fifo_init_control(struct evtchn_init_control *init_control) { struct domain *d = current->domain; uint32_t vcpu_id; uint64_t gfn; uint32_t offset; struct vcpu *v; int rc; init_control->link_bits = EVTCHN_FIFO_LINK_BITS; vcpu_id = init_control->vcpu; gfn = init_control->control_gfn; offset = init_control->offset; if ( vcpu_id >= d->max_vcpus || !d->vcpu[vcpu_id] ) return -ENOENT; v = d->vcpu[vcpu_id]; /* Must not cross page boundary. */ if ( offset > (PAGE_SIZE - sizeof(evtchn_fifo_control_block_t)) ) return -EINVAL; /* Must be 8-bytes aligned. */ if ( offset & (8 - 1) ) return -EINVAL; spin_lock(&d->event_lock); rc = setup_control_block(v, gfn, offset); /* * If this is the first control block, setup an empty event array * and switch to the fifo port ops. */ if ( rc == 0 && !d->evtchn_fifo ) { rc = setup_event_array(d); if ( rc < 0 ) cleanup_control_block(v); else { d->evtchn_port_ops = &evtchn_port_ops_fifo; d->max_evtchns = EVTCHN_FIFO_NR_CHANNELS; setup_ports(d); } } spin_unlock(&d->event_lock); return rc; } static int add_page_to_event_array(struct domain *d, unsigned long gfn) { void *virt; unsigned int slot; unsigned int port = d->evtchn_fifo->num_evtchns; int rc; slot = d->evtchn_fifo->num_evtchns / EVTCHN_FIFO_EVENT_WORDS_PER_PAGE; if ( slot >= EVTCHN_FIFO_MAX_EVENT_ARRAY_PAGES ) return -ENOSPC; rc = map_guest_page(d, gfn, &virt); if ( rc < 0 ) return rc; d->evtchn_fifo->event_array[slot] = virt; d->evtchn_fifo->num_evtchns += EVTCHN_FIFO_EVENT_WORDS_PER_PAGE; /* * Re-raise any events that were pending while this array page was * missing. */ for ( ; port < d->evtchn_fifo->num_evtchns; port++ ) { struct evtchn *evtchn; if ( !port_is_valid(d, port) ) break; evtchn = evtchn_from_port(d, port); if ( evtchn->pending ) evtchn_fifo_set_pending(d->vcpu[evtchn->notify_vcpu_id], evtchn); } return 0; } int evtchn_fifo_expand_array(const struct evtchn_expand_array *expand_array) { struct domain *d = current->domain; int rc; if ( !d->evtchn_fifo ) return -ENOSYS; spin_lock(&d->event_lock); rc = add_page_to_event_array(d, expand_array->array_gfn); spin_unlock(&d->event_lock); return rc; } void evtchn_fifo_destroy(struct domain *d) { struct vcpu *v; for_each_vcpu( d, v ) cleanup_control_block(v); cleanup_event_array(d); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/common/random.c0000664000175000017500000000106012307313555014146 0ustar smbsmb#include #include #include #include static DEFINE_PER_CPU(unsigned int, seed); unsigned int get_random(void) { unsigned int next = this_cpu(seed), val = arch_get_random(); if ( unlikely(!next) ) next = val ?: NOW(); if ( !val ) { unsigned int i; for ( i = 0; i < sizeof(val) * 8; i += 11 ) { next = next * 1103515245 + 12345; val |= ((next >> 16) & 0x7ff) << i; } } this_cpu(seed) = next; return val; } xen-4.4.0/xen/common/rangeset.c0000664000175000017500000002137012307313555014504 0ustar smbsmb/****************************************************************************** * rangeset.c * * Creation, maintenance and automatic destruction of per-domain sets of * numeric ranges. * * Copyright (c) 2005, K A Fraser */ #include #include #include #include /* An inclusive range [s,e] and pointer to next range in ascending order. */ struct range { struct list_head list; unsigned long s, e; }; struct rangeset { /* Owning domain and threaded list of rangesets. */ struct list_head rangeset_list; struct domain *domain; /* Ordered list of ranges contained in this set, and protecting lock. */ struct list_head range_list; spinlock_t lock; /* Pretty-printing name. */ char name[32]; /* RANGESETF flags. */ unsigned int flags; }; /***************************** * Private range functions hide the underlying linked-list implemnetation. */ /* Find highest range lower than or containing s. NULL if no such range. */ static struct range *find_range( struct rangeset *r, unsigned long s) { struct range *x = NULL, *y; list_for_each_entry ( y, &r->range_list, list ) { if ( y->s > s ) break; x = y; } return x; } /* Return the lowest range in the set r, or NULL if r is empty. */ static struct range *first_range( struct rangeset *r) { if ( list_empty(&r->range_list) ) return NULL; return list_entry(r->range_list.next, struct range, list); } /* Return range following x in ascending order, or NULL if x is the highest. */ static struct range *next_range( struct rangeset *r, struct range *x) { if ( x->list.next == &r->range_list ) return NULL; return list_entry(x->list.next, struct range, list); } /* Insert range y after range x in r. Insert as first range if x is NULL. */ static void insert_range( struct rangeset *r, struct range *x, struct range *y) { list_add(&y->list, (x != NULL) ? &x->list : &r->range_list); } /* Remove a range from its list and free it. */ static void destroy_range( struct range *x) { list_del(&x->list); xfree(x); } /***************************** * Core public functions */ int rangeset_add_range( struct rangeset *r, unsigned long s, unsigned long e) { struct range *x, *y; int rc = 0; ASSERT(s <= e); spin_lock(&r->lock); x = find_range(r, s); y = find_range(r, e); if ( x == y ) { if ( (x == NULL) || ((x->e < s) && ((x->e + 1) != s)) ) { x = xmalloc(struct range); if ( x == NULL ) { rc = -ENOMEM; goto out; } x->s = s; x->e = e; insert_range(r, y, x); } else if ( x->e < e ) x->e = e; } else { if ( x == NULL ) { x = first_range(r); x->s = s; } else if ( (x->e < s) && ((x->e + 1) != s) ) { x = next_range(r, x); x->s = s; } x->e = (y->e > e) ? y->e : e; for ( ; ; ) { y = next_range(r, x); if ( (y == NULL) || (y->e > x->e) ) break; destroy_range(y); } } y = next_range(r, x); if ( (y != NULL) && ((x->e + 1) == y->s) ) { x->e = y->e; destroy_range(y); } out: spin_unlock(&r->lock); return rc; } int rangeset_remove_range( struct rangeset *r, unsigned long s, unsigned long e) { struct range *x, *y, *t; int rc = 0; ASSERT(s <= e); spin_lock(&r->lock); x = find_range(r, s); y = find_range(r, e); if ( x == y ) { if ( (x == NULL) || (x->e < s) ) goto out; if ( (x->s < s) && (x->e > e) ) { y = xmalloc(struct range); if ( y == NULL ) { rc = -ENOMEM; goto out; } y->s = e + 1; y->e = x->e; x->e = s - 1; insert_range(r, x, y); } else if ( (x->s == s) && (x->e <= e) ) destroy_range(x); else if ( x->s == s ) x->s = e + 1; else if ( x->e <= e ) x->e = s - 1; } else { if ( x == NULL ) x = first_range(r); if ( x->s < s ) { x->e = s - 1; x = next_range(r, x); } while ( x != y ) { t = x; x = next_range(r, x); destroy_range(t); } x->s = e + 1; if ( x->s > x->e ) destroy_range(x); } out: spin_unlock(&r->lock); return rc; } int rangeset_contains_range( struct rangeset *r, unsigned long s, unsigned long e) { struct range *x; int contains; ASSERT(s <= e); spin_lock(&r->lock); x = find_range(r, s); contains = (x && (x->e >= e)); spin_unlock(&r->lock); return contains; } int rangeset_overlaps_range( struct rangeset *r, unsigned long s, unsigned long e) { struct range *x; int overlaps; ASSERT(s <= e); spin_lock(&r->lock); x = find_range(r, e); overlaps = (x && (s <= x->e)); spin_unlock(&r->lock); return overlaps; } int rangeset_report_ranges( struct rangeset *r, unsigned long s, unsigned long e, int (*cb)(unsigned long s, unsigned long e, void *), void *ctxt) { struct range *x; int rc = 0; spin_lock(&r->lock); for ( x = find_range(r, s); x && (x->s <= e) && !rc; x = next_range(r, x) ) if ( x->e >= s ) rc = cb(max(x->s, s), min(x->e, e), ctxt); spin_unlock(&r->lock); return rc; } int rangeset_add_singleton( struct rangeset *r, unsigned long s) { return rangeset_add_range(r, s, s); } int rangeset_remove_singleton( struct rangeset *r, unsigned long s) { return rangeset_remove_range(r, s, s); } int rangeset_contains_singleton( struct rangeset *r, unsigned long s) { return rangeset_contains_range(r, s, s); } int rangeset_is_empty( struct rangeset *r) { return ((r == NULL) || list_empty(&r->range_list)); } struct rangeset *rangeset_new( struct domain *d, char *name, unsigned int flags) { struct rangeset *r; r = xmalloc(struct rangeset); if ( r == NULL ) return NULL; spin_lock_init(&r->lock); INIT_LIST_HEAD(&r->range_list); BUG_ON(flags & ~RANGESETF_prettyprint_hex); r->flags = flags; if ( name != NULL ) { safe_strcpy(r->name, name); } else { snprintf(r->name, sizeof(r->name), "(no name)"); } if ( (r->domain = d) != NULL ) { spin_lock(&d->rangesets_lock); list_add(&r->rangeset_list, &d->rangesets); spin_unlock(&d->rangesets_lock); } return r; } void rangeset_destroy( struct rangeset *r) { struct range *x; if ( r == NULL ) return; if ( r->domain != NULL ) { spin_lock(&r->domain->rangesets_lock); list_del(&r->rangeset_list); spin_unlock(&r->domain->rangesets_lock); } while ( (x = first_range(r)) != NULL ) destroy_range(x); xfree(r); } void rangeset_domain_initialise( struct domain *d) { INIT_LIST_HEAD(&d->rangesets); spin_lock_init(&d->rangesets_lock); } void rangeset_domain_destroy( struct domain *d) { struct rangeset *r; while ( !list_empty(&d->rangesets) ) { r = list_entry(d->rangesets.next, struct rangeset, rangeset_list); BUG_ON(r->domain != d); r->domain = NULL; list_del(&r->rangeset_list); rangeset_destroy(r); } } /***************************** * Pretty-printing functions */ static void print_limit(struct rangeset *r, unsigned long s) { printk((r->flags & RANGESETF_prettyprint_hex) ? "%lx" : "%lu", s); } void rangeset_printk( struct rangeset *r) { int nr_printed = 0; struct range *x; spin_lock(&r->lock); printk("%-10s {", r->name); for ( x = first_range(r); x != NULL; x = next_range(r, x) ) { if ( nr_printed++ ) printk(","); printk(" "); print_limit(r, x->s); if ( x->s != x->e ) { printk("-"); print_limit(r, x->e); } } printk(" }"); spin_unlock(&r->lock); } void rangeset_domain_printk( struct domain *d) { struct rangeset *r; printk("Rangesets belonging to domain %u:\n", d->domain_id); spin_lock(&d->rangesets_lock); if ( list_empty(&d->rangesets) ) printk(" None\n"); list_for_each_entry ( r, &d->rangesets, rangeset_list ) { printk(" "); rangeset_printk(r); printk("\n"); } spin_unlock(&d->rangesets_lock); } xen-4.4.0/xen/common/softirq.c0000664000175000017500000000464412307313555014370 0ustar smbsmb/****************************************************************************** * common/softirq.c * * Softirqs in Xen are only executed in an outermost activation (e.g., never * within an interrupt activation). This simplifies some things and generally * seems a good thing. * * Copyright (c) 2003, K A Fraser * Copyright (c) 1992, Linus Torvalds */ #include #include #include #include #include #include #include #ifndef __ARCH_IRQ_STAT irq_cpustat_t irq_stat[NR_CPUS]; #endif static softirq_handler softirq_handlers[NR_SOFTIRQS]; static void __do_softirq(unsigned long ignore_mask) { unsigned int i, cpu; unsigned long pending; for ( ; ; ) { /* * Initialise @cpu on every iteration: SCHEDULE_SOFTIRQ may move * us to another processor. */ cpu = smp_processor_id(); if ( rcu_pending(cpu) ) rcu_check_callbacks(cpu); if ( ((pending = (softirq_pending(cpu) & ~ignore_mask)) == 0) || cpu_is_offline(cpu) ) break; i = find_first_set_bit(pending); clear_bit(i, &softirq_pending(cpu)); (*softirq_handlers[i])(); } } void process_pending_softirqs(void) { ASSERT(!in_irq() && local_irq_is_enabled()); /* Do not enter scheduler as it can preempt the calling context. */ __do_softirq(1ul< #include #include #include #include #include #include #include #include #include #include #include #include #include struct dt_early_info __initdata early_info; const void *device_tree_flattened; dt_irq_xlate_func dt_irq_xlate; /* Host device tree */ struct dt_device_node *dt_host; /* Interrupt controller node*/ const struct dt_device_node *dt_interrupt_controller; /** * struct dt_alias_prop - Alias property in 'aliases' node * @link: List node to link the structure in aliases_lookup list * @alias: Alias property name * @np: Pointer to device_node that the alias stands for * @id: Index value from end of alias name * @stem: Alias string without the index * * The structure represents one alias property of 'aliases' node as * an entry in aliases_lookup list. */ struct dt_alias_prop { struct list_head link; const char *alias; struct dt_device_node *np; int id; char stem[0]; }; static LIST_HEAD(aliases_lookup); /* Some device tree functions may be called both before and after the console is initialized. */ #define dt_printk(fmt, ...) \ do \ { \ if ( system_state == SYS_STATE_early_boot ) \ early_printk(fmt, ## __VA_ARGS__); \ else \ printk(fmt, ## __VA_ARGS__); \ } while (0) // #define DEBUG_DT #ifdef DEBUG_DT # define dt_dprintk(fmt, args...) dt_printk(XENLOG_DEBUG fmt, ##args) static void dt_dump_addr(const char *s, const __be32 *addr, int na) { dt_dprintk("%s", s); while ( na-- ) dt_dprintk(" %08x", be32_to_cpu(*(addr++))); dt_dprintk("\n"); } #else # define dt_dprintk(fmt, args...) do {} while ( 0 ) static void dt_dump_addr(const char *s, const __be32 *addr, int na) { } #endif #define DT_BAD_ADDR ((u64)-1) /* Max address size we deal with */ #define DT_MAX_ADDR_CELLS 4 #define DT_CHECK_ADDR_COUNT(na) ((na) > 0 && (na) <= DT_MAX_ADDR_CELLS) #define DT_CHECK_COUNTS(na, ns) (DT_CHECK_ADDR_COUNT(na) && (ns) > 0) /* Callbacks for bus specific translators */ struct dt_bus { const char *name; const char *addresses; bool_t (*match)(const struct dt_device_node *node); void (*count_cells)(const struct dt_device_node *child, int *addrc, int *sizec); u64 (*map)(__be32 *addr, const __be32 *range, int na, int ns, int pna); int (*translate)(__be32 *addr, u64 offset, int na); unsigned int (*get_flags)(const __be32 *addr); }; static bool_t __init device_tree_node_matches(const void *fdt, int node, const char *match) { const char *name; size_t match_len; name = fdt_get_name(fdt, node, NULL); match_len = strlen(match); /* Match both "match" and "match@..." patterns but not "match-foo". */ return strncmp(name, match, match_len) == 0 && (name[match_len] == '@' || name[match_len] == '\0'); } static bool_t __init device_tree_node_compatible(const void *fdt, int node, const char *match) { int len, l; int mlen; const void *prop; mlen = strlen(match); prop = fdt_getprop(fdt, node, "compatible", &len); if ( prop == NULL ) return 0; while ( len > 0 ) { if ( !dt_compat_cmp(prop, match) ) return 1; l = strlen(prop) + 1; prop += l; len -= l; } return 0; } static void __init device_tree_get_reg(const __be32 **cell, u32 address_cells, u32 size_cells, u64 *start, u64 *size) { *start = dt_next_cell(address_cells, cell); *size = dt_next_cell(size_cells, cell); } void dt_get_range(const __be32 **cell, const struct dt_device_node *np, u64 *address, u64 *size) { *address = dt_next_cell(dt_n_addr_cells(np), cell); *size = dt_next_cell(dt_n_size_cells(np), cell); } void dt_set_cell(__be32 **cellp, int size, u64 val) { int cells = size; while ( size-- ) { (*cellp)[size] = cpu_to_fdt32(val); val >>= 32; } (*cellp) += cells; } void dt_set_range(__be32 **cellp, const struct dt_device_node *np, u64 address, u64 size) { dt_set_cell(cellp, dt_n_addr_cells(np), address); dt_set_cell(cellp, dt_n_size_cells(np), size); } static u32 __init device_tree_get_u32(const void *fdt, int node, const char *prop_name, u32 dflt) { const struct fdt_property *prop; prop = fdt_get_property(fdt, node, prop_name, NULL); if ( !prop || prop->len < sizeof(u32) ) return dflt; return fdt32_to_cpu(*(uint32_t*)prop->data); } /** * device_tree_for_each_node - iterate over all device tree nodes * @fdt: flat device tree. * @func: function to call for each node. * @data: data to pass to @func. * * Any nodes nested at DEVICE_TREE_MAX_DEPTH or deeper are ignored. * * Returns 0 if all nodes were iterated over successfully. If @func * returns a value different from 0, that value is returned immediately. */ static int __init device_tree_for_each_node(const void *fdt, device_tree_node_func func, void *data) { int node; int depth; u32 address_cells[DEVICE_TREE_MAX_DEPTH]; u32 size_cells[DEVICE_TREE_MAX_DEPTH]; int ret; for ( node = 0, depth = 0; node >=0 && depth >= 0; node = fdt_next_node(fdt, node, &depth) ) { const char *name = fdt_get_name(fdt, node, NULL); if ( depth >= DEVICE_TREE_MAX_DEPTH ) { dt_printk("Warning: device tree node `%s' is nested too deep\n", name); continue; } address_cells[depth] = device_tree_get_u32(fdt, node, "#address-cells", depth > 0 ? address_cells[depth-1] : 0); size_cells[depth] = device_tree_get_u32(fdt, node, "#size-cells", depth > 0 ? size_cells[depth-1] : 0); ret = func(fdt, node, name, depth, address_cells[depth-1], size_cells[depth-1], data); if ( ret != 0 ) return ret; } return 0; } /** * device_tree_bootargs - return the bootargs (the Xen command line) * @fdt flat device tree. */ const char *device_tree_bootargs(const void *fdt) { int node; const struct fdt_property *prop; node = fdt_path_offset(fdt, "/chosen"); if ( node < 0 ) return NULL; prop = fdt_get_property(fdt, node, "xen,xen-bootargs", NULL); if ( prop == NULL ) { struct dt_mb_module *dom0_mod = NULL; if ( early_info.modules.nr_mods >= MOD_KERNEL ) dom0_mod = &early_info.modules.module[MOD_KERNEL]; if (fdt_get_property(fdt, node, "xen,dom0-bootargs", NULL) || ( dom0_mod && dom0_mod->cmdline[0] ) ) prop = fdt_get_property(fdt, node, "bootargs", NULL); } if ( prop == NULL ) return NULL; return prop->data; } static int dump_node(const void *fdt, int node, const char *name, int depth, u32 address_cells, u32 size_cells, void *data) { char prefix[2*DEVICE_TREE_MAX_DEPTH + 1] = ""; int i; int prop; for ( i = 0; i < depth; i++ ) safe_strcat(prefix, " "); if ( name[0] == '\0' ) name = "/"; dt_printk("%s%s:\n", prefix, name); for ( prop = fdt_first_property_offset(fdt, node); prop >= 0; prop = fdt_next_property_offset(fdt, prop) ) { const struct fdt_property *p; p = fdt_get_property_by_offset(fdt, prop, NULL); dt_printk("%s %s\n", prefix, fdt_string(fdt, fdt32_to_cpu(p->nameoff))); } return 0; } /** * device_tree_dump - print a text representation of a device tree * @fdt: flat device tree to print */ void __init device_tree_dump(const void *fdt) { device_tree_for_each_node(fdt, dump_node, NULL); } static void __init process_memory_node(const void *fdt, int node, const char *name, u32 address_cells, u32 size_cells) { const struct fdt_property *prop; int i; int banks; const __be32 *cell; paddr_t start, size; u32 reg_cells = address_cells + size_cells; if ( address_cells < 1 || size_cells < 1 ) { early_printk("fdt: node `%s': invalid #address-cells or #size-cells", name); return; } prop = fdt_get_property(fdt, node, "reg", NULL); if ( !prop ) { early_printk("fdt: node `%s': missing `reg' property\n", name); return; } cell = (const __be32 *)prop->data; banks = fdt32_to_cpu(prop->len) / (reg_cells * sizeof (u32)); for ( i = 0; i < banks && early_info.mem.nr_banks < NR_MEM_BANKS; i++ ) { device_tree_get_reg(&cell, address_cells, size_cells, &start, &size); early_info.mem.bank[early_info.mem.nr_banks].start = start; early_info.mem.bank[early_info.mem.nr_banks].size = size; early_info.mem.nr_banks++; } } static void __init process_multiboot_node(const void *fdt, int node, const char *name, u32 address_cells, u32 size_cells) { const struct fdt_property *prop; const __be32 *cell; int nr; struct dt_mb_module *mod; int len; if ( fdt_node_check_compatible(fdt, node, "xen,linux-zimage") == 0 ) nr = MOD_KERNEL; else if ( fdt_node_check_compatible(fdt, node, "xen,linux-initrd") == 0) nr = MOD_INITRD; else early_panic("%s not a known xen multiboot type\n", name); mod = &early_info.modules.module[nr]; prop = fdt_get_property(fdt, node, "reg", &len); if ( !prop ) early_panic("node %s missing `reg' property\n", name); if ( len < dt_cells_to_size(address_cells + size_cells) ) early_panic("fdt: node `%s': `reg` property length is too short\n", name); cell = (const __be32 *)prop->data; device_tree_get_reg(&cell, address_cells, size_cells, &mod->start, &mod->size); prop = fdt_get_property(fdt, node, "bootargs", &len); if ( prop ) { if ( len > sizeof(mod->cmdline) ) early_panic("module %d command line too long\n", nr); safe_strcpy(mod->cmdline, prop->data); } else mod->cmdline[0] = 0; if ( nr > early_info.modules.nr_mods ) early_info.modules.nr_mods = nr; } static void __init process_chosen_node(const void *fdt, int node, const char *name, u32 address_cells, u32 size_cells) { const struct fdt_property *prop; struct dt_mb_module *mod = &early_info.modules.module[MOD_INITRD]; paddr_t start, end; int len; dt_printk("Checking for initrd in /chosen\n"); prop = fdt_get_property(fdt, node, "linux,initrd-start", &len); if ( !prop ) /* No initrd present. */ return; if ( len != sizeof(u32) && len != sizeof(u64) ) { dt_printk("linux,initrd-start property has invalid length %d\n", len); return; } start = dt_read_number((void *)&prop->data, dt_size_to_cells(len)); prop = fdt_get_property(fdt, node, "linux,initrd-end", &len); if ( !prop ) { dt_printk("linux,initrd-end not present but -start was\n"); return; } if ( len != sizeof(u32) && len != sizeof(u64) ) { dt_printk("linux,initrd-end property has invalid length %d\n", len); return; } end = dt_read_number((void *)&prop->data, dt_size_to_cells(len)); if ( start >= end ) { dt_printk("linux,initrd limits invalid: %"PRIpaddr" >= %"PRIpaddr"\n", start, end); return; } dt_printk("Initrd %"PRIpaddr"-%"PRIpaddr"\n", start, end); mod->start = start; mod->size = end - start; early_info.modules.nr_mods = MAX(MOD_INITRD, early_info.modules.nr_mods); } static int __init early_scan_node(const void *fdt, int node, const char *name, int depth, u32 address_cells, u32 size_cells, void *data) { if ( device_tree_node_matches(fdt, node, "memory") ) process_memory_node(fdt, node, name, address_cells, size_cells); else if ( device_tree_node_compatible(fdt, node, "xen,multiboot-module" ) ) process_multiboot_node(fdt, node, name, address_cells, size_cells); else if ( depth == 1 && device_tree_node_matches(fdt, node, "chosen") ) process_chosen_node(fdt, node, name, address_cells, size_cells); return 0; } static void __init early_print_info(void) { struct dt_mem_info *mi = &early_info.mem; struct dt_module_info *mods = &early_info.modules; int i, nr_rsvd; for ( i = 0; i < mi->nr_banks; i++ ) early_printk("RAM: %"PRIpaddr" - %"PRIpaddr"\n", mi->bank[i].start, mi->bank[i].start + mi->bank[i].size - 1); early_printk("\n"); for ( i = 1 ; i < mods->nr_mods + 1; i++ ) early_printk("MODULE[%d]: %"PRIpaddr" - %"PRIpaddr" %s\n", i, mods->module[i].start, mods->module[i].start + mods->module[i].size, mods->module[i].cmdline); nr_rsvd = fdt_num_mem_rsv(device_tree_flattened); for ( i = 0; i < nr_rsvd; i++ ) { paddr_t s, e; if ( fdt_get_mem_rsv(device_tree_flattened, i, &s, &e) < 0 ) continue; /* fdt_get_mem_rsv returns length */ e += s; early_printk(" RESVD[%d]: %"PRIpaddr" - %"PRIpaddr"\n", i, s, e); } early_printk("\n"); } /** * device_tree_early_init - initialize early info from a DTB * @fdt: flattened device tree binary * * Returns the size of the DTB. */ size_t __init device_tree_early_init(const void *fdt, paddr_t paddr) { struct dt_mb_module *mod; int ret; ret = fdt_check_header(fdt); if ( ret < 0 ) early_panic("No valid device tree\n"); mod = &early_info.modules.module[MOD_FDT]; mod->start = paddr; mod->size = fdt_totalsize(fdt); early_info.modules.nr_mods = max(MOD_FDT, early_info.modules.nr_mods); device_tree_for_each_node((void *)fdt, early_scan_node, NULL); early_print_info(); return fdt_totalsize(fdt); } static void __init *unflatten_dt_alloc(unsigned long *mem, unsigned long size, unsigned long align) { void *res; *mem = ROUNDUP(*mem, align); res = (void *)*mem; *mem += size; return res; } /* Find a property with a given name for a given node and return it. */ static const struct dt_property * dt_find_property(const struct dt_device_node *np, const char *name, u32 *lenp) { const struct dt_property *pp; if ( !np ) return NULL; for ( pp = np->properties; pp; pp = pp->next ) { if ( dt_prop_cmp(pp->name, name) == 0 ) { if ( lenp ) *lenp = pp->length; break; } } return pp; } const void *dt_get_property(const struct dt_device_node *np, const char *name, u32 *lenp) { const struct dt_property *pp = dt_find_property(np, name, lenp); return pp ? pp->value : NULL; } bool_t dt_property_read_u32(const struct dt_device_node *np, const char *name, u32 *out_value) { u32 len; const __be32 *val; val = dt_get_property(np, name, &len); if ( !val || len < sizeof(*out_value) ) return 0; *out_value = be32_to_cpup(val); return 1; } bool_t dt_property_read_u64(const struct dt_device_node *np, const char *name, u64 *out_value) { u32 len; const __be32 *val; val = dt_get_property(np, name, &len); if ( !val || len < sizeof(*out_value) ) return 0; *out_value = dt_read_number(val, 2); return 1; } int dt_property_read_string(const struct dt_device_node *np, const char *propname, const char **out_string) { const struct dt_property *pp = dt_find_property(np, propname, NULL); if ( !pp ) return -EINVAL; if ( !pp->value ) return -ENODATA; if ( strnlen(pp->value, pp->length) >= pp->length ) return -EILSEQ; *out_string = pp->value; return 0; } bool_t dt_device_is_compatible(const struct dt_device_node *device, const char *compat) { const char* cp; u32 cplen, l; cp = dt_get_property(device, "compatible", &cplen); if ( cp == NULL ) return 0; while ( cplen > 0 ) { if ( dt_compat_cmp(cp, compat) == 0 ) return 1; l = strlen(cp) + 1; cp += l; cplen -= l; } return 0; } bool_t dt_machine_is_compatible(const char *compat) { const struct dt_device_node *root; bool_t rc = 0; root = dt_find_node_by_path("/"); if ( root ) { rc = dt_device_is_compatible(root, compat); } return rc; } struct dt_device_node *dt_find_node_by_name(struct dt_device_node *from, const char *name) { struct dt_device_node *np; struct dt_device_node *dt; dt = from ? from->allnext : dt_host; dt_for_each_device_node(dt, np) if ( np->name && (dt_node_cmp(np->name, name) == 0) ) break; return np; } struct dt_device_node *dt_find_node_by_type(struct dt_device_node *from, const char *type) { struct dt_device_node *np; struct dt_device_node *dt; dt = from ? from->allnext : dt_host; dt_for_each_device_node(dt, np) if ( np->type && (dt_node_cmp(np->type, type) == 0) ) break; return np; } struct dt_device_node *dt_find_node_by_path(const char *path) { struct dt_device_node *np; dt_for_each_device_node(dt_host, np) if ( np->full_name && (dt_node_cmp(np->full_name, path) == 0) ) break; return np; } struct dt_device_node *dt_find_node_by_alias(const char *alias) { const struct dt_alias_prop *app; list_for_each_entry( app, &aliases_lookup, link ) { if ( !strcmp(app->alias, alias) ) return app->np; } return NULL; } bool_t dt_match_node(const struct dt_device_match *matches, const struct dt_device_node *node) { if ( !matches ) return 0; while ( matches->path || matches->type || matches->compatible ) { bool_t match = 1; if ( matches->path ) match &= dt_node_path_is_equal(node, matches->path); if ( matches->type ) match &= dt_device_type_is_equal(node, matches->type); if ( matches->compatible ) match &= dt_device_is_compatible(node, matches->compatible); if ( match ) return match; matches++; } return 0; } const struct dt_device_node *dt_get_parent(const struct dt_device_node *node) { if ( !node ) return NULL; return node->parent; } struct dt_device_node * dt_find_compatible_node(struct dt_device_node *from, const char *type, const char *compatible) { struct dt_device_node *np; struct dt_device_node *dt; dt = from ? from->allnext : dt_host; dt_for_each_device_node(dt, np) { if ( type && !(np->type && (dt_node_cmp(np->type, type) == 0)) ) continue; if ( dt_device_is_compatible(np, compatible) ) break; } return np; } struct dt_device_node * dt_find_matching_node(struct dt_device_node *from, const struct dt_device_match *matches) { struct dt_device_node *np; struct dt_device_node *dt; dt = from ? from->allnext : dt_host; dt_for_each_device_node(dt, np) { if ( dt_match_node(matches, np) ) return np; } return NULL; } int dt_n_addr_cells(const struct dt_device_node *np) { const __be32 *ip; do { if ( np->parent ) np = np->parent; ip = dt_get_property(np, "#address-cells", NULL); if ( ip ) return be32_to_cpup(ip); } while ( np->parent ); /* No #address-cells property for the root node */ return DT_ROOT_NODE_ADDR_CELLS_DEFAULT; } int dt_n_size_cells(const struct dt_device_node *np) { const __be32 *ip; do { if ( np->parent ) np = np->parent; ip = dt_get_property(np, "#size-cells", NULL); if ( ip ) return be32_to_cpup(ip); } while ( np->parent ); /* No #address-cells property for the root node */ return DT_ROOT_NODE_SIZE_CELLS_DEFAULT; } /* * Default translator (generic bus) */ static bool_t dt_bus_default_match(const struct dt_device_node *node) { /* Root node doesn't have "ranges" property */ if ( node->parent == NULL ) return 1; /* The default bus is only used when the "ranges" property exists. * Otherwise we can't translate the address */ return (dt_get_property(node, "ranges", NULL) != NULL); } static void dt_bus_default_count_cells(const struct dt_device_node *dev, int *addrc, int *sizec) { if ( addrc ) *addrc = dt_n_addr_cells(dev); if ( sizec ) *sizec = dt_n_size_cells(dev); } static u64 dt_bus_default_map(__be32 *addr, const __be32 *range, int na, int ns, int pna) { u64 cp, s, da; cp = dt_read_number(range, na); s = dt_read_number(range + na + pna, ns); da = dt_read_number(addr, na); dt_dprintk("DT: default map, cp=%llx, s=%llx, da=%llx\n", (unsigned long long)cp, (unsigned long long)s, (unsigned long long)da); /* * If the number of address cells is larger than 2 we assume the * mapping doesn't specify a physical address. Rather, the address * specifies an identifier that must match exactly. */ if ( na > 2 && memcmp(range, addr, na * 4) != 0 ) return DT_BAD_ADDR; if ( da < cp || da >= (cp + s) ) return DT_BAD_ADDR; return da - cp; } static int dt_bus_default_translate(__be32 *addr, u64 offset, int na) { u64 a = dt_read_number(addr, na); memset(addr, 0, na * 4); a += offset; if ( na > 1 ) addr[na - 2] = cpu_to_be32(a >> 32); addr[na - 1] = cpu_to_be32(a & 0xffffffffu); return 0; } static unsigned int dt_bus_default_get_flags(const __be32 *addr) { /* TODO: Return the type of memory (device, ...) for caching * attribute during mapping */ return 0; } /* * Array of bus specific translators */ static const struct dt_bus dt_busses[] = { /* Default */ { .name = "default", .addresses = "reg", .match = dt_bus_default_match, .count_cells = dt_bus_default_count_cells, .map = dt_bus_default_map, .translate = dt_bus_default_translate, .get_flags = dt_bus_default_get_flags, }, }; static const struct dt_bus *dt_match_bus(const struct dt_device_node *np) { int i; for ( i = 0; i < ARRAY_SIZE(dt_busses); i++ ) if ( !dt_busses[i].match || dt_busses[i].match(np) ) return &dt_busses[i]; return NULL; } static const __be32 *dt_get_address(const struct dt_device_node *dev, int index, u64 *size, unsigned int *flags) { const __be32 *prop; u32 psize; const struct dt_device_node *parent; const struct dt_bus *bus; int onesize, i, na, ns; /* Get parent & match bus type */ parent = dt_get_parent(dev); if ( parent == NULL ) return NULL; bus = dt_match_bus(parent); if ( !bus ) return NULL; bus->count_cells(dev, &na, &ns); if ( !DT_CHECK_ADDR_COUNT(na) ) return NULL; /* Get "reg" or "assigned-addresses" property */ prop = dt_get_property(dev, bus->addresses, &psize); if ( prop == NULL ) return NULL; psize /= 4; onesize = na + ns; for ( i = 0; psize >= onesize; psize -= onesize, prop += onesize, i++ ) { if ( i == index ) { if ( size ) *size = dt_read_number(prop + na, ns); if ( flags ) *flags = bus->get_flags(prop); return prop; } } return NULL; } static int dt_translate_one(const struct dt_device_node *parent, const struct dt_bus *bus, const struct dt_bus *pbus, __be32 *addr, int na, int ns, int pna, const char *rprop) { const __be32 *ranges; unsigned int rlen; int rone; u64 offset = DT_BAD_ADDR; ranges = dt_get_property(parent, rprop, &rlen); if ( ranges == NULL ) { dt_printk(XENLOG_ERR "DT: no ranges; cannot translate\n"); return 1; } if ( rlen == 0 ) { offset = dt_read_number(addr, na); memset(addr, 0, pna * 4); dt_dprintk("DT: empty ranges; 1:1 translation\n"); goto finish; } dt_dprintk("DT: walking ranges...\n"); /* Now walk through the ranges */ rlen /= 4; rone = na + pna + ns; for ( ; rlen >= rone; rlen -= rone, ranges += rone ) { offset = bus->map(addr, ranges, na, ns, pna); if ( offset != DT_BAD_ADDR ) break; } if ( offset == DT_BAD_ADDR ) { dt_dprintk("DT: not found !\n"); return 1; } memcpy(addr, ranges + na, 4 * pna); finish: dt_dump_addr("DT: parent translation for:", addr, pna); dt_dprintk("DT: with offset: %llx\n", (unsigned long long)offset); /* Translate it into parent bus space */ return pbus->translate(addr, offset, pna); } /* * Translate an address from the device-tree into a CPU physical address, * this walks up the tree and applies the various bus mappings on the * way. * * Note: We consider that crossing any level with #size-cells == 0 to mean * that translation is impossible (that is we are not dealing with a value * that can be mapped to a cpu physical address). This is not really specified * that way, but this is traditionally the way IBM at least do things */ static u64 __dt_translate_address(const struct dt_device_node *dev, const __be32 *in_addr, const char *rprop) { const struct dt_device_node *parent = NULL; const struct dt_bus *bus, *pbus; __be32 addr[DT_MAX_ADDR_CELLS]; int na, ns, pna, pns; u64 result = DT_BAD_ADDR; dt_dprintk("DT: ** translation for device %s **\n", dev->full_name); /* Get parent & match bus type */ parent = dt_get_parent(dev); if ( parent == NULL ) goto bail; bus = dt_match_bus(parent); if ( !bus ) goto bail; /* Count address cells & copy address locally */ bus->count_cells(dev, &na, &ns); if ( !DT_CHECK_COUNTS(na, ns) ) { dt_printk(XENLOG_ERR "dt_parse: Bad cell count for device %s\n", dev->full_name); goto bail; } memcpy(addr, in_addr, na * 4); dt_dprintk("DT: bus is %s (na=%d, ns=%d) on %s\n", bus->name, na, ns, parent->full_name); dt_dump_addr("DT: translating address:", addr, na); /* Translate */ for ( ;; ) { /* Switch to parent bus */ dev = parent; parent = dt_get_parent(dev); /* If root, we have finished */ if ( parent == NULL ) { dt_dprintk("DT: reached root node\n"); result = dt_read_number(addr, na); break; } /* Get new parent bus and counts */ pbus = dt_match_bus(parent); if ( pbus == NULL ) { dt_printk("DT: %s is not a valid bus\n", parent->full_name); break; } pbus->count_cells(dev, &pna, &pns); if ( !DT_CHECK_COUNTS(pna, pns) ) { printk(XENLOG_ERR "dt_parse: Bad cell count for parent %s\n", dev->full_name); break; } dt_dprintk("DT: parent bus is %s (na=%d, ns=%d) on %s\n", pbus->name, pna, pns, parent->full_name); /* Apply bus translation */ if ( dt_translate_one(dev, bus, pbus, addr, na, ns, pna, rprop) ) break; /* Complete the move up one level */ na = pna; ns = pns; bus = pbus; dt_dump_addr("DT: one level translation:", addr, na); } bail: return result; } /* dt_device_address - Translate device tree address and return it */ int dt_device_get_address(const struct dt_device_node *dev, int index, u64 *addr, u64 *size) { const __be32 *addrp; unsigned int flags; addrp = dt_get_address(dev, index, size, &flags); if ( addrp == NULL ) return -EINVAL; if ( !addr ) return -EINVAL; *addr = __dt_translate_address(dev, addrp, "ranges"); if ( *addr == DT_BAD_ADDR ) return -EINVAL; return 0; } /** * dt_find_node_by_phandle - Find a node given a phandle * @handle: phandle of the node to find * * Returns a node pointer. */ static const struct dt_device_node *dt_find_node_by_phandle(dt_phandle handle) { const struct dt_device_node *np; dt_for_each_device_node(dt_host, np) if ( np->phandle == handle ) break; return np; } /** * dt_irq_find_parent - Given a device node, find its interrupt parent node * @child: pointer to device node * * Returns a pointer to the interrupt parent node, or NULL if the interrupt * parent could not be determined. */ static const struct dt_device_node * dt_irq_find_parent(const struct dt_device_node *child) { const struct dt_device_node *p; const __be32 *parp; do { parp = dt_get_property(child, "interrupt-parent", NULL); if ( parp == NULL ) p = dt_get_parent(child); else p = dt_find_node_by_phandle(be32_to_cpup(parp)); child = p; } while ( p && dt_get_property(p, "#interrupt-cells", NULL) == NULL ); return p; } unsigned int dt_number_of_irq(const struct dt_device_node *device) { const struct dt_device_node *p; const __be32 *intspec, *tmp; u32 intsize, intlen; dt_dprintk("dt_irq_number: dev=%s\n", device->full_name); /* Get the interrupts property */ intspec = dt_get_property(device, "interrupts", &intlen); if ( intspec == NULL ) return 0; intlen /= sizeof(*intspec); dt_dprintk(" intspec=%d intlen=%d\n", be32_to_cpup(intspec), intlen); /* Look for the interrupt parent. */ p = dt_irq_find_parent(device); if ( p == NULL ) return 0; /* Get size of interrupt specifier */ tmp = dt_get_property(p, "#interrupt-cells", NULL); if ( tmp == NULL ) return 0; intsize = be32_to_cpu(*tmp); dt_dprintk(" intsize=%d intlen=%d\n", intsize, intlen); return (intlen / intsize); } unsigned int dt_number_of_address(const struct dt_device_node *dev) { const __be32 *prop; u32 psize; const struct dt_device_node *parent; const struct dt_bus *bus; int onesize, na, ns; /* Get parent & match bus type */ parent = dt_get_parent(dev); if ( parent == NULL ) return 0; bus = dt_match_bus(parent); if ( !bus ) return 0; bus->count_cells(dev, &na, &ns); if ( !DT_CHECK_COUNTS(na, ns) ) return 0; /* Get "reg" or "assigned-addresses" property */ prop = dt_get_property(dev, bus->addresses, &psize); if ( prop == NULL ) return 0; psize /= 4; onesize = na + ns; return (psize / onesize); } /** * dt_irq_map_raw - Low level interrupt tree parsing * @parent: the device interrupt parent * @intspec: interrupt specifier ("interrupts" property of the device) * @ointsize: size of the passed in interrupt specifier * @addr: address specifier (start of "reg" property of the device) * @oirq: structure dt_raw_irq filled by this function * * Returns 0 on success and a negative number on error * * This function is a low-level interrupt tree walking function. It * can be used to do a partial walk with synthesized reg and interrupts * properties, for example when resolving PCI interrupts when no device * node exist for the parent. */ static int dt_irq_map_raw(const struct dt_device_node *parent, const __be32 *intspec, u32 ointsize, const __be32 *addr, struct dt_raw_irq *oirq) { const struct dt_device_node *ipar, *tnode, *old = NULL, *newpar = NULL; const __be32 *tmp, *imap, *imask; u32 intsize = 1, addrsize, newintsize = 0, newaddrsize = 0; u32 imaplen; int match, i; dt_dprintk("dt_irq_map_raw: par=%s,intspec=[0x%08x 0x%08x...],ointsize=%d\n", parent->full_name, be32_to_cpup(intspec), be32_to_cpup(intspec + 1), ointsize); ipar = parent; /* First get the #interrupt-cells property of the current cursor * that tells us how to interpret the passed-in intspec. If there * is none, we are nice and just walk up the tree */ do { tmp = dt_get_property(ipar, "#interrupt-cells", NULL); if ( tmp != NULL ) { intsize = be32_to_cpu(*tmp); break; } tnode = ipar; ipar = dt_irq_find_parent(ipar); } while ( ipar ); if ( ipar == NULL ) { dt_dprintk(" -> no parent found !\n"); goto fail; } dt_dprintk("dt_irq_map_raw: ipar=%s, size=%d\n", ipar->full_name, intsize); if ( ointsize != intsize ) return -EINVAL; /* Look for this #address-cells. We have to implement the old linux * trick of looking for the parent here as some device-trees rely on it */ old = ipar; do { tmp = dt_get_property(old, "#address-cells", NULL); tnode = dt_get_parent(old); old = tnode; } while ( old && tmp == NULL ); old = NULL; addrsize = (tmp == NULL) ? 2 : be32_to_cpu(*tmp); dt_dprintk(" -> addrsize=%d\n", addrsize); /* Now start the actual "proper" walk of the interrupt tree */ while ( ipar != NULL ) { /* Now check if cursor is an interrupt-controller and if it is * then we are done */ if ( dt_get_property(ipar, "interrupt-controller", NULL) != NULL ) { dt_dprintk(" -> got it !\n"); if ( intsize > DT_MAX_IRQ_SPEC ) { dt_dprintk(" -> intsize(%u) greater than DT_MAX_IRQ_SPEC(%u)\n", intsize, DT_MAX_IRQ_SPEC); goto fail; } for ( i = 0; i < intsize; i++ ) oirq->specifier[i] = dt_read_number(intspec + i, 1); oirq->size = intsize; oirq->controller = ipar; return 0; } /* Now look for an interrupt-map */ imap = dt_get_property(ipar, "interrupt-map", &imaplen); /* No interrupt map, check for an interrupt parent */ if ( imap == NULL ) { dt_dprintk(" -> no map, getting parent\n"); newpar = dt_irq_find_parent(ipar); goto skiplevel; } imaplen /= sizeof(u32); /* Look for a mask */ imask = dt_get_property(ipar, "interrupt-map-mask", NULL); /* If we were passed no "reg" property and we attempt to parse * an interrupt-map, then #address-cells must be 0. * Fail if it's not. */ if ( addr == NULL && addrsize != 0 ) { dt_dprintk(" -> no reg passed in when needed !\n"); goto fail; } /* Parse interrupt-map */ match = 0; while ( imaplen > (addrsize + intsize + 1) && !match ) { /* Compare specifiers */ match = 1; for ( i = 0; i < addrsize && match; ++i ) { __be32 mask = imask ? imask[i] : cpu_to_be32(0xffffffffu); match = ((addr[i] ^ imap[i]) & mask) == 0; } for ( ; i < (addrsize + intsize) && match; ++i ) { __be32 mask = imask ? imask[i] : cpu_to_be32(0xffffffffu); match = ((intspec[i-addrsize] ^ imap[i]) & mask) == 0; } imap += addrsize + intsize; imaplen -= addrsize + intsize; dt_dprintk(" -> match=%d (imaplen=%d)\n", match, imaplen); /* Get the interrupt parent */ newpar = dt_find_node_by_phandle(be32_to_cpup(imap)); imap++; --imaplen; /* Check if not found */ if ( newpar == NULL ) { dt_dprintk(" -> imap parent not found !\n"); goto fail; } /* Get #interrupt-cells and #address-cells of new * parent */ tmp = dt_get_property(newpar, "#interrupt-cells", NULL); if ( tmp == NULL ) { dt_dprintk(" -> parent lacks #interrupt-cells!\n"); goto fail; } newintsize = be32_to_cpu(*tmp); tmp = dt_get_property(newpar, "#address-cells", NULL); newaddrsize = (tmp == NULL) ? 0 : be32_to_cpu(*tmp); dt_dprintk(" -> newintsize=%d, newaddrsize=%d\n", newintsize, newaddrsize); /* Check for malformed properties */ if ( imaplen < (newaddrsize + newintsize) ) goto fail; imap += newaddrsize + newintsize; imaplen -= newaddrsize + newintsize; dt_dprintk(" -> imaplen=%d\n", imaplen); } if ( !match ) goto fail; old = newpar; addrsize = newaddrsize; intsize = newintsize; intspec = imap - intsize; addr = intspec - addrsize; skiplevel: /* Iterate again with new parent */ dt_dprintk(" -> new parent: %s\n", dt_node_full_name(newpar)); ipar = newpar; newpar = NULL; } fail: return -EINVAL; } int dt_device_get_raw_irq(const struct dt_device_node *device, int index, struct dt_raw_irq *out_irq) { const struct dt_device_node *p; const __be32 *intspec, *tmp, *addr; u32 intsize, intlen; int res = -EINVAL; dt_dprintk("dt_device_get_raw_irq: dev=%s, index=%d\n", device->full_name, index); /* Get the interrupts property */ intspec = dt_get_property(device, "interrupts", &intlen); if ( intspec == NULL ) return -EINVAL; intlen /= sizeof(*intspec); dt_dprintk(" intspec=%d intlen=%d\n", be32_to_cpup(intspec), intlen); /* Get the reg property (if any) */ addr = dt_get_property(device, "reg", NULL); /* Look for the interrupt parent. */ p = dt_irq_find_parent(device); if ( p == NULL ) return -EINVAL; /* Get size of interrupt specifier */ tmp = dt_get_property(p, "#interrupt-cells", NULL); if ( tmp == NULL ) goto out; intsize = be32_to_cpu(*tmp); dt_dprintk(" intsize=%d intlen=%d\n", intsize, intlen); /* Check index */ if ( (index + 1) * intsize > intlen ) goto out; /* Get new specifier and map it */ res = dt_irq_map_raw(p, intspec + index * intsize, intsize, addr, out_irq); if ( res ) goto out; out: return res; } int dt_irq_translate(const struct dt_raw_irq *raw, struct dt_irq *out_irq) { ASSERT(dt_irq_xlate != NULL); /* TODO: Retrieve the right irq_xlate. This is only work for the gic */ return dt_irq_xlate(raw->specifier, raw->size, &out_irq->irq, &out_irq->type); } int dt_device_get_irq(const struct dt_device_node *device, int index, struct dt_irq *out_irq) { struct dt_raw_irq raw; int res; res = dt_device_get_raw_irq(device, index, &raw); if ( res ) return res; return dt_irq_translate(&raw, out_irq); } bool_t dt_device_is_available(const struct dt_device_node *device) { const char *status; u32 statlen; status = dt_get_property(device, "status", &statlen); if ( status == NULL ) return 1; if ( statlen > 0 ) { if ( !strcmp(status, "okay") || !strcmp(status, "ok") ) return 1; } return 0; } /** * unflatten_dt_node - Alloc and populate a device_node from the flat tree * @fdt: The parent device tree blob * @mem: Memory chunk to use for allocating device nodes and properties * @p: pointer to node in flat tree * @dad: Parent struct device_node * @allnextpp: pointer to ->allnext from last allocated device_node * @fpsize: Size of the node path up at the current depth. */ static unsigned long __init unflatten_dt_node(const void *fdt, unsigned long mem, unsigned long *p, struct dt_device_node *dad, struct dt_device_node ***allnextpp, unsigned long fpsize) { struct dt_device_node *np; struct dt_property *pp, **prev_pp = NULL; char *pathp; u32 tag; unsigned int l, allocl; int has_name = 0; int new_format = 0; tag = be32_to_cpup((__be32 *)(*p)); if ( tag != FDT_BEGIN_NODE ) { dt_printk(XENLOG_WARNING "Weird tag at start of node: %x\n", tag); return mem; } *p += 4; pathp = (char *)*p; l = allocl = strlen(pathp) + 1; *p = ROUNDUP(*p + l, 4); /* version 0x10 has a more compact unit name here instead of the full * path. we accumulate the full path size using "fpsize", we'll rebuild * it later. We detect this because the first character of the name is * not '/'. */ if ( (*pathp) != '/' ) { new_format = 1; if ( fpsize == 0 ) { /* root node: special case. fpsize accounts for path * plus terminating zero. root node only has '/', so * fpsize should be 2, but we want to avoid the first * level nodes to have two '/' so we use fpsize 1 here */ fpsize = 1; allocl = 2; } else { /* account for '/' and path size minus terminal 0 * already in 'l' */ fpsize += l; allocl = fpsize; } } np = unflatten_dt_alloc(&mem, sizeof(struct dt_device_node) + allocl, __alignof__(struct dt_device_node)); if ( allnextpp ) { memset(np, 0, sizeof(*np)); np->full_name = ((char *)np) + sizeof(struct dt_device_node); /* By default dom0 owns the device */ np->used_by = 0; if ( new_format ) { char *fn = np->full_name; /* rebuild full path for new format */ if ( dad && dad->parent ) { strlcpy(fn, dad->full_name, allocl); #ifdef DEBUG_DT if ( (strlen(fn) + l + 1) != allocl ) { dt_dprintk("%s: p: %d, l: %d, a: %d\n", pathp, (int)strlen(fn), l, allocl); } #endif fn += strlen(fn); } *(fn++) = '/'; memcpy(fn, pathp, l); } else memcpy(np->full_name, pathp, l); prev_pp = &np->properties; **allnextpp = np; *allnextpp = &np->allnext; if ( dad != NULL ) { np->parent = dad; /* we temporarily use the next field as `last_child'*/ if ( dad->next == NULL ) dad->child = np; else dad->next->sibling = np; dad->next = np; } } /* process properties */ while ( 1 ) { u32 sz, noff; const char *pname; tag = be32_to_cpup((__be32 *)(*p)); if ( tag == FDT_NOP ) { *p += 4; continue; } if ( tag != FDT_PROP ) break; *p += 4; sz = be32_to_cpup((__be32 *)(*p)); noff = be32_to_cpup((__be32 *)((*p) + 4)); *p += 8; if ( fdt_version(fdt) < 0x10 ) *p = ROUNDUP(*p, sz >= 8 ? 8 : 4); pname = fdt_string(fdt, noff); if ( pname == NULL ) { dt_dprintk("Can't find property name in list!\n"); break; } if ( strcmp(pname, "name") == 0 ) has_name = 1; l = strlen(pname) + 1; pp = unflatten_dt_alloc(&mem, sizeof(struct dt_property), __alignof__(struct dt_property)); if ( allnextpp ) { /* We accept flattened tree phandles either in * ePAPR-style "phandle" properties, or the * legacy "linux,phandle" properties. If both * appear and have different values, things * will get weird. Don't do that. */ if ( (strcmp(pname, "phandle") == 0) || (strcmp(pname, "linux,phandle") == 0) ) { if ( np->phandle == 0 ) np->phandle = be32_to_cpup((__be32*)*p); } /* And we process the "ibm,phandle" property * used in pSeries dynamic device tree * stuff */ if ( strcmp(pname, "ibm,phandle") == 0 ) np->phandle = be32_to_cpup((__be32 *)*p); pp->name = pname; pp->length = sz; pp->value = (void *)*p; *prev_pp = pp; prev_pp = &pp->next; } *p = ROUNDUP((*p) + sz, 4); } /* with version 0x10 we may not have the name property, recreate * it here from the unit name if absent */ if ( !has_name ) { char *p1 = pathp, *ps = pathp, *pa = NULL; int sz; while ( *p1 ) { if ( (*p1) == '@' ) pa = p1; if ( (*p1) == '/' ) ps = p1 + 1; p1++; } if ( pa < ps ) pa = p1; sz = (pa - ps) + 1; pp = unflatten_dt_alloc(&mem, sizeof(struct dt_property) + sz, __alignof__(struct dt_property)); if ( allnextpp ) { pp->name = "name"; pp->length = sz; pp->value = pp + 1; /* * The device tree creation code assume that the property * "name" is not a fake. * To avoid a big divergence with Linux code, only remove * property link. In this case we will lose a bit of memory */ #if 0 *prev_pp = pp; prev_pp = &pp->next; #endif np->name = pp->value; memcpy(pp->value, ps, sz - 1); ((char *)pp->value)[sz - 1] = 0; dt_dprintk("fixed up name for %s -> %s\n", pathp, (char *)pp->value); } } if ( allnextpp ) { *prev_pp = NULL; np->name = (np->name) ? : dt_get_property(np, "name", NULL); np->type = dt_get_property(np, "device_type", NULL); if ( !np->name ) np->name = ""; if ( !np->type ) np->type = ""; } while ( tag == FDT_BEGIN_NODE || tag == FDT_NOP ) { if ( tag == FDT_NOP ) *p += 4; else mem = unflatten_dt_node(fdt, mem, p, np, allnextpp, fpsize); tag = be32_to_cpup((__be32 *)(*p)); } if ( tag != FDT_END_NODE ) { dt_printk(XENLOG_WARNING "Weird tag at end of node: %x\n", tag); return mem; } *p += 4; return mem; } /** * __unflatten_device_tree - create tree of device_nodes from flat blob * * unflattens a device-tree, creating the * tree of struct device_node. It also fills the "name" and "type" * pointers of the nodes so the normal device-tree walking functions * can be used. * @fdt: The fdt to expand * @mynodes: The device_node tree created by the call */ static void __init __unflatten_device_tree(const void *fdt, struct dt_device_node **mynodes) { unsigned long start, mem, size; struct dt_device_node **allnextp = mynodes; dt_dprintk(" -> unflatten_device_tree()\n"); dt_dprintk("Unflattening device tree:\n"); dt_dprintk("magic: %#08x\n", fdt_magic(fdt)); dt_dprintk("size: %#08x\n", fdt_totalsize(fdt)); dt_dprintk("version: %#08x\n", fdt_version(fdt)); /* First pass, scan for size */ start = ((unsigned long)fdt) + fdt_off_dt_struct(fdt); size = unflatten_dt_node(fdt, 0, &start, NULL, NULL, 0); size = (size | 3) + 1; dt_dprintk(" size is %#lx allocating...\n", size); /* Allocate memory for the expanded device tree */ mem = (unsigned long)_xmalloc (size + 4, __alignof__(struct dt_device_node)); ((__be32 *)mem)[size / 4] = cpu_to_be32(0xdeadbeef); dt_dprintk(" unflattening %lx...\n", mem); /* Second pass, do actual unflattening */ start = ((unsigned long)fdt) + fdt_off_dt_struct(fdt); unflatten_dt_node(fdt, mem, &start, NULL, &allnextp, 0); if ( be32_to_cpup((__be32 *)start) != FDT_END ) dt_printk(XENLOG_WARNING "Weird tag at end of tree: %08x\n", *((u32 *)start)); if ( be32_to_cpu(((__be32 *)mem)[size / 4]) != 0xdeadbeef ) dt_printk(XENLOG_WARNING "End of tree marker overwritten: %08x\n", be32_to_cpu(((__be32 *)mem)[size / 4])); *allnextp = NULL; dt_dprintk(" <- unflatten_device_tree()\n"); } static void dt_alias_add(struct dt_alias_prop *ap, struct dt_device_node *np, int id, const char *stem, int stem_len) { ap->np = np; ap->id = id; strlcpy(ap->stem, stem, stem_len + 1); list_add_tail(&ap->link, &aliases_lookup); dt_dprintk("adding DT alias:%s: stem=%s id=%d node=%s\n", ap->alias, ap->stem, ap->id, dt_node_full_name(np)); } /** * dt_alias_scan - Scan all properties of 'aliases' node * * The function scans all the properties of 'aliases' node and populate * the the global lookup table with the properties. It returns the * number of alias_prop found, or error code in error case. */ static void __init dt_alias_scan(void) { const struct dt_property *pp; const struct dt_device_node *aliases; aliases = dt_find_node_by_path("/aliases"); if ( !aliases ) return; dt_for_each_property_node( aliases, pp ) { const char *start = pp->name; const char *end = start + strlen(start); struct dt_device_node *np; struct dt_alias_prop *ap; int id, len; /* Skip those we do not want to proceed */ if ( !strcmp(pp->name, "name") || !strcmp(pp->name, "phandle") || !strcmp(pp->name, "linux,phandle") ) continue; np = dt_find_node_by_path(pp->value); if ( !np ) continue; /* walk the alias backwards to extract the id and work out * the 'stem' string */ while ( isdigit(*(end-1)) && end > start ) end--; len = end - start; id = simple_strtoll(end, NULL, 10); /* Allocate an alias_prop with enough space for the stem */ ap = _xmalloc(sizeof(*ap) + len + 1, 4); if ( !ap ) continue; ap->alias = start; dt_alias_add(ap, np, id, start, len); } } struct dt_device_node * __init dt_find_interrupt_controller(const struct dt_device_match *matches) { struct dt_device_node *np = NULL; while ( (np = dt_find_matching_node(np, matches)) ) { if ( !dt_find_property(np, "interrupt-controller", NULL) ) continue; if ( dt_get_parent(np) ) break; } return np; } void __init dt_unflatten_host_device_tree(void) { __unflatten_device_tree(device_tree_flattened, &dt_host); dt_alias_scan(); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/drivers/0000775000175000017500000000000012307313555012713 5ustar smbsmbxen-4.4.0/xen/drivers/Makefile0000664000175000017500000000025412307313555014354 0ustar smbsmbsubdir-y += char subdir-$(HAS_CPUFREQ) += cpufreq subdir-$(HAS_PCI) += pci subdir-$(HAS_PASSTHROUGH) += passthrough subdir-$(HAS_ACPI) += acpi subdir-$(HAS_VIDEO) += video xen-4.4.0/xen/drivers/video/0000775000175000017500000000000012307313555014021 5ustar smbsmbxen-4.4.0/xen/drivers/video/Makefile0000664000175000017500000000031612307313555015461 0ustar smbsmbobj-$(HAS_VGA) := vga.o obj-$(HAS_VIDEO) += font_8x14.o obj-$(HAS_VIDEO) += font_8x16.o obj-$(HAS_VIDEO) += font_8x8.o obj-$(HAS_VIDEO) += lfb.o obj-$(HAS_VGA) += vesa.o obj-$(HAS_ARM_HDLCD) += arm_hdlcd.o xen-4.4.0/xen/drivers/video/vesa.c0000664000175000017500000001343512307313555015131 0ustar smbsmb/****************************************************************************** * vesa.c * * VESA linear frame buffer handling. */ #include #include #include #include #include #include #include #include #include "font.h" #include "lfb.h" #define vlfb_info vga_console_info.u.vesa_lfb static void lfb_flush(void); static unsigned char *lfb; static const struct font_desc *font; static bool_t vga_compat; static unsigned int vram_total; integer_param("vesa-ram", vram_total); static unsigned int vram_remap; integer_param("vesa-map", vram_remap); static int font_height; static void __init parse_font_height(const char *s) { if ( simple_strtoul(s, &s, 10) == 8 && (*s++ == 'x') ) font_height = simple_strtoul(s, &s, 10); if ( *s != '\0' ) font_height = 0; } custom_param("font", parse_font_height); void __init vesa_early_init(void) { unsigned int vram_vmode; vga_compat = !(vga_console_info.u.vesa_lfb.gbl_caps & 2); if ( (vlfb_info.bits_per_pixel < 8) || (vlfb_info.bits_per_pixel > 32) ) return; if ( font_height == 0 ) /* choose a sensible default */ font = ((vlfb_info.height <= 600) ? &font_vga_8x8 : (vlfb_info.height <= 768) ? &font_vga_8x14 : &font_vga_8x16); else if ( font_height <= 8 ) font = &font_vga_8x8; else if ( font_height <= 14 ) font = &font_vga_8x14; else font = &font_vga_8x16; /* vram_vmode -- that is the amount of memory needed for the * used video mode, i.e. the minimum amount of * memory we need. */ vram_vmode = vlfb_info.height * vlfb_info.bytes_per_line; /* vram_total -- all video memory we have. Used for mtrr * entries. */ vram_total = vram_total ? (vram_total << 20) : (vlfb_info.lfb_size << 16); vram_total = max_t(unsigned int, vram_total, vram_vmode); /* vram_remap -- the amount of video memory we are going to * use for vesafb. With modern cards it is no * option to simply use vram_total as that * wastes plenty of kernel address space. */ vram_remap = (vram_remap ? (vram_remap << 20) : ((vram_vmode + (1 << L2_PAGETABLE_SHIFT) - 1) & ~((1 << L2_PAGETABLE_SHIFT) - 1))); vram_remap = max_t(unsigned int, vram_remap, vram_vmode); vram_remap = min_t(unsigned int, vram_remap, vram_total); } void __init vesa_init(void) { struct lfb_prop lfbp; if ( !font ) return; lfbp.font = font; lfbp.bits_per_pixel = vlfb_info.bits_per_pixel; lfbp.bytes_per_line = vlfb_info.bytes_per_line; lfbp.width = vlfb_info.width; lfbp.height = vlfb_info.height; lfbp.flush = lfb_flush; lfbp.text_columns = vlfb_info.width / font->width; lfbp.text_rows = vlfb_info.height / font->height; lfbp.lfb = lfb = ioremap(vlfb_info.lfb_base, vram_remap); if ( !lfb ) return; memset(lfb, 0, vram_remap); printk(XENLOG_INFO "vesafb: framebuffer at %#x, mapped to 0x%p, " "using %uk, total %uk\n", vlfb_info.lfb_base, lfb, vram_remap >> 10, vram_total >> 10); printk(XENLOG_INFO "vesafb: mode is %dx%dx%u, linelength=%d, font %ux%u\n", vlfb_info.width, vlfb_info.height, vlfb_info.bits_per_pixel, vlfb_info.bytes_per_line, font->width, font->height); printk(XENLOG_INFO "vesafb: %scolor: size=%d:%d:%d:%d, " "shift=%d:%d:%d:%d\n", vlfb_info.bits_per_pixel > 8 ? "True" : vga_compat ? "Pseudo" : "Static Pseudo", vlfb_info.rsvd_size, vlfb_info.red_size, vlfb_info.green_size, vlfb_info.blue_size, vlfb_info.rsvd_pos, vlfb_info.red_pos, vlfb_info.green_pos, vlfb_info.blue_pos); if ( vlfb_info.bits_per_pixel > 8 ) { /* Light grey in truecolor. */ unsigned int grey = 0xaaaaaaaa; lfbp.pixel_on = ((grey >> (32 - vlfb_info. red_size)) << vlfb_info. red_pos) | ((grey >> (32 - vlfb_info.green_size)) << vlfb_info.green_pos) | ((grey >> (32 - vlfb_info. blue_size)) << vlfb_info. blue_pos); } else { /* White(ish) in default pseudocolor palette. */ lfbp.pixel_on = 7; } if ( lfb_init(&lfbp) < 0 ) return; video_puts = lfb_redraw_puts; } #include static unsigned int vesa_mtrr; integer_param("vesa-mtrr", vesa_mtrr); void __init vesa_mtrr_init(void) { static const int mtrr_types[] = { 0, MTRR_TYPE_UNCACHABLE, MTRR_TYPE_WRBACK, MTRR_TYPE_WRCOMB, MTRR_TYPE_WRTHROUGH }; unsigned int size_total; int rc, type; if ( !lfb || (vesa_mtrr == 0) || (vesa_mtrr >= ARRAY_SIZE(mtrr_types)) ) return; type = mtrr_types[vesa_mtrr]; if ( !type ) return; /* Find the largest power-of-two */ size_total = vram_total; while ( size_total & (size_total - 1) ) size_total &= size_total - 1; /* Try and find a power of two to add */ do { rc = mtrr_add(vlfb_info.lfb_base, size_total, type, 1); size_total >>= 1; } while ( (size_total >= PAGE_SIZE) && (rc == -EINVAL) ); } static void lfb_flush(void) { if ( vesa_mtrr == 3 ) __asm__ __volatile__ ("sfence" : : : "memory"); } void __init vesa_endboot(bool_t keep) { if ( keep ) { video_puts = lfb_scroll_puts; lfb_carriage_return(); } else { unsigned int i, bpp = (vlfb_info.bits_per_pixel + 7) >> 3; for ( i = 0; i < vlfb_info.height; i++ ) memset(lfb + i * vlfb_info.bytes_per_line, 0, vlfb_info.width * bpp); lfb_flush(); lfb_free(); } } xen-4.4.0/xen/drivers/video/vga.c0000664000175000017500000001376412307313555014755 0ustar smbsmb/****************************************************************************** * vga.c * * VGA support routines. */ #include #include #include #include #include #include #include /* Filled in by arch boot code. */ struct xen_vga_console_info vga_console_info; static int vgacon_keep; static unsigned int xpos, ypos; static unsigned char *video; static void vga_text_puts(const char *s); static void vga_noop_puts(const char *s) {} void (*video_puts)(const char *) = vga_noop_puts; /* * 'vga=[,keep]' where is one of: * * 'vga=ask': * display a vga menu of available modes * * 'vga=current': * use the current vga mode without modification * * 'vga=text-80x': * text mode, where is one of {25,28,30,34,43,50,60} * * 'vga=gfx-xx': * graphics mode, e.g., vga=gfx-1024x768x16 * * 'vga=mode-: * specifies a mode as specified in 'vga=ask' menu * (NB. menu modes are displayed in hex, so mode numbers here must * be prefixed with '0x' (e.g., 'vga=mode-0x0318')) * * The option 'keep' causes Xen to continue to print to the VGA console even * after domain 0 starts to boot. The default behaviour is to relinquish * control of the console to domain 0. */ static char __initdata opt_vga[30] = ""; string_param("vga", opt_vga); /* VGA text-mode definitions. */ static unsigned int columns, lines; #define ATTRIBUTE 7 #ifdef CONFIG_X86 void vesa_early_init(void); void vesa_endboot(bool_t keep); #else #define vesa_early_init() ((void)0) #define vesa_endboot(x) ((void)0) #endif void __init video_init(void) { char *p; /* Look for 'keep' in comma-separated options. */ for ( p = opt_vga; p != NULL; p = strchr(p, ',') ) { if ( *p == ',' ) p++; if ( strncmp(p, "keep", 4) == 0 ) vgacon_keep = 1; } switch ( vga_console_info.video_type ) { case XEN_VGATYPE_TEXT_MODE_3: if ( page_is_ram_type(paddr_to_pfn(0xB8000), RAM_TYPE_CONVENTIONAL) || ((video = ioremap(0xB8000, 0x8000)) == NULL) ) return; outw(0x200a, 0x3d4); /* disable cursor */ columns = vga_console_info.u.text_mode_3.columns; lines = vga_console_info.u.text_mode_3.rows; memset(video, 0, columns * lines * 2); video_puts = vga_text_puts; break; case XEN_VGATYPE_VESA_LFB: case XEN_VGATYPE_EFI_LFB: vesa_early_init(); break; default: memset(&vga_console_info, 0, sizeof(vga_console_info)); break; } } void __init video_endboot(void) { if ( video_puts == vga_noop_puts ) return; printk("Xen is %s VGA console.\n", vgacon_keep ? "keeping" : "relinquishing"); if ( !vgacon_keep ) video_puts = vga_noop_puts; else { int bus, devfn; for ( bus = 0; bus < 256; ++bus ) for ( devfn = 0; devfn < 256; ++devfn ) { const struct pci_dev *pdev; u8 b = bus, df = devfn, sb; spin_lock(&pcidevs_lock); pdev = pci_get_pdev(0, bus, devfn); spin_unlock(&pcidevs_lock); if ( !pdev || pci_conf_read16(0, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), PCI_CLASS_DEVICE) != 0x0300 || !(pci_conf_read16(0, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), PCI_COMMAND) & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) ) continue; while ( b ) { switch ( find_upstream_bridge(0, &b, &df, &sb) ) { case 0: b = 0; break; case 1: switch ( pci_conf_read8(0, b, PCI_SLOT(df), PCI_FUNC(df), PCI_HEADER_TYPE) ) { case PCI_HEADER_TYPE_BRIDGE: case PCI_HEADER_TYPE_CARDBUS: if ( pci_conf_read16(0, b, PCI_SLOT(df), PCI_FUNC(df), PCI_BRIDGE_CONTROL) & PCI_BRIDGE_CTL_VGA ) continue; break; } break; } break; } if ( !b ) { printk(XENLOG_INFO "Boot video device %02x:%02x.%u\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); pci_hide_device(bus, devfn); } } } switch ( vga_console_info.video_type ) { case XEN_VGATYPE_TEXT_MODE_3: if ( !vgacon_keep ) memset(video, 0, columns * lines * 2); break; case XEN_VGATYPE_VESA_LFB: case XEN_VGATYPE_EFI_LFB: vesa_endboot(vgacon_keep); break; default: BUG(); } } static void vga_text_puts(const char *s) { char c; while ( (c = *s++) != '\0' ) { if ( (c == '\n') || (xpos >= columns) ) { if ( ++ypos >= lines ) { ypos = lines - 1; memmove(video, video + 2 * columns, ypos * 2 * columns); memset(video + ypos * 2 * columns, 0, 2 * xpos); } xpos = 0; } if ( c != '\n' ) { video[(xpos + ypos * columns) * 2] = c; video[(xpos + ypos * columns) * 2 + 1] = ATTRIBUTE; xpos++; } } } int __init fill_console_start_info(struct dom0_vga_console_info *ci) { memcpy(ci, &vga_console_info, sizeof(*ci)); return 1; } xen-4.4.0/xen/drivers/video/font_8x14.c0000664000175000017500000027365312307313555015737 0ustar smbsmb/**********************************************/ /* */ /* Font file generated by cpi2fnt */ /* */ /**********************************************/ #include #include "font.h" #define FONTDATAMAX (256*14) static const unsigned char fontdata_8x14[FONTDATAMAX] = { /* 0 0x00 '^@' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 1 0x01 '^A' */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x81, /* 10000001 */ 0xa5, /* 10100101 */ 0x81, /* 10000001 */ 0x81, /* 10000001 */ 0xbd, /* 10111101 */ 0x99, /* 10011001 */ 0x81, /* 10000001 */ 0x81, /* 10000001 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 2 0x02 '^B' */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0xff, /* 11111111 */ 0xdb, /* 11011011 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xc3, /* 11000011 */ 0xe7, /* 11100111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 3 0x03 '^C' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x6c, /* 01101100 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0x7c, /* 01111100 */ 0x38, /* 00111000 */ 0x10, /* 00010000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 4 0x04 '^D' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x7c, /* 01111100 */ 0xfe, /* 11111110 */ 0x7c, /* 01111100 */ 0x38, /* 00111000 */ 0x10, /* 00010000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 5 0x05 '^E' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0xe7, /* 11100111 */ 0xe7, /* 11100111 */ 0xe7, /* 11100111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 6 0x06 '^F' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x7e, /* 01111110 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 7 0x07 '^G' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 8 0x08 '^H' */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xe7, /* 11100111 */ 0xc3, /* 11000011 */ 0xc3, /* 11000011 */ 0xe7, /* 11100111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ /* 9 0x09 '^I' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x42, /* 01000010 */ 0x42, /* 01000010 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 10 0x0a '^J' */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xc3, /* 11000011 */ 0x99, /* 10011001 */ 0xbd, /* 10111101 */ 0xbd, /* 10111101 */ 0x99, /* 10011001 */ 0xc3, /* 11000011 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ /* 11 0x0b '^K' */ 0x00, /* 00000000 */ 0x1e, /* 00011110 */ 0x0e, /* 00001110 */ 0x1a, /* 00011010 */ 0x32, /* 00110010 */ 0x78, /* 01111000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x78, /* 01111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 12 0x0c '^L' */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 13 0x0d '^M' */ 0x00, /* 00000000 */ 0x3f, /* 00111111 */ 0x33, /* 00110011 */ 0x3f, /* 00111111 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x70, /* 01110000 */ 0xf0, /* 11110000 */ 0xe0, /* 11100000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 14 0x0e '^N' */ 0x00, /* 00000000 */ 0x7f, /* 01111111 */ 0x63, /* 01100011 */ 0x7f, /* 01111111 */ 0x63, /* 01100011 */ 0x63, /* 01100011 */ 0x63, /* 01100011 */ 0x63, /* 01100011 */ 0x67, /* 01100111 */ 0xe7, /* 11100111 */ 0xe6, /* 11100110 */ 0xc0, /* 11000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 15 0x0f '^O' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xdb, /* 11011011 */ 0x3c, /* 00111100 */ 0xe7, /* 11100111 */ 0x3c, /* 00111100 */ 0xdb, /* 11011011 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 16 0x10 '^P' */ 0x80, /* 10000000 */ 0xc0, /* 11000000 */ 0xe0, /* 11100000 */ 0xf0, /* 11110000 */ 0xf8, /* 11111000 */ 0xfe, /* 11111110 */ 0xf8, /* 11111000 */ 0xf0, /* 11110000 */ 0xe0, /* 11100000 */ 0xc0, /* 11000000 */ 0x80, /* 10000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 17 0x11 '^Q' */ 0x02, /* 00000010 */ 0x06, /* 00000110 */ 0x0e, /* 00001110 */ 0x1e, /* 00011110 */ 0x3e, /* 00111110 */ 0xfe, /* 11111110 */ 0x3e, /* 00111110 */ 0x1e, /* 00011110 */ 0x0e, /* 00001110 */ 0x06, /* 00000110 */ 0x02, /* 00000010 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 18 0x12 '^R' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 19 0x13 '^S' */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 20 0x14 '^T' */ 0x00, /* 00000000 */ 0x7f, /* 01111111 */ 0xdb, /* 11011011 */ 0xdb, /* 11011011 */ 0xdb, /* 11011011 */ 0x7b, /* 01111011 */ 0x1b, /* 00011011 */ 0x1b, /* 00011011 */ 0x1b, /* 00011011 */ 0x1b, /* 00011011 */ 0x1b, /* 00011011 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 21 0x15 '^U' */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0x60, /* 01100000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x0c, /* 00001100 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 22 0x16 '^V' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 23 0x17 '^W' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 24 0x18 '^X' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 25 0x19 '^Y' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 26 0x1a '^Z' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0xfe, /* 11111110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 27 0x1b '^[' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xfe, /* 11111110 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 28 0x1c '^\' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 29 0x1d '^]' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x28, /* 00101000 */ 0x6c, /* 01101100 */ 0xfe, /* 11111110 */ 0x6c, /* 01101100 */ 0x28, /* 00101000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 30 0x1e '^^' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x38, /* 00111000 */ 0x7c, /* 01111100 */ 0x7c, /* 01111100 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 31 0x1f '^_' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0x7c, /* 01111100 */ 0x7c, /* 01111100 */ 0x38, /* 00111000 */ 0x38, /* 00111000 */ 0x10, /* 00010000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 32 0x20 ' ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 33 0x21 '!' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 34 0x22 '"' */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x24, /* 00100100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 35 0x23 '#' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0xfe, /* 11111110 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0xfe, /* 11111110 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 36 0x24 '$' */ 0x18, /* 00011000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc2, /* 11000010 */ 0xc0, /* 11000000 */ 0x7c, /* 01111100 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x86, /* 10000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ /* 37 0x25 '%' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc2, /* 11000010 */ 0xc6, /* 11000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xc6, /* 11000110 */ 0x86, /* 10000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 38 0x26 '&' */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 39 0x27 ''' */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 40 0x28 '(' */ 0x00, /* 00000000 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 41 0x29 ')' */ 0x00, /* 00000000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 42 0x2a '*' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0xff, /* 11111111 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 43 0x2b '+' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 44 0x2c ',' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 45 0x2d '-' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 46 0x2e '.' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 47 0x2f '/' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x02, /* 00000010 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xc0, /* 11000000 */ 0x80, /* 10000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 48 0x30 '0' */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 49 0x31 '1' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x38, /* 00111000 */ 0x78, /* 01111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 50 0x32 '2' */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 51 0x33 '3' */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x3c, /* 00111100 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 52 0x34 '4' */ 0x00, /* 00000000 */ 0x0c, /* 00001100 */ 0x1c, /* 00011100 */ 0x3c, /* 00111100 */ 0x6c, /* 01101100 */ 0xcc, /* 11001100 */ 0xfe, /* 11111110 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x1e, /* 00011110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 53 0x35 '5' */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xfc, /* 11111100 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 54 0x36 '6' */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x60, /* 01100000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xfc, /* 11111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 55 0x37 '7' */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 56 0x38 '8' */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 57 0x39 '9' */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7e, /* 01111110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x78, /* 01111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 58 0x3a ':' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 59 0x3b ';' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 60 0x3c '<' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x06, /* 00000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 61 0x3d '=' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 62 0x3e '>' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 63 0x3f '?' */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 64 0x40 '@' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xde, /* 11011110 */ 0xde, /* 11011110 */ 0xde, /* 11011110 */ 0xdc, /* 11011100 */ 0xc0, /* 11000000 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 65 0x41 'A' */ 0x00, /* 00000000 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 66 0x42 'B' */ 0x00, /* 00000000 */ 0xfc, /* 11111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0xfc, /* 11111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 67 0x43 'C' */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0xc2, /* 11000010 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc2, /* 11000010 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 68 0x44 'D' */ 0x00, /* 00000000 */ 0xf8, /* 11111000 */ 0x6c, /* 01101100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x6c, /* 01101100 */ 0xf8, /* 11111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 69 0x45 'E' */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x66, /* 01100110 */ 0x62, /* 01100010 */ 0x68, /* 01101000 */ 0x78, /* 01111000 */ 0x68, /* 01101000 */ 0x60, /* 01100000 */ 0x62, /* 01100010 */ 0x66, /* 01100110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 70 0x46 'F' */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x66, /* 01100110 */ 0x62, /* 01100010 */ 0x68, /* 01101000 */ 0x78, /* 01111000 */ 0x68, /* 01101000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0xf0, /* 11110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 71 0x47 'G' */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0xc2, /* 11000010 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xde, /* 11011110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x66, /* 01100110 */ 0x3a, /* 00111010 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 72 0x48 'H' */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 73 0x49 'I' */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 74 0x4a 'J' */ 0x00, /* 00000000 */ 0x1e, /* 00011110 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x78, /* 01111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 75 0x4b 'K' */ 0x00, /* 00000000 */ 0xe6, /* 11100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x6c, /* 01101100 */ 0x78, /* 01111000 */ 0x78, /* 01111000 */ 0x6c, /* 01101100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0xe6, /* 11100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 76 0x4c 'L' */ 0x00, /* 00000000 */ 0xf0, /* 11110000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x62, /* 01100010 */ 0x66, /* 01100110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 77 0x4d 'M' */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xee, /* 11101110 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0xd6, /* 11010110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 78 0x4e 'N' */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xe6, /* 11100110 */ 0xf6, /* 11110110 */ 0xfe, /* 11111110 */ 0xde, /* 11011110 */ 0xce, /* 11001110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 79 0x4f 'O' */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 80 0x50 'P' */ 0x00, /* 00000000 */ 0xfc, /* 11111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0xf0, /* 11110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 81 0x51 'Q' */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xd6, /* 11010110 */ 0xde, /* 11011110 */ 0x7c, /* 01111100 */ 0x0c, /* 00001100 */ 0x0e, /* 00001110 */ 0x00, /* 00000000 */ /* 82 0x52 'R' */ 0x00, /* 00000000 */ 0xfc, /* 11111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0x6c, /* 01101100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0xe6, /* 11100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 83 0x53 'S' */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x60, /* 01100000 */ 0x38, /* 00111000 */ 0x0c, /* 00001100 */ 0x06, /* 00000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 84 0x54 'T' */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x5a, /* 01011010 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 85 0x55 'U' */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 86 0x56 'V' */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x10, /* 00010000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 87 0x57 'W' */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xfe, /* 11111110 */ 0xee, /* 11101110 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 88 0x58 'X' */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x7c, /* 01111100 */ 0x38, /* 00111000 */ 0x38, /* 00111000 */ 0x7c, /* 01111100 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 89 0x59 'Y' */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 90 0x5a 'Z' */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0x86, /* 10000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xc2, /* 11000010 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 91 0x5b '[' */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 92 0x5c '\' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x80, /* 10000000 */ 0xc0, /* 11000000 */ 0xe0, /* 11100000 */ 0x70, /* 01110000 */ 0x38, /* 00111000 */ 0x1c, /* 00011100 */ 0x0e, /* 00001110 */ 0x06, /* 00000110 */ 0x02, /* 00000010 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 93 0x5d ']' */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 94 0x5e '^' */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 95 0x5f '_' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ /* 96 0x60 '`' */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 97 0x61 'a' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 98 0x62 'b' */ 0x00, /* 00000000 */ 0xe0, /* 11100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x78, /* 01111000 */ 0x6c, /* 01101100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 99 0x63 'c' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 100 0x64 'd' */ 0x00, /* 00000000 */ 0x1c, /* 00011100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x3c, /* 00111100 */ 0x6c, /* 01101100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 101 0x65 'e' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 102 0x66 'f' */ 0x00, /* 00000000 */ 0x1c, /* 00011100 */ 0x36, /* 00110110 */ 0x32, /* 00110010 */ 0x30, /* 00110000 */ 0x78, /* 01111000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x78, /* 01111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 103 0x67 'g' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x7c, /* 01111100 */ 0x0c, /* 00001100 */ 0xcc, /* 11001100 */ 0x78, /* 01111000 */ /* 104 0x68 'h' */ 0x00, /* 00000000 */ 0xe0, /* 11100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x6c, /* 01101100 */ 0x76, /* 01110110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0xe6, /* 11100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 105 0x69 'i' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 106 0x6a 'j' */ 0x00, /* 00000000 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x00, /* 00000000 */ 0x0e, /* 00001110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ /* 107 0x6b 'k' */ 0x00, /* 00000000 */ 0xe0, /* 11100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x66, /* 01100110 */ 0x6c, /* 01101100 */ 0x78, /* 01111000 */ 0x78, /* 01111000 */ 0x6c, /* 01101100 */ 0x66, /* 01100110 */ 0xe6, /* 11100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 108 0x6c 'l' */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 109 0x6d 'm' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xec, /* 11101100 */ 0xfe, /* 11111110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 110 0x6e 'n' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xdc, /* 11011100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 111 0x6f 'o' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 112 0x70 'p' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xdc, /* 11011100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0xf0, /* 11110000 */ /* 113 0x71 'q' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x7c, /* 01111100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x1e, /* 00011110 */ /* 114 0x72 'r' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xdc, /* 11011100 */ 0x76, /* 01110110 */ 0x66, /* 01100110 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0xf0, /* 11110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 115 0x73 's' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0x60, /* 01100000 */ 0x38, /* 00111000 */ 0x0c, /* 00001100 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 116 0x74 't' */ 0x00, /* 00000000 */ 0x10, /* 00010000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0xfc, /* 11111100 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x36, /* 00110110 */ 0x1c, /* 00011100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 117 0x75 'u' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 118 0x76 'v' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 119 0x77 'w' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xfe, /* 11111110 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 120 0x78 'x' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x38, /* 00111000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 121 0x79 'y' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7e, /* 01111110 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0xf8, /* 11111000 */ /* 122 0x7a 'z' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xcc, /* 11001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 123 0x7b '{' */ 0x00, /* 00000000 */ 0x0e, /* 00001110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x70, /* 01110000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x0e, /* 00001110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 124 0x7c '|' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 125 0x7d '}' */ 0x00, /* 00000000 */ 0x70, /* 01110000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x0e, /* 00001110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x70, /* 01110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 126 0x7e '~' */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 127 0x7f '' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 128 0x80 'Ÿˆ */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0xc2, /* 11000010 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc2, /* 11000010 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x70, /* 01110000 */ 0x00, /* 00000000 */ /* 129 0x81 '¡ˆ */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 130 0x82 '£ˆ */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 131 0x83 '¥ˆ */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 132 0x84 '§ˆ */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 133 0x85 '©ˆ */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 134 0x86 '«ˆ */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 135 0x87 '­ˆ */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x18, /* 00011000 */ 0x70, /* 01110000 */ 0x00, /* 00000000 */ /* 136 0x88 '¯ˆ */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 137 0x89 '±ˆ */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 138 0x8a '³ˆ */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 139 0x8b 'µˆ */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 140 0x8c '·ˆ */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 141 0x8d '¹ˆ */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 142 0x8e '»ˆ */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 143 0x8f '½ˆ */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 144 0x90 '¿ˆ */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x66, /* 01100110 */ 0x62, /* 01100010 */ 0x68, /* 01101000 */ 0x78, /* 01111000 */ 0x68, /* 01101000 */ 0x62, /* 01100010 */ 0x66, /* 01100110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 145 0x91 'Áˆ */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xec, /* 11101100 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x7e, /* 01111110 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0x6e, /* 01101110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 146 0x92 'È */ 0x00, /* 00000000 */ 0x3e, /* 00111110 */ 0x6c, /* 01101100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xfe, /* 11111110 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xce, /* 11001110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 147 0x93 'ň */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 148 0x94 'Lj */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 149 0x95 'Ɉ */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 150 0x96 'ˈ */ 0x30, /* 00110000 */ 0x78, /* 01111000 */ 0xcc, /* 11001100 */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 151 0x97 '͈ */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 152 0x98 'ψ */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7e, /* 01111110 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x78, /* 01111000 */ /* 153 0x99 'ш */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 154 0x9a 'Óˆ */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 155 0x9b 'Õˆ */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 156 0x9c '׈ */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x64, /* 01100100 */ 0x60, /* 01100000 */ 0xf0, /* 11110000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0xe6, /* 11100110 */ 0xfc, /* 11111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 157 0x9d 'Ùˆ */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 158 0x9e 'Ûˆ */ 0xf8, /* 11111000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xf8, /* 11111000 */ 0xc4, /* 11000100 */ 0xcc, /* 11001100 */ 0xde, /* 11011110 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 159 0x9f '݈ */ 0x0e, /* 00001110 */ 0x1b, /* 00011011 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xd8, /* 11011000 */ 0x70, /* 01110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 160 0xa0 ' ' */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x00, /* 00000000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 161 0xa1 '¡' */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 162 0xa2 '¢' */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 163 0xa3 '£' */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 164 0xa4 '¤' */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x00, /* 00000000 */ 0xdc, /* 11011100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 165 0xa5 '¥' */ 0xdc, /* 11011100 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xe6, /* 11100110 */ 0xf6, /* 11110110 */ 0xfe, /* 11111110 */ 0xde, /* 11011110 */ 0xce, /* 11001110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 166 0xa6 '¦' */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x3e, /* 00111110 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 167 0xa7 '§' */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 168 0xa8 '¨' */ 0x00, /* 00000000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 169 0xa9 '©' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 170 0xaa 'ª' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 171 0xab '«' */ 0x60, /* 01100000 */ 0xe0, /* 11100000 */ 0x62, /* 01100010 */ 0x66, /* 01100110 */ 0x6c, /* 01101100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xdc, /* 11011100 */ 0x86, /* 10000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x3e, /* 00111110 */ 0x00, /* 00000000 */ /* 172 0xac '¬' */ 0x60, /* 01100000 */ 0xe0, /* 11100000 */ 0x62, /* 01100010 */ 0x66, /* 01100110 */ 0x6c, /* 01101100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x66, /* 01100110 */ 0xce, /* 11001110 */ 0x9a, /* 10011010 */ 0x3f, /* 00111111 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x00, /* 00000000 */ /* 173 0xad '­' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 174 0xae '®' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x36, /* 00110110 */ 0x6c, /* 01101100 */ 0xd8, /* 11011000 */ 0x6c, /* 01101100 */ 0x36, /* 00110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 175 0xaf '¯' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xd8, /* 11011000 */ 0x6c, /* 01101100 */ 0x36, /* 00110110 */ 0x6c, /* 01101100 */ 0xd8, /* 11011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 176 0xb0 '°' */ 0x44, /* 01000100 */ 0x11, /* 00010001 */ 0x44, /* 01000100 */ 0x11, /* 00010001 */ 0x44, /* 01000100 */ 0x11, /* 00010001 */ 0x44, /* 01000100 */ 0x11, /* 00010001 */ 0x44, /* 01000100 */ 0x11, /* 00010001 */ 0x44, /* 01000100 */ 0x11, /* 00010001 */ 0x44, /* 01000100 */ 0x11, /* 00010001 */ /* 177 0xb1 '±' */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ /* 178 0xb2 '²' */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ /* 179 0xb3 '³' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 180 0xb4 '´' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 181 0xb5 'µ' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 182 0xb6 '¶' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xf6, /* 11110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 183 0xb7 '·' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 184 0xb8 '¸' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 185 0xb9 '¹' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xf6, /* 11110110 */ 0x06, /* 00000110 */ 0xf6, /* 11110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 186 0xba 'º' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 187 0xbb '»' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x06, /* 00000110 */ 0xf6, /* 11110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 188 0xbc '¼' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xf6, /* 11110110 */ 0x06, /* 00000110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 189 0xbd '½' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 190 0xbe '¾' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 191 0xbf '¿' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 192 0xc0 'À' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 193 0xc1 'Á' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 194 0xc2 'Â' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 195 0xc3 'Ã' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 196 0xc4 'Ä' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 197 0xc5 'Å' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xff, /* 11111111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 198 0xc6 'Æ' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 199 0xc7 'Ç' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x37, /* 00110111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 200 0xc8 'È' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x37, /* 00110111 */ 0x30, /* 00110000 */ 0x3f, /* 00111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 201 0xc9 'É' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3f, /* 00111111 */ 0x30, /* 00110000 */ 0x37, /* 00110111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 202 0xca 'Ê' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xf7, /* 11110111 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 203 0xcb 'Ë' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0xf7, /* 11110111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 204 0xcc 'Ì' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x37, /* 00110111 */ 0x30, /* 00110000 */ 0x37, /* 00110111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 205 0xcd 'Í' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 206 0xce 'Î' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xf7, /* 11110111 */ 0x00, /* 00000000 */ 0xf7, /* 11110111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 207 0xcf 'Ï' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 208 0xd0 'Ð' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 209 0xd1 'Ñ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 210 0xd2 'Ò' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 211 0xd3 'Ó' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x3f, /* 00111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 212 0xd4 'Ô' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 213 0xd5 'Õ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 214 0xd6 'Ö' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3f, /* 00111111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 215 0xd7 '×' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xff, /* 11111111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 216 0xd8 'Ø' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xff, /* 11111111 */ 0x18, /* 00011000 */ 0xff, /* 11111111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 217 0xd9 'Ù' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 218 0xda 'Ú' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 219 0xdb 'Û' */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ /* 220 0xdc 'Ü' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ /* 221 0xdd 'Ý' */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ /* 222 0xde 'Þ' */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ /* 223 0xdf 'ß' */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 224 0xe0 '߈ */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0xdc, /* 11011100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 225 0xe1 'ሠ*/ 0x00, /* 00000000 */ 0x78, /* 01111000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xd8, /* 11011000 */ 0xcc, /* 11001100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xcc, /* 11001100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 226 0xe2 '㈠*/ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 227 0xe3 'åˆ */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 228 0xe4 'çˆ */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 229 0xe5 'éˆ */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0x70, /* 01110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 230 0xe6 'ëˆ */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0xc0, /* 11000000 */ /* 231 0xe7 'íˆ */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 232 0xe8 'ïˆ */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 233 0xe9 'ñˆ */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 234 0xea 'óˆ */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0xee, /* 11101110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 235 0xeb 'õˆ */ 0x00, /* 00000000 */ 0x1e, /* 00011110 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x3e, /* 00111110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 236 0xec '÷ˆ */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0xdb, /* 11011011 */ 0xdb, /* 11011011 */ 0xdb, /* 11011011 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 237 0xed 'ùˆ */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x03, /* 00000011 */ 0x06, /* 00000110 */ 0x7e, /* 01111110 */ 0xdb, /* 11011011 */ 0xdb, /* 11011011 */ 0xf3, /* 11110011 */ 0x7e, /* 01111110 */ 0x60, /* 01100000 */ 0xc0, /* 11000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 238 0xee 'ûˆ */ 0x00, /* 00000000 */ 0x1c, /* 00011100 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x7c, /* 01111100 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x1c, /* 00011100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 239 0xef 'ýˆ */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 240 0xf0 'ð' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 241 0xf1 'ñ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 242 0xf2 'ò' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 243 0xf3 'ó' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 244 0xf4 'ô' */ 0x00, /* 00000000 */ 0x0e, /* 00001110 */ 0x1b, /* 00011011 */ 0x1b, /* 00011011 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 245 0xf5 'õ' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0x70, /* 01110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 246 0xf6 'ö' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 247 0xf7 '÷' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 248 0xf8 'ø' */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 249 0xf9 'ù' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 250 0xfa 'ú' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 251 0xfb 'û' */ 0x0f, /* 00001111 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0xec, /* 11101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x3c, /* 00111100 */ 0x1c, /* 00011100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 252 0xfc 'ü' */ 0x6c, /* 01101100 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 253 0xfd 'ý' */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x32, /* 00110010 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 254 0xfe 'þ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 255 0xff 'ÿ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ }; const struct font_desc font_vga_8x14 = { "VGA8x14", 8, 14, sizeof(fontdata_8x14) / 14, fontdata_8x14 }; xen-4.4.0/xen/drivers/video/lfb.h0000664000175000017500000000231012307313555014731 0ustar smbsmb/* * xen/drivers/video/lfb.h * * Cross-platform framebuffer library * * Stefano Stabellini * Copyright (c) 2013 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #ifndef _XEN_LFB_H #define _XEN_LFB_H #include struct lfb_prop { const struct font_desc *font; unsigned char *lfb; unsigned int pixel_on; uint16_t width, height; uint16_t bytes_per_line; uint16_t bits_per_pixel; void (*flush)(void); unsigned int text_columns; unsigned int text_rows; }; void lfb_redraw_puts(const char *s); void lfb_scroll_puts(const char *s); void lfb_carriage_return(void); void lfb_free(void); /* initialize the framebuffer */ int lfb_init(struct lfb_prop *lfbp); #endif xen-4.4.0/xen/drivers/video/arm_hdlcd.c0000664000175000017500000002044412307313555016106 0ustar smbsmb/* * xen/drivers/video/arm_hdlcd.c * * Driver for ARM HDLCD Controller * * Stefano Stabellini * Copyright (c) 2013 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include #include #include "font.h" #include "lfb.h" #include "modelines.h" #define HDLCD ((volatile uint32_t *) FIXMAP_ADDR(FIXMAP_MISC)) #define HDLCD_INTMASK (0x18/4) #define HDLCD_FBBASE (0x100/4) #define HDLCD_LINELENGTH (0x104/4) #define HDLCD_LINECOUNT (0x108/4) #define HDLCD_LINEPITCH (0x10C/4) #define HDLCD_BUS (0x110/4) #define HDLCD_VSYNC (0x200/4) #define HDLCD_VBACK (0x204/4) #define HDLCD_VDATA (0x208/4) #define HDLCD_VFRONT (0x20C/4) #define HDLCD_HSYNC (0x210/4) #define HDLCD_HBACK (0x214/4) #define HDLCD_HDATA (0x218/4) #define HDLCD_HFRONT (0x21C/4) #define HDLCD_POLARITIES (0x220/4) #define HDLCD_COMMAND (0x230/4) #define HDLCD_PF (0x240/4) #define HDLCD_RED (0x244/4) #define HDLCD_GREEN (0x248/4) #define HDLCD_BLUE (0x24C/4) struct color_masks { int red_shift; int red_size; int green_shift; int green_size; int blue_shift; int blue_size; }; struct pixel_colors { const char* bpp; struct color_masks colors; }; struct pixel_colors __initdata colors[] = { { "16", { 0, 5, 11, 5, 6, 5 } }, { "24", { 0, 8, 16, 8, 8, 8 } }, { "32", { 0, 8, 16, 8, 8, 8 } }, }; static void vga_noop_puts(const char *s) {} void (*video_puts)(const char *) = vga_noop_puts; static void hdlcd_flush(void) { dsb(); } static int __init get_color_masks(const char* bpp, struct color_masks **masks) { int i; for ( i = 0; i < ARRAY_SIZE(colors); i++ ) { if ( !strncmp(colors[i].bpp, bpp, 2) ) { *masks = &colors[i].colors; return 0; } } return -1; } static void __init set_pixclock(uint32_t pixclock) { if ( dt_find_compatible_node(NULL, NULL, "arm,vexpress") ) vexpress_syscfg(1, V2M_SYS_CFG_OSC_FUNC, V2M_SYS_CFG_OSC5, &pixclock); } void __init video_init(void) { struct lfb_prop lfbp; unsigned char *lfb; paddr_t hdlcd_start, hdlcd_size; paddr_t framebuffer_start, framebuffer_size; const char *mode_string; char _mode_string[16]; int bytes_per_pixel = 4; struct color_masks *c = NULL; struct modeline *videomode = NULL; int i; const struct dt_device_node *dev; const __be32 *cells; u32 lenp; int res; dev = dt_find_compatible_node(NULL, NULL, "arm,hdlcd"); if ( !dev ) { early_printk("HDLCD: Cannot find node compatible with \"arm,hdcld\"\n"); return; } res = dt_device_get_address(dev, 0, &hdlcd_start, &hdlcd_size); if ( !res ) { early_printk("HDLCD: Unable to retrieve MMIO base address\n"); return; } cells = dt_get_property(dev, "framebuffer", &lenp); if ( !cells ) { early_printk("HDLCD: Unable to retrieve framebuffer property\n"); return; } framebuffer_start = dt_next_cell(dt_n_addr_cells(dev), &cells); framebuffer_size = dt_next_cell(dt_n_size_cells(dev), &cells); if ( !hdlcd_start ) { early_printk(KERN_ERR "HDLCD: address missing from device tree, disabling driver\n"); return; } if ( !framebuffer_start ) { early_printk(KERN_ERR "HDLCD: framebuffer address missing from device tree, disabling driver\n"); return; } res = dt_property_read_string(dev, "mode", &mode_string); if ( res ) { get_color_masks("32", &c); memcpy(_mode_string, "1280x1024@60", strlen("1280x1024@60") + 1); bytes_per_pixel = 4; } else if ( strlen(mode_string) < strlen("800x600@60") || strlen(mode_string) > sizeof(_mode_string) - 1 ) { early_printk(KERN_ERR "HDLCD: invalid modeline=%s\n", mode_string); return; } else { char *s = strchr(mode_string, '-'); if ( !s ) { early_printk(KERN_INFO "HDLCD: bpp not found in modeline %s, assume 32 bpp\n", mode_string); get_color_masks("32", &c); memcpy(_mode_string, mode_string, strlen(mode_string) + 1); bytes_per_pixel = 4; } else { if ( strlen(s) < 6 ) { early_printk(KERN_ERR "HDLCD: invalid mode %s\n", mode_string); return; } s++; if ( get_color_masks(s, &c) < 0 ) { early_printk(KERN_WARNING "HDLCD: unsupported bpp %s\n", s); return; } bytes_per_pixel = simple_strtoll(s, NULL, 10) / 8; } i = s - mode_string - 1; memcpy(_mode_string, mode_string, i); memcpy(_mode_string + i, mode_string + i + 3, 4); } for ( i = 0; i < ARRAY_SIZE(videomodes); i++ ) { if ( !strcmp(_mode_string, videomodes[i].mode) ) { videomode = &videomodes[i]; break; } } if ( !videomode ) { early_printk(KERN_WARNING "HDLCD: unsupported videomode %s\n", _mode_string); return; } if ( framebuffer_size < bytes_per_pixel * videomode->xres * videomode->yres ) { early_printk(KERN_ERR "HDLCD: the framebuffer is too small, disabling the HDLCD driver\n"); return; } early_printk(KERN_INFO "Initializing HDLCD driver\n"); lfb = ioremap_wc(framebuffer_start, framebuffer_size); if ( !lfb ) { early_printk(KERN_ERR "Couldn't map the framebuffer\n"); return; } memset(lfb, 0x00, bytes_per_pixel * videomode->xres * videomode->yres); /* uses FIXMAP_MISC */ set_pixclock(videomode->pixclock); set_fixmap(FIXMAP_MISC, hdlcd_start >> PAGE_SHIFT, DEV_SHARED); HDLCD[HDLCD_COMMAND] = 0; HDLCD[HDLCD_LINELENGTH] = videomode->xres * bytes_per_pixel; HDLCD[HDLCD_LINECOUNT] = videomode->yres - 1; HDLCD[HDLCD_LINEPITCH] = videomode->xres * bytes_per_pixel; HDLCD[HDLCD_PF] = ((bytes_per_pixel - 1) << 3); HDLCD[HDLCD_INTMASK] = 0; HDLCD[HDLCD_FBBASE] = framebuffer_start; HDLCD[HDLCD_BUS] = 0xf00 | (1 << 4); HDLCD[HDLCD_VBACK] = videomode->vback - 1; HDLCD[HDLCD_VSYNC] = videomode->vsync - 1; HDLCD[HDLCD_VDATA] = videomode->yres - 1; HDLCD[HDLCD_VFRONT] = videomode->vfront - 1; HDLCD[HDLCD_HBACK] = videomode->hback - 1; HDLCD[HDLCD_HSYNC] = videomode->hsync - 1; HDLCD[HDLCD_HDATA] = videomode->xres - 1; HDLCD[HDLCD_HFRONT] = videomode->hfront - 1; HDLCD[HDLCD_POLARITIES] = (1 << 2) | (1 << 3); HDLCD[HDLCD_RED] = (c->red_size << 8) | c->red_shift; HDLCD[HDLCD_GREEN] = (c->green_size << 8) | c->green_shift; HDLCD[HDLCD_BLUE] = (c->blue_size << 8) | c->blue_shift; HDLCD[HDLCD_COMMAND] = 1; clear_fixmap(FIXMAP_MISC); lfbp.pixel_on = (((1 << c->red_size) - 1) << c->red_shift) | (((1 << c->green_size) - 1) << c->green_shift) | (((1 << c->blue_size) - 1) << c->blue_shift); lfbp.lfb = lfb; lfbp.font = &font_vga_8x16; lfbp.bits_per_pixel = bytes_per_pixel*8; lfbp.bytes_per_line = bytes_per_pixel*videomode->xres; lfbp.width = videomode->xres; lfbp.height = videomode->yres; lfbp.flush = hdlcd_flush; lfbp.text_columns = videomode->xres / 8; lfbp.text_rows = videomode->yres / 16; if ( lfb_init(&lfbp) < 0 ) return; video_puts = lfb_scroll_puts; } void __init video_endboot(void) { } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/drivers/video/modelines.h0000664000175000017500000001016212307313555016151 0ustar smbsmb/* * xen/drivers/video/modelines.h * * Timings for many popular monitor resolutions * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Copyright (c) 1999 by The XFree86 Project, Inc. * Copyright (c) 2013 Citrix Systems */ #ifndef _XEN_MODLINES_H #define _XEN_MODLINES_H struct modeline { const char* mode; /* in the form 1280x1024@60 */ uint32_t pixclock; /* Khz */ uint32_t xres; uint32_t hfront; /* horizontal front porch in pixels */ uint32_t hsync; /* horizontal sync pulse in pixels */ uint32_t hback; /* horizontal back porch in pixels */ uint32_t yres; uint32_t vfront; /* vertical front porch in lines */ uint32_t vsync; /* vertical sync pulse in lines */ uint32_t vback; /* vertical back porch in lines */ }; struct modeline __initdata videomodes[] = { { "640x480@60", 25175, 640, 16, 96, 48, 480, 11, 2, 31 }, { "640x480@72", 31500, 640, 24, 40, 128, 480, 9, 3, 28 }, { "640x480@75", 31500, 640, 16, 96, 48, 480, 11, 2, 32 }, { "640x480@85", 36000, 640, 32, 48, 112, 480, 1, 3, 25 }, { "800x600@56", 38100, 800, 32, 128, 128, 600, 1, 4, 14 }, { "800x600@60", 40000, 800, 40, 128, 88 , 600, 1, 4, 23 }, { "800x600@72", 50000, 800, 56, 120, 64 , 600, 37, 6, 23 }, { "800x600@75", 49500, 800, 16, 80, 160, 600, 1, 2, 21 }, { "800x600@85", 56250, 800, 32, 64, 152, 600, 1, 3, 27 }, { "1024x768@60", 65000, 1024, 24, 136, 160, 768, 3, 6, 29 }, { "1024x768@70", 75000, 1024, 24, 136, 144, 768, 3, 6, 29 }, { "1024x768@75", 78750, 1024, 16, 96, 176, 768, 1, 3, 28 }, { "1024x768@85", 94500, 1024, 48, 96, 208, 768, 1, 3, 36 }, { "1280x1024@60", 108000, 1280, 48, 112, 248, 1024, 1, 3, 38 }, { "1280x1024@75", 135000, 1280, 16, 144, 248, 1024, 1, 3, 38 }, { "1280x1024@85", 157500, 1280, 64, 160, 224, 1024, 1, 3, 44 }, { "1400x1050@60", 122610, 1400, 88, 152, 240, 1050, 1, 3, 33 }, { "1400x1050@75", 155850, 1400, 96, 152, 248, 1050, 1, 3, 42 }, { "1600x1200@60", 162000, 1600, 64, 192, 304, 1200, 1, 3, 46 }, { "1600x1200@65", 175500, 1600, 64, 192, 304, 1200, 1, 3, 46 }, { "1600x1200@70", 189000, 1600, 64, 192, 304, 1200, 1, 3, 46 }, { "1600x1200@75", 202500, 1600, 64, 192, 304, 1200, 1, 3, 46 }, { "1600x1200@85", 229500, 1600, 64, 192, 304, 1200, 1, 3, 46 }, { "1792x1344@60", 204800, 1792, 128, 200, 328, 1344, 1, 3, 46 }, { "1792x1344@75", 261000, 1792, 96, 216, 352, 1344, 1, 3, 69 }, { "1856x1392@60", 218300, 1856, 96, 224, 352, 1392, 1, 3, 43 }, { "1856x1392@75", 288000, 1856, 128, 224, 352, 1392, 1, 3, 104 }, { "1920x1200@75", 193160, 1920, 128, 208, 336, 1200, 1, 3, 38 }, { "1920x1440@60", 234000, 1920, 128, 208, 344, 1440, 1, 3, 56 }, { "1920x1440@75", 297000, 1920, 144, 224, 352, 1440, 1, 3, 56 }, }; #endif xen-4.4.0/xen/drivers/video/lfb.c0000664000175000017500000001176012307313555014735 0ustar smbsmb/****************************************************************************** * lfb.c * * linear frame buffer handling. */ #include #include #include #include #include "lfb.h" #include "font.h" #define MAX_XRES 1900 #define MAX_YRES 1200 #define MAX_BPP 4 #define MAX_FONT_W 8 #define MAX_FONT_H 16 struct lfb_status { struct lfb_prop lfbp; unsigned char *lbuf, *text_buf; unsigned int *line_len; unsigned int xpos, ypos; }; static struct lfb_status lfb; static void lfb_show_line( const unsigned char *text_line, unsigned char *video_line, unsigned int nr_chars, unsigned int nr_cells) { unsigned int i, j, b, bpp, pixel; bpp = (lfb.lfbp.bits_per_pixel + 7) >> 3; for ( i = 0; i < lfb.lfbp.font->height; i++ ) { unsigned char *ptr = lfb.lbuf; for ( j = 0; j < nr_chars; j++ ) { const unsigned char *bits = lfb.lfbp.font->data; bits += ((text_line[j] * lfb.lfbp.font->height + i) * ((lfb.lfbp.font->width + 7) >> 3)); for ( b = lfb.lfbp.font->width; b--; ) { pixel = (*bits & (1u<width) * bpp); memcpy(video_line, lfb.lbuf, nr_cells * lfb.lfbp.font->width * bpp); video_line += lfb.lfbp.bytes_per_line; } } /* Fast mode which redraws all modified parts of a 2D text buffer. */ void lfb_redraw_puts(const char *s) { unsigned int i, min_redraw_y = lfb.ypos; char c; /* Paste characters into text buffer. */ while ( (c = *s++) != '\0' ) { if ( (c == '\n') || (lfb.xpos >= lfb.lfbp.text_columns) ) { if ( ++lfb.ypos >= lfb.lfbp.text_rows ) { min_redraw_y = 0; lfb.ypos = lfb.lfbp.text_rows - 1; memmove(lfb.text_buf, lfb.text_buf + lfb.lfbp.text_columns, lfb.ypos * lfb.lfbp.text_columns); memset(lfb.text_buf + lfb.ypos * lfb.lfbp.text_columns, 0, lfb.xpos); } lfb.xpos = 0; } if ( c != '\n' ) lfb.text_buf[lfb.xpos++ + lfb.ypos * lfb.lfbp.text_columns] = c; } /* Render modified section of text buffer to VESA linear framebuffer. */ for ( i = min_redraw_y; i <= lfb.ypos; i++ ) { const unsigned char *line = lfb.text_buf + i * lfb.lfbp.text_columns; unsigned int width; for ( width = lfb.lfbp.text_columns; width; --width ) if ( line[width - 1] ) break; lfb_show_line(line, lfb.lfbp.lfb + i * lfb.lfbp.font->height * lfb.lfbp.bytes_per_line, width, max(lfb.line_len[i], width)); lfb.line_len[i] = width; } lfb.lfbp.flush(); } /* Slower line-based scroll mode which interacts better with dom0. */ void lfb_scroll_puts(const char *s) { unsigned int i; char c; while ( (c = *s++) != '\0' ) { if ( (c == '\n') || (lfb.xpos >= lfb.lfbp.text_columns) ) { unsigned int bytes = (lfb.lfbp.width * ((lfb.lfbp.bits_per_pixel + 7) >> 3)); unsigned char *src = lfb.lfbp.lfb + lfb.lfbp.font->height * lfb.lfbp.bytes_per_line; unsigned char *dst = lfb.lfbp.lfb; /* New line: scroll all previous rows up one line. */ for ( i = lfb.lfbp.font->height; i < lfb.lfbp.height; i++ ) { memcpy(dst, src, bytes); src += lfb.lfbp.bytes_per_line; dst += lfb.lfbp.bytes_per_line; } /* Render new line. */ lfb_show_line( lfb.text_buf, lfb.lfbp.lfb + (lfb.lfbp.text_rows-1) * lfb.lfbp.font->height * lfb.lfbp.bytes_per_line, lfb.xpos, lfb.lfbp.text_columns); lfb.xpos = 0; } if ( c != '\n' ) lfb.text_buf[lfb.xpos++] = c; } lfb.lfbp.flush(); } void lfb_carriage_return(void) { lfb.xpos = 0; } int __init lfb_init(struct lfb_prop *lfbp) { if ( lfbp->width > MAX_XRES || lfbp->height > MAX_YRES ) { printk(XENLOG_WARNING "Couldn't initialize a %ux%u framebuffer early.\n", lfbp->width, lfbp->height); return -EINVAL; } lfb.lfbp = *lfbp; lfb.lbuf = xmalloc_bytes(lfb.lfbp.bytes_per_line); lfb.text_buf = xzalloc_bytes(lfb.lfbp.text_columns * lfb.lfbp.text_rows); lfb.line_len = xzalloc_array(unsigned int, lfb.lfbp.text_columns); if ( !lfb.lbuf || !lfb.text_buf || !lfb.line_len ) goto fail; return 0; fail: printk(XENLOG_ERR "Couldn't allocate enough memory to drive the framebuffer\n"); lfb_free(); return -ENOMEM; } void lfb_free(void) { xfree(lfb.lbuf); xfree(lfb.text_buf); xfree(lfb.line_len); } xen-4.4.0/xen/drivers/video/font_8x16.c0000664000175000017500000032465312307313555015736 0ustar smbsmb/**********************************************/ /* */ /* Font file generated by cpi2fnt */ /* */ /**********************************************/ #include #include "font.h" #define FONTDATAMAX (256*16) static const unsigned char fontdata_8x16[FONTDATAMAX] = { /* 0 0x00 '^@' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 1 0x01 '^A' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x81, /* 10000001 */ 0xa5, /* 10100101 */ 0x81, /* 10000001 */ 0x81, /* 10000001 */ 0xbd, /* 10111101 */ 0x99, /* 10011001 */ 0x81, /* 10000001 */ 0x81, /* 10000001 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 2 0x02 '^B' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0xff, /* 11111111 */ 0xdb, /* 11011011 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xc3, /* 11000011 */ 0xe7, /* 11100111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 3 0x03 '^C' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x6c, /* 01101100 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0x7c, /* 01111100 */ 0x38, /* 00111000 */ 0x10, /* 00010000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 4 0x04 '^D' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x7c, /* 01111100 */ 0xfe, /* 11111110 */ 0x7c, /* 01111100 */ 0x38, /* 00111000 */ 0x10, /* 00010000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 5 0x05 '^E' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0xe7, /* 11100111 */ 0xe7, /* 11100111 */ 0xe7, /* 11100111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 6 0x06 '^F' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x7e, /* 01111110 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 7 0x07 '^G' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 8 0x08 '^H' */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xe7, /* 11100111 */ 0xc3, /* 11000011 */ 0xc3, /* 11000011 */ 0xe7, /* 11100111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ /* 9 0x09 '^I' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x42, /* 01000010 */ 0x42, /* 01000010 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 10 0x0a '^J' */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xc3, /* 11000011 */ 0x99, /* 10011001 */ 0xbd, /* 10111101 */ 0xbd, /* 10111101 */ 0x99, /* 10011001 */ 0xc3, /* 11000011 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ /* 11 0x0b '^K' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x1e, /* 00011110 */ 0x0e, /* 00001110 */ 0x1a, /* 00011010 */ 0x32, /* 00110010 */ 0x78, /* 01111000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x78, /* 01111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 12 0x0c '^L' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 13 0x0d '^M' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3f, /* 00111111 */ 0x33, /* 00110011 */ 0x3f, /* 00111111 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x70, /* 01110000 */ 0xf0, /* 11110000 */ 0xe0, /* 11100000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 14 0x0e '^N' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7f, /* 01111111 */ 0x63, /* 01100011 */ 0x7f, /* 01111111 */ 0x63, /* 01100011 */ 0x63, /* 01100011 */ 0x63, /* 01100011 */ 0x63, /* 01100011 */ 0x67, /* 01100111 */ 0xe7, /* 11100111 */ 0xe6, /* 11100110 */ 0xc0, /* 11000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 15 0x0f '^O' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xdb, /* 11011011 */ 0x3c, /* 00111100 */ 0xe7, /* 11100111 */ 0x3c, /* 00111100 */ 0xdb, /* 11011011 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 16 0x10 '^P' */ 0x00, /* 00000000 */ 0x80, /* 10000000 */ 0xc0, /* 11000000 */ 0xe0, /* 11100000 */ 0xf0, /* 11110000 */ 0xf8, /* 11111000 */ 0xfe, /* 11111110 */ 0xf8, /* 11111000 */ 0xf0, /* 11110000 */ 0xe0, /* 11100000 */ 0xc0, /* 11000000 */ 0x80, /* 10000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 17 0x11 '^Q' */ 0x00, /* 00000000 */ 0x02, /* 00000010 */ 0x06, /* 00000110 */ 0x0e, /* 00001110 */ 0x1e, /* 00011110 */ 0x3e, /* 00111110 */ 0xfe, /* 11111110 */ 0x3e, /* 00111110 */ 0x1e, /* 00011110 */ 0x0e, /* 00001110 */ 0x06, /* 00000110 */ 0x02, /* 00000010 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 18 0x12 '^R' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 19 0x13 '^S' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 20 0x14 '^T' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7f, /* 01111111 */ 0xdb, /* 11011011 */ 0xdb, /* 11011011 */ 0xdb, /* 11011011 */ 0x7b, /* 01111011 */ 0x1b, /* 00011011 */ 0x1b, /* 00011011 */ 0x1b, /* 00011011 */ 0x1b, /* 00011011 */ 0x1b, /* 00011011 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 21 0x15 '^U' */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0x60, /* 01100000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x0c, /* 00001100 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 22 0x16 '^V' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 23 0x17 '^W' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 24 0x18 '^X' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 25 0x19 '^Y' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 26 0x1a '^Z' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0xfe, /* 11111110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 27 0x1b '^[' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xfe, /* 11111110 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 28 0x1c '^\' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 29 0x1d '^]' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x28, /* 00101000 */ 0x6c, /* 01101100 */ 0xfe, /* 11111110 */ 0x6c, /* 01101100 */ 0x28, /* 00101000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 30 0x1e '^^' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x38, /* 00111000 */ 0x7c, /* 01111100 */ 0x7c, /* 01111100 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 31 0x1f '^_' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0x7c, /* 01111100 */ 0x7c, /* 01111100 */ 0x38, /* 00111000 */ 0x38, /* 00111000 */ 0x10, /* 00010000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 32 0x20 ' ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 33 0x21 '!' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 34 0x22 '"' */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x24, /* 00100100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 35 0x23 '#' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0xfe, /* 11111110 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0xfe, /* 11111110 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 36 0x24 '$' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc2, /* 11000010 */ 0xc0, /* 11000000 */ 0x7c, /* 01111100 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x86, /* 10000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 37 0x25 '%' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc2, /* 11000010 */ 0xc6, /* 11000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xc6, /* 11000110 */ 0x86, /* 10000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 38 0x26 '&' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 39 0x27 ''' */ 0x00, /* 00000000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 40 0x28 '(' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 41 0x29 ')' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 42 0x2a '*' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0xff, /* 11111111 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 43 0x2b '+' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 44 0x2c ',' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 45 0x2d '-' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 46 0x2e '.' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 47 0x2f '/' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x02, /* 00000010 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xc0, /* 11000000 */ 0x80, /* 10000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 48 0x30 '0' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 49 0x31 '1' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x38, /* 00111000 */ 0x78, /* 01111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 50 0x32 '2' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 51 0x33 '3' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x3c, /* 00111100 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 52 0x34 '4' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x0c, /* 00001100 */ 0x1c, /* 00011100 */ 0x3c, /* 00111100 */ 0x6c, /* 01101100 */ 0xcc, /* 11001100 */ 0xfe, /* 11111110 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x1e, /* 00011110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 53 0x35 '5' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xfc, /* 11111100 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 54 0x36 '6' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x60, /* 01100000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xfc, /* 11111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 55 0x37 '7' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 56 0x38 '8' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 57 0x39 '9' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7e, /* 01111110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x78, /* 01111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 58 0x3a ':' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 59 0x3b ';' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 60 0x3c '<' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x06, /* 00000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 61 0x3d '=' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 62 0x3e '>' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 63 0x3f '?' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 64 0x40 '@' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xde, /* 11011110 */ 0xde, /* 11011110 */ 0xde, /* 11011110 */ 0xdc, /* 11011100 */ 0xc0, /* 11000000 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 65 0x41 'A' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 66 0x42 'B' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfc, /* 11111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0xfc, /* 11111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 67 0x43 'C' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0xc2, /* 11000010 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc2, /* 11000010 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 68 0x44 'D' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xf8, /* 11111000 */ 0x6c, /* 01101100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x6c, /* 01101100 */ 0xf8, /* 11111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 69 0x45 'E' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x66, /* 01100110 */ 0x62, /* 01100010 */ 0x68, /* 01101000 */ 0x78, /* 01111000 */ 0x68, /* 01101000 */ 0x60, /* 01100000 */ 0x62, /* 01100010 */ 0x66, /* 01100110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 70 0x46 'F' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x66, /* 01100110 */ 0x62, /* 01100010 */ 0x68, /* 01101000 */ 0x78, /* 01111000 */ 0x68, /* 01101000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0xf0, /* 11110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 71 0x47 'G' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0xc2, /* 11000010 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xde, /* 11011110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x66, /* 01100110 */ 0x3a, /* 00111010 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 72 0x48 'H' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 73 0x49 'I' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 74 0x4a 'J' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x1e, /* 00011110 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x78, /* 01111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 75 0x4b 'K' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xe6, /* 11100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x6c, /* 01101100 */ 0x78, /* 01111000 */ 0x78, /* 01111000 */ 0x6c, /* 01101100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0xe6, /* 11100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 76 0x4c 'L' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xf0, /* 11110000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x62, /* 01100010 */ 0x66, /* 01100110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 77 0x4d 'M' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xee, /* 11101110 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0xd6, /* 11010110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 78 0x4e 'N' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xe6, /* 11100110 */ 0xf6, /* 11110110 */ 0xfe, /* 11111110 */ 0xde, /* 11011110 */ 0xce, /* 11001110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 79 0x4f 'O' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 80 0x50 'P' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfc, /* 11111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0xf0, /* 11110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 81 0x51 'Q' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xd6, /* 11010110 */ 0xde, /* 11011110 */ 0x7c, /* 01111100 */ 0x0c, /* 00001100 */ 0x0e, /* 00001110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 82 0x52 'R' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfc, /* 11111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0x6c, /* 01101100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0xe6, /* 11100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 83 0x53 'S' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x60, /* 01100000 */ 0x38, /* 00111000 */ 0x0c, /* 00001100 */ 0x06, /* 00000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 84 0x54 'T' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x5a, /* 01011010 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 85 0x55 'U' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 86 0x56 'V' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x10, /* 00010000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 87 0x57 'W' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xfe, /* 11111110 */ 0xee, /* 11101110 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 88 0x58 'X' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x7c, /* 01111100 */ 0x38, /* 00111000 */ 0x38, /* 00111000 */ 0x7c, /* 01111100 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 89 0x59 'Y' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 90 0x5a 'Z' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0x86, /* 10000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xc2, /* 11000010 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 91 0x5b '[' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 92 0x5c '\' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x80, /* 10000000 */ 0xc0, /* 11000000 */ 0xe0, /* 11100000 */ 0x70, /* 01110000 */ 0x38, /* 00111000 */ 0x1c, /* 00011100 */ 0x0e, /* 00001110 */ 0x06, /* 00000110 */ 0x02, /* 00000010 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 93 0x5d ']' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 94 0x5e '^' */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 95 0x5f '_' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 96 0x60 '`' */ 0x00, /* 00000000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 97 0x61 'a' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 98 0x62 'b' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xe0, /* 11100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x78, /* 01111000 */ 0x6c, /* 01101100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 99 0x63 'c' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 100 0x64 'd' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x1c, /* 00011100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x3c, /* 00111100 */ 0x6c, /* 01101100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 101 0x65 'e' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 102 0x66 'f' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x1c, /* 00011100 */ 0x36, /* 00110110 */ 0x32, /* 00110010 */ 0x30, /* 00110000 */ 0x78, /* 01111000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x78, /* 01111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 103 0x67 'g' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x7c, /* 01111100 */ 0x0c, /* 00001100 */ 0xcc, /* 11001100 */ 0x78, /* 01111000 */ 0x00, /* 00000000 */ /* 104 0x68 'h' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xe0, /* 11100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x6c, /* 01101100 */ 0x76, /* 01110110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0xe6, /* 11100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 105 0x69 'i' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 106 0x6a 'j' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x00, /* 00000000 */ 0x0e, /* 00001110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ /* 107 0x6b 'k' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xe0, /* 11100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x66, /* 01100110 */ 0x6c, /* 01101100 */ 0x78, /* 01111000 */ 0x78, /* 01111000 */ 0x6c, /* 01101100 */ 0x66, /* 01100110 */ 0xe6, /* 11100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 108 0x6c 'l' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 109 0x6d 'm' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xec, /* 11101100 */ 0xfe, /* 11111110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 110 0x6e 'n' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xdc, /* 11011100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 111 0x6f 'o' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 112 0x70 'p' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xdc, /* 11011100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0xf0, /* 11110000 */ 0x00, /* 00000000 */ /* 113 0x71 'q' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x7c, /* 01111100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x1e, /* 00011110 */ 0x00, /* 00000000 */ /* 114 0x72 'r' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xdc, /* 11011100 */ 0x76, /* 01110110 */ 0x66, /* 01100110 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0xf0, /* 11110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 115 0x73 's' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0x60, /* 01100000 */ 0x38, /* 00111000 */ 0x0c, /* 00001100 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 116 0x74 't' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x10, /* 00010000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0xfc, /* 11111100 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x36, /* 00110110 */ 0x1c, /* 00011100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 117 0x75 'u' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 118 0x76 'v' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 119 0x77 'w' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xfe, /* 11111110 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 120 0x78 'x' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x38, /* 00111000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 121 0x79 'y' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7e, /* 01111110 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0xf8, /* 11111000 */ 0x00, /* 00000000 */ /* 122 0x7a 'z' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xcc, /* 11001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 123 0x7b '{' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x0e, /* 00001110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x70, /* 01110000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x0e, /* 00001110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 124 0x7c '|' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 125 0x7d '}' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x70, /* 01110000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x0e, /* 00001110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x70, /* 01110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 126 0x7e '~' */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 127 0x7f '' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 128 0x80 '€' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0xc2, /* 11000010 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc2, /* 11000010 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x70, /* 01110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 129 0x81 '' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 130 0x82 '‚' */ 0x00, /* 00000000 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 131 0x83 'ƒ' */ 0x00, /* 00000000 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 132 0x84 '„' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 133 0x85 '…' */ 0x00, /* 00000000 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 134 0x86 '†' */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 135 0x87 '‡' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x18, /* 00011000 */ 0x70, /* 01110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 136 0x88 'ˆ' */ 0x00, /* 00000000 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 137 0x89 '‰' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 138 0x8a 'Š' */ 0x00, /* 00000000 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 139 0x8b '‹' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 140 0x8c 'Œ' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 141 0x8d '' */ 0x00, /* 00000000 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 142 0x8e 'Ž' */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 143 0x8f '' */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 144 0x90 '' */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x66, /* 01100110 */ 0x62, /* 01100010 */ 0x68, /* 01101000 */ 0x78, /* 01111000 */ 0x68, /* 01101000 */ 0x62, /* 01100010 */ 0x66, /* 01100110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 145 0x91 '‘' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xec, /* 11101100 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x7e, /* 01111110 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0x6e, /* 01101110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 146 0x92 '’' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3e, /* 00111110 */ 0x6c, /* 01101100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xfe, /* 11111110 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xce, /* 11001110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 147 0x93 '“' */ 0x00, /* 00000000 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 148 0x94 '”' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 149 0x95 '•' */ 0x00, /* 00000000 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 150 0x96 '–' */ 0x00, /* 00000000 */ 0x30, /* 00110000 */ 0x78, /* 01111000 */ 0xcc, /* 11001100 */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 151 0x97 '—' */ 0x00, /* 00000000 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 152 0x98 '˜' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7e, /* 01111110 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x78, /* 01111000 */ 0x00, /* 00000000 */ /* 153 0x99 '™' */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 154 0x9a 'š' */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 155 0x9b '›' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 156 0x9c 'œ' */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x64, /* 01100100 */ 0x60, /* 01100000 */ 0xf0, /* 11110000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0xe6, /* 11100110 */ 0xfc, /* 11111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 157 0x9d '' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 158 0x9e 'ž' */ 0x00, /* 00000000 */ 0xf8, /* 11111000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xf8, /* 11111000 */ 0xc4, /* 11000100 */ 0xcc, /* 11001100 */ 0xde, /* 11011110 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 159 0x9f 'Ÿ' */ 0x00, /* 00000000 */ 0x0e, /* 00001110 */ 0x1b, /* 00011011 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xd8, /* 11011000 */ 0x70, /* 01110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 160 0xa0 ' ' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x00, /* 00000000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 161 0xa1 '¡' */ 0x00, /* 00000000 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 162 0xa2 '¢' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 163 0xa3 '£' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 164 0xa4 '¤' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x00, /* 00000000 */ 0xdc, /* 11011100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 165 0xa5 '¥' */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xe6, /* 11100110 */ 0xf6, /* 11110110 */ 0xfe, /* 11111110 */ 0xde, /* 11011110 */ 0xce, /* 11001110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 166 0xa6 '¦' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x3e, /* 00111110 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 167 0xa7 '§' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 168 0xa8 '¨' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 169 0xa9 '©' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 170 0xaa 'ª' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 171 0xab '«' */ 0x00, /* 00000000 */ 0x60, /* 01100000 */ 0xe0, /* 11100000 */ 0x62, /* 01100010 */ 0x66, /* 01100110 */ 0x6c, /* 01101100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xdc, /* 11011100 */ 0x86, /* 10000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x3e, /* 00111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 172 0xac '¬' */ 0x00, /* 00000000 */ 0x60, /* 01100000 */ 0xe0, /* 11100000 */ 0x62, /* 01100010 */ 0x66, /* 01100110 */ 0x6c, /* 01101100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x66, /* 01100110 */ 0xce, /* 11001110 */ 0x9a, /* 10011010 */ 0x3f, /* 00111111 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 173 0xad '­' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 174 0xae '®' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x36, /* 00110110 */ 0x6c, /* 01101100 */ 0xd8, /* 11011000 */ 0x6c, /* 01101100 */ 0x36, /* 00110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 175 0xaf '¯' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xd8, /* 11011000 */ 0x6c, /* 01101100 */ 0x36, /* 00110110 */ 0x6c, /* 01101100 */ 0xd8, /* 11011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 176 0xb0 '°' */ 0x11, /* 00010001 */ 0x44, /* 01000100 */ 0x11, /* 00010001 */ 0x44, /* 01000100 */ 0x11, /* 00010001 */ 0x44, /* 01000100 */ 0x11, /* 00010001 */ 0x44, /* 01000100 */ 0x11, /* 00010001 */ 0x44, /* 01000100 */ 0x11, /* 00010001 */ 0x44, /* 01000100 */ 0x11, /* 00010001 */ 0x44, /* 01000100 */ 0x11, /* 00010001 */ 0x44, /* 01000100 */ /* 177 0xb1 '±' */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ /* 178 0xb2 '²' */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ /* 179 0xb3 '³' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 180 0xb4 '´' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 181 0xb5 'µ' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 182 0xb6 '¶' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xf6, /* 11110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 183 0xb7 '·' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 184 0xb8 '¸' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 185 0xb9 '¹' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xf6, /* 11110110 */ 0x06, /* 00000110 */ 0xf6, /* 11110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 186 0xba 'º' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 187 0xbb '»' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x06, /* 00000110 */ 0xf6, /* 11110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 188 0xbc '¼' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xf6, /* 11110110 */ 0x06, /* 00000110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 189 0xbd '½' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 190 0xbe '¾' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 191 0xbf '¿' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 192 0xc0 'À' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 193 0xc1 'Á' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 194 0xc2 'Â' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 195 0xc3 'Ã' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 196 0xc4 'Ä' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 197 0xc5 'Å' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xff, /* 11111111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 198 0xc6 'Æ' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 199 0xc7 'Ç' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x37, /* 00110111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 200 0xc8 'È' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x37, /* 00110111 */ 0x30, /* 00110000 */ 0x3f, /* 00111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 201 0xc9 'É' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3f, /* 00111111 */ 0x30, /* 00110000 */ 0x37, /* 00110111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 202 0xca 'Ê' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xf7, /* 11110111 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 203 0xcb 'Ë' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0xf7, /* 11110111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 204 0xcc 'Ì' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x37, /* 00110111 */ 0x30, /* 00110000 */ 0x37, /* 00110111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 205 0xcd 'Í' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 206 0xce 'Î' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xf7, /* 11110111 */ 0x00, /* 00000000 */ 0xf7, /* 11110111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 207 0xcf 'Ï' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 208 0xd0 'Ð' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 209 0xd1 'Ñ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 210 0xd2 'Ò' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 211 0xd3 'Ó' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x3f, /* 00111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 212 0xd4 'Ô' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 213 0xd5 'Õ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 214 0xd6 'Ö' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3f, /* 00111111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 215 0xd7 '×' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xff, /* 11111111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 216 0xd8 'Ø' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xff, /* 11111111 */ 0x18, /* 00011000 */ 0xff, /* 11111111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 217 0xd9 'Ù' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 218 0xda 'Ú' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 219 0xdb 'Û' */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ /* 220 0xdc 'Ü' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ /* 221 0xdd 'Ý' */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ /* 222 0xde 'Þ' */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ /* 223 0xdf 'ß' */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 224 0xe0 'à' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0xdc, /* 11011100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 225 0xe1 'á' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x78, /* 01111000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xd8, /* 11011000 */ 0xcc, /* 11001100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xcc, /* 11001100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 226 0xe2 'â' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 227 0xe3 'ã' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 228 0xe4 'ä' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 229 0xe5 'å' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0x70, /* 01110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 230 0xe6 'æ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0xc0, /* 11000000 */ 0x00, /* 00000000 */ /* 231 0xe7 'ç' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 232 0xe8 'è' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 233 0xe9 'é' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 234 0xea 'ê' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0xee, /* 11101110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 235 0xeb 'ë' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x1e, /* 00011110 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x3e, /* 00111110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 236 0xec 'ì' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0xdb, /* 11011011 */ 0xdb, /* 11011011 */ 0xdb, /* 11011011 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 237 0xed 'í' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x03, /* 00000011 */ 0x06, /* 00000110 */ 0x7e, /* 01111110 */ 0xdb, /* 11011011 */ 0xdb, /* 11011011 */ 0xf3, /* 11110011 */ 0x7e, /* 01111110 */ 0x60, /* 01100000 */ 0xc0, /* 11000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 238 0xee 'î' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x1c, /* 00011100 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x7c, /* 01111100 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x1c, /* 00011100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 239 0xef 'ï' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 240 0xf0 'ð' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 241 0xf1 'ñ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 242 0xf2 'ò' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 243 0xf3 'ó' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 244 0xf4 'ô' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x0e, /* 00001110 */ 0x1b, /* 00011011 */ 0x1b, /* 00011011 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 245 0xf5 'õ' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0x70, /* 01110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 246 0xf6 'ö' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 247 0xf7 '÷' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 248 0xf8 'ø' */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 249 0xf9 'ù' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 250 0xfa 'ú' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 251 0xfb 'û' */ 0x00, /* 00000000 */ 0x0f, /* 00001111 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0xec, /* 11101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x3c, /* 00111100 */ 0x1c, /* 00011100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 252 0xfc 'ü' */ 0x00, /* 00000000 */ 0x6c, /* 01101100 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 253 0xfd 'ý' */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x32, /* 00110010 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 254 0xfe 'þ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 255 0xff 'ÿ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ }; const struct font_desc font_vga_8x16 = { "VGA8x16", 8, 16, sizeof(fontdata_8x16) / 16, fontdata_8x16 }; xen-4.4.0/xen/drivers/video/font.h0000664000175000017500000000077412307313555015150 0ustar smbsmb/* * font.h -- `Soft' font definitions * * Created 1995 by Geert Uytterhoeven * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. */ #ifndef _XEN_FONT_H #define _XEN_FONT_H struct font_desc { const char *name; unsigned width, height, count; const void *data; }; extern const struct font_desc font_vga_8x8, font_vga_8x14, font_vga_8x16; #endif /* _XEN_FONT_H */ xen-4.4.0/xen/drivers/video/font_8x8.c0000664000175000017500000016064312307313555015654 0ustar smbsmb/**********************************************/ /* */ /* Font file generated by cpi2fnt */ /* */ /**********************************************/ #include #include "font.h" #define FONTDATAMAX (256*8) static const unsigned char fontdata_8x8[FONTDATAMAX] = { /* 0 0x00 '^@' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 1 0x01 '^A' */ 0x7e, /* 01111110 */ 0x81, /* 10000001 */ 0xa5, /* 10100101 */ 0x81, /* 10000001 */ 0xbd, /* 10111101 */ 0x99, /* 10011001 */ 0x81, /* 10000001 */ 0x7e, /* 01111110 */ /* 2 0x02 '^B' */ 0x7e, /* 01111110 */ 0xff, /* 11111111 */ 0xdb, /* 11011011 */ 0xff, /* 11111111 */ 0xc3, /* 11000011 */ 0xe7, /* 11100111 */ 0xff, /* 11111111 */ 0x7e, /* 01111110 */ /* 3 0x03 '^C' */ 0x6c, /* 01101100 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0x7c, /* 01111100 */ 0x38, /* 00111000 */ 0x10, /* 00010000 */ 0x00, /* 00000000 */ /* 4 0x04 '^D' */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x7c, /* 01111100 */ 0xfe, /* 11111110 */ 0x7c, /* 01111100 */ 0x38, /* 00111000 */ 0x10, /* 00010000 */ 0x00, /* 00000000 */ /* 5 0x05 '^E' */ 0x38, /* 00111000 */ 0x7c, /* 01111100 */ 0x38, /* 00111000 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0xd6, /* 11010110 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ /* 6 0x06 '^F' */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x7c, /* 01111100 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0x7c, /* 01111100 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ /* 7 0x07 '^G' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 8 0x08 '^H' */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xe7, /* 11100111 */ 0xc3, /* 11000011 */ 0xc3, /* 11000011 */ 0xe7, /* 11100111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ /* 9 0x09 '^I' */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x42, /* 01000010 */ 0x42, /* 01000010 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ /* 10 0x0a '^J' */ 0xff, /* 11111111 */ 0xc3, /* 11000011 */ 0x99, /* 10011001 */ 0xbd, /* 10111101 */ 0xbd, /* 10111101 */ 0x99, /* 10011001 */ 0xc3, /* 11000011 */ 0xff, /* 11111111 */ /* 11 0x0b '^K' */ 0x0f, /* 00001111 */ 0x07, /* 00000111 */ 0x0f, /* 00001111 */ 0x7d, /* 01111101 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x78, /* 01111000 */ /* 12 0x0c '^L' */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ /* 13 0x0d '^M' */ 0x3f, /* 00111111 */ 0x33, /* 00110011 */ 0x3f, /* 00111111 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x70, /* 01110000 */ 0xf0, /* 11110000 */ 0xe0, /* 11100000 */ /* 14 0x0e '^N' */ 0x7f, /* 01111111 */ 0x63, /* 01100011 */ 0x7f, /* 01111111 */ 0x63, /* 01100011 */ 0x63, /* 01100011 */ 0x67, /* 01100111 */ 0xe6, /* 11100110 */ 0xc0, /* 11000000 */ /* 15 0x0f '^O' */ 0x18, /* 00011000 */ 0xdb, /* 11011011 */ 0x3c, /* 00111100 */ 0xe7, /* 11100111 */ 0xe7, /* 11100111 */ 0x3c, /* 00111100 */ 0xdb, /* 11011011 */ 0x18, /* 00011000 */ /* 16 0x10 '^P' */ 0x80, /* 10000000 */ 0xe0, /* 11100000 */ 0xf8, /* 11111000 */ 0xfe, /* 11111110 */ 0xf8, /* 11111000 */ 0xe0, /* 11100000 */ 0x80, /* 10000000 */ 0x00, /* 00000000 */ /* 17 0x11 '^Q' */ 0x02, /* 00000010 */ 0x0e, /* 00001110 */ 0x3e, /* 00111110 */ 0xfe, /* 11111110 */ 0x3e, /* 00111110 */ 0x0e, /* 00001110 */ 0x02, /* 00000010 */ 0x00, /* 00000000 */ /* 18 0x12 '^R' */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ /* 19 0x13 '^S' */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ /* 20 0x14 '^T' */ 0x7f, /* 01111111 */ 0xdb, /* 11011011 */ 0xdb, /* 11011011 */ 0x7b, /* 01111011 */ 0x1b, /* 00011011 */ 0x1b, /* 00011011 */ 0x1b, /* 00011011 */ 0x00, /* 00000000 */ /* 21 0x15 '^U' */ 0x3e, /* 00111110 */ 0x61, /* 01100001 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x86, /* 10000110 */ 0x7c, /* 01111100 */ /* 22 0x16 '^V' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ /* 23 0x17 '^W' */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0xff, /* 11111111 */ /* 24 0x18 '^X' */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ /* 25 0x19 '^Y' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ /* 26 0x1a '^Z' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0xfe, /* 11111110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 27 0x1b '^[' */ 0x00, /* 00000000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xfe, /* 11111110 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 28 0x1c '^\' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 29 0x1d '^]' */ 0x00, /* 00000000 */ 0x24, /* 00100100 */ 0x66, /* 01100110 */ 0xff, /* 11111111 */ 0x66, /* 01100110 */ 0x24, /* 00100100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 30 0x1e '^^' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x7e, /* 01111110 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 31 0x1f '^_' */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0x7e, /* 01111110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 32 0x20 ' ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 33 0x21 '!' */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ /* 34 0x22 '"' */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x24, /* 00100100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 35 0x23 '#' */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0xfe, /* 11111110 */ 0x6c, /* 01101100 */ 0xfe, /* 11111110 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ /* 36 0x24 '$' */ 0x18, /* 00011000 */ 0x3e, /* 00111110 */ 0x60, /* 01100000 */ 0x3c, /* 00111100 */ 0x06, /* 00000110 */ 0x7c, /* 01111100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ /* 37 0x25 '%' */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xcc, /* 11001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x66, /* 01100110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ /* 38 0x26 '&' */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ /* 39 0x27 ''' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 40 0x28 '(' */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x00, /* 00000000 */ /* 41 0x29 ')' */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ /* 42 0x2a '*' */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0xff, /* 11111111 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 43 0x2b '+' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 44 0x2c ',' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ /* 45 0x2d '-' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 46 0x2e '.' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ /* 47 0x2f '/' */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xc0, /* 11000000 */ 0x80, /* 10000000 */ 0x00, /* 00000000 */ /* 48 0x30 '0' */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xd6, /* 11010110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ /* 49 0x31 '1' */ 0x18, /* 00011000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ /* 50 0x32 '2' */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0x06, /* 00000110 */ 0x1c, /* 00011100 */ 0x30, /* 00110000 */ 0x66, /* 01100110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ /* 51 0x33 '3' */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0x06, /* 00000110 */ 0x3c, /* 00111100 */ 0x06, /* 00000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 52 0x34 '4' */ 0x1c, /* 00011100 */ 0x3c, /* 00111100 */ 0x6c, /* 01101100 */ 0xcc, /* 11001100 */ 0xfe, /* 11111110 */ 0x0c, /* 00001100 */ 0x1e, /* 00011110 */ 0x00, /* 00000000 */ /* 53 0x35 '5' */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xfc, /* 11111100 */ 0x06, /* 00000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 54 0x36 '6' */ 0x38, /* 00111000 */ 0x60, /* 01100000 */ 0xc0, /* 11000000 */ 0xfc, /* 11111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 55 0x37 '7' */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ /* 56 0x38 '8' */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 57 0x39 '9' */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7e, /* 01111110 */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x78, /* 01111000 */ 0x00, /* 00000000 */ /* 58 0x3a ':' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ /* 59 0x3b ';' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ /* 60 0x3c '<' */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x06, /* 00000110 */ 0x00, /* 00000000 */ /* 61 0x3d '=' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 62 0x3e '>' */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x00, /* 00000000 */ /* 63 0x3f '?' */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ /* 64 0x40 '@' */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xde, /* 11011110 */ 0xde, /* 11011110 */ 0xde, /* 11011110 */ 0xc0, /* 11000000 */ 0x78, /* 01111000 */ 0x00, /* 00000000 */ /* 65 0x41 'A' */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ /* 66 0x42 'B' */ 0xfc, /* 11111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0xfc, /* 11111100 */ 0x00, /* 00000000 */ /* 67 0x43 'C' */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ /* 68 0x44 'D' */ 0xf8, /* 11111000 */ 0x6c, /* 01101100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x6c, /* 01101100 */ 0xf8, /* 11111000 */ 0x00, /* 00000000 */ /* 69 0x45 'E' */ 0xfe, /* 11111110 */ 0x62, /* 01100010 */ 0x68, /* 01101000 */ 0x78, /* 01111000 */ 0x68, /* 01101000 */ 0x62, /* 01100010 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ /* 70 0x46 'F' */ 0xfe, /* 11111110 */ 0x62, /* 01100010 */ 0x68, /* 01101000 */ 0x78, /* 01111000 */ 0x68, /* 01101000 */ 0x60, /* 01100000 */ 0xf0, /* 11110000 */ 0x00, /* 00000000 */ /* 71 0x47 'G' */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xce, /* 11001110 */ 0x66, /* 01100110 */ 0x3a, /* 00111010 */ 0x00, /* 00000000 */ /* 72 0x48 'H' */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ /* 73 0x49 'I' */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ /* 74 0x4a 'J' */ 0x1e, /* 00011110 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x78, /* 01111000 */ 0x00, /* 00000000 */ /* 75 0x4b 'K' */ 0xe6, /* 11100110 */ 0x66, /* 01100110 */ 0x6c, /* 01101100 */ 0x78, /* 01111000 */ 0x6c, /* 01101100 */ 0x66, /* 01100110 */ 0xe6, /* 11100110 */ 0x00, /* 00000000 */ /* 76 0x4c 'L' */ 0xf0, /* 11110000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0x62, /* 01100010 */ 0x66, /* 01100110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ /* 77 0x4d 'M' */ 0xc6, /* 11000110 */ 0xee, /* 11101110 */ 0xfe, /* 11111110 */ 0xfe, /* 11111110 */ 0xd6, /* 11010110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ /* 78 0x4e 'N' */ 0xc6, /* 11000110 */ 0xe6, /* 11100110 */ 0xf6, /* 11110110 */ 0xde, /* 11011110 */ 0xce, /* 11001110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ /* 79 0x4f 'O' */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 80 0x50 'P' */ 0xfc, /* 11111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0xf0, /* 11110000 */ 0x00, /* 00000000 */ /* 81 0x51 'Q' */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xce, /* 11001110 */ 0x7c, /* 01111100 */ 0x0e, /* 00001110 */ /* 82 0x52 'R' */ 0xfc, /* 11111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0x6c, /* 01101100 */ 0x66, /* 01100110 */ 0xe6, /* 11100110 */ 0x00, /* 00000000 */ /* 83 0x53 'S' */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ /* 84 0x54 'T' */ 0x7e, /* 01111110 */ 0x7e, /* 01111110 */ 0x5a, /* 01011010 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ /* 85 0x55 'U' */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 86 0x56 'V' */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ /* 87 0x57 'W' */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xfe, /* 11111110 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ /* 88 0x58 'X' */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ /* 89 0x59 'Y' */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ /* 90 0x5a 'Z' */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0x8c, /* 10001100 */ 0x18, /* 00011000 */ 0x32, /* 00110010 */ 0x66, /* 01100110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ /* 91 0x5b '[' */ 0x3c, /* 00111100 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ /* 92 0x5c '\' */ 0xc0, /* 11000000 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x06, /* 00000110 */ 0x02, /* 00000010 */ 0x00, /* 00000000 */ /* 93 0x5d ']' */ 0x3c, /* 00111100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ /* 94 0x5e '^' */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 95 0x5f '_' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ /* 96 0x60 '`' */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 97 0x61 'a' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ /* 98 0x62 'b' */ 0xe0, /* 11100000 */ 0x60, /* 01100000 */ 0x7c, /* 01111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0xdc, /* 11011100 */ 0x00, /* 00000000 */ /* 99 0x63 'c' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 100 0x64 'd' */ 0x1c, /* 00011100 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ /* 101 0x65 'e' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 102 0x66 'f' */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x60, /* 01100000 */ 0xf8, /* 11111000 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0xf0, /* 11110000 */ 0x00, /* 00000000 */ /* 103 0x67 'g' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x7c, /* 01111100 */ 0x0c, /* 00001100 */ 0xf8, /* 11111000 */ /* 104 0x68 'h' */ 0xe0, /* 11100000 */ 0x60, /* 01100000 */ 0x6c, /* 01101100 */ 0x76, /* 01110110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0xe6, /* 11100110 */ 0x00, /* 00000000 */ /* 105 0x69 'i' */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ /* 106 0x6a 'j' */ 0x06, /* 00000110 */ 0x00, /* 00000000 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ /* 107 0x6b 'k' */ 0xe0, /* 11100000 */ 0x60, /* 01100000 */ 0x66, /* 01100110 */ 0x6c, /* 01101100 */ 0x78, /* 01111000 */ 0x6c, /* 01101100 */ 0xe6, /* 11100110 */ 0x00, /* 00000000 */ /* 108 0x6c 'l' */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ /* 109 0x6d 'm' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xec, /* 11101100 */ 0xfe, /* 11111110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0x00, /* 00000000 */ /* 110 0x6e 'n' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xdc, /* 11011100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ /* 111 0x6f 'o' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 112 0x70 'p' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xdc, /* 11011100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0x60, /* 01100000 */ 0xf0, /* 11110000 */ /* 113 0x71 'q' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x7c, /* 01111100 */ 0x0c, /* 00001100 */ 0x1e, /* 00011110 */ /* 114 0x72 'r' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xdc, /* 11011100 */ 0x76, /* 01110110 */ 0x60, /* 01100000 */ 0x60, /* 01100000 */ 0xf0, /* 11110000 */ 0x00, /* 00000000 */ /* 115 0x73 's' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0xc0, /* 11000000 */ 0x7c, /* 01111100 */ 0x06, /* 00000110 */ 0xfc, /* 11111100 */ 0x00, /* 00000000 */ /* 116 0x74 't' */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0xfc, /* 11111100 */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x36, /* 00110110 */ 0x1c, /* 00011100 */ 0x00, /* 00000000 */ /* 117 0x75 'u' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ /* 118 0x76 'v' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ /* 119 0x77 'w' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xd6, /* 11010110 */ 0xd6, /* 11010110 */ 0xfe, /* 11111110 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ /* 120 0x78 'x' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ /* 121 0x79 'y' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7e, /* 01111110 */ 0x06, /* 00000110 */ 0xfc, /* 11111100 */ /* 122 0x7a 'z' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x4c, /* 01001100 */ 0x18, /* 00011000 */ 0x32, /* 00110010 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ /* 123 0x7b '{' */ 0x0e, /* 00001110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x70, /* 01110000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x0e, /* 00001110 */ 0x00, /* 00000000 */ /* 124 0x7c '|' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ /* 125 0x7d '}' */ 0x70, /* 01110000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x0e, /* 00001110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x70, /* 01110000 */ 0x00, /* 00000000 */ /* 126 0x7e '~' */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 127 0x7f '' */ 0x00, /* 00000000 */ 0x10, /* 00010000 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ /* 128 0x80 '€' */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x0c, /* 00001100 */ 0x78, /* 01111000 */ /* 129 0x81 '' */ 0xcc, /* 11001100 */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ /* 130 0x82 '‚' */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 131 0x83 'ƒ' */ 0x7c, /* 01111100 */ 0x82, /* 10000010 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ /* 132 0x84 '„' */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ /* 133 0x85 '…' */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ /* 134 0x86 '†' */ 0x30, /* 00110000 */ 0x30, /* 00110000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ /* 135 0x87 '‡' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0x7e, /* 01111110 */ 0x0c, /* 00001100 */ 0x38, /* 00111000 */ /* 136 0x88 'ˆ' */ 0x7c, /* 01111100 */ 0x82, /* 10000010 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 137 0x89 '‰' */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 138 0x8a 'Š' */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 139 0x8b '‹' */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ /* 140 0x8c 'Œ' */ 0x7c, /* 01111100 */ 0x82, /* 10000010 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ /* 141 0x8d '' */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ /* 142 0x8e 'Ž' */ 0xc6, /* 11000110 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ /* 143 0x8f '' */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ /* 144 0x90 '' */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xf8, /* 11111000 */ 0xc0, /* 11000000 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ /* 145 0x91 '‘' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0xd8, /* 11011000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ /* 146 0x92 '’' */ 0x3e, /* 00111110 */ 0x6c, /* 01101100 */ 0xcc, /* 11001100 */ 0xfe, /* 11111110 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xce, /* 11001110 */ 0x00, /* 00000000 */ /* 147 0x93 '“' */ 0x7c, /* 01111100 */ 0x82, /* 10000010 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 148 0x94 '”' */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 149 0x95 '•' */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 150 0x96 '–' */ 0x78, /* 01111000 */ 0x84, /* 10000100 */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ /* 151 0x97 '—' */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ /* 152 0x98 '˜' */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7e, /* 01111110 */ 0x06, /* 00000110 */ 0xfc, /* 11111100 */ /* 153 0x99 '™' */ 0xc6, /* 11000110 */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ /* 154 0x9a 'š' */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 155 0x9b '›' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 156 0x9c 'œ' */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x64, /* 01100100 */ 0xf0, /* 11110000 */ 0x60, /* 01100000 */ 0x66, /* 01100110 */ 0xfc, /* 11111100 */ 0x00, /* 00000000 */ /* 157 0x9d '' */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 158 0x9e 'ž' */ 0xf8, /* 11111000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xfa, /* 11111010 */ 0xc6, /* 11000110 */ 0xcf, /* 11001111 */ 0xc6, /* 11000110 */ 0xc7, /* 11000111 */ /* 159 0x9f 'Ÿ' */ 0x0e, /* 00001110 */ 0x1b, /* 00011011 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0xd8, /* 11011000 */ 0x70, /* 01110000 */ 0x00, /* 00000000 */ /* 160 0xa0 ' ' */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x7c, /* 01111100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ /* 161 0xa1 '¡' */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x38, /* 00111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ /* 162 0xa2 '¢' */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ /* 163 0xa3 '£' */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ /* 164 0xa4 '¤' */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x00, /* 00000000 */ 0xdc, /* 11011100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x00, /* 00000000 */ /* 165 0xa5 '¥' */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x00, /* 00000000 */ 0xe6, /* 11100110 */ 0xf6, /* 11110110 */ 0xde, /* 11011110 */ 0xce, /* 11001110 */ 0x00, /* 00000000 */ /* 166 0xa6 '¦' */ 0x3c, /* 00111100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x3e, /* 00111110 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 167 0xa7 '§' */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 168 0xa8 '¨' */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x63, /* 01100011 */ 0x3e, /* 00111110 */ 0x00, /* 00000000 */ /* 169 0xa9 '©' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 170 0xaa 'ª' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x06, /* 00000110 */ 0x06, /* 00000110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 171 0xab '«' */ 0x63, /* 01100011 */ 0xe6, /* 11100110 */ 0x6c, /* 01101100 */ 0x7e, /* 01111110 */ 0x33, /* 00110011 */ 0x66, /* 01100110 */ 0xcc, /* 11001100 */ 0x0f, /* 00001111 */ /* 172 0xac '¬' */ 0x63, /* 01100011 */ 0xe6, /* 11100110 */ 0x6c, /* 01101100 */ 0x7a, /* 01111010 */ 0x36, /* 00110110 */ 0x6a, /* 01101010 */ 0xdf, /* 11011111 */ 0x06, /* 00000110 */ /* 173 0xad '­' */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ /* 174 0xae '®' */ 0x00, /* 00000000 */ 0x33, /* 00110011 */ 0x66, /* 01100110 */ 0xcc, /* 11001100 */ 0x66, /* 01100110 */ 0x33, /* 00110011 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 175 0xaf '¯' */ 0x00, /* 00000000 */ 0xcc, /* 11001100 */ 0x66, /* 01100110 */ 0x33, /* 00110011 */ 0x66, /* 01100110 */ 0xcc, /* 11001100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 176 0xb0 '°' */ 0x22, /* 00100010 */ 0x88, /* 10001000 */ 0x22, /* 00100010 */ 0x88, /* 10001000 */ 0x22, /* 00100010 */ 0x88, /* 10001000 */ 0x22, /* 00100010 */ 0x88, /* 10001000 */ /* 177 0xb1 '±' */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ 0x55, /* 01010101 */ 0xaa, /* 10101010 */ /* 178 0xb2 '²' */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ 0x77, /* 01110111 */ 0xdd, /* 11011101 */ /* 179 0xb3 '³' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 180 0xb4 '´' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 181 0xb5 'µ' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 182 0xb6 '¶' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xf6, /* 11110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 183 0xb7 '·' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 184 0xb8 '¸' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 185 0xb9 '¹' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xf6, /* 11110110 */ 0x06, /* 00000110 */ 0xf6, /* 11110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 186 0xba 'º' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 187 0xbb '»' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x06, /* 00000110 */ 0xf6, /* 11110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 188 0xbc '¼' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xf6, /* 11110110 */ 0x06, /* 00000110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 189 0xbd '½' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 190 0xbe '¾' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 191 0xbf '¿' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xf8, /* 11111000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 192 0xc0 'À' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 193 0xc1 'Á' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 194 0xc2 'Â' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 195 0xc3 'Ã' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 196 0xc4 'Ä' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 197 0xc5 'Å' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xff, /* 11111111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 198 0xc6 'Æ' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 199 0xc7 'Ç' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x37, /* 00110111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 200 0xc8 'È' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x37, /* 00110111 */ 0x30, /* 00110000 */ 0x3f, /* 00111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 201 0xc9 'É' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3f, /* 00111111 */ 0x30, /* 00110000 */ 0x37, /* 00110111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 202 0xca 'Ê' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xf7, /* 11110111 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 203 0xcb 'Ë' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0xf7, /* 11110111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 204 0xcc 'Ì' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x37, /* 00110111 */ 0x30, /* 00110000 */ 0x37, /* 00110111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 205 0xcd 'Í' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 206 0xce 'Î' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xf7, /* 11110111 */ 0x00, /* 00000000 */ 0xf7, /* 11110111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 207 0xcf 'Ï' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 208 0xd0 'Ð' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 209 0xd1 'Ñ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 210 0xd2 'Ò' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 211 0xd3 'Ó' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x3f, /* 00111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 212 0xd4 'Ô' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 213 0xd5 'Õ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 214 0xd6 'Ö' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3f, /* 00111111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 215 0xd7 '×' */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0xff, /* 11111111 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ /* 216 0xd8 'Ø' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xff, /* 11111111 */ 0x18, /* 00011000 */ 0xff, /* 11111111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 217 0xd9 'Ù' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xf8, /* 11111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 218 0xda 'Ú' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x1f, /* 00011111 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 219 0xdb 'Û' */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ /* 220 0xdc 'Ü' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ /* 221 0xdd 'Ý' */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ 0xf0, /* 11110000 */ /* 222 0xde 'Þ' */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ 0x0f, /* 00001111 */ /* 223 0xdf 'ß' */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0xff, /* 11111111 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 224 0xe0 'à' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0xc8, /* 11001000 */ 0xdc, /* 11011100 */ 0x76, /* 01110110 */ 0x00, /* 00000000 */ /* 225 0xe1 'á' */ 0x78, /* 01111000 */ 0xcc, /* 11001100 */ 0xcc, /* 11001100 */ 0xd8, /* 11011000 */ 0xcc, /* 11001100 */ 0xc6, /* 11000110 */ 0xcc, /* 11001100 */ 0x00, /* 00000000 */ /* 226 0xe2 'â' */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0xc0, /* 11000000 */ 0x00, /* 00000000 */ /* 227 0xe3 'ã' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x00, /* 00000000 */ /* 228 0xe4 'ä' */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ /* 229 0xe5 'å' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0x70, /* 01110000 */ 0x00, /* 00000000 */ /* 230 0xe6 'æ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x7c, /* 01111100 */ 0xc0, /* 11000000 */ /* 231 0xe7 'ç' */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ /* 232 0xe8 'è' */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x3c, /* 00111100 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ /* 233 0xe9 'é' */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xfe, /* 11111110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ /* 234 0xea 'ê' */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0xee, /* 11101110 */ 0x00, /* 00000000 */ /* 235 0xeb 'ë' */ 0x0e, /* 00001110 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x3e, /* 00111110 */ 0x66, /* 01100110 */ 0x66, /* 01100110 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ /* 236 0xec 'ì' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0xdb, /* 11011011 */ 0xdb, /* 11011011 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 237 0xed 'í' */ 0x06, /* 00000110 */ 0x0c, /* 00001100 */ 0x7e, /* 01111110 */ 0xdb, /* 11011011 */ 0xdb, /* 11011011 */ 0x7e, /* 01111110 */ 0x60, /* 01100000 */ 0xc0, /* 11000000 */ /* 238 0xee 'î' */ 0x1e, /* 00011110 */ 0x30, /* 00110000 */ 0x60, /* 01100000 */ 0x7e, /* 01111110 */ 0x60, /* 01100000 */ 0x30, /* 00110000 */ 0x1e, /* 00011110 */ 0x00, /* 00000000 */ /* 239 0xef 'ï' */ 0x00, /* 00000000 */ 0x7c, /* 01111100 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0xc6, /* 11000110 */ 0x00, /* 00000000 */ /* 240 0xf0 'ð' */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0xfe, /* 11111110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 241 0xf1 'ñ' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x7e, /* 01111110 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ /* 242 0xf2 'ò' */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ /* 243 0xf3 'ó' */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x18, /* 00011000 */ 0x0c, /* 00001100 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ /* 244 0xf4 'ô' */ 0x0e, /* 00001110 */ 0x1b, /* 00011011 */ 0x1b, /* 00011011 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ /* 245 0xf5 'õ' */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0xd8, /* 11011000 */ 0xd8, /* 11011000 */ 0x70, /* 01110000 */ /* 246 0xf6 'ö' */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x7e, /* 01111110 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 247 0xf7 '÷' */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x00, /* 00000000 */ 0x76, /* 01110110 */ 0xdc, /* 11011100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 248 0xf8 'ø' */ 0x38, /* 00111000 */ 0x6c, /* 01101100 */ 0x6c, /* 01101100 */ 0x38, /* 00111000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 249 0xf9 'ù' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 250 0xfa 'ú' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x18, /* 00011000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 251 0xfb 'û' */ 0x0f, /* 00001111 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0x0c, /* 00001100 */ 0xec, /* 11101100 */ 0x6c, /* 01101100 */ 0x3c, /* 00111100 */ 0x1c, /* 00011100 */ /* 252 0xfc 'ü' */ 0x6c, /* 01101100 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x36, /* 00110110 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 253 0xfd 'ý' */ 0x78, /* 01111000 */ 0x0c, /* 00001100 */ 0x18, /* 00011000 */ 0x30, /* 00110000 */ 0x7c, /* 01111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 254 0xfe 'þ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0x3c, /* 00111100 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ /* 255 0xff 'ÿ' */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ 0x00, /* 00000000 */ }; const struct font_desc font_vga_8x8 = { "VGA8x8", 8, 8, sizeof(fontdata_8x8) / 8, fontdata_8x8 }; xen-4.4.0/xen/drivers/pci/0000775000175000017500000000000012307313555013466 5ustar smbsmbxen-4.4.0/xen/drivers/pci/Makefile0000664000175000017500000000001712307313555015124 0ustar smbsmbobj-y += pci.o xen-4.4.0/xen/drivers/pci/pci.c0000664000175000017500000000711112307313555014405 0ustar smbsmb/****************************************************************************** * pci.c * * Architecture-independent PCI access functions. */ #include #include #include int pci_find_cap_offset(u16 seg, u8 bus, u8 dev, u8 func, u8 cap) { u8 id; int max_cap = 48; u8 pos = PCI_CAPABILITY_LIST; u16 status; status = pci_conf_read16(seg, bus, dev, func, PCI_STATUS); if ( (status & PCI_STATUS_CAP_LIST) == 0 ) return 0; while ( max_cap-- ) { pos = pci_conf_read8(seg, bus, dev, func, pos); if ( pos < 0x40 ) break; pos &= ~3; id = pci_conf_read8(seg, bus, dev, func, pos + PCI_CAP_LIST_ID); if ( id == 0xff ) break; else if ( id == cap ) return pos; pos += PCI_CAP_LIST_NEXT; } return 0; } int pci_find_next_cap(u16 seg, u8 bus, unsigned int devfn, u8 pos, int cap) { u8 id; int ttl = 48; while ( ttl-- ) { pos = pci_conf_read8(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos); if ( pos < 0x40 ) break; pos &= ~3; id = pci_conf_read8(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos + PCI_CAP_LIST_ID); if ( id == 0xff ) break; if ( id == cap ) return pos; pos += PCI_CAP_LIST_NEXT; } return 0; } /** * pci_find_ext_capability - Find an extended capability * @dev: PCI device to query * @cap: capability code * * Returns the address of the requested extended capability structure * within the device's PCI configuration space or 0 if the device does * not support it. Possible values for @cap: * * %PCI_EXT_CAP_ID_ERR Advanced Error Reporting * %PCI_EXT_CAP_ID_VC Virtual Channel * %PCI_EXT_CAP_ID_DSN Device Serial Number * %PCI_EXT_CAP_ID_PWR Power Budgeting */ int pci_find_ext_capability(int seg, int bus, int devfn, int cap) { u32 header; int ttl = 480; /* 3840 bytes, minimum 8 bytes per capability */ int pos = 0x100; header = pci_conf_read32(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos); /* * If we have no capabilities, this is indicated by cap ID, * cap version and next pointer all being 0. */ if ( (header == 0) || (header == -1) ) return 0; while ( ttl-- > 0 ) { if ( PCI_EXT_CAP_ID(header) == cap ) return pos; pos = PCI_EXT_CAP_NEXT(header); if ( pos < 0x100 ) break; header = pci_conf_read32(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos); } return 0; } const char *__init parse_pci(const char *s, unsigned int *seg_p, unsigned int *bus_p, unsigned int *dev_p, unsigned int *func_p) { unsigned long seg = simple_strtoul(s, &s, 16), bus, dev, func; if ( *s != ':' ) return NULL; bus = simple_strtoul(s + 1, &s, 16); if ( *s == ':' ) dev = simple_strtoul(s + 1, &s, 16); else { dev = bus; bus = seg; seg = 0; } if ( func_p ) { if ( *s != '.' ) return NULL; func = simple_strtoul(s + 1, &s, 0); } else func = 0; if ( seg != (seg_p ? (u16)seg : 0) || bus != PCI_BUS(PCI_BDF2(bus, 0)) || dev != PCI_SLOT(PCI_DEVFN(dev, 0)) || func != PCI_FUNC(PCI_DEVFN(0, func)) ) return NULL; if ( seg_p ) *seg_p = seg; *bus_p = bus; *dev_p = dev; if ( func_p ) *func_p = func; return s; } xen-4.4.0/xen/drivers/passthrough/0000775000175000017500000000000012307313555015262 5ustar smbsmbxen-4.4.0/xen/drivers/passthrough/vtd/0000775000175000017500000000000012307313555016057 5ustar smbsmbxen-4.4.0/xen/drivers/passthrough/vtd/Makefile0000664000175000017500000000020012307313555017507 0ustar smbsmbsubdir-$(x86) += x86 obj-y += iommu.o obj-y += dmar.o obj-y += utils.o obj-y += qinval.o obj-y += intremap.o obj-y += quirks.o xen-4.4.0/xen/drivers/passthrough/vtd/iommu.c0000664000175000017500000021200012307313555017344 0ustar smbsmb/* * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Copyright (C) Ashok Raj * Copyright (C) Shaohua Li * Copyright (C) Allen Kay - adapted to xen */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "iommu.h" #include "dmar.h" #include "extern.h" #include "vtd.h" #include "../ats.h" /* Possible unfiltered LAPIC/MSI messages from untrusted sources? */ bool_t __read_mostly untrusted_msi; int nr_iommus; static struct tasklet vtd_fault_tasklet; static int setup_dom0_device(u8 devfn, struct pci_dev *); static void setup_dom0_rmrr(struct domain *d); static int domain_iommu_domid(struct domain *d, struct iommu *iommu) { unsigned long nr_dom, i; nr_dom = cap_ndoms(iommu->cap); i = find_first_bit(iommu->domid_bitmap, nr_dom); while ( i < nr_dom ) { if ( iommu->domid_map[i] == d->domain_id ) return i; i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1); } dprintk(XENLOG_ERR VTDPREFIX, "Cannot get valid iommu domid: domid=%d iommu->index=%d\n", d->domain_id, iommu->index); return -1; } #define DID_FIELD_WIDTH 16 #define DID_HIGH_OFFSET 8 static int context_set_domain_id(struct context_entry *context, struct domain *d, struct iommu *iommu) { unsigned long nr_dom, i; int found = 0; ASSERT(spin_is_locked(&iommu->lock)); nr_dom = cap_ndoms(iommu->cap); i = find_first_bit(iommu->domid_bitmap, nr_dom); while ( i < nr_dom ) { if ( iommu->domid_map[i] == d->domain_id ) { found = 1; break; } i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1); } if ( found == 0 ) { i = find_first_zero_bit(iommu->domid_bitmap, nr_dom); if ( i >= nr_dom ) { dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no free domain ids\n"); return -EFAULT; } iommu->domid_map[i] = d->domain_id; } set_bit(i, iommu->domid_bitmap); context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET; return 0; } static int context_get_domain_id(struct context_entry *context, struct iommu *iommu) { unsigned long dom_index, nr_dom; int domid = -1; if (iommu && context) { nr_dom = cap_ndoms(iommu->cap); dom_index = context_domain_id(*context); if ( dom_index < nr_dom && iommu->domid_map) domid = iommu->domid_map[dom_index]; else dprintk(XENLOG_DEBUG VTDPREFIX, "%s: dom_index %lu exceeds nr_dom %lu or iommu has no domid_map\n", __func__, dom_index, nr_dom); } return domid; } static struct intel_iommu *__init alloc_intel_iommu(void) { struct intel_iommu *intel; intel = xzalloc(struct intel_iommu); if ( intel == NULL ) return NULL; spin_lock_init(&intel->qi_ctrl.qinval_lock); spin_lock_init(&intel->ir_ctrl.iremap_lock); return intel; } static void __init free_intel_iommu(struct intel_iommu *intel) { xfree(intel); } static int iommus_incoherent; static void __iommu_flush_cache(void *addr, unsigned int size) { int i; static unsigned int clflush_size = 0; if ( !iommus_incoherent ) return; if ( clflush_size == 0 ) clflush_size = get_cache_line_size(); for ( i = 0; i < size; i += clflush_size ) cacheline_flush((char *)addr + i); } void iommu_flush_cache_entry(void *addr, unsigned int size) { __iommu_flush_cache(addr, size); } void iommu_flush_cache_page(void *addr, unsigned long npages) { __iommu_flush_cache(addr, PAGE_SIZE * npages); } /* Allocate page table, return its machine address */ u64 alloc_pgtable_maddr(struct acpi_drhd_unit *drhd, unsigned long npages) { struct acpi_rhsa_unit *rhsa; struct page_info *pg, *cur_pg; u64 *vaddr; int node = -1, i; rhsa = drhd_to_rhsa(drhd); if ( rhsa ) node = pxm_to_node(rhsa->proximity_domain); pg = alloc_domheap_pages(NULL, get_order_from_pages(npages), (node == -1 ) ? 0 : MEMF_node(node)); if ( !pg ) return 0; cur_pg = pg; for ( i = 0; i < npages; i++ ) { vaddr = __map_domain_page(cur_pg); memset(vaddr, 0, PAGE_SIZE); iommu_flush_cache_page(vaddr, 1); unmap_domain_page(vaddr); cur_pg++; } return page_to_maddr(pg); } void free_pgtable_maddr(u64 maddr) { if ( maddr != 0 ) free_domheap_page(maddr_to_page(maddr)); } /* context entry handling */ static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus) { struct acpi_drhd_unit *drhd; struct root_entry *root, *root_entries; u64 maddr; ASSERT(spin_is_locked(&iommu->lock)); root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr); root = &root_entries[bus]; if ( !root_present(*root) ) { drhd = iommu_to_drhd(iommu); maddr = alloc_pgtable_maddr(drhd, 1); if ( maddr == 0 ) { unmap_vtd_domain_page(root_entries); return 0; } set_root_value(*root, maddr); set_root_present(*root); iommu_flush_cache_entry(root, sizeof(struct root_entry)); } maddr = (u64) get_context_addr(*root); unmap_vtd_domain_page(root_entries); return maddr; } static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc) { struct acpi_drhd_unit *drhd; struct pci_dev *pdev; struct hvm_iommu *hd = domain_hvm_iommu(domain); int addr_width = agaw_to_width(hd->agaw); struct dma_pte *parent, *pte = NULL; int level = agaw_to_level(hd->agaw); int offset; u64 pte_maddr = 0, maddr; u64 *vaddr = NULL; addr &= (((u64)1) << addr_width) - 1; ASSERT(spin_is_locked(&hd->mapping_lock)); if ( hd->pgd_maddr == 0 ) { /* * just get any passthrough device in the domainr - assume user * assigns only devices from same node to a given guest. */ pdev = pci_get_pdev_by_domain(domain, -1, -1, -1); drhd = acpi_find_matched_drhd_unit(pdev); if ( !alloc || ((hd->pgd_maddr = alloc_pgtable_maddr(drhd, 1)) == 0) ) goto out; } parent = (struct dma_pte *)map_vtd_domain_page(hd->pgd_maddr); while ( level > 1 ) { offset = address_level_offset(addr, level); pte = &parent[offset]; if ( dma_pte_addr(*pte) == 0 ) { if ( !alloc ) break; pdev = pci_get_pdev_by_domain(domain, -1, -1, -1); drhd = acpi_find_matched_drhd_unit(pdev); maddr = alloc_pgtable_maddr(drhd, 1); if ( !maddr ) break; dma_set_pte_addr(*pte, maddr); vaddr = map_vtd_domain_page(maddr); /* * high level table always sets r/w, last level * page table control read/write */ dma_set_pte_readable(*pte); dma_set_pte_writable(*pte); iommu_flush_cache_entry(pte, sizeof(struct dma_pte)); } else { vaddr = map_vtd_domain_page(pte->val); } if ( level == 2 ) { pte_maddr = pte->val & PAGE_MASK_4K; unmap_vtd_domain_page(vaddr); break; } unmap_vtd_domain_page(parent); parent = (struct dma_pte *)vaddr; vaddr = NULL; level--; } unmap_vtd_domain_page(parent); out: return pte_maddr; } static void iommu_flush_write_buffer(struct iommu *iommu) { u32 val; unsigned long flags; if ( !rwbf_quirk && !cap_rwbf(iommu->cap) ) return; spin_lock_irqsave(&iommu->register_lock, flags); val = dmar_readl(iommu->reg, DMAR_GSTS_REG); dmar_writel(iommu->reg, DMAR_GCMD_REG, val | DMA_GCMD_WBF); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl, !(val & DMA_GSTS_WBFS), val); spin_unlock_irqrestore(&iommu->register_lock, flags); } /* return value determine if we need a write buffer flush */ static int flush_context_reg( void *_iommu, u16 did, u16 source_id, u8 function_mask, u64 type, int flush_non_present_entry) { struct iommu *iommu = (struct iommu *) _iommu; u64 val = 0; unsigned long flags; /* * In the non-present entry flush case, if hardware doesn't cache * non-present entry we do nothing and if hardware cache non-present * entry, we flush entries of domain 0 (the domain id is used to cache * any non-present entries) */ if ( flush_non_present_entry ) { if ( !cap_caching_mode(iommu->cap) ) return 1; else did = 0; } /* use register invalidation */ switch ( type ) { case DMA_CCMD_GLOBAL_INVL: val = DMA_CCMD_GLOBAL_INVL; break; case DMA_CCMD_DOMAIN_INVL: val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); break; case DMA_CCMD_DEVICE_INVL: val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask); break; default: BUG(); } val |= DMA_CCMD_ICC; spin_lock_irqsave(&iommu->register_lock, flags); dmar_writeq(iommu->reg, DMAR_CCMD_REG, val); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, dmar_readq, !(val & DMA_CCMD_ICC), val); spin_unlock_irqrestore(&iommu->register_lock, flags); /* flush context entry will implicitly flush write buffer */ return 0; } static int iommu_flush_context_global( struct iommu *iommu, int flush_non_present_entry) { struct iommu_flush *flush = iommu_get_flush(iommu); return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL, flush_non_present_entry); } static int iommu_flush_context_device( struct iommu *iommu, u16 did, u16 source_id, u8 function_mask, int flush_non_present_entry) { struct iommu_flush *flush = iommu_get_flush(iommu); return flush->context(iommu, did, source_id, function_mask, DMA_CCMD_DEVICE_INVL, flush_non_present_entry); } /* return value determine if we need a write buffer flush */ static int flush_iotlb_reg(void *_iommu, u16 did, u64 addr, unsigned int size_order, u64 type, int flush_non_present_entry, int flush_dev_iotlb) { struct iommu *iommu = (struct iommu *) _iommu; int tlb_offset = ecap_iotlb_offset(iommu->ecap); u64 val = 0, val_iva = 0; unsigned long flags; /* * In the non-present entry flush case, if hardware doesn't cache * non-present entry we do nothing and if hardware cache non-present * entry, we flush entries of domain 0 (the domain id is used to cache * any non-present entries) */ if ( flush_non_present_entry ) { if ( !cap_caching_mode(iommu->cap) ) return 1; else did = 0; } /* use register invalidation */ switch ( type ) { case DMA_TLB_GLOBAL_FLUSH: /* global flush doesn't need set IVA_REG */ val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT; break; case DMA_TLB_DSI_FLUSH: val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); break; case DMA_TLB_PSI_FLUSH: val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did); /* Note: always flush non-leaf currently */ val_iva = size_order | addr; break; default: BUG(); } /* Note: set drain read/write */ if ( cap_read_drain(iommu->cap) ) val |= DMA_TLB_READ_DRAIN; if ( cap_write_drain(iommu->cap) ) val |= DMA_TLB_WRITE_DRAIN; spin_lock_irqsave(&iommu->register_lock, flags); /* Note: Only uses first TLB reg currently */ if ( val_iva ) dmar_writeq(iommu->reg, tlb_offset, val_iva); dmar_writeq(iommu->reg, tlb_offset + 8, val); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, (tlb_offset + 8), dmar_readq, !(val & DMA_TLB_IVT), val); spin_unlock_irqrestore(&iommu->register_lock, flags); /* check IOTLB invalidation granularity */ if ( DMA_TLB_IAIG(val) == 0 ) dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n"); /* flush iotlb entry will implicitly flush write buffer */ return 0; } static int iommu_flush_iotlb_global(struct iommu *iommu, int flush_non_present_entry, int flush_dev_iotlb) { struct iommu_flush *flush = iommu_get_flush(iommu); int status; /* apply platform specific errata workarounds */ vtd_ops_preamble_quirk(iommu); status = flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH, flush_non_present_entry, flush_dev_iotlb); /* undo platform specific errata workarounds */ vtd_ops_postamble_quirk(iommu); return status; } static int iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did, int flush_non_present_entry, int flush_dev_iotlb) { struct iommu_flush *flush = iommu_get_flush(iommu); int status; /* apply platform specific errata workarounds */ vtd_ops_preamble_quirk(iommu); status = flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH, flush_non_present_entry, flush_dev_iotlb); /* undo platform specific errata workarounds */ vtd_ops_postamble_quirk(iommu); return status; } static int iommu_flush_iotlb_psi( struct iommu *iommu, u16 did, u64 addr, unsigned int order, int flush_non_present_entry, int flush_dev_iotlb) { struct iommu_flush *flush = iommu_get_flush(iommu); int status; ASSERT(!(addr & (~PAGE_MASK_4K))); /* Fallback to domain selective flush if no PSI support */ if ( !cap_pgsel_inv(iommu->cap) ) return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb); /* Fallback to domain selective flush if size is too big */ if ( order > cap_max_amask_val(iommu->cap) ) return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb); addr >>= PAGE_SHIFT_4K + order; addr <<= PAGE_SHIFT_4K + order; /* apply platform specific errata workarounds */ vtd_ops_preamble_quirk(iommu); status = flush->iotlb(iommu, did, addr, order, DMA_TLB_PSI_FLUSH, flush_non_present_entry, flush_dev_iotlb); /* undo platform specific errata workarounds */ vtd_ops_postamble_quirk(iommu); return status; } static void iommu_flush_all(void) { struct acpi_drhd_unit *drhd; struct iommu *iommu; int flush_dev_iotlb; flush_all_cache(); for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; iommu_flush_context_global(iommu, 0); flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0; iommu_flush_iotlb_global(iommu, 0, flush_dev_iotlb); } } static void __intel_iommu_iotlb_flush(struct domain *d, unsigned long gfn, int dma_old_pte_present, unsigned int page_count) { struct hvm_iommu *hd = domain_hvm_iommu(d); struct acpi_drhd_unit *drhd; struct iommu *iommu; int flush_dev_iotlb; int iommu_domid; /* * No need pcideves_lock here because we have flush * when assign/deassign device */ for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; if ( !test_bit(iommu->index, &hd->iommu_bitmap) ) continue; flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0; iommu_domid= domain_iommu_domid(d, iommu); if ( iommu_domid == -1 ) continue; if ( page_count > 1 || gfn == -1 ) { if ( iommu_flush_iotlb_dsi(iommu, iommu_domid, 0, flush_dev_iotlb) ) iommu_flush_write_buffer(iommu); } else { if ( iommu_flush_iotlb_psi(iommu, iommu_domid, (paddr_t)gfn << PAGE_SHIFT_4K, 0, !dma_old_pte_present, flush_dev_iotlb) ) iommu_flush_write_buffer(iommu); } } } static void intel_iommu_iotlb_flush(struct domain *d, unsigned long gfn, unsigned int page_count) { __intel_iommu_iotlb_flush(d, gfn, 1, page_count); } static void intel_iommu_iotlb_flush_all(struct domain *d) { __intel_iommu_iotlb_flush(d, 0, 0, 0); } /* clear one page's page table */ static void dma_pte_clear_one(struct domain *domain, u64 addr) { struct hvm_iommu *hd = domain_hvm_iommu(domain); struct dma_pte *page = NULL, *pte = NULL; u64 pg_maddr; struct mapped_rmrr *mrmrr; spin_lock(&hd->mapping_lock); /* get last level pte */ pg_maddr = addr_to_dma_page_maddr(domain, addr, 0); if ( pg_maddr == 0 ) { spin_unlock(&hd->mapping_lock); return; } page = (struct dma_pte *)map_vtd_domain_page(pg_maddr); pte = page + address_level_offset(addr, 1); if ( !dma_pte_present(*pte) ) { spin_unlock(&hd->mapping_lock); unmap_vtd_domain_page(page); return; } dma_clear_pte(*pte); spin_unlock(&hd->mapping_lock); iommu_flush_cache_entry(pte, sizeof(struct dma_pte)); if ( !this_cpu(iommu_dont_flush_iotlb) ) __intel_iommu_iotlb_flush(domain, addr >> PAGE_SHIFT_4K, 1, 1); unmap_vtd_domain_page(page); /* if the cleared address is between mapped RMRR region, * remove the mapped RMRR */ spin_lock(&hd->mapping_lock); list_for_each_entry ( mrmrr, &hd->mapped_rmrrs, list ) { if ( addr >= mrmrr->base && addr <= mrmrr->end ) { list_del(&mrmrr->list); xfree(mrmrr); break; } } spin_unlock(&hd->mapping_lock); } static void iommu_free_pagetable(u64 pt_maddr, int level) { struct page_info *pg = maddr_to_page(pt_maddr); if ( pt_maddr == 0 ) return; PFN_ORDER(pg) = level; spin_lock(&iommu_pt_cleanup_lock); page_list_add_tail(pg, &iommu_pt_cleanup_list); spin_unlock(&iommu_pt_cleanup_lock); } static void iommu_free_page_table(struct page_info *pg) { unsigned int i, next_level = PFN_ORDER(pg) - 1; u64 pt_maddr = page_to_maddr(pg); struct dma_pte *pt_vaddr, *pte; PFN_ORDER(pg) = 0; pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr); for ( i = 0; i < PTE_NUM; i++ ) { pte = &pt_vaddr[i]; if ( !dma_pte_present(*pte) ) continue; if ( next_level >= 1 ) iommu_free_pagetable(dma_pte_addr(*pte), next_level); dma_clear_pte(*pte); iommu_flush_cache_entry(pte, sizeof(struct dma_pte)); } unmap_vtd_domain_page(pt_vaddr); free_pgtable_maddr(pt_maddr); } static int iommu_set_root_entry(struct iommu *iommu) { u32 sts; unsigned long flags; spin_lock_irqsave(&iommu->register_lock, flags); dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr); sts = dmar_readl(iommu->reg, DMAR_GSTS_REG); dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_SRTP); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl, (sts & DMA_GSTS_RTPS), sts); spin_unlock_irqrestore(&iommu->register_lock, flags); return 0; } static void iommu_enable_translation(struct acpi_drhd_unit *drhd) { u32 sts; unsigned long flags; struct iommu *iommu = drhd->iommu; if ( is_igd_drhd(drhd) && !is_igd_vt_enabled_quirk() ) { if ( force_iommu ) panic("BIOS did not enable IGD for VT properly, crash Xen for security purpose"); else { dprintk(XENLOG_WARNING VTDPREFIX, "BIOS did not enable IGD for VT properly. Disabling IGD VT-d engine.\n"); return; } } /* apply platform specific errata workarounds */ vtd_ops_preamble_quirk(iommu); if ( iommu_verbose ) dprintk(VTDPREFIX, "iommu_enable_translation: iommu->reg = %p\n", iommu->reg); spin_lock_irqsave(&iommu->register_lock, flags); sts = dmar_readl(iommu->reg, DMAR_GSTS_REG); dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_TE); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl, (sts & DMA_GSTS_TES), sts); spin_unlock_irqrestore(&iommu->register_lock, flags); /* undo platform specific errata workarounds */ vtd_ops_postamble_quirk(iommu); /* Disable PMRs when VT-d engine takes effect per spec definition */ disable_pmr(iommu); } static void iommu_disable_translation(struct iommu *iommu) { u32 sts; unsigned long flags; /* apply platform specific errata workarounds */ vtd_ops_preamble_quirk(iommu); spin_lock_irqsave(&iommu->register_lock, flags); sts = dmar_readl(iommu->reg, DMAR_GSTS_REG); dmar_writel(iommu->reg, DMAR_GCMD_REG, sts & (~DMA_GCMD_TE)); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl, !(sts & DMA_GSTS_TES), sts); spin_unlock_irqrestore(&iommu->register_lock, flags); /* undo platform specific errata workarounds */ vtd_ops_postamble_quirk(iommu); } enum faulttype { DMA_REMAP, INTR_REMAP, UNKNOWN, }; static const char *dma_remap_fault_reasons[] = { "Software", "Present bit in root entry is clear", "Present bit in context entry is clear", "Invalid context entry", "Access beyond MGAW", "PTE Write access is not set", "PTE Read access is not set", "Next page table ptr is invalid", "Root table address invalid", "Context table ptr is invalid", "non-zero reserved fields in RTP", "non-zero reserved fields in CTP", "non-zero reserved fields in PTE", "Blocked a DMA translation request", }; static const char *intr_remap_fault_reasons[] = { "Detected reserved fields in the decoded interrupt-remapped request", "Interrupt index exceeded the interrupt-remapping table size", "Present field in the IRTE entry is clear", "Error accessing interrupt-remapping table pointed by IRTA_REG", "Detected reserved fields in the IRTE entry", "Blocked a compatibility format interrupt request", "Blocked an interrupt request due to source-id verification failure", }; static const char *iommu_get_fault_reason(u8 fault_reason, int *fault_type) { if ( fault_reason >= 0x20 && ( fault_reason < 0x20 + ARRAY_SIZE(intr_remap_fault_reasons)) ) { *fault_type = INTR_REMAP; return intr_remap_fault_reasons[fault_reason - 0x20]; } else if ( fault_reason < ARRAY_SIZE(dma_remap_fault_reasons) ) { *fault_type = DMA_REMAP; return dma_remap_fault_reasons[fault_reason]; } else { *fault_type = UNKNOWN; return "Unknown"; } } static int iommu_page_fault_do_one(struct iommu *iommu, int type, u8 fault_reason, u16 source_id, u64 addr) { const char *reason; int fault_type; u16 seg = iommu->intel->drhd->segment; reason = iommu_get_fault_reason(fault_reason, &fault_type); if ( fault_type == DMA_REMAP ) { INTEL_IOMMU_DEBUG( "DMAR:[%s] Request device [%04x:%02x:%02x.%u] " "fault addr %"PRIx64", iommu reg = %p\n" "DMAR:[fault reason %02xh] %s\n", (type ? "DMA Read" : "DMA Write"), seg, (source_id >> 8), PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr, iommu->reg, fault_reason, reason); if (iommu_debug) print_vtd_entries(iommu, (source_id >> 8), (source_id & 0xff), (addr >> PAGE_SHIFT)); } else INTEL_IOMMU_DEBUG( "INTR-REMAP: Request device [%04x:%02x:%02x.%u] " "fault index %"PRIx64", iommu reg = %p\n" "INTR-REMAP:[fault reason %02xh] %s\n", seg, (source_id >> 8), PCI_SLOT(source_id & 0xFF), PCI_FUNC(source_id & 0xFF), addr >> 48, iommu->reg, fault_reason, reason); return 0; } static void iommu_fault_status(u32 fault_status) { if ( fault_status & DMA_FSTS_PFO ) INTEL_IOMMU_DEBUG("iommu_fault_status: Fault Overflow\n"); if ( fault_status & DMA_FSTS_PPF ) INTEL_IOMMU_DEBUG("iommu_fault_status: Primary Pending Fault\n"); if ( fault_status & DMA_FSTS_AFO ) INTEL_IOMMU_DEBUG("iommu_fault_status: Advanced Fault Overflow\n"); if ( fault_status & DMA_FSTS_APF ) INTEL_IOMMU_DEBUG("iommu_fault_status: Advanced Pending Fault\n"); if ( fault_status & DMA_FSTS_IQE ) INTEL_IOMMU_DEBUG("iommu_fault_status: Invalidation Queue Error\n"); if ( fault_status & DMA_FSTS_ICE ) INTEL_IOMMU_DEBUG("iommu_fault_status: Invalidation Completion Error\n"); if ( fault_status & DMA_FSTS_ITE ) INTEL_IOMMU_DEBUG("iommu_fault_status: Invalidation Time-out Error\n"); } #define PRIMARY_FAULT_REG_LEN (16) static void __do_iommu_page_fault(struct iommu *iommu) { int reg, fault_index; u32 fault_status; unsigned long flags; fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG); iommu_fault_status(fault_status); /* FIXME: ignore advanced fault log */ if ( !(fault_status & DMA_FSTS_PPF) ) goto clear_overflow; fault_index = dma_fsts_fault_record_index(fault_status); reg = cap_fault_reg_offset(iommu->cap); while (1) { u8 fault_reason; u16 source_id; u32 data; u64 guest_addr; int type; /* highest 32 bits */ spin_lock_irqsave(&iommu->register_lock, flags); data = dmar_readl(iommu->reg, reg + fault_index * PRIMARY_FAULT_REG_LEN + 12); if ( !(data & DMA_FRCD_F) ) { spin_unlock_irqrestore(&iommu->register_lock, flags); break; } fault_reason = dma_frcd_fault_reason(data); type = dma_frcd_type(data); data = dmar_readl(iommu->reg, reg + fault_index * PRIMARY_FAULT_REG_LEN + 8); source_id = dma_frcd_source_id(data); guest_addr = dmar_readq(iommu->reg, reg + fault_index * PRIMARY_FAULT_REG_LEN); guest_addr = dma_frcd_page_addr(guest_addr); /* clear the fault */ dmar_writel(iommu->reg, reg + fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F); spin_unlock_irqrestore(&iommu->register_lock, flags); iommu_page_fault_do_one(iommu, type, fault_reason, source_id, guest_addr); pci_check_disable_device(iommu->intel->drhd->segment, PCI_BUS(source_id), PCI_DEVFN2(source_id)); fault_index++; if ( fault_index > cap_num_fault_regs(iommu->cap) ) fault_index = 0; } clear_overflow: /* clear primary fault overflow */ fault_status = readl(iommu->reg + DMAR_FSTS_REG); if ( fault_status & DMA_FSTS_PFO ) { spin_lock_irqsave(&iommu->register_lock, flags); dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO); spin_unlock_irqrestore(&iommu->register_lock, flags); } } static void do_iommu_page_fault(unsigned long data) { struct acpi_drhd_unit *drhd; if ( list_empty(&acpi_drhd_units) ) { INTEL_IOMMU_DEBUG("no device found, something must be very wrong!\n"); return; } /* * No matter from whom the interrupt came from, check all the * IOMMUs present in the system. This allows for having just one * tasklet (instead of one per each IOMMUs) and should be more than * fine, considering how rare the event of a fault should be. */ for_each_drhd_unit ( drhd ) __do_iommu_page_fault(drhd->iommu); } static void iommu_page_fault(int irq, void *dev_id, struct cpu_user_regs *regs) { /* * Just flag the tasklet as runnable. This is fine, according to VT-d * specs since a new interrupt won't be generated until we clear all * the faults that caused this one to happen. */ tasklet_schedule(&vtd_fault_tasklet); } static void dma_msi_unmask(struct irq_desc *desc) { struct iommu *iommu = desc->action->dev_id; unsigned long flags; /* unmask it */ spin_lock_irqsave(&iommu->register_lock, flags); dmar_writel(iommu->reg, DMAR_FECTL_REG, 0); spin_unlock_irqrestore(&iommu->register_lock, flags); iommu->msi.msi_attrib.masked = 0; } static void dma_msi_mask(struct irq_desc *desc) { unsigned long flags; struct iommu *iommu = desc->action->dev_id; /* mask it */ spin_lock_irqsave(&iommu->register_lock, flags); dmar_writel(iommu->reg, DMAR_FECTL_REG, DMA_FECTL_IM); spin_unlock_irqrestore(&iommu->register_lock, flags); iommu->msi.msi_attrib.masked = 1; } static unsigned int dma_msi_startup(struct irq_desc *desc) { dma_msi_unmask(desc); return 0; } static void dma_msi_ack(struct irq_desc *desc) { irq_complete_move(desc); dma_msi_mask(desc); move_masked_irq(desc); } static void dma_msi_end(struct irq_desc *desc, u8 vector) { dma_msi_unmask(desc); ack_APIC_irq(); } static void dma_msi_set_affinity(struct irq_desc *desc, const cpumask_t *mask) { struct msi_msg msg; unsigned int dest; unsigned long flags; struct iommu *iommu = desc->action->dev_id; dest = set_desc_affinity(desc, mask); if (dest == BAD_APICID){ dprintk(XENLOG_ERR VTDPREFIX, "Set iommu interrupt affinity error!\n"); return; } msi_compose_msg(desc->arch.vector, desc->arch.cpu_mask, &msg); /* Are these overrides really needed? */ if (x2apic_enabled) msg.address_hi = dest & 0xFFFFFF00; msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest & 0xff); iommu->msi.msg = msg; spin_lock_irqsave(&iommu->register_lock, flags); dmar_writel(iommu->reg, DMAR_FEDATA_REG, msg.data); dmar_writel(iommu->reg, DMAR_FEADDR_REG, msg.address_lo); dmar_writel(iommu->reg, DMAR_FEUADDR_REG, msg.address_hi); spin_unlock_irqrestore(&iommu->register_lock, flags); } static hw_irq_controller dma_msi_type = { .typename = "DMA_MSI", .startup = dma_msi_startup, .shutdown = dma_msi_mask, .enable = dma_msi_unmask, .disable = dma_msi_mask, .ack = dma_msi_ack, .end = dma_msi_end, .set_affinity = dma_msi_set_affinity, }; static int __init iommu_set_interrupt(struct acpi_drhd_unit *drhd) { int irq, ret; struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd); struct iommu *iommu = drhd->iommu; struct irq_desc *desc; irq = create_irq(rhsa ? pxm_to_node(rhsa->proximity_domain) : NUMA_NO_NODE); if ( irq <= 0 ) { dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no irq available!\n"); return -EINVAL; } desc = irq_to_desc(irq); desc->handler = &dma_msi_type; ret = request_irq(irq, iommu_page_fault, "dmar", iommu); if ( ret ) { desc->handler = &no_irq_type; destroy_irq(irq); dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n"); return ret; } iommu->msi.irq = irq; iommu->msi.msi_attrib.pos = MSI_TYPE_IOMMU; iommu->msi.msi_attrib.maskbit = 1; iommu->msi.msi_attrib.is_64 = 1; desc->msi_desc = &iommu->msi; return 0; } int __init iommu_alloc(struct acpi_drhd_unit *drhd) { struct iommu *iommu; unsigned long sagaw, nr_dom; int agaw; if ( nr_iommus > MAX_IOMMUS ) { dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus); return -ENOMEM; } iommu = xzalloc(struct iommu); if ( iommu == NULL ) return -ENOMEM; iommu->msi.irq = -1; /* No irq assigned yet. */ iommu->intel = alloc_intel_iommu(); if ( iommu->intel == NULL ) { xfree(iommu); return -ENOMEM; } iommu->intel->drhd = drhd; drhd->iommu = iommu; if ( !(iommu->root_maddr = alloc_pgtable_maddr(drhd, 1)) ) return -ENOMEM; iommu->reg = ioremap(drhd->address, PAGE_SIZE); if ( !iommu->reg ) return -ENOMEM; iommu->index = nr_iommus++; iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG); iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG); if ( iommu_verbose ) { dprintk(VTDPREFIX, "drhd->address = %"PRIx64" iommu->reg = %p\n", drhd->address, iommu->reg); dprintk(VTDPREFIX, "cap = %"PRIx64" ecap = %"PRIx64"\n", iommu->cap, iommu->ecap); } if ( !(iommu->cap + 1) || !(iommu->ecap + 1) ) return -ENODEV; if ( cap_fault_reg_offset(iommu->cap) + cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE || ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE ) { dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: unsupported\n"); print_iommu_regs(drhd); return -ENODEV; } /* Calculate number of pagetable levels: between 2 and 4. */ sagaw = cap_sagaw(iommu->cap); for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- ) if ( test_bit(agaw, &sagaw) ) break; if ( agaw < 0 ) { dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: unsupported sagaw %lx\n", sagaw); print_iommu_regs(drhd); return -ENODEV; } iommu->nr_pt_levels = agaw_to_level(agaw); if ( !ecap_coherent(iommu->ecap) ) iommus_incoherent = 1; /* allocate domain id bitmap */ nr_dom = cap_ndoms(iommu->cap); iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom)); if ( !iommu->domid_bitmap ) return -ENOMEM ; /* * if Caching mode is set, then invalid translations are tagged with * domain id 0, Hence reserve bit 0 for it */ if ( cap_caching_mode(iommu->cap) ) set_bit(0, iommu->domid_bitmap); iommu->domid_map = xzalloc_array(u16, nr_dom); if ( !iommu->domid_map ) return -ENOMEM ; spin_lock_init(&iommu->lock); spin_lock_init(&iommu->register_lock); return 0; } void __init iommu_free(struct acpi_drhd_unit *drhd) { struct iommu *iommu = drhd->iommu; if ( iommu == NULL ) return; drhd->iommu = NULL; if ( iommu->root_maddr != 0 ) { free_pgtable_maddr(iommu->root_maddr); iommu->root_maddr = 0; } if ( iommu->reg ) iounmap(iommu->reg); xfree(iommu->domid_bitmap); xfree(iommu->domid_map); free_intel_iommu(iommu->intel); if ( iommu->msi.irq >= 0 ) destroy_irq(iommu->msi.irq); xfree(iommu); } #define guestwidth_to_adjustwidth(gaw) ({ \ int agaw, r = (gaw - 12) % 9; \ agaw = (r == 0) ? gaw : (gaw + 9 - r); \ if ( agaw > 64 ) \ agaw = 64; \ agaw; }) static int intel_iommu_domain_init(struct domain *d) { struct hvm_iommu *hd = domain_hvm_iommu(d); hd->agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH); return 0; } static void __init intel_iommu_dom0_init(struct domain *d) { struct acpi_drhd_unit *drhd; if ( !iommu_passthrough && !need_iommu(d) ) { /* Set up 1:1 page table for dom0 */ iommu_set_dom0_mapping(d); } setup_dom0_pci_devices(d, setup_dom0_device); setup_dom0_rmrr(d); iommu_flush_all(); for_each_drhd_unit ( drhd ) { iommu_enable_translation(drhd); } } int domain_context_mapping_one( struct domain *domain, struct iommu *iommu, u8 bus, u8 devfn, const struct pci_dev *pdev) { struct hvm_iommu *hd = domain_hvm_iommu(domain); struct context_entry *context, *context_entries; u64 maddr, pgd_maddr; u16 seg = iommu->intel->drhd->segment; int agaw; ASSERT(spin_is_locked(&pcidevs_lock)); spin_lock(&iommu->lock); maddr = bus_to_context_maddr(iommu, bus); context_entries = (struct context_entry *)map_vtd_domain_page(maddr); context = &context_entries[devfn]; if ( context_present(*context) ) { int res = 0; /* Try to get domain ownership from device structure. If that's * not available, try to read it from the context itself. */ if ( pdev ) { if ( pdev->domain != domain ) { printk(XENLOG_G_INFO VTDPREFIX "d%d: %04x:%02x:%02x.%u owned by d%d!", domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pdev->domain ? pdev->domain->domain_id : -1); res = -EINVAL; } } else { int cdomain; cdomain = context_get_domain_id(context, iommu); if ( cdomain < 0 ) { printk(XENLOG_G_WARNING VTDPREFIX "d%d: %04x:%02x:%02x.%u mapped, but can't find owner!\n", domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); res = -EINVAL; } else if ( cdomain != domain->domain_id ) { printk(XENLOG_G_INFO VTDPREFIX "d%d: %04x:%02x:%02x.%u already mapped to d%d!", domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), cdomain); res = -EINVAL; } } unmap_vtd_domain_page(context_entries); spin_unlock(&iommu->lock); return res; } if ( iommu_passthrough && (domain->domain_id == 0) ) { context_set_translation_type(*context, CONTEXT_TT_PASS_THRU); agaw = level_to_agaw(iommu->nr_pt_levels); } else { spin_lock(&hd->mapping_lock); /* Ensure we have pagetables allocated down to leaf PTE. */ if ( hd->pgd_maddr == 0 ) { addr_to_dma_page_maddr(domain, 0, 1); if ( hd->pgd_maddr == 0 ) { nomem: spin_unlock(&hd->mapping_lock); spin_unlock(&iommu->lock); unmap_vtd_domain_page(context_entries); return -ENOMEM; } } /* Skip top levels of page tables for 2- and 3-level DRHDs. */ pgd_maddr = hd->pgd_maddr; for ( agaw = level_to_agaw(4); agaw != level_to_agaw(iommu->nr_pt_levels); agaw-- ) { struct dma_pte *p = map_vtd_domain_page(pgd_maddr); pgd_maddr = dma_pte_addr(*p); unmap_vtd_domain_page(p); if ( pgd_maddr == 0 ) goto nomem; } context_set_address_root(*context, pgd_maddr); if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) ) context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB); else context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL); spin_unlock(&hd->mapping_lock); } if ( context_set_domain_id(context, domain, iommu) ) { spin_unlock(&iommu->lock); unmap_vtd_domain_page(context_entries); return -EFAULT; } context_set_address_width(*context, agaw); context_set_fault_enable(*context); context_set_present(*context); iommu_flush_cache_entry(context, sizeof(struct context_entry)); spin_unlock(&iommu->lock); /* Context entry was previously non-present (with domid 0). */ if ( iommu_flush_context_device(iommu, 0, (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1) ) iommu_flush_write_buffer(iommu); else { int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0; iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb); } set_bit(iommu->index, &hd->iommu_bitmap); unmap_vtd_domain_page(context_entries); if ( !seg ) me_wifi_quirk(domain, bus, devfn, MAP_ME_PHANTOM_FUNC); return 0; } static int domain_context_mapping( struct domain *domain, u8 devfn, const struct pci_dev *pdev) { struct acpi_drhd_unit *drhd; int ret = 0; u8 seg = pdev->seg, bus = pdev->bus, secbus; drhd = acpi_find_matched_drhd_unit(pdev); if ( !drhd ) return -ENODEV; ASSERT(spin_is_locked(&pcidevs_lock)); switch ( pdev->type ) { case DEV_TYPE_PCI_HOST_BRIDGE: if ( iommu_verbose ) dprintk(VTDPREFIX, "d%d:Hostbridge: skip %04x:%02x:%02x.%u map\n", domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); if ( !is_hardware_domain(domain) ) return -EPERM; break; case DEV_TYPE_PCIe_BRIDGE: case DEV_TYPE_PCIe2PCI_BRIDGE: case DEV_TYPE_LEGACY_PCI_BRIDGE: break; case DEV_TYPE_PCIe_ENDPOINT: if ( iommu_verbose ) dprintk(VTDPREFIX, "d%d:PCIe: map %04x:%02x:%02x.%u\n", domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, pdev); if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) enable_ats_device(seg, bus, devfn); break; case DEV_TYPE_PCI: if ( iommu_verbose ) dprintk(VTDPREFIX, "d%d:PCI: map %04x:%02x:%02x.%u\n", domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, pdev); if ( ret ) break; if ( find_upstream_bridge(seg, &bus, &devfn, &secbus) < 1 ) break; ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, pci_get_pdev(seg, bus, devfn)); /* * Devices behind PCIe-to-PCI/PCIx bridge may generate different * requester-id. It may originate from devfn=0 on the secondary bus * behind the bridge. Map that id as well if we didn't already. */ if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE && (secbus != pdev->bus || pdev->devfn != 0) ) ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0, pci_get_pdev(seg, secbus, 0)); break; default: dprintk(XENLOG_ERR VTDPREFIX, "d%d:unknown(%u): %04x:%02x:%02x.%u\n", domain->domain_id, pdev->type, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); ret = -EINVAL; break; } if ( iommu_verbose ) process_pending_softirqs(); return ret; } int domain_context_unmap_one( struct domain *domain, struct iommu *iommu, u8 bus, u8 devfn) { struct context_entry *context, *context_entries; u64 maddr; int iommu_domid; ASSERT(spin_is_locked(&pcidevs_lock)); spin_lock(&iommu->lock); maddr = bus_to_context_maddr(iommu, bus); context_entries = (struct context_entry *)map_vtd_domain_page(maddr); context = &context_entries[devfn]; if ( !context_present(*context) ) { spin_unlock(&iommu->lock); unmap_vtd_domain_page(context_entries); return 0; } context_clear_present(*context); context_clear_entry(*context); iommu_flush_cache_entry(context, sizeof(struct context_entry)); iommu_domid= domain_iommu_domid(domain, iommu); if ( iommu_domid == -1 ) { spin_unlock(&iommu->lock); unmap_vtd_domain_page(context_entries); return -EINVAL; } if ( iommu_flush_context_device(iommu, iommu_domid, (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 0) ) iommu_flush_write_buffer(iommu); else { int flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0; iommu_flush_iotlb_dsi(iommu, iommu_domid, 0, flush_dev_iotlb); } spin_unlock(&iommu->lock); unmap_vtd_domain_page(context_entries); if ( !iommu->intel->drhd->segment ) me_wifi_quirk(domain, bus, devfn, UNMAP_ME_PHANTOM_FUNC); return 0; } static int domain_context_unmap( struct domain *domain, u8 devfn, const struct pci_dev *pdev) { struct acpi_drhd_unit *drhd; struct iommu *iommu; int ret = 0; u8 seg = pdev->seg, bus = pdev->bus, tmp_bus, tmp_devfn, secbus; int found = 0; drhd = acpi_find_matched_drhd_unit(pdev); if ( !drhd ) return -ENODEV; iommu = drhd->iommu; switch ( pdev->type ) { case DEV_TYPE_PCI_HOST_BRIDGE: if ( iommu_verbose ) dprintk(VTDPREFIX, "d%d:Hostbridge: skip %04x:%02x:%02x.%u unmap\n", domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); if ( !is_hardware_domain(domain) ) return -EPERM; goto out; case DEV_TYPE_PCIe_BRIDGE: case DEV_TYPE_PCIe2PCI_BRIDGE: case DEV_TYPE_LEGACY_PCI_BRIDGE: goto out; case DEV_TYPE_PCIe_ENDPOINT: if ( iommu_verbose ) dprintk(VTDPREFIX, "d%d:PCIe: unmap %04x:%02x:%02x.%u\n", domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); ret = domain_context_unmap_one(domain, iommu, bus, devfn); if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) disable_ats_device(seg, bus, devfn); break; case DEV_TYPE_PCI: if ( iommu_verbose ) dprintk(VTDPREFIX, "d%d:PCI: unmap %04x:%02x:%02x.%u\n", domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); ret = domain_context_unmap_one(domain, iommu, bus, devfn); if ( ret ) break; tmp_bus = bus; tmp_devfn = devfn; if ( find_upstream_bridge(seg, &tmp_bus, &tmp_devfn, &secbus) < 1 ) break; /* PCIe to PCI/PCIx bridge */ if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE ) { ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn); if ( ret ) return ret; ret = domain_context_unmap_one(domain, iommu, secbus, 0); } else /* Legacy PCI bridge */ ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn); break; default: dprintk(XENLOG_ERR VTDPREFIX, "d%d:unknown(%u): %04x:%02x:%02x.%u\n", domain->domain_id, pdev->type, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); ret = -EINVAL; goto out; } /* * if no other devices under the same iommu owned by this domain, * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp */ for_each_pdev ( domain, pdev ) { if ( pdev->seg == seg && pdev->bus == bus && pdev->devfn == devfn ) continue; drhd = acpi_find_matched_drhd_unit(pdev); if ( drhd && drhd->iommu == iommu ) { found = 1; break; } } if ( found == 0 ) { struct hvm_iommu *hd = domain_hvm_iommu(domain); int iommu_domid; clear_bit(iommu->index, &hd->iommu_bitmap); iommu_domid = domain_iommu_domid(domain, iommu); if ( iommu_domid == -1 ) { ret = -EINVAL; goto out; } clear_bit(iommu_domid, iommu->domid_bitmap); iommu->domid_map[iommu_domid] = 0; } out: return ret; } static int reassign_device_ownership( struct domain *source, struct domain *target, u8 devfn, struct pci_dev *pdev) { int ret; /* * Devices assigned to untrusted domains (here assumed to be any domU) * can attempt to send arbitrary LAPIC/MSI messages. We are unprotected * by the root complex unless interrupt remapping is enabled. */ if ( (target != dom0) && !iommu_intremap ) untrusted_msi = 1; ret = domain_context_unmap(source, devfn, pdev); if ( ret ) return ret; ret = domain_context_mapping(target, devfn, pdev); if ( ret ) return ret; if ( devfn == pdev->devfn ) { list_move(&pdev->domain_list, &target->arch.pdev_list); pdev->domain = target; } return ret; } void iommu_domain_teardown(struct domain *d) { struct hvm_iommu *hd = domain_hvm_iommu(d); if ( list_empty(&acpi_drhd_units) ) return; if ( iommu_use_hap_pt(d) ) return; spin_lock(&hd->mapping_lock); iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw)); hd->pgd_maddr = 0; spin_unlock(&hd->mapping_lock); } static int intel_iommu_map_page( struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int flags) { struct hvm_iommu *hd = domain_hvm_iommu(d); struct dma_pte *page = NULL, *pte = NULL, old, new = { 0 }; u64 pg_maddr; /* Do nothing if VT-d shares EPT page table */ if ( iommu_use_hap_pt(d) ) return 0; /* do nothing if dom0 and iommu supports pass thru */ if ( iommu_passthrough && (d->domain_id == 0) ) return 0; spin_lock(&hd->mapping_lock); pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1); if ( pg_maddr == 0 ) { spin_unlock(&hd->mapping_lock); return -ENOMEM; } page = (struct dma_pte *)map_vtd_domain_page(pg_maddr); pte = page + (gfn & LEVEL_MASK); old = *pte; dma_set_pte_addr(new, (paddr_t)mfn << PAGE_SHIFT_4K); dma_set_pte_prot(new, ((flags & IOMMUF_readable) ? DMA_PTE_READ : 0) | ((flags & IOMMUF_writable) ? DMA_PTE_WRITE : 0)); /* Set the SNP on leaf page table if Snoop Control available */ if ( iommu_snoop ) dma_set_pte_snp(new); if ( old.val == new.val ) { spin_unlock(&hd->mapping_lock); unmap_vtd_domain_page(page); return 0; } *pte = new; iommu_flush_cache_entry(pte, sizeof(struct dma_pte)); spin_unlock(&hd->mapping_lock); unmap_vtd_domain_page(page); if ( !this_cpu(iommu_dont_flush_iotlb) ) __intel_iommu_iotlb_flush(d, gfn, dma_pte_present(old), 1); return 0; } static int intel_iommu_unmap_page(struct domain *d, unsigned long gfn) { /* Do nothing if dom0 and iommu supports pass thru. */ if ( iommu_passthrough && (d->domain_id == 0) ) return 0; dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K); return 0; } void iommu_pte_flush(struct domain *d, u64 gfn, u64 *pte, int order, int present) { struct acpi_drhd_unit *drhd; struct iommu *iommu = NULL; struct hvm_iommu *hd = domain_hvm_iommu(d); int flush_dev_iotlb; int iommu_domid; iommu_flush_cache_entry(pte, sizeof(struct dma_pte)); for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; if ( !test_bit(iommu->index, &hd->iommu_bitmap) ) continue; flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0; iommu_domid= domain_iommu_domid(d, iommu); if ( iommu_domid == -1 ) continue; if ( iommu_flush_iotlb_psi(iommu, iommu_domid, (paddr_t)gfn << PAGE_SHIFT_4K, order, !present, flush_dev_iotlb) ) iommu_flush_write_buffer(iommu); } } static int vtd_ept_page_compatible(struct iommu *iommu) { u64 ept_cap, vtd_cap = iommu->cap; /* EPT is not initialised yet, so we must check the capability in * the MSR explicitly rather than use cpu_has_vmx_ept_*() */ if ( rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, ept_cap) != 0 ) return 0; return ( ept_has_2mb(ept_cap) == cap_sps_2mb(vtd_cap) && ept_has_1gb(ept_cap) == cap_sps_1gb(vtd_cap) ); } /* * set VT-d page table directory to EPT table if allowed */ void iommu_set_pgd(struct domain *d) { struct hvm_iommu *hd = domain_hvm_iommu(d); mfn_t pgd_mfn; ASSERT( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled ); if ( !iommu_use_hap_pt(d) ) return; pgd_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m_get_hostp2m(d))); hd->pgd_maddr = pagetable_get_paddr(pagetable_from_mfn(pgd_mfn)); } static int rmrr_identity_mapping(struct domain *d, struct acpi_rmrr_unit *rmrr) { u64 base, end; unsigned long base_pfn, end_pfn; struct mapped_rmrr *mrmrr; struct hvm_iommu *hd = domain_hvm_iommu(d); ASSERT(spin_is_locked(&pcidevs_lock)); ASSERT(rmrr->base_address < rmrr->end_address); /* * No need to acquire hd->mapping_lock, as the only theoretical race is * with the insertion below (impossible due to holding pcidevs_lock). */ list_for_each_entry( mrmrr, &hd->mapped_rmrrs, list ) { if ( mrmrr->base == rmrr->base_address && mrmrr->end == rmrr->end_address ) return 0; } base = rmrr->base_address & PAGE_MASK_4K; base_pfn = base >> PAGE_SHIFT_4K; end = PAGE_ALIGN_4K(rmrr->end_address); end_pfn = end >> PAGE_SHIFT_4K; while ( base_pfn < end_pfn ) { if ( intel_iommu_map_page(d, base_pfn, base_pfn, IOMMUF_readable|IOMMUF_writable) ) return -1; base_pfn++; } mrmrr = xmalloc(struct mapped_rmrr); if ( !mrmrr ) return -ENOMEM; mrmrr->base = rmrr->base_address; mrmrr->end = rmrr->end_address; spin_lock(&hd->mapping_lock); list_add_tail(&mrmrr->list, &hd->mapped_rmrrs); spin_unlock(&hd->mapping_lock); return 0; } static int intel_iommu_add_device(u8 devfn, struct pci_dev *pdev) { struct acpi_rmrr_unit *rmrr; u16 bdf; int ret, i; ASSERT(spin_is_locked(&pcidevs_lock)); if ( !pdev->domain ) return -EINVAL; ret = domain_context_mapping(pdev->domain, devfn, pdev); if ( ret ) { dprintk(XENLOG_ERR VTDPREFIX, "d%d: context mapping failed\n", pdev->domain->domain_id); return ret; } for_each_rmrr_device ( rmrr, bdf, i ) { if ( rmrr->segment == pdev->seg && PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == devfn ) { ret = rmrr_identity_mapping(pdev->domain, rmrr); if ( ret ) dprintk(XENLOG_ERR VTDPREFIX, "d%d: RMRR mapping failed\n", pdev->domain->domain_id); } } return ret; } static int intel_iommu_enable_device(struct pci_dev *pdev) { struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev); int ret = drhd ? ats_device(pdev, drhd) : -ENODEV; if ( ret <= 0 ) return ret; ret = enable_ats_device(pdev->seg, pdev->bus, pdev->devfn); return ret >= 0 ? 0 : ret; } static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev) { struct acpi_rmrr_unit *rmrr; u16 bdf; int i; if ( !pdev->domain ) return -EINVAL; /* If the device belongs to dom0, and it has RMRR, don't remove it * from dom0, because BIOS may use RMRR at booting time. */ if ( pdev->domain->domain_id == 0 ) { for_each_rmrr_device ( rmrr, bdf, i ) { if ( rmrr->segment == pdev->seg && PCI_BUS(bdf) == pdev->bus && PCI_DEVFN2(bdf) == devfn ) return 0; } } return domain_context_unmap(pdev->domain, devfn, pdev); } static int __init setup_dom0_device(u8 devfn, struct pci_dev *pdev) { int err; err = domain_context_mapping(pdev->domain, devfn, pdev); if ( !err && devfn == pdev->devfn ) pci_vtd_quirk(pdev); return err; } void clear_fault_bits(struct iommu *iommu) { u64 val; unsigned long flags; spin_lock_irqsave(&iommu->register_lock, flags); val = dmar_readq(iommu->reg, cap_fault_reg_offset(iommu->cap) + 8); dmar_writeq(iommu->reg, cap_fault_reg_offset(iommu->cap) + 8, val); dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS); spin_unlock_irqrestore(&iommu->register_lock, flags); } static void adjust_irq_affinity(struct acpi_drhd_unit *drhd) { const struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd); unsigned int node = rhsa ? pxm_to_node(rhsa->proximity_domain) : NUMA_NO_NODE; const cpumask_t *cpumask = &cpu_online_map; if ( node < MAX_NUMNODES && node_online(node) && cpumask_intersects(&node_to_cpumask(node), cpumask) ) cpumask = &node_to_cpumask(node); dma_msi_set_affinity(irq_to_desc(drhd->iommu->msi.irq), cpumask); } int adjust_vtd_irq_affinities(void) { struct acpi_drhd_unit *drhd; if ( !iommu_enabled ) return 0; for_each_drhd_unit ( drhd ) adjust_irq_affinity(drhd); return 0; } __initcall(adjust_vtd_irq_affinities); static int init_vtd_hw(void) { struct acpi_drhd_unit *drhd; struct iommu *iommu; struct iommu_flush *flush = NULL; int ret; unsigned long flags; /* * Basic VT-d HW init: set VT-d interrupt, clear VT-d faults. */ for_each_drhd_unit ( drhd ) { adjust_irq_affinity(drhd); iommu = drhd->iommu; clear_fault_bits(iommu); spin_lock_irqsave(&iommu->register_lock, flags); dmar_writel(iommu->reg, DMAR_FECTL_REG, 0); spin_unlock_irqrestore(&iommu->register_lock, flags); } /* * Enable queue invalidation */ for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; /* * If queued invalidation not enabled, use regiser based * invalidation */ if ( enable_qinval(iommu) != 0 ) { flush = iommu_get_flush(iommu); flush->context = flush_context_reg; flush->iotlb = flush_iotlb_reg; } } /* * Enable interrupt remapping */ if ( iommu_intremap ) { int apic; for ( apic = 0; apic < nr_ioapics; apic++ ) { if ( ioapic_to_iommu(IO_APIC_ID(apic)) == NULL ) { iommu_intremap = 0; dprintk(XENLOG_ERR VTDPREFIX, "ioapic_to_iommu: ioapic %#x (id: %#x) is NULL! " "Will not try to enable Interrupt Remapping.\n", apic, IO_APIC_ID(apic)); break; } } } if ( iommu_intremap ) { for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; if ( enable_intremap(iommu, 0) != 0 ) { iommu_intremap = 0; dprintk(XENLOG_WARNING VTDPREFIX, "Interrupt Remapping not enabled\n"); break; } } if ( !iommu_intremap ) for_each_drhd_unit ( drhd ) disable_intremap(drhd->iommu); } /* * Set root entries for each VT-d engine. After set root entry, * must globally invalidate context cache, and then globally * invalidate IOTLB */ for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; ret = iommu_set_root_entry(iommu); if ( ret ) { dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n"); return -EIO; } } iommu_flush_all(); return 0; } static void __init setup_dom0_rmrr(struct domain *d) { struct acpi_rmrr_unit *rmrr; u16 bdf; int ret, i; spin_lock(&pcidevs_lock); for_each_rmrr_device ( rmrr, bdf, i ) { ret = rmrr_identity_mapping(d, rmrr); if ( ret ) dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: mapping reserved region failed\n"); } spin_unlock(&pcidevs_lock); } int __init intel_vtd_setup(void) { struct acpi_drhd_unit *drhd; struct iommu *iommu; int ret; if ( list_empty(&acpi_drhd_units) ) { ret = -ENODEV; goto error; } if ( unlikely(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_MSI) ) { ret = -EPERM; goto error; } platform_quirks_init(); if ( !iommu_enable ) { ret = -ENODEV; goto error; } /* We enable the following features only if they are supported by all VT-d * engines: Snoop Control, DMA passthrough, Queued Invalidation and * Interrupt Remapping. */ for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; printk("Intel VT-d iommu %"PRIu32" supported page sizes: 4kB", iommu->index); if (cap_sps_2mb(iommu->cap)) printk(", 2MB"); if (cap_sps_1gb(iommu->cap)) printk(", 1GB"); printk(".\n"); if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) ) iommu_snoop = 0; if ( iommu_passthrough && !ecap_pass_thru(iommu->ecap) ) iommu_passthrough = 0; if ( iommu_qinval && !ecap_queued_inval(iommu->ecap) ) iommu_qinval = 0; if ( iommu_intremap && !ecap_intr_remap(iommu->ecap) ) iommu_intremap = 0; if ( !vtd_ept_page_compatible(iommu) ) iommu_hap_pt_share = 0; ret = iommu_set_interrupt(drhd); if ( ret ) { dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n"); goto error; } } softirq_tasklet_init(&vtd_fault_tasklet, do_iommu_page_fault, 0); if ( !iommu_qinval && iommu_intremap ) { iommu_intremap = 0; dprintk(XENLOG_WARNING VTDPREFIX, "Interrupt Remapping disabled " "since Queued Invalidation isn't supported or enabled.\n"); } #define P(p,s) printk("Intel VT-d %s %senabled.\n", s, (p)? "" : "not ") P(iommu_snoop, "Snoop Control"); P(iommu_passthrough, "Dom0 DMA Passthrough"); P(iommu_qinval, "Queued Invalidation"); P(iommu_intremap, "Interrupt Remapping"); P(iommu_hap_pt_share, "Shared EPT tables"); #undef P scan_pci_devices(); ret = init_vtd_hw(); if ( ret ) goto error; register_keyhandler('V', &dump_iommu_info_keyhandler); return 0; error: iommu_enabled = 0; iommu_snoop = 0; iommu_passthrough = 0; iommu_qinval = 0; iommu_intremap = 0; return ret; } static int intel_iommu_assign_device( struct domain *d, u8 devfn, struct pci_dev *pdev) { struct acpi_rmrr_unit *rmrr; int ret = 0, i; u16 bdf, seg; u8 bus; if ( list_empty(&acpi_drhd_units) ) return -ENODEV; ret = reassign_device_ownership(dom0, d, devfn, pdev); if ( ret ) goto done; /* FIXME: Because USB RMRR conflicts with guest bios region, * ignore USB RMRR temporarily. */ seg = pdev->seg; bus = pdev->bus; if ( is_usb_device(seg, bus, pdev->devfn) ) { ret = 0; goto done; } /* Setup rmrr identity mapping */ for_each_rmrr_device( rmrr, bdf, i ) { if ( rmrr->segment == seg && PCI_BUS(bdf) == bus && PCI_DEVFN2(bdf) == devfn ) { ret = rmrr_identity_mapping(d, rmrr); if ( ret ) { dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: mapping reserved region failed\n"); goto done; } } } done: return ret; } static int intel_iommu_group_id(u16 seg, u8 bus, u8 devfn) { u8 secbus; if ( find_upstream_bridge(seg, &bus, &devfn, &secbus) < 0 ) return -1; else return PCI_BDF2(bus, devfn); } static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS]; static void vtd_suspend(void) { struct acpi_drhd_unit *drhd; struct iommu *iommu; u32 i; if ( !iommu_enabled ) return; iommu_flush_all(); for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; i = iommu->index; iommu_state[i][DMAR_FECTL_REG] = (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG); iommu_state[i][DMAR_FEDATA_REG] = (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG); iommu_state[i][DMAR_FEADDR_REG] = (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG); iommu_state[i][DMAR_FEUADDR_REG] = (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG); /* don't disable VT-d engine when force_iommu is set. */ if ( force_iommu ) continue; iommu_disable_translation(iommu); /* If interrupt remapping is enabled, queued invalidation * will be disabled following interupt remapping disabling * in local apic suspend */ if ( !iommu_intremap && iommu_qinval ) disable_qinval(iommu); } } static void vtd_crash_shutdown(void) { struct acpi_drhd_unit *drhd; struct iommu *iommu; if ( !iommu_enabled ) return; iommu_flush_all(); for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; iommu_disable_translation(iommu); disable_intremap(drhd->iommu); disable_qinval(drhd->iommu); } } static void vtd_resume(void) { struct acpi_drhd_unit *drhd; struct iommu *iommu; u32 i; unsigned long flags; if ( !iommu_enabled ) return; if ( init_vtd_hw() != 0 && force_iommu ) panic("IOMMU setup failed, crash Xen for security purpose"); for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; i = iommu->index; spin_lock_irqsave(&iommu->register_lock, flags); dmar_writel(iommu->reg, DMAR_FECTL_REG, (u32) iommu_state[i][DMAR_FECTL_REG]); dmar_writel(iommu->reg, DMAR_FEDATA_REG, (u32) iommu_state[i][DMAR_FEDATA_REG]); dmar_writel(iommu->reg, DMAR_FEADDR_REG, (u32) iommu_state[i][DMAR_FEADDR_REG]); dmar_writel(iommu->reg, DMAR_FEUADDR_REG, (u32) iommu_state[i][DMAR_FEUADDR_REG]); spin_unlock_irqrestore(&iommu->register_lock, flags); iommu_enable_translation(drhd); } } static void vtd_dump_p2m_table_level(paddr_t pt_maddr, int level, paddr_t gpa, int indent) { paddr_t address; int i; struct dma_pte *pt_vaddr, *pte; int next_level; if ( level < 1 ) return; pt_vaddr = map_vtd_domain_page(pt_maddr); if ( pt_vaddr == NULL ) { printk("Failed to map VT-D domain page %"PRIpaddr"\n", pt_maddr); return; } next_level = level - 1; for ( i = 0; i < PTE_NUM; i++ ) { if ( !(i % 2) ) process_pending_softirqs(); pte = &pt_vaddr[i]; if ( !dma_pte_present(*pte) ) continue; address = gpa + offset_level_address(i, level); if ( next_level >= 1 ) vtd_dump_p2m_table_level(dma_pte_addr(*pte), next_level, address, indent + 1); else printk("%*sgfn: %08lx mfn: %08lx\n", indent, "", (unsigned long)(address >> PAGE_SHIFT_4K), (unsigned long)(pte->val >> PAGE_SHIFT_4K)); } unmap_vtd_domain_page(pt_vaddr); } static void vtd_dump_p2m_table(struct domain *d) { struct hvm_iommu *hd; if ( list_empty(&acpi_drhd_units) ) return; hd = domain_hvm_iommu(d); printk("p2m table has %d levels\n", agaw_to_level(hd->agaw)); vtd_dump_p2m_table_level(hd->pgd_maddr, agaw_to_level(hd->agaw), 0, 0); } const struct iommu_ops intel_iommu_ops = { .init = intel_iommu_domain_init, .dom0_init = intel_iommu_dom0_init, .add_device = intel_iommu_add_device, .enable_device = intel_iommu_enable_device, .remove_device = intel_iommu_remove_device, .assign_device = intel_iommu_assign_device, .teardown = iommu_domain_teardown, .map_page = intel_iommu_map_page, .unmap_page = intel_iommu_unmap_page, .free_page_table = iommu_free_page_table, .reassign_device = reassign_device_ownership, .get_device_group_id = intel_iommu_group_id, .update_ire_from_apic = io_apic_write_remap_rte, .update_ire_from_msi = msi_msg_write_remap_rte, .read_apic_from_ire = io_apic_read_remap_rte, .read_msi_from_ire = msi_msg_read_remap_rte, .setup_hpet_msi = intel_setup_hpet_msi, .suspend = vtd_suspend, .resume = vtd_resume, .share_p2m = iommu_set_pgd, .crash_shutdown = vtd_crash_shutdown, .iotlb_flush = intel_iommu_iotlb_flush, .iotlb_flush_all = intel_iommu_iotlb_flush_all, .dump_p2m_table = vtd_dump_p2m_table, }; /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/drivers/passthrough/vtd/dmar.h0000664000175000017500000001001412307313555017147 0ustar smbsmb/* * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Copyright (C) Ashok Raj * Copyright (C) Shaohua Li */ #ifndef _DMAR_H_ #define _DMAR_H_ #include #include #include /* This one is for interrupt remapping */ struct acpi_ioapic_unit { struct list_head list; int apic_id; union { u16 info; struct { u16 func: 3, dev: 5, bus: 8; }bdf; }ioapic; }; struct acpi_hpet_unit { struct list_head list; unsigned int id; union { u16 bdf; struct { u16 func: 3, dev: 5, bus: 8; }; }; }; struct dmar_scope { DECLARE_BITMAP(buses, 256); /* buses owned by this unit */ u16 *devices; /* devices owned by this unit */ int devices_cnt; }; struct acpi_drhd_unit { struct dmar_scope scope; struct list_head list; u64 address; /* register base address of the unit */ u16 segment; u8 include_all:1; struct iommu *iommu; struct list_head ioapic_list; struct list_head hpet_list; }; struct acpi_rmrr_unit { struct dmar_scope scope; struct list_head list; u64 base_address; u64 end_address; u16 segment; u8 allow_all:1; }; struct acpi_atsr_unit { struct dmar_scope scope; struct list_head list; u16 segment; u8 all_ports:1; }; struct acpi_rhsa_unit { struct list_head list; u64 address; u32 proximity_domain; }; #define for_each_drhd_unit(drhd) \ list_for_each_entry(drhd, &acpi_drhd_units, list) #define for_each_rmrr_device(rmrr, bdf, idx) \ list_for_each_entry(rmrr, &acpi_rmrr_units, list) \ /* assume there never is a bdf == 0 */ \ for (idx = 0; (bdf = rmrr->scope.devices[idx]) && \ idx < rmrr->scope.devices_cnt; idx++) struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const struct pci_dev *); struct acpi_atsr_unit *acpi_find_matched_atsr_unit(const struct pci_dev *); #define DMAR_TYPE 1 #define RMRR_TYPE 2 #define ATSR_TYPE 3 #define DMAR_OPERATION_TIMEOUT MILLISECS(1000) #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \ do { \ s_time_t start_time = NOW(); \ while (1) { \ sts = op(iommu->reg, offset); \ if ( cond ) \ break; \ if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT ) { \ if ( !kexecing ) \ panic("%s:%d:%s: DMAR hardware is malfunctional",\ __FILE__, __LINE__, __func__); \ else \ break; \ } \ cpu_relax(); \ } \ } while (0) int vtd_hw_check(void); void disable_pmr(struct iommu *iommu); int is_usb_device(u16 seg, u8 bus, u8 devfn); int is_igd_drhd(struct acpi_drhd_unit *drhd); #endif /* _DMAR_H_ */ xen-4.4.0/xen/drivers/passthrough/vtd/intremap.c0000664000175000017500000006674012307313555020057 0ustar smbsmb/* * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Copyright (C) Allen Kay * Copyright (C) Xiaohui Xin */ #include #include #include #include #include #include #include #include #include "iommu.h" #include "dmar.h" #include "vtd.h" #include "extern.h" #include #include #define nr_ioapic_entries(i) nr_ioapic_entries[i] /* * source validation type (SVT) */ #define SVT_NO_VERIFY 0x0 /* no verification is required */ #define SVT_VERIFY_SID_SQ 0x1 /* verify using SID and SQ fiels */ #define SVT_VERIFY_BUS 0x2 /* verify bus of request-id */ /* * source-id qualifier (SQ) */ #define SQ_ALL_16 0x0 /* verify all 16 bits of request-id */ #define SQ_13_IGNORE_1 0x1 /* verify most significant 13 bits, ignore * the third least significant bit */ #define SQ_13_IGNORE_2 0x2 /* verify most significant 13 bits, ignore * the second and third least significant bits */ #define SQ_13_IGNORE_3 0x3 /* verify most significant 13 bits, ignore * the least three significant bits */ /* apic_pin_2_ir_idx[apicid][pin] = interrupt remapping table index */ static int **apic_pin_2_ir_idx; static int init_apic_pin_2_ir_idx(void) { int *_apic_pin_2_ir_idx; unsigned int nr_pins, i; /* Here we shouldn't need to re-init when resuming from S3. */ if ( apic_pin_2_ir_idx != NULL ) return 0; nr_pins = 0; for ( i = 0; i < nr_ioapics; i++ ) nr_pins += nr_ioapic_entries(i); _apic_pin_2_ir_idx = xmalloc_array(int, nr_pins); apic_pin_2_ir_idx = xmalloc_array(int *, nr_ioapics); if ( (_apic_pin_2_ir_idx == NULL) || (apic_pin_2_ir_idx == NULL) ) { xfree(_apic_pin_2_ir_idx); xfree(apic_pin_2_ir_idx); return -ENOMEM; } for ( i = 0; i < nr_pins; i++ ) _apic_pin_2_ir_idx[i] = -1; nr_pins = 0; for ( i = 0; i < nr_ioapics; i++ ) { apic_pin_2_ir_idx[i] = &_apic_pin_2_ir_idx[nr_pins]; nr_pins += nr_ioapic_entries(i); } return 0; } static u16 apicid_to_bdf(int apic_id) { struct acpi_drhd_unit *drhd = ioapic_to_drhd(apic_id); struct acpi_ioapic_unit *acpi_ioapic_unit; list_for_each_entry ( acpi_ioapic_unit, &drhd->ioapic_list, list ) if ( acpi_ioapic_unit->apic_id == apic_id ) return acpi_ioapic_unit->ioapic.info; dprintk(XENLOG_ERR VTDPREFIX, "Didn't find the bdf for the apic_id!\n"); return 0; } static u16 hpetid_to_bdf(unsigned int hpet_id) { struct acpi_drhd_unit *drhd = hpet_to_drhd(hpet_id); struct acpi_hpet_unit *acpi_hpet_unit; list_for_each_entry ( acpi_hpet_unit, &drhd->hpet_list, list ) if ( acpi_hpet_unit->id == hpet_id ) return acpi_hpet_unit->bdf; dprintk(XENLOG_ERR VTDPREFIX, "Didn't find the bdf for HPET %u!\n", hpet_id); return 0; } static void set_ire_sid(struct iremap_entry *ire, unsigned int svt, unsigned int sq, unsigned int sid) { ire->hi.svt = svt; ire->hi.sq = sq; ire->hi.sid = sid; } static void set_ioapic_source_id(int apic_id, struct iremap_entry *ire) { set_ire_sid(ire, SVT_VERIFY_SID_SQ, SQ_ALL_16, apicid_to_bdf(apic_id)); } static void set_hpet_source_id(unsigned int id, struct iremap_entry *ire) { /* * Should really use SQ_ALL_16. Some platforms are broken. * While we figure out the right quirks for these broken platforms, use * SQ_13_IGNORE_3 for now. */ set_ire_sid(ire, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, hpetid_to_bdf(id)); } int iommu_supports_eim(void) { struct acpi_drhd_unit *drhd; int apic; if ( !iommu_qinval || !iommu_intremap || list_empty(&acpi_drhd_units) ) return 0; /* We MUST have a DRHD unit for each IOAPIC. */ for ( apic = 0; apic < nr_ioapics; apic++ ) if ( !ioapic_to_drhd(IO_APIC_ID(apic)) ) { dprintk(XENLOG_WARNING VTDPREFIX, "There is not a DRHD for IOAPIC %#x (id: %#x)!\n", apic, IO_APIC_ID(apic)); return 0; } for_each_drhd_unit ( drhd ) if ( !ecap_queued_inval(drhd->iommu->ecap) || !ecap_intr_remap(drhd->iommu->ecap) || !ecap_eim(drhd->iommu->ecap) ) return 0; return 1; } /* Mark specified intr remap entry as free */ static void free_remap_entry(struct iommu *iommu, int index) { struct iremap_entry *iremap_entry = NULL, *iremap_entries; struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); if ( index < 0 || index > IREMAP_ENTRY_NR - 1 ) return; ASSERT( spin_is_locked(&ir_ctrl->iremap_lock) ); GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, iremap_entries, iremap_entry); memset(iremap_entry, 0, sizeof(struct iremap_entry)); iommu_flush_cache_entry(iremap_entry, sizeof(struct iremap_entry)); iommu_flush_iec_index(iommu, 0, index); unmap_vtd_domain_page(iremap_entries); ir_ctrl->iremap_num--; } /* * Look for a free intr remap entry (or a contiguous set thereof). * Need hold iremap_lock, and setup returned entry before releasing lock. */ static unsigned int alloc_remap_entry(struct iommu *iommu, unsigned int nr) { struct iremap_entry *iremap_entries = NULL; struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); unsigned int i, found; ASSERT( spin_is_locked(&ir_ctrl->iremap_lock) ); for ( found = i = 0; i < IREMAP_ENTRY_NR; i++ ) { struct iremap_entry *p; if ( i % (1 << IREMAP_ENTRY_ORDER) == 0 ) { /* This entry across page boundry */ if ( iremap_entries ) unmap_vtd_domain_page(iremap_entries); GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, i, iremap_entries, p); } else p = &iremap_entries[i % (1 << IREMAP_ENTRY_ORDER)]; if ( p->lo_val || p->hi_val ) /* not a free entry */ found = 0; else if ( ++found == nr ) break; } if ( iremap_entries ) unmap_vtd_domain_page(iremap_entries); if ( i < IREMAP_ENTRY_NR ) ir_ctrl->iremap_num += nr; return i; } static int remap_entry_to_ioapic_rte( struct iommu *iommu, int index, struct IO_xAPIC_route_entry *old_rte) { struct iremap_entry *iremap_entry = NULL, *iremap_entries; unsigned long flags; struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); if ( index < 0 || index > IREMAP_ENTRY_NR - 1 ) { dprintk(XENLOG_ERR VTDPREFIX, "%s: index (%d) for remap table is invalid !\n", __func__, index); return -EFAULT; } spin_lock_irqsave(&ir_ctrl->iremap_lock, flags); GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, iremap_entries, iremap_entry); if ( iremap_entry->hi_val == 0 && iremap_entry->lo_val == 0 ) { dprintk(XENLOG_ERR VTDPREFIX, "%s: index (%d) get an empty entry!\n", __func__, index); unmap_vtd_domain_page(iremap_entries); spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); return -EFAULT; } old_rte->vector = iremap_entry->lo.vector; old_rte->delivery_mode = iremap_entry->lo.dlm; old_rte->dest_mode = iremap_entry->lo.dm; old_rte->trigger = iremap_entry->lo.tm; old_rte->__reserved_2 = 0; old_rte->dest.logical.__reserved_1 = 0; old_rte->dest.logical.logical_dest = iremap_entry->lo.dst >> 8; unmap_vtd_domain_page(iremap_entries); spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); return 0; } static int ioapic_rte_to_remap_entry(struct iommu *iommu, int apic, unsigned int ioapic_pin, struct IO_xAPIC_route_entry *old_rte, unsigned int rte_upper, unsigned int value) { struct iremap_entry *iremap_entry = NULL, *iremap_entries; struct iremap_entry new_ire; struct IO_APIC_route_remap_entry *remap_rte; struct IO_xAPIC_route_entry new_rte; int index; unsigned long flags; struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); remap_rte = (struct IO_APIC_route_remap_entry *) old_rte; spin_lock_irqsave(&ir_ctrl->iremap_lock, flags); index = apic_pin_2_ir_idx[apic][ioapic_pin]; if ( index < 0 ) { index = alloc_remap_entry(iommu, 1); if ( index < IREMAP_ENTRY_NR ) apic_pin_2_ir_idx[apic][ioapic_pin] = index; } if ( index > IREMAP_ENTRY_NR - 1 ) { dprintk(XENLOG_ERR VTDPREFIX, "%s: intremap index (%d) is larger than" " the maximum index (%d)!\n", __func__, index, IREMAP_ENTRY_NR - 1); spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); return -EFAULT; } GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, iremap_entries, iremap_entry); memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry)); if ( rte_upper ) { if ( x2apic_enabled ) new_ire.lo.dst = value; else new_ire.lo.dst = (value >> 24) << 8; } else { *(((u32 *)&new_rte) + 0) = value; new_ire.lo.fpd = 0; new_ire.lo.dm = new_rte.dest_mode; new_ire.lo.tm = new_rte.trigger; new_ire.lo.dlm = new_rte.delivery_mode; /* Hardware require RH = 1 for LPR delivery mode */ new_ire.lo.rh = (new_ire.lo.dlm == dest_LowestPrio); new_ire.lo.avail = 0; new_ire.lo.res_1 = 0; new_ire.lo.vector = new_rte.vector; new_ire.lo.res_2 = 0; set_ioapic_source_id(IO_APIC_ID(apic), &new_ire); new_ire.hi.res_1 = 0; new_ire.lo.p = 1; /* finally, set present bit */ /* now construct new ioapic rte entry */ remap_rte->vector = new_rte.vector; remap_rte->delivery_mode = 0; /* has to be 0 for remap format */ remap_rte->index_15 = (index >> 15) & 0x1; remap_rte->index_0_14 = index & 0x7fff; remap_rte->delivery_status = new_rte.delivery_status; remap_rte->polarity = new_rte.polarity; remap_rte->irr = new_rte.irr; remap_rte->trigger = new_rte.trigger; remap_rte->mask = new_rte.mask; remap_rte->reserved = 0; remap_rte->format = 1; /* indicate remap format */ } memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry)); iommu_flush_cache_entry(iremap_entry, sizeof(struct iremap_entry)); iommu_flush_iec_index(iommu, 0, index); invalidate_sync(iommu); unmap_vtd_domain_page(iremap_entries); spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); return 0; } unsigned int io_apic_read_remap_rte( unsigned int apic, unsigned int reg) { unsigned int ioapic_pin = (reg - 0x10) / 2; int index; struct IO_xAPIC_route_entry old_rte = { 0 }; int rte_upper = (reg & 1) ? 1 : 0; struct iommu *iommu = ioapic_to_iommu(IO_APIC_ID(apic)); struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); if ( !ir_ctrl->iremap_num || ( (index = apic_pin_2_ir_idx[apic][ioapic_pin]) < 0 ) ) return __io_apic_read(apic, reg); old_rte = __ioapic_read_entry(apic, ioapic_pin, 1); if ( remap_entry_to_ioapic_rte(iommu, index, &old_rte) ) return __io_apic_read(apic, reg); if ( rte_upper ) return (*(((u32 *)&old_rte) + 1)); else return (*(((u32 *)&old_rte) + 0)); } void io_apic_write_remap_rte( unsigned int apic, unsigned int reg, unsigned int value) { unsigned int ioapic_pin = (reg - 0x10) / 2; struct IO_xAPIC_route_entry old_rte = { 0 }; struct IO_APIC_route_remap_entry *remap_rte; unsigned int rte_upper = (reg & 1) ? 1 : 0; struct iommu *iommu = ioapic_to_iommu(IO_APIC_ID(apic)); int saved_mask; old_rte = __ioapic_read_entry(apic, ioapic_pin, 1); remap_rte = (struct IO_APIC_route_remap_entry *) &old_rte; /* mask the interrupt while we change the intremap table */ saved_mask = remap_rte->mask; remap_rte->mask = 1; __io_apic_write(apic, reg & ~1, *(u32 *)&old_rte); remap_rte->mask = saved_mask; if ( ioapic_rte_to_remap_entry(iommu, apic, ioapic_pin, &old_rte, rte_upper, value) ) { __io_apic_write(apic, reg, value); /* Recover the original value of 'mask' bit */ if ( rte_upper ) __io_apic_write(apic, reg & ~1, *(u32 *)&old_rte); } else __ioapic_write_entry(apic, ioapic_pin, 1, old_rte); } static void set_msi_source_id(struct pci_dev *pdev, struct iremap_entry *ire) { u16 seg; u8 bus, devfn, secbus; int ret; if ( !pdev || !ire ) return; seg = pdev->seg; bus = pdev->bus; devfn = pdev->devfn; switch ( pdev->type ) { unsigned int sq; case DEV_TYPE_PCIe_ENDPOINT: case DEV_TYPE_PCIe_BRIDGE: case DEV_TYPE_PCIe2PCI_BRIDGE: case DEV_TYPE_PCI_HOST_BRIDGE: switch ( pdev->phantom_stride ) { case 1: sq = SQ_13_IGNORE_3; break; case 2: sq = SQ_13_IGNORE_2; break; case 4: sq = SQ_13_IGNORE_1; break; default: sq = SQ_ALL_16; break; } set_ire_sid(ire, SVT_VERIFY_SID_SQ, sq, PCI_BDF2(bus, devfn)); break; case DEV_TYPE_PCI: case DEV_TYPE_LEGACY_PCI_BRIDGE: case DEV_TYPE_PCI2PCIe_BRIDGE: ret = find_upstream_bridge(seg, &bus, &devfn, &secbus); if ( ret == 0 ) /* integrated PCI device */ { set_ire_sid(ire, SVT_VERIFY_SID_SQ, SQ_ALL_16, PCI_BDF2(bus, devfn)); } else if ( ret == 1 ) /* find upstream bridge */ { if ( pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE ) set_ire_sid(ire, SVT_VERIFY_BUS, SQ_ALL_16, (bus << 8) | pdev->bus); else set_ire_sid(ire, SVT_VERIFY_SID_SQ, SQ_ALL_16, PCI_BDF2(bus, devfn)); } else dprintk(XENLOG_WARNING VTDPREFIX, "d%d: no upstream bridge for %04x:%02x:%02x.%u\n", pdev->domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); break; default: dprintk(XENLOG_WARNING VTDPREFIX, "d%d: unknown(%u): %04x:%02x:%02x.%u\n", pdev->domain->domain_id, pdev->type, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); break; } } static int remap_entry_to_msi_msg( struct iommu *iommu, struct msi_msg *msg, unsigned int index) { struct iremap_entry *iremap_entry = NULL, *iremap_entries; struct msi_msg_remap_entry *remap_rte; unsigned long flags; struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); remap_rte = (struct msi_msg_remap_entry *) msg; index += (remap_rte->address_lo.index_15 << 15) | remap_rte->address_lo.index_0_14; if ( index >= IREMAP_ENTRY_NR ) { dprintk(XENLOG_ERR VTDPREFIX, "%s: index (%d) for remap table is invalid !\n", __func__, index); return -EFAULT; } spin_lock_irqsave(&ir_ctrl->iremap_lock, flags); GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, iremap_entries, iremap_entry); if ( iremap_entry->hi_val == 0 && iremap_entry->lo_val == 0 ) { dprintk(XENLOG_ERR VTDPREFIX, "%s: index (%d) get an empty entry!\n", __func__, index); unmap_vtd_domain_page(iremap_entries); spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); return -EFAULT; } msg->address_hi = MSI_ADDR_BASE_HI; msg->address_lo = MSI_ADDR_BASE_LO | ((iremap_entry->lo.dm == 0) ? MSI_ADDR_DESTMODE_PHYS: MSI_ADDR_DESTMODE_LOGIC) | ((iremap_entry->lo.dlm != dest_LowestPrio) ? MSI_ADDR_REDIRECTION_CPU: MSI_ADDR_REDIRECTION_LOWPRI); if ( x2apic_enabled ) msg->dest32 = iremap_entry->lo.dst; else msg->dest32 = (iremap_entry->lo.dst >> 8) & 0xff; msg->address_lo |= (msg->dest32 & 0xff) << MSI_ADDR_DEST_ID_SHIFT; msg->data = MSI_DATA_TRIGGER_EDGE | MSI_DATA_LEVEL_ASSERT | ((iremap_entry->lo.dlm != dest_LowestPrio) ? MSI_DATA_DELIVERY_FIXED: MSI_DATA_DELIVERY_LOWPRI) | iremap_entry->lo.vector; unmap_vtd_domain_page(iremap_entries); spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); return 0; } static int msi_msg_to_remap_entry( struct iommu *iommu, struct pci_dev *pdev, struct msi_desc *msi_desc, struct msi_msg *msg) { struct iremap_entry *iremap_entry = NULL, *iremap_entries; struct iremap_entry new_ire; struct msi_msg_remap_entry *remap_rte; unsigned int index, i, nr = 1; unsigned long flags; struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ) nr = msi_desc->msi.nvec; spin_lock_irqsave(&ir_ctrl->iremap_lock, flags); if ( msg == NULL ) { /* Free specified unused IRTEs */ for ( i = 0; i < nr; ++i ) free_remap_entry(iommu, msi_desc->remap_index + i); spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); return 0; } if ( msi_desc->remap_index < 0 ) { index = alloc_remap_entry(iommu, nr); for ( i = 0; i < nr; ++i ) msi_desc[i].remap_index = index + i; } else index = msi_desc->remap_index; if ( index > IREMAP_ENTRY_NR - 1 ) { dprintk(XENLOG_ERR VTDPREFIX, "%s: intremap index (%d) is larger than" " the maximum index (%d)!\n", __func__, index, IREMAP_ENTRY_NR - 1); for ( i = 0; i < nr; ++i ) msi_desc[i].remap_index = -1; spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); return -EFAULT; } GET_IREMAP_ENTRY(ir_ctrl->iremap_maddr, index, iremap_entries, iremap_entry); memcpy(&new_ire, iremap_entry, sizeof(struct iremap_entry)); /* Set interrupt remapping table entry */ new_ire.lo.fpd = 0; new_ire.lo.dm = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1; new_ire.lo.tm = (msg->data >> MSI_DATA_TRIGGER_SHIFT) & 0x1; new_ire.lo.dlm = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1; /* Hardware require RH = 1 for LPR delivery mode */ new_ire.lo.rh = (new_ire.lo.dlm == dest_LowestPrio); new_ire.lo.avail = 0; new_ire.lo.res_1 = 0; new_ire.lo.vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK; new_ire.lo.res_2 = 0; if ( x2apic_enabled ) new_ire.lo.dst = msg->dest32; else new_ire.lo.dst = ((msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff) << 8; if ( pdev ) set_msi_source_id(pdev, &new_ire); else set_hpet_source_id(msi_desc->hpet_id, &new_ire); new_ire.hi.res_1 = 0; new_ire.lo.p = 1; /* finally, set present bit */ /* now construct new MSI/MSI-X rte entry */ remap_rte = (struct msi_msg_remap_entry *)msg; remap_rte->address_lo.dontcare = 0; i = index; if ( !nr ) i -= msi_desc->msi_attrib.entry_nr; remap_rte->address_lo.index_15 = (i >> 15) & 0x1; remap_rte->address_lo.index_0_14 = i & 0x7fff; remap_rte->address_lo.SHV = 1; remap_rte->address_lo.format = 1; remap_rte->address_hi = 0; remap_rte->data = index - i; memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry)); iommu_flush_cache_entry(iremap_entry, sizeof(struct iremap_entry)); iommu_flush_iec_index(iommu, 0, index); invalidate_sync(iommu); unmap_vtd_domain_page(iremap_entries); spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); return 0; } void msi_msg_read_remap_rte( struct msi_desc *msi_desc, struct msi_msg *msg) { struct pci_dev *pdev = msi_desc->dev; struct acpi_drhd_unit *drhd = NULL; drhd = pdev ? acpi_find_matched_drhd_unit(pdev) : hpet_to_drhd(msi_desc->hpet_id); if ( drhd ) remap_entry_to_msi_msg(drhd->iommu, msg, msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ? msi_desc->msi_attrib.entry_nr : 0); } int msi_msg_write_remap_rte( struct msi_desc *msi_desc, struct msi_msg *msg) { struct pci_dev *pdev = msi_desc->dev; struct acpi_drhd_unit *drhd = NULL; drhd = pdev ? acpi_find_matched_drhd_unit(pdev) : hpet_to_drhd(msi_desc->hpet_id); return drhd ? msi_msg_to_remap_entry(drhd->iommu, pdev, msi_desc, msg) : -EINVAL; } int __init intel_setup_hpet_msi(struct msi_desc *msi_desc) { struct iommu *iommu = hpet_to_iommu(msi_desc->hpet_id); struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); unsigned long flags; int rc = 0; if ( !ir_ctrl || !ir_ctrl->iremap_maddr ) return 0; spin_lock_irqsave(&ir_ctrl->iremap_lock, flags); msi_desc->remap_index = alloc_remap_entry(iommu, 1); if ( msi_desc->remap_index >= IREMAP_ENTRY_NR ) { dprintk(XENLOG_ERR VTDPREFIX, "%s: intremap index (%d) is larger than" " the maximum index (%d)!\n", __func__, msi_desc->remap_index, IREMAP_ENTRY_NR - 1); msi_desc->remap_index = -1; rc = -ENXIO; } spin_unlock_irqrestore(&ir_ctrl->iremap_lock, flags); return rc; } int enable_intremap(struct iommu *iommu, int eim) { struct acpi_drhd_unit *drhd; struct ir_ctrl *ir_ctrl; u32 sts, gcmd; unsigned long flags; ASSERT(ecap_intr_remap(iommu->ecap) && iommu_intremap); if ( !platform_supports_intremap() ) { printk(XENLOG_ERR VTDPREFIX " Platform firmware does not support interrupt remapping\n"); return -EINVAL; } ir_ctrl = iommu_ir_ctrl(iommu); sts = dmar_readl(iommu->reg, DMAR_GSTS_REG); /* Return if already enabled by Xen */ if ( (sts & DMA_GSTS_IRES) && ir_ctrl->iremap_maddr ) return 0; if ( !(sts & DMA_GSTS_QIES) ) { printk(XENLOG_ERR VTDPREFIX " Queued invalidation is not enabled on IOMMU #%u:" " Should not enable interrupt remapping\n", iommu->index); return -EINVAL; } if ( !eim && (sts & DMA_GSTS_CFIS) ) printk(XENLOG_WARNING VTDPREFIX " Compatibility Format Interrupts permitted on IOMMU #%u:" " Device pass-through will be insecure\n", iommu->index); if ( ir_ctrl->iremap_maddr == 0 ) { drhd = iommu_to_drhd(iommu); ir_ctrl->iremap_maddr = alloc_pgtable_maddr(drhd, IREMAP_ARCH_PAGE_NR); if ( ir_ctrl->iremap_maddr == 0 ) { dprintk(XENLOG_WARNING VTDPREFIX, "Cannot allocate memory for ir_ctrl->iremap_maddr\n"); return -ENOMEM; } ir_ctrl->iremap_num = 0; } /* set extended interrupt mode bit */ ir_ctrl->iremap_maddr |= eim ? IRTA_EIME : 0; spin_lock_irqsave(&iommu->register_lock, flags); /* set size of the interrupt remapping table */ ir_ctrl->iremap_maddr |= IRTA_REG_TABLE_SIZE; dmar_writeq(iommu->reg, DMAR_IRTA_REG, ir_ctrl->iremap_maddr); /* set SIRTP */ gcmd = dmar_readl(iommu->reg, DMAR_GSTS_REG); gcmd |= DMA_GCMD_SIRTP; dmar_writel(iommu->reg, DMAR_GCMD_REG, gcmd); IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl, (sts & DMA_GSTS_SIRTPS), sts); spin_unlock_irqrestore(&iommu->register_lock, flags); /* After set SIRTP, must globally invalidate the interrupt entry cache */ iommu_flush_iec_global(iommu); spin_lock_irqsave(&iommu->register_lock, flags); /* enable interrupt remapping hardware */ gcmd |= DMA_GCMD_IRE; dmar_writel(iommu->reg, DMAR_GCMD_REG, gcmd); IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl, (sts & DMA_GSTS_IRES), sts); spin_unlock_irqrestore(&iommu->register_lock, flags); return init_apic_pin_2_ir_idx(); } void disable_intremap(struct iommu *iommu) { u32 sts; u64 irta; unsigned long flags; if ( !ecap_intr_remap(iommu->ecap) ) return; spin_lock_irqsave(&iommu->register_lock, flags); sts = dmar_readl(iommu->reg, DMAR_GSTS_REG); if ( !(sts & DMA_GSTS_IRES) ) goto out; dmar_writel(iommu->reg, DMAR_GCMD_REG, sts & (~DMA_GCMD_IRE)); IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl, !(sts & DMA_GSTS_IRES), sts); /* If we are disabling Interrupt Remapping, make sure we dont stay in * Extended Interrupt Mode, as this is unaffected by the Interrupt * Remapping flag in each DMAR Global Control Register. * Specifically, local apics in xapic mode do not like interrupts delivered * in x2apic mode. Any code turning interrupt remapping back on will set * EIME back correctly. */ if ( !ecap_eim(iommu->ecap) ) goto out; /* Can't read the register unless we ecaps says we can */ irta = dmar_readl(iommu->reg, DMAR_IRTA_REG); if ( !(irta & IRTA_EIME) ) goto out; dmar_writel(iommu->reg, DMAR_IRTA_REG, irta & ~IRTA_EIME); IOMMU_WAIT_OP(iommu, DMAR_IRTA_REG, dmar_readl, !(irta & IRTA_EIME), irta); out: spin_unlock_irqrestore(&iommu->register_lock, flags); } /* * This function is used to enable Interrupt remapping when * enable x2apic */ int iommu_enable_x2apic_IR(void) { struct acpi_drhd_unit *drhd; struct iommu *iommu; if ( !iommu_supports_eim() ) return -1; if ( !platform_supports_x2apic() ) return -1; for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; /* Clear previous faults */ clear_fault_bits(iommu); /* * Disable interrupt remapping and queued invalidation if * already enabled by BIOS */ disable_intremap(iommu); disable_qinval(iommu); } /* Enable queue invalidation */ for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; if ( enable_qinval(iommu) != 0 ) { dprintk(XENLOG_INFO VTDPREFIX, "Failed to enable Queued Invalidation!\n"); return -1; } } /* Enable interrupt remapping */ for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; if ( enable_intremap(iommu, 1) ) { dprintk(XENLOG_INFO VTDPREFIX, "Failed to enable Interrupt Remapping!\n"); return -1; } } return 0; } /* * This function is used to disable Interrutp remapping when * suspend local apic */ void iommu_disable_x2apic_IR(void) { struct acpi_drhd_unit *drhd; if ( !iommu_supports_eim() ) return; for_each_drhd_unit ( drhd ) disable_intremap(drhd->iommu); for_each_drhd_unit ( drhd ) disable_qinval(drhd->iommu); } xen-4.4.0/xen/drivers/passthrough/vtd/extern.h0000664000175000017500000001003212307313555017531 0ustar smbsmb/* * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Copyright (C) Allen Kay * Copyright (C) Weidong Han */ #ifndef _VTD_EXTERN_H_ #define _VTD_EXTERN_H_ #include "dmar.h" #include #define VTDPREFIX "[VT-D]" extern bool_t rwbf_quirk; void print_iommu_regs(struct acpi_drhd_unit *drhd); void print_vtd_entries(struct iommu *iommu, int bus, int devfn, u64 gmfn); extern struct keyhandler dump_iommu_info_keyhandler; int enable_qinval(struct iommu *iommu); void disable_qinval(struct iommu *iommu); int enable_intremap(struct iommu *iommu, int eim); void disable_intremap(struct iommu *iommu); void iommu_flush_cache_entry(void *addr, unsigned int size); void iommu_flush_cache_page(void *addr, unsigned long npages); int iommu_alloc(struct acpi_drhd_unit *drhd); void iommu_free(struct acpi_drhd_unit *drhd); int queue_invalidate_context(struct iommu *iommu, u16 did, u16 source_id, u8 function_mask, u8 granu); int queue_invalidate_iotlb(struct iommu *iommu, u8 granu, u8 dr, u8 dw, u16 did, u8 am, u8 ih, u64 addr); int queue_invalidate_iec(struct iommu *iommu, u8 granu, u8 im, u16 iidx); int invalidate_sync(struct iommu *iommu); int iommu_flush_iec_global(struct iommu *iommu); int iommu_flush_iec_index(struct iommu *iommu, u8 im, u16 iidx); void clear_fault_bits(struct iommu *iommu); struct iommu * ioapic_to_iommu(unsigned int apic_id); struct iommu * hpet_to_iommu(unsigned int hpet_id); struct acpi_drhd_unit * ioapic_to_drhd(unsigned int apic_id); struct acpi_drhd_unit * hpet_to_drhd(unsigned int hpet_id); struct acpi_drhd_unit * iommu_to_drhd(struct iommu *iommu); struct acpi_rhsa_unit * drhd_to_rhsa(struct acpi_drhd_unit *drhd); struct acpi_drhd_unit * find_ats_dev_drhd(struct iommu *iommu); int ats_device(const struct pci_dev *, const struct acpi_drhd_unit *); int dev_invalidate_iotlb(struct iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type); int qinval_device_iotlb(struct iommu *iommu, u32 max_invs_pend, u16 sid, u16 size, u64 addr); unsigned int get_cache_line_size(void); void cacheline_flush(char *); void flush_all_cache(void); u64 alloc_pgtable_maddr(struct acpi_drhd_unit *drhd, unsigned long npages); void free_pgtable_maddr(u64 maddr); void *map_vtd_domain_page(u64 maddr); void unmap_vtd_domain_page(void *va); int domain_context_mapping_one(struct domain *domain, struct iommu *iommu, u8 bus, u8 devfn, const struct pci_dev *); int domain_context_unmap_one(struct domain *domain, struct iommu *iommu, u8 bus, u8 devfn); unsigned int io_apic_read_remap_rte(unsigned int apic, unsigned int reg); void io_apic_write_remap_rte(unsigned int apic, unsigned int reg, unsigned int value); struct msi_desc; struct msi_msg; void msi_msg_read_remap_rte(struct msi_desc *, struct msi_msg *); int msi_msg_write_remap_rte(struct msi_desc *, struct msi_msg *); int intel_setup_hpet_msi(struct msi_desc *); int is_igd_vt_enabled_quirk(void); void platform_quirks_init(void); void vtd_ops_preamble_quirk(struct iommu* iommu); void vtd_ops_postamble_quirk(struct iommu* iommu); void me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map); void pci_vtd_quirk(struct pci_dev *pdev); int platform_supports_intremap(void); int platform_supports_x2apic(void); #endif // _VTD_EXTERN_H_ xen-4.4.0/xen/drivers/passthrough/vtd/iommu.h0000664000175000017500000004456412307313555017373 0ustar smbsmb/* * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Copyright (C) Ashok Raj */ #ifndef _INTEL_IOMMU_H_ #define _INTEL_IOMMU_H_ #include #include /* * Intel IOMMU register specification per version 1.0 public spec. */ #define DMAR_VER_REG 0x0 /* Arch version supported by this IOMMU */ #define DMAR_CAP_REG 0x8 /* Hardware supported capabilities */ #define DMAR_ECAP_REG 0x10 /* Extended capabilities supported */ #define DMAR_GCMD_REG 0x18 /* Global command register */ #define DMAR_GSTS_REG 0x1c /* Global status register */ #define DMAR_RTADDR_REG 0x20 /* Root entry table */ #define DMAR_CCMD_REG 0x28 /* Context command reg */ #define DMAR_FSTS_REG 0x34 /* Fault Status register */ #define DMAR_FECTL_REG 0x38 /* Fault control register */ #define DMAR_FEDATA_REG 0x3c /* Fault event interrupt data register */ #define DMAR_FEADDR_REG 0x40 /* Fault event interrupt addr register */ #define DMAR_FEUADDR_REG 0x44 /* Upper address register */ #define DMAR_AFLOG_REG 0x58 /* Advanced Fault control */ #define DMAR_PMEN_REG 0x64 /* Enable Protected Memory Region */ #define DMAR_PLMBASE_REG 0x68 /* PMRR Low addr */ #define DMAR_PLMLIMIT_REG 0x6c /* PMRR low limit */ #define DMAR_PHMBASE_REG 0x70 /* pmrr high base addr */ #define DMAR_PHMLIMIT_REG 0x78 /* pmrr high limit */ #define DMAR_IQH_REG 0x80 /* invalidation queue head */ #define DMAR_IQT_REG 0x88 /* invalidation queue tail */ #define DMAR_IQA_REG 0x90 /* invalidation queue addr */ #define DMAR_IRTA_REG 0xB8 /* intr remap */ #define OFFSET_STRIDE (9) #define dmar_readl(dmar, reg) readl(dmar + reg) #define dmar_writel(dmar, reg, val) writel(val, dmar + reg) #define dmar_readq(dmar, reg) ({ \ u32 lo, hi; \ lo = dmar_readl(dmar, reg); \ hi = dmar_readl(dmar, reg + 4); \ (((u64) hi) << 32) + lo; }) #define dmar_writeq(dmar, reg, val) do {\ dmar_writel(dmar, reg, (u32)val); \ dmar_writel(dmar, reg + 4, (u32)((u64) val >> 32)); \ } while (0) #define VER_MAJOR(v) (((v) & 0xf0) >> 4) #define VER_MINOR(v) ((v) & 0x0f) /* * Decoding Capability Register */ #define cap_read_drain(c) (((c) >> 55) & 1) #define cap_write_drain(c) (((c) >> 54) & 1) #define cap_max_amask_val(c) (((c) >> 48) & 0x3f) #define cap_num_fault_regs(c) ((((c) >> 40) & 0xff) + 1) #define cap_pgsel_inv(c) (((c) >> 39) & 1) #define cap_super_page_val(c) (((c) >> 34) & 0xf) #define cap_super_offset(c) (((find_first_bit(&cap_super_page_val(c), 4)) \ * OFFSET_STRIDE) + 21) #define cap_sps_2mb(c) ((c >> 34) & 1) #define cap_sps_1gb(c) ((c >> 35) & 1) #define cap_sps_512gb(c) ((c >> 36) & 1) #define cap_sps_1tb(c) ((c >> 37) & 1) #define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16) #define cap_isoch(c) (((c) >> 23) & 1) #define cap_qos(c) (((c) >> 22) & 1) #define cap_mgaw(c) ((((c) >> 16) & 0x3f) + 1) #define cap_sagaw(c) (((c) >> 8) & 0x1f) #define cap_caching_mode(c) (((c) >> 7) & 1) #define cap_phmr(c) (((c) >> 6) & 1) #define cap_plmr(c) (((c) >> 5) & 1) #define cap_rwbf(c) (((c) >> 4) & 1) #define cap_afl(c) (((c) >> 3) & 1) #define cap_ndoms(c) (1 << (4 + 2 * ((c) & 0x7))) /* * Extended Capability Register */ #define ecap_niotlb_iunits(e) ((((e) >> 24) & 0xff) + 1) #define ecap_iotlb_offset(e) ((((e) >> 8) & 0x3ff) * 16) #define ecap_coherent(e) ((e >> 0) & 0x1) #define ecap_queued_inval(e) ((e >> 1) & 0x1) #define ecap_dev_iotlb(e) ((e >> 2) & 0x1) #define ecap_intr_remap(e) ((e >> 3) & 0x1) #define ecap_eim(e) ((e >> 4) & 0x1) #define ecap_cache_hints(e) ((e >> 5) & 0x1) #define ecap_pass_thru(e) ((e >> 6) & 0x1) #define ecap_snp_ctl(e) ((e >> 7) & 0x1) /* IOTLB_REG */ #define DMA_TLB_FLUSH_GRANU_OFFSET 60 #define DMA_TLB_GLOBAL_FLUSH (((u64)1) << 60) #define DMA_TLB_DSI_FLUSH (((u64)2) << 60) #define DMA_TLB_PSI_FLUSH (((u64)3) << 60) #define DMA_TLB_IIRG(x) (((x) >> 60) & 7) #define DMA_TLB_IAIG(val) (((val) >> 57) & 7) #define DMA_TLB_DID(x) (((u64)(x & 0xffff)) << 32) #define DMA_TLB_READ_DRAIN (((u64)1) << 49) #define DMA_TLB_WRITE_DRAIN (((u64)1) << 48) #define DMA_TLB_IVT (((u64)1) << 63) #define DMA_TLB_IVA_ADDR(x) ((((u64)x) >> 12) << 12) #define DMA_TLB_IVA_HINT(x) ((((u64)x) & 1) << 6) /* GCMD_REG */ #define DMA_GCMD_TE (((u64)1) << 31) #define DMA_GCMD_SRTP (((u64)1) << 30) #define DMA_GCMD_SFL (((u64)1) << 29) #define DMA_GCMD_EAFL (((u64)1) << 28) #define DMA_GCMD_WBF (((u64)1) << 27) #define DMA_GCMD_QIE (((u64)1) << 26) #define DMA_GCMD_IRE (((u64)1) << 25) #define DMA_GCMD_SIRTP (((u64)1) << 24) #define DMA_GCMD_CFI (((u64)1) << 23) /* GSTS_REG */ #define DMA_GSTS_TES (((u64)1) << 31) #define DMA_GSTS_RTPS (((u64)1) << 30) #define DMA_GSTS_FLS (((u64)1) << 29) #define DMA_GSTS_AFLS (((u64)1) << 28) #define DMA_GSTS_WBFS (((u64)1) << 27) #define DMA_GSTS_QIES (((u64)1) <<26) #define DMA_GSTS_IRES (((u64)1) <<25) #define DMA_GSTS_SIRTPS (((u64)1) << 24) #define DMA_GSTS_CFIS (((u64)1) <<23) /* PMEN_REG */ #define DMA_PMEN_EPM (((u32)1) << 31) #define DMA_PMEN_PRS (((u32)1) << 0) /* CCMD_REG */ #define DMA_CCMD_INVL_GRANU_OFFSET 61 #define DMA_CCMD_ICC (((u64)1) << 63) #define DMA_CCMD_GLOBAL_INVL (((u64)1) << 61) #define DMA_CCMD_DOMAIN_INVL (((u64)2) << 61) #define DMA_CCMD_DEVICE_INVL (((u64)3) << 61) #define DMA_CCMD_FM(m) (((u64)((m) & 0x3)) << 32) #define DMA_CCMD_CIRG(x) ((((u64)3) << 61) & x) #define DMA_CCMD_MASK_NOBIT 0 #define DMA_CCMD_MASK_1BIT 1 #define DMA_CCMD_MASK_2BIT 2 #define DMA_CCMD_MASK_3BIT 3 #define DMA_CCMD_SID(s) (((u64)((s) & 0xffff)) << 16) #define DMA_CCMD_DID(d) ((u64)((d) & 0xffff)) #define DMA_CCMD_CAIG_MASK(x) (((u64)x) & ((u64) 0x3 << 59)) /* FECTL_REG */ #define DMA_FECTL_IM (((u64)1) << 31) /* FSTS_REG */ #define DMA_FSTS_PFO ((u64)1 << 0) #define DMA_FSTS_PPF ((u64)1 << 1) #define DMA_FSTS_AFO ((u64)1 << 2) #define DMA_FSTS_APF ((u64)1 << 3) #define DMA_FSTS_IQE ((u64)1 << 4) #define DMA_FSTS_ICE ((u64)1 << 5) #define DMA_FSTS_ITE ((u64)1 << 6) #define DMA_FSTS_FAULTS DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_AFO | DMA_FSTS_APF | DMA_FSTS_IQE | DMA_FSTS_ICE | DMA_FSTS_ITE #define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff) /* FRCD_REG, 32 bits access */ #define DMA_FRCD_F (((u64)1) << 31) #define dma_frcd_type(d) ((d >> 30) & 1) #define dma_frcd_fault_reason(c) (c & 0xff) #define dma_frcd_source_id(c) (c & 0xffff) #define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */ /* * 0: Present * 1-11: Reserved * 12-63: Context Ptr (12 - (haw-1)) * 64-127: Reserved */ struct root_entry { u64 val; u64 rsvd1; }; #define root_present(root) ((root).val & 1) #define set_root_present(root) do {(root).val |= 1;} while(0) #define get_context_addr(root) ((root).val & PAGE_MASK_4K) #define set_root_value(root, value) \ do {(root).val |= ((value) & PAGE_MASK_4K);} while(0) struct context_entry { u64 lo; u64 hi; }; #define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry)) #define context_present(c) ((c).lo & 1) #define context_fault_disable(c) (((c).lo >> 1) & 1) #define context_translation_type(c) (((c).lo >> 2) & 3) #define context_address_root(c) ((c).lo & PAGE_MASK_4K) #define context_address_width(c) ((c).hi & 7) #define context_domain_id(c) (((c).hi >> 8) & ((1 << 16) - 1)) #define context_set_present(c) do {(c).lo |= 1;} while(0) #define context_clear_present(c) do {(c).lo &= ~1;} while(0) #define context_set_fault_enable(c) \ do {(c).lo &= (((u64)-1) << 2) | 1;} while(0) #define context_set_translation_type(c, val) do { \ (c).lo &= (((u64)-1) << 4) | 3; \ (c).lo |= (val & 3) << 2; \ } while(0) #define CONTEXT_TT_MULTI_LEVEL 0 #define CONTEXT_TT_DEV_IOTLB 1 #define CONTEXT_TT_PASS_THRU 2 #define context_set_address_root(c, val) \ do {(c).lo &= 0xfff; (c).lo |= (val) & PAGE_MASK_4K ;} while(0) #define context_set_address_width(c, val) \ do {(c).hi &= 0xfffffff8; (c).hi |= (val) & 7;} while(0) #define context_clear_entry(c) do {(c).lo = 0; (c).hi = 0;} while(0) /* page table handling */ #define LEVEL_STRIDE (9) #define LEVEL_MASK ((1 << LEVEL_STRIDE) - 1) #define PTE_NUM (1 << LEVEL_STRIDE) #define level_to_agaw(val) ((val) - 2) #define agaw_to_level(val) ((val) + 2) #define agaw_to_width(val) (30 + val * LEVEL_STRIDE) #define width_to_agaw(w) ((w - 30)/LEVEL_STRIDE) #define level_to_offset_bits(l) (12 + (l - 1) * LEVEL_STRIDE) #define address_level_offset(addr, level) \ ((addr >> level_to_offset_bits(level)) & LEVEL_MASK) #define offset_level_address(offset, level) \ ((u64)(offset) << level_to_offset_bits(level)) #define level_mask(l) (((u64)(-1)) << level_to_offset_bits(l)) #define level_size(l) (1 << level_to_offset_bits(l)) #define align_to_level(addr, l) ((addr + level_size(l) - 1) & level_mask(l)) /* * 0: readable * 1: writable * 2-6: reserved * 7: super page * 8-11: available * 12-63: Host physcial address */ struct dma_pte { u64 val; }; #define DMA_PTE_READ (1) #define DMA_PTE_WRITE (2) #define DMA_PTE_SNP (1 << 11) #define dma_clear_pte(p) do {(p).val = 0;} while(0) #define dma_set_pte_readable(p) do {(p).val |= DMA_PTE_READ;} while(0) #define dma_set_pte_writable(p) do {(p).val |= DMA_PTE_WRITE;} while(0) #define dma_set_pte_superpage(p) do {(p).val |= (1 << 7);} while(0) #define dma_set_pte_snp(p) do {(p).val |= DMA_PTE_SNP;} while(0) #define dma_set_pte_prot(p, prot) \ do {(p).val = ((p).val & ~3) | ((prot) & 3); } while (0) #define dma_pte_addr(p) ((p).val & PAGE_MASK_4K) #define dma_set_pte_addr(p, addr) do {\ (p).val |= ((addr) & PAGE_MASK_4K); } while (0) #define dma_pte_present(p) (((p).val & 3) != 0) /* interrupt remap entry */ struct iremap_entry { union { u64 lo_val; struct { u64 p : 1, fpd : 1, dm : 1, rh : 1, tm : 1, dlm : 3, avail : 4, res_1 : 4, vector : 8, res_2 : 8, dst : 32; }lo; }; union { u64 hi_val; struct { u64 sid : 16, sq : 2, svt : 2, res_1 : 44; }hi; }; }; /* Max intr remapping table page order is 8, as max number of IRTEs is 64K */ #define IREMAP_PAGE_ORDER 8 /* * VTd engine handles 4K page, while CPU may have different page size on * different arch. E.g. 16K on IPF. */ #define IREMAP_ARCH_PAGE_ORDER (IREMAP_PAGE_ORDER + PAGE_SHIFT_4K - PAGE_SHIFT) #define IREMAP_ARCH_PAGE_NR ( IREMAP_ARCH_PAGE_ORDER < 0 ? \ 1 : \ 1 << IREMAP_ARCH_PAGE_ORDER ) /* Each entry is 16 bytes, so 2^8 entries per 4K page */ #define IREMAP_ENTRY_ORDER ( PAGE_SHIFT - 4 ) #define IREMAP_ENTRY_NR ( 1 << ( IREMAP_PAGE_ORDER + 8 ) ) #define iremap_present(v) ((v).lo & 1) #define iremap_fault_disable(v) (((v).lo >> 1) & 1) #define iremap_set_present(v) do {(v).lo |= 1;} while(0) #define iremap_clear_present(v) do {(v).lo &= ~1;} while(0) /* * Get the intr remap entry: * maddr - machine addr of the table * index - index of the entry * entries - return addr of the page holding this entry, need unmap it * entry - return required entry */ #define GET_IREMAP_ENTRY(maddr, index, entries, entry) \ do { \ entries = (struct iremap_entry *)map_vtd_domain_page( \ (maddr) + (( (index) >> IREMAP_ENTRY_ORDER ) << PAGE_SHIFT ) ); \ entry = &entries[(index) % (1 << IREMAP_ENTRY_ORDER)]; \ } while(0) /* queue invalidation entry */ struct qinval_entry { union { struct { u64 lo; u64 hi; }val; struct { struct { u64 type : 4, granu : 2, res_1 : 10, did : 16, sid : 16, fm : 2, res_2 : 14; }lo; struct { u64 res; }hi; }cc_inv_dsc; struct { struct { u64 type : 4, granu : 2, dw : 1, dr : 1, res_1 : 8, did : 16, res_2 : 32; }lo; struct { u64 am : 6, ih : 1, res_1 : 5, addr : 52; }hi; }iotlb_inv_dsc; struct { struct { u64 type : 4, res_1 : 12, max_invs_pend: 5, res_2 : 11, sid : 16, res_3 : 16; }lo; struct { u64 size : 1, res_1 : 11, addr : 52; }hi; }dev_iotlb_inv_dsc; struct { struct { u64 type : 4, granu : 1, res_1 : 22, im : 5, iidx : 16, res_2 : 16; }lo; struct { u64 res; }hi; }iec_inv_dsc; struct { struct { u64 type : 4, iflag : 1, sw : 1, fn : 1, res_1 : 25, sdata : 32; }lo; struct { u64 res_1 : 2, saddr : 62; }hi; }inv_wait_dsc; }q; }; /* Order of queue invalidation pages(max is 8) */ #define QINVAL_PAGE_ORDER 2 #define QINVAL_ARCH_PAGE_ORDER (QINVAL_PAGE_ORDER + PAGE_SHIFT_4K - PAGE_SHIFT) #define QINVAL_ARCH_PAGE_NR ( QINVAL_ARCH_PAGE_ORDER < 0 ? \ 1 : \ 1 << QINVAL_ARCH_PAGE_ORDER ) /* Each entry is 16 bytes, so 2^8 entries per page */ #define QINVAL_ENTRY_ORDER ( PAGE_SHIFT - 4 ) #define QINVAL_ENTRY_NR (1 << (QINVAL_PAGE_ORDER + 8)) /* Status data flag */ #define QINVAL_STAT_INIT 0 #define QINVAL_STAT_DONE 1 /* Queue invalidation head/tail shift */ #define QINVAL_INDEX_SHIFT 4 #define qinval_present(v) ((v).lo & 1) #define qinval_fault_disable(v) (((v).lo >> 1) & 1) #define qinval_set_present(v) do {(v).lo |= 1;} while(0) #define qinval_clear_present(v) do {(v).lo &= ~1;} while(0) #define RESERVED_VAL 0 #define TYPE_INVAL_CONTEXT 0x1 #define TYPE_INVAL_IOTLB 0x2 #define TYPE_INVAL_DEVICE_IOTLB 0x3 #define TYPE_INVAL_IEC 0x4 #define TYPE_INVAL_WAIT 0x5 #define NOTIFY_TYPE_POLL 1 #define NOTIFY_TYPE_INTR 1 #define INTERRUTP_FLAG 1 #define STATUS_WRITE 1 #define FENCE_FLAG 1 #define IEC_GLOBAL_INVL 0 #define IEC_INDEX_INVL 1 #define IRTA_EIME (((u64)1) << 11) /* 2^(IRTA_REG_TABLE_SIZE + 1) = IREMAP_ENTRY_NR */ #define IRTA_REG_TABLE_SIZE ( IREMAP_PAGE_ORDER + 7 ) #define VTD_PAGE_TABLE_LEVEL_3 3 #define VTD_PAGE_TABLE_LEVEL_4 4 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 #define MAX_IOMMU_REGS 0xc0 extern struct list_head acpi_drhd_units; extern struct list_head acpi_rmrr_units; extern struct list_head acpi_ioapic_units; struct qi_ctrl { u64 qinval_maddr; /* queue invalidation page machine address */ int qinval_index; /* queue invalidation index */ spinlock_t qinval_lock; /* lock for queue invalidation page */ }; struct ir_ctrl { u64 iremap_maddr; /* interrupt remap table machine address */ int iremap_num; /* total num of used interrupt remap entry */ spinlock_t iremap_lock; /* lock for irq remappping table */ }; struct iommu_flush { int (*context)(void *iommu, u16 did, u16 source_id, u8 function_mask, u64 type, int non_present_entry_flush); int (*iotlb)(void *iommu, u16 did, u64 addr, unsigned int size_order, u64 type, int flush_non_present_entry, int flush_dev_iotlb); }; struct intel_iommu { struct qi_ctrl qi_ctrl; struct ir_ctrl ir_ctrl; struct iommu_flush flush; struct acpi_drhd_unit *drhd; }; struct iommu { struct list_head list; void __iomem *reg; /* Pointer to hardware regs, virtual addr */ u32 index; /* Sequence number of iommu */ u32 nr_pt_levels; u64 cap; u64 ecap; spinlock_t lock; /* protect context, domain ids */ spinlock_t register_lock; /* protect iommu register handling */ u64 root_maddr; /* root entry machine address */ struct msi_desc msi; struct intel_iommu *intel; unsigned long *domid_bitmap; /* domain id bitmap */ u16 *domid_map; /* domain id mapping array */ }; static inline struct qi_ctrl *iommu_qi_ctrl(struct iommu *iommu) { return iommu ? &iommu->intel->qi_ctrl : NULL; } static inline struct ir_ctrl *iommu_ir_ctrl(struct iommu *iommu) { return iommu ? &iommu->intel->ir_ctrl : NULL; } static inline struct iommu_flush *iommu_get_flush(struct iommu *iommu) { return iommu ? &iommu->intel->flush : NULL; } #define INTEL_IOMMU_DEBUG(fmt, args...) \ do \ { \ if ( iommu_debug ) \ dprintk(XENLOG_WARNING VTDPREFIX, fmt, ## args); \ } while(0) #endif xen-4.4.0/xen/drivers/passthrough/vtd/vtd.h0000664000175000017500000000362512307313555017033 0ustar smbsmb/* * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Copyright (C) Allen Kay * Copyright (C) Weidong Han */ #ifndef _VTD_H_ #define _VTD_H_ #include #define MAP_ME_PHANTOM_FUNC 1 #define UNMAP_ME_PHANTOM_FUNC 0 /* Allow for both IOAPIC and IOSAPIC. */ #define IO_xAPIC_route_entry IO_APIC_route_entry struct IO_APIC_route_remap_entry { union { u64 val; struct { u64 vector:8, delivery_mode:3, index_15:1, delivery_status:1, polarity:1, irr:1, trigger:1, mask:1, reserved:31, format:1, index_0_14:15; }; }; }; struct msi_msg_remap_entry { union { u32 val; struct { u32 dontcare:2, index_15:1, SHV:1, format:1, index_0_14:15, addr_id_val:12; /* Interrupt address identifier value, must be 0FEEh */ }; } address_lo; /* low 32 bits of msi message address */ u32 address_hi; /* high 32 bits of msi message address */ u32 data; /* msi message data */ }; #endif // _VTD_H_ xen-4.4.0/xen/drivers/passthrough/vtd/dmar.c0000664000175000017500000006217612307313555017162 0ustar smbsmb/* * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Copyright (C) Ashok Raj * Copyright (C) Shaohua Li * Copyright (C) Allen Kay - adapted to xen */ #include #include #include #include #include #include #include #include #include #include #include "dmar.h" #include "iommu.h" #include "extern.h" #include "vtd.h" #undef PREFIX #define PREFIX VTDPREFIX "ACPI DMAR:" #define DEBUG #define MIN_SCOPE_LEN (sizeof(struct acpi_dmar_device_scope) + \ sizeof(struct acpi_dmar_pci_path)) LIST_HEAD_READ_MOSTLY(acpi_drhd_units); LIST_HEAD_READ_MOSTLY(acpi_rmrr_units); static LIST_HEAD_READ_MOSTLY(acpi_atsr_units); static LIST_HEAD_READ_MOSTLY(acpi_rhsa_units); static struct acpi_table_header *__read_mostly dmar_table; static int __read_mostly dmar_flags; static u64 __read_mostly igd_drhd_address; static void __init dmar_scope_add_buses(struct dmar_scope *scope, u16 sec_bus, u16 sub_bus) { sub_bus &= 0xff; if (sec_bus > sub_bus) return; while ( sec_bus <= sub_bus ) set_bit(sec_bus++, scope->buses); } static int __init acpi_register_drhd_unit(struct acpi_drhd_unit *drhd) { /* * add INCLUDE_ALL at the tail, so scan the list will find it at * the very end. */ if ( drhd->include_all ) list_add_tail(&drhd->list, &acpi_drhd_units); else list_add(&drhd->list, &acpi_drhd_units); return 0; } static int __init acpi_register_rmrr_unit(struct acpi_rmrr_unit *rmrr) { list_add(&rmrr->list, &acpi_rmrr_units); return 0; } static void __init disable_all_dmar_units(void) { struct acpi_drhd_unit *drhd, *_drhd; struct acpi_rmrr_unit *rmrr, *_rmrr; struct acpi_atsr_unit *atsr, *_atsr; list_for_each_entry_safe ( drhd, _drhd, &acpi_drhd_units, list ) { list_del(&drhd->list); xfree(drhd); } list_for_each_entry_safe ( rmrr, _rmrr, &acpi_rmrr_units, list ) { list_del(&rmrr->list); xfree(rmrr); } list_for_each_entry_safe ( atsr, _atsr, &acpi_atsr_units, list ) { list_del(&atsr->list); xfree(atsr); } } static int acpi_ioapic_device_match( struct list_head *ioapic_list, unsigned int apic_id) { struct acpi_ioapic_unit *ioapic; list_for_each_entry( ioapic, ioapic_list, list ) { if (ioapic->apic_id == apic_id) return 1; } return 0; } struct acpi_drhd_unit * ioapic_to_drhd(unsigned int apic_id) { struct acpi_drhd_unit *drhd; list_for_each_entry( drhd, &acpi_drhd_units, list ) if ( acpi_ioapic_device_match(&drhd->ioapic_list, apic_id) ) return drhd; return NULL; } struct acpi_drhd_unit * iommu_to_drhd(struct iommu *iommu) { struct acpi_drhd_unit *drhd; if ( iommu == NULL ) return NULL; list_for_each_entry( drhd, &acpi_drhd_units, list ) if ( drhd->iommu == iommu ) return drhd; return NULL; } struct iommu * ioapic_to_iommu(unsigned int apic_id) { struct acpi_drhd_unit *drhd; list_for_each_entry( drhd, &acpi_drhd_units, list ) if ( acpi_ioapic_device_match(&drhd->ioapic_list, apic_id) ) return drhd->iommu; return NULL; } static bool_t acpi_hpet_device_match( struct list_head *list, unsigned int hpet_id) { struct acpi_hpet_unit *hpet; list_for_each_entry( hpet, list, list ) if (hpet->id == hpet_id) return 1; return 0; } struct acpi_drhd_unit *hpet_to_drhd(unsigned int hpet_id) { struct acpi_drhd_unit *drhd; list_for_each_entry( drhd, &acpi_drhd_units, list ) if ( acpi_hpet_device_match(&drhd->hpet_list, hpet_id) ) return drhd; return NULL; } struct iommu *hpet_to_iommu(unsigned int hpet_id) { struct acpi_drhd_unit *drhd = hpet_to_drhd(hpet_id); return drhd ? drhd->iommu : NULL; } static int __init acpi_register_atsr_unit(struct acpi_atsr_unit *atsr) { /* * add ALL_PORTS at the tail, so scan the list will find it at * the very end. */ if ( atsr->all_ports ) list_add_tail(&atsr->list, &acpi_atsr_units); else list_add(&atsr->list, &acpi_atsr_units); return 0; } struct acpi_drhd_unit *acpi_find_matched_drhd_unit(const struct pci_dev *pdev) { u8 bus, devfn; struct acpi_drhd_unit *drhd; struct acpi_drhd_unit *include_all = NULL; int i; if ( pdev == NULL ) return NULL; if ( pdev->info.is_extfn ) { bus = pdev->bus; devfn = 0; } else if ( pdev->info.is_virtfn ) { bus = pdev->info.physfn.bus; devfn = PCI_SLOT(pdev->info.physfn.devfn) ? 0 : pdev->info.physfn.devfn; } else { bus = pdev->bus; devfn = pdev->devfn; } list_for_each_entry ( drhd, &acpi_drhd_units, list ) { if ( drhd->segment != pdev->seg ) continue; for (i = 0; i < drhd->scope.devices_cnt; i++) if ( drhd->scope.devices[i] == PCI_BDF2(bus, devfn) ) return drhd; if ( test_bit(bus, drhd->scope.buses) ) return drhd; if ( drhd->include_all ) include_all = drhd; } return include_all; } struct acpi_atsr_unit *acpi_find_matched_atsr_unit(const struct pci_dev *pdev) { struct acpi_atsr_unit *atsr; struct acpi_atsr_unit *all_ports = NULL; list_for_each_entry ( atsr, &acpi_atsr_units, list ) { if ( atsr->segment != pdev->seg ) continue; if ( test_bit(pdev->bus, atsr->scope.buses) ) return atsr; if ( atsr->all_ports ) all_ports = atsr; } return all_ports; } struct acpi_rhsa_unit * drhd_to_rhsa(struct acpi_drhd_unit *drhd) { struct acpi_rhsa_unit *rhsa; if ( drhd == NULL ) return NULL; list_for_each_entry ( rhsa, &acpi_rhsa_units, list ) { if ( rhsa->address == drhd->address ) return rhsa; } return NULL; } int is_igd_drhd(struct acpi_drhd_unit *drhd) { return drhd && (drhd->address == igd_drhd_address); } /* * Count number of devices in device scope. Do not include PCI sub * hierarchies. */ static int __init scope_device_count(const void *start, const void *end) { const struct acpi_dmar_device_scope *scope; int count = 0; while ( start < end ) { scope = start; if ( scope->length < MIN_SCOPE_LEN ) { dprintk(XENLOG_WARNING VTDPREFIX, "Invalid device scope.\n"); return -EINVAL; } if ( scope->entry_type == ACPI_DMAR_SCOPE_TYPE_BRIDGE || scope->entry_type == ACPI_DMAR_SCOPE_TYPE_ENDPOINT || scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC || scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET ) count++; start += scope->length; } return count; } static int __init acpi_parse_dev_scope( const void *start, const void *end, struct dmar_scope *scope, int type, u16 seg) { struct acpi_ioapic_unit *acpi_ioapic_unit; const struct acpi_dmar_device_scope *acpi_scope; u16 bus, sub_bus, sec_bus; const struct acpi_dmar_pci_path *path; struct acpi_drhd_unit *drhd = type == DMAR_TYPE ? container_of(scope, struct acpi_drhd_unit, scope) : NULL; int depth, cnt, didx = 0, ret; if ( (cnt = scope_device_count(start, end)) < 0 ) return cnt; scope->devices_cnt = cnt; if ( cnt > 0 ) { scope->devices = xzalloc_array(u16, cnt); if ( !scope->devices ) return -ENOMEM; } while ( start < end ) { acpi_scope = start; path = (const void *)(acpi_scope + 1); depth = (acpi_scope->length - sizeof(*acpi_scope)) / sizeof(*path); bus = acpi_scope->bus; while ( --depth > 0 ) { bus = pci_conf_read8(seg, bus, path->dev, path->fn, PCI_SECONDARY_BUS); path++; } switch ( acpi_scope->entry_type ) { case ACPI_DMAR_SCOPE_TYPE_BRIDGE: sec_bus = pci_conf_read8(seg, bus, path->dev, path->fn, PCI_SECONDARY_BUS); sub_bus = pci_conf_read8(seg, bus, path->dev, path->fn, PCI_SUBORDINATE_BUS); if ( iommu_verbose ) dprintk(VTDPREFIX, " bridge: %04x:%02x:%02x.%u start=%x sec=%x sub=%x\n", seg, bus, path->dev, path->fn, acpi_scope->bus, sec_bus, sub_bus); dmar_scope_add_buses(scope, sec_bus, sub_bus); break; case ACPI_DMAR_SCOPE_TYPE_HPET: if ( iommu_verbose ) dprintk(VTDPREFIX, " MSI HPET: %04x:%02x:%02x.%u\n", seg, bus, path->dev, path->fn); if ( drhd ) { struct acpi_hpet_unit *acpi_hpet_unit; ret = -ENOMEM; acpi_hpet_unit = xmalloc(struct acpi_hpet_unit); if ( !acpi_hpet_unit ) goto out; acpi_hpet_unit->id = acpi_scope->enumeration_id; acpi_hpet_unit->bus = bus; acpi_hpet_unit->dev = path->dev; acpi_hpet_unit->func = path->fn; list_add(&acpi_hpet_unit->list, &drhd->hpet_list); } break; case ACPI_DMAR_SCOPE_TYPE_ENDPOINT: if ( iommu_verbose ) dprintk(VTDPREFIX, " endpoint: %04x:%02x:%02x.%u\n", seg, bus, path->dev, path->fn); if ( drhd ) { if ( (seg == 0) && (bus == 0) && (path->dev == 2) && (path->fn == 0) ) igd_drhd_address = drhd->address; } break; case ACPI_DMAR_SCOPE_TYPE_IOAPIC: if ( iommu_verbose ) dprintk(VTDPREFIX, " IOAPIC: %04x:%02x:%02x.%u\n", seg, bus, path->dev, path->fn); if ( drhd ) { ret = -ENOMEM; acpi_ioapic_unit = xmalloc(struct acpi_ioapic_unit); if ( !acpi_ioapic_unit ) goto out; acpi_ioapic_unit->apic_id = acpi_scope->enumeration_id; acpi_ioapic_unit->ioapic.bdf.bus = bus; acpi_ioapic_unit->ioapic.bdf.dev = path->dev; acpi_ioapic_unit->ioapic.bdf.func = path->fn; list_add(&acpi_ioapic_unit->list, &drhd->ioapic_list); } break; default: if ( iommu_verbose ) printk(XENLOG_WARNING VTDPREFIX "Unknown scope type %#x\n", acpi_scope->entry_type); start += acpi_scope->length; continue; } scope->devices[didx++] = PCI_BDF(bus, path->dev, path->fn); start += acpi_scope->length; } ret = 0; out: if ( ret ) xfree(scope->devices); return ret; } static int __init acpi_dmar_check_length( const struct acpi_dmar_header *h, unsigned int min_len) { if ( h->length >= min_len ) return 0; dprintk(XENLOG_ERR VTDPREFIX, "Invalid ACPI DMAR entry length: %#x\n", h->length); return -EINVAL; } static int __init acpi_parse_one_drhd(struct acpi_dmar_header *header) { struct acpi_dmar_hardware_unit *drhd = container_of(header, struct acpi_dmar_hardware_unit, header); void *dev_scope_start, *dev_scope_end; struct acpi_drhd_unit *dmaru; int ret; static int include_all = 0; if ( (ret = acpi_dmar_check_length(header, sizeof(*drhd))) != 0 ) return ret; if ( !drhd->address || !(drhd->address + 1) ) return -ENODEV; dmaru = xzalloc(struct acpi_drhd_unit); if ( !dmaru ) return -ENOMEM; dmaru->address = drhd->address; dmaru->segment = drhd->segment; dmaru->include_all = drhd->flags & ACPI_DMAR_INCLUDE_ALL; INIT_LIST_HEAD(&dmaru->ioapic_list); INIT_LIST_HEAD(&dmaru->hpet_list); if ( iommu_verbose ) dprintk(VTDPREFIX, " dmaru->address = %"PRIx64"\n", dmaru->address); ret = iommu_alloc(dmaru); if ( ret ) goto out; dev_scope_start = (void *)(drhd + 1); dev_scope_end = ((void *)drhd) + header->length; ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end, &dmaru->scope, DMAR_TYPE, drhd->segment); if ( dmaru->include_all ) { if ( iommu_verbose ) dprintk(VTDPREFIX, " flags: INCLUDE_ALL\n"); /* Only allow one INCLUDE_ALL */ if ( drhd->segment == 0 && include_all ) { dprintk(XENLOG_WARNING VTDPREFIX, "Only one INCLUDE_ALL device scope is allowed\n"); ret = -EINVAL; } if ( drhd->segment == 0 ) include_all = 1; } if ( ret ) goto out; else if ( force_iommu || dmaru->include_all ) acpi_register_drhd_unit(dmaru); else { u8 b, d, f; unsigned int i = 0, invalid_cnt = 0; union { const void *raw; const struct acpi_dmar_device_scope *scope; } p; /* Skip checking if segment is not accessible yet. */ if ( !pci_known_segment(drhd->segment) ) i = UINT_MAX; for ( p.raw = dev_scope_start; i < dmaru->scope.devices_cnt; i++, p.raw += p.scope->length ) { if ( p.scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC || p.scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET ) continue; b = PCI_BUS(dmaru->scope.devices[i]); d = PCI_SLOT(dmaru->scope.devices[i]); f = PCI_FUNC(dmaru->scope.devices[i]); if ( pci_device_detect(drhd->segment, b, d, f) == 0 ) { dprintk(XENLOG_WARNING VTDPREFIX, " Non-existent device (%04x:%02x:%02x.%u) is reported" " in this DRHD's scope!\n", drhd->segment, b, d, f); invalid_cnt++; } } if ( invalid_cnt ) { if ( iommu_workaround_bios_bug && invalid_cnt == dmaru->scope.devices_cnt ) { dprintk(XENLOG_WARNING VTDPREFIX, " Workaround BIOS bug: ignore the DRHD due to all " "devices under its scope are not PCI discoverable!\n"); iommu_free(dmaru); xfree(dmaru); } else { dprintk(XENLOG_WARNING VTDPREFIX, " The DRHD is invalid due to there are devices under " "its scope are not PCI discoverable! Pls try option " "iommu=force or iommu=workaround_bios_bug if you " "really want VT-d\n"); ret = -EINVAL; } } else acpi_register_drhd_unit(dmaru); } out: if ( ret ) { iommu_free(dmaru); xfree(dmaru); } return ret; } static int __init acpi_parse_one_rmrr(struct acpi_dmar_header *header) { struct acpi_dmar_reserved_memory *rmrr = container_of(header, struct acpi_dmar_reserved_memory, header); struct acpi_rmrr_unit *rmrru; void *dev_scope_start, *dev_scope_end; u64 base_addr = rmrr->base_address, end_addr = rmrr->end_address; int ret; if ( (ret = acpi_dmar_check_length(header, sizeof(*rmrr))) != 0 ) return ret; /* This check is here simply to detect when RMRR values are * not properly represented in the system memory map and * inform the user */ if ( (!page_is_ram_type(paddr_to_pfn(base_addr), RAM_TYPE_RESERVED)) || (!page_is_ram_type(paddr_to_pfn(end_addr), RAM_TYPE_RESERVED)) ) { dprintk(XENLOG_WARNING VTDPREFIX, " RMRR address range not in reserved memory " "base = %"PRIx64" end = %"PRIx64"; " "iommu_inclusive_mapping=1 parameter may be needed.\n", base_addr, end_addr); } rmrru = xzalloc(struct acpi_rmrr_unit); if ( !rmrru ) return -ENOMEM; rmrru->base_address = base_addr; rmrru->end_address = end_addr; rmrru->segment = rmrr->segment; dev_scope_start = (void *)(rmrr + 1); dev_scope_end = ((void *)rmrr) + header->length; ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end, &rmrru->scope, RMRR_TYPE, rmrr->segment); if ( ret || (rmrru->scope.devices_cnt == 0) ) xfree(rmrru); else { u8 b, d, f; bool_t ignore = 0; unsigned int i = 0; /* Skip checking if segment is not accessible yet. */ if ( !pci_known_segment(rmrr->segment) ) i = UINT_MAX; for ( ; i < rmrru->scope.devices_cnt; i++ ) { b = PCI_BUS(rmrru->scope.devices[i]); d = PCI_SLOT(rmrru->scope.devices[i]); f = PCI_FUNC(rmrru->scope.devices[i]); if ( pci_device_detect(rmrr->segment, b, d, f) == 0 ) { dprintk(XENLOG_WARNING VTDPREFIX, " Non-existent device (%04x:%02x:%02x.%u) is reported" " in RMRR (%"PRIx64", %"PRIx64")'s scope!\n", rmrr->segment, b, d, f, rmrru->base_address, rmrru->end_address); ignore = 1; } else { ignore = 0; break; } } if ( ignore ) { dprintk(XENLOG_WARNING VTDPREFIX, " Ignore the RMRR (%"PRIx64", %"PRIx64") due to " "devices under its scope are not PCI discoverable!\n", rmrru->base_address, rmrru->end_address); xfree(rmrru); } else if ( base_addr > end_addr ) { dprintk(XENLOG_WARNING VTDPREFIX, " The RMRR (%"PRIx64", %"PRIx64") is incorrect!\n", rmrru->base_address, rmrru->end_address); xfree(rmrru); ret = -EFAULT; } else { if ( iommu_verbose ) dprintk(VTDPREFIX, " RMRR region: base_addr %"PRIx64 " end_address %"PRIx64"\n", rmrru->base_address, rmrru->end_address); acpi_register_rmrr_unit(rmrru); } } return ret; } static int __init acpi_parse_one_atsr(struct acpi_dmar_header *header) { struct acpi_dmar_atsr *atsr = container_of(header, struct acpi_dmar_atsr, header); struct acpi_atsr_unit *atsru; int ret; static int all_ports; void *dev_scope_start, *dev_scope_end; if ( (ret = acpi_dmar_check_length(header, sizeof(*atsr))) != 0 ) return ret; atsru = xzalloc(struct acpi_atsr_unit); if ( !atsru ) return -ENOMEM; atsru->segment = atsr->segment; atsru->all_ports = atsr->flags & ACPI_DMAR_ALL_PORTS; if ( iommu_verbose ) dprintk(VTDPREFIX, " atsru->all_ports: %x\n", atsru->all_ports); if ( !atsru->all_ports ) { dev_scope_start = (void *)(atsr + 1); dev_scope_end = ((void *)atsr) + header->length; ret = acpi_parse_dev_scope(dev_scope_start, dev_scope_end, &atsru->scope, ATSR_TYPE, atsr->segment); } else { if ( iommu_verbose ) dprintk(VTDPREFIX, " flags: ALL_PORTS\n"); /* Only allow one ALL_PORTS */ if ( atsr->segment == 0 && all_ports ) { dprintk(XENLOG_WARNING VTDPREFIX, "Only one ALL_PORTS device scope is allowed\n"); ret = -EINVAL; } if ( atsr->segment == 0 ) all_ports = 1; } if ( ret ) xfree(atsru); else acpi_register_atsr_unit(atsru); return ret; } static int __init acpi_parse_one_rhsa(struct acpi_dmar_header *header) { struct acpi_dmar_rhsa *rhsa = container_of(header, struct acpi_dmar_rhsa, header); struct acpi_rhsa_unit *rhsau; int ret; if ( (ret = acpi_dmar_check_length(header, sizeof(*rhsa))) != 0 ) return ret; rhsau = xzalloc(struct acpi_rhsa_unit); if ( !rhsau ) return -ENOMEM; rhsau->address = rhsa->base_address; rhsau->proximity_domain = rhsa->proximity_domain; list_add_tail(&rhsau->list, &acpi_rhsa_units); if ( iommu_verbose ) dprintk(VTDPREFIX, " rhsau->address: %"PRIx64 " rhsau->proximity_domain: %"PRIx32"\n", rhsau->address, rhsau->proximity_domain); return ret; } static int __init acpi_parse_dmar(struct acpi_table_header *table) { struct acpi_table_dmar *dmar; struct acpi_dmar_header *entry_header; u8 dmar_host_address_width; int ret = 0; dmar = (struct acpi_table_dmar *)table; dmar_flags = dmar->flags; if ( !iommu_enable && !iommu_intremap ) { ret = -EINVAL; goto out; } if ( !dmar->width ) { dprintk(XENLOG_WARNING VTDPREFIX, "Zero: Invalid DMAR width\n"); ret = -EINVAL; goto out; } dmar_host_address_width = dmar->width + 1; if ( iommu_verbose ) dprintk(VTDPREFIX, "Host address width %d\n", dmar_host_address_width); entry_header = (void *)(dmar + 1); while ( ((unsigned long)entry_header) < (((unsigned long)dmar) + table->length) ) { ret = acpi_dmar_check_length(entry_header, sizeof(*entry_header)); if ( ret ) break; switch ( entry_header->type ) { case ACPI_DMAR_TYPE_HARDWARE_UNIT: if ( iommu_verbose ) dprintk(VTDPREFIX, "found ACPI_DMAR_DRHD:\n"); ret = acpi_parse_one_drhd(entry_header); break; case ACPI_DMAR_TYPE_RESERVED_MEMORY: if ( iommu_verbose ) dprintk(VTDPREFIX, "found ACPI_DMAR_RMRR:\n"); ret = acpi_parse_one_rmrr(entry_header); break; case ACPI_DMAR_TYPE_ATSR: if ( iommu_verbose ) dprintk(VTDPREFIX, "found ACPI_DMAR_ATSR:\n"); ret = acpi_parse_one_atsr(entry_header); break; case ACPI_DMAR_HARDWARE_AFFINITY: if ( iommu_verbose ) dprintk(VTDPREFIX, "found ACPI_DMAR_RHSA:\n"); ret = acpi_parse_one_rhsa(entry_header); break; default: dprintk(XENLOG_WARNING VTDPREFIX, "Ignore unknown DMAR structure type (%#x)\n", entry_header->type); break; } if ( ret ) break; entry_header = ((void *)entry_header + entry_header->length); } if ( ret ) { printk(XENLOG_WARNING "Failed to parse ACPI DMAR. Disabling VT-d.\n"); disable_all_dmar_units(); } out: /* Zap ACPI DMAR signature to prevent dom0 using vt-d HW. */ dmar->header.signature[0] = 'X'; dmar->header.checksum -= 'X'-'D'; return ret; } #include /* ACPI tables may not be DMA protected by tboot, so use DMAR copy */ /* SINIT saved in SinitMleData in TXT heap (which is DMA protected) */ #define parse_dmar_table(h) tboot_parse_dmar_table(h) int __init acpi_dmar_init(void) { acpi_physical_address dmar_addr; acpi_native_uint dmar_len; if ( ACPI_SUCCESS(acpi_get_table_phys(ACPI_SIG_DMAR, 0, &dmar_addr, &dmar_len)) ) { map_pages_to_xen((unsigned long)__va(dmar_addr), PFN_DOWN(dmar_addr), PFN_UP(dmar_addr + dmar_len) - PFN_DOWN(dmar_addr), PAGE_HYPERVISOR); dmar_table = __va(dmar_addr); } return parse_dmar_table(acpi_parse_dmar); } void acpi_dmar_reinstate(void) { if ( dmar_table == NULL ) return; dmar_table->signature[0] = 'D'; dmar_table->checksum += 'X'-'D'; } void acpi_dmar_zap(void) { if ( dmar_table == NULL ) return; dmar_table->signature[0] = 'X'; dmar_table->checksum -= 'X'-'D'; } int platform_supports_intremap(void) { unsigned int mask = ACPI_DMAR_INTR_REMAP; return (dmar_flags & mask) == ACPI_DMAR_INTR_REMAP; } int platform_supports_x2apic(void) { unsigned int mask = ACPI_DMAR_INTR_REMAP | ACPI_DMAR_X2APIC_OPT_OUT; return cpu_has_x2apic && ((dmar_flags & mask) == ACPI_DMAR_INTR_REMAP); } xen-4.4.0/xen/drivers/passthrough/vtd/utils.c0000664000175000017500000002434412307313555017372 0ustar smbsmb/* * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Copyright (C) Allen Kay */ #include #include #include #include #include #include #include "iommu.h" #include "dmar.h" #include "vtd.h" #include "extern.h" #include int is_usb_device(u16 seg, u8 bus, u8 devfn) { u16 class = pci_conf_read16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), PCI_CLASS_DEVICE); return (class == 0xc03); } /* Disable vt-d protected memory registers. */ void disable_pmr(struct iommu *iommu) { u32 val; unsigned long flags; val = dmar_readl(iommu->reg, DMAR_PMEN_REG); if ( !(val & DMA_PMEN_PRS) ) return; spin_lock_irqsave(&iommu->register_lock, flags); dmar_writel(iommu->reg, DMAR_PMEN_REG, val & ~DMA_PMEN_EPM); IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG, dmar_readl, !(val & DMA_PMEN_PRS), val); spin_unlock_irqrestore(&iommu->register_lock, flags); dprintk(XENLOG_INFO VTDPREFIX, "Disabled protected memory registers\n"); } void print_iommu_regs(struct acpi_drhd_unit *drhd) { struct iommu *iommu = drhd->iommu; u64 cap; printk("---- print_iommu_regs ----\n"); printk(" drhd->address = %"PRIx64"\n", drhd->address); printk(" VER = %x\n", dmar_readl(iommu->reg, DMAR_VER_REG)); printk(" CAP = %"PRIx64"\n", cap = dmar_readq(iommu->reg, DMAR_CAP_REG)); printk(" n_fault_reg = %"PRIx64"\n", cap_num_fault_regs(cap)); printk(" fault_recording_offset = %"PRIx64"\n", cap_fault_reg_offset(cap)); if ( cap_fault_reg_offset(cap) < PAGE_SIZE ) { printk(" fault_recording_reg_l = %"PRIx64"\n", dmar_readq(iommu->reg, cap_fault_reg_offset(cap))); printk(" fault_recording_reg_h = %"PRIx64"\n", dmar_readq(iommu->reg, cap_fault_reg_offset(cap) + 8)); } printk(" ECAP = %"PRIx64"\n", dmar_readq(iommu->reg, DMAR_ECAP_REG)); printk(" GCMD = %x\n", dmar_readl(iommu->reg, DMAR_GCMD_REG)); printk(" GSTS = %x\n", dmar_readl(iommu->reg, DMAR_GSTS_REG)); printk(" RTADDR = %"PRIx64"\n", dmar_readq(iommu->reg,DMAR_RTADDR_REG)); printk(" CCMD = %"PRIx64"\n", dmar_readq(iommu->reg, DMAR_CCMD_REG)); printk(" FSTS = %x\n", dmar_readl(iommu->reg, DMAR_FSTS_REG)); printk(" FECTL = %x\n", dmar_readl(iommu->reg, DMAR_FECTL_REG)); printk(" FEDATA = %x\n", dmar_readl(iommu->reg, DMAR_FEDATA_REG)); printk(" FEADDR = %x\n", dmar_readl(iommu->reg, DMAR_FEADDR_REG)); printk(" FEUADDR = %x\n", dmar_readl(iommu->reg, DMAR_FEUADDR_REG)); } static u32 get_level_index(unsigned long gmfn, int level) { while ( --level ) gmfn = gmfn >> LEVEL_STRIDE; return gmfn & LEVEL_MASK; } void print_vtd_entries(struct iommu *iommu, int bus, int devfn, u64 gmfn) { struct context_entry *ctxt_entry; struct root_entry *root_entry; struct dma_pte pte; u64 *l, val; u32 l_index, level; printk("print_vtd_entries: iommu %p dev %04x:%02x:%02x.%u gmfn %"PRIx64"\n", iommu, iommu->intel->drhd->segment, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), gmfn); if ( iommu->root_maddr == 0 ) { printk(" iommu->root_maddr = 0\n"); return; } root_entry = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr); if ( root_entry == NULL ) { printk(" root_entry == NULL\n"); return; } printk(" root_entry = %p\n", root_entry); printk(" root_entry[%x] = %"PRIx64"\n", bus, root_entry[bus].val); if ( !root_present(root_entry[bus]) ) { unmap_vtd_domain_page(root_entry); printk(" root_entry[%x] not present\n", bus); return; } val = root_entry[bus].val; unmap_vtd_domain_page(root_entry); ctxt_entry = map_vtd_domain_page(val); if ( ctxt_entry == NULL ) { printk(" ctxt_entry == NULL\n"); return; } printk(" context = %p\n", ctxt_entry); val = ctxt_entry[devfn].lo; printk(" context[%x] = %"PRIx64"_%"PRIx64"\n", devfn, ctxt_entry[devfn].hi, val); if ( !context_present(ctxt_entry[devfn]) ) { unmap_vtd_domain_page(ctxt_entry); printk(" ctxt_entry[%x] not present\n", devfn); return; } level = agaw_to_level(context_address_width(ctxt_entry[devfn])); unmap_vtd_domain_page(ctxt_entry); if ( level != VTD_PAGE_TABLE_LEVEL_3 && level != VTD_PAGE_TABLE_LEVEL_4) { printk("Unsupported VTD page table level (%d)!\n", level); return; } do { l = map_vtd_domain_page(val); printk(" l%d = %p\n", level, l); if ( l == NULL ) { printk(" l%d == NULL\n", level); break; } l_index = get_level_index(gmfn, level); printk(" l%d_index = %x\n", level, l_index); pte.val = val = l[l_index]; unmap_vtd_domain_page(l); printk(" l%d[%x] = %"PRIx64"\n", level, l_index, val); pte.val = val; if ( !dma_pte_present(pte) ) { printk(" l%d[%x] not present\n", level, l_index); break; } } while ( --level ); } static void dump_iommu_info(unsigned char key) { struct acpi_drhd_unit *drhd; struct iommu *iommu; int i; for_each_drhd_unit ( drhd ) { u32 status = 0; iommu = drhd->iommu; printk("\niommu %x: nr_pt_levels = %x.\n", iommu->index, iommu->nr_pt_levels); if ( ecap_queued_inval(iommu->ecap) || ecap_intr_remap(iommu->ecap) ) status = dmar_readl(iommu->reg, DMAR_GSTS_REG); printk(" Queued Invalidation: %ssupported%s.\n", ecap_queued_inval(iommu->ecap) ? "" : "not ", (status & DMA_GSTS_QIES) ? " and enabled" : "" ); printk(" Interrupt Remapping: %ssupported%s.\n", ecap_intr_remap(iommu->ecap) ? "" : "not ", (status & DMA_GSTS_IRES) ? " and enabled" : "" ); if ( status & DMA_GSTS_IRES ) { /* Dump interrupt remapping table. */ u64 iremap_maddr = dmar_readq(iommu->reg, DMAR_IRTA_REG); int nr_entry = 1 << ((iremap_maddr & 0xF) + 1); struct iremap_entry *iremap_entries = NULL; int print_cnt = 0; printk(" Interrupt remapping table (nr_entry=%#x. " "Only dump P=1 entries here):\n", nr_entry); printk(" SVT SQ SID DST V AVL DLM TM RH DM " "FPD P\n"); for ( i = 0; i < nr_entry; i++ ) { struct iremap_entry *p; if ( i % (1 << IREMAP_ENTRY_ORDER) == 0 ) { /* This entry across page boundry */ if ( iremap_entries ) unmap_vtd_domain_page(iremap_entries); GET_IREMAP_ENTRY(iremap_maddr, i, iremap_entries, p); } else p = &iremap_entries[i % (1 << IREMAP_ENTRY_ORDER)]; if ( !p->lo.p ) continue; printk(" %04x: %x %x %04x %08x %02x %x %x %x %x %x" " %x %x\n", i, (u32)p->hi.svt, (u32)p->hi.sq, (u32)p->hi.sid, (u32)p->lo.dst, (u32)p->lo.vector, (u32)p->lo.avail, (u32)p->lo.dlm, (u32)p->lo.tm, (u32)p->lo.rh, (u32)p->lo.dm, (u32)p->lo.fpd, (u32)p->lo.p); print_cnt++; } if ( iremap_entries ) unmap_vtd_domain_page(iremap_entries); if ( iommu_ir_ctrl(iommu)->iremap_num != print_cnt ) printk("Warning: Print %d IRTE (actually have %d)!\n", print_cnt, iommu_ir_ctrl(iommu)->iremap_num); } } /* Dump the I/O xAPIC redirection table(s). */ if ( iommu_enabled ) { int apic; union IO_APIC_reg_01 reg_01; struct IO_APIC_route_remap_entry *remap; struct ir_ctrl *ir_ctrl; for ( apic = 0; apic < nr_ioapics; apic++ ) { iommu = ioapic_to_iommu(mp_ioapics[apic].mpc_apicid); ir_ctrl = iommu_ir_ctrl(iommu); if ( !ir_ctrl || !ir_ctrl->iremap_maddr || !ir_ctrl->iremap_num ) continue; printk( "\nRedirection table of IOAPIC %x:\n", apic); /* IO xAPIC Version Register. */ reg_01.raw = __io_apic_read(apic, 1); printk(" #entry IDX FMT MASK TRIG IRR POL STAT DELI VECTOR\n"); for ( i = 0; i <= reg_01.bits.entries; i++ ) { struct IO_APIC_route_entry rte = __ioapic_read_entry(apic, i, TRUE); remap = (struct IO_APIC_route_remap_entry *) &rte; if ( !remap->format ) continue; printk(" %02x: %04x %x %x %x %x %x %x" " %x %02x\n", i, (u32)remap->index_0_14 | ((u32)remap->index_15 << 15), (u32)remap->format, (u32)remap->mask, (u32)remap->trigger, (u32)remap->irr, (u32)remap->polarity, (u32)remap->delivery_status, (u32)remap->delivery_mode, (u32)remap->vector); } } } } struct keyhandler dump_iommu_info_keyhandler = { .diagnostic = 1, .u.fn = dump_iommu_info, .desc = "dump iommu info" }; /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/drivers/passthrough/vtd/quirks.c0000664000175000017500000002626612307313555017555 0ustar smbsmb/* * Copyright (c) 2010, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Author: Allen Kay */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "iommu.h" #include "dmar.h" #include "extern.h" #include "vtd.h" #define IOH_DEV 0 #define IGD_DEV 2 #define IGD_BAR_MASK 0xFFFFFFFFFFFF0000 #define GGC 0x52 #define GGC_MEMORY_VT_ENABLED (0x8 << 8) #define IS_CTG(id) (id == 0x2a408086) #define IS_ILK(id) (id == 0x00408086 || id == 0x00448086 || id== 0x00628086 || id == 0x006A8086) #define IS_CPT(id) (id == 0x01008086 || id == 0x01048086) static u32 __read_mostly ioh_id; static u32 __initdata igd_id; bool_t __read_mostly rwbf_quirk; static bool_t __read_mostly is_cantiga_b3; static bool_t __read_mostly is_snb_gfx; static u8 *__read_mostly igd_reg_va; static spinlock_t igd_lock; /* * QUIRK to workaround Xen boot issue on Calpella/Ironlake OEM BIOS * not enabling VT-d properly in IGD. The workaround is to not enabling * IGD VT-d translation if VT is not enabled in IGD. */ int is_igd_vt_enabled_quirk(void) { u16 ggc; if ( !IS_ILK(ioh_id) ) return 1; /* integrated graphics on Intel platforms is located at 0:2.0 */ ggc = pci_conf_read16(0, 0, IGD_DEV, 0, GGC); return ( ggc & GGC_MEMORY_VT_ENABLED ? 1 : 0 ); } /* * QUIRK to workaround cantiga VT-d buffer flush issue. * The workaround is to force write buffer flush even if * VT-d capability indicates it is not required. */ static void __init cantiga_b3_errata_init(void) { u16 vid; u8 did_hi, rid; vid = pci_conf_read16(0, 0, IGD_DEV, 0, 0); if ( vid != 0x8086 ) return; did_hi = pci_conf_read8(0, 0, IGD_DEV, 0, 3); rid = pci_conf_read8(0, 0, IGD_DEV, 0, 8); if ( (did_hi == 0x2A) && (rid == 0x7) ) is_cantiga_b3 = 1; } /* check for Sandybridge IGD device ID's */ static void __init snb_errata_init(void) { is_snb_gfx = IS_SNB_GFX(igd_id); spin_lock_init(&igd_lock); } /* * QUIRK to workaround Cantiga IGD VT-d low power errata. * This errata impacts IGD assignment on Cantiga systems * and can potentially cause VT-d operations to hang. * The workaround is to access an IGD PCI config register * to get IGD out of low power state before VT-d translation * enable/disable and IOTLB flushes. */ /* * map IGD MMIO+0x2000 page to allow Xen access to IGD 3D register. */ static void __init map_igd_reg(void) { u64 igd_mmio, igd_reg; if ( !is_cantiga_b3 && !is_snb_gfx ) return; if ( igd_reg_va ) return; /* get IGD mmio address in PCI BAR */ igd_mmio = ((u64)pci_conf_read32(0, 0, IGD_DEV, 0, 0x14) << 32) + pci_conf_read32(0, 0, IGD_DEV, 0, 0x10); /* offset of IGD regster we want to access is in 0x2000 range */ igd_reg = (igd_mmio & IGD_BAR_MASK) + 0x2000; /* ioremap this physical page */ set_fixmap_nocache(FIX_IGD_MMIO, igd_reg); igd_reg_va = (u8 *)fix_to_virt(FIX_IGD_MMIO); } /* * force IGD to exit low power mode by accessing a IGD 3D regsiter. */ static int cantiga_vtd_ops_preamble(struct iommu* iommu) { struct intel_iommu *intel = iommu->intel; struct acpi_drhd_unit *drhd = intel ? intel->drhd : NULL; if ( !is_igd_drhd(drhd) || !is_cantiga_b3 ) return 0; if ( !igd_reg_va ) return 0; /* * read IGD register at IGD MMIO + 0x20A4 to force IGD * to exit low power state. Since map_igd_reg() * already mapped page starting 0x2000, we just need to * add page offset 0x0A4 to virtual address base. */ return ( *((volatile int *)(igd_reg_va + 0x0A4)) ); } /* * Sandybridge RC6 power management inhibit state erratum. * This can cause power high power consumption. * Workaround is to prevent graphics get into RC6 * state when doing VT-d IOTLB operations, do the VT-d * IOTLB operation, and then re-enable RC6 state. */ static void snb_vtd_ops_preamble(struct iommu* iommu) { struct intel_iommu *intel = iommu->intel; struct acpi_drhd_unit *drhd = intel ? intel->drhd : NULL; s_time_t start_time; if ( !is_igd_drhd(drhd) || !is_snb_gfx ) return; if ( !igd_reg_va ) return; *((volatile u32 *)(igd_reg_va + 0x54)) = 0x000FFFFF; *((volatile u32 *)(igd_reg_va + 0x700)) = 0; start_time = NOW(); while ( (*((volatile u32 *)(igd_reg_va + 0x2AC)) & 0xF) != 0 ) { if ( NOW() > start_time + DMAR_OPERATION_TIMEOUT ) { dprintk(XENLOG_INFO VTDPREFIX, "snb_vtd_ops_preamble: failed to disable idle handshake\n"); break; } cpu_relax(); } *((volatile u32*)(igd_reg_va + 0x50)) = 0x10001; } static void snb_vtd_ops_postamble(struct iommu* iommu) { struct intel_iommu *intel = iommu->intel; struct acpi_drhd_unit *drhd = intel ? intel->drhd : NULL; if ( !is_igd_drhd(drhd) || !is_snb_gfx ) return; if ( !igd_reg_va ) return; *((volatile u32 *)(igd_reg_va + 0x54)) = 0xA; *((volatile u32 *)(igd_reg_va + 0x50)) = 0x10000; } /* * call before VT-d translation enable and IOTLB flush operations. */ static int snb_igd_quirk; boolean_param("snb_igd_quirk", snb_igd_quirk); void vtd_ops_preamble_quirk(struct iommu* iommu) { cantiga_vtd_ops_preamble(iommu); if ( snb_igd_quirk ) { spin_lock(&igd_lock); /* match unlock in postamble */ snb_vtd_ops_preamble(iommu); } } /* * call after VT-d translation enable and IOTLB flush operations. */ void vtd_ops_postamble_quirk(struct iommu* iommu) { if ( snb_igd_quirk ) { snb_vtd_ops_postamble(iommu); /* match the lock in preamble */ spin_unlock(&igd_lock); } } /* 5500/5520/X58 Chipset Interrupt remapping errata, for stepping B-3. * Fixed in stepping C-2. */ static void __init tylersburg_intremap_quirk(void) { uint32_t bus, device; uint8_t rev; for ( bus = 0; bus < 0x100; bus++ ) { /* Match on System Management Registers on Device 20 Function 0 */ device = pci_conf_read32(0, bus, 20, 0, PCI_VENDOR_ID); rev = pci_conf_read8(0, bus, 20, 0, PCI_REVISION_ID); if ( rev == 0x13 && device == 0x342e8086 ) { printk(XENLOG_WARNING VTDPREFIX "Disabling IOMMU due to Intel 5500/5520/X58 Chipset errata #47, #53\n"); iommu_enable = 0; break; } } } /* initialize platform identification flags */ void __init platform_quirks_init(void) { ioh_id = pci_conf_read32(0, 0, IOH_DEV, 0, 0); igd_id = pci_conf_read32(0, 0, IGD_DEV, 0, 0); /* Mobile 4 Series Chipset neglects to set RWBF capability. */ if ( ioh_id == 0x2a408086 ) { dprintk(XENLOG_INFO VTDPREFIX, "DMAR: Forcing write-buffer flush\n"); rwbf_quirk = 1; } /* initialize cantiga B3 identification */ cantiga_b3_errata_init(); snb_errata_init(); /* ioremap IGD MMIO+0x2000 page */ map_igd_reg(); /* Tylersburg interrupt remap quirk */ if ( iommu_intremap ) tylersburg_intremap_quirk(); } /* * QUIRK to workaround wifi direct assignment issue. This issue * impacts only cases where Intel integrated wifi device is directly * is directly assigned to a guest. * * The workaround is to map ME phantom device 0:3.7 or 0:22.7 * to the ME vt-d engine if detect the user is trying to directly * assigning Intel integrated wifi device to a guest. */ static void map_me_phantom_function(struct domain *domain, u32 dev, int map) { struct acpi_drhd_unit *drhd; struct pci_dev *pdev; /* find ME VT-d engine base on a real ME device */ pdev = pci_get_pdev(0, 0, PCI_DEVFN(dev, 0)); drhd = acpi_find_matched_drhd_unit(pdev); /* map or unmap ME phantom function */ if ( map ) domain_context_mapping_one(domain, drhd->iommu, 0, PCI_DEVFN(dev, 7), NULL); else domain_context_unmap_one(domain, drhd->iommu, 0, PCI_DEVFN(dev, 7)); } void me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map) { u32 id; id = pci_conf_read32(0, 0, 0, 0, 0); if ( IS_CTG(id) ) { /* quit if ME does not exist */ if ( pci_conf_read32(0, 0, 3, 0, 0) == 0xffffffff ) return; /* if device is WLAN device, map ME phantom device 0:3.7 */ id = pci_conf_read32(0, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), 0); switch (id) { case 0x42328086: case 0x42358086: case 0x42368086: case 0x42378086: case 0x423a8086: case 0x423b8086: case 0x423c8086: case 0x423d8086: map_me_phantom_function(domain, 3, map); break; default: break; } } else if ( IS_ILK(id) || IS_CPT(id) ) { /* quit if ME does not exist */ if ( pci_conf_read32(0, 0, 22, 0, 0) == 0xffffffff ) return; /* if device is WLAN device, map ME phantom device 0:22.7 */ id = pci_conf_read32(0, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), 0); switch (id) { case 0x00878086: /* Kilmer Peak */ case 0x00898086: case 0x00828086: /* Taylor Peak */ case 0x00858086: case 0x008F8086: /* Rainbow Peak */ case 0x00908086: case 0x00918086: case 0x42388086: /* Puma Peak */ case 0x422b8086: case 0x422c8086: map_me_phantom_function(domain, 22, map); break; default: break; } } } /* * Mask reporting Intel VT-d faults to IOH core logic: * - Some platform escalates VT-d faults to platform errors * - This can cause system failure upon non-fatal VT-d faults * - Potential security issue if malicious guest trigger VT-d faults */ void __init pci_vtd_quirk(struct pci_dev *pdev) { int seg = pdev->seg; int bus = pdev->bus; int dev = PCI_SLOT(pdev->devfn); int func = PCI_FUNC(pdev->devfn); int id, val; id = pci_conf_read32(seg, bus, dev, func, 0); if ( id == 0x342e8086 || id == 0x3c288086 ) { val = pci_conf_read32(seg, bus, dev, func, 0x1AC); pci_conf_write32(seg, bus, dev, func, 0x1AC, val | (1 << 31)); } } xen-4.4.0/xen/drivers/passthrough/vtd/x86/0000775000175000017500000000000012307313555016504 5ustar smbsmbxen-4.4.0/xen/drivers/passthrough/vtd/x86/ats.c0000664000175000017500000001110512307313555017435 0ustar smbsmb/* * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Author: Allen Kay */ #include #include #include #include #include #include #include "../iommu.h" #include "../dmar.h" #include "../vtd.h" #include "../extern.h" #include "../../ats.h" static LIST_HEAD(ats_dev_drhd_units); struct acpi_drhd_unit * find_ats_dev_drhd(struct iommu *iommu) { struct acpi_drhd_unit *drhd; list_for_each_entry ( drhd, &ats_dev_drhd_units, list ) { if ( drhd->iommu == iommu ) return drhd; } return NULL; } int ats_device(const struct pci_dev *pdev, const struct acpi_drhd_unit *drhd) { struct acpi_drhd_unit *ats_drhd; int pos; if ( !ats_enabled || !iommu_qinval ) return 0; if ( !ecap_queued_inval(drhd->iommu->ecap) || !ecap_dev_iotlb(drhd->iommu->ecap) ) return 0; if ( !acpi_find_matched_atsr_unit(pdev) ) return 0; ats_drhd = find_ats_dev_drhd(drhd->iommu); pos = pci_find_ext_capability(pdev->seg, pdev->bus, pdev->devfn, PCI_EXT_CAP_ID_ATS); if ( pos && (ats_drhd == NULL) ) { ats_drhd = xmalloc(struct acpi_drhd_unit); if ( !ats_drhd ) return -ENOMEM; *ats_drhd = *drhd; list_add_tail(&ats_drhd->list, &ats_dev_drhd_units); } return pos; } static int device_in_domain(struct iommu *iommu, struct pci_ats_dev *pdev, u16 did) { struct root_entry *root_entry = NULL; struct context_entry *ctxt_entry = NULL; int tt, found = 0; root_entry = (struct root_entry *) map_vtd_domain_page(iommu->root_maddr); if ( !root_entry || !root_present(root_entry[pdev->bus]) ) goto out; ctxt_entry = (struct context_entry *) map_vtd_domain_page(root_entry[pdev->bus].val); if ( ctxt_entry == NULL ) goto out; if ( context_domain_id(ctxt_entry[pdev->devfn]) != did ) goto out; tt = context_translation_type(ctxt_entry[pdev->devfn]); if ( tt != CONTEXT_TT_DEV_IOTLB ) goto out; found = 1; out: if ( root_entry ) unmap_vtd_domain_page(root_entry); if ( ctxt_entry ) unmap_vtd_domain_page(ctxt_entry); return found; } int dev_invalidate_iotlb(struct iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type) { struct pci_ats_dev *pdev; int sbit, ret = 0; u16 sid; if ( !ecap_dev_iotlb(iommu->ecap) ) return ret; list_for_each_entry( pdev, &ats_devices, list ) { sid = (pdev->bus << 8) | pdev->devfn; switch ( type ) { case DMA_TLB_DSI_FLUSH: if ( !device_in_domain(iommu, pdev, did) ) break; /* fall through if DSI condition met */ case DMA_TLB_GLOBAL_FLUSH: /* invalidate all translations: sbit=1,bit_63=0,bit[62:12]=1 */ sbit = 1; addr = (~0 << PAGE_SHIFT_4K) & 0x7FFFFFFFFFFFFFFF; ret |= qinval_device_iotlb(iommu, pdev->ats_queue_depth, sid, sbit, addr); break; case DMA_TLB_PSI_FLUSH: if ( !device_in_domain(iommu, pdev, did) ) break; addr &= ~0 << (PAGE_SHIFT + size_order); /* if size <= 4K, set sbit = 0, else set sbit = 1 */ sbit = size_order ? 1 : 0; /* clear lower bits */ addr &= (~0 << (PAGE_SHIFT + size_order)); /* if sbit == 1, zero out size_order bit and set lower bits to 1 */ if ( sbit ) addr &= (~0 & ~(1 << (PAGE_SHIFT + size_order))); ret |= qinval_device_iotlb(iommu, pdev->ats_queue_depth, sid, sbit, addr); break; default: dprintk(XENLOG_WARNING VTDPREFIX, "invalid vt-d flush type\n"); break; } } return ret; } xen-4.4.0/xen/drivers/passthrough/vtd/x86/Makefile0000664000175000017500000000003612307313555020143 0ustar smbsmbobj-y += vtd.o obj-y += ats.o xen-4.4.0/xen/drivers/passthrough/vtd/x86/vtd.c0000664000175000017500000001011512307313555017443 0ustar smbsmb/* * Copyright (c) 2008, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Copyright (C) Allen Kay * Copyright (C) Weidong Han */ #include #include #include #include #include #include #include #include #include #include "../iommu.h" #include "../dmar.h" #include "../vtd.h" #include "../extern.h" /* * iommu_inclusive_mapping: when set, all memory below 4GB is included in dom0 * 1:1 iommu mappings except xen and unusable regions. */ static bool_t __initdata iommu_inclusive_mapping = 1; boolean_param("iommu_inclusive_mapping", iommu_inclusive_mapping); void *map_vtd_domain_page(u64 maddr) { return map_domain_page(maddr >> PAGE_SHIFT_4K); } void unmap_vtd_domain_page(void *va) { unmap_domain_page(va); } unsigned int get_cache_line_size(void) { return ((cpuid_ebx(1) >> 8) & 0xff) * 8; } void cacheline_flush(char * addr) { clflush(addr); } void flush_all_cache() { wbinvd(); } static int _hvm_dpci_isairq_eoi(struct domain *d, struct hvm_pirq_dpci *pirq_dpci, void *arg) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; unsigned int isairq = (long)arg; struct dev_intx_gsi_link *digl, *tmp; list_for_each_entry_safe ( digl, tmp, &pirq_dpci->digl_list, list ) { if ( hvm_irq->pci_link.route[digl->link] == isairq ) { hvm_pci_intx_deassert(d, digl->device, digl->intx); if ( --pirq_dpci->pending == 0 ) { stop_timer(&pirq_dpci->timer); pirq_guest_eoi(dpci_pirq(pirq_dpci)); } } } return 0; } void hvm_dpci_isairq_eoi(struct domain *d, unsigned int isairq) { struct hvm_irq_dpci *dpci = NULL; ASSERT(isairq < NR_ISAIRQS); if ( !iommu_enabled) return; spin_lock(&d->event_lock); dpci = domain_get_irq_dpci(d); if ( dpci && test_bit(isairq, dpci->isairq_map) ) { /* Multiple mirq may be mapped to one isa irq */ pt_pirq_iterate(d, _hvm_dpci_isairq_eoi, (void *)(long)isairq); } spin_unlock(&d->event_lock); } void __init iommu_set_dom0_mapping(struct domain *d) { unsigned long i, j, tmp, top; BUG_ON(d->domain_id != 0); top = max(max_pdx, pfn_to_pdx(0xffffffffUL >> PAGE_SHIFT) + 1); for ( i = 0; i < top; i++ ) { /* * Set up 1:1 mapping for dom0. Default to use only conventional RAM * areas and let RMRRs include needed reserved regions. When set, the * inclusive mapping maps in everything below 4GB except unusable * ranges. */ unsigned long pfn = pdx_to_pfn(i); if ( pfn > (0xffffffffUL >> PAGE_SHIFT) ? (!mfn_valid(pfn) || !page_is_ram_type(pfn, RAM_TYPE_CONVENTIONAL)) : iommu_inclusive_mapping ? page_is_ram_type(pfn, RAM_TYPE_UNUSABLE) : !page_is_ram_type(pfn, RAM_TYPE_CONVENTIONAL) ) continue; /* Exclude Xen bits */ if ( xen_in_range(pfn) ) continue; tmp = 1 << (PAGE_SHIFT - PAGE_SHIFT_4K); for ( j = 0; j < tmp; j++ ) iommu_map_page(d, pfn * tmp + j, pfn * tmp + j, IOMMUF_readable|IOMMUF_writable); if (!(i & (0xfffff >> (PAGE_SHIFT - PAGE_SHIFT_4K)))) process_pending_softirqs(); } } xen-4.4.0/xen/drivers/passthrough/vtd/qinval.c0000664000175000017500000004000212307313555017511 0ustar smbsmb/* * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Copyright (C) Allen Kay * Copyright (C) Xiaohui Xin */ #include #include #include #include #include #include "iommu.h" #include "dmar.h" #include "vtd.h" #include "extern.h" static void print_qi_regs(struct iommu *iommu) { u64 val; val = dmar_readq(iommu->reg, DMAR_IQA_REG); printk("DMAR_IQA_REG = %"PRIx64"\n", val); val = dmar_readq(iommu->reg, DMAR_IQH_REG); printk("DMAR_IQH_REG = %"PRIx64"\n", val); val = dmar_readq(iommu->reg, DMAR_IQT_REG); printk("DMAR_IQT_REG = %"PRIx64"\n", val); } static int qinval_next_index(struct iommu *iommu) { u64 tail; tail = dmar_readq(iommu->reg, DMAR_IQT_REG); tail >>= QINVAL_INDEX_SHIFT; /* (tail+1 == head) indicates a full queue, wait for HW */ while ( ( tail + 1 ) % QINVAL_ENTRY_NR == ( dmar_readq(iommu->reg, DMAR_IQH_REG) >> QINVAL_INDEX_SHIFT ) ) cpu_relax(); return tail; } static int qinval_update_qtail(struct iommu *iommu, int index) { u64 val; /* Need hold register lock when update tail */ ASSERT( spin_is_locked(&iommu->register_lock) ); val = (index + 1) % QINVAL_ENTRY_NR; dmar_writeq(iommu->reg, DMAR_IQT_REG, (val << QINVAL_INDEX_SHIFT)); return 0; } static int gen_cc_inv_dsc(struct iommu *iommu, int index, u16 did, u16 source_id, u8 function_mask, u8 granu) { unsigned long flags; struct qinval_entry *qinval_entry = NULL, *qinval_entries; struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); u64 entry_base = qi_ctrl->qinval_maddr + (( index >> QINVAL_ENTRY_ORDER ) << PAGE_SHIFT ); spin_lock_irqsave(&qi_ctrl->qinval_lock, flags); qinval_entries = (struct qinval_entry *)map_vtd_domain_page(entry_base); qinval_entry = &qinval_entries[index % (1 << QINVAL_ENTRY_ORDER)]; qinval_entry->q.cc_inv_dsc.lo.type = TYPE_INVAL_CONTEXT; qinval_entry->q.cc_inv_dsc.lo.granu = granu; qinval_entry->q.cc_inv_dsc.lo.res_1 = 0; qinval_entry->q.cc_inv_dsc.lo.did = did; qinval_entry->q.cc_inv_dsc.lo.sid = source_id; qinval_entry->q.cc_inv_dsc.lo.fm = function_mask; qinval_entry->q.cc_inv_dsc.lo.res_2 = 0; qinval_entry->q.cc_inv_dsc.hi.res = 0; unmap_vtd_domain_page(qinval_entries); spin_unlock_irqrestore(&qi_ctrl->qinval_lock, flags); return 0; } int queue_invalidate_context(struct iommu *iommu, u16 did, u16 source_id, u8 function_mask, u8 granu) { int ret = -1; unsigned long flags; int index = -1; spin_lock_irqsave(&iommu->register_lock, flags); index = qinval_next_index(iommu); if ( index == -1 ) return -EBUSY; ret = gen_cc_inv_dsc(iommu, index, did, source_id, function_mask, granu); ret |= qinval_update_qtail(iommu, index); spin_unlock_irqrestore(&iommu->register_lock, flags); return ret; } static int gen_iotlb_inv_dsc(struct iommu *iommu, int index, u8 granu, u8 dr, u8 dw, u16 did, u8 am, u8 ih, u64 addr) { unsigned long flags; struct qinval_entry *qinval_entry = NULL, *qinval_entries; struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); u64 entry_base = qi_ctrl->qinval_maddr + (( index >> QINVAL_ENTRY_ORDER ) << PAGE_SHIFT ); spin_lock_irqsave(&qi_ctrl->qinval_lock, flags); qinval_entries = (struct qinval_entry *)map_vtd_domain_page(entry_base); qinval_entry = &qinval_entries[index % (1 << QINVAL_ENTRY_ORDER)]; qinval_entry->q.iotlb_inv_dsc.lo.type = TYPE_INVAL_IOTLB; qinval_entry->q.iotlb_inv_dsc.lo.granu = granu; qinval_entry->q.iotlb_inv_dsc.lo.dr = dr; qinval_entry->q.iotlb_inv_dsc.lo.dw = dw; qinval_entry->q.iotlb_inv_dsc.lo.res_1 = 0; qinval_entry->q.iotlb_inv_dsc.lo.did = did; qinval_entry->q.iotlb_inv_dsc.lo.res_2 = 0; qinval_entry->q.iotlb_inv_dsc.hi.am = am; qinval_entry->q.iotlb_inv_dsc.hi.ih = ih; qinval_entry->q.iotlb_inv_dsc.hi.res_1 = 0; qinval_entry->q.iotlb_inv_dsc.hi.addr = addr >> PAGE_SHIFT_4K; unmap_vtd_domain_page(qinval_entries); spin_unlock_irqrestore(&qi_ctrl->qinval_lock, flags); return 0; } int queue_invalidate_iotlb(struct iommu *iommu, u8 granu, u8 dr, u8 dw, u16 did, u8 am, u8 ih, u64 addr) { int ret = -1; unsigned long flags; int index = -1; spin_lock_irqsave(&iommu->register_lock, flags); index = qinval_next_index(iommu); if ( index == -1 ) return -EBUSY; ret = gen_iotlb_inv_dsc(iommu, index, granu, dr, dw, did, am, ih, addr); ret |= qinval_update_qtail(iommu, index); spin_unlock_irqrestore(&iommu->register_lock, flags); return ret; } static int gen_wait_dsc(struct iommu *iommu, int index, u8 iflag, u8 sw, u8 fn, u32 sdata, volatile u32 *saddr) { unsigned long flags; struct qinval_entry *qinval_entry = NULL, *qinval_entries; struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); u64 entry_base = qi_ctrl->qinval_maddr + (( index >> QINVAL_ENTRY_ORDER ) << PAGE_SHIFT ); spin_lock_irqsave(&qi_ctrl->qinval_lock, flags); qinval_entries = (struct qinval_entry *)map_vtd_domain_page(entry_base); qinval_entry = &qinval_entries[index % (1 << QINVAL_ENTRY_ORDER)]; qinval_entry->q.inv_wait_dsc.lo.type = TYPE_INVAL_WAIT; qinval_entry->q.inv_wait_dsc.lo.iflag = iflag; qinval_entry->q.inv_wait_dsc.lo.sw = sw; qinval_entry->q.inv_wait_dsc.lo.fn = fn; qinval_entry->q.inv_wait_dsc.lo.res_1 = 0; qinval_entry->q.inv_wait_dsc.lo.sdata = sdata; qinval_entry->q.inv_wait_dsc.hi.res_1 = 0; qinval_entry->q.inv_wait_dsc.hi.saddr = virt_to_maddr(saddr) >> 2; unmap_vtd_domain_page(qinval_entries); spin_unlock_irqrestore(&qi_ctrl->qinval_lock, flags); return 0; } static int queue_invalidate_wait(struct iommu *iommu, u8 iflag, u8 sw, u8 fn) { s_time_t start_time; u32 poll_slot = QINVAL_STAT_INIT; int index = -1; int ret = -1; unsigned long flags; spin_lock_irqsave(&iommu->register_lock, flags); index = qinval_next_index(iommu); if ( index == -1 ) return -EBUSY; ret = gen_wait_dsc(iommu, index, iflag, sw, fn, QINVAL_STAT_DONE, &poll_slot); ret |= qinval_update_qtail(iommu, index); spin_unlock_irqrestore(&iommu->register_lock, flags); /* Now we don't support interrupt method */ if ( sw ) { /* In case all wait descriptor writes to same addr with same data */ start_time = NOW(); while ( poll_slot != QINVAL_STAT_DONE ) { if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) ) { print_qi_regs(iommu); panic("queue invalidate wait descriptor was not executed"); } cpu_relax(); } } return ret; } int invalidate_sync(struct iommu *iommu) { int ret = -1; struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); if ( qi_ctrl->qinval_maddr != 0 ) { ret = queue_invalidate_wait(iommu, 0, 1, 1); return ret; } return 0; } static int gen_dev_iotlb_inv_dsc(struct iommu *iommu, int index, u32 max_invs_pend, u16 sid, u16 size, u64 addr) { unsigned long flags; struct qinval_entry *qinval_entry = NULL, *qinval_entries; struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); u64 entry_base = qi_ctrl->qinval_maddr + (( index >> QINVAL_ENTRY_ORDER ) << PAGE_SHIFT ); spin_lock_irqsave(&qi_ctrl->qinval_lock, flags); qinval_entries = (struct qinval_entry *)map_vtd_domain_page(entry_base); qinval_entry = &qinval_entries[index % (1 << QINVAL_ENTRY_ORDER)]; qinval_entry->q.dev_iotlb_inv_dsc.lo.type = TYPE_INVAL_DEVICE_IOTLB; qinval_entry->q.dev_iotlb_inv_dsc.lo.res_1 = 0; qinval_entry->q.dev_iotlb_inv_dsc.lo.max_invs_pend = max_invs_pend; qinval_entry->q.dev_iotlb_inv_dsc.lo.res_2 = 0; qinval_entry->q.dev_iotlb_inv_dsc.lo.sid = sid; qinval_entry->q.dev_iotlb_inv_dsc.lo.res_3 = 0; qinval_entry->q.dev_iotlb_inv_dsc.hi.size = size; qinval_entry->q.dev_iotlb_inv_dsc.hi.res_1 = 0; qinval_entry->q.dev_iotlb_inv_dsc.hi.addr = addr >> PAGE_SHIFT_4K; unmap_vtd_domain_page(qinval_entries); spin_unlock_irqrestore(&qi_ctrl->qinval_lock, flags); return 0; } int qinval_device_iotlb(struct iommu *iommu, u32 max_invs_pend, u16 sid, u16 size, u64 addr) { int ret = -1; unsigned long flags; int index = -1; spin_lock_irqsave(&iommu->register_lock, flags); index = qinval_next_index(iommu); if ( index == -1 ) return -EBUSY; ret = gen_dev_iotlb_inv_dsc(iommu, index, max_invs_pend, sid, size, addr); ret |= qinval_update_qtail(iommu, index); spin_unlock_irqrestore(&iommu->register_lock, flags); return ret; } static int gen_iec_inv_dsc(struct iommu *iommu, int index, u8 granu, u8 im, u16 iidx) { unsigned long flags; struct qinval_entry *qinval_entry = NULL, *qinval_entries; struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); u64 entry_base = qi_ctrl->qinval_maddr + (( index >> QINVAL_ENTRY_ORDER ) << PAGE_SHIFT ); spin_lock_irqsave(&qi_ctrl->qinval_lock, flags); qinval_entries = (struct qinval_entry *)map_vtd_domain_page(entry_base); qinval_entry = &qinval_entries[index % (1 << QINVAL_ENTRY_ORDER)]; qinval_entry->q.iec_inv_dsc.lo.type = TYPE_INVAL_IEC; qinval_entry->q.iec_inv_dsc.lo.granu = granu; qinval_entry->q.iec_inv_dsc.lo.res_1 = 0; qinval_entry->q.iec_inv_dsc.lo.im = im; qinval_entry->q.iec_inv_dsc.lo.iidx = iidx; qinval_entry->q.iec_inv_dsc.lo.res_2 = 0; qinval_entry->q.iec_inv_dsc.hi.res = 0; unmap_vtd_domain_page(qinval_entries); spin_unlock_irqrestore(&qi_ctrl->qinval_lock, flags); return 0; } int queue_invalidate_iec(struct iommu *iommu, u8 granu, u8 im, u16 iidx) { int ret; unsigned long flags; int index = -1; spin_lock_irqsave(&iommu->register_lock, flags); index = qinval_next_index(iommu); if ( index == -1 ) return -EBUSY; ret = gen_iec_inv_dsc(iommu, index, granu, im, iidx); ret |= qinval_update_qtail(iommu, index); spin_unlock_irqrestore(&iommu->register_lock, flags); return ret; } static int __iommu_flush_iec(struct iommu *iommu, u8 granu, u8 im, u16 iidx) { int ret; ret = queue_invalidate_iec(iommu, granu, im, iidx); ret |= invalidate_sync(iommu); /* * reading vt-d architecture register will ensure * draining happens in implementation independent way. */ (void)dmar_readq(iommu->reg, DMAR_CAP_REG); return ret; } int iommu_flush_iec_global(struct iommu *iommu) { return __iommu_flush_iec(iommu, IEC_GLOBAL_INVL, 0, 0); } int iommu_flush_iec_index(struct iommu *iommu, u8 im, u16 iidx) { return __iommu_flush_iec(iommu, IEC_INDEX_INVL, im, iidx); } static int flush_context_qi( void *_iommu, u16 did, u16 sid, u8 fm, u64 type, int flush_non_present_entry) { int ret = 0; struct iommu *iommu = (struct iommu *)_iommu; struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); /* * In the non-present entry flush case, if hardware doesn't cache * non-present entry we do nothing and if hardware cache non-present * entry, we flush entries of domain 0 (the domain id is used to cache * any non-present entries) */ if ( flush_non_present_entry ) { if ( !cap_caching_mode(iommu->cap) ) return 1; else did = 0; } if ( qi_ctrl->qinval_maddr != 0 ) { ret = queue_invalidate_context(iommu, did, sid, fm, type >> DMA_CCMD_INVL_GRANU_OFFSET); ret |= invalidate_sync(iommu); } return ret; } static int flush_iotlb_qi( void *_iommu, u16 did, u64 addr, unsigned int size_order, u64 type, int flush_non_present_entry, int flush_dev_iotlb) { u8 dr = 0, dw = 0; int ret = 0; struct iommu *iommu = (struct iommu *)_iommu; struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); /* * In the non-present entry flush case, if hardware doesn't cache * non-present entry we do nothing and if hardware cache non-present * entry, we flush entries of domain 0 (the domain id is used to cache * any non-present entries) */ if ( flush_non_present_entry ) { if ( !cap_caching_mode(iommu->cap) ) return 1; else did = 0; } if ( qi_ctrl->qinval_maddr != 0 ) { /* use queued invalidation */ if (cap_write_drain(iommu->cap)) dw = 1; if (cap_read_drain(iommu->cap)) dr = 1; /* Need to conside the ih bit later */ ret = queue_invalidate_iotlb(iommu, (type >> DMA_TLB_FLUSH_GRANU_OFFSET), dr, dw, did, (u8)size_order, 0, addr); if ( flush_dev_iotlb ) ret |= dev_invalidate_iotlb(iommu, did, addr, size_order, type); ret |= invalidate_sync(iommu); } return ret; } int enable_qinval(struct iommu *iommu) { struct acpi_drhd_unit *drhd; struct qi_ctrl *qi_ctrl; struct iommu_flush *flush; u32 sts; unsigned long flags; if ( !ecap_queued_inval(iommu->ecap) || !iommu_qinval ) return -ENOENT; qi_ctrl = iommu_qi_ctrl(iommu); flush = iommu_get_flush(iommu); /* Return if already enabled by Xen */ sts = dmar_readl(iommu->reg, DMAR_GSTS_REG); if ( (sts & DMA_GSTS_QIES) && qi_ctrl->qinval_maddr ) return 0; if ( qi_ctrl->qinval_maddr == 0 ) { drhd = iommu_to_drhd(iommu); qi_ctrl->qinval_maddr = alloc_pgtable_maddr(drhd, QINVAL_ARCH_PAGE_NR); if ( qi_ctrl->qinval_maddr == 0 ) { dprintk(XENLOG_WARNING VTDPREFIX, "Cannot allocate memory for qi_ctrl->qinval_maddr\n"); return -ENOMEM; } } flush->context = flush_context_qi; flush->iotlb = flush_iotlb_qi; /* Setup Invalidation Queue Address(IQA) register with the * address of the page we just allocated. QS field at * bits[2:0] to indicate size of queue is one 4KB page. * That's 256 entries. Queued Head (IQH) and Queue Tail (IQT) * registers are automatically reset to 0 with write * to IQA register. */ qi_ctrl->qinval_maddr |= QINVAL_PAGE_ORDER; spin_lock_irqsave(&iommu->register_lock, flags); dmar_writeq(iommu->reg, DMAR_IQA_REG, qi_ctrl->qinval_maddr); dmar_writeq(iommu->reg, DMAR_IQT_REG, 0); /* enable queued invalidation hardware */ sts = dmar_readl(iommu->reg, DMAR_GSTS_REG); dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_QIE); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl, (sts & DMA_GSTS_QIES), sts); spin_unlock_irqrestore(&iommu->register_lock, flags); return 0; } void disable_qinval(struct iommu *iommu) { u32 sts; unsigned long flags; if ( !ecap_queued_inval(iommu->ecap) ) return; spin_lock_irqsave(&iommu->register_lock, flags); sts = dmar_readl(iommu->reg, DMAR_GSTS_REG); if ( !(sts & DMA_GSTS_QIES) ) goto out; dmar_writel(iommu->reg, DMAR_GCMD_REG, sts & (~DMA_GCMD_QIE)); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl, !(sts & DMA_GSTS_QIES), sts); out: spin_unlock_irqrestore(&iommu->register_lock, flags); } xen-4.4.0/xen/drivers/passthrough/Makefile0000664000175000017500000000016112307313555016720 0ustar smbsmbsubdir-$(x86) += vtd subdir-$(x86) += amd subdir-$(x86_64) += x86 obj-y += iommu.o obj-y += io.o obj-y += pci.o xen-4.4.0/xen/drivers/passthrough/iommu.c0000664000175000017500000005637612307313555016575 0ustar smbsmb/* * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include static void parse_iommu_param(char *s); static int iommu_populate_page_table(struct domain *d); static void iommu_dump_p2m_table(unsigned char key); /* * The 'iommu' parameter enables the IOMMU. Optional comma separated * value may contain: * * off|no|false|disable Disable IOMMU (default) * force|required Don't boot unless IOMMU is enabled * workaround_bios_bug Workaround some bios issue to still enable VT-d, don't guarantee security * dom0-passthrough No DMA translation at all for Dom0 * dom0-strict No 1:1 memory mapping for Dom0 * no-snoop Disable VT-d Snoop Control * no-qinval Disable VT-d Queued Invalidation * no-intremap Disable VT-d Interrupt Remapping */ custom_param("iommu", parse_iommu_param); bool_t __initdata iommu_enable = 1; bool_t __read_mostly iommu_enabled; bool_t __read_mostly force_iommu; bool_t __initdata iommu_dom0_strict; bool_t __read_mostly iommu_verbose; bool_t __read_mostly iommu_workaround_bios_bug; bool_t __read_mostly iommu_passthrough; bool_t __read_mostly iommu_snoop = 1; bool_t __read_mostly iommu_qinval = 1; bool_t __read_mostly iommu_intremap = 1; bool_t __read_mostly iommu_hap_pt_share = 1; bool_t __read_mostly iommu_debug; bool_t __read_mostly amd_iommu_perdev_intremap = 1; DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb); DEFINE_SPINLOCK(iommu_pt_cleanup_lock); PAGE_LIST_HEAD(iommu_pt_cleanup_list); static struct tasklet iommu_pt_cleanup_tasklet; static struct keyhandler iommu_p2m_table = { .diagnostic = 0, .u.fn = iommu_dump_p2m_table, .desc = "dump iommu p2m table" }; static void __init parse_iommu_param(char *s) { char *ss; int val; do { val = !!strncmp(s, "no-", 3); if ( !val ) s += 3; ss = strchr(s, ','); if ( ss ) *ss = '\0'; if ( !parse_bool(s) ) iommu_enable = 0; else if ( !strcmp(s, "force") || !strcmp(s, "required") ) force_iommu = val; else if ( !strcmp(s, "workaround_bios_bug") ) iommu_workaround_bios_bug = val; else if ( !strcmp(s, "verbose") ) iommu_verbose = val; else if ( !strcmp(s, "snoop") ) iommu_snoop = val; else if ( !strcmp(s, "qinval") ) iommu_qinval = val; else if ( !strcmp(s, "intremap") ) iommu_intremap = val; else if ( !strcmp(s, "debug") ) { iommu_debug = val; if ( val ) iommu_verbose = 1; } else if ( !strcmp(s, "amd-iommu-perdev-intremap") ) amd_iommu_perdev_intremap = val; else if ( !strcmp(s, "dom0-passthrough") ) iommu_passthrough = val; else if ( !strcmp(s, "dom0-strict") ) iommu_dom0_strict = val; else if ( !strcmp(s, "sharept") ) iommu_hap_pt_share = val; s = ss + 1; } while ( ss ); } int iommu_domain_init(struct domain *d) { struct hvm_iommu *hd = domain_hvm_iommu(d); spin_lock_init(&hd->mapping_lock); INIT_LIST_HEAD(&hd->g2m_ioport_list); INIT_LIST_HEAD(&hd->mapped_rmrrs); if ( !iommu_enabled ) return 0; hd->platform_ops = iommu_get_ops(); return hd->platform_ops->init(d); } static __init void check_dom0_pvh_reqs(struct domain *d) { if ( !iommu_enabled ) panic("Presently, iommu must be enabled for pvh dom0\n"); if ( iommu_passthrough ) panic("For pvh dom0, dom0-passthrough must not be enabled\n"); iommu_dom0_strict = 1; } void __init iommu_dom0_init(struct domain *d) { struct hvm_iommu *hd = domain_hvm_iommu(d); if ( is_pvh_domain(d) ) check_dom0_pvh_reqs(d); if ( !iommu_enabled ) return; register_keyhandler('o', &iommu_p2m_table); d->need_iommu = !!iommu_dom0_strict; if ( need_iommu(d) ) { struct page_info *page; unsigned int i = 0; page_list_for_each ( page, &d->page_list ) { unsigned long mfn = page_to_mfn(page); unsigned long gfn = mfn_to_gmfn(d, mfn); unsigned int mapping = IOMMUF_readable; if ( ((page->u.inuse.type_info & PGT_count_mask) == 0) || ((page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page) ) mapping |= IOMMUF_writable; hd->platform_ops->map_page(d, gfn, mfn, mapping); if ( !(i++ & 0xfffff) ) process_pending_softirqs(); } } return hd->platform_ops->dom0_init(d); } int iommu_add_device(struct pci_dev *pdev) { struct hvm_iommu *hd; int rc; u8 devfn; if ( !pdev->domain ) return -EINVAL; ASSERT(spin_is_locked(&pcidevs_lock)); hd = domain_hvm_iommu(pdev->domain); if ( !iommu_enabled || !hd->platform_ops ) return 0; rc = hd->platform_ops->add_device(pdev->devfn, pdev); if ( rc || !pdev->phantom_stride ) return rc; for ( devfn = pdev->devfn ; ; ) { devfn += pdev->phantom_stride; if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) return 0; rc = hd->platform_ops->add_device(devfn, pdev); if ( rc ) printk(XENLOG_WARNING "IOMMU: add %04x:%02x:%02x.%u failed (%d)\n", pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), rc); } } int iommu_enable_device(struct pci_dev *pdev) { struct hvm_iommu *hd; if ( !pdev->domain ) return -EINVAL; ASSERT(spin_is_locked(&pcidevs_lock)); hd = domain_hvm_iommu(pdev->domain); if ( !iommu_enabled || !hd->platform_ops || !hd->platform_ops->enable_device ) return 0; return hd->platform_ops->enable_device(pdev); } int iommu_remove_device(struct pci_dev *pdev) { struct hvm_iommu *hd; u8 devfn; if ( !pdev->domain ) return -EINVAL; hd = domain_hvm_iommu(pdev->domain); if ( !iommu_enabled || !hd->platform_ops ) return 0; for ( devfn = pdev->devfn ; pdev->phantom_stride; ) { int rc; devfn += pdev->phantom_stride; if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) break; rc = hd->platform_ops->remove_device(devfn, pdev); if ( !rc ) continue; printk(XENLOG_ERR "IOMMU: remove %04x:%02x:%02x.%u failed (%d)\n", pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), rc); return rc; } return hd->platform_ops->remove_device(pdev->devfn, pdev); } static void iommu_teardown(struct domain *d) { const struct hvm_iommu *hd = domain_hvm_iommu(d); d->need_iommu = 0; hd->platform_ops->teardown(d); tasklet_schedule(&iommu_pt_cleanup_tasklet); } /* * If the device isn't owned by dom0, it means it already * has been assigned to other domain, or it doesn't exist. */ static int device_assigned(u16 seg, u8 bus, u8 devfn) { struct pci_dev *pdev; spin_lock(&pcidevs_lock); pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn); spin_unlock(&pcidevs_lock); return pdev ? 0 : -EBUSY; } static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) { struct hvm_iommu *hd = domain_hvm_iommu(d); struct pci_dev *pdev; int rc = 0; if ( !iommu_enabled || !hd->platform_ops ) return 0; /* Prevent device assign if mem paging or mem sharing have been * enabled for this domain */ if ( unlikely(!need_iommu(d) && (d->arch.hvm_domain.mem_sharing_enabled || d->mem_event->paging.ring_page)) ) return -EXDEV; if ( !spin_trylock(&pcidevs_lock) ) return -ERESTART; if ( need_iommu(d) <= 0 ) { if ( !iommu_use_hap_pt(d) ) { rc = iommu_populate_page_table(d); if ( rc ) { spin_unlock(&pcidevs_lock); return rc; } } d->need_iommu = 1; } pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn); if ( !pdev ) { rc = pci_get_pdev(seg, bus, devfn) ? -EBUSY : -ENODEV; goto done; } pdev->fault.count = 0; if ( (rc = hd->platform_ops->assign_device(d, devfn, pdev)) ) goto done; for ( ; pdev->phantom_stride; rc = 0 ) { devfn += pdev->phantom_stride; if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) break; rc = hd->platform_ops->assign_device(d, devfn, pdev); if ( rc ) printk(XENLOG_G_WARNING "d%d: assign %04x:%02x:%02x.%u failed (%d)\n", d->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), rc); } done: if ( !has_arch_pdevs(d) && need_iommu(d) ) iommu_teardown(d); spin_unlock(&pcidevs_lock); return rc; } static int iommu_populate_page_table(struct domain *d) { struct hvm_iommu *hd = domain_hvm_iommu(d); struct page_info *page; int rc = 0, n = 0; d->need_iommu = -1; this_cpu(iommu_dont_flush_iotlb) = 1; spin_lock(&d->page_alloc_lock); if ( unlikely(d->is_dying) ) rc = -ESRCH; while ( !rc && (page = page_list_remove_head(&d->page_list)) ) { if ( is_hvm_domain(d) || (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page ) { BUG_ON(SHARED_M2P(mfn_to_gmfn(d, page_to_mfn(page)))); rc = hd->platform_ops->map_page( d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page), IOMMUF_readable|IOMMUF_writable); if ( rc ) { page_list_add(page, &d->page_list); break; } } page_list_add_tail(page, &d->arch.relmem_list); if ( !(++n & 0xff) && !page_list_empty(&d->page_list) && hypercall_preempt_check() ) rc = -ERESTART; } if ( !rc ) { /* * The expectation here is that generally there are many normal pages * on relmem_list (the ones we put there) and only few being in an * offline/broken state. The latter ones are always at the head of the * list. Hence we first move the whole list, and then move back the * first few entries. */ page_list_move(&d->page_list, &d->arch.relmem_list); while ( (page = page_list_first(&d->page_list)) != NULL && (page->count_info & (PGC_state|PGC_broken)) ) { page_list_del(page, &d->page_list); page_list_add_tail(page, &d->arch.relmem_list); } } spin_unlock(&d->page_alloc_lock); this_cpu(iommu_dont_flush_iotlb) = 0; if ( !rc ) iommu_iotlb_flush_all(d); else if ( rc != -ERESTART ) iommu_teardown(d); return rc; } void iommu_domain_destroy(struct domain *d) { struct hvm_iommu *hd = domain_hvm_iommu(d); struct list_head *ioport_list, *rmrr_list, *tmp; struct g2m_ioport *ioport; struct mapped_rmrr *mrmrr; if ( !iommu_enabled || !hd->platform_ops ) return; if ( need_iommu(d) ) iommu_teardown(d); list_for_each_safe ( ioport_list, tmp, &hd->g2m_ioport_list ) { ioport = list_entry(ioport_list, struct g2m_ioport, list); list_del(&ioport->list); xfree(ioport); } list_for_each_safe ( rmrr_list, tmp, &hd->mapped_rmrrs ) { mrmrr = list_entry(rmrr_list, struct mapped_rmrr, list); list_del(&mrmrr->list); xfree(mrmrr); } } int iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int flags) { struct hvm_iommu *hd = domain_hvm_iommu(d); if ( !iommu_enabled || !hd->platform_ops ) return 0; return hd->platform_ops->map_page(d, gfn, mfn, flags); } int iommu_unmap_page(struct domain *d, unsigned long gfn) { struct hvm_iommu *hd = domain_hvm_iommu(d); if ( !iommu_enabled || !hd->platform_ops ) return 0; return hd->platform_ops->unmap_page(d, gfn); } static void iommu_free_pagetables(unsigned long unused) { do { struct page_info *pg; spin_lock(&iommu_pt_cleanup_lock); pg = page_list_remove_head(&iommu_pt_cleanup_list); spin_unlock(&iommu_pt_cleanup_lock); if ( !pg ) return; iommu_get_ops()->free_page_table(pg); } while ( !softirq_pending(smp_processor_id()) ); tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet, cpumask_cycle(smp_processor_id(), &cpu_online_map)); } void iommu_iotlb_flush(struct domain *d, unsigned long gfn, unsigned int page_count) { struct hvm_iommu *hd = domain_hvm_iommu(d); if ( !iommu_enabled || !hd->platform_ops || !hd->platform_ops->iotlb_flush ) return; hd->platform_ops->iotlb_flush(d, gfn, page_count); } void iommu_iotlb_flush_all(struct domain *d) { struct hvm_iommu *hd = domain_hvm_iommu(d); if ( !iommu_enabled || !hd->platform_ops || !hd->platform_ops->iotlb_flush_all ) return; hd->platform_ops->iotlb_flush_all(d); } /* caller should hold the pcidevs_lock */ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) { struct hvm_iommu *hd = domain_hvm_iommu(d); struct pci_dev *pdev = NULL; int ret = 0; if ( !iommu_enabled || !hd->platform_ops ) return -EINVAL; ASSERT(spin_is_locked(&pcidevs_lock)); pdev = pci_get_pdev_by_domain(d, seg, bus, devfn); if ( !pdev ) return -ENODEV; while ( pdev->phantom_stride ) { devfn += pdev->phantom_stride; if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) break; ret = hd->platform_ops->reassign_device(d, dom0, devfn, pdev); if ( !ret ) continue; printk(XENLOG_G_ERR "d%d: deassign %04x:%02x:%02x.%u failed (%d)\n", d->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), ret); return ret; } devfn = pdev->devfn; ret = hd->platform_ops->reassign_device(d, dom0, devfn, pdev); if ( ret ) { dprintk(XENLOG_G_ERR, "d%d: deassign device (%04x:%02x:%02x.%u) failed\n", d->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); return ret; } pdev->fault.count = 0; if ( !has_arch_pdevs(d) && need_iommu(d) ) iommu_teardown(d); return ret; } int __init iommu_setup(void) { int rc = -ENODEV; bool_t force_intremap = force_iommu && iommu_intremap; if ( iommu_dom0_strict ) iommu_passthrough = 0; if ( iommu_enable ) { rc = iommu_hardware_setup(); iommu_enabled = (rc == 0); } if ( !iommu_enabled ) iommu_intremap = 0; if ( (force_iommu && !iommu_enabled) || (force_intremap && !iommu_intremap) ) panic("Couldn't enable %s and iommu=required/force", !iommu_enabled ? "IOMMU" : "Interrupt Remapping"); if ( !iommu_enabled ) { iommu_snoop = 0; iommu_passthrough = 0; iommu_dom0_strict = 0; } printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis"); if ( iommu_enabled ) { printk(" - Dom0 mode: %s\n", iommu_passthrough ? "Passthrough" : iommu_dom0_strict ? "Strict" : "Relaxed"); printk("Interrupt remapping %sabled\n", iommu_intremap ? "en" : "dis"); tasklet_init(&iommu_pt_cleanup_tasklet, iommu_free_pagetables, 0); } return rc; } static int iommu_get_device_group( struct domain *d, u16 seg, u8 bus, u8 devfn, XEN_GUEST_HANDLE_64(uint32) buf, int max_sdevs) { struct hvm_iommu *hd = domain_hvm_iommu(d); struct pci_dev *pdev; int group_id, sdev_id; u32 bdf; int i = 0; const struct iommu_ops *ops = hd->platform_ops; if ( !iommu_enabled || !ops || !ops->get_device_group_id ) return 0; group_id = ops->get_device_group_id(seg, bus, devfn); spin_lock(&pcidevs_lock); for_each_pdev( d, pdev ) { if ( (pdev->seg != seg) || ((pdev->bus == bus) && (pdev->devfn == devfn)) ) continue; if ( xsm_get_device_group(XSM_HOOK, (seg << 16) | (pdev->bus << 8) | pdev->devfn) ) continue; sdev_id = ops->get_device_group_id(seg, pdev->bus, pdev->devfn); if ( (sdev_id == group_id) && (i < max_sdevs) ) { bdf = 0; bdf |= (pdev->bus & 0xff) << 16; bdf |= (pdev->devfn & 0xff) << 8; if ( unlikely(copy_to_guest_offset(buf, i, &bdf, 1)) ) { spin_unlock(&pcidevs_lock); return -1; } i++; } } spin_unlock(&pcidevs_lock); return i; } void iommu_update_ire_from_apic( unsigned int apic, unsigned int reg, unsigned int value) { const struct iommu_ops *ops = iommu_get_ops(); ops->update_ire_from_apic(apic, reg, value); } int iommu_update_ire_from_msi( struct msi_desc *msi_desc, struct msi_msg *msg) { const struct iommu_ops *ops = iommu_get_ops(); return iommu_intremap ? ops->update_ire_from_msi(msi_desc, msg) : 0; } void iommu_read_msi_from_ire( struct msi_desc *msi_desc, struct msi_msg *msg) { const struct iommu_ops *ops = iommu_get_ops(); if ( iommu_intremap ) ops->read_msi_from_ire(msi_desc, msg); } unsigned int iommu_read_apic_from_ire(unsigned int apic, unsigned int reg) { const struct iommu_ops *ops = iommu_get_ops(); return ops->read_apic_from_ire(apic, reg); } int __init iommu_setup_hpet_msi(struct msi_desc *msi) { const struct iommu_ops *ops = iommu_get_ops(); return ops->setup_hpet_msi ? ops->setup_hpet_msi(msi) : -ENODEV; } void iommu_resume() { const struct iommu_ops *ops = iommu_get_ops(); if ( iommu_enabled ) ops->resume(); } void iommu_suspend() { const struct iommu_ops *ops = iommu_get_ops(); if ( iommu_enabled ) ops->suspend(); } void iommu_share_p2m_table(struct domain* d) { const struct iommu_ops *ops = iommu_get_ops(); if ( iommu_enabled && is_hvm_domain(d) ) ops->share_p2m(d); } void iommu_crash_shutdown(void) { const struct iommu_ops *ops = iommu_get_ops(); if ( iommu_enabled ) ops->crash_shutdown(); iommu_enabled = iommu_intremap = 0; } int iommu_do_domctl( struct xen_domctl *domctl, struct domain *d, XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) { u16 seg; u8 bus, devfn; int ret = 0; if ( !iommu_enabled ) return -ENOSYS; switch ( domctl->cmd ) { case XEN_DOMCTL_get_device_group: { u32 max_sdevs; XEN_GUEST_HANDLE_64(uint32) sdevs; ret = xsm_get_device_group(XSM_HOOK, domctl->u.get_device_group.machine_sbdf); if ( ret ) break; seg = domctl->u.get_device_group.machine_sbdf >> 16; bus = (domctl->u.get_device_group.machine_sbdf >> 8) & 0xff; devfn = domctl->u.get_device_group.machine_sbdf & 0xff; max_sdevs = domctl->u.get_device_group.max_sdevs; sdevs = domctl->u.get_device_group.sdev_array; ret = iommu_get_device_group(d, seg, bus, devfn, sdevs, max_sdevs); if ( ret < 0 ) { dprintk(XENLOG_ERR, "iommu_get_device_group() failed!\n"); ret = -EFAULT; domctl->u.get_device_group.num_sdevs = 0; } else { domctl->u.get_device_group.num_sdevs = ret; ret = 0; } if ( __copy_field_to_guest(u_domctl, domctl, u.get_device_group) ) ret = -EFAULT; } break; case XEN_DOMCTL_test_assign_device: ret = xsm_test_assign_device(XSM_HOOK, domctl->u.assign_device.machine_sbdf); if ( ret ) break; seg = domctl->u.assign_device.machine_sbdf >> 16; bus = (domctl->u.assign_device.machine_sbdf >> 8) & 0xff; devfn = domctl->u.assign_device.machine_sbdf & 0xff; if ( device_assigned(seg, bus, devfn) ) { printk(XENLOG_G_INFO "%04x:%02x:%02x.%u already assigned, or non-existent\n", seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); ret = -EINVAL; } break; case XEN_DOMCTL_assign_device: if ( unlikely(d->is_dying) ) { ret = -EINVAL; break; } ret = xsm_assign_device(XSM_HOOK, d, domctl->u.assign_device.machine_sbdf); if ( ret ) break; seg = domctl->u.assign_device.machine_sbdf >> 16; bus = (domctl->u.assign_device.machine_sbdf >> 8) & 0xff; devfn = domctl->u.assign_device.machine_sbdf & 0xff; ret = device_assigned(seg, bus, devfn) ?: assign_device(d, seg, bus, devfn); if ( ret == -ERESTART ) ret = hypercall_create_continuation(__HYPERVISOR_domctl, "h", u_domctl); else if ( ret ) printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: " "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n", seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d->domain_id, ret); break; case XEN_DOMCTL_deassign_device: ret = xsm_deassign_device(XSM_HOOK, d, domctl->u.assign_device.machine_sbdf); if ( ret ) break; seg = domctl->u.assign_device.machine_sbdf >> 16; bus = (domctl->u.assign_device.machine_sbdf >> 8) & 0xff; devfn = domctl->u.assign_device.machine_sbdf & 0xff; spin_lock(&pcidevs_lock); ret = deassign_device(d, seg, bus, devfn); spin_unlock(&pcidevs_lock); if ( ret ) printk(XENLOG_G_ERR "deassign %04x:%02x:%02x.%u from dom%d failed (%d)\n", seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d->domain_id, ret); break; default: ret = -ENOSYS; break; } return ret; } static void iommu_dump_p2m_table(unsigned char key) { struct domain *d; const struct iommu_ops *ops; if ( !iommu_enabled ) { printk("IOMMU not enabled!\n"); return; } ops = iommu_get_ops(); for_each_domain(d) { if ( !d->domain_id ) continue; if ( iommu_use_hap_pt(d) ) { printk("\ndomain%d IOMMU p2m table shared with MMU: \n", d->domain_id); continue; } printk("\ndomain%d IOMMU p2m table: \n", d->domain_id); ops->dump_p2m_table(d); } } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/drivers/passthrough/ats.h0000664000175000017500000000341012307313555016220 0ustar smbsmb/* * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #ifndef _ATS_H_ #define _ATS_H_ #include struct pci_ats_dev { struct list_head list; u16 seg; u8 bus; u8 devfn; u16 ats_queue_depth; /* ATS device invalidation queue depth */ }; #define ATS_REG_CAP 4 #define ATS_REG_CTL 6 #define ATS_QUEUE_DEPTH_MASK 0x1f #define ATS_ENABLE (1<<15) extern struct list_head ats_devices; extern bool_t ats_enabled; int enable_ats_device(int seg, int bus, int devfn); void disable_ats_device(int seg, int bus, int devfn); struct pci_ats_dev *get_ats_device(int seg, int bus, int devfn); static inline int pci_ats_enabled(int seg, int bus, int devfn) { u32 value; int pos; pos = pci_find_ext_capability(seg, bus, devfn, PCI_EXT_CAP_ID_ATS); BUG_ON(!pos); value = pci_conf_read16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos + ATS_REG_CTL); return value & ATS_ENABLE; } static inline int pci_ats_device(int seg, int bus, int devfn) { if ( !ats_enabled ) return 0; return pci_find_ext_capability(seg, bus, devfn, PCI_EXT_CAP_ID_ATS); } #endif /* _ATS_H_ */ xen-4.4.0/xen/drivers/passthrough/io.c0000664000175000017500000004220512307313555016040 0ustar smbsmb/* * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Copyright (C) Allen Kay * Copyright (C) Xiaohui Xin */ #include #include #include #include #include #include #include #include static void hvm_dirq_assist(unsigned long _d); bool_t pt_irq_need_timer(uint32_t flags) { return !(flags & (HVM_IRQ_DPCI_GUEST_MSI | HVM_IRQ_DPCI_TRANSLATE)); } static int pt_irq_guest_eoi(struct domain *d, struct hvm_pirq_dpci *pirq_dpci, void *arg) { if ( __test_and_clear_bit(_HVM_IRQ_DPCI_EOI_LATCH_SHIFT, &pirq_dpci->flags) ) { pirq_dpci->masked = 0; pirq_dpci->pending = 0; pirq_guest_eoi(dpci_pirq(pirq_dpci)); } return 0; } static void pt_irq_time_out(void *data) { struct hvm_pirq_dpci *irq_map = data; unsigned int guest_gsi; struct hvm_irq_dpci *dpci = NULL; struct dev_intx_gsi_link *digl; struct hvm_girq_dpci_mapping *girq; uint32_t device, intx; spin_lock(&irq_map->dom->event_lock); dpci = domain_get_irq_dpci(irq_map->dom); ASSERT(dpci); list_for_each_entry ( digl, &irq_map->digl_list, list ) { guest_gsi = digl->gsi; list_for_each_entry ( girq, &dpci->girq[guest_gsi], list ) { struct pirq *pirq = pirq_info(irq_map->dom, girq->machine_gsi); pirq_dpci(pirq)->flags |= HVM_IRQ_DPCI_EOI_LATCH; } device = digl->device; intx = digl->intx; hvm_pci_intx_deassert(irq_map->dom, device, intx); } pt_pirq_iterate(irq_map->dom, pt_irq_guest_eoi, NULL); spin_unlock(&irq_map->dom->event_lock); } struct hvm_irq_dpci *domain_get_irq_dpci(const struct domain *d) { if ( !d || !is_hvm_domain(d) ) return NULL; return d->arch.hvm_domain.irq.dpci; } void free_hvm_irq_dpci(struct hvm_irq_dpci *dpci) { xfree(dpci); } int pt_irq_create_bind( struct domain *d, xen_domctl_bind_pt_irq_t *pt_irq_bind) { struct hvm_irq_dpci *hvm_irq_dpci = NULL; struct hvm_pirq_dpci *pirq_dpci; struct pirq *info; uint32_t guest_gsi; uint32_t device, intx, link; struct dev_intx_gsi_link *digl; struct hvm_girq_dpci_mapping *girq; int rc, pirq = pt_irq_bind->machine_irq; if ( pirq < 0 || pirq >= d->nr_pirqs ) return -EINVAL; spin_lock(&d->event_lock); hvm_irq_dpci = domain_get_irq_dpci(d); if ( hvm_irq_dpci == NULL ) { hvm_irq_dpci = xzalloc(struct hvm_irq_dpci); if ( hvm_irq_dpci == NULL ) { spin_unlock(&d->event_lock); return -ENOMEM; } softirq_tasklet_init( &hvm_irq_dpci->dirq_tasklet, hvm_dirq_assist, (unsigned long)d); for ( int i = 0; i < NR_HVM_IRQS; i++ ) INIT_LIST_HEAD(&hvm_irq_dpci->girq[i]); d->arch.hvm_domain.irq.dpci = hvm_irq_dpci; } info = pirq_get_info(d, pirq); if ( !info ) { spin_unlock(&d->event_lock); return -ENOMEM; } pirq_dpci = pirq_dpci(info); if ( pt_irq_bind->irq_type == PT_IRQ_TYPE_MSI ) { uint8_t dest, dest_mode; int dest_vcpu_id; if ( !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) ) { pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED | HVM_IRQ_DPCI_MACH_MSI | HVM_IRQ_DPCI_GUEST_MSI; pirq_dpci->gmsi.gvec = pt_irq_bind->u.msi.gvec; pirq_dpci->gmsi.gflags = pt_irq_bind->u.msi.gflags; /* bind after hvm_irq_dpci is setup to avoid race with irq handler*/ rc = pirq_guest_bind(d->vcpu[0], info, 0); if ( rc == 0 && pt_irq_bind->u.msi.gtable ) { rc = msixtbl_pt_register(d, info, pt_irq_bind->u.msi.gtable); if ( unlikely(rc) ) pirq_guest_unbind(d, info); } if ( unlikely(rc) ) { pirq_dpci->gmsi.gflags = 0; pirq_dpci->gmsi.gvec = 0; pirq_dpci->flags = 0; pirq_cleanup_check(info, d); spin_unlock(&d->event_lock); return rc; } } else { uint32_t mask = HVM_IRQ_DPCI_MACH_MSI | HVM_IRQ_DPCI_GUEST_MSI; if ( (pirq_dpci->flags & mask) != mask) { spin_unlock(&d->event_lock); return -EBUSY; } /* if pirq is already mapped as vmsi, update the guest data/addr */ if ( pirq_dpci->gmsi.gvec != pt_irq_bind->u.msi.gvec || pirq_dpci->gmsi.gflags != pt_irq_bind->u.msi.gflags) { /* Directly clear pending EOIs before enabling new MSI info. */ pirq_guest_eoi(info); pirq_dpci->gmsi.gvec = pt_irq_bind->u.msi.gvec; pirq_dpci->gmsi.gflags = pt_irq_bind->u.msi.gflags; } } /* Caculate dest_vcpu_id for MSI-type pirq migration */ dest = pirq_dpci->gmsi.gflags & VMSI_DEST_ID_MASK; dest_mode = !!(pirq_dpci->gmsi.gflags & VMSI_DM_MASK); dest_vcpu_id = hvm_girq_dest_2_vcpu_id(d, dest, dest_mode); pirq_dpci->gmsi.dest_vcpu_id = dest_vcpu_id; spin_unlock(&d->event_lock); if ( dest_vcpu_id >= 0 ) hvm_migrate_pirqs(d->vcpu[dest_vcpu_id]); } else { device = pt_irq_bind->u.pci.device; intx = pt_irq_bind->u.pci.intx; guest_gsi = hvm_pci_intx_gsi(device, intx); link = hvm_pci_intx_link(device, intx); hvm_irq_dpci->link_cnt[link]++; digl = xmalloc(struct dev_intx_gsi_link); if ( !digl ) { spin_unlock(&d->event_lock); return -ENOMEM; } girq = xmalloc(struct hvm_girq_dpci_mapping); if ( !girq ) { xfree(digl); spin_unlock(&d->event_lock); return -ENOMEM; } digl->device = device; digl->intx = intx; digl->gsi = guest_gsi; digl->link = link; list_add_tail(&digl->list, &pirq_dpci->digl_list); girq->device = device; girq->intx = intx; girq->machine_gsi = pirq; list_add_tail(&girq->list, &hvm_irq_dpci->girq[guest_gsi]); /* Bind the same mirq once in the same domain */ if ( !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) ) { unsigned int share; pirq_dpci->dom = d; if ( pt_irq_bind->irq_type == PT_IRQ_TYPE_MSI_TRANSLATE ) { pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED | HVM_IRQ_DPCI_MACH_MSI | HVM_IRQ_DPCI_GUEST_PCI | HVM_IRQ_DPCI_TRANSLATE; share = 0; } else /* PT_IRQ_TYPE_PCI */ { pirq_dpci->flags = HVM_IRQ_DPCI_MAPPED | HVM_IRQ_DPCI_MACH_PCI | HVM_IRQ_DPCI_GUEST_PCI; share = BIND_PIRQ__WILL_SHARE; } /* Init timer before binding */ if ( pt_irq_need_timer(pirq_dpci->flags) ) init_timer(&pirq_dpci->timer, pt_irq_time_out, pirq_dpci, 0); /* Deal with gsi for legacy devices */ rc = pirq_guest_bind(d->vcpu[0], info, share); if ( unlikely(rc) ) { if ( pt_irq_need_timer(pirq_dpci->flags) ) kill_timer(&pirq_dpci->timer); pirq_dpci->dom = NULL; list_del(&girq->list); xfree(girq); list_del(&digl->list); hvm_irq_dpci->link_cnt[link]--; pirq_dpci->flags = 0; pirq_cleanup_check(info, d); spin_unlock(&d->event_lock); xfree(digl); return rc; } } spin_unlock(&d->event_lock); if ( iommu_verbose ) dprintk(XENLOG_G_INFO, "d%d: bind: m_gsi=%u g_gsi=%u device=%u intx=%u\n", d->domain_id, pirq, guest_gsi, device, intx); } return 0; } int pt_irq_destroy_bind( struct domain *d, xen_domctl_bind_pt_irq_t *pt_irq_bind) { struct hvm_irq_dpci *hvm_irq_dpci = NULL; struct hvm_pirq_dpci *pirq_dpci; uint32_t machine_gsi, guest_gsi; uint32_t device, intx, link; struct dev_intx_gsi_link *digl, *tmp; struct hvm_girq_dpci_mapping *girq; struct pirq *pirq; machine_gsi = pt_irq_bind->machine_irq; device = pt_irq_bind->u.pci.device; intx = pt_irq_bind->u.pci.intx; guest_gsi = hvm_pci_intx_gsi(device, intx); link = hvm_pci_intx_link(device, intx); if ( iommu_verbose ) dprintk(XENLOG_G_INFO, "d%d: unbind: m_gsi=%u g_gsi=%u device=%u intx=%u\n", d->domain_id, machine_gsi, guest_gsi, device, intx); spin_lock(&d->event_lock); hvm_irq_dpci = domain_get_irq_dpci(d); if ( hvm_irq_dpci == NULL ) { spin_unlock(&d->event_lock); return -EINVAL; } hvm_irq_dpci->link_cnt[link]--; list_for_each_entry ( girq, &hvm_irq_dpci->girq[guest_gsi], list ) { if ( girq->machine_gsi == machine_gsi ) { list_del(&girq->list); xfree(girq); break; } } pirq = pirq_info(d, machine_gsi); pirq_dpci = pirq_dpci(pirq); /* clear the mirq info */ if ( pirq_dpci && (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) ) { list_for_each_entry_safe ( digl, tmp, &pirq_dpci->digl_list, list ) { if ( digl->device == device && digl->intx == intx && digl->link == link && digl->gsi == guest_gsi ) { list_del(&digl->list); xfree(digl); } } if ( list_empty(&pirq_dpci->digl_list) ) { pirq_guest_unbind(d, pirq); msixtbl_pt_unregister(d, pirq); if ( pt_irq_need_timer(pirq_dpci->flags) ) kill_timer(&pirq_dpci->timer); pirq_dpci->dom = NULL; pirq_dpci->flags = 0; pirq_cleanup_check(pirq, d); } } spin_unlock(&d->event_lock); if ( iommu_verbose ) dprintk(XENLOG_G_INFO, "d%d unmap: m_irq=%u device=%u intx=%u\n", d->domain_id, machine_gsi, device, intx); return 0; } void pt_pirq_init(struct domain *d, struct hvm_pirq_dpci *dpci) { INIT_LIST_HEAD(&dpci->digl_list); dpci->gmsi.dest_vcpu_id = -1; } bool_t pt_pirq_cleanup_check(struct hvm_pirq_dpci *dpci) { return !dpci->flags; } int pt_pirq_iterate(struct domain *d, int (*cb)(struct domain *, struct hvm_pirq_dpci *, void *), void *arg) { int rc = 0; unsigned int pirq = 0, n, i; struct pirq *pirqs[8]; ASSERT(spin_is_locked(&d->event_lock)); do { n = radix_tree_gang_lookup(&d->pirq_tree, (void **)pirqs, pirq, ARRAY_SIZE(pirqs)); for ( i = 0; i < n; ++i ) { struct hvm_pirq_dpci *pirq_dpci = pirq_dpci(pirqs[i]); pirq = pirqs[i]->pirq; if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) ) rc = cb(d, pirq_dpci, arg); } } while ( !rc && ++pirq < d->nr_pirqs && n == ARRAY_SIZE(pirqs) ); return rc; } int hvm_do_IRQ_dpci(struct domain *d, struct pirq *pirq) { struct hvm_irq_dpci *dpci = domain_get_irq_dpci(d); struct hvm_pirq_dpci *pirq_dpci = pirq_dpci(pirq); if ( !iommu_enabled || !dpci || !pirq_dpci || !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) ) return 0; pirq_dpci->masked = 1; tasklet_schedule(&dpci->dirq_tasklet); return 1; } /* called with d->event_lock held */ static void __msi_pirq_eoi(struct hvm_pirq_dpci *pirq_dpci) { irq_desc_t *desc; if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) && (pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI) ) { struct pirq *pirq = dpci_pirq(pirq_dpci); BUG_ON(!local_irq_is_enabled()); desc = pirq_spin_lock_irq_desc(pirq, NULL); if ( !desc ) return; desc_guest_eoi(desc, pirq); } } static int _hvm_dpci_msi_eoi(struct domain *d, struct hvm_pirq_dpci *pirq_dpci, void *arg) { int vector = (long)arg; if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI) && (pirq_dpci->gmsi.gvec == vector) ) { int dest = pirq_dpci->gmsi.gflags & VMSI_DEST_ID_MASK; int dest_mode = !!(pirq_dpci->gmsi.gflags & VMSI_DM_MASK); if ( vlapic_match_dest(vcpu_vlapic(current), NULL, 0, dest, dest_mode) ) { __msi_pirq_eoi(pirq_dpci); return 1; } } return 0; } void hvm_dpci_msi_eoi(struct domain *d, int vector) { if ( !iommu_enabled || !d->arch.hvm_domain.irq.dpci ) return; spin_lock(&d->event_lock); pt_pirq_iterate(d, _hvm_dpci_msi_eoi, (void *)(long)vector); spin_unlock(&d->event_lock); } static void hvm_pci_msi_assert( struct domain *d, struct hvm_pirq_dpci *pirq_dpci) { struct pirq *pirq = dpci_pirq(pirq_dpci); if ( hvm_domain_use_pirq(d, pirq) ) send_guest_pirq(d, pirq); else vmsi_deliver_pirq(d, pirq_dpci); } static int _hvm_dirq_assist(struct domain *d, struct hvm_pirq_dpci *pirq_dpci, void *arg) { uint32_t device, intx; struct dev_intx_gsi_link *digl; if ( test_and_clear_bool(pirq_dpci->masked) ) { if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI ) { hvm_pci_msi_assert(d, pirq_dpci); return 0; } list_for_each_entry ( digl, &pirq_dpci->digl_list, list ) { struct pirq *info = dpci_pirq(pirq_dpci); device = digl->device; intx = digl->intx; if ( hvm_domain_use_pirq(d, info) ) send_guest_pirq(d, info); else hvm_pci_intx_assert(d, device, intx); pirq_dpci->pending++; if ( pirq_dpci->flags & HVM_IRQ_DPCI_TRANSLATE ) { /* for translated MSI to INTx interrupt, eoi as early as possible */ __msi_pirq_eoi(pirq_dpci); } } /* * Set a timer to see if the guest can finish the interrupt or not. For * example, the guest OS may unmask the PIC during boot, before the * guest driver is loaded. hvm_pci_intx_assert() may succeed, but the * guest will never deal with the irq, then the physical interrupt line * will never be deasserted. */ if ( pt_irq_need_timer(pirq_dpci->flags) ) set_timer(&pirq_dpci->timer, NOW() + PT_IRQ_TIME_OUT); } return 0; } static void hvm_dirq_assist(unsigned long _d) { struct domain *d = (struct domain *)_d; ASSERT(d->arch.hvm_domain.irq.dpci); spin_lock(&d->event_lock); pt_pirq_iterate(d, _hvm_dirq_assist, NULL); spin_unlock(&d->event_lock); } static void __hvm_dpci_eoi(struct domain *d, struct hvm_girq_dpci_mapping *girq, union vioapic_redir_entry *ent) { uint32_t device, intx; struct pirq *pirq; struct hvm_pirq_dpci *pirq_dpci; device = girq->device; intx = girq->intx; hvm_pci_intx_deassert(d, device, intx); pirq = pirq_info(d, girq->machine_gsi); pirq_dpci = pirq_dpci(pirq); /* * No need to get vector lock for timer * since interrupt is still not EOIed */ if ( --pirq_dpci->pending || ( ent && ent->fields.mask ) || ! pt_irq_need_timer(pirq_dpci->flags) ) return; stop_timer(&pirq_dpci->timer); pirq_guest_eoi(pirq); } void hvm_dpci_eoi(struct domain *d, unsigned int guest_gsi, union vioapic_redir_entry *ent) { struct hvm_irq_dpci *hvm_irq_dpci; struct hvm_girq_dpci_mapping *girq; if ( !iommu_enabled ) return; if ( guest_gsi < NR_ISAIRQS ) { hvm_dpci_isairq_eoi(d, guest_gsi); return; } spin_lock(&d->event_lock); hvm_irq_dpci = domain_get_irq_dpci(d); if ( !hvm_irq_dpci ) goto unlock; list_for_each_entry ( girq, &hvm_irq_dpci->girq[guest_gsi], list ) __hvm_dpci_eoi(d, girq, ent); unlock: spin_unlock(&d->event_lock); } xen-4.4.0/xen/drivers/passthrough/x86/0000775000175000017500000000000012307313555015707 5ustar smbsmbxen-4.4.0/xen/drivers/passthrough/x86/ats.c0000664000175000017500000000724212307313555016647 0ustar smbsmb/* * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include "../ats.h" LIST_HEAD(ats_devices); bool_t __read_mostly ats_enabled = 1; boolean_param("ats", ats_enabled); int enable_ats_device(int seg, int bus, int devfn) { struct pci_ats_dev *pdev = NULL; u32 value; int pos; pos = pci_find_ext_capability(seg, bus, devfn, PCI_EXT_CAP_ID_ATS); BUG_ON(!pos); if ( iommu_verbose ) dprintk(XENLOG_INFO, "%04x:%02x:%02x.%u: ATS capability found\n", seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); value = pci_conf_read16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos + ATS_REG_CTL); if ( value & ATS_ENABLE ) { list_for_each_entry ( pdev, &ats_devices, list ) { if ( pdev->seg == seg && pdev->bus == bus && pdev->devfn == devfn ) { pos = 0; break; } } } if ( pos ) pdev = xmalloc(struct pci_ats_dev); if ( !pdev ) return -ENOMEM; if ( !(value & ATS_ENABLE) ) { value |= ATS_ENABLE; pci_conf_write16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos + ATS_REG_CTL, value); } if ( pos ) { pdev->seg = seg; pdev->bus = bus; pdev->devfn = devfn; value = pci_conf_read16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos + ATS_REG_CAP); pdev->ats_queue_depth = value & ATS_QUEUE_DEPTH_MASK ?: ATS_QUEUE_DEPTH_MASK + 1; list_add(&pdev->list, &ats_devices); } if ( iommu_verbose ) dprintk(XENLOG_INFO, "%04x:%02x:%02x.%u: ATS %s enabled\n", seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos ? "is" : "was"); return pos; } void disable_ats_device(int seg, int bus, int devfn) { struct pci_ats_dev *pdev; u32 value; int pos; pos = pci_find_ext_capability(seg, bus, devfn, PCI_EXT_CAP_ID_ATS); BUG_ON(!pos); value = pci_conf_read16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos + ATS_REG_CTL); value &= ~ATS_ENABLE; pci_conf_write16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos + ATS_REG_CTL, value); list_for_each_entry ( pdev, &ats_devices, list ) { if ( pdev->seg == seg && pdev->bus == bus && pdev->devfn == devfn ) { list_del(&pdev->list); xfree(pdev); break; } } if ( iommu_verbose ) dprintk(XENLOG_INFO, "%04x:%02x:%02x.%u: ATS is disabled\n", seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); } struct pci_ats_dev *get_ats_device(int seg, int bus, int devfn) { struct pci_ats_dev *pdev; if ( !pci_ats_device(seg, bus, devfn) ) return NULL; list_for_each_entry ( pdev, &ats_devices, list ) { if ( pdev->seg == seg && pdev->bus == bus && pdev->devfn == devfn ) return pdev; } return NULL; } xen-4.4.0/xen/drivers/passthrough/x86/Makefile0000664000175000017500000000001712307313555017345 0ustar smbsmbobj-y += ats.o xen-4.4.0/xen/drivers/passthrough/pci.c0000664000175000017500000006517512307313555016217 0ustar smbsmb/* * Copyright (C) 2008, Netronome Systems, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct pci_seg { struct list_head alldevs_list; u16 nr; unsigned long *ro_map; /* bus2bridge_lock protects bus2bridge array */ spinlock_t bus2bridge_lock; #define MAX_BUSES 256 struct { u8 map; u8 bus; u8 devfn; } bus2bridge[MAX_BUSES]; }; spinlock_t pcidevs_lock = SPIN_LOCK_UNLOCKED; static struct radix_tree_root pci_segments; static inline struct pci_seg *get_pseg(u16 seg) { return radix_tree_lookup(&pci_segments, seg); } bool_t pci_known_segment(u16 seg) { return get_pseg(seg) != NULL; } static struct pci_seg *alloc_pseg(u16 seg) { struct pci_seg *pseg = get_pseg(seg); if ( pseg ) return pseg; pseg = xzalloc(struct pci_seg); if ( !pseg ) return NULL; pseg->nr = seg; INIT_LIST_HEAD(&pseg->alldevs_list); spin_lock_init(&pseg->bus2bridge_lock); if ( radix_tree_insert(&pci_segments, seg, pseg) ) { xfree(pseg); pseg = NULL; } return pseg; } static int pci_segments_iterate( int (*handler)(struct pci_seg *, void *), void *arg) { u16 seg = 0; int rc = 0; do { struct pci_seg *pseg; if ( !radix_tree_gang_lookup(&pci_segments, (void **)&pseg, seg, 1) ) break; rc = handler(pseg, arg); seg = pseg->nr + 1; } while (!rc && seg); return rc; } void __init pt_pci_init(void) { radix_tree_init(&pci_segments); if ( !alloc_pseg(0) ) panic("Could not initialize PCI segment 0"); } int __init pci_add_segment(u16 seg) { return alloc_pseg(seg) ? 0 : -ENOMEM; } const unsigned long *pci_get_ro_map(u16 seg) { struct pci_seg *pseg = get_pseg(seg); return pseg ? pseg->ro_map : NULL; } static struct phantom_dev { u16 seg; u8 bus, slot, stride; } phantom_devs[8]; static unsigned int nr_phantom_devs; static void __init parse_phantom_dev(char *str) { const char *s = str; unsigned int seg, bus, slot; struct phantom_dev phantom; if ( !s || !*s || nr_phantom_devs >= ARRAY_SIZE(phantom_devs) ) return; s = parse_pci(s, &seg, &bus, &slot, NULL); if ( !s || *s != ',' ) return; phantom.seg = seg; phantom.bus = bus; phantom.slot = slot; switch ( phantom.stride = simple_strtol(s + 1, &s, 0) ) { case 1: case 2: case 4: if ( *s ) default: return; } phantom_devs[nr_phantom_devs++] = phantom; } custom_param("pci-phantom", parse_phantom_dev); static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn) { struct pci_dev *pdev; list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list ) if ( pdev->bus == bus && pdev->devfn == devfn ) return pdev; pdev = xzalloc(struct pci_dev); if ( !pdev ) return NULL; *(u16*) &pdev->seg = pseg->nr; *((u8*) &pdev->bus) = bus; *((u8*) &pdev->devfn) = devfn; pdev->domain = NULL; INIT_LIST_HEAD(&pdev->msi_list); if ( pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), PCI_CAP_ID_MSIX) ) { struct arch_msix *msix = xzalloc(struct arch_msix); if ( !msix ) { xfree(pdev); return NULL; } spin_lock_init(&msix->table_lock); pdev->msix = msix; } list_add(&pdev->alldevs_list, &pseg->alldevs_list); /* update bus2bridge */ switch ( pdev->type = pdev_type(pseg->nr, bus, devfn) ) { int pos; u16 cap; u8 sec_bus, sub_bus; case DEV_TYPE_PCIe2PCI_BRIDGE: case DEV_TYPE_LEGACY_PCI_BRIDGE: sec_bus = pci_conf_read8(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), PCI_SECONDARY_BUS); sub_bus = pci_conf_read8(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), PCI_SUBORDINATE_BUS); spin_lock(&pseg->bus2bridge_lock); for ( ; sec_bus <= sub_bus; sec_bus++ ) { pseg->bus2bridge[sec_bus].map = 1; pseg->bus2bridge[sec_bus].bus = bus; pseg->bus2bridge[sec_bus].devfn = devfn; } spin_unlock(&pseg->bus2bridge_lock); break; case DEV_TYPE_PCIe_ENDPOINT: pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), PCI_CAP_ID_EXP); BUG_ON(!pos); cap = pci_conf_read16(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pos + PCI_EXP_DEVCAP); if ( cap & PCI_EXP_DEVCAP_PHANTOM ) { pdev->phantom_stride = 8 >> MASK_EXTR(cap, PCI_EXP_DEVCAP_PHANTOM); if ( PCI_FUNC(devfn) >= pdev->phantom_stride ) pdev->phantom_stride = 0; } else { unsigned int i; for ( i = 0; i < nr_phantom_devs; ++i ) if ( phantom_devs[i].seg == pseg->nr && phantom_devs[i].bus == bus && phantom_devs[i].slot == PCI_SLOT(devfn) && phantom_devs[i].stride > PCI_FUNC(devfn) ) { pdev->phantom_stride = phantom_devs[i].stride; break; } } break; case DEV_TYPE_PCI: case DEV_TYPE_PCIe_BRIDGE: case DEV_TYPE_PCI_HOST_BRIDGE: break; default: printk(XENLOG_WARNING "%s: unknown type: %04x:%02x:%02x.%u\n", __func__, pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); break; } return pdev; } static void free_pdev(struct pci_seg *pseg, struct pci_dev *pdev) { /* update bus2bridge */ switch ( pdev->type ) { u8 dev, func, sec_bus, sub_bus; case DEV_TYPE_PCIe2PCI_BRIDGE: case DEV_TYPE_LEGACY_PCI_BRIDGE: dev = PCI_SLOT(pdev->devfn); func = PCI_FUNC(pdev->devfn); sec_bus = pci_conf_read8(pseg->nr, pdev->bus, dev, func, PCI_SECONDARY_BUS); sub_bus = pci_conf_read8(pseg->nr, pdev->bus, dev, func, PCI_SUBORDINATE_BUS); spin_lock(&pseg->bus2bridge_lock); for ( ; sec_bus <= sub_bus; sec_bus++ ) pseg->bus2bridge[sec_bus] = pseg->bus2bridge[pdev->bus]; spin_unlock(&pseg->bus2bridge_lock); break; default: break; } list_del(&pdev->alldevs_list); xfree(pdev->msix); xfree(pdev); } static void _pci_hide_device(struct pci_dev *pdev) { if ( pdev->domain ) return; pdev->domain = dom_xen; list_add(&pdev->domain_list, &dom_xen->arch.pdev_list); } int __init pci_hide_device(int bus, int devfn) { struct pci_dev *pdev; int rc = -ENOMEM; spin_lock(&pcidevs_lock); pdev = alloc_pdev(get_pseg(0), bus, devfn); if ( pdev ) { _pci_hide_device(pdev); rc = 0; } spin_unlock(&pcidevs_lock); return rc; } int __init pci_ro_device(int seg, int bus, int devfn) { struct pci_seg *pseg = alloc_pseg(seg); struct pci_dev *pdev; if ( !pseg ) return -ENOMEM; pdev = alloc_pdev(pseg, bus, devfn); if ( !pdev ) return -ENOMEM; if ( !pseg->ro_map ) { size_t sz = BITS_TO_LONGS(PCI_BDF(-1, -1, -1) + 1) * sizeof(long); pseg->ro_map = alloc_xenheap_pages(get_order_from_bytes(sz), 0); if ( !pseg->ro_map ) return -ENOMEM; memset(pseg->ro_map, 0, sz); } __set_bit(PCI_BDF2(bus, devfn), pseg->ro_map); arch_pci_ro_device(seg, PCI_BDF2(bus, devfn)); _pci_hide_device(pdev); return 0; } struct pci_dev *pci_get_pdev(int seg, int bus, int devfn) { struct pci_seg *pseg = get_pseg(seg); struct pci_dev *pdev = NULL; ASSERT(spin_is_locked(&pcidevs_lock)); ASSERT(seg != -1 || bus == -1); ASSERT(bus != -1 || devfn == -1); if ( !pseg ) { if ( seg == -1 ) radix_tree_gang_lookup(&pci_segments, (void **)&pseg, 0, 1); if ( !pseg ) return NULL; } do { list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list ) if ( (pdev->bus == bus || bus == -1) && (pdev->devfn == devfn || devfn == -1) ) return pdev; } while ( radix_tree_gang_lookup(&pci_segments, (void **)&pseg, pseg->nr + 1, 1) ); return NULL; } struct pci_dev *pci_get_real_pdev(int seg, int bus, int devfn) { struct pci_dev *pdev; int stride; if ( seg < 0 || bus < 0 || devfn < 0 ) return NULL; for ( pdev = pci_get_pdev(seg, bus, devfn), stride = 4; !pdev && stride; stride >>= 1 ) { if ( !(devfn & (8 - stride)) ) continue; pdev = pci_get_pdev(seg, bus, devfn & ~(8 - stride)); if ( pdev && stride != pdev->phantom_stride ) pdev = NULL; } return pdev; } struct pci_dev *pci_get_pdev_by_domain( struct domain *d, int seg, int bus, int devfn) { struct pci_seg *pseg = get_pseg(seg); struct pci_dev *pdev = NULL; ASSERT(seg != -1 || bus == -1); ASSERT(bus != -1 || devfn == -1); if ( !pseg ) { if ( seg == -1 ) radix_tree_gang_lookup(&pci_segments, (void **)&pseg, 0, 1); if ( !pseg ) return NULL; } do { list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list ) if ( (pdev->bus == bus || bus == -1) && (pdev->devfn == devfn || devfn == -1) && (pdev->domain == d) ) return pdev; } while ( radix_tree_gang_lookup(&pci_segments, (void **)&pseg, pseg->nr + 1, 1) ); return NULL; } /** * pci_enable_acs - enable ACS if hardware support it * @dev: the PCI device */ static void pci_enable_acs(struct pci_dev *pdev) { int pos; u16 cap, ctrl, seg = pdev->seg; u8 bus = pdev->bus; u8 dev = PCI_SLOT(pdev->devfn); u8 func = PCI_FUNC(pdev->devfn); if ( !iommu_enabled ) return; pos = pci_find_ext_capability(seg, bus, pdev->devfn, PCI_EXT_CAP_ID_ACS); if (!pos) return; cap = pci_conf_read16(seg, bus, dev, func, pos + PCI_ACS_CAP); ctrl = pci_conf_read16(seg, bus, dev, func, pos + PCI_ACS_CTRL); /* Source Validation */ ctrl |= (cap & PCI_ACS_SV); /* P2P Request Redirect */ ctrl |= (cap & PCI_ACS_RR); /* P2P Completion Redirect */ ctrl |= (cap & PCI_ACS_CR); /* Upstream Forwarding */ ctrl |= (cap & PCI_ACS_UF); pci_conf_write16(seg, bus, dev, func, pos + PCI_ACS_CTRL, ctrl); } int pci_add_device(u16 seg, u8 bus, u8 devfn, const struct pci_dev_info *info) { struct pci_seg *pseg; struct pci_dev *pdev; unsigned int slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn); const char *pdev_type; int ret; if (!info) pdev_type = "device"; else if (info->is_extfn) pdev_type = "extended function"; else if (info->is_virtfn) { spin_lock(&pcidevs_lock); pdev = pci_get_pdev(seg, info->physfn.bus, info->physfn.devfn); spin_unlock(&pcidevs_lock); if ( !pdev ) pci_add_device(seg, info->physfn.bus, info->physfn.devfn, NULL); pdev_type = "virtual function"; } else { info = NULL; pdev_type = "device"; } ret = xsm_resource_plug_pci(XSM_PRIV, (seg << 16) | (bus << 8) | devfn); if ( ret ) return ret; ret = -ENOMEM; spin_lock(&pcidevs_lock); pseg = alloc_pseg(seg); if ( !pseg ) goto out; pdev = alloc_pdev(pseg, bus, devfn); if ( !pdev ) goto out; if ( info ) pdev->info = *info; else if ( !pdev->vf_rlen[0] ) { unsigned int pos = pci_find_ext_capability(seg, bus, devfn, PCI_EXT_CAP_ID_SRIOV); u16 ctrl = pci_conf_read16(seg, bus, slot, func, pos + PCI_SRIOV_CTRL); if ( !pos ) /* Nothing */; else if ( !(ctrl & (PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE)) ) { unsigned int i; BUILD_BUG_ON(ARRAY_SIZE(pdev->vf_rlen) != PCI_SRIOV_NUM_BARS); for ( i = 0; i < PCI_SRIOV_NUM_BARS; ++i ) { unsigned int idx = pos + PCI_SRIOV_BAR + i * 4; u32 bar = pci_conf_read32(seg, bus, slot, func, idx); u32 hi = 0; if ( (bar & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO ) { printk(XENLOG_WARNING "SR-IOV device %04x:%02x:%02x.%u with vf BAR%u" " in IO space\n", seg, bus, slot, func, i); continue; } pci_conf_write32(seg, bus, slot, func, idx, ~0); if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64 ) { if ( i >= PCI_SRIOV_NUM_BARS ) { printk(XENLOG_WARNING "SR-IOV device %04x:%02x:%02x.%u with 64-bit" " vf BAR in last slot\n", seg, bus, slot, func); break; } hi = pci_conf_read32(seg, bus, slot, func, idx + 4); pci_conf_write32(seg, bus, slot, func, idx + 4, ~0); } pdev->vf_rlen[i] = pci_conf_read32(seg, bus, slot, func, idx) & PCI_BASE_ADDRESS_MEM_MASK; if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64 ) { pdev->vf_rlen[i] |= (u64)pci_conf_read32(seg, bus, slot, func, idx + 4) << 32; pci_conf_write32(seg, bus, slot, func, idx + 4, hi); } else if ( pdev->vf_rlen[i] ) pdev->vf_rlen[i] |= (u64)~0 << 32; pci_conf_write32(seg, bus, slot, func, idx, bar); pdev->vf_rlen[i] = -pdev->vf_rlen[i]; if ( (bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64 ) ++i; } } else printk(XENLOG_WARNING "SR-IOV device %04x:%02x:%02x.%u has its virtual" " functions already enabled (%04x)\n", seg, bus, slot, func, ctrl); } ret = 0; if ( !pdev->domain ) { pdev->domain = dom0; ret = iommu_add_device(pdev); if ( ret ) { pdev->domain = NULL; goto out; } list_add(&pdev->domain_list, &dom0->arch.pdev_list); } else iommu_enable_device(pdev); pci_enable_acs(pdev); out: spin_unlock(&pcidevs_lock); if ( !ret ) { printk(XENLOG_DEBUG "PCI add %s %04x:%02x:%02x.%u\n", pdev_type, seg, bus, slot, func); while ( pdev->phantom_stride ) { func += pdev->phantom_stride; if ( PCI_SLOT(func) ) break; printk(XENLOG_DEBUG "PCI phantom %04x:%02x:%02x.%u\n", seg, bus, slot, func); } } return ret; } int pci_remove_device(u16 seg, u8 bus, u8 devfn) { struct pci_seg *pseg = get_pseg(seg); struct pci_dev *pdev; int ret; ret = xsm_resource_unplug_pci(XSM_PRIV, (seg << 16) | (bus << 8) | devfn); if ( ret ) return ret; ret = -ENODEV; if ( !pseg ) return -ENODEV; spin_lock(&pcidevs_lock); list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list ) if ( pdev->bus == bus && pdev->devfn == devfn ) { ret = iommu_remove_device(pdev); if ( pdev->domain ) list_del(&pdev->domain_list); pci_cleanup_msi(pdev); free_pdev(pseg, pdev); printk(XENLOG_DEBUG "PCI remove device %04x:%02x:%02x.%u\n", seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); break; } spin_unlock(&pcidevs_lock); return ret; } static int pci_clean_dpci_irq(struct domain *d, struct hvm_pirq_dpci *pirq_dpci, void *arg) { struct dev_intx_gsi_link *digl, *tmp; pirq_guest_unbind(d, dpci_pirq(pirq_dpci)); if ( pt_irq_need_timer(pirq_dpci->flags) ) kill_timer(&pirq_dpci->timer); list_for_each_entry_safe ( digl, tmp, &pirq_dpci->digl_list, list ) { list_del(&digl->list); xfree(digl); } return 0; } static void pci_clean_dpci_irqs(struct domain *d) { struct hvm_irq_dpci *hvm_irq_dpci = NULL; if ( !iommu_enabled ) return; if ( !is_hvm_domain(d) ) return; spin_lock(&d->event_lock); hvm_irq_dpci = domain_get_irq_dpci(d); if ( hvm_irq_dpci != NULL ) { tasklet_kill(&hvm_irq_dpci->dirq_tasklet); pt_pirq_iterate(d, pci_clean_dpci_irq, NULL); d->arch.hvm_domain.irq.dpci = NULL; free_hvm_irq_dpci(hvm_irq_dpci); } spin_unlock(&d->event_lock); } void pci_release_devices(struct domain *d) { struct pci_dev *pdev; u8 bus, devfn; spin_lock(&pcidevs_lock); pci_clean_dpci_irqs(d); while ( (pdev = pci_get_pdev_by_domain(d, -1, -1, -1)) ) { bus = pdev->bus; devfn = pdev->devfn; if ( deassign_device(d, pdev->seg, bus, devfn) ) printk("domain %d: deassign device (%04x:%02x:%02x.%u) failed!\n", d->domain_id, pdev->seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); } spin_unlock(&pcidevs_lock); } #define PCI_CLASS_BRIDGE_HOST 0x0600 #define PCI_CLASS_BRIDGE_PCI 0x0604 enum pdev_type pdev_type(u16 seg, u8 bus, u8 devfn) { u16 class_device, creg; u8 d = PCI_SLOT(devfn), f = PCI_FUNC(devfn); int pos = pci_find_cap_offset(seg, bus, d, f, PCI_CAP_ID_EXP); class_device = pci_conf_read16(seg, bus, d, f, PCI_CLASS_DEVICE); switch ( class_device ) { case PCI_CLASS_BRIDGE_PCI: if ( !pos ) return DEV_TYPE_LEGACY_PCI_BRIDGE; creg = pci_conf_read16(seg, bus, d, f, pos + PCI_EXP_FLAGS); switch ( (creg & PCI_EXP_FLAGS_TYPE) >> 4 ) { case PCI_EXP_TYPE_PCI_BRIDGE: return DEV_TYPE_PCIe2PCI_BRIDGE; case PCI_EXP_TYPE_PCIE_BRIDGE: return DEV_TYPE_PCI2PCIe_BRIDGE; } return DEV_TYPE_PCIe_BRIDGE; case PCI_CLASS_BRIDGE_HOST: return DEV_TYPE_PCI_HOST_BRIDGE; case 0x0000: case 0xffff: return DEV_TYPE_PCI_UNKNOWN; } return pos ? DEV_TYPE_PCIe_ENDPOINT : DEV_TYPE_PCI; } /* * find the upstream PCIe-to-PCI/PCIX bridge or PCI legacy bridge * return 0: the device is integrated PCI device or PCIe * return 1: find PCIe-to-PCI/PCIX bridge or PCI legacy bridge * return -1: fail */ int find_upstream_bridge(u16 seg, u8 *bus, u8 *devfn, u8 *secbus) { struct pci_seg *pseg = get_pseg(seg); int ret = 0; int cnt = 0; if ( *bus == 0 ) return 0; if ( !pseg ) return -1; if ( !pseg->bus2bridge[*bus].map ) return 0; ret = 1; spin_lock(&pseg->bus2bridge_lock); while ( pseg->bus2bridge[*bus].map ) { *secbus = *bus; *devfn = pseg->bus2bridge[*bus].devfn; *bus = pseg->bus2bridge[*bus].bus; if ( cnt++ >= MAX_BUSES ) { ret = -1; goto out; } } out: spin_unlock(&pseg->bus2bridge_lock); return ret; } /* * detect pci device, return 0 if it exists, or return 0 */ int __init pci_device_detect(u16 seg, u8 bus, u8 dev, u8 func) { u32 vendor; vendor = pci_conf_read32(seg, bus, dev, func, PCI_VENDOR_ID); /* some broken boards return 0 or ~0 if a slot is empty: */ if ( (vendor == 0xffffffff) || (vendor == 0x00000000) || (vendor == 0x0000ffff) || (vendor == 0xffff0000) ) return 0; return 1; } void pci_check_disable_device(u16 seg, u8 bus, u8 devfn) { struct pci_dev *pdev; s_time_t now = NOW(); u16 cword; spin_lock(&pcidevs_lock); pdev = pci_get_real_pdev(seg, bus, devfn); if ( pdev ) { if ( now < pdev->fault.time || now - pdev->fault.time > MILLISECS(10) ) pdev->fault.count >>= 1; pdev->fault.time = now; if ( ++pdev->fault.count < PT_FAULT_THRESHOLD ) pdev = NULL; } spin_unlock(&pcidevs_lock); if ( !pdev ) return; /* Tell the device to stop DMAing; we can't rely on the guest to * control it for us. */ devfn = pdev->devfn; cword = pci_conf_read16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), PCI_COMMAND); pci_conf_write16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), PCI_COMMAND, cword & ~PCI_COMMAND_MASTER); } /* * scan pci devices to add all existed PCI devices to alldevs_list, * and setup pci hierarchy in array bus2bridge. */ static int __init _scan_pci_devices(struct pci_seg *pseg, void *arg) { struct pci_dev *pdev; int bus, dev, func; for ( bus = 0; bus < 256; bus++ ) { for ( dev = 0; dev < 32; dev++ ) { for ( func = 0; func < 8; func++ ) { if ( pci_device_detect(pseg->nr, bus, dev, func) == 0 ) { if ( !func ) break; continue; } pdev = alloc_pdev(pseg, bus, PCI_DEVFN(dev, func)); if ( !pdev ) { printk("%s: alloc_pdev failed.\n", __func__); return -ENOMEM; } if ( !func && !(pci_conf_read8(pseg->nr, bus, dev, func, PCI_HEADER_TYPE) & 0x80) ) break; } } } return 0; } int __init scan_pci_devices(void) { int ret; spin_lock(&pcidevs_lock); ret = pci_segments_iterate(_scan_pci_devices, NULL); spin_unlock(&pcidevs_lock); return ret; } struct setup_dom0 { struct domain *d; int (*handler)(u8 devfn, struct pci_dev *); }; static void setup_one_dom0_device(const struct setup_dom0 *ctxt, struct pci_dev *pdev) { u8 devfn = pdev->devfn; do { int err = ctxt->handler(devfn, pdev); if ( err ) { printk(XENLOG_ERR "setup %04x:%02x:%02x.%u for d%d failed (%d)\n", pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), ctxt->d->domain_id, err); if ( devfn == pdev->devfn ) return; } devfn += pdev->phantom_stride; } while ( devfn != pdev->devfn && PCI_SLOT(devfn) == PCI_SLOT(pdev->devfn) ); } static int __init _setup_dom0_pci_devices(struct pci_seg *pseg, void *arg) { struct setup_dom0 *ctxt = arg; int bus, devfn; for ( bus = 0; bus < 256; bus++ ) { for ( devfn = 0; devfn < 256; devfn++ ) { struct pci_dev *pdev = pci_get_pdev(pseg->nr, bus, devfn); if ( !pdev ) continue; if ( !pdev->domain ) { pdev->domain = ctxt->d; list_add(&pdev->domain_list, &ctxt->d->arch.pdev_list); setup_one_dom0_device(ctxt, pdev); } else if ( pdev->domain == dom_xen ) { pdev->domain = ctxt->d; setup_one_dom0_device(ctxt, pdev); pdev->domain = dom_xen; } else if ( pdev->domain != ctxt->d ) printk(XENLOG_WARNING "Dom%d owning %04x:%02x:%02x.%u?\n", pdev->domain->domain_id, pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); } } return 0; } void __init setup_dom0_pci_devices( struct domain *d, int (*handler)(u8 devfn, struct pci_dev *)) { struct setup_dom0 ctxt = { .d = d, .handler = handler }; spin_lock(&pcidevs_lock); pci_segments_iterate(_setup_dom0_pci_devices, &ctxt); spin_unlock(&pcidevs_lock); } static int _dump_pci_devices(struct pci_seg *pseg, void *arg) { struct pci_dev *pdev; struct msi_desc *msi; printk("==== segment %04x ====\n", pseg->nr); list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list ) { printk("%04x:%02x:%02x.%u - dom %-3d - MSIs < ", pseg->nr, pdev->bus, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), pdev->domain ? pdev->domain->domain_id : -1); list_for_each_entry ( msi, &pdev->msi_list, list ) printk("%d ", msi->irq); printk(">\n"); } return 0; } static void dump_pci_devices(unsigned char ch) { printk("==== PCI devices ====\n"); spin_lock(&pcidevs_lock); pci_segments_iterate(_dump_pci_devices, NULL); spin_unlock(&pcidevs_lock); } struct keyhandler dump_pci_devices_keyhandler = { .diagnostic = 1, .u.fn = dump_pci_devices, .desc = "dump PCI devices" }; static int __init setup_dump_pcidevs(void) { register_keyhandler('Q', &dump_pci_devices_keyhandler); return 0; } __initcall(setup_dump_pcidevs); /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/drivers/passthrough/amd/0000775000175000017500000000000012307313555016023 5ustar smbsmbxen-4.4.0/xen/drivers/passthrough/amd/iommu_intr.c0000664000175000017500000005061712307313555020362 0ustar smbsmb/* * Copyright (C) 2007 Advanced Micro Devices, Inc. * Author: Wei Wang * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #define INTREMAP_TABLE_ORDER 1 #define INTREMAP_LENGTH 0xB #define INTREMAP_ENTRIES (1 << INTREMAP_LENGTH) struct ioapic_sbdf ioapic_sbdf[MAX_IO_APICS]; struct hpet_sbdf hpet_sbdf; void *shared_intremap_table; unsigned long *shared_intremap_inuse; static DEFINE_SPINLOCK(shared_intremap_lock); static void dump_intremap_tables(unsigned char key); static struct keyhandler dump_intremap = { .diagnostic = 0, .u.fn = dump_intremap_tables, .desc = "dump IOMMU intremap tables" }; static spinlock_t* get_intremap_lock(int seg, int req_id) { return (amd_iommu_perdev_intremap ? &get_ivrs_mappings(seg)[req_id].intremap_lock: &shared_intremap_lock); } static int get_intremap_requestor_id(int seg, int bdf) { ASSERT( bdf < ivrs_bdf_entries ); return get_ivrs_mappings(seg)[bdf].dte_requestor_id; } static unsigned int alloc_intremap_entry(int seg, int bdf, unsigned int nr) { unsigned long *inuse = get_ivrs_mappings(seg)[bdf].intremap_inuse; unsigned int slot = find_first_zero_bit(inuse, INTREMAP_ENTRIES); for ( ; ; ) { unsigned int end; if ( slot >= INTREMAP_ENTRIES ) break; end = find_next_bit(inuse, INTREMAP_ENTRIES, slot + 1); if ( end > INTREMAP_ENTRIES ) end = INTREMAP_ENTRIES; slot = (slot + nr - 1) & ~(nr - 1); if ( slot + nr <= end ) { while ( nr-- ) __set_bit(slot + nr, inuse); break; } slot = (end + nr) & ~(nr - 1); if ( slot >= INTREMAP_ENTRIES ) break; slot = find_next_zero_bit(inuse, INTREMAP_ENTRIES, slot); } return slot; } static u32 *get_intremap_entry(int seg, int bdf, int offset) { u32 *table = get_ivrs_mappings(seg)[bdf].intremap_table; ASSERT( (table != NULL) && (offset < INTREMAP_ENTRIES) ); return table + offset; } static void free_intremap_entry(int seg, int bdf, int offset) { u32 *entry = get_intremap_entry(seg, bdf, offset); memset(entry, 0, sizeof(u32)); __clear_bit(offset, get_ivrs_mappings(seg)[bdf].intremap_inuse); } static void update_intremap_entry(u32* entry, u8 vector, u8 int_type, u8 dest_mode, u8 dest) { set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, 0, INT_REMAP_ENTRY_REMAPEN_MASK, INT_REMAP_ENTRY_REMAPEN_SHIFT, entry); set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, *entry, INT_REMAP_ENTRY_SUPIOPF_MASK, INT_REMAP_ENTRY_SUPIOPF_SHIFT, entry); set_field_in_reg_u32(int_type, *entry, INT_REMAP_ENTRY_INTTYPE_MASK, INT_REMAP_ENTRY_INTTYPE_SHIFT, entry); set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, *entry, INT_REMAP_ENTRY_REQEOI_MASK, INT_REMAP_ENTRY_REQEOI_SHIFT, entry); set_field_in_reg_u32((u32)dest_mode, *entry, INT_REMAP_ENTRY_DM_MASK, INT_REMAP_ENTRY_DM_SHIFT, entry); set_field_in_reg_u32((u32)dest, *entry, INT_REMAP_ENTRY_DEST_MAST, INT_REMAP_ENTRY_DEST_SHIFT, entry); set_field_in_reg_u32((u32)vector, *entry, INT_REMAP_ENTRY_VECTOR_MASK, INT_REMAP_ENTRY_VECTOR_SHIFT, entry); } static inline int get_rte_index(const struct IO_APIC_route_entry *rte) { return rte->vector | (rte->delivery_mode << 8); } static inline void set_rte_index(struct IO_APIC_route_entry *rte, int offset) { rte->vector = (u8)offset; rte->delivery_mode = offset >> 8; } static int update_intremap_entry_from_ioapic( int bdf, struct amd_iommu *iommu, struct IO_APIC_route_entry *rte, bool_t lo_update, u16 *index) { unsigned long flags; u32* entry; u8 delivery_mode, dest, vector, dest_mode; int req_id; spinlock_t *lock; unsigned int offset; req_id = get_intremap_requestor_id(iommu->seg, bdf); lock = get_intremap_lock(iommu->seg, req_id); delivery_mode = rte->delivery_mode; vector = rte->vector; dest_mode = rte->dest_mode; dest = rte->dest.logical.logical_dest; spin_lock_irqsave(lock, flags); offset = *index; if ( offset >= INTREMAP_ENTRIES ) { offset = alloc_intremap_entry(iommu->seg, req_id, 1); if ( offset >= INTREMAP_ENTRIES ) { spin_unlock_irqrestore(lock, flags); rte->mask = 1; return -ENOSPC; } *index = offset; lo_update = 1; } entry = get_intremap_entry(iommu->seg, req_id, offset); if ( !lo_update ) { /* * Low half of incoming RTE is already in remapped format, * so need to recover vector and delivery mode from IRTE. */ ASSERT(get_rte_index(rte) == offset); vector = get_field_from_reg_u32(*entry, INT_REMAP_ENTRY_VECTOR_MASK, INT_REMAP_ENTRY_VECTOR_SHIFT); delivery_mode = get_field_from_reg_u32(*entry, INT_REMAP_ENTRY_INTTYPE_MASK, INT_REMAP_ENTRY_INTTYPE_SHIFT); } update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); spin_unlock_irqrestore(lock, flags); if ( iommu->enabled ) { spin_lock_irqsave(&iommu->lock, flags); amd_iommu_flush_intremap(iommu, req_id); spin_unlock_irqrestore(&iommu->lock, flags); } set_rte_index(rte, offset); return 0; } int __init amd_iommu_setup_ioapic_remapping(void) { struct IO_APIC_route_entry rte; unsigned long flags; u32* entry; int apic, pin; u8 delivery_mode, dest, vector, dest_mode; u16 seg, bdf, req_id; struct amd_iommu *iommu; spinlock_t *lock; unsigned int offset; /* Read ioapic entries and update interrupt remapping table accordingly */ for ( apic = 0; apic < nr_ioapics; apic++ ) { for ( pin = 0; pin < nr_ioapic_entries[apic]; pin++ ) { rte = __ioapic_read_entry(apic, pin, 1); if ( rte.mask == 1 ) continue; /* get device id of ioapic devices */ bdf = ioapic_sbdf[IO_APIC_ID(apic)].bdf; seg = ioapic_sbdf[IO_APIC_ID(apic)].seg; iommu = find_iommu_for_device(seg, bdf); if ( !iommu ) { AMD_IOMMU_DEBUG("Fail to find iommu for ioapic " "device id = %04x:%04x\n", seg, bdf); continue; } req_id = get_intremap_requestor_id(iommu->seg, bdf); lock = get_intremap_lock(iommu->seg, req_id); delivery_mode = rte.delivery_mode; vector = rte.vector; dest_mode = rte.dest_mode; dest = rte.dest.logical.logical_dest; spin_lock_irqsave(lock, flags); offset = alloc_intremap_entry(seg, req_id, 1); BUG_ON(offset >= INTREMAP_ENTRIES); entry = get_intremap_entry(iommu->seg, req_id, offset); update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); spin_unlock_irqrestore(lock, flags); set_rte_index(&rte, offset); ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx[pin] = offset; __ioapic_write_entry(apic, pin, 1, rte); if ( iommu->enabled ) { spin_lock_irqsave(&iommu->lock, flags); amd_iommu_flush_intremap(iommu, req_id); spin_unlock_irqrestore(&iommu->lock, flags); } } } register_keyhandler('V', &dump_intremap); return 0; } void amd_iommu_ioapic_update_ire( unsigned int apic, unsigned int reg, unsigned int value) { struct IO_APIC_route_entry old_rte = { 0 }; struct IO_APIC_route_entry new_rte = { 0 }; unsigned int rte_lo = (reg & 1) ? reg - 1 : reg; unsigned int pin = (reg - 0x10) / 2; int saved_mask, seg, bdf, rc; struct amd_iommu *iommu; if ( !iommu_intremap ) { __io_apic_write(apic, reg, value); return; } /* get device id of ioapic devices */ bdf = ioapic_sbdf[IO_APIC_ID(apic)].bdf; seg = ioapic_sbdf[IO_APIC_ID(apic)].seg; iommu = find_iommu_for_device(seg, bdf); if ( !iommu ) { AMD_IOMMU_DEBUG("Fail to find iommu for ioapic device id =" " %04x:%04x\n", seg, bdf); __io_apic_write(apic, reg, value); return; } /* save io-apic rte lower 32 bits */ *((u32 *)&old_rte) = __io_apic_read(apic, rte_lo); saved_mask = old_rte.mask; if ( reg == rte_lo ) { *((u32 *)&new_rte) = value; /* read upper 32 bits from io-apic rte */ *(((u32 *)&new_rte) + 1) = __io_apic_read(apic, reg + 1); } else { *((u32 *)&new_rte) = *((u32 *)&old_rte); *(((u32 *)&new_rte) + 1) = value; } if ( new_rte.mask && ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx[pin] >= INTREMAP_ENTRIES ) { ASSERT(saved_mask); __io_apic_write(apic, reg, value); return; } /* mask the interrupt while we change the intremap table */ if ( !saved_mask ) { old_rte.mask = 1; __io_apic_write(apic, rte_lo, *((u32 *)&old_rte)); } /* Update interrupt remapping entry */ rc = update_intremap_entry_from_ioapic( bdf, iommu, &new_rte, reg == rte_lo, &ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx[pin]); __io_apic_write(apic, reg, ((u32 *)&new_rte)[reg != rte_lo]); if ( rc ) { /* Keep the entry masked. */ printk(XENLOG_ERR "Remapping IO-APIC %#x pin %u failed (%d)\n", IO_APIC_ID(apic), pin, rc); return; } /* For lower bits access, return directly to avoid double writes */ if ( reg == rte_lo ) return; /* unmask the interrupt after we have updated the intremap table */ if ( !saved_mask ) { old_rte.mask = saved_mask; __io_apic_write(apic, rte_lo, *((u32 *)&old_rte)); } } unsigned int amd_iommu_read_ioapic_from_ire( unsigned int apic, unsigned int reg) { unsigned int val = __io_apic_read(apic, reg); if ( !(reg & 1) ) { unsigned int offset = val & (INTREMAP_ENTRIES - 1); u16 bdf = ioapic_sbdf[IO_APIC_ID(apic)].bdf; u16 seg = ioapic_sbdf[IO_APIC_ID(apic)].seg; u16 req_id = get_intremap_requestor_id(seg, bdf); const u32 *entry = get_intremap_entry(seg, req_id, offset); val &= ~(INTREMAP_ENTRIES - 1); val |= get_field_from_reg_u32(*entry, INT_REMAP_ENTRY_INTTYPE_MASK, INT_REMAP_ENTRY_INTTYPE_SHIFT) << 8; val |= get_field_from_reg_u32(*entry, INT_REMAP_ENTRY_VECTOR_MASK, INT_REMAP_ENTRY_VECTOR_SHIFT); } return val; } static int update_intremap_entry_from_msi_msg( struct amd_iommu *iommu, u16 bdf, unsigned int nr, int *remap_index, const struct msi_msg *msg, u32 *data) { unsigned long flags; u32* entry; u16 req_id, alias_id; u8 delivery_mode, dest, vector, dest_mode; spinlock_t *lock; unsigned int offset, i; req_id = get_dma_requestor_id(iommu->seg, bdf); alias_id = get_intremap_requestor_id(iommu->seg, bdf); if ( msg == NULL ) { lock = get_intremap_lock(iommu->seg, req_id); spin_lock_irqsave(lock, flags); for ( i = 0; i < nr; ++i ) free_intremap_entry(iommu->seg, req_id, *remap_index + i); spin_unlock_irqrestore(lock, flags); goto done; } lock = get_intremap_lock(iommu->seg, req_id); spin_lock_irqsave(lock, flags); dest_mode = (msg->address_lo >> MSI_ADDR_DESTMODE_SHIFT) & 0x1; delivery_mode = (msg->data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x1; vector = (msg->data >> MSI_DATA_VECTOR_SHIFT) & MSI_DATA_VECTOR_MASK; dest = (msg->address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff; offset = *remap_index; if ( offset >= INTREMAP_ENTRIES ) { ASSERT(nr); offset = alloc_intremap_entry(iommu->seg, bdf, nr); if ( offset >= INTREMAP_ENTRIES ) { spin_unlock_irqrestore(lock, flags); return -ENOSPC; } *remap_index = offset; } entry = get_intremap_entry(iommu->seg, req_id, offset); update_intremap_entry(entry, vector, delivery_mode, dest_mode, dest); spin_unlock_irqrestore(lock, flags); *data = (msg->data & ~(INTREMAP_ENTRIES - 1)) | offset; /* * In some special cases, a pci-e device(e.g SATA controller in IDE mode) * will use alias id to index interrupt remapping table. * We have to setup a secondary interrupt remapping entry to satisfy those * devices. */ if ( ( req_id != alias_id ) && get_ivrs_mappings(iommu->seg)[alias_id].intremap_table != NULL ) { BUG_ON(get_ivrs_mappings(iommu->seg)[req_id].intremap_table != get_ivrs_mappings(iommu->seg)[alias_id].intremap_table); } done: if ( iommu->enabled ) { spin_lock_irqsave(&iommu->lock, flags); amd_iommu_flush_intremap(iommu, req_id); if ( alias_id != req_id ) amd_iommu_flush_intremap(iommu, alias_id); spin_unlock_irqrestore(&iommu->lock, flags); } return 0; } static struct amd_iommu *_find_iommu_for_device(int seg, int bdf) { struct amd_iommu *iommu; list_for_each_entry ( iommu, &amd_iommu_head, list ) if ( iommu->seg == seg && iommu->bdf == bdf ) return NULL; iommu = find_iommu_for_device(seg, bdf); if ( iommu ) return iommu; AMD_IOMMU_DEBUG("No IOMMU for MSI dev = %04x:%02x:%02x.%u\n", seg, PCI_BUS(bdf), PCI_SLOT(bdf), PCI_FUNC(bdf)); return ERR_PTR(-EINVAL); } int amd_iommu_msi_msg_update_ire( struct msi_desc *msi_desc, struct msi_msg *msg) { struct pci_dev *pdev = msi_desc->dev; int bdf, seg, rc; struct amd_iommu *iommu; unsigned int i, nr = 1; u32 data; bdf = pdev ? PCI_BDF2(pdev->bus, pdev->devfn) : hpet_sbdf.bdf; seg = pdev ? pdev->seg : hpet_sbdf.seg; iommu = _find_iommu_for_device(seg, bdf); if ( IS_ERR_OR_NULL(iommu) ) return PTR_ERR(iommu); if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ) nr = msi_desc->msi.nvec; if ( msi_desc->remap_index >= 0 && !msg ) { do { update_intremap_entry_from_msi_msg(iommu, bdf, nr, &msi_desc->remap_index, NULL, NULL); if ( !pdev || !pdev->phantom_stride ) break; bdf += pdev->phantom_stride; } while ( PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) ); for ( i = 0; i < nr; ++i ) msi_desc[i].remap_index = -1; if ( pdev ) bdf = PCI_BDF2(pdev->bus, pdev->devfn); } if ( !msg ) return 0; do { rc = update_intremap_entry_from_msi_msg(iommu, bdf, nr, &msi_desc->remap_index, msg, &data); if ( rc || !pdev || !pdev->phantom_stride ) break; bdf += pdev->phantom_stride; } while ( PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) ); if ( !rc ) for ( i = 1; i < nr; ++i ) msi_desc[i].remap_index = msi_desc->remap_index + i; msg->data = data; return rc; } void amd_iommu_read_msi_from_ire( struct msi_desc *msi_desc, struct msi_msg *msg) { unsigned int offset = msg->data & (INTREMAP_ENTRIES - 1); const struct pci_dev *pdev = msi_desc->dev; u16 bdf = pdev ? PCI_BDF2(pdev->bus, pdev->devfn) : hpet_sbdf.bdf; u16 seg = pdev ? pdev->seg : hpet_sbdf.seg; const u32 *entry; if ( IS_ERR_OR_NULL(_find_iommu_for_device(seg, bdf)) ) return; entry = get_intremap_entry(seg, get_dma_requestor_id(seg, bdf), offset); if ( msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ) { int nr = msi_desc->msi_attrib.entry_nr; ASSERT(!(offset & (msi_desc[-nr].msi.nvec - 1))); offset |= nr; } msg->data &= ~(INTREMAP_ENTRIES - 1); msg->data |= get_field_from_reg_u32(*entry, INT_REMAP_ENTRY_INTTYPE_MASK, INT_REMAP_ENTRY_INTTYPE_SHIFT) << 8; msg->data |= get_field_from_reg_u32(*entry, INT_REMAP_ENTRY_VECTOR_MASK, INT_REMAP_ENTRY_VECTOR_SHIFT); } int __init amd_iommu_free_intremap_table( u16 seg, struct ivrs_mappings *ivrs_mapping) { void *tb = ivrs_mapping->intremap_table; if ( tb ) { __free_amd_iommu_tables(tb, INTREMAP_TABLE_ORDER); ivrs_mapping->intremap_table = NULL; } return 0; } void* __init amd_iommu_alloc_intremap_table(unsigned long **inuse_map) { void *tb; tb = __alloc_amd_iommu_tables(INTREMAP_TABLE_ORDER); BUG_ON(tb == NULL); memset(tb, 0, PAGE_SIZE * (1UL << INTREMAP_TABLE_ORDER)); *inuse_map = xzalloc_array(unsigned long, BITS_TO_LONGS(INTREMAP_ENTRIES)); BUG_ON(*inuse_map == NULL); return tb; } int __init amd_setup_hpet_msi(struct msi_desc *msi_desc) { spinlock_t *lock; unsigned long flags; int rc = 0; if ( hpet_sbdf.init == HPET_NONE ) { AMD_IOMMU_DEBUG("Failed to setup HPET MSI remapping." " Missing IVRS HPET info.\n"); return -ENODEV; } if ( msi_desc->hpet_id != hpet_sbdf.id ) { AMD_IOMMU_DEBUG("Failed to setup HPET MSI remapping." " Wrong HPET.\n"); return -ENODEV; } lock = get_intremap_lock(hpet_sbdf.seg, hpet_sbdf.bdf); spin_lock_irqsave(lock, flags); msi_desc->remap_index = alloc_intremap_entry(hpet_sbdf.seg, hpet_sbdf.bdf, 1); if ( msi_desc->remap_index >= INTREMAP_ENTRIES ) { msi_desc->remap_index = -1; rc = -ENXIO; } spin_unlock_irqrestore(lock, flags); return rc; } static void dump_intremap_table(const u32 *table) { u32 count; if ( !table ) return; for ( count = 0; count < INTREMAP_ENTRIES; count++ ) { if ( !table[count] ) continue; printk(" IRTE[%03x] %08x\n", count, table[count]); } } static int dump_intremap_mapping(u16 seg, struct ivrs_mappings *ivrs_mapping) { unsigned long flags; if ( !ivrs_mapping ) return 0; printk(" %04x:%02x:%02x:%u:\n", seg, PCI_BUS(ivrs_mapping->dte_requestor_id), PCI_SLOT(ivrs_mapping->dte_requestor_id), PCI_FUNC(ivrs_mapping->dte_requestor_id)); spin_lock_irqsave(&(ivrs_mapping->intremap_lock), flags); dump_intremap_table(ivrs_mapping->intremap_table); spin_unlock_irqrestore(&(ivrs_mapping->intremap_lock), flags); return 0; } static void dump_intremap_tables(unsigned char key) { unsigned long flags; printk("--- Dumping Per-dev IOMMU Interrupt Remapping Table ---\n"); iterate_ivrs_entries(dump_intremap_mapping); printk("--- Dumping Shared IOMMU Interrupt Remapping Table ---\n"); spin_lock_irqsave(&shared_intremap_lock, flags); dump_intremap_table(shared_intremap_table); spin_unlock_irqrestore(&shared_intremap_lock, flags); } xen-4.4.0/xen/drivers/passthrough/amd/Makefile0000664000175000017500000000030612307313555017462 0ustar smbsmbobj-bin-y += iommu_detect.init.o obj-y += iommu_init.o obj-y += iommu_map.o obj-y += pci_amd_iommu.o obj-bin-y += iommu_acpi.init.o obj-y += iommu_intr.o obj-y += iommu_cmd.o obj-y += iommu_guest.o xen-4.4.0/xen/drivers/passthrough/amd/pci_amd_iommu.c0000664000175000017500000004510612307313555020777 0ustar smbsmb/* * Copyright (C) 2007 Advanced Micro Devices, Inc. * Author: Leo Duran * Author: Wei Wang - adapted to xen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include "../ats.h" static bool_t __read_mostly init_done; struct amd_iommu *find_iommu_for_device(int seg, int bdf) { struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(seg); if ( !ivrs_mappings || bdf >= ivrs_bdf_entries ) return NULL; if ( unlikely(!ivrs_mappings[bdf].iommu) && likely(init_done) ) { unsigned int bd0 = bdf & ~PCI_FUNC(~0); if ( ivrs_mappings[bd0].iommu ) { struct ivrs_mappings tmp = ivrs_mappings[bd0]; tmp.iommu = NULL; if ( tmp.dte_requestor_id == bd0 ) tmp.dte_requestor_id = bdf; ivrs_mappings[bdf] = tmp; printk(XENLOG_WARNING "%04x:%02x:%02x.%u not found in ACPI tables;" " using same IOMMU as function 0\n", seg, PCI_BUS(bdf), PCI_SLOT(bdf), PCI_FUNC(bdf)); /* write iommu field last */ ivrs_mappings[bdf].iommu = ivrs_mappings[bd0].iommu; } } return ivrs_mappings[bdf].iommu; } /* * Some devices will use alias id and original device id to index interrupt * table and I/O page table respectively. Such devices will have * both alias entry and select entry in IVRS structure. * * Return original device id, if device has valid interrupt remapping * table setup for both select entry and alias entry. */ int get_dma_requestor_id(u16 seg, u16 bdf) { struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(seg); int req_id; BUG_ON ( bdf >= ivrs_bdf_entries ); req_id = ivrs_mappings[bdf].dte_requestor_id; if ( (ivrs_mappings[bdf].intremap_table != NULL) && (ivrs_mappings[req_id].intremap_table != NULL) ) req_id = bdf; return req_id; } static int is_translation_valid(u32 *entry) { return (get_field_from_reg_u32(entry[0], IOMMU_DEV_TABLE_VALID_MASK, IOMMU_DEV_TABLE_VALID_SHIFT) && get_field_from_reg_u32(entry[0], IOMMU_DEV_TABLE_TRANSLATION_VALID_MASK, IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT)); } static void disable_translation(u32 *dte) { u32 entry; entry = dte[0]; set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, entry, IOMMU_DEV_TABLE_TRANSLATION_VALID_MASK, IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT, &entry); set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, entry, IOMMU_DEV_TABLE_VALID_MASK, IOMMU_DEV_TABLE_VALID_SHIFT, &entry); dte[0] = entry; } static void amd_iommu_setup_domain_device( struct domain *domain, struct amd_iommu *iommu, u8 devfn, struct pci_dev *pdev) { void *dte; unsigned long flags; int req_id, valid = 1; int dte_i = 0; u8 bus = pdev->bus; struct hvm_iommu *hd = domain_hvm_iommu(domain); BUG_ON( !hd->root_table || !hd->paging_mode || !iommu->dev_table.buffer ); if ( iommu_passthrough && (domain->domain_id == 0) ) valid = 0; if ( ats_enabled ) dte_i = 1; /* get device-table entry */ req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn)); dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE); spin_lock_irqsave(&iommu->lock, flags); if ( !is_translation_valid((u32 *)dte) ) { /* bind DTE to domain page-tables */ amd_iommu_set_root_page_table( (u32 *)dte, page_to_maddr(hd->root_table), hd->domain_id, hd->paging_mode, valid); if ( pci_ats_device(iommu->seg, bus, pdev->devfn) && iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) ) iommu_dte_set_iotlb((u32 *)dte, dte_i); amd_iommu_flush_device(iommu, req_id); AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, " "root table = %#"PRIx64", " "domain = %d, paging mode = %d\n", req_id, pdev->type, page_to_maddr(hd->root_table), hd->domain_id, hd->paging_mode); } spin_unlock_irqrestore(&iommu->lock, flags); ASSERT(spin_is_locked(&pcidevs_lock)); if ( pci_ats_device(iommu->seg, bus, pdev->devfn) && !pci_ats_enabled(iommu->seg, bus, pdev->devfn) ) { if ( devfn == pdev->devfn ) enable_ats_device(iommu->seg, bus, devfn); amd_iommu_flush_iotlb(devfn, pdev, INV_IOMMU_ALL_PAGES_ADDRESS, 0); } } static int __init amd_iommu_setup_dom0_device(u8 devfn, struct pci_dev *pdev) { int bdf = PCI_BDF2(pdev->bus, pdev->devfn); struct amd_iommu *iommu = find_iommu_for_device(pdev->seg, bdf); if ( unlikely(!iommu) ) { /* Filter the bridge devices */ if ( pdev->type == DEV_TYPE_PCI_HOST_BRIDGE ) { AMD_IOMMU_DEBUG("Skipping host bridge %04x:%02x:%02x.%u\n", pdev->seg, PCI_BUS(bdf), PCI_SLOT(bdf), PCI_FUNC(bdf)); return 0; } AMD_IOMMU_DEBUG("No iommu for device %04x:%02x:%02x.%u\n", pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); return -ENODEV; } amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev); return 0; } int __init amd_iov_detect(void) { INIT_LIST_HEAD(&amd_iommu_head); if ( !iommu_enable && !iommu_intremap ) return 0; if ( (amd_iommu_detect_acpi() !=0) || (iommu_found() == 0) ) { printk("AMD-Vi: IOMMU not found!\n"); iommu_intremap = 0; return -ENODEV; } if ( amd_iommu_init() != 0 ) { printk("AMD-Vi: Error initialization\n"); return -ENODEV; } init_done = 1; if ( !amd_iommu_perdev_intremap ) printk(XENLOG_WARNING "AMD-Vi: Using global interrupt remap table is not recommended (see XSA-36)!\n"); return scan_pci_devices(); } static int allocate_domain_resources(struct hvm_iommu *hd) { /* allocate root table */ spin_lock(&hd->mapping_lock); if ( !hd->root_table ) { hd->root_table = alloc_amd_iommu_pgtable(); if ( !hd->root_table ) { spin_unlock(&hd->mapping_lock); return -ENOMEM; } } spin_unlock(&hd->mapping_lock); return 0; } static int get_paging_mode(unsigned long entries) { int level = 1; BUG_ON( !entries ); while ( entries > PTE_PER_TABLE_SIZE ) { entries = PTE_PER_TABLE_ALIGN(entries) >> PTE_PER_TABLE_SHIFT; if ( ++level > 6 ) return -ENOMEM; } return level; } static int amd_iommu_domain_init(struct domain *d) { struct hvm_iommu *hd = domain_hvm_iommu(d); /* allocate page directroy */ if ( allocate_domain_resources(hd) != 0 ) { if ( hd->root_table ) free_domheap_page(hd->root_table); return -ENOMEM; } /* For pv and dom0, stick with get_paging_mode(max_page) * For HVM dom0, use 2 level page table at first */ hd->paging_mode = is_hvm_domain(d) ? IOMMU_PAGING_MODE_LEVEL_2 : get_paging_mode(max_page); hd->domain_id = d->domain_id; guest_iommu_init(d); return 0; } static void __init amd_iommu_dom0_init(struct domain *d) { unsigned long i; if ( !iommu_passthrough && !need_iommu(d) ) { /* Set up 1:1 page table for dom0 */ for ( i = 0; i < max_pdx; i++ ) { unsigned long pfn = pdx_to_pfn(i); /* * XXX Should we really map all non-RAM (above 4G)? Minimally * a pfn_valid() check would seem desirable here. */ if ( mfn_valid(pfn) ) amd_iommu_map_page(d, pfn, pfn, IOMMUF_readable|IOMMUF_writable); if ( !(i & 0xfffff) ) process_pending_softirqs(); } } setup_dom0_pci_devices(d, amd_iommu_setup_dom0_device); } void amd_iommu_disable_domain_device(struct domain *domain, struct amd_iommu *iommu, u8 devfn, struct pci_dev *pdev) { void *dte; unsigned long flags; int req_id; u8 bus = pdev->bus; BUG_ON ( iommu->dev_table.buffer == NULL ); req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn)); dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE); spin_lock_irqsave(&iommu->lock, flags); if ( is_translation_valid((u32 *)dte) ) { disable_translation((u32 *)dte); if ( pci_ats_device(iommu->seg, bus, pdev->devfn) && iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) ) iommu_dte_set_iotlb((u32 *)dte, 0); amd_iommu_flush_device(iommu, req_id); AMD_IOMMU_DEBUG("Disable: device id = %#x, " "domain = %d, paging mode = %d\n", req_id, domain_hvm_iommu(domain)->domain_id, domain_hvm_iommu(domain)->paging_mode); } spin_unlock_irqrestore(&iommu->lock, flags); ASSERT(spin_is_locked(&pcidevs_lock)); if ( devfn == pdev->devfn && pci_ats_device(iommu->seg, bus, devfn) && pci_ats_enabled(iommu->seg, bus, devfn) ) disable_ats_device(iommu->seg, bus, devfn); } static int reassign_device(struct domain *source, struct domain *target, u8 devfn, struct pci_dev *pdev) { struct amd_iommu *iommu; int bdf; struct hvm_iommu *t = domain_hvm_iommu(target); bdf = PCI_BDF2(pdev->bus, pdev->devfn); iommu = find_iommu_for_device(pdev->seg, bdf); if ( !iommu ) { AMD_IOMMU_DEBUG("Fail to find iommu." " %04x:%02x:%x02.%x cannot be assigned to dom%d\n", pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), target->domain_id); return -ENODEV; } amd_iommu_disable_domain_device(source, iommu, devfn, pdev); if ( devfn == pdev->devfn ) { list_move(&pdev->domain_list, &target->arch.pdev_list); pdev->domain = target; } /* IO page tables might be destroyed after pci-detach the last device * In this case, we have to re-allocate root table for next pci-attach.*/ if ( t->root_table == NULL ) allocate_domain_resources(t); amd_iommu_setup_domain_device(target, iommu, devfn, pdev); AMD_IOMMU_DEBUG("Re-assign %04x:%02x:%02x.%u from dom%d to dom%d\n", pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), source->domain_id, target->domain_id); return 0; } static int amd_iommu_assign_device(struct domain *d, u8 devfn, struct pci_dev *pdev) { struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg); int bdf = PCI_BDF2(pdev->bus, devfn); int req_id = get_dma_requestor_id(pdev->seg, bdf); if ( ivrs_mappings[req_id].unity_map_enable ) { amd_iommu_reserve_domain_unity_map( d, ivrs_mappings[req_id].addr_range_start, ivrs_mappings[req_id].addr_range_length, ivrs_mappings[req_id].write_permission, ivrs_mappings[req_id].read_permission); } return reassign_device(dom0, d, devfn, pdev); } static void deallocate_next_page_table(struct page_info *pg, int level) { PFN_ORDER(pg) = level; spin_lock(&iommu_pt_cleanup_lock); page_list_add_tail(pg, &iommu_pt_cleanup_list); spin_unlock(&iommu_pt_cleanup_lock); } static void deallocate_page_table(struct page_info *pg) { void *table_vaddr, *pde; u64 next_table_maddr; unsigned int index, level = PFN_ORDER(pg), next_level; PFN_ORDER(pg) = 0; if ( level <= 1 ) { free_amd_iommu_pgtable(pg); return; } table_vaddr = __map_domain_page(pg); for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ ) { pde = table_vaddr + (index * IOMMU_PAGE_TABLE_ENTRY_SIZE); next_table_maddr = amd_iommu_get_next_table_from_pte(pde); next_level = iommu_next_level((u32*)pde); if ( (next_table_maddr != 0) && (next_level != 0) && iommu_is_pte_present((u32*)pde) ) { /* We do not support skip levels yet */ ASSERT(next_level == level - 1); deallocate_next_page_table(maddr_to_page(next_table_maddr), next_level); } } unmap_domain_page(table_vaddr); free_amd_iommu_pgtable(pg); } static void deallocate_iommu_page_tables(struct domain *d) { struct hvm_iommu *hd = domain_hvm_iommu(d); if ( iommu_use_hap_pt(d) ) return; spin_lock(&hd->mapping_lock); if ( hd->root_table ) { deallocate_next_page_table(hd->root_table, hd->paging_mode); hd->root_table = NULL; } spin_unlock(&hd->mapping_lock); } static void amd_iommu_domain_destroy(struct domain *d) { guest_iommu_destroy(d); deallocate_iommu_page_tables(d); amd_iommu_flush_all_pages(d); } static int amd_iommu_add_device(u8 devfn, struct pci_dev *pdev) { struct amd_iommu *iommu; u16 bdf; if ( !pdev->domain ) return -EINVAL; bdf = PCI_BDF2(pdev->bus, pdev->devfn); iommu = find_iommu_for_device(pdev->seg, bdf); if ( !iommu ) { AMD_IOMMU_DEBUG("Fail to find iommu." " %04x:%02x:%02x.%u cannot be assigned to dom%d\n", pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pdev->domain->domain_id); return -ENODEV; } amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev); return 0; } static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev) { struct amd_iommu *iommu; u16 bdf; if ( !pdev->domain ) return -EINVAL; bdf = PCI_BDF2(pdev->bus, pdev->devfn); iommu = find_iommu_for_device(pdev->seg, bdf); if ( !iommu ) { AMD_IOMMU_DEBUG("Fail to find iommu." " %04x:%02x:%02x.%u cannot be removed from dom%d\n", pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), pdev->domain->domain_id); return -ENODEV; } amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev); return 0; } static int amd_iommu_group_id(u16 seg, u8 bus, u8 devfn) { int bdf = PCI_BDF2(bus, devfn); return (bdf < ivrs_bdf_entries) ? get_dma_requestor_id(seg, bdf) : bdf; } #include static void amd_dump_p2m_table_level(struct page_info* pg, int level, paddr_t gpa, int indent) { paddr_t address; void *table_vaddr, *pde; paddr_t next_table_maddr; int index, next_level, present; u32 *entry; if ( level < 1 ) return; table_vaddr = __map_domain_page(pg); if ( table_vaddr == NULL ) { printk("Failed to map IOMMU domain page %"PRIpaddr"\n", page_to_maddr(pg)); return; } for ( index = 0; index < PTE_PER_TABLE_SIZE; index++ ) { if ( !(index % 2) ) process_pending_softirqs(); pde = table_vaddr + (index * IOMMU_PAGE_TABLE_ENTRY_SIZE); next_table_maddr = amd_iommu_get_next_table_from_pte(pde); entry = (u32*)pde; present = get_field_from_reg_u32(entry[0], IOMMU_PDE_PRESENT_MASK, IOMMU_PDE_PRESENT_SHIFT); if ( !present ) continue; next_level = get_field_from_reg_u32(entry[0], IOMMU_PDE_NEXT_LEVEL_MASK, IOMMU_PDE_NEXT_LEVEL_SHIFT); if ( next_level && (next_level != (level - 1)) ) { printk("IOMMU p2m table error. next_level = %d, expected %d\n", next_level, level - 1); continue; } address = gpa + amd_offset_level_address(index, level); if ( next_level >= 1 ) amd_dump_p2m_table_level( maddr_to_page(next_table_maddr), next_level, address, indent + 1); else printk("%*sgfn: %08lx mfn: %08lx\n", indent, "", (unsigned long)PFN_DOWN(address), (unsigned long)PFN_DOWN(next_table_maddr)); } unmap_domain_page(table_vaddr); } static void amd_dump_p2m_table(struct domain *d) { struct hvm_iommu *hd = domain_hvm_iommu(d); if ( !hd->root_table ) return; printk("p2m table has %d levels\n", hd->paging_mode); amd_dump_p2m_table_level(hd->root_table, hd->paging_mode, 0, 0); } const struct iommu_ops amd_iommu_ops = { .init = amd_iommu_domain_init, .dom0_init = amd_iommu_dom0_init, .add_device = amd_iommu_add_device, .remove_device = amd_iommu_remove_device, .assign_device = amd_iommu_assign_device, .teardown = amd_iommu_domain_destroy, .map_page = amd_iommu_map_page, .unmap_page = amd_iommu_unmap_page, .free_page_table = deallocate_page_table, .reassign_device = reassign_device, .get_device_group_id = amd_iommu_group_id, .update_ire_from_apic = amd_iommu_ioapic_update_ire, .update_ire_from_msi = amd_iommu_msi_msg_update_ire, .read_apic_from_ire = amd_iommu_read_ioapic_from_ire, .read_msi_from_ire = amd_iommu_read_msi_from_ire, .setup_hpet_msi = amd_setup_hpet_msi, .suspend = amd_iommu_suspend, .resume = amd_iommu_resume, .share_p2m = amd_iommu_share_p2m, .crash_shutdown = amd_iommu_suspend, .dump_p2m_table = amd_dump_p2m_table, }; xen-4.4.0/xen/drivers/passthrough/amd/iommu_acpi.c0000664000175000017500000011356212307313555020321 0ustar smbsmb/* * Copyright (C) 2007 Advanced Micro Devices, Inc. * Author: Leo Duran * Author: Wei Wang - adapted to xen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include /* Some helper structures, particularly to deal with ranges. */ struct acpi_ivhd_device_range { struct acpi_ivrs_device4 start; struct acpi_ivrs_device4 end; }; struct acpi_ivhd_device_alias_range { struct acpi_ivrs_device8a alias; struct acpi_ivrs_device4 end; }; struct acpi_ivhd_device_extended_range { struct acpi_ivrs_device8b extended; struct acpi_ivrs_device4 end; }; union acpi_ivhd_device { struct acpi_ivrs_de_header header; struct acpi_ivrs_device4 select; struct acpi_ivhd_device_range range; struct acpi_ivrs_device8a alias; struct acpi_ivhd_device_alias_range alias_range; struct acpi_ivrs_device8b extended; struct acpi_ivhd_device_extended_range extended_range; struct acpi_ivrs_device8c special; }; static void __init add_ivrs_mapping_entry( u16 bdf, u16 alias_id, u8 flags, struct amd_iommu *iommu) { struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(iommu->seg); ASSERT( ivrs_mappings != NULL ); /* setup requestor id */ ivrs_mappings[bdf].dte_requestor_id = alias_id; /* override flags for range of devices */ ivrs_mappings[bdf].device_flags = flags; if (ivrs_mappings[alias_id].intremap_table == NULL ) { /* allocate per-device interrupt remapping table */ if ( amd_iommu_perdev_intremap ) ivrs_mappings[alias_id].intremap_table = amd_iommu_alloc_intremap_table( &ivrs_mappings[alias_id].intremap_inuse); else { if ( shared_intremap_table == NULL ) shared_intremap_table = amd_iommu_alloc_intremap_table( &shared_intremap_inuse); ivrs_mappings[alias_id].intremap_table = shared_intremap_table; ivrs_mappings[alias_id].intremap_inuse = shared_intremap_inuse; } } /* assgin iommu hardware */ ivrs_mappings[bdf].iommu = iommu; } static struct amd_iommu * __init find_iommu_from_bdf_cap( u16 seg, u16 bdf, u16 cap_offset) { struct amd_iommu *iommu; for_each_amd_iommu ( iommu ) if ( (iommu->seg == seg) && (iommu->bdf == bdf) && (iommu->cap_offset == cap_offset) ) return iommu; return NULL; } static void __init reserve_iommu_exclusion_range( struct amd_iommu *iommu, uint64_t base, uint64_t limit) { /* need to extend exclusion range? */ if ( iommu->exclusion_enable ) { if ( iommu->exclusion_base < base ) base = iommu->exclusion_base; if ( iommu->exclusion_limit > limit ) limit = iommu->exclusion_limit; } iommu->exclusion_enable = IOMMU_CONTROL_ENABLED; iommu->exclusion_base = base; iommu->exclusion_limit = limit; } static void __init reserve_iommu_exclusion_range_all( struct amd_iommu *iommu, unsigned long base, unsigned long limit) { reserve_iommu_exclusion_range(iommu, base, limit); iommu->exclusion_allow_all = IOMMU_CONTROL_ENABLED; } static void __init reserve_unity_map_for_device( u16 seg, u16 bdf, unsigned long base, unsigned long length, u8 iw, u8 ir) { struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(seg); unsigned long old_top, new_top; /* need to extend unity-mapped range? */ if ( ivrs_mappings[bdf].unity_map_enable ) { old_top = ivrs_mappings[bdf].addr_range_start + ivrs_mappings[bdf].addr_range_length; new_top = base + length; if ( old_top > new_top ) new_top = old_top; if ( ivrs_mappings[bdf].addr_range_start < base ) base = ivrs_mappings[bdf].addr_range_start; length = new_top - base; } /* extend r/w permissioms and keep aggregate */ ivrs_mappings[bdf].write_permission = iw; ivrs_mappings[bdf].read_permission = ir; ivrs_mappings[bdf].unity_map_enable = IOMMU_CONTROL_ENABLED; ivrs_mappings[bdf].addr_range_start = base; ivrs_mappings[bdf].addr_range_length = length; } static int __init register_exclusion_range_for_all_devices( unsigned long base, unsigned long limit, u8 iw, u8 ir) { int seg = 0; /* XXX */ unsigned long range_top, iommu_top, length; struct amd_iommu *iommu; unsigned int bdf; /* is part of exclusion range inside of IOMMU virtual address space? */ /* note: 'limit' parameter is assumed to be page-aligned */ range_top = limit + PAGE_SIZE; iommu_top = max_page * PAGE_SIZE; if ( base < iommu_top ) { if ( range_top > iommu_top ) range_top = iommu_top; length = range_top - base; /* reserve r/w unity-mapped page entries for devices */ /* note: these entries are part of the exclusion range */ for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) reserve_unity_map_for_device(seg, bdf, base, length, iw, ir); /* push 'base' just outside of virtual address space */ base = iommu_top; } /* register IOMMU exclusion range settings */ if ( limit >= iommu_top ) { for_each_amd_iommu( iommu ) reserve_iommu_exclusion_range_all(iommu, base, limit); } return 0; } static int __init register_exclusion_range_for_device( u16 bdf, unsigned long base, unsigned long limit, u8 iw, u8 ir) { int seg = 0; /* XXX */ struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(seg); unsigned long range_top, iommu_top, length; struct amd_iommu *iommu; u16 req; iommu = find_iommu_for_device(seg, bdf); if ( !iommu ) { AMD_IOMMU_DEBUG("IVMD Error: No IOMMU for Dev_Id %#x!\n", bdf); return -ENODEV; } req = ivrs_mappings[bdf].dte_requestor_id; /* note: 'limit' parameter is assumed to be page-aligned */ range_top = limit + PAGE_SIZE; iommu_top = max_page * PAGE_SIZE; if ( base < iommu_top ) { if ( range_top > iommu_top ) range_top = iommu_top; length = range_top - base; /* reserve unity-mapped page entries for device */ /* note: these entries are part of the exclusion range */ reserve_unity_map_for_device(seg, bdf, base, length, iw, ir); reserve_unity_map_for_device(seg, req, base, length, iw, ir); /* push 'base' just outside of virtual address space */ base = iommu_top; } /* register IOMMU exclusion range settings for device */ if ( limit >= iommu_top ) { reserve_iommu_exclusion_range(iommu, base, limit); ivrs_mappings[bdf].dte_allow_exclusion = IOMMU_CONTROL_ENABLED; ivrs_mappings[req].dte_allow_exclusion = IOMMU_CONTROL_ENABLED; } return 0; } static int __init register_exclusion_range_for_iommu_devices( struct amd_iommu *iommu, unsigned long base, unsigned long limit, u8 iw, u8 ir) { unsigned long range_top, iommu_top, length; unsigned int bdf; u16 req; /* is part of exclusion range inside of IOMMU virtual address space? */ /* note: 'limit' parameter is assumed to be page-aligned */ range_top = limit + PAGE_SIZE; iommu_top = max_page * PAGE_SIZE; if ( base < iommu_top ) { if ( range_top > iommu_top ) range_top = iommu_top; length = range_top - base; /* reserve r/w unity-mapped page entries for devices */ /* note: these entries are part of the exclusion range */ for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) { if ( iommu == find_iommu_for_device(iommu->seg, bdf) ) { reserve_unity_map_for_device(iommu->seg, bdf, base, length, iw, ir); req = get_ivrs_mappings(iommu->seg)[bdf].dte_requestor_id; reserve_unity_map_for_device(iommu->seg, req, base, length, iw, ir); } } /* push 'base' just outside of virtual address space */ base = iommu_top; } /* register IOMMU exclusion range settings */ if ( limit >= iommu_top ) reserve_iommu_exclusion_range_all(iommu, base, limit); return 0; } static int __init parse_ivmd_device_select( const struct acpi_ivrs_memory *ivmd_block, unsigned long base, unsigned long limit, u8 iw, u8 ir) { u16 bdf; bdf = ivmd_block->header.device_id; if ( bdf >= ivrs_bdf_entries ) { AMD_IOMMU_DEBUG("IVMD Error: Invalid Dev_Id %#x\n", bdf); return -ENODEV; } return register_exclusion_range_for_device(bdf, base, limit, iw, ir); } static int __init parse_ivmd_device_range( const struct acpi_ivrs_memory *ivmd_block, unsigned long base, unsigned long limit, u8 iw, u8 ir) { unsigned int first_bdf, last_bdf, bdf; int error; first_bdf = ivmd_block->header.device_id; if ( first_bdf >= ivrs_bdf_entries ) { AMD_IOMMU_DEBUG("IVMD Error: " "Invalid Range_First Dev_Id %#x\n", first_bdf); return -ENODEV; } last_bdf = ivmd_block->aux_data; if ( (last_bdf >= ivrs_bdf_entries) || (last_bdf <= first_bdf) ) { AMD_IOMMU_DEBUG("IVMD Error: " "Invalid Range_Last Dev_Id %#x\n", last_bdf); return -ENODEV; } for ( bdf = first_bdf, error = 0; (bdf <= last_bdf) && !error; bdf++ ) error = register_exclusion_range_for_device( bdf, base, limit, iw, ir); return error; } static int __init parse_ivmd_device_iommu( const struct acpi_ivrs_memory *ivmd_block, unsigned long base, unsigned long limit, u8 iw, u8 ir) { int seg = 0; /* XXX */ struct amd_iommu *iommu; /* find target IOMMU */ iommu = find_iommu_from_bdf_cap(seg, ivmd_block->header.device_id, ivmd_block->aux_data); if ( !iommu ) { AMD_IOMMU_DEBUG("IVMD Error: No IOMMU for Dev_Id %#x Cap %#x\n", ivmd_block->header.device_id, ivmd_block->aux_data); return -ENODEV; } return register_exclusion_range_for_iommu_devices( iommu, base, limit, iw, ir); } static int __init parse_ivmd_block(const struct acpi_ivrs_memory *ivmd_block) { unsigned long start_addr, mem_length, base, limit; u8 iw, ir; if ( ivmd_block->header.length < sizeof(*ivmd_block) ) { AMD_IOMMU_DEBUG("IVMD Error: Invalid Block Length!\n"); return -ENODEV; } start_addr = (unsigned long)ivmd_block->start_address; mem_length = (unsigned long)ivmd_block->memory_length; base = start_addr & PAGE_MASK; limit = (start_addr + mem_length - 1) & PAGE_MASK; AMD_IOMMU_DEBUG("IVMD Block: type %#x phys %#lx len %#lx\n", ivmd_block->header.type, start_addr, mem_length); if ( ivmd_block->header.flags & ACPI_IVMD_EXCLUSION_RANGE ) iw = ir = IOMMU_CONTROL_ENABLED; else if ( ivmd_block->header.flags & ACPI_IVMD_UNITY ) { iw = ivmd_block->header.flags & ACPI_IVMD_READ ? IOMMU_CONTROL_ENABLED : IOMMU_CONTROL_DISABLED; ir = ivmd_block->header.flags & ACPI_IVMD_WRITE ? IOMMU_CONTROL_ENABLED : IOMMU_CONTROL_DISABLED; } else { AMD_IOMMU_DEBUG("IVMD Error: Invalid Flag Field!\n"); return -ENODEV; } switch( ivmd_block->header.type ) { case ACPI_IVRS_TYPE_MEMORY_ALL: return register_exclusion_range_for_all_devices( base, limit, iw, ir); case ACPI_IVRS_TYPE_MEMORY_ONE: return parse_ivmd_device_select(ivmd_block, base, limit, iw, ir); case ACPI_IVRS_TYPE_MEMORY_RANGE: return parse_ivmd_device_range(ivmd_block, base, limit, iw, ir); case ACPI_IVRS_TYPE_MEMORY_IOMMU: return parse_ivmd_device_iommu(ivmd_block, base, limit, iw, ir); default: AMD_IOMMU_DEBUG("IVMD Error: Invalid Block Type!\n"); return -ENODEV; } } static u16 __init parse_ivhd_device_padding( u16 pad_length, u16 header_length, u16 block_length) { if ( header_length < (block_length + pad_length) ) { AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n"); return 0; } return pad_length; } static u16 __init parse_ivhd_device_select( const struct acpi_ivrs_device4 *select, struct amd_iommu *iommu) { u16 bdf; bdf = select->header.id; if ( bdf >= ivrs_bdf_entries ) { AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Dev_Id %#x\n", bdf); return 0; } add_ivrs_mapping_entry(bdf, bdf, select->header.data_setting, iommu); return sizeof(*select); } static u16 __init parse_ivhd_device_range( const struct acpi_ivhd_device_range *range, u16 header_length, u16 block_length, struct amd_iommu *iommu) { unsigned int dev_length, first_bdf, last_bdf, bdf; dev_length = sizeof(*range); if ( header_length < (block_length + dev_length) ) { AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n"); return 0; } if ( range->end.header.type != ACPI_IVRS_TYPE_END ) { AMD_IOMMU_DEBUG("IVHD Error: " "Invalid Range: End_Type %#x\n", range->end.header.type); return 0; } first_bdf = range->start.header.id; if ( first_bdf >= ivrs_bdf_entries ) { AMD_IOMMU_DEBUG("IVHD Error: " "Invalid Range: First Dev_Id %#x\n", first_bdf); return 0; } last_bdf = range->end.header.id; if ( (last_bdf >= ivrs_bdf_entries) || (last_bdf <= first_bdf) ) { AMD_IOMMU_DEBUG("IVHD Error: " "Invalid Range: Last Dev_Id %#x\n", last_bdf); return 0; } AMD_IOMMU_DEBUG(" Dev_Id Range: %#x -> %#x\n", first_bdf, last_bdf); for ( bdf = first_bdf; bdf <= last_bdf; bdf++ ) add_ivrs_mapping_entry(bdf, bdf, range->start.header.data_setting, iommu); return dev_length; } static u16 __init parse_ivhd_device_alias( const struct acpi_ivrs_device8a *alias, u16 header_length, u16 block_length, struct amd_iommu *iommu) { u16 dev_length, alias_id, bdf; dev_length = sizeof(*alias); if ( header_length < (block_length + dev_length) ) { AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n"); return 0; } bdf = alias->header.id; if ( bdf >= ivrs_bdf_entries ) { AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Dev_Id %#x\n", bdf); return 0; } alias_id = alias->used_id; if ( alias_id >= ivrs_bdf_entries ) { AMD_IOMMU_DEBUG("IVHD Error: Invalid Alias Dev_Id %#x\n", alias_id); return 0; } AMD_IOMMU_DEBUG(" Dev_Id Alias: %#x\n", alias_id); add_ivrs_mapping_entry(bdf, alias_id, alias->header.data_setting, iommu); return dev_length; } static u16 __init parse_ivhd_device_alias_range( const struct acpi_ivhd_device_alias_range *range, u16 header_length, u16 block_length, struct amd_iommu *iommu) { unsigned int dev_length, first_bdf, last_bdf, alias_id, bdf; dev_length = sizeof(*range); if ( header_length < (block_length + dev_length) ) { AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n"); return 0; } if ( range->end.header.type != ACPI_IVRS_TYPE_END ) { AMD_IOMMU_DEBUG("IVHD Error: " "Invalid Range: End_Type %#x\n", range->end.header.type); return 0; } first_bdf = range->alias.header.id; if ( first_bdf >= ivrs_bdf_entries ) { AMD_IOMMU_DEBUG("IVHD Error: " "Invalid Range: First Dev_Id %#x\n", first_bdf); return 0; } last_bdf = range->end.header.id; if ( last_bdf >= ivrs_bdf_entries || last_bdf <= first_bdf ) { AMD_IOMMU_DEBUG( "IVHD Error: Invalid Range: Last Dev_Id %#x\n", last_bdf); return 0; } alias_id = range->alias.used_id; if ( alias_id >= ivrs_bdf_entries ) { AMD_IOMMU_DEBUG("IVHD Error: Invalid Alias Dev_Id %#x\n", alias_id); return 0; } AMD_IOMMU_DEBUG(" Dev_Id Range: %#x -> %#x alias %#x\n", first_bdf, last_bdf, alias_id); for ( bdf = first_bdf; bdf <= last_bdf; bdf++ ) add_ivrs_mapping_entry(bdf, alias_id, range->alias.header.data_setting, iommu); return dev_length; } static u16 __init parse_ivhd_device_extended( const struct acpi_ivrs_device8b *ext, u16 header_length, u16 block_length, struct amd_iommu *iommu) { u16 dev_length, bdf; dev_length = sizeof(*ext); if ( header_length < (block_length + dev_length) ) { AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n"); return 0; } bdf = ext->header.id; if ( bdf >= ivrs_bdf_entries ) { AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Dev_Id %#x\n", bdf); return 0; } add_ivrs_mapping_entry(bdf, bdf, ext->header.data_setting, iommu); return dev_length; } static u16 __init parse_ivhd_device_extended_range( const struct acpi_ivhd_device_extended_range *range, u16 header_length, u16 block_length, struct amd_iommu *iommu) { unsigned int dev_length, first_bdf, last_bdf, bdf; dev_length = sizeof(*range); if ( header_length < (block_length + dev_length) ) { AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n"); return 0; } if ( range->end.header.type != ACPI_IVRS_TYPE_END ) { AMD_IOMMU_DEBUG("IVHD Error: " "Invalid Range: End_Type %#x\n", range->end.header.type); return 0; } first_bdf = range->extended.header.id; if ( first_bdf >= ivrs_bdf_entries ) { AMD_IOMMU_DEBUG("IVHD Error: " "Invalid Range: First Dev_Id %#x\n", first_bdf); return 0; } last_bdf = range->end.header.id; if ( (last_bdf >= ivrs_bdf_entries) || (last_bdf <= first_bdf) ) { AMD_IOMMU_DEBUG("IVHD Error: " "Invalid Range: Last Dev_Id %#x\n", last_bdf); return 0; } AMD_IOMMU_DEBUG(" Dev_Id Range: %#x -> %#x\n", first_bdf, last_bdf); for ( bdf = first_bdf; bdf <= last_bdf; bdf++ ) add_ivrs_mapping_entry(bdf, bdf, range->extended.header.data_setting, iommu); return dev_length; } static DECLARE_BITMAP(ioapic_cmdline, ARRAY_SIZE(ioapic_sbdf)) __initdata; static void __init parse_ivrs_ioapic(char *str) { const char *s = str; unsigned long id; unsigned int seg, bus, dev, func; ASSERT(*s == '['); id = simple_strtoul(s + 1, &s, 0); if ( id >= ARRAY_SIZE(ioapic_sbdf) || *s != ']' || *++s != '=' ) return; s = parse_pci(s + 1, &seg, &bus, &dev, &func); if ( !s || *s ) return; ioapic_sbdf[id].bdf = PCI_BDF(bus, dev, func); ioapic_sbdf[id].seg = seg; __set_bit(id, ioapic_cmdline); } custom_param("ivrs_ioapic[", parse_ivrs_ioapic); static void __init parse_ivrs_hpet(char *str) { const char *s = str; unsigned long id; unsigned int seg, bus, dev, func; ASSERT(*s == '['); id = simple_strtoul(s + 1, &s, 0); if ( id != (typeof(hpet_sbdf.id))id || *s != ']' || *++s != '=' ) return; s = parse_pci(s + 1, &seg, &bus, &dev, &func); if ( !s || *s ) return; hpet_sbdf.id = id; hpet_sbdf.bdf = PCI_BDF(bus, dev, func); hpet_sbdf.seg = seg; hpet_sbdf.init = HPET_CMDL; } custom_param("ivrs_hpet[", parse_ivrs_hpet); static u16 __init parse_ivhd_device_special( const struct acpi_ivrs_device8c *special, u16 seg, u16 header_length, u16 block_length, struct amd_iommu *iommu) { u16 dev_length, bdf; int apic; dev_length = sizeof(*special); if ( header_length < (block_length + dev_length) ) { AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Length!\n"); return 0; } bdf = special->used_id; if ( bdf >= ivrs_bdf_entries ) { AMD_IOMMU_DEBUG("IVHD Error: Invalid Device_Entry Dev_Id %#x\n", bdf); return 0; } AMD_IOMMU_DEBUG("IVHD Special: %04x:%02x:%02x.%u variety %#x handle %#x\n", seg, PCI_BUS(bdf), PCI_SLOT(bdf), PCI_FUNC(bdf), special->variety, special->handle); add_ivrs_mapping_entry(bdf, bdf, special->header.data_setting, iommu); switch ( special->variety ) { case ACPI_IVHD_IOAPIC: if ( !iommu_intremap ) break; /* * Some BIOSes have IOAPIC broken entries so we check for IVRS * consistency here --- whether entry's IOAPIC ID is valid and * whether there are conflicting/duplicated entries. */ apic = find_first_bit(ioapic_cmdline, ARRAY_SIZE(ioapic_sbdf)); while ( apic < ARRAY_SIZE(ioapic_sbdf) ) { if ( ioapic_sbdf[apic].bdf == bdf && ioapic_sbdf[apic].seg == seg ) break; apic = find_next_bit(ioapic_cmdline, ARRAY_SIZE(ioapic_sbdf), apic + 1); } if ( apic < ARRAY_SIZE(ioapic_sbdf) ) { AMD_IOMMU_DEBUG("IVHD: Command line override present for IO-APIC %#x" "(IVRS: %#x devID %04x:%02x:%02x.%u)\n", apic, special->handle, seg, PCI_BUS(bdf), PCI_SLOT(bdf), PCI_FUNC(bdf)); break; } for ( apic = 0; apic < nr_ioapics; apic++ ) { if ( IO_APIC_ID(apic) != special->handle ) continue; if ( special->handle >= ARRAY_SIZE(ioapic_sbdf) ) { printk(XENLOG_ERR "IVHD Error: IO-APIC %#x entry beyond bounds\n", special->handle); return 0; } if ( test_bit(special->handle, ioapic_cmdline) ) AMD_IOMMU_DEBUG("IVHD: Command line override present for IO-APIC %#x\n", special->handle); else if ( ioapic_sbdf[special->handle].pin_2_idx ) { if ( ioapic_sbdf[special->handle].bdf == bdf && ioapic_sbdf[special->handle].seg == seg ) AMD_IOMMU_DEBUG("IVHD Warning: Duplicate IO-APIC %#x entries\n", special->handle); else { printk(XENLOG_ERR "IVHD Error: Conflicting IO-APIC %#x entries\n", special->handle); if ( amd_iommu_perdev_intremap ) return 0; } } else { /* set device id of ioapic */ ioapic_sbdf[special->handle].bdf = bdf; ioapic_sbdf[special->handle].seg = seg; ioapic_sbdf[special->handle].pin_2_idx = xmalloc_array( u16, nr_ioapic_entries[apic]); if ( nr_ioapic_entries[apic] && !ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx ) { printk(XENLOG_ERR "IVHD Error: Out of memory\n"); return 0; } memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, nr_ioapic_entries[apic] * sizeof(*ioapic_sbdf->pin_2_idx)); } break; } if ( apic == nr_ioapics ) { printk(XENLOG_ERR "IVHD Error: Invalid IO-APIC %#x\n", special->handle); return 0; } break; case ACPI_IVHD_HPET: switch (hpet_sbdf.init) { case HPET_IVHD: printk(XENLOG_WARNING "Only one IVHD HPET entry is supported.\n"); break; case HPET_CMDL: AMD_IOMMU_DEBUG("IVHD: Command line override present for HPET %#x " "(IVRS: %#x devID %04x:%02x:%02x.%u)\n", hpet_sbdf.id, special->handle, seg, PCI_BUS(bdf), PCI_SLOT(bdf), PCI_FUNC(bdf)); break; case HPET_NONE: /* set device id of hpet */ hpet_sbdf.id = special->handle; hpet_sbdf.bdf = bdf; hpet_sbdf.seg = seg; hpet_sbdf.init = HPET_IVHD; break; default: ASSERT(0); break; } break; default: printk(XENLOG_ERR "Unrecognized IVHD special variety %#x\n", special->variety); return 0; } return dev_length; } static int __init parse_ivhd_block(const struct acpi_ivrs_hardware *ivhd_block) { const union acpi_ivhd_device *ivhd_device; u16 block_length, dev_length; struct amd_iommu *iommu; if ( ivhd_block->header.length < sizeof(*ivhd_block) ) { AMD_IOMMU_DEBUG("IVHD Error: Invalid Block Length!\n"); return -ENODEV; } iommu = find_iommu_from_bdf_cap(ivhd_block->pci_segment_group, ivhd_block->header.device_id, ivhd_block->capability_offset); if ( !iommu ) { AMD_IOMMU_DEBUG("IVHD Error: No IOMMU for Dev_Id %#x Cap %#x\n", ivhd_block->header.device_id, ivhd_block->capability_offset); return -ENODEV; } /* parse Device Entries */ block_length = sizeof(*ivhd_block); while ( ivhd_block->header.length >= (block_length + sizeof(struct acpi_ivrs_de_header)) ) { ivhd_device = (const void *)((const u8 *)ivhd_block + block_length); AMD_IOMMU_DEBUG("IVHD Device Entry: type %#x id %#x flags %#x\n", ivhd_device->header.type, ivhd_device->header.id, ivhd_device->header.data_setting); switch ( ivhd_device->header.type ) { case ACPI_IVRS_TYPE_PAD4: dev_length = parse_ivhd_device_padding( sizeof(u32), ivhd_block->header.length, block_length); break; case ACPI_IVRS_TYPE_PAD8: dev_length = parse_ivhd_device_padding( sizeof(u64), ivhd_block->header.length, block_length); break; case ACPI_IVRS_TYPE_SELECT: dev_length = parse_ivhd_device_select(&ivhd_device->select, iommu); break; case ACPI_IVRS_TYPE_START: dev_length = parse_ivhd_device_range( &ivhd_device->range, ivhd_block->header.length, block_length, iommu); break; case ACPI_IVRS_TYPE_ALIAS_SELECT: dev_length = parse_ivhd_device_alias( &ivhd_device->alias, ivhd_block->header.length, block_length, iommu); break; case ACPI_IVRS_TYPE_ALIAS_START: dev_length = parse_ivhd_device_alias_range( &ivhd_device->alias_range, ivhd_block->header.length, block_length, iommu); break; case ACPI_IVRS_TYPE_EXT_SELECT: dev_length = parse_ivhd_device_extended( &ivhd_device->extended, ivhd_block->header.length, block_length, iommu); break; case ACPI_IVRS_TYPE_EXT_START: dev_length = parse_ivhd_device_extended_range( &ivhd_device->extended_range, ivhd_block->header.length, block_length, iommu); break; case ACPI_IVRS_TYPE_SPECIAL: dev_length = parse_ivhd_device_special( &ivhd_device->special, ivhd_block->pci_segment_group, ivhd_block->header.length, block_length, iommu); break; default: AMD_IOMMU_DEBUG("IVHD Error: Invalid Device Type!\n"); dev_length = 0; break; } block_length += dev_length; if ( !dev_length ) return -ENODEV; } return 0; } static int __init parse_ivrs_block(const struct acpi_ivrs_header *ivrs_block) { const struct acpi_ivrs_hardware *ivhd_block; const struct acpi_ivrs_memory *ivmd_block; switch ( ivrs_block->type ) { case ACPI_IVRS_TYPE_HARDWARE: ivhd_block = container_of(ivrs_block, const struct acpi_ivrs_hardware, header); return parse_ivhd_block(ivhd_block); case ACPI_IVRS_TYPE_MEMORY_ALL: case ACPI_IVRS_TYPE_MEMORY_ONE: case ACPI_IVRS_TYPE_MEMORY_RANGE: case ACPI_IVRS_TYPE_MEMORY_IOMMU: ivmd_block = container_of(ivrs_block, const struct acpi_ivrs_memory, header); return parse_ivmd_block(ivmd_block); default: AMD_IOMMU_DEBUG("IVRS Error: Invalid Block Type!\n"); return -ENODEV; } return 0; } static void __init dump_acpi_table_header(struct acpi_table_header *table) { int i; AMD_IOMMU_DEBUG("ACPI Table:\n"); AMD_IOMMU_DEBUG(" Signature "); for ( i = 0; i < ACPI_NAME_SIZE; i++ ) printk("%c", table->signature[i]); printk("\n"); AMD_IOMMU_DEBUG(" Length %#x\n", table->length); AMD_IOMMU_DEBUG(" Revision %#x\n", table->revision); AMD_IOMMU_DEBUG(" CheckSum %#x\n", table->checksum); AMD_IOMMU_DEBUG(" OEM_Id "); for ( i = 0; i < ACPI_OEM_ID_SIZE; i++ ) printk("%c", table->oem_id[i]); printk("\n"); AMD_IOMMU_DEBUG(" OEM_Table_Id "); for ( i = 0; i < ACPI_OEM_TABLE_ID_SIZE; i++ ) printk("%c", table->oem_table_id[i]); printk("\n"); AMD_IOMMU_DEBUG(" OEM_Revision %#x\n", table->oem_revision); AMD_IOMMU_DEBUG(" Creator_Id "); for ( i = 0; i < ACPI_NAME_SIZE; i++ ) printk("%c", table->asl_compiler_id[i]); printk("\n"); AMD_IOMMU_DEBUG(" Creator_Revision %#x\n", table->asl_compiler_revision); } static int __init parse_ivrs_table(struct acpi_table_header *table) { const struct acpi_ivrs_header *ivrs_block; unsigned long length; unsigned int apic; bool_t sb_ioapic = !iommu_intremap; int error = 0; BUG_ON(!table); if ( iommu_debug ) dump_acpi_table_header(table); /* parse IVRS blocks */ length = sizeof(struct acpi_table_ivrs); while ( (error == 0) && (table->length > (length + sizeof(*ivrs_block))) ) { ivrs_block = (struct acpi_ivrs_header *)((u8 *)table + length); AMD_IOMMU_DEBUG("IVRS Block: type %#x flags %#x len %#x id %#x\n", ivrs_block->type, ivrs_block->flags, ivrs_block->length, ivrs_block->device_id); if ( table->length < (length + ivrs_block->length) ) { AMD_IOMMU_DEBUG("IVRS Error: " "Table Length Exceeded: %#x -> %#lx\n", table->length, (length + ivrs_block->length)); return -ENODEV; } error = parse_ivrs_block(ivrs_block); length += ivrs_block->length; } /* Each IO-APIC must have been mentioned in the table. */ for ( apic = 0; !error && iommu_intremap && apic < nr_ioapics; ++apic ) { if ( !nr_ioapic_entries[apic] ) continue; if ( !ioapic_sbdf[IO_APIC_ID(apic)].seg && /* SB IO-APIC is always on this device in AMD systems. */ ioapic_sbdf[IO_APIC_ID(apic)].bdf == PCI_BDF(0, 0x14, 0) ) sb_ioapic = 1; if ( ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx ) continue; if ( !test_bit(IO_APIC_ID(apic), ioapic_cmdline) ) { printk(XENLOG_ERR "IVHD Error: no information for IO-APIC %#x\n", IO_APIC_ID(apic)); if ( amd_iommu_perdev_intremap ) return -ENXIO; } ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx = xmalloc_array( u16, nr_ioapic_entries[apic]); if ( ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx ) memset(ioapic_sbdf[IO_APIC_ID(apic)].pin_2_idx, -1, nr_ioapic_entries[apic] * sizeof(*ioapic_sbdf->pin_2_idx)); else { printk(XENLOG_ERR "IVHD Error: Out of memory\n"); error = -ENOMEM; } } if ( !error && !sb_ioapic ) { if ( amd_iommu_perdev_intremap ) error = -ENXIO; printk("%sNo southbridge IO-APIC found in IVRS table\n", amd_iommu_perdev_intremap ? XENLOG_ERR : XENLOG_WARNING); } return error; } static int __init detect_iommu_acpi(struct acpi_table_header *table) { const struct acpi_ivrs_header *ivrs_block; unsigned long i; unsigned long length = sizeof(struct acpi_table_ivrs); u8 checksum, *raw_table; /* validate checksum: sum of entire table == 0 */ checksum = 0; raw_table = (u8 *)table; for ( i = 0; i < table->length; i++ ) checksum += raw_table[i]; if ( checksum ) { AMD_IOMMU_DEBUG("IVRS Error: Invalid Checksum %#x\n", checksum); return -ENODEV; } while ( table->length > (length + sizeof(*ivrs_block)) ) { ivrs_block = (struct acpi_ivrs_header *)((u8 *)table + length); if ( table->length < (length + ivrs_block->length) ) return -ENODEV; if ( ivrs_block->type == ACPI_IVRS_TYPE_HARDWARE && amd_iommu_detect_one_acpi( container_of(ivrs_block, const struct acpi_ivrs_hardware, header)) != 0 ) return -ENODEV; length += ivrs_block->length; } return 0; } #define UPDATE_LAST_BDF(x) do {\ if ((x) > last_bdf) \ last_bdf = (x); \ } while(0); static int __init get_last_bdf_ivhd( const struct acpi_ivrs_hardware *ivhd_block) { const union acpi_ivhd_device *ivhd_device; u16 block_length, dev_length; int last_bdf = 0; if ( ivhd_block->header.length < sizeof(*ivhd_block) ) { AMD_IOMMU_DEBUG("IVHD Error: Invalid Block Length!\n"); return -ENODEV; } block_length = sizeof(*ivhd_block); while ( ivhd_block->header.length >= (block_length + sizeof(struct acpi_ivrs_de_header)) ) { ivhd_device = (const void *)((u8 *)ivhd_block + block_length); switch ( ivhd_device->header.type ) { case ACPI_IVRS_TYPE_PAD4: dev_length = sizeof(u32); break; case ACPI_IVRS_TYPE_PAD8: dev_length = sizeof(u64); break; case ACPI_IVRS_TYPE_SELECT: UPDATE_LAST_BDF(ivhd_device->select.header.id); dev_length = sizeof(ivhd_device->header); break; case ACPI_IVRS_TYPE_ALIAS_SELECT: UPDATE_LAST_BDF(ivhd_device->alias.header.id); dev_length = sizeof(ivhd_device->alias); break; case ACPI_IVRS_TYPE_EXT_SELECT: UPDATE_LAST_BDF(ivhd_device->extended.header.id); dev_length = sizeof(ivhd_device->extended); break; case ACPI_IVRS_TYPE_START: UPDATE_LAST_BDF(ivhd_device->range.end.header.id); dev_length = sizeof(ivhd_device->range); break; case ACPI_IVRS_TYPE_ALIAS_START: UPDATE_LAST_BDF(ivhd_device->alias_range.end.header.id) dev_length = sizeof(ivhd_device->alias_range); break; case ACPI_IVRS_TYPE_EXT_START: UPDATE_LAST_BDF(ivhd_device->extended_range.end.header.id) dev_length = sizeof(ivhd_device->extended_range); break; case ACPI_IVRS_TYPE_SPECIAL: UPDATE_LAST_BDF(ivhd_device->special.used_id) dev_length = sizeof(ivhd_device->special); break; default: AMD_IOMMU_DEBUG("IVHD Error: Invalid Device Type!\n"); dev_length = 0; break; } block_length += dev_length; if ( !dev_length ) return -ENODEV; } return last_bdf; } static int __init get_last_bdf_acpi(struct acpi_table_header *table) { const struct acpi_ivrs_header *ivrs_block; unsigned long length = sizeof(struct acpi_table_ivrs); int last_bdf = 0; while ( table->length > (length + sizeof(*ivrs_block)) ) { ivrs_block = (struct acpi_ivrs_header *)((u8 *)table + length); if ( table->length < (length + ivrs_block->length) ) return -ENODEV; if ( ivrs_block->type == ACPI_IVRS_TYPE_HARDWARE ) { int ret = get_last_bdf_ivhd( container_of(ivrs_block, const struct acpi_ivrs_hardware, header)); if ( ret < 0 ) return ret; UPDATE_LAST_BDF(ret); } length += ivrs_block->length; } return last_bdf; } int __init amd_iommu_detect_acpi(void) { return acpi_table_parse(ACPI_SIG_IVRS, detect_iommu_acpi); } int __init amd_iommu_get_ivrs_dev_entries(void) { int ret = acpi_table_parse(ACPI_SIG_IVRS, get_last_bdf_acpi); return ret < 0 ? ret : (ret | PCI_FUNC(~0)) + 1; } int __init amd_iommu_update_ivrs_mapping_acpi(void) { if ( unlikely(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_MSI) ) return -EPERM; return acpi_table_parse(ACPI_SIG_IVRS, parse_ivrs_table); } xen-4.4.0/xen/drivers/passthrough/amd/iommu_map.c0000664000175000017500000006362212307313555020163 0ustar smbsmb/* * Copyright (C) 2007 Advanced Micro Devices, Inc. * Author: Leo Duran * Author: Wei Wang - adapted to xen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include "../ats.h" #include /* Given pfn and page table level, return pde index */ static unsigned int pfn_to_pde_idx(unsigned long pfn, unsigned int level) { unsigned int idx; idx = pfn >> (PTE_PER_TABLE_SHIFT * (--level)); idx &= ~PTE_PER_TABLE_MASK; return idx; } void clear_iommu_pte_present(unsigned long l1_mfn, unsigned long gfn) { u64 *table, *pte; table = map_domain_page(l1_mfn); pte = table + pfn_to_pde_idx(gfn, IOMMU_PAGING_MODE_LEVEL_1); *pte = 0; unmap_domain_page(table); } static bool_t set_iommu_pde_present(u32 *pde, unsigned long next_mfn, unsigned int next_level, bool_t iw, bool_t ir) { u64 addr_lo, addr_hi, maddr_old, maddr_next; u32 entry; bool_t need_flush = 0; maddr_next = (u64)next_mfn << PAGE_SHIFT; addr_hi = get_field_from_reg_u32(pde[1], IOMMU_PTE_ADDR_HIGH_MASK, IOMMU_PTE_ADDR_HIGH_SHIFT); addr_lo = get_field_from_reg_u32(pde[0], IOMMU_PTE_ADDR_LOW_MASK, IOMMU_PTE_ADDR_LOW_SHIFT); maddr_old = (addr_hi << 32) | (addr_lo << PAGE_SHIFT); if ( maddr_old != maddr_next ) need_flush = 1; addr_lo = maddr_next & DMA_32BIT_MASK; addr_hi = maddr_next >> 32; /* enable read/write permissions,which will be enforced at the PTE */ set_field_in_reg_u32((u32)addr_hi, 0, IOMMU_PDE_ADDR_HIGH_MASK, IOMMU_PDE_ADDR_HIGH_SHIFT, &entry); set_field_in_reg_u32(iw, entry, IOMMU_PDE_IO_WRITE_PERMISSION_MASK, IOMMU_PDE_IO_WRITE_PERMISSION_SHIFT, &entry); set_field_in_reg_u32(ir, entry, IOMMU_PDE_IO_READ_PERMISSION_MASK, IOMMU_PDE_IO_READ_PERMISSION_SHIFT, &entry); /* FC bit should be enabled in PTE, this helps to solve potential * issues with ATS devices */ if ( next_level == IOMMU_PAGING_MODE_LEVEL_0 ) set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, IOMMU_PTE_FC_MASK, IOMMU_PTE_FC_SHIFT, &entry); pde[1] = entry; /* mark next level as 'present' */ set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, IOMMU_PDE_ADDR_LOW_MASK, IOMMU_PDE_ADDR_LOW_SHIFT, &entry); set_field_in_reg_u32(next_level, entry, IOMMU_PDE_NEXT_LEVEL_MASK, IOMMU_PDE_NEXT_LEVEL_SHIFT, &entry); set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, IOMMU_PDE_PRESENT_MASK, IOMMU_PDE_PRESENT_SHIFT, &entry); pde[0] = entry; return need_flush; } static bool_t set_iommu_pte_present(unsigned long pt_mfn, unsigned long gfn, unsigned long next_mfn, int pde_level, bool_t iw, bool_t ir) { u64 *table; u32 *pde; bool_t need_flush = 0; table = map_domain_page(pt_mfn); pde = (u32*)(table + pfn_to_pde_idx(gfn, pde_level)); need_flush = set_iommu_pde_present(pde, next_mfn, IOMMU_PAGING_MODE_LEVEL_0, iw, ir); unmap_domain_page(table); return need_flush; } void amd_iommu_set_root_page_table( u32 *dte, u64 root_ptr, u16 domain_id, u8 paging_mode, u8 valid) { u64 addr_hi, addr_lo; u32 entry; set_field_in_reg_u32(domain_id, 0, IOMMU_DEV_TABLE_DOMAIN_ID_MASK, IOMMU_DEV_TABLE_DOMAIN_ID_SHIFT, &entry); dte[2] = entry; addr_lo = root_ptr & DMA_32BIT_MASK; addr_hi = root_ptr >> 32; set_field_in_reg_u32((u32)addr_hi, 0, IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_MASK, IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT, &entry); set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, IOMMU_DEV_TABLE_IO_WRITE_PERMISSION_MASK, IOMMU_DEV_TABLE_IO_WRITE_PERMISSION_SHIFT, &entry); set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, IOMMU_DEV_TABLE_IO_READ_PERMISSION_MASK, IOMMU_DEV_TABLE_IO_READ_PERMISSION_SHIFT, &entry); dte[1] = entry; set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK, IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT, &entry); set_field_in_reg_u32(paging_mode, entry, IOMMU_DEV_TABLE_PAGING_MODE_MASK, IOMMU_DEV_TABLE_PAGING_MODE_SHIFT, &entry); set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, IOMMU_DEV_TABLE_TRANSLATION_VALID_MASK, IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT, &entry); set_field_in_reg_u32(valid ? IOMMU_CONTROL_ENABLED : IOMMU_CONTROL_DISABLED, entry, IOMMU_DEV_TABLE_VALID_MASK, IOMMU_DEV_TABLE_VALID_SHIFT, &entry); dte[0] = entry; } void iommu_dte_set_iotlb(u32 *dte, u8 i) { u32 entry; entry = dte[3]; set_field_in_reg_u32(!!i, entry, IOMMU_DEV_TABLE_IOTLB_SUPPORT_MASK, IOMMU_DEV_TABLE_IOTLB_SUPPORT_SHIFT, &entry); dte[3] = entry; } void __init amd_iommu_set_intremap_table( u32 *dte, u64 intremap_ptr, u8 int_valid) { u64 addr_hi, addr_lo; u32 entry; addr_lo = intremap_ptr & DMA_32BIT_MASK; addr_hi = intremap_ptr >> 32; entry = dte[5]; set_field_in_reg_u32((u32)addr_hi, entry, IOMMU_DEV_TABLE_INT_TABLE_PTR_HIGH_MASK, IOMMU_DEV_TABLE_INT_TABLE_PTR_HIGH_SHIFT, &entry); /* Fixed and arbitrated interrupts remapepd */ set_field_in_reg_u32(2, entry, IOMMU_DEV_TABLE_INT_CONTROL_MASK, IOMMU_DEV_TABLE_INT_CONTROL_SHIFT, &entry); dte[5] = entry; set_field_in_reg_u32((u32)addr_lo >> 6, 0, IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_MASK, IOMMU_DEV_TABLE_INT_TABLE_PTR_LOW_SHIFT, &entry); /* 2048 entries */ set_field_in_reg_u32(0xB, entry, IOMMU_DEV_TABLE_INT_TABLE_LENGTH_MASK, IOMMU_DEV_TABLE_INT_TABLE_LENGTH_SHIFT, &entry); /* unmapped interrupt results io page faults*/ set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, entry, IOMMU_DEV_TABLE_INT_TABLE_IGN_UNMAPPED_MASK, IOMMU_DEV_TABLE_INT_TABLE_IGN_UNMAPPED_SHIFT, &entry); set_field_in_reg_u32(int_valid ? IOMMU_CONTROL_ENABLED : IOMMU_CONTROL_DISABLED, entry, IOMMU_DEV_TABLE_INT_VALID_MASK, IOMMU_DEV_TABLE_INT_VALID_SHIFT, &entry); dte[4] = entry; } void __init iommu_dte_add_device_entry(u32 *dte, struct ivrs_mappings *ivrs_dev) { u32 entry; u8 sys_mgt, dev_ex, flags; u8 mask = ~(0x7 << 3); dte[7] = dte[6] = dte[4] = dte[2] = dte[1] = dte[0] = 0; flags = ivrs_dev->device_flags; sys_mgt = get_field_from_byte(flags, ACPI_IVHD_SYSTEM_MGMT); dev_ex = ivrs_dev->dte_allow_exclusion; flags &= mask; set_field_in_reg_u32(flags, 0, IOMMU_DEV_TABLE_IVHD_FLAGS_MASK, IOMMU_DEV_TABLE_IVHD_FLAGS_SHIFT, &entry); dte[5] = entry; set_field_in_reg_u32(sys_mgt, 0, IOMMU_DEV_TABLE_SYS_MGT_MSG_ENABLE_MASK, IOMMU_DEV_TABLE_SYS_MGT_MSG_ENABLE_SHIFT, &entry); set_field_in_reg_u32(dev_ex, entry, IOMMU_DEV_TABLE_ALLOW_EXCLUSION_MASK, IOMMU_DEV_TABLE_ALLOW_EXCLUSION_SHIFT, &entry); dte[3] = entry; } void iommu_dte_set_guest_cr3(u32 *dte, u16 dom_id, u64 gcr3, int gv, unsigned int glx) { u32 entry, gcr3_1, gcr3_2, gcr3_3; gcr3_3 = gcr3 >> 31; gcr3_2 = (gcr3 >> 15) & 0xFFFF; gcr3_1 = (gcr3 >> PAGE_SHIFT) & 0x7; /* I bit must be set when gcr3 is enabled */ entry = dte[3]; set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, IOMMU_DEV_TABLE_IOTLB_SUPPORT_MASK, IOMMU_DEV_TABLE_IOTLB_SUPPORT_SHIFT, &entry); /* update gcr3 */ set_field_in_reg_u32(gcr3_3, entry, IOMMU_DEV_TABLE_GCR3_3_MASK, IOMMU_DEV_TABLE_GCR3_3_SHIFT, &entry); dte[3] = entry; set_field_in_reg_u32(dom_id, entry, IOMMU_DEV_TABLE_DOMAIN_ID_MASK, IOMMU_DEV_TABLE_DOMAIN_ID_SHIFT, &entry); /* update gcr3 */ entry = dte[2]; set_field_in_reg_u32(gcr3_2, entry, IOMMU_DEV_TABLE_GCR3_2_MASK, IOMMU_DEV_TABLE_GCR3_2_SHIFT, &entry); dte[2] = entry; entry = dte[1]; /* Enable GV bit */ set_field_in_reg_u32(!!gv, entry, IOMMU_DEV_TABLE_GV_MASK, IOMMU_DEV_TABLE_GV_SHIFT, &entry); /* 1 level guest cr3 table */ set_field_in_reg_u32(glx, entry, IOMMU_DEV_TABLE_GLX_MASK, IOMMU_DEV_TABLE_GLX_SHIFT, &entry); /* update gcr3 */ set_field_in_reg_u32(gcr3_1, entry, IOMMU_DEV_TABLE_GCR3_1_MASK, IOMMU_DEV_TABLE_GCR3_1_SHIFT, &entry); dte[1] = entry; } u64 amd_iommu_get_next_table_from_pte(u32 *entry) { u64 addr_lo, addr_hi, ptr; addr_lo = get_field_from_reg_u32( entry[0], IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK, IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT); addr_hi = get_field_from_reg_u32( entry[1], IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_MASK, IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT); ptr = (addr_hi << 32) | (addr_lo << PAGE_SHIFT); return ptr; } /* For each pde, We use ignored bits (bit 1 - bit 8 and bit 63) * to save pde count, pde count = 511 is a candidate of page coalescing. */ static unsigned int get_pde_count(u64 pde) { unsigned int count; u64 upper_mask = 1ULL << 63 ; u64 lower_mask = 0xFF << 1; count = ((pde & upper_mask) >> 55) | ((pde & lower_mask) >> 1); return count; } /* Convert pde count into iommu pte ignored bits */ static void set_pde_count(u64 *pde, unsigned int count) { u64 upper_mask = 1ULL << 8 ; u64 lower_mask = 0xFF; u64 pte_mask = (~(1ULL << 63)) & (~(0xFF << 1)); *pde &= pte_mask; *pde |= ((count & upper_mask ) << 55) | ((count & lower_mask ) << 1); } /* Return 1, if pages are suitable for merging at merge_level. * otherwise increase pde count if mfn is contigous with mfn - 1 */ static int iommu_update_pde_count(struct domain *d, unsigned long pt_mfn, unsigned long gfn, unsigned long mfn, unsigned int merge_level) { unsigned int pde_count, next_level; unsigned long first_mfn; u64 *table, *pde, *ntable; u64 ntable_maddr, mask; struct hvm_iommu *hd = domain_hvm_iommu(d); bool_t ok = 0; ASSERT( spin_is_locked(&hd->mapping_lock) && pt_mfn ); next_level = merge_level - 1; /* get pde at merge level */ table = map_domain_page(pt_mfn); pde = table + pfn_to_pde_idx(gfn, merge_level); /* get page table of next level */ ntable_maddr = amd_iommu_get_next_table_from_pte((u32*)pde); ntable = map_domain_page(ntable_maddr >> PAGE_SHIFT); /* get the first mfn of next level */ first_mfn = amd_iommu_get_next_table_from_pte((u32*)ntable) >> PAGE_SHIFT; if ( first_mfn == 0 ) goto out; mask = (1ULL<< (PTE_PER_TABLE_SHIFT * next_level)) - 1; if ( ((first_mfn & mask) == 0) && (((gfn & mask) | first_mfn) == mfn) ) { pde_count = get_pde_count(*pde); if ( pde_count == (PTE_PER_TABLE_SIZE - 1) ) ok = 1; else if ( pde_count < (PTE_PER_TABLE_SIZE - 1)) { pde_count++; set_pde_count(pde, pde_count); } } else /* non-contiguous mapping */ set_pde_count(pde, 0); out: unmap_domain_page(ntable); unmap_domain_page(table); return ok; } static int iommu_merge_pages(struct domain *d, unsigned long pt_mfn, unsigned long gfn, unsigned int flags, unsigned int merge_level) { u64 *table, *pde, *ntable; u64 ntable_mfn; unsigned long first_mfn; struct hvm_iommu *hd = domain_hvm_iommu(d); ASSERT( spin_is_locked(&hd->mapping_lock) && pt_mfn ); table = map_domain_page(pt_mfn); pde = table + pfn_to_pde_idx(gfn, merge_level); /* get first mfn */ ntable_mfn = amd_iommu_get_next_table_from_pte((u32*)pde) >> PAGE_SHIFT; if ( ntable_mfn == 0 ) { unmap_domain_page(table); return 1; } ntable = map_domain_page(ntable_mfn); first_mfn = amd_iommu_get_next_table_from_pte((u32*)ntable) >> PAGE_SHIFT; if ( first_mfn == 0 ) { unmap_domain_page(ntable); unmap_domain_page(table); return 1; } /* setup super page mapping, next level = 0 */ set_iommu_pde_present((u32*)pde, first_mfn, IOMMU_PAGING_MODE_LEVEL_0, !!(flags & IOMMUF_writable), !!(flags & IOMMUF_readable)); amd_iommu_flush_all_pages(d); unmap_domain_page(ntable); unmap_domain_page(table); return 0; } /* Walk io page tables and build level page tables if necessary * {Re, un}mapping super page frames causes re-allocation of io * page tables. */ static int iommu_pde_from_gfn(struct domain *d, unsigned long pfn, unsigned long pt_mfn[]) { u64 *pde, *next_table_vaddr; unsigned long next_table_mfn; unsigned int level; struct page_info *table; struct hvm_iommu *hd = domain_hvm_iommu(d); table = hd->root_table; level = hd->paging_mode; BUG_ON( table == NULL || level < IOMMU_PAGING_MODE_LEVEL_1 || level > IOMMU_PAGING_MODE_LEVEL_6 ); next_table_mfn = page_to_mfn(table); if ( level == IOMMU_PAGING_MODE_LEVEL_1 ) { pt_mfn[level] = next_table_mfn; return 0; } while ( level > IOMMU_PAGING_MODE_LEVEL_1 ) { unsigned int next_level = level - 1; pt_mfn[level] = next_table_mfn; next_table_vaddr = map_domain_page(next_table_mfn); pde = next_table_vaddr + pfn_to_pde_idx(pfn, level); /* Here might be a super page frame */ next_table_mfn = amd_iommu_get_next_table_from_pte((uint32_t*)pde) >> PAGE_SHIFT; /* Split super page frame into smaller pieces.*/ if ( iommu_is_pte_present((u32*)pde) && (iommu_next_level((u32*)pde) == 0) && next_table_mfn != 0 ) { int i; unsigned long mfn, gfn; unsigned int page_sz; page_sz = 1 << (PTE_PER_TABLE_SHIFT * (next_level - 1)); gfn = pfn & ~((1 << (PTE_PER_TABLE_SHIFT * next_level)) - 1); mfn = next_table_mfn; /* allocate lower level page table */ table = alloc_amd_iommu_pgtable(); if ( table == NULL ) { AMD_IOMMU_DEBUG("Cannot allocate I/O page table\n"); unmap_domain_page(next_table_vaddr); return 1; } next_table_mfn = page_to_mfn(table); set_iommu_pde_present((u32*)pde, next_table_mfn, next_level, !!IOMMUF_writable, !!IOMMUF_readable); for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ ) { set_iommu_pte_present(next_table_mfn, gfn, mfn, next_level, !!IOMMUF_writable, !!IOMMUF_readable); mfn += page_sz; gfn += page_sz; } amd_iommu_flush_all_pages(d); } /* Install lower level page table for non-present entries */ else if ( !iommu_is_pte_present((u32*)pde) ) { if ( next_table_mfn == 0 ) { table = alloc_amd_iommu_pgtable(); if ( table == NULL ) { AMD_IOMMU_DEBUG("Cannot allocate I/O page table\n"); unmap_domain_page(next_table_vaddr); return 1; } next_table_mfn = page_to_mfn(table); set_iommu_pde_present((u32*)pde, next_table_mfn, next_level, !!IOMMUF_writable, !!IOMMUF_readable); } else /* should never reach here */ { unmap_domain_page(next_table_vaddr); return 1; } } unmap_domain_page(next_table_vaddr); level--; } /* mfn of level 1 page table */ pt_mfn[level] = next_table_mfn; return 0; } static int update_paging_mode(struct domain *d, unsigned long gfn) { u16 bdf; void *device_entry; unsigned int req_id, level, offset; unsigned long flags; struct pci_dev *pdev; struct amd_iommu *iommu = NULL; struct page_info *new_root = NULL; struct page_info *old_root = NULL; void *new_root_vaddr; unsigned long old_root_mfn; struct hvm_iommu *hd = domain_hvm_iommu(d); level = hd->paging_mode; old_root = hd->root_table; offset = gfn >> (PTE_PER_TABLE_SHIFT * (level - 1)); ASSERT(spin_is_locked(&hd->mapping_lock) && is_hvm_domain(d)); while ( offset >= PTE_PER_TABLE_SIZE ) { /* Allocate and install a new root table. * Only upper I/O page table grows, no need to fix next level bits */ new_root = alloc_amd_iommu_pgtable(); if ( new_root == NULL ) { AMD_IOMMU_DEBUG("%s Cannot allocate I/O page table\n", __func__); return -ENOMEM; } new_root_vaddr = __map_domain_page(new_root); old_root_mfn = page_to_mfn(old_root); set_iommu_pde_present(new_root_vaddr, old_root_mfn, level, !!IOMMUF_writable, !!IOMMUF_readable); level++; old_root = new_root; offset >>= PTE_PER_TABLE_SHIFT; unmap_domain_page(new_root_vaddr); } if ( new_root != NULL ) { hd->paging_mode = level; hd->root_table = new_root; if ( !spin_is_locked(&pcidevs_lock) ) AMD_IOMMU_DEBUG("%s Try to access pdev_list " "without aquiring pcidevs_lock.\n", __func__); /* Update device table entries using new root table and paging mode */ for_each_pdev( d, pdev ) { bdf = PCI_BDF2(pdev->bus, pdev->devfn); iommu = find_iommu_for_device(pdev->seg, bdf); if ( !iommu ) { AMD_IOMMU_DEBUG("%s Fail to find iommu.\n", __func__); return -ENODEV; } spin_lock_irqsave(&iommu->lock, flags); do { req_id = get_dma_requestor_id(pdev->seg, bdf); device_entry = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE); /* valid = 0 only works for dom0 passthrough mode */ amd_iommu_set_root_page_table((u32 *)device_entry, page_to_maddr(hd->root_table), hd->domain_id, hd->paging_mode, 1); amd_iommu_flush_device(iommu, req_id); bdf += pdev->phantom_stride; } while ( PCI_DEVFN2(bdf) != pdev->devfn && PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) ); spin_unlock_irqrestore(&iommu->lock, flags); } /* For safety, invalidate all entries */ amd_iommu_flush_all_pages(d); } return 0; } int amd_iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int flags) { bool_t need_flush = 0; struct hvm_iommu *hd = domain_hvm_iommu(d); unsigned long pt_mfn[7]; unsigned int merge_level; BUG_ON( !hd->root_table ); if ( iommu_use_hap_pt(d) ) return 0; memset(pt_mfn, 0, sizeof(pt_mfn)); spin_lock(&hd->mapping_lock); /* Since HVM domain is initialized with 2 level IO page table, * we might need a deeper page table for lager gfn now */ if ( is_hvm_domain(d) ) { if ( update_paging_mode(d, gfn) ) { spin_unlock(&hd->mapping_lock); AMD_IOMMU_DEBUG("Update page mode failed gfn = %lx\n", gfn); domain_crash(d); return -EFAULT; } } if ( iommu_pde_from_gfn(d, gfn, pt_mfn) || (pt_mfn[1] == 0) ) { spin_unlock(&hd->mapping_lock); AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx\n", gfn); domain_crash(d); return -EFAULT; } /* Install 4k mapping first */ need_flush = set_iommu_pte_present(pt_mfn[1], gfn, mfn, IOMMU_PAGING_MODE_LEVEL_1, !!(flags & IOMMUF_writable), !!(flags & IOMMUF_readable)); /* Do not increase pde count if io mapping has not been changed */ if ( !need_flush ) goto out; /* 4K mapping for PV guests never changes, * no need to flush if we trust non-present bits */ if ( is_hvm_domain(d) ) amd_iommu_flush_pages(d, gfn, 0); for ( merge_level = IOMMU_PAGING_MODE_LEVEL_2; merge_level <= hd->paging_mode; merge_level++ ) { if ( pt_mfn[merge_level] == 0 ) break; if ( !iommu_update_pde_count(d, pt_mfn[merge_level], gfn, mfn, merge_level) ) break; /* Deallocate lower level page table */ free_amd_iommu_pgtable(mfn_to_page(pt_mfn[merge_level - 1])); if ( iommu_merge_pages(d, pt_mfn[merge_level], gfn, flags, merge_level) ) { spin_unlock(&hd->mapping_lock); AMD_IOMMU_DEBUG("Merge iommu page failed at level %d, " "gfn = %lx mfn = %lx\n", merge_level, gfn, mfn); domain_crash(d); return -EFAULT; } } out: spin_unlock(&hd->mapping_lock); return 0; } int amd_iommu_unmap_page(struct domain *d, unsigned long gfn) { unsigned long pt_mfn[7]; struct hvm_iommu *hd = domain_hvm_iommu(d); BUG_ON( !hd->root_table ); if ( iommu_use_hap_pt(d) ) return 0; memset(pt_mfn, 0, sizeof(pt_mfn)); spin_lock(&hd->mapping_lock); /* Since HVM domain is initialized with 2 level IO page table, * we might need a deeper page table for lager gfn now */ if ( is_hvm_domain(d) ) { if ( update_paging_mode(d, gfn) ) { spin_unlock(&hd->mapping_lock); AMD_IOMMU_DEBUG("Update page mode failed gfn = %lx\n", gfn); domain_crash(d); return -EFAULT; } } if ( iommu_pde_from_gfn(d, gfn, pt_mfn) || (pt_mfn[1] == 0) ) { spin_unlock(&hd->mapping_lock); AMD_IOMMU_DEBUG("Invalid IO pagetable entry gfn = %lx\n", gfn); domain_crash(d); return -EFAULT; } /* mark PTE as 'page not present' */ clear_iommu_pte_present(pt_mfn[1], gfn); spin_unlock(&hd->mapping_lock); amd_iommu_flush_pages(d, gfn, 0); return 0; } int amd_iommu_reserve_domain_unity_map(struct domain *domain, u64 phys_addr, unsigned long size, int iw, int ir) { unsigned long npages, i; unsigned long gfn; unsigned int flags = !!ir; int rt = 0; if ( iw ) flags |= IOMMUF_writable; npages = region_to_pages(phys_addr, size); gfn = phys_addr >> PAGE_SHIFT; for ( i = 0; i < npages; i++ ) { rt = amd_iommu_map_page(domain, gfn +i, gfn +i, flags); if ( rt != 0 ) return rt; } return 0; } /* Share p2m table with iommu. */ void amd_iommu_share_p2m(struct domain *d) { struct hvm_iommu *hd = domain_hvm_iommu(d); struct page_info *p2m_table; mfn_t pgd_mfn; ASSERT( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled ); if ( !iommu_use_hap_pt(d) ) return; pgd_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m_get_hostp2m(d))); p2m_table = mfn_to_page(mfn_x(pgd_mfn)); if ( hd->root_table != p2m_table ) { free_amd_iommu_pgtable(hd->root_table); hd->root_table = p2m_table; /* When sharing p2m with iommu, paging mode = 4 */ hd->paging_mode = IOMMU_PAGING_MODE_LEVEL_4; AMD_IOMMU_DEBUG("Share p2m table with iommu: p2m table = %#lx\n", mfn_x(pgd_mfn)); } } xen-4.4.0/xen/drivers/passthrough/amd/iommu_guest.c0000664000175000017500000006775312307313555020546 0ustar smbsmb/* * Copyright (C) 2011 Advanced Micro Devices, Inc. * Author: Wei Wang * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #define IOMMU_MMIO_SIZE 0x8000 #define IOMMU_MMIO_PAGE_NR 0x8 #define RING_BF_LENGTH_MASK 0x0F000000 #define RING_BF_LENGTH_SHIFT 24 #define PASMAX_9_bit 0x8 #define GUEST_CR3_1_LEVEL 0x0 #define GUEST_ADDRESS_SIZE_6_LEVEL 0x2 #define HOST_ADDRESS_SIZE_6_LEVEL 0x2 #define guest_iommu_set_status(iommu, bit) \ iommu_set_bit(&((iommu)->reg_status.lo), bit) #define guest_iommu_clear_status(iommu, bit) \ iommu_clear_bit(&((iommu)->reg_status.lo), bit) #define reg_to_u64(reg) (((uint64_t)reg.hi << 32) | reg.lo ) #define u64_to_reg(reg, val) \ do \ { \ (reg)->lo = (u32)(val); \ (reg)->hi = (val) >> 32; \ } while (0) static unsigned int machine_bdf(struct domain *d, uint16_t guest_bdf) { return guest_bdf; } static uint16_t guest_bdf(struct domain *d, uint16_t machine_bdf) { return machine_bdf; } static inline struct guest_iommu *domain_iommu(struct domain *d) { return domain_hvm_iommu(d)->g_iommu; } static inline struct guest_iommu *vcpu_iommu(struct vcpu *v) { return domain_hvm_iommu(v->domain)->g_iommu; } static void guest_iommu_enable(struct guest_iommu *iommu) { iommu->enabled = 1; } static void guest_iommu_disable(struct guest_iommu *iommu) { iommu->enabled = 0; } static uint64_t get_guest_cr3_from_dte(dev_entry_t *dte) { uint64_t gcr3_1, gcr3_2, gcr3_3; gcr3_1 = get_field_from_reg_u32(dte->data[1], IOMMU_DEV_TABLE_GCR3_1_MASK, IOMMU_DEV_TABLE_GCR3_1_SHIFT); gcr3_2 = get_field_from_reg_u32(dte->data[2], IOMMU_DEV_TABLE_GCR3_2_MASK, IOMMU_DEV_TABLE_GCR3_2_SHIFT); gcr3_3 = get_field_from_reg_u32(dte->data[3], IOMMU_DEV_TABLE_GCR3_3_MASK, IOMMU_DEV_TABLE_GCR3_3_SHIFT); return ((gcr3_3 << 31) | (gcr3_2 << 15 ) | (gcr3_1 << 12)) >> PAGE_SHIFT; } static uint16_t get_domid_from_dte(dev_entry_t *dte) { return get_field_from_reg_u32(dte->data[2], IOMMU_DEV_TABLE_DOMAIN_ID_MASK, IOMMU_DEV_TABLE_DOMAIN_ID_SHIFT); } static uint16_t get_glx_from_dte(dev_entry_t *dte) { return get_field_from_reg_u32(dte->data[1], IOMMU_DEV_TABLE_GLX_MASK, IOMMU_DEV_TABLE_GLX_SHIFT); } static uint16_t get_gv_from_dte(dev_entry_t *dte) { return get_field_from_reg_u32(dte->data[1],IOMMU_DEV_TABLE_GV_MASK, IOMMU_DEV_TABLE_GV_SHIFT); } static unsigned int host_domid(struct domain *d, uint64_t g_domid) { /* Only support one PPR device in guest for now */ return d->domain_id; } static unsigned long get_gfn_from_base_reg(uint64_t base_raw) { base_raw &= PADDR_MASK; ASSERT ( base_raw != 0 ); return base_raw >> PAGE_SHIFT; } static void guest_iommu_deliver_msi(struct domain *d) { uint8_t vector, dest, dest_mode, delivery_mode, trig_mode; struct guest_iommu *iommu = domain_iommu(d); vector = iommu->msi.vector; dest = iommu->msi.dest; dest_mode = iommu->msi.dest_mode; delivery_mode = iommu->msi.delivery_mode; trig_mode = iommu->msi.trig_mode; vmsi_deliver(d, vector, dest, dest_mode, delivery_mode, trig_mode); } static unsigned long guest_iommu_get_table_mfn(struct domain *d, uint64_t base_raw, unsigned int entry_size, unsigned int pos) { unsigned long idx, gfn, mfn; p2m_type_t p2mt; gfn = get_gfn_from_base_reg(base_raw); idx = (pos * entry_size) >> PAGE_SHIFT; mfn = mfn_x(get_gfn(d, gfn + idx, &p2mt)); put_gfn(d, gfn); return mfn; } static void guest_iommu_enable_dev_table(struct guest_iommu *iommu) { uint32_t length_raw = get_field_from_reg_u32(iommu->dev_table.reg_base.lo, IOMMU_DEV_TABLE_SIZE_MASK, IOMMU_DEV_TABLE_SIZE_SHIFT); iommu->dev_table.size = (length_raw + 1) * PAGE_SIZE; } static void guest_iommu_enable_ring_buffer(struct guest_iommu *iommu, struct guest_buffer *buffer, uint32_t entry_size) { uint32_t length_raw = get_field_from_reg_u32(buffer->reg_base.hi, RING_BF_LENGTH_MASK, RING_BF_LENGTH_SHIFT); buffer->entries = 1 << length_raw; } void guest_iommu_add_ppr_log(struct domain *d, u32 entry[]) { uint16_t gdev_id; unsigned long mfn, tail, head; ppr_entry_t *log, *log_base; struct guest_iommu *iommu; if ( !is_hvm_domain(d) ) return; iommu = domain_iommu(d); if ( !iommu ) return; tail = iommu_get_rb_pointer(iommu->ppr_log.reg_tail.lo); head = iommu_get_rb_pointer(iommu->ppr_log.reg_head.lo); if ( tail >= iommu->ppr_log.entries || head >= iommu->ppr_log.entries ) { AMD_IOMMU_DEBUG("Error: guest iommu ppr log overflows\n"); guest_iommu_disable(iommu); return; } mfn = guest_iommu_get_table_mfn(d, reg_to_u64(iommu->ppr_log.reg_base), sizeof(ppr_entry_t), tail); ASSERT(mfn_valid(mfn)); log_base = map_domain_page(mfn); log = log_base + tail % (PAGE_SIZE / sizeof(ppr_entry_t)); /* Convert physical device id back into virtual device id */ gdev_id = guest_bdf(d, iommu_get_devid_from_cmd(entry[0])); iommu_set_devid_to_cmd(&entry[0], gdev_id); memcpy(log, entry, sizeof(ppr_entry_t)); /* Now shift ppr log tail pointer */ if ( ++tail >= iommu->ppr_log.entries ) { tail = 0; guest_iommu_set_status(iommu, IOMMU_STATUS_PPR_LOG_OVERFLOW_SHIFT); } iommu_set_rb_pointer(&iommu->ppr_log.reg_tail.lo, tail); unmap_domain_page(log_base); guest_iommu_deliver_msi(d); } void guest_iommu_add_event_log(struct domain *d, u32 entry[]) { uint16_t dev_id; unsigned long mfn, tail, head; event_entry_t *log, *log_base; struct guest_iommu *iommu; if ( !is_hvm_domain(d) ) return; iommu = domain_iommu(d); if ( !iommu ) return; tail = iommu_get_rb_pointer(iommu->event_log.reg_tail.lo); head = iommu_get_rb_pointer(iommu->event_log.reg_head.lo); if ( tail >= iommu->event_log.entries || head >= iommu->event_log.entries ) { AMD_IOMMU_DEBUG("Error: guest iommu event overflows\n"); guest_iommu_disable(iommu); return; } mfn = guest_iommu_get_table_mfn(d, reg_to_u64(iommu->event_log.reg_base), sizeof(event_entry_t), tail); ASSERT(mfn_valid(mfn)); log_base = map_domain_page(mfn); log = log_base + tail % (PAGE_SIZE / sizeof(event_entry_t)); /* re-write physical device id into virtual device id */ dev_id = guest_bdf(d, iommu_get_devid_from_cmd(entry[0])); iommu_set_devid_to_cmd(&entry[0], dev_id); memcpy(log, entry, sizeof(event_entry_t)); /* Now shift event log tail pointer */ if ( ++tail >= iommu->event_log.entries ) { tail = 0; guest_iommu_set_status(iommu, IOMMU_STATUS_EVENT_OVERFLOW_SHIFT); } iommu_set_rb_pointer(&iommu->event_log.reg_tail.lo, tail); unmap_domain_page(log_base); guest_iommu_deliver_msi(d); } static int do_complete_ppr_request(struct domain *d, cmd_entry_t *cmd) { uint16_t dev_id; struct amd_iommu *iommu; dev_id = machine_bdf(d, iommu_get_devid_from_cmd(cmd->data[0])); iommu = find_iommu_for_device(0, dev_id); if ( !iommu ) { AMD_IOMMU_DEBUG("%s: Fail to find iommu for bdf %x\n", __func__, dev_id); return -ENODEV; } /* replace virtual device id into physical */ iommu_set_devid_to_cmd(&cmd->data[0], dev_id); amd_iommu_send_guest_cmd(iommu, cmd->data); return 0; } static int do_invalidate_pages(struct domain *d, cmd_entry_t *cmd) { uint16_t gdom_id, hdom_id; struct amd_iommu *iommu = NULL; gdom_id = get_field_from_reg_u32(cmd->data[1], IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_MASK, IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_SHIFT); hdom_id = host_domid(d, gdom_id); set_field_in_reg_u32(hdom_id, cmd->data[1], IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_MASK, IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_SHIFT, &cmd->data[1]); for_each_amd_iommu ( iommu ) amd_iommu_send_guest_cmd(iommu, cmd->data); return 0; } static int do_invalidate_all(struct domain *d, cmd_entry_t *cmd) { struct amd_iommu *iommu = NULL; for_each_amd_iommu ( iommu ) amd_iommu_flush_all_pages(d); return 0; } static int do_invalidate_iotlb_pages(struct domain *d, cmd_entry_t *cmd) { struct amd_iommu *iommu; uint16_t dev_id; dev_id = machine_bdf(d, iommu_get_devid_from_cmd(cmd->data[0])); iommu = find_iommu_for_device(0, dev_id); if ( !iommu ) { AMD_IOMMU_DEBUG("%s: Fail to find iommu for bdf %x\n", __func__, dev_id); return -ENODEV; } iommu_set_devid_to_cmd(&cmd->data[0], dev_id); amd_iommu_send_guest_cmd(iommu, cmd->data); return 0; } static int do_completion_wait(struct domain *d, cmd_entry_t *cmd) { bool_t com_wait_int_en, com_wait_int, i, s; struct guest_iommu *iommu; unsigned long gfn; p2m_type_t p2mt; iommu = domain_iommu(d); i = iommu_get_bit(cmd->data[0], IOMMU_COMP_WAIT_I_FLAG_SHIFT); s = iommu_get_bit(cmd->data[0], IOMMU_COMP_WAIT_S_FLAG_SHIFT); if ( i ) guest_iommu_set_status(iommu, IOMMU_STATUS_COMP_WAIT_INT_SHIFT); if ( s ) { uint64_t gaddr_lo, gaddr_hi, gaddr_64, data; void *vaddr; data = (uint64_t)cmd->data[3] << 32 | cmd->data[2]; gaddr_lo = get_field_from_reg_u32(cmd->data[0], IOMMU_COMP_WAIT_ADDR_LOW_MASK, IOMMU_COMP_WAIT_ADDR_LOW_SHIFT); gaddr_hi = get_field_from_reg_u32(cmd->data[1], IOMMU_COMP_WAIT_ADDR_HIGH_MASK, IOMMU_COMP_WAIT_ADDR_HIGH_SHIFT); gaddr_64 = (gaddr_hi << 32) | (gaddr_lo << 3); gfn = gaddr_64 >> PAGE_SHIFT; vaddr = map_domain_page(mfn_x(get_gfn(d, gfn ,&p2mt))); put_gfn(d, gfn); write_u64_atomic((uint64_t *)(vaddr + (gaddr_64 & (PAGE_SIZE-1))), data); unmap_domain_page(vaddr); } com_wait_int_en = iommu_get_bit(iommu->reg_ctrl.lo, IOMMU_CONTROL_COMP_WAIT_INT_SHIFT); com_wait_int = iommu_get_bit(iommu->reg_status.lo, IOMMU_STATUS_COMP_WAIT_INT_SHIFT); if ( com_wait_int_en && com_wait_int ) guest_iommu_deliver_msi(d); return 0; } static int do_invalidate_dte(struct domain *d, cmd_entry_t *cmd) { uint16_t gbdf, mbdf, req_id, gdom_id, hdom_id; dev_entry_t *gdte, *mdte, *dte_base; struct amd_iommu *iommu = NULL; struct guest_iommu *g_iommu; uint64_t gcr3_gfn, gcr3_mfn; uint8_t glx, gv; unsigned long dte_mfn, flags; p2m_type_t p2mt; g_iommu = domain_iommu(d); gbdf = iommu_get_devid_from_cmd(cmd->data[0]); mbdf = machine_bdf(d, gbdf); /* Guest can only update DTEs for its passthru devices */ if ( mbdf == 0 || gbdf == 0 ) return 0; /* Sometimes guest invalidates devices from non-exists dtes */ if ( (gbdf * sizeof(dev_entry_t)) > g_iommu->dev_table.size ) return 0; dte_mfn = guest_iommu_get_table_mfn(d, reg_to_u64(g_iommu->dev_table.reg_base), sizeof(dev_entry_t), gbdf); ASSERT(mfn_valid(dte_mfn)); /* Read guest dte information */ dte_base = map_domain_page(dte_mfn); gdte = dte_base + gbdf % (PAGE_SIZE / sizeof(dev_entry_t)); gdom_id = get_domid_from_dte(gdte); gcr3_gfn = get_guest_cr3_from_dte(gdte); glx = get_glx_from_dte(gdte); gv = get_gv_from_dte(gdte); unmap_domain_page(dte_base); /* Do not update host dte before gcr3 has been set */ if ( gcr3_gfn == 0 ) return 0; gcr3_mfn = mfn_x(get_gfn(d, gcr3_gfn, &p2mt)); put_gfn(d, gcr3_gfn); ASSERT(mfn_valid(gcr3_mfn)); iommu = find_iommu_for_device(0, mbdf); if ( !iommu ) { AMD_IOMMU_DEBUG("%s: Fail to find iommu for bdf %x!\n", __func__, mbdf); return -ENODEV; } /* Setup host device entry */ hdom_id = host_domid(d, gdom_id); req_id = get_dma_requestor_id(iommu->seg, mbdf); mdte = iommu->dev_table.buffer + (req_id * sizeof(dev_entry_t)); spin_lock_irqsave(&iommu->lock, flags); iommu_dte_set_guest_cr3((u32 *)mdte, hdom_id, gcr3_mfn << PAGE_SHIFT, gv, glx); amd_iommu_flush_device(iommu, req_id); spin_unlock_irqrestore(&iommu->lock, flags); return 0; } static void guest_iommu_process_command(unsigned long _d) { unsigned long opcode, tail, head, entries_per_page, cmd_mfn; cmd_entry_t *cmd, *cmd_base; struct domain *d = (struct domain *)_d; struct guest_iommu *iommu; iommu = domain_iommu(d); if ( !iommu->enabled ) return; head = iommu_get_rb_pointer(iommu->cmd_buffer.reg_head.lo); tail = iommu_get_rb_pointer(iommu->cmd_buffer.reg_tail.lo); /* Tail pointer is rolled over by guest driver, value outside * cmd_buffer_entries cause iommu disabled */ if ( tail >= iommu->cmd_buffer.entries || head >= iommu->cmd_buffer.entries ) { AMD_IOMMU_DEBUG("Error: guest iommu cmd buffer overflows\n"); guest_iommu_disable(iommu); return; } entries_per_page = PAGE_SIZE / sizeof(cmd_entry_t); while ( head != tail ) { int ret = 0; cmd_mfn = guest_iommu_get_table_mfn(d, reg_to_u64(iommu->cmd_buffer.reg_base), sizeof(cmd_entry_t), head); ASSERT(mfn_valid(cmd_mfn)); cmd_base = map_domain_page(cmd_mfn); cmd = cmd_base + head % entries_per_page; opcode = get_field_from_reg_u32(cmd->data[1], IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT); switch ( opcode ) { case IOMMU_CMD_COMPLETION_WAIT: ret = do_completion_wait(d, cmd); break; case IOMMU_CMD_INVALIDATE_DEVTAB_ENTRY: ret = do_invalidate_dte(d, cmd); break; case IOMMU_CMD_INVALIDATE_IOMMU_PAGES: ret = do_invalidate_pages(d, cmd); break; case IOMMU_CMD_INVALIDATE_IOTLB_PAGES: ret = do_invalidate_iotlb_pages(d, cmd); break; case IOMMU_CMD_INVALIDATE_INT_TABLE: break; case IOMMU_CMD_COMPLETE_PPR_REQUEST: ret = do_complete_ppr_request(d, cmd); break; case IOMMU_CMD_INVALIDATE_IOMMU_ALL: ret = do_invalidate_all(d, cmd); break; default: AMD_IOMMU_DEBUG("CMD: Unknown command cmd_type = %lx " "head = %ld\n", opcode, head); break; } unmap_domain_page(cmd_base); if ( ++head >= iommu->cmd_buffer.entries ) head = 0; if ( ret ) guest_iommu_disable(iommu); } /* Now shift cmd buffer head pointer */ iommu_set_rb_pointer(&iommu->cmd_buffer.reg_head.lo, head); return; } static int guest_iommu_write_ctrl(struct guest_iommu *iommu, uint64_t newctrl) { bool_t cmd_en, event_en, iommu_en, ppr_en, ppr_log_en; bool_t cmd_en_old, event_en_old, iommu_en_old; bool_t cmd_run; iommu_en = iommu_get_bit(newctrl, IOMMU_CONTROL_TRANSLATION_ENABLE_SHIFT); iommu_en_old = iommu_get_bit(iommu->reg_ctrl.lo, IOMMU_CONTROL_TRANSLATION_ENABLE_SHIFT); cmd_en = iommu_get_bit(newctrl, IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT); cmd_en_old = iommu_get_bit(iommu->reg_ctrl.lo, IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT); cmd_run = iommu_get_bit(iommu->reg_status.lo, IOMMU_STATUS_CMD_BUFFER_RUN_SHIFT); event_en = iommu_get_bit(newctrl, IOMMU_CONTROL_EVENT_LOG_ENABLE_SHIFT); event_en_old = iommu_get_bit(iommu->reg_ctrl.lo, IOMMU_CONTROL_EVENT_LOG_ENABLE_SHIFT); ppr_en = iommu_get_bit(newctrl, IOMMU_CONTROL_PPR_ENABLE_SHIFT); ppr_log_en = iommu_get_bit(newctrl, IOMMU_CONTROL_PPR_LOG_ENABLE_SHIFT); if ( iommu_en ) { guest_iommu_enable(iommu); guest_iommu_enable_dev_table(iommu); } if ( iommu_en && cmd_en ) { guest_iommu_enable_ring_buffer(iommu, &iommu->cmd_buffer, sizeof(cmd_entry_t)); /* Enable iommu command processing */ tasklet_schedule(&iommu->cmd_buffer_tasklet); } if ( iommu_en && event_en ) { guest_iommu_enable_ring_buffer(iommu, &iommu->event_log, sizeof(event_entry_t)); guest_iommu_set_status(iommu, IOMMU_STATUS_EVENT_LOG_RUN_SHIFT); guest_iommu_clear_status(iommu, IOMMU_STATUS_EVENT_OVERFLOW_SHIFT); } if ( iommu_en && ppr_en && ppr_log_en ) { guest_iommu_enable_ring_buffer(iommu, &iommu->ppr_log, sizeof(ppr_entry_t)); guest_iommu_set_status(iommu, IOMMU_STATUS_PPR_LOG_RUN_SHIFT); guest_iommu_clear_status(iommu, IOMMU_STATUS_PPR_LOG_OVERFLOW_SHIFT); } if ( iommu_en && cmd_en_old && !cmd_en ) { /* Disable iommu command processing */ tasklet_kill(&iommu->cmd_buffer_tasklet); } if ( event_en_old && !event_en ) guest_iommu_clear_status(iommu, IOMMU_STATUS_EVENT_LOG_RUN_SHIFT); if ( iommu_en_old && !iommu_en ) guest_iommu_disable(iommu); u64_to_reg(&iommu->reg_ctrl, newctrl); return 0; } static uint64_t iommu_mmio_read64(struct guest_iommu *iommu, unsigned long offset) { uint64_t val; switch ( offset ) { case IOMMU_DEV_TABLE_BASE_LOW_OFFSET: val = reg_to_u64(iommu->dev_table.reg_base); break; case IOMMU_CMD_BUFFER_BASE_LOW_OFFSET: val = reg_to_u64(iommu->cmd_buffer.reg_base); break; case IOMMU_EVENT_LOG_BASE_LOW_OFFSET: val = reg_to_u64(iommu->event_log.reg_base); break; case IOMMU_PPR_LOG_BASE_LOW_OFFSET: val = reg_to_u64(iommu->ppr_log.reg_base); break; case IOMMU_CMD_BUFFER_HEAD_OFFSET: val = reg_to_u64(iommu->cmd_buffer.reg_head); break; case IOMMU_CMD_BUFFER_TAIL_OFFSET: val = reg_to_u64(iommu->cmd_buffer.reg_tail); break; case IOMMU_EVENT_LOG_HEAD_OFFSET: val = reg_to_u64(iommu->event_log.reg_head); break; case IOMMU_EVENT_LOG_TAIL_OFFSET: val = reg_to_u64(iommu->event_log.reg_tail); break; case IOMMU_PPR_LOG_HEAD_OFFSET: val = reg_to_u64(iommu->ppr_log.reg_head); break; case IOMMU_PPR_LOG_TAIL_OFFSET: val = reg_to_u64(iommu->ppr_log.reg_tail); break; case IOMMU_CONTROL_MMIO_OFFSET: val = reg_to_u64(iommu->reg_ctrl); break; case IOMMU_STATUS_MMIO_OFFSET: val = reg_to_u64(iommu->reg_status); break; case IOMMU_EXT_FEATURE_MMIO_OFFSET: val = reg_to_u64(iommu->reg_ext_feature); break; default: AMD_IOMMU_DEBUG("Guest reads unknown mmio offset = %lx\n", offset); val = 0; break; } return val; } static int guest_iommu_mmio_read(struct vcpu *v, unsigned long addr, unsigned long len, unsigned long *pval) { struct guest_iommu *iommu = vcpu_iommu(v); unsigned long offset; uint64_t val; uint32_t mmio, shift; uint64_t mask = 0; offset = addr - iommu->mmio_base; if ( unlikely((offset & (len - 1 )) || (len > 8)) ) { AMD_IOMMU_DEBUG("iommu mmio read access is not aligned:" " offset = %lx, len = %lx\n", offset, len); return X86EMUL_UNHANDLEABLE; } mask = (len == 8) ? ~0ULL : (1ULL << (len * 8)) - 1; shift = (offset & 7u) * 8; /* mmio access is always aligned on 8-byte boundary */ mmio = offset & (~7u); spin_lock(&iommu->lock); val = iommu_mmio_read64(iommu, mmio); spin_unlock(&iommu->lock); *pval = (val >> shift ) & mask; return X86EMUL_OKAY; } static void guest_iommu_mmio_write64(struct guest_iommu *iommu, unsigned long offset, uint64_t val) { switch ( offset ) { case IOMMU_DEV_TABLE_BASE_LOW_OFFSET: u64_to_reg(&iommu->dev_table.reg_base, val); break; case IOMMU_CMD_BUFFER_BASE_LOW_OFFSET: u64_to_reg(&iommu->cmd_buffer.reg_base, val); break; case IOMMU_EVENT_LOG_BASE_LOW_OFFSET: u64_to_reg(&iommu->event_log.reg_base, val); break; case IOMMU_PPR_LOG_BASE_LOW_OFFSET: u64_to_reg(&iommu->ppr_log.reg_base, val); break; case IOMMU_CONTROL_MMIO_OFFSET: guest_iommu_write_ctrl(iommu, val); break; case IOMMU_CMD_BUFFER_HEAD_OFFSET: u64_to_reg(&iommu->cmd_buffer.reg_head, val); break; case IOMMU_CMD_BUFFER_TAIL_OFFSET: u64_to_reg(&iommu->cmd_buffer.reg_tail, val); tasklet_schedule(&iommu->cmd_buffer_tasklet); break; case IOMMU_EVENT_LOG_HEAD_OFFSET: u64_to_reg(&iommu->event_log.reg_head, val); break; case IOMMU_EVENT_LOG_TAIL_OFFSET: u64_to_reg(&iommu->event_log.reg_tail, val); break; case IOMMU_PPR_LOG_HEAD_OFFSET: u64_to_reg(&iommu->ppr_log.reg_head, val); break; case IOMMU_PPR_LOG_TAIL_OFFSET: u64_to_reg(&iommu->ppr_log.reg_tail, val); break; case IOMMU_STATUS_MMIO_OFFSET: val &= IOMMU_STATUS_EVENT_OVERFLOW_MASK | IOMMU_STATUS_EVENT_LOG_INT_MASK | IOMMU_STATUS_COMP_WAIT_INT_MASK | IOMMU_STATUS_PPR_LOG_OVERFLOW_MASK | IOMMU_STATUS_PPR_LOG_INT_MASK | IOMMU_STATUS_GAPIC_LOG_OVERFLOW_MASK | IOMMU_STATUS_GAPIC_LOG_INT_MASK; u64_to_reg(&iommu->reg_status, reg_to_u64(iommu->reg_status) & ~val); break; default: AMD_IOMMU_DEBUG("guest writes unknown mmio offset = %lx," " val = %" PRIx64 "\n", offset, val); break; } } static int guest_iommu_mmio_write(struct vcpu *v, unsigned long addr, unsigned long len, unsigned long val) { struct guest_iommu *iommu = vcpu_iommu(v); unsigned long offset; uint64_t reg_old, mmio; uint32_t shift; uint64_t mask = 0; offset = addr - iommu->mmio_base; if ( unlikely((offset & (len - 1)) || (len > 8)) ) { AMD_IOMMU_DEBUG("iommu mmio write access is not aligned:" " offset = %lx, len = %lx\n", offset, len); return X86EMUL_UNHANDLEABLE; } mask = (len == 8) ? ~0ULL : (1ULL << (len * 8)) - 1; shift = (offset & 7) * 8; /* mmio access is always aligned on 8-byte boundary */ mmio = offset & ~7; spin_lock(&iommu->lock); reg_old = iommu_mmio_read64(iommu, mmio); reg_old &= ~(mask << shift); val = reg_old | ((val & mask) << shift); guest_iommu_mmio_write64(iommu, mmio, val); spin_unlock(&iommu->lock); return X86EMUL_OKAY; } int guest_iommu_set_base(struct domain *d, uint64_t base) { p2m_type_t t; struct guest_iommu *iommu = domain_iommu(d); if ( !iommu ) return -EACCES; iommu->mmio_base = base; base >>= PAGE_SHIFT; for ( int i = 0; i < IOMMU_MMIO_PAGE_NR; i++ ) { unsigned long gfn = base + i; get_gfn_query(d, gfn, &t); p2m_change_type(d, gfn, t, p2m_mmio_dm); put_gfn(d, gfn); } return 0; } /* Initialize mmio read only bits */ static void guest_iommu_reg_init(struct guest_iommu *iommu) { uint32_t lower, upper; lower = upper = 0; /* Support prefetch */ iommu_set_bit(&lower,IOMMU_EXT_FEATURE_PREFSUP_SHIFT); /* Support PPR log */ iommu_set_bit(&lower,IOMMU_EXT_FEATURE_PPRSUP_SHIFT); /* Support guest translation */ iommu_set_bit(&lower,IOMMU_EXT_FEATURE_GTSUP_SHIFT); /* Support invalidate all command */ iommu_set_bit(&lower,IOMMU_EXT_FEATURE_IASUP_SHIFT); /* Host translation size has 6 levels */ set_field_in_reg_u32(HOST_ADDRESS_SIZE_6_LEVEL, lower, IOMMU_EXT_FEATURE_HATS_MASK, IOMMU_EXT_FEATURE_HATS_SHIFT, &lower); /* Guest translation size has 6 levels */ set_field_in_reg_u32(GUEST_ADDRESS_SIZE_6_LEVEL, lower, IOMMU_EXT_FEATURE_GATS_MASK, IOMMU_EXT_FEATURE_GATS_SHIFT, &lower); /* Single level gCR3 */ set_field_in_reg_u32(GUEST_CR3_1_LEVEL, lower, IOMMU_EXT_FEATURE_GLXSUP_MASK, IOMMU_EXT_FEATURE_GLXSUP_SHIFT, &lower); /* 9 bit PASID */ set_field_in_reg_u32(PASMAX_9_bit, upper, IOMMU_EXT_FEATURE_PASMAX_MASK, IOMMU_EXT_FEATURE_PASMAX_SHIFT, &upper); iommu->reg_ext_feature.lo = lower; iommu->reg_ext_feature.hi = upper; } /* Domain specific initialization */ int guest_iommu_init(struct domain* d) { struct guest_iommu *iommu; struct hvm_iommu *hd = domain_hvm_iommu(d); if ( !is_hvm_domain(d) || !iommu_enabled || !iommuv2_enabled ) return 0; iommu = xzalloc(struct guest_iommu); if ( !iommu ) { AMD_IOMMU_DEBUG("Error allocating guest iommu structure.\n"); return 1; } guest_iommu_reg_init(iommu); iommu->domain = d; hd->g_iommu = iommu; tasklet_init(&iommu->cmd_buffer_tasklet, guest_iommu_process_command, (unsigned long)d); spin_lock_init(&iommu->lock); return 0; } void guest_iommu_destroy(struct domain *d) { struct guest_iommu *iommu; iommu = domain_iommu(d); if ( !iommu ) return; tasklet_kill(&iommu->cmd_buffer_tasklet); xfree(iommu); domain_hvm_iommu(d)->g_iommu = NULL; } static int guest_iommu_mmio_range(struct vcpu *v, unsigned long addr) { struct guest_iommu *iommu = vcpu_iommu(v); return iommu && addr >= iommu->mmio_base && addr < iommu->mmio_base + IOMMU_MMIO_SIZE; } const struct hvm_mmio_handler iommu_mmio_handler = { .check_handler = guest_iommu_mmio_range, .read_handler = guest_iommu_mmio_read, .write_handler = guest_iommu_mmio_write }; xen-4.4.0/xen/drivers/passthrough/amd/iommu_cmd.c0000664000175000017500000003173612307313555020152 0ustar smbsmb/* * Copyright (C) 2011 Advanced Micro Devices, Inc. * Author: Leo Duran * Author: Wei Wang - adapted to xen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include "../ats.h" static int queue_iommu_command(struct amd_iommu *iommu, u32 cmd[]) { u32 tail, head, *cmd_buffer; int i; tail = iommu->cmd_buffer.tail; if ( ++tail == iommu->cmd_buffer.entries ) tail = 0; head = iommu_get_rb_pointer(readl(iommu->mmio_base + IOMMU_CMD_BUFFER_HEAD_OFFSET)); if ( head != tail ) { cmd_buffer = (u32 *)(iommu->cmd_buffer.buffer + (iommu->cmd_buffer.tail * IOMMU_CMD_BUFFER_ENTRY_SIZE)); for ( i = 0; i < IOMMU_CMD_BUFFER_U32_PER_ENTRY; i++ ) cmd_buffer[i] = cmd[i]; iommu->cmd_buffer.tail = tail; return 1; } return 0; } static void commit_iommu_command_buffer(struct amd_iommu *iommu) { u32 tail = 0; iommu_set_rb_pointer(&tail, iommu->cmd_buffer.tail); writel(tail, iommu->mmio_base+IOMMU_CMD_BUFFER_TAIL_OFFSET); } int send_iommu_command(struct amd_iommu *iommu, u32 cmd[]) { if ( queue_iommu_command(iommu, cmd) ) { commit_iommu_command_buffer(iommu); return 1; } return 0; } static void flush_command_buffer(struct amd_iommu *iommu) { u32 cmd[4], status; int loop_count, comp_wait; /* RW1C 'ComWaitInt' in status register */ writel(IOMMU_STATUS_COMP_WAIT_INT_MASK, iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); /* send an empty COMPLETION_WAIT command to flush command buffer */ cmd[3] = cmd[2] = 0; set_field_in_reg_u32(IOMMU_CMD_COMPLETION_WAIT, 0, IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT, &cmd[1]); set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, 0, IOMMU_COMP_WAIT_I_FLAG_MASK, IOMMU_COMP_WAIT_I_FLAG_SHIFT, &cmd[0]); send_iommu_command(iommu, cmd); /* Make loop_count long enough for polling completion wait bit */ loop_count = 1000; do { status = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); comp_wait = get_field_from_reg_u32(status, IOMMU_STATUS_COMP_WAIT_INT_MASK, IOMMU_STATUS_COMP_WAIT_INT_SHIFT); --loop_count; } while ( !comp_wait && loop_count ); if ( comp_wait ) { /* RW1C 'ComWaitInt' in status register */ writel(IOMMU_STATUS_COMP_WAIT_INT_MASK, iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); return; } AMD_IOMMU_DEBUG("Warning: ComWaitInt bit did not assert!\n"); } /* Build low level iommu command messages */ static void invalidate_iommu_pages(struct amd_iommu *iommu, u64 io_addr, u16 domain_id, u16 order) { u64 addr_lo, addr_hi; u32 cmd[4], entry; int sflag = 0, pde = 0; ASSERT ( order == 0 || order == 9 || order == 18 ); /* All pages associated with the domainID are invalidated */ if ( order || (io_addr == INV_IOMMU_ALL_PAGES_ADDRESS ) ) { sflag = 1; pde = 1; } /* If sflag == 1, the size of the invalidate command is determined by the first zero bit in the address starting from Address[12] */ if ( order ) { u64 mask = 1ULL << (order - 1 + PAGE_SHIFT); io_addr &= ~mask; io_addr |= mask - 1; } addr_lo = io_addr & DMA_32BIT_MASK; addr_hi = io_addr >> 32; set_field_in_reg_u32(domain_id, 0, IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_MASK, IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_SHIFT, &entry); set_field_in_reg_u32(IOMMU_CMD_INVALIDATE_IOMMU_PAGES, entry, IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT, &entry); cmd[1] = entry; set_field_in_reg_u32(sflag, 0, IOMMU_INV_IOMMU_PAGES_S_FLAG_MASK, IOMMU_INV_IOMMU_PAGES_S_FLAG_SHIFT, &entry); set_field_in_reg_u32(pde, entry, IOMMU_INV_IOMMU_PAGES_PDE_FLAG_MASK, IOMMU_INV_IOMMU_PAGES_PDE_FLAG_SHIFT, &entry); set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, entry, IOMMU_INV_IOMMU_PAGES_ADDR_LOW_MASK, IOMMU_INV_IOMMU_PAGES_ADDR_LOW_SHIFT, &entry); cmd[2] = entry; set_field_in_reg_u32((u32)addr_hi, 0, IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_MASK, IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_SHIFT, &entry); cmd[3] = entry; cmd[0] = 0; send_iommu_command(iommu, cmd); } static void invalidate_iotlb_pages(struct amd_iommu *iommu, u16 maxpend, u32 pasid, u16 queueid, u64 io_addr, u16 dev_id, u16 order) { u64 addr_lo, addr_hi; u32 cmd[4], entry; int sflag = 0; ASSERT ( order == 0 || order == 9 || order == 18 ); if ( order || (io_addr == INV_IOMMU_ALL_PAGES_ADDRESS ) ) sflag = 1; /* If sflag == 1, the size of the invalidate command is determined by the first zero bit in the address starting from Address[12] */ if ( order ) { u64 mask = 1ULL << (order - 1 + PAGE_SHIFT); io_addr &= ~mask; io_addr |= mask - 1; } addr_lo = io_addr & DMA_32BIT_MASK; addr_hi = io_addr >> 32; set_field_in_reg_u32(dev_id, 0, IOMMU_INV_IOTLB_PAGES_DEVICE_ID_MASK, IOMMU_INV_IOTLB_PAGES_DEVICE_ID_SHIFT, &entry); set_field_in_reg_u32(maxpend, entry, IOMMU_INV_IOTLB_PAGES_MAXPEND_MASK, IOMMU_INV_IOTLB_PAGES_MAXPEND_SHIFT, &entry); set_field_in_reg_u32(pasid & 0xff, entry, IOMMU_INV_IOTLB_PAGES_PASID1_MASK, IOMMU_INV_IOTLB_PAGES_PASID1_SHIFT, &entry); cmd[0] = entry; set_field_in_reg_u32(IOMMU_CMD_INVALIDATE_IOTLB_PAGES, 0, IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT, &entry); set_field_in_reg_u32(pasid >> 8, entry, IOMMU_INV_IOTLB_PAGES_PASID2_MASK, IOMMU_INV_IOTLB_PAGES_PASID2_SHIFT, &entry); set_field_in_reg_u32(queueid, entry, IOMMU_INV_IOTLB_PAGES_QUEUEID_MASK, IOMMU_INV_IOTLB_PAGES_QUEUEID_SHIFT, &entry); cmd[1] = entry; set_field_in_reg_u32(sflag, 0, IOMMU_INV_IOTLB_PAGES_S_FLAG_MASK, IOMMU_INV_IOTLB_PAGES_S_FLAG_MASK, &entry); set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, entry, IOMMU_INV_IOTLB_PAGES_ADDR_LOW_MASK, IOMMU_INV_IOTLB_PAGES_ADDR_LOW_SHIFT, &entry); cmd[2] = entry; set_field_in_reg_u32((u32)addr_hi, 0, IOMMU_INV_IOTLB_PAGES_ADDR_HIGH_MASK, IOMMU_INV_IOTLB_PAGES_ADDR_HIGH_SHIFT, &entry); cmd[3] = entry; send_iommu_command(iommu, cmd); } static void invalidate_dev_table_entry(struct amd_iommu *iommu, u16 device_id) { u32 cmd[4], entry; cmd[3] = cmd[2] = 0; set_field_in_reg_u32(device_id, 0, IOMMU_INV_DEVTAB_ENTRY_DEVICE_ID_MASK, IOMMU_INV_DEVTAB_ENTRY_DEVICE_ID_SHIFT, &entry); cmd[0] = entry; set_field_in_reg_u32(IOMMU_CMD_INVALIDATE_DEVTAB_ENTRY, 0, IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT, &entry); cmd[1] = entry; send_iommu_command(iommu, cmd); } static void invalidate_interrupt_table(struct amd_iommu *iommu, u16 device_id) { u32 cmd[4], entry; cmd[3] = cmd[2] = 0; set_field_in_reg_u32(device_id, 0, IOMMU_INV_INT_TABLE_DEVICE_ID_MASK, IOMMU_INV_INT_TABLE_DEVICE_ID_SHIFT, &entry); cmd[0] = entry; set_field_in_reg_u32(IOMMU_CMD_INVALIDATE_INT_TABLE, 0, IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT, &entry); cmd[1] = entry; send_iommu_command(iommu, cmd); } void invalidate_iommu_all(struct amd_iommu *iommu) { u32 cmd[4], entry; cmd[3] = cmd[2] = cmd[0] = 0; set_field_in_reg_u32(IOMMU_CMD_INVALIDATE_IOMMU_ALL, 0, IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT, &entry); cmd[1] = entry; send_iommu_command(iommu, cmd); } void amd_iommu_flush_iotlb(u8 devfn, const struct pci_dev *pdev, uint64_t gaddr, unsigned int order) { unsigned long flags; struct amd_iommu *iommu; unsigned int req_id, queueid, maxpend; struct pci_ats_dev *ats_pdev; if ( !ats_enabled ) return; ats_pdev = get_ats_device(pdev->seg, pdev->bus, pdev->devfn); if ( ats_pdev == NULL ) return; if ( !pci_ats_enabled(ats_pdev->seg, ats_pdev->bus, ats_pdev->devfn) ) return; iommu = find_iommu_for_device(ats_pdev->seg, PCI_BDF2(ats_pdev->bus, ats_pdev->devfn)); if ( !iommu ) { AMD_IOMMU_DEBUG("%s: Can't find iommu for %04x:%02x:%02x.%u\n", __func__, ats_pdev->seg, ats_pdev->bus, PCI_SLOT(ats_pdev->devfn), PCI_FUNC(ats_pdev->devfn)); return; } if ( !iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) ) return; req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(ats_pdev->bus, devfn)); queueid = req_id; maxpend = ats_pdev->ats_queue_depth & 0xff; /* send INVALIDATE_IOTLB_PAGES command */ spin_lock_irqsave(&iommu->lock, flags); invalidate_iotlb_pages(iommu, maxpend, 0, queueid, gaddr, req_id, order); flush_command_buffer(iommu); spin_unlock_irqrestore(&iommu->lock, flags); } static void amd_iommu_flush_all_iotlbs(struct domain *d, uint64_t gaddr, unsigned int order) { struct pci_dev *pdev; if ( !ats_enabled ) return; for_each_pdev( d, pdev ) { u8 devfn = pdev->devfn; do { amd_iommu_flush_iotlb(devfn, pdev, gaddr, order); devfn += pdev->phantom_stride; } while ( devfn != pdev->devfn && PCI_SLOT(devfn) == PCI_SLOT(pdev->devfn) ); } } /* Flush iommu cache after p2m changes. */ static void _amd_iommu_flush_pages(struct domain *d, uint64_t gaddr, unsigned int order) { unsigned long flags; struct amd_iommu *iommu; struct hvm_iommu *hd = domain_hvm_iommu(d); unsigned int dom_id = hd->domain_id; /* send INVALIDATE_IOMMU_PAGES command */ for_each_amd_iommu ( iommu ) { spin_lock_irqsave(&iommu->lock, flags); invalidate_iommu_pages(iommu, gaddr, dom_id, order); flush_command_buffer(iommu); spin_unlock_irqrestore(&iommu->lock, flags); } if ( ats_enabled ) amd_iommu_flush_all_iotlbs(d, gaddr, order); } void amd_iommu_flush_all_pages(struct domain *d) { _amd_iommu_flush_pages(d, INV_IOMMU_ALL_PAGES_ADDRESS, 0); } void amd_iommu_flush_pages(struct domain *d, unsigned long gfn, unsigned int order) { _amd_iommu_flush_pages(d, (uint64_t) gfn << PAGE_SHIFT, order); } void amd_iommu_flush_device(struct amd_iommu *iommu, uint16_t bdf) { ASSERT( spin_is_locked(&iommu->lock) ); invalidate_dev_table_entry(iommu, bdf); flush_command_buffer(iommu); } void amd_iommu_flush_intremap(struct amd_iommu *iommu, uint16_t bdf) { ASSERT( spin_is_locked(&iommu->lock) ); invalidate_interrupt_table(iommu, bdf); flush_command_buffer(iommu); } void amd_iommu_flush_all_caches(struct amd_iommu *iommu) { ASSERT( spin_is_locked(&iommu->lock) ); invalidate_iommu_all(iommu); flush_command_buffer(iommu); } void amd_iommu_send_guest_cmd(struct amd_iommu *iommu, u32 cmd[]) { unsigned long flags; spin_lock_irqsave(&iommu->lock, flags); send_iommu_command(iommu, cmd); flush_command_buffer(iommu); spin_unlock_irqrestore(&iommu->lock, flags); } xen-4.4.0/xen/drivers/passthrough/amd/iommu_init.c0000664000175000017500000012065212307313555020346 0ustar smbsmb/* * Copyright (C) 2007 Advanced Micro Devices, Inc. * Author: Leo Duran * Author: Wei Wang - adapted to xen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include static int __initdata nr_amd_iommus; static struct tasklet amd_iommu_irq_tasklet; unsigned int __read_mostly ivrs_bdf_entries; static struct radix_tree_root ivrs_maps; struct list_head amd_iommu_head; struct table_struct device_table; bool_t iommuv2_enabled; static int iommu_has_ht_flag(struct amd_iommu *iommu, u8 mask) { return iommu->ht_flags & mask; } static int __init map_iommu_mmio_region(struct amd_iommu *iommu) { iommu->mmio_base = ioremap(iommu->mmio_base_phys, IOMMU_MMIO_REGION_LENGTH); if ( !iommu->mmio_base ) return -ENOMEM; memset(iommu->mmio_base, 0, IOMMU_MMIO_REGION_LENGTH); return 0; } static void __init unmap_iommu_mmio_region(struct amd_iommu *iommu) { if ( iommu->mmio_base ) { iounmap(iommu->mmio_base); iommu->mmio_base = NULL; } } static void set_iommu_ht_flags(struct amd_iommu *iommu) { u32 entry; entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET); /* Setup HT flags */ if ( iommu_has_cap(iommu, PCI_CAP_HT_TUNNEL_SHIFT) ) iommu_has_ht_flag(iommu, ACPI_IVHD_TT_ENABLE) ? iommu_set_bit(&entry, IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_SHIFT) : iommu_clear_bit(&entry, IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_SHIFT); iommu_has_ht_flag(iommu, ACPI_IVHD_RES_PASS_PW) ? iommu_set_bit(&entry, IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_SHIFT): iommu_clear_bit(&entry, IOMMU_CONTROL_RESP_PASS_POSTED_WRITE_SHIFT); iommu_has_ht_flag(iommu, ACPI_IVHD_ISOC) ? iommu_set_bit(&entry, IOMMU_CONTROL_ISOCHRONOUS_SHIFT): iommu_clear_bit(&entry, IOMMU_CONTROL_ISOCHRONOUS_SHIFT); iommu_has_ht_flag(iommu, ACPI_IVHD_PASS_PW) ? iommu_set_bit(&entry, IOMMU_CONTROL_PASS_POSTED_WRITE_SHIFT): iommu_clear_bit(&entry, IOMMU_CONTROL_PASS_POSTED_WRITE_SHIFT); /* Force coherent */ iommu_set_bit(&entry, IOMMU_CONTROL_COHERENT_SHIFT); writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET); } static void register_iommu_dev_table_in_mmio_space(struct amd_iommu *iommu) { u64 addr_64, addr_lo, addr_hi; u32 entry; ASSERT( iommu->dev_table.buffer ); addr_64 = (u64)virt_to_maddr(iommu->dev_table.buffer); addr_lo = addr_64 & DMA_32BIT_MASK; addr_hi = addr_64 >> 32; entry = 0; iommu_set_addr_lo_to_reg(&entry, addr_lo >> PAGE_SHIFT); set_field_in_reg_u32((iommu->dev_table.alloc_size / PAGE_SIZE) - 1, entry, IOMMU_DEV_TABLE_SIZE_MASK, IOMMU_DEV_TABLE_SIZE_SHIFT, &entry); writel(entry, iommu->mmio_base + IOMMU_DEV_TABLE_BASE_LOW_OFFSET); entry = 0; iommu_set_addr_hi_to_reg(&entry, addr_hi); writel(entry, iommu->mmio_base + IOMMU_DEV_TABLE_BASE_HIGH_OFFSET); } static void register_iommu_cmd_buffer_in_mmio_space(struct amd_iommu *iommu) { u64 addr_64; u32 addr_lo, addr_hi; u32 power_of2_entries; u32 entry; ASSERT( iommu->cmd_buffer.buffer ); addr_64 = virt_to_maddr(iommu->cmd_buffer.buffer); addr_lo = addr_64; addr_hi = addr_64 >> 32; entry = 0; iommu_set_addr_lo_to_reg(&entry, addr_lo >> PAGE_SHIFT); writel(entry, iommu->mmio_base + IOMMU_CMD_BUFFER_BASE_LOW_OFFSET); power_of2_entries = get_order_from_bytes(iommu->cmd_buffer.alloc_size) + IOMMU_CMD_BUFFER_POWER_OF2_ENTRIES_PER_PAGE; entry = 0; iommu_set_addr_hi_to_reg(&entry, addr_hi); set_field_in_reg_u32(power_of2_entries, entry, IOMMU_CMD_BUFFER_LENGTH_MASK, IOMMU_CMD_BUFFER_LENGTH_SHIFT, &entry); writel(entry, iommu->mmio_base+IOMMU_CMD_BUFFER_BASE_HIGH_OFFSET); } static void register_iommu_event_log_in_mmio_space(struct amd_iommu *iommu) { u64 addr_64; u32 addr_lo, addr_hi; u32 power_of2_entries; u32 entry; ASSERT( iommu->event_log.buffer ); addr_64 = virt_to_maddr(iommu->event_log.buffer); addr_lo = addr_64; addr_hi = addr_64 >> 32; entry = 0; iommu_set_addr_lo_to_reg(&entry, addr_lo >> PAGE_SHIFT); writel(entry, iommu->mmio_base + IOMMU_EVENT_LOG_BASE_LOW_OFFSET); power_of2_entries = get_order_from_bytes(iommu->event_log.alloc_size) + IOMMU_EVENT_LOG_POWER_OF2_ENTRIES_PER_PAGE; entry = 0; iommu_set_addr_hi_to_reg(&entry, addr_hi); set_field_in_reg_u32(power_of2_entries, entry, IOMMU_EVENT_LOG_LENGTH_MASK, IOMMU_EVENT_LOG_LENGTH_SHIFT, &entry); writel(entry, iommu->mmio_base+IOMMU_EVENT_LOG_BASE_HIGH_OFFSET); } static void register_iommu_ppr_log_in_mmio_space(struct amd_iommu *iommu) { u64 addr_64; u32 addr_lo, addr_hi; u32 power_of2_entries; u32 entry; ASSERT ( iommu->ppr_log.buffer ); addr_64 = virt_to_maddr(iommu->ppr_log.buffer); addr_lo = addr_64; addr_hi = addr_64 >> 32; entry = 0; iommu_set_addr_lo_to_reg(&entry, addr_lo >> PAGE_SHIFT); writel(entry, iommu->mmio_base + IOMMU_PPR_LOG_BASE_LOW_OFFSET); power_of2_entries = get_order_from_bytes(iommu->ppr_log.alloc_size) + IOMMU_PPR_LOG_POWER_OF2_ENTRIES_PER_PAGE; entry = 0; iommu_set_addr_hi_to_reg(&entry, addr_hi); set_field_in_reg_u32(power_of2_entries, entry, IOMMU_PPR_LOG_LENGTH_MASK, IOMMU_PPR_LOG_LENGTH_SHIFT, &entry); writel(entry, iommu->mmio_base + IOMMU_PPR_LOG_BASE_HIGH_OFFSET); } static void set_iommu_translation_control(struct amd_iommu *iommu, int enable) { u32 entry; entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET); enable ? iommu_set_bit(&entry, IOMMU_CONTROL_TRANSLATION_ENABLE_SHIFT) : iommu_clear_bit(&entry, IOMMU_CONTROL_TRANSLATION_ENABLE_SHIFT); writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET); } static void set_iommu_guest_translation_control(struct amd_iommu *iommu, int enable) { u32 entry; entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET); enable ? iommu_set_bit(&entry, IOMMU_CONTROL_GT_ENABLE_SHIFT) : iommu_clear_bit(&entry, IOMMU_CONTROL_GT_ENABLE_SHIFT); writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET); if ( enable ) AMD_IOMMU_DEBUG("Guest Translation Enabled.\n"); } static void set_iommu_command_buffer_control(struct amd_iommu *iommu, int enable) { u32 entry; entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET); /*reset head and tail pointer manually before enablement */ if ( enable ) { writeq(0, iommu->mmio_base + IOMMU_CMD_BUFFER_HEAD_OFFSET); writeq(0, iommu->mmio_base + IOMMU_CMD_BUFFER_TAIL_OFFSET); iommu_set_bit(&entry, IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT); } else iommu_clear_bit(&entry, IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT); writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET); } static void register_iommu_exclusion_range(struct amd_iommu *iommu) { u32 addr_lo, addr_hi; u32 entry; addr_lo = iommu->exclusion_limit; addr_hi = iommu->exclusion_limit >> 32; set_field_in_reg_u32((u32)addr_hi, 0, IOMMU_EXCLUSION_LIMIT_HIGH_MASK, IOMMU_EXCLUSION_LIMIT_HIGH_SHIFT, &entry); writel(entry, iommu->mmio_base+IOMMU_EXCLUSION_LIMIT_HIGH_OFFSET); set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, IOMMU_EXCLUSION_LIMIT_LOW_MASK, IOMMU_EXCLUSION_LIMIT_LOW_SHIFT, &entry); writel(entry, iommu->mmio_base+IOMMU_EXCLUSION_LIMIT_LOW_OFFSET); addr_lo = iommu->exclusion_base & DMA_32BIT_MASK; addr_hi = iommu->exclusion_base >> 32; entry = 0; iommu_set_addr_hi_to_reg(&entry, addr_hi); writel(entry, iommu->mmio_base+IOMMU_EXCLUSION_BASE_HIGH_OFFSET); entry = 0; iommu_set_addr_lo_to_reg(&entry, addr_lo >> PAGE_SHIFT); set_field_in_reg_u32(iommu->exclusion_allow_all, entry, IOMMU_EXCLUSION_ALLOW_ALL_MASK, IOMMU_EXCLUSION_ALLOW_ALL_SHIFT, &entry); set_field_in_reg_u32(iommu->exclusion_enable, entry, IOMMU_EXCLUSION_RANGE_ENABLE_MASK, IOMMU_EXCLUSION_RANGE_ENABLE_SHIFT, &entry); writel(entry, iommu->mmio_base+IOMMU_EXCLUSION_BASE_LOW_OFFSET); } static void set_iommu_event_log_control(struct amd_iommu *iommu, int enable) { u32 entry; entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET); /*reset head and tail pointer manually before enablement */ if ( enable ) { writeq(0, iommu->mmio_base + IOMMU_EVENT_LOG_HEAD_OFFSET); writeq(0, iommu->mmio_base + IOMMU_EVENT_LOG_TAIL_OFFSET); iommu_set_bit(&entry, IOMMU_CONTROL_EVENT_LOG_INT_SHIFT); iommu_set_bit(&entry, IOMMU_CONTROL_EVENT_LOG_ENABLE_SHIFT); } else { iommu_clear_bit(&entry, IOMMU_CONTROL_EVENT_LOG_INT_SHIFT); iommu_clear_bit(&entry, IOMMU_CONTROL_EVENT_LOG_ENABLE_SHIFT); } iommu_clear_bit(&entry, IOMMU_CONTROL_COMP_WAIT_INT_SHIFT); writel(entry, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET); } static void set_iommu_ppr_log_control(struct amd_iommu *iommu, int enable) { u32 entry; entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET); /*reset head and tail pointer manually before enablement */ if ( enable ) { writeq(0, iommu->mmio_base + IOMMU_PPR_LOG_HEAD_OFFSET); writeq(0, iommu->mmio_base + IOMMU_PPR_LOG_TAIL_OFFSET); iommu_set_bit(&entry, IOMMU_CONTROL_PPR_ENABLE_SHIFT); iommu_set_bit(&entry, IOMMU_CONTROL_PPR_LOG_INT_SHIFT); iommu_set_bit(&entry, IOMMU_CONTROL_PPR_LOG_ENABLE_SHIFT); } else { iommu_clear_bit(&entry, IOMMU_CONTROL_PPR_ENABLE_SHIFT); iommu_clear_bit(&entry, IOMMU_CONTROL_PPR_LOG_INT_SHIFT); iommu_clear_bit(&entry, IOMMU_CONTROL_PPR_LOG_ENABLE_SHIFT); } writel(entry, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET); if ( enable ) AMD_IOMMU_DEBUG("PPR Log Enabled.\n"); } /* read event log or ppr log from iommu ring buffer */ static int iommu_read_log(struct amd_iommu *iommu, struct ring_buffer *log, unsigned int entry_size, void (*parse_func)(struct amd_iommu *, u32 *)) { u32 tail, head, *entry, tail_offest, head_offset; BUG_ON(!iommu || ((log != &iommu->event_log) && (log != &iommu->ppr_log))); spin_lock(&log->lock); /* make sure there's an entry in the log */ tail_offest = ( log == &iommu->event_log ) ? IOMMU_EVENT_LOG_TAIL_OFFSET : IOMMU_PPR_LOG_TAIL_OFFSET; head_offset = ( log == &iommu->event_log ) ? IOMMU_EVENT_LOG_HEAD_OFFSET : IOMMU_PPR_LOG_HEAD_OFFSET; tail = readl(iommu->mmio_base + tail_offest); tail = iommu_get_rb_pointer(tail); while ( tail != log->head ) { /* read event log entry */ entry = (u32 *)(log->buffer + log->head * entry_size); parse_func(iommu, entry); if ( ++log->head == log->entries ) log->head = 0; /* update head pointer */ head = 0; iommu_set_rb_pointer(&head, log->head); writel(head, iommu->mmio_base + head_offset); } spin_unlock(&log->lock); return 0; } /* reset event log or ppr log when overflow */ static void iommu_reset_log(struct amd_iommu *iommu, struct ring_buffer *log, void (*ctrl_func)(struct amd_iommu *iommu, int)) { u32 entry; int log_run, run_bit; int loop_count = 1000; BUG_ON(!iommu || ((log != &iommu->event_log) && (log != &iommu->ppr_log))); run_bit = ( log == &iommu->event_log ) ? IOMMU_STATUS_EVENT_LOG_RUN_SHIFT : IOMMU_STATUS_PPR_LOG_RUN_SHIFT; /* wait until EventLogRun bit = 0 */ do { entry = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); log_run = iommu_get_bit(entry, run_bit); loop_count--; } while ( log_run && loop_count ); if ( log_run ) { AMD_IOMMU_DEBUG("Warning: Log Run bit %d is not cleared" "before reset!\n", run_bit); return; } ctrl_func(iommu, IOMMU_CONTROL_DISABLED); /* RW1C overflow bit */ writel(log == &iommu->event_log ? IOMMU_STATUS_EVENT_OVERFLOW_MASK : IOMMU_STATUS_PPR_LOG_OVERFLOW_MASK, iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); /*reset event log base address */ log->head = 0; ctrl_func(iommu, IOMMU_CONTROL_ENABLED); } static void amd_iommu_msi_enable(struct amd_iommu *iommu, int flag) { __msi_set_enable(iommu->seg, PCI_BUS(iommu->bdf), PCI_SLOT(iommu->bdf), PCI_FUNC(iommu->bdf), iommu->msi.msi_attrib.pos, flag); } static void iommu_msi_unmask(struct irq_desc *desc) { unsigned long flags; struct amd_iommu *iommu = desc->action->dev_id; spin_lock_irqsave(&iommu->lock, flags); amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED); spin_unlock_irqrestore(&iommu->lock, flags); iommu->msi.msi_attrib.masked = 0; } static void iommu_msi_mask(struct irq_desc *desc) { unsigned long flags; struct amd_iommu *iommu = desc->action->dev_id; irq_complete_move(desc); spin_lock_irqsave(&iommu->lock, flags); amd_iommu_msi_enable(iommu, IOMMU_CONTROL_DISABLED); spin_unlock_irqrestore(&iommu->lock, flags); iommu->msi.msi_attrib.masked = 1; } static unsigned int iommu_msi_startup(struct irq_desc *desc) { iommu_msi_unmask(desc); return 0; } static void iommu_msi_end(struct irq_desc *desc, u8 vector) { iommu_msi_unmask(desc); ack_APIC_irq(); } static hw_irq_controller iommu_msi_type = { .typename = "AMD-IOMMU-MSI", .startup = iommu_msi_startup, .shutdown = iommu_msi_mask, .enable = iommu_msi_unmask, .disable = iommu_msi_mask, .ack = iommu_msi_mask, .end = iommu_msi_end, .set_affinity = set_msi_affinity, }; static unsigned int iommu_maskable_msi_startup(struct irq_desc *desc) { iommu_msi_unmask(desc); unmask_msi_irq(desc); return 0; } static void iommu_maskable_msi_shutdown(struct irq_desc *desc) { mask_msi_irq(desc); iommu_msi_mask(desc); } /* * While the names may appear mismatched, we indeed want to use the non- * maskable flavors here, as we want the ACK to be issued in ->end(). */ #define iommu_maskable_msi_ack ack_nonmaskable_msi_irq #define iommu_maskable_msi_end end_nonmaskable_msi_irq static hw_irq_controller iommu_maskable_msi_type = { .typename = "IOMMU-M-MSI", .startup = iommu_maskable_msi_startup, .shutdown = iommu_maskable_msi_shutdown, .enable = unmask_msi_irq, .disable = mask_msi_irq, .ack = iommu_maskable_msi_ack, .end = iommu_maskable_msi_end, .set_affinity = set_msi_affinity, }; static void parse_event_log_entry(struct amd_iommu *iommu, u32 entry[]) { u16 domain_id, device_id, flags; unsigned int bdf; u32 code; u64 *addr; int count = 0; static const char *const event_str[] = { #define EVENT_STR(name) [IOMMU_EVENT_##name - 1] = #name EVENT_STR(ILLEGAL_DEV_TABLE_ENTRY), EVENT_STR(IO_PAGE_FAULT), EVENT_STR(DEV_TABLE_HW_ERROR), EVENT_STR(PAGE_TABLE_HW_ERROR), EVENT_STR(ILLEGAL_COMMAND_ERROR), EVENT_STR(COMMAND_HW_ERROR), EVENT_STR(IOTLB_INV_TIMEOUT), EVENT_STR(INVALID_DEV_REQUEST) #undef EVENT_STR }; code = get_field_from_reg_u32(entry[1], IOMMU_EVENT_CODE_MASK, IOMMU_EVENT_CODE_SHIFT); /* * Workaround for erratum 732: * It can happen that the tail pointer is updated before the actual entry * got written. As suggested by RevGuide, we initialize the event log * buffer to all zeros and clear event log entries after processing them. */ while ( code == 0 ) { if ( unlikely(++count == IOMMU_LOG_ENTRY_TIMEOUT) ) { AMD_IOMMU_DEBUG("AMD-Vi: No event written to log\n"); return; } udelay(1); rmb(); code = get_field_from_reg_u32(entry[1], IOMMU_EVENT_CODE_MASK, IOMMU_EVENT_CODE_SHIFT); } if ( code == IOMMU_EVENT_IO_PAGE_FAULT ) { device_id = iommu_get_devid_from_event(entry[0]); domain_id = get_field_from_reg_u32(entry[1], IOMMU_EVENT_DOMAIN_ID_MASK, IOMMU_EVENT_DOMAIN_ID_SHIFT); flags = get_field_from_reg_u32(entry[1], IOMMU_EVENT_FLAGS_MASK, IOMMU_EVENT_FLAGS_SHIFT); addr= (u64*) (entry + 2); printk(XENLOG_ERR "AMD-Vi: " "%s: domain = %d, device id = %#x, " "fault address = %#"PRIx64", flags = %#x\n", event_str[code-1], domain_id, device_id, *addr, flags); for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) if ( get_dma_requestor_id(iommu->seg, bdf) == device_id ) pci_check_disable_device(iommu->seg, PCI_BUS(bdf), PCI_DEVFN2(bdf)); } else { AMD_IOMMU_DEBUG("%s %08x %08x %08x %08x\n", code <= ARRAY_SIZE(event_str) ? event_str[code - 1] : "event", entry[0], entry[1], entry[2], entry[3]); } memset(entry, 0, IOMMU_EVENT_LOG_ENTRY_SIZE); } static void iommu_check_event_log(struct amd_iommu *iommu) { u32 entry; unsigned long flags; /* RW1C interrupt status bit */ writel(IOMMU_STATUS_EVENT_LOG_INT_MASK, iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); iommu_read_log(iommu, &iommu->event_log, sizeof(event_entry_t), parse_event_log_entry); spin_lock_irqsave(&iommu->lock, flags); /* Check event overflow. */ entry = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); if ( iommu_get_bit(entry, IOMMU_STATUS_EVENT_OVERFLOW_SHIFT) ) iommu_reset_log(iommu, &iommu->event_log, set_iommu_event_log_control); else { entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET); if ( !(entry & IOMMU_CONTROL_EVENT_LOG_INT_MASK) ) { entry |= IOMMU_CONTROL_EVENT_LOG_INT_MASK; writel(entry, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET); /* * Re-schedule the tasklet to handle eventual log entries added * between reading the log above and re-enabling the interrupt. */ tasklet_schedule(&amd_iommu_irq_tasklet); } } /* * Workaround for erratum787: * Re-check to make sure the bit has been cleared. */ entry = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); if ( entry & IOMMU_STATUS_EVENT_LOG_INT_MASK ) tasklet_schedule(&amd_iommu_irq_tasklet); spin_unlock_irqrestore(&iommu->lock, flags); } void parse_ppr_log_entry(struct amd_iommu *iommu, u32 entry[]) { u16 device_id; u8 bus, devfn, code; struct pci_dev *pdev; int count = 0; code = get_field_from_reg_u32(entry[1], IOMMU_PPR_LOG_CODE_MASK, IOMMU_PPR_LOG_CODE_SHIFT); /* * Workaround for erratum 733: * It can happen that the tail pointer is updated before the actual entry * got written. As suggested by RevGuide, we initialize the event log * buffer to all zeros and clear ppr log entries after processing them. */ while ( code == 0 ) { if ( unlikely(++count == IOMMU_LOG_ENTRY_TIMEOUT) ) { AMD_IOMMU_DEBUG("AMD-Vi: No ppr written to log\n"); return; } udelay(1); rmb(); code = get_field_from_reg_u32(entry[1], IOMMU_PPR_LOG_CODE_MASK, IOMMU_PPR_LOG_CODE_SHIFT); } /* here device_id is physical value */ device_id = iommu_get_devid_from_cmd(entry[0]); bus = PCI_BUS(device_id); devfn = PCI_DEVFN2(device_id); spin_lock(&pcidevs_lock); pdev = pci_get_real_pdev(iommu->seg, bus, devfn); spin_unlock(&pcidevs_lock); if ( pdev ) guest_iommu_add_ppr_log(pdev->domain, entry); memset(entry, 0, IOMMU_PPR_LOG_ENTRY_SIZE); } static void iommu_check_ppr_log(struct amd_iommu *iommu) { u32 entry; unsigned long flags; /* RW1C interrupt status bit */ writel(IOMMU_STATUS_PPR_LOG_INT_MASK, iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); iommu_read_log(iommu, &iommu->ppr_log, sizeof(ppr_entry_t), parse_ppr_log_entry); spin_lock_irqsave(&iommu->lock, flags); /* Check event overflow. */ entry = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); if ( iommu_get_bit(entry, IOMMU_STATUS_PPR_LOG_OVERFLOW_SHIFT) ) iommu_reset_log(iommu, &iommu->ppr_log, set_iommu_ppr_log_control); else { entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET); if ( !(entry & IOMMU_CONTROL_PPR_LOG_INT_MASK) ) { entry |= IOMMU_CONTROL_PPR_LOG_INT_MASK; writel(entry, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET); /* * Re-schedule the tasklet to handle eventual log entries added * between reading the log above and re-enabling the interrupt. */ tasklet_schedule(&amd_iommu_irq_tasklet); } } /* * Workaround for erratum787: * Re-check to make sure the bit has been cleared. */ entry = readl(iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); if ( entry & IOMMU_STATUS_PPR_LOG_INT_MASK ) tasklet_schedule(&amd_iommu_irq_tasklet); spin_unlock_irqrestore(&iommu->lock, flags); } static void do_amd_iommu_irq(unsigned long data) { struct amd_iommu *iommu; if ( !iommu_found() ) { AMD_IOMMU_DEBUG("no device found, something must be very wrong!\n"); return; } /* * No matter from where the interrupt came from, check all the * IOMMUs present in the system. This allows for having just one * tasklet (instead of one per each IOMMUs). */ for_each_amd_iommu ( iommu ) { iommu_check_event_log(iommu); if ( iommu->ppr_log.buffer != NULL ) iommu_check_ppr_log(iommu); } } static void iommu_interrupt_handler(int irq, void *dev_id, struct cpu_user_regs *regs) { u32 entry; unsigned long flags; struct amd_iommu *iommu = dev_id; spin_lock_irqsave(&iommu->lock, flags); /* * Silence interrupts from both event and PPR by clearing the * enable logging bits in the control register */ entry = readl(iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET); iommu_clear_bit(&entry, IOMMU_CONTROL_EVENT_LOG_INT_SHIFT); iommu_clear_bit(&entry, IOMMU_CONTROL_PPR_LOG_INT_SHIFT); writel(entry, iommu->mmio_base + IOMMU_CONTROL_MMIO_OFFSET); spin_unlock_irqrestore(&iommu->lock, flags); /* It is the tasklet that will clear the logs and re-enable interrupts */ tasklet_schedule(&amd_iommu_irq_tasklet); } static bool_t __init set_iommu_interrupt_handler(struct amd_iommu *iommu) { int irq, ret; hw_irq_controller *handler; unsigned long flags; u16 control; irq = create_irq(NUMA_NO_NODE); if ( irq <= 0 ) { dprintk(XENLOG_ERR, "IOMMU: no irqs\n"); return 0; } spin_lock_irqsave(&pcidevs_lock, flags); iommu->msi.dev = pci_get_pdev(iommu->seg, PCI_BUS(iommu->bdf), PCI_DEVFN2(iommu->bdf)); spin_unlock_irqrestore(&pcidevs_lock, flags); if ( !iommu->msi.dev ) { AMD_IOMMU_DEBUG("IOMMU: no pdev for %04x:%02x:%02x.%u\n", iommu->seg, PCI_BUS(iommu->bdf), PCI_SLOT(iommu->bdf), PCI_FUNC(iommu->bdf)); return 0; } control = pci_conf_read16(iommu->seg, PCI_BUS(iommu->bdf), PCI_SLOT(iommu->bdf), PCI_FUNC(iommu->bdf), iommu->msi.msi_attrib.pos + PCI_MSI_FLAGS); iommu->msi.msi.nvec = 1; if ( is_mask_bit_support(control) ) { iommu->msi.msi_attrib.maskbit = 1; iommu->msi.msi.mpos = msi_mask_bits_reg(iommu->msi.msi_attrib.pos, is_64bit_address(control)); handler = &iommu_maskable_msi_type; } else handler = &iommu_msi_type; ret = __setup_msi_irq(irq_to_desc(irq), &iommu->msi, handler); if ( !ret ) ret = request_irq(irq, iommu_interrupt_handler, "amd_iommu", iommu); if ( ret ) { destroy_irq(irq); AMD_IOMMU_DEBUG("can't request irq\n"); return 0; } iommu->msi.irq = irq; return 1; } /* * Family15h Model 10h-1fh erratum 746 (IOMMU Logging May Stall Translations) * Workaround: * BIOS should disable L2B micellaneous clock gating by setting * L2_L2B_CK_GATE_CONTROL[CKGateL2BMiscDisable](D0F2xF4_x90[2]) = 1b */ static void amd_iommu_erratum_746_workaround(struct amd_iommu *iommu) { u32 value; u8 bus = PCI_BUS(iommu->bdf); u8 dev = PCI_SLOT(iommu->bdf); u8 func = PCI_FUNC(iommu->bdf); if ( (boot_cpu_data.x86 != 0x15) || (boot_cpu_data.x86_model < 0x10) || (boot_cpu_data.x86_model > 0x1f) ) return; pci_conf_write32(iommu->seg, bus, dev, func, 0xf0, 0x90); value = pci_conf_read32(iommu->seg, bus, dev, func, 0xf4); if ( value & (1 << 2) ) return; /* Select NB indirect register 0x90 and enable writing */ pci_conf_write32(iommu->seg, bus, dev, func, 0xf0, 0x90 | (1 << 8)); pci_conf_write32(iommu->seg, bus, dev, func, 0xf4, value | (1 << 2)); printk(XENLOG_INFO "AMD-Vi: Applying erratum 746 workaround for IOMMU at %04x:%02x:%02x.%u\n", iommu->seg, bus, dev, func); /* Clear the enable writing bit */ pci_conf_write32(iommu->seg, bus, dev, func, 0xf0, 0x90); } static void enable_iommu(struct amd_iommu *iommu) { unsigned long flags; struct irq_desc *desc; spin_lock_irqsave(&iommu->lock, flags); if ( iommu->enabled ) { spin_unlock_irqrestore(&iommu->lock, flags); return; } amd_iommu_erratum_746_workaround(iommu); register_iommu_dev_table_in_mmio_space(iommu); register_iommu_cmd_buffer_in_mmio_space(iommu); register_iommu_event_log_in_mmio_space(iommu); register_iommu_exclusion_range(iommu); if ( iommu_has_feature(iommu, IOMMU_EXT_FEATURE_PPRSUP_SHIFT) ) register_iommu_ppr_log_in_mmio_space(iommu); desc = irq_to_desc(iommu->msi.irq); spin_lock(&desc->lock); set_msi_affinity(desc, &cpu_online_map); spin_unlock(&desc->lock); amd_iommu_msi_enable(iommu, IOMMU_CONTROL_ENABLED); set_iommu_ht_flags(iommu); set_iommu_command_buffer_control(iommu, IOMMU_CONTROL_ENABLED); set_iommu_event_log_control(iommu, IOMMU_CONTROL_ENABLED); if ( iommu_has_feature(iommu, IOMMU_EXT_FEATURE_PPRSUP_SHIFT) ) set_iommu_ppr_log_control(iommu, IOMMU_CONTROL_ENABLED); if ( iommu_has_feature(iommu, IOMMU_EXT_FEATURE_GTSUP_SHIFT) ) set_iommu_guest_translation_control(iommu, IOMMU_CONTROL_ENABLED); set_iommu_translation_control(iommu, IOMMU_CONTROL_ENABLED); if ( iommu_has_feature(iommu, IOMMU_EXT_FEATURE_IASUP_SHIFT) ) amd_iommu_flush_all_caches(iommu); iommu->enabled = 1; spin_unlock_irqrestore(&iommu->lock, flags); } static void __init deallocate_buffer(void *buf, uint32_t sz) { int order = 0; if ( buf ) { order = get_order_from_bytes(sz); __free_amd_iommu_tables(buf, order); } } static void __init deallocate_device_table(struct table_struct *table) { deallocate_buffer(table->buffer, table->alloc_size); table->buffer = NULL; } static void __init deallocate_ring_buffer(struct ring_buffer *ring_buf) { deallocate_buffer(ring_buf->buffer, ring_buf->alloc_size); ring_buf->buffer = NULL; ring_buf->head = 0; ring_buf->tail = 0; } static void * __init allocate_buffer(uint32_t alloc_size, const char *name) { void * buffer; int order = get_order_from_bytes(alloc_size); buffer = __alloc_amd_iommu_tables(order); if ( buffer == NULL ) { AMD_IOMMU_DEBUG("Error allocating %s\n", name); return NULL; } memset(buffer, 0, PAGE_SIZE * (1UL << order)); return buffer; } static void * __init allocate_ring_buffer(struct ring_buffer *ring_buf, uint32_t entry_size, uint64_t entries, const char *name) { ring_buf->head = 0; ring_buf->tail = 0; spin_lock_init(&ring_buf->lock); ring_buf->alloc_size = PAGE_SIZE << get_order_from_bytes(entries * entry_size); ring_buf->entries = ring_buf->alloc_size / entry_size; ring_buf->buffer = allocate_buffer(ring_buf->alloc_size, name); return ring_buf->buffer; } static void * __init allocate_cmd_buffer(struct amd_iommu *iommu) { /* allocate 'command buffer' in power of 2 increments of 4K */ return allocate_ring_buffer(&iommu->cmd_buffer, sizeof(cmd_entry_t), IOMMU_CMD_BUFFER_DEFAULT_ENTRIES, "Command Buffer"); } static void * __init allocate_event_log(struct amd_iommu *iommu) { /* allocate 'event log' in power of 2 increments of 4K */ return allocate_ring_buffer(&iommu->event_log, sizeof(event_entry_t), IOMMU_EVENT_LOG_DEFAULT_ENTRIES, "Event Log"); } static void * __init allocate_ppr_log(struct amd_iommu *iommu) { /* allocate 'ppr log' in power of 2 increments of 4K */ return allocate_ring_buffer(&iommu->ppr_log, sizeof(ppr_entry_t), IOMMU_PPR_LOG_DEFAULT_ENTRIES, "PPR Log"); } static int __init amd_iommu_init_one(struct amd_iommu *iommu) { if ( map_iommu_mmio_region(iommu) != 0 ) goto error_out; get_iommu_features(iommu); if ( iommu->features ) iommuv2_enabled = 1; if ( allocate_cmd_buffer(iommu) == NULL ) goto error_out; if ( allocate_event_log(iommu) == NULL ) goto error_out; if ( iommu_has_feature(iommu, IOMMU_EXT_FEATURE_PPRSUP_SHIFT) ) if ( allocate_ppr_log(iommu) == NULL ) goto error_out; if ( !set_iommu_interrupt_handler(iommu) ) goto error_out; /* To make sure that device_table.buffer has been successfully allocated */ if ( device_table.buffer == NULL ) goto error_out; iommu->dev_table.alloc_size = device_table.alloc_size; iommu->dev_table.entries = device_table.entries; iommu->dev_table.buffer = device_table.buffer; enable_iommu(iommu); printk("AMD-Vi: IOMMU %d Enabled.\n", nr_amd_iommus ); nr_amd_iommus++; softirq_tasklet_init(&amd_iommu_irq_tasklet, do_amd_iommu_irq, 0); return 0; error_out: return -ENODEV; } static void __init amd_iommu_init_cleanup(void) { struct amd_iommu *iommu, *next; /* free amd iommu list */ list_for_each_entry_safe ( iommu, next, &amd_iommu_head, list ) { list_del(&iommu->list); if ( iommu->enabled ) { deallocate_ring_buffer(&iommu->cmd_buffer); deallocate_ring_buffer(&iommu->event_log); deallocate_ring_buffer(&iommu->ppr_log); unmap_iommu_mmio_region(iommu); } xfree(iommu); } /* free interrupt remapping table */ iterate_ivrs_entries(amd_iommu_free_intremap_table); /* free device table */ deallocate_device_table(&device_table); /* free ivrs_mappings[] */ radix_tree_destroy(&ivrs_maps, xfree); iommu_enabled = 0; iommu_passthrough = 0; iommu_intremap = 0; iommuv2_enabled = 0; } /* * We allocate an extra array element to store the segment number * (and in the future perhaps other global information). */ #define IVRS_MAPPINGS_SEG(m) m[ivrs_bdf_entries].dte_requestor_id struct ivrs_mappings *get_ivrs_mappings(u16 seg) { return radix_tree_lookup(&ivrs_maps, seg); } int iterate_ivrs_mappings(int (*handler)(u16 seg, struct ivrs_mappings *)) { u16 seg = 0; int rc = 0; do { struct ivrs_mappings *map; if ( !radix_tree_gang_lookup(&ivrs_maps, (void **)&map, seg, 1) ) break; seg = IVRS_MAPPINGS_SEG(map); rc = handler(seg, map); } while ( !rc && ++seg ); return rc; } int iterate_ivrs_entries(int (*handler)(u16 seg, struct ivrs_mappings *)) { u16 seg = 0; int rc = 0; do { struct ivrs_mappings *map; unsigned int bdf; if ( !radix_tree_gang_lookup(&ivrs_maps, (void **)&map, seg, 1) ) break; seg = IVRS_MAPPINGS_SEG(map); for ( bdf = 0; !rc && bdf < ivrs_bdf_entries; ++bdf ) rc = handler(seg, map + bdf); } while ( !rc && ++seg ); return rc; } static int __init alloc_ivrs_mappings(u16 seg) { struct ivrs_mappings *ivrs_mappings; unsigned int bdf; BUG_ON( !ivrs_bdf_entries ); if ( get_ivrs_mappings(seg) ) return 0; ivrs_mappings = xzalloc_array(struct ivrs_mappings, ivrs_bdf_entries + 1); if ( ivrs_mappings == NULL ) { AMD_IOMMU_DEBUG("Error allocating IVRS Mappings table\n"); return -ENOMEM; } IVRS_MAPPINGS_SEG(ivrs_mappings) = seg; /* assign default values for device entries */ for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) { ivrs_mappings[bdf].dte_requestor_id = bdf; ivrs_mappings[bdf].dte_allow_exclusion = IOMMU_CONTROL_DISABLED; ivrs_mappings[bdf].unity_map_enable = IOMMU_CONTROL_DISABLED; ivrs_mappings[bdf].iommu = NULL; ivrs_mappings[bdf].intremap_table = NULL; ivrs_mappings[bdf].device_flags = 0; if ( amd_iommu_perdev_intremap ) spin_lock_init(&ivrs_mappings[bdf].intremap_lock); } radix_tree_insert(&ivrs_maps, seg, ivrs_mappings); return 0; } static int __init amd_iommu_setup_device_table( u16 seg, struct ivrs_mappings *ivrs_mappings) { unsigned int bdf; void *intr_tb, *dte; BUG_ON( (ivrs_bdf_entries == 0) ); /* allocate 'device table' on a 4K boundary */ device_table.alloc_size = PAGE_SIZE << get_order_from_bytes( PAGE_ALIGN(ivrs_bdf_entries * IOMMU_DEV_TABLE_ENTRY_SIZE)); device_table.entries = device_table.alloc_size / IOMMU_DEV_TABLE_ENTRY_SIZE; device_table.buffer = allocate_buffer(device_table.alloc_size, "Device Table"); if ( device_table.buffer == NULL ) return -ENOMEM; /* Add device table entries */ for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) { intr_tb = ivrs_mappings[bdf].intremap_table; if ( intr_tb ) { /* add device table entry */ dte = device_table.buffer + (bdf * IOMMU_DEV_TABLE_ENTRY_SIZE); iommu_dte_add_device_entry(dte, &ivrs_mappings[bdf]); amd_iommu_set_intremap_table( dte, (u64)virt_to_maddr(intr_tb), iommu_intremap); } } return 0; } /* Check whether SP5100 SATA Combined mode is on */ static bool_t __init amd_sp5100_erratum28(void) { u32 bus, id; u16 vendor_id, dev_id; u8 byte; for (bus = 0; bus < 256; bus++) { id = pci_conf_read32(0, bus, 0x14, 0, PCI_VENDOR_ID); vendor_id = id & 0xffff; dev_id = (id >> 16) & 0xffff; /* SP5100 SMBus module sets Combined mode on */ if (vendor_id != 0x1002 || dev_id != 0x4385) continue; byte = pci_conf_read8(0, bus, 0x14, 0, 0xad); if ( (byte >> 3) & 1 ) { printk(XENLOG_WARNING "AMD-Vi: SP5100 erratum 28 detected, disabling IOMMU.\n" "If possible, disable SATA Combined mode in BIOS or contact your vendor for BIOS update.\n"); return 1; } } return 0; } int __init amd_iommu_init(void) { struct amd_iommu *iommu; BUG_ON( !iommu_found() ); if ( iommu_intremap && amd_iommu_perdev_intremap && amd_sp5100_erratum28() ) goto error_out; ivrs_bdf_entries = amd_iommu_get_ivrs_dev_entries(); if ( !ivrs_bdf_entries ) goto error_out; radix_tree_init(&ivrs_maps); for_each_amd_iommu ( iommu ) if ( alloc_ivrs_mappings(iommu->seg) != 0 ) goto error_out; if ( amd_iommu_update_ivrs_mapping_acpi() != 0 ) goto error_out; /* initialize io-apic interrupt remapping entries */ if ( iommu_intremap && amd_iommu_setup_ioapic_remapping() != 0 ) goto error_out; /* allocate and initialize a global device table shared by all iommus */ if ( iterate_ivrs_mappings(amd_iommu_setup_device_table) != 0 ) goto error_out; /* per iommu initialization */ for_each_amd_iommu ( iommu ) if ( amd_iommu_init_one(iommu) != 0 ) goto error_out; return 0; error_out: amd_iommu_init_cleanup(); return -ENODEV; } static void disable_iommu(struct amd_iommu *iommu) { unsigned long flags; spin_lock_irqsave(&iommu->lock, flags); if ( !iommu->enabled ) { spin_unlock_irqrestore(&iommu->lock, flags); return; } amd_iommu_msi_enable(iommu, IOMMU_CONTROL_DISABLED); set_iommu_command_buffer_control(iommu, IOMMU_CONTROL_DISABLED); set_iommu_event_log_control(iommu, IOMMU_CONTROL_DISABLED); if ( iommu_has_feature(iommu, IOMMU_EXT_FEATURE_PPRSUP_SHIFT) ) set_iommu_ppr_log_control(iommu, IOMMU_CONTROL_DISABLED); if ( iommu_has_feature(iommu, IOMMU_EXT_FEATURE_GTSUP_SHIFT) ) set_iommu_guest_translation_control(iommu, IOMMU_CONTROL_DISABLED); set_iommu_translation_control(iommu, IOMMU_CONTROL_DISABLED); iommu->enabled = 0; spin_unlock_irqrestore(&iommu->lock, flags); } static void invalidate_all_domain_pages(void) { struct domain *d; for_each_domain( d ) amd_iommu_flush_all_pages(d); } static int _invalidate_all_devices( u16 seg, struct ivrs_mappings *ivrs_mappings) { unsigned int bdf; u16 req_id; unsigned long flags; struct amd_iommu *iommu; for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) { iommu = find_iommu_for_device(seg, bdf); req_id = ivrs_mappings[bdf].dte_requestor_id; if ( iommu ) { spin_lock_irqsave(&iommu->lock, flags); amd_iommu_flush_device(iommu, req_id); amd_iommu_flush_intremap(iommu, req_id); spin_unlock_irqrestore(&iommu->lock, flags); } } return 0; } static void invalidate_all_devices(void) { iterate_ivrs_mappings(_invalidate_all_devices); } void amd_iommu_suspend(void) { struct amd_iommu *iommu; for_each_amd_iommu ( iommu ) disable_iommu(iommu); } void amd_iommu_resume(void) { struct amd_iommu *iommu; for_each_amd_iommu ( iommu ) { /* * To make sure that iommus have not been touched * before re-enablement */ disable_iommu(iommu); enable_iommu(iommu); } /* flush all cache entries after iommu re-enabled */ if ( !iommu_has_feature(iommu, IOMMU_EXT_FEATURE_IASUP_SHIFT) ) { invalidate_all_devices(); invalidate_all_domain_pages(); } } xen-4.4.0/xen/drivers/passthrough/amd/iommu_detect.c0000664000175000017500000001111012307313555020637 0ustar smbsmb/* * Copyright (C) 2007 Advanced Micro Devices, Inc. * Author: Leo Duran * Author: Wei Wang - adapted to xen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include static int __init get_iommu_msi_capabilities( u16 seg, u8 bus, u8 dev, u8 func, struct amd_iommu *iommu) { int pos; pos = pci_find_cap_offset(seg, bus, dev, func, PCI_CAP_ID_MSI); if ( !pos ) return -ENODEV; AMD_IOMMU_DEBUG("Found MSI capability block at %#x\n", pos); iommu->msi.msi_attrib.type = PCI_CAP_ID_MSI; iommu->msi.msi_attrib.pos = pos; iommu->msi.msi_attrib.is_64 = 1; return 0; } static int __init get_iommu_capabilities( u16 seg, u8 bus, u8 dev, u8 func, u16 cap_ptr, struct amd_iommu *iommu) { u8 type; iommu->cap.header = pci_conf_read32(seg, bus, dev, func, cap_ptr); type = get_field_from_reg_u32(iommu->cap.header, PCI_CAP_TYPE_MASK, PCI_CAP_TYPE_SHIFT); if ( type != PCI_CAP_TYPE_IOMMU ) return -ENODEV; return 0; } void __init get_iommu_features(struct amd_iommu *iommu) { u32 low, high; int i = 0 ; static const char *__initdata feature_str[] = { "- Prefetch Pages Command", "- Peripheral Page Service Request", "- X2APIC Supported", "- NX bit Supported", "- Guest Translation", "- Reserved bit [5]", "- Invalidate All Command", "- Guest APIC supported", "- Hardware Error Registers", "- Performance Counters", NULL }; ASSERT( iommu->mmio_base ); if ( !iommu_has_cap(iommu, PCI_CAP_EFRSUP_SHIFT) ) { iommu->features = 0; return; } low = readl(iommu->mmio_base + IOMMU_EXT_FEATURE_MMIO_OFFSET); high = readl(iommu->mmio_base + IOMMU_EXT_FEATURE_MMIO_OFFSET + 4); iommu->features = ((u64)high << 32) | low; printk("AMD-Vi: IOMMU Extended Features:\n"); while ( feature_str[i] ) { if ( iommu_has_feature(iommu, i) ) printk( " %s\n", feature_str[i]); i++; } } int __init amd_iommu_detect_one_acpi( const struct acpi_ivrs_hardware *ivhd_block) { struct amd_iommu *iommu; u8 bus, dev, func; int rt = 0; if ( ivhd_block->header.length < sizeof(*ivhd_block) ) { AMD_IOMMU_DEBUG("Invalid IVHD Block Length!\n"); return -ENODEV; } if ( !ivhd_block->header.device_id || !ivhd_block->capability_offset || !ivhd_block->base_address) { AMD_IOMMU_DEBUG("Invalid IVHD Block!\n"); return -ENODEV; } iommu = xzalloc(struct amd_iommu); if ( !iommu ) { AMD_IOMMU_DEBUG("Error allocating amd_iommu\n"); return -ENOMEM; } spin_lock_init(&iommu->lock); iommu->seg = ivhd_block->pci_segment_group; iommu->bdf = ivhd_block->header.device_id; iommu->cap_offset = ivhd_block->capability_offset; iommu->mmio_base_phys = ivhd_block->base_address; /* override IOMMU HT flags */ iommu->ht_flags = ivhd_block->header.flags; bus = PCI_BUS(iommu->bdf); dev = PCI_SLOT(iommu->bdf); func = PCI_FUNC(iommu->bdf); rt = get_iommu_capabilities(iommu->seg, bus, dev, func, iommu->cap_offset, iommu); if ( rt ) goto out; rt = get_iommu_msi_capabilities(iommu->seg, bus, dev, func, iommu); if ( rt ) goto out; rt = pci_ro_device(iommu->seg, bus, PCI_DEVFN(dev, func)); if ( rt ) printk(XENLOG_ERR "Could not mark config space of %04x:%02x:%02x.%u read-only (%d)\n", iommu->seg, bus, dev, func, rt); list_add_tail(&iommu->list, &amd_iommu_head); rt = 0; out: if ( rt ) xfree(iommu); return rt; } xen-4.4.0/xen/drivers/acpi/0000775000175000017500000000000012307313555013627 5ustar smbsmbxen-4.4.0/xen/drivers/acpi/osl.c0000664000175000017500000001213612307313555014573 0ustar smbsmb/* * acpi_osl.c - OS-dependent functions ($Revision: 83 $) * * Copyright (C) 2000 Andrew Henroid * Copyright (C) 2001, 2002 Andy Grover * Copyright (C) 2001, 2002 Paul Diefenbaugh * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define _COMPONENT ACPI_OS_SERVICES ACPI_MODULE_NAME("osl") #ifdef CONFIG_ACPI_CUSTOM_DSDT #include CONFIG_ACPI_CUSTOM_DSDT_FILE #endif void __init acpi_os_printf(const char *fmt, ...) { va_list args; va_start(args, fmt); acpi_os_vprintf(fmt, args); va_end(args); } void __init acpi_os_vprintf(const char *fmt, va_list args) { static char buffer[512]; vsnprintf(buffer, sizeof(buffer), fmt, args); printk("%s", buffer); } acpi_physical_address __init acpi_os_get_root_pointer(void) { if (efi_enabled) { if (efi.acpi20 != EFI_INVALID_TABLE_ADDR) return efi.acpi20; else if (efi.acpi != EFI_INVALID_TABLE_ADDR) return efi.acpi; else { printk(KERN_ERR PREFIX "System description tables not found\n"); return 0; } } else { acpi_physical_address pa = 0; acpi_find_root_pointer(&pa); return pa; } } void __iomem * acpi_os_map_memory(acpi_physical_address phys, acpi_size size) { if (system_state >= SYS_STATE_active) { unsigned long pfn = PFN_DOWN(phys); unsigned int offs = phys & (PAGE_SIZE - 1); /* The low first Mb is always mapped. */ if ( !((phys + size - 1) >> 20) ) return __va(phys); return __vmap(&pfn, PFN_UP(offs + size), 1, 1, PAGE_HYPERVISOR_NOCACHE) + offs; } return __acpi_map_table(phys, size); } void acpi_os_unmap_memory(void __iomem * virt, acpi_size size) { if (system_state >= SYS_STATE_active) vunmap((void *)((unsigned long)virt & PAGE_MASK)); } acpi_status acpi_os_read_port(acpi_io_address port, u32 * value, u32 width) { u32 dummy; if (!value) value = &dummy; *value = 0; if (width <= 8) { *(u8 *) value = inb(port); } else if (width <= 16) { *(u16 *) value = inw(port); } else if (width <= 32) { *(u32 *) value = inl(port); } else { BUG(); } return AE_OK; } acpi_status acpi_os_write_port(acpi_io_address port, u32 value, u32 width) { if (width <= 8) { outb(value, port); } else if (width <= 16) { outw(value, port); } else if (width <= 32) { outl(value, port); } else { BUG(); } return AE_OK; } acpi_status acpi_os_read_memory(acpi_physical_address phys_addr, u32 * value, u32 width) { u32 dummy; void __iomem *virt_addr = acpi_os_map_memory(phys_addr, width >> 3); if (!value) value = &dummy; switch (width) { case 8: *(u8 *) value = readb(virt_addr); break; case 16: *(u16 *) value = readw(virt_addr); break; case 32: *(u32 *) value = readl(virt_addr); break; default: BUG(); } acpi_os_unmap_memory(virt_addr, width >> 3); return AE_OK; } acpi_status acpi_os_write_memory(acpi_physical_address phys_addr, u32 value, u32 width) { void __iomem *virt_addr = acpi_os_map_memory(phys_addr, width >> 3); switch (width) { case 8: writeb(value, virt_addr); break; case 16: writew(value, virt_addr); break; case 32: writel(value, virt_addr); break; default: BUG(); } acpi_os_unmap_memory(virt_addr, width >> 3); return AE_OK; } #define is_xmalloc_memory(ptr) ((unsigned long)(ptr) & (PAGE_SIZE - 1)) void *__init acpi_os_alloc_memory(size_t sz) { void *ptr; if (system_state == SYS_STATE_early_boot) return mfn_to_virt(alloc_boot_pages(PFN_UP(sz), 1)); ptr = xmalloc_bytes(sz); ASSERT(!ptr || is_xmalloc_memory(ptr)); return ptr; } void *__init acpi_os_zalloc_memory(size_t sz) { void *ptr; if (system_state != SYS_STATE_early_boot) { ptr = xzalloc_bytes(sz); ASSERT(!ptr || is_xmalloc_memory(ptr)); return ptr; } ptr = acpi_os_alloc_memory(sz); return ptr ? memset(ptr, 0, sz) : NULL; } void __init acpi_os_free_memory(void *ptr) { if (is_xmalloc_memory(ptr)) xfree(ptr); else if (ptr && system_state == SYS_STATE_early_boot) init_boot_pages(__pa(ptr), __pa(ptr) + PAGE_SIZE); } xen-4.4.0/xen/drivers/acpi/Makefile0000664000175000017500000000027312307313555015271 0ustar smbsmbsubdir-y += tables subdir-y += utilities subdir-$(x86) += apei obj-bin-y += tables.init.o obj-y += numa.o obj-y += osl.o obj-y += pmstat.o obj-$(x86) += hwregs.o obj-$(x86) += reboot.o xen-4.4.0/xen/drivers/acpi/tables.c0000664000175000017500000002246112307313555015252 0ustar smbsmb/* * acpi_tables.c - ACPI Boot-Time Table Parsing * * Copyright (C) 2001 Paul Diefenbaugh * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * */ #include #include #include #include #include #include #include #include #define PREFIX "ACPI: " #define ACPI_MAX_TABLES 128 static const char *__initdata mps_inti_flags_polarity[] = { "dfl", "high", "res", "low" }; static const char *__initdata mps_inti_flags_trigger[] = { "dfl", "edge", "res", "level" }; static int acpi_apic_instance __initdata; void __init acpi_table_print_madt_entry(struct acpi_subtable_header *header) { if (!header) return; switch (header->type) { case ACPI_MADT_TYPE_LOCAL_APIC: { struct acpi_madt_local_apic *p = (struct acpi_madt_local_apic *)header; printk(KERN_INFO PREFIX "LAPIC (acpi_id[0x%02x] lapic_id[0x%02x] %s)\n", p->processor_id, p->id, (p->lapic_flags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); } break; case ACPI_MADT_TYPE_LOCAL_X2APIC: { struct acpi_madt_local_x2apic *p = (struct acpi_madt_local_x2apic *)header; printk(KERN_INFO PREFIX "X2APIC (apic_id[0x%02x] uid[0x%02x] %s)\n", p->local_apic_id, p->uid, (p->lapic_flags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); } break; case ACPI_MADT_TYPE_IO_APIC: { struct acpi_madt_io_apic *p = (struct acpi_madt_io_apic *)header; printk(KERN_INFO PREFIX "IOAPIC (id[0x%02x] address[0x%08x] gsi_base[%d])\n", p->id, p->address, p->global_irq_base); } break; case ACPI_MADT_TYPE_INTERRUPT_OVERRIDE: { struct acpi_madt_interrupt_override *p = (struct acpi_madt_interrupt_override *)header; printk(KERN_INFO PREFIX "INT_SRC_OVR (bus %d bus_irq %d global_irq %d %s %s)\n", p->bus, p->source_irq, p->global_irq, mps_inti_flags_polarity[p->inti_flags & ACPI_MADT_POLARITY_MASK], mps_inti_flags_trigger[(p->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2]); if (p->inti_flags & ~(ACPI_MADT_POLARITY_MASK | ACPI_MADT_TRIGGER_MASK)) printk(KERN_INFO PREFIX "INT_SRC_OVR unexpected reserved flags: %#x\n", p->inti_flags & ~(ACPI_MADT_POLARITY_MASK | ACPI_MADT_TRIGGER_MASK)); } break; case ACPI_MADT_TYPE_NMI_SOURCE: { struct acpi_madt_nmi_source *p = (struct acpi_madt_nmi_source *)header; printk(KERN_INFO PREFIX "NMI_SRC (%s %s global_irq %d)\n", mps_inti_flags_polarity[p->inti_flags & ACPI_MADT_POLARITY_MASK], mps_inti_flags_trigger[(p->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2], p->global_irq); } break; case ACPI_MADT_TYPE_LOCAL_APIC_NMI: { struct acpi_madt_local_apic_nmi *p = (struct acpi_madt_local_apic_nmi *)header; printk(KERN_INFO PREFIX "LAPIC_NMI (acpi_id[0x%02x] %s %s lint[%#x])\n", p->processor_id, mps_inti_flags_polarity[p->inti_flags & ACPI_MADT_POLARITY_MASK ], mps_inti_flags_trigger[(p->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2], p->lint); } break; case ACPI_MADT_TYPE_LOCAL_X2APIC_NMI: { u16 polarity, trigger; struct acpi_madt_local_x2apic_nmi *p = (struct acpi_madt_local_x2apic_nmi *)header; polarity = p->inti_flags & ACPI_MADT_POLARITY_MASK; trigger = (p->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2; printk(KERN_INFO PREFIX "X2APIC_NMI (uid[0x%02x] %s %s lint[%#x])\n", p->uid, mps_inti_flags_polarity[polarity], mps_inti_flags_trigger[trigger], p->lint); } break; case ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE: { struct acpi_madt_local_apic_override *p = (struct acpi_madt_local_apic_override *)header; printk(KERN_INFO PREFIX "LAPIC_ADDR_OVR (address[%p])\n", (void *)(unsigned long)p->address); } break; case ACPI_MADT_TYPE_IO_SAPIC: { struct acpi_madt_io_sapic *p = (struct acpi_madt_io_sapic *)header; printk(KERN_INFO PREFIX "IOSAPIC (id[%#x] address[%p] gsi_base[%d])\n", p->id, (void *)(unsigned long)p->address, p->global_irq_base); } break; case ACPI_MADT_TYPE_LOCAL_SAPIC: { struct acpi_madt_local_sapic *p = (struct acpi_madt_local_sapic *)header; printk(KERN_INFO PREFIX "LSAPIC (acpi_id[0x%02x] lsapic_id[0x%02x] lsapic_eid[0x%02x] %s)\n", p->processor_id, p->id, p->eid, (p->lapic_flags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); } break; case ACPI_MADT_TYPE_INTERRUPT_SOURCE: { struct acpi_madt_interrupt_source *p = (struct acpi_madt_interrupt_source *)header; printk(KERN_INFO PREFIX "PLAT_INT_SRC (%s %s type[%#x] id[0x%04x] eid[%#x] iosapic_vector[%#x] global_irq[%#x]\n", mps_inti_flags_polarity[p->inti_flags & ACPI_MADT_POLARITY_MASK], mps_inti_flags_trigger[(p->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2], p->type, p->id, p->eid, p->io_sapic_vector, p->global_irq); } break; default: printk(KERN_WARNING PREFIX "Found unsupported MADT entry (type = %#x)\n", header->type); break; } } int __init acpi_table_parse_entries(char *id, unsigned long table_size, int entry_id, acpi_table_entry_handler handler, unsigned int max_entries) { struct acpi_table_header *table_header = NULL; struct acpi_subtable_header *entry; unsigned int count = 0; unsigned long table_end; if (!handler) return -EINVAL; if (strncmp(id, ACPI_SIG_MADT, 4) == 0) acpi_get_table(id, acpi_apic_instance, &table_header); else acpi_get_table(id, 0, &table_header); if (!table_header) { printk(KERN_WARNING PREFIX "%4.4s not present\n", id); return -ENODEV; } table_end = (unsigned long)table_header + table_header->length; /* Parse all entries looking for a match. */ entry = (struct acpi_subtable_header *) ((unsigned long)table_header + table_size); while (((unsigned long)entry) + sizeof(struct acpi_subtable_header) < table_end) { if (entry->type == entry_id && (!max_entries || count++ < max_entries)) if (handler(entry, table_end)) return -EINVAL; entry = (struct acpi_subtable_header *) ((unsigned long)entry + entry->length); } if (max_entries && count > max_entries) { printk(KERN_WARNING PREFIX "[%4.4s:%#x] ignored %i entries of " "%i found\n", id, entry_id, count - max_entries, count); } return count; } int __init acpi_table_parse_madt(enum acpi_madt_type id, acpi_table_entry_handler handler, unsigned int max_entries) { return acpi_table_parse_entries(ACPI_SIG_MADT, sizeof(struct acpi_table_madt), id, handler, max_entries); } /** * acpi_table_parse - find table with @id, run @handler on it * * @id: table id to find * @handler: handler to run * * Scan the ACPI System Descriptor Table (STD) for a table matching @id, * run @handler on it. */ int __init acpi_table_parse(char *id, acpi_table_handler handler) { struct acpi_table_header *table = NULL; if (!handler) return -EINVAL; if (strncmp(id, ACPI_SIG_MADT, 4) == 0) acpi_get_table(id, acpi_apic_instance, &table); else acpi_get_table(id, 0, &table); if (table) { return handler(table); } else return 1; } /* * The BIOS is supposed to supply a single APIC/MADT, * but some report two. Provide a knob to use either. * (don't you wish instance 0 and 1 were not the same?) */ static void __init check_multiple_madt(void) { struct acpi_table_header *table = NULL; acpi_get_table(ACPI_SIG_MADT, 2, &table); if (table) { printk(KERN_WARNING PREFIX "BIOS bug: multiple APIC/MADT found," " using %d\n", acpi_apic_instance); printk(KERN_WARNING PREFIX "If \"acpi_apic_instance=%d\" works better, " "notify linux-acpi@vger.kernel.org\n", acpi_apic_instance ? 0 : 2); } else acpi_apic_instance = 0; return; } /* * acpi_table_init() * * find RSDP, find and checksum SDT/XSDT. * checksum all tables, print SDT/XSDT * * result: sdt_entry[] is initialized */ int __init acpi_table_init(void) { acpi_initialize_tables(NULL, ACPI_MAX_TABLES, 0); check_multiple_madt(); return 0; } static int __init acpi_parse_apic_instance(char *str) { acpi_apic_instance = simple_strtoul(str, NULL, 0); printk(KERN_NOTICE PREFIX "Shall use APIC/MADT table %d\n", acpi_apic_instance); return 0; } custom_param("acpi_apic_instance", acpi_parse_apic_instance); xen-4.4.0/xen/drivers/acpi/numa.c0000664000175000017500000001417312307313555014741 0ustar smbsmb/* * acpi_numa.c - ACPI NUMA support * * Copyright (C) 2002 Takayoshi Kochi * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * */ #include #include #include #include #include #include #include #define ACPI_NUMA 0x80000000 #define _COMPONENT ACPI_NUMA ACPI_MODULE_NAME("numa") int __initdata srat_rev; void __init acpi_table_print_srat_entry(struct acpi_subtable_header * header) { ACPI_FUNCTION_NAME("acpi_table_print_srat_entry"); if (!header) return; switch (header->type) { case ACPI_SRAT_TYPE_CPU_AFFINITY: #ifdef ACPI_DEBUG_OUTPUT { struct acpi_srat_cpu_affinity *p = container_of(header, struct acpi_srat_cpu_affinity, header); u32 proximity_domain = p->proximity_domain_lo; if (srat_rev >= 2) { proximity_domain |= p->proximity_domain_hi[0] << 8; proximity_domain |= p->proximity_domain_hi[1] << 16; proximity_domain |= p->proximity_domain_hi[2] << 24; } ACPI_DEBUG_PRINT((ACPI_DB_INFO, "SRAT Processor (id[0x%02x] eid[0x%02x]) in proximity domain %d %s\n", p->apic_id, p->local_sapic_eid, proximity_domain, p->flags & ACPI_SRAT_CPU_ENABLED ? "enabled" : "disabled")); } #endif /* ACPI_DEBUG_OUTPUT */ break; case ACPI_SRAT_TYPE_MEMORY_AFFINITY: #ifdef ACPI_DEBUG_OUTPUT { struct acpi_srat_mem_affinity *p = container_of(header, struct acpi_srat_mem_affinity, header); u32 proximity_domain = p->proximity_domain; if (srat_rev < 2) proximity_domain &= 0xff; ACPI_DEBUG_PRINT((ACPI_DB_INFO, "SRAT Memory (%#"PRIx64 " length %#"PRIx64")" " in proximity domain %d %s%s\n", p->base_address, p->length, proximity_domain, p->flags & ACPI_SRAT_MEM_ENABLED ? "enabled" : "disabled", p->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE ? " hot-pluggable" : "")); } #endif /* ACPI_DEBUG_OUTPUT */ break; case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: #ifdef ACPI_DEBUG_OUTPUT { struct acpi_srat_x2apic_cpu_affinity *p = (struct acpi_srat_x2apic_cpu_affinity *)header; ACPI_DEBUG_PRINT((ACPI_DB_INFO, "SRAT Processor (x2apicid[0x%08x]) in" " proximity domain %d %s\n", p->apic_id, p->proximity_domain, (p->flags & ACPI_SRAT_CPU_ENABLED) ? "enabled" : "disabled")); } #endif /* ACPI_DEBUG_OUTPUT */ break; default: printk(KERN_WARNING PREFIX "Found unsupported SRAT entry (type = %#x)\n", header->type); break; } } static int __init acpi_parse_slit(struct acpi_table_header *table) { acpi_numa_slit_init((struct acpi_table_slit *)table); return 0; } static int __init acpi_parse_x2apic_affinity(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_srat_x2apic_cpu_affinity *processor_affinity; processor_affinity = (struct acpi_srat_x2apic_cpu_affinity *)header; if (!processor_affinity) return -EINVAL; acpi_table_print_srat_entry(header); /* let architecture-dependent part to do it */ acpi_numa_x2apic_affinity_init(processor_affinity); return 0; } static int __init acpi_parse_processor_affinity(struct acpi_subtable_header * header, const unsigned long end) { struct acpi_srat_cpu_affinity *processor_affinity = container_of(header, struct acpi_srat_cpu_affinity, header); if (!processor_affinity) return -EINVAL; acpi_table_print_srat_entry(header); /* let architecture-dependent part to do it */ acpi_numa_processor_affinity_init(processor_affinity); return 0; } static int __init acpi_parse_memory_affinity(struct acpi_subtable_header * header, const unsigned long end) { struct acpi_srat_mem_affinity *memory_affinity = container_of(header, struct acpi_srat_mem_affinity, header); if (!memory_affinity) return -EINVAL; acpi_table_print_srat_entry(header); /* let architecture-dependent part to do it */ acpi_numa_memory_affinity_init(memory_affinity); return 0; } int __init acpi_parse_srat(struct acpi_table_header *table) { if (!table) return -EINVAL; srat_rev = table->revision; return 0; } int __init acpi_table_parse_srat(int id, acpi_madt_entry_handler handler, unsigned int max_entries) { return acpi_table_parse_entries(ACPI_SIG_SRAT, sizeof(struct acpi_table_srat), id, handler, max_entries); } int __init acpi_numa_init(void) { /* SRAT: Static Resource Affinity Table */ if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY, acpi_parse_x2apic_affinity, NR_CPUS); acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, acpi_parse_processor_affinity, NR_CPUS); acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, acpi_parse_memory_affinity, NR_NODE_MEMBLKS); } /* SLIT: System Locality Information Table */ acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); acpi_numa_arch_fixup(); return 0; } #if 0 int acpi_get_pxm(acpi_handle h) { unsigned long pxm; acpi_status status; acpi_handle handle; acpi_handle phandle = h; do { handle = phandle; status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm); if (ACPI_SUCCESS(status)) return (int)pxm; status = acpi_get_parent(handle, &phandle); } while (ACPI_SUCCESS(status)); return -1; } EXPORT_SYMBOL(acpi_get_pxm); #endif xen-4.4.0/xen/drivers/acpi/utilities/0000775000175000017500000000000012307313555015642 5ustar smbsmbxen-4.4.0/xen/drivers/acpi/utilities/Makefile0000664000175000017500000000005712307313555017304 0ustar smbsmbobj-y += utglobal.o obj-bin-y += utmisc.init.o xen-4.4.0/xen/drivers/acpi/utilities/utglobal.c0000664000175000017500000001711012307313555017617 0ustar smbsmb/****************************************************************************** * * Module Name: utglobal - Global variables for the ACPI subsystem * *****************************************************************************/ /* * Copyright (C) 2000 - 2008, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #define DEFINE_ACPI_GLOBALS #include #include #include #include #include ACPI_EXPORT_SYMBOL(acpi_gbl_FADT) #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("utglobal") /******************************************************************************* * * FUNCTION: acpi_format_exception * * PARAMETERS: Status - The acpi_status code to be formatted * * RETURN: A string containing the exception text. A valid pointer is * always returned. * * DESCRIPTION: This function translates an ACPI exception into an ASCII string * It is here instead of utxface.c so it is always present. * ******************************************************************************/ const char *__init acpi_format_exception(acpi_status status) { const char *exception = NULL; ACPI_FUNCTION_ENTRY(); exception = acpi_ut_validate_exception(status); if (!exception) { /* Exception code was not recognized */ ACPI_ERROR((AE_INFO, "Unknown exception code: %#X", status)); exception = "UNKNOWN_STATUS_CODE"; dump_execution_state(); } return (ACPI_CAST_PTR(const char, exception)); } /****************************************************************************** * * Event and Hardware globals * ******************************************************************************/ struct acpi_bit_register_info acpi_gbl_bit_register_info[ACPI_NUM_BITREG] = { /* Name Parent Register Register Bit Position Register Bit Mask */ /* ACPI_BITREG_TIMER_STATUS */ {ACPI_REGISTER_PM1_STATUS, ACPI_BITPOSITION_TIMER_STATUS, ACPI_BITMASK_TIMER_STATUS}, /* ACPI_BITREG_BUS_MASTER_STATUS */ {ACPI_REGISTER_PM1_STATUS, ACPI_BITPOSITION_BUS_MASTER_STATUS, ACPI_BITMASK_BUS_MASTER_STATUS}, /* ACPI_BITREG_GLOBAL_LOCK_STATUS */ {ACPI_REGISTER_PM1_STATUS, ACPI_BITPOSITION_GLOBAL_LOCK_STATUS, ACPI_BITMASK_GLOBAL_LOCK_STATUS}, /* ACPI_BITREG_POWER_BUTTON_STATUS */ {ACPI_REGISTER_PM1_STATUS, ACPI_BITPOSITION_POWER_BUTTON_STATUS, ACPI_BITMASK_POWER_BUTTON_STATUS}, /* ACPI_BITREG_SLEEP_BUTTON_STATUS */ {ACPI_REGISTER_PM1_STATUS, ACPI_BITPOSITION_SLEEP_BUTTON_STATUS, ACPI_BITMASK_SLEEP_BUTTON_STATUS}, /* ACPI_BITREG_RT_CLOCK_STATUS */ {ACPI_REGISTER_PM1_STATUS, ACPI_BITPOSITION_RT_CLOCK_STATUS, ACPI_BITMASK_RT_CLOCK_STATUS}, /* ACPI_BITREG_WAKE_STATUS */ {ACPI_REGISTER_PM1_STATUS, ACPI_BITPOSITION_WAKE_STATUS, ACPI_BITMASK_WAKE_STATUS}, /* ACPI_BITREG_PCIEXP_WAKE_STATUS */ {ACPI_REGISTER_PM1_STATUS, ACPI_BITPOSITION_PCIEXP_WAKE_STATUS, ACPI_BITMASK_PCIEXP_WAKE_STATUS}, /* ACPI_BITREG_TIMER_ENABLE */ {ACPI_REGISTER_PM1_ENABLE, ACPI_BITPOSITION_TIMER_ENABLE, ACPI_BITMASK_TIMER_ENABLE}, /* ACPI_BITREG_GLOBAL_LOCK_ENABLE */ {ACPI_REGISTER_PM1_ENABLE, ACPI_BITPOSITION_GLOBAL_LOCK_ENABLE, ACPI_BITMASK_GLOBAL_LOCK_ENABLE}, /* ACPI_BITREG_POWER_BUTTON_ENABLE */ {ACPI_REGISTER_PM1_ENABLE, ACPI_BITPOSITION_POWER_BUTTON_ENABLE, ACPI_BITMASK_POWER_BUTTON_ENABLE}, /* ACPI_BITREG_SLEEP_BUTTON_ENABLE */ {ACPI_REGISTER_PM1_ENABLE, ACPI_BITPOSITION_SLEEP_BUTTON_ENABLE, ACPI_BITMASK_SLEEP_BUTTON_ENABLE}, /* ACPI_BITREG_RT_CLOCK_ENABLE */ {ACPI_REGISTER_PM1_ENABLE, ACPI_BITPOSITION_RT_CLOCK_ENABLE, ACPI_BITMASK_RT_CLOCK_ENABLE}, /* ACPI_BITREG_WAKE_ENABLE */ {ACPI_REGISTER_PM1_ENABLE, 0, 0}, /* ACPI_BITREG_PCIEXP_WAKE_DISABLE */ {ACPI_REGISTER_PM1_ENABLE, ACPI_BITPOSITION_PCIEXP_WAKE_DISABLE, ACPI_BITMASK_PCIEXP_WAKE_DISABLE}, /* ACPI_BITREG_SCI_ENABLE */ {ACPI_REGISTER_PM1_CONTROL, ACPI_BITPOSITION_SCI_ENABLE, ACPI_BITMASK_SCI_ENABLE}, /* ACPI_BITREG_BUS_MASTER_RLD */ {ACPI_REGISTER_PM1_CONTROL, ACPI_BITPOSITION_BUS_MASTER_RLD, ACPI_BITMASK_BUS_MASTER_RLD}, /* ACPI_BITREG_GLOBAL_LOCK_RELEASE */ {ACPI_REGISTER_PM1_CONTROL, ACPI_BITPOSITION_GLOBAL_LOCK_RELEASE, ACPI_BITMASK_GLOBAL_LOCK_RELEASE}, /* ACPI_BITREG_SLEEP_TYPE_A */ {ACPI_REGISTER_PM1_CONTROL, ACPI_BITPOSITION_SLEEP_TYPE_X, ACPI_BITMASK_SLEEP_TYPE_X}, /* ACPI_BITREG_SLEEP_TYPE_B */ {ACPI_REGISTER_PM1_CONTROL, ACPI_BITPOSITION_SLEEP_TYPE_X, ACPI_BITMASK_SLEEP_TYPE_X}, /* ACPI_BITREG_SLEEP_ENABLE */ {ACPI_REGISTER_PM1_CONTROL, ACPI_BITPOSITION_SLEEP_ENABLE, ACPI_BITMASK_SLEEP_ENABLE}, /* ACPI_BITREG_ARB_DIS */ {ACPI_REGISTER_PM2_CONTROL, ACPI_BITPOSITION_ARB_DISABLE, ACPI_BITMASK_ARB_DISABLE} }; #ifdef ACPI_DEBUG_OUTPUT /******************************************************************************* * * FUNCTION: acpi_ut_get_region_name * * PARAMETERS: None. * * RETURN: Status * * DESCRIPTION: Translate a Space ID into a name string (Debug only) * ******************************************************************************/ /* Region type decoding */ static const char *const acpi_gbl_region_types[ACPI_NUM_PREDEFINED_REGIONS] = { "SystemMemory", "SystemIO", "PCI_Config", "EmbeddedControl", "SMBus", "CMOS", "PCIBARTarget", "DataTable" }; const char *acpi_ut_get_region_name(u8 space_id) { if (space_id >= ACPI_USER_REGION_BEGIN) { return ("UserDefinedRegion"); } else if (space_id >= ACPI_NUM_PREDEFINED_REGIONS) { return ("InvalidSpaceId"); } return (ACPI_CAST_PTR(char, acpi_gbl_region_types[space_id])); } #endif xen-4.4.0/xen/drivers/acpi/utilities/utmisc.c0000664000175000017500000001243312307313555017315 0ustar smbsmb/******************************************************************************* * * Module Name: utmisc - common utility procedures * ******************************************************************************/ /* * Copyright (C) 2000 - 2008, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #include #include #include #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("utmisc") /******************************************************************************* * * FUNCTION: acpi_ut_validate_exception * * PARAMETERS: Status - The acpi_status code to be formatted * * RETURN: A string containing the exception text. NULL if exception is * not valid. * * DESCRIPTION: This function validates and translates an ACPI exception into * an ASCII string. * ******************************************************************************/ const char *__init acpi_ut_validate_exception(acpi_status status) { acpi_status sub_status; const char *exception = NULL; ACPI_FUNCTION_ENTRY(); /* * Status is composed of two parts, a "type" and an actual code */ sub_status = (status & ~AE_CODE_MASK); switch (status & AE_CODE_MASK) { case AE_CODE_ENVIRONMENTAL: if (sub_status <= AE_CODE_ENV_MAX) { exception = acpi_gbl_exception_names_env[sub_status]; } break; case AE_CODE_PROGRAMMER: if (sub_status <= AE_CODE_PGM_MAX) { exception = acpi_gbl_exception_names_pgm[sub_status - 1]; } break; case AE_CODE_ACPI_TABLES: if (sub_status <= AE_CODE_TBL_MAX) { exception = acpi_gbl_exception_names_tbl[sub_status - 1]; } break; case AE_CODE_AML: if (sub_status <= AE_CODE_AML_MAX) { exception = acpi_gbl_exception_names_aml[sub_status - 1]; } break; case AE_CODE_CONTROL: if (sub_status <= AE_CODE_CTRL_MAX) { exception = acpi_gbl_exception_names_ctrl[sub_status - 1]; } break; default: break; } return (ACPI_CAST_PTR(const char, exception)); } /******************************************************************************* * * FUNCTION: acpi_ut_error, acpi_ut_warning, acpi_ut_info * * PARAMETERS: module_name - Caller's module name (for error output) * line_number - Caller's line number (for error output) * Format - Printf format string + additional args * * RETURN: None * * DESCRIPTION: Print message with module/line/version info * ******************************************************************************/ void ACPI_INTERNAL_VAR_XFACE __init acpi_ut_error(const char *module_name, u32 line_number, char *format, ...) { va_list args; acpi_os_printf("ACPI Error (%s-%04d): ", module_name, line_number); va_start(args, format); acpi_os_vprintf(format, args); acpi_os_printf(" [%X]\n", ACPI_CA_VERSION); va_end(args); } void ACPI_INTERNAL_VAR_XFACE __init acpi_ut_warning(const char *module_name, u32 line_number, char *format, ...) { va_list args; acpi_os_printf("ACPI Warning (%s-%04d): ", module_name, line_number); va_start(args, format); acpi_os_vprintf(format, args); acpi_os_printf(" [%X]\n", ACPI_CA_VERSION); va_end(args); va_end(args); } void ACPI_INTERNAL_VAR_XFACE __init acpi_ut_info(const char *module_name, u32 line_number, char *format, ...) { va_list args; /* * Removed module_name, line_number, and acpica version, not needed * for info output */ acpi_os_printf("ACPI: "); va_start(args, format); acpi_os_vprintf(format, args); acpi_os_printf("\n"); va_end(args); } xen-4.4.0/xen/drivers/acpi/apei/0000775000175000017500000000000012307313555014545 5ustar smbsmbxen-4.4.0/xen/drivers/acpi/apei/Makefile0000664000175000017500000000007012307313555016202 0ustar smbsmbobj-y += erst.o obj-y += apei-base.o obj-y += apei-io.o xen-4.4.0/xen/drivers/acpi/apei/apei-base.c0000664000175000017500000001534412307313555016546 0ustar smbsmb/* * apei-base.c - ACPI Platform Error Interface (APEI) supporting * infrastructure * * APEI allows to report errors (for example from the chipset) to the * the operating system. This improves NMI handling especially. In * addition it supports error serialization and error injection. * * For more information about APEI, please refer to ACPI Specification * version 4.0, chapter 17. * * This file has Common functions used by more than one APEI table, * including framework of interpreter for ERST and EINJ; resource * management for APEI registers. * * This feature is ported from linux acpi tree * Copyright (C) 2009, Intel Corp. * Author: Huang Ying * Ported by: Liu, Jinsong * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include "apei-internal.h" /* * APEI ERST (Error Record Serialization Table) and EINJ (Error * INJection) interpreter framework. */ #define APEI_EXEC_PRESERVE_REGISTER 0x1 int apei_exec_ctx_init(struct apei_exec_context *ctx, struct apei_exec_ins_type *ins_table, u32 instructions, struct acpi_whea_header *action_table, u32 entries) { if (!ctx) return -EINVAL; ctx->ins_table = ins_table; ctx->instructions = instructions; ctx->action_table = action_table; ctx->entries = entries; return 0; } int __apei_exec_read_register(struct acpi_whea_header *entry, u64 *val) { int rc; rc = apei_read(val, &entry->register_region); if (rc) return rc; *val >>= entry->register_region.bit_offset; *val &= entry->mask; return 0; } int apei_exec_read_register(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { int rc; u64 val = 0; rc = __apei_exec_read_register(entry, &val); if (rc) return rc; ctx->value = val; return 0; } int apei_exec_read_register_value(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { int rc; rc = apei_exec_read_register(ctx, entry); if (rc) return rc; ctx->value = (ctx->value == entry->value); return 0; } int __apei_exec_write_register(struct acpi_whea_header *entry, u64 val) { int rc; val &= entry->mask; val <<= entry->register_region.bit_offset; if (entry->flags & APEI_EXEC_PRESERVE_REGISTER) { u64 valr = 0; rc = apei_read(&valr, &entry->register_region); if (rc) return rc; valr &= ~(entry->mask << entry->register_region.bit_offset); val |= valr; } rc = apei_write(val, &entry->register_region); return rc; } int apei_exec_write_register(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { return __apei_exec_write_register(entry, ctx->value); } int apei_exec_write_register_value(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { int rc; ctx->value = entry->value; rc = apei_exec_write_register(ctx, entry); return rc; } int apei_exec_noop(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { return 0; } /* * Interpret the specified action. Go through whole action table, * execute all instructions belong to the action. */ int __apei_exec_run(struct apei_exec_context *ctx, u8 action, bool_t optional) { int rc = -ENOENT; u32 i, ip; struct acpi_whea_header *entry; apei_exec_ins_func_t run; ctx->ip = 0; /* * "ip" is the instruction pointer of current instruction, * "ctx->ip" specifies the next instruction to executed, * instruction "run" function may change the "ctx->ip" to * implement "goto" semantics. */ rewind: ip = 0; for (i = 0; i < ctx->entries; i++) { entry = &ctx->action_table[i]; if (entry->action != action) continue; if (ip == ctx->ip) { if (entry->instruction >= ctx->instructions || !ctx->ins_table[entry->instruction].run) { printk(KERN_WARNING "Invalid action table, unknown instruction " "type: %d\n", entry->instruction); return -EINVAL; } run = ctx->ins_table[entry->instruction].run; rc = run(ctx, entry); if (rc < 0) return rc; else if (rc != APEI_EXEC_SET_IP) ctx->ip++; } ip++; if (ctx->ip < ip) goto rewind; } return !optional && rc < 0 ? rc : 0; } typedef int (*apei_exec_entry_func_t)(struct apei_exec_context *ctx, struct acpi_whea_header *entry, void *data); static int __init apei_exec_for_each_entry(struct apei_exec_context *ctx, apei_exec_entry_func_t func, void *data, int *end) { u8 ins; int i, rc; struct acpi_whea_header *entry; struct apei_exec_ins_type *ins_table = ctx->ins_table; for (i = 0; i < ctx->entries; i++) { entry = ctx->action_table + i; ins = entry->instruction; if (end) *end = i; if (ins >= ctx->instructions || !ins_table[ins].run) { printk(KERN_WARNING "Invalid action table, " "unknown instruction type: %d\n", ins); return -EINVAL; } rc = func(ctx, entry, data); if (rc) return rc; } return 0; } static int __init pre_map_gar_callback(struct apei_exec_context *ctx, struct acpi_whea_header *entry, void *data) { u8 ins = entry->instruction; if (ctx->ins_table[ins].flags & APEI_EXEC_INS_ACCESS_REGISTER) return apei_pre_map_gar(&entry->register_region); return 0; } /* Pre-map all GARs in action table. */ int __init apei_exec_pre_map_gars(struct apei_exec_context *ctx) { int rc, end; rc = apei_exec_for_each_entry(ctx, pre_map_gar_callback, NULL, &end); if (rc) { struct apei_exec_context ctx_unmap; memcpy(&ctx_unmap, ctx, sizeof(*ctx)); ctx_unmap.entries = end; apei_exec_post_unmap_gars(&ctx_unmap); } return rc; } static int __init post_unmap_gar_callback(struct apei_exec_context *ctx, struct acpi_whea_header *entry, void *data) { u8 ins = entry->instruction; if (ctx->ins_table[ins].flags & APEI_EXEC_INS_ACCESS_REGISTER) apei_post_unmap_gar(&entry->register_region); return 0; } /* Post-unmap all GAR in action table. */ int __init apei_exec_post_unmap_gars(struct apei_exec_context *ctx) { return apei_exec_for_each_entry(ctx, post_unmap_gar_callback, NULL, NULL); } xen-4.4.0/xen/drivers/acpi/apei/erst.c0000664000175000017500000004465012307313555015677 0ustar smbsmb/* * APEI Error Record Serialization Table support * * ERST is a way provided by APEI to save and retrieve hardware error * infomation to and from a persistent store. * * For more information about ERST, please refer to ACPI Specification * version 4.0, section 17.4. * * This feature is ported from linux acpi tree * Copyright 2010 Intel Corp. * Author: Huang Ying * Ported by: Liu, Jinsong * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include "apei-internal.h" /* ERST command status */ #define ERST_STATUS_SUCCESS 0x0 #define ERST_STATUS_NOT_ENOUGH_SPACE 0x1 #define ERST_STATUS_HARDWARE_NOT_AVAILABLE 0x2 #define ERST_STATUS_FAILED 0x3 #define ERST_STATUS_RECORD_STORE_EMPTY 0x4 #define ERST_STATUS_RECORD_NOT_FOUND 0x5 #define ERST_TAB_ENTRY(tab) \ ((struct acpi_whea_header *)((char *)(tab) + \ sizeof(struct acpi_table_erst))) #define SPIN_UNIT 1 /* 1us */ /* Firmware should respond within 1 miliseconds */ #define FIRMWARE_TIMEOUT (1 * 1000) #define FIRMWARE_MAX_STALL 50 /* 50us */ static struct acpi_table_erst *__read_mostly erst_tab; static bool_t __read_mostly erst_enabled; /* ERST Error Log Address Range atrributes */ #define ERST_RANGE_RESERVED 0x0001 #define ERST_RANGE_NVRAM 0x0002 #define ERST_RANGE_SLOW 0x0004 /* * ERST Error Log Address Range, used as buffer for reading/writing * error records. */ static struct erst_erange { u64 base; u64 size; void __iomem *vaddr; u32 attr; } erst_erange; /* * Prevent ERST interpreter to run simultaneously, because the * corresponding firmware implementation may not work properly when * invoked simultaneously. * * It is used to provide exclusive accessing for ERST Error Log * Address Range too. */ static DEFINE_SPINLOCK(erst_lock); static inline int erst_errno(int command_status) { switch (command_status) { case ERST_STATUS_SUCCESS: return 0; case ERST_STATUS_HARDWARE_NOT_AVAILABLE: return -ENODEV; case ERST_STATUS_NOT_ENOUGH_SPACE: return -ENOSPC; case ERST_STATUS_RECORD_STORE_EMPTY: case ERST_STATUS_RECORD_NOT_FOUND: return -ENOENT; default: return -EINVAL; } } static int erst_timedout(u64 *t, u64 spin_unit) { if ((s64)*t < spin_unit) { printk(XENLOG_WARNING "Firmware does not respond in time\n"); return 1; } *t -= spin_unit; udelay(spin_unit); return 0; } static int erst_exec_load_var1(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { return __apei_exec_read_register(entry, &ctx->var1); } static int erst_exec_load_var2(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { return __apei_exec_read_register(entry, &ctx->var2); } static int erst_exec_store_var1(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { return __apei_exec_write_register(entry, ctx->var1); } static int erst_exec_add(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { ctx->var1 += ctx->var2; return 0; } static int erst_exec_subtract(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { ctx->var1 -= ctx->var2; return 0; } static int erst_exec_add_value(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { int rc; u64 val; rc = __apei_exec_read_register(entry, &val); if (rc) return rc; val += ctx->value; rc = __apei_exec_write_register(entry, val); return rc; } static int erst_exec_subtract_value(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { int rc; u64 val; rc = __apei_exec_read_register(entry, &val); if (rc) return rc; val -= ctx->value; rc = __apei_exec_write_register(entry, val); return rc; } static int erst_exec_stall(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { udelay((ctx->var1 > FIRMWARE_MAX_STALL) ? FIRMWARE_MAX_STALL : ctx->var1); return 0; } static int erst_exec_stall_while_true(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { int rc; u64 val; u64 timeout = FIRMWARE_TIMEOUT; u64 stall_time = (ctx->var1 > FIRMWARE_MAX_STALL) ? FIRMWARE_MAX_STALL : ctx->var1; for (;;) { rc = __apei_exec_read_register(entry, &val); if (rc) return rc; if (val != ctx->value) break; if (erst_timedout(&timeout, stall_time)) return -EIO; } return 0; } static int erst_exec_skip_next_instruction_if_true( struct apei_exec_context *ctx, struct acpi_whea_header *entry) { int rc; u64 val; rc = __apei_exec_read_register(entry, &val); if (rc) return rc; if (val == ctx->value) { ctx->ip += 2; return APEI_EXEC_SET_IP; } return 0; } static int erst_exec_goto(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { ctx->ip = ctx->value; return APEI_EXEC_SET_IP; } static int erst_exec_set_src_address_base(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { return __apei_exec_read_register(entry, &ctx->src_base); } static int erst_exec_set_dst_address_base(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { return __apei_exec_read_register(entry, &ctx->dst_base); } static int erst_exec_move_data(struct apei_exec_context *ctx, struct acpi_whea_header *entry) { int rc; u64 offset; void *src, *dst; /* ioremap does not work in interrupt context */ if (in_irq()) { printk(KERN_WARNING "MOVE_DATA cannot be used in interrupt context\n"); return -EBUSY; } rc = __apei_exec_read_register(entry, &offset); if (rc) return rc; src = ioremap(ctx->src_base + offset, ctx->var2); if (!src) return -ENOMEM; dst = ioremap(ctx->dst_base + offset, ctx->var2); if (dst) { memmove(dst, src, ctx->var2); iounmap(dst); } else rc = -ENOMEM; iounmap(src); return rc; } static struct apei_exec_ins_type erst_ins_type[] = { [ACPI_ERST_READ_REGISTER] = { .flags = APEI_EXEC_INS_ACCESS_REGISTER, .run = apei_exec_read_register, }, [ACPI_ERST_READ_REGISTER_VALUE] = { .flags = APEI_EXEC_INS_ACCESS_REGISTER, .run = apei_exec_read_register_value, }, [ACPI_ERST_WRITE_REGISTER] = { .flags = APEI_EXEC_INS_ACCESS_REGISTER, .run = apei_exec_write_register, }, [ACPI_ERST_WRITE_REGISTER_VALUE] = { .flags = APEI_EXEC_INS_ACCESS_REGISTER, .run = apei_exec_write_register_value, }, [ACPI_ERST_NOOP] = { .flags = 0, .run = apei_exec_noop, }, [ACPI_ERST_LOAD_VAR1] = { .flags = APEI_EXEC_INS_ACCESS_REGISTER, .run = erst_exec_load_var1, }, [ACPI_ERST_LOAD_VAR2] = { .flags = APEI_EXEC_INS_ACCESS_REGISTER, .run = erst_exec_load_var2, }, [ACPI_ERST_STORE_VAR1] = { .flags = APEI_EXEC_INS_ACCESS_REGISTER, .run = erst_exec_store_var1, }, [ACPI_ERST_ADD] = { .flags = 0, .run = erst_exec_add, }, [ACPI_ERST_SUBTRACT] = { .flags = 0, .run = erst_exec_subtract, }, [ACPI_ERST_ADD_VALUE] = { .flags = APEI_EXEC_INS_ACCESS_REGISTER, .run = erst_exec_add_value, }, [ACPI_ERST_SUBTRACT_VALUE] = { .flags = APEI_EXEC_INS_ACCESS_REGISTER, .run = erst_exec_subtract_value, }, [ACPI_ERST_STALL] = { .flags = 0, .run = erst_exec_stall, }, [ACPI_ERST_STALL_WHILE_TRUE] = { .flags = APEI_EXEC_INS_ACCESS_REGISTER, .run = erst_exec_stall_while_true, }, [ACPI_ERST_SKIP_NEXT_IF_TRUE] = { .flags = APEI_EXEC_INS_ACCESS_REGISTER, .run = erst_exec_skip_next_instruction_if_true, }, [ACPI_ERST_GOTO] = { .flags = 0, .run = erst_exec_goto, }, [ACPI_ERST_SET_SRC_ADDRESS_BASE] = { .flags = APEI_EXEC_INS_ACCESS_REGISTER, .run = erst_exec_set_src_address_base, }, [ACPI_ERST_SET_DST_ADDRESS_BASE] = { .flags = APEI_EXEC_INS_ACCESS_REGISTER, .run = erst_exec_set_dst_address_base, }, [ACPI_ERST_MOVE_DATA] = { .flags = APEI_EXEC_INS_ACCESS_REGISTER, .run = erst_exec_move_data, }, }; static inline void erst_exec_ctx_init(struct apei_exec_context *ctx) { apei_exec_ctx_init(ctx, erst_ins_type, ARRAY_SIZE(erst_ins_type), ERST_TAB_ENTRY(erst_tab), erst_tab->entries); } static int erst_get_erange(struct erst_erange *range) { struct apei_exec_context ctx; int rc; erst_exec_ctx_init(&ctx); rc = apei_exec_run(&ctx, ACPI_ERST_GET_ERROR_RANGE); if (rc) return rc; range->base = apei_exec_ctx_get_output(&ctx); rc = apei_exec_run(&ctx, ACPI_ERST_GET_ERROR_LENGTH); if (rc) return rc; range->size = apei_exec_ctx_get_output(&ctx); rc = apei_exec_run(&ctx, ACPI_ERST_GET_ERROR_ATTRIBUTES); if (rc) return rc; range->attr = apei_exec_ctx_get_output(&ctx); return 0; } static size_t __erst_get_record_count(void) { struct apei_exec_context ctx; int rc; erst_exec_ctx_init(&ctx); rc = apei_exec_run(&ctx, ACPI_ERST_GET_RECORD_COUNT); if (rc) return rc; return apei_exec_ctx_get_output(&ctx); } size_t erst_get_record_count(void) { size_t count; unsigned long flags; if (!erst_enabled) return -ENODEV; spin_lock_irqsave(&erst_lock, flags); count = __erst_get_record_count(); spin_unlock_irqrestore(&erst_lock, flags); return count; } static int __erst_get_next_record_id(u64 *record_id) { struct apei_exec_context ctx; int rc; erst_exec_ctx_init(&ctx); rc = apei_exec_run(&ctx, ACPI_ERST_GET_RECORD_ID); if (rc) return rc; *record_id = apei_exec_ctx_get_output(&ctx); return 0; } /* * Get the record ID of an existing error record on the persistent * storage. If there is no error record on the persistent storage, the * returned record_id is APEI_ERST_INVALID_RECORD_ID. */ int erst_get_next_record_id(u64 *record_id) { int rc; unsigned long flags; if (!erst_enabled) return -ENODEV; spin_lock_irqsave(&erst_lock, flags); rc = __erst_get_next_record_id(record_id); spin_unlock_irqrestore(&erst_lock, flags); return rc; } static int __erst_write_to_storage(u64 offset) { struct apei_exec_context ctx; u64 timeout = FIRMWARE_TIMEOUT; u64 val; int rc; erst_exec_ctx_init(&ctx); rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_WRITE); if (rc) return rc; apei_exec_ctx_set_input(&ctx, offset); rc = apei_exec_run(&ctx, ACPI_ERST_SET_RECORD_OFFSET); if (rc) return rc; rc = apei_exec_run(&ctx, ACPI_ERST_EXECUTE_OPERATION); if (rc) return rc; for (;;) { rc = apei_exec_run(&ctx, ACPI_ERST_CHECK_BUSY_STATUS); if (rc) return rc; val = apei_exec_ctx_get_output(&ctx); if (!val) break; if (erst_timedout(&timeout, SPIN_UNIT)) return -EIO; } rc = apei_exec_run(&ctx, ACPI_ERST_GET_COMMAND_STATUS); if (rc) return rc; val = apei_exec_ctx_get_output(&ctx); rc = apei_exec_run(&ctx, ACPI_ERST_END); if (rc) return rc; return erst_errno(val); } static int __erst_read_from_storage(u64 record_id, u64 offset) { struct apei_exec_context ctx; u64 timeout = FIRMWARE_TIMEOUT; u64 val; int rc; erst_exec_ctx_init(&ctx); rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_READ); if (rc) return rc; apei_exec_ctx_set_input(&ctx, offset); rc = apei_exec_run(&ctx, ACPI_ERST_SET_RECORD_OFFSET); if (rc) return rc; apei_exec_ctx_set_input(&ctx, record_id); rc = apei_exec_run(&ctx, ACPI_ERST_SET_RECORD_ID); if (rc) return rc; rc = apei_exec_run(&ctx, ACPI_ERST_EXECUTE_OPERATION); if (rc) return rc; for (;;) { rc = apei_exec_run(&ctx, ACPI_ERST_CHECK_BUSY_STATUS); if (rc) return rc; val = apei_exec_ctx_get_output(&ctx); if (!val) break; if (erst_timedout(&timeout, SPIN_UNIT)) return -EIO; }; rc = apei_exec_run(&ctx, ACPI_ERST_GET_COMMAND_STATUS); if (rc) return rc; val = apei_exec_ctx_get_output(&ctx); rc = apei_exec_run(&ctx, ACPI_ERST_END); if (rc) return rc; return erst_errno(val); } static int __erst_clear_from_storage(u64 record_id) { struct apei_exec_context ctx; u64 timeout = FIRMWARE_TIMEOUT; u64 val; int rc; erst_exec_ctx_init(&ctx); rc = apei_exec_run(&ctx, ACPI_ERST_BEGIN_CLEAR); if (rc) return rc; apei_exec_ctx_set_input(&ctx, record_id); rc = apei_exec_run(&ctx, ACPI_ERST_SET_RECORD_ID); if (rc) return rc; rc = apei_exec_run(&ctx, ACPI_ERST_EXECUTE_OPERATION); if (rc) return rc; for (;;) { rc = apei_exec_run(&ctx, ACPI_ERST_CHECK_BUSY_STATUS); if (rc) return rc; val = apei_exec_ctx_get_output(&ctx); if (!val) break; if (erst_timedout(&timeout, SPIN_UNIT)) return -EIO; } rc = apei_exec_run(&ctx, ACPI_ERST_GET_COMMAND_STATUS); if (rc) return rc; val = apei_exec_ctx_get_output(&ctx); rc = apei_exec_run(&ctx, ACPI_ERST_END); if (rc) return rc; return erst_errno(val); } /* NVRAM ERST Error Log Address Range is not supported yet */ static int __erst_write_to_nvram(const struct cper_record_header *record) { /* do not print message, because printk is not safe for NMI */ return -ENOSYS; } static int __erst_read_to_erange_from_nvram(u64 record_id, u64 *offset) { printk(KERN_WARNING "NVRAM ERST Log Address Range is not implemented yet\n"); return -ENOSYS; } static int __erst_clear_from_nvram(u64 record_id) { printk(KERN_WARNING "NVRAM ERST Log Address Range is not implemented yet\n"); return -ENOSYS; } int erst_write(const struct cper_record_header *record) { int rc; unsigned long flags; struct cper_record_header *rcd_erange; if (!record) return -EINVAL; if (!erst_enabled) return -ENODEV; if (memcmp(record->signature, CPER_SIG_RECORD, CPER_SIG_SIZE)) return -EINVAL; if (erst_erange.attr & ERST_RANGE_NVRAM) { if (!spin_trylock_irqsave(&erst_lock, flags)) return -EBUSY; rc = __erst_write_to_nvram(record); spin_unlock_irqrestore(&erst_lock, flags); return rc; } if (record->record_length > erst_erange.size) return -EINVAL; if (!spin_trylock_irqsave(&erst_lock, flags)) return -EBUSY; memcpy(erst_erange.vaddr, record, record->record_length); rcd_erange = erst_erange.vaddr; /* signature for serialization system */ memcpy(&rcd_erange->persistence_information, "ER", 2); rc = __erst_write_to_storage(0); spin_unlock_irqrestore(&erst_lock, flags); return rc; } static int __erst_read_to_erange(u64 record_id, u64 *offset) { int rc; if (erst_erange.attr & ERST_RANGE_NVRAM) return __erst_read_to_erange_from_nvram( record_id, offset); rc = __erst_read_from_storage(record_id, 0); if (rc) return rc; *offset = 0; return 0; } static size_t __erst_read(u64 record_id, struct cper_record_header *record, size_t buflen) { int rc; u64 offset, len = 0; struct cper_record_header *rcd_tmp; rc = __erst_read_to_erange(record_id, &offset); if (rc) return rc; rcd_tmp = erst_erange.vaddr + offset; len = rcd_tmp->record_length; if (len <= buflen) memcpy(record, rcd_tmp, len); return len; } /* * If return value > buflen, the buffer size is not big enough, * else if return value < 0, something goes wrong, * else everything is OK, and return value is record length */ size_t erst_read(u64 record_id, struct cper_record_header *record, size_t buflen) { size_t len; unsigned long flags; if (!erst_enabled) return -ENODEV; spin_lock_irqsave(&erst_lock, flags); len = __erst_read(record_id, record, buflen); spin_unlock_irqrestore(&erst_lock, flags); return len; } /* * If return value > buflen, the buffer size is not big enough, * else if return value = 0, there is no more record to read, * else if return value < 0, something goes wrong, * else everything is OK, and return value is record length */ size_t erst_read_next(struct cper_record_header *record, size_t buflen) { int rc; size_t len; unsigned long flags; u64 record_id; if (!erst_enabled) return -ENODEV; spin_lock_irqsave(&erst_lock, flags); rc = __erst_get_next_record_id(&record_id); if (rc) { spin_unlock_irqrestore(&erst_lock, flags); return rc; } /* no more record */ if (record_id == APEI_ERST_INVALID_RECORD_ID) { spin_unlock_irqrestore(&erst_lock, flags); return 0; } len = __erst_read(record_id, record, buflen); spin_unlock_irqrestore(&erst_lock, flags); return len; } int erst_clear(u64 record_id) { int rc; unsigned long flags; if (!erst_enabled) return -ENODEV; spin_lock_irqsave(&erst_lock, flags); if (erst_erange.attr & ERST_RANGE_NVRAM) rc = __erst_clear_from_nvram(record_id); else rc = __erst_clear_from_storage(record_id); spin_unlock_irqrestore(&erst_lock, flags); return rc; } static int __init erst_check_table(struct acpi_table_erst *erst_tab) { if (erst_tab->header.length < sizeof(*erst_tab)) return -EINVAL; switch (erst_tab->header_length) { case sizeof(*erst_tab) - sizeof(erst_tab->header): /* * While invalid per specification, there are (early?) systems * indicating the full header size here, so accept that value too. */ case sizeof(*erst_tab): break; default: return -EINVAL; } if (erst_tab->entries != (erst_tab->header.length - sizeof(*erst_tab)) / sizeof(struct acpi_erst_entry)) return -EINVAL; return 0; } int __init erst_init(void) { int rc = 0; acpi_status status; struct apei_exec_context ctx; if (acpi_disabled) return -ENODEV; status = acpi_get_table(ACPI_SIG_ERST, 0, (struct acpi_table_header **)&erst_tab); if (status == AE_NOT_FOUND) { printk(KERN_INFO "ERST table was not found\n"); return -ENODEV; } else if (ACPI_FAILURE(status)) { const char *msg = acpi_format_exception(status); printk(KERN_WARNING "Failed to get ERST table: %s\n", msg); return -EINVAL; } rc = erst_check_table(erst_tab); if (rc) { printk(KERN_ERR "ERST table is invalid\n"); return rc; } erst_exec_ctx_init(&ctx); rc = apei_exec_pre_map_gars(&ctx); if (rc) return rc; rc = erst_get_erange(&erst_erange); if (rc) { if (rc == -ENODEV) printk(KERN_INFO "The corresponding hardware device or firmware " "implementation is not available.\n"); else printk(KERN_ERR "Failed to get Error Log Address Range.\n"); goto err_unmap_reg; } erst_erange.vaddr = apei_pre_map(erst_erange.base, erst_erange.size); if (!erst_erange.vaddr) { rc = -ENOMEM; goto err_unmap_reg; } printk(KERN_INFO "Xen ERST support is initialized.\n"); erst_enabled = 1; return 0; err_unmap_reg: apei_exec_post_unmap_gars(&ctx); return rc; } xen-4.4.0/xen/drivers/acpi/apei/apei-internal.h0000664000175000017500000000434412307313555017453 0ustar smbsmb/* * apei-internal.h - ACPI Platform Error Interface internal * definations. */ #ifndef APEI_INTERNAL_H #define APEI_INTERNAL_H struct apei_exec_context; typedef int (*apei_exec_ins_func_t)(struct apei_exec_context *ctx, struct acpi_whea_header *entry); #define APEI_EXEC_INS_ACCESS_REGISTER 0x0001 struct apei_exec_ins_type { u32 flags; apei_exec_ins_func_t run; }; struct apei_exec_context { u32 ip; u64 value; u64 var1; u64 var2; u64 src_base; u64 dst_base; struct apei_exec_ins_type *ins_table; u32 instructions; struct acpi_whea_header *action_table; u32 entries; }; int apei_exec_ctx_init(struct apei_exec_context *ctx, struct apei_exec_ins_type *ins_table, u32 instructions, struct acpi_whea_header *action_table, u32 entries); static inline void apei_exec_ctx_set_input(struct apei_exec_context *ctx, u64 input) { ctx->value = input; } static inline u64 apei_exec_ctx_get_output(struct apei_exec_context *ctx) { return ctx->value; } int __apei_exec_run(struct apei_exec_context *ctx, u8 action, bool_t optional); static inline int apei_exec_run(struct apei_exec_context *ctx, u8 action) { return __apei_exec_run(ctx, action, 0); } /* It is optional whether the firmware provides the action */ static inline int apei_exec_run_optional(struct apei_exec_context *ctx, u8 action) { return __apei_exec_run(ctx, action, 1); } /* Common instruction implementation */ /* IP has been set in instruction function */ #define APEI_EXEC_SET_IP 1 int __apei_exec_read_register(struct acpi_whea_header *entry, u64 *val); int __apei_exec_write_register(struct acpi_whea_header *entry, u64 val); int apei_exec_read_register(struct apei_exec_context *ctx, struct acpi_whea_header *entry); int apei_exec_read_register_value(struct apei_exec_context *ctx, struct acpi_whea_header *entry); int apei_exec_write_register(struct apei_exec_context *ctx, struct acpi_whea_header *entry); int apei_exec_write_register_value(struct apei_exec_context *ctx, struct acpi_whea_header *entry); int apei_exec_noop(struct apei_exec_context *ctx, struct acpi_whea_header *entry); int apei_exec_pre_map_gars(struct apei_exec_context *ctx); int apei_exec_post_unmap_gars(struct apei_exec_context *ctx); #endif xen-4.4.0/xen/drivers/acpi/apei/apei-io.c0000664000175000017500000001572112307313555016242 0ustar smbsmb/* * apei-io.c - APEI IO memory pre-mapping/post-unmapping and access * * Copyright (C) 2009-2010, Intel Corp. * Author: Huang Ying * Ported by: Liu, Jinsong * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include static LIST_HEAD(apei_iomaps); /* * Used for mutual exclusion between writers of apei_iomaps list, for * synchronization between readers and writer. */ static DEFINE_SPINLOCK(apei_iomaps_lock); struct apei_iomap { struct list_head list; void __iomem *vaddr; unsigned long size; paddr_t paddr; }; static struct apei_iomap *__apei_find_iomap(paddr_t paddr, unsigned long size) { struct apei_iomap *map; list_for_each_entry(map, &apei_iomaps, list) { if (map->paddr + map->size >= paddr + size && map->paddr <= paddr) return map; } return NULL; } static void __iomem *__apei_ioremap_fast(paddr_t paddr, unsigned long size) { struct apei_iomap *map; map = __apei_find_iomap(paddr, size); if (map) return map->vaddr + (paddr - map->paddr); else return NULL; } static int apei_range_nr; static void __iomem *__init apei_range_map(paddr_t paddr, unsigned long size) { int i, pg; int start_nr, cur_nr; pg = ((((paddr + size -1) & PAGE_MASK) - (paddr & PAGE_MASK)) >> PAGE_SHIFT) + 1; if (apei_range_nr + pg > FIX_APEI_RANGE_MAX) return NULL; start_nr = apei_range_nr + pg -1; for (i = 0; i < pg; i++) { cur_nr = start_nr - i; set_fixmap_nocache(FIX_APEI_RANGE_BASE + cur_nr, paddr + (i << PAGE_SHIFT)); apei_range_nr++; } return (void __iomem *)fix_to_virt(FIX_APEI_RANGE_BASE + start_nr); } /* * Used to pre-map the specified IO memory area. First try to find * whether the area is already pre-mapped, if it is, return; otherwise, * do the real map, and add the mapping into apei_iomaps list. */ void __iomem *__init apei_pre_map(paddr_t paddr, unsigned long size) { void __iomem *vaddr; struct apei_iomap *map; unsigned long flags; spin_lock_irqsave(&apei_iomaps_lock, flags); vaddr = __apei_ioremap_fast(paddr, size); spin_unlock_irqrestore(&apei_iomaps_lock, flags); if (vaddr) return vaddr; map = xmalloc(struct apei_iomap); if (!map) return NULL; vaddr = apei_range_map(paddr, size); if (!vaddr) { xfree(map); return NULL; } INIT_LIST_HEAD(&map->list); map->paddr = paddr & PAGE_MASK; map->size = (((paddr + size + PAGE_SIZE -1) & PAGE_MASK) - (paddr & PAGE_MASK)); map->vaddr = vaddr; spin_lock_irqsave(&apei_iomaps_lock, flags); list_add_tail(&map->list, &apei_iomaps); spin_unlock_irqrestore(&apei_iomaps_lock, flags); return map->vaddr + (paddr - map->paddr); } /* * Used to post-unmap the specified IO memory area. */ static void __init apei_post_unmap(paddr_t paddr, unsigned long size) { struct apei_iomap *map; unsigned long flags; spin_lock_irqsave(&apei_iomaps_lock, flags); map = __apei_find_iomap(paddr, size); if (map) list_del(&map->list); spin_unlock_irqrestore(&apei_iomaps_lock, flags); xfree(map); } /* In NMI handler, should set silent = 1 */ static int apei_check_gar(struct acpi_generic_address *reg, u64 *paddr, int silent) { u32 width, space_id; width = reg->bit_width; space_id = reg->space_id; /* Handle possible alignment issues */ memcpy(paddr, ®->address, sizeof(*paddr)); if (!*paddr) { if (!silent) printk(KERN_WARNING "Invalid physical address in GAR\n"); return -EINVAL; } if ((width != 8) && (width != 16) && (width != 32) && (width != 64)) { if (!silent) printk(KERN_WARNING "Invalid bit width in GAR\n"); return -EINVAL; } if (space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY && space_id != ACPI_ADR_SPACE_SYSTEM_IO) { if (!silent) printk(KERN_WARNING "Invalid address space type in GAR\n"); return -EINVAL; } return 0; } /* Pre-map, working on GAR */ int __init apei_pre_map_gar(struct acpi_generic_address *reg) { u64 paddr; void __iomem *vaddr; int rc; if (reg->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) return 0; rc = apei_check_gar(reg, &paddr, 0); if (rc) return rc; vaddr = apei_pre_map(paddr, reg->bit_width / 8); if (!vaddr) return -EIO; return 0; } /* Post-unmap, working on GAR */ int __init apei_post_unmap_gar(struct acpi_generic_address *reg) { u64 paddr; int rc; if (reg->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) return 0; rc = apei_check_gar(reg, &paddr, 0); if (rc) return rc; apei_post_unmap(paddr, reg->bit_width / 8); return 0; } static int apei_read_mem(u64 paddr, u64 *val, u32 width) { void __iomem *addr; u64 tmpval; addr = __apei_ioremap_fast(paddr, width); switch (width) { case 8: *val = readb(addr); break; case 16: *val = readw(addr); break; case 32: *val = readl(addr); break; case 64: tmpval = (u64)readl(addr); tmpval |= ((u64)readl(addr+4)) << 32; *val = tmpval; break; default: return -EINVAL; } return 0; } static int apei_write_mem(u64 paddr, u64 val, u32 width) { void __iomem *addr; u32 tmpval; addr = __apei_ioremap_fast(paddr, width); switch (width) { case 8: writeb(val, addr); break; case 16: writew(val, addr); break; case 32: writel(val, addr); break; case 64: tmpval = (u32)val; writel(tmpval, addr); tmpval = (u32)(val >> 32); writel(tmpval, addr+4); break; default: return -EINVAL; } return 0; } int apei_read(u64 *val, struct acpi_generic_address *reg) { u64 paddr; int rc; rc = apei_check_gar(reg, &paddr, 1); if (rc) return rc; *val = 0; /* currently all erst implementation take bit_width as real range */ switch (reg->space_id) { case ACPI_ADR_SPACE_SYSTEM_MEMORY: return apei_read_mem(paddr, val, reg->bit_width); case ACPI_ADR_SPACE_SYSTEM_IO: return acpi_os_read_port(paddr, (u32 *)val, reg->bit_width); default: return -EINVAL; } } int apei_write(u64 val, struct acpi_generic_address *reg) { u64 paddr; int rc; rc = apei_check_gar(reg, &paddr, 1); if (rc) return rc; switch (reg->space_id) { case ACPI_ADR_SPACE_SYSTEM_MEMORY: return apei_write_mem(paddr, val, reg->bit_width); case ACPI_ADR_SPACE_SYSTEM_IO: return acpi_os_write_port(paddr, val, reg->bit_width); default: return -EINVAL; } } xen-4.4.0/xen/drivers/acpi/pmstat.c0000664000175000017500000003425012307313555015307 0ustar smbsmb/***************************************************************************** # pmstat.c - Power Management statistic information (Px/Cx/Tx, etc.) # # Copyright (c) 2008, Liu Jinsong # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # more details. # # You should have received a copy of the GNU General Public License along with # this program; if not, write to the Free Software Foundation, Inc., 59 # Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # The full GNU General Public License is included in this distribution in the # file called LICENSE. # *****************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include DEFINE_PER_CPU_READ_MOSTLY(struct pm_px *, cpufreq_statistic_data); /* * Get PM statistic info */ int do_get_pm_info(struct xen_sysctl_get_pmstat *op) { int ret = 0; const struct processor_pminfo *pmpt; if ( !op || (op->cpuid >= nr_cpu_ids) || !cpu_online(op->cpuid) ) return -EINVAL; pmpt = processor_pminfo[op->cpuid]; switch ( op->type & PMSTAT_CATEGORY_MASK ) { case PMSTAT_CX: if ( !(xen_processor_pmbits & XEN_PROCESSOR_PM_CX) ) return -ENODEV; break; case PMSTAT_PX: if ( !(xen_processor_pmbits & XEN_PROCESSOR_PM_PX) ) return -ENODEV; if ( !cpufreq_driver ) return -ENODEV; if ( !pmpt || !(pmpt->perf.init & XEN_PX_INIT) ) return -EINVAL; break; default: return -ENODEV; } switch ( op->type ) { case PMSTAT_get_max_px: { op->u.getpx.total = pmpt->perf.state_count; break; } case PMSTAT_get_pxstat: { uint32_t ct; struct pm_px *pxpt; spinlock_t *cpufreq_statistic_lock = &per_cpu(cpufreq_statistic_lock, op->cpuid); spin_lock(cpufreq_statistic_lock); pxpt = per_cpu(cpufreq_statistic_data, op->cpuid); if ( !pxpt || !pxpt->u.pt || !pxpt->u.trans_pt ) { spin_unlock(cpufreq_statistic_lock); return -ENODATA; } pxpt->u.usable = pmpt->perf.state_count - pmpt->perf.platform_limit; cpufreq_residency_update(op->cpuid, pxpt->u.cur); ct = pmpt->perf.state_count; if ( copy_to_guest(op->u.getpx.trans_pt, pxpt->u.trans_pt, ct*ct) ) { spin_unlock(cpufreq_statistic_lock); ret = -EFAULT; break; } if ( copy_to_guest(op->u.getpx.pt, pxpt->u.pt, ct) ) { spin_unlock(cpufreq_statistic_lock); ret = -EFAULT; break; } op->u.getpx.total = pxpt->u.total; op->u.getpx.usable = pxpt->u.usable; op->u.getpx.last = pxpt->u.last; op->u.getpx.cur = pxpt->u.cur; spin_unlock(cpufreq_statistic_lock); break; } case PMSTAT_reset_pxstat: { cpufreq_statistic_reset(op->cpuid); break; } case PMSTAT_get_max_cx: { op->u.getcx.nr = pmstat_get_cx_nr(op->cpuid); ret = 0; break; } case PMSTAT_get_cxstat: { ret = pmstat_get_cx_stat(op->cpuid, &op->u.getcx); break; } case PMSTAT_reset_cxstat: { ret = pmstat_reset_cx_stat(op->cpuid); break; } default: printk("not defined sub-hypercall @ do_get_pm_info\n"); ret = -ENOSYS; break; } return ret; } /* * 1. Get PM parameter * 2. Provide user PM control */ static int read_scaling_available_governors(char *scaling_available_governors, unsigned int size) { unsigned int i = 0; struct cpufreq_governor *t; if ( !scaling_available_governors ) return -EINVAL; list_for_each_entry(t, &cpufreq_governor_list, governor_list) { i += scnprintf(&scaling_available_governors[i], CPUFREQ_NAME_LEN, "%s ", t->name); if ( i > size ) return -EINVAL; } scaling_available_governors[i-1] = '\0'; return 0; } static int get_cpufreq_para(struct xen_sysctl_pm_op *op) { uint32_t ret = 0; const struct processor_pminfo *pmpt; struct cpufreq_policy *policy; uint32_t gov_num = 0; uint32_t *affected_cpus; uint32_t *scaling_available_frequencies; char *scaling_available_governors; struct list_head *pos; uint32_t cpu, i, j = 0; pmpt = processor_pminfo[op->cpuid]; policy = per_cpu(cpufreq_cpu_policy, op->cpuid); if ( !pmpt || !pmpt->perf.states || !policy || !policy->governor ) return -EINVAL; list_for_each(pos, &cpufreq_governor_list) gov_num++; if ( (op->u.get_para.cpu_num != cpumask_weight(policy->cpus)) || (op->u.get_para.freq_num != pmpt->perf.state_count) || (op->u.get_para.gov_num != gov_num) ) { op->u.get_para.cpu_num = cpumask_weight(policy->cpus); op->u.get_para.freq_num = pmpt->perf.state_count; op->u.get_para.gov_num = gov_num; return -EAGAIN; } if ( !(affected_cpus = xzalloc_array(uint32_t, op->u.get_para.cpu_num)) ) return -ENOMEM; for_each_cpu(cpu, policy->cpus) affected_cpus[j++] = cpu; ret = copy_to_guest(op->u.get_para.affected_cpus, affected_cpus, op->u.get_para.cpu_num); xfree(affected_cpus); if ( ret ) return ret; if ( !(scaling_available_frequencies = xzalloc_array(uint32_t, op->u.get_para.freq_num)) ) return -ENOMEM; for ( i = 0; i < op->u.get_para.freq_num; i++ ) scaling_available_frequencies[i] = pmpt->perf.states[i].core_frequency * 1000; ret = copy_to_guest(op->u.get_para.scaling_available_frequencies, scaling_available_frequencies, op->u.get_para.freq_num); xfree(scaling_available_frequencies); if ( ret ) return ret; if ( !(scaling_available_governors = xzalloc_array(char, gov_num * CPUFREQ_NAME_LEN)) ) return -ENOMEM; if ( (ret = read_scaling_available_governors(scaling_available_governors, gov_num * CPUFREQ_NAME_LEN * sizeof(char))) ) { xfree(scaling_available_governors); return ret; } ret = copy_to_guest(op->u.get_para.scaling_available_governors, scaling_available_governors, gov_num * CPUFREQ_NAME_LEN); xfree(scaling_available_governors); if ( ret ) return ret; op->u.get_para.cpuinfo_cur_freq = cpufreq_driver->get ? cpufreq_driver->get(op->cpuid) : policy->cur; op->u.get_para.cpuinfo_max_freq = policy->cpuinfo.max_freq; op->u.get_para.cpuinfo_min_freq = policy->cpuinfo.min_freq; op->u.get_para.scaling_cur_freq = policy->cur; op->u.get_para.scaling_max_freq = policy->max; op->u.get_para.scaling_min_freq = policy->min; if ( cpufreq_driver->name[0] ) strlcpy(op->u.get_para.scaling_driver, cpufreq_driver->name, CPUFREQ_NAME_LEN); else strlcpy(op->u.get_para.scaling_driver, "Unknown", CPUFREQ_NAME_LEN); if ( policy->governor->name[0] ) strlcpy(op->u.get_para.scaling_governor, policy->governor->name, CPUFREQ_NAME_LEN); else strlcpy(op->u.get_para.scaling_governor, "Unknown", CPUFREQ_NAME_LEN); /* governor specific para */ if ( !strnicmp(op->u.get_para.scaling_governor, "userspace", CPUFREQ_NAME_LEN) ) { op->u.get_para.u.userspace.scaling_setspeed = policy->cur; } if ( !strnicmp(op->u.get_para.scaling_governor, "ondemand", CPUFREQ_NAME_LEN) ) { ret = get_cpufreq_ondemand_para( &op->u.get_para.u.ondemand.sampling_rate_max, &op->u.get_para.u.ondemand.sampling_rate_min, &op->u.get_para.u.ondemand.sampling_rate, &op->u.get_para.u.ondemand.up_threshold); } op->u.get_para.turbo_enabled = cpufreq_get_turbo_status(op->cpuid); return ret; } static int set_cpufreq_gov(struct xen_sysctl_pm_op *op) { struct cpufreq_policy new_policy, *old_policy; old_policy = per_cpu(cpufreq_cpu_policy, op->cpuid); if ( !old_policy ) return -EINVAL; memcpy(&new_policy, old_policy, sizeof(struct cpufreq_policy)); new_policy.governor = __find_governor(op->u.set_gov.scaling_governor); if (new_policy.governor == NULL) return -EINVAL; return __cpufreq_set_policy(old_policy, &new_policy); } static int set_cpufreq_para(struct xen_sysctl_pm_op *op) { int ret = 0; struct cpufreq_policy *policy; policy = per_cpu(cpufreq_cpu_policy, op->cpuid); if ( !policy || !policy->governor ) return -EINVAL; switch(op->u.set_para.ctrl_type) { case SCALING_MAX_FREQ: { struct cpufreq_policy new_policy; memcpy(&new_policy, policy, sizeof(struct cpufreq_policy)); new_policy.max = op->u.set_para.ctrl_value; ret = __cpufreq_set_policy(policy, &new_policy); break; } case SCALING_MIN_FREQ: { struct cpufreq_policy new_policy; memcpy(&new_policy, policy, sizeof(struct cpufreq_policy)); new_policy.min = op->u.set_para.ctrl_value; ret = __cpufreq_set_policy(policy, &new_policy); break; } case SCALING_SETSPEED: { unsigned int freq =op->u.set_para.ctrl_value; if ( !strnicmp(policy->governor->name, "userspace", CPUFREQ_NAME_LEN) ) ret = write_userspace_scaling_setspeed(op->cpuid, freq); else ret = -EINVAL; break; } case SAMPLING_RATE: { unsigned int sampling_rate = op->u.set_para.ctrl_value; if ( !strnicmp(policy->governor->name, "ondemand", CPUFREQ_NAME_LEN) ) ret = write_ondemand_sampling_rate(sampling_rate); else ret = -EINVAL; break; } case UP_THRESHOLD: { unsigned int up_threshold = op->u.set_para.ctrl_value; if ( !strnicmp(policy->governor->name, "ondemand", CPUFREQ_NAME_LEN) ) ret = write_ondemand_up_threshold(up_threshold); else ret = -EINVAL; break; } default: ret = -EINVAL; break; } return ret; } int do_pm_op(struct xen_sysctl_pm_op *op) { int ret = 0; const struct processor_pminfo *pmpt; if ( !op || op->cpuid >= nr_cpu_ids || !cpu_online(op->cpuid) ) return -EINVAL; pmpt = processor_pminfo[op->cpuid]; switch ( op->cmd & PM_PARA_CATEGORY_MASK ) { case CPUFREQ_PARA: if ( !(xen_processor_pmbits & XEN_PROCESSOR_PM_PX) ) return -ENODEV; if ( !pmpt || !(pmpt->perf.init & XEN_PX_INIT) ) return -EINVAL; break; } switch ( op->cmd ) { case GET_CPUFREQ_PARA: { ret = get_cpufreq_para(op); break; } case SET_CPUFREQ_GOV: { ret = set_cpufreq_gov(op); break; } case SET_CPUFREQ_PARA: { ret = set_cpufreq_para(op); break; } case GET_CPUFREQ_AVGFREQ: { op->u.get_avgfreq = cpufreq_driver_getavg(op->cpuid, USR_GETAVG); break; } case XEN_SYSCTL_pm_op_set_sched_opt_smt: { uint32_t saved_value; saved_value = sched_smt_power_savings; sched_smt_power_savings = !!op->u.set_sched_opt_smt; op->u.set_sched_opt_smt = saved_value; break; } case XEN_SYSCTL_pm_op_set_vcpu_migration_delay: { set_vcpu_migration_delay(op->u.set_vcpu_migration_delay); break; } case XEN_SYSCTL_pm_op_get_vcpu_migration_delay: { op->u.get_vcpu_migration_delay = get_vcpu_migration_delay(); break; } case XEN_SYSCTL_pm_op_get_max_cstate: { op->u.get_max_cstate = acpi_get_cstate_limit(); break; } case XEN_SYSCTL_pm_op_set_max_cstate: { acpi_set_cstate_limit(op->u.set_max_cstate); break; } case XEN_SYSCTL_pm_op_enable_turbo: { ret = cpufreq_update_turbo(op->cpuid, CPUFREQ_TURBO_ENABLED); break; } case XEN_SYSCTL_pm_op_disable_turbo: { ret = cpufreq_update_turbo(op->cpuid, CPUFREQ_TURBO_DISABLED); break; } default: printk("not defined sub-hypercall @ do_pm_op\n"); ret = -ENOSYS; break; } return ret; } int acpi_set_pdc_bits(u32 acpi_id, XEN_GUEST_HANDLE_PARAM(uint32) pdc) { u32 bits[3]; int ret; if ( copy_from_guest(bits, pdc, 2) ) ret = -EFAULT; else if ( bits[0] != ACPI_PDC_REVISION_ID || !bits[1] ) ret = -EINVAL; else if ( copy_from_guest_offset(bits + 2, pdc, 2, 1) ) ret = -EFAULT; else { u32 mask = 0; if ( xen_processor_pmbits & XEN_PROCESSOR_PM_CX ) mask |= ACPI_PDC_C_MASK | ACPI_PDC_SMP_C1PT; if ( xen_processor_pmbits & XEN_PROCESSOR_PM_PX ) mask |= ACPI_PDC_P_MASK | ACPI_PDC_SMP_C1PT; if ( xen_processor_pmbits & XEN_PROCESSOR_PM_TX ) mask |= ACPI_PDC_T_MASK | ACPI_PDC_SMP_C1PT; bits[2] &= (ACPI_PDC_C_MASK | ACPI_PDC_P_MASK | ACPI_PDC_T_MASK | ACPI_PDC_SMP_C1PT) & ~mask; ret = arch_acpi_set_pdc_bits(acpi_id, bits, mask); } if ( !ret && __copy_to_guest_offset(pdc, 2, bits + 2, 1) ) ret = -EFAULT; return ret; } xen-4.4.0/xen/drivers/acpi/tables/0000775000175000017500000000000012307313555015101 5ustar smbsmbxen-4.4.0/xen/drivers/acpi/tables/Makefile0000664000175000017500000000020412307313555016535 0ustar smbsmbobj-bin-y += tbfadt.init.o obj-bin-y += tbinstal.init.o obj-y += tbutils.o obj-bin-y += tbxface.init.o obj-bin-y += tbxfroot.init.o xen-4.4.0/xen/drivers/acpi/tables/tbxface.c0000664000175000017500000002054112307313555016663 0ustar smbsmb/****************************************************************************** * * Module Name: tbxface - Public interfaces to the ACPI subsystem * ACPI table oriented interfaces * *****************************************************************************/ /* * Copyright (C) 2000 - 2008, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #include #include #include #include #define _COMPONENT ACPI_TABLES ACPI_MODULE_NAME("tbxface") /******************************************************************************* * * FUNCTION: acpi_allocate_root_table * * PARAMETERS: initial_table_count - Size of initial_table_array, in number of * struct acpi_table_desc structures * * RETURN: Status * * DESCRIPTION: Allocate a root table array. Used by i_aSL compiler and * acpi_initialize_tables. * ******************************************************************************/ acpi_status __init acpi_allocate_root_table(u32 initial_table_count) { acpi_gbl_root_table_list.size = initial_table_count - ACPI_ROOT_TABLE_SIZE_INCREMENT; acpi_gbl_root_table_list.flags = ACPI_ROOT_ALLOW_RESIZE; return (acpi_tb_resize_root_table_list()); } /******************************************************************************* * * FUNCTION: acpi_initialize_tables * * PARAMETERS: initial_table_array - Pointer to an array of pre-allocated * struct acpi_table_desc structures. If NULL, the * array is dynamically allocated. * initial_table_count - Size of initial_table_array, in number of * struct acpi_table_desc structures * allow_realloc - Flag to tell Table Manager if resize of * pre-allocated array is allowed. Ignored * if initial_table_array is NULL. * * RETURN: Status * * DESCRIPTION: Initialize the table manager, get the RSDP and RSDT/XSDT. * * NOTE: Allows static allocation of the initial table array in order * to avoid the use of dynamic memory in confined environments * such as the kernel boot sequence where it may not be available. * * If the host OS memory managers are initialized, use NULL for * initial_table_array, and the table will be dynamically allocated. * ******************************************************************************/ acpi_status __init acpi_initialize_tables(struct acpi_table_desc * initial_table_array, u32 initial_table_count, u8 allow_resize) { acpi_physical_address rsdp_address; acpi_status status; ACPI_FUNCTION_TRACE(acpi_initialize_tables); /* * Set up the Root Table Array * Allocate the table array if requested */ if (!initial_table_array) { status = acpi_allocate_root_table(initial_table_count); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } } else { /* Root Table Array has been statically allocated by the host */ ACPI_MEMSET(initial_table_array, 0, initial_table_count * sizeof(struct acpi_table_desc)); acpi_gbl_root_table_list.tables = initial_table_array; acpi_gbl_root_table_list.size = initial_table_count; acpi_gbl_root_table_list.flags = ACPI_ROOT_ORIGIN_UNKNOWN; if (allow_resize) { acpi_gbl_root_table_list.flags |= ACPI_ROOT_ALLOW_RESIZE; } } /* Get the address of the RSDP */ rsdp_address = acpi_os_get_root_pointer(); if (!rsdp_address) { return_ACPI_STATUS(AE_NOT_FOUND); } /* * Get the root table (RSDT or XSDT) and extract all entries to the local * Root Table Array. This array contains the information of the RSDT/XSDT * in a common, more useable format. */ status = acpi_tb_parse_root_table(rsdp_address, ACPI_TABLE_ORIGIN_MAPPED); return_ACPI_STATUS(status); } /******************************************************************************* * * FUNCTION: acpi_get_table * * PARAMETERS: Signature - ACPI signature of needed table * Instance - Which instance (for SSDTs) * out_table - Where the pointer to the table is returned * * RETURN: Status and pointer to table * * DESCRIPTION: Finds and verifies an ACPI table. * *****************************************************************************/ acpi_status __init acpi_get_table(char *signature, acpi_native_uint instance, struct acpi_table_header **out_table) { acpi_native_uint i; acpi_native_uint j; acpi_status status; /* Parameter validation */ if (!signature || !out_table) { return (AE_BAD_PARAMETER); } /* * Walk the root table list */ for (i = 0, j = 0; i < acpi_gbl_root_table_list.count; i++) { if (!ACPI_COMPARE_NAME (&(acpi_gbl_root_table_list.tables[i].signature), signature)) { continue; } if (++j < instance) { continue; } status = acpi_tb_verify_table(&acpi_gbl_root_table_list.tables[i]); if (ACPI_SUCCESS(status)) { *out_table = acpi_gbl_root_table_list.tables[i].pointer; } acpi_gbl_root_table_list.tables[i].pointer = NULL; return (status); } return (AE_NOT_FOUND); } /****************************************************************************** * * FUNCTION: acpi_get_table_phys * * PARAMETERS: signature - ACPI signature of needed table * instance - Which instance (for SSDTs) * addr - Where the table's physical address is returned * len - Where the length of table is returned * * RETURN: Status, pointer and length of table * * DESCRIPTION: Finds physical address and length of ACPI table * *****************************************************************************/ acpi_status __init acpi_get_table_phys(acpi_string signature, acpi_native_uint instance, acpi_physical_address *addr, acpi_native_uint *len) { acpi_native_uint i, j; acpi_status status; if (!signature || !addr || !len) return AE_BAD_PARAMETER; for (i = j = 0; i < acpi_gbl_root_table_list.count; i++) { if (!ACPI_COMPARE_NAME( &acpi_gbl_root_table_list.tables[i].signature, signature)) continue; if (++j < instance) continue; status = acpi_tb_verify_table(&acpi_gbl_root_table_list.tables[i]); if (ACPI_SUCCESS(status)) { *addr = acpi_gbl_root_table_list.tables[i].address; *len = acpi_gbl_root_table_list.tables[i].length; } acpi_gbl_root_table_list.tables[i].pointer = NULL; return status; } return AE_NOT_FOUND; } xen-4.4.0/xen/drivers/acpi/tables/tbfadt.c0000664000175000017500000003653312307313555016523 0ustar smbsmb/****************************************************************************** * * Module Name: tbfadt - FADT table utilities * *****************************************************************************/ /* * Copyright (C) 2000 - 2008, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #include #include #include #include #define _COMPONENT ACPI_TABLES ACPI_MODULE_NAME("tbfadt") /* Local prototypes */ static void inline acpi_tb_init_generic_address(struct acpi_generic_address *generic_address, u8 bit_width, u64 address); static void acpi_tb_convert_fadt(void); static void acpi_tb_validate_fadt(void); /* Table for conversion of FADT to common internal format and FADT validation */ typedef struct acpi_fadt_info { char *name; u16 target; u16 source; u16 length; u8 type; } acpi_fadt_info; #define ACPI_FADT_OPTIONAL 0 #define ACPI_FADT_REQUIRED 1 #define ACPI_FADT_SEPARATE_LENGTH 2 static struct acpi_fadt_info __initdata fadt_info_table[] = { {"Pm1aEventBlock", ACPI_FADT_OFFSET(xpm1a_event_block), ACPI_FADT_OFFSET(pm1a_event_block), ACPI_FADT_OFFSET(pm1_event_length), ACPI_FADT_REQUIRED}, {"Pm1bEventBlock", ACPI_FADT_OFFSET(xpm1b_event_block), ACPI_FADT_OFFSET(pm1b_event_block), ACPI_FADT_OFFSET(pm1_event_length), ACPI_FADT_OPTIONAL}, {"Pm1aControlBlock", ACPI_FADT_OFFSET(xpm1a_control_block), ACPI_FADT_OFFSET(pm1a_control_block), ACPI_FADT_OFFSET(pm1_control_length), ACPI_FADT_REQUIRED}, {"Pm1bControlBlock", ACPI_FADT_OFFSET(xpm1b_control_block), ACPI_FADT_OFFSET(pm1b_control_block), ACPI_FADT_OFFSET(pm1_control_length), ACPI_FADT_OPTIONAL}, {"Pm2ControlBlock", ACPI_FADT_OFFSET(xpm2_control_block), ACPI_FADT_OFFSET(pm2_control_block), ACPI_FADT_OFFSET(pm2_control_length), ACPI_FADT_SEPARATE_LENGTH}, {"PmTimerBlock", ACPI_FADT_OFFSET(xpm_timer_block), ACPI_FADT_OFFSET(pm_timer_block), ACPI_FADT_OFFSET(pm_timer_length), ACPI_FADT_REQUIRED}, {"Gpe0Block", ACPI_FADT_OFFSET(xgpe0_block), ACPI_FADT_OFFSET(gpe0_block), ACPI_FADT_OFFSET(gpe0_block_length), ACPI_FADT_SEPARATE_LENGTH}, {"Gpe1Block", ACPI_FADT_OFFSET(xgpe1_block), ACPI_FADT_OFFSET(gpe1_block), ACPI_FADT_OFFSET(gpe1_block_length), ACPI_FADT_SEPARATE_LENGTH} }; #define ACPI_FADT_INFO_ENTRIES (sizeof (fadt_info_table) / sizeof (struct acpi_fadt_info)) /******************************************************************************* * * FUNCTION: acpi_tb_init_generic_address * * PARAMETERS: generic_address - GAS struct to be initialized * bit_width - Width of this register * Address - Address of the register * * RETURN: None * * DESCRIPTION: Initialize a Generic Address Structure (GAS) * See the ACPI specification for a full description and * definition of this structure. * ******************************************************************************/ static void inline acpi_tb_init_generic_address(struct acpi_generic_address *generic_address, u8 bit_width, u64 address) { /* * The 64-bit Address field is non-aligned in the byte packed * GAS struct. */ ACPI_MOVE_64_TO_64(&generic_address->address, &address); /* All other fields are byte-wide */ generic_address->space_id = ACPI_ADR_SPACE_SYSTEM_IO; generic_address->bit_width = bit_width; generic_address->bit_offset = 0; generic_address->access_width = 0; } /******************************************************************************* * * FUNCTION: acpi_tb_parse_fadt * * PARAMETERS: table_index - Index for the FADT * Flags - Flags * * RETURN: None * * DESCRIPTION: Initialize the FADT, DSDT and FACS tables * (FADT contains the addresses of the DSDT and FACS) * ******************************************************************************/ void __init acpi_tb_parse_fadt(acpi_native_uint table_index, u8 flags) { u32 length; struct acpi_table_header *table; /* * The FADT has multiple versions with different lengths, * and it contains pointers to both the DSDT and FACS tables. * * Get a local copy of the FADT and convert it to a common format * Map entire FADT, assumed to be smaller than one page. */ length = acpi_gbl_root_table_list.tables[table_index].length; table = acpi_os_map_memory(acpi_gbl_root_table_list.tables[table_index]. address, length); if (!table) { return; } /* * Validate the FADT checksum before we copy the table. Ignore * checksum error as we want to try to get the DSDT and FACS. */ (void)acpi_tb_verify_checksum(table, length); /* Obtain a local copy of the FADT in common ACPI 2.0+ format */ acpi_tb_create_local_fadt(table, length); /* All done with the real FADT, unmap it */ acpi_os_unmap_memory(table, length); /* Obtain the DSDT and FACS tables via their addresses within the FADT */ acpi_tb_install_table((acpi_physical_address) acpi_gbl_FADT.Xdsdt, flags, ACPI_SIG_DSDT, ACPI_TABLE_INDEX_DSDT); /* If Hardware Reduced flag is set, there is no FACS */ if (!acpi_gbl_reduced_hardware) { acpi_tb_install_table((acpi_physical_address) acpi_gbl_FADT. Xfacs, flags, ACPI_SIG_FACS, ACPI_TABLE_INDEX_FACS); } } /******************************************************************************* * * FUNCTION: acpi_tb_create_local_fadt * * PARAMETERS: Table - Pointer to BIOS FADT * Length - Length of the table * * RETURN: None * * DESCRIPTION: Get a local copy of the FADT and convert it to a common format. * Performs validation on some important FADT fields. * * NOTE: We create a local copy of the FADT regardless of the version. * ******************************************************************************/ void __init acpi_tb_create_local_fadt(struct acpi_table_header *table, u32 length) { /* * Check if the FADT is larger than the largest table that we expect * (the ACPI 5.0 version). If so, truncate the table, and issue * a warning. */ if (length > sizeof(struct acpi_table_fadt)) { ACPI_WARNING((AE_INFO, "FADT (revision %u) is longer than ACPI 5.0 version," " truncating length %u to %zu", table->revision, (unsigned)length, sizeof(struct acpi_table_fadt))); } /* Clear the entire local FADT */ ACPI_MEMSET(&acpi_gbl_FADT, 0, sizeof(struct acpi_table_fadt)); /* Copy the original FADT, up to sizeof (struct acpi_table_fadt) */ ACPI_MEMCPY(&acpi_gbl_FADT, table, ACPI_MIN(length, sizeof(struct acpi_table_fadt))); /* Take a copy of the Hardware Reduced flag */ acpi_gbl_reduced_hardware = FALSE; if (acpi_gbl_FADT.flags & ACPI_FADT_HW_REDUCED) { acpi_gbl_reduced_hardware = TRUE; } /* * 1) Convert the local copy of the FADT to the common internal format * 2) Validate some of the important values within the FADT */ acpi_tb_convert_fadt(); acpi_tb_validate_fadt(); } /******************************************************************************* * * FUNCTION: acpi_tb_convert_fadt * * PARAMETERS: None, uses acpi_gbl_FADT * * RETURN: None * * DESCRIPTION: Converts all versions of the FADT to a common internal format. * Expand all 32-bit addresses to 64-bit. * * NOTE: acpi_gbl_FADT must be of size (struct acpi_table_fadt), * and must contain a copy of the actual FADT. * * ACPICA will use the "X" fields of the FADT for all addresses. * * "X" fields are optional extensions to the original V1.0 fields. Even if * they are present in the structure, they can be optionally not used by * setting them to zero. Therefore, we must selectively expand V1.0 fields * if the corresponding X field is zero. * * For ACPI 1.0 FADTs, all address fields are expanded to the corresponding * "X" fields. * * For ACPI 2.0 FADTs, any "X" fields that are NULL are filled in by * expanding the corresponding ACPI 1.0 field. * ******************************************************************************/ static void __init acpi_tb_convert_fadt(void) { u8 pm1_register_length; struct acpi_generic_address *target; acpi_native_uint i; /* Update the local FADT table header length */ acpi_gbl_FADT.header.length = sizeof(struct acpi_table_fadt); /* Expand the 32-bit FACS and DSDT addresses to 64-bit as necessary */ if (!acpi_gbl_FADT.Xfacs) { acpi_gbl_FADT.Xfacs = (u64) acpi_gbl_FADT.facs; } if (!acpi_gbl_FADT.Xdsdt) { acpi_gbl_FADT.Xdsdt = (u64) acpi_gbl_FADT.dsdt; } /* * For ACPI 1.0 FADTs (revision 1 or 2), ensure that reserved fields which * should be zero are indeed zero. This will workaround BIOSs that * inadvertently place values in these fields. * * The ACPI 1.0 reserved fields that will be zeroed are the bytes located at * offset 45, 55, 95, and the word located at offset 109, 110. */ if (acpi_gbl_FADT.header.revision < 3) { acpi_gbl_FADT.preferred_profile = 0; acpi_gbl_FADT.pstate_control = 0; acpi_gbl_FADT.cst_control = 0; acpi_gbl_FADT.boot_flags = 0; } /* * Expand the ACPI 1.0 32-bit V1.0 addresses to the ACPI 2.0 64-bit "X" * generic address structures as necessary. */ for (i = 0; i < ACPI_FADT_INFO_ENTRIES; i++) { target = ACPI_ADD_PTR(struct acpi_generic_address, &acpi_gbl_FADT, fadt_info_table[i].target); /* Expand only if the X target is null */ if (!target->address) { acpi_tb_init_generic_address(target, *ACPI_ADD_PTR(u8, &acpi_gbl_FADT, fadt_info_table [i].length), (u64) * ACPI_ADD_PTR(u32, &acpi_gbl_FADT, fadt_info_table [i]. source)); } } /* * Calculate separate GAS structs for the PM1 Enable registers. * These addresses do not appear (directly) in the FADT, so it is * useful to calculate them once, here. * * The PM event blocks are split into two register blocks, first is the * PM Status Register block, followed immediately by the PM Enable Register * block. Each is of length (pm1_event_length/2) */ pm1_register_length = (u8) ACPI_DIV_2(acpi_gbl_FADT.pm1_event_length); /* The PM1A register block is required */ acpi_tb_init_generic_address(&acpi_gbl_xpm1a_enable, pm1_register_length, (acpi_gbl_FADT.xpm1a_event_block.address + pm1_register_length)); /* Don't forget to copy space_id of the GAS */ acpi_gbl_xpm1a_enable.space_id = acpi_gbl_FADT.xpm1a_event_block.space_id; /* The PM1B register block is optional, ignore if not present */ if (acpi_gbl_FADT.xpm1b_event_block.address) { acpi_tb_init_generic_address(&acpi_gbl_xpm1b_enable, pm1_register_length, (acpi_gbl_FADT.xpm1b_event_block. address + pm1_register_length)); /* Don't forget to copy space_id of the GAS */ acpi_gbl_xpm1b_enable.space_id = acpi_gbl_FADT.xpm1a_event_block.space_id; } } /****************************************************************************** * * FUNCTION: acpi_tb_validate_fadt * * PARAMETERS: Table - Pointer to the FADT to be validated * * RETURN: None * * DESCRIPTION: Validate various important fields within the FADT. If a problem * is found, issue a message, but no status is returned. * Used by both the table manager and the disassembler. * * Possible additional checks: * (acpi_gbl_FADT.pm1_event_length >= 4) * (acpi_gbl_FADT.pm1_control_length >= 2) * (acpi_gbl_FADT.pm_timer_length >= 4) * Gpe block lengths must be multiple of 2 * ******************************************************************************/ static void __init acpi_tb_validate_fadt(void) { u32 *address32; struct acpi_generic_address *address64; u8 length; acpi_native_uint i; /* If Hardware Reduced flag is set, we are all done */ if (acpi_gbl_reduced_hardware) { return; } /* Examine all of the 64-bit extended address fields (X fields) */ for (i = 0; i < ACPI_FADT_INFO_ENTRIES; i++) { /* Generate pointers to the 32-bit and 64-bit addresses and get the length */ address64 = ACPI_ADD_PTR(struct acpi_generic_address, &acpi_gbl_FADT, fadt_info_table[i].target); address32 = ACPI_ADD_PTR(u32, &acpi_gbl_FADT, fadt_info_table[i].source); length = *ACPI_ADD_PTR(u8, &acpi_gbl_FADT, fadt_info_table[i].length); if (fadt_info_table[i].type & ACPI_FADT_REQUIRED) { /* * Field is required (Pm1a_event, Pm1a_control, pm_timer). * Both the address and length must be non-zero. */ if (!address64->address || !length) { ACPI_ERROR((AE_INFO, "Required field \"%s\" has zero address and/or length: %8.8X%8.8X/%X", fadt_info_table[i].name, ACPI_FORMAT_UINT64(address64-> address), length)); } } else if (fadt_info_table[i].type & ACPI_FADT_SEPARATE_LENGTH) { /* * Field is optional (PM2Control, GPE0, GPE1) AND has its own * length field. If present, both the address and length must be valid. */ if ((address64->address && !length) || (!address64->address && length)) { ACPI_WARNING((AE_INFO, "Optional field \"%s\" has zero address or length: %8.8X%8.8X/%X", fadt_info_table[i].name, ACPI_FORMAT_UINT64(address64-> address), length)); } } /* If both 32- and 64-bit addresses are valid (non-zero), they must match */ if (address64->address && *address32 && (address64->address != (u64) * address32)) { ACPI_ERROR((AE_INFO, "32/64X address mismatch in \"%s\": [%8.8X] [%8.8X%8.8X], using 64X", fadt_info_table[i].name, *address32, ACPI_FORMAT_UINT64(address64->address))); } } } xen-4.4.0/xen/drivers/acpi/tables/tbinstal.c0000664000175000017500000001162012307313555017065 0ustar smbsmb/****************************************************************************** * * Module Name: tbinstal - ACPI table installation and removal * *****************************************************************************/ /* * Copyright (C) 2000 - 2008, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #include #include #include #define _COMPONENT ACPI_TABLES ACPI_MODULE_NAME("tbinstal") /****************************************************************************** * * FUNCTION: acpi_tb_verify_table * * PARAMETERS: table_desc - table * * RETURN: Status * * DESCRIPTION: this function is called to verify and map table * *****************************************************************************/ acpi_status __init acpi_tb_verify_table(struct acpi_table_desc *table_desc) { acpi_status status = AE_OK; ACPI_FUNCTION_TRACE(tb_verify_table); /* Map the table if necessary */ if (!table_desc->pointer) { if ((table_desc->flags & ACPI_TABLE_ORIGIN_MASK) == ACPI_TABLE_ORIGIN_MAPPED) { table_desc->pointer = acpi_os_map_memory(table_desc->address, table_desc->length); } if (!table_desc->pointer) { return_ACPI_STATUS(AE_NO_MEMORY); } } /* FACS is the odd table, has no standard ACPI header and no checksum */ if (!ACPI_COMPARE_NAME(&table_desc->signature, ACPI_SIG_FACS)) { /* Always calculate checksum, ignore bad checksum if requested */ status = acpi_tb_verify_checksum(table_desc->pointer, table_desc->length); } return_ACPI_STATUS(status); } /******************************************************************************* * * FUNCTION: acpi_tb_resize_root_table_list * * PARAMETERS: None * * RETURN: Status * * DESCRIPTION: Expand the size of global table array * ******************************************************************************/ acpi_status __init acpi_tb_resize_root_table_list(void) { struct acpi_table_desc *tables; ACPI_FUNCTION_TRACE(tb_resize_root_table_list); /* allow_resize flag is a parameter to acpi_initialize_tables */ if (!(acpi_gbl_root_table_list.flags & ACPI_ROOT_ALLOW_RESIZE)) { ACPI_ERROR((AE_INFO, "Resize of Root Table Array is not allowed")); return_ACPI_STATUS(AE_SUPPORT); } /* Increase the Table Array size */ tables = ACPI_ALLOCATE_ZEROED((acpi_gbl_root_table_list.size + ACPI_ROOT_TABLE_SIZE_INCREMENT) * sizeof(struct acpi_table_desc)); if (!tables) { ACPI_ERROR((AE_INFO, "Could not allocate new root table array")); return_ACPI_STATUS(AE_NO_MEMORY); } /* Copy and free the previous table array */ if (acpi_gbl_root_table_list.tables) { ACPI_MEMCPY(tables, acpi_gbl_root_table_list.tables, acpi_gbl_root_table_list.size * sizeof(struct acpi_table_desc)); if (acpi_gbl_root_table_list.flags & ACPI_ROOT_ORIGIN_ALLOCATED) { ACPI_FREE(acpi_gbl_root_table_list.tables); } } acpi_gbl_root_table_list.tables = tables; acpi_gbl_root_table_list.size += ACPI_ROOT_TABLE_SIZE_INCREMENT; acpi_gbl_root_table_list.flags |= (u8) ACPI_ROOT_ORIGIN_ALLOCATED; return_ACPI_STATUS(AE_OK); } xen-4.4.0/xen/drivers/acpi/tables/tbxfroot.c0000664000175000017500000002024112307313555017113 0ustar smbsmb/****************************************************************************** * * Module Name: tbxfroot - Find the root ACPI table (RSDT) * *****************************************************************************/ /* * Copyright (C) 2000 - 2008, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #include #include #include #include #define _COMPONENT ACPI_TABLES ACPI_MODULE_NAME("tbxfroot") /* Local prototypes */ static u8 *acpi_tb_scan_memory_for_rsdp(u8 * start_address, u32 length); /******************************************************************************* * * FUNCTION: acpi_tb_validate_rsdp * * PARAMETERS: Rsdp - Pointer to unvalidated RSDP * * RETURN: Status * * DESCRIPTION: Validate the RSDP (ptr) * ******************************************************************************/ static acpi_status __init acpi_tb_validate_rsdp(struct acpi_table_rsdp *rsdp) { ACPI_FUNCTION_ENTRY(); /* * The signature and checksum must both be correct * * Note: Sometimes there exists more than one RSDP in memory; the valid * RSDP has a valid checksum, all others have an invalid checksum. */ if (ACPI_STRNCMP((char *)rsdp, ACPI_SIG_RSDP, sizeof(ACPI_SIG_RSDP) - 1) != 0) { /* Nope, BAD Signature */ return (AE_BAD_SIGNATURE); } /* Check the standard checksum */ if (acpi_tb_checksum((u8 *) rsdp, ACPI_RSDP_CHECKSUM_LENGTH) != 0) { return (AE_BAD_CHECKSUM); } /* Check extended checksum if table version >= 2 */ if ((rsdp->revision >= 2) && (acpi_tb_checksum((u8 *) rsdp, ACPI_RSDP_XCHECKSUM_LENGTH) != 0)) { return (AE_BAD_CHECKSUM); } return (AE_OK); } /******************************************************************************* * * FUNCTION: acpi_find_root_pointer * * PARAMETERS: table_address - Where the table pointer is returned * * RETURN: Status, RSDP physical address * * DESCRIPTION: Search lower 1_mbyte of memory for the root system descriptor * pointer structure. If it is found, set *RSDP to point to it. * * NOTE1: The RSDP must be either in the first 1_k of the Extended * BIOS Data Area or between E0000 and FFFFF (From ACPI Spec.) * Only a 32-bit physical address is necessary. * * NOTE2: This function is always available, regardless of the * initialization state of the rest of ACPI. * ******************************************************************************/ acpi_status __init acpi_find_root_pointer(acpi_native_uint * table_address) { u8 *table_ptr; u8 *mem_rover; u32 physical_address; ACPI_FUNCTION_TRACE(acpi_find_root_pointer); /* 1a) Get the location of the Extended BIOS Data Area (EBDA) */ table_ptr = acpi_os_map_memory((acpi_physical_address) ACPI_EBDA_PTR_LOCATION, ACPI_EBDA_PTR_LENGTH); if (!table_ptr) { ACPI_ERROR((AE_INFO, "Could not map memory at %8.8X for length %X", ACPI_EBDA_PTR_LOCATION, ACPI_EBDA_PTR_LENGTH)); return_ACPI_STATUS(AE_NO_MEMORY); } ACPI_MOVE_16_TO_32(&physical_address, table_ptr); /* Convert segment part to physical address */ physical_address <<= 4; acpi_os_unmap_memory(table_ptr, ACPI_EBDA_PTR_LENGTH); /* EBDA present? */ if (physical_address > 0x400) { /* * 1b) Search EBDA paragraphs (EBDA is required to be a * minimum of 1_k length) */ table_ptr = acpi_os_map_memory((acpi_native_uint) physical_address, ACPI_EBDA_WINDOW_SIZE); if (!table_ptr) { ACPI_ERROR((AE_INFO, "Could not map memory at %8.8X for length %X", physical_address, ACPI_EBDA_WINDOW_SIZE)); return_ACPI_STATUS(AE_NO_MEMORY); } mem_rover = acpi_tb_scan_memory_for_rsdp(table_ptr, ACPI_EBDA_WINDOW_SIZE); acpi_os_unmap_memory(table_ptr, ACPI_EBDA_WINDOW_SIZE); if (mem_rover) { /* Return the physical address */ physical_address += (u32) ACPI_PTR_DIFF(mem_rover, table_ptr); *table_address = physical_address; return_ACPI_STATUS(AE_OK); } } /* * 2) Search upper memory: 16-byte boundaries in E0000h-FFFFFh */ table_ptr = acpi_os_map_memory((acpi_physical_address) ACPI_HI_RSDP_WINDOW_BASE, ACPI_HI_RSDP_WINDOW_SIZE); if (!table_ptr) { ACPI_ERROR((AE_INFO, "Could not map memory at %8.8X for length %X", ACPI_HI_RSDP_WINDOW_BASE, ACPI_HI_RSDP_WINDOW_SIZE)); return_ACPI_STATUS(AE_NO_MEMORY); } mem_rover = acpi_tb_scan_memory_for_rsdp(table_ptr, ACPI_HI_RSDP_WINDOW_SIZE); acpi_os_unmap_memory(table_ptr, ACPI_HI_RSDP_WINDOW_SIZE); if (mem_rover) { /* Return the physical address */ physical_address = (u32) (ACPI_HI_RSDP_WINDOW_BASE + ACPI_PTR_DIFF(mem_rover, table_ptr)); *table_address = physical_address; return_ACPI_STATUS(AE_OK); } /* A valid RSDP was not found */ ACPI_ERROR((AE_INFO, "A valid RSDP was not found")); return_ACPI_STATUS(AE_NOT_FOUND); } /******************************************************************************* * * FUNCTION: acpi_tb_scan_memory_for_rsdp * * PARAMETERS: start_address - Starting pointer for search * Length - Maximum length to search * * RETURN: Pointer to the RSDP if found, otherwise NULL. * * DESCRIPTION: Search a block of memory for the RSDP signature * ******************************************************************************/ static u8 *__init acpi_tb_scan_memory_for_rsdp(u8 * start_address, u32 length) { acpi_status status; u8 *mem_rover; u8 *end_address; ACPI_FUNCTION_TRACE(tb_scan_memory_for_rsdp); end_address = start_address + length; /* Search from given start address for the requested length */ for (mem_rover = start_address; mem_rover < end_address; mem_rover += ACPI_RSDP_SCAN_STEP) { /* The RSDP signature and checksum must both be correct */ status = acpi_tb_validate_rsdp(ACPI_CAST_PTR (struct acpi_table_rsdp, mem_rover)); if (ACPI_SUCCESS(status)) { /* Sig and checksum valid, we have found a real RSDP */ ACPI_DEBUG_PRINT((ACPI_DB_INFO, "RSDP located at physical address %p\n", mem_rover)); return_PTR(mem_rover); } /* No sig match or bad checksum, keep searching */ } /* Searched entire block, no RSDP was found */ ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Searched entire block from %p, valid RSDP was not found\n", start_address)); return_PTR(NULL); } xen-4.4.0/xen/drivers/acpi/tables/tbutils.c0000664000175000017500000003734312307313555016745 0ustar smbsmb/****************************************************************************** * * Module Name: tbutils - table utilities * *****************************************************************************/ /* * Copyright (C) 2000 - 2008, Intel Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #include #include #include #include #define _COMPONENT ACPI_TABLES ACPI_MODULE_NAME("tbutils") /******************************************************************************* * * FUNCTION: acpi_tb_check_xsdt * * PARAMETERS: address - Pointer to the XSDT * * RETURN: status * AE_OK - XSDT is okay * AE_NO_MEMORY - can't map XSDT * AE_INVALID_TABLE_LENGTH - invalid table length * AE_NULL_ENTRY - XSDT has NULL entry * * DESCRIPTION: validate XSDT ******************************************************************************/ static acpi_status __init acpi_tb_check_xsdt(acpi_physical_address address) { struct acpi_table_header *table; u32 length; u64 xsdt_entry_address; u8 *table_entry; u32 table_count; int i; table = acpi_os_map_memory(address, sizeof(struct acpi_table_header)); if (!table) return AE_NO_MEMORY; length = table->length; acpi_os_unmap_memory(table, sizeof(struct acpi_table_header)); if (length < sizeof(struct acpi_table_header)) return AE_INVALID_TABLE_LENGTH; table = acpi_os_map_memory(address, length); if (!table) return AE_NO_MEMORY; /* Calculate the number of tables described in XSDT */ table_count = (u32) ((table->length - sizeof(struct acpi_table_header)) / sizeof(u64)); table_entry = ACPI_CAST_PTR(u8, table) + sizeof(struct acpi_table_header); for (i = 0; i < table_count; i++) { ACPI_MOVE_64_TO_64(&xsdt_entry_address, table_entry); if (!xsdt_entry_address) { /* XSDT has NULL entry */ break; } table_entry += sizeof(u64); } acpi_os_unmap_memory(table, length); if (i < table_count) return AE_NULL_ENTRY; else return AE_OK; } /******************************************************************************* * * FUNCTION: acpi_tb_print_table_header * * PARAMETERS: Address - Table physical address * Header - Table header * * RETURN: None * * DESCRIPTION: Print an ACPI table header. Special cases for FACS and RSDP. * ******************************************************************************/ void __init acpi_tb_print_table_header(acpi_physical_address address, struct acpi_table_header *header) { if (ACPI_COMPARE_NAME(header->signature, ACPI_SIG_FACS)) { /* FACS only has signature and length fields of common table header */ ACPI_INFO((AE_INFO, "%4.4s %08lX, %04X", header->signature, (unsigned long)address, header->length)); } else if (ACPI_COMPARE_NAME(header->signature, ACPI_SIG_RSDP)) { /* RSDP has no common fields */ ACPI_INFO((AE_INFO, "RSDP %08lX, %04X (r%d %6.6s)", (unsigned long)address, (ACPI_CAST_PTR(struct acpi_table_rsdp, header)-> revision > 0) ? ACPI_CAST_PTR(struct acpi_table_rsdp, header)->length : 20, ACPI_CAST_PTR(struct acpi_table_rsdp, header)->revision, ACPI_CAST_PTR(struct acpi_table_rsdp, header)->oem_id)); } else { /* Standard ACPI table with full common header */ ACPI_INFO((AE_INFO, "%4.4s %08lX, %04X (r%d %6.6s %8.8s %8X %4.4s %8X)", header->signature, (unsigned long)address, header->length, header->revision, header->oem_id, header->oem_table_id, header->oem_revision, header->asl_compiler_id, header->asl_compiler_revision)); } } /******************************************************************************* * * FUNCTION: acpi_tb_validate_checksum * * PARAMETERS: Table - ACPI table to verify * Length - Length of entire table * * RETURN: Status * * DESCRIPTION: Verifies that the table checksums to zero. Optionally returns * exception on bad checksum. * ******************************************************************************/ acpi_status __init acpi_tb_verify_checksum(struct acpi_table_header *table, u32 length) { u8 checksum; /* Compute the checksum on the table */ checksum = acpi_tb_checksum(ACPI_CAST_PTR(u8, table), length); /* Checksum ok? (should be zero) */ if (checksum) { ACPI_WARNING((AE_INFO, "Incorrect checksum in table [%4.4s] - %2.2X, should be %2.2X", table->signature, table->checksum, (u8) (table->checksum - checksum))); #if (ACPI_CHECKSUM_ABORT) return (AE_BAD_CHECKSUM); #endif } return (AE_OK); } /******************************************************************************* * * FUNCTION: acpi_tb_checksum * * PARAMETERS: Buffer - Pointer to memory region to be checked * Length - Length of this memory region * * RETURN: Checksum (u8) * * DESCRIPTION: Calculates circular checksum of memory region. * ******************************************************************************/ u8 acpi_tb_checksum(u8 * buffer, acpi_native_uint length) { u8 sum = 0; u8 *end = buffer + length; while (buffer < end) { sum = (u8) (sum + *(buffer++)); } return sum; } /******************************************************************************* * * FUNCTION: acpi_tb_install_table * * PARAMETERS: Address - Physical address of DSDT or FACS * Flags - Flags * Signature - Table signature, NULL if no need to * match * table_index - Index into root table array * * RETURN: None * * DESCRIPTION: Install an ACPI table into the global data structure. * ******************************************************************************/ void __init acpi_tb_install_table(acpi_physical_address address, u8 flags, char *signature, acpi_native_uint table_index) { struct acpi_table_header *table; if (!address) { ACPI_ERROR((AE_INFO, "Null physical address for ACPI table [%s]", signature)); return; } /* Map just the table header */ table = acpi_os_map_memory(address, sizeof(struct acpi_table_header)); if (!table) { return; } /* If a particular signature is expected, signature must match */ if (signature && !ACPI_COMPARE_NAME(table->signature, signature)) { ACPI_ERROR((AE_INFO, "Invalid signature 0x%X for ACPI table [%s]", *ACPI_CAST_PTR(u32, table->signature), signature)); goto unmap_and_exit; } /* Initialize the table entry */ acpi_gbl_root_table_list.tables[table_index].address = address; acpi_gbl_root_table_list.tables[table_index].length = table->length; acpi_gbl_root_table_list.tables[table_index].flags = flags; ACPI_MOVE_32_TO_32(& (acpi_gbl_root_table_list.tables[table_index]. signature), table->signature); acpi_tb_print_table_header(address, table); unmap_and_exit: acpi_os_unmap_memory(table, sizeof(struct acpi_table_header)); } /******************************************************************************* * * FUNCTION: acpi_tb_get_root_table_entry * * PARAMETERS: table_entry - Pointer to the RSDT/XSDT table entry * table_entry_size - sizeof 32 or 64 (RSDT or XSDT) * * RETURN: Physical address extracted from the root table * * DESCRIPTION: Get one root table entry. Handles 32-bit and 64-bit cases on * both 32-bit and 64-bit platforms * * NOTE: acpi_physical_address is 32-bit on 32-bit platforms, 64-bit on * 64-bit platforms. * ******************************************************************************/ static acpi_physical_address __init acpi_tb_get_root_table_entry(u8 * table_entry, acpi_native_uint table_entry_size) { u64 address64; /* * Get the table physical address (32-bit for RSDT, 64-bit for XSDT): * Note: Addresses are 32-bit aligned (not 64) in both RSDT and XSDT */ if (table_entry_size == sizeof(u32)) { /* * 32-bit platform, RSDT: Return 32-bit table entry * 64-bit platform, RSDT: Expand 32-bit to 64-bit and return */ return ((acpi_physical_address) (*ACPI_CAST_PTR(u32, table_entry))); } else { /* * 32-bit platform, XSDT: Truncate 64-bit to 32-bit and return * 64-bit platform, XSDT: Move (unaligned) 64-bit to local, return 64-bit */ ACPI_MOVE_64_TO_64(&address64, table_entry); #if ACPI_MACHINE_WIDTH == 32 if (address64 > ACPI_UINT32_MAX) { /* Will truncate 64-bit address to 32 bits, issue warning */ ACPI_WARNING((AE_INFO, "64-bit Physical Address in XSDT is too large (%8.8X%8.8X), truncating", ACPI_FORMAT_UINT64(address64))); } #endif return ((acpi_physical_address) (address64)); } } /******************************************************************************* * * FUNCTION: acpi_tb_parse_root_table * * PARAMETERS: Rsdp - Pointer to the RSDP * Flags - Flags * * RETURN: Status * * DESCRIPTION: This function is called to parse the Root System Description * Table (RSDT or XSDT) * * NOTE: Tables are mapped (not copied) for efficiency. The FACS must * be mapped and cannot be copied because it contains the actual * memory location of the ACPI Global Lock. * ******************************************************************************/ acpi_status __init acpi_tb_parse_root_table(acpi_physical_address rsdp_address, u8 flags) { struct acpi_table_rsdp *rsdp; acpi_native_uint table_entry_size; acpi_native_uint i; u32 table_count; struct acpi_table_header *table; acpi_physical_address address; acpi_physical_address rsdt_address = 0; u32 length; u8 *table_entry; acpi_status status; ACPI_FUNCTION_TRACE(tb_parse_root_table); /* * Map the entire RSDP and extract the address of the RSDT or XSDT */ rsdp = acpi_os_map_memory(rsdp_address, sizeof(struct acpi_table_rsdp)); if (!rsdp) { return_ACPI_STATUS(AE_NO_MEMORY); } acpi_tb_print_table_header(rsdp_address, ACPI_CAST_PTR(struct acpi_table_header, rsdp)); /* Differentiate between RSDT and XSDT root tables */ if (rsdp->revision > 1 && rsdp->xsdt_physical_address) { /* * Root table is an XSDT (64-bit physical addresses). We must use the * XSDT if the revision is > 1 and the XSDT pointer is present, as per * the ACPI specification. */ address = (acpi_physical_address) rsdp->xsdt_physical_address; table_entry_size = sizeof(u64); rsdt_address = (acpi_physical_address) rsdp->rsdt_physical_address; } else { /* Root table is an RSDT (32-bit physical addresses) */ address = (acpi_physical_address) rsdp->rsdt_physical_address; table_entry_size = sizeof(u32); } /* * It is not possible to map more than one entry in some environments, * so unmap the RSDP here before mapping other tables */ acpi_os_unmap_memory(rsdp, sizeof(struct acpi_table_rsdp)); if (table_entry_size == sizeof(u64)) { if (acpi_tb_check_xsdt(address) == AE_NULL_ENTRY) { /* XSDT has NULL entry, RSDT is used */ address = rsdt_address; table_entry_size = sizeof(u32); ACPI_WARNING((AE_INFO, "BIOS XSDT has NULL entry, " "using RSDT")); } } /* Map the RSDT/XSDT table header to get the full table length */ table = acpi_os_map_memory(address, sizeof(struct acpi_table_header)); if (!table) { return_ACPI_STATUS(AE_NO_MEMORY); } acpi_tb_print_table_header(address, table); /* Get the length of the full table, verify length and map entire table */ length = table->length; acpi_os_unmap_memory(table, sizeof(struct acpi_table_header)); if (length < sizeof(struct acpi_table_header)) { ACPI_ERROR((AE_INFO, "Invalid length 0x%X in RSDT/XSDT", length)); return_ACPI_STATUS(AE_INVALID_TABLE_LENGTH); } table = acpi_os_map_memory(address, length); if (!table) { return_ACPI_STATUS(AE_NO_MEMORY); } /* Validate the root table checksum */ status = acpi_tb_verify_checksum(table, length); if (ACPI_FAILURE(status)) { acpi_os_unmap_memory(table, length); return_ACPI_STATUS(status); } /* Calculate the number of tables described in the root table */ table_count = (u32) ((table->length - sizeof(struct acpi_table_header)) / table_entry_size); /* * First two entries in the table array are reserved for the DSDT and FACS, * which are not actually present in the RSDT/XSDT - they come from the FADT */ table_entry = ACPI_CAST_PTR(u8, table) + sizeof(struct acpi_table_header); acpi_gbl_root_table_list.count = 2; /* * Initialize the root table array from the RSDT/XSDT */ for (i = 0; i < table_count; i++) { if (acpi_gbl_root_table_list.count >= acpi_gbl_root_table_list.size) { /* There is no more room in the root table array, attempt resize */ status = acpi_tb_resize_root_table_list(); if (ACPI_FAILURE(status)) { ACPI_WARNING((AE_INFO, "Truncating %u table entries!", (unsigned) (acpi_gbl_root_table_list.size - acpi_gbl_root_table_list. count))); break; } } /* Get the table physical address (32-bit for RSDT, 64-bit for XSDT) */ acpi_gbl_root_table_list.tables[acpi_gbl_root_table_list.count]. address = acpi_tb_get_root_table_entry(table_entry, table_entry_size); table_entry += table_entry_size; acpi_gbl_root_table_list.count++; } /* * It is not possible to map more than one entry in some environments, * so unmap the root table here before mapping other tables */ acpi_os_unmap_memory(table, length); /* * Complete the initialization of the root table array by examining * the header of each table */ for (i = 2; i < acpi_gbl_root_table_list.count; i++) { acpi_tb_install_table(acpi_gbl_root_table_list.tables[i]. address, flags, NULL, i); /* Special case for FADT - get the DSDT and FACS */ if (ACPI_COMPARE_NAME (&acpi_gbl_root_table_list.tables[i].signature, ACPI_SIG_FADT)) { acpi_tb_parse_fadt(i, flags); } } return_ACPI_STATUS(AE_OK); } xen-4.4.0/xen/drivers/acpi/reboot.c0000664000175000017500000000201512307313555015263 0ustar smbsmb#include #include #include #include void acpi_reboot(void) { struct acpi_generic_address *rr; u8 reset_value; rr = &acpi_gbl_FADT.reset_register; /* Is the reset register supported? The spec says we should be * checking the bit width and bit offset, but Windows ignores * these fields */ if (!(acpi_gbl_FADT.flags & ACPI_FADT_RESET_REGISTER)) return; reset_value = acpi_gbl_FADT.reset_value; /* The reset register can only exist in I/O, Memory or PCI config space * on a device on bus 0. */ switch (rr->space_id) { case ACPI_ADR_SPACE_PCI_CONFIG: printk("Resetting with ACPI PCI RESET_REG.\n"); /* Write the value that resets us. */ pci_conf_write8(0, 0, (rr->address >> 32) & 31, (rr->address >> 16) & 7, (rr->address & 255), reset_value); break; case ACPI_ADR_SPACE_SYSTEM_MEMORY: case ACPI_ADR_SPACE_SYSTEM_IO: printk("Resetting with ACPI MEMORY or I/O RESET_REG.\n"); acpi_hw_low_level_write(8, reset_value, rr); break; } } xen-4.4.0/xen/drivers/acpi/hwregs.c0000664000175000017500000004371412307313555015303 0ustar smbsmb /******************************************************************************* * * Module Name: hwregs - Read/write access functions for the various ACPI * control and status registers. * ******************************************************************************/ /* * Copyright (C) 2000 - 2006, R. Byron Moore * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ #include #include #include #include #include #include #define _COMPONENT ACPI_HARDWARE ACPI_MODULE_NAME("hwregs") /******************************************************************************* * * FUNCTION: acpi_hw_get_register_bit_mask * * PARAMETERS: register_id - Index of ACPI Register to access * * RETURN: The bitmask to be used when accessing the register * * DESCRIPTION: Map register_id into a register bitmask. * ******************************************************************************/ static struct acpi_bit_register_info * acpi_hw_get_bit_register_info(u32 register_id) { ACPI_FUNCTION_ENTRY(); if (register_id > ACPI_BITREG_MAX) { ACPI_DEBUG_PRINT((AE_INFO, "Invalid BitRegister ID: %X", register_id)); return (NULL); } return (&acpi_gbl_bit_register_info[register_id]); } /******************************************************************************* * * FUNCTION: acpi_get_register * * PARAMETERS: register_id - ID of ACPI bit_register to access * return_value - Value that was read from the register * * RETURN: Status and the value read from specified Register. Value * returned is normalized to bit0 (is shifted all the way right) * * DESCRIPTION: ACPI bit_register read function. * ******************************************************************************/ acpi_status acpi_get_register(u32 register_id, u32 * return_value) { u32 register_value = 0; struct acpi_bit_register_info *bit_reg_info; acpi_status status; ACPI_FUNCTION_TRACE(acpi_get_register); /* Get the info structure corresponding to the requested ACPI Register */ bit_reg_info = acpi_hw_get_bit_register_info(register_id); if (!bit_reg_info) { return_ACPI_STATUS(AE_BAD_PARAMETER); } /* Read from the register */ status = acpi_hw_register_read(bit_reg_info->parent_register, ®ister_value); if (ACPI_SUCCESS(status)) { /* Normalize the value that was read */ register_value = ((register_value & bit_reg_info->access_bit_mask) >> bit_reg_info->bit_position); *return_value = register_value; ACPI_DEBUG_PRINT((ACPI_DB_IO, "Read value %8.8X register %X\n", register_value, bit_reg_info->parent_register)); } return_ACPI_STATUS(status); } /******************************************************************************* * * FUNCTION: acpi_set_register * * PARAMETERS: register_id - ID of ACPI bit_register to access * Value - (only used on write) value to write to the * Register, NOT pre-normalized to the bit pos * * RETURN: Status * * DESCRIPTION: ACPI Bit Register write function. * ******************************************************************************/ acpi_status acpi_set_register(u32 register_id, u32 value) { u32 register_value = 0; struct acpi_bit_register_info *bit_reg_info; acpi_status status; ACPI_FUNCTION_TRACE_U32(acpi_set_register, register_id); /* Get the info structure corresponding to the requested ACPI Register */ bit_reg_info = acpi_hw_get_bit_register_info(register_id); if (!bit_reg_info) { ACPI_DEBUG_PRINT((AE_INFO, "Bad ACPI HW RegisterId: %X", register_id)); return_ACPI_STATUS(AE_BAD_PARAMETER); } /* Always do a register read first so we can insert the new bits */ status = acpi_hw_register_read(bit_reg_info->parent_register, ®ister_value); if (ACPI_FAILURE(status)) { goto unlock_and_exit; } /* * Decode the Register ID * Register ID = [Register block ID] | [bit ID] * * Check bit ID to fine locate Register offset. * Check Mask to determine Register offset, and then read-write. */ switch (bit_reg_info->parent_register) { case ACPI_REGISTER_PM1_STATUS: /* * Status Registers are different from the rest. Clear by * writing 1, and writing 0 has no effect. So, the only relevant * information is the single bit we're interested in, all others should * be written as 0 so they will be left unchanged. */ value = ACPI_REGISTER_PREPARE_BITS(value, bit_reg_info->bit_position, bit_reg_info-> access_bit_mask); if (value) { status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS, (u16) value); register_value = 0; } break; case ACPI_REGISTER_PM1_ENABLE: ACPI_REGISTER_INSERT_VALUE(register_value, bit_reg_info->bit_position, bit_reg_info->access_bit_mask, value); status = acpi_hw_register_write(ACPI_REGISTER_PM1_ENABLE, (u16) register_value); break; case ACPI_REGISTER_PM1_CONTROL: /* * Write the PM1 Control register. * Note that at this level, the fact that there are actually TWO * registers (A and B - and B may not exist) is abstracted. */ ACPI_DEBUG_PRINT((ACPI_DB_IO, "PM1 control: Read %X\n", register_value)); ACPI_REGISTER_INSERT_VALUE(register_value, bit_reg_info->bit_position, bit_reg_info->access_bit_mask, value); status = acpi_hw_register_write(ACPI_REGISTER_PM1_CONTROL, (u16) register_value); break; case ACPI_REGISTER_PM2_CONTROL: #if 0 /* Redundant read in original Linux code. */ status = acpi_hw_register_read(ACPI_REGISTER_PM2_CONTROL, ®ister_value); if (ACPI_FAILURE(status)) { goto unlock_and_exit; } #endif ACPI_DEBUG_PRINT((ACPI_DB_IO, "PM2 control: Read %X from %8.8X%8.8X\n", register_value, ACPI_FORMAT_UINT64(acpi_gbl_FADT. xpm2_control_block. address))); ACPI_REGISTER_INSERT_VALUE(register_value, bit_reg_info->bit_position, bit_reg_info->access_bit_mask, value); ACPI_DEBUG_PRINT((ACPI_DB_IO, "About to write %4.4X to %8.8X%8.8X\n", register_value, ACPI_FORMAT_UINT64(acpi_gbl_FADT. xpm2_control_block. address))); status = acpi_hw_register_write(ACPI_REGISTER_PM2_CONTROL, (u8) (register_value)); break; default: break; } unlock_and_exit: /* Normalize the value that was read */ ACPI_DEBUG_EXEC(register_value = ((register_value & bit_reg_info->access_bit_mask) >> bit_reg_info->bit_position)); ACPI_DEBUG_PRINT((ACPI_DB_IO, "Set bits: %8.8X actual %8.8X register %X\n", value, register_value, bit_reg_info->parent_register)); return_ACPI_STATUS(status); } /****************************************************************************** * * FUNCTION: acpi_hw_register_read * * PARAMETERS: register_id - ACPI Register ID * return_value - Where the register value is returned * * RETURN: Status and the value read. * * DESCRIPTION: Read from the specified ACPI register * ******************************************************************************/ acpi_status acpi_hw_register_read(u32 register_id, u32 * return_value) { u32 value1 = 0; u32 value2 = 0; acpi_status status; ACPI_FUNCTION_TRACE(hw_register_read); switch (register_id) { case ACPI_REGISTER_PM1_STATUS: /* 16-bit access */ status = acpi_hw_low_level_read(16, &value1, &acpi_gbl_FADT.xpm1a_event_block); if (ACPI_FAILURE(status)) { goto exit; } /* PM1B is optional */ status = acpi_hw_low_level_read(16, &value2, &acpi_gbl_FADT.xpm1b_event_block); value1 |= value2; break; case ACPI_REGISTER_PM1_ENABLE: /* 16-bit access */ status = acpi_hw_low_level_read(16, &value1, &acpi_gbl_xpm1a_enable); if (ACPI_FAILURE(status)) { goto exit; } /* PM1B is optional */ status = acpi_hw_low_level_read(16, &value2, &acpi_gbl_xpm1b_enable); value1 |= value2; break; case ACPI_REGISTER_PM1_CONTROL: /* 16-bit access */ status = acpi_hw_low_level_read(16, &value1, &acpi_gbl_FADT.xpm1a_control_block); if (ACPI_FAILURE(status)) { goto exit; } status = acpi_hw_low_level_read(16, &value2, &acpi_gbl_FADT.xpm1b_control_block); value1 |= value2; break; case ACPI_REGISTER_PM2_CONTROL: /* 8-bit access */ status = acpi_hw_low_level_read(8, &value1, &acpi_gbl_FADT.xpm2_control_block); break; case ACPI_REGISTER_PM_TIMER: /* 32-bit access */ status = acpi_hw_low_level_read(32, &value1, &acpi_gbl_FADT.xpm_timer_block); break; case ACPI_REGISTER_SMI_COMMAND_BLOCK: /* 8-bit access */ status = acpi_os_read_port(acpi_gbl_FADT.smi_command, &value1, 8); break; case ACPI_REGISTER_SLEEP_STATUS: status = acpi_hw_low_level_read(acpi_gbl_FADT.sleep_status.bit_width, &value1, &acpi_gbl_FADT.sleep_status); break; default: ACPI_DEBUG_PRINT((AE_INFO, "Unknown Register ID: %X", register_id)); status = AE_BAD_PARAMETER; break; } exit: if (ACPI_SUCCESS(status)) { *return_value = value1; } return_ACPI_STATUS(status); } /****************************************************************************** * * FUNCTION: acpi_hw_register_write * * PARAMETERS: register_id - ACPI Register ID * Value - The value to write * * RETURN: Status * * DESCRIPTION: Write to the specified ACPI register * * NOTE: In accordance with the ACPI specification, this function automatically * preserves the value of the following bits, meaning that these bits cannot be * changed via this interface: * * PM1_CONTROL[0] = SCI_EN * PM1_CONTROL[9] * PM1_STATUS[11] * * ACPI References: * 1) Hardware Ignored Bits: When software writes to a register with ignored * bit fields, it preserves the ignored bit fields * 2) SCI_EN: OSPM always preserves this bit position * ******************************************************************************/ acpi_status acpi_hw_register_write(u32 register_id, u32 value) { acpi_status status; u32 read_value; ACPI_FUNCTION_TRACE(hw_register_write); switch (register_id) { case ACPI_REGISTER_PM1_STATUS: /* 16-bit access */ /* Perform a read first to preserve certain bits (per ACPI spec) */ status = acpi_hw_register_read(ACPI_REGISTER_PM1_STATUS, &read_value); if (ACPI_FAILURE(status)) { goto exit; } /* Insert the bits to be preserved */ ACPI_INSERT_BITS(value, ACPI_PM1_STATUS_PRESERVED_BITS, read_value); /* Now we can write the data */ status = acpi_hw_low_level_write(16, value, &acpi_gbl_FADT.xpm1a_event_block); if (ACPI_FAILURE(status)) { goto exit; } /* PM1B is optional */ status = acpi_hw_low_level_write(16, value, &acpi_gbl_FADT.xpm1b_event_block); break; case ACPI_REGISTER_PM1_ENABLE: /* 16-bit access */ status = acpi_hw_low_level_write(16, value, &acpi_gbl_xpm1a_enable); if (ACPI_FAILURE(status)) { goto exit; } /* PM1B is optional */ status = acpi_hw_low_level_write(16, value, &acpi_gbl_xpm1b_enable); break; case ACPI_REGISTER_PM1_CONTROL: /* 16-bit access */ /* * Perform a read first to preserve certain bits (per ACPI spec) */ status = acpi_hw_register_read(ACPI_REGISTER_PM1_CONTROL, &read_value); if (ACPI_FAILURE(status)) { goto exit; } /* Insert the bits to be preserved */ ACPI_INSERT_BITS(value, ACPI_PM1_CONTROL_PRESERVED_BITS, read_value); /* Now we can write the data */ status = acpi_hw_low_level_write(16, value, &acpi_gbl_FADT.xpm1a_control_block); if (ACPI_FAILURE(status)) { goto exit; } status = acpi_hw_low_level_write(16, value, &acpi_gbl_FADT.xpm1b_control_block); break; case ACPI_REGISTER_PM1A_CONTROL: /* 16-bit access */ status = acpi_hw_low_level_write(16, value, &acpi_gbl_FADT.xpm1a_control_block); break; case ACPI_REGISTER_PM1B_CONTROL: /* 16-bit access */ status = acpi_hw_low_level_write(16, value, &acpi_gbl_FADT.xpm1b_control_block); break; case ACPI_REGISTER_PM2_CONTROL: /* 8-bit access */ status = acpi_hw_low_level_write(8, value, &acpi_gbl_FADT.xpm2_control_block); break; case ACPI_REGISTER_PM_TIMER: /* 32-bit access */ status = acpi_hw_low_level_write(32, value, &acpi_gbl_FADT.xpm_timer_block); break; case ACPI_REGISTER_SMI_COMMAND_BLOCK: /* 8-bit access */ /* SMI_CMD is currently always in IO space */ status = acpi_os_write_port(acpi_gbl_FADT.smi_command, value, 8); break; case ACPI_REGISTER_SLEEP_CONTROL: status = acpi_hw_low_level_write(acpi_gbl_FADT.sleep_control.bit_width, value, &acpi_gbl_FADT.sleep_control); break; default: status = AE_BAD_PARAMETER; break; } exit: return_ACPI_STATUS(status); } /****************************************************************************** * * FUNCTION: acpi_hw_low_level_read * * PARAMETERS: Width - 8, 16, or 32 * Value - Where the value is returned * Reg - GAS register structure * * RETURN: Status * * DESCRIPTION: Read from either memory or IO space. * ******************************************************************************/ acpi_status acpi_hw_low_level_read(u32 width, u32 * value, struct acpi_generic_address *reg) { u64 address; acpi_status status; ACPI_FUNCTION_NAME(hw_low_level_read); /* * Must have a valid pointer to a GAS structure, and * a non-zero address within. However, don't return an error * because the PM1A/B code must not fail if B isn't present. */ if (!reg) { return (AE_OK); } /* Get a local copy of the address. Handles possible alignment issues */ ACPI_MOVE_64_TO_64(&address, ®->address); if (!address) { return (AE_OK); } *value = 0; /* * Two address spaces supported: Memory or IO. * PCI_Config is not supported here because the GAS struct is insufficient */ switch (reg->space_id) { case ACPI_ADR_SPACE_SYSTEM_MEMORY: status = acpi_os_read_memory((acpi_physical_address) address, value, width); break; case ACPI_ADR_SPACE_SYSTEM_IO: status = acpi_os_read_port((acpi_io_address) address, value, width); break; default: return (AE_BAD_PARAMETER); } ACPI_DEBUG_PRINT((ACPI_DB_IO, "Read: %8.8X width %2d from %8.8X%8.8X (%s)\n", *value, width, ACPI_FORMAT_UINT64(address), acpi_ut_get_region_name(reg->address_space_id))); return (status); } /****************************************************************************** * * FUNCTION: acpi_hw_low_level_write * * PARAMETERS: Width - 8, 16, or 32 * Value - To be written * Reg - GAS register structure * * RETURN: Status * * DESCRIPTION: Write to either memory or IO space. * ******************************************************************************/ acpi_status acpi_hw_low_level_write(u32 width, u32 value, struct acpi_generic_address * reg) { u64 address; acpi_status status; ACPI_FUNCTION_NAME(hw_low_level_write); /* * Must have a valid pointer to a GAS structure, and * a non-zero address within. However, don't return an error * because the PM1A/B code must not fail if B isn't present. */ if (!reg) { return (AE_OK); } /* Get a local copy of the address. Handles possible alignment issues */ ACPI_MOVE_64_TO_64(&address, ®->address); if (!address) { return (AE_OK); } /* * Two address spaces supported: Memory or IO. * PCI_Config is not supported here because the GAS struct is insufficient */ switch (reg->space_id) { case ACPI_ADR_SPACE_SYSTEM_MEMORY: status = acpi_os_write_memory((acpi_physical_address) address, value, width); break; case ACPI_ADR_SPACE_SYSTEM_IO: status = acpi_os_write_port((acpi_io_address) address, value, width); break; default: return (AE_BAD_PARAMETER); } ACPI_DEBUG_PRINT((ACPI_DB_IO, "Wrote: %8.8X width %2d to %8.8X%8.8X (%s)\n", value, width, ACPI_FORMAT_UINT64(address), acpi_ut_get_region_name(reg->address_space_id))); return (status); } xen-4.4.0/xen/drivers/char/0000775000175000017500000000000012307313555013630 5ustar smbsmbxen-4.4.0/xen/drivers/char/omap-uart.c0000664000175000017500000002331512307313555015705 0ustar smbsmb/* * omap-uart.c * Based on drivers/char/ns16550.c * * Driver for OMAP-UART controller * * Copyright (C) 2013, Chen Baozi * * Note: This driver is made separate from 16550-series UART driver as * omap platform has some specific configurations */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define REG_SHIFT 2 #define omap_read(uart, off) readl((uart)->regs + (off<regs + (off<uart; u32 lsr; uint32_t reg; while ( !(omap_read(uart, UART_IIR) & UART_IIR_NOINT) ) { lsr = omap_read(uart, UART_LSR) & 0xff; if ( lsr & UART_LSR_THRE ) serial_tx_interrupt(port, regs); if ( lsr & UART_LSR_DR ) serial_rx_interrupt(port, regs); if ( port->txbufc == port->txbufp ) { reg = omap_read(uart, UART_IER); omap_write(uart, UART_IER, reg & (~UART_IER_ETHREI)); } }; } static void baud_protocol_setup(struct omap_uart *uart) { u32 dll, dlh, efr; unsigned int divisor; divisor = uart->clock_hz / (uart->baud << 4); dll = divisor & 0xff; dlh = divisor >> 8; /* * Switch to register configuration mode B to access the UART_OMAP_EFR * register. */ omap_write(uart, UART_LCR, UART_LCR_CONF_MODE_B); /* * Enable access to the UART_IER[7:4] bit field. */ efr = omap_read(uart, UART_OMAP_EFR); omap_write(uart, UART_OMAP_EFR, efr|UART_OMAP_EFR_ECB); /* * Switch to register operation mode to access the UART_IER register. */ omap_write(uart, UART_LCR, 0); /* * Clear the UART_IER register (set the UART_IER[4] SLEEP_MODE bit * to 0 to change the UART_DLL and UART_DLM register). Set the * UART_IER register value to 0x0000. */ omap_write(uart, UART_IER, 0); /* * Switch to register configuartion mode B to access the UART_DLL and * UART_DLM registers. */ omap_write(uart, UART_LCR, UART_LCR_CONF_MODE_B); /* * Load divisor value. */ omap_write(uart, UART_DLL, dll); omap_write(uart, UART_DLM, dlh); /* * Restore the UART_OMAP_EFR */ omap_write(uart, UART_OMAP_EFR, efr); /* * Load the new protocol formatting (parity, stop-bit, character length) * and switch to register operational mode. */ omap_write(uart, UART_LCR, (uart->data_bits - 5) | ((uart->stop_bits - 1) << 2) | uart->parity); } static void fifo_setup(struct omap_uart *uart) { u32 lcr, efr, mcr; /* * Switch to register configuration mode B to access the UART_OMAP_EFR * register. */ lcr = omap_read(uart, UART_LCR); omap_write(uart, UART_LCR, UART_LCR_CONF_MODE_B); /* * Enable register submode TCR_TLR to access the UART_OMAP_TLR register. */ efr = omap_read(uart, UART_OMAP_EFR); omap_write(uart, UART_OMAP_EFR, efr|UART_OMAP_EFR_ECB); /* * Switch to register configuration mode A to access the UART_MCR * register. */ omap_write(uart, UART_LCR, UART_LCR_CONF_MODE_A); /* * Enable register submode TCR_TLR to access the UART_OMAP_TLR register */ mcr = omap_read(uart, UART_MCR); omap_write(uart, UART_MCR, mcr|UART_MCR_TCRTLR); /* * Enable the FIFO; load the new FIFO trigger and the new DMA mode. */ omap_write(uart, UART_FCR, UART_FCR_R_TRIG_01| UART_FCR_T_TRIG_10|UART_FCR_ENABLE); /* * Switch to register configuration mode B to access the UART_EFR * register. */ omap_write(uart, UART_LCR, UART_LCR_CONF_MODE_B); /* * Load the new FIFO triggers and the new DMA mode bit. */ omap_write(uart, UART_OMAP_SCR, OMAP_UART_SCR_RX_TRIG_GRANU1_MASK); /* * Restore the UART_OMAP_EFR[4] value. */ omap_write(uart, UART_OMAP_EFR, efr); /* * Switch to register configuration mode A to access the UART_MCR * register. */ omap_write(uart, UART_LCR, UART_LCR_CONF_MODE_A); /* * Restore UART_MCR[6] value. */ omap_write(uart, UART_MCR, mcr); /* * Restore UART_LCR value. */ omap_write(uart, UART_LCR, lcr); uart->fifo_size = 64; } static void __init omap_uart_init_preirq(struct serial_port *port) { struct omap_uart *uart = port->uart; /* * Clear the FIFO buffers. */ omap_write(uart, UART_FCR, UART_FCR_ENABLE); omap_write(uart, UART_FCR, UART_FCR_ENABLE|UART_FCR_CLRX|UART_FCR_CLTX); omap_write(uart, UART_FCR, 0); /* * The TRM says the mode should be disabled while UART_DLL and UART_DHL * are being changed so we disable before setup, then enable. */ omap_write(uart, UART_OMAP_MDR1, UART_OMAP_MDR1_DISABLE); /* Baud rate & protocol format setup */ baud_protocol_setup(uart); /* FIFO setup */ fifo_setup(uart); /* No flow control */ omap_write(uart, UART_MCR, UART_MCR_DTR|UART_MCR_RTS); omap_write(uart, UART_OMAP_MDR1, UART_OMAP_MDR1_16X_MODE); } static void __init omap_uart_init_postirq(struct serial_port *port) { struct omap_uart *uart = port->uart; uart->irqaction.handler = omap_uart_interrupt; uart->irqaction.name = "omap_uart"; uart->irqaction.dev_id = port; if ( setup_dt_irq(&uart->irq, &uart->irqaction) != 0 ) { dprintk(XENLOG_ERR, "Failed to allocated omap_uart IRQ %d\n", uart->irq.irq); return; } /* Enable interrupts */ omap_write(uart, UART_IER, UART_IER_ERDAI|UART_IER_ETHREI|UART_IER_ELSI); } static void omap_uart_suspend(struct serial_port *port) { BUG(); } static void omap_uart_resume(struct serial_port *port) { BUG(); } static int omap_uart_tx_ready(struct serial_port *port) { struct omap_uart *uart = port->uart; uint32_t reg; reg = omap_read(uart, UART_IER); omap_write(uart, UART_IER, reg | UART_IER_ETHREI); return omap_read(uart, UART_LSR) & UART_LSR_THRE ? uart->fifo_size : 0; } static void omap_uart_putc(struct serial_port *port, char c) { struct omap_uart *uart = port->uart; omap_write(uart, UART_THR, (uint32_t)(unsigned char)c); } static int omap_uart_getc(struct serial_port *port, char *pc) { struct omap_uart *uart = port->uart; if ( !(omap_read(uart, UART_LSR) & UART_LSR_DR) ) return 0; *pc = omap_read(uart, UART_RBR) & 0xff; return 1; } static int __init omap_uart_irq(struct serial_port *port) { struct omap_uart *uart = port->uart; return ((uart->irq.irq > 0) ? uart->irq.irq : -1); } static const struct dt_irq __init *omap_uart_dt_irq(struct serial_port *port) { struct omap_uart *uart = port->uart; return &uart->irq; } static const struct vuart_info *omap_vuart_info(struct serial_port *port) { struct omap_uart *uart = port->uart; return &uart->vuart; } static struct uart_driver __read_mostly omap_uart_driver = { .init_preirq = omap_uart_init_preirq, .init_postirq = omap_uart_init_postirq, .endboot = NULL, .suspend = omap_uart_suspend, .resume = omap_uart_resume, .tx_ready = omap_uart_tx_ready, .putc = omap_uart_putc, .getc = omap_uart_getc, .irq = omap_uart_irq, .dt_irq_get = omap_uart_dt_irq, .vuart_info = omap_vuart_info, }; static int __init omap_uart_init(struct dt_device_node *dev, const void *data) { const char *config = data; struct omap_uart *uart; u32 clkspec; int res; u64 addr, size; if ( strcmp(config, "") ) early_printk("WARNING: UART configuration is not supported\n"); uart = &omap_com; res = dt_property_read_u32(dev, "clock-frequency", &clkspec); if ( !res ) { early_printk("omap-uart: Unable to retrieve the clock frequency\n"); return -EINVAL; } uart->clock_hz = clkspec; uart->baud = 115200; uart->data_bits = 8; uart->parity = UART_PARITY_NONE; uart->stop_bits = 1; res = dt_device_get_address(dev, 0, &addr, &size); if ( res ) { early_printk("omap-uart: Unable to retrieve the base" " address of the UART\n"); return res; } uart->regs = ioremap_attr(addr, size, PAGE_HYPERVISOR_NOCACHE); if ( !uart->regs ) { early_printk("omap-uart: Unable to map the UART memory\n"); return -ENOMEM; } res = dt_device_get_irq(dev, 0, &uart->irq); if ( res ) { early_printk("omap-uart: Unable to retrieve the IRQ\n"); return res; } uart->vuart.base_addr = addr; uart->vuart.size = size; uart->vuart.data_off = UART_THR; uart->vuart.status_off = UART_LSR << REG_SHIFT; uart->vuart.status = UART_LSR_THRE; /* Register with generic serial driver */ serial_register_uart(SERHND_DTUART, &omap_uart_driver, uart); dt_device_set_used_by(dev, DOMID_XEN); return 0; } static const char * const omap_uart_dt_compat[] __initconst = { "ti,omap4-uart", NULL }; DT_DEVICE_START(omap_uart, "OMAP UART", DEVICE_SERIAL) .compatible = omap_uart_dt_compat, .init = omap_uart_init, DT_DEVICE_END /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/drivers/char/Makefile0000664000175000017500000000035112307313555015267 0ustar smbsmbobj-y += console.o obj-$(HAS_NS16550) += ns16550.o obj-$(HAS_PL011) += pl011.o obj-$(HAS_EXYNOS4210) += exynos4210-uart.o obj-$(HAS_OMAP) += omap-uart.o obj-$(HAS_EHCI) += ehci-dbgp.o obj-$(CONFIG_ARM) += dt-uart.o obj-y += serial.o xen-4.4.0/xen/drivers/char/serial.c0000664000175000017500000003250612307313555015261 0ustar smbsmb/****************************************************************************** * serial.c * * Framework for serial device drivers. * * Copyright (c) 2003-2008, K A Fraser */ #include #include #include #include #include #include /* Never drop characters, even if the async transmit buffer fills. */ /* #define SERIAL_NEVER_DROP_CHARS 1 */ unsigned int __read_mostly serial_txbufsz = 16384; size_param("serial_tx_buffer", serial_txbufsz); #define mask_serial_rxbuf_idx(_i) ((_i)&(serial_rxbufsz-1)) #define mask_serial_txbuf_idx(_i) ((_i)&(serial_txbufsz-1)) static struct serial_port com[SERHND_IDX + 1] = { [0 ... SERHND_IDX] = { .rx_lock = SPIN_LOCK_UNLOCKED, .tx_lock = SPIN_LOCK_UNLOCKED } }; static bool_t __read_mostly post_irq; void serial_rx_interrupt(struct serial_port *port, struct cpu_user_regs *regs) { char c; serial_rx_fn fn = NULL; unsigned long flags; spin_lock_irqsave(&port->rx_lock, flags); if ( port->driver->getc(port, &c) ) { if ( port->rx != NULL ) fn = port->rx; else if ( (c & 0x80) && (port->rx_hi != NULL) ) fn = port->rx_hi; else if ( !(c & 0x80) && (port->rx_lo != NULL) ) fn = port->rx_lo; else if ( (port->rxbufp - port->rxbufc) != serial_rxbufsz ) port->rxbuf[mask_serial_rxbuf_idx(port->rxbufp++)] = c; } spin_unlock_irqrestore(&port->rx_lock, flags); if ( fn != NULL ) (*fn)(c & 0x7f, regs); } void serial_tx_interrupt(struct serial_port *port, struct cpu_user_regs *regs) { int i, n; unsigned long flags; local_irq_save(flags); /* * Avoid spinning for a long time: if there is a long-term lock holder * then we know that they'll be stuffing bytes into the transmitter which * will therefore not be empty for long. */ while ( !spin_trylock(&port->tx_lock) ) { if ( port->driver->tx_ready(port) <= 0 ) goto out; cpu_relax(); } for ( i = 0, n = port->driver->tx_ready(port); i < n; i++ ) { if ( port->txbufc == port->txbufp ) break; port->driver->putc( port, port->txbuf[mask_serial_txbuf_idx(port->txbufc++)]); } if ( i && port->driver->flush ) port->driver->flush(port); spin_unlock(&port->tx_lock); out: local_irq_restore(flags); } static void __serial_putc(struct serial_port *port, char c) { if ( (port->txbuf != NULL) && !port->sync ) { /* Interrupt-driven (asynchronous) transmitter. */ if ( port->tx_quench ) { /* Buffer filled and we are dropping characters. */ if ( (port->txbufp - port->txbufc) > (serial_txbufsz / 2) ) return; port->tx_quench = 0; } if ( (port->txbufp - port->txbufc) == serial_txbufsz ) { if ( port->tx_log_everything ) { /* Buffer is full: we spin waiting for space to appear. */ int n; while ( (n = port->driver->tx_ready(port)) == 0 ) cpu_relax(); if ( n > 0 ) { while ( n-- ) port->driver->putc( port, port->txbuf[mask_serial_txbuf_idx(port->txbufc++)]); port->txbuf[mask_serial_txbuf_idx(port->txbufp++)] = c; } } else { /* Buffer is full: drop chars until buffer is half empty. */ port->tx_quench = 1; } return; } if ( ((port->txbufp - port->txbufc) == 0) && port->driver->tx_ready(port) > 0 ) { /* Buffer and UART FIFO are both empty, and port is available. */ port->driver->putc(port, c); } else { /* Normal case: buffer the character. */ port->txbuf[mask_serial_txbuf_idx(port->txbufp++)] = c; } } else if ( port->driver->tx_ready ) { int n; /* Synchronous finite-capacity transmitter. */ while ( !(n = port->driver->tx_ready(port)) ) cpu_relax(); if ( n > 0 ) port->driver->putc(port, c); } else { /* Simple synchronous transmitter. */ port->driver->putc(port, c); } } void serial_putc(int handle, char c) { struct serial_port *port; unsigned long flags; if ( handle == -1 ) return; port = &com[handle & SERHND_IDX]; if ( !port->driver || !port->driver->putc ) return; spin_lock_irqsave(&port->tx_lock, flags); if ( (c == '\n') && (handle & SERHND_COOKED) ) __serial_putc(port, '\r' | ((handle & SERHND_HI) ? 0x80 : 0x00)); if ( handle & SERHND_HI ) c |= 0x80; else if ( handle & SERHND_LO ) c &= 0x7f; __serial_putc(port, c); if ( port->driver->flush ) port->driver->flush(port); spin_unlock_irqrestore(&port->tx_lock, flags); } void serial_puts(int handle, const char *s) { struct serial_port *port; unsigned long flags; char c; if ( handle == -1 ) return; port = &com[handle & SERHND_IDX]; if ( !port->driver || !port->driver->putc ) return; spin_lock_irqsave(&port->tx_lock, flags); while ( (c = *s++) != '\0' ) { if ( (c == '\n') && (handle & SERHND_COOKED) ) __serial_putc(port, '\r' | ((handle & SERHND_HI) ? 0x80 : 0x00)); if ( handle & SERHND_HI ) c |= 0x80; else if ( handle & SERHND_LO ) c &= 0x7f; __serial_putc(port, c); } if ( port->driver->flush ) port->driver->flush(port); spin_unlock_irqrestore(&port->tx_lock, flags); } char serial_getc(int handle) { struct serial_port *port; char c; unsigned long flags; if ( handle == -1 ) return '\0'; port = &com[handle & SERHND_IDX]; if ( !port->driver || !port->driver->getc ) return '\0'; do { for ( ; ; ) { spin_lock_irqsave(&port->rx_lock, flags); if ( port->rxbufp != port->rxbufc ) { c = port->rxbuf[mask_serial_rxbuf_idx(port->rxbufc++)]; spin_unlock_irqrestore(&port->rx_lock, flags); break; } if ( port->driver->getc(port, &c) ) { spin_unlock_irqrestore(&port->rx_lock, flags); break; } spin_unlock_irqrestore(&port->rx_lock, flags); cpu_relax(); udelay(100); } } while ( ((handle & SERHND_LO) && (c & 0x80)) || ((handle & SERHND_HI) && !(c & 0x80)) ); return c & 0x7f; } int __init serial_parse_handle(char *conf) { int handle, flags = 0; if ( !strncmp(conf, "dbgp", 4) && (!conf[4] || conf[4] == ',') ) { handle = SERHND_DBGP; goto common; } if ( !strncmp(conf, "dtuart", 5) ) { handle = SERHND_DTUART; goto common; } if ( strncmp(conf, "com", 3) ) goto fail; switch ( conf[3] ) { case '1': handle = SERHND_COM1; break; case '2': handle = SERHND_COM2; break; default: goto fail; } if ( conf[4] == 'H' ) flags |= SERHND_HI; else if ( conf[4] == 'L' ) flags |= SERHND_LO; common: if ( !com[handle].driver ) goto fail; if ( !post_irq ) com[handle].state = serial_parsed; else if ( com[handle].state != serial_initialized ) { if ( com[handle].driver->init_postirq ) com[handle].driver->init_postirq(&com[handle]); com[handle].state = serial_initialized; } return handle | flags | SERHND_COOKED; fail: return -1; } void __init serial_set_rx_handler(int handle, serial_rx_fn fn) { struct serial_port *port; unsigned long flags; if ( handle == -1 ) return; port = &com[handle & SERHND_IDX]; spin_lock_irqsave(&port->rx_lock, flags); if ( port->rx != NULL ) goto fail; if ( handle & SERHND_LO ) { if ( port->rx_lo != NULL ) goto fail; port->rx_lo = fn; } else if ( handle & SERHND_HI ) { if ( port->rx_hi != NULL ) goto fail; port->rx_hi = fn; } else { if ( (port->rx_hi != NULL) || (port->rx_lo != NULL) ) goto fail; port->rx = fn; } spin_unlock_irqrestore(&port->rx_lock, flags); return; fail: spin_unlock_irqrestore(&port->rx_lock, flags); printk("ERROR: Conflicting receive handlers for COM%d\n", handle & SERHND_IDX); } void serial_force_unlock(int handle) { struct serial_port *port; if ( handle == -1 ) return; port = &com[handle & SERHND_IDX]; spin_lock_init(&port->rx_lock); spin_lock_init(&port->tx_lock); serial_start_sync(handle); } void serial_start_sync(int handle) { struct serial_port *port; unsigned long flags; if ( handle == -1 ) return; port = &com[handle & SERHND_IDX]; spin_lock_irqsave(&port->tx_lock, flags); if ( port->sync++ == 0 ) { while ( (port->txbufp - port->txbufc) != 0 ) { int n; while ( !(n = port->driver->tx_ready(port)) ) cpu_relax(); if ( n < 0 ) /* port is unavailable and might not come up until reenabled by dom0, we can't really do proper sync */ break; port->driver->putc( port, port->txbuf[mask_serial_txbuf_idx(port->txbufc++)]); } if ( port->driver->flush ) port->driver->flush(port); } spin_unlock_irqrestore(&port->tx_lock, flags); } void serial_end_sync(int handle) { struct serial_port *port; unsigned long flags; if ( handle == -1 ) return; port = &com[handle & SERHND_IDX]; spin_lock_irqsave(&port->tx_lock, flags); port->sync--; spin_unlock_irqrestore(&port->tx_lock, flags); } void serial_start_log_everything(int handle) { struct serial_port *port; unsigned long flags; if ( handle == -1 ) return; port = &com[handle & SERHND_IDX]; spin_lock_irqsave(&port->tx_lock, flags); port->tx_log_everything++; port->tx_quench = 0; spin_unlock_irqrestore(&port->tx_lock, flags); } void serial_end_log_everything(int handle) { struct serial_port *port; unsigned long flags; if ( handle == -1 ) return; port = &com[handle & SERHND_IDX]; spin_lock_irqsave(&port->tx_lock, flags); port->tx_log_everything--; spin_unlock_irqrestore(&port->tx_lock, flags); } void __init serial_init_preirq(void) { int i; for ( i = 0; i < ARRAY_SIZE(com); i++ ) if ( com[i].driver && com[i].driver->init_preirq ) com[i].driver->init_preirq(&com[i]); } void __init serial_init_postirq(void) { int i; for ( i = 0; i < ARRAY_SIZE(com); i++ ) if ( com[i].state == serial_parsed ) { if ( com[i].driver->init_postirq ) com[i].driver->init_postirq(&com[i]); com[i].state = serial_initialized; } post_irq = 1; } void __init serial_endboot(void) { int i; for ( i = 0; i < ARRAY_SIZE(com); i++ ) if ( com[i].driver && com[i].driver->endboot ) com[i].driver->endboot(&com[i]); } int __init serial_irq(int idx) { if ( (idx >= 0) && (idx < ARRAY_SIZE(com)) && com[idx].driver && com[idx].driver->irq ) return com[idx].driver->irq(&com[idx]); return -1; } const struct dt_irq __init *serial_dt_irq(int idx) { if ( (idx >= 0) && (idx < ARRAY_SIZE(com)) && com[idx].driver && com[idx].driver->dt_irq_get ) return com[idx].driver->dt_irq_get(&com[idx]); return NULL; } const struct vuart_info *serial_vuart_info(int idx) { if ( (idx >= 0) && (idx < ARRAY_SIZE(com)) && com[idx].driver && com[idx].driver->vuart_info ) return com[idx].driver->vuart_info(&com[idx]); return NULL; } void serial_suspend(void) { int i; for ( i = 0; i < ARRAY_SIZE(com); i++ ) if ( com[i].state == serial_initialized && com[i].driver->suspend ) com[i].driver->suspend(&com[i]); } void serial_resume(void) { int i; for ( i = 0; i < ARRAY_SIZE(com); i++ ) if ( com[i].state == serial_initialized && com[i].driver->resume ) com[i].driver->resume(&com[i]); } void __init serial_register_uart(int idx, struct uart_driver *driver, void *uart) { /* Store UART-specific info. */ com[idx].driver = driver; com[idx].uart = uart; } void __init serial_async_transmit(struct serial_port *port) { BUG_ON(!port->driver->tx_ready); if ( port->txbuf != NULL ) return; if ( serial_txbufsz < PAGE_SIZE ) serial_txbufsz = PAGE_SIZE; while ( serial_txbufsz & (serial_txbufsz - 1) ) serial_txbufsz &= serial_txbufsz - 1; port->txbuf = alloc_xenheap_pages( get_order_from_bytes(serial_txbufsz), 0); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/drivers/char/exynos4210-uart.c0000664000175000017500000002512212307313555016603 0ustar smbsmb/* * xen/drivers/char/exynos4210-uart.c * * Driver for Exynos 4210 UART. * * Anthony PERARD * Copyright (c) 2012 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include #include #include #include static struct exynos4210_uart { unsigned int baud, clock_hz, data_bits, parity, stop_bits; struct dt_irq irq; void *regs; struct irqaction irqaction; struct vuart_info vuart; } exynos4210_com = {0}; /* These parity settings can be ORed directly into the ULCON. */ #define PARITY_NONE (0) #define PARITY_ODD (0x4) #define PARITY_EVEN (0x5) #define FORCED_CHECKED_AS_ONE (0x6) #define FORCED_CHECKED_AS_ZERO (0x7) #define exynos4210_read(uart, off) readl((uart)->regs + off) #define exynos4210_write(uart, off, val) writel(val, (uart->regs) + off) static void exynos4210_uart_interrupt(int irq, void *data, struct cpu_user_regs *regs) { struct serial_port *port = data; struct exynos4210_uart *uart = port->uart; unsigned int status; status = exynos4210_read(uart, UINTP); while ( status != 0 ) { /* Clear all pending interrupts * but should take care of ERROR and MODEM */ if ( status & UINTM_ERROR ) { uint32_t error_bit; error_bit = exynos4210_read(uart, UERSTAT); if ( error_bit & UERSTAT_OVERRUN ) dprintk(XENLOG_ERR, "uart: overrun error\n"); if ( error_bit & UERSTAT_PARITY ) dprintk(XENLOG_ERR, "uart: parity error\n"); if ( error_bit & UERSTAT_FRAME ) dprintk(XENLOG_ERR, "uart: frame error\n"); if ( error_bit & UERSTAT_BREAK ) dprintk(XENLOG_ERR, "uart: break detected\n"); /* Clear error pending interrupt */ exynos4210_write(uart, UINTP, UINTM_ERROR); } if ( status & (UINTM_RXD | UINTM_ERROR) ) { /* uart->regs[UINTM] |= RXD|ERROR; */ serial_rx_interrupt(port, regs); /* uart->regs[UINTM] &= ~(RXD|ERROR); */ exynos4210_write(uart, UINTP, UINTM_RXD | UINTM_ERROR); } if ( status & (UINTM_TXD | UINTM_MODEM) ) { /* uart->regs[UINTM] |= TXD|MODEM; */ serial_tx_interrupt(port, regs); /* uart->regs[UINTM] &= ~(TXD|MODEM); */ exynos4210_write(uart, UINTP, UINTM_TXD | UINTM_MODEM); } status = exynos4210_read(uart, UINTP); } } static void __init exynos4210_uart_init_preirq(struct serial_port *port) { struct exynos4210_uart *uart = port->uart; unsigned int divisor; uint32_t ulcon; /* reset, TX/RX disables */ exynos4210_write(uart, UCON, 0); /* No Interrupt, auto flow control */ exynos4210_write(uart, UMCON, 0); /* Line control and baud-rate generator. */ if ( uart->baud != BAUD_AUTO ) { /* Baud rate specified: program it into the divisor latch. */ divisor = ((uart->clock_hz) / (uart->baud)) - 1; /* FIXME: will use a hacked divisor, assuming the src clock and bauds */ exynos4210_write(uart, UFRACVAL, 53); exynos4210_write(uart, UBRDIV, 4); } else { /* * TODO: should be updated * Baud rate already set: read it out from the divisor latch. * divisor = (uart->regs[IBRD] << 6) | uart->regs[FBRD]; * uart->baud = (uart->clock_hz << 2) / divisor; */ } /* * Number of bits per character * 0 => 5 bits * 1 => 6 bits * 2 => 7 bits * 3 => 8 bits */ ASSERT(uart->data_bits >= 5 && uart->data_bits <= 8); ulcon = (uart->data_bits - 5); /* * Stop bits * 0 => 1 stop bit per frame * 1 => 2 stop bit per frame */ ASSERT(uart->stop_bits >= 1 && uart->stop_bits <= 2); ulcon |= (uart->stop_bits - 1) << ULCON_STOPB_SHIFT; /* Parity */ ulcon |= uart->parity << ULCON_PARITY_SHIFT; exynos4210_write(uart, ULCON, ulcon); /* Mask and clear the interrupts */ exynos4210_write(uart, UINTM, UINTM_ALLI); exynos4210_write(uart, UINTP, UINTM_ALLI); /* reset FIFO */ exynos4210_write(uart, UFCON, UFCON_FIFO_RESET); /* TODO: Add timeout to avoid infinite loop */ while ( exynos4210_read(uart, UFCON) & UFCON_FIFO_RESET ) ; /* * Enable FIFO and set the trigger level of Tx FIFO * The trigger level is always set to b101, an interrupt will be * generated when data count of Tx FIFO is less than or equal to the * following value: * UART0 => 160 bytes * UART1 => 40 bytes * UART2 => 10 bytes * UART3 => 10 bytes */ exynos4210_write(uart, UFCON, UFCON_FIFO_TX_TRIGGER | UFCON_FIFO_EN); /* * Enable the UART for Rx and Tx * - Use only interrupt request * - Interrupts are level trigger * - Enable Rx timeout */ exynos4210_write(uart, UCON, UCON_RX_IRQ_LEVEL | UCON_TX_IRQ_LEVEL | UCON_RX_IRQ | UCON_TX_IRQ | UCON_RX_TIMEOUT); } static void __init exynos4210_uart_init_postirq(struct serial_port *port) { struct exynos4210_uart *uart = port->uart; int rc; uart->irqaction.handler = exynos4210_uart_interrupt; uart->irqaction.name = "exynos4210_uart"; uart->irqaction.dev_id = port; if ( (rc = setup_dt_irq(&uart->irq, &uart->irqaction)) != 0 ) dprintk(XENLOG_ERR, "Failed to allocated exynos4210_uart IRQ %d\n", uart->irq.irq); /* Unmask interrupts */ exynos4210_write(uart, UINTM, ~UINTM_ALLI); /* Clear pending interrupts */ exynos4210_write(uart, UINTP, UINTM_ALLI); /* Enable interrupts */ exynos4210_write(uart, UMCON, exynos4210_read(uart, UMCON) | UMCON_INT_EN); } static void exynos4210_uart_suspend(struct serial_port *port) { BUG(); // XXX } static void exynos4210_uart_resume(struct serial_port *port) { BUG(); // XXX } static int exynos4210_uart_tx_ready(struct serial_port *port) { struct exynos4210_uart *uart = port->uart; /* Tx fifo full ? */ if ( exynos4210_read(uart, UFSTAT) & UFSTAT_TX_FULL ) return 0; else { uint32_t val = exynos4210_read(uart, UFSTAT); val = (val & UFSTAT_TX_COUNT_MASK) >> UFSTAT_TX_COUNT_SHIFT; /* XXX: Here we assume that we use UART 2/3, on the others * UART the buffer is bigger */ ASSERT(val >= 0 && val <= FIFO_MAX_SIZE); return (FIFO_MAX_SIZE - val); } } static void exynos4210_uart_putc(struct serial_port *port, char c) { struct exynos4210_uart *uart = port->uart; exynos4210_write(uart, UTXH, (uint32_t)(unsigned char)c); } static int exynos4210_uart_getc(struct serial_port *port, char *pc) { struct exynos4210_uart *uart = port->uart; uint32_t ufstat = exynos4210_read(uart, UFSTAT); uint32_t count; count = (ufstat & UFSTAT_RX_COUNT_MASK) >> UFSTAT_RX_COUNT_SHIFT; /* Check if Rx fifo is full or if the is something in it */ if ( ufstat & UFSTAT_RX_FULL || count ) { *pc = exynos4210_read(uart, URXH) & URXH_DATA_MASK; return 1; } else return 0; } static int __init exynos4210_uart_irq(struct serial_port *port) { struct exynos4210_uart *uart = port->uart; return uart->irq.irq; } static const struct dt_irq __init *exynos4210_uart_dt_irq(struct serial_port *port) { struct exynos4210_uart *uart = port->uart; return &uart->irq; } static const struct vuart_info *exynos4210_vuart_info(struct serial_port *port) { struct exynos4210_uart *uart = port->uart; return &uart->vuart; } static struct uart_driver __read_mostly exynos4210_uart_driver = { .init_preirq = exynos4210_uart_init_preirq, .init_postirq = exynos4210_uart_init_postirq, .endboot = NULL, .suspend = exynos4210_uart_suspend, .resume = exynos4210_uart_resume, .tx_ready = exynos4210_uart_tx_ready, .putc = exynos4210_uart_putc, .getc = exynos4210_uart_getc, .irq = exynos4210_uart_irq, .dt_irq_get = exynos4210_uart_dt_irq, .vuart_info = exynos4210_vuart_info, }; /* TODO: Parse UART config from the command line */ static int __init exynos4210_uart_init(struct dt_device_node *dev, const void *data) { const char *config = data; struct exynos4210_uart *uart; int res; u64 addr, size; if ( strcmp(config, "") ) { early_printk("WARNING: UART configuration is not supported\n"); } uart = &exynos4210_com; /* uart->clock_hz = 0x16e3600; */ uart->baud = BAUD_AUTO; uart->data_bits = 8; uart->parity = PARITY_NONE; uart->stop_bits = 1; res = dt_device_get_address(dev, 0, &addr, &size); if ( res ) { early_printk("exynos4210: Unable to retrieve the base" " address of the UART\n"); return res; } uart->regs = ioremap_nocache(addr, size); if ( !uart->regs ) { early_printk("exynos4210: Unable to map the UART memory\n"); return -ENOMEM; } res = dt_device_get_irq(dev, 0, &uart->irq); if ( res ) { early_printk("exynos4210: Unable to retrieve the IRQ\n"); return res; } uart->vuart.base_addr = addr; uart->vuart.size = size; uart->vuart.data_off = UTXH; uart->vuart.status_off = UTRSTAT; uart->vuart.status = UTRSTAT_TXE | UTRSTAT_TXFE; /* Register with generic serial driver. */ serial_register_uart(SERHND_DTUART, &exynos4210_uart_driver, uart); dt_device_set_used_by(dev, DOMID_XEN); return 0; } static const char * const exynos4210_dt_compat[] __initconst = { "samsung,exynos4210-uart", NULL }; DT_DEVICE_START(exynos4210, "Exynos 4210 UART", DEVICE_SERIAL) .compatible = exynos4210_dt_compat, .init = exynos4210_uart_init, DT_DEVICE_END /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/drivers/char/console.c0000664000175000017500000007426012307313555015447 0ustar smbsmb/****************************************************************************** * console.c * * Emergency console I/O for Xen and the domain-0 guest OS. * * Copyright (c) 2002-2004, K A Fraser. * * Added printf_ratelimit * Taken from Linux - Author: Andi Kleen (net_ratelimit) * Ported to Xen - Steven Rostedt - Red Hat */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for do_console_io */ /* console: comma-separated list of console outputs. */ static char __initdata opt_console[30] = OPT_CONSOLE_STR; string_param("console", opt_console); /* conswitch: a character pair controlling console switching. */ /* Char 1: CTRL+ is used to switch console input between Xen and DOM0 */ /* Char 2: If this character is 'x', then do not auto-switch to DOM0 when it */ /* boots. Any other value, or omitting the char, enables auto-switch */ static unsigned char __read_mostly opt_conswitch[3] = "a"; string_param("conswitch", opt_conswitch); /* sync_console: force synchronous console output (useful for debugging). */ static bool_t __initdata opt_sync_console; boolean_param("sync_console", opt_sync_console); /* console_to_ring: send guest (incl. dom 0) console data to console ring. */ static bool_t __read_mostly opt_console_to_ring; boolean_param("console_to_ring", opt_console_to_ring); /* console_timestamps: include a timestamp prefix on every Xen console line. */ static bool_t __read_mostly opt_console_timestamps; boolean_param("console_timestamps", opt_console_timestamps); /* conring_size: allows a large console ring than default (16kB). */ static uint32_t __initdata opt_conring_size; size_param("conring_size", opt_conring_size); #define _CONRING_SIZE 16384 #define CONRING_IDX_MASK(i) ((i)&(conring_size-1)) static char __initdata _conring[_CONRING_SIZE]; static char *__read_mostly conring = _conring; static uint32_t __read_mostly conring_size = _CONRING_SIZE; static uint32_t conringc, conringp; static int __read_mostly sercon_handle = -1; static DEFINE_SPINLOCK(console_lock); /* * To control the amount of printing, thresholds are added. * These thresholds correspond to the XENLOG logging levels. * There's an upper and lower threshold for non-guest messages and for * guest-provoked messages. This works as follows, for a given log level L: * * L < lower_threshold : always logged * lower_threshold <= L < upper_threshold : rate-limited logging * upper_threshold <= L : never logged * * Note, in the above algorithm, to disable rate limiting simply make * the lower threshold equal to the upper. */ #ifdef NDEBUG #define XENLOG_UPPER_THRESHOLD 2 /* Do not print INFO and DEBUG */ #define XENLOG_LOWER_THRESHOLD 2 /* Always print ERR and WARNING */ #define XENLOG_GUEST_UPPER_THRESHOLD 2 /* Do not print INFO and DEBUG */ #define XENLOG_GUEST_LOWER_THRESHOLD 0 /* Rate-limit ERR and WARNING */ #else #define XENLOG_UPPER_THRESHOLD 4 /* Do not discard anything */ #define XENLOG_LOWER_THRESHOLD 4 /* Print everything */ #define XENLOG_GUEST_UPPER_THRESHOLD 4 /* Do not discard anything */ #define XENLOG_GUEST_LOWER_THRESHOLD 4 /* Print everything */ #endif /* * The XENLOG_DEFAULT is the default given to printks that * do not have any print level associated with them. */ #define XENLOG_DEFAULT 1 /* XENLOG_WARNING */ #define XENLOG_GUEST_DEFAULT 1 /* XENLOG_WARNING */ static int __read_mostly xenlog_upper_thresh = XENLOG_UPPER_THRESHOLD; static int __read_mostly xenlog_lower_thresh = XENLOG_LOWER_THRESHOLD; static int __read_mostly xenlog_guest_upper_thresh = XENLOG_GUEST_UPPER_THRESHOLD; static int __read_mostly xenlog_guest_lower_thresh = XENLOG_GUEST_LOWER_THRESHOLD; static void parse_loglvl(char *s); static void parse_guest_loglvl(char *s); /* * := none|error|warning|info|debug|all * loglvl=[/] * : log level which is always printed * : log level which is rate-limit printed * Similar definitions for guest_loglvl, but applies to guest tracing. * Defaults: loglvl=warning ; guest_loglvl=none/warning */ custom_param("loglvl", parse_loglvl); custom_param("guest_loglvl", parse_guest_loglvl); static atomic_t print_everything = ATOMIC_INIT(0); #define ___parse_loglvl(s, ps, lvlstr, lvlnum) \ if ( !strncmp((s), (lvlstr), strlen(lvlstr)) ) { \ *(ps) = (s) + strlen(lvlstr); \ return (lvlnum); \ } static int __init __parse_loglvl(char *s, char **ps) { ___parse_loglvl(s, ps, "none", 0); ___parse_loglvl(s, ps, "error", 1); ___parse_loglvl(s, ps, "warning", 2); ___parse_loglvl(s, ps, "info", 3); ___parse_loglvl(s, ps, "debug", 4); ___parse_loglvl(s, ps, "all", 4); return 2; /* sane fallback */ } static void __init _parse_loglvl(char *s, int *lower, int *upper) { *lower = *upper = __parse_loglvl(s, &s); if ( *s == '/' ) *upper = __parse_loglvl(s+1, &s); if ( *upper < *lower ) *upper = *lower; } static void __init parse_loglvl(char *s) { _parse_loglvl(s, &xenlog_lower_thresh, &xenlog_upper_thresh); } static void __init parse_guest_loglvl(char *s) { _parse_loglvl(s, &xenlog_guest_lower_thresh, &xenlog_guest_upper_thresh); } static char * __init loglvl_str(int lvl) { switch ( lvl ) { case 0: return "Nothing"; case 1: return "Errors"; case 2: return "Errors and warnings"; case 3: return "Errors, warnings and info"; case 4: return "All"; } return "???"; } /* * ******************************************************** * *************** ACCESS TO CONSOLE RING ***************** * ******************************************************** */ static void conring_puts(const char *str) { char c; ASSERT(spin_is_locked(&console_lock)); while ( (c = *str++) != '\0' ) conring[CONRING_IDX_MASK(conringp++)] = c; if ( (uint32_t)(conringp - conringc) > conring_size ) conringc = conringp - conring_size; } long read_console_ring(struct xen_sysctl_readconsole *op) { XEN_GUEST_HANDLE_PARAM(char) str; uint32_t idx, len, max, sofar, c; str = guest_handle_cast(op->buffer, char), max = op->count; sofar = 0; c = conringc; if ( op->incremental && ((int32_t)(op->index - c) > 0) ) c = op->index; while ( (c != conringp) && (sofar < max) ) { idx = CONRING_IDX_MASK(c); len = conringp - c; if ( (idx + len) > conring_size ) len = conring_size - idx; if ( (sofar + len) > max ) len = max - sofar; if ( copy_to_guest_offset(str, sofar, &conring[idx], len) ) return -EFAULT; sofar += len; c += len; } if ( op->clear ) { spin_lock_irq(&console_lock); if ( (uint32_t)(conringp - c) > conring_size ) conringc = conringp - conring_size; else conringc = c; spin_unlock_irq(&console_lock); } op->count = sofar; op->index = c; return 0; } /* * ******************************************************* * *************** ACCESS TO SERIAL LINE ***************** * ******************************************************* */ /* Characters received over the serial line are buffered for domain 0. */ #define SERIAL_RX_SIZE 128 #define SERIAL_RX_MASK(_i) ((_i)&(SERIAL_RX_SIZE-1)) static char serial_rx_ring[SERIAL_RX_SIZE]; static unsigned int serial_rx_cons, serial_rx_prod; static void (*serial_steal_fn)(const char *); int console_steal(int handle, void (*fn)(const char *)) { if ( (handle == -1) || (handle != sercon_handle) ) return 0; if ( serial_steal_fn != NULL ) return -EBUSY; serial_steal_fn = fn; return 1; } void console_giveback(int id) { if ( id == 1 ) serial_steal_fn = NULL; } static void sercon_puts(const char *s) { if ( serial_steal_fn != NULL ) (*serial_steal_fn)(s); else serial_puts(sercon_handle, s); } static void dump_console_ring_key(unsigned char key) { uint32_t idx, len, sofar, c; unsigned int order; char *buf; printk("'%c' pressed -> dumping console ring buffer (dmesg)\n", key); /* create a buffer in which we'll copy the ring in the correct order and NUL terminate */ order = get_order_from_bytes(conring_size + 1); buf = alloc_xenheap_pages(order, 0); if ( buf == NULL ) { printk("unable to allocate memory!\n"); return; } c = conringc; sofar = 0; while ( (c != conringp) ) { idx = CONRING_IDX_MASK(c); len = conringp - c; if ( (idx + len) > conring_size ) len = conring_size - idx; memcpy(buf + sofar, &conring[idx], len); sofar += len; c += len; } buf[sofar] = '\0'; sercon_puts(buf); video_puts(buf); free_xenheap_pages(buf, order); } static struct keyhandler dump_console_ring_keyhandler = { .u.fn = dump_console_ring_key, .desc = "synchronously dump console ring buffer (dmesg)" }; /* CTRL- switches input direction between Xen and DOM0. */ #define switch_code (opt_conswitch[0]-'a'+1) static int __read_mostly xen_rx = 1; /* FALSE => input passed to domain 0. */ static void switch_serial_input(void) { static char *input_str[2] = { "DOM0", "Xen" }; xen_rx = !xen_rx; printk("*** Serial input -> %s", input_str[xen_rx]); if ( switch_code ) printk(" (type 'CTRL-%c' three times to switch input to %s)", opt_conswitch[0], input_str[!xen_rx]); printk("\n"); } static void __serial_rx(char c, struct cpu_user_regs *regs) { if ( xen_rx ) return handle_keypress(c, regs); /* Deliver input to guest buffer, unless it is already full. */ if ( (serial_rx_prod-serial_rx_cons) != SERIAL_RX_SIZE ) serial_rx_ring[SERIAL_RX_MASK(serial_rx_prod++)] = c; /* Always notify the guest: prevents receive path from getting stuck. */ send_global_virq(VIRQ_CONSOLE); } static void serial_rx(char c, struct cpu_user_regs *regs) { static int switch_code_count = 0; if ( switch_code && (c == switch_code) ) { /* We eat CTRL- in groups of 3 to switch console input. */ if ( ++switch_code_count == 3 ) { switch_serial_input(); switch_code_count = 0; } return; } for ( ; switch_code_count != 0; switch_code_count-- ) __serial_rx(switch_code, regs); /* Finally process the just-received character. */ __serial_rx(c, regs); } static void notify_dom0_con_ring(unsigned long unused) { send_global_virq(VIRQ_CON_RING); } static DECLARE_SOFTIRQ_TASKLET(notify_dom0_con_ring_tasklet, notify_dom0_con_ring, 0); static long guest_console_write(XEN_GUEST_HANDLE_PARAM(char) buffer, int count) { char kbuf[128]; int kcount; struct domain *cd = current->domain; while ( count > 0 ) { if ( hypercall_preempt_check() ) return hypercall_create_continuation( __HYPERVISOR_console_io, "iih", CONSOLEIO_write, count, buffer); kcount = min_t(int, count, sizeof(kbuf)-1); if ( copy_from_guest(kbuf, buffer, kcount) ) return -EFAULT; kbuf[kcount] = '\0'; if ( is_hardware_domain(cd) ) { /* Use direct console output as it could be interactive */ spin_lock_irq(&console_lock); sercon_puts(kbuf); video_puts(kbuf); if ( opt_console_to_ring ) { conring_puts(kbuf); tasklet_schedule(¬ify_dom0_con_ring_tasklet); } spin_unlock_irq(&console_lock); } else { char *kin = kbuf, *kout = kbuf, c; /* Strip non-printable characters */ for ( ; ; ) { c = *kin++; if ( c == '\0' || c == '\n' ) break; if ( isprint(c) || c == '\t' ) *kout++ = c; } *kout = '\0'; spin_lock(&cd->pbuf_lock); if ( c == '\n' ) { kcount = kin - kbuf; cd->pbuf[cd->pbuf_idx] = '\0'; guest_printk(cd, XENLOG_G_DEBUG "%s%s\n", cd->pbuf, kbuf); cd->pbuf_idx = 0; } else if ( cd->pbuf_idx + kcount < (DOMAIN_PBUF_SIZE - 1) ) { /* buffer the output until a newline */ memcpy(cd->pbuf + cd->pbuf_idx, kbuf, kcount); cd->pbuf_idx += kcount; } else { cd->pbuf[cd->pbuf_idx] = '\0'; guest_printk(cd, XENLOG_G_DEBUG "%s%s\n", cd->pbuf, kbuf); cd->pbuf_idx = 0; } spin_unlock(&cd->pbuf_lock); } guest_handle_add_offset(buffer, kcount); count -= kcount; } return 0; } long do_console_io(int cmd, int count, XEN_GUEST_HANDLE_PARAM(char) buffer) { long rc; unsigned int idx, len; rc = xsm_console_io(XSM_OTHER, current->domain, cmd); if ( rc ) return rc; switch ( cmd ) { case CONSOLEIO_write: rc = guest_console_write(buffer, count); break; case CONSOLEIO_read: rc = 0; while ( (serial_rx_cons != serial_rx_prod) && (rc < count) ) { idx = SERIAL_RX_MASK(serial_rx_cons); len = serial_rx_prod - serial_rx_cons; if ( (idx + len) > SERIAL_RX_SIZE ) len = SERIAL_RX_SIZE - idx; if ( (rc + len) > count ) len = count - rc; if ( copy_to_guest_offset(buffer, rc, &serial_rx_ring[idx], len) ) { rc = -EFAULT; break; } rc += len; serial_rx_cons += len; } break; default: rc = -ENOSYS; break; } return rc; } /* * ***************************************************** * *************** GENERIC CONSOLE I/O ***************** * ***************************************************** */ static bool_t console_locks_busted; static void __putstr(const char *str) { ASSERT(spin_is_locked(&console_lock)); sercon_puts(str); video_puts(str); conring_puts(str); if ( !console_locks_busted ) tasklet_schedule(¬ify_dom0_con_ring_tasklet); } static int printk_prefix_check(char *p, char **pp) { int loglvl = -1; int upper_thresh = xenlog_upper_thresh; int lower_thresh = xenlog_lower_thresh; while ( (p[0] == '<') && (p[1] != '\0') && (p[2] == '>') ) { switch ( p[1] ) { case 'G': upper_thresh = xenlog_guest_upper_thresh; lower_thresh = xenlog_guest_lower_thresh; if ( loglvl == -1 ) loglvl = XENLOG_GUEST_DEFAULT; break; case '0' ... '3': loglvl = p[1] - '0'; break; } p += 3; } if ( loglvl == -1 ) loglvl = XENLOG_DEFAULT; *pp = p; return ((atomic_read(&print_everything) != 0) || (loglvl < lower_thresh) || ((loglvl < upper_thresh) && printk_ratelimit())); } static void printk_start_of_line(const char *prefix) { struct tm tm; char tstr[32]; __putstr(prefix); if ( !opt_console_timestamps ) return; tm = wallclock_time(); if ( tm.tm_mday == 0 ) return; snprintf(tstr, sizeof(tstr), "[%04u-%02u-%02u %02u:%02u:%02u] ", 1900 + tm.tm_year, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec); __putstr(tstr); } static void vprintk_common(const char *prefix, const char *fmt, va_list args) { static char buf[1024]; static int start_of_line = 1, do_print; char *p, *q; unsigned long flags; /* console_lock can be acquired recursively from __printk_ratelimit(). */ local_irq_save(flags); spin_lock_recursive(&console_lock); (void)vsnprintf(buf, sizeof(buf), fmt, args); p = buf; while ( (q = strchr(p, '\n')) != NULL ) { *q = '\0'; if ( start_of_line ) do_print = printk_prefix_check(p, &p); if ( do_print ) { if ( start_of_line ) printk_start_of_line(prefix); __putstr(p); __putstr("\n"); } start_of_line = 1; p = q + 1; } if ( *p != '\0' ) { if ( start_of_line ) do_print = printk_prefix_check(p, &p); if ( do_print ) { if ( start_of_line ) printk_start_of_line(prefix); __putstr(p); } start_of_line = 0; } spin_unlock_recursive(&console_lock); local_irq_restore(flags); } void printk(const char *fmt, ...) { va_list args; va_start(args, fmt); vprintk_common("(XEN) ", fmt, args); va_end(args); } void guest_printk(const struct domain *d, const char *fmt, ...) { va_list args; char prefix[16]; snprintf(prefix, sizeof(prefix), "(d%d) ", d->domain_id); va_start(args, fmt); vprintk_common(prefix, fmt, args); va_end(args); } void __init console_init_preirq(void) { char *p; int sh; serial_init_preirq(); /* Where should console output go? */ for ( p = opt_console; p != NULL; p = strchr(p, ',') ) { if ( *p == ',' ) p++; if ( !strncmp(p, "vga", 3) ) video_init(); else if ( !strncmp(p, "none", 4) ) continue; else if ( (sh = serial_parse_handle(p)) >= 0 ) sercon_handle = sh; else { char *q = strchr(p, ','); if ( q != NULL ) *q = '\0'; printk("Bad console= option '%s'\n", p); if ( q != NULL ) *q = ','; } } serial_set_rx_handler(sercon_handle, serial_rx); /* HELLO WORLD --- start-of-day banner text. */ spin_lock(&console_lock); __putstr(xen_banner()); spin_unlock(&console_lock); printk("Xen version %d.%d%s (%s@%s) (%s) debug=%c %s\n", xen_major_version(), xen_minor_version(), xen_extra_version(), xen_compile_by(), xen_compile_domain(), xen_compiler(), debug_build() ? 'y' : 'n', xen_compile_date()); printk("Latest ChangeSet: %s\n", xen_changeset()); if ( opt_sync_console ) { serial_start_sync(sercon_handle); add_taint(TAINT_SYNC_CONSOLE); printk("Console output is synchronous.\n"); } } void __init console_init_postirq(void) { char *ring; unsigned int i, order, memflags; serial_init_postirq(); if ( !opt_conring_size ) opt_conring_size = num_present_cpus() << (9 + xenlog_lower_thresh); order = get_order_from_bytes(max(opt_conring_size, conring_size)); memflags = MEMF_bits(crashinfo_maxaddr_bits); while ( (ring = alloc_xenheap_pages(order, memflags)) == NULL ) { BUG_ON(order == 0); order--; } opt_conring_size = PAGE_SIZE << order; spin_lock_irq(&console_lock); for ( i = conringc ; i != conringp; i++ ) ring[i & (opt_conring_size - 1)] = conring[i & (conring_size - 1)]; conring = ring; smp_wmb(); /* Allow users of console_force_unlock() to see larger buffer. */ conring_size = opt_conring_size; spin_unlock_irq(&console_lock); printk("Allocated console ring of %u KiB.\n", opt_conring_size >> 10); } void __init console_endboot(void) { int i, j; printk("Std. Loglevel: %s", loglvl_str(xenlog_lower_thresh)); if ( xenlog_upper_thresh != xenlog_lower_thresh ) printk(" (Rate-limited: %s)", loglvl_str(xenlog_upper_thresh)); printk("\nGuest Loglevel: %s", loglvl_str(xenlog_guest_lower_thresh)); if ( xenlog_guest_upper_thresh != xenlog_guest_lower_thresh ) printk(" (Rate-limited: %s)", loglvl_str(xenlog_guest_upper_thresh)); printk("\n"); if ( opt_sync_console ) { printk("**********************************************\n"); printk("******* WARNING: CONSOLE OUTPUT IS SYNCHRONOUS\n"); printk("******* This option is intended to aid debugging " "of Xen by ensuring\n"); printk("******* that all output is synchronously delivered " "on the serial line.\n"); printk("******* However it can introduce SIGNIFICANT latencies " "and affect\n"); printk("******* timekeeping. It is NOT recommended for " "production use!\n"); printk("**********************************************\n"); for ( i = 0; i < 3; i++ ) { printk("%d... ", 3-i); for ( j = 0; j < 100; j++ ) { process_pending_softirqs(); mdelay(10); } } printk("\n"); } video_endboot(); /* * If user specifies so, we fool the switch routine to redirect input * straight back to Xen. I use this convoluted method so we still print * a useful 'how to switch' message. */ if ( opt_conswitch[1] == 'x' ) xen_rx = !xen_rx; register_keyhandler('w', &dump_console_ring_keyhandler); /* Serial input is directed to DOM0 by default. */ switch_serial_input(); } int __init console_has(const char *device) { char *p; for ( p = opt_console; p != NULL; p = strchr(p, ',') ) { if ( *p == ',' ) p++; if ( strncmp(p, device, strlen(device)) == 0 ) return 1; } return 0; } void console_start_log_everything(void) { serial_start_log_everything(sercon_handle); atomic_inc(&print_everything); } void console_end_log_everything(void) { serial_end_log_everything(sercon_handle); atomic_dec(&print_everything); } void console_force_unlock(void) { watchdog_disable(); spin_lock_init(&console_lock); serial_force_unlock(sercon_handle); console_locks_busted = 1; console_start_sync(); } void console_start_sync(void) { atomic_inc(&print_everything); serial_start_sync(sercon_handle); } void console_end_sync(void) { serial_end_sync(sercon_handle); atomic_dec(&print_everything); } /* * printk rate limiting, lifted from Linux. * * This enforces a rate limit: not more than one kernel message * every printk_ratelimit_ms (millisecs). */ int __printk_ratelimit(int ratelimit_ms, int ratelimit_burst) { static DEFINE_SPINLOCK(ratelimit_lock); static unsigned long toks = 10 * 5 * 1000; static unsigned long last_msg; static int missed; unsigned long flags; unsigned long long now = NOW(); /* ns */ unsigned long ms; do_div(now, 1000000); ms = (unsigned long)now; spin_lock_irqsave(&ratelimit_lock, flags); toks += ms - last_msg; last_msg = ms; if ( toks > (ratelimit_burst * ratelimit_ms)) toks = ratelimit_burst * ratelimit_ms; if ( toks >= ratelimit_ms ) { int lost = missed; missed = 0; toks -= ratelimit_ms; spin_unlock(&ratelimit_lock); if ( lost ) { char lost_str[8]; snprintf(lost_str, sizeof(lost_str), "%d", lost); /* console_lock may already be acquired by printk(). */ spin_lock_recursive(&console_lock); printk_start_of_line("(XEN) "); __putstr("printk: "); __putstr(lost_str); __putstr(" messages suppressed.\n"); spin_unlock_recursive(&console_lock); } local_irq_restore(flags); return 1; } missed++; spin_unlock_irqrestore(&ratelimit_lock, flags); return 0; } /* minimum time in ms between messages */ static int __read_mostly printk_ratelimit_ms = 5 * 1000; /* number of messages we send before ratelimiting */ static int __read_mostly printk_ratelimit_burst = 10; int printk_ratelimit(void) { return __printk_ratelimit(printk_ratelimit_ms, printk_ratelimit_burst); } /* * ************************************************************** * *************** Serial console ring buffer ******************* * ************************************************************** */ #ifdef DEBUG_TRACE_DUMP /* Send output direct to console, or buffer it? */ static volatile int debugtrace_send_to_console; static char *debugtrace_buf; /* Debug-trace buffer */ static unsigned int debugtrace_prd; /* Producer index */ static unsigned int debugtrace_kilobytes = 128, debugtrace_bytes; static unsigned int debugtrace_used; static DEFINE_SPINLOCK(debugtrace_lock); integer_param("debugtrace", debugtrace_kilobytes); static void debugtrace_dump_worker(void) { if ( (debugtrace_bytes == 0) || !debugtrace_used ) return; printk("debugtrace_dump() starting\n"); /* Print oldest portion of the ring. */ ASSERT(debugtrace_buf[debugtrace_bytes - 1] == 0); sercon_puts(&debugtrace_buf[debugtrace_prd]); /* Print youngest portion of the ring. */ debugtrace_buf[debugtrace_prd] = '\0'; sercon_puts(&debugtrace_buf[0]); memset(debugtrace_buf, '\0', debugtrace_bytes); printk("debugtrace_dump() finished\n"); } static void debugtrace_toggle(void) { unsigned long flags; watchdog_disable(); spin_lock_irqsave(&debugtrace_lock, flags); /* * Dump the buffer *before* toggling, in case the act of dumping the * buffer itself causes more printk() invocations. */ printk("debugtrace_printk now writing to %s.\n", !debugtrace_send_to_console ? "console": "buffer"); if ( !debugtrace_send_to_console ) debugtrace_dump_worker(); debugtrace_send_to_console = !debugtrace_send_to_console; spin_unlock_irqrestore(&debugtrace_lock, flags); watchdog_enable(); } void debugtrace_dump(void) { unsigned long flags; watchdog_disable(); spin_lock_irqsave(&debugtrace_lock, flags); debugtrace_dump_worker(); spin_unlock_irqrestore(&debugtrace_lock, flags); watchdog_enable(); } void debugtrace_printk(const char *fmt, ...) { static char buf[1024]; static u32 count; va_list args; char *p; unsigned long flags; if ( debugtrace_bytes == 0 ) return; debugtrace_used = 1; spin_lock_irqsave(&debugtrace_lock, flags); ASSERT(debugtrace_buf[debugtrace_bytes - 1] == 0); snprintf(buf, sizeof(buf), "%u ", ++count); va_start(args, fmt); (void)vsnprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), fmt, args); va_end(args); if ( debugtrace_send_to_console ) { serial_puts(sercon_handle, buf); } else { for ( p = buf; *p != '\0'; p++ ) { debugtrace_buf[debugtrace_prd++] = *p; /* Always leave a nul byte at the end of the buffer. */ if ( debugtrace_prd == (debugtrace_bytes - 1) ) debugtrace_prd = 0; } } spin_unlock_irqrestore(&debugtrace_lock, flags); } static void debugtrace_key(unsigned char key) { debugtrace_toggle(); } static struct keyhandler debugtrace_keyhandler = { .u.fn = debugtrace_key, .desc = "toggle debugtrace to console/buffer" }; static int __init debugtrace_init(void) { int order; unsigned int kbytes, bytes; /* Round size down to next power of two. */ while ( (kbytes = (debugtrace_kilobytes & (debugtrace_kilobytes-1))) != 0 ) debugtrace_kilobytes = kbytes; bytes = debugtrace_kilobytes << 10; if ( bytes == 0 ) return 0; order = get_order_from_bytes(bytes); debugtrace_buf = alloc_xenheap_pages(order, 0); ASSERT(debugtrace_buf != NULL); memset(debugtrace_buf, '\0', bytes); debugtrace_bytes = bytes; register_keyhandler('T', &debugtrace_keyhandler); return 0; } __initcall(debugtrace_init); #endif /* !NDEBUG */ /* * ************************************************************** * *************** Debugging/tracing/error-report *************** * ************************************************************** */ void panic(const char *fmt, ...) { va_list args; unsigned long flags; static DEFINE_SPINLOCK(lock); static char buf[128]; debugtrace_dump(); /* Protects buf[] and ensure multi-line message prints atomically. */ spin_lock_irqsave(&lock, flags); va_start(args, fmt); (void)vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); console_start_sync(); printk("\n****************************************\n"); printk("Panic on CPU %d:\n", smp_processor_id()); printk("%s\n", buf); printk("****************************************\n\n"); if ( opt_noreboot ) printk("Manual reset required ('noreboot' specified)\n"); else printk("Reboot in five seconds...\n"); spin_unlock_irqrestore(&lock, flags); debugger_trap_immediate(); #ifdef CONFIG_KEXEC kexec_crash(); #endif if ( opt_noreboot ) { machine_halt(); } else { watchdog_disable(); machine_restart(5000); } } void __bug(char *file, int line) { console_start_sync(); printk("Xen BUG at %s:%d\n", file, line); dump_execution_state(); panic("Xen BUG at %s:%d", file, line); for ( ; ; ) ; } void __warn(char *file, int line) { printk("Xen WARN at %s:%d\n", file, line); dump_execution_state(); } /* * ************************************************************** * ****************** Console suspend/resume ******************** * ************************************************************** */ static void suspend_steal_fn(const char *str) { } static int suspend_steal_id; int console_suspend(void) { suspend_steal_id = console_steal(sercon_handle, suspend_steal_fn); serial_suspend(); return 0; } int console_resume(void) { serial_resume(); console_giveback(suspend_steal_id); return 0; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/drivers/char/dt-uart.c0000664000175000017500000000370412307313555015360 0ustar smbsmb/* * xen/drivers/char/dt-uart.c * * Generic uart retrieved via the device tree * * Julien Grall * Copyright (c) 2013 Linaro Limited. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include /* * Configure UART port with a string: * path,options * * @path: full path used in the device tree for the UART. If the path * doesn't start with '/', we assuming that it's an alias. * @options: UART speficic options (see in each UART driver) */ static char __initdata opt_dtuart[30] = ""; string_param("dtuart", opt_dtuart); void __init dt_uart_init(void) { struct dt_device_node *dev; int ret; const char *devpath = opt_dtuart; char *options; if ( !console_has("dtuart") || !strcmp(opt_dtuart, "") ) { early_printk("No console\n"); return; } options = strchr(opt_dtuart, ','); if ( options != NULL ) *(options++) = '\0'; else options = ""; early_printk("Looking for UART console %s\n", devpath); if ( *devpath == '/' ) dev = dt_find_node_by_path(devpath); else dev = dt_find_node_by_alias(devpath); if ( !dev ) { early_printk("Unable to find device \"%s\"\n", devpath); return; } ret = device_init(dev, DEVICE_SERIAL, options); if ( ret ) early_printk("Unable to initialize serial: %d\n", ret); } xen-4.4.0/xen/drivers/char/pl011.c0000664000175000017500000001726512307313555014644 0ustar smbsmb/* * xen/drivers/char/pl011.c * * Driver for ARM PrimeCell PL011 UART. * * Tim Deegan * Copyright (c) 2011 Citrix Systems. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include #include #include #include #include #include static struct pl011 { unsigned int baud, clock_hz, data_bits, parity, stop_bits; struct dt_irq irq; void __iomem *regs; /* UART with IRQ line: interrupt-driven I/O. */ struct irqaction irqaction; struct vuart_info vuart; /* /\* UART with no IRQ line: periodically-polled I/O. *\/ */ /* struct timer timer; */ /* unsigned int timeout_ms; */ /* bool_t probing, intr_works; */ } pl011_com = {0}; /* These parity settings can be ORed directly into the LCR. */ #define PARITY_NONE (0) #define PARITY_ODD (PEN) #define PARITY_EVEN (PEN|EPS) #define PARITY_MARK (PEN|SPS) #define PARITY_SPACE (PEN|EPS|SPS) #define pl011_read(uart, off) readl((uart)->regs + (off)) #define pl011_write(uart, off,val) writel((val), (uart)->regs + (off)) static void pl011_interrupt(int irq, void *data, struct cpu_user_regs *regs) { struct serial_port *port = data; struct pl011 *uart = port->uart; unsigned int status = pl011_read(uart, MIS); if ( status ) { do { pl011_write(uart, ICR, status & ~(TXI|RTI|RXI)); if ( status & (RTI|RXI) ) serial_rx_interrupt(port, regs); /* TODO if ( status & (DSRMI|DCDMI|CTSMI|RIMI) ) ... */ if ( status & (TXI) ) serial_tx_interrupt(port, regs); status = pl011_read(uart, MIS); } while (status != 0); } } static void __init pl011_init_preirq(struct serial_port *port) { struct pl011 *uart = port->uart; unsigned int divisor; unsigned int cr; /* No interrupts, please. */ pl011_write(uart, IMSC, 0); /* Definitely no DMA */ pl011_write(uart, DMACR, 0x0); /* Line control and baud-rate generator. */ if ( uart->baud != BAUD_AUTO ) { /* Baud rate specified: program it into the divisor latch. */ divisor = (uart->clock_hz << 2) / uart->baud; /* clk << 6 / bd << 4 */ pl011_write(uart, FBRD, divisor & 0x3f); pl011_write(uart, IBRD, divisor >> 6); } else { /* Baud rate already set: read it out from the divisor latch. */ divisor = (pl011_read(uart, IBRD) << 6) | (pl011_read(uart, FBRD)); if (!divisor) early_panic("pl011: No Baud rate configured\n"); uart->baud = (uart->clock_hz << 2) / divisor; } /* This write must follow FBRD and IBRD writes. */ pl011_write(uart, LCR_H, (uart->data_bits - 5) << 5 | FEN | ((uart->stop_bits - 1) << 3) | uart->parity); /* Clear errors */ pl011_write(uart, RSR, 0); /* Mask and clear the interrupts */ pl011_write(uart, IMSC, 0); pl011_write(uart, ICR, ALLI); /* Enable the UART for RX and TX; keep RTS and DTR */ cr = pl011_read(uart, CR); cr &= RTS | DTR; pl011_write(uart, CR, cr | RXE | TXE | UARTEN); } static void __init pl011_init_postirq(struct serial_port *port) { struct pl011 *uart = port->uart; int rc; if ( uart->irq.irq > 0 ) { uart->irqaction.handler = pl011_interrupt; uart->irqaction.name = "pl011"; uart->irqaction.dev_id = port; if ( (rc = setup_dt_irq(&uart->irq, &uart->irqaction)) != 0 ) printk("ERROR: Failed to allocate pl011 IRQ %d\n", uart->irq.irq); } /* Clear pending error interrupts */ pl011_write(uart, ICR, OEI|BEI|PEI|FEI); /* Unmask interrupts */ pl011_write(uart, IMSC, RTI|OEI|BEI|PEI|FEI|TXI|RXI); } static void pl011_suspend(struct serial_port *port) { BUG(); // XXX } static void pl011_resume(struct serial_port *port) { BUG(); // XXX } static int pl011_tx_ready(struct serial_port *port) { struct pl011 *uart = port->uart; return ((pl011_read(uart, FR) & TXFE) ? 16 : 0); } static void pl011_putc(struct serial_port *port, char c) { struct pl011 *uart = port->uart; pl011_write(uart, DR, (uint32_t)(unsigned char)c); } static int pl011_getc(struct serial_port *port, char *pc) { struct pl011 *uart = port->uart; if ( pl011_read(uart, FR) & RXFE ) return 0; *pc = pl011_read(uart, DR) & 0xff; return 1; } static int __init pl011_irq(struct serial_port *port) { struct pl011 *uart = port->uart; return ((uart->irq.irq > 0) ? uart->irq.irq : -1); } static const struct dt_irq __init *pl011_dt_irq(struct serial_port *port) { struct pl011 *uart = port->uart; return &uart->irq; } static const struct vuart_info *pl011_vuart(struct serial_port *port) { struct pl011 *uart = port->uart; return &uart->vuart; } static struct uart_driver __read_mostly pl011_driver = { .init_preirq = pl011_init_preirq, .init_postirq = pl011_init_postirq, .endboot = NULL, .suspend = pl011_suspend, .resume = pl011_resume, .tx_ready = pl011_tx_ready, .putc = pl011_putc, .getc = pl011_getc, .irq = pl011_irq, .dt_irq_get = pl011_dt_irq, .vuart_info = pl011_vuart, }; /* TODO: Parse UART config from the command line */ static int __init pl011_uart_init(struct dt_device_node *dev, const void *data) { const char *config = data; struct pl011 *uart; int res; u64 addr, size; if ( strcmp(config, "") ) { early_printk("WARNING: UART configuration is not supported\n"); } uart = &pl011_com; uart->clock_hz = 0x16e3600; uart->baud = BAUD_AUTO; uart->data_bits = 8; uart->parity = PARITY_NONE; uart->stop_bits = 1; res = dt_device_get_address(dev, 0, &addr, &size); if ( res ) { early_printk("pl011: Unable to retrieve the base" " address of the UART\n"); return res; } uart->regs = ioremap_attr(addr, size, PAGE_HYPERVISOR_NOCACHE); if ( !uart->regs ) { early_printk("pl011: Unable to map the UART memory\n"); return -ENOMEM; } res = dt_device_get_irq(dev, 0, &uart->irq); if ( res ) { early_printk("pl011: Unable to retrieve the IRQ\n"); return res; } uart->vuart.base_addr = addr; uart->vuart.size = size; uart->vuart.data_off = DR; uart->vuart.status_off = FR; uart->vuart.status = 0; /* Register with generic serial driver. */ serial_register_uart(SERHND_DTUART, &pl011_driver, uart); dt_device_set_used_by(dev, DOMID_XEN); return 0; } static const char * const pl011_dt_compat[] __initconst = { "arm,pl011", NULL }; DT_DEVICE_START(pl011, "PL011 UART", DEVICE_SERIAL) .compatible = pl011_dt_compat, .init = pl011_uart_init, DT_DEVICE_END /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/drivers/char/ns16550.c0000664000175000017500000005654612307313555015035 0ustar smbsmb/****************************************************************************** * ns16550.c * * Driver for 16550-series UARTs. This driver is to be kept within Xen as * it permits debugging of seriously-toasted machines (e.g., in situations * where a device driver within a guest OS would be inaccessible). * * Copyright (c) 2003-2005, K A Fraser */ #include #include #include #include #include #include #include #include #ifdef HAS_PCI #include #include #endif #include #include #include #ifdef HAS_DEVICE_TREE #include #endif #ifdef CONFIG_X86 #include #endif /* * Configure serial port with a string: * [/][,DPS[,[,[,[,]]]]]. * The tail of the string can be omitted if platform defaults are sufficient. * If the baud rate is pre-configured, perhaps by a bootloader, then 'auto' * can be specified in place of a numeric baud rate. Polled mode is specified * by requesting irq 0. */ static char __initdata opt_com1[30] = ""; static char __initdata opt_com2[30] = ""; string_param("com1", opt_com1); string_param("com2", opt_com2); static struct ns16550 { int baud, clock_hz, data_bits, parity, stop_bits, fifo_size, irq; u64 io_base; /* I/O port or memory-mapped I/O address. */ u32 io_size; int reg_shift; /* Bits to shift register offset by */ int reg_width; /* Size of access to use, the registers * themselves are still bytes */ char __iomem *remapped_io_base; /* Remapped virtual address of MMIO. */ /* UART with IRQ line: interrupt-driven I/O. */ struct irqaction irqaction; #ifdef CONFIG_ARM struct vuart_info vuart; #endif /* UART with no IRQ line: periodically-polled I/O. */ struct timer timer; struct timer resume_timer; unsigned int timeout_ms; bool_t intr_works; bool_t dw_usr_bsy; #ifdef HAS_PCI /* PCI card parameters. */ unsigned int pb_bdf[3]; /* pci bridge BDF */ unsigned int ps_bdf[3]; /* pci serial port BDF */ bool_t pb_bdf_enable; /* if =1, pb-bdf effective, port behind bridge */ bool_t ps_bdf_enable; /* if =1, ps_bdf effective, port on pci card */ u32 bar; u16 cr; u8 bar_idx; #endif #ifdef HAS_DEVICE_TREE struct dt_irq dt_irq; #endif } ns16550_com[2] = { { 0 } }; static void ns16550_delayed_resume(void *data); static char ns_read_reg(struct ns16550 *uart, int reg) { void __iomem *addr = uart->remapped_io_base + (reg << uart->reg_shift); #ifdef HAS_IOPORTS if ( uart->remapped_io_base == NULL ) return inb(uart->io_base + reg); #endif switch ( uart->reg_width ) { case 1: return readb(addr); case 4: return readl(addr); default: return 0xff; } } static void ns_write_reg(struct ns16550 *uart, int reg, char c) { void __iomem *addr = uart->remapped_io_base + (reg << uart->reg_shift); #ifdef HAS_IOPORTS if ( uart->remapped_io_base == NULL ) return outb(c, uart->io_base + reg); #endif switch ( uart->reg_width ) { case 1: writeb(c, addr); break; case 4: writel(c, addr); break; default: /* Ignored */ break; } } static int ns16550_ioport_invalid(struct ns16550 *uart) { return (unsigned char)ns_read_reg(uart, UART_IER) == 0xff; } static void ns16550_interrupt( int irq, void *dev_id, struct cpu_user_regs *regs) { struct serial_port *port = dev_id; struct ns16550 *uart = port->uart; uart->intr_works = 1; while ( !(ns_read_reg(uart, UART_IIR) & UART_IIR_NOINT) ) { char lsr = ns_read_reg(uart, UART_LSR); if ( lsr & UART_LSR_THRE ) serial_tx_interrupt(port, regs); if ( lsr & UART_LSR_DR ) serial_rx_interrupt(port, regs); } } /* Safe: ns16550_poll() runs as softirq so not reentrant on a given CPU. */ static DEFINE_PER_CPU(struct serial_port *, poll_port); static void __ns16550_poll(struct cpu_user_regs *regs) { struct serial_port *port = this_cpu(poll_port); struct ns16550 *uart = port->uart; if ( uart->intr_works ) return; /* Interrupts work - no more polling */ while ( ns_read_reg(uart, UART_LSR) & UART_LSR_DR ) { if ( ns16550_ioport_invalid(uart) ) goto out; serial_rx_interrupt(port, regs); } if ( ns_read_reg(uart, UART_LSR) & UART_LSR_THRE ) serial_tx_interrupt(port, regs); out: set_timer(&uart->timer, NOW() + MILLISECS(uart->timeout_ms)); } static void ns16550_poll(void *data) { this_cpu(poll_port) = data; #ifdef run_in_exception_handler run_in_exception_handler(__ns16550_poll); #else __ns16550_poll(guest_cpu_user_regs()); #endif } static int ns16550_tx_ready(struct serial_port *port) { struct ns16550 *uart = port->uart; if ( ns16550_ioport_invalid(uart) ) return -EIO; return ns_read_reg(uart, UART_LSR) & UART_LSR_THRE ? uart->fifo_size : 0; } static void ns16550_putc(struct serial_port *port, char c) { struct ns16550 *uart = port->uart; ns_write_reg(uart, UART_THR, c); } static int ns16550_getc(struct serial_port *port, char *pc) { struct ns16550 *uart = port->uart; if ( ns16550_ioport_invalid(uart) || !(ns_read_reg(uart, UART_LSR) & UART_LSR_DR) ) return 0; *pc = ns_read_reg(uart, UART_RBR); return 1; } static void pci_serial_early_init(struct ns16550 *uart) { #ifdef HAS_PCI if ( !uart->ps_bdf_enable || uart->io_base >= 0x10000 ) return; if ( uart->pb_bdf_enable ) pci_conf_write16(0, uart->pb_bdf[0], uart->pb_bdf[1], uart->pb_bdf[2], PCI_IO_BASE, (uart->io_base & 0xF000) | ((uart->io_base & 0xF000) >> 8)); pci_conf_write32(0, uart->ps_bdf[0], uart->ps_bdf[1], uart->ps_bdf[2], PCI_BASE_ADDRESS_0, uart->io_base | PCI_BASE_ADDRESS_SPACE_IO); pci_conf_write16(0, uart->ps_bdf[0], uart->ps_bdf[1], uart->ps_bdf[2], PCI_COMMAND, PCI_COMMAND_IO); #endif } static void ns16550_setup_preirq(struct ns16550 *uart) { unsigned char lcr; unsigned int divisor; uart->intr_works = 0; pci_serial_early_init(uart); lcr = (uart->data_bits - 5) | ((uart->stop_bits - 1) << 2) | uart->parity; /* No interrupts. */ ns_write_reg(uart, UART_IER, 0); if ( uart->dw_usr_bsy && (ns_read_reg(uart, UART_IIR) & UART_IIR_BSY) == UART_IIR_BSY ) { /* DesignWare 8250 detects if LCR is written while the UART is * busy and raises a "busy detect" interrupt. Read the UART * Status Register to clear this state. */ ns_read_reg(uart, UART_USR); } /* Line control and baud-rate generator. */ ns_write_reg(uart, UART_LCR, lcr | UART_LCR_DLAB); if ( uart->baud != BAUD_AUTO ) { /* Baud rate specified: program it into the divisor latch. */ divisor = uart->clock_hz / (uart->baud << 4); ns_write_reg(uart, UART_DLL, (char)divisor); ns_write_reg(uart, UART_DLM, (char)(divisor >> 8)); } else { /* Baud rate already set: read it out from the divisor latch. */ divisor = ns_read_reg(uart, UART_DLL); divisor |= ns_read_reg(uart, UART_DLM) << 8; uart->baud = uart->clock_hz / (divisor << 4); } ns_write_reg(uart, UART_LCR, lcr); /* No flow ctrl: DTR and RTS are both wedged high to keep remote happy. */ ns_write_reg(uart, UART_MCR, UART_MCR_DTR | UART_MCR_RTS); /* Enable and clear the FIFOs. Set a large trigger threshold. */ ns_write_reg(uart, UART_FCR, UART_FCR_ENABLE | UART_FCR_CLRX | UART_FCR_CLTX | UART_FCR_TRG14); } static void __init ns16550_init_preirq(struct serial_port *port) { struct ns16550 *uart = port->uart; #ifdef HAS_IOPORTS /* I/O ports are distinguished by their size (16 bits). */ if ( uart->io_base >= 0x10000 ) #endif { #ifdef CONFIG_X86 enum fixed_addresses idx = FIX_COM_BEGIN + (uart - ns16550_com); set_fixmap_nocache(idx, uart->io_base); uart->remapped_io_base = (void __iomem *)fix_to_virt(idx); uart->remapped_io_base += uart->io_base & ~PAGE_MASK; #else uart->remapped_io_base = (char *)ioremap(uart->io_base, uart->io_size); #endif } ns16550_setup_preirq(uart); /* Check this really is a 16550+. Otherwise we have no FIFOs. */ if ( ((ns_read_reg(uart, UART_IIR) & 0xc0) == 0xc0) && ((ns_read_reg(uart, UART_FCR) & UART_FCR_TRG14) == UART_FCR_TRG14) ) uart->fifo_size = 16; } static void ns16550_setup_postirq(struct ns16550 *uart) { if ( uart->irq > 0 ) { /* Master interrupt enable; also keep DTR/RTS asserted. */ ns_write_reg(uart, UART_MCR, UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS); /* Enable receive and transmit interrupts. */ ns_write_reg(uart, UART_IER, UART_IER_ERDAI | UART_IER_ETHREI); } if ( uart->irq >= 0 ) set_timer(&uart->timer, NOW() + MILLISECS(uart->timeout_ms)); } static void __init ns16550_init_postirq(struct serial_port *port) { struct ns16550 *uart = port->uart; int rc, bits; if ( uart->irq < 0 ) return; serial_async_transmit(port); init_timer(&uart->timer, ns16550_poll, port, 0); init_timer(&uart->resume_timer, ns16550_delayed_resume, port, 0); /* Calculate time to fill RX FIFO and/or empty TX FIFO for polling. */ bits = uart->data_bits + uart->stop_bits + !!uart->parity; uart->timeout_ms = max_t( unsigned int, 1, (bits * uart->fifo_size * 1000) / uart->baud); if ( uart->irq > 0 ) { uart->irqaction.handler = ns16550_interrupt; uart->irqaction.name = "ns16550"; uart->irqaction.dev_id = port; #ifdef HAS_DEVICE_TREE if ( (rc = setup_dt_irq(&uart->dt_irq, &uart->irqaction)) != 0 ) printk("ERROR: Failed to allocate ns16550 DT IRQ.\n"); #else if ( (rc = setup_irq(uart->irq, &uart->irqaction)) != 0 ) printk("ERROR: Failed to allocate ns16550 IRQ %d\n", uart->irq); #endif } ns16550_setup_postirq(uart); #ifdef HAS_PCI if ( uart->bar || uart->ps_bdf_enable ) pci_hide_device(uart->ps_bdf[0], PCI_DEVFN(uart->ps_bdf[1], uart->ps_bdf[2])); #endif } static void ns16550_suspend(struct serial_port *port) { struct ns16550 *uart = port->uart; stop_timer(&uart->timer); #ifdef HAS_PCI if ( uart->bar ) uart->cr = pci_conf_read16(0, uart->ps_bdf[0], uart->ps_bdf[1], uart->ps_bdf[2], PCI_COMMAND); #endif } static void _ns16550_resume(struct serial_port *port) { #ifdef HAS_PCI struct ns16550 *uart = port->uart; if ( uart->bar ) { pci_conf_write32(0, uart->ps_bdf[0], uart->ps_bdf[1], uart->ps_bdf[2], PCI_BASE_ADDRESS_0 + uart->bar_idx*4, uart->bar); pci_conf_write16(0, uart->ps_bdf[0], uart->ps_bdf[1], uart->ps_bdf[2], PCI_COMMAND, uart->cr); } #endif ns16550_setup_preirq(port->uart); ns16550_setup_postirq(port->uart); } static int delayed_resume_tries; static void ns16550_delayed_resume(void *data) { struct serial_port *port = data; struct ns16550 *uart = port->uart; if ( ns16550_ioport_invalid(port->uart) && delayed_resume_tries-- ) set_timer(&uart->resume_timer, NOW() + RESUME_DELAY); else _ns16550_resume(port); } static void ns16550_resume(struct serial_port *port) { struct ns16550 *uart = port->uart; /* * Check for ioport access, before fully resuming operation. * On some systems, there is a SuperIO card that provides * this legacy ioport on the LPC bus. * * We need to wait for dom0's ACPI processing to run the proper * AML to re-initialize the chip, before we can use the card again. * * This may cause a small amount of garbage to be written * to the serial log while we wait patiently for that AML to * be executed. However, this is preferable to spinning in an * infinite loop, as seen on a Lenovo T430, when serial was enabled. */ if ( ns16550_ioport_invalid(uart) ) { delayed_resume_tries = RESUME_RETRIES; set_timer(&uart->resume_timer, NOW() + RESUME_DELAY); } else _ns16550_resume(port); } static void __init ns16550_endboot(struct serial_port *port) { #ifdef HAS_IOPORTS struct ns16550 *uart = port->uart; if ( uart->remapped_io_base ) return; if ( ioports_deny_access(dom0, uart->io_base, uart->io_base + 7) != 0 ) BUG(); #endif } static int __init ns16550_irq(struct serial_port *port) { struct ns16550 *uart = port->uart; return ((uart->irq > 0) ? uart->irq : -1); } #ifdef HAS_DEVICE_TREE static const struct dt_irq __init *ns16550_dt_irq(struct serial_port *port) { struct ns16550 *uart = port->uart; return &uart->dt_irq; } #endif #ifdef CONFIG_ARM static const struct vuart_info *ns16550_vuart_info(struct serial_port *port) { struct ns16550 *uart = port->uart; return &uart->vuart; } #endif static struct uart_driver __read_mostly ns16550_driver = { .init_preirq = ns16550_init_preirq, .init_postirq = ns16550_init_postirq, .endboot = ns16550_endboot, .suspend = ns16550_suspend, .resume = ns16550_resume, .tx_ready = ns16550_tx_ready, .putc = ns16550_putc, .getc = ns16550_getc, .irq = ns16550_irq, #ifdef HAS_DEVICE_TREE .dt_irq_get = ns16550_dt_irq, #endif #ifdef CONFIG_ARM .vuart_info = ns16550_vuart_info, #endif }; static int __init parse_parity_char(int c) { switch ( c ) { case 'n': return UART_PARITY_NONE; case 'o': return UART_PARITY_ODD; case 'e': return UART_PARITY_EVEN; case 'm': return UART_PARITY_MARK; case 's': return UART_PARITY_SPACE; } return 0; } static int __init check_existence(struct ns16550 *uart) { unsigned char status, scratch, scratch2, scratch3; #ifdef HAS_IOPORTS /* * We can't poke MMIO UARTs until they get I/O remapped later. Assume that * if we're getting MMIO UARTs, the arch code knows what it's doing. */ if ( uart->io_base >= 0x10000 ) return 1; #else return 1; /* Everything is MMIO */ #endif #ifdef HAS_PCI pci_serial_early_init(uart); #endif /* * Do a simple existence test first; if we fail this, * there's no point trying anything else. */ scratch = ns_read_reg(uart, UART_IER); ns_write_reg(uart, UART_IER, 0); /* * Mask out IER[7:4] bits for test as some UARTs (e.g. TL * 16C754B) allow only to modify them if an EFR bit is set. */ scratch2 = ns_read_reg(uart, UART_IER) & 0x0f; ns_write_reg(uart,UART_IER, 0x0F); scratch3 = ns_read_reg(uart, UART_IER) & 0x0f; ns_write_reg(uart, UART_IER, scratch); if ( (scratch2 != 0) || (scratch3 != 0x0F) ) return 0; /* * Check to see if a UART is really there. * Use loopback test mode. */ ns_write_reg(uart, UART_MCR, UART_MCR_LOOP | 0x0A); status = ns_read_reg(uart, UART_MSR) & 0xF0; return (status == 0x90); } #ifdef HAS_PCI static int pci_uart_config (struct ns16550 *uart, int skip_amt, int bar_idx) { uint32_t bar, len; int b, d, f, nextf; /* NB. Start at bus 1 to avoid AMT: a plug-in card cannot be on bus 0. */ for ( b = skip_amt ? 1 : 0; b < 0x100; b++ ) { for ( d = 0; d < 0x20; d++ ) { for ( f = 0; f < 8; f = nextf ) { nextf = (f || (pci_conf_read16(0, b, d, f, PCI_HEADER_TYPE) & 0x80)) ? f + 1 : 8; switch ( pci_conf_read16(0, b, d, f, PCI_CLASS_DEVICE) ) { case 0x0700: /* single port serial */ case 0x0702: /* multi port serial */ case 0x0780: /* other (e.g serial+parallel) */ break; case 0xffff: if ( !f ) nextf = 8; /* fall through */ default: continue; } bar = pci_conf_read32(0, b, d, f, PCI_BASE_ADDRESS_0 + bar_idx*4); /* Not IO */ if ( !(bar & PCI_BASE_ADDRESS_SPACE_IO) ) continue; pci_conf_write32(0, b, d, f, PCI_BASE_ADDRESS_0, ~0u); len = pci_conf_read32(0, b, d, f, PCI_BASE_ADDRESS_0); pci_conf_write32(0, b, d, f, PCI_BASE_ADDRESS_0 + bar_idx*4, bar); /* Not 8 bytes */ if ( (len & 0xffff) != 0xfff9 ) continue; uart->ps_bdf[0] = b; uart->ps_bdf[1] = d; uart->ps_bdf[2] = f; uart->bar = bar; uart->bar_idx = bar_idx; uart->io_base = bar & ~PCI_BASE_ADDRESS_SPACE_IO; uart->irq = pci_conf_read8(0, b, d, f, PCI_INTERRUPT_PIN) ? pci_conf_read8(0, b, d, f, PCI_INTERRUPT_LINE) : 0; return 0; } } } if ( !skip_amt ) return -1; uart->io_base = 0x3f8; uart->irq = 0; uart->clock_hz = UART_CLOCK_HZ; return 0; } #endif #define PARSE_ERR(_f, _a...) \ do { \ printk( "ERROR: " _f "\n" , ## _a ); \ return; \ } while ( 0 ) static void __init ns16550_parse_port_config( struct ns16550 *uart, const char *conf) { int baud; /* No user-specified configuration? */ if ( (conf == NULL) || (*conf == '\0') ) { /* Some platforms may automatically probe the UART configuartion. */ if ( uart->baud != 0 ) goto config_parsed; return; } if ( strncmp(conf, "auto", 4) == 0 ) { uart->baud = BAUD_AUTO; conf += 4; } else if ( (baud = simple_strtoul(conf, &conf, 10)) != 0 ) uart->baud = baud; if ( *conf == '/' ) { conf++; uart->clock_hz = simple_strtoul(conf, &conf, 0) << 4; } if ( *conf == ',' && *++conf != ',' ) { uart->data_bits = simple_strtoul(conf, &conf, 10); uart->parity = parse_parity_char(*conf); uart->stop_bits = simple_strtoul(conf + 1, &conf, 10); } if ( *conf == ',' && *++conf != ',' ) { #ifdef HAS_PCI if ( strncmp(conf, "pci", 3) == 0 ) { if ( pci_uart_config(uart, 1/* skip AMT */, uart - ns16550_com) ) return; conf += 3; } else if ( strncmp(conf, "amt", 3) == 0 ) { if ( pci_uart_config(uart, 0, uart - ns16550_com) ) return; conf += 3; } else #endif { uart->io_base = simple_strtoul(conf, &conf, 0); } } if ( *conf == ',' && *++conf != ',' ) uart->irq = simple_strtol(conf, &conf, 10); #ifdef HAS_PCI if ( *conf == ',' && *++conf != ',' ) { conf = parse_pci(conf, NULL, &uart->ps_bdf[0], &uart->ps_bdf[1], &uart->ps_bdf[2]); if ( !conf ) PARSE_ERR("Bad port PCI coordinates"); uart->ps_bdf_enable = 1; } if ( *conf == ',' && *++conf != ',' ) { if ( !parse_pci(conf, NULL, &uart->pb_bdf[0], &uart->pb_bdf[1], &uart->pb_bdf[2]) ) PARSE_ERR("Bad bridge PCI coordinates"); uart->pb_bdf_enable = 1; } #endif config_parsed: /* Sanity checks. */ if ( (uart->baud != BAUD_AUTO) && ((uart->baud < 1200) || (uart->baud > 115200)) ) PARSE_ERR("Baud rate %d outside supported range.", uart->baud); if ( (uart->data_bits < 5) || (uart->data_bits > 8) ) PARSE_ERR("%d data bits are unsupported.", uart->data_bits); if ( (uart->stop_bits < 1) || (uart->stop_bits > 2) ) PARSE_ERR("%d stop bits are unsupported.", uart->stop_bits); if ( uart->io_base == 0 ) PARSE_ERR("I/O base address must be specified."); if ( !check_existence(uart) ) PARSE_ERR("16550-compatible serial UART not present"); /* Register with generic serial driver. */ serial_register_uart(uart - ns16550_com, &ns16550_driver, uart); } void __init ns16550_init(int index, struct ns16550_defaults *defaults) { struct ns16550 *uart; if ( (index < 0) || (index > 1) ) return; uart = &ns16550_com[index]; uart->baud = (defaults->baud ? : console_has((index == 0) ? "com1" : "com2") ? BAUD_AUTO : 0); uart->clock_hz = UART_CLOCK_HZ; uart->data_bits = defaults->data_bits; uart->parity = parse_parity_char(defaults->parity); uart->stop_bits = defaults->stop_bits; uart->irq = defaults->irq; uart->io_base = defaults->io_base; uart->io_size = 8; uart->reg_width = 1; uart->reg_shift = 0; /* Default is no transmit FIFO. */ uart->fifo_size = 1; ns16550_parse_port_config(uart, (index == 0) ? opt_com1 : opt_com2); } #ifdef HAS_DEVICE_TREE static int __init ns16550_uart_dt_init(struct dt_device_node *dev, const void *data) { struct ns16550 *uart; int res; u32 reg_shift, reg_width; u64 io_size; uart = &ns16550_com[0]; uart->baud = BAUD_AUTO; uart->clock_hz = UART_CLOCK_HZ; uart->data_bits = 8; uart->parity = UART_PARITY_NONE; uart->stop_bits = 1; /* Default is no transmit FIFO. */ uart->fifo_size = 1; res = dt_device_get_address(dev, 0, &uart->io_base, &io_size); if ( res ) return res; uart->io_size = io_size; ASSERT(uart->io_size == io_size); /* Detect truncation */ res = dt_property_read_u32(dev, "reg-shift", ®_shift); if ( !res ) uart->reg_shift = 0; else uart->reg_shift = reg_shift; res = dt_property_read_u32(dev, "reg-io-width", ®_width); if ( !res ) uart->reg_width = 1; else uart->reg_width = reg_width; if ( uart->reg_width != 1 && uart->reg_width != 4 ) return -EINVAL; res = dt_device_get_irq(dev, 0, &uart->dt_irq); if ( res ) return res; /* The common bit of the driver mostly deals with irq not dt_irq. */ uart->irq = uart->dt_irq.irq; uart->dw_usr_bsy = dt_device_is_compatible(dev, "snps,dw-apb-uart"); uart->vuart.base_addr = uart->io_base; uart->vuart.size = uart->io_size; uart->vuart.data_off = UART_THR <reg_shift; uart->vuart.status_off = UART_LSR<reg_shift; uart->vuart.status = UART_LSR_THRE|UART_LSR_TEMT; /* Register with generic serial driver. */ serial_register_uart(uart - ns16550_com, &ns16550_driver, uart); dt_device_set_used_by(dev, DOMID_XEN); return 0; } static const char * const ns16550_dt_compat[] __initconst = { "ns16550", "ns16550a", "snps,dw-apb-uart", NULL }; DT_DEVICE_START(ns16550, "NS16550 UART", DEVICE_SERIAL) .compatible = ns16550_dt_compat, .init = ns16550_uart_dt_init, DT_DEVICE_END #endif /* HAS_DEVICE_TREE */ /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/xen/drivers/char/ehci-dbgp.c0000664000175000017500000013443612307313555015631 0ustar smbsmb/* * Standalone EHCI USB debug driver * * Hardware interface code based on the respective early console driver in * Linux; see the Linux source for authorship and copyrights. */ #include #include #include #include #include #include #include #include #include #include /* #define DBGP_DEBUG */ /* EHCI register interface, corresponds to EHCI Revision 0.95 specification */ /* Section 2.2 Host Controller Capability Registers */ struct ehci_caps { /* * These fields are specified as 8 and 16 bit registers, * but some hosts can't perform 8 or 16 bit PCI accesses. * some hosts treat caplength and hciversion as parts of a 32-bit * register, others treat them as two separate registers, this * affects the memory map for big endian controllers. */ u32 hc_capbase; #define HC_LENGTH(p) (0x00ff & (p)) /* bits 7:0 / offset 0x00 */ #define HC_VERSION(p) (0xffff & ((p) >> 16)) /* bits 31:16 / offset 0x02 */ u32 hcs_params; /* HCSPARAMS - offset 0x04 */ #define HCS_DEBUG_PORT(p) (((p) >> 20) & 0xf) /* bits 23:20, debug port? */ #define HCS_INDICATOR(p) ((p) & (1 << 16)) /* true: has port indicators */ #define HCS_N_CC(p) (((p) >> 12) & 0xf) /* bits 15:12, #companion HCs */ #define HCS_N_PCC(p) (((p) >> 8) & 0xf) /* bits 11:8, ports per CC */ #define HCS_PORTROUTED(p) ((p) & (1 << 7)) /* true: port routing */ #define HCS_PPC(p) ((p) & (1 << 4)) /* true: port power control */ #define HCS_N_PORTS(p) (((p) >> 0) & 0xf) /* bits 3:0, ports on HC */ u32 hcc_params; /* HCCPARAMS - offset 0x08 */ /* EHCI 1.1 addendum */ #define HCC_32FRAME_PERIODIC_LIST(p) ((p) & (1 << 19)) #define HCC_PER_PORT_CHANGE_EVENT(p) ((p) & (1 << 18)) #define HCC_LPM(p) ((p) & (1 << 17)) #define HCC_HW_PREFETCH(p) ((p) & (1 << 16)) #define HCC_EXT_CAPS(p) (((p) >> 8) & 0xff) /* for pci extended caps */ #define HCC_ISOC_CACHE(p) ((p) & (1 << 7)) /* true: can cache isoc frame */ #define HCC_ISOC_THRES(p) (((p) >> 4) & 0x7) /* bits 6:4, uframes cached */ #define HCC_CANPARK(p) ((p) & (1 << 2)) /* true: can park on async qh */ #define HCC_PGM_FRAMELISTLEN(p) ((p) & (1 << 1)) /* true: periodic_size changes */ #define HCC_64BIT_ADDR(p) ((p) & 1) /* true: can use 64-bit addr */ u8 portroute[8]; /* nibbles for routing - offset 0x0C */ }; /* Section 2.3 Host Controller Operational Registers */ struct ehci_regs { /* USBCMD: offset 0x00 */ u32 command; /* EHCI 1.1 addendum */ #define CMD_HIRD (0xf << 24) /* host initiated resume duration */ #define CMD_PPCEE (1 << 15) /* per port change event enable */ #define CMD_FSP (1 << 14) /* fully synchronized prefetch */ #define CMD_ASPE (1 << 13) /* async schedule prefetch enable */ #define CMD_PSPE (1 << 12) /* periodic schedule prefetch enable */ /* 23:16 is r/w intr rate, in microframes; default "8" == 1/msec */ #define CMD_PARK (1 << 11) /* enable "park" on async qh */ #define CMD_PARK_CNT(c) (((c) >> 8) & 3) /* how many transfers to park for */ #define CMD_LRESET (1 << 7) /* partial reset (no ports, etc) */ #define CMD_IAAD (1 << 6) /* "doorbell" interrupt async advance */ #define CMD_ASE (1 << 5) /* async schedule enable */ #define CMD_PSE (1 << 4) /* periodic schedule enable */ /* 3:2 is periodic frame list size */ #define CMD_RESET (1 << 1) /* reset HC not bus */ #define CMD_RUN (1 << 0) /* start/stop HC */ /* USBSTS: offset 0x04 */ u32 status; #define STS_PPCE_MASK (0xff << 16) /* Per-Port change event 1-16 */ #define STS_ASS (1 << 15) /* Async Schedule Status */ #define STS_PSS (1 << 14) /* Periodic Schedule Status */ #define STS_RECL (1 << 13) /* Reclamation */ #define STS_HALT (1 << 12) /* Not running (any reason) */ /* some bits reserved */ /* these STS_* flags are also intr_enable bits (USBINTR) */ #define STS_IAA (1 << 5) /* Interrupted on async advance */ #define STS_FATAL (1 << 4) /* such as some PCI access errors */ #define STS_FLR (1 << 3) /* frame list rolled over */ #define STS_PCD (1 << 2) /* port change detect */ #define STS_ERR (1 << 1) /* "error" completion (overflow, ...) */ #define STS_INT (1 << 0) /* "normal" completion (short, ...) */ /* USBINTR: offset 0x08 */ u32 intr_enable; /* FRINDEX: offset 0x0C */ u32 frame_index; /* current microframe number */ /* CTRLDSSEGMENT: offset 0x10 */ u32 segment; /* address bits 63:32 if needed */ /* PERIODICLISTBASE: offset 0x14 */ u32 frame_list; /* points to periodic list */ /* ASYNCLISTADDR: offset 0x18 */ u32 async_next; /* address of next async queue head */ u32 reserved[9]; /* CONFIGFLAG: offset 0x40 */ u32 configured_flag; #define FLAG_CF (1 << 0) /* true: we'll support "high speed" */ /* PORTSC: offset 0x44 */ u32 port_status[0]; /* up to N_PORTS */ /* EHCI 1.1 addendum */ #define PORTSC_SUSPEND_STS_ACK 0 #define PORTSC_SUSPEND_STS_NYET 1 #define PORTSC_SUSPEND_STS_STALL 2 #define PORTSC_SUSPEND_STS_ERR 3 #define PORT_DEV_ADDR (0x7f << 25) /* device address */ #define PORT_SSTS (0x3 << 23) /* suspend status */ /* 31:23 reserved */ #define PORT_WKOC_E (1 << 22) /* wake on overcurrent (enable) */ #define PORT_WKDISC_E (1 << 21) /* wake on disconnect (enable) */ #define PORT_WKCONN_E (1 << 20) /* wake on connect (enable) */ /* 19:16 for port testing */ #define PORT_TEST(x) (((x) & 0xf) << 16) /* Port Test Control */ #define PORT_TEST_PKT PORT_TEST(0x4) /* Port Test Control - packet test */ #define PORT_TEST_FORCE PORT_TEST(0x5) /* Port Test Control - force enable */ #define PORT_LED_OFF (0 << 14) #define PORT_LED_AMBER (1 << 14) #define PORT_LED_GREEN (2 << 14) #define PORT_LED_MASK (3 << 14) #define PORT_OWNER (1 << 13) /* true: companion hc owns this port */ #define PORT_POWER (1 << 12) /* true: has power (see PPC) */ #define PORT_USB11(x) (((x) & (3 << 10)) == (1 << 10)) /* USB 1.1 device */ /* 11:10 for detecting lowspeed devices (reset vs release ownership) */ /* 9 reserved */ #define PORT_LPM (1 << 9) /* LPM transaction */ #define PORT_RESET (1 << 8) /* reset port */ #define PORT_SUSPEND (1 << 7) /* suspend port */ #define PORT_RESUME (1 << 6) /* resume it */ #define PORT_OCC (1 << 5) /* over current change */ #define PORT_OC (1 << 4) /* over current active */ #define PORT_PEC (1 << 3) /* port enable change */ #define PORT_PE (1 << 2) /* port enable */ #define PORT_CSC (1 << 1) /* connect status change */ #define PORT_CONNECT (1 << 0) /* device connected */ #define PORT_RWC_BITS (PORT_CSC | PORT_PEC | PORT_OCC) }; /* * Appendix C, Debug port ... intended for use with special "debug devices" * that can help if there's no serial console. (nonstandard enumeration.) */ struct ehci_dbg_port { u32 control; #define DBGP_OWNER (1 << 30) #define DBGP_ENABLED (1 << 28) #define DBGP_DONE (1 << 16) #define DBGP_INUSE (1 << 10) #define DBGP_ERRCODE(x) (((x) >> 7) & 0x07) # define DBGP_ERR_BAD 1 # define DBGP_ERR_SIGNAL 2 #define DBGP_ERROR (1 << 6) #define DBGP_GO (1 << 5) #define DBGP_OUT (1 << 4) #define DBGP_LEN (0xf << 0) #define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) u32 pids; #define DBGP_PID_GET(x) (((x) >> 16) & 0xff) #define DBGP_PID_SET(data, tok) (((data) << 8) | (tok)) u32 data03; u32 data47; u32 address; #define DBGP_EPADDR(dev, ep) (((dev) << 8) | (ep)) }; /* CONTROL REQUEST SUPPORT */ /* * USB directions * * This bit flag is used in endpoint descriptors' bEndpointAddress field. * It's also one of three fields in control requests bRequestType. */ #define USB_DIR_OUT 0 /* to device */ #define USB_DIR_IN 0x80 /* to host */ /* * USB types, the second of three bRequestType fields */ #define USB_TYPE_MASK (0x03 << 5) #define USB_TYPE_STANDARD (0x00 << 5) #define USB_TYPE_CLASS (0x01 << 5) #define USB_TYPE_VENDOR (0x02 << 5) #define USB_TYPE_RESERVED (0x03 << 5) /* * USB recipients, the third of three bRequestType fields */ #define USB_RECIP_MASK 0x1f #define USB_RECIP_DEVICE 0x00 #define USB_RECIP_INTERFACE 0x01 #define USB_RECIP_ENDPOINT 0x02 #define USB_RECIP_OTHER 0x03 /* From Wireless USB 1.0 */ #define USB_RECIP_PORT 0x04 #define USB_RECIP_RPIPE 0x05 /* * Standard requests, for the bRequest field of a SETUP packet. * * These are qualified by the bRequestType field, so that for example * TYPE_CLASS or TYPE_VENDOR specific feature flags could be retrieved * by a GET_STATUS request. */ #define USB_REQ_GET_STATUS 0x00 #define USB_REQ_CLEAR_FEATURE 0x01 #define USB_REQ_SET_FEATURE 0x03 #define USB_REQ_SET_ADDRESS 0x05 #define USB_REQ_GET_DESCRIPTOR 0x06 #define USB_REQ_SET_DESCRIPTOR 0x07 #define USB_REQ_GET_CONFIGURATION 0x08 #define USB_REQ_SET_CONFIGURATION 0x09 #define USB_REQ_GET_INTERFACE 0x0A #define USB_REQ_SET_INTERFACE 0x0B #define USB_REQ_SYNCH_FRAME 0x0C #define USB_DEVICE_DEBUG_MODE 6 /* (special devices only) */ /** * struct usb_ctrlrequest - SETUP data for a USB device control request * @bRequestType: matches the USB bmRequestType field * @bRequest: matches the USB bRequest field * @wValue: matches the USB wValue field (le16 byte order) * @wIndex: matches the USB wIndex field (le16 byte order) * @wLength: matches the USB wLength field (le16 byte order) * * This structure is used to send control requests to a USB device. It matches * the different fields of the USB 2.0 Spec section 9.3, table 9-2. See the * USB spec for a fuller description of the different fields, and what they are * used for. * * Note that the driver for any interface can issue control requests. * For most devices, interfaces don't coordinate with each other, so * such requests may be made at any time. */ struct usb_ctrlrequest { u8 bRequestType; u8 bRequest; __le16 wValue; __le16 wIndex; __le16 wLength; } __attribute__ ((packed)); /* USB_DT_DEBUG: for special highspeed devices, replacing serial console */ #define USB_DT_DEBUG 0x0a struct usb_debug_descriptor { u8 bLength; u8 bDescriptorType; /* bulk endpoints with 8 byte maxpacket */ u8 bDebugInEndpoint; u8 bDebugOutEndpoint; } __attribute__((packed)); #define USB_DEBUG_DEVNUM 127 /* * USB Packet IDs (PIDs) */ /* token */ #define USB_PID_OUT 0xe1 #define USB_PID_IN 0x69 #define USB_PID_SOF 0xa5 #define USB_PID_SETUP 0x2d /* handshake */ #define USB_PID_ACK 0xd2 #define USB_PID_NAK 0x5a #define USB_PID_STALL 0x1e #define USB_PID_NYET 0x96 /* data */ #define USB_PID_DATA0 0xc3 #define USB_PID_DATA1 0x4b #define USB_PID_DATA2 0x87 #define USB_PID_MDATA 0x0f /* Special */ #define USB_PID_PREAMBLE 0x3c #define USB_PID_ERR 0x3c #define USB_PID_SPLIT 0x78 #define USB_PID_PING 0xb4 #define USB_PID_UNDEF_0 0xf0 #define PCI_CLASS_SERIAL_USB_EHCI 0x0c0320 #define PCI_CAP_ID_EHCI_DEBUG 0x0a #define HUB_ROOT_RESET_TIME 50 /* times are in msec */ #define HUB_SHORT_RESET_TIME 10 #define HUB_LONG_RESET_TIME 200 #define HUB_RESET_TIMEOUT 500 #define DBGP_MAX_PACKET 8 #define DBGP_LOOPS 1000 #define DBGP_TIMEOUT (250 * 1000) /* us */ #define DBGP_CHECK_INTERVAL 100 /* us */ /* This one can be set arbitrarily - only affects input responsiveness: */ #define DBGP_IDLE_INTERVAL 100 /* ms */ struct ehci_dbgp { struct ehci_dbg_port __iomem *ehci_debug; enum dbgp_state { dbgp_idle, dbgp_out, dbgp_in, dbgp_ctrl, dbgp_unsafe /* cannot use debug device during EHCI reset */ } state; unsigned int phys_port; struct { unsigned int endpoint; unsigned int chunk; char buf[DBGP_MAX_PACKET]; } out, in; unsigned long timeout; struct timer timer; spinlock_t *lock; bool_t reset_run; u8 bus, slot, func, bar; u16 pci_cr; u32 bar_val; unsigned int cap; struct ehci_regs __iomem *ehci_regs; struct ehci_caps __iomem *ehci_caps; }; static int ehci_dbgp_external_startup(struct ehci_dbgp *); static void ehci_dbgp_status(struct ehci_dbgp *dbgp, const char *str) { #ifdef DBGP_DEBUG #define dbgp_printk printk if ( !dbgp->ehci_debug ) return; dbgp_printk("dbgp: %s\n", str); dbgp_printk(" debug control: %08x\n", readl(&dbgp->ehci_debug->control)); dbgp_printk(" EHCI cmd : %08x\n", readl(&dbgp->ehci_regs->command)); dbgp_printk(" EHCI conf flg: %08x\n", readl(&dbgp->ehci_regs->configured_flag)); dbgp_printk(" EHCI status : %08x\n", readl(&dbgp->ehci_regs->status)); dbgp_printk(" EHCI portsc : %08x\n", readl(&dbgp->ehci_regs->port_status[dbgp->phys_port - 1])); #endif } #ifndef DBGP_DEBUG static inline __attribute__ ((format (printf, 1, 2))) void dbgp_printk(const char *fmt, ...) { } #endif static inline u32 dbgp_len_update(u32 x, u32 len) { return (x & ~DBGP_LEN) | (len & DBGP_LEN) | DBGP_OUT; } static inline u32 dbgp_pid_write_update(u32 x, u32 tok) { static u8 data0 = USB_PID_DATA1; data0 ^= USB_PID_DATA0 ^ USB_PID_DATA1; return (x & 0xffff0000) | (data0 << 8) | (tok & 0xff); } static inline u32 dbgp_pid_read_update(u32 x, u32 tok) { return (x & 0xffffff00) | (tok & 0xff); } static inline void dbgp_set_data(struct ehci_dbg_port __iomem *ehci_debug, const void *buf, unsigned int size) { const unsigned char *bytes = buf; u32 lo = 0, hi = 0; unsigned int i; for ( i = 0; i < 4 && i < size; i++ ) lo |= bytes[i] << (8 * i); for ( ; i < 8 && i < size; i++ ) hi |= bytes[i] << (8 * (i - 4)); writel(lo, &ehci_debug->data03); writel(hi, &ehci_debug->data47); } static inline void dbgp_get_data(struct ehci_dbg_port __iomem *ehci_debug, void *buf, int size) { unsigned char *bytes = buf; u32 lo = readl(&ehci_debug->data03); u32 hi = readl(&ehci_debug->data47); unsigned int i; for ( i = 0; i < 4 && i < size; i++ ) bytes[i] = (lo >> (8 * i)) & 0xff; for ( ; i < 8 && i < size; i++ ) bytes[i] = (hi >> (8 * (i - 4))) & 0xff; } static void dbgp_issue_command(struct ehci_dbgp *dbgp, u32 ctrl, enum dbgp_state state) { u32 cmd = readl(&dbgp->ehci_regs->command); if ( unlikely(!(cmd & CMD_RUN)) ) { /* * If the EHCI controller is not in the run state do extended * checks to see if ACPI or some other initialization also * reset the EHCI debug port. */ u32 ctrl = readl(&dbgp->ehci_debug->control); if ( ctrl & DBGP_ENABLED ) { cmd |= CMD_RUN; writel(cmd, &dbgp->ehci_regs->command); dbgp->reset_run = 1; } else if ( dbgp->state != dbgp_unsafe ) { dbgp->state = dbgp_unsafe; ehci_dbgp_external_startup(dbgp); } } writel(ctrl | DBGP_GO, &dbgp->ehci_debug->control); dbgp->timeout = DBGP_TIMEOUT; if ( dbgp->state != dbgp_unsafe ) dbgp->state = state; } static int dbgp_check_for_completion(struct ehci_dbgp *dbgp, unsigned int interval, u8 *ppid) { u32 ctrl; int ret; if ( dbgp->state == dbgp_idle ) return 0; ctrl = readl(&dbgp->ehci_debug->control) & ~DBGP_GO; if ( !(ctrl & DBGP_DONE) ) { if ( dbgp->timeout > interval ) dbgp->timeout -= interval; else if ( interval ) { /* See the timeout related comment in dbgp_wait_until_done(). */ dbgp->state = dbgp_unsafe; dbgp->timeout = 0; } return -DBGP_TIMEOUT; } if ( ctrl & DBGP_ERROR ) { ret = -DBGP_ERRCODE(ctrl); if ( ret == -DBGP_ERR_BAD && dbgp->timeout > interval ) ctrl |= DBGP_GO; } else { u8 pid = DBGP_PID_GET(readl(&dbgp->ehci_debug->pids)); ret = ctrl & DBGP_LEN; if ( ppid ) *ppid = pid; else if ( dbgp->state == dbgp_in ) { dbgp_get_data(dbgp->ehci_debug, dbgp->in.buf, ret); dbgp->in.chunk = ret; } else if ( pid == USB_PID_NAK && dbgp->timeout > interval ) ctrl |= DBGP_GO; } writel(ctrl, &dbgp->ehci_debug->control); if ( ctrl & DBGP_GO ) { dbgp->timeout -= interval; return -DBGP_TIMEOUT; } if ( unlikely(dbgp->reset_run) ) { writel(readl(&dbgp->ehci_regs->command) & ~CMD_RUN, &dbgp->ehci_regs->command); dbgp->reset_run = 0; } if ( dbgp->state != dbgp_unsafe ) dbgp->state = dbgp_idle; return ret; } static int dbgp_wait_until_complete(struct ehci_dbgp *dbgp, u8 *ppid) { unsigned int loop = DBGP_TIMEOUT; int ret; do { ret = dbgp_check_for_completion(dbgp, 0, ppid); if ( ret != -DBGP_TIMEOUT ) break; udelay(1); } while ( --loop ); if ( !ppid && !loop ) dbgp->state = dbgp_unsafe; return ret; } static inline void dbgp_mdelay(unsigned int ms) { while ( ms-- ) { unsigned int i; for ( i = 0; i < 1000; i++ ) outb(0x1, 0x80); } } static void dbgp_breathe(void) { /* Sleep to give the debug port a chance to breathe. */ dbgp_mdelay(1); } static int dbgp_wait_until_done(struct ehci_dbgp *dbgp, u32 ctrl, unsigned int loop) { int ret; dbgp->timeout = 0; for ( ; ; writel(ctrl | DBGP_GO, &dbgp->ehci_debug->control) ) { u8 pid; ret = dbgp_wait_until_complete(dbgp, &pid); if ( ret < 0 ) { /* * A -DBGP_TIMEOUT failure here means the device has failed, * perhaps because it was unplugged, in which case we do not * want to hang the system so the dbgp will be marked as unsafe * to use. EHCI reset is the only way to recover if you unplug * the dbgp device. */ if ( ret == -DBGP_TIMEOUT ) dbgp->state = dbgp_unsafe; if ( ret != -DBGP_ERR_BAD || !--loop ) break; } else { /* * If the port is getting full or it has dropped data * start pacing ourselves, not necessary but it's friendly. */ if ( pid == USB_PID_NAK || pid == USB_PID_NYET ) dbgp_breathe(); /* If we got a NACK, reissue the transmission. */ if ( pid != USB_PID_NAK || !--loop ) break; } } return ret; } static int dbgp_bulk_write(struct ehci_dbgp *dbgp, unsigned int devnum, unsigned int endpoint, const void *bytes, unsigned int size, u32 *pctrl) { u32 addr, pids, ctrl; if ( size > DBGP_MAX_PACKET ) return -EINVAL; addr = DBGP_EPADDR(devnum, endpoint); pids = dbgp_pid_write_update(readl(&dbgp->ehci_debug->pids), USB_PID_OUT); ctrl = dbgp_len_update(readl(&dbgp->ehci_debug->control), size); if ( pctrl ) *pctrl = ctrl; dbgp_set_data(dbgp->ehci_debug, bytes, size); writel(addr, &dbgp->ehci_debug->address); writel(pids, &dbgp->ehci_debug->pids); dbgp_issue_command(dbgp, ctrl, dbgp_out); return 0; } static int dbgp_bulk_read(struct ehci_dbgp *dbgp, unsigned int devnum, unsigned int endpoint, unsigned int size, u32 *pctrl) { u32 addr, pids, ctrl; if ( size > DBGP_MAX_PACKET ) return -EINVAL; addr = DBGP_EPADDR(devnum, endpoint); pids = dbgp_pid_read_update(readl(&dbgp->ehci_debug->pids), USB_PID_IN); ctrl = readl(&dbgp->ehci_debug->control) & ~DBGP_OUT; writel(addr, &dbgp->ehci_debug->address); writel(pids, &dbgp->ehci_debug->pids); if ( likely(!pctrl) ) dbgp_issue_command(dbgp, ctrl, dbgp_in); else dbgp_issue_command(dbgp, *pctrl = ctrl, dbgp_ctrl); return 0; } static int dbgp_control_msg(struct ehci_dbgp *dbgp, unsigned int devnum, int requesttype, int request, int value, int index, void *data, unsigned int size) { u32 addr, pids, ctrl; struct usb_ctrlrequest req; bool_t read = (requesttype & USB_DIR_IN) != 0; int ret; if ( size > (read ? DBGP_MAX_PACKET : 0) ) return -EINVAL; /* Compute the control message */ req.bRequestType = requesttype; req.bRequest = request; req.wValue = cpu_to_le16(value); req.wIndex = cpu_to_le16(index); req.wLength = cpu_to_le16(size); pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); addr = DBGP_EPADDR(devnum, 0); ctrl = dbgp_len_update(readl(&dbgp->ehci_debug->control), sizeof(req)); /* Send the setup message */ dbgp_set_data(dbgp->ehci_debug, &req, sizeof(req)); writel(addr, &dbgp->ehci_debug->address); writel(pids, &dbgp->ehci_debug->pids); dbgp_issue_command(dbgp, ctrl, dbgp_ctrl); ret = dbgp_wait_until_done(dbgp, ctrl, DBGP_LOOPS); if ( ret < 0 ) return ret; /* Read the result */ ret = dbgp_bulk_read(dbgp, devnum, 0, size, &ctrl); if ( !ret ) ret = dbgp_wait_until_done(dbgp, ctrl, DBGP_LOOPS); if ( ret > 0 ) { if ( size > ret ) size = ret; dbgp_get_data(dbgp->ehci_debug, data, size); } return ret; } static unsigned int __init __find_dbgp(u8 bus, u8 slot, u8 func) { u32 class = pci_conf_read32(0, bus, slot, func, PCI_CLASS_REVISION); if ( (class >> 8) != PCI_CLASS_SERIAL_USB_EHCI ) return 0; return pci_find_cap_offset(0, bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); } static unsigned int __init find_dbgp(struct ehci_dbgp *dbgp, unsigned int ehci_num) { unsigned int bus, slot, func; for ( bus = 0; bus < 256; bus++ ) { for ( slot = 0; slot < 32; slot++ ) { for ( func = 0; func < 8; func++ ) { unsigned int cap; if ( !pci_device_detect(0, bus, slot, func) ) { if ( !func ) break; continue; } cap = __find_dbgp(bus, slot, func); if ( !cap || ehci_num-- ) { if ( !func && !(pci_conf_read8(0, bus, slot, func, PCI_HEADER_TYPE) & 0x80) ) break; continue; } dbgp->bus = bus; dbgp->slot = slot; dbgp->func = func; return cap; } } } return 0; } static int ehci_dbgp_startup(struct ehci_dbgp *dbgp) { u32 ctrl, cmd, status; unsigned int loop; /* Claim ownership, but do not enable yet */ ctrl = readl(&dbgp->ehci_debug->control); ctrl |= DBGP_OWNER; ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); writel(ctrl, &dbgp->ehci_debug->control); udelay(1); ehci_dbgp_status(dbgp, "EHCI startup"); /* Start the EHCI. */ cmd = readl(&dbgp->ehci_regs->command); cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); cmd |= CMD_RUN; writel(cmd, &dbgp->ehci_regs->command); /* Ensure everything is routed to the EHCI */ writel(FLAG_CF, &dbgp->ehci_regs->configured_flag); /* Wait until the controller is no longer halted. */ loop = 1000; do { status = readl(&dbgp->ehci_regs->status); if ( !(status & STS_HALT) ) break; udelay(1); } while ( --loop ); if ( !loop ) { dbgp_printk("EHCI cannot be started\n"); return -ENODEV; } dbgp_printk("EHCI started\n"); return 0; } static int ehci_dbgp_controller_reset(struct ehci_dbgp *dbgp) { unsigned int loop = 250 * 1000; u32 cmd; /* Reset the EHCI controller */ cmd = readl(&dbgp->ehci_regs->command); cmd |= CMD_RESET; writel(cmd, &dbgp->ehci_regs->command); do { cmd = readl(&dbgp->ehci_regs->command); } while ( (cmd & CMD_RESET) && --loop ); if ( !loop ) { dbgp_printk("cannot reset EHCI\n"); return -1; } ehci_dbgp_status(dbgp, "ehci reset done"); return 0; } static int ehci_reset_port(struct ehci_dbgp *dbgp, unsigned int port) { u32 portsc, delay_time, delay; ehci_dbgp_status(dbgp, "reset port"); /* Reset the USB debug port. */ portsc = readl(&dbgp->ehci_regs->port_status[port - 1]); portsc &= ~PORT_PE; portsc |= PORT_RESET; writel(portsc, &dbgp->ehci_regs->port_status[port - 1]); delay = HUB_ROOT_RESET_TIME; for ( delay_time = 0; delay_time < HUB_RESET_TIMEOUT; delay_time += delay ) { dbgp_mdelay(delay); portsc = readl(&dbgp->ehci_regs->port_status[port - 1]); if (!(portsc & PORT_RESET)) break; } if ( portsc & PORT_RESET ) { /* force reset to complete */ unsigned int loop = 100 * 1000; writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), &dbgp->ehci_regs->port_status[port - 1]); do { udelay(1); portsc = readl(&dbgp->ehci_regs->port_status[port-1]); } while ( (portsc & PORT_RESET) && --loop ); } /* Device went away? */ if ( !(portsc & PORT_CONNECT) ) return -ENOTCONN; /* bomb out completely if something weird happened */ if ( portsc & PORT_CSC ) return -EINVAL; /* If we've finished resetting, then break out of the loop */ if ( !(portsc & PORT_RESET) && (portsc & PORT_PE) ) return 0; return -EBUSY; } static int ehci_wait_for_port(struct ehci_dbgp *dbgp, unsigned int port) { u32 status; unsigned int reps; for ( reps = 0; reps < 300; reps++ ) { status = readl(&dbgp->ehci_regs->status); if ( status & STS_PCD ) break; dbgp_mdelay(1); } return ehci_reset_port(dbgp, port) == 0 ? 0 : -ENOTCONN; } /* Return 0 on success * Return -ENODEV for any general failure * Return -EIO if wait for port fails */ static int ehci_dbgp_external_startup(struct ehci_dbgp *dbgp) { unsigned int devnum; struct usb_debug_descriptor dbgp_desc; int ret; u32 ctrl, portsc, cmd; unsigned int dbg_port = dbgp->phys_port; unsigned int tries = 3; unsigned int reset_port_tries = 1; bool_t try_hard_once = 1; try_port_reset_again: ret = ehci_dbgp_startup(dbgp); if ( ret ) return ret; /* Wait for a device to show up in the debug port */ ret = ehci_wait_for_port(dbgp, dbg_port); if ( ret < 0 ) { portsc = readl(&dbgp->ehci_regs->port_status[dbg_port - 1]); if ( !(portsc & PORT_CONNECT) && try_hard_once ) { /* * Last ditch effort to try to force enable the debug device by * using the packet test EHCI command to try and wake it up. */ try_hard_once = 0; cmd = readl(&dbgp->ehci_regs->command); cmd &= ~CMD_RUN; writel(cmd, &dbgp->ehci_regs->command); portsc = readl(&dbgp->ehci_regs->port_status[dbg_port - 1]); portsc |= PORT_TEST_PKT; writel(portsc, &dbgp->ehci_regs->port_status[dbg_port - 1]); ehci_dbgp_status(dbgp, "Trying to force debug port online"); mdelay(50); ehci_dbgp_controller_reset(dbgp); goto try_port_reset_again; } else if ( reset_port_tries-- ) goto try_port_reset_again; dbgp_printk("no device found in debug port\n"); return -EIO; } ehci_dbgp_status(dbgp, "wait for port done"); /* Enable the debug port */ ctrl = readl(&dbgp->ehci_debug->control); ctrl |= DBGP_CLAIM; writel(ctrl, &dbgp->ehci_debug->control); ctrl = readl(&dbgp->ehci_debug->control); if ( (ctrl & DBGP_CLAIM) != DBGP_CLAIM ) { dbgp_printk("no device in debug port\n"); writel(ctrl & ~DBGP_CLAIM, &dbgp->ehci_debug->control); return -ENODEV; } ehci_dbgp_status(dbgp, "debug port enabled"); /* Completely transfer the debug device to the debug controller */ portsc = readl(&dbgp->ehci_regs->port_status[dbg_port - 1]); portsc &= ~PORT_PE; writel(portsc, &dbgp->ehci_regs->port_status[dbg_port - 1]); dbgp_mdelay(100); try_again: /* Find the debug device and make it device number 127 */ for ( devnum = 0; devnum <= 127; devnum++ ) { ret = dbgp_control_msg(dbgp, devnum, USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, &dbgp_desc, sizeof(dbgp_desc)); if ( ret > 0 ) break; } if ( devnum > 127 ) { dbgp_printk("could not find attached debug device\n"); goto err; } dbgp->out.endpoint = dbgp_desc.bDebugOutEndpoint; dbgp->in.endpoint = dbgp_desc.bDebugInEndpoint; /* Move the device to 127 if it isn't already there. */ if ( devnum != USB_DEBUG_DEVNUM ) { ret = dbgp_control_msg(dbgp, devnum, USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); if ( ret < 0 ) { dbgp_printk("could not move attached device to %d\n", USB_DEBUG_DEVNUM); goto err; } devnum = USB_DEBUG_DEVNUM; dbgp_printk("debug device renamed to 127\n"); } /* Enable the debug interface */ ret = dbgp_control_msg(dbgp, USB_DEBUG_DEVNUM, USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); if ( ret < 0 ) { dbgp_printk("could not enable the debug device\n"); goto err; } dbgp_printk("debug interface enabled\n"); /* Perform a small write to get the even/odd data state in sync. */ ret = dbgp_bulk_write(dbgp, USB_DEBUG_DEVNUM, dbgp->out.endpoint, "\n", 1, &ctrl); if ( !ret ) ret = dbgp_wait_until_done(dbgp, ctrl, DBGP_LOOPS); if ( ret < 0 ) { dbgp_printk("dbgp_bulk_write failed: %d\n", ret); goto err; } dbgp_printk("small write done\n"); dbgp->state = dbgp_idle; return 0; err: if ( tries-- ) goto try_again; return -ENODEV; } typedef void (*set_debug_port_t)(struct ehci_dbgp *, unsigned int); static void default_set_debug_port(struct ehci_dbgp *dbgp, unsigned int port) { } static set_debug_port_t __read_mostly set_debug_port = default_set_debug_port; static void nvidia_set_debug_port(struct ehci_dbgp *dbgp, unsigned int port) { u32 dword = pci_conf_read32(0, dbgp->bus, dbgp->slot, dbgp->func, 0x74); dword &= ~(0x0f << 12); dword |= (port & 0x0f) << 12; pci_conf_write32(0, dbgp->bus, dbgp->slot, dbgp->func, 0x74, dword); dbgp_printk("set debug port to %u\n", port); } static void __init detect_set_debug_port(struct ehci_dbgp *dbgp) { if ( pci_conf_read16(0, dbgp->bus, dbgp->slot, dbgp->func, PCI_VENDOR_ID) == 0x10de ) { dbgp_printk("using nvidia set_debug_port\n"); set_debug_port = nvidia_set_debug_port; } } /* * The code in ehci_dbgp_bios_handoff() is derived from the USB PCI * quirk initialization in Linux. */ #define EHCI_USBLEGSUP_BIOS (1 << 16) /* BIOS semaphore */ #define EHCI_USBLEGCTLSTS 4 /* legacy control/status */ static void ehci_dbgp_bios_handoff(struct ehci_dbgp *dbgp, u32 hcc_params) { u32 cap; unsigned int offset = HCC_EXT_CAPS(hcc_params); int msec; if ( !offset ) return; cap = pci_conf_read32(0, dbgp->bus, dbgp->slot, dbgp->func, offset); dbgp_printk("dbgp: EHCI BIOS state %08x\n", cap); if ( (cap & 0xff) == 1 && (cap & EHCI_USBLEGSUP_BIOS) ) { dbgp_printk("dbgp: BIOS handoff\n"); pci_conf_write8(0, dbgp->bus, dbgp->slot, dbgp->func, offset + 3, 1); } /* if boot firmware now owns EHCI, spin till it hands it over. */ msec = 1000; while ( (cap & EHCI_USBLEGSUP_BIOS) && (msec > 0) ) { mdelay(10); msec -= 10; cap = pci_conf_read32(0, dbgp->bus, dbgp->slot, dbgp->func, offset); } if ( cap & EHCI_USBLEGSUP_BIOS ) { /* well, possibly buggy BIOS... try to shut it down, * and hope nothing goes too wrong */ dbgp_printk("dbgp: BIOS handoff failed: %08x\n", cap); pci_conf_write8(0, dbgp->bus, dbgp->slot, dbgp->func, offset + 2, 0); } /* just in case, always disable EHCI SMIs */ pci_conf_write8(0, dbgp->bus, dbgp->slot, dbgp->func, offset + EHCI_USBLEGCTLSTS, 0); } static int ehci_dbgp_setup(struct ehci_dbgp *dbgp) { u32 ctrl, portsc, hcs_params; unsigned int i, debug_port, new_debug_port = 0, n_ports; unsigned int port_map_tried, playtimes = 3; int ret; ehci_dbgp_bios_handoff(dbgp, readl(&dbgp->ehci_caps->hcc_params)); try_next_time: port_map_tried = 0; try_next_port: hcs_params = readl(&dbgp->ehci_caps->hcs_params); debug_port = HCS_DEBUG_PORT(hcs_params); dbgp->phys_port = debug_port; n_ports = HCS_N_PORTS(hcs_params); dbgp_printk("debug_port: %u\n", debug_port); dbgp_printk("n_ports: %u\n", n_ports); ehci_dbgp_status(dbgp, ""); if ( n_ports == 0 ) return -1; for ( i = 1; i <= n_ports; i++ ) { portsc = readl(&dbgp->ehci_regs->port_status[i-1]); dbgp_printk("portstatus%d: %08x\n", i, portsc); } if ( port_map_tried && (new_debug_port != debug_port) ) { if ( --playtimes ) { set_debug_port(dbgp, new_debug_port); goto try_next_time; } return -1; } /* Only reset the controller if it is not already in the * configured state */ if ( readl(&dbgp->ehci_regs->configured_flag) & FLAG_CF ) ehci_dbgp_status(dbgp, "ehci skip - already configured"); else if ( ehci_dbgp_controller_reset(dbgp) != 0 ) return -1; ret = ehci_dbgp_external_startup(dbgp); if (ret == -EIO) goto next_debug_port; if ( ret < 0 ) { /* Things didn't work so remove my claim */ ctrl = readl(&dbgp->ehci_debug->control); ctrl &= ~(DBGP_CLAIM | DBGP_OUT); writel(ctrl, &dbgp->ehci_debug->control); return -1; } return 0; next_debug_port: port_map_tried |= 1 << (debug_port - 1); new_debug_port = (debug_port % n_ports) + 1; if ( port_map_tried != ((1 << n_ports) - 1) ) { set_debug_port(dbgp, new_debug_port); goto try_next_port; } if ( --playtimes ) { set_debug_port(dbgp, new_debug_port); goto try_next_time; } return -1; } static inline void _ehci_dbgp_flush(struct ehci_dbgp *dbgp) { if ( dbgp_bulk_write(dbgp, USB_DEBUG_DEVNUM, dbgp->out.endpoint, dbgp->out.buf, dbgp->out.chunk, NULL) ) BUG(); dbgp->out.chunk = 0; } static void ehci_dbgp_flush(struct serial_port *port) { struct ehci_dbgp *dbgp = port->uart; s_time_t goal; if ( !dbgp->out.chunk || !dbgp->ehci_debug || dbgp->state == dbgp_unsafe ) return; if ( dbgp->state == dbgp_idle || !port->sync ) dbgp_check_for_completion(dbgp, 1, NULL); else dbgp_wait_until_complete(dbgp, NULL); if ( dbgp->state == dbgp_idle ) { _ehci_dbgp_flush(dbgp); if ( port->sync ) { dbgp_wait_until_complete(dbgp, NULL); return; } } goal = NOW() + MICROSECS(DBGP_CHECK_INTERVAL); if ( dbgp->timer.expires > goal ) set_timer(&dbgp->timer, goal); } static void ehci_dbgp_putc(struct serial_port *port, char c) { struct ehci_dbgp *dbgp = port->uart; if ( unlikely(dbgp->out.chunk >= DBGP_MAX_PACKET) ) return; dbgp->out.buf[dbgp->out.chunk++] = c; if ( dbgp->out.chunk == DBGP_MAX_PACKET ) ehci_dbgp_flush(port); } static int ehci_dbgp_tx_ready(struct serial_port *port) { struct ehci_dbgp *dbgp = port->uart; if ( unlikely(!dbgp->ehci_debug) || unlikely(dbgp->state == dbgp_unsafe) ) return port->sync || port->tx_log_everything || !port->txbuf; if ( dbgp->out.chunk == DBGP_MAX_PACKET ) ehci_dbgp_flush(port); else dbgp_check_for_completion(dbgp, 1, NULL); if ( dbgp->state != dbgp_idle && dbgp->out.chunk >= DBGP_MAX_PACKET ) return 0; return DBGP_MAX_PACKET - dbgp->out.chunk + (dbgp->state == dbgp_idle) * DBGP_MAX_PACKET; } static int ehci_dbgp_getc(struct serial_port *port, char *pc) { struct ehci_dbgp *dbgp = port->uart; if ( !dbgp->in.chunk ) return 0; *pc = *dbgp->in.buf; if ( --dbgp->in.chunk ) memmove(dbgp->in.buf, dbgp->in.buf + 1, dbgp->in.chunk); return 1; } /* Safe: ehci_dbgp_poll() runs as timer handler, so not reentrant. */ static struct serial_port *poll_port; static void _ehci_dbgp_poll(struct cpu_user_regs *regs) { struct serial_port *port = poll_port; struct ehci_dbgp *dbgp = port->uart; unsigned long flags; unsigned int timeout = MICROSECS(DBGP_CHECK_INTERVAL); bool_t empty = 0; if ( !dbgp->ehci_debug ) return; if ( spin_trylock_irqsave(&port->tx_lock, flags) ) { if ( dbgp->state != dbgp_unsafe ) dbgp_check_for_completion(dbgp, DBGP_CHECK_INTERVAL, NULL); if ( dbgp->state == dbgp_idle && dbgp->out.chunk ) _ehci_dbgp_flush(dbgp); if ( dbgp->state == dbgp_idle || dbgp->out.chunk < DBGP_MAX_PACKET ) empty = 1; spin_unlock_irqrestore(&port->tx_lock, flags); } if ( dbgp->in.chunk ) serial_rx_interrupt(port, regs); if ( empty ) serial_tx_interrupt(port, regs); if ( spin_trylock_irqsave(&port->tx_lock, flags) ) { if ( dbgp->state == dbgp_idle && !dbgp->in.chunk && !dbgp->out.chunk && port->txbufp == port->txbufc ) { if ( dbgp_bulk_read(dbgp, USB_DEBUG_DEVNUM, dbgp->in.endpoint, DBGP_MAX_PACKET, NULL) ) BUG(); timeout = MILLISECS(DBGP_IDLE_INTERVAL); } spin_unlock_irqrestore(&port->tx_lock, flags); } set_timer(&dbgp->timer, NOW() + timeout); } static void ehci_dbgp_poll(void *data) { poll_port = data; #ifdef run_in_exception_handler run_in_exception_handler(_ehci_dbgp_poll); #else _ehci_dbgp_poll(guest_cpu_user_regs()); #endif } static bool_t ehci_dbgp_setup_preirq(struct ehci_dbgp *dbgp) { if ( !ehci_dbgp_setup(dbgp) ) return 1; dbgp_printk("ehci_dbgp_setup failed\n"); dbgp->ehci_debug = NULL; return 0; } static void __init ehci_dbgp_init_preirq(struct serial_port *port) { struct ehci_dbgp *dbgp = port->uart; u32 debug_port, offset; void __iomem *ehci_bar; debug_port = pci_conf_read32(0, dbgp->bus, dbgp->slot, dbgp->func, dbgp->cap); offset = (debug_port >> 16) & 0xfff; /* double check if the mem space is enabled */ dbgp->pci_cr = pci_conf_read8(0, dbgp->bus, dbgp->slot, dbgp->func, PCI_COMMAND); if ( !(dbgp->pci_cr & PCI_COMMAND_MEMORY) ) { dbgp->pci_cr |= PCI_COMMAND_MEMORY; pci_conf_write16(0, dbgp->bus, dbgp->slot, dbgp->func, PCI_COMMAND, dbgp->pci_cr); dbgp_printk("MMIO for EHCI enabled\n"); } /* * FIXME I don't have the bar size so just guess PAGE_SIZE is more * than enough. 1k is the biggest that was seen. */ set_fixmap_nocache(FIX_EHCI_DBGP, dbgp->bar_val); ehci_bar = (void __iomem *)fix_to_virt(FIX_EHCI_DBGP); ehci_bar += dbgp->bar_val & ~PAGE_MASK; dbgp_printk("ehci_bar: %p\n", ehci_bar); dbgp->ehci_caps = ehci_bar; dbgp->ehci_regs = ehci_bar + HC_LENGTH(readl(&dbgp->ehci_caps->hc_capbase)); dbgp->ehci_debug = ehci_bar + offset; detect_set_debug_port(dbgp); if ( ehci_dbgp_setup_preirq(dbgp) ) ehci_dbgp_status(dbgp, "ehci_dbgp_init_preirq complete"); dbgp->lock = &port->tx_lock; } static void ehci_dbgp_setup_postirq(struct ehci_dbgp *dbgp) { set_timer(&dbgp->timer, NOW() + MILLISECS(1)); } static void __init ehci_dbgp_init_postirq(struct serial_port *port) { struct ehci_dbgp *dbgp = port->uart; if ( !dbgp->ehci_debug ) return; serial_async_transmit(port); init_timer(&dbgp->timer, ehci_dbgp_poll, port, 0); ehci_dbgp_setup_postirq(dbgp); pci_hide_device(dbgp->bus, PCI_DEVFN(dbgp->slot, dbgp->func)); } static int ehci_dbgp_check_release(struct ehci_dbgp *dbgp) { struct ehci_dbg_port __iomem *ehci_debug = dbgp->ehci_debug; u32 ctrl; unsigned int i; if ( !ehci_debug ) return 0; for ( i = 0; i < DBGP_MAX_PACKET; ++i ) if ( dbgp->out.buf[i] ) return 1; /* * This means the console is not initialized, or should get shutdown * so as to allow for reuse of the USB device, which means it is time * to shutdown the USB debug port. */ printk(XENLOG_INFO "Releasing EHCI debug port at %02x:%02x.%u\n", dbgp->bus, dbgp->slot, dbgp->func); if ( dbgp->timer.function ) kill_timer(&dbgp->timer); dbgp->ehci_debug = NULL; ctrl = readl(&ehci_debug->control); if ( ctrl & DBGP_ENABLED ) { ctrl &= ~DBGP_CLAIM; writel(ctrl, &ehci_debug->control); } return 0; } static void __init ehci_dbgp_endboot(struct serial_port *port) { ehci_dbgp_check_release(port->uart); } static void ehci_dbgp_suspend(struct serial_port *port) { struct ehci_dbgp *dbgp = port->uart; if ( !dbgp->ehci_debug ) return; stop_timer(&dbgp->timer); dbgp->timer.expires = 0; dbgp->pci_cr = pci_conf_read16(0, dbgp->bus, dbgp->slot, dbgp->func, PCI_COMMAND); dbgp->state = dbgp_unsafe; } static void ehci_dbgp_resume(struct serial_port *port) { struct ehci_dbgp *dbgp = port->uart; if ( !dbgp->ehci_debug ) return; pci_conf_write32(0, dbgp->bus, dbgp->slot, dbgp->func, dbgp->bar, dbgp->bar_val); pci_conf_write16(0, dbgp->bus, dbgp->slot, dbgp->func, PCI_COMMAND, dbgp->pci_cr); ehci_dbgp_setup_preirq(dbgp); ehci_dbgp_setup_postirq(dbgp); } static struct uart_driver __read_mostly ehci_dbgp_driver = { .init_preirq = ehci_dbgp_init_preirq, .init_postirq = ehci_dbgp_init_postirq, .endboot = ehci_dbgp_endboot, .suspend = ehci_dbgp_suspend, .resume = ehci_dbgp_resume, .tx_ready = ehci_dbgp_tx_ready, .putc = ehci_dbgp_putc, .flush = ehci_dbgp_flush, .getc = ehci_dbgp_getc }; static struct ehci_dbgp ehci_dbgp = { .state = dbgp_unsafe, .phys_port = 1 }; static char __initdata opt_dbgp[30]; string_param("dbgp", opt_dbgp); void __init ehci_dbgp_init(void) { struct ehci_dbgp *dbgp = &ehci_dbgp; u32 debug_port, offset, bar_val; const char *e; if ( strncmp(opt_dbgp, "ehci", 4) ) return; if ( isdigit(opt_dbgp[4]) || !opt_dbgp[4] ) { unsigned int num = 0; if ( opt_dbgp[4] ) simple_strtoul(opt_dbgp + 4, &e, 10); dbgp->cap = find_dbgp(dbgp, num); if ( !dbgp->cap ) return; dbgp_printk("Found EHCI debug port on %02x:%02x.%u\n", dbgp->bus, dbgp->slot, dbgp->func); } else if ( strncmp(opt_dbgp + 4, "@pci", 4) == 0 ) { unsigned int bus, slot, func; e = parse_pci(opt_dbgp + 8, NULL, &bus, &slot, &func); if ( !e || *e ) return; dbgp->bus = bus; dbgp->slot = slot; dbgp->func = func; if ( !pci_device_detect(0, bus, slot, func) ) return; dbgp->cap = __find_dbgp(bus, slot, func); if ( !dbgp->cap ) return; dbgp_printk("Using EHCI debug port on %02x:%02x.%u\n", bus, slot, func); } else return; debug_port = pci_conf_read32(0, dbgp->bus, dbgp->slot, dbgp->func, dbgp->cap); dbgp->bar = (debug_port >> 29) & 0x7; dbgp->bar = ((dbgp->bar - 1) * 4) + PCI_BASE_ADDRESS_0; offset = (debug_port >> 16) & 0xfff; dbgp_printk("bar: %02x offset: %03x\n", dbgp->bar, offset); if ( dbgp->bar < PCI_BASE_ADDRESS_0 || dbgp->bar > PCI_BASE_ADDRESS_5 ) { dbgp_printk("unsupported/invalid bar\n"); return; } dbgp->bar_val = bar_val = pci_conf_read32(0, dbgp->bus, dbgp->slot, dbgp->func, dbgp->bar); dbgp_printk("bar_val: %08x\n", bar_val); if ( bar_val & ~PCI_BASE_ADDRESS_MEM_MASK ) { dbgp_printk("only simple 32-bit MMIO BARs supported\n"); return; } bar_val &= PCI_BASE_ADDRESS_MEM_MASK; if ( !bar_val || !(bar_val + (bar_val & -bar_val)) ) { dbgp_printk("firmware initialization of MMIO BAR required\n"); return; } serial_register_uart(SERHND_DBGP, &ehci_dbgp_driver, dbgp); } int dbgp_op(const struct physdev_dbgp_op *op) { if ( !ehci_dbgp.ehci_debug ) return 0; switch ( op->bus ) { case PHYSDEVOP_DBGP_BUS_UNKNOWN: break; case PHYSDEVOP_DBGP_BUS_PCI: if ( op->u.pci.seg || ehci_dbgp.bus != op->u.pci.bus || PCI_DEVFN(ehci_dbgp.slot, ehci_dbgp.func) != op->u.pci.devfn ) default: return 0; break; } switch ( op->op ) { case PHYSDEVOP_DBGP_RESET_PREPARE: spin_lock_irq(ehci_dbgp.lock); ehci_dbgp.state = dbgp_unsafe; dbgp_wait_until_complete(&ehci_dbgp, NULL); spin_unlock_irq(ehci_dbgp.lock); return ehci_dbgp_check_release(&ehci_dbgp); case PHYSDEVOP_DBGP_RESET_DONE: return ehci_dbgp_external_startup(&ehci_dbgp) ?: 1; } return -ENOSYS; } xen-4.4.0/xen/drivers/cpufreq/0000775000175000017500000000000012307313555014360 5ustar smbsmbxen-4.4.0/xen/drivers/cpufreq/Makefile0000664000175000017500000000014412307313555016017 0ustar smbsmbobj-y += cpufreq.o obj-y += cpufreq_ondemand.o obj-y += cpufreq_misc_governors.o obj-y += utility.o xen-4.4.0/xen/drivers/cpufreq/cpufreq.c0000664000175000017500000004356112307313555016202 0ustar smbsmb/* * Copyright (C) 2001, 2002 Andy Grover * Copyright (C) 2001, 2002 Paul Diefenbaugh * Copyright (C) 2002 - 2004 Dominik Brodowski * Copyright (C) 2006 Denis Sadykov * * Feb 2008 - Liu Jinsong * Add cpufreq limit change handle and per-cpu cpufreq add/del * to cope with cpu hotplug * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static unsigned int __read_mostly usr_min_freq; static unsigned int __read_mostly usr_max_freq; static void cpufreq_cmdline_common_para(struct cpufreq_policy *new_policy); struct cpufreq_dom { unsigned int dom; cpumask_var_t map; struct list_head node; }; static LIST_HEAD_READ_MOSTLY(cpufreq_dom_list_head); struct cpufreq_governor *__read_mostly cpufreq_opt_governor; LIST_HEAD_READ_MOSTLY(cpufreq_governor_list); /* set xen as default cpufreq */ enum cpufreq_controller cpufreq_controller = FREQCTL_xen; static void __init setup_cpufreq_option(char *str) { char *arg; if ( !strcmp(str, "dom0-kernel") ) { xen_processor_pmbits &= ~XEN_PROCESSOR_PM_PX; cpufreq_controller = FREQCTL_dom0_kernel; opt_dom0_vcpus_pin = 1; return; } if ( !strcmp(str, "none") ) { xen_processor_pmbits &= ~XEN_PROCESSOR_PM_PX; cpufreq_controller = FREQCTL_none; return; } if ( (arg = strpbrk(str, ",:")) != NULL ) *arg++ = '\0'; if ( !strcmp(str, "xen") ) if ( arg && *arg ) cpufreq_cmdline_parse(arg); } custom_param("cpufreq", setup_cpufreq_option); bool_t __read_mostly cpufreq_verbose; struct cpufreq_governor *__find_governor(const char *governor) { struct cpufreq_governor *t; if (!governor) return NULL; list_for_each_entry(t, &cpufreq_governor_list, governor_list) if (!strnicmp(governor, t->name, CPUFREQ_NAME_LEN)) return t; return NULL; } int __init cpufreq_register_governor(struct cpufreq_governor *governor) { if (!governor) return -EINVAL; if (__find_governor(governor->name) != NULL) return -EEXIST; list_add(&governor->governor_list, &cpufreq_governor_list); return 0; } int cpufreq_limit_change(unsigned int cpu) { struct processor_performance *perf = &processor_pminfo[cpu]->perf; struct cpufreq_policy *data; struct cpufreq_policy policy; if (!cpu_online(cpu) || !(data = per_cpu(cpufreq_cpu_policy, cpu)) || !processor_pminfo[cpu]) return -ENODEV; if (perf->platform_limit >= perf->state_count) return -EINVAL; memcpy(&policy, data, sizeof(struct cpufreq_policy)); policy.max = perf->states[perf->platform_limit].core_frequency * 1000; return __cpufreq_set_policy(data, &policy); } int cpufreq_add_cpu(unsigned int cpu) { int ret = 0; unsigned int firstcpu; unsigned int dom, domexist = 0; unsigned int hw_all = 0; struct list_head *pos; struct cpufreq_dom *cpufreq_dom = NULL; struct cpufreq_policy new_policy; struct cpufreq_policy *policy; struct processor_performance *perf = &processor_pminfo[cpu]->perf; /* to protect the case when Px was not controlled by xen */ if (!processor_pminfo[cpu] || !(perf->init & XEN_PX_INIT) || !cpu_online(cpu)) return -EINVAL; if (!cpufreq_driver) return 0; if (per_cpu(cpufreq_cpu_policy, cpu)) return 0; if (perf->shared_type == CPUFREQ_SHARED_TYPE_HW) hw_all = 1; dom = perf->domain_info.domain; list_for_each(pos, &cpufreq_dom_list_head) { cpufreq_dom = list_entry(pos, struct cpufreq_dom, node); if (dom == cpufreq_dom->dom) { domexist = 1; break; } } if (!domexist) { cpufreq_dom = xzalloc(struct cpufreq_dom); if (!cpufreq_dom) return -ENOMEM; if (!zalloc_cpumask_var(&cpufreq_dom->map)) { xfree(cpufreq_dom); return -ENOMEM; } cpufreq_dom->dom = dom; list_add(&cpufreq_dom->node, &cpufreq_dom_list_head); } else { /* domain sanity check under whatever coordination type */ firstcpu = cpumask_first(cpufreq_dom->map); if ((perf->domain_info.coord_type != processor_pminfo[firstcpu]->perf.domain_info.coord_type) || (perf->domain_info.num_processors != processor_pminfo[firstcpu]->perf.domain_info.num_processors)) { printk(KERN_WARNING "cpufreq fail to add CPU%d:" "incorrect _PSD(%"PRIu64":%"PRIu64"), " "expect(%"PRIu64"/%"PRIu64")\n", cpu, perf->domain_info.coord_type, perf->domain_info.num_processors, processor_pminfo[firstcpu]->perf.domain_info.coord_type, processor_pminfo[firstcpu]->perf.domain_info.num_processors ); return -EINVAL; } } if (!domexist || hw_all) { policy = xzalloc(struct cpufreq_policy); if (!policy) { ret = -ENOMEM; goto err0; } if (!zalloc_cpumask_var(&policy->cpus)) { xfree(policy); ret = -ENOMEM; goto err0; } policy->cpu = cpu; per_cpu(cpufreq_cpu_policy, cpu) = policy; ret = cpufreq_driver->init(policy); if (ret) { free_cpumask_var(policy->cpus); xfree(policy); per_cpu(cpufreq_cpu_policy, cpu) = NULL; goto err0; } if (cpufreq_verbose) printk("CPU %u initialization completed\n", cpu); } else { firstcpu = cpumask_first(cpufreq_dom->map); policy = per_cpu(cpufreq_cpu_policy, firstcpu); per_cpu(cpufreq_cpu_policy, cpu) = policy; if (cpufreq_verbose) printk("adding CPU %u\n", cpu); } cpumask_set_cpu(cpu, policy->cpus); cpumask_set_cpu(cpu, cpufreq_dom->map); ret = cpufreq_statistic_init(cpu); if (ret) goto err1; if (hw_all || (cpumask_weight(cpufreq_dom->map) == perf->domain_info.num_processors)) { memcpy(&new_policy, policy, sizeof(struct cpufreq_policy)); policy->governor = NULL; cpufreq_cmdline_common_para(&new_policy); ret = __cpufreq_set_policy(policy, &new_policy); if (ret) { if (new_policy.governor == CPUFREQ_DEFAULT_GOVERNOR) /* if default governor fail, cpufreq really meet troubles */ goto err2; else { /* grub option governor fail */ /* give one more chance to default gov */ memcpy(&new_policy, policy, sizeof(struct cpufreq_policy)); new_policy.governor = CPUFREQ_DEFAULT_GOVERNOR; ret = __cpufreq_set_policy(policy, &new_policy); if (ret) goto err2; } } } return 0; err2: cpufreq_statistic_exit(cpu); err1: per_cpu(cpufreq_cpu_policy, cpu) = NULL; cpumask_clear_cpu(cpu, policy->cpus); cpumask_clear_cpu(cpu, cpufreq_dom->map); if (cpumask_empty(policy->cpus)) { cpufreq_driver->exit(policy); free_cpumask_var(policy->cpus); xfree(policy); } err0: if (cpumask_empty(cpufreq_dom->map)) { list_del(&cpufreq_dom->node); free_cpumask_var(cpufreq_dom->map); xfree(cpufreq_dom); } return ret; } int cpufreq_del_cpu(unsigned int cpu) { unsigned int dom, domexist = 0; unsigned int hw_all = 0; struct list_head *pos; struct cpufreq_dom *cpufreq_dom = NULL; struct cpufreq_policy *policy; struct processor_performance *perf = &processor_pminfo[cpu]->perf; /* to protect the case when Px was not controlled by xen */ if (!processor_pminfo[cpu] || !(perf->init & XEN_PX_INIT) || !cpu_online(cpu)) return -EINVAL; if (!per_cpu(cpufreq_cpu_policy, cpu)) return 0; if (perf->shared_type == CPUFREQ_SHARED_TYPE_HW) hw_all = 1; dom = perf->domain_info.domain; policy = per_cpu(cpufreq_cpu_policy, cpu); list_for_each(pos, &cpufreq_dom_list_head) { cpufreq_dom = list_entry(pos, struct cpufreq_dom, node); if (dom == cpufreq_dom->dom) { domexist = 1; break; } } if (!domexist) return -EINVAL; /* for HW_ALL, stop gov for each core of the _PSD domain */ /* for SW_ALL & SW_ANY, stop gov for the 1st core of the _PSD domain */ if (hw_all || (cpumask_weight(cpufreq_dom->map) == perf->domain_info.num_processors)) __cpufreq_governor(policy, CPUFREQ_GOV_STOP); cpufreq_statistic_exit(cpu); per_cpu(cpufreq_cpu_policy, cpu) = NULL; cpumask_clear_cpu(cpu, policy->cpus); cpumask_clear_cpu(cpu, cpufreq_dom->map); if (cpumask_empty(policy->cpus)) { cpufreq_driver->exit(policy); free_cpumask_var(policy->cpus); xfree(policy); } /* for the last cpu of the domain, clean room */ /* It's safe here to free freq_table, drv_data and policy */ if (cpumask_empty(cpufreq_dom->map)) { list_del(&cpufreq_dom->node); free_cpumask_var(cpufreq_dom->map); xfree(cpufreq_dom); } if (cpufreq_verbose) printk("deleting CPU %u\n", cpu); return 0; } static void print_PCT(struct xen_pct_register *ptr) { printk("\t_PCT: descriptor=%d, length=%d, space_id=%d, " "bit_width=%d, bit_offset=%d, reserved=%d, address=%"PRId64"\n", ptr->descriptor, ptr->length, ptr->space_id, ptr->bit_width, ptr->bit_offset, ptr->reserved, ptr->address); } static void print_PSS(struct xen_processor_px *ptr, int count) { int i; printk("\t_PSS: state_count=%d\n", count); for (i=0; inum_entries, ptr->revision, ptr->domain, ptr->coord_type, ptr->num_processors); } static void print_PPC(unsigned int platform_limit) { printk("\t_PPC: %d\n", platform_limit); } int set_px_pminfo(uint32_t acpi_id, struct xen_processor_performance *dom0_px_info) { int ret=0, cpuid; struct processor_pminfo *pmpt; struct processor_performance *pxpt; cpuid = get_cpu_id(acpi_id); if ( cpuid < 0 || !dom0_px_info) { ret = -EINVAL; goto out; } if ( cpufreq_verbose ) printk("Set CPU acpi_id(%d) cpuid(%d) Px State info:\n", acpi_id, cpuid); pmpt = processor_pminfo[cpuid]; if ( !pmpt ) { pmpt = xzalloc(struct processor_pminfo); if ( !pmpt ) { ret = -ENOMEM; goto out; } processor_pminfo[cpuid] = pmpt; } pxpt = &pmpt->perf; pmpt->acpi_id = acpi_id; pmpt->id = cpuid; if ( dom0_px_info->flags & XEN_PX_PCT ) { /* space_id check */ if (dom0_px_info->control_register.space_id != dom0_px_info->status_register.space_id) { ret = -EINVAL; goto out; } memcpy ((void *)&pxpt->control_register, (void *)&dom0_px_info->control_register, sizeof(struct xen_pct_register)); memcpy ((void *)&pxpt->status_register, (void *)&dom0_px_info->status_register, sizeof(struct xen_pct_register)); if ( cpufreq_verbose ) { print_PCT(&pxpt->control_register); print_PCT(&pxpt->status_register); } } if ( dom0_px_info->flags & XEN_PX_PSS ) { /* capability check */ if (dom0_px_info->state_count <= 1) { ret = -EINVAL; goto out; } if ( !(pxpt->states = xmalloc_array(struct xen_processor_px, dom0_px_info->state_count)) ) { ret = -ENOMEM; goto out; } if ( copy_from_guest(pxpt->states, dom0_px_info->states, dom0_px_info->state_count) ) { ret = -EFAULT; goto out; } pxpt->state_count = dom0_px_info->state_count; if ( cpufreq_verbose ) print_PSS(pxpt->states,pxpt->state_count); } if ( dom0_px_info->flags & XEN_PX_PSD ) { /* check domain coordination */ if (dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_ALL && dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_ANY && dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_HW) { ret = -EINVAL; goto out; } pxpt->shared_type = dom0_px_info->shared_type; memcpy ((void *)&pxpt->domain_info, (void *)&dom0_px_info->domain_info, sizeof(struct xen_psd_package)); if ( cpufreq_verbose ) print_PSD(&pxpt->domain_info); } if ( dom0_px_info->flags & XEN_PX_PPC ) { pxpt->platform_limit = dom0_px_info->platform_limit; if ( cpufreq_verbose ) print_PPC(pxpt->platform_limit); if ( pxpt->init == XEN_PX_INIT ) { ret = cpufreq_limit_change(cpuid); goto out; } } if ( dom0_px_info->flags == ( XEN_PX_PCT | XEN_PX_PSS | XEN_PX_PSD | XEN_PX_PPC ) ) { pxpt->init = XEN_PX_INIT; ret = cpufreq_cpu_init(cpuid); goto out; } out: return ret; } static void cpufreq_cmdline_common_para(struct cpufreq_policy *new_policy) { if (usr_max_freq) new_policy->max = usr_max_freq; if (usr_min_freq) new_policy->min = usr_min_freq; } static int __init cpufreq_handle_common_option(const char *name, const char *val) { if (!strcmp(name, "maxfreq") && val) { usr_max_freq = simple_strtoul(val, NULL, 0); return 1; } if (!strcmp(name, "minfreq") && val) { usr_min_freq = simple_strtoul(val, NULL, 0); return 1; } if (!strcmp(name, "verbose")) { cpufreq_verbose = !val || !!simple_strtoul(val, NULL, 0); return 1; } return 0; } void __init cpufreq_cmdline_parse(char *str) { static struct cpufreq_governor *__initdata cpufreq_governors[] = { CPUFREQ_DEFAULT_GOVERNOR, &cpufreq_gov_userspace, &cpufreq_gov_dbs, &cpufreq_gov_performance, &cpufreq_gov_powersave }; unsigned int gov_index = 0; do { char *val, *end = strchr(str, ','); unsigned int i; if (end) *end++ = '\0'; val = strchr(str, '='); if (val) *val++ = '\0'; if (!cpufreq_opt_governor) { if (!val) { for (i = 0; i < ARRAY_SIZE(cpufreq_governors); ++i) { if (!strcmp(str, cpufreq_governors[i]->name)) { cpufreq_opt_governor = cpufreq_governors[i]; gov_index = i; str = NULL; break; } } } else { cpufreq_opt_governor = CPUFREQ_DEFAULT_GOVERNOR; } } if (str && !cpufreq_handle_common_option(str, val) && (!cpufreq_governors[gov_index]->handle_option || !cpufreq_governors[gov_index]->handle_option(str, val))) printk(XENLOG_WARNING "cpufreq/%s: option '%s' not recognized\n", cpufreq_governors[gov_index]->name, str); str = end; } while (str); } static int cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; switch ( action ) { case CPU_DOWN_FAILED: case CPU_ONLINE: (void)cpufreq_add_cpu(cpu); break; case CPU_DOWN_PREPARE: (void)cpufreq_del_cpu(cpu); break; default: break; } return NOTIFY_DONE; } static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; static int __init cpufreq_presmp_init(void) { void *cpu = (void *)(long)smp_processor_id(); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); return 0; } presmp_initcall(cpufreq_presmp_init); xen-4.4.0/xen/drivers/cpufreq/cpufreq_ondemand.c0000664000175000017500000002677012307313555020052 0ustar smbsmb/* * xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c * * Copyright (C) 2001 Russell King * (C) 2003 Venkatesh Pallipadi . * Jun Nakajima * Feb 2008 Liu Jinsong * Porting cpufreq_ondemand.c from Liunx 2.6.23 to Xen hypervisor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include #include #include #include #include #include #include #define DEF_FREQUENCY_UP_THRESHOLD (80) #define MIN_FREQUENCY_UP_THRESHOLD (11) #define MAX_FREQUENCY_UP_THRESHOLD (100) #define MIN_DBS_INTERVAL (MICROSECS(100)) #define MIN_SAMPLING_RATE_RATIO (2) #define MIN_SAMPLING_MILLISECS (MIN_SAMPLING_RATE_RATIO * 10) #define MIN_STAT_SAMPLING_RATE \ (MIN_SAMPLING_MILLISECS * MILLISECS(1)) #define MIN_SAMPLING_RATE \ (def_sampling_rate / MIN_SAMPLING_RATE_RATIO) #define MAX_SAMPLING_RATE (500 * def_sampling_rate) #define DEF_SAMPLING_RATE_LATENCY_MULTIPLIER (1000) #define TRANSITION_LATENCY_LIMIT (10 * 1000 ) static uint64_t def_sampling_rate; static uint64_t usr_sampling_rate; /* Sampling types */ enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE}; static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); static unsigned int dbs_enable; /* number of CPUs using this policy */ static struct dbs_tuners { uint64_t sampling_rate; unsigned int up_threshold; unsigned int powersave_bias; } dbs_tuners_ins = { .sampling_rate = 0, .up_threshold = DEF_FREQUENCY_UP_THRESHOLD, .powersave_bias = 0, }; static DEFINE_PER_CPU(struct timer, dbs_timer); int write_ondemand_sampling_rate(unsigned int sampling_rate) { if ( (sampling_rate > MAX_SAMPLING_RATE / MICROSECS(1)) || (sampling_rate < MIN_SAMPLING_RATE / MICROSECS(1)) ) return -EINVAL; dbs_tuners_ins.sampling_rate = sampling_rate * MICROSECS(1); return 0; } int write_ondemand_up_threshold(unsigned int up_threshold) { if ( (up_threshold > MAX_FREQUENCY_UP_THRESHOLD) || (up_threshold < MIN_FREQUENCY_UP_THRESHOLD) ) return -EINVAL; dbs_tuners_ins.up_threshold = up_threshold; return 0; } int get_cpufreq_ondemand_para(uint32_t *sampling_rate_max, uint32_t *sampling_rate_min, uint32_t *sampling_rate, uint32_t *up_threshold) { if (!sampling_rate_max || !sampling_rate_min || !sampling_rate || !up_threshold) return -EINVAL; *sampling_rate_max = MAX_SAMPLING_RATE/MICROSECS(1); *sampling_rate_min = MIN_SAMPLING_RATE/MICROSECS(1); *sampling_rate = dbs_tuners_ins.sampling_rate / MICROSECS(1); *up_threshold = dbs_tuners_ins.up_threshold; return 0; } static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) { uint64_t cur_ns, total_ns; uint64_t max_load_freq = 0; struct cpufreq_policy *policy; unsigned int max; unsigned int j; if (!this_dbs_info->enable) return; policy = this_dbs_info->cur_policy; max = policy->max; if (unlikely(policy->resume)) { __cpufreq_driver_target(policy, max,CPUFREQ_RELATION_H); return; } cur_ns = NOW(); total_ns = cur_ns - this_dbs_info->prev_cpu_wall; this_dbs_info->prev_cpu_wall = NOW(); if (total_ns < MIN_DBS_INTERVAL) return; /* Get Idle Time */ for_each_cpu(j, policy->cpus) { uint64_t idle_ns, total_idle_ns; uint64_t load, load_freq, freq_avg; struct cpu_dbs_info_s *j_dbs_info; j_dbs_info = &per_cpu(cpu_dbs_info, j); total_idle_ns = get_cpu_idle_time(j); idle_ns = total_idle_ns - j_dbs_info->prev_cpu_idle; j_dbs_info->prev_cpu_idle = total_idle_ns; if (unlikely(total_ns < idle_ns)) continue; load = 100 * (total_ns - idle_ns) / total_ns; freq_avg = cpufreq_driver_getavg(j, GOV_GETAVG); load_freq = load * freq_avg; if (load_freq > max_load_freq) max_load_freq = load_freq; } /* Check for frequency increase */ if (max_load_freq > (uint64_t) dbs_tuners_ins.up_threshold * policy->cur) { /* if we are already at full speed then break out early */ if (policy->cur == max) return; __cpufreq_driver_target(policy, max, CPUFREQ_RELATION_H); return; } /* Check for frequency decrease */ /* if we cannot reduce the frequency anymore, break out early */ if (policy->cur == policy->min) return; /* * The optimal frequency is the frequency that is the lowest that * can support the current CPU usage without triggering the up * policy. To be safe, we focus 10 points under the threshold. */ if (max_load_freq < (uint64_t) (dbs_tuners_ins.up_threshold - 10) * policy->cur) { uint64_t freq_next; freq_next = max_load_freq / (dbs_tuners_ins.up_threshold - 10); __cpufreq_driver_target(policy, freq_next, CPUFREQ_RELATION_L); } } static void do_dbs_timer(void *dbs) { struct cpu_dbs_info_s *dbs_info = (struct cpu_dbs_info_s *)dbs; if (!dbs_info->enable) return; dbs_check_cpu(dbs_info); set_timer(&per_cpu(dbs_timer, dbs_info->cpu), align_timer(NOW() , dbs_tuners_ins.sampling_rate)); } static void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) { dbs_info->enable = 1; init_timer(&per_cpu(dbs_timer, dbs_info->cpu), do_dbs_timer, (void *)dbs_info, dbs_info->cpu); set_timer(&per_cpu(dbs_timer, dbs_info->cpu), NOW()+dbs_tuners_ins.sampling_rate); if ( processor_pminfo[dbs_info->cpu]->perf.shared_type == CPUFREQ_SHARED_TYPE_HW ) { dbs_info->stoppable = 1; } } static void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) { dbs_info->enable = 0; dbs_info->stoppable = 0; kill_timer(&per_cpu(dbs_timer, dbs_info->cpu)); } int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int event) { unsigned int cpu = policy->cpu; struct cpu_dbs_info_s *this_dbs_info; unsigned int j; this_dbs_info = &per_cpu(cpu_dbs_info, cpu); switch (event) { case CPUFREQ_GOV_START: if ((!cpu_online(cpu)) || (!policy->cur)) return -EINVAL; if (policy->cpuinfo.transition_latency > (TRANSITION_LATENCY_LIMIT * 1000)) { printk(KERN_WARNING "ondemand governor failed to load " "due to too long transition latency\n"); return -EINVAL; } if (this_dbs_info->enable) /* Already enabled */ break; dbs_enable++; for_each_cpu(j, policy->cpus) { struct cpu_dbs_info_s *j_dbs_info; j_dbs_info = &per_cpu(cpu_dbs_info, j); j_dbs_info->cur_policy = policy; j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j); j_dbs_info->prev_cpu_wall = NOW(); } this_dbs_info->cpu = cpu; /* * Start the timerschedule work, when this governor * is used for first time */ if ((dbs_enable == 1) && !dbs_tuners_ins.sampling_rate) { def_sampling_rate = (uint64_t) policy->cpuinfo.transition_latency * DEF_SAMPLING_RATE_LATENCY_MULTIPLIER; if (def_sampling_rate < MIN_STAT_SAMPLING_RATE) def_sampling_rate = MIN_STAT_SAMPLING_RATE; if (!usr_sampling_rate) dbs_tuners_ins.sampling_rate = def_sampling_rate; else if (usr_sampling_rate < MIN_SAMPLING_RATE) { printk(KERN_WARNING "cpufreq/ondemand: " "specified sampling rate too low, using %"PRIu64"\n", MIN_SAMPLING_RATE); dbs_tuners_ins.sampling_rate = MIN_SAMPLING_RATE; } else if (usr_sampling_rate > MAX_SAMPLING_RATE) { printk(KERN_WARNING "cpufreq/ondemand: " "specified sampling rate too high, using %"PRIu64"\n", MAX_SAMPLING_RATE); dbs_tuners_ins.sampling_rate = MAX_SAMPLING_RATE; } else dbs_tuners_ins.sampling_rate = usr_sampling_rate; } dbs_timer_init(this_dbs_info); break; case CPUFREQ_GOV_STOP: dbs_timer_exit(this_dbs_info); dbs_enable--; break; case CPUFREQ_GOV_LIMITS: if ( this_dbs_info->cur_policy == NULL ) { printk(KERN_WARNING "CPU%d ondemand governor not started yet," "unable to GOV_LIMIT\n", cpu); return -EINVAL; } if (policy->max < this_dbs_info->cur_policy->cur) __cpufreq_driver_target(this_dbs_info->cur_policy, policy->max, CPUFREQ_RELATION_H); else if (policy->min > this_dbs_info->cur_policy->cur) __cpufreq_driver_target(this_dbs_info->cur_policy, policy->min, CPUFREQ_RELATION_L); break; } return 0; } static bool_t __init cpufreq_dbs_handle_option(const char *name, const char *val) { if ( !strcmp(name, "rate") && val ) { usr_sampling_rate = simple_strtoull(val, NULL, 0) * MICROSECS(1); } else if ( !strcmp(name, "up_threshold") && val ) { unsigned long tmp = simple_strtoul(val, NULL, 0); if ( tmp < MIN_FREQUENCY_UP_THRESHOLD ) { printk(XENLOG_WARNING "cpufreq/ondemand: " "specified threshold too low, using %d\n", MIN_FREQUENCY_UP_THRESHOLD); tmp = MIN_FREQUENCY_UP_THRESHOLD; } else if ( tmp > MAX_FREQUENCY_UP_THRESHOLD ) { printk(XENLOG_WARNING "cpufreq/ondemand: " "specified threshold too high, using %d\n", MAX_FREQUENCY_UP_THRESHOLD); tmp = MAX_FREQUENCY_UP_THRESHOLD; } dbs_tuners_ins.up_threshold = tmp; } else if ( !strcmp(name, "bias") && val ) { unsigned long tmp = simple_strtoul(val, NULL, 0); if ( tmp > 1000 ) { printk(XENLOG_WARNING "cpufreq/ondemand: " "specified bias too high, using 1000\n"); tmp = 1000; } dbs_tuners_ins.powersave_bias = tmp; } else return 0; return 1; } struct cpufreq_governor cpufreq_gov_dbs = { .name = "ondemand", .governor = cpufreq_governor_dbs, .handle_option = cpufreq_dbs_handle_option }; static int __init cpufreq_gov_dbs_init(void) { return cpufreq_register_governor(&cpufreq_gov_dbs); } __initcall(cpufreq_gov_dbs_init); void cpufreq_dbs_timer_suspend(void) { int cpu; cpu = smp_processor_id(); if ( per_cpu(cpu_dbs_info,cpu).stoppable ) { stop_timer( &per_cpu(dbs_timer, cpu) ); } } void cpufreq_dbs_timer_resume(void) { int cpu; struct timer* t; s_time_t now; cpu = smp_processor_id(); if ( per_cpu(cpu_dbs_info,cpu).stoppable ) { now = NOW(); t = &per_cpu(dbs_timer, cpu); if (t->expires <= now) { t->function(t->data); } else { set_timer(t, align_timer(now , dbs_tuners_ins.sampling_rate)); } } } xen-4.4.0/xen/drivers/cpufreq/utility.c0000664000175000017500000003320412307313555016231 0ustar smbsmb/* * utility.c - misc functions for cpufreq driver and Px statistic * * Copyright (C) 2001 Russell King * (C) 2002 - 2003 Dominik Brodowski * * Oct 2005 - Ashok Raj * Added handling for CPU hotplug * Feb 2006 - Jacob Shin * Fix handling for CPU hotplug -- affected CPUs * Feb 2008 - Liu Jinsong * 1. Merge cpufreq.c and freq_table.c of linux 2.6.23 * And poring to Xen hypervisor * 2. some Px statistic interface funcdtions * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * */ #include #include #include #include #include #include #include #include #include #include #include struct cpufreq_driver *cpufreq_driver; struct processor_pminfo *__read_mostly processor_pminfo[NR_CPUS]; DEFINE_PER_CPU_READ_MOSTLY(struct cpufreq_policy *, cpufreq_cpu_policy); DEFINE_PER_CPU(spinlock_t, cpufreq_statistic_lock); /********************************************************************* * Px STATISTIC INFO * *********************************************************************/ void cpufreq_residency_update(unsigned int cpu, uint8_t state) { uint64_t now, total_idle_ns; int64_t delta; struct pm_px *pxpt = per_cpu(cpufreq_statistic_data, cpu); total_idle_ns = get_cpu_idle_time(cpu); now = NOW(); delta = (now - pxpt->prev_state_wall) - (total_idle_ns - pxpt->prev_idle_wall); if ( likely(delta >= 0) ) pxpt->u.pt[state].residency += delta; pxpt->prev_state_wall = now; pxpt->prev_idle_wall = total_idle_ns; } void cpufreq_statistic_update(unsigned int cpu, uint8_t from, uint8_t to) { struct pm_px *pxpt; struct processor_pminfo *pmpt = processor_pminfo[cpu]; spinlock_t *cpufreq_statistic_lock = &per_cpu(cpufreq_statistic_lock, cpu); spin_lock(cpufreq_statistic_lock); pxpt = per_cpu(cpufreq_statistic_data, cpu); if ( !pxpt || !pmpt ) { spin_unlock(cpufreq_statistic_lock); return; } pxpt->u.last = from; pxpt->u.cur = to; pxpt->u.pt[to].count++; cpufreq_residency_update(cpu, from); (*(pxpt->u.trans_pt + from * pmpt->perf.state_count + to))++; spin_unlock(cpufreq_statistic_lock); } int cpufreq_statistic_init(unsigned int cpuid) { uint32_t i, count; struct pm_px *pxpt; const struct processor_pminfo *pmpt = processor_pminfo[cpuid]; spinlock_t *cpufreq_statistic_lock = &per_cpu(cpufreq_statistic_lock, cpuid); spin_lock_init(cpufreq_statistic_lock); if ( !pmpt ) return -EINVAL; spin_lock(cpufreq_statistic_lock); pxpt = per_cpu(cpufreq_statistic_data, cpuid); if ( pxpt ) { spin_unlock(cpufreq_statistic_lock); return 0; } count = pmpt->perf.state_count; pxpt = xzalloc(struct pm_px); if ( !pxpt ) { spin_unlock(cpufreq_statistic_lock); return -ENOMEM; } per_cpu(cpufreq_statistic_data, cpuid) = pxpt; pxpt->u.trans_pt = xzalloc_array(uint64_t, count * count); if (!pxpt->u.trans_pt) { xfree(pxpt); spin_unlock(cpufreq_statistic_lock); return -ENOMEM; } pxpt->u.pt = xzalloc_array(struct pm_px_val, count); if (!pxpt->u.pt) { xfree(pxpt->u.trans_pt); xfree(pxpt); spin_unlock(cpufreq_statistic_lock); return -ENOMEM; } pxpt->u.total = pmpt->perf.state_count; pxpt->u.usable = pmpt->perf.state_count - pmpt->perf.platform_limit; for (i=0; i < pmpt->perf.state_count; i++) pxpt->u.pt[i].freq = pmpt->perf.states[i].core_frequency; pxpt->prev_state_wall = NOW(); pxpt->prev_idle_wall = get_cpu_idle_time(cpuid); spin_unlock(cpufreq_statistic_lock); return 0; } void cpufreq_statistic_exit(unsigned int cpuid) { struct pm_px *pxpt; spinlock_t *cpufreq_statistic_lock = &per_cpu(cpufreq_statistic_lock, cpuid); spin_lock(cpufreq_statistic_lock); pxpt = per_cpu(cpufreq_statistic_data, cpuid); if (!pxpt) { spin_unlock(cpufreq_statistic_lock); return; } xfree(pxpt->u.trans_pt); xfree(pxpt->u.pt); xfree(pxpt); per_cpu(cpufreq_statistic_data, cpuid) = NULL; spin_unlock(cpufreq_statistic_lock); } void cpufreq_statistic_reset(unsigned int cpuid) { uint32_t i, j, count; struct pm_px *pxpt; const struct processor_pminfo *pmpt = processor_pminfo[cpuid]; spinlock_t *cpufreq_statistic_lock = &per_cpu(cpufreq_statistic_lock, cpuid); spin_lock(cpufreq_statistic_lock); pxpt = per_cpu(cpufreq_statistic_data, cpuid); if ( !pmpt || !pxpt || !pxpt->u.pt || !pxpt->u.trans_pt ) { spin_unlock(cpufreq_statistic_lock); return; } count = pmpt->perf.state_count; for (i=0; i < count; i++) { pxpt->u.pt[i].residency = 0; pxpt->u.pt[i].count = 0; for (j=0; j < count; j++) *(pxpt->u.trans_pt + i*count + j) = 0; } pxpt->prev_state_wall = NOW(); pxpt->prev_idle_wall = get_cpu_idle_time(cpuid); spin_unlock(cpufreq_statistic_lock); } /********************************************************************* * FREQUENCY TABLE HELPERS * *********************************************************************/ int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table) { unsigned int min_freq = ~0; unsigned int max_freq = 0; unsigned int second_max_freq = 0; unsigned int i; for (i=0; (table[i].frequency != CPUFREQ_TABLE_END); i++) { unsigned int freq = table[i].frequency; if (freq == CPUFREQ_ENTRY_INVALID) continue; if (freq < min_freq) min_freq = freq; if (freq > max_freq) max_freq = freq; } for (i=0; (table[i].frequency != CPUFREQ_TABLE_END); i++) { unsigned int freq = table[i].frequency; if (freq == CPUFREQ_ENTRY_INVALID || freq == max_freq) continue; if (freq > second_max_freq) second_max_freq = freq; } if (second_max_freq == 0) second_max_freq = max_freq; if (cpufreq_verbose) printk("max_freq: %u second_max_freq: %u\n", max_freq, second_max_freq); policy->min = policy->cpuinfo.min_freq = min_freq; policy->max = policy->cpuinfo.max_freq = max_freq; policy->cpuinfo.second_max_freq = second_max_freq; if (policy->min == ~0) return -EINVAL; else return 0; } int cpufreq_frequency_table_verify(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table) { unsigned int next_larger = ~0; unsigned int i; unsigned int count = 0; if (!cpu_online(policy->cpu)) return -EINVAL; cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, policy->cpuinfo.max_freq); for (i=0; (table[i].frequency != CPUFREQ_TABLE_END); i++) { unsigned int freq = table[i].frequency; if (freq == CPUFREQ_ENTRY_INVALID) continue; if ((freq >= policy->min) && (freq <= policy->max)) count++; else if ((next_larger > freq) && (freq > policy->max)) next_larger = freq; } if (!count) policy->max = next_larger; cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, policy->cpuinfo.max_freq); return 0; } int cpufreq_frequency_table_target(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int target_freq, unsigned int relation, unsigned int *index) { struct cpufreq_frequency_table optimal = { .index = ~0, .frequency = 0, }; struct cpufreq_frequency_table suboptimal = { .index = ~0, .frequency = 0, }; unsigned int i; switch (relation) { case CPUFREQ_RELATION_H: suboptimal.frequency = ~0; break; case CPUFREQ_RELATION_L: optimal.frequency = ~0; break; } if (!cpu_online(policy->cpu)) return -EINVAL; for (i=0; (table[i].frequency != CPUFREQ_TABLE_END); i++) { unsigned int freq = table[i].frequency; if (freq == CPUFREQ_ENTRY_INVALID) continue; if ((freq < policy->min) || (freq > policy->max)) continue; switch(relation) { case CPUFREQ_RELATION_H: if (freq <= target_freq) { if (freq >= optimal.frequency) { optimal.frequency = freq; optimal.index = i; } } else { if (freq <= suboptimal.frequency) { suboptimal.frequency = freq; suboptimal.index = i; } } break; case CPUFREQ_RELATION_L: if (freq >= target_freq) { if (freq <= optimal.frequency) { optimal.frequency = freq; optimal.index = i; } } else { if (freq >= suboptimal.frequency) { suboptimal.frequency = freq; suboptimal.index = i; } } break; } } if (optimal.index > i) { if (suboptimal.index > i) return -EINVAL; *index = suboptimal.index; } else *index = optimal.index; return 0; } /********************************************************************* * GOVERNORS * *********************************************************************/ int __cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { int retval = -EINVAL; if (cpu_online(policy->cpu) && cpufreq_driver->target) { unsigned int prev_freq = policy->cur; retval = cpufreq_driver->target(policy, target_freq, relation); if ( retval == 0 ) TRACE_2D(TRC_PM_FREQ_CHANGE, prev_freq/1000, policy->cur/1000); } return retval; } int cpufreq_driver_getavg(unsigned int cpu, unsigned int flag) { struct cpufreq_policy *policy; int freq_avg; if (!cpu_online(cpu) || !(policy = per_cpu(cpufreq_cpu_policy, cpu))) return 0; if (cpufreq_driver->getavg) { freq_avg = cpufreq_driver->getavg(cpu, flag); if (freq_avg > 0) return freq_avg; } return policy->cur; } int cpufreq_update_turbo(int cpuid, int new_state) { struct cpufreq_policy *policy; int curr_state; int ret = 0; if (new_state != CPUFREQ_TURBO_ENABLED && new_state != CPUFREQ_TURBO_DISABLED) return -EINVAL; policy = per_cpu(cpufreq_cpu_policy, cpuid); if (!policy) return -EACCES; if (policy->turbo == CPUFREQ_TURBO_UNSUPPORTED) return -EOPNOTSUPP; curr_state = policy->turbo; if (curr_state == new_state) return 0; policy->turbo = new_state; if (cpufreq_driver->update) { ret = cpufreq_driver->update(cpuid, policy); if (ret) policy->turbo = curr_state; } return ret; } int cpufreq_get_turbo_status(int cpuid) { struct cpufreq_policy *policy; policy = per_cpu(cpufreq_cpu_policy, cpuid); return policy && policy->turbo == CPUFREQ_TURBO_ENABLED; } /********************************************************************* * POLICY * *********************************************************************/ /* * data : current policy. * policy : policy to be set. */ int __cpufreq_set_policy(struct cpufreq_policy *data, struct cpufreq_policy *policy) { int ret = 0; memcpy(&policy->cpuinfo, &data->cpuinfo, sizeof(struct cpufreq_cpuinfo)); if (policy->min > data->min && policy->min > policy->max) return -EINVAL; /* verify the cpu speed can be set within this limit */ ret = cpufreq_driver->verify(policy); if (ret) return ret; data->min = policy->min; data->max = policy->max; if (policy->governor != data->governor) { /* save old, working values */ struct cpufreq_governor *old_gov = data->governor; /* end old governor */ if (data->governor) __cpufreq_governor(data, CPUFREQ_GOV_STOP); /* start new governor */ data->governor = policy->governor; if (__cpufreq_governor(data, CPUFREQ_GOV_START)) { printk(KERN_WARNING "Fail change to %s governor\n", data->governor->name); /* new governor failed, so re-start old one */ data->governor = old_gov; if (old_gov) { __cpufreq_governor(data, CPUFREQ_GOV_START); printk(KERN_WARNING "Still stay at %s governor\n", data->governor->name); } return -EINVAL; } /* might be a policy change, too, so fall through */ } return __cpufreq_governor(data, CPUFREQ_GOV_LIMITS); } xen-4.4.0/xen/drivers/cpufreq/cpufreq_misc_governors.c0000664000175000017500000001206712307313555021316 0ustar smbsmb/* * xen/drivers/cpufreq/cpufreq_misc_gov.c * * Copyright (C) 2001 Russell King * (C) 2002 - 2004 Dominik Brodowski * * Nov 2008 Liu Jinsong * Porting cpufreq_userspace.c, cpufreq_performance.c, and * cpufreq_powersave.c from Liunx 2.6.23 to Xen hypervisor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * */ #include #include #include #include #include /* * cpufreq userspace governor */ static unsigned int __read_mostly userspace_cmdline_freq; static DEFINE_PER_CPU(unsigned int, cpu_set_freq); static int cpufreq_governor_userspace(struct cpufreq_policy *policy, unsigned int event) { int ret = 0; unsigned int cpu; if (unlikely(!policy) || unlikely(!cpu_online(cpu = policy->cpu))) return -EINVAL; switch (event) { case CPUFREQ_GOV_START: if (!per_cpu(cpu_set_freq, cpu)) per_cpu(cpu_set_freq, cpu) = policy->cur; break; case CPUFREQ_GOV_STOP: per_cpu(cpu_set_freq, cpu) = 0; break; case CPUFREQ_GOV_LIMITS: if (policy->max < per_cpu(cpu_set_freq, cpu)) ret = __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); else if (policy->min > per_cpu(cpu_set_freq, cpu)) ret = __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); else ret = __cpufreq_driver_target(policy, per_cpu(cpu_set_freq, cpu), CPUFREQ_RELATION_L); break; default: ret = -EINVAL; break; } return ret; } int write_userspace_scaling_setspeed(unsigned int cpu, unsigned int freq) { struct cpufreq_policy *policy; if (!cpu_online(cpu) || !(policy = per_cpu(cpufreq_cpu_policy, cpu))) return -EINVAL; per_cpu(cpu_set_freq, cpu) = freq; if (freq < policy->min) freq = policy->min; if (freq > policy->max) freq = policy->max; return __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); } static bool_t __init cpufreq_userspace_handle_option(const char *name, const char *val) { if (!strcmp(name, "speed") && val) { userspace_cmdline_freq = simple_strtoul(val, NULL, 0); return 1; } return 0; } static int cpufreq_userspace_cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; switch (action) { case CPU_UP_PREPARE: per_cpu(cpu_set_freq, cpu) = userspace_cmdline_freq; break; } return NOTIFY_DONE; } static struct notifier_block cpufreq_userspace_cpu_nfb = { .notifier_call = cpufreq_userspace_cpu_callback }; struct cpufreq_governor cpufreq_gov_userspace = { .name = "userspace", .governor = cpufreq_governor_userspace, .handle_option = cpufreq_userspace_handle_option }; static int __init cpufreq_gov_userspace_init(void) { unsigned int cpu; for_each_online_cpu(cpu) per_cpu(cpu_set_freq, cpu) = userspace_cmdline_freq; register_cpu_notifier(&cpufreq_userspace_cpu_nfb); return cpufreq_register_governor(&cpufreq_gov_userspace); } __initcall(cpufreq_gov_userspace_init); /* * cpufreq performance governor */ static int cpufreq_governor_performance(struct cpufreq_policy *policy, unsigned int event) { int ret = 0; if (!policy) return -EINVAL; switch (event) { case CPUFREQ_GOV_START: case CPUFREQ_GOV_STOP: break; case CPUFREQ_GOV_LIMITS: ret = __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H); break; default: ret = -EINVAL; break; } return ret; } struct cpufreq_governor cpufreq_gov_performance = { .name = "performance", .governor = cpufreq_governor_performance, }; static int __init cpufreq_gov_performance_init(void) { return cpufreq_register_governor(&cpufreq_gov_performance); } __initcall(cpufreq_gov_performance_init); /* * cpufreq powersave governor */ static int cpufreq_governor_powersave(struct cpufreq_policy *policy, unsigned int event) { int ret = 0; if (!policy) return -EINVAL; switch (event) { case CPUFREQ_GOV_START: case CPUFREQ_GOV_STOP: break; case CPUFREQ_GOV_LIMITS: ret = __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L); break; default: ret = -EINVAL; break; } return ret; } struct cpufreq_governor cpufreq_gov_powersave = { .name = "powersave", .governor = cpufreq_governor_powersave, }; static int __init cpufreq_gov_powersave_init(void) { return cpufreq_register_governor(&cpufreq_gov_powersave); } __initcall(cpufreq_gov_powersave_init); xen-4.4.0/xen/Rules.mk0000664000175000017500000001332312307313555012662 0ustar smbsmb # # If you change any of these configuration options then you must # 'make clean' before rebuilding. # verbose ?= n perfc ?= n perfc_arrays ?= n lock_profile ?= n crash_debug ?= n frame_pointer ?= n lto ?= n include $(XEN_ROOT)/Config.mk # Hardcoded configuration implications and dependencies. # Do this is a neater way if it becomes unwieldy. ifeq ($(debug),y) verbose := y frame_pointer := y else CFLAGS += -DNDEBUG endif ifeq ($(perfc_arrays),y) perfc := y endif # Set ARCH/SUBARCH appropriately. override TARGET_SUBARCH := $(XEN_TARGET_ARCH) override TARGET_ARCH := $(shell echo $(XEN_TARGET_ARCH) | \ sed -e 's/x86.*/x86/' -e s'/arm\(32\|64\)/arm/g') TARGET := $(BASEDIR)/xen include $(BASEDIR)/arch/$(TARGET_ARCH)/Rules.mk # Note that link order matters! ALL_OBJS-y += $(BASEDIR)/common/built_in.o ALL_OBJS-y += $(BASEDIR)/drivers/built_in.o ALL_OBJS-y += $(BASEDIR)/xsm/built_in.o ALL_OBJS-y += $(BASEDIR)/arch/$(TARGET_ARCH)/built_in.o ALL_OBJS-$(x86) += $(BASEDIR)/crypto/built_in.o CFLAGS += -fno-builtin -fno-common CFLAGS += -Werror -Wredundant-decls -Wno-pointer-arith CFLAGS += -pipe -g -D__XEN__ -include $(BASEDIR)/include/xen/config.h CFLAGS += -nostdinc CFLAGS-$(XSM_ENABLE) += -DXSM_ENABLE CFLAGS-$(FLASK_ENABLE) += -DFLASK_ENABLE -DXSM_MAGIC=0xf97cff8c CFLAGS-$(FLASK_ENABLE) += -DFLASK_DEVELOP -DFLASK_BOOTPARAM -DFLASK_AVC_STATS CFLAGS-$(verbose) += -DVERBOSE CFLAGS-$(crash_debug) += -DCRASH_DEBUG CFLAGS-$(perfc) += -DPERF_COUNTERS CFLAGS-$(perfc_arrays) += -DPERF_ARRAYS CFLAGS-$(lock_profile) += -DLOCK_PROFILE CFLAGS-$(HAS_ACPI) += -DHAS_ACPI CFLAGS-$(HAS_GDBSX) += -DHAS_GDBSX CFLAGS-$(HAS_PASSTHROUGH) += -DHAS_PASSTHROUGH CFLAGS-$(HAS_DEVICE_TREE) += -DHAS_DEVICE_TREE CFLAGS-$(HAS_PCI) += -DHAS_PCI CFLAGS-$(HAS_IOPORTS) += -DHAS_IOPORTS CFLAGS-$(frame_pointer) += -fno-omit-frame-pointer -DCONFIG_FRAME_POINTER ifneq ($(max_phys_cpus),) CFLAGS-y += -DMAX_PHYS_CPUS=$(max_phys_cpus) endif ifneq ($(max_phys_irqs),) CFLAGS-y += -DMAX_PHYS_IRQS=$(max_phys_irqs) endif AFLAGS-y += -D__ASSEMBLY__ -include $(BASEDIR)/include/xen/config.h # Clang's built-in assembler can't handle .code16/.code32/.code64 yet AFLAGS-$(clang) += -no-integrated-as ALL_OBJS := $(ALL_OBJS-y) # Get gcc to generate the dependencies for us. CFLAGS-y += -MMD -MF .$(@F).d DEPS = .*.d CFLAGS += $(CFLAGS-y) # Most CFLAGS are safe for assembly files: # -std=gnu{89,99} gets confused by #-prefixed end-of-line comments # -flto makes no sense and annoys clang AFLAGS += $(AFLAGS-y) $(filter-out -std=gnu%,$(filter-out -flto,$(CFLAGS))) # LDFLAGS are only passed directly to $(LD) LDFLAGS += $(LDFLAGS_DIRECT) LDFLAGS += $(LDFLAGS-y) include Makefile # Ensure each subdirectory has exactly one trailing slash. subdir-n := $(patsubst %,%/,$(patsubst %/,%,$(subdir-n) $(subdir-))) subdir-y := $(patsubst %,%/,$(patsubst %/,%,$(subdir-y))) # Add explicitly declared subdirectories to the object lists. obj-y += $(patsubst %/,%/built_in.o,$(subdir-y)) # Add implicitly declared subdirectories (in the object lists) to the # subdirectory list, and rewrite the object-list entry. subdir-y += $(filter %/,$(obj-y)) obj-y := $(patsubst %/,%/built-in.o,$(obj-y)) subdir-all := $(subdir-y) $(subdir-n) $(filter %.init.o,$(obj-y) $(obj-bin-y) $(extra-y)): CFLAGS += -DINIT_SECTIONS_ONLY $(obj-$(coverage)): CFLAGS += -fprofile-arcs -ftest-coverage -DTEST_COVERAGE ifeq ($(lto),y) # Would like to handle all object files as bitcode, but objects made from # pure asm are in a different format and have to be collected separately. # Mirror the directory tree, collecting them as built_in_bin.o. # If there are no binary objects in a given directory, make a dummy .o obj-bin-y += $(patsubst %/built_in.o,%/built_in_bin.o,$(filter %/built_in.o,$(obj-y))) else # For a non-LTO build, bundle obj-bin targets in with the normal objs. obj-y += $(obj-bin-y) obj-bin-y := endif # Always build obj-bin files as binary even if they come from C source. $(obj-bin-y): CFLAGS := $(filter-out -flto,$(CFLAGS)) built_in.o: $(obj-y) ifeq ($(obj-y),) $(CC) $(CFLAGS) -c -x c /dev/null -o $@ else ifeq ($(lto),y) $(LD_LTO) -r -o $@ $^ else $(LD) $(LDFLAGS) -r -o $@ $^ endif endif built_in_bin.o: $(obj-bin-y) ifeq ($(obj-bin-y),) $(CC) $(AFLAGS) -c -x assembler /dev/null -o $@ else $(LD) $(LDFLAGS) -r -o $@ $^ endif # Force execution of pattern rules (for which PHONY cannot be directly used). .PHONY: FORCE FORCE: %/built_in.o: FORCE $(MAKE) -f $(BASEDIR)/Rules.mk -C $* built_in.o %/built_in_bin.o: FORCE $(MAKE) -f $(BASEDIR)/Rules.mk -C $* built_in_bin.o .PHONY: clean clean:: $(addprefix _clean_, $(subdir-all)) rm -f *.o *~ core $(DEPS) _clean_%/: FORCE $(MAKE) -f $(BASEDIR)/Rules.mk -C $* clean %.o: %.c Makefile $(CC) $(CFLAGS) -c $< -o $@ %.o: %.S Makefile $(CC) $(AFLAGS) -c $< -o $@ SPECIAL_DATA_SECTIONS := rodata $(foreach n,1 2 4 8,rodata.str1.$(n)) \ $(foreach r,rel rel.ro,data.$(r) data.$(r).local) $(filter %.init.o,$(obj-y) $(obj-bin-y) $(extra-y)): %.init.o: %.o Makefile $(OBJDUMP) -h $< | sed -n '/[0-9]/{s,00*,0,g;p}' | while read idx name sz rest; do \ case "$$name" in \ .text|.text.*|.data|.data.*|.bss) \ test $$sz != 0 || continue; \ echo "Error: size of $<:$$name is 0x$$sz" >&2; \ exit $$(expr $$idx + 1);; \ esac; \ done $(OBJCOPY) $(foreach s,$(SPECIAL_DATA_SECTIONS),--rename-section .$(s)=.init.$(s)) $< $@ %.i: %.c Makefile $(CPP) $(CFLAGS) $< -o $@ %.s: %.c Makefile $(CC) $(CFLAGS) -S $< -o $@ # -std=gnu{89,99} gets confused by # as an end-of-line comment marker %.s: %.S Makefile $(CPP) $(AFLAGS) $< -o $@ -include $(DEPS) xen-4.4.0/Makefile0000664000175000017500000002071412307313555012107 0ustar smbsmb# # Grand Unified Makefile for Xen. # # Default target must appear before any include lines .PHONY: all all: dist -include config/Toplevel.mk SUBSYSTEMS?=xen kernels tools stubdom docs TARGS_DIST=$(patsubst %, dist-%, $(SUBSYSTEMS)) TARGS_INSTALL=$(patsubst %, install-%, $(SUBSYSTEMS)) export XEN_ROOT=$(CURDIR) include Config.mk SUBARCH := $(subst x86_32,i386,$(XEN_TARGET_ARCH)) export XEN_TARGET_ARCH SUBARCH XEN_SYSTYPE include buildconfigs/Rules.mk # build and install everything into the standard system directories .PHONY: install install: $(TARGS_INSTALL) .PHONY: build build: kernels $(MAKE) -C xen build $(MAKE) -C tools build $(MAKE) -C stubdom build ifeq (x86_64,$(XEN_TARGET_ARCH)) XEN_TARGET_ARCH=x86_32 $(MAKE) -C stubdom pv-grub endif $(MAKE) -C docs build # The test target is for unit tests that can run without an installation. Of # course, many tests require a machine running Xen itself, and these are # handled elsewhere. .PHONY: test test: $(MAKE) -C tools/python test # build and install everything into local dist directory .PHONY: dist dist: DESTDIR=$(DISTDIR)/install dist: $(TARGS_DIST) dist-misc dist-misc: $(INSTALL_DIR) $(DISTDIR)/ $(INSTALL_DATA) ./COPYING $(DISTDIR) $(INSTALL_DATA) ./README $(DISTDIR) $(INSTALL_PROG) ./install.sh $(DISTDIR) dist-%: DESTDIR=$(DISTDIR)/install dist-%: install-% @: # do nothing # Legacy dist targets .PHONY: xen tools stubdom kernels docs xen: dist-xen tools: dist-tools kernels: dist-kernels stubdom: dist-stubdom docs: dist-docs .PHONY: prep-kernels prep-kernels: for i in $(XKERNELS) ; do $(MAKE) $$i-prep || exit 1; done .PHONY: install-xen install-xen: $(MAKE) -C xen install ifeq ($(CONFIG_QEMU_TRAD),y) QEMU_TRAD_DIR_TGT := tools/qemu-xen-traditional-dir tools/qemu-xen-traditional-dir: $(MAKE) -C tools qemu-xen-traditional-dir-find .PHONY: tools/qemu-xen-traditional-dir-force-update tools/qemu-xen-traditional-dir-force-update: $(MAKE) -C tools qemu-xen-traditional-dir-force-update endif ifeq ($(CONFIG_QEMU_XEN),y) QEMU_XEN_DIR_TGT := tools/qemu-xen-dir tools/qemu-xen-dir: $(MAKE) -C tools qemu-xen-dir-find .PHONY: tools/qemu-xen-dir-force-update tools/qemu-xen-dir-force-update: $(MAKE) -C tools qemu-xen-dir-force-update endif .PHONY: install-tools install-tools: $(QEMU_TRAD_DIR_TARGET) $(QEMU_XEN_DIR_TARGET) $(MAKE) -C tools install .PHONY: install-kernels install-kernels: for i in $(XKERNELS) ; do $(MAKE) $$i-install || exit 1; done .PHONY: install-stubdom install-stubdom: $(QEMU_TRAD_DIR_TARGET) install-tools $(MAKE) -C stubdom install ifeq (x86_64,$(XEN_TARGET_ARCH)) XEN_TARGET_ARCH=x86_32 $(MAKE) -C stubdom install-grub endif .PHONY: tools/firmware/seabios-dir-force-update tools/firmware/seabios-dir-force-update: $(MAKE) -C tools/firmware seabios-dir-force-update .PHONY: tools/firmware/ovmf-dir-force-update tools/firmware/ovmf-dir-force-update: $(MAKE) -C tools/firmware ovmf-dir-force-update .PHONY: install-docs install-docs: $(MAKE) -C docs install .PHONY: dev-docs dev-docs: $(MAKE) -C docs dev-docs # Build all the various kernels and modules .PHONY: kbuild kbuild: kernels # Delete the kernel build trees entirely .PHONY: kdelete kdelete: for i in $(XKERNELS) ; do $(MAKE) $$i-delete ; done # Clean the kernel build trees .PHONY: kclean kclean: for i in $(XKERNELS) ; do $(MAKE) $$i-clean ; done # build xen, the tools, and a domain 0 plus unprivileged linux-xen images, # and place them in the install directory. 'make install' should then # copy them to the normal system directories .PHONY: world world: $(MAKE) clean $(MAKE) kdelete $(MAKE) dist # Package a build in a debball file, that is inside a .deb format # container to allow for easy and clean removal. This is not intended # to be a full featured policy compliant .deb package. .PHONY: debball debball: dist fakeroot sh ./tools/misc/mkdeb $(XEN_ROOT) $$($(MAKE) -C xen xenversion | grep -v :) # clean doesn't do a kclean .PHONY: clean clean:: $(MAKE) -C xen clean $(MAKE) -C tools clean $(MAKE) -C stubdom crossclean ifeq (x86_64,$(XEN_TARGET_ARCH)) XEN_TARGET_ARCH=x86_32 $(MAKE) -C stubdom crossclean endif $(MAKE) -C docs clean # clean, but blow away kernel build tree plus tarballs .PHONY: distclean distclean: rm -f config/Toplevel.mk $(MAKE) -C xen distclean $(MAKE) -C tools distclean $(MAKE) -C stubdom distclean ifeq (x86_64,$(XEN_TARGET_ARCH)) XEN_TARGET_ARCH=x86_32 $(MAKE) -C stubdom distclean endif $(MAKE) -C docs distclean rm -rf dist patches/tmp for i in $(ALLKERNELS) ; do $(MAKE) $$i-delete ; done rm -rf patches/*/.makedep rm -rf config.log config.status config.cache autom4te.cache # Linux name for GNU distclean .PHONY: mrproper mrproper: distclean # Prepare for source tarball .PHONY: src-tarball src-tarball: distclean $(MAKE) -C xen .banner rm -rf xen/tools/figlet .[a-z]* $(MAKE) -C xen distclean .PHONY: help help: @echo 'Installation targets:' @echo ' install - build and install everything' @echo ' install-xen - build and install the Xen hypervisor' @echo ' install-tools - build and install the control tools' @echo ' install-kernels - build and install guest kernels' @echo ' install-stubdom - build and install the stubdomain images' @echo ' install-docs - build and install user documentation' @echo '' @echo 'Building targets:' @echo ' dist - build and install everything into local dist directory' @echo ' world - clean everything, delete guest kernel build' @echo ' trees then make dist' @echo ' xen - build and install Xen hypervisor' @echo ' tools - build and install tools' @echo ' stubdom - build and install the stubdomain images' @echo ' kernels - build and install guest kernels' @echo ' kbuild - synonym for make kernels' @echo ' docs - build and install user documentation' @echo ' dev-docs - build developer-only documentation' @echo '' @echo 'Cleaning targets:' @echo ' clean - clean the Xen, tools and docs (but not guest kernel trees)' @echo ' distclean - clean plus delete kernel build trees and' @echo ' local downloaded files' @echo ' kdelete - delete guest kernel build trees' @echo ' kclean - clean guest kernel build trees' @echo '' @echo 'Miscellaneous targets:' @echo ' prep-kernels - prepares kernel directories, does not build' @echo ' uninstall - attempt to remove installed Xen tools' @echo ' (use with extreme care!)' @echo @echo 'Trusted Boot (tboot) targets:' @echo ' build-tboot - download and build the tboot module' @echo ' install-tboot - download, build, and install the tboot module' @echo ' clean-tboot - clean the tboot module if it exists' @echo @echo 'Environment:' @echo ' [ this documentation is sadly not complete ]' # Use this target with extreme care! .PHONY: uninstall uninstall: D=$(DESTDIR) uninstall: [ -d $(D)$(XEN_CONFIG_DIR) ] && mv -f $(D)$(XEN_CONFIG_DIR) $(D)$(XEN_CONFIG_DIR).old-`date +%s` || true $(MAKE) -C xen uninstall rm -rf $(D)$(CONFIG_DIR)/init.d/xendomains $(D)$(CONFIG_DIR)/init.d/xend rm -rf $(D)$(CONFIG_DIR)/init.d/xencommons $(D)$(CONFIG_DIR)/init.d/xen-watchdog rm -f $(D)$(CONFIG_DIR)/udev/rules.d/xen-backend.rules rm -f $(D)$(CONFIG_DIR)/udev/rules.d/xend.rules rm -f $(D)$(SYSCONFIG_DIR)/xendomains rm -f $(D)$(SYSCONFIG_DIR)/xencommons rm -rf $(D)/var/run/xen* $(D)/var/lib/xen* make -C tools uninstall rm -rf $(D)/boot/tboot* # Legacy targets for compatibility .PHONY: linux26 linux26: $(MAKE) 'KERNELS=linux-2.6*' kernels .PHONY: xenversion xenversion: @$(MAKE) --no-print-directory -C xen xenversion # # tboot targets # TBOOT_TARFILE = tboot-20090330.tar.gz #TBOOT_BASE_URL = http://downloads.sourceforge.net/tboot TBOOT_BASE_URL = $(XEN_EXTFILES_URL) .PHONY: build-tboot build-tboot: download_tboot $(MAKE) -C tboot build .PHONY: install-tboot install-tboot: download_tboot $(MAKE) -C tboot install .PHONY: dist-tboot dist-tboot: download_tboot $(MAKE) DESTDIR=$(DISTDIR)/install -C tboot dist .PHONY: clean-tboot clean-tboot: [ ! -d tboot ] || $(MAKE) -C tboot clean .PHONY: distclean-tboot distclean-tboot: [ ! -d tboot ] || $(MAKE) -C tboot distclean .PHONY: download_tboot download_tboot: tboot/Makefile tboot/Makefile: tboot/$(TBOOT_TARFILE) [ -e tboot/Makefile ] || tar -xzf tboot/$(TBOOT_TARFILE) -C tboot/ --strip-components 1 tboot/$(TBOOT_TARFILE): mkdir -p tboot wget -O tboot/$(TBOOT_TARFILE) $(TBOOT_BASE_URL)/$(TBOOT_TARFILE) xen-4.4.0/m4/0000775000175000017500000000000012307313555010763 5ustar smbsmbxen-4.4.0/m4/ptyfuncs.m40000664000175000017500000000216512307313555013104 0ustar smbsmbAC_DEFUN([AX_CHECK_PTYFUNCS], [ dnl This is a workaround for a bug in Debian package dnl libbsd-dev-0.3.0-1. Once we no longer support that dnl package we can remove the addition of -Werror to dnl CPPFLAGS. AX_SAVEVAR_SAVE(CPPFLAGS) CPPFLAGS="$CPPFLAGS -Werror" AC_CHECK_HEADER([libutil.h],[ AC_DEFINE([INCLUDE_LIBUTIL_H],[],[libutil header file name]) ]) AX_SAVEVAR_RESTORE(CPPFLAGS) AC_CACHE_CHECK([for openpty et al], [ax_cv_ptyfuncs_libs], [ for ax_cv_ptyfuncs_libs in -lutil "" NOT_FOUND; do if test "x$ax_cv_ptyfuncs_libs" = "xNOT_FOUND"; then AC_MSG_FAILURE([Unable to find library for openpty and login_tty]) fi AX_SAVEVAR_SAVE(LIBS) LIBS="$LIBS $ax_cv_ptyfuncs_libs" AC_LINK_IFELSE([AC_LANG_SOURCE([ #ifdef INCLUDE_LIBUTIL_H #include INCLUDE_LIBUTIL_H #endif int main(void) { openpty(0,0,0,0,0); login_tty(0); } ])],[ break ],[]) AX_SAVEVAR_RESTORE(LIBS) done ]) PTYFUNCS_LIBS="$ax_cv_ptyfuncs_libs" AC_SUBST(PTYFUNCS_LIBS) ]) xen-4.4.0/m4/python_version.m40000664000175000017500000000056212307313555014316 0ustar smbsmbAC_DEFUN([AX_CHECK_PYTHON_VERSION], [AC_MSG_CHECKING([for python version >= $1.$2 ]) `$PYTHON -c 'import sys; sys.exit(eval("sys.version_info < ($1, $2)"))'` if test "$?" != "0" then python_version=`$PYTHON -V 2>&1` AC_MSG_RESULT([no]) AC_MSG_ERROR( [$python_version is too old, minimum required version is $1.$2]) else AC_MSG_RESULT([yes]) fi]) xen-4.4.0/m4/set_cflags_ldflags.m40000664000175000017500000000074512307313555015041 0ustar smbsmbAC_DEFUN([AX_SET_FLAGS], [for cppflag in $PREPEND_INCLUDES do PREPEND_CPPFLAGS="$PREPEND_CPPFLAGS -I$cppflag" done for ldflag in $PREPEND_LIB do PREPEND_LDFLAGS="$PREPEND_LDFLAGS -L$ldflag" done for cppflag in $APPEND_INCLUDES do APPEND_CPPFLAGS="$APPEND_CPPFLAGS -I$cppflag" done for ldflag in $APPEND_LIB do APPEND_LDFLAGS="$APPEND_LDFLAGS -L$ldflag" done CPPFLAGS="$PREPEND_CPPFLAGS $CPPFLAGS $APPEND_CPPFLAGS" LDFLAGS="$PREPEND_LDFLAGS $LDFLAGS $APPEND_LDFLAGS"]) xen-4.4.0/m4/fetcher.m40000664000175000017500000000047512307313555012653 0ustar smbsmbAC_DEFUN([AX_CHECK_FETCHER], [ AC_PATH_PROG([WGET],[wget], [no]) AS_IF([test x"$WGET" != x"no"], [ FETCHER="$WGET -c -O" ], [ AC_PATH_PROG([FTP],[ftp], [no]) AS_IF([test x"$FTP" != x"no"], [ FETCHER="$FTP -o" ], [ AC_MSG_ERROR([cannot find wget or ftp]) ]) ]) AC_SUBST(FETCHER) ]) xen-4.4.0/m4/extfs.m40000664000175000017500000000120412307313555012353 0ustar smbsmbAC_DEFUN([AX_CHECK_EXTFS], [ AC_CHECK_HEADER([ext2fs/ext2fs.h], [ AC_CHECK_LIB([ext2fs], [ext2fs_open2], [ AC_DEFINE([INCLUDE_EXTFS_H], [], [Define extfs header to use]) EXTFS_LIBS="-lext2fs" ]) ]) dnl This is a temporary hack for CentOS 5.x, which split the ext4 support dnl of ext2fs in a different package. Once CentOS 5.x is no longer supported dnl we can remove this. AC_CHECK_HEADER([ext4fs/ext2fs.h], [ AC_CHECK_LIB([ext4fs], [ext2fs_open2], [ AC_DEFINE([INCLUDE_EXTFS_H], [], [Define extfs header to use]) EXTFS_LIBS="-lext4fs" ]) ]) AC_SUBST(EXTFS_LIBS) ]) xen-4.4.0/m4/path_or_fail.m40000664000175000017500000000023512307313555013654 0ustar smbsmbAC_DEFUN([AX_PATH_PROG_OR_FAIL], [AC_PATH_PROG([$1], [$2], [no]) if test x"${$1}" = x"no" then AC_MSG_ERROR([Unable to find $2, please install $2]) fi]) xen-4.4.0/m4/pthread.m40000664000175000017500000000243712307313555012662 0ustar smbsmb# We define, separately, PTHREAD_CFLAGS, _LDFLAGS and _LIBS # even though currently we don't set them very separately. # This means that the makefiles will not need to change in # the future if we make the test more sophisticated. AC_DEFUN([AX_PTHREAD_CV2VARS],[ PTHREAD_CFLAGS="$ax_cv_pthread_flags" PTHREAD_LDFLAGS="$ax_cv_pthread_flags" PTHREAD_LIBS="" ]) # We invoke AX_PTHREAD_VARS with the name of another macro # which is then expanded once for each variable. AC_DEFUN([AX_PTHREAD_VARS],[$1(CFLAGS) $1(LDFLAGS) $1(LIBS)]) AC_DEFUN([AX_PTHREAD_VAR_APPLY],[ $1="$$1 $PTHREAD_$1" ]) AC_DEFUN([AX_PTHREAD_VAR_SUBST],[AC_SUBST(PTHREAD_$1)]) AC_DEFUN([AX_CHECK_PTHREAD],[ AC_CACHE_CHECK([for pthread flag], [ax_cv_pthread_flags], [ ax_cv_pthread_flags=-pthread AX_PTHREAD_CV2VARS AX_PTHREAD_VARS([AX_SAVEVAR_SAVE]) AX_PTHREAD_VARS([AX_PTHREAD_VAR_APPLY]) AC_LINK_IFELSE([AC_LANG_SOURCE([ #include int main(void) { pthread_atfork(0,0,0); pthread_create(0,0,0,0); } ])],[],[ax_cv_pthread_flags=failed]) AX_PTHREAD_VARS([AX_SAVEVAR_RESTORE]) ]) if test "x$ax_cv_pthread_flags" = xfailed; then AC_MSG_ERROR([-pthread does not work]) fi AX_PTHREAD_CV2VARS AX_PTHREAD_VARS([AX_PTHREAD_VAR_SUBST]) ]) xen-4.4.0/m4/pkg.m40000664000175000017500000001250112307313555012005 0ustar smbsmb# pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*- # serial 1 (pkg-config-0.24) # # Copyright © 2004 Scott James Remnant . # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that program. # PKG_PROG_PKG_CONFIG([MIN-VERSION]) # ---------------------------------- AC_DEFUN([PKG_PROG_PKG_CONFIG], [m4_pattern_forbid([^_?PKG_[A-Z_]+$]) m4_pattern_allow([^PKG_CONFIG(_PATH)?$]) AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility]) AC_ARG_VAR([PKG_CONFIG_PATH], [directories to add to pkg-config's search path]) AC_ARG_VAR([PKG_CONFIG_LIBDIR], [path overriding pkg-config's built-in search path]) if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then AC_PATH_TOOL([PKG_CONFIG], [pkg-config]) fi if test -n "$PKG_CONFIG"; then _pkg_min_version=m4_default([$1], [0.9.0]) AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version]) if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then AC_MSG_RESULT([yes]) else AC_MSG_RESULT([no]) PKG_CONFIG="" fi fi[]dnl ])# PKG_PROG_PKG_CONFIG # PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) # # Check to see whether a particular set of modules exists. Similar # to PKG_CHECK_MODULES(), but does not set variables or print errors. # # Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG]) # only at the first occurence in configure.ac, so if the first place # it's called might be skipped (such as if it is within an "if", you # have to call PKG_CHECK_EXISTS manually # -------------------------------------------------------------- AC_DEFUN([PKG_CHECK_EXISTS], [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl if test -n "$PKG_CONFIG" && \ AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then m4_default([$2], [:]) m4_ifvaln([$3], [else $3])dnl fi]) # _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES]) # --------------------------------------------- m4_define([_PKG_CONFIG], [if test -n "$$1"; then pkg_cv_[]$1="$$1" elif test -n "$PKG_CONFIG"; then PKG_CHECK_EXISTS([$3], [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null`], [pkg_failed=yes]) else pkg_failed=untried fi[]dnl ])# _PKG_CONFIG # _PKG_SHORT_ERRORS_SUPPORTED # ----------------------------- AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED], [AC_REQUIRE([PKG_PROG_PKG_CONFIG]) if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then _pkg_short_errors_supported=yes else _pkg_short_errors_supported=no fi[]dnl ])# _PKG_SHORT_ERRORS_SUPPORTED # PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], # [ACTION-IF-NOT-FOUND]) # # # Note that if there is a possibility the first call to # PKG_CHECK_MODULES might not happen, you should be sure to include an # explicit call to PKG_PROG_PKG_CONFIG in your configure.ac # # # -------------------------------------------------------------- AC_DEFUN([PKG_CHECK_MODULES], [AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl pkg_failed=no AC_MSG_CHECKING([for $1]) _PKG_CONFIG([$1][_CFLAGS], [cflags], [$2]) _PKG_CONFIG([$1][_LIBS], [libs], [$2]) m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS and $1[]_LIBS to avoid the need to call pkg-config. See the pkg-config man page for more details.]) if test $pkg_failed = yes; then AC_MSG_RESULT([no]) _PKG_SHORT_ERRORS_SUPPORTED if test $_pkg_short_errors_supported = yes; then $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors "$2" 2>&1` else $1[]_PKG_ERRORS=`$PKG_CONFIG --print-errors "$2" 2>&1` fi # Put the nasty error message in config.log where it belongs echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD m4_default([$4], [AC_MSG_ERROR( [Package requirements ($2) were not met: $$1_PKG_ERRORS Consider adjusting the PKG_CONFIG_PATH environment variable if you installed software in a non-standard prefix. _PKG_TEXT])dnl ]) elif test $pkg_failed = untried; then AC_MSG_RESULT([no]) m4_default([$4], [AC_MSG_FAILURE( [The pkg-config script could not be found or is too old. Make sure it is in your PATH or set the PKG_CONFIG environment variable to the full path to pkg-config. _PKG_TEXT To get pkg-config, see .])dnl ]) else $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS $1[]_LIBS=$pkg_cv_[]$1[]_LIBS AC_MSG_RESULT([yes]) $3 fi[]dnl ])# PKG_CHECK_MODULES xen-4.4.0/m4/savevar.m40000664000175000017500000000015112307313555012671 0ustar smbsmbAC_DEFUN([AX_SAVEVAR_SAVE],[ saved_$1="$$1" ]) AC_DEFUN([AX_SAVEVAR_RESTORE],[ $1="$saved_$1" ]) xen-4.4.0/m4/subsystem.m40000664000175000017500000000253212307313555013265 0ustar smbsmbAC_DEFUN([AX_ENABLE_SUBSYSTEM], [ $1=y SUBSYSTEMS="$SUBSYSTEMS $1" ]) AC_DEFUN([AX_DISABLE_SUBSYSTEM], [ $1=n ]) AC_DEFUN([AX_SUBSYSTEM_DEFAULT_ENABLE], [ AC_ARG_ENABLE([$1], AS_HELP_STRING([--disable-$1], [Disable build and install of $1]),[ AX_SUBSYSTEM_INTERNAL([$1]) ],[ AX_ENABLE_SUBSYSTEM([$1]) ]) AX_SUBSYSTEM_CONFIGURE([$1]) AC_SUBST([$1]) ]) AC_DEFUN([AX_SUBSYSTEM_DEFAULT_DISABLE], [ AC_ARG_ENABLE([$1], AS_HELP_STRING([--enable-$1], [Enable build and install of $1]),[ AX_SUBSYSTEM_INTERNAL([$1]) ],[ AX_DISABLE_SUBSYSTEM([$1]) ]) AX_SUBSYSTEM_CONFIGURE([$1]) AC_SUBST([$1]) ]) AC_DEFUN([AX_SUBSYSTEM_CONDITIONAL], [ AC_ARG_ENABLE([$1], AS_HELP_STRING([--enable-$1], [Enable build and install of $1]),[ AX_SUBSYSTEM_INTERNAL([$1]) ],[ AS_IF([test "x$2" = "xy"],[ AX_ENABLE_SUBSYSTEM([$1]) ],[ AX_DISABLE_SUBSYSTEM([$1]) ]) ]) AX_SUBSYSTEM_CONFIGURE([$1]) AC_SUBST($1) ]) AC_DEFUN([AX_SUBSYSTEM_FINISH], [ AC_SUBST(SUBSYSTEMS) echo "Will build the following subsystems:" for x in $SUBSYSTEMS; do echo " $x" done ]) AC_DEFUN([AX_SUBSYSTEM_INTERNAL], [ AS_IF([test "x$enableval" = "xyes"], [ AX_ENABLE_SUBSYSTEM([$1]) ],[ AS_IF([test "x$enableval" = "xno"],[ AX_DISABLE_SUBSYSTEM([$1]) ]) ]) ]) AC_DEFUN([AX_SUBSYSTEM_CONFIGURE], [ AS_IF([test -e "$1/configure"], [ if test "x$$1" = "xy" || test "x$$1" = "x" ; then AC_CONFIG_SUBDIRS([$1]) fi ]) ]) xen-4.4.0/m4/depends.m40000664000175000017500000000046312307313555012652 0ustar smbsmb AC_DEFUN([AX_DEPENDS_PATH_PROG], [ AS_IF([test "x$$1" = "xy"], [AX_PATH_PROG_OR_FAIL([$2], [$3])], [ AS_IF([test "x$$1" = "xn"], [ $2="/$3-disabled-in-configure-script" ], [ AC_PATH_PROG([$2], [$3], [no]) AS_IF([test x"${$2}" = "xno"], [ $1=n $2="/$3-disabled-in-configure-script" ]) ]) ]) AC_SUBST($2) ]) xen-4.4.0/m4/features.m40000664000175000017500000000106312307313555013043 0ustar smbsmbAC_DEFUN([AX_ARG_DEFAULT_ENABLE], [ AC_ARG_ENABLE([$1], AS_HELP_STRING([--disable-$1], [$2 (default is ENABLED)])) AX_PARSE_VALUE([$1], [y]) ]) AC_DEFUN([AX_ARG_DEFAULT_DISABLE], [ AC_ARG_ENABLE([$1], AS_HELP_STRING([--enable-$1], [$2 (default is DISABLED)])) AX_PARSE_VALUE([$1], [n]) ]) dnl This function should not be called outside of this file AC_DEFUN([AX_PARSE_VALUE], [ AS_IF([test "x$enable_$1" = "xno"], [ ax_cv_$1="n" ], [test "x$enable_$1" = "xyes"], [ ax_cv_$1="y" ], [test -z $ax_cv_$1], [ ax_cv_$1="$2" ]) $1=$ax_cv_$1 AC_SUBST($1)]) xen-4.4.0/m4/ocaml.m40000664000175000017500000001351512307313555012325 0ustar smbsmbdnl autoconf macros for OCaml dnl from http://forge.ocamlcore.org/ dnl dnl Copyright © 2009 Richard W.M. Jones dnl Copyright © 2009 Stefano Zacchiroli dnl Copyright © 2000-2005 Olivier Andrieu dnl Copyright © 2000-2005 Jean-Christophe Filliâtre dnl Copyright © 2000-2005 Georges Mariano dnl dnl For documentation, please read the ocaml.m4 man page. AC_DEFUN([AC_PROG_OCAML], [dnl # checking for ocamlc AC_CHECK_TOOL([OCAMLC],[ocamlc],[no]) if test "$OCAMLC" != "no"; then OCAMLVERSION=`$OCAMLC -v | sed -n -e 's|.*version* *\(.*\)$|\1|p'` AC_MSG_RESULT([OCaml version is $OCAMLVERSION]) # If OCAMLLIB is set, use it if test "$OCAMLLIB" = ""; then OCAMLLIB=`$OCAMLC -where 2>/dev/null || $OCAMLC -v|tail -1|cut -d ' ' -f 4` else AC_MSG_RESULT([OCAMLLIB previously set; preserving it.]) fi AC_MSG_RESULT([OCaml library path is $OCAMLLIB]) AC_SUBST([OCAMLVERSION]) AC_SUBST([OCAMLLIB]) # checking for ocamlopt AC_CHECK_TOOL([OCAMLOPT],[ocamlopt],[no]) OCAMLBEST=byte if test "$OCAMLOPT" = "no"; then AC_MSG_WARN([Cannot find ocamlopt; bytecode compilation only.]) else TMPVERSION=`$OCAMLOPT -v | sed -n -e 's|.*version* *\(.*\)$|\1|p' ` if test "$TMPVERSION" != "$OCAMLVERSION" ; then AC_MSG_RESULT([versions differs from ocamlc; ocamlopt discarded.]) OCAMLOPT=no else OCAMLBEST=opt fi fi AC_SUBST([OCAMLBEST]) # checking for ocamlc.opt AC_CHECK_TOOL([OCAMLCDOTOPT],[ocamlc.opt],[no]) if test "$OCAMLCDOTOPT" != "no"; then TMPVERSION=`$OCAMLCDOTOPT -v | sed -n -e 's|.*version* *\(.*\)$|\1|p' ` if test "$TMPVERSION" != "$OCAMLVERSION" ; then AC_MSG_RESULT([versions differs from ocamlc; ocamlc.opt discarded.]) else OCAMLC=$OCAMLCDOTOPT fi fi # checking for ocamlopt.opt if test "$OCAMLOPT" != "no" ; then AC_CHECK_TOOL([OCAMLOPTDOTOPT],[ocamlopt.opt],[no]) if test "$OCAMLOPTDOTOPT" != "no"; then TMPVERSION=`$OCAMLOPTDOTOPT -v | sed -n -e 's|.*version* *\(.*\)$|\1|p' ` if test "$TMPVERSION" != "$OCAMLVERSION" ; then AC_MSG_RESULT([version differs from ocamlc; ocamlopt.opt discarded.]) else OCAMLOPT=$OCAMLOPTDOTOPT fi fi fi AC_SUBST([OCAMLOPT]) fi AC_SUBST([OCAMLC]) # checking for ocaml toplevel AC_CHECK_TOOL([OCAML],[ocaml],[no]) # checking for ocamldep AC_CHECK_TOOL([OCAMLDEP],[ocamldep],[no]) # checking for ocamlmktop AC_CHECK_TOOL([OCAMLMKTOP],[ocamlmktop],[no]) # checking for ocamlmklib AC_CHECK_TOOL([OCAMLMKLIB],[ocamlmklib],[no]) # checking for ocamldoc AC_CHECK_TOOL([OCAMLDOC],[ocamldoc],[no]) # checking for ocamlbuild AC_CHECK_TOOL([OCAMLBUILD],[ocamlbuild],[no]) ]) AC_DEFUN([AC_PROG_OCAMLLEX], [dnl # checking for ocamllex AC_CHECK_TOOL([OCAMLLEX],[ocamllex],[no]) if test "$OCAMLLEX" != "no"; then AC_CHECK_TOOL([OCAMLLEXDOTOPT],[ocamllex.opt],[no]) if test "$OCAMLLEXDOTOPT" != "no"; then OCAMLLEX=$OCAMLLEXDOTOPT fi fi AC_SUBST([OCAMLLEX]) ]) AC_DEFUN([AC_PROG_OCAMLYACC], [dnl AC_CHECK_TOOL([OCAMLYACC],[ocamlyacc],[no]) AC_SUBST([OCAMLYACC]) ]) AC_DEFUN([AC_PROG_CAMLP4], [dnl AC_REQUIRE([AC_PROG_OCAML])dnl # checking for camlp4 AC_CHECK_TOOL([CAMLP4],[camlp4],[no]) if test "$CAMLP4" != "no"; then TMPVERSION=`$CAMLP4 -v 2>&1| sed -n -e 's|.*version *\(.*\)$|\1|p'` if test "$TMPVERSION" != "$OCAMLVERSION" ; then AC_MSG_RESULT([versions differs from ocamlc]) CAMLP4=no fi fi AC_SUBST([CAMLP4]) # checking for companion tools AC_CHECK_TOOL([CAMLP4BOOT],[camlp4boot],[no]) AC_CHECK_TOOL([CAMLP4O],[camlp4o],[no]) AC_CHECK_TOOL([CAMLP4OF],[camlp4of],[no]) AC_CHECK_TOOL([CAMLP4OOF],[camlp4oof],[no]) AC_CHECK_TOOL([CAMLP4ORF],[camlp4orf],[no]) AC_CHECK_TOOL([CAMLP4PROF],[camlp4prof],[no]) AC_CHECK_TOOL([CAMLP4R],[camlp4r],[no]) AC_CHECK_TOOL([CAMLP4RF],[camlp4rf],[no]) AC_SUBST([CAMLP4BOOT]) AC_SUBST([CAMLP4O]) AC_SUBST([CAMLP4OF]) AC_SUBST([CAMLP4OOF]) AC_SUBST([CAMLP4ORF]) AC_SUBST([CAMLP4PROF]) AC_SUBST([CAMLP4R]) AC_SUBST([CAMLP4RF]) ]) AC_DEFUN([AC_PROG_FINDLIB], [dnl AC_REQUIRE([AC_PROG_OCAML])dnl # checking for ocamlfind AC_CHECK_TOOL([OCAMLFIND],[ocamlfind],[no]) AC_SUBST([OCAMLFIND]) ]) dnl Thanks to Jim Meyering for working this next bit out for us. dnl XXX We should define AS_TR_SH if it's not defined already dnl (eg. for old autoconf). AC_DEFUN([AC_CHECK_OCAML_PKG], [dnl AC_REQUIRE([AC_PROG_FINDLIB])dnl AC_MSG_CHECKING([for OCaml findlib package $1]) unset found unset pkg found=no for pkg in $1 $2 ; do if $OCAMLFIND query $pkg >/dev/null 2>/dev/null; then AC_MSG_RESULT([found]) AS_TR_SH([OCAML_PKG_$1])=$pkg found=yes break fi done if test "$found" = "no" ; then AC_MSG_RESULT([not found]) AS_TR_SH([OCAML_PKG_$1])=no fi AC_SUBST(AS_TR_SH([OCAML_PKG_$1])) ]) AC_DEFUN([AC_CHECK_OCAML_MODULE], [dnl AC_MSG_CHECKING([for OCaml module $2]) cat > conftest.ml <&5 2>&5 ; then found=yes break fi done if test "$found" ; then AC_MSG_RESULT([$$1]) else AC_MSG_RESULT([not found]) $1=no fi AC_SUBST([$1]) ]) dnl XXX Cross-compiling AC_DEFUN([AC_CHECK_OCAML_WORD_SIZE], [dnl AC_REQUIRE([AC_PROG_OCAML])dnl AC_MSG_CHECKING([for OCaml compiler word size]) cat > conftest.ml < conftest.ml <], [Define curses header to use]) ], [ CURSES_LIBS="-lcurses" AC_DEFINE([INCLUDE_CURSES_H], [], [Define curses header to use]) ]) AC_SUBST(CURSES_LIBS) ]) xen-4.4.0/m4/checkpolicy.m40000664000175000017500000000047612307313555013531 0ustar smbsmbAC_DEFUN([AC_PROG_CHECKPOLICY], [dnl # check for a checkpolicy binary with support for -t xen AC_CHECK_TOOL([CHECKPOLICY],[checkpolicy],[no]) if test "$CHECKPOLICY" != "no"; then CHECKPOLICYHELP=`$CHECKPOLICY -h | grep xen` if test "$CHECKPOLICYHELP" = ""; then CHECKPOLICY=no fi fi ]) xen-4.4.0/m4/stubdom.m40000664000175000017500000000366312307313555012712 0ustar smbsmbAC_DEFUN([AX_STUBDOM_DEFAULT_ENABLE], [ AC_ARG_ENABLE([$1], AS_HELP_STRING([--disable-$1], [Build and install $1 (default is ENABLED)]),[ AX_STUBDOM_INTERNAL([$1], [$2]) ],[ AX_ENABLE_STUBDOM([$1], [$2]) ]) AC_SUBST([$2]) ]) AC_DEFUN([AX_STUBDOM_DEFAULT_DISABLE], [ AC_ARG_ENABLE([$1], AS_HELP_STRING([--enable-$1], [Build and install $1 (default is DISABLED)]),[ AX_STUBDOM_INTERNAL([$1], [$2]) ],[ AX_DISABLE_STUBDOM([$1], [$2]) ]) AC_SUBST([$2]) ]) AC_DEFUN([AX_STUBDOM_CONDITIONAL], [ AC_ARG_ENABLE([$1], AS_HELP_STRING([--enable-$1], [Build and install $1]),[ AX_STUBDOM_INTERNAL([$1], [$2]) ]) ]) AC_DEFUN([AX_STUBDOM_CONDITIONAL_FINISH], [ AS_IF([test "x$$2" = "xy" || test "x$$2" = "x"], [ AX_ENABLE_STUBDOM([$1],[$2]) ],[ AX_DISABLE_STUBDOM([$1],[$2]) ]) AC_SUBST([$2]) ]) AC_DEFUN([AX_STUBDOM_AUTO_DEPENDS], [ AS_IF([test "x$$1" = "x" && test "x$$2" = "xn"], [ $1="n" ]) ]) AC_DEFUN([AX_ENABLE_STUBDOM], [ $2=y STUBDOM_TARGETS="$STUBDOM_TARGETS $2" STUBDOM_BUILD="$STUBDOM_BUILD $1" STUBDOM_INSTALL="$STUBDOM_INSTALL install-$2" ]) AC_DEFUN([AX_DISABLE_STUBDOM], [ $2=n ]) dnl Don't call this outside of this file AC_DEFUN([AX_STUBDOM_INTERNAL], [ AS_IF([test "x$enableval" = "xyes"], [ AX_ENABLE_STUBDOM([$1], [$2]) ],[ AS_IF([test "x$enableval" = "xno"],[ AX_DISABLE_STUBDOM([$1], [$2]) ]) ]) ]) AC_DEFUN([AX_STUBDOM_FINISH], [ AC_SUBST(STUBDOM_TARGETS) AC_SUBST(STUBDOM_BUILD) AC_SUBST(STUBDOM_INSTALL) echo "Will build the following stub domains:" for x in $STUBDOM_BUILD; do echo " $x" done ]) AC_DEFUN([AX_STUBDOM_LIB], [ AC_ARG_VAR([$1_URL], [Download url for $2]) AS_IF([test "x$$1_URL" = "x"], [ AS_IF([test "x$extfiles" = "xy"], [$1_URL=\@S|@\@{:@XEN_EXTFILES_URL\@:}@], [$1_URL="$4"]) ]) $1_VERSION="$3" AC_SUBST($1_URL) AC_SUBST($1_VERSION) ]) AC_DEFUN([AX_STUBDOM_LIB_NOEXT], [ AC_ARG_VAR([$1_URL], [Download url for $2]) AS_IF([test "x$$1_URL" = "x"], [ $1_URL="$4" ]) $1_VERSION="$3" AC_SUBST($1_URL) AC_SUBST($1_VERSION) ]) xen-4.4.0/m4/uuid.m40000664000175000017500000000037212307313555012175 0ustar smbsmbAC_DEFUN([AX_CHECK_UUID], [ AC_CHECK_HEADER([uuid/uuid.h],[ AC_CHECK_LIB([uuid], [uuid_clear], [libuuid="y"]) ]) AC_CHECK_HEADER([uuid.h],[libuuid="y"]) AS_IF([test "$libuuid" != "y"], [ AC_MSG_ERROR([cannot find a valid uuid library]) ]) ]) xen-4.4.0/CODING_STYLE0000664000175000017500000000577612307313555012330 0ustar smbsmbCoding Style for the Xen Hypervisor =================================== The Xen coding style described below is the coding style used by the Xen hypervisor itself (xen/*) as well as various associated low-level libraries (e.g. tools/libxc/*). An exception is made for files which are imported from an external source. In these cases the prevailing coding style of the upstream source is generally used (commonly the Linux coding style). Other parts of the code base may use other coding styles, sometimes explicitly (e.g. tools/libxl/CODING_STYLE) but often implicitly (Linux coding style is fairly common). In general you should copy the style of the surrounding code. If you are unsure please ask. Indentation ----------- Indenting uses spaces, not tabs - in contrast to Linux. An indent level consists of four spaces. Code within blocks is indented by one extra indent level. The enclosing braces of a block are indented the same as the code _outside_ the block. e.g. void fun(void) { /* One level of indent. */ { /* A second level of indent. */ } } White space ----------- Space characters are used to spread out logical statements, such as in the condition of an if or while. Spaces are placed between the keyword and the brackets surrounding the condition, between the brackets and the condition itself, and around binary operators (except the structure access operators, '.' and '->'). e.g. if ( (wibble & wombat) == 42 ) { ... There should be no trailing white space at the end of lines (including after the opening /* of a comment block). Line Length ----------- Lines should be less than 80 characters in length. Long lines should be split at sensible places and the trailing portions indented. User visible strings (e.g., printk() messages) should not be split so they can searched for more easily. Bracing ------- Braces ('{' and '}') are usually placed on a line of their own, except for the do/while loop. This is unlike the Linux coding style and unlike K&R. do/while loops are an exception. e.g.: if ( condition ) { /* Do stuff. */ } else { /* Other stuff. */ } while ( condition ) { /* Do stuff. */ } do { /* Do stuff. */ } while ( condition ); etc. Braces should be omitted for blocks with a single statement. e.g., if ( condition ) single_statement(); Comments -------- Only C style /* ... */ comments are to be used. C++ style // comments should not be used. Multi-word comments should begin with a capital letter and end with a full stop. Multi-line comment blocks should start and end with comment markers on separate lines and each line should begin with a leading '*'. /* * Example, multi-line comment block. * * Note beginning and end markers on separate lines and leading '*'. */ Emacs local variables --------------------- A comment block containing local variables for emacs is permitted at the end of files. It should be: /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */ xen-4.4.0/.hgignore0000664000175000017500000002263612307313555012257 0ustar smbsmb.*\.a$ .*\.cmi$ .*\.cmo$ .*\.cmx$ \..*\.d$ .*\.o$ .*\.opic$ .*\.pyc$ .*\.so$ .*\.so\..*$ .*\.tar\.bz2$ .*\.tar\.gz$ .*~$ .*\.swp$ .*\.tmp$ .*\.flc$ .*\.orig$ .*\.rej$ .*\.spot$ .*\.spit$ .*\.gcno$ .*\.gcda$ .*/a\.out$ .*/Modules\.symvers$ .*/cscope\..*$ ^cscope.*$ ^[^/]*\.bz2$ ^\.config$ ^\.pc (^|/)(tags|TAGS)$ (^|/)(GTAGS|GPATH|GSYMS|GRTAGS)$ ^autom4te\.cache$ ^config\.log$ ^config\.status$ ^config\.cache$ ^config/Toplevel\.mk$ ^build-.*$ ^dist/.*$ ^docs/autom4te\.cache$ ^docs/config\.log$ ^docs/config\.status ^docs/config/Toplevel\.mk ^docs/.*\.aux$ ^docs/.*\.dvi$ ^docs/.*\.log$ ^docs/.*\.pdf$ ^docs/.*\.ps$ ^docs/.*\.toc$ ^docs/api/.*$ ^docs/figs/xenserver\.eps$ ^docs/html/.*$ ^docs/interface/WARNINGS$ ^docs/interface/images\.pl$ ^docs/interface/images\.tex$ ^docs/interface/img1\.png$ ^docs/interface/index\.html$ ^docs/interface/interface\.css$ ^docs/interface/interface\.html$ ^docs/interface/labels\.pl$ ^docs/figs/.*\.png ^docs/man1/ ^docs/man5/ ^docs/pdf/.*$ ^docs/ps/.*$ ^docs/user/WARNINGS$ ^docs/user/images\.pl$ ^docs/user/images\.tex$ ^docs/user/img1\.png$ ^docs/user/img2\.png$ ^docs/user/img3\.png$ ^docs/user/index\.html$ ^docs/user/internals\.pl$ ^docs/user/labels\.pl$ ^docs/user/user\.css$ ^docs/user/user\.html$ ^docs/txt/.*$ ^docs/xen-api/vm_lifecycle.eps$ ^docs/xen-api/xenapi-datamodel-graph.eps$ ^docs/xen-api/xenapi.out$ ^extras/mini-os/include/list\.h$ ^extras/mini-os/include/mini-os$ ^extras/mini-os/include/x86/mini-os$ ^extras/mini-os/include/xen$ ^extras/mini-os/mini-os.*$ ^install/.*$ ^linux-[^/]*-paravirt/.*$ ^linux-2.6[^/]*/.*$ ^linux-[^/]*-rc/.*$ ^linux-[^/]*-tip/.*$ ^linux-[^/]*-git/.*$ ^linux-[^/]*\.patch$ ^mkddbxen$ ^netbsd-[^/]*-tools/.*$ ^netbsd-[^/]*-xen0/.*$ ^netbsd-[^/]*-xenU/.*$ ^netbsd-[^/]*\.patch$ ^patches/.*/\.makedep$ ^patches/ebtables-brnf-5_vs_2\.4\.25\.diff$ ^patches/ebtables\.diff$ ^patches/tmp/.*$ ^pristine-.*$ ^ref-.*$ ^tmp-.*$ ^stubdom/autom4te\.cache$ ^stubdom/binutils-.*$ ^stubdom/config\.log$ ^stubdom/config\.status$ ^stubdom/config\.cache$ ^stubdom/cross-root-.*$ ^stubdom/gcc-.*$ ^stubdom/include$ ^stubdom/ioemu$ ^stubdom/xenstore$ ^stubdom/libxc-.*$ ^stubdom/lwip-.*$ ^stubdom/mini-os-.*$ ^stubdom/mk-headers-.*$ ^stubdom/newlib-.*$ ^stubdom/pciutils-.*$ ^stubdom/zlib-.*$ ^stubdom/grub-.*$ ^stubdom/polarssl-.*$ ^stubdom/gmp-.*$ ^stubdom/tpm_emulator-.*$ ^stubdom/ocaml-.*$ ^stubdom/lwip/ ^stubdom/ioemu/ ^stubdom/stubdompath\.sh$ ^stubdom/vtpm/vtpm_manager\.h$ ^tools/.*/build/lib.*/.*\.py$ ^tools/blktap2/control/tap-ctl$ ^tools/blktap2/drivers/img2qcow$ ^tools/blktap2/drivers/lock-util$ ^tools/blktap2/drivers/qcow-create$ ^tools/blktap2/drivers/qcow2raw$ ^tools/blktap2/drivers/tapdisk-client$ ^tools/blktap2/drivers/tapdisk-diff$ ^tools/blktap2/drivers/tapdisk-stream$ ^tools/blktap2/drivers/tapdisk2$ ^tools/blktap2/drivers/td-util$ ^tools/blktap2/vhd/vhd-update$ ^tools/blktap2/vhd/vhd-util$ ^tools/blktap/drivers/blktapctrl$ ^tools/blktap/drivers/img2qcow$ ^tools/blktap/drivers/qcow-create$ ^tools/blktap/drivers/qcow2raw$ ^tools/blktap/drivers/tapdisk$ ^tools/check/\..*$ ^tools/console/xenconsole$ ^tools/console/xenconsoled$ ^tools/debugger/gdb/gdb-6\.2\.1-linux-i386-xen/.*$ ^tools/debugger/gdb/gdb-6\.2\.1/.*$ ^tools/debugger/gdb/gdb-6\.2\.1\.tar\.bz2$ ^tools/debugger/gdbsx/gdbsx$ ^tools/debugger/kdd/kdd$ ^tools/debugger/xenitp/xenitp$ ^tools/firmware/.*/biossums$ ^tools/firmware/.*\.bin$ ^tools/firmware/.*\.sym$ ^tools/firmware/.*bios/.*bios.*\.txt$ ^tools/firmware/etherboot/eb-roms\.h$ ^tools/firmware/etherboot/ipxe/.*$ ^tools/firmware/etherboot/ipxe\.git/.*$ ^tools/firmware/extboot/extboot.img$ ^tools/firmware/extboot/signrom$ ^tools/firmware/hvmloader/acpi/mk_dsdt$ ^tools/firmware/hvmloader/acpi/dsdt.*\.c$ ^tools/firmware/hvmloader/acpi/dsdt_.*\.asl$ ^tools/firmware/hvmloader/acpi/ssdt_.*\.h$ ^tools/firmware/hvmloader/hvmloader$ ^tools/firmware/hvmloader/roms\.inc$ ^tools/firmware/rombios/BIOS-bochs-[^/]*$ ^tools/firmware/rombios/_rombios[^/]*_\.c$ ^tools/firmware/rombios/rombios[^/]*\.s$ ^tools/firmware/rombios/32bit/32bitbios_flat\.h$ ^tools/firmware/vgabios/vbetables-gen$ ^tools/firmware/vgabios/vbetables\.h$ ^tools/flask/utils/flask-getenforce$ ^tools/flask/utils/flask-get-bool$ ^tools/flask/utils/flask-loadpolicy$ ^tools/flask/utils/flask-setenforce$ ^tools/flask/utils/flask-set-bool$ ^tools/flask/utils/flask-label-pci$ ^tools/fs-back/fs-backend$ ^tools/hotplug/common/hotplugpath\.sh$ ^tools/include/xen/.*$ ^tools/include/xen-foreign/.*\.(c|h|size)$ ^tools/include/xen-foreign/checker$ ^tools/libxen/libxenapi- ^tools/libxen/test/test_bindings$ ^tools/libxen/test/test_event_handling$ ^tools/libxl/_.*\.h$ ^tools/libxl/_.*\.c$ ^tools/libxl/libxlu_cfg_y\.output$ ^tools/libxl/xl$ ^tools/libxl/libxl-save-helper$ ^tools/libxl/testidl$ ^tools/libxl/testidl\.c$ ^tools/libxl/tmp\..*$ ^tools/libxl/.*\.new$ ^tools/libxl/_libxl\.api-for-check ^tools/libxl/libxl\.api-ok ^tools/libvchan/vchan-node[12]$ ^tools/misc/cpuperf/cpuperf-perfcntr$ ^tools/misc/cpuperf/cpuperf-xen$ ^tools/misc/xc_shadow$ ^tools/misc/xen_cpuperf$ ^tools/misc/xen-detect$ ^tools/misc/xen-hptool$ ^tools/misc/xen-hvmcrash$ ^tools/misc/xen-tmem-list-parse$ ^tools/misc/xenperf$ ^tools/misc/xenpm$ ^tools/misc/xen-hvmctx$ ^tools/misc/xen-lowmemd$ ^tools/misc/gtraceview$ ^tools/misc/gtracestat$ ^tools/misc/xenlockprof$ ^tools/misc/xencov$ ^tools/pygrub/build/.*$ ^tools/python/build/.*$ ^tools/python/xen/util/path\.py$ ^tools/python/xen/lowlevel/xl/_pyxl_types.c ^tools/python/xen/lowlevel/xl/_pyxl_types.h ^tools/remus/imqebt/imqebt$ ^tools/remus/kmod/.*(\.cmd|\.mod|\.ko|\.mod\.c|\.symvers|\.xen)$ ^tools/security/secpol_tool$ ^tools/security/xen/.*$ ^tools/security/xensec_tool$ ^tools/tests/x86_emulator/blowfish\.bin$ ^tools/tests/x86_emulator/blowfish\.h$ ^tools/tests/x86_emulator/test_x86_emulator$ ^tools/tests/x86_emulator/x86_emulate$ ^tools/tests/regression/installed/.*$ ^tools/tests/regression/build/.*$ ^tools/tests/regression/downloads/.*$ ^tools/tests/xen-access/xen-access$ ^tools/tests/mem-sharing/memshrtool$ ^tools/tests/mce-test/tools/xen-mceinj$ ^tools/vtpm/tpm_emulator-.*\.tar\.gz$ ^tools/vtpm/tpm_emulator/.*$ ^tools/vtpm/vtpm/.*$ ^tools/vtpm_manager/manager/vtpm_managerd$ ^tools/xcutils/lsevtchn$ ^tools/xcutils/xc_restore$ ^tools/xcutils/xc_save$ ^tools/xcutils/readnotes$ ^tools/misc/xenwatchdogd$ ^tools/xenfb/sdlfb$ ^tools/xenfb/vncfb$ ^tools/xenmon/xentrace_setmask$ ^tools/xenmon/xenbaked$ ^tools/xenpaging/xenpaging$ ^tools/xenpmd/xenpmd$ ^tools/xenstat/xentop/xentop$ ^tools/xenstore/testsuite/tmp/.*$ ^tools/xenstore/init-xenstore-domain$ ^tools/xenstore/xen$ ^tools/xenstore/xenstore$ ^tools/xenstore/xenstore-chmod$ ^tools/xenstore/xenstore-exists$ ^tools/xenstore/xenstore-list$ ^tools/xenstore/xenstore-read$ ^tools/xenstore/xenstore-rm$ ^tools/xenstore/xenstore-write$ ^tools/xenstore/xenstore-control$ ^tools/xenstore/xenstore-ls$ ^tools/xenstore/xenstore-watch$ ^tools/xenstore/xenstored$ ^tools/xenstore/xenstored_test$ ^tools/xenstore/xs_crashme$ ^tools/xenstore/xs_random$ ^tools/xenstore/xs_stress$ ^tools/xenstore/xs_tdb_dump$ ^tools/xenstore/xs_test$ ^tools/xenstore/xs_watch_stress$ ^tools/xentrace/xentrace_setsize$ ^tools/xentrace/tbctl$ ^tools/xentrace/xenctx$ ^tools/xentrace/xentrace$ ^tools/xm-test/ramdisk/buildroot ^tools/xm-test/aclocal.m4$ ^tools/xm-test/autom4te ^tools/xm-test/install-sh$ ^tools/xm-test/mkinstalldirs$ ^tools/xm-test/missing$ ^tools/xm-test/config(ure|.log|.status|.guess|.sub)$ ^tools/xm-test/Makefile(.in)*$ ^tools/xm-test/.*/Makefile(.in)*$ ^tools/xm-test/lib/XmTestLib/config.py$ ^tools/xm-test/lib/XmTestReport/xmtest.py$ ^tools/xm-test/tests/.*\.test$ ^tools/firmware/ovmf-remote ^tools/firmware/ovmf$ ^tools/qemu-xen-traditional-dir-remote ^tools/qemu-xen-traditional-dir$ ^tools/qemu-xen-dir-remote ^tools/qemu-xen-dir$ ^tools/firmware/seabios-dir-remote ^tools/firmware/seabios-dir$ ^tools/ocaml/.*/.*\.annot$ ^tools/ocaml/.*/.*\.cmx?a$ ^tools/ocaml/.*/META$ ^tools/ocaml/.*/\.ocamldep\.make$ ^tools/ocaml/libs/xl/_libxl_types\.ml\.in$ ^tools/ocaml/libs/xl/_libxl_types\.mli\.in$ ^tools/ocaml/libs/xl/_libxl_types\.inc$ ^tools/ocaml/libs/xl/xenlight\.ml$ ^tools/ocaml/libs/xl/xenlight\.mli$ ^tools/ocaml/xenstored/oxenstored$ ^tools/ocaml/test/xtl$ ^tools/ocaml/test/send_debug_keys$ ^tools/ocaml/test/list_domains$ ^tools/autom4te\.cache$ ^tools/config\.h$ ^tools/config\.log$ ^tools/config\.status$ ^tools/config\.cache$ ^config/Tools\.mk$ ^config/Stubdom\.mk$ ^config/Docs\.mk$ ^xen/\.banner.*$ ^xen/System.map$ ^xen/arch/arm/asm-offsets\.s$ ^xen/arch/arm/xen\.lds$ ^xen/arch/x86/asm-offsets\.s$ ^xen/arch/x86/boot/mkelf32$ ^xen/arch/x86/xen\.lds$ ^xen/arch/x86/boot/reloc\.S$ ^xen/arch/x86/boot/reloc\.bin$ ^xen/arch/x86/boot/reloc\.lnk$ ^xen/arch/x86/efi\.lds$ ^xen/arch/x86/efi/check\.efi$ ^xen/arch/x86/efi/disabled$ ^xen/arch/x86/efi/mkreloc$ ^xen/ddb/.*$ ^xen/include/headers\.chk$ ^xen/include/asm$ ^xen/include/asm-.*/asm-offsets\.h$ ^xen/include/compat/.*$ ^xen/include/hypervisor-ifs/arch$ ^xen/include/linux$ ^xen/include/public/public$ ^xen/include/xen/.*\.new$ ^xen/include/xen/acm_policy\.h$ ^xen/include/xen/banner\.h$ ^xen/include/xen/compile\.h$ ^xen/tools/figlet/figlet$ ^xen/tools/symbols$ ^xen/xsm/flask/include/av_perm_to_string\.h$ ^xen/xsm/flask/include/av_permissions\.h$ ^xen/xsm/flask/include/class_to_string\.h$ ^xen/xsm/flask/include/flask\.h$ ^xen/xsm/flask/include/initial_sid_to_string\.h$ ^xen/xen$ ^xen/xen-syms$ ^xen/xen\..*$ ^unmodified_drivers/linux-2.6/\.tmp_versions ^unmodified_drivers/linux-2.6/.*\.cmd$ ^unmodified_drivers/linux-2.6/.*\.ko$ ^unmodified_drivers/linux-2.6/.*\.mod\.c$ ^LibVNCServer.* xen-4.4.0/autogen.sh0000775000175000017500000000015412307313555012444 0ustar smbsmb#!/bin/sh -e autoconf ( cd tools autoconf autoheader ) ( cd stubdom autoconf ) ( cd docs autoconf ) xen-4.4.0/MAINTAINERS0000664000175000017500000002274212307313555012147 0ustar smbsmb List of maintainers and how to submit changes ============================================= Please try to follow the guidelines below. This will make things easier on the maintainers. Not all of these guidelines matter for every trivial patch so apply some common sense. 1. Always _test_ your changes, however small, on at least 4 or 5 people, preferably many more. 2. Make sure your changes compile correctly in multiple configurations. For example, both 32- and 64-bit x86. 3. Make a patch available to the relevant maintainer in the list. Use 'diff -u' to make the patch easy to merge. Be prepared to get your changes sent back with seemingly silly requests about formatting and variable names. These aren't as silly as they seem. One job the maintainers do is to keep things looking the same. PLEASE see http://wiki.xen.org/wiki/Submitting_Xen_Patches for hints on how to submit a patch to xen-unstable in a suitable form. PLEASE try to include any credit lines you want added with the patch. It avoids people being missed off by mistake and makes it easier to know who wants adding and who doesn't. PLEASE document known bugs. If it doesn't work for everything or does something very odd once a month document it. PLEASE remember that submissions must be made under the terms of the "Developer's Certificate of Origin" (DCO) and should include a Signed-off-by: line. 4. Make sure you have the right to send any changes you make. If you do changes at work you may find your employer owns the patch not you. 5. Happy hacking. Stable Release Maintenance ========================== The policy for inclusion in a Xen stable release is different to that for inclusion in xen-unstable. Please see http://wiki.xen.org/wiki/Xen_Maintenance_Releases for more information. Remember to copy the appropriate stable branch maintainer who will be listed in this section of the MAINTAINERS file in the appropriate branch. Unstable Subsystem Maintainers ============================== Descriptions of section entries: M: Mail patches to: FullName L: Mailing list that is relevant to this area W: Web-page with status/info T: SCM tree type and location. Type is one of: git, hg, quilt, stgit. S: Status, one of the following: Supported: Someone is actually paid to look after this. Maintained: Someone actually looks after it. Odd Fixes: It has a maintainer but they don't have time to do much other than throw the odd patch in. See below.. Orphan: No current maintainer [but maybe you could take the role as you write your new code]. Obsolete: Old code. Something tagged obsolete generally means it has been replaced by a better system and you should be using that. F: Files and directories with wildcard patterns. A trailing slash includes all files and subdirectory files. F: drivers/net/ all files in and below drivers/net F: drivers/net/* all files in drivers/net, but not below F: */net/* all files in "any top level directory"/net One pattern per line. Multiple F: lines acceptable. X: Files and directories that are NOT maintained, same rules as F: Files exclusions are tested before file matches. Can be useful for excluding a specific subdirectory, for instance: F: net/ X: net/ipv6/ matches all files in and below net excluding net/ipv6/ K: Keyword perl extended regex pattern to match content in a patch or file. For instance: K: of_get_profile matches patches or files that contain "of_get_profile" K: \b(printk|pr_(info|err))\b matches patches or files that contain one or more of the words printk, pr_info or pr_err One regex pattern per line. Multiple K: lines acceptable. Maintainers List (try to look for most precise areas first) ----------------------------------- ACPI M: Jan Beulich S: Supported F: xen/arch/x86/acpi/ F: xen/drivers/acpi/ F: xen/include/acpi/ AMD IOMMU M: Suravee Suthikulpanit S: Maintained F: xen/drivers/passthrough/amd/ AMD SVM M: Boris Ostrovsky M: Suravee Suthikulpanit S: Supported F: xen/arch/x86/hvm/svm/ ARINC653 SCHEDULER M: Nathan Studer M: Robert VanVossen S: Supported F: xen/common/sched_arinc653.c F: tools/libxc/xc_arinc653.c ARM (W/ VIRTUALISATION EXTENSIONS) ARCHITECTURE M: Ian Campbell M: Stefano Stabellini M: Tim Deegan S: Supported L: xen-devel@lists.xen.org F: xen/arch/arm/ F: xen/include/asm-arm/ CPU POOLS M: Juergen Gross S: Supported F: xen/common/cpupool.c DEVICE TREE M: Ian Campbell M: Stefano Stabellini M: Tim Deegan S: Supported F: xen/common/libfdt/ F: xen/common/device_tree.c F: xen/include/xen/libfdt/ F: xen/include/xen/device_tree.h EFI M: Jan Beulich S: Supported F: xen/arch/x86/efi/ F: xen/include/efi/ F: xen/include/asm-x86/efi*.h F: xen/include/asm-x86/x86_*/efi*.h EVENT CHANNELS (FIFO-BASED ABI) M: David Vrabel S: Supported F: xen/common/event_fifo.c F: xen/include/xen/event_fifo.h GDBSX DEBUGGER M: Mukesh Rathor S: Supported F: xen/arch/x86/debug.c F: tools/debugger/gdbsx/ KDD DEBUGGER M: Tim Deegan S: Supported F: tools/debugger/kdd/ INTEL(R) TRUSTED EXECUTION TECHNOLOGY (TXT) M: Joseph Cihula M: Gang Wei M: Shane Wang S: Supported F: xen/arch/x86/tboot.c F: xen/include/asm-x86/tboot.h INTEL(R) VT FOR DIRECTED I/O (VT-D) M: Xiantao Zhang S: Supported F: xen/drivers/passthrough/vtd/ INTEL(R) VT FOR X86 (VT-X) M: Jun Nakajima M: Eddie Dong S: Supported F: xen/arch/x86/hvm/vmx/ F: xen/arch/x86/mm/hap/p2m-ept.c F: xen/include/asm-x86/hvm/vmx/ IOMMU VENDOR INDEPENDENT CODE M: Xiantao Zhang M: Jan Beulich S: Supported F: xen/drivers/passthrough/ X: xen/drivers/passthrough/amd/ X: xen/drivers/passthrough/vtd/ F: xen/include/xen/iommu.h KEXEC M: David Vrabel S: Supported F: xen/common/{kexec,kimage}.c F: xen/include/{kexec,kimage}.h F: xen/arch/x86/machine_kexec.c F: xen/arch/x86/x86_64/kexec_reloc.S MACHINE CHECK (MCA) & RAS M: Christoph Egger M: Liu Jinsong S: Supported F: xen/arch/x86/cpu/mcheck/ MINI-OS M: Stefano Stabellini M: Samuel Thibault S: Supported F: extras/mini-os/ OCAML TOOLS M: David Scott S: Supported F: tools/ocaml/ POWER MANAGEMENT M: Jan Beulich M: Liu Jinsong S: Supported F: xen/arch/x86/acpi/ X: xen/arch/x86/acpi/boot.c X: xen/arch/x86/acpi/lib.c F: xen/drivers/cpufreq/ F: xen/include/acpi/cpufreq/ QEMU-DM M: Ian Jackson S: Supported T: git git://xenbits.xen.org/qemu-xen-*.git QEMU UPSTREAM M: Stefano Stabellini S: Supported T: git git://xenbits.xen.org/qemu-upstream-*.git REMUS M: Shriram Rajagopalan S: Maintained F: tools/remus/ F: tools/python/xen/remus/ F: tools/python/xen/lowlevel/checkpoint/ F: tools/blktap2/drivers/block-remus.c F: tools/blktap2/drivers/hashtable* SCHEDULING M: George Dunlap S: Supported F: xen/common/sched* SEABIOS UPSTREAM M: Ian Campbell S: Supported T: git git://xenbits.xen.org/seabios.git STUB DOMAINS M: Stefano Stabellini M: Samuel Thibault S: Supported F: stubdom/ TOOLSTACK M: Ian Jackson M: Stefano Stabellini M: Ian Campbell S: Supported F: tools/ TRANSCENDENT MEMORY (TMEM) M: Konrad Rzeszutek Wilk W: http://oss.oracle.com/projects/tmem S: Supported F: xen/common/tmem* F: xen/include/xen/tmem* F: docs/misc/tmem* UNMODIFIED LINUX PV DRIVERS M: Keir Fraser M: Jan Beulich S: Obsolete L: xen-devel@lists.xen.org F: unmodified_drivers/linux-2.6/ USB PV DRIVERS M: Noboru Iwamatsu S: Supported T: hg http://xenbits.xen.org/linux-2.6.18-xen.hg F: drivers/xen/usb*/ VTPM M: Daniel De Graaf S: Supported F: extras/mini-os/tpm* F: extras/mini-os/include/tpm* F: stubdom/vtpm/ F: stubdom/vtpmmgr/ F: docs/misc/vtpm.txt X86 ARCHITECTURE M: Keir Fraser M: Jan Beulich S: Supported L: xen-devel@lists.xen.org F: xen/arch/x86/ F: xen/include/asm-x86/ X86 MEMORY MANAGEMENT M: Tim Deegan S: Supported F: xen/arch/x86/mm/ X86 MEMORY SHARING AND PAGING M: Andres Lagar-Cavilla M: Tim Deegan S: Supported F: xen/arch/x86/mm/mem_sharing.c F: xen/arch/x86/mm/mem_paging.c F: tools/memshr XENTRACE M: George Dunlap S: Supported F: tools/xentrace/ F: xen/common/trace.c XSM/FLASK M: Daniel De Graaf S: Supported F: tools/flask/ F: xen/include/xsm/ F: xen/xsm/ F: docs/misc/xsm-flask.txt THE REST M: Keir Fraser L: xen-devel@lists.xen.org S: Supported F: * F: */ xen-4.4.0/install.sh0000664000175000017500000000131012307313555012440 0ustar smbsmb#!/bin/sh src='./install' if [ -d ./dist ]; then src='./dist/install' fi if ! [ -d $src ]; then echo "ERROR: Could not find a valid distribution directory." echo " If this is a source-only release, try 'make dist'." exit 1 fi dst='/' if [ $# -ne 0 ]; then dst=$1 fi if ! [ -d $dst ]; then echo "ERROR: You must specify a valid install directory." echo " The specified directory '$dst' is not valid." exit 1 fi tmp="`mktemp -d`" echo "Installing Xen from '$src' to '$dst'..." (cd $src; tar -cf - * ) | tar -C "$tmp" -xf - echo " - modifying permissions" chmod -R a+rX "$tmp" (cd $tmp; tar -cf - *) | tar --no-same-owner -C "$dst" -xf - rm -rf "$tmp" echo "All done." exit 0 xen-4.4.0/.hg-to-bk0000775000175000017500000000002112307313555012050 0ustar smbsmb#!/bin/sh exit 0 xen-4.4.0/.hgsigs0000664000175000017500000001114412307313555011731 0ustar smbsmb2a60be40e2cb2a3c62836e690927588c3e31d01c 0 iQEcBAABAgAGBQJNLxyGAAoJEIP+FMlX6CvZUjIH/1R0TcK4zBLl1FdXWF/LnOW+HgezYywSRHuJqNur6TweqVqKxnM6gCqjRvlGOrtWOhtEVnb3FmR7pRX4dEiQnBdO//b30ZbU8zOKvDlB7Pa0U81pcD19tloycV+LbHgulVxXnZXeQRTP/eLsUKTM4GKiqvDNN3EPdZADJyz+0MLf/BCJipXVkgmzbFefFlBDRtw05BGE899kiObqoA6LnPPVvZxBqW4nNxeYR6nK7mgps9RRrgMnt9fyPmSN0+ME1nxhfl6Gd9p9109S5ujHwnzwgO/5X+OQF/F0i1SQVvXsGQyM5eE/ZxHEpEKqtFyWPEIk4/eFLsqmdPAAr5XERaw= 9dca60d88c630fcc7a07ad68fd3a0dce02aee646 0 iQEcBAABAgAGBQJNPtkLAAoJEIP+FMlX6CvZzPMH/Rwsp62u3DXfzune2FBh6WjJerPusvkkhfogDc21Mfy7ed/rJfr2ovLYdiJhc7Ml5F51rLc/1d2FshiCDB9x5nVGyKSx9VfWew6fU/YI5wc2BiQg47d9WM6WI8kP07E51c+ML9nQu9QOE4xI0JpFy2u1jByJtG/Wy9qgLwvhK1OoCjs0Z9/R9h5rvWhi61GpQ1z04/FdMRn0JqSOy7nLb4qWmXrNg5LfddJl63yNcK08HfHyp+KtnrLMNei8T8TKEAv3amC29h2t5xMmVsGMkfakjLuC3k9Jirn29YP25QHNqz405rH3aimFbVnIfwelNW7y+/5jfJ4r8Fnt8Oq0+Y4= e7b31cc0093c6002effce7d038e929d936ed023f 0 iQEcBAABAgAGBQJNRvWqAAoJEIP+FMlX6CvZwYMH/2TbgdFOUQbzBCp1WgKHA2foq+KFBAl72uyOm3SPeaZID8mGe+jySnZj1zTgdZkuZv3sC510Hsl0EbJ+0cXtdvMemLuZHusRGzvP1cY3fWW6Ltme1EkFlsqEI3opO+mourSo8GVC/BnMqOl7fV8vpGv72BDqXQ/bBM8qgLnYeWN0yale7soR8Akpp27gNcivUUqKSerusHvIYV1mBl9Xg3mfQTCxR+bmM6lp4SXje38wWnqzAEWd5v9YpGKNacc0ca5mxJGz6t+dmr7PSdR5puYsSVU0CK3aWoL8jxGzT/caw03l5dboMaRbBWSD2R0UeZEG4B1C3aXFE9aiBV2I/No= 8df0c7f9cda6cb271a1414114e26f72e1d93a69a 0 iQEcBAABAgAGBQJNVB85AAoJEIP+FMlX6CvZ+aAIAL/+olR1XkxMyQyQlhV6pc9IPONHsUF1+xGcl1p3MRZCk32rNhdCM1NTMEW695Q9CRZ1/aZpCVJKSMNuuA3k6KEFEpyOMmAgWqRUgo7mMS/Jrp2qS0JCkKshQ3SoCJLsa6ENIJLKI/T4ZlUHTAK7MMfg7XjPLgHZU/l84LnpRDSfr8TeoQq4lcWYkAV+ra0VW0KXl1QrV5hMEmZAKqLhKHRYyRKgg3ZM81OHthe+zE8OUOd/yADxvwaovfai6c0wKoyHuRCi9Rzbp/aJbVFBkdDvZqhSWxq+IykC6MI/Yu3BlCTvwKVRgoYOtHJ4twjxln5qw+BExFZo3JfL16jQ6dE= d2d27d989ebce4325575f871415337cc54891dde 0 iQEcBAABAgAGBQJNXBYbAAoJEIP+FMlX6CvZTGIH/2bxz3UrQNNwsUMmx5tNKwoAzmTjX5IjpsibwZgIlWFTpZhmbdAK5jvUVdPgmAURgPUHIkcic9MRfXZepzqxEyE/zz2itCcDE6oo1ibb2cwn54c2RlHhcweaDzEDroRfcGZ2tEWXIjTEAyNwlhy9IQZ9wC3XAA0wTwXoHx7o9fUs3WSFahr+Cz93w/sc7uQ6LK52WvVvUd7UflSEx2X4uACDknxbKucG6POCxnQ3A+eVXJmdnHak5c+ySsjwu+fHiQ3/ca0JY1RlEF+8NT+aFobZ51kr7acCmFDvOsGd7h6hhgNVe0mujq5pyooklA4qW83/VudS/jOVPnSt+cGhBpE= 7b6791dbfecdb2eba6acd2c8bf0ced17d06746b2 0 iQEcBAABAgAGBQJNa9NOAAoJEIP+FMlX6CvZK+sH/REZ7jV0iWnDbvfPgdl/KlxEVvj4LsmLFAVBa0sEHa8ih+THp3C8Bso7izIs424y667QH6Pq+wAiB5TELOOHRsp6VV5AxYXuNF1Vw9xD9YGUQ5ECAIfc9gldfZZAv4QZayHOgdz4zRnMOrO1D8rqNDVO7BcgNutgf25WtrLsg+mxxyLjcA9Q5Q7a6dY1WSzzSf6beOW9MLh6Hy2pGTzO3CcS6M0Aa6sXZKisy7CWMOueAbwSmRBmw3il1oM62I44ovAJUxBvLTQZm0mNPzeq0IP59rLrGzdhZ76+iE47GK/zGjcC7o/jSQUy6xhuUwgcx3PdB8vzJzNsz6p8mIR+UKo= 6241fa0ad1a939d2eba11bdec929564276f56a9e 0 iQEcBAABAgAGBQJNcSJqAAoJEIP+FMlX6CvZbxEH/206722t0t46MfPq0/buPpQDNiB0HtFZY79u1Yc8xGTo23rQ2NTG10JdCurN2pyxwFesiSIZ4Ua3zqpZ0EA2EN6xAFY9Vua4WEPNJwK8pxxWABcHI017QTidQs41bEaUn92yyfmixq6iYCGvR+b46KWbip/i9Z6vZpw0g3JyOaxgAhkA+p4w90yUIKUkjTrtiuqnI31MUQE0urwWAXQUj78knrqEu3wb91+me1KLH9WzReBB7tt3/GmaK4qSdnHNpIkulVK0XJYzFkJzFr5eHVeYTGFb25Sgir2sIN2gxaCPI3/p90akFMJKmeNV5XCmOQ5g2GWUZTff8LGF6Fk0Me8= 8e620dc570ce42564c1630de5798e23909ee809c 0 iQEcBAABAgAGBQJQFqUYAAoJEIP+FMlX6CvZDGEH/jxUFG29okRASHY6XwFrJ4AU7tLyC8Bt2vAH3ZaA/h1Tn2GWkQXf4tzBAvvwWNBrm1hdj73OaB7dD2l2kvL551AhoSd4/5z+pPtpC72ItwFFTr3Ep9GRZEvEHculuei5IUzw1lxZuNCPCScBxQl+KI+dZHIeP0YO46VZTtG+EeBlj0SJ7JojKN4REd382eIgSLvpEFHCM6vqeCWM9FtDyomiMXdSosDLcWdFauFmV7cdWybA5WPj6bkJenABwX0A1V9uf225S0DTdO0t/7ncZ8h+wXTzqiixywA+lqza+u+HzNyopWmwiFcseFZjuHbKx6M2v0EvKPBcyTgdRZF/+Kg= f4c47bcc01e13cb2e3b4b71fd588d86278875442 0 iQEcBAABAgAGBQJQIpupAAoJEIP+FMlX6CvZnBwH/3qlibrquTmD2p3rcdVi/JkWixaKK3KH4Ows/76sAJYCdx9YkmtkgC4zyz5aph4Rfy7dYnQIUhbwNNdZNBDd0XOVkjQm0ZX1wQjv3yA3Eq9NOUDmfFWldF+oosOG3h/+MLm/36CkB7gFku1ENQ8kagytXe4rN3QdOiJ9Rz84BAv6nJoT/1OX+m+MlBs948orC23aNEfWxrsrv2s92v5fJcAfSLW/AtER2JnASIxWECJ/TOkuHtrO2QPoBymSnifM2R624L+ySwRxSxo+Owfpooje5eekk0ekifvdYtaJ3pOfoOCv8yGEtb7mXv62WowVkasUZa/q+rGdVpHuK0UlBjI= d44f290e81df5f78f0e3545742e3c4ff5415b0a7 0 iQEcBAABAgAGBQJQNji7AAoJEIP+FMlX6CvZvBcIAJPiA6ydvolYCvZsfGgafqCpcpG3f+PbUOwQRWfUcaKHPWFKnwRkmx9MAP9tEjXFNZcbimktDrofmee6XfLb8NLnCcHWZT8trRXYTNw2lM6DpK+AROUpAcShF3gCHQEtCJC6+NArD75aUkUji3awbGlhplHbyWvgvSSULhga+2xfYszRC7eI/g+7/f4JNkvqeWllAtd9fwftXaGhQi2V4LT5IimpJI7PivlDi0wi+BtqXHcmZhlnw3zpR51NRujsMVTNZMqe733kGdt8Y8j3vvsfXBye4+YAkEKFrzW3jTYlA9ws71oNVvWmwmOjOHpx6ASfquP3hhAoiHvb2c6Wims= 68640a3c99cefa86ae70fc49871025864e5671f2 0 iQEcBAABAgAGBQJQSdgUAAoJEIP+FMlX6CvZeWUH/1SeYnKXtkt7n+WuMyF2LvCzZQApmOo+hqVB7m0Pe4Yh9bEq1PAE+sJOAoQIBklCpY3WFsNGQN8yIbYyWRzbIM7oavknwoUoSn50zMk12HczqHigXvhL/eOghRKNsK9BrQnRV+byNHRjzSHPReNxpzueKBOTrCMlXtYMDb94swc+hatEDItT4TwZcaNPri3kiWND0DPmOk58Ke96ml9rpomS1ff976hYAZl+N1Q3Pt998SXiaglZ9EeL0Y8cY759XqZII4D4o6/oQW8MzWrQZr4woHHU7MfK3OWslelzP486esHyyjotEWeelhgl/RjIl0crNBbmCWU15BOAw7wSZh4= xen-4.4.0/unmodified_drivers/0000775000175000017500000000000012307313555014324 5ustar smbsmbxen-4.4.0/unmodified_drivers/linux-2.6/0000775000175000017500000000000012307313555015766 5ustar smbsmbxen-4.4.0/unmodified_drivers/linux-2.6/Makefile0000664000175000017500000000022112307313555017421 0ustar smbsmbinclude $(M)/overrides.mk obj-m += platform-pci/ obj-m += balloon/ obj-m += blkfront/ obj-m += netfront/ obj-m += scsifront/ obj-m += usbfront/ xen-4.4.0/unmodified_drivers/linux-2.6/balloon/0000775000175000017500000000000012307313555017414 5ustar smbsmbxen-4.4.0/unmodified_drivers/linux-2.6/balloon/Makefile0000664000175000017500000000006612307313555021056 0ustar smbsmbifneq ($(KERNELRELEASE),) include $(src)/Kbuild endif xen-4.4.0/unmodified_drivers/linux-2.6/balloon/Kbuild0000664000175000017500000000025412307313555020552 0ustar smbsmbinclude $(M)/overrides.mk obj-m = xen-balloon.o EXTRA_CFLAGS += -I$(M)/platform-pci xen-balloon-y := balloon.o sysfs.o xen-balloon-$(CONFIG_XEN_SCRUB_PAGES) += scrub.o xen-4.4.0/unmodified_drivers/linux-2.6/netfront/0000775000175000017500000000000012307313555017625 5ustar smbsmbxen-4.4.0/unmodified_drivers/linux-2.6/netfront/Makefile0000664000175000017500000000006612307313555021267 0ustar smbsmbifneq ($(KERNELRELEASE),) include $(src)/Kbuild endif xen-4.4.0/unmodified_drivers/linux-2.6/netfront/Kbuild0000664000175000017500000000014412307313555020761 0ustar smbsmbinclude $(M)/overrides.mk obj-m = xen-vnif.o xen-vnif-objs := netfront.o xen-vnif-objs += accel.o xen-4.4.0/unmodified_drivers/linux-2.6/overrides.mk0000664000175000017500000000131712307313555020323 0ustar smbsmb# Hack: we need to use the config which was used to build the kernel, # except that that won't have the right headers etc., so duplicate # some of the mach-xen infrastructure in here. # # (i.e. we need the native config for things like -mregparm, but # a Xen kernel to find the right headers) _XEN_CPPFLAGS += -D__XEN_INTERFACE_VERSION__=0x00030205 _XEN_CPPFLAGS += -DCONFIG_XEN_COMPAT=0xffffff _XEN_CPPFLAGS += -I$(M)/include -I$(M)/compat-include -DHAVE_XEN_PLATFORM_COMPAT_H ifeq ($(ARCH),ia64) _XEN_CPPFLAGS += -DCONFIG_VMX_GUEST endif _XEN_CPPFLAGS += -include $(wildcard $(objtree)/include/*/autoconf.h) EXTRA_CFLAGS += $(_XEN_CPPFLAGS) EXTRA_AFLAGS += $(_XEN_CPPFLAGS) CPPFLAGS := -I$(M)/include $(CPPFLAGS) xen-4.4.0/unmodified_drivers/linux-2.6/usbfront/0000775000175000017500000000000012307313555017630 5ustar smbsmbxen-4.4.0/unmodified_drivers/linux-2.6/usbfront/Makefile0000664000175000017500000000006612307313555021272 0ustar smbsmbifneq ($(KERNELRELEASE),) include $(src)/Kbuild endif xen-4.4.0/unmodified_drivers/linux-2.6/usbfront/Kbuild0000664000175000017500000000031612307313555020765 0ustar smbsmbinclude $(M)/overrides.mk obj-m += $(if $(shell grep '^\#include "\.\./\.\./' $(obj)/usbfront.h), \ $(warning usbfront cannot be built), \ xen-usb.o) xen-usb-objs := usbfront-hcd.o xenbus.o xen-4.4.0/unmodified_drivers/linux-2.6/README0000664000175000017500000000174312307313555016653 0ustar smbsmbThese drivers provide paravirtualised drivers for pre-PVops Linux kernels which have the classic-Xen patches applied. They do not work with kernels which are not patched with the classic-Xen patches. In particular this includes all PVops kernels. For 2.6.36 and newer you should use the "PVHVM" functionality which is available in the mainline kernel. To build do one of: ./mkbuildtree [] NB. You can override paths to Xen sources and a (stub) XenLinux build tree via the XEN and XL environment variable. - or - make -C /path/to/kernel/build M=$PWD modules NB. This is your native kernel build tree (or a distro provided stub), not the XenLinux sources referred to in step 1. NB. If you are cross compiling, you need to set ARCH and CROSS_COMPILE too. You get four modules, xen-platform-pci.ko, xen-vbd.ko, xen-vnif.ko and xen-balloon.ko. Load xen-platform-pci first, and then whichever of xen-vbd, xen-vnif and xen-balloon.ko you happen to need. xen-4.4.0/unmodified_drivers/linux-2.6/xenbus/0000775000175000017500000000000012307313555017272 5ustar smbsmbxen-4.4.0/unmodified_drivers/linux-2.6/xenbus/empty_directory0000664000175000017500000000000012307313555022425 0ustar smbsmbxen-4.4.0/unmodified_drivers/linux-2.6/platform-pci/0000775000175000017500000000000012307313555020363 5ustar smbsmbxen-4.4.0/unmodified_drivers/linux-2.6/platform-pci/Makefile0000664000175000017500000000006612307313555022025 0ustar smbsmbifneq ($(KERNELRELEASE),) include $(src)/Kbuild endif xen-4.4.0/unmodified_drivers/linux-2.6/platform-pci/Kbuild0000664000175000017500000000133312307313555021520 0ustar smbsmbinclude $(M)/overrides.mk obj-m := xen-platform-pci.o EXTRA_CFLAGS += -I$(M)/platform-pci xen-platform-pci-objs := evtchn.o platform-pci.o gnttab.o xen_support.o xen-platform-pci-objs += features.o platform-compat.o xen-platform-pci-objs += reboot.o machine_reboot.o xen-platform-pci-objs += panic-handler.o xen-platform-pci-objs += ../xenbus/xenbus_comms.o xen-platform-pci-objs += ../xenbus/xenbus_xs.o xen-platform-pci-objs += ../xenbus/xenbus_probe.o xen-platform-pci-objs += ../xenbus/xenbus_dev.o xen-platform-pci-objs += ../xenbus/xenbus_client.o xen-platform-pci-objs += ../xenbus/xen_proc.o # Can we do better ? ifeq ($(ARCH),ia64) xen-platform-pci-objs += xencomm.o xencomm_arch.o xcom_hcall.o xcom_asm.o endif xen-4.4.0/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c0000664000175000017500000003001412307313555023122 0ustar smbsmb/****************************************************************************** * platform-pci.c * * Xen platform PCI device driver * Copyright (c) 2005, Intel Corporation. * Copyright (c) 2007, XenSource Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __ia64__ #include #endif #include "platform-pci.h" #ifdef HAVE_XEN_PLATFORM_COMPAT_H #include #endif #define DRV_NAME "xen-platform-pci" #define DRV_VERSION "0.10" #define DRV_RELDATE "03/03/2005" static int max_hypercall_stub_pages, nr_hypercall_stub_pages; char *hypercall_stubs; EXPORT_SYMBOL(hypercall_stubs); MODULE_AUTHOR("ssmith@xensource.com"); MODULE_DESCRIPTION("Xen platform PCI device"); MODULE_LICENSE("GPL"); /* NB. [aux-]ide-disks options do not unplug IDE CD-ROM drives. */ /* NB. aux-ide-disks is equiv to ide-disks except ignores primary master. */ static char *dev_unplug; module_param(dev_unplug, charp, 0644); MODULE_PARM_DESC(dev_unplug, "Emulated devices to unplug: " "[all,][ide-disks,][aux-ide-disks,][nics,][never] (default is 'all')\n"); struct pci_dev *xen_platform_pdev; static unsigned long shared_info_frame; static uint64_t callback_via; static int __devinit init_xen_info(void) { struct xen_add_to_physmap xatp; extern void *shared_info_area; #ifdef __ia64__ xencomm_initialize(); #endif setup_xen_features(); shared_info_frame = alloc_xen_mmio(PAGE_SIZE) >> PAGE_SHIFT; xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; xatp.gpfn = shared_info_frame; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); shared_info_area = ioremap(shared_info_frame << PAGE_SHIFT, PAGE_SIZE); if (shared_info_area == NULL) panic("can't map shared info\n"); return 0; } static unsigned long platform_mmio; static unsigned long platform_mmio_alloc; static unsigned long platform_mmiolen; unsigned long alloc_xen_mmio(unsigned long len) { unsigned long addr; addr = platform_mmio + platform_mmio_alloc; platform_mmio_alloc += len; BUG_ON(platform_mmio_alloc > platform_mmiolen); return addr; } #ifndef __ia64__ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38) static uint32_t xen_cpuid_base(void) { uint32_t base, eax, ebx, ecx, edx; char signature[13]; for (base = 0x40000000; base < 0x40010000; base += 0x100) { cpuid(base, &eax, &ebx, &ecx, &edx); *(uint32_t*)(signature + 0) = ebx; *(uint32_t*)(signature + 4) = ecx; *(uint32_t*)(signature + 8) = edx; signature[12] = 0; if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) return base; } return 0; } #endif static int init_hypercall_stubs(void) { uint32_t eax, ebx, ecx, edx, pages, msr, i, base; base = xen_cpuid_base(); if (base == 0) { printk(KERN_WARNING "Detected Xen platform device but not Xen VMM?\n"); return -EINVAL; } cpuid(base + 1, &eax, &ebx, &ecx, &edx); printk(KERN_INFO "Xen version %d.%d.\n", eax >> 16, eax & 0xffff); /* * Find largest supported number of hypercall pages. * We'll create as many as possible up to this number. */ cpuid(base + 2, &pages, &msr, &ecx, &edx); /* * Use __vmalloc() because vmalloc_exec() is not an exported symbol. * PAGE_KERNEL_EXEC also is not exported, hence we use PAGE_KERNEL. * hypercall_stubs = vmalloc_exec(pages * PAGE_SIZE); */ while (pages > 0) { hypercall_stubs = __vmalloc( pages * PAGE_SIZE, GFP_KERNEL | __GFP_HIGHMEM, __pgprot(__PAGE_KERNEL & ~_PAGE_NX)); if (hypercall_stubs != NULL) break; pages--; /* vmalloc failed: try one fewer pages */ } if (hypercall_stubs == NULL) return -ENOMEM; for (i = 0; i < pages; i++) { unsigned long pfn; pfn = vmalloc_to_pfn((char *)hypercall_stubs + i*PAGE_SIZE); wrmsrl(msr, ((u64)pfn << PAGE_SHIFT) + i); } nr_hypercall_stub_pages = pages; max_hypercall_stub_pages = pages; printk(KERN_INFO "Hypercall area is %u pages.\n", pages); return 0; } static void resume_hypercall_stubs(void) { uint32_t base, ecx, edx, pages, msr, i; base = xen_cpuid_base(); BUG_ON(base == 0); cpuid(base + 2, &pages, &msr, &ecx, &edx); if (pages > max_hypercall_stub_pages) pages = max_hypercall_stub_pages; for (i = 0; i < pages; i++) { unsigned long pfn; pfn = vmalloc_to_pfn((char *)hypercall_stubs + i*PAGE_SIZE); wrmsrl(msr, ((u64)pfn << PAGE_SHIFT) + i); } nr_hypercall_stub_pages = pages; } #else /* __ia64__ */ #define init_hypercall_stubs() (0) #define resume_hypercall_stubs() ((void)0) #endif static uint64_t get_callback_via(struct pci_dev *pdev) { u8 pin; int irq; #ifdef __ia64__ for (irq = 0; irq < 16; irq++) { if (isa_irq_to_vector(irq) == pdev->irq) return irq; /* ISA IRQ */ } #else /* !__ia64__ */ irq = pdev->irq; if (irq < 16) return irq; /* ISA IRQ */ #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) pin = pdev->pin; #else pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin); #endif /* We don't know the GSI. Specify the PCI INTx line instead. */ return (((uint64_t)0x01 << 56) | /* PCI INTx identifier */ ((uint64_t)pci_domain_nr(pdev->bus) << 32) | ((uint64_t)pdev->bus->number << 16) | ((uint64_t)(pdev->devfn & 0xff) << 8) | ((uint64_t)(pin - 1) & 3)); } static int set_callback_via(uint64_t via) { struct xen_hvm_param a; a.domid = DOMID_SELF; a.index = HVM_PARAM_CALLBACK_IRQ; a.value = via; return HYPERVISOR_hvm_op(HVMOP_set_param, &a); } int xen_irq_init(struct pci_dev *pdev); int xenbus_init(void); int xen_reboot_init(void); int xen_panic_handler_init(void); int gnttab_init(void); #define XEN_IOPORT_BASE 0x10 #define XEN_IOPORT_PLATFLAGS (XEN_IOPORT_BASE + 0) /* 1 byte access (R/W) */ #define XEN_IOPORT_MAGIC (XEN_IOPORT_BASE + 0) /* 2 byte access (R) */ #define XEN_IOPORT_UNPLUG (XEN_IOPORT_BASE + 0) /* 2 byte access (W) */ #define XEN_IOPORT_DRVVER (XEN_IOPORT_BASE + 0) /* 4 byte access (W) */ #define XEN_IOPORT_SYSLOG (XEN_IOPORT_BASE + 2) /* 1 byte access (W) */ #define XEN_IOPORT_PROTOVER (XEN_IOPORT_BASE + 2) /* 1 byte access (R) */ #define XEN_IOPORT_PRODNUM (XEN_IOPORT_BASE + 2) /* 2 byte access (W) */ #define XEN_IOPORT_MAGIC_VAL 0x49d2 #define XEN_IOPORT_LINUX_PRODNUM 0xffff /* NB: register a proper one */ #define XEN_IOPORT_LINUX_DRVVER ((LINUX_VERSION_CODE << 8) + 0x0) #define UNPLUG_ALL_IDE_DISKS 1 #define UNPLUG_ALL_NICS 2 #define UNPLUG_AUX_IDE_DISKS 4 #define UNPLUG_ALL 7 static int check_platform_magic(struct device *dev, long ioaddr, long iolen) { short magic, unplug = 0; char protocol, *p, *q, *err; /* Unconditionally unplug everything */ if (!dev_unplug) unplug = UNPLUG_ALL; for (p = dev_unplug; p; p = q) { q = strchr(dev_unplug, ','); if (q) *q++ = '\0'; if (!strcmp(p, "all")) unplug |= UNPLUG_ALL; else if (!strcmp(p, "ide-disks")) unplug |= UNPLUG_ALL_IDE_DISKS; else if (!strcmp(p, "aux-ide-disks")) unplug |= UNPLUG_AUX_IDE_DISKS; else if (!strcmp(p, "nics")) unplug |= UNPLUG_ALL_NICS; else if (!strcmp(p, "never")) unplug = 0; else dev_warn(dev, "unrecognised option '%s' " "in module parameter 'dev_unplug'\n", p); } if (iolen < 0x16) { err = "backend too old"; goto no_dev; } magic = inw(XEN_IOPORT_MAGIC); if (magic != XEN_IOPORT_MAGIC_VAL) { err = "unrecognised magic value"; goto no_dev; } protocol = inb(XEN_IOPORT_PROTOVER); dev_info(dev, "I/O protocol version %d\n", protocol); switch (protocol) { case 1: outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM); outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER); if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) { dev_err(dev, "blacklisted by host\n"); return -ENODEV; } /* Fall through */ case 0: outw(unplug, XEN_IOPORT_UNPLUG); break; default: err = "unknown I/O protocol version"; goto no_dev; } return 0; no_dev: dev_warn(dev, "failed backend handshake: %s\n", err); if (!unplug) return 0; dev_err(dev, "failed to execute specified dev_unplug options!\n"); return -ENODEV; } #ifdef HAVE_OLDMEM_PFN_IS_RAM static int xen_oldmem_pfn_is_ram(unsigned long pfn) { struct xen_hvm_get_mem_type a; int ret; a.domid = DOMID_SELF; a.pfn = pfn; if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) return -ENXIO; switch (a.mem_type) { case HVMMEM_mmio_dm: ret = 0; break; case HVMMEM_ram_rw: case HVMMEM_ram_ro: default: ret = 1; break; } return ret; } #endif static int __devinit platform_pci_init(struct pci_dev *pdev, const struct pci_device_id *ent) { int i, ret; long ioaddr, iolen; long mmio_addr, mmio_len; if (xen_platform_pdev) return -EBUSY; xen_platform_pdev = pdev; i = pci_enable_device(pdev); if (i) return i; ioaddr = pci_resource_start(pdev, 0); iolen = pci_resource_len(pdev, 0); mmio_addr = pci_resource_start(pdev, 1); mmio_len = pci_resource_len(pdev, 1); callback_via = get_callback_via(pdev); if (mmio_addr == 0 || ioaddr == 0 || callback_via == 0) { printk(KERN_WARNING DRV_NAME ":no resources found\n"); return -ENOENT; } ret = pci_request_region(pdev, 1, DRV_NAME); if (ret < 0) return ret; ret = pci_request_region(pdev, 0, DRV_NAME); if (ret < 0) goto mem_out; platform_mmio = mmio_addr; platform_mmiolen = mmio_len; ret = init_hypercall_stubs(); if (ret < 0) goto out; ret = check_platform_magic(&pdev->dev, ioaddr, iolen); if (ret < 0) goto out; if ((ret = init_xen_info())) goto out; if ((ret = gnttab_init())) goto out; if ((ret = xen_irq_init(pdev))) goto out; if ((ret = set_callback_via(callback_via))) goto out; if ((ret = xenbus_init())) goto out; if ((ret = xen_reboot_init())) goto out; if ((ret = xen_panic_handler_init())) goto out; #ifdef HAVE_OLDMEM_PFN_IS_RAM register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram); #endif out: if (ret) { pci_release_region(pdev, 0); mem_out: pci_release_region(pdev, 1); } return ret; } #define XEN_PLATFORM_VENDOR_ID 0x5853 #define XEN_PLATFORM_DEVICE_ID 0x0001 static struct pci_device_id platform_pci_tbl[] __devinitdata = { {XEN_PLATFORM_VENDOR_ID, XEN_PLATFORM_DEVICE_ID, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, /* Continue to recognise the old ID for now */ {0xfffd, 0x0101, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, {0,} }; MODULE_DEVICE_TABLE(pci, platform_pci_tbl); static struct pci_driver platform_driver = { name: DRV_NAME, probe: platform_pci_init, id_table: platform_pci_tbl, }; static int pci_device_registered; void platform_pci_resume(void) { struct xen_add_to_physmap xatp; resume_hypercall_stubs(); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; xatp.gpfn = shared_info_frame; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); if (set_callback_via(callback_via)) printk("platform_pci_resume failure!\n"); } static int __init platform_pci_module_init(void) { int rc; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10) rc = pci_module_init(&platform_driver); #else rc = pci_register_driver(&platform_driver); #endif if (rc) { printk(KERN_INFO DRV_NAME ": No platform pci device model found\n"); return rc; } pci_device_registered = 1; return 0; } module_init(platform_pci_module_init); xen-4.4.0/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c0000664000175000017500000000410012307313555023500 0ustar smbsmb#include #include #include #include #include #include "platform-pci.h" #include struct ap_suspend_info { int do_spin; atomic_t nr_spinning; }; #ifdef CONFIG_SMP /* * Spinning prevents, for example, APs touching grant table entries while * the shared grant table is not mapped into the address space imemdiately * after resume. */ static void ap_suspend(void *_info) { struct ap_suspend_info *info = _info; BUG_ON(!irqs_disabled()); atomic_inc(&info->nr_spinning); mb(); while (info->do_spin) cpu_relax(); mb(); atomic_dec(&info->nr_spinning); } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27) #define initiate_ap_suspend(i) smp_call_function(ap_suspend, i, 0, 0) #else #define initiate_ap_suspend(i) smp_call_function(ap_suspend, i, 0) #endif #else /* !defined(CONFIG_SMP) */ #define initiate_ap_suspend(i) 0 #endif static int bp_suspend(void) { int suspend_cancelled; BUG_ON(!irqs_disabled()); suspend_cancelled = HYPERVISOR_suspend(0); if (!suspend_cancelled) { platform_pci_resume(); gnttab_resume(); irq_resume(); } return suspend_cancelled; } int __xen_suspend(int fast_suspend, void (*resume_notifier)(int)) { int err, suspend_cancelled, nr_cpus; struct ap_suspend_info info; xenbus_suspend(); preempt_disable(); /* Prevent any races with evtchn_interrupt() handler. */ disable_irq(xen_platform_pdev->irq); info.do_spin = 1; atomic_set(&info.nr_spinning, 0); smp_mb(); nr_cpus = num_online_cpus() - 1; err = initiate_ap_suspend(&info); if (err < 0) { preempt_enable(); xenbus_suspend_cancel(); return err; } while (atomic_read(&info.nr_spinning) != nr_cpus) cpu_relax(); local_irq_disable(); suspend_cancelled = bp_suspend(); resume_notifier(suspend_cancelled); local_irq_enable(); smp_mb(); info.do_spin = 0; while (atomic_read(&info.nr_spinning) != 0) cpu_relax(); enable_irq(xen_platform_pdev->irq); preempt_enable(); if (!suspend_cancelled) xenbus_resume(); else xenbus_suspend_cancel(); return 0; } xen-4.4.0/unmodified_drivers/linux-2.6/platform-pci/panic-handler.c0000664000175000017500000000165212307313555023240 0ustar smbsmb#include #include #include #include #ifdef HAVE_XEN_PLATFORM_COMPAT_H #include #endif MODULE_LICENSE("GPL"); #ifdef __ia64__ static void xen_panic_hypercall(struct unw_frame_info *info, void *arg) { current->thread.ksp = (__u64)info->sw - 16; HYPERVISOR_shutdown(SHUTDOWN_crash); /* we're never actually going to get here... */ } #endif static int xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { #ifdef __ia64__ unw_init_running(xen_panic_hypercall, NULL); #else /* !__ia64__ */ HYPERVISOR_shutdown(SHUTDOWN_crash); #endif /* we're never actually going to get here... */ return NOTIFY_DONE; } static struct notifier_block xen_panic_block = { .notifier_call = xen_panic_event }; int xen_panic_handler_init(void) { atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); return 0; } xen-4.4.0/unmodified_drivers/linux-2.6/platform-pci/xen_support.c0000664000175000017500000000405312307313555023117 0ustar smbsmb/****************************************************************************** * support.c * Xen module support functions. * Copyright (C) 2004, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * */ #include #include #include #include #include #include #include "platform-pci.h" #ifdef HAVE_XEN_PLATFORM_COMPAT_H #include #endif #if defined (__ia64__) unsigned long __hypercall(unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4, unsigned long a5, unsigned long cmd) { unsigned long __res; __asm__ __volatile__ (";;\n" "mov r2=%1\n" "break 0x1000 ;;\n" "mov %0=r8 ;;\n" : "=r"(__res) : "r"(cmd) : "r2", "r8", "memory"); return __res; } EXPORT_SYMBOL(__hypercall); int HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count) { return xencomm_hypercall_grant_table_op(cmd, uop, count); } EXPORT_SYMBOL(HYPERVISOR_grant_table_op); /* without using balloon driver on PV-on-HVM for ia64 */ void balloon_update_driver_allowance(long delta) { /* nothing */ } EXPORT_SYMBOL_GPL(balloon_update_driver_allowance); void balloon_release_driver_page(struct page *page) { /* nothing */ } EXPORT_SYMBOL_GPL(balloon_release_driver_page); #endif /* __ia64__ */ void xen_machphys_update(unsigned long mfn, unsigned long pfn) { BUG(); } EXPORT_SYMBOL(xen_machphys_update); xen-4.4.0/unmodified_drivers/linux-2.6/platform-pci/platform-pci.h0000664000175000017500000000217512307313555023136 0ustar smbsmb/****************************************************************************** * platform-pci.h * * Xen platform PCI device driver * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2007, XenSource Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ #ifndef _XEN_PLATFORM_PCI_H #define _XEN_PLATFORM_PCI_H #include unsigned long alloc_xen_mmio(unsigned long len); void platform_pci_resume(void); extern struct pci_dev *xen_platform_pdev; #endif /* _XEN_PLATFORM_PCI_H */ xen-4.4.0/unmodified_drivers/linux-2.6/platform-pci/evtchn.c0000664000175000017500000002134612307313555022024 0ustar smbsmb/****************************************************************************** * evtchn.c * * A simplified event channel for para-drivers in unmodified linux * * Copyright (c) 2002-2005, K A Fraser * Copyright (c) 2005, Intel Corporation * * This file may be distributed separately from the Linux kernel, or * incorporated into other software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include #include #include #include #include #include #include "platform-pci.h" #ifdef HAVE_XEN_PLATFORM_COMPAT_H #include #endif void *shared_info_area; #define is_valid_evtchn(x) ((x) != 0) #define evtchn_from_irq(x) (irq_evtchn[irq].evtchn) static struct { spinlock_t lock; irq_handler_t handler; void *dev_id; int evtchn; int close:1; /* close on unbind_from_irqhandler()? */ int inuse:1; int in_handler:1; } irq_evtchn[256]; static int evtchn_to_irq[NR_EVENT_CHANNELS] = { [0 ... NR_EVENT_CHANNELS-1] = -1 }; static DEFINE_SPINLOCK(irq_alloc_lock); static int alloc_xen_irq(void) { static int warned; int irq; spin_lock(&irq_alloc_lock); for (irq = 1; irq < ARRAY_SIZE(irq_evtchn); irq++) { if (irq_evtchn[irq].inuse) continue; irq_evtchn[irq].inuse = 1; spin_unlock(&irq_alloc_lock); return irq; } if (!warned) { warned = 1; printk(KERN_WARNING "No available IRQ to bind to: " "increase irq_evtchn[] size in evtchn.c.\n"); } spin_unlock(&irq_alloc_lock); return -ENOSPC; } static void free_xen_irq(int irq) { spin_lock(&irq_alloc_lock); irq_evtchn[irq].inuse = 0; spin_unlock(&irq_alloc_lock); } int irq_to_evtchn_port(int irq) { return irq_evtchn[irq].evtchn; } EXPORT_SYMBOL(irq_to_evtchn_port); void mask_evtchn(int port) { shared_info_t *s = shared_info_area; synch_set_bit(port, &s->evtchn_mask[0]); } EXPORT_SYMBOL(mask_evtchn); void unmask_evtchn(int port) { evtchn_unmask_t op = { .port = port }; VOID(HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &op)); } EXPORT_SYMBOL(unmask_evtchn); int bind_listening_port_to_irqhandler( unsigned int remote_domain, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) { struct evtchn_alloc_unbound alloc_unbound; int err, irq; irq = alloc_xen_irq(); if (irq < 0) return irq; spin_lock_irq(&irq_evtchn[irq].lock); alloc_unbound.dom = DOMID_SELF; alloc_unbound.remote_dom = remote_domain; err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, &alloc_unbound); if (err) { spin_unlock_irq(&irq_evtchn[irq].lock); free_xen_irq(irq); return err; } irq_evtchn[irq].handler = handler; irq_evtchn[irq].dev_id = dev_id; irq_evtchn[irq].evtchn = alloc_unbound.port; irq_evtchn[irq].close = 1; evtchn_to_irq[alloc_unbound.port] = irq; unmask_evtchn(alloc_unbound.port); spin_unlock_irq(&irq_evtchn[irq].lock); return irq; } EXPORT_SYMBOL(bind_listening_port_to_irqhandler); int bind_caller_port_to_irqhandler( unsigned int caller_port, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) { int irq; irq = alloc_xen_irq(); if (irq < 0) return irq; spin_lock_irq(&irq_evtchn[irq].lock); irq_evtchn[irq].handler = handler; irq_evtchn[irq].dev_id = dev_id; irq_evtchn[irq].evtchn = caller_port; irq_evtchn[irq].close = 0; evtchn_to_irq[caller_port] = irq; unmask_evtchn(caller_port); spin_unlock_irq(&irq_evtchn[irq].lock); return irq; } EXPORT_SYMBOL(bind_caller_port_to_irqhandler); void unbind_from_irqhandler(unsigned int irq, void *dev_id) { int evtchn; spin_lock_irq(&irq_evtchn[irq].lock); evtchn = evtchn_from_irq(irq); if (is_valid_evtchn(evtchn)) { evtchn_to_irq[evtchn] = -1; mask_evtchn(evtchn); if (irq_evtchn[irq].close) { struct evtchn_close close = { .port = evtchn }; if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) BUG(); } } irq_evtchn[irq].handler = NULL; irq_evtchn[irq].evtchn = 0; spin_unlock_irq(&irq_evtchn[irq].lock); while (irq_evtchn[irq].in_handler) cpu_relax(); free_xen_irq(irq); } EXPORT_SYMBOL(unbind_from_irqhandler); void notify_remote_via_irq(int irq) { int evtchn; evtchn = evtchn_from_irq(irq); if (is_valid_evtchn(evtchn)) notify_remote_via_evtchn(evtchn); } EXPORT_SYMBOL(notify_remote_via_irq); static DEFINE_PER_CPU(unsigned int, last_processed_l1i) = { BITS_PER_LONG - 1 }; static DEFINE_PER_CPU(unsigned int, last_processed_l2i) = { BITS_PER_LONG - 1 }; static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh, unsigned int idx) { return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]); } static irqreturn_t evtchn_interrupt(int irq, void *dev_id #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) , struct pt_regs *regs #else # define handler(irq, dev_id, regs) handler(irq, dev_id) #endif ) { unsigned int l1i, l2i, port; unsigned long masked_l1, masked_l2; /* XXX: All events are bound to vcpu0 but irq may be redirected. */ int cpu = 0; /*smp_processor_id();*/ irq_handler_t handler; shared_info_t *s = shared_info_area; vcpu_info_t *v = &s->vcpu_info[cpu]; unsigned long l1, l2; v->evtchn_upcall_pending = 0; #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ /* Clear master flag /before/ clearing selector flag. */ wmb(); #endif l1 = xchg(&v->evtchn_pending_sel, 0); l1i = per_cpu(last_processed_l1i, cpu); l2i = per_cpu(last_processed_l2i, cpu); while (l1 != 0) { l1i = (l1i + 1) % BITS_PER_LONG; masked_l1 = l1 & ((~0UL) << l1i); if (masked_l1 == 0) { /* if we masked out all events, wrap around to the beginning */ l1i = BITS_PER_LONG - 1; l2i = BITS_PER_LONG - 1; continue; } l1i = __ffs(masked_l1); do { l2 = active_evtchns(cpu, s, l1i); l2i = (l2i + 1) % BITS_PER_LONG; masked_l2 = l2 & ((~0UL) << l2i); if (masked_l2 == 0) { /* if we masked out all events, move on */ l2i = BITS_PER_LONG - 1; break; } l2i = __ffs(masked_l2); /* process port */ port = (l1i * BITS_PER_LONG) + l2i; synch_clear_bit(port, &s->evtchn_pending[0]); irq = evtchn_to_irq[port]; if (irq < 0) continue; spin_lock(&irq_evtchn[irq].lock); handler = irq_evtchn[irq].handler; dev_id = irq_evtchn[irq].dev_id; if (unlikely(handler == NULL)) { printk("Xen IRQ%d (port %d) has no handler!\n", irq, port); spin_unlock(&irq_evtchn[irq].lock); continue; } irq_evtchn[irq].in_handler = 1; spin_unlock(&irq_evtchn[irq].lock); local_irq_enable(); handler(irq, irq_evtchn[irq].dev_id, regs); local_irq_disable(); spin_lock(&irq_evtchn[irq].lock); irq_evtchn[irq].in_handler = 0; spin_unlock(&irq_evtchn[irq].lock); /* if this is the final port processed, we'll pick up here+1 next time */ per_cpu(last_processed_l1i, cpu) = l1i; per_cpu(last_processed_l2i, cpu) = l2i; } while (l2i != BITS_PER_LONG - 1); l2 = active_evtchns(cpu, s, l1i); if (l2 == 0) /* we handled all ports, so we can clear the selector bit */ l1 &= ~(1UL << l1i); } return IRQ_HANDLED; } void irq_resume(void) { int evtchn, irq; for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) { mask_evtchn(evtchn); evtchn_to_irq[evtchn] = -1; } for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++) irq_evtchn[irq].evtchn = 0; } int xen_irq_init(struct pci_dev *pdev) { int irq; for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++) spin_lock_init(&irq_evtchn[irq].lock); return request_irq(pdev->irq, evtchn_interrupt, #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) SA_SHIRQ | SA_SAMPLE_RANDOM | SA_INTERRUPT, #else IRQF_SHARED | #ifdef IRQF_SAMPLE_RANDOM IRQF_SAMPLE_RANDOM | #endif IRQF_DISABLED, #endif "xen-platform-pci", pdev); } xen-4.4.0/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c0000664000175000017500000000676112307313555023646 0ustar smbsmb#include #include #include #include #include #include #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,7) static int system_state = 1; EXPORT_SYMBOL(system_state); #endif void ctrl_alt_del(void) { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27) kill_proc(1, SIGINT, 1); /* interrupt init */ #else kill_cad_pid(SIGINT, 1); #endif } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8) size_t strcspn(const char *s, const char *reject) { const char *p; const char *r; size_t count = 0; for (p = s; *p != '\0'; ++p) { for (r = reject; *r != '\0'; ++r) { if (*p == *r) return count; } ++count; } return count; } EXPORT_SYMBOL(strcspn); #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10) /* * Map a vmalloc()-space virtual address to the physical page frame number. */ unsigned long vmalloc_to_pfn(void * vmalloc_addr) { return page_to_pfn(vmalloc_to_page(vmalloc_addr)); } EXPORT_SYMBOL(vmalloc_to_pfn); #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) unsigned long wait_for_completion_timeout(struct completion *x, unsigned long timeout) { might_sleep(); spin_lock_irq(&x->wait.lock); if (!x->done) { DECLARE_WAITQUEUE(wait, current); wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock_irq(&x->wait.lock); timeout = schedule_timeout(timeout); spin_lock_irq(&x->wait.lock); if (!timeout) { __remove_wait_queue(&x->wait, &wait); goto out; } } while (!x->done); __remove_wait_queue(&x->wait, &wait); } x->done--; out: spin_unlock_irq(&x->wait.lock); return timeout; } EXPORT_SYMBOL(wait_for_completion_timeout); #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12) /* fake do_exit using complete_and_exit */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10) asmlinkage NORET_TYPE void do_exit(long code) #else fastcall NORET_TYPE void do_exit(long code) #endif { complete_and_exit(NULL, code); } EXPORT_SYMBOL_GPL(do_exit); #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) signed long schedule_timeout_interruptible(signed long timeout) { __set_current_state(TASK_INTERRUPTIBLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_interruptible); #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) /** * kzalloc - allocate memory. The memory is set to zero. * @size: how many bytes of memory are required. * @flags: the type of memory to allocate. */ void *kzalloc(size_t size, int flags) { void *ret = kmalloc(size, flags); if (ret) memset(ret, 0, size); return ret; } EXPORT_SYMBOL(kzalloc); #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18) /* Simplified asprintf. */ char *kasprintf(gfp_t gfp, const char *fmt, ...) { va_list ap; unsigned int len; char *p, dummy[1]; va_start(ap, fmt); len = vsnprintf(dummy, 0, fmt, ap); va_end(ap); p = kmalloc(len + 1, gfp); if (!p) return NULL; va_start(ap, fmt); vsprintf(p, fmt, ap); va_end(ap); return p; } EXPORT_SYMBOL(kasprintf); #endif xen-4.4.0/unmodified_drivers/linux-2.6/compat-include/0000775000175000017500000000000012307313555020672 5ustar smbsmbxen-4.4.0/unmodified_drivers/linux-2.6/compat-include/xen/0000775000175000017500000000000012307313555021464 5ustar smbsmbxen-4.4.0/unmodified_drivers/linux-2.6/compat-include/xen/platform-compat.h0000664000175000017500000001354012307313555024745 0ustar smbsmb#ifndef COMPAT_INCLUDE_XEN_PLATFORM_COMPAT_H #define COMPAT_INCLUDE_XEN_PLATFORM_COMPAT_H #include #include #include #if defined(__LINUX_COMPILER_H) && !defined(__always_inline) #define __always_inline inline #endif #if defined(__LINUX_SPINLOCK_H) && !defined(DEFINE_SPINLOCK) #define DEFINE_SPINLOCK(x) spinlock_t x = SPIN_LOCK_UNLOCKED #endif #ifdef _LINUX_INIT_H #ifndef __init #define __init #endif #ifndef __devinit #define __devinit #define __devinitdata #endif #endif /* _LINUX_INIT_H */ #if defined(__LINUX_CACHE_H) && !defined(__read_mostly) #define __read_mostly #endif #if defined(_LINUX_SKBUFF_H) && !defined(NET_IP_ALIGN) #define NET_IP_ALIGN 0 #endif #if defined(_LINUX_SKBUFF_H) && !defined(CHECKSUM_HW) #define CHECKSUM_HW CHECKSUM_PARTIAL #endif #if defined(_LINUX_ERR_H) && !defined(IS_ERR_VALUE) #define IS_ERR_VALUE(x) unlikely((x) > (unsigned long)-1000L) #endif #if defined(_ASM_IA64_PGTABLE_H) && !defined(_PGTABLE_NOPUD_H) #include #endif /* Some kernels have this typedef backported so we cannot reliably * detect based on version number, hence we forcibly #define it. */ #if defined(__LINUX_TYPES_H) || defined(__LINUX_GFP_H) || defined(_LINUX_KERNEL_H) #define gfp_t unsigned #endif #if defined(_LINUX_NOTIFIER_H) && !defined(ATOMIC_NOTIFIER_HEAD) #define ATOMIC_NOTIFIER_HEAD(name) struct notifier_block *name #define atomic_notifier_chain_register(chain,nb) notifier_chain_register(chain,nb) #define atomic_notifier_chain_unregister(chain,nb) notifier_chain_unregister(chain,nb) #define atomic_notifier_call_chain(chain,val,v) notifier_call_chain(chain,val,v) #endif #if defined(_LINUX_NOTIFIER_H) && !defined(BLOCKING_NOTIFIER_HEAD) #define BLOCKING_NOTIFIER_HEAD(name) struct notifier_block *name #define blocking_notifier_chain_register(chain,nb) notifier_chain_register(chain,nb) #define blocking_notifier_chain_unregister(chain,nb) notifier_chain_unregister(chain,nb) #define blocking_notifier_call_chain(chain,val,v) notifier_call_chain(chain,val,v) #endif #if defined(_LINUX_MM_H) && defined set_page_count #define init_page_count(page) set_page_count(page, 1) #endif #if defined(__LINUX_GFP_H) && !defined __GFP_NOMEMALLOC #define __GFP_NOMEMALLOC 0 #endif #if defined(_LINUX_FS_H) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) #define nonseekable_open(inode, filp) /* Nothing to do */ #endif #if defined(_LINUX_MM_H) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10) unsigned long vmalloc_to_pfn(void *addr); #endif #if defined(__LINUX_COMPLETION_H) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) unsigned long wait_for_completion_timeout(struct completion *x, unsigned long timeout); #endif #if defined(_LINUX_SCHED_H) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) signed long schedule_timeout_interruptible(signed long timeout); #endif #if defined(_LINUX_SLAB_H) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) void *kzalloc(size_t size, int flags); #endif #if defined(_LINUX_BLKDEV_H) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) #define end_that_request_last(req, uptodate) end_that_request_last(req) #endif #if defined(_LINUX_CAPABILITY_H) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) #define capable(cap) (1) #endif #if defined(_LINUX_KERNEL_H) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18) extern char *kasprintf(gfp_t gfp, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); #endif #if defined(_LINUX_SYSRQ_H) && LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18) #define handle_sysrq(x,y,z) handle_sysrq(x,y) #endif #if defined(_PAGE_PRESENT) && !defined(_PAGE_NX) #define _PAGE_NX 0 /* * This variable at present is referenced by netfront, but only in code that * is dead when running in hvm guests. To detect potential active uses of it * in the future, don't try to supply a 'valid' value here, so that any * mappings created with it will fault when accessed. */ #define __supported_pte_mask ((maddr_t)0) #endif /* This code duplication is not ideal, but || does not seem to properly * short circuit in a #if condition. **/ #if defined(_LINUX_NETDEVICE_H) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18) #if !defined(SLE_VERSION) #define netif_tx_lock_bh(dev) spin_lock_bh(&(dev)->xmit_lock) #define netif_tx_unlock_bh(dev) spin_unlock_bh(&(dev)->xmit_lock) #elif SLE_VERSION_CODE < SLE_VERSION(10,1,0) #define netif_tx_lock_bh(dev) spin_lock_bh(&(dev)->xmit_lock) #define netif_tx_unlock_bh(dev) spin_unlock_bh(&(dev)->xmit_lock) #endif #endif #if defined(__LINUX_SEQLOCK_H) && !defined(DEFINE_SEQLOCK) #define DEFINE_SEQLOCK(x) seqlock_t x = SEQLOCK_UNLOCKED #endif /* Bug in RHEL4-U3: rw_lock_t is mistakenly defined in DEFINE_RWLOCK() macro */ #if defined(__LINUX_SPINLOCK_H) && defined(DEFINE_RWLOCK) #define rw_lock_t rwlock_t #endif #if defined(__LINUX_SPINLOCK_H) && !defined(DEFINE_RWLOCK) #define DEFINE_RWLOCK(x) rwlock_t x = RW_LOCK_UNLOCKED #endif #if defined(_LINUX_INTERRUPT_H) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) /** * RHEL4-U5 pulled back this feature into the older kernel * Since it is a typedef, and not a macro - detect this kernel via * RHEL_VERSION */ #if !defined(RHEL_VERSION) || (RHEL_VERSION == 4 && RHEL_UPDATE < 5) #if !defined(RHEL_MAJOR) || (RHEL_MAJOR == 4 && RHEL_MINOR < 5) typedef irqreturn_t (*irq_handler_t)(int, void *, struct pt_regs *); #endif #endif #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23) #define setup_xen_features xen_setup_features #endif #ifndef atomic_cmpxchg #define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) #endif #ifdef sync_test_bit #define synch_change_bit sync_change_bit #define synch_clear_bit sync_clear_bit #define synch_set_bit sync_set_bit #define synch_test_and_change_bit sync_test_and_change_bit #define synch_test_and_clear_bit sync_test_and_clear_bit #define synch_test_and_set_bit sync_test_and_set_bit #define synch_test_bit sync_test_bit #endif #endif xen-4.4.0/unmodified_drivers/linux-2.6/compat-include/asm-generic/0000775000175000017500000000000012307313555023064 5ustar smbsmbxen-4.4.0/unmodified_drivers/linux-2.6/compat-include/asm-generic/pgtable-nopud.h0000664000175000017500000000062112307313555025775 0ustar smbsmb#ifndef _PGTABLE_NOPUD_H #define _PGTABLE_NOPUD_H #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11) #error "This version of Linux should not need compat pgtable-nopud.h" #endif #define pud_t pgd_t #define pud_offset(d, va) d #define pud_none(pud) 0 #define pud_present(pud) 1 #define pud_bad(pud) 0 #define PTRS_PER_PUD 1 #endif /* _PGTABLE_NOPUD_H */ xen-4.4.0/unmodified_drivers/linux-2.6/compat-include/asm-generic/pgtable-nopmd.h0000664000175000017500000000056112307313555025770 0ustar smbsmb#ifndef _PGTABLE_NOPMD_H #define _PGTABLE_NOPMD_H #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11) #error "This version of Linux should not need compat pgtable-nopmd.h" #endif #define pud_t pgd_t #define pud_offset(d, va) d #define pud_none(pud) 0 #define pud_present(pud) 1 #define PTRS_PER_PUD 1 #endif /* _PGTABLE_NOPMD_H */ xen-4.4.0/unmodified_drivers/linux-2.6/compat-include/linux/0000775000175000017500000000000012307313555022031 5ustar smbsmbxen-4.4.0/unmodified_drivers/linux-2.6/compat-include/linux/mutex.h0000664000175000017500000000154112307313555023345 0ustar smbsmb/* * Copyright (c) 2006 Cisco Systems. All rights reserved. * * This file is released under the GPLv2. */ /* mutex compatibility for pre-2.6.16 kernels */ #ifndef __LINUX_MUTEX_H #define __LINUX_MUTEX_H #include #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) #error "This version of Linux should not need compat mutex.h" #endif #include #include #define mutex semaphore #define DEFINE_MUTEX(foo) DECLARE_MUTEX(foo) #define mutex_init(foo) init_MUTEX(foo) #define mutex_lock(foo) down(foo) #define mutex_lock_interruptible(foo) down_interruptible(foo) /* this function follows the spin_trylock() convention, so * * it is negated to the down_trylock() return values! Be careful */ #define mutex_trylock(foo) !down_trylock(foo) #define mutex_unlock(foo) up(foo) #endif /* __LINUX_MUTEX_H */ xen-4.4.0/unmodified_drivers/linux-2.6/compat-include/linux/io.h0000664000175000017500000000027712307313555022617 0ustar smbsmb#ifndef _LINUX_IO_H #define _LINUX_IO_H #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) #error "This version of Linux should not need compat linux/io.h" #endif #include #endif xen-4.4.0/unmodified_drivers/linux-2.6/compat-include/linux/scatterlist.h0000664000175000017500000000037612307313555024551 0ustar smbsmb#ifndef _LINUX_SCATTERLIST_H #define _LINUX_SCATTERLIST_H #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12) #error "This version of Linux should not need compat linux/scatterlist.h" #endif #include #endif /* _LINUX_SCATTERLIST_H */ xen-4.4.0/unmodified_drivers/linux-2.6/blkfront/0000775000175000017500000000000012307313555017607 5ustar smbsmbxen-4.4.0/unmodified_drivers/linux-2.6/blkfront/Makefile0000664000175000017500000000006612307313555021251 0ustar smbsmbifneq ($(KERNELRELEASE),) include $(src)/Kbuild endif xen-4.4.0/unmodified_drivers/linux-2.6/blkfront/Kbuild0000664000175000017500000000012012307313555020735 0ustar smbsmbinclude $(M)/overrides.mk obj-m += xen-vbd.o xen-vbd-objs := blkfront.o vbd.o xen-4.4.0/unmodified_drivers/linux-2.6/mkbuildtree0000775000175000017500000000705012307313555020225 0ustar smbsmb#! /bin/sh if [ $1 ]; then uname="$1" else uname=`uname -m` echo "Defaulting to this machine's architecture, $uname, for linking." echo "This may be overridden on the command line (i386,x86_64,ia64)." fi C=$(cd $(dirname $0) && pwd) R=${C%/*/*} if [ -n "$XEN" -a -d "$XEN" ]; then XEN=$(cd $XEN && pwd) else XEN=$R/xen fi echo "Xen tree: $XEN" if [ -n "$XL" -a -d "$XL" ]; then XL=$(cd $XL && pwd) else XL=$R/linux-2.6.18-xen.hg fi echo "Linux tree: $XL" cd $C for d in $(find ${XL}/drivers/xen/ -mindepth 1 -maxdepth 1 -type d); do test -d $(basename $d) || continue lndir $d $(basename $d) > /dev/null 2>&1 done ln -sf ${XL}/drivers/xen/core/gnttab.c platform-pci if [ -f ${XL}/drivers/xen/core/features.c ]; then ln -sf ${XL}/drivers/xen/core/features.c platform-pci else ln -sf ${XL}/drivers/xen/features.c platform-pci fi ln -sf ${XL}/drivers/xen/core/xen_proc.c xenbus ln -sf ${XL}/drivers/xen/core/reboot.c platform-pci mkdir -p include/asm include/xen lndir -silent ${XL}/include/xen include/xen ln -nsf ${XEN}/include/public include/xen/interface # Need to be quite careful here: we don't want the files we link in to # risk overriding the native Linux ones (in particular, system.h must # be native and not xenolinux). case "$uname" in i[34567]86|x86_64) if [ -d ${XL}/arch/x86/include/mach-xen ]; then ln -sf ${XL}/arch/x86/include/mach-xen/asm/hypervisor.h include/asm ln -sf ${XL}/arch/x86/include/mach-xen/asm/hypercall*.h include/asm ln -sf ${XL}/arch/x86/include/mach-xen/asm/synch_bitops*.h include/asm ln -sf ${XL}/arch/x86/include/mach-xen/asm/maddr*.h include/asm ln -sf ${XL}/arch/x86/include/mach-xen/asm/gnttab_dma.h include/asm ln -sf ${XL}/arch/x86/lib/scrub.c balloon elif [ -d ${XL}/include/asm-x86 ]; then ln -sf ${XL}/include/asm-x86/mach-xen/asm/hypervisor.h include/asm ln -sf ${XL}/include/asm-x86/mach-xen/asm/hypercall*.h include/asm ln -sf ${XL}/include/asm-x86/mach-xen/asm/synch_bitops*.h include/asm ln -sf ${XL}/include/asm-x86/mach-xen/asm/maddr*.h include/asm ln -sf ${XL}/include/asm-x86/mach-xen/asm/gnttab_dma.h include/asm ln -sf ${XL}/arch/x86/lib/scrub.c balloon else if [ $uname = x86_64 ]; then mkdir -p include/asm-i386 lndir -silent ${XL}/include/asm-i386 include/asm-i386 else uname=i386 fi ln -sf ${XL}/include/asm-$uname/mach-xen/asm/hypervisor.h include/asm ln -sf ${XL}/include/asm-$uname/mach-xen/asm/hypercall.h include/asm ln -sf ${XL}/include/asm-$uname/mach-xen/asm/synch_bitops.h include/asm ln -sf ${XL}/include/asm-$uname/mach-xen/asm/maddr.h include/asm ln -sf ${XL}/include/asm-$uname/mach-xen/asm/gnttab_dma.h include/asm fi ;; "ia64") ln -sf ${XL}/include/asm-ia64/hypervisor.h include/asm ln -sf ${XL}/include/asm-ia64/hypercall.h include/asm ln -sf ${XL}/include/asm-ia64/synch_bitops.h include/asm ln -sf ${XL}/include/asm-ia64/maddr.h include/asm ln -sf ${XL}/include/asm-ia64/gnttab_dma.h include/asm mkdir -p include/asm/xen ln -sf ${XL}/include/asm-ia64/xen/xcom_hcall.h include/asm/xen ln -sf ${XL}/include/asm-ia64/xen/xencomm.h include/asm/xen ln -sf ${XL}/arch/ia64/xen/xcom_hcall.c platform-pci/ ln -sf ${XL}/arch/ia64/xen/xcom_asm.S platform-pci/ ln -sf ${XL}/arch/ia64/xen/xencomm.c platform-pci/xencomm_arch.c ln -sf ${XL}/drivers/xen/core/xencomm.c platform-pci ;; *) echo unknown architecture $uname exit 1 ;; esac xen-4.4.0/unmodified_drivers/linux-2.6/scsifront/0000775000175000017500000000000012307313555020000 5ustar smbsmbxen-4.4.0/unmodified_drivers/linux-2.6/scsifront/Makefile0000664000175000017500000000006612307313555021442 0ustar smbsmbifneq ($(KERNELRELEASE),) include $(src)/Kbuild endif xen-4.4.0/unmodified_drivers/linux-2.6/scsifront/Kbuild0000664000175000017500000000012712307313555021135 0ustar smbsmbinclude $(M)/overrides.mk obj-m += xen-scsi.o xen-scsi-objs := scsifront.o xenbus.o xen-4.4.0/docs/0000775000175000017500000000000012307313555011373 5ustar smbsmbxen-4.4.0/docs/Makefile0000664000175000017500000001070012307313555013031 0ustar smbsmbXEN_ROOT=$(CURDIR)/.. include $(XEN_ROOT)/Config.mk -include $(XEN_ROOT)/config/Docs.mk VERSION := $(shell $(MAKE) -C $(XEN_ROOT)/xen --no-print-directory xenversion) DOC_ARCHES := arm x86_32 x86_64 DOC_MAN5SRC := $(wildcard man/*.pod.5) DOC_MAN1SRC := $(wildcard man/*.pod.1) DOC_MAN1 := $(patsubst man/%.pod.1,man1/%.1,$(DOC_MAN1SRC)) DOC_MAN5 := $(patsubst man/%.pod.5,man5/%.5,$(DOC_MAN5SRC)) DOC_MARKDOWN := $(wildcard misc/*.markdown) DOC_HTML := $(patsubst %.markdown,html/%.html,$(DOC_MARKDOWN)) \ $(patsubst man/%.pod.1,html/man/%.1.html,$(DOC_MAN1SRC)) \ $(patsubst man/%.pod.5,html/man/%.5.html,$(DOC_MAN5SRC)) \ $(patsubst %.txt,html/%.txt,$(wildcard misc/*.txt)) \ $(patsubst %,html/hypercall/%/index.html,$(DOC_ARCHES)) DOC_TXT := $(patsubst %.txt,txt/%.txt,$(wildcard misc/*.txt)) \ $(patsubst %.markdown,txt/%.txt,$(DOC_MARKDOWN)) \ $(patsubst man/%.pod.1,txt/man/%.1.txt,$(DOC_MAN1SRC)) \ $(patsubst man/%.pod.5,txt/man/%.5.txt,$(DOC_MAN5SRC)) .PHONY: all all: build .PHONY: build build: html txt man-pages figs .PHONY: html html: $(DOC_HTML) html/index.html .PHONY: txt txt: ifdef POD2TEXT $(MAKE) $(DOC_TXT) else @echo "pod2text not installed; skipping text outputs." endif .PHONY: figs figs: ifdef FIG2DEV set -x; $(MAKE) -C figs else @echo "fig2dev (transfig) not installed; skipping figs." endif .PHONY: man-pages man-pages: ifdef POD2MAN $(MAKE) $(DOC_MAN1) $(DOC_MAN5) else @echo "pod2man not installed; skipping man-pages." endif man1/%.1: man/%.pod.1 Makefile $(INSTALL_DIR) $(@D) $(POD2MAN) --release=$(VERSION) --name=`echo $@ | sed 's/^man1.//'| \ sed 's/.1//'` -s 1 -c "Xen" $< $@ man5/%.5: man/%.pod.5 Makefile $(INSTALL_DIR) $(@D) $(POD2MAN) --release=$(VERSION) --name=`echo $@ | sed 's/^man5.//'| \ sed 's/.5//'` -s 5 -c "Xen" $< $@ .PHONY: clean clean: $(MAKE) -C figs clean rm -rf .word_count *.aux *.dvi *.bbl *.blg *.glo *.idx *~ rm -rf *.ilg *.log *.ind *.toc *.bak *.tmp core rm -rf html txt rm -rf man5 rm -rf man1 .PHONY: distclean distclean: clean rm -rf $(XEN_ROOT)/config/Docs.mk config.log config.status config.cache \ autom4te.cache .PHONY: install install: all rm -rf $(DESTDIR)$(DOCDIR) $(INSTALL_DIR) $(DESTDIR)$(DOCDIR) $(INSTALL_DIR) $(DESTDIR)$(MANDIR) cp -R man1 $(DESTDIR)$(MANDIR) cp -R man5 $(DESTDIR)$(MANDIR) [ ! -d html ] || cp -R html $(DESTDIR)$(DOCDIR) html/index.html: $(DOC_HTML) $(CURDIR)/gen-html-index INDEX $(PERL) -w -- $(CURDIR)/gen-html-index -i INDEX html $(DOC_HTML) html/%.html: %.markdown $(INSTALL_DIR) $(@D) ifdef MARKDOWN @echo "Running markdown to generate $*.html ... " $(MARKDOWN) $< > $@.tmp ; \ $(call move-if-changed,$@.tmp,$@) else @echo "markdown not installed; skipping $*.html." endif html/%.txt: %.txt $(INSTALL_DIR) $(@D) cp $< $@ html/man/%.1.html: man/%.pod.1 Makefile $(INSTALL_DIR) $(@D) ifdef POD2HTML $(POD2HTML) --infile=$< --outfile=$@.tmp $(call move-if-changed,$@.tmp,$@) else @echo "pod2html not installed; skipping $<." endif html/man/%.5.html: man/%.pod.5 Makefile $(INSTALL_DIR) $(@D) ifdef POD2HTML $(POD2HTML) --infile=$< --outfile=$@.tmp $(call move-if-changed,$@.tmp,$@) else @echo "pod2html not installed; skipping $<." endif # For non-x86 arches exclude the subarch whole x86 arch. $(foreach i,$(filter-out x86_32 x86_64,$(DOC_ARCHES)),html/hypercall/$(i)/index.html): EXTRA_EXCLUDE := -X arch-x86 html/hypercall/%/index.html: $(CURDIR)/xen-headers Makefile rm -rf $(@D) $(INSTALL_DIR) $(@D) $(PERL) -w $(CURDIR)/xen-headers -O $(@D) \ -T 'arch-$* - Xen public headers' \ $(patsubst %,-X arch-%,$(filter-out $*,$(DOC_ARCHES))) \ $(patsubst %,-X xen-%,$(filter-out $*,$(DOC_ARCHES))) \ $(EXTRA_EXCLUDE) \ $(XEN_ROOT)/xen include/public include/xen/errno.h -include $(wildcard html/hypercall/*/.deps) txt/%.txt: %.txt $(INSTALL_DIR) $(@D) cp $< $@.tmp $(call move-if-changed,$@.tmp,$@) txt/%.txt: %.markdown $(INSTALL_DIR) $(@D) cp $< $@.tmp $(call move-if-changed,$@.tmp,$@) txt/man/%.1.txt: man/%.pod.1 Makefile $(INSTALL_DIR) $(@D) ifdef POD2TEXT $(POD2TEXT) $< $@.tmp $(call move-if-changed,$@.tmp,$@) else @echo "pod2text not installed; skipping $<." endif txt/man/%.5.txt: man/%.pod.5 Makefile $(INSTALL_DIR) $(@D) ifdef POD2TEXT $(POD2TEXT) $< $@.tmp $(call move-if-changed,$@.tmp,$@) else @echo "pod2text not installed; skipping $<." endif ifeq (,$(findstring clean,$(MAKECMDGOALS))) $(XEN_ROOT)/config/Docs.mk: $(error You have to run ./configure before building docs) endif xen-4.4.0/docs/gen-html-index0000664000175000017500000000567112307313555014147 0ustar smbsmb#!/usr/bin/env perl # # Generate indexes for html documentation # use strict; use warnings; use Getopt::Long; use IO::File; use File::Basename; Getopt::Long::Configure('bundling'); @ARGV >= 2 or die; our @docs; our @dirs; our %index; our $outdir; GetOptions("i=s" => sub { read_index(@_);} ) or die; ($outdir,@docs) = @ARGV; sub write_file ($$) { my ($opath, $odata) = @_; print STDOUT "Writing: $opath\n"; my $out = new IO::File "$opath.new", '>' or die "$opath $!"; print $out $odata or die $!; rename "$opath.new", "$opath" or die "$opath $!"; } sub make_page ($$$) { my ($file,$title,$content) = @_; my $o = ''; my $h1; if ( $title eq "" ) { $title = $h1 = "Xen Documentation"; } else { $h1 = "Xen Documentation - $title"; $title = "Xen Documentation - $title"; } $o .= <$title

$h1

    $content
END write_file($file, $o); } sub make_linktext ($) { my ($l) = @_; return "$1($2)" if $l =~ m,^man/(.*)\.([0-9].*)\.html,; $l =~ s/.(?:html|txt)$//g; return $index{$l} if exists $index{$l}; return basename($l); } sub make_link ($$) { my ($ref,$base) = @_; my $txt = make_linktext($ref); $ref =~ s,^$base/,, if $base; #/ return "
  • $txt
  • \n"; } sub make_links ($@) { my ($dir,@docs) = @_; my $idx = ''; foreach my $of (sort { make_linktext($a) cmp make_linktext($b) } @docs) { $idx .= make_link($of,$dir); } return $idx; } sub read_index ($$) { my ($opt, $val) = @_; my $idx = new IO::File "$val", '<' or die "$val $!"; while ($_ = $idx->getline()) { s/^\s+//; s/\s+$//; next if m/^\#/; next unless m/\S/; m/^(\S+)\s+(\S.*)$/ or die; $index{$1} = $2; } } sub uniq (@) { my %h; foreach (@_) { $h{$_} = 1; } return keys %h; } for (@docs) { s,^\Q$outdir\E/,, } @docs = grep { -e "$outdir/$_" && (make_linktext($_) ne "NO-INDEX") } @docs; my $top = ''; # Return a list of all directories leading to $path sub dirs($) { my ($path) = @_; my @dirs; while ( $path =~ m,/, ) { $path =~ m,/([^/]+)$,; push @dirs, $`;#` $path = $`;#` } return @dirs; } foreach my $od (sort { $a cmp $b } uniq map { dirs($_) } @docs) { my @d = (grep /^\Q$od\E/, @docs); if ( @d == 1 and $d[0] eq "$od/index.html" ) { next if $d[0] =~ m,/,;#/ linked to from the subdirectory entry. $top .= make_link("$od/index.html", 0); } else { my $links = make_links(undef,@d); my $secttitle = make_linktext($od); $top .= <$secttitle
      $links
    END $links = make_links($od,@d); my $idx = ''; $idx .= <$secttitle
      $links
    END make_page("$outdir/$od/index.html", $secttitle, $idx); } } make_page("$outdir/index.html", "", $top); xen-4.4.0/docs/misc/0000775000175000017500000000000012307313555012326 5ustar smbsmbxen-4.4.0/docs/misc/printk-formats.txt0000664000175000017500000000122312307313555016045 0ustar smbsmbXen custom %p format options. A subset, borrowed from Linux. All parameters to a %p option should be compatible with void*. Regular pointers are fine. Numbers should make use of the _p() macro. Symbol/Function pointers: %ps Symbol name with condition offset and size (iff offset != 0) e.g. printk default_idle+0x78/0x7d %pS Symbol name with unconditional offset and size e.g. printk+0/0x48 default_idle+0x78/0x7d In the case that an appropriate symbol name can't be found, %p[sS] will fall back to '%p' and print the address in hex. xen-4.4.0/docs/misc/xen-command-line.markdown0000664000175000017500000006503612307313555017237 0ustar smbsmb# Xen Hypervisor Command Line Options This document covers the command line options which the Xen Hypervisor. ## Types of parameter Most parameters take the form `option=value`. Different options on the command line should be space delimited. All options are case sensitive, as are all values unless explicitly noted. ### Boolean (``) All boolean option may be explicitly enabled using a `value` of > `yes`, `on`, `true`, `enable` or `1` They may be explicitly disabled using a `value` of > `no`, `off`, `false`, `disable` or `0` In addition, a boolean option may be enabled by simply stating its name, and may be disabled by prefixing its name with `no-`. ####Examples Enable noreboot mode > `noreboot=true` Disable x2apic support (if present) > `x2apic=off` Enable synchronous console mode > `sync_console` Explicitly specifying any value other than those listed above is undefined, as is stacking a `no-` prefix with an explicit value. ### Integer (``) An integer parameter will default to decimal and may be prefixed with a `-` for negative numbers. Alternatively, a hexadecimal number may be used by prefixing the number with `0x`, or an octal number may be used if a leading `0` is present. Providing a string which does not validly convert to an integer is undefined. ### Size (``) A size parameter may be any integer, with a size suffix * `G` or `g`: GiB (2^30) * `M` or `m`: MiB (2^20) * `K` or `k`: KiB (2^10) * `B` or `b`: Bytes Without a size suffix, the default will be kilo. Providing a suffix other than those listed above is undefined. ### String Many parameters are more complicated and require more intricate configuration. The detailed description of each individual parameter specify which values are valid. ### List Some options take a comma separated list of values. ### Combination Some parameters act as combinations of the above, most commonly a mix of Boolean and String. These are noted in the relevant sections. ## Parameter details ### acpi > `= force | ht | noirq | ` **String**, or **Boolean** to disable. The **acpi** option is used to control a set of four related boolean flags; `acpi_force`, `acpi_ht`, `acpi_noirq` and `acpi_disabled`. By default, Xen will scan the DMI data and blacklist certain systems which are known to have broken ACPI setups. Providing `acpi=force` will cause Xen to ignore the blacklist and attempt to use all ACPI features. Using `acpi=ht` causes Xen to parse the ACPI tables enough to enumerate all CPUs, but will not use other ACPI features. This is not common, and only has an effect if your system is blacklisted. The `acpi=noirq` option causes Xen to not parse the ACPI MADT table looking for IO-APIC entries. This is also not common, and any system which requires this option to function should be blacklisted. Additionally, this will not prevent Xen from finding IO-APIC entries from the MP tables. Finally, any of the boolean false options can be used to disable ACPI usage entirely. Because responsibility for ACPI processing is shared between Xen and the domain 0 kernel this option is automatically propagated to the domain 0 command line ### acpi\_apic\_instance > `= ` Specify which ACPI MADT table to parse for APIC information, if more than one is present. ### acpi\_pstate\_strict > `= ` ### acpi\_skip\_timer\_override > `= ` Instruct Xen to ignore timer-interrupt override. Because responsibility for ACPI processing is shared between Xen and the domain 0 kernel this option is automatically propagated to the domain 0 command line ### acpi\_sleep > `= s3_bios | s3_mode` ### allowsuperpage > `= ` > Default: `true` Permit Xen to use superpages when performing memory management. ### apic > `= bigsmp | default` Override Xen's logic for choosing the APIC driver. By default, if there are more than 8 CPUs, Xen will switch to `bigsmp` over `default`. ### allow\_unsafe > `= ` > Default: `false` Force boot on potentially unsafe systems. By default Xen will refuse to boot on systems with the following errata: * AMD Erratum 121. Processors with this erratum are subject to a guest triggerable Denial of Service. Override only if you trust all of your PV guests. ### apic\_verbosity > `= verbose | debug` Increase the verbosity of the APIC code from the default value. ### arat > `= ` > Default: `true` Permit Xen to use "Always Running APIC Timer" support on compatible hardware in combination with cpuidle. This option is only expected to be useful for developers wishing Xen to fall back to older timing methods on newer hardware. ### ats > `= ` > Default: `true` Permits Xen to set up and use PCI Address Translation Services, which is required for PCI Passthrough. ### availmem > `= ` > Default: `0` (no limit) Specify a maximum amount of available memory, to which Xen will clamp the e820 table. ### badpage > `= List of [ | - ]` Specify that certain pages, or certain ranges of pages contain bad bytes and should not be used. For example, if your memory tester says that byte `0x12345678` is bad, you would place `badpage=0x12345` on Xen's command line. ### bootscrub > `= ` > Default: `true` Scrub free RAM during boot. This is a safety feature to prevent accidentally leaking sensitive VM data into other VMs if Xen crashes and reboots. ### cachesize > `= ` If set, override Xen's calculation of the level 2 cache line size. ### clocksource > `= pit | hpet | acpi` If set, override Xen's default choice for the platform timer. ### com1,com2 > `= [/][,[DPS][,[|pci|amt][,[][,[][,[]]]]]]` Both option `com1` and `com2` follow the same format. * `` may be either an integer baud rate, or the string `auto` if the bootloader or other earlier firmware has already set it up. * Optionally, a clock speed measured in hz can be specified. * `DPS` represents the number of data bits, the parity, and the number of stop bits. `D` is an integer between 5 and 8 for the number of data bits. `P` is a single character representing the type of parity: * `n` No * `o` Odd * `e` Even * `m` Mark * `s` Space `S` is an integer 1 or 2 for the number of stop bits. * `` is an integer which specifies the IO base port for UART registers. * `` is the IRQ number to use, or `0` to use the UART in poll mode only. * `` is the PCI location of the UART, in `:.` notation. * `` is the PCI bridge behind which is the UART, in `:.` notation. * `pci` indicates that Xen should scan the PCI bus for the UART, avoiding Intel AMT devices. * `amt` indicated that Xen should scan the PCI bus for the UART, including Intel AMT devices if present. A typical setup for most situations might be `com1=115200,8n1` ### conring\_size > `= ` > Default: `conring_size=16k` Specify the size of the console ring buffer. ### console > `= List of [ vga | com1[H,L] | com2[H,L] | dbgp | none ]` > Default: `console=com1,vga` Specify which console(s) Xen should use. `vga` indicates that Xen should try and use the vga graphics adapter. `com1` and `com2` indicates that Xen should use serial ports 1 and 2 respectively. Optionally, these arguments may be followed by an `H` or `L`. `H` indicates that transmitted characters will have their MSB set, while received characters must have their MSB set. `L` indicates the converse; transmitted and received characters will have their MSB cleared. This allows a single port to be shared by two subsystems (e.g. console and debugger). `dbgp` indicates that Xen should use a USB debug port. `none` indicates that Xen should not use a console. This option only makes sense on its own. ### console\_timestamps > `= ` > Default: `false` Flag to indicate whether include a timestamp with each console line. ### console\_to\_ring > `= ` > Default: `false` Flag to indicate whether all guest console output should be copied into the console ring buffer. ### conswitch > `= [x]` > Default `conswitch=a` Specify which character should be used to switch serial input between Xen and dom0. The required sequence is CTRL-<switch char> three times. The optional trailing `x` indicates that Xen should not automatically switch the console input to dom0 during boot. Any other value, including omission, causes Xen to automatically switch to the dom0 console during dom0 boot. Use `conswitch=ax` to keep the default switch character, but for xen to keep the console. ### cpu\_type > `= arch_perfmon` If set, force use of the performance counters for oprofile, rather than detecting available support. ### cpufreq > `= dom0-kernel | none | xen` > Default: `xen` Indicate where the responsibility for driving power states lies. ### cpuid\_mask\_cpu (AMD only) > `= fam_0f_rev_c | fam_0f_rev_d | fam_0f_rev_e | fam_0f_rev_f | fam_0f_rev_g | fam_10_rev_b | fam_10_rev_c | fam_11_rev_b` If the other **cpuid\_mask\_{,ext\_}e{c,d}x** options are fully set (unspecified on the command line), specify a pre-canned cpuid mask to mask the current processor down to appear as the specified processor. It is important to ensure that all hosts in a pool appear the same to guests to allow successful live migration. ### cpuid\_mask\_ ecx,edx,ext\_ecx,ext\_edx,xsave_eax > `= ` > Default: `~0` (all bits set) These five command line parameters are used to specify cpuid masks to help with cpuid levelling across a pool of hosts. Setting a bit in the mask indicates that the feature should be enabled, while clearing a bit in the mask indicates that the feature should be disabled. It is important to ensure that all hosts in a pool appear the same to guests to allow successful live migration. ### cpuidle > `= ` ### cpuinfo > `= ` ### crashinfo_maxaddr > `= ` > Default: `4G` Specify the maximum address to allocate certain structures, if used in combination with the `low_crashinfo` command line option. ### crashkernel > `= :[,...][@]` ### credit2\_balance\_over > `= ` ### credit2\_balance\_under > `= ` ### credit2\_load\_window\_shift > `= ` ### dbgp > `= ehci[ | @pci:. ]` Specify the USB controller to use, either by instance number (when going over the PCI busses sequentially) or by PCI device (must be on segment 0). ### debug\_stack\_lines > `= ` > Default: `20` Limits the number lines printed in Xen stack traces. ### debugtrace > `= ` > Default: `128` Specify the size of the console debug trace buffer in KiB. The debug trace feature is only enabled in debugging builds of Xen. ### dma\_bits > `= ` Specify the bit width of the DMA heap. ### dom0\_ioports\_disable > `= List of -` Specify a list of IO ports to be excluded from dom0 access. ### dom0\_max\_vcpus Either: > `= `. The number of VCPUs to give to dom0. This number of VCPUs can be more than the number of PCPUs on the host. The default is the number of PCPUs. Or: > `= -` where `` and `` are integers. Gives dom0 a number of VCPUs equal to the number of PCPUs, but always at least `` and no more than ``. Using `` may give more VCPUs than PCPUs. `` or `` may be omitted and the defaults of 1 and unlimited respectively are used instead. For example, with `dom0_max_vcpus=4-8`: Number of PCPUs | Dom0 VCPUs 2 | 4 4 | 4 6 | 6 8 | 8 10 | 8 ### dom0\_mem > `= List of ( min: | max: | )` Set the amount of memory for the initial domain (dom0). If a size is positive, it represents an absolute value. If a size is negative, it is subtracted from the total available memory. * `` specifies the exact amount of memory. * `min:` specifies the minimum amount of memory. * `max:` specifies the maximum amount of memory. If `` is not specified, the default is all the available memory minus some reserve. The reserve is 1/16 of the available memory or 128 MB (whichever is smaller). The amount of memory will be at least the minimum but never more than the maximum (i.e., `max` overrides the `min` option). If there isn't enough memory then as much as possible is allocated. `max:` also sets the maximum reservation (the maximum amount of memory dom0 can balloon up to). If this is omitted then the maximum reservation is unlimited. For example, to set dom0's initial memory allocation to 512MB but allow it to balloon up as far as 1GB use `dom0_mem=512M,max:1G` If you use this option then it is highly recommended that you disable any dom0 autoballooning feature present in your toolstack. See the _xl.conf(5)_ man page or [Xen Best Practices](http://wiki.xen.org/wiki/Xen_Best_Practices#Xen_dom0_dedicated_memory_and_preventing_dom0_memory_ballooning). ### dom0\_shadow > `= ` ### dom0\_vcpus\_pin > `= ` > Default: `false` Pin dom0 vcpus to their respective pcpus ### e820-mtrr-clip > `= ` Flag that specifies if RAM should be clipped to the highest cacheable MTRR. > Default: `true` on Intel CPUs, otherwise `false` ### e820-verbose > `= ` > Default: `false` Flag that enables verbose output when processing e820 information and applying clipping. ### edd (x86) > `= off | on | skipmbr` Control retrieval of Extended Disc Data (EDD) from the BIOS during boot. ### edid (x86) > `= no | force` Either force retrieval of monitor EDID information via VESA DDC, or disable it (edid=no). This option should not normally be required except for debugging purposes. ### extra\_guest\_irqs > `= [][,]` > Default: `32,256` Change the number of PIRQs available for guests. The optional first number is common for all domUs, while the optional second number (preceded by a comma) is for dom0. Changing the setting for domU has no impact on dom0 and vice versa. For example to change dom0 without changing domU, use `extra_guest_irqs=,512` ### flask\_enabled > `= ` ### flask\_enforcing > `= ` ### font > `= ` where height is `8x8 | 8x14 | 8x16 '` Specify the font size when using the VESA console driver. ### gdb > `= [/][,DPS[,[,[,[,]]]] | pci | amt ] ` Specify the serial parameters for the GDB stub. ### gnttab\_max\_nr\_frames > `= ` Specify the maximum number of frames per grant table operation. ### guest\_loglvl > `= [/]` where level is `none | error | warning | info | debug | all` > Default: `guest_loglvl=none/warning` Set the logging level for Xen guests. Any log message with equal more more importance will be printed. The optional `` option instructs which severities should be rate limited. ### hap > `= ` > Default: `true` Flag to globally enable or disable support for Hardware Assisted Paging (HAP) ### hap\_1gb > `= ` > Default: `true` Flag to enable 1 GB host page table support for Hardware Assisted Paging (HAP). ### hap\_2mb > `= ` > Default: `true` Flag to enable 2 MB host page table support for Hardware Assisted Paging (HAP). ### hpetbroadcast > `= ` ### hvm\_debug > `= ` ### hvm\_port80 > `= ` ### highmem-start > `= ` Specify the memory boundary past which memory will be treated as highmem (x86 debug hypervisor only). ### idle\_latency\_factor > `= ` ### ioapic\_ack ### iommu ### iommu\_inclusive\_mapping > `= ` ### irq\_ratelimit > `= ` ### irq\_vector\_map ### ivrs_hpet[``] > `=[:]:.` Force the use of `[:]:.` as device ID of HPET `` instead of the one specified by the IVHD sub-tables of the IVRS ACPI table. ### ivrs_ioapic[``] > `=[:]:.` Force the use of `[:]:.` as device ID of IO-APIC `` instead of the one specified by the IVHD sub-tables of the IVRS ACPI table. ### lapic Force the use of use of the local APIC on a uniprocessor system, even if left disabled by the BIOS. This option will accept any value at all. ### lapic\_timer\_c2\_ok > `= ` ### ler > `= ` ### loglvl > `= [/]` where level is `none | error | warning | info | debug | all` > Default: `loglvl=warning` Set the logging level for Xen. Any log message with equal more more importance will be printed. The optional `` option instructs which severities should be rate limited. ### low\_crashinfo > `= none | min | all` > Default: `none` if not specified at all, or to `min` if **low_crashinfo** is present without qualification. This option is only useful for hosts with a 32bit dom0 kernel, wishing to use kexec functionality in the case of a crash. It represents which data structures should be deliberately allocated in low memory, so the crash kernel may find find them. Should be used in combination with **crashinfo_maxaddr**. ### max\_cstate > `= ` ### max\_gsi\_irqs > `= ` ### maxcpus > `= ` ### mce > `= ` ### mce\_fb > `= ` ### mce\_verbosity > `= verbose` Specify verbose machine check output. ### mem > `= ` Specify the maximum address of physical RAM. Any RAM beyond this limit is ignored by Xen. ### mmcfg > `= [,amd-fam10]` > Default: `1` Specify if the MMConfig space should be enabled. ### msi > `= ` > Default: `true` Force Xen to (not) use PCI-MSI, even if ACPI FADT says otherwise. ### mwait-idle > `= ` > Default: `true` Use the MWAIT idle driver (with model specific C-state knowledge) instead of the ACPI based one. ### nmi > `= ignore | dom0 | fatal` > Default: `nmi=fatal` Specify what Xen should do in the event of an NMI parity or I/O error. `ignore` discards the error; `dom0` causes Xen to report the error to dom0, while 'fatal' causes Xen to print diagnostics and then hang. ### noapic Instruct Xen to ignore any IOAPICs that are present in the system, and instead continue to use the legacy PIC. This is _not_ recommended with pvops type kernels. Because responsibility for APIC setup is shared between Xen and the domain 0 kernel this option is automatically propagated to the domain 0 command line. ### nofxsr > `= ` ### noirqbalance > `= ` Disable software IRQ balancing and affinity. This can be used on systems such as Dell 1850/2850 that have workarounds in hardware for IRQ routing issues. ### nolapic > `= ` > Default: `false` Ignore the local APIC on a uniprocessor system, even if enabled by the BIOS. This option will accept value. ### no-real-mode (x86) > `= ` Do not execute real-mode bootstrap code when booting Xen. This option should not be used except for debugging. It will effectively disable the **vga** option, which relies on real mode to set the video mode. ### noreboot > `= ` Do not automatically reboot after an error. This is useful for catching debug output. Defaults to automatically reboot after 5 seconds. ### noserialnumber > `= ` Disable CPU serial number reporting. ### nosmp > `= ` Disable SMP support. No secondary processors will be booted. Defaults to booting secondary processors. ### nr\_irqs > `= ` ### numa > `= on | off | fake= | noacpi` Default: `on` ### pci-phantom > `=[:]:,` Mark a group of PCI devices as using phantom functions without actually advertising so, so the IOMMU can create translation contexts for them. All numbers specified must be hexadecimal ones. This option can be specified more than once (up to 8 times at present). ### ple\_gap > `= ` ### ple\_window > `= ` ### reboot > `= t[riple] | k[bd] | n[o] [, [w]arm | [c]old]` Default: `0` Specify the host reboot method. `warm` instructs Xen to not set the cold reboot flag. `cold` instructs Xen to set the cold reboot flag. `triple` instructs Xen to reboot the host by causing a triple fault. `kbd` instructs Xen to reboot the host via the keyboard controller. `acpi` instructs Xen to reboot the host using RESET_REG in the ACPI FADT. ### sched > `= credit | credit2 | sedf | arinc653` > Default: `sched=credit` Choose the default scheduler. ### sched\_credit2\_migrate\_resist > `= ` ### sched\_credit\_tslice\_ms > `= ` Set the timeslice of the credit1 scheduler, in milliseconds. The default is 30ms. Reasonable values may include 10, 5, or even 1 for very latency-sensitive workloads. ### sched\_ratelimit\_us > `= ` In order to limit the rate of context switching, set the minimum amount of time that a vcpu can be scheduled for before preempting it, in microseconds. The default is 1000us (1ms). Setting this to 0 disables it altogether. ### sched\_smt\_power\_savings > `= ` Normally Xen will try to maximize performance and cache utilization by spreading out vcpus across as many different divisions as possible (i.e, numa nodes, sockets, cores threads, &c). This often maximizes throughput, but also maximizes energy usage, since it reduces the depth to which a processor can sleep. This option inverts the logic, so that the scheduler in effect tries to keep the vcpus on the smallest amount of silicon possible; i.e., first fill up sibling threads, then sibling cores, then sibling sockets, &c. This will reduce performance somewhat, particularly on systems with hyperthreading enabled, but should reduce power by enabling more sockets and cores to go into deeper sleep states. ### serial\_tx\_buffer > `= ` > Default: `16kB` Set the serial transmit buffer size. ### smep > `= ` > Default: `true` Flag to enable Supervisor Mode Execution Protection ### snb\_igd\_quirk > `= ` ### sync\_console > `= ` > Default: `false` Flag to force synchronous console output. Useful for debugging, but not suitable for production environments due to incurred overhead. ### tboot > `= 0x` Specify the physical address of the trusted boot shared page. ### tbuf\_size > `= ` Specify the per-cpu trace buffer size in pages. ### tdt > `= ` > Default: `true` Flag to enable TSC deadline as the APIC timer mode. ### tevt\_mask > `= ` Specify a mask for Xen event tracing. This allows Xen tracing to be enabled at boot. Refer to the xentrace(8) documentation for a list of valid event mask values. In order to enable tracing, a buffer size (in pages) must also be specified via the tbuf\_size parameter. ### tickle\_one\_idle\_cpu > `= ` ### timer\_slop > `= ` ### tmem > `= ` ### tmem\_compress > `= ` ### tmem\_dedup > `= ` ### tmem\_lock > `= ` ### tmem\_shared\_auth > `= ` ### tmem\_tze > `= ` ### tsc > `= unstable | skewed` ### ucode > `= [ | scan]` Specify how and where to find CPU microcode update blob. 'integer' specifies the CPU microcode update blob module index. When positive, this specifies the n-th module (in the GrUB entry, zero based) to be used for updating CPU micrcode. When negative, counting starts at the end of the modules in the GrUB entry (so with the blob commonly being last, one could specify `ucode=-1`). Note that the value of zero is not valid here (entry zero, i.e. the first module, is always the Dom0 kernel image). Note further that use of this option has an unspecified effect when used with xen.efi (there the concept of modules doesn't exist, and the blob gets specified via the `ucode=` config file/section entry; see [EFI configuration file description](efi.html)). 'scan' instructs the hypervisor to scan the multiboot images for an cpio image that contains microcode. Depending on the platform the blob with the microcode in the cpio name space must be: - on Intel: kernel/x86/microcode/GenuineIntel.bin - on AMD : kernel/x86/microcode/AuthenticAMD.bin ### unrestricted\_guest > `= ` ### vcpu\_migration\_delay > `= ` > Default: `0` Specify a delay, in microseconds, between migrations of a VCPU between PCPUs when using the credit1 scheduler. This prevents rapid fluttering of a VCPU between CPUs, and reduces the implicit overheads such as cache-warming. 1ms (1000) has been measured as a good value. ### vesa-map > `= ` ### vesa-mtrr > `= ` ### vesa-ram > `= ` ### vga > `= ( ask | current | text-80x | gfx-xx | mode- )[,keep]` `ask` causes Xen to display a menu of available modes and request the user to choose one of them. `current` causes Xen to use the graphics adapter in its current state, without further setup. `text-80x` instructs Xen to set up text mode. Valid values for `` are `25, 28, 30, 34, 43, 50, 80` `gfx-xx` instructs Xen to set up graphics mode with the specified width, height and depth. `mode-` instructs Xen to use a specific mode, as shown with the `ask` option. (N.B menu modes are displayed in hex, so `` should be a hexadecimal number) The optional `keep` parameter causes Xen to continue using the vga console even after dom0 has been started. The default behaviour is to relinquish control to dom0. ### vpid (Intel) > `= ` > Default: `true` Use Virtual Processor ID support if available. This prevents the need for TLB flushes on VM entry and exit, increasing performance. ### vpmu > `= ( bts )` > Default: `off` Switch on the virtualized performance monitoring unit for HVM guests. If the current cpu isn't supported a message like 'VPMU: Initialization failed. ...' is printed on the hypervisor serial log. For some Intel Nehalem processors a quirk handling exist for an unknown wrong behaviour (see handle\_pmc\_quirk()). If 'vpmu=bts' is specified the virtualisation of the Branch Trace Store (BTS) feature is switched on on Intel processors supporting this feature. *Warning:* As the BTS virtualisation is not 100% safe and because of the nehalem quirk don't use the vpmu flag on production systems with Intel cpus! ### watchdog > `= ` > Default: `false` Run an NMI watchdog on each processor. If a processor is stuck for longer than the **watchdog\_timeout**, a panic occurs. ### watchdog\_timeout > `= ` > Default: `5` Set the NMI watchdog timeout in seconds. Specifying `0` will turn off the watchdog. ### x2apic > `= ` > Default: `true` Permit use of x2apic setup for SMP environments. ### x2apic\_phys > `= ` > Default: `true` Use the x2apic physical apic driver. The alternative is the x2apic cluster driver. ### xsave > `= ` > Default: `true` Permit use of the `xsave/xrstor` instructions. xen-4.4.0/docs/misc/pvrdtscp.c0000664000175000017500000002164612307313555014350 0ustar smbsmb/* pvrdtscp algorithm * * This sample code demonstrates the use of the paravirtualized rdtscp * algorithm. Using this algorithm, an application may communicate with * the Xen hypervisor (version 4.0+) to obtain timestamp information which * is both monotonically increasing and has a fixed 1 GHz rate, even across * migrations between machines with different TSC rates and offsets. * Further,the algorithm provides performance near the performance of a * native rdtsc/rdtscp instruction -- much faster than emulation PROVIDED * the application is running on a machine on which the rdtscp instruction * is supported and TSC is "safe". The application must also be running in a * PV domain. (HVM domains may be supported at a later time.) On machines * where TSC is unsafe or the rdtscp instruction is not supported, Xen * (v4.0+) provides emulation which is slower but consistent with the pvrdtscp * algorithm, thus providing support for the algorithm for live migration * across all machines. * * More information can be found within the Xen (4.0+) source tree at * docs/misc/tscmode.txt * * Copyright (c) 2009 Oracle Corporation and/or its affiliates. * All rights reserved * Written by: Dan Magenheimer * * This code is derived from code licensed under the GNU * General Public License ("GPL") version 2 and is therefore itself * also licensed under the GPL version 2. * * This code is known to compile and run on Oracle Enterprise Linux 5 Update 2 * using gcc version 4.1.2, but its purpose is to describe the pvrdtscp * algorithm and its ABI to Xen version 4.0+ */ #include #include #include #include #ifdef __LP64__ #define __X86_64__ typedef unsigned short u16; typedef unsigned int u32; typedef unsigned long u64; typedef int i32; typedef long i64; #define NSEC_PER_SEC 1000000000 #else #define __X86_32__ typedef unsigned int u16; typedef unsigned long u32; typedef unsigned long long u64; typedef long i32; typedef long long i64; #define NSEC_PER_SEC 1000000000L #endif static inline void hvm_cpuid(u32 idx, u32 sub, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) { *eax = idx, *ecx = sub; asm("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (*eax), "2" (*ecx)); } static inline void pv_cpuid(u32 idx, u32 sub, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) { *eax = idx, *ecx = sub; asm volatile ( "ud2a ; .ascii \"xen\"; cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (*eax), "2" (*ecx)); } static inline u64 do_rdtscp(u32 *aux) { static u64 last = 0; u32 lo32, hi32; u64 val; asm volatile(".byte 0x0f,0x01,0xf9":"=a"(lo32),"=d"(hi32),"=c" (*aux)); val = lo32 | ((u64)hi32 << 32); return val; } static inline int get_xen_tsc_mode(void) { u32 val, dummy1, dummy2, dummy3; pv_cpuid(0x40000003,0,&dummy1,&val,&dummy2,&dummy3); return val; } static inline int get_xen_vtsc(void) { u32 val, dummy1, dummy2, dummy3; pv_cpuid(0x40000003,0,&val,&dummy1,&dummy2,&dummy3); return val & 1; } static inline int get_xen_vtsc_khz(void) { u32 val, dummy1, dummy2, dummy3; pv_cpuid(0x40000003,0,&dummy1,&dummy2,&val,&dummy3); return val; } static inline u32 get_xen_cpu_khz(void) { u32 cpu_khz, dummy1, dummy2, dummy3; pv_cpuid(0x40000003,2,&cpu_khz,&dummy1,&dummy2,&dummy3); return cpu_khz; } static inline u32 get_xen_incarnation(void) { u32 incarn, dummy1, dummy2, dummy3; pv_cpuid(0x40000003,0,&dummy1,&dummy2,&dummy3,&incarn); return incarn; } static inline void get_xen_time_values(u64 *offset, u32 *mul_frac, u32 *shift) { u32 off_lo, off_hi, sys_lo, sys_hi, dummy; pv_cpuid(0x40000003,1,&off_lo,&off_hi,mul_frac,shift); *offset = off_lo | ((u64)off_hi << 32); } static inline u64 scale_delta(u64 delta, u32 tsc_mul_frac, i32 tsc_shift) { u64 product; #ifdef __X86_32__ u32 tmp1, tmp2; #endif if ( tsc_shift < 0 ) delta >>= -tsc_shift; else delta <<= tsc_shift; #ifdef __X86_32__ asm ( "mul %5 ; " "mov %4,%%eax ; " "mov %%edx,%4 ; " "mul %5 ; " "xor %5,%5 ; " "add %4,%%eax ; " "adc %5,%%edx ; " : "=A" (product), "=r" (tmp1), "=r" (tmp2) : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (tsc_mul_frac) ); #else asm ( "mul %%rdx ; shrd $32,%%rdx,%%rax" : "=a" (product) : "0" (delta), "d" ((u64)tsc_mul_frac) ); #endif return product; } static inline u64 get_pvrdtscp_timestamp(int *discontinuity) { static int firsttime = 1; static u64 last_pvrdtscp_timestamp = 0; static u32 last_tsc_aux; static u64 xen_ns_offset; static u32 xen_tsc_to_ns_mul_frac, xen_tsc_to_ns_shift; u32 this_tsc_aux; u64 timestamp, cur_tsc, cur_ns; if (firsttime) { cur_tsc = do_rdtscp(&last_tsc_aux); get_xen_time_values(&xen_ns_offset, &xen_tsc_to_ns_mul_frac, &xen_tsc_to_ns_shift); cur_ns = scale_delta(cur_tsc, xen_tsc_to_ns_mul_frac, xen_tsc_to_ns_shift); timestamp = cur_ns - xen_ns_offset; last_pvrdtscp_timestamp = timestamp; firsttime = 0; } cur_tsc = do_rdtscp(&this_tsc_aux); *discontinuity = 0; while (this_tsc_aux != last_tsc_aux) { /* if tsc_aux changed, try again */ last_tsc_aux = this_tsc_aux; get_xen_time_values(&xen_ns_offset, &xen_tsc_to_ns_mul_frac, &xen_tsc_to_ns_shift); cur_tsc = do_rdtscp(&this_tsc_aux); *discontinuity = 1; } /* compute nsec from TSC and Xen time values */ cur_ns = scale_delta(cur_tsc, xen_tsc_to_ns_mul_frac, xen_tsc_to_ns_shift); timestamp = cur_ns - xen_ns_offset; /* enforce monotonicity just in case */ if ((i64)(timestamp - last_pvrdtscp_timestamp) > 0) last_pvrdtscp_timestamp = timestamp; else { /* this should never happen but we'll check it anyway in * case of some strange combination of scaling errors * occurs across a very fast migration */ printf("Time went backwards by %lluns\n", (unsigned long long)(last_pvrdtscp_timestamp-timestamp)); timestamp = ++last_pvrdtscp_timestamp; } return timestamp; } #define HVM 1 #define PVM 0 static int running_on_xen(int hvm, u16 *version_major, u16 *version_minor) { u32 eax, ebx, ecx, edx, base; union { char csig[16]; u32 u[4]; } sig; for (base=0x40000000; base < 0x40010000; base += 0x100) { if (hvm==HVM) hvm_cpuid(base,0,&eax,&ebx,&ecx,&edx); else pv_cpuid(base,0,&eax,&ebx,&ecx,&edx); sig.u[0] = ebx; sig.u[1] = ecx; sig.u[2] = edx; sig.csig[12] = '\0'; if (!strcmp("XenVMMXenVMM",&sig.csig[0]) && (eax >= (base+2))) { if (hvm==HVM) hvm_cpuid(base+1,0,&eax,&ebx,&ecx,&edx); else pv_cpuid(base+1,0,&eax,&ebx,&ecx,&edx); *version_major = (eax >> 16) & 0xffff; *version_minor = eax & 0xffff; return 1; } } return 0; } main(int ac, char **av) { u32 dummy; u16 version_hi, version_lo; u64 ts, last_ts; int status, discontinuity = 0; pid_t pid; if (running_on_xen(HVM,&version_hi,&version_lo)) { printf("running on Xen v%d.%d as an HVM domain, " "pvrdtsc not supported, exiting\n", (int)version_hi, (int)version_lo); exit(0); } pid = fork(); if (pid == -1) { fprintf(stderr,"Huh? Fork failed\n"); return 0; } else if (pid == 0) { /* child */ pv_cpuid(0x40000000,0,&dummy,&dummy,&dummy,&dummy); exit(0); } waitpid(pid,&status,0); if (!WIFEXITED(status)) exit(0); if (!running_on_xen(PVM,&version_hi,&version_lo)) { printf("not running on Xen, exiting\n"); exit(0); } printf("running on Xen v%d.%d as a PV domain\n", (int)version_hi, (int)version_lo); if ( version_hi <= 3 ) { printf("pvrdtscp requires Xen version 4.0 or greater\n"); /* exit(0); FIXME after xen-unstable is officially v4.0 */ } if ( get_xen_tsc_mode() != 3 ) printf("tsc_mode not pvrdtscp, set tsc_mode=3, exiting\n"); /* OK, we are on Xen, now loop forever checking timestamps */ ts = get_pvrdtscp_timestamp(&discontinuity); printf("Starting with ts=%lluns 0x%llx (%llusec)\n",ts,ts,ts/NSEC_PER_SEC); printf("incarn=%d: vtsc=%d, vtsc_khz=%lu, phys cpu_khz=%lu\n", (unsigned long)get_xen_incarnation(), (unsigned long)get_xen_vtsc(), (unsigned long)get_xen_vtsc_khz(), (unsigned long)get_xen_cpu_khz()); ts = get_pvrdtscp_timestamp(&discontinuity); last_ts = ts; while (1) { ts = get_pvrdtscp_timestamp(&discontinuity); if (discontinuity) printf("migrated/restored, incarn=%d: " "vtsc now %d, vtsc_khz=%lu, phys cpu_khz=%lu\n", (unsigned long)get_xen_incarnation(), (unsigned long)get_xen_vtsc(), (unsigned long)get_xen_vtsc_khz(), (unsigned long)get_xen_cpu_khz()); if (ts < last_ts) /* this should NEVER happen, especially since there * is a check for it in get_pvrdtscp_timestamp() */ printf("Time went backwards: %lluns (%llusec)\n", last_ts-ts,(last_ts-ts)/NSEC_PER_SEC); if (ts > last_ts + 200000000LL) /* this is OK, usually about 2sec for save/restore * and a fraction of a second for live migrate */ printf("Time jumped forward %lluns (%llusec)\n", ts-last_ts,(ts-last_ts)/NSEC_PER_SEC); last_ts = ts; } } xen-4.4.0/docs/misc/grant-tables.txt0000664000175000017500000002642212307313555015460 0ustar smbsmb******************************************************************************** A Rough Introduction to Using Grant Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Christopher Clark, March, 2005. Grant tables are a mechanism for sharing and transferring frames between domains, without requiring the participating domains to be privileged. The first mode of use allows domA to grant domB access to a specific frame, whilst retaining ownership. The block front driver uses this to grant memory access to the block back driver, so that it may read or write as requested. 1. domA creates a grant access reference, and transmits the ref id to domB. 2. domB uses the reference to map the granted frame. 3. domB performs the memory access. 4. domB unmaps the granted frame. 5. domA removes its grant. The second mode allows domA to accept a transfer of ownership of a frame from domB. The net front and back driver will use this for packet tx/rx. This mechanism is still being implemented, though the xen<->guest interface design is complete. 1. domA creates an accept transfer grant reference, and transmits it to domB. 2. domB uses the ref to hand over a frame it owns. 3. domA accepts the transfer 4. domA clears the used reference. ******************************************************************************** Data structures ~~~~~~~~~~~~~~~ The following data structures are used by Xen and the guests to implement grant tables: 1. Shared grant entries 2. Active grant entries 3. Map tracking These are not the users primary interface to grant tables, but are discussed because an understanding of how they work may be useful. Each of these is a finite resource. Shared grant entries ~~~~~~~~~~~~~~~~~~~~ A set of pages are shared between Xen and a guest, holding the shared grant entries. The guest writes into these entries to create grant references. The index of the entry is transmitted to the remote domain: this is the reference used to activate an entry. Xen will write into a shared entry to indicate to a guest that its grant is in use. sha->domid : remote domain being granted rights sha->frame : machine frame being granted sha->flags : allow access, allow transfer, remote is reading/writing, etc. Active grant entries ~~~~~~~~~~~~~~~~~~~~ Xen maintains a set of private frames per domain, holding the active grant entries for safety, and to reference count mappings. act->domid : remote domain being granted rights act->frame : machine frame being granted act->pin : used to hold reference counts Map tracking ~~~~~~~~~~~~ Every time a frame is mapped, a map track entry is stored in the metadata of the mapping domain. The index of this entry is returned from the map call, and is used to unmap the frame. Map track entries are also searched whenever a page table entry containing a foreign frame number is overwritten: the first matching map track entry is then removed, as if unmap had been invoked. These are not used by the transfer mechanism. map->domid : owner of the mapped frame map->ref_and_flags : grant reference, ro/rw, mapped for host or device access ******************************************************************************** Granting a foreign domain access to frames ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ domA [frame]--> domB domA: #include grant_ref_t gref[BATCH_SIZE]; for ( i = 0; i < BATCH_SIZE; i++ ) gref[i] = gnttab_grant_foreign_access( domBid, mfn, (readonly ? 1 : 0) ); .. gref is then somehow transmitted to domB for use. Mapping foreign frames ~~~~~~~~~~~~~~~~~~~~~~ domB: #include unsigned long mmap_vstart; gnttab_op_t aop[BATCH_SIZE]; grant_ref_t mapped_handle[BATCH_SIZE]; if ( (mmap_vstart = allocate_empty_lowmem_region(BATCH_SIZE)) == 0 ) BUG(); for ( i = 0; i < BATCH_SIZE; i++ ) { aop[i].u.map_grant_ref.host_virt_addr = mmap_vstart + (i * PAGE_SIZE); aop[i].u.map_grant_ref.dom = domAid; aop[i].u.map_grant_ref.ref = gref[i]; aop[i].u.map_grant_ref.flags = ( GNTMAP_host_map | GNTMAP_readonly ); } if ( unlikely(HYPERVISOR_grant_table_op( GNTTABOP_map_grant_ref, aop, BATCH_SIZE))) BUG(); for ( i = 0; i < BATCH_SIZE; i++ ) { if ( unlikely(aop[i].u.map_grant_ref.handle < 0) ) { tidyup_all(aop, i); goto panic; } phys_to_machine_mapping[__pa(mmap_vstart + (i * PAGE_SIZE))>>PAGE_SHIFT] = FOREIGN_FRAME(aop[i].u.map_grant_ref.dev_bus_addr); mapped_handle[i] = aop[i].u.map_grant_ref.handle; } Unmapping foreign frames ~~~~~~~~~~~~~~~~~~~~~~~~ domB: for ( i = 0; i < BATCH_SIZE; i++ ) { aop[i].u.unmap_grant_ref.host_virt_addr = mmap_vstart + (i * PAGE_SIZE); aop[i].u.unmap_grant_ref.dev_bus_addr = 0; aop[i].u.unmap_grant_ref.handle = mapped_handle[i]; } if ( unlikely(HYPERVISOR_grant_table_op( GNTTABOP_unmap_grant_ref, aop, BATCH_SIZE))) BUG(); Ending foreign access ~~~~~~~~~~~~~~~~~~~~~ Note that this only prevents further mappings; it does _not_ revoke access. Should _only_ be used when the remote domain has unmapped the frame. gnttab_query_foreign_access( gref ) will indicate the state of any mapping. domA: if ( gnttab_query_foreign_access( gref[i] ) == 0 ) gnttab_end_foreign_access( gref[i], readonly ); TODO: readonly yet to be implemented. ******************************************************************************** Transferring ownership of a frame to another domain ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ [ XXX: Transfer mechanism is alpha-calibre code, untested, use at own risk XXX ] [ XXX: show use of batch operations below, rather than single frame XXX ] [ XXX: linux internal interface could/should be wrapped to be tidier XXX ] Prepare to accept a frame from a foreign domain ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ domA: if ( (p = alloc_page(GFP_HIGHUSER)) == NULL ) { printk("Cannot alloc a frame to surrender\n"); break; } pfn = p - mem_map; mfn = phys_to_machine_mapping[pfn]; if ( !PageHighMem(p) ) { v = phys_to_virt(pfn << PAGE_SHIFT); scrub_pages(v, 1); queue_l1_entry_update(get_ptep((unsigned long)v), 0); } /* Ensure that ballooned highmem pages don't have cached mappings. */ kmap_flush_unused(); /* Flush updates through and flush the TLB. */ xen_tlb_flush(); phys_to_machine_mapping[pfn] = INVALID_P2M_ENTRY; if ( HYPERVISOR_dom_mem_op( MEMOP_decrease_reservation, &mfn, 1, 0) != 1 ) { printk("MEMOP_decrease_reservation failed\n"); /* er... ok. free the page then */ __free_page(p); break; } accepting_pfn = pfn; ref = gnttab_grant_foreign_transfer( (domid_t) args.arg[0], pfn ); printk("Accepting dom %lu frame at ref (%d)\n", args.arg[0], ref); Transfer a frame to a foreign domain ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ domB: mmu_update_t update; domid_t domid; grant_ref_t gref; unsigned long pfn, mfn, *v; struct page *transfer_page = 0; /* alloc a page and grant access. * alloc page returns a page struct. */ if ( (transfer_page = alloc_page(GFP_HIGHUSER)) == NULL ) return -ENOMEM; pfn = transfer_page - mem_map; mfn = phys_to_machine_mapping[pfn]; /* need to remove all references to this page */ if ( !PageHighMem(transfer_page) ) { v = phys_to_virt(pfn << PAGE_SHIFT); scrub_pages(v, 1); sprintf((char *)v, "This page (%lx) was transferred.\n", mfn); queue_l1_entry_update(get_ptep((unsigned long)v), 0); } #ifdef CONFIG_XEN_SCRUB_PAGES else { v = kmap(transfer_page); scrub_pages(v, 1); sprintf((char *)v, "This page (%lx) was transferred.\n", mfn); kunmap(transfer_page); } #endif /* Delete any cached kmappings */ kmap_flush_unused(); /* Flush updates through and flush the TLB */ xen_tlb_flush(); /* invalidate in P2M */ phys_to_machine_mapping[pfn] = INVALID_P2M_ENTRY; domid = (domid_t)args.arg[0]; gref = (grant_ref_t)args.arg[1]; update.ptr = MMU_EXTENDED_COMMAND; update.ptr |= ((gref & 0x00FF) << 2); update.ptr |= mfn << PAGE_SHIFT; update.val = MMUEXT_TRANSFER_PAGE; update.val |= (domid << 16); update.val |= (gref & 0xFF00); ret = HYPERVISOR_mmu_update(&update, 1, NULL); Map a transferred frame ~~~~~~~~~~~~~~~~~~~~~~~ TODO: Clear the used transfer reference ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TODO: ******************************************************************************** Using a private reserve of grant references ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Where it is known in advance how many grant references are required, and failure to allocate them on demand would cause difficulty, a batch can be allocated and held in a private reserve. To reserve a private batch: /* housekeeping data - treat as opaque: */ grant_ref_t gref_head, gref_terminal; if ( 0 > gnttab_alloc_grant_references( number_to_reserve, &gref_head, &gref_terminal )) return -ENOSPC; To release a batch back to the shared pool: gnttab_free_grant_references( number_reserved, gref_head ); To claim a reserved reference: ref = gnttab_claim_grant_reference( &gref_head, gref_terminal ); To release a claimed reference back to the reserve pool: gnttab_release_grant_reference( &gref_head, gref ); To use a claimed reference to grant access, use these alternative functions that take an additional parameter of the grant reference to use: gnttab_grant_foreign_access_ref gnttab_grant_foreign_transfer_ref xen-4.4.0/docs/misc/xenstore-paths.markdown0000664000175000017500000003213512307313555017062 0ustar smbsmb# XenStore Paths This document attempts to defines all the paths which are in common use by either guests, front-/back-end drivers, toolstacks etc. The XenStore wire protocol itself is described in [xenstore.txt](xenstore.txt). ## Notation This document is intended to be partially machine readable, such that test system etc can use it to validate whether the xenstore paths used by a test are allowable etc. Therefore the following notation conventions apply: A xenstore path is generically defined as: PATH = VALUES [TAGS] PATH/* [TAGS] The first syntax defines a simple path with a single value. The second syntax defines an aggregated set of paths which are usually described externally to this document. The text will give a pointer to the appropriate external documentation. PATH can contain simple regex constructs following the Perl compatible regexp syntax described in pcre(3) or perlre(1). In addition the following additional wild card names are defined and are evaluated before regexp expansion: * ~ -- expands to an arbitrary a domain's home path (described below). Only valid at the begining of a path. * $DEVID -- a per-device type device identifier. Typically an integer. * $DOMID -- a domain identifier, an integer. Typically this refers to the "other" domain. i.e. ~ refers to the domain providing a service while $DOMID is the consumer of that service. * $UUID -- a UUID in the form xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx VALUES are strings and can take the following forms: * PATH -- a XenStore path. * STRING -- an arbitrary string. * INTEGER -- An integer, in decimal representation unless otherwise noted. * MEMKB -- the decimal representation of a number of kilobytes. * EVTCHN -- the decimal representation of an event channel. * GNTREF -- the decimal representation of a grant reference. * "a literal string" -- literal strings are contained within quotes. * (VALUE | VALUE | ... ) -- a set of alternatives. Alternatives are separated by a "|" and all the alternatives are enclosed in "(" and ")". Additional TAGS may follow as a comma separated set of the following tags enclosed in square brackets. * w -- Path is writable by the containing domain, that is the domain whose home path ~ this key is under or which /vm/$UUID refers to. By default paths under both of these locations are read only for the domain. * n -- Path is neither readable nor writeable for guest domains. * HVM -- Path is valid for HVM domains only * PV -- Path is valid for PV domains only * BACKEND -- Path is valid for a backend domain (AKA driver domain) * INTERNAL -- Although a path is visible to the domain its use is reserved for the virtual firmware or Xen platform code. Guest Operating Systems must not read this key or otherwise rely on its presence or contents. * DEPRECATED -- This path is deprecated and may be removed in its current form in the future. Guests should not add new dependencies on such paths. Owning domain means the domain whose home path this tag is found under. Lack of either a __HVM__ or __PV__ tag indicates that the path is valid for either type of domain (including PVonHVM and similar mixed domain types). ## Domain Home Path Every domain has a home path within the xenstore hierarchy. This is the path where the majority of the domain-visible information about each domain is stored. This path is: /local/domain/$DOMID All non-absolute paths are relative to this path. Although this path could be considered a "Home Directory" for the domain it would not usually be writable by the domain. The tools will create writable subdirectories as necessary. ## Per Domain Paths ## General Paths #### ~/vm = PATH [] A pointer back to the domain's /vm/$UUID path (described below). #### ~/name = STRING [] The guests name. #### ~/domid = INTEGER [] The domain's own ID. #### ~/image/device-model-pid = INTEGER [INTERNAL] The process ID of the device model associated with this domain, if it has one. #### ~/cpu/[0-9]+/availability = ("online"|"offline") [PV] One node for each virtual CPU up to the guest's configured maximum. Valid values are "online" and "offline". The guest is expected to react to changes in this path by bringing the appropriate VCPU on or offline using the VCPUOP interface described in [xen/include/public/vcpu.h][VCPU] This protocol is not currently well documented. #### ~/memory/static-max = MEMKB [] Specifies a static maximum amount memory which this domain should expect to be given. In the absence of in-guest memory hotplug support this set on domain boot and is usually the maximum amount of RAM which a guest can make use of. See [docs/misc/libxl_memory.txt][LIBXLMEM] for a description of how memory is accounted for in toolstacks using the libxl library. #### ~/memory/target = MEMKB [] The current balloon target for the domain. The balloon driver within the guest is expected to make every effort to every effort use no more than this amount of RAM. #### ~/memory/videoram = MEMKB [HVM,INTERNAL] The size of the video RAM this domain is configured with. #### ~/device/suspend/event-channel = ""|EVTCHN [w] The domain's suspend event channel. The toolstack will create this path with an empty value which the guest may choose to overwrite. If the guest overwrites this, it will be with the number of an unbound event channel port it has acquired. The toolstack is expected to use an interdomain bind, and then, when it wishes to ask the guest to suspend, to signal the event channel. The guest does not need to explicitly acknowledge the request; indeed, there is no explicit signalling by the guest in the reverse direction. The guest, when it is ready, simply shuts down (`SCHEDOP_shutdown`) with reason code `SHUTDOWN_suspend`. The toolstack is expected to use `XEN_DOMCTL_subscribe` to be alerted to guest state changes, and `XEN_SYSCTL_getdomaininfolist` to verify that the domain has suspended. Note that the use of this event channel suspend protocol is optional for both sides. By writing a non-empty string to the node, the guest is advertising its support. However, the toolstack is at liberty to use the xenstore-based protocol instead (see ~/control/shutdown, below) even if the guest has advertised support for the event channel protocol. #### ~/hvmloader/generation-id-address = ADDRESS [r,HVM,INTERNAL] The hexadecimal representation of the address of the domain's "generation id". #### ~/hvmloader/allow-memory-relocate = ("1"|"0") [HVM,INTERNAL] If the default low MMIO hole (below 4GiB) is not big enough for all the devices, this indicates if hvmloader should relocate guest memory into the high memory region (above 4GiB). If "1", hvmloader will relocate memory as needed, until 2GiB is reached; if "0", hvmloader will not relocate guest memory. #### ~/hvmloader/bios = ("rombios"|"seabios"|"OVMF") [HVM,INTERNAL] The BIOS used by this domain. #### ~/platform/* [HVM,INTERNAL] Various platform properties. * acpi -- is ACPI enabled for this domain * acpi_s3 -- is ACPI S3 support enabled for this domain * acpi_s4 -- is ACPI S4 support enabled for this domain #### ~/platform/generation-id = INTEGER ":" INTEGER [HVM,INTERNAL] Two 64 bit values that represent the Windows Generation ID. Is used by the BIOS initializer to get this value. If not present or "0:0" (all zeroes) device will not be present to the machine. ### Frontend device paths Paravirtual device frontends are generally specified by their own directory within the XenStore hierarchy. Usually this is under ~/device/$TYPE/$DEVID although there are exceptions, e.g. ~/console for the first PV console. #### ~/device/vbd/$DEVID/* [] A virtual block device frontend. Described by [xen/include/public/io/blkif.h][BLKIF] #### ~/device/vfb/$DEVID/* [] A virtual framebuffer frontend. Described by [xen/include/public/io/fbif.h][FBIF] #### ~/device/vkbd/$DEVID/* [] A virtual keyboard device frontend. Described by [xen/include/public/io/kbdif.h][KBDIF] #### ~/device/vif/$DEVID/* [] A virtual network device frontend. Described by [xen/include/public/io/netif.h][NETIF] #### ~/console/* [] The primary PV console device. Described in [console.txt](console.txt) #### ~/device/console/$DEVID/* [] A secondary PV console device. Described in [console.txt](console.txt) #### ~/device/serial/$DEVID/* [HVM] An emulated serial device. Described in [console.txt](console.txt) #### ~/store/port = EVTCHN [DEPRECATED] The event channel used by the domain's connection to XenStore. This path is deprecated since the same information is provided via the [start_info][SI] for PV guests and as an [HVM param][HVMPARAMS] for HVM guests. There is an obvious chicken and egg problem with extracting this value from xenstore in order to setup the xenstore communication ring. #### ~/store/ring-ref = GNTREF [DEPRECATED] The grant reference of the domain's XenStore ring. As with ~/store/port this path is deprecated. ### Backend Device Paths Paravirtual device backends are generally specified by their own directory within the XenStore hierarchy. Usually this is under ~/backend/$TYPE/$DOMID/$DEVID. #### ~/backend/vbd/$DOMID/$DEVID/* [] A virtual block device backend. Described by [xen/include/public/io/blkif.h][BLKIF] Uses the in-kernel blkback driver. #### ~/backend/qdisk/$DOMID/$DEVID/* [] A virtual block device backend. Described by [xen/include/public/io/blkif.h][BLKIF] Uses the qemu based disk backend. #### ~/backend/tap/$DOMID/$DEVID/* [] A virtual block device backend. Described by [xen/include/public/io/blkif.h][BLKIF] Uses the in-kernel blktap (v1) disk backend (deprecated). #### ~/backend/vfb/$DOMID/$DEVID/* [] A virtual framebuffer backend. Described by [xen/include/public/io/fbif.h][FBIF] #### ~/backend/vkbd/$DOMID/$DEVID/* [] A virtual keyboard device backend. Described by [xen/include/public/io/kbdif.h][KBDIF] #### ~/backend/vif/$DOMID/$DEVID/* [] A virtual network device backend. Described by [xen/include/public/io/netif.h][NETIF] #### ~/backend/console/$DOMID/$DEVID/* [] A PV console backend. Described in [console.txt](console.txt) #### ~/device-model/$DOMID/* [INTERNAL] Information relating to device models running in the domain. $DOMID is the target domain of the device model. #### ~/libxl/disable_udev = ("1"|"0") [] Indicates whether device hotplug scripts in this domain should be run by udev ("0") or will be run by the toolstack directly ("1"). ### Platform Feature and Control Paths #### ~/control/shutdown = (""|COMMAND) [w] This is the PV shutdown control node. A toolstack can write various commands here to cause various guest shutdown, reboot or suspend activities. The guest acknowledges a request by writing the empty string back to the command node. The precise protocol is not yet documented. #### ~/control/platform-feature-multiprocessor-suspend = (0|1) [] Indicates to the guest that this platform supports the multiprocessor suspend feature. #### ~/control/platform-feature-xs\_reset\_watches = (0|1) [] Indicates to the guest that this platform supports the XS_RESET_WATCHES xenstore message. See [xen/include/public/io/xs\_wire.h][XSWIRE] for the XenStore wire protocol definition. ### Domain Controlled Paths #### ~/data/* [w] A domain writable path. Available for arbitrary domain use. ### Paths private to the toolstack #### ~/device-model/$DOMID/state [w] Contains the status of the device models running on the domain. #### ~/libxl/$DOMID/qdisk-backend-pid [w] Contains the PIDs of the device models running on the domain. ## Virtual Machine Paths The /vm/$UUID namespace is used by toolstacks to store various information relating to the domain which is not intended to be guest visible (hence they are all tagged [n,INTERNAL]). Several of the keys here are not well defined and/or not well located and are liable to be replaced with more fully defined paths in the future. ### /vm/$UUID/uuid = UUID [n,INTERNAL] Value is the same UUID as the path. ### /vm/$UUID/name = STRING [n,INTERNAL] The domain's name. ### /vm/$UUID/image/* [n,INTERNAL] Various information relating to the domain builder used for this guest. ### /vm/$UUID/start_time = INTEGER "." INTEGER [n,INTERNAL] The time which the guest was started in SECONDS.MICROSECONDS format ### /vm/$UUID/rtc/timeoffset = ""|INTEGER [n,HVM,INTERNAL] The guest's virtual time offset from UTC in seconds. ## Platform-Level paths ### libxl Specific Paths #### /libxl/$DOMID/dm-version ("qemu\_xen"|"qemu\_xen\_traditional") = [n,INTERNAL] The device model version for a domain. [BLKIF]: http://xenbits.xen.org/docs/unstable/hypercall/include,public,io,blkif.h.html [FBIF]: http://xenbits.xen.org/docs/unstable/hypercall/include,public,io,fbif.h.html [HVMPARAMS]: http://xenbits.xen.org/docs/unstable/hypercall/include,public,hvm,params.h.html [KBDIF]: http://xenbits.xen.org/docs/unstable/hypercall/include,public,io,kbdif.h.html [LIBXLMEM]: http://xenbits.xen.org/docs/unstable/misc/libxl_memory.txt [NETIF]: http://xenbits.xen.org/docs/unstable/hypercall/include,public,io,netif.h.html [SI]: http://xenbits.xen.org/docs/unstable/hypercall/include,public,xen.h.html#Struct_start_info [VCPU]: http://xenbits.xen.org/docs/unstable/hypercall/include,public,vcpu.h.html [XSWIRE]: http://xenbits.xen.org/docs/unstable/hypercall/include,public,io,xs_wire.h.html xen-4.4.0/docs/misc/tscmode.txt0000664000175000017500000004014512307313555014531 0ustar smbsmbTSC_MODE HOW-TO by: Dan Magenheimer OVERVIEW As of Xen 4.0, a new config option called tsc_mode may be specified for each domain. The default for tsc_mode handles the vast majority of hardware and software environments. This document is targeted for Xen users and administrators that may need to select a non-default tsc_mode. Proper selection of tsc_mode depends on an understanding not only of the guest operating system (OS), but also of the application set that will ever run on this guest OS. This is because tsc_mode applies equally to both the OS and ALL apps that are running on this domain, now or in the future. Key questions to be answered for the OS and/or each application are: - Does the OS/app use the rdtsc instruction at all? (We will explain below how to determine this.) - At what frequency is the rdtsc instruction executed by either the OS or any running apps? If the sum exceeds about 10,000 rdtsc instructions per second per processor, we call this a "high-TSC-frequency" OS/app/environment. (This is relatively rare, and developers of OS's and apps that are high-TSC-frequency are usually aware of it.) - If the OS/app does use rdtsc, will it behave incorrectly if "time goes backwards" or if the frequency of the TSC suddenly changes? If so, we call this a "TSC-sensitive" app or OS; otherwise it is "TSC-resilient". This last is the US$64,000 question as it may be very difficult (or, for legacy apps, even impossible) to predict all possible failure cases. As a result, unless proven otherwise, any app that uses rdtsc must be assumed to be TSC-sensitive and, as we will see, this is the default starting in Xen 4.0. Xen's new tsc_mode parameter determines the circumstances under which the family of rdtsc instructions are executed "natively" vs emulated. Roughly speaking, native means rdtsc is fast but TSC-sensitive apps may, under unpredictable circumstances, run incorrectly; emulated means there is some performance degradation (unobservable in most cases), but TSC-sensitive apps will always run correctly. Prior to Xen 4.0, all rdtsc instructions were native: "fast but potentially incorrect." Starting at Xen 4.0, the default is that all rdtsc instructions are "correct but potentially slow". The tsc_mode parameter in 4.0 provides an intelligent default but allows system administrator's to adjust how rdtsc instructions are executed differently for different domains. The non-default choices for tsc_mode are: - tsc_mode=1 (always emulate). All rdtsc instructions are emulated; this is the best choice when TSC-sensitive apps are running and it is necessary to understand worst-case performance degradation for a specific hardware environment. - tsc_mode=2 (never emulate). This is the same as prior to Xen 4.0 and is the best choice if it is certain that all apps running in this VM are TSC-resilient and highest performance is required. - tsc_mode=3 (PVRDTSCP). High-TSC-frequency apps may be paravirtualized (modified) to obtain both correctness and highest performance; any unmodified apps must be TSC-resilient. If tsc_mode is left unspecified (or set to tsc_mode=0), a hybrid algorithm is utilized to ensure correctness while providing the best performance possible given: - the requirement of correctness, - the underlying hardware, and - whether or not the VM has been saved/restored/migrated To understand this in more detail, the rest of this document must be read. DETERMINING RDTSC FREQUENCY To determine the frequency of rdtsc instructions that are emulated, an "xm" command can be used by a privileged user of domain0. The command: # xm debug-key s; xm dmesg | tail provides information about TSC usage in each domain where TSC emulation is currently enabled. TSC HISTORY To understand tsc_mode completely, some background on TSC is required: The x86 "timestamp counter", or TSC, is a 64-bit register on each processor that increases monotonically. Historically, TSC incremented every processor cycle, but on recent processors, it increases at a constant rate even if the processor changes frequency (for example, to reduce processor power usage). TSC is known by x86 programmers as the fastest, highest-precision measurement of the passage of time so it is often used as a foundation for performance monitoring. And since it is guaranteed to be monotonically increasing and, at 64 bits, is guaranteed to not wraparound within 10 years, it is sometimes used as a random number or a unique sequence identifier, such as to stamp transactions so they can be replayed in a specific order. On most older SMP and early multi-core machines, TSC was not synchronized between processors. Thus if an application were to read the TSC on one processor, then was moved by the OS to another processor, then read TSC again, it might appear that "time went backwards". This loss of monotonicity resulted in many obscure application bugs when TSC-sensitive apps were ported from a uniprocessor to an SMP environment; as a result, many applications -- especially in the Windows world -- removed their dependency on TSC and replaced their timestamp needs with OS-specific functions, losing both performance and precision. On some more recent generations of multi-core machines, especially multi-socket multi-core machines, the TSC was synchronized but if one processor were to enter certain low-power states, its TSC would stop, destroying the synchrony and again causing obscure bugs. This reinforced decisions to avoid use of TSC altogether. On the most recent generations of multi-core machines, however, synchronization is provided across all processors in all power states, even on multi-socket machines, and provide a flag that indicates that TSC is synchronized and "invariant". Thus TSC is once again useful for applications, and even newer operating systems are using and depending upon TSC for critical timekeeping tasks when running on these recent machines. We will refer to hardware that ensures TSC is both synchronized and invariant as "TSC-safe" and any hardware on which TSC is not (or may not remain) synchronized as "TSC-unsafe". As a result of TSC's sordid history, two classes of applications use TSC: old applications designed for single processors, and the most recent enterprise applications which require high-frequency high-precision timestamping. We will refer to apps that might break if running on a TSC-unsafe machine as "TSC-sensitive"; apps that don't use TSC, or do use TSC but use it in a way that monotonicity and frequency invariance are unimportant as "TSC-resilient". The emergence of virtualization once again complicates the usage of TSC. When features such as save/restore or live migration are employed, a guest OS and all its currently running applications may be invisibly transported to an entirely different physical machine. While TSC may be "safe" on one machine, it is essentially impossible to precisely synchronize TSC across a data center or even a pool of machines. As a result, when run in a virtualized environment, rare and obscure "time going backwards" problems might once again occur for those TSC-sensitive applications. Worse, if a guest OS moves from, for example, a 3GHz machine to a 1.5GHz machine, attempts by an OS/app to measure time intervals with TSC may without notice be incorrect by a factor of two. The rdtsc (read timestamp counter) instruction is used to read the TSC register. The rdtscp instruction is a variant of rdtsc on recent processors. We refer to these together as the rdtsc family of instructions, or just "rdtsc". Instructions in the rdtsc family are non-privileged, but privileged software may set a cpuid bit to cause all rdtsc family instructions to trap. This trap can be detected by Xen, which can then transparently "emulate" the results of the rdtsc instruction and return control to the code following the rdtsc instruction. To provide a "safe" TSC, i.e. to ensure both TSC monotonicity and a fixed rate, Xen provides rdtsc emulation whenever necessary or when explicitly specified by a per-VM configuration option. TSC emulation is relatively slow -- roughly 15-20 times slower than the rdtsc instruction when executed natively. However, except when an OS or application uses the rdtsc instruction at a high frequency (e.g. more than about 10,000 times per second per processor), this performance degradation is not noticeable (i.e. <0.3%). And, TSC emulation is nearly always faster than OS-provided alternatives (e.g. Linux's gettimeofday). For environments where it is certain that all apps are TSC-resilient (e.g. "TSC-safeness" is not necessary) and highest performance is a requirement, TSC emulation may be entirely disabled (tsc_mode==2). The default mode (tsc_mode==0) checks TSC-safeness of the underlying hardware on which the virtual machine is launched. If it is TSC-safe, rdtsc will execute at hardware speed; if it is not, rdtsc will be emulated. Once a virtual machine is save/restored or migrated, however, there are two possibilities: For a paravirtualized (PV) domain, TSC will always be emulated. For a fully-virtualized (HVM) domain, TSC remains native IF the source physical machine and target physical machine have the same TSC frequency; else TSC is emulated. Note that, though emulated, the "apparent" TSC frequency will be the TSC frequency of the initial physical machine, even after migration. For environments where both TSC-safeness AND highest performance even across migration is a requirement, application code can be specially modified to use an algorithm explicitly designed into Xen for this purpose. This mode (tsc_mode==3) is called PVRDTSCP, because it requires app paravirtualization (awareness by the app that it may be running on top of Xen), and utilizes a variation of the rdtsc instruction called rdtscp that is available on most recent generation processors. (The rdtscp instruction differs from the rdtsc instruction in that it reads not only the TSC but an additional register set by system software.) When a pvrdtscp-modified app is running on a processor that is both TSC-safe and supports the rdtscp instruction, information can be obtained about migration and TSC frequency/offset adjustment to allow the vast majority of timestamps to be obtained at top performance; when running on a TSC-unsafe processor or a processor that doesn't support the rdtscp instruction, rdtscp is emulated. PVRDTSCP (tsc_mode==3) has two limitations. First, it applies to all apps running in this virtual machine. This means that all apps must either be TSC-resilient or pvrdtscp-modified. Second, highest performance is only obtained on TSC-safe machines that support the rdtscp instruction; when running on older machines, rdtscp is emulated and thus slower. For more information on PVRDTSCP, see below. Finally, tsc_mode==1 always enables TSC emulation, regardless of the underlying physical hardware. The "apparent" TSC frequency will be the TSC frequency of the initial physical machine, even after migration. This mode is useful to measure any performance degradation that might be encountered by a tsc_mode==0 domain after migration occurs, or a tsc_mode==3 domain when it is running on TSC-unsafe hardware. Note that while Xen ensures that an emulated TSC is "safe" across migration, it does not ensure that it continues to tick at the same rate during the actual migration. As an oversimplified example, if TSC is ticking once per second in a guest, and the guest is saved when the TSC is 1000, then restored 30 seconds later, TSC is only guaranteed to be greater than or equal to 1001, not precisely 1030. This has some OS implications as will be seen in the next section. TSC INVARIANT BIT and NO_MIGRATE Related to TSC emulation, the "TSC Invariant" bit is architecturally defined in a cpuid bit on the most recent x86 processors. If set, TSC invariance ensures that the TSC is "safe", that is it will increment at a constant rate regardless of power events, will be synchronized across all processors, and was properly initialized to zero on all processors at boot-time by system hardware/BIOS. As long as system software never writes to TSC, TSC will be safe and continuously incremented at a fixed rate and thus can be used as a system "clocksource". This bit is used by some OS's, and specifically by Linux starting with version 2.6.30(?), to select TSC as a system clocksource. Once selected, TSC remains the Linux system clocksource unless manually overridden. In a virtualized environment, since it is not possible to synchronize TSC across all the machines in a pool or data center, a migration may "break" TSC as a usable clocksource; while time will not go backwards, it may not track wallclock time well enough to avoid certain time-sensitive consequences. As a result, Xen can only expose the TSC Invariant bit to a guest OS if it is certain that the domain will never migrate. As of Xen 4.0, the "no_migrate=1" VM configuration option may be specified to disable migration. If no_migrate is selected and the VM is running on a physical machine with "TSC Invariant", Linux 2.6.30+ will safely use TSC as the system clocksource. But, attempts to migrate or, once saved, restore this domain will fail. There is another cpuid-related complication: The x86 cpuid instruction is non-privileged. HVM domains are configured to always trap this instruction to Xen, where Xen can "filter" the result. In a PV OS, all cpuid instructions have been replaced by a paravirtualized equivalent of the cpuid instruction ("pvcpuid") and also trap to Xen. But apps in a PV guest that use a cpuid instruction execute it directly, without a trap to Xen. As a result, an app may directly examine the physical TSC Invariant cpuid bit and make decisions based on that bit. This is still an unsolved problem, though a workaround exists as part of the PVRDTSCP tsc_mode for apps that can be modified. MORE ON PVRDTSCP Paravirtualized OS's use the "pvclock" algorithm to manage the passing of time. This sophisticated algorithm obtains information from a memory page shared between Xen and the OS and selects information from this page based on the current virtual CPU (vcpu) in order to properly adapt to TSC-unsafe systems and changes that occur across migration. Neither this shared page nor the vcpu information is available to a userland app so the pvclock algorithm cannot be directly used by an app, at least without performance degradation roughly equal to the cost of just emulating an rdtsc. As a result, as of 4.0, Xen provides capabilities for a userland app to obtain key time values similar to the information accessible to the PV OS pvclock algorithm. The app uses the rdtscp instruction which is defined in recent processors to obtain both the TSC and an auxiliary value called TSC_AUX. Xen is responsible for setting TSC_AUX to the same value on all vcpus running any domain with tsc_mode==3; further, Xen tools are responsible for monotonically incrementing TSC_AUX anytime the domain is restored/migrated (thus changing key time values); and, when the domain is running on a physical machine that either is not TSC-safe or does not support the rdtscp instruction, Xen is responsible for emulating the rdtscp instruction and for setting TSC_AUX to zero on all processors. Xen also provides pvclock information via a "pvcpuid" instruction. While this results in a slow trap, the information changes (and thus must be reobtained via pvcpuid) ONLY when TSC_AUX has changed, which should be very rare relative to a high frequency of rdtscp instructions. Finally, Xen provides additional time-related information via other pvcpuid instructions. First, an app is capable of determining if it is currently running on Xen, next whether the tsc_mode setting of the domain in which it is running, and finally whether the underlying hardware is TSC-safe and supports the rdtscp instruction. As a result, a pvrdtscp-modified app has sufficient information to compute the pvclock "elapsed nanoseconds" which can be used as a timestamp. And this can be done nearly as fast as a native rdtsc instruction, much faster than emulation, and also much faster than nearly all OS-provided time mechanisms. While pvrtscp is too complex for most apps, certain enterprise TSC-sensitive high-TSC-frequency apps may find it useful to obtain a significant performance gain. xen-4.4.0/docs/misc/pvh-readme.txt0000664000175000017500000000451312307313555015122 0ustar smbsmb PVH : an x86 PV guest running in an HVM container. See: http://blog.xen.org/index.php/2012/10/23/the-paravirtualization-spectrum-part-1-the-ends-of-the-spectrum/ At the moment HAP is required for PVH. At present the only PVH guest is an x86 64bit PV linux. Patches are at: git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git A PVH guest kernel must support following features, as defined for linux in arch/x86/xen/xen-head.S: #define FEATURES_PVH "|writable_descriptor_tables" \ "|auto_translated_physmap" \ "|supervisor_mode_kernel" \ "|hvm_callback_vector" In a nutshell: * the guest uses auto translate: - p2m is managed by xen - pagetables are owned by the guest - mmu_update hypercall not available * it uses event callback and not vlapic emulation, * IDT is native, so set_trap_table hcall is also N/A for a PVH guest. For a full list of hcalls supported for PVH, see pvh_hypercall64_table in arch/x86/hvm/hvm.c in xen. From the ABI prespective, it's mostly a PV guest with auto translate, although it does use hvm_op for setting callback vector, and has a special version of arch_set_guest_info for bringing up secondary cpus. The initial phase targets the booting of a 64bit UP/SMP linux guest in PVH mode. This is done by adding: pvh=1 in the config file. xl, and not xm, is supported. Phase I patches are broken into three parts: - xen changes for booting of 64bit PVH guest - tools changes for creating a PVH guest - boot of 64bit dom0 in PVH mode. Following fixme's exist in the code: - arch/x86/time.c: support more tsc modes. Following remain to be done for PVH: - Get rid of PVH mode, make it just HVM with some flags set - implement arch_get_info_guest() for pvh. - Investigate what else needs to be done for VMI support. - AMD port. - 32bit PVH guest support in both linux and xen. Xen changes are tagged "32bitfixme". - Add support for monitoring guest behavior. See hvm_memory_event* functions in hvm.c - vcpu hotplug support - Live migration of PVH guests. - Avail PVH dom0 of posted interrupts. (This will be a big win). Note, any emails to me must be cc'd to xen devel mailing list. OTOH, please cc me on PVH emails to the xen devel mailing list. Mukesh Rathor mukesh.rathor [at] oracle [dot] com xen-4.4.0/docs/misc/vbd-interface.txt0000664000175000017500000001265512307313555015611 0ustar smbsmbXen guest interface ------------------- A Xen guest can be provided with block devices. These are always provided as Xen VBDs; for HVM guests they may also be provided as emulated IDE or SCSI disks. The abstract interface involves specifying, for each block device: * Nominal disk type: Xen virtual disk (aka xvd*, the default); SCSI (sd*); IDE (hd*). For HVM guests, each whole-disk hd* and and sd* device is made available _both_ via emulated IDE resp. SCSI controller, _and_ as a Xen VBD. The HVM guest is entitled to assume that the IDE or SCSI disks available via the emulated IDE controller target the same underlying devices as the corresponding Xen VBD (ie, multipath). For PV guests every device is made available to the guest only as a Xen VBD. For these domains the type is advisory, for use by the guest's device naming scheme. The Xen interface does not specify what name a device should have in the guest (nor what major/minor device number it should have in the guest, if the guest has such a concept). * Disk number, which is a nonnegative integer, conventionally starting at 0 for the first disk. * Partition number, which is a nonnegative integer where by convention partition 0 indicates the "whole disk". Normally for any disk _either_ partition 0 should be supplied in which case the guest is expected to treat it as they would a native whole disk (for example by putting or expecting a partition table or disk label on it); _Or_ only non-0 partitions should be supplied in which case the guest should expect storage management to be done by the host and treat each vbd as it would a partition or slice or LVM volume (for example by putting or expecting a filesystem on it). Non-whole disk devices cannot be passed through to HVM guests via the emulated IDE or SCSI controllers. Configuration file syntax ------------------------- The config file syntaxes are, for example d0 d0p0 xvda Xen virtual disk 0 partition 0 (whole disk) d1p2 xvdb2 Xen virtual disk 1 partition 2 d536p37 xvdtq37 Xen virtual disk 536 partition 37 sdb3 SCSI disk 1 partition 3 hdc2 IDE disk 2 partition 2 The d*p* syntax is not supported by xm/xend. To cope with guests which predate this specification we preserve the existing facility to specify the xenstore numerical value directly by putting a single number (hex, decimal or octal) in the domain config file instead of the disk identifier; this number is written directly to xenstore (after conversion to the canonical decimal format). Concrete encoding in the VBD interface (in xenstore) ---------------------------------------------------- The information above is encoded in the concrete interface as an integer (in a canonical decimal format in xenstore), whose value encodes the information above as follows: 1 << 28 | disk << 8 | partition xvd, disks or partitions 16 onwards 202 << 8 | disk << 4 | partition xvd, disks and partitions up to 15 8 << 8 | disk << 4 | partition sd, disks and partitions up to 15 3 << 8 | disk << 6 | partition hd, disks 0..1, partitions 0..63 22 << 8 | (disk-2) << 6 | partition hd, disks 2..3, partitions 0..63 2 << 28 onwards reserved for future use other values less than 1 << 28 deprecated / reserved The 1<<28 format handles disks up to (1<<20)-1 and partitions up to 255. It will be used only where the 202<<8 format does not have enough bits. Guests MAY support any subset of the formats above except that if they support 1<<28 they MUST also support 202<<8. PV-on-HVM drivers MUST support at least one of 3<<8 or 8<<8; 3<<8 is recommended. Some software has used or understood Linux-specific encodings for SCSI disks beyond disk 15 partition 15, and IDE disks beyond disk 3 partition 63. These vbds, and the corresponding encoded integers, are deprecated. Guests SHOULD ignore numbers that they do not understand or recognise. They SHOULD check supplied numbers for validity. Notes on Linux as a guest ------------------------- Very old Linux guests (PV and PV-on-HVM) are able to "steal" the device numbers and names normally used by the IDE and SCSI controllers, so that writing "hda1" in the config file results in /dev/hda1 in the guest. These systems interpret the xenstore integer as major << 8 | minor where major and minor are the Linux-specific device numbers. Some old configurations may depend on deprecated high-numbered SCSI and IDE disks. This does not work in recent versions of Linux. So for Linux PV guests, users are recommended to supply xvd* devices only. Modern PV drivers will map these to identically-named devices in the guest. For Linux HVM guests using PV-on-HVM drivers, users are recommended to supply as few hd* devices as possible, and for the rest of the disks, to use pure xvd* devices starting at xvde. Modern PV-on-HVM drivers will map provided hd* devices to the corresponding /dev/xvd* (for example, hda is presented also as /dev/xvda). Some Linux HVM guests with broken PV-on-HVM drivers do not cope properly if both hda and hdc are supplied, nor with both hda and xvda, because they directly map the bottom 8 bits of the xenstore integer directly to the Linux guest's device number and throw away the rest; they can crash due to minor number clashes. With these guests, the workaround is not to supply problematic combinations of devices. xen-4.4.0/docs/misc/arm/0000775000175000017500000000000012307313555013105 5ustar smbsmbxen-4.4.0/docs/misc/arm/device-tree/0000775000175000017500000000000012307313555015301 5ustar smbsmbxen-4.4.0/docs/misc/arm/device-tree/booting.txt0000664000175000017500000000322412307313555017504 0ustar smbsmbDom0 kernel and ramdisk modules ================================ Xen is passed the dom0 kernel and initrd via a reference in the /chosen node of the device tree. Each node contains the following properties: - compatible Must be: "xen,", "xen,multiboot-module" where must be one of: - "linux-zimage" -- the dom0 kernel - "linux-initrd" -- the dom0 ramdisk - reg Specifies the physical address of the module in RAM and the length of the module. - bootargs (optional) Command line associated with this module. This is deprecated and should be replaced by the bootargs variations described below. Command lines ============= Xen also checks for properties directly under /chosen to find suitable command lines for Xen and Dom0. The logic is the following: - If xen,xen-bootargs is present, it will be used for Xen. - If xen,dom0-bootargs is present, it will be used for Dom0. - If xen,xen-bootargs is _not_ present, but xen,dom0-bootargs is, bootargs will be used for Xen. - If a kernel boot module is present and has a bootargs property then the top-level bootargs will used for Xen. - If no Xen specific properties are present, bootargs is for Dom0. - If xen,xen-bootargs is present, but xen,dom0-bootargs is missing, bootargs will be used for Dom0. Most of these cases is to make booting with Xen-unaware bootloaders easier. For those you would hardcode the Xen commandline in the DTB under /chosen/xen,xen-bootargs and would let the bootloader set the Dom0 command line by writing bootargs (as for native Linux). A Xen-aware bootloader would set xen,xen-bootargs for Xen, xen,dom0-bootargs for Dom0 and bootargs for native Linux. xen-4.4.0/docs/misc/arm/booting.txt0000664000175000017500000000222112307313555015304 0ustar smbsmbBooting Xen =========== Xen follows the zImage protocol defined for 32-bit ARM Linux[1] and the Image protocol defined for ARM64 Linux[2]. In both cases the recommendation to boot in HYP/EL2 mode is a strict requirement for Xen. The exceptions to this on 32-bit ARM are as follows: Xen does not require the machine type to be passed in r1. This register is ignored (so may be invalid or the actual machine type). Xen does not support the ATAG list and requires Device Tree. Therefore r2 must point to the physical address of device tree block (dtb) in system RAM. NOTE: although Xen uses the zImage protocol there is no compression actually used. This should be transparent to the bootloader. The zImage protocol should still be used and not the stricter "raw (non-zImage)" protocol described in arm/Booting. There are no exception on 64-bit ARM. [1] linux/Documentation/arm/Booting Latest version: http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/arm/Booting [2] linux/Documentation/arm64/booting.txt Latest version: http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/arm64/booting.txt xen-4.4.0/docs/misc/arm/early-printk.txt0000664000175000017500000000235512307313555016274 0ustar smbsmbHow to enable early printk Early printk can only be enabled if debug=y. You may want to enable it if you are debbuging code that executes before the console is initialized. Note that selecting this option will limit Xen to a single UART definition. Attempting to boot Xen image on a different platform *will not work*, so this option should not be enable for Xens that are intended to be portable. CONFIG_EARLY_PRINTK=mach where mach is the name of the machine: - vexpress: printk with pl011 for versatile express - exynos5250: printk with the second UART - midway: printk with the pl011 on Calxeda Midway processors - fastmodel: printk on ARM Fastmodel software emulators - omap5432: printk with UART3 on TI OMAP5432 processors - sun6i: printk with 8250 on Allwinner A31 processors - sun7i: printk with 8250 on Allwinner A20 processors The base address and baud rate is hardcoded in xen/arch/arm/Rules.mk, see there when adding support for new machines. If not explicitly requested with "EARLY_PRINTK_INIT_UART := y" in Rules.mk, the code will not try to initialize the UART, so that bootloader or firmware settings can be used for maximum compatibility. The baud rate parameter is ignored in this case. By default early printk is disabled. xen-4.4.0/docs/misc/vtpm.txt0000664000175000017500000002574612307313555014073 0ustar smbsmbCopyright (c) 2010-2012 United States Government, as represented by the Secretary of Defense. All rights reserved. November 12 2012 Authors: Matthew Fioravante (JHUAPL), Daniel De Graaf (NSA) This document describes the virtual Trusted Platform Module (vTPM) subsystem for Xen. The reader is assumed to have familiarity with building and installing Xen, Linux, and a basic understanding of the TPM and vTPM concepts. ------------------------------ INTRODUCTION ------------------------------ The goal of this work is to provide a TPM functionality to a virtual guest operating system (a DomU). This allows programs to interact with a TPM in a virtual system the same way they interact with a TPM on the physical system. Each guest gets its own unique, emulated, software TPM. However, each of the vTPM's secrets (Keys, NVRAM, etc) are managed by a vTPM Manager domain, which seals the secrets to the Physical TPM. If the process of creating each of these domains (manager, vTPM, and guest) is trusted, the vTPM subsystem extends the chain of trust rooted in the hardware TPM to virtual machines in Xen. Each major component of vTPM is implemented as a separate domain, providing secure separation guaranteed by the hypervisor. The vTPM domains are implemented in mini-os to reduce memory and processor overhead. This mini-os vTPM subsystem was built on top of the previous vTPM work done by IBM and Intel corporation. ------------------------------ DESIGN OVERVIEW ------------------------------ The architecture of vTPM is described below: +------------------+ | Linux DomU | ... | | ^ | | v | | | xen-tpmfront | +------------------+ | ^ v | +------------------+ | mini-os/tpmback | | | ^ | | v | | | vtpm-stubdom | ... | | ^ | | v | | | mini-os/tpmfront | +------------------+ | ^ v | +------------------+ | mini-os/tpmback | | | ^ | | v | | | vtpmmgr-stubdom | | | ^ | | v | | | mini-os/tpm_tis | +------------------+ | ^ v | +------------------+ | Hardware TPM | +------------------+ * Linux DomU: The Linux based guest that wants to use a vTPM. There many be more than one of these. * xen-tpmfront.ko: Linux kernel virtual TPM frontend driver. This driver provides vTPM access to a para-virtualized Linux based DomU. * mini-os/tpmback: Mini-os TPM backend driver. The Linux frontend driver connects to this backend driver to facilitate communications between the Linux DomU and its vTPM. This driver is also used by vtpmmgr-stubdom to communicate with vtpm-stubdom. * vtpm-stubdom: A mini-os stub domain that implements a vTPM. There is a one to one mapping between running vtpm-stubdom instances and logical vtpms on the system. The vTPM Platform Configuration Registers (PCRs) are all initialized to zero. * mini-os/tpmfront: Mini-os TPM frontend driver. The vTPM mini-os domain vtpm-stubdom uses this driver to communicate with vtpmmgr-stubdom. This driver could also be used separately to implement a mini-os domain that wishes to use a vTPM of its own. * vtpmmgr-stubdom: A mini-os domain that implements the vTPM manager. There is only one vTPM manager and it should be running during the entire lifetime of the machine. This domain regulates access to the physical TPM on the system and secures the persistent state of each vTPM. * mini-os/tpm_tis: Mini-os TPM version 1.2 TPM Interface Specification (TIS) driver. This driver used by vtpmmgr-stubdom to talk directly to the hardware TPM. Communication is facilitated by mapping hardware memory pages into vtpmmgr-stubdom. * Hardware TPM: The physical TPM that is soldered onto the motherboard. ------------------------------ INSTALLATION ------------------------------ Prerequisites: -------------- You must have an x86 machine with a TPM on the motherboard. The only software requirement to compiling vTPM is cmake. You must use libxl to manage domains with vTPMs. 'xm' is deprecated and does not support vTPM. Compiling the XEN tree: ----------------------- Compile and install the XEN tree as usual. Be sure to build and install the stubdom tree. Compiling the LINUX dom0 kernel: -------------------------------- The Linux dom0 kernel should not try accessing the TPM while the vTPM Manager domain is accessing it; the simplest way to accomplish this is to ensure the kernel is compiled without a driver for the TPM, or avoid loading the driver by blacklisting the module. Compiling the LINUX domU kernel: -------------------------------- The domU kernel used by domains with vtpms must include the xen-tpmfront.ko driver. It can be built directly into the kernel or as a module; however, some features such as IMA require the TPM to be built in to the kernel. CONFIG_TCG_TPM=y CONFIG_TCG_XEN=y ------------------------------ VTPM MANAGER SETUP ------------------------------ Manager disk image setup: ------------------------- The vTPM Manager requires a disk image to store its encrypted data. The image does not require a filesystem and can live anywhere on the host disk. The image does not need to be large. 8 to 16 Mb should be sufficient. # dd if=/dev/zero of=/var/vtpmmgr-stubdom.img bs=16M count=1 Manager config file: -------------------- The vTPM Manager domain (vtpmmgr-stubdom) must be started like any other Xen virtual machine and requires a config file. The manager requires a disk image for storage and permission to access the hardware memory pages for the TPM. An example configuration looks like the following. kernel="/usr/lib/xen/boot/vtpmmgr-stubdom.gz" memory=16 disk=["file:/var/vtpmmgr-stubdom.img,hda,w"] name="vtpmmgr" iomem=["fed40,5"] The iomem line tells xl to allow access to all of the TPM IO memory pages, which are 5 pages (one per locality) that start at 0xfed40000. By default, the TPM manager uses locality 0 (so only the page at 0xfed40 is needed); this can be changed on the domain's command line. Starting and stopping the manager: ---------------------------------- The vTPM manager should be started at boot, you may wish to create an init script to do this. # xl create -c vtpmmgr-stubdom.cfg Once initialization is complete you should see the following: INFO[VTPM]: Waiting for commands from vTPM's: To shutdown the manager you must destroy it. To avoid data corruption, only destroy the manager when you see the above "Waiting for commands" message. This ensures the disk is in a consistent state. # xl destroy vtpmmgr-stubdom ------------------------------ VTPM AND LINUX PVM SETUP ------------------------------ In the following examples we will assume we have Linux guest named "domu" with its associated configuration located at /home/user/domu. It's vtpm will be named domu-vtpm. vTPM disk image setup: ---------------------- The vTPM requires a disk image to store its persistent data. The image does not require a filesystem. The image does not need to be large. 8 Mb should be sufficient. # dd if=/dev/zero of=/home/user/domu/vtpm.img bs=8M count=1 vTPM config file: ----------------- The vTPM domain requires a configuration file like any other domain. The vTPM requires a disk image for storage and a TPM frontend driver to communicate with the manager. An example configuration is given: kernel="/usr/lib/xen/boot/vtpm-stubdom.gz" memory=8 disk=["file:/home/user/domu/vtpm.img,hda,w"] name="domu-vtpm" vtpm=["backend=vtpmmgr,uuid=ac0a5b9e-cbe2-4c07-b43b-1d69e46fb839"] The vtpm= line sets up the tpm frontend driver. The backend must set to vtpmmgr. You are required to generate a uuid for this vtpm. You can use the uuidgen unix program or some other method to create a uuid. The uuid uniquely identifies this vtpm to manager. If you wish to clear the vTPM data you can either recreate the disk image or change the uuid. Linux Guest config file: ------------------------ The Linux guest config file needs to be modified to include the Linux tpmfront driver. Add the following line: vtpm=["backend=domu-vtpm"] Currently only Linux guests are supported (PV or HVM with PV drivers). Launching and shut down: ------------------------ To launch a Linux guest with a vTPM we first have to start the vTPM domain. # xl create -c /home/user/domu/vtpm.cfg After initialization is complete, you should see the following: Info: Waiting for frontend domain to connect.. Next, launch the Linux guest # xl create -c /home/user/domu/domu.cfg If xen-tpmfront was compiled as a module, be sure to load it in the guest. # modprobe xen-tpmfront After the Linux domain boots and the xen-tpmfront driver is loaded, you should see the following on the vtpm console: Info: VTPM attached to Frontend X/Y If you have trousers and tpm_tools installed on the guest, you can test the vtpm. On guest: # tcsd (if tcsd is not running already) # tpm_version The version command should return the following: TPM 1.2 Version Info: Chip Version: 1.2.0.7 Spec Level: 2 Errata Revision: 1 TPM Vendor ID: ETHZ TPM Version: 01010000 Manufacturer Info: 4554485a You should also see the command being sent to the vtpm console as well as the vtpm saving its state. You should see the vtpm key being encrypted and stored on the vtpmmgr console. You may wish to write a script to start your vtpm and guest together and to destroy the vtpm when the guest shuts down. ------------------------------ INTEGRATION WITH PV-GRUB ------------------------------ The vTPM currently starts up with all PCRs set to their default values (all zeros for the lower 16). This means that any decisions about the trustworthiness of the created domain must be made based on the environment that created the vTPM and the domU; for example, a system that only constructs images using a trusted configuration and guest kernel be able to provide guarantees about the guests and any measurements done that kernel (such as the IMA TCB log). Guests wishing to use a custom kernel in such a secure environment are often started using the pv-grub bootloader as the kernel, which then can load the untrusted kernel without needing to parse an untrusted filesystem and kernel in dom0. If the pv-grub stub domain succeeds in connecting to a vTPM, it will extend the hash of the kernel that it boots into PCR #4, and will extend the command line and initrd into PCR #5 before booting so that a domU booted in this way can attest to its early boot state. ------------------------------ MORE INFORMATION ------------------------------ See stubdom/vtpmmgr/README for more details about how the manager domain works, how to use it, and its command line parameters. See stubdom/vtpm/README for more specifics about how vtpm-stubdom operates and the command line options it accepts. xen-4.4.0/docs/misc/xl-network-configuration.markdown0000664000175000017500000001353312307313555021056 0ustar smbsmb# XL Network Configuration ## Syntax Overview This document specifies the xl config file format vif configuration option. It has the following form: vif = [ '', '', ... ] where each vifspec is in this form: [=|,] For example: 'mac=00:16:3E:74:3d:76,model=rtl8139,bridge=xenbr0' 'mac=00:16:3E:74:34:32' '' # The empty string These might be specified in the domain config file like this: vif = [ 'mac=00:16:3E:74:34:32', 'mac=00:16:3e:5f:48:e4,bridge=xenbr1' ] More formally, the string is a series of comma-separated keyword/value pairs. All keywords are optional. Each device has a `DEVID` which is its index within the vif list, starting from 0. ## Keywords ### mac If specified then this option specifies the MAC address inside the guest of this VIF device. The value is a 48-bit number represented as six groups of two hexadecimal digits, separated by colons (:). The default if this keyword is not specified is to be automatically generate a MAC address inside the space assigned to Xen's [Organizationally Unique Identifier][oui] (00:16:3e). If you are choosing a MAC address then it is strongly recommend to follow one of the following strategies: * Generate a random sequence of 6 byte, set the locally administered bit (bit 2 of the first byte) and clear the multicast bit (bit 1 of the first byte). In other words the first byte should have the bit pattern xxxxxx10 (where x is a randomly generated bit) and the remaining 5 bytes are randomly generated See [http://en.wikipedia.org/wiki/MAC_address] for more details the structure of a MAC address. * Allocate an address from within the space defined by your organization's OUI (if you have one) following your organization's procedures for doing so. * Allocate an address from within the space defined by Xen's OUI (00:16:3e). Taking care not to clash with other users of the physical network segment where this VIF will reside. If you have an OUI for your own use then that is the preferred strategy. Otherwise in general you should prefer to generate a random MAC and set the locally administered bit since this allows for more bits of randomness than using the Xen OUI. ### bridge Specifies the name of the network bridge which this VIF should be added to. The default is `xenbr0`. The bridge must be configured using your distribution's network configuration tools. See the [wiki][net] for guidance and examples. ### gatewaydev Specifies the name of the network interface which has an IP and which is in the network the VIF should communicate with. This is used in the host by the vif-route hotplug script. See [wiki][vifroute] for guidance and examples. NOTE: netdev is a deprecated alias of this option. ### type This keyword is valid for HVM guests only. Specifies the type of device to valid values are: * `ioemu` (default) -- this device will be provided as an emulate device to the guest and also as a paravirtualised device which the guest may choose to use instead if it has suitable drivers available. * `vif` -- this device will be provided as a paravirtualised device only. ### model This keyword is valid for HVM guest devices with `type=ioemu` only. Specifies the type device to emulated for this guest. Valid values are: * `rtl8139` (default) -- Realtek RTL8139 * `e1000` -- Intel E1000 * in principle any device supported by your device model ### vifname Specifies the backend device name for the virtual device. If the domain is an HVM domain then the associated emulated (tap) device will have a "-emu" suffice added. The default name for the virtual device is `vifDOMID.DEVID` where `DOMID` is the guest domain ID and `DEVID` is the device number. Likewise the default tap name is `vifDOMID.DEVID-emu`. ### script Specifies the hotplug script to run to configure this device (e.g. to add it to the relevant bridge). Defaults to `XEN_SCRIPT_DIR/vif-bridge` but can be set to any script. Some example scripts are installed in `XEN_SCRIPT_DIR`. ### ip Specifies the IP address for the device, the default is not to specify an IP address. What, if any, effect this has depends on the hotplug script which is configured. A typically behaviour (exhibited by the example hotplug scripts) if set might be to configure firewall rules to allow only the specified IP address to be used by the guest (blocking all others). ### backend Specifies the backend domain which this device should attach to. This defaults to domain 0. Specifying another domain requires setting up a driver domain which is outside the scope of this document. ### rate Specifies the rate at which the outgoing traffic will be limited to. The default if this keyword is not specified is unlimited. The rate may be specified as "/s" or optionally "/s@". * `RATE` is in bytes and can accept suffixes: * GB, MB, KB, B for bytes. * Gb, Mb, Kb, b for bits. * `INTERVAL` is in microseconds and can accept suffixes: ms, us, s. It determines the frequency at which the vif transmission credit is replenished. The default is 50ms. Vif rate limiting is credit-based. It means that for "1MB/s@20ms", the available credit will be equivalent of the traffic you would have done at "1MB/s" during 20ms. This will results in a credit of 20,000 bytes replenished every 20,000 us. For example: 'rate=10Mb/s' -- meaning up to 10 megabits every second 'rate=250KB/s' -- meaning up to 250 kilobytes every second 'rate=1MB/s@20ms' -- meaning 20,000 bytes in every 20 millisecond period NOTE: The actual underlying limits of rate limiting are dependent on the underlying netback implementation. [oui]: http://en.wikipedia.org/wiki/Organizationally_Unique_Identifier [net]: http://wiki.xen.org/wiki/HostConfiguration/Networking [vifroute]: http://wiki.xen.org/wiki/Vif-route xen-4.4.0/docs/misc/vtd.txt0000664000175000017500000002772512307313555013701 0ustar smbsmbTitle : How to do PCI Passthrough with VT-d Authors : Allen Kay Weidong Han Yuji Shimada Created : October-24-2007 Updated : July-07-2009 How to turn on VT-d in Xen -------------------------- Xen with 2.6.18 dom0: 1 ) cd xen-unstable.hg 2 ) make install 3 ) make linux-2.6-xen-config CONFIGMODE=menuconfig 4 ) change XEN->"PCI-device backend driver" from "M" to "*". 5 ) make linux-2.6-xen-build 6 ) make linux-2.6-xen-install 7 ) depmod 2.6.18.8-xen 8 ) mkinitrd -v -f --with=ahci --with=aacraid --with=sd_mod --with=scsi_mod initrd-2.6.18-xen.img 2.6.18.8-xen 9 ) cp initrd-2.6.18-xen.img /boot 10) lspci - select the PCI BDF you want to assign to guest OS 11) "hide" pci device from dom0 as following sample grub entry: title Xen-Fedora Core (2.6.18-xen) root (hd0,0) kernel /boot/xen.gz com1=115200,8n1 console=com1 iommu=1 module /boot/vmlinuz-2.6.18.8-xen root=LABEL=/ ro xencons=ttyS console=tty0 console=ttyS0, pciback.hide=(01:00.0)(03:00.0) module /boot/initrd-2.6.18-xen.img or use dynamic hiding via PCI backend sysfs interface: a) check if the driver has binded to the device ls -l /sys/bus/pci/devices/0000:01:00.0/driver ... /sys/bus/pci/devices/0000:01:00.0/driver -> ../../../../bus/pci/drivers/igb b) if yes, then unload the driver first echo -n 0000:01:00.0 >/sys/bus/pci/drivers/igb/unbind c) add the device to the PCI backend echo -n 0000:01:00.0 >/sys/bus/pci/drivers/pciback/new_slot d) let the PCI backend bind to the device echo -n 0000:01:00.0 >/sys/bus/pci/drivers/pciback/bind 12) reboot system (not requires if you use the dynamic hiding method) 13) add "pci" line in /etc/xen/hvm.conf for to assigned devices pci = [ '01:00.0', '03:00.0' ] 15) start hvm guest and use "lspci" to see the passthru device and "ifconfig" to see if IP address has been assigned to NIC devices. Xen with pv-ops dom0: 1 ) cd xen-unstable.hg 2 ) make install 3 ) make linux-2.6-pvops-config CONFIGMODE=menuconfig 4 ) change Bus options (PCI etc.)->"PCI Stub driver" to "*". 5 ) make linux-2.6-pvops-build 6 ) make linux-2.6-pvops-install 7 ) mkinitrd -v -f --with=ahci --with=aacraid --with=sd_mod --with=scsi_mod initrd-2.6.30-rc3-tip.img 2.6.30-rc3-tip (change 2.6.30-rc3-tip to pv-ops dom0 version when it's updated in future) 8 ) cp initrd-2.6.30-rc3-tip.img /boot 9 ) edit grub: title Xen-Fedora Core (pv-ops) root (hd0,0) kernel /boot/xen.gz console=com1,vga console=com1 com1=115200,8n1 iommu=1 module /boot/vmlinuz-2.6.30-rc3-tip root=LABEL=/ ro console=hvc0 earlyprintk=xen module /boot/initrd-2.6.30-rc3-tip.img 10) reboot system 11) hide device using pci-stub (example PCI device 01:00.0): - lspci -n - locate the entry for device 01:00.0 and note down the vendor & device ID 8086:10b9 ... 01:00.0 0200: 8086:10b9 (rev 06) ... - then use following commands to hide it: echo "8086 10b9" > /sys/bus/pci/drivers/pci-stub/new_id echo "0000:01:00.0" > /sys/bus/pci/devices/0000:01:00.0/driver/unbind echo "0000:01:00.0" > /sys/bus/pci/drivers/pci-stub/bind 12) add "pci" line in /etc/xen/hvm.conf for to assigned devices pci = [ '01:00.0' ] 13) start hvm guest and use "lspci" to see the passthru device and "ifconfig" to see if IP address has been assigned to NIC devices. Enable MSI/MSI-x for assigned devices ------------------------------------- Add "msi=1" option in kernel line of host grub. MSI-INTx translation for passthrough devices in HVM --------------------------------------------------- If the assigned device uses a physical IRQ that is shared by more than one device among multiple domains, there may be significant impact on device performance. Unfortunately, this is quite a common case if the IO-APIC (INTx) IRQ is used. MSI can avoid this issue, but was only available if the guest enables it. With MSI-INTx translation turned on, Xen enables device MSI if it's available, regardless of whether the guest uses INTx or MSI. If the guest uses INTx IRQ, Xen will inject a translated INTx IRQ to guest's virtual ioapic whenever an MSI message is received. This reduces the interrupt sharing of the system. If the guest OS enables MSI or MSI-X, the translation is automatically turned off. To enable or disable MSI-INTx translation globally, add "pci_msitranslate" in the config file: pci_msitranslate = 1 (default is 1) To override for a specific device: pci = [ '01:00.0,msitranslate=0', '03:00.0' ] Caveat on Conventional PCI Device Passthrough --------------------------------------------- VT-d spec specifies that all conventional PCI devices behind a PCIe-to-PCI bridge have to be assigned to the same domain. PCIe devices do not have this restriction. VT-d Works on OS: ----------------- 1) Host OS: PAE, 64-bit 2) Guest OS: 32-bit, PAE, 64-bit Combinations Tested: -------------------- 1) 64-bit host: 32/PAE/64 Linux/XP/Win2003/Vista guests 2) PAE host: 32/PAE Linux/XP/Win2003/Vista guests VTd device hotplug: ------------------- 2 virtual PCI slots (6~7) are reserved in HVM guest to support VTd hotplug. If you have more VTd devices, only 2 of them can support hotplug. Usage is simple: 1. List the VTd device by dom. You can see a VTd device 0:2:0.0 is inserted in the HVM domain's PCI slot 6. '''lspci''' inside the guest should see the same. [root@vt-vtd ~]# xm pci-list HVMDomainVtd VSlt domain bus slot func 0x6 0x0 0x02 0x00 0x0 2. Detach the device from the guest by the physical BDF. Then HVM guest will receive a virtual PCI hot removal event to detach the physical device [root@vt-vtd ~]# xm pci-detach HVMDomainVtd 0:2:0.0 3. Attach a PCI device to the guest by the physical BDF and desired virtual slot(optional). Following command would insert the physical device into guest's virtual slot 7 [root@vt-vtd ~]# xm pci-attach HVMDomainVtd 0:2:0.0 7 To specify options for the device, use -o or --options=. Following command would disable MSI-INTx translation for the device [root@vt-vtd ~]# xm pci-attach -o msitranslate=0 0:2:0.0 7 VTd hotplug usage model: ------------------------ * For live migration: As you know, VTd device would break the live migration as physical device can't be save/restored like virtual device. With hotplug, live migration is back again. Just hot remove all the VTd devices before live migration and hot add new VTd devices on target machine after live migration. * VTd hotplug for device switch: VTd hotplug can be used to dynamically switch physical device between different HVM guest without shutdown. VT-d Enabled Systems -------------------- 1) For VT-d enabling work on Xen, we have been using development systems using following Intel motherboards: - DQ35MP - DQ35JO 2) As far as we know, following OEM systems also has vt-d enabled. Feel free to add others as they become available. - Dell: Optiplex 755 http://www.dell.com/content/products/category.aspx/optix?c=us&cs=555&l=en&s=biz - HP Compaq: DC7800 http://h10010.www1.hp.com/wwpc/us/en/en/WF04a/12454-12454-64287-321860-3328898.html For more information, pls refer to http://wiki.xen.org/wiki/VTdHowTo. Assigning devices to HVM domains -------------------------------- Most device types such as NIC, HBA, EHCI and UHCI can be assigned to an HVM domain. But some devices have design features which make them unsuitable for assignment to an HVM domain. Examples include: * Device has an internal resource, such as private memory, which is mapped to memory address space with BAR (Base Address Register). * Driver submits command with a pointer to a buffer within internal resource. Device decodes the pointer (address), and accesses to the buffer. In an HVM domain, the BAR is virtualized, and host-BAR value and guest-BAR value are different. The addresses of internal resource from device's view and driver's view are different. Similarly, the addresses of buffer within internal resource from device's view and driver's view are different. As a result, device can't access to the buffer specified by driver. Such devices assigned to HVM domain currently do not work. Using SR-IOV with VT-d -------------------------------- The Single Root I/O Virtualization is a PCI Express feature supported by some devices such as Intel 82576 which allows you to create virtual PCI devices (Virtual Function) and assign them to the HVM guest. You can use latest lspci (v3.1 and above) to check if your PCIe device supports the SR-IOV capability or not. $ lspci -s 01:00.0 -vvv 01:00.0 Ethernet controller: Intel Corporation 82576 Gigabit Network Connection (rev 01) Subsystem: Intel Corporation Gigabit ET Dual Port Server Adapter ... Capabilities: [160] Single Root I/O Virtualization (SR-IOV) IOVCap: Migration-, Interrupt Message Number: 000 IOVCtl: Enable+ Migration- Interrupt- MSE+ ARIHierarchy+ IOVSta: Migration- Initial VFs: 8, Total VFs: 8, Number of VFs: 7, Function Dependency Link: 00 VF offset: 128, stride: 2, Device ID: 10ca Supported Page Size: 00000553, System Page Size: 00000001 VF Migration: offset: 00000000, BIR: 0 Kernel driver in use: igb The function that has the SR-IOV capability is also known as Physical Function. You need the Physical Function driver (runs in the Dom0 and controls the physical resources allocation) to enable the Virtual Function. Following is the Virtual Functions associated with above Physical Function. $ lspci | grep -e 01:1[01].[0246] 01:10.0 Ethernet controller: Intel Corporation Device 10ca (rev 01) 01:10.2 Ethernet controller: Intel Corporation Device 10ca (rev 01) 01:10.4 Ethernet controller: Intel Corporation Device 10ca (rev 01) 01:10.6 Ethernet controller: Intel Corporation Device 10ca (rev 01) 01:11.0 Ethernet controller: Intel Corporation Device 10ca (rev 01) 01:11.2 Ethernet controller: Intel Corporation Device 10ca (rev 01) 01:11.4 Ethernet controller: Intel Corporation Device 10ca (rev 01) We can tell that Physical Function 01:00.0 has 7 Virtual Functions (01:10.0, 01:10.2, 01:10.4, 01:10.6, 01:11.0, 01:11.2, 01:11.4). And the Virtual Function PCI Configuration Space looks just like normal PCI device. $ lspci -s 01:10.0 -vvv 01:10.0 Ethernet controller: Intel Corporation 82576 Gigabit Virtual Function Subsystem: Intel Corporation Gigabit Virtual Function Control: I/O- Mem- BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx- Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- SERR- = 2.0.20 and libsepol >=2.0.39 are required. The policy Makefile (tools/flask/policy/Makefile) must also be changed as follows: ######################################## # # Build a binary policy locally # $(POLVER): policy.conf @echo "Compiling $(NAME) $(POLVER)" $(QUIET) $(CHECKPOLICY) $^ -o $@ (Comment out this line) # Uncomment line below to enable policies for devices # $(QUIET) $(CHECKPOLICY) -t Xen $^ -o $@ (Uncomment this line) ######################################## # # Install a binary policy # $(LOADPATH): policy.conf @echo "Compiling and installing $(NAME) $(LOADPATH)" $(QUIET) $(CHECKPOLICY) $^ -o $@ (Comment out this line) # Uncomment line below to enable policies for devices # $(QUIET) $(CHECKPOLICY) -t Xen $^ -o $@ (Uncomment this line) IRQs, PCI devices, I/O memory and ports can all be labeled. There are commented out lines in xen.te policy for examples on how to label devices. Device Labeling --------------- The "lspci -vvn" command can be used to output all the devices and identifiers associated with them. For example, to label an Intel e1000e ethernet card the lspci output is.. 00:19.0 0200: 8086:10de (rev 02) Subsystem: 1028:0276 Interrupt: pin A routed to IRQ 33 Region 0: Memory at febe0000 (32-bit, non-prefetchable) [size=128K] Region 1: Memory at febd9000 (32-bit, non-prefetchable) [size=4K] Region 2: I/O ports at ecc0 [size=32] Kernel modules: e1000e The labeling can be done with these commands pirqcon 33 system_u:object_r:nicP_t iomemcon 0xfebe0-0xfebff system_u:object_r:nicP_t iomemcon 0xfebd9 system_u:object_r:nicP_t ioportcon 0xecc0-0xecdf system_u:object_r:nicP_t pcidevicecon 0xc800 system_u:object_r:nicP_t The PCI device label must be computed as the 32-bit SBDF number for the PCI device. It the PCI device is aaaa:bb:cc.d or bb:cc.d, then the SBDF can be calculated using: SBDF = (a << 16) | (b << 8) | (c << 3) | d The AVC denials for IRQs, memory, ports, and PCI devices will normally contain the ranges being denied to more easily determine what resources are required. When running in permissive mode, only the first denial of a given source/destination is printed to the log, so labeling devices using this method may require multiple passes to find all required ranges. Additional notes on XSM:FLASK ----------------------------- 1) xen command line parameters a) flask_enforcing The default value for flask_enforcing is '0'. This parameter causes the platform to boot in permissive mode which means that the policy is loaded but not enforced. This mode is often helpful for developing new systems and policies as the policy violations are reported on the xen console and may be viewed in dom0 through 'xl dmesg'. To boot the platform into enforcing mode, which means that the policy is loaded and enforced, append 'flask_enforcing=1' on the grub line. This parameter may also be changed through the flask hypercall. b) flask_enabled The default value for flask_enabled is '1'. This parameter causes the platform to enable the FLASK security module under the XSM framework. The parameter may be enabled/disabled only once per boot. If the parameter is set to '0', only a reboot can re-enable flask. When flask_enabled is '0' the DUMMY module is enforced. This parameter may also be changed through the flask hypercall. But may only be performed once per boot. xen-4.4.0/docs/misc/console.txt0000664000175000017500000001010412307313555014525 0ustar smbsmbXen PV Console notes ------------------------------------------------------------------------ Stefano Stabellini stefano.stabellini@eu.citrix.com Xen traditionally provided a single pv console to pv guests, storing the relevant information in xenstore under /local/domain/$DOMID/console. Now many years after the introduction of the pv console we have multiple pv consoles support for pv and hvm guests; multiple pv console backends (qemu and xenconsoled) and emulated serial cards too. This document tries to describe how the whole system works and how the different components interact with each others. The first PV console path in xenstore remains: /local/domain/$DOMID/console the other PV consoles follow the conventional xenstore device path and live in: /local/domain/$DOMID/device/console/$DEVID. The output of a PV console, whether it should be a file, a pty, a socket, or something else, is specified by the toolstack in the xenstore node "output", under the relevant console section. For example: # xenstore-read /local/domain/26/device/console/1/output pty The backend chosen for a particular console is specified by the toolstack in the "type" node on xenstore, under the relevant console section. For example: # xenstore-read /local/domain/26/console/1/type xenconsoled The supported values are only xenconsoled or ioemu; xenconsoled has several limitations: it can only be used for the first PV console and it can only have a pty as output. If the output is a pty, backends write the device name to the "tty" node in xenstore under the relevant console path. Emulated serials are provided by qemu-dm only to hvm guests; the number of emulated serials depends on how many "-serial" command line options are given to qemu. The output of a serial is specified as argument to the -serial command line option to qemu. Qemu writes the tty name to xenstore in the following path: /local/domain/$DOMID/serial/$SERIAL_NUM/tty xenconsole is the tool to connect to a PV console or an emulated serial that has a pty as output. Xenconsole takes a domid as parameter plus an optional console type (pv for PV consoles or serial for emulated serials) and console number. Depending on the type and console number, xenconsole will look for the tty node in different xenstore paths, as described above. If the user doesn't specify the console type xenconsole will try to guess: if the guest is a pv guest it defaults to PV console, if the guest is an hvm guest it defaults to emulated serial. By default xl creates a pv console for hvm guests, plus an emulated serial if the user specified 'serial = "pty"' in the VM config file. Considering that xenconsole defaults to emulated serials for hvm guests, executing xl create -c "domain" causes xenconsole to attach to the emulated serial tty. This is most probably what the user wanted because currently no bootloaders support xen pv consoles so the only way to interact with a bootloader like grub over a console is to use the emulated serial. However the pv console is still easy to use with Linux PV on HVM guests: the user just need to pass "console=hvc0" to the kernel command line and then execute "xl console -t pv " to connect to it. When using stubdoms the serial cards are still emulated by qemu (this time running in the stubdom), the number of serial cards and where the output goes is still specified using qemu command line options. The difference is that for each emulated serial card there must be a pv console connection between the stubdom and dom0 to export the serial output from the stubdom to dom0. The pv console backend for stubdom's pv consoles is always ioemu because multiple pv consoles support is a requirement in this case, considering that minios has its own pv console too. In order to simplify the setup when using stubdoms the hvm guest can only have one pv console with xenstored as backend (the stubdom could provide pv console backends to the hvm guest but then it would need another pv console connection for each console backend to export the pty to dom0). xen-4.4.0/docs/misc/pci-device-reservations.txt0000664000175000017500000000534012307313555017623 0ustar smbsmbPCI vendor ID 0x5853 has been reserved for use by Xen systems in order to advertise certain virtual hardware to guest virtual machines. The primary use of this is with device ID 0x0001 to advertise the Xen Platform PCI device - the presence of this virtual device enables a guest Operating System (subject to the availability of suitable drivers) to make use of paravirtualisation features such as disk and network devices etc. Some Xen vendors wish to provide alternative and/or additional guest drivers that can bind to virtual devices[1]. This may be done using the Xen PCI vendor ID of 0x5853 and Xen-vendor/device specific PCI device IDs. This file records reservations made within the device ID range in order to avoid multiple Xen vendors using conflicting IDs. Guidelines 1. A vendor may request a range of device IDs by submitting a patch to this file. 2. Vendor allocations should be in the range 0xc000-0xfffe to reduce the possibility of clashes with community IDs assigned from the bottom up. 3. The vendor is responsible for allocations within the range and should try to record specific device IDs in PCI ID databases such as http://pciids.sourceforge.net and http//www.pcidatabase.com Reservations ============ range | vendor/product --------------+-------------------------------------------------------------- 0x0001 | (Xen Platform PCI device) 0x0002 | Citrix XenServer (grandfathered allocation for XenServer 6.1) 0xc000-0xc0ff | Citrix XenServer 0xc100-0xc1ff | Citrix XenClient [1] Upstream QEMU provides a parameterized device called xen-pvdevice that can be used to host guest drivers. Execute: qemu-system-i386 -device xen-pvdevice,help for a list of all parameters. The following parameters are relevant to driver binding: vendor-id (default 0x5853): The PCI vendor ID and subsystem vendor ID of the device. device-id (must be specified): The PCI device ID and subsystem device ID of the device. revision (default 0x01): The PCI revision of the device Also the size parameter (default 0x400000) can be used to specify the size of the single MMIO BAR that the device exposes. This area may be used by drivers for mapping grant tables, etc. Note that the presence of the Xen Platform PCI device is generally a pre-requisite for an additional xen-pvdevice as it is the platform device that provides that IO ports necessary for unplugging emulated devices. See hvm-emulated-unplug.markdown for details of the IO ports and unplug protocol. libxl provides support for creation of a single additional xen-pvdevice. See the vendor_device parameter in xl.cfg(5). xen-4.4.0/docs/misc/dump-core-format.txt0000664000175000017500000002621712307313555016260 0ustar smbsmbxen dump-core format Written by Isaku Yamahata Feb. 2007 Introduction ------------ With xm dump-core command, the guest domain's core can be created as a file. Its format was changed to be based on ELF format because elf format is easily extensible and handy. This document describes the new format. In this document the new format is called new xen dump-core format, xen dump-core format or simply dump-core format. The file of xen dump-core format is called xen dump-core file or dump-core file. The usual process core file includes program headers and no section header. On the other hand the xen dump-core file includes no program headers and some sections because of its peculiar requirements. Reference --------- For ELF format itself, see Tool Interface Standard(TIS) Executable and Linking Format(ELF) Specification version 1.2. For xen related structure, please see the xen header files. Elf header ---------- The elf header members are set as follows e_ident[EI_CLASS] = ELFCLASS64 = 2 e_ident[EI_OSABI] = ELFOSABI_SYSV = 0 e_type = ET_CORE = 4 ELFCLASS64 is always used independent of architecture. e_ident[EI_DATA] is set as follows For x86 PV domain case, it is set according to the guest configuration (i.e. if guest is 32bit it is set to EM_386 even when the dom0 is 64 bit.) For other domain case (x86 HVM domain case and ia64 domain case), it is set according to the dumping system's architecture. e_flags is set according to the dumping system's architecture. Other members are set as usual. Sections -------- Currently the following sections are defined. Some sections are optional. ".note.Xen" section name ".note.Xen" type SHT_NOTE description This section is used as note section to store xen dump-core file specific informations. The each detailed informations are described in note section. This section must exist. ".xen_prstatus" section name ".xen_prstatus" type SHT_PROGBITS structure array of vcpu_guest_context_t description This section stores the array of vcpu_guest_context_t which is obtained by XEN_DOMCTL_getvcpucontext hypercall when the xen dump-core file is created. The size of array is stored in xch_nr_vcpus member of header note descriptor in .note.Xen note section. This section must exist. ".xen_shared_info" section name ".xen_shared_info" type SHT_PROGBITS structure shared_info_t description This section stores the contents of shared info page of a domain. This section is optional. ".xen_p2m" section name ".xen_p2m" type SHT_PROGBITS structure array of struct xen_dumpcore_p2m struct xen_dumpcore_p2m { uint64_t pfn; uint64_t gmfn; }; description This elements represents the frame number of the page in .xen_pages section. pfn: guest-specific pseudo-physical frame number gmfn: machine physical frame number The size of arrays is stored in xch_nr_pages member of header note descriptor in .note.Xen note section. The entries are stored in pfn-ascending order. The value, {~(uint64_t)0, ~(uint64_t)0}, means invalid (pfn, gmfn) and the corresponding page has zero. There might exist invalid (pfn, gmfn)'s at the end part of this array. This section must exist when the domain is non auto translated physmap mode. Currently x86 paravirtualized domain. ".xen_pfn" section name ".xen_pfn" type SHT_PROGBITS structure array of uint64_t description This elements represents the frame number of the page in .xen_pages section. The size of arrays is stored in xch_nr_pages member of header note descriptor in .note.Xen note section. The entries are stored in ascending order. The value, ~(uint64_t)0, means invalid pfn and the corresponding page has zero. There might exist invalid pfn's at the end part of this array. This section must exist when the domain is auto translated physmap mode. Currently x86 full virtualized domain and ia64 domain. ".xen_pages" section name ".xen_pages" type SHT_PROGBITS structure array of page where page is page size byte array description This section includes the contents of pages. The corresponding address is described in .xen_p2m section or .xen_pfn section. The page size is stored in xch_page_size member of header note descriptor in .note.Xen section. The array size is stored in xch_nr_pages member of header note descriptor in .note.Xen section. This section must exist. ".xen_ia64_mapped_regs" section name ".xen_ia64_mapped_regs" type SHT_PROGBITS structure array of mapped_regs_t description This section stores the array of mapped_regs_t. The size of array is stored in xch_nr_vcpus member of header note descriptor in .note.Xen note section. This section is ia64 specific and must exist for ia64 PV domain. This section must not exist for non-ia64 domain or ia64 HVM domain. note section ------------ The note types are defined in xen/include/public/elfnote.h. The note descriptors are defined in tools/libxc/xc_core.h Currently the following note informations are defined. elf note section "Xen" is used as elf note name in elf note info namesz 4 name "Xen" (null-terminated) Descriptors none note descriptor type XEN_ELFNOTE_DUMPCORE_NONE = 0x2000000 structure struct xen_dumpcore_elfnote_none_desc { /* nothing is defined */ }; description This note descriptor is defined to just indicate that this file is xen dump-core format without any specific information. This note information must exist. header note descriptor type XEN_ELFNOTE_DUMPCORE_HEADER = 0x2000001 structure struct xen_dumpcore_elfnote_header_desc { uint64_t xch_magic; uint64_t xch_nr_vcpus; uint64_t xch_nr_pages; uint64_t xch_page_size; }; description This note descriptor stores basic information of the domain. xch_magic magic number XC_CORE_MAGIC = 0xF00FEBED for paravirtualized domain XC_CORE_MAGIC_HVM = 0xF00FEBEE for full virtualized domain xch_nr_vcpus the number of vcpus xch_nr_pages the number of pages xch_page_size guest OS's page size This note information must exist. xen_version descriptor type XEN_ELFNOTE_DUMPCORE_XEN_VERSION = 0x2000002 structure struct xen_dumpcore_elfnote_xen_version_desc { uint64_t major_version; uint64_t minor_version; xen_extraversion_t extra_version; xen_compile_info_t compile_info; xen_capabilities_info_t capabilities; xen_changeset_info_t changeset; xen_platform_parameters_t platform_parameters; uint64_t pagesize; }; description This note descriptor stores basic information about xen hypervisor. The each members store the result of __HYPERVISOR_xen_version hypercall. major_version 16msb bit of the result of XENVER_version minor_version 16lsb bit of the result of XENVER_version uint64_t is used to make struct xen_dumpcore_elfnote_xen_version_desc independent on 32bit/64bit instead of uint32_t. extra_version the result of XENVER_extraversion compile_info the result of XENVER_compile_info capabilities the result of XENVER_capabilities changeset the result of XENVER_changeset platform_parameters the result of XENVER_platform_parameters pagesize the result of XENVER_pagesize This note information must exist. format_version descriptor type XEN_ELFNOTE_DUMPCORE_FORMAT_VERSION = 0x2000003 structure struct xen_dumpcore_elfnote_format_version_desc { uint64_t version; }; description This note descriptor stores xen dump-core format version. The 32msb bit is major version and the 32lsb bit is minor version. The minor version will be incremented when the format is changed in compatible way. e.g. new sections, new note descriptors are added. the major version will be incremented when the format is changed in incompatible way. This note information must exit. Analysis tools should check this format version. This note information must exist. Format version history ---------------------- Currently only (major, minor) = (0, 1) is used. [When the format is changed, it would be described here.] (0, 1) update - .xen_p2m, .xen_pfn section Invalid pfn/gmfn. - .xen_p2m, .xen_pfn section Arrays must be in pfn ascending order for efficient looking up. - EI_CLASS member of elf header was changed to ELFCLASS64 independent of architecture. This is mainly for x86_32pae. The format version isn't bumped because analysis tools can distinguish it. - .xen_ia64_mapped_regs section was made only for ia64 PV domain. In case of IA64 HVM domain, this section doesn't exist. - elf header e_ident[EI_DATA] On x86 PV domain case, it is set according to the guest configuration. I.e. 32-on-64 case, the file will be set EM_386 instead of EM_X86_64. This is the same as 32-on-32 case, so there is no impact on analysis tools. xen-4.4.0/docs/misc/distro_mapping.txt0000664000175000017500000000250012307313555016103 0ustar smbsmbWith directory layout differences between Red Hat, Debian, Suse and other distros one needs to set the variables for the elements below -----------------+------------------+---------------+----------------+ | Red Hat | Debian | Suse | -----------------+------------------+---------------+----------------+ CONFIG_LEAF_DIR | sysconfig | default | sysconfig | SUBSYS_DIR | /var/run/subsys | /var/run | /var/run | INITD_DIR | /etc/rc.d/init.d | /etc/init.d | /etc/init.d | -----------------+------------------+---------------+----------------+ The existence of these directories are tested at build-time (on the build host, via the "setvar_dir" macro in Config.mk) and for some scripts at run-time. If the Red Hat directory exists, it is used; otherwise the Debian one is used. You can override this by setting the variables in the environment or your ".config" (which is included by .config). To add support for new distributions that don't use the above locations, one must grep for the above elements and add appropriate checks. For example if a new distro uses /etc/bork as its config dir, it's not sufficient to set CONFIG_LEAF_DIR=bork; one must also add tests for the existence of the bork dir in every context where config files are read. xen-4.4.0/docs/misc/coverage.markdown0000664000175000017500000000551712307313555015675 0ustar smbsmb# Coverage support for Xen Coverare support allow you to get coverage information from Xen execution. You can see how many times a line is executed. Some compilers have specific options that enable the collection of this information. Every basic block in the code will be instrumented by the compiler to compute these statistics. It should not be used in production as it slows down your hypervisor. ## Enable coverage Test coverage support can be turned on compiling Xen with the `coverage` option set to `y`. Something like: cd xen make coverage=y (or change your `.config` file). ## Extract coverage data The way GCC and other tools deal with coverage information is to use some files created during build phase (.gcno) and some files produced by executing the *program* (.gcda). The program in this case is Xen but Xen cannot write files so the way you can use coverage from Xen is extract coverage data from Xen and then split these information into files. To extract data you use a simple utility called `xencov`. Mainly `xencore` allows you to do 3 operations: * `xencov read` extract data * `xencov reset` reset all coverage counters * `xencov read-reset` extract data and reset counters at the same time. Another utility (`xencov_split`) is used to split extracted data file into files needed by userspace tools. ## Split coverage data Once you extracted data from Xen, it is time to create files which the coverage tools can understand. To do it you need to run `xencov_split` utility. The utility just takes an input file and splits the blob into gcc .gcda files in the same directory that you execute the script. As file names are generated relative to the current directory, it could be a good idea to run the script from `/` on your build machine. Code for splitting the blob is put in another utility for some reason: * It is simpler to maintain a high level script than a C program; * You don't need to execute on the Xen host so you just need to copy the file to your development box (you usually need development files anyway). ## Possible use **This section is just an example on how to use these tools!** This example assumes you compiled Xen from `~/xen-unstable` and installed into the host. **Consider that if you even recompile Xen you are not able to use blob extracted from xencov!** * Ensure the `lcov` package is installed * From the Xen host machine extract the coverage blob cd /root xencov read coverage.dat * Copy the extracted blob to your dev machine cd ~ scp root@myhost:coverage.dat * Extract the coverage information (cd / && xencov_split ~/coverage.dat) * Produce coverage html output cd ~/xen-unstable rm -rf cov.info cov geninfo -o cov.info xen mkdir cov genhtml -o cov cov.info * See output in a browser firefox cov/index.html xen-4.4.0/docs/misc/xl-numa-placement.markdown0000664000175000017500000002147612307313555017433 0ustar smbsmb# Guest Automatic NUMA Placement in libxl and xl # ## Rationale ## NUMA (which stands for Non-Uniform Memory Access) means that the memory accessing times of a program running on a CPU depends on the relative distance between that CPU and that memory. In fact, most of the NUMA systems are built in such a way that each processor has its local memory, on which it can operate very fast. On the other hand, getting and storing data from and on remote memory (that is, memory local to some other processor) is quite more complex and slow. On these machines, a NUMA node is usually defined as a set of processor cores (typically a physical CPU package) and the memory directly attached to the set of cores. The Xen hypervisor deals with NUMA machines by assigning to each domain a "node affinity", i.e., a set of NUMA nodes of the host from which they get their memory allocated. Also, even if the node affinity of a domain is allowed to change on-line, it is very important to "place" the domain correctly when it is fist created, as the most of its memory is allocated at that time and can not (for now) be moved easily. NUMA awareness becomes very important as soon as many domains start running memory-intensive workloads on a shared host. In fact, the cost of accessing non node-local memory locations is very high, and the performance degradation is likely to be noticeable. For more information, have a look at the [Xen NUMA Introduction][numa_intro] page on the Wiki. ### Placing via pinning and cpupools ### The simplest way of placing a domain on a NUMA node is statically pinning the domain's vCPUs to the pCPUs of the node. This goes under the name of CPU affinity and can be set through the "cpus=" option in the config file (more about this below). Another option is to pool together the pCPUs spanning the node and put the domain in such a cpupool with the "pool=" config option (as documented in our [Wiki][cpupools_howto]). In both the above cases, the domain will not be able to execute outside the specified set of pCPUs for any reasons, even if all those pCPUs are busy doing something else while there are others, idle, pCPUs. So, when doing this, local memory accesses are 100% guaranteed, but that may come at he cost of some load imbalances. ### NUMA aware scheduling ### If the credit scheduler is in use, the concept of node affinity defined above does not only apply to memory. In fact, starting from Xen 4.3, the scheduler always tries to run the domain's vCPUs on one of the nodes in its node affinity. Only if that turns out to be impossible, it will just pick any free pCPU. This is, therefore, something more flexible than CPU affinity, as a domain can still run everywhere, it just prefers some nodes rather than others. Locality of access is less guaranteed than in the pinning case, but that comes along with better chances to exploit all the host resources (e.g., the pCPUs). In fact, if all the pCPUs in a domain's node affinity are busy, it is possible for the domain to run outside of there, but it is very likely that slower execution (due to remote memory accesses) is still better than no execution at all, as it would happen with pinning. For this reason, NUMA aware scheduling has the potential of bringing substantial performances benefits, although this will depend on the workload. ## Guest placement in xl ## If using xl for creating and managing guests, it is very easy to ask for both manual or automatic placement of them across the host's NUMA nodes. Note that xm/xend does a very similar thing, the only differences being the details of the heuristics adopted for automatic placement (see below), and the lack of support (in both xm/xend and the Xen versions where that\ was the default toolstack) for NUMA aware scheduling. ### Placing the guest manually ### Thanks to the "cpus=" option, it is possible to specify where a domain should be created and scheduled on, directly in its config file. This affects NUMA placement and memory accesses as the hypervisor constructs the node affinity of a VM basing right on its CPU affinity when it is created. This is very simple and effective, but requires the user/system administrator to explicitly specify affinities for each and every domain, or Xen won't be able to guarantee the locality for their memory accesses. Notice that this also pins the domain's vCPUs to the specified set of pCPUs, so it not only sets the domain's node affinity (its memory will come from the nodes to which the pCPUs belong), but at the same time forces the vCPUs of the domain to be scheduled on those same pCPUs. ### Placing the guest automatically ### If no "cpus=" option is specified in the config file, libxl tries to figure out on its own on which node(s) the domain could fit best. If it finds one (some), the domain's node affinity get set to there, and both memory allocations and NUMA aware scheduling (for the credit scheduler and starting from Xen 4.3) will comply with it. It is worthwhile noting that optimally fitting a set of VMs on the NUMA nodes of an host is an incarnation of the Bin Packing Problem. In fact, the various VMs with different memory sizes are the items to be packed, and the host nodes are the bins. As such problem is known to be NP-hard, we will be using some heuristics. The first thing to do is find the nodes or the sets of nodes (from now on referred to as 'candidates') that have enough free memory and enough physical CPUs for accommodating the new domain. The idea is to find a spot for the domain with at least as much free memory as it has configured to have, and as much pCPUs as it has vCPUs. After that, the actual decision on which candidate to pick happens accordingly to the following heuristics: * candidates involving fewer nodes are considered better. In case two (or more) candidates span the same number of nodes, * candidates with a smaller number of vCPUs runnable on them (due to previous placement and/or plain vCPU pinning) are considered better. In case the same number of vCPUs can run on two (or more) candidates, * the candidate with with the greatest amount of free memory is considered to be the best one. Giving preference to candidates with fewer nodes ensures better performance for the guest, as it avoid spreading its memory among different nodes. Favoring candidates with fewer vCPUs already runnable there ensures a good balance of the overall host load. Finally, if more candidates fulfil these criteria, prioritizing the nodes that have the largest amounts of free memory helps keeping the memory fragmentation small, and maximizes the probability of being able to put more domains there. ## Guest placement in libxl ## xl achieves automatic NUMA placement because that is what libxl does by default. No API is provided (yet) for modifying the behaviour of the placement algorithm. However, if your program is calling libxl, it is possible to set the `numa_placement` build info key to `false` (it is `true` by default) with something like the below, to prevent any placement from happening: libxl_defbool_set(&domain_build_info->numa_placement, false); Also, if `numa_placement` is set to `true`, the domain must not have any CPU affinity (i.e., `domain_build_info->cpumap` must have all its bits set, as it is by default), or domain creation will fail returning `ERROR_INVAL`. Starting from Xen 4.3, in case automatic placement happens (and is successful), it will affect the domain's node affinity and _not_ its CPU affinity. Namely, the domain's vCPUs will not be pinned to any pCPU on the host, but the memory from the domain will come from the selected node(s) and the NUMA aware scheduling (if the credit scheduler is in use) will try to keep the domain there as much as possible. Besides than that, looking and/or tweaking the placement algorithm search "Automatic NUMA placement" in libxl\_internal.h. Note this may change in future versions of Xen/libxl. ## Xen < 4.3 ## As NUMA aware scheduling is a new feature of Xen 4.3, things are a little bit different for earlier version of Xen. If no "cpus=" option is specified and Xen 4.2 is in use, the automatic placement algorithm still runs, but the results is used to _pin_ the vCPUs of the domain to the output node(s). This is consistent with what was happening with xm/xend, which were also affecting the domain's CPU affinity. On a version of Xen earlier than 4.2, there is not automatic placement at all in xl or libxl, and hence no node or CPU affinity being affected. ## Limitations ## Analyzing various possible placement solutions is what makes the algorithm flexible and quite effective. However, that also means it won't scale well to systems with arbitrary number of nodes. For this reason, automatic placement is disabled (with a warning) if it is requested on a host with more than 16 NUMA nodes. [numa_intro]: http://wiki.xen.org/wiki/Xen_NUMA_Introduction [cpupools_howto]: http://wiki.xen.org/wiki/Cpupools_Howto xen-4.4.0/docs/misc/libxl_memory.txt0000664000175000017500000001040112307313555015565 0ustar smbsmb/* === Domain memory breakdown: HVM guests ================================== + +----------+ + | | shadow | | | +----------+ | overhead | | extra | | | | external | | | +----------+ + | | | extra | | | | | internal | | | + +----------+ + | | footprint | | video | | | | | +----------+ + + | | xen | | | | | | | actual | maximum | | | | | | | target | | | | guest | | | build | | | | | | | | start | | | static | | | | | | | | maximum | +----------+ | + + + + | | | | | | | | | | balloon | | build | | | | maximum | | | | + +----------+ + extra internal = LIBXL_MAXMEM_CONSTANT extra external = LIBXL_HVM_EXTRA_MEMORY shadow = libxl_domain_build_info.shadow_memkb static maximum = libxl_domain_build_info.max_memkb video = libxl_domain_build_info.video_memkb build start = libxl_domain_build_info.target_memkb libxl_domain_setmaxmem -> xen maximum libxl_set_memory_target -> actual target === Domain memory breakdown: PV guests ================================== + +----------+ + overhead | | extra | | | | external | | | +----------+ + | | | extra | | | | | internal | | | + +----------+ + + + | | footprint | | | | | | | xen | | | | | | | actual | maximum | | | guest | | | build | target | | | | | | | start | | | static | | | | | | | | maximum | +----------+ | + + + + | | | | | | | | | | balloon | | build | | | | maximum | | | | + +----------+ + extra internal = LIBXL_MAXMEM_CONSTANT extra external = LIBXL_PV_EXTRA_MEMORY static maximum = libxl_domain_build_info.max_memkb build start = libxl_domain_build_info.target_memkb libxl_domain_setmaxmem -> xen maximum libxl_set_memory_target -> actual target ========================================================================= */ xen-4.4.0/docs/misc/hvm-emulated-unplug.markdown0000664000175000017500000000607012307313555017775 0ustar smbsmb#Xen HVM emulated device unplug protocol The protocol covers three basic things: * Disconnecting emulated devices. * Getting log messages out of the drivers and into dom0. * Allowing dom0 to block the loading of specific drivers. This is intended as a backwards-compatibility thing: if we discover a bug in some old version of the drivers, then rather than working around it in Xen, we have the option of just making those drivers fall back to emulated mode. The current protocol works like this (from the point of view of drivers): 1. When the drivers first come up, they check whether the unplug logic is available by reading a two-byte magic number from IO port `0x10`. These should be `0x49d2`. If the magic number doesn't match, the drivers don't do anything. 2. The drivers read a one-byte protocol version from IO port `0x12`. If this is 0, skip to 6. 3. The drivers write a two-byte product number to IO port `0x12`. At the moment, the only drivers using this protocol are our closed-source ones, which use product number 1. 4. The drivers write a four-byte build number to IO port `0x10`. 5. The drivers check the magic number by reading two bytes from `0x10` again. If it's changed from `0x49d2` to `0xd249`, the drivers are blacklisted and should not load. 6. The drivers write a two-byte bitmask of devices to unplug to IO port `0x10`. The defined fields are: * `1` -- All IDE disks (not including CD drives) * `2` -- All emulated NICs * `4` -- All IDE disks except for the primary master (not including CD drives) The relevant emulated devices then disappear from the relevant buses. For most guest operating systems, you want to do this before device enumeration happens. Once the drivers have checked the magic number, they can send log messages to qemu which will be logged to wherever qemu's logs go (`/var/log/xen/qemu-dm.log` on normal Xen, dom0 syslog on XenServer). These messages are written to IO port `0x12` a byte at a time, and are terminated by newlines. There's a fairly aggressive rate limiter on these messages, so they shouldn't be used for anything even vaguely high-volume, but they're rather useful for debugging and support. It is still permitted for a driver to use this logging feature if it is blacklisted, but *ONLY* if it has checked the magic number and found it to be `0x49d2` or `0xd249`. This isn't exactly a pretty protocol, but it does solve the problem. The blacklist is, from qemu's point of view, handled mostly through xenstore. A driver version is considered to be blacklisted if `/mh/driver-blacklist/{product_name}/{build_number}` exists and is readable, where `{build_number}` is the build number from step 4 as a decimal number. `{product_name}` is a string corresponding to the product number in step 3. The master registry of product names and numbers is in xen/include/public/hvm/pvdrivers.h. NOTE: The IO ports implementing the unplug protocol are implemented as part of the Xen Platform PCI Device, so if that device is not present in the system then this protocol will not work. xen-4.4.0/docs/misc/xenpaging.txt0000664000175000017500000000273312307313555015054 0ustar smbsmbWarning: The xenpaging code is new and not fully debugged. Usage of xenpaging can crash Xen or cause severe data corruption in the guest memory and its filesystems! Description: xenpaging writes memory pages of a given guest to a file and moves the pages back to the pool of available memory. Once the guests wants to access the paged-out memory, the page is read from disk and placed into memory. This allows the sum of all running guests to use more memory than physically available on the host. Requirements: xenpaging relies on Intel EPT or AMD RVI, other hardware is not supported. Only HVM guests are supported. The dom0 kernel needs paging-aware backend drivers to handle paged granttable entries. Currently only dom0 kernels based on classic Xen Linux support this functionality. Usage: Up to now xenpaging is not integrated into libxl/xend, so it has to be started manually for each guest. Once the guest is running, run xenpaging with the guest_id and the path to the pagefile: /usr/lib/xen/bin/xenpaging -f /path/to/page_file -d dom_id & Once xenpaging runs it needs a memory target, which is the memory footprint of the guest. This value (in KiB) must be written manually to xenstore. The following example sets the target to 512MB: xenstore-write /local/domain//memory/target-tot_pages $((1024*512)) Now xenpaging tries to page-out as many pages to keep the overall memory footprint of the guest at 512MB. Todo: - integrate xenpaging into libxl # vim: tw=72 xen-4.4.0/docs/misc/sedf_scheduler_mini-HOWTO.txt0000664000175000017500000000271412307313555017764 0ustar smbsmbsEDF scheduler -------------- Author: Stephan.Diestelhorst@{cl.cam.ac.uk, inf.tu-dresden.de} Overview: This scheduler provides weighted CPU sharing in an intuitive way and uses realtime-algorithms to ensure time guarantees. Usage: -add "sched=sedf" on Xen's boot command-line -create domains as usual -use "xm sched-sedf " Where: -period/slice are the normal EDF scheduling parameters in nanosecs -latency-hint is the scaled period in case the domain is doing heavy I/O (unused by the currently compiled version) -extra is a flag (0/1), which controls whether the domain can run in extra-time -weight is mutually exclusive with period/slice and specifies another way of setting a domains cpu slice Examples: normal EDF (20ms/5ms): xm sched-sedf 20000000 5000000 0 0 0 best-effort domains (i.e. non-realtime): xm sched-sedf 20000000 0 0 1 0 normal EDF (20ms/5ms) + share of extra-time: xm sched-sedf 20000000 5000000 0 1 0 4 domains with weights 2:3:4:2 xm sched-sedf 0 0 0 0 2 xm sched-sedf 0 0 0 0 3 xm sched-sedf 0 0 0 0 4 xm sched-sedf 0 0 0 0 2 1 fully-specified (10ms/3ms) domain, 3 other domains share available rest in 2:7:3 ratio: xm sched-sedf 10000000 3000000 0 0 0 xm sched-sedf 0 0 0 0 2 xm sched-sedf 0 0 0 0 7 xm sched-sedf 0 0 0 0 3 xen-4.4.0/docs/misc/xend.tex0000664000175000017500000003775712307313555014031 0ustar smbsmb% -*- mode: LaTeX -*- \def\seca{\chapter} \def\secb{\section} \def\secc{\subsection} \def\secd{\subsubsection} \def\refa{chapter} \def\refb{section} \def\refc{section} \def\refd{section} %\def\seca{\section} %\def\secb{\subsection} %\def\secc{\subsubsection} %\def\refa{section} %\def\refb{section} %\def\refc{section} \documentclass[11pt,twoside,final,openright]{report} \usepackage{a4,graphicx,setspace} \setstretch{1.15} \begin{document} % TITLE PAGE \pagestyle{empty} \begin{center} \vspace*{\fill} \includegraphics{figs/xenlogo.eps} \vfill \vfill \vfill \begin{tabular}{l} {\Huge \bf Xend} \\[4mm] {\huge Xen v2.0 for x86} \\[80mm] {\Large Xen is Copyright (c) 2004, The Xen Team} \\[3mm] {\Large University of Cambridge, UK} \\[20mm] {\large Last updated 30 August 2004} \end{tabular} \vfill \end{center} \cleardoublepage % TABLE OF CONTENTS \pagestyle{plain} \pagenumbering{roman} { \parskip 0pt plus 1pt \tableofcontents } \cleardoublepage % PREPARE FOR MAIN TEXT \pagenumbering{arabic} \raggedbottom \widowpenalty=10000 \clubpenalty=10000 \parindent=0pt \renewcommand{\topfraction}{.8} \renewcommand{\bottomfraction}{.8} \renewcommand{\textfraction}{.2} \renewcommand{\floatpagefraction}{.8} \setstretch{1.15} \seca{Introduction} Xend is the control daemon used to manage a machine running the Xen hypervisor. Xend is responsible for creating and destroying domains and managing their resources, such as virtual block devices and virtual network interfaces. Xend exists because the Xen hypervisor itself only manages the memory image of a domain and its scheduling. Xen provides the event channels that connect a domain to its devices, but is intentionally not involved in setting them up. Xend runs as a daemon in the privileged domain 0 and uses a low-level api to communicate with Xen via the domain 0 kernel. Xend exports its control interface to its clients using HTTP. Most programming languages have HTTP client libraries, so this interface can be used from most popular languages, for example Python, Perl, C, Java. Xend itself is written in Python, as are most of the Xen tools. The xend interface is intended to be a complete interface for the creation and management of domains. It supports domain creation, shutdown, reboot, destruction, save, restore and migration. When xend creates a domain it creates the domain memory image and communicates with the device driver domain(s) to configure the devices for the domain. This sets up connections between the domain and backend device controllers in the driver domain. When a domain shuts down its memory image cannot be fully released unless its backend devices are released and disconnected. This is done by xend. In order to protect against loss of this information when xend is restarted, xend maintains a persistent database of domain configurations. This allows xend to be stopped and restarted without loss of configuration information. For example, in order to upgrade the xend software. \seca{Domain lifecycle} \secb{Domain creation} Xend is instructed to create a domain by posting a domain\_create message to it, containing the domain configuration to be instantiated. The domain configuration is in sxp format and is as far as possible {\em fully-bound}, that is, all parameters are fully-specified. The domain configuration is saved in the filesystem so that it can be reused later if necessary. The domain configuration specifies the domain name, memory size, kernel image and parameters, and all the domain devices. Xend uses the Xen api to create the domain memory image, and then {\em builds} the memory image for the domain using the kernel image. At this point the domain exists, but it cannot be run because it has no devices. Xend then communicates with the device driver domain to create the configured devices. Once the devices are created it sets up event channels for them between the driver domain and the new domain, and notifies the new domain that its devices are connected. At this point the domain can be started. Xend is also responsible for managing domain consoles. When a domain is created, xend sets up a console event channel to the domain, and creates a TCP listening port for the domain console. When a connection is accepted to the port, xend connects input and output for the port to the domain console channel. \secb{Domain destruction} When a domain terminates, because it has been shutdown or it has crashed, the domain resources must be released so that the domain memory image can be finally removed from xen. Xend monitors the domains, and is also signaled by xen (using a VIRQ) when a domain exits. Xend examines the domain states and determines which domains have exited. It then communicates with the driver domain to release the devices for exited domains. Xend also closes any open console connections and removes the TCP listeners for exited domains. Once all devices have been released it instructs xen to destroy the memory image. \secb{Domain restart} Domain restart is the xen equivalent of a machine reboot. When a domain exits because it has been shutdown in reboot mode, its exit code is reboot. When examining domains to find those that have exited and destroy them, xend detects those that have exited for reboot and does not completely destroy them. It disconnects all their devices, and detaches the console listener from its channel to the domain, but does not close it. Instead it schedules a call to rebuild the domain from its configuration. This proceeds almost identically to creating the domain, except that the console listener is reused and connected to the new domain. This allows existing console connections to remain connected across a domain restart. The restarted domain keeps the same name and domain id. The determination of when to restart a domain is in fact slightly more complex than described above. A domain is configured with a {\em restart mode}. If the restart mode is {\em onreboot}, the default, restart happens when the domain is shutdown normally and exits with code reboot. If the restart mode is {\em never} the domain is not restarted. If the restart mode is {\em always} the domain is always restarted, regardless of how it exited. In order to prevent continual domain crash causing restart loops, xend has a {\em minimum restart time}. Xend remembers when a domain was last restarted and will fail a restart that happens inside the minimum restart time. \seca{Devices} \secb{Virtual network interfaces} Each virtual network interface (vif) has 2 parts: the font-end device in its domain, and the back-end device in the driver domain. Usually the driver domain is domain 0, and there is a linux network device corresponding to the vif. The linux device for interface N on domain D is called vifD.N. When a packet is sent on the vif in the domain the packet is received from the linux device. The linux devices are connected to network interfaces using ethernet bridging. The default setup is a bridge xen-br0, with eth0 connected to it, and the routes for eth0 directed at xen-br0. This is controlled by the xend network setup script, default {\tt /etc/xen/network}, which is run when xend starts. When the vifs for a domain are created, a vif control script, default {\tt /etc/xen/vif-bridge}, is run to connect the vif to its bridge. The default script connects the vif to xen-br0 and optionally sets up iptables rules to prevent IP address spoofing. The bridge a vif is connected to can be defined in its configuration, and this is useful for setting up virtual networks using several bridges. \secb{Virtual block devices} Virtual block devices in a domain are interfaces onto back-end device drivers that export physical devices to domains. In the default configuration the back-end driver is in domain 0 and can export any linux block device to a domain. This includes physical disk partitions, LVM volumes and loopback mounts of files. In fact anything that linux can represent as a block device can be exported to a domain as virtual block device. \seca{Xend invocation} Xend is started (by root) using the command \begin{verbatim} xend start \end{verbatim} Xend can be stopped using \begin{verbatim} xend stop \end{verbatim} Xend must be started before any domains (apart from domain 0) can be created. If you try to use the {\tt xm} tool when xend is not running you will get a 'connection refused' message. \secb{Xend configuration} Xend reads its own configuration from {\tt /etc/xen/xend-config.sxp}, which is a sequence of s-expressions. The configuration parameters are: \begin{itemize} \item xend-port: Port xend should use for the HTTP interface (default 8000). \item xend-address: Address xend should listen on. Specifying 'localhost' prevents remote connections. Specifying the empty string '' allows all connections, and is the default. \item network-script: The script used to start/stop networking for xend (default network). \item vif-bridge: The default bridge that virtual interfaces should be connected to (default xen-br0). \item vif-script: The default script used to control virtual interfaces (default vif-bridge). \item vif-antispoof: Whether iptables should be set up to prevent IP spoofing for virtual interfaces (default yes). \end{itemize} Configuration scripts ({\it e.g.} for network-script) are looked for in {\tt /etc/xen} unless their name begins with '/'. Xend sends its log output to {\tt /var/log/xen/xend.log}. This is a rotating logfile, and logs are moved onto {\tt xend.log.1} {\it etc.} as they get large. Old logs may be deleted. \secb{Xend database} Xend needs to make some data persistent, and it uses files under {\tt /var/xen/xend-db} for this. The persistent data is stored in files in SXP format. Domain information is saved when domains are created. When xend starts it reads the file {\tt /var/xen/lastboot} (if it exists) to determine the last time the system was rebooted. It compares this time with the last reboot time in {\tt wtmp} to determine if the system has been rebooted since xend last ran. If the system has been rebooted xend removes all its saved data that is not persistent across reboots (for example domain data). \seca{Xend HTTP Interface} The xend interface uses HTTP 1.1 \cite{http} as its transport. Simple PUT and GET calls can encode parameters using the standard url-encoding for parameters: MIME type {\tt application/x-www-form-urlencoded}. When file upload is required, the {\tt multipart/form-data} encoding is used. See the HTML 4.1 specification for details \cite{html}. Xend functions as a webserver and supports two interfaces: one for web-browsers and one for programs. The web-browser interface returns replies in HTML and includes forms for interactive operations such as stopping domains and creating domains from an uploaded configuration. The programmatic interface usually returns replies in s-expression format. Both interfaces are accessed in exactly the same way over HTTP - the only difference is the data returned. The webserver distinguishes browsers from programs using the {\tt User-Agent} and {\tt Accept} headers in the HTTP request. If there is no {\tt User-Agent} or no {\tt Acccept} header, or {\tt Accept} includes the type {\tt application/sxp}, the webserver assumes the client is a program and returns SXP. Otherwise it assumes the client is a webserver and returns HTML. In some cases the return value is essentially a string, so {\tt Content-Type} {\tt text/plain} is returned. The HTTP api supported is listed below. All paths in it are relative to the server root, for example {\tt http://localhost:8000/xend}. As defined in the HTTP specification, we use GET for side-effect free operations that may safely be repeated, and POST for operations with side-effects. For each request we list the HTTP method (GET or POST), the url relative to the server root, the operation name and arguments (if any). The operation name is passed as request parameter 'op', and the arguments are passed by name. Operation name and parameters can be encoded using either encoding described above. We also list the corresponding api function from the Python client interface in {\tt xen.xend.XendClient}. \begin{itemize} \item {\tt GET /},\\ {\tt xend()}:\\ Get list of urls under xend root. \item {\tt GET /node},\\ {\tt xend\_node()}:\\ Get node information. \item {\tt POST /node shutdown()},\\ {\tt xend\_node\_shutdown()}:\\ Shutdown the node \item {\tt POST /node reboot()},\\ {\tt xend\_node\_reboot()}:\\ Reboot the node \item {\tt POST /node notify()}:\\ Set node notification url \item {\tt GET /node/dmesg},\\ {\tt xend\_node\_dmesg()}:\\ Get xen boot message. \item {\tt GET /node/log},\\ {\tt xend\_node\_log()}:\\ Get xend log. \item {\tt GET /domain}\\ {\tt xend\_domains()}:\\ Get list of domains. \item {\tt POST /domain create(config)},\\ {\tt xend\_domain\_create(config)}:\\ Create a domain. \item {\tt POST /domain restore(file)},\\ {\tt xend\_domain\_restore(filename)}:\\ Restore a saved domain. \item {\tt GET /domain/},\\ {\tt xend\_domain(dom)}:\\ Get domain information. \item {\tt POST /domain/[dom] configure(config)},\\ {\tt xend\_domain\_configure(dom, conf)}:\\ Configure an existing domain (for internal use by restore and migrate). \item {\tt POST /domain/[dom] unpause()},\\ {\tt xend\_domain\_unpause(dom)}:\\ Start domain running \item {\tt POST /domain/[dom] pause()},\\ {\tt xend\_domain\_pause(dom)}:\\ Stop domain running. \item {\tt POST /domain/[dom] shutdown(reason)},\\ {\tt xend\_domain\_shutdown(dom, reason)}:\\ Shutdown domain, reason can be reboot, poweroff, halt. \item {\tt POST /domain/[dom] destroy(reason)},\\ {\tt xend\_domain\_destroy(dom, reason)}:\\ Destroy domain, reason can be reboot, halt. \item {\tt POST /domain/[dom] save(file)},\\ {\tt xend\_domain\_save(dom, filename)}:\\ Save a domain to a file. \item {\tt POST /domain/[dom] migrate(dst)},\\ {\tt xend\_domain\_migrate(dom, dst)}:\\ Migrate a domain. \item {\tt POST /domain/[dom] pincpu(cpu)},\\ {\tt xend\_domain\_pincpu(self, id, cpu)}\\: Pin a domain to a cpu. \item {\tt POST /domain/[dom] maxmem\_set(memory)},\\ {\tt xend\_domain\_maxmem\_set(dom, memory)}:\\ Set domain maximum memory limit. \item {\tt POST /domain/[dom] device\_create(config)}\\ {\tt xend\_domain\_device\_create(dom, config)}:\\ Add a device to a domain. \item {\tt POST /domain/[dom] device\_destroy(type, index)},\\ {\tt xend\_domain\_device\_destroy(dom, type, index)}:\\ Delete a device from a domain \item {\tt GET /domain/[dom] vifs()},\\ {\tt xend\_domain\_vifs(dom)}:\\ Get virtual network interfaces. \item {\tt GET /domain/[dom] vif(vif)},\\ {\tt xend\_domain\_vif(dom, vif)}:\\ Get virtual network interface. \item {\tt GET /domain/[dom] vbds()},\\ {\tt xend\_domain\_vbds(dom)}:\\ Get virtual block devices. \item {\tt GET /domain/[dom] vbd(vbd)},\\ {\tt xend\_domain\_vbd(dom, vbd)}:\\ Get virtual block device. \item {\tt GET /console},\\ {\tt xend\_consoles()}:\\ Get list of consoles. \item {\tt GET /console/[id]}\\ {\tt xend\_console(id)}:\\ Get information about a console. \item {\tt GET /console/[id] disconnect()}\\ {\tt xend\_console\_disconnect(self, id)}:\\ Disconnect any console TCP connection. \item {\tt POST /event inject(event)}\\ {\tt xend\_event\_inject(sxpr)}:\\ Inject an event. \end{itemize} \secb{Xend debugging interface} Xend also listens on port 8001. Connecting to this port (for example via telnet) allows access to some debugging functions: \begin{itemize} \item help: list functions \item traceon: turn xend tracing on \item traceoff: turn xend tracing off \item quit: disconnect. \item info: list console listeners, block and network device controllers. \end{itemize} When tracing is on xend logs all functions calls and exceptions to {\tt /var/log/xen/xend.trace}. \begin{thebibliography}{99} \bibitem{html} HTML 4.01 Specification,\\ http://www.w3.org/TR/html4/,\\ W3C Recommendation, 24 December 1999. \bibitem{http} Hypertext Transfer Protocol -- HTTP/1.1,\\ http://www.ietf.org/rfc/rfc2616.txt,\\ RFC 2616, IETF 1999. \bibitem{ssh} http://www.openssh.org. \bibitem{stunnel} http://www.stunnel.org. \end{thebibliography} \end{document} xen-4.4.0/docs/misc/xen-error-handling.txt0000664000175000017500000000670112307313555016576 0ustar smbsmbError handling in Xen --------------------- 1. domain_crash() ----------------- Crash the specified domain due to buggy or unsupported behaviour of the guest. This should not be used where the hypervisor itself is in error, even if the scope of that error affects only a single domain. BUG() is a more appropriate failure method for hypervisor bugs. To repeat: domain_crash() is the correct response for erroneous or unsupported *guest* behaviour! Note that this should be used in most cases in preference to domain_crash_synchronous(): domain_crash() returns to the caller, allowing the crash to be deferred for the currently executing VCPU until certain resources (notably, spinlocks) have been released. Example usages: * Unrecoverable guest kernel stack overflows * Unsupported corners of HVM device models 2. BUG() -------- Crashes the host system with an informative file/line error message and a backtrace. Use this to check consistency assumptions within the hypervisor. Be careful not to use BUG() (or BUG_ON(), or ASSERT()) for failures *outside* the hypervisor software -- in particular, guest bugs (where domain_crash() is more appropriate) or non-critical BIOS or hardware errors (where retry or feature disable are more appropriate). Example usage: In arch/x86/hvm/i8254.c an I/O port handler includes the check BUG_ON(bytes != 1). We choose this extreme reaction to the unexpected error case because, although it could be handled by failing the I/O access or crashing the domain, it is indicative of an unexpected inconsistency in the hypervisor itself (since the I/O handler was only registered for single-byte accesses). 3. BUG_ON() ----------- BUG_ON(...) is merely a convenient short form for "if (...) BUG()". It is most commonly used as an 'always on' alternative to ASSERT(). 4. ASSERT() ----------- Similar to BUG_ON(), except that it is only enabled for debug builds of the hypervisor. Typically ASSERT() is used only where the (usually small) overheads of an always-on debug check might be considered excessive. A good example might be within inner loops of time-critical functions, or where an assertion is extreme paranoia (considered *particularly* unlikely ever to fail). In general, if in doubt, use BUG_ON() in preference to ASSERT(). 5. panic() ---------- Like BUG() and ASSERT() this will crash and reboot the host system. However it does this after printing only an error message with no extra diagnostic information such as a backtrace. panic() is generally used where an unsupported system configuration is detected, particularly during boot, and where extra diagnostic information about CPU context would not be useful. It may also be used before exception handling is enabled during Xen bootstrap (on x86, BUG() and ASSERT() depend on Xen's exception-handling capabilities). Example usage: Most commonly for out-of-memory errors during bootstrap. The failure is unexpected since a host should always have enough memory to boot Xen, but if the failure does occur then the context of the failed memory allocation itself is not very interesting. 6. Feature disable ------------------ A possible approach to dealing with boot-time errors, rather than crashing the hypervisor. It's particularly appropriate when parsing non-critical BIOS tables and detecting extended hardware features. 7. BUILD_BUG_ON() ----------------- Useful for assertions which can be evaluated at compile time. For example, making explicit assumptions about size and alignment of C structures. xen-4.4.0/docs/misc/kexec_and_kdump.txt0000664000175000017500000001574112307313555016220 0ustar smbsmb ======================= Kexec and Kdump for Xen ======================= This is a brief guide to using Kexec and Kdump in conjunction with Xen. This functionally works at the level of the hypervisor and dom0 kernel. And will thus affect all guests running on a machine. At this stage it does not work in conjunction with domU kernels. This document should be read in conjunction with Documentation/kdump/kdump.txt from the Linux kernel source. Some of the information in this document has been sourced from that document. Kexec ===== It is possible to kexec from Xen or Linux to either Xen or Linux. Pattern | Before Kexec | After Kexec ---------------+--------------------+-------------------- Xen -> Xen | first hypervisor & | second hypervisor & | dom0 kernel | dom0 kernel ---------------+--------------------+-------------------- Xen -> Linux | first hypervisor & | second kernel | dom0 kernel | ---------------+--------------------+-------------------- Linux -> Xen | first kernel | second hypervisor & | | dom0 kernel ---------------+--------------------+-------------------- Linux -> Linux | first kernel | second kernel If you are kexecing to Xen then you will also need to prepare the second hypervisor and dom0 kernel that will run after kexec. These may be the same as the first hypervisor and dom0 kernel that are used before kexec if you are kexecing from Xen to Xen. If you are kexecing to Linux then you will need to prepare the second Linux kernel that will run after kexec. In the case that you are kexecing from Linux, it may be the same as the first kernel image that that runs before kexec. Regardless of which kexec pattern you wish to run, you will need to have kexec-tools installed. This provides the kexec command. 1. Load ------- Before kexecing the second kernel or hypervisor & dom0 kernel need to be loaded into the running hypervisor or kernel using the kexec command. a. To kexec to Xen (Xen->Xen or Linux->Xen) kexec -l --append="XEN_ARGS -- DOM0_ARGS" \ --vmm="XEN_IMAGE" "DOM0_IMAGE" KEXEC_ARGS where: XEN_ARGS: command line arguments to the xen hypervisor On x86 the no-real-mode argument should be included DOM0_ARGS: command line arguments to the dom0 kernel XEN_IMAGE: xen hypervisor image DOM0_IMAGE: dom0 kernel image KEXEC_ARGS: additional kexec-tools command line arguments e.g. kexec -l --append "no-real-mode" --vmm="/boot/xen.gz" /boot/vmlinuz.gz OR b. To kexec to Linux (Xen->Linux or Linux->Linux) kexec -l LINUX_IMAGE --append "$LINUX_ARGS" KEXEC_ARGS where: LINUX_IMAGE: the second linux kernel image LINUX_ARGS: command line arguments to the second linux kernel KEXEC_ARGS: additional kexec-tools command line arguments e.g. kexec -l /boot/second-vmlinuz.gz 2. Execute ---------- Once the second kernel is loaded, it can be executed at any time. If you don't see the second kernel booting within a second or so, you are in trouble :( kexec -e Kdump ===== It is possible to kdump from Xen or Linux to a Linux crash kernel. It is not possible to use xen as a crash kernel. Pattern | Before Kexec | After Kexec ---------------+--------------------+-------------------- Xen -> Linux | first hypervisor & | crash kernel | dom0 kernel | ---------------+--------------------+-------------------- Linux -> Linux | first kernel | crash kernel Regardless of if you are kdumping from Xen or Linux you will need to prepare a linux crash kernel. You will also need to have kexec-tools installed. This provides the kexec command. 0. Set-Up The Crash Kernel Region --------------------------------- In order to use kdump an area of memory has to be reserved at boot time. This is the area of memory that the crash kernel will use, thus allowing it to run without disrupting the memory used by the first kernel. This area is called the crash kernel region and is reserved using the crashkernel command line parameter to the Xen hypervisor. It has two forms: i) crashkernel=size This is the simplest and recommended way to reserve the crash kernel region. Just specify how large the region should be and the hypervisor will find a good location for it. A good size to start with is 128Mb e.g. crashkernel=128M ii) crashkernel=size@base In this form the base address is provided in addition to the size. Use this if auto-placement doesn't work for some reason. It is strongly recommended that the base address be aligned to 64Mb, else memory below the alignment point will not be usable. e.g. crashkernel=128M@256M Regardless of which of the two forms of the crashkernel command line you use, the crash kernel region should appear in /proc/iomem on x86 or /proc/iomem_machine on ia64. If it doesn't then either the crashkernel parameter is missing, or for some reason the region couldn't be placed - for instance because it is too large. # cat /proc/iomem ... 00100000-07feffff : System RAM 00100000-00bfffff : Hypervisor code and data 0533f000-0733efff : Crash kernel ... 1. Load ------- Once you are running in a kexec-enabled hypervisor and dom0, you can prepare to kdump by loading the crash kernel into the running kernel. kexec -p CRASH_KERNEL_IMAGE --append "$CRASH_KERNEL_ARGS" KEXEC_ARGS where: CRASH_KERNEL_IMAGE: the crash kernel image CRASH_KERNEL_ARGS: command line arguments to the crash kernel init 1 is strongly recommended irqpoll is strongly recommended maxcpus=1 is required if the crash kernel is SMP reset_devices is strongly recommended KEXEC_ARGS: additional kexec-tools command line arguments On x86 --args-linux should be supplied if an uncompressed vmlinux image is used as the crash kernel e.g. kexec -p /boot/crash-vmlinuz \ --append "init 1 irqpoll maxcpus=1 reset_devices" --args-linux On x86 systems the crash kernel may be either - A uncompressed vmlinux image if the kernel is not relocatable - A compressed bzImage or vmlinuz image if the kernel is relocatable - Relocatability is controlled by the CONFIG_RELOCATABLE kernel compile configuration parameter. This option may not be available depending on the kernel version On ia64 Either a vmlinuz or vmlinux.gz image may be used 2. Execute ---------- Once the second kernel is loaded, the crash kernel will be executed if the hypervisor panics. It will also be executed if dom0 panics or if dom0 oopses and /proc/sys/kernel/panic_on_oops is set to a non-zero value echo 1 > /proc/sys/kernel/panic_on_oops Kdump may also be triggered (for testing) a. From Domain 0 echo c > /proc/sysrq-trigger b. From Xen Enter the xen console ctrl^a ctrl^a (may be bound to a different key, this is the default) Select C for "trigger a crashdump" C If you don't see the crash kernel booting within a second or so, you are in trouble :( xen-4.4.0/docs/misc/efi.markdown0000664000175000017500000000702512307313555014641 0ustar smbsmbBuilding xen.efi requires gcc 4.5.x or above (4.6.x or newer recommended, as 4.5.x was probably never really tested for this purpose) and binutils 2.22 or newer. Additionally, the binutils build must be configured to include support for the x86_64-pep emulation (i.e. `--enable-targets=x86_64-pep` or an option of equivalent effect should be passed to the configure script). Once built, `make install-xen` will place the resulting binary directly into the EFI boot partition, provided `EFI_VENDOR` is set in the environment (and `EFI_MOUNTPOINT` is overridden as needed, should the default of `/boot/efi` not match your system). The xen.efi binary will also be installed in `/usr/lib64/efi/`, unless `EFI_DIR` is set in the environment to override this default. The binary itself will require a configuration file (names with the `.efi` extension of the binary's name replaced by `.cfg`, and - until an existing file is found - trailing name components dropped at `.`, `-`, and `_` separators will be tried) to be present in the same directory as the binary. (To illustrate the name handling, a binary named `xen-4.2-unstable.efi` would try `xen-4.2-unstable.cfg`, `xen-4.2.cfg`, `xen-4.cfg`, and `xen.cfg` in order.) One can override this with a command line option (`-cfg=`). The configuration file consists of one or more sections headed by a section name enclosed in square brackets, with individual values specified in each section. A section named `[global]` is treated specially to allow certain settings to apply to all other sections (or to provide defaults for certain settings in case individual sections don't specify them). A typical file would thus look like this (`#` serving as comment character): **************************example begin****************************** [global] default=sle11sp2 [sle11sp2] options=console=vga,com1 com1=57600 loglvl=all noreboot kernel=vmlinuz-3.0.31-0.4-xen ignore_loglevel #earlyprintk=xen ramdisk=initrd-3.0.31-0.4-xen **************************example end******************************** The individual values used here are: ###`default=` Specifies the section to use for booting, if none was specified on the command line; only meaningful in the `[global]` section. This isn't required; if absent, section headers will be ignored and for each value looked for the first instance within the file will be used. ###`options=` Specifies the options passed to the hypervisor, see [Xen Hypervisor Command Line Options](xen-command-line.html). ###`kernel=[ ]` Specifies the Dom0 kernel binary and the options to pass to it. ###`ramdisk=` Specifies a Linux-style initial RAM disk image to load. Other values to specify are: ###`video=gfx-[x[x]]` Specifies a video mode to select if available. In case of problems, the `-basevideo` command line option can be used to skip altering video modes. ###`xsm=` Specifies an XSM module to load. ###`ucode=` Specifies a CPU microcode blob to load. ###`chain=` Specifies an alternate configuration file to use in case the specified section (and in particular its `kernel=` setting) can't be found in the default (or specified) configuration file. This is only meaningful in the [global] section and really not meant to be used together with the `-cfg=` command line option. Filenames must be specified relative to the location of the EFI binary. Extra options to be passed to Xen can also be specified on the command line, following a `--` separator option. xen-4.4.0/docs/misc/xenstore.txt0000664000175000017500000003155012307313555014742 0ustar smbsmbXenstore protocol specification ------------------------------- Xenstore implements a database which maps filename-like pathnames (also known as `keys') to values. Clients may read and write values, watch for changes, and set permissions to allow or deny access. There is a rudimentary transaction system. While xenstore and most tools and APIs are capable of dealing with arbitrary binary data as values, this should generally be avoided. Data should generally be human-readable for ease of management and debugging; xenstore is not a high-performance facility and should be used only for small amounts of control plane data. Therefore xenstore values should normally be 7-bit ASCII text strings containing bytes 0x20..0x7f only, and should not contain a trailing nul byte. (The APIs used for accessing xenstore generally add a nul when reading, for the caller's convenience.) A separate specification will detail the keys and values which are used in the Xen system and what their meanings are. (Sadly that specification currently exists only in multiple out-of-date versions.) Paths are /-separated and start with a /, just as Unix filenames. We can speak of two paths being and , which is the case if they're identical, or if is /, or if / is an initial substring of . (This includes being a child of itself.) If a particular path exists, all of its parents do too. Every existing path maps to a possibly empty value, and may also have zero or more immediate children. There is thus no particular distinction between directories and leaf nodes. However, it is conventional not to store nonempty values at nodes which also have children. The permitted character for paths set is ASCII alphanumerics and plus the four punctuation characters -/_@ (hyphen slash underscore atsign). @ should be avoided except to specify special watches (see below). Doubled slashes and trailing slashes (except to specify the root) are forbidden. The empty path is also forbidden. Paths longer than 3072 bytes are forbidden; clients specifying relative paths should keep them to within 2048 bytes. (See XENSTORE_*_PATH_MAX in xs_wire.h.) Communication with xenstore is via either sockets, or event channel and shared memory, as specified in io/xs_wire.h: each message in either direction is a header formatted as a struct xsd_sockmsg followed by xsd_sockmsg.len bytes of payload. The payload syntax varies according to the type field. Generally requests each generate a reply with an identical type, req_id and tx_id. However, if an error occurs, a reply will be returned with type ERROR, and only req_id and tx_id copied from the request. A caller who sends several requests may receive the replies in any order and must use req_id (and tx_id, if applicable) to match up replies to requests. (The current implementation always replies to requests in the order received but this should not be relied on.) The payload length (len field of the header) is limited to 4096 (XENSTORE_PAYLOAD_MAX) in both directions. If a client exceeds the limit, its xenstored connection will be immediately killed by xenstored, which is usually catastrophic from the client's point of view. Clients (particularly domains, which cannot just reconnect) should avoid this. Existing clients do not always contain defences against overly long payloads. Increasing xenstored's limit is therefore difficult; it would require negotiation with the client, and obviously would make parts of xenstore inaccessible to some clients. In any case passing bulk data through xenstore is not recommended as the performance properties are poor. ---------- Xenstore protocol details - introduction ---------- The payload syntax and semantics of the requests and replies are described below. In the payload syntax specifications we use the following notations: | A nul (zero) byte. A string guaranteed not to contain any nul bytes. Binary data (which may contain zero or more nul bytes) |* Zero or more strings each followed by a trailing nul |+ One or more strings each followed by a trailing nul ? Reserved value (may not contain nuls) ?? Reserved value (may contain nuls) Except as otherwise noted, reserved values are believed to be sent as empty strings by all current clients. Clients should not send nonempty strings for reserved values; those parts of the protocol may be used for extension in the future. Error replies are as follows: ERROR E| Where E is the name of an errno value listed in io/xs_wire.h. Note that the string name is transmitted, not a numeric value. Where no reply payload format is specified below, success responses have the following payload: OK| Values commonly included in payloads include: Specifies a path in the hierarchical key structure. If starts with a / it simply represents that path. is allowed not to start with /, in which case the caller must be a domain (rather than connected via a socket) and the path is taken to be relative to /local/domain/ (eg, `x/y' sent by domain 3 would mean `/local/domain/3/x/y'). Integer domid, represented as decimal number 0..65535. Parsing errors and values out of range generally go undetected. The special DOMID_... values (see xen.h) are represented as integers; unless otherwise specified it is an error not to specify a real domain id. The following are the actual type values, including the request and reply payloads as applicable: ---------- Database read, write and permissions operations ---------- READ | WRITE | Store and read the octet string at . WRITE creates any missing parent paths, with empty values. MKDIR | Ensures that the exists, by necessary by creating it and any missing parents with empty values. If or any parent already exists, its value is left unchanged. RM | Ensures that the does not exist, by deleting it and all of its children. It is not an error if does not exist, but it _is_ an error if 's immediate parent does not exist either. DIRECTORY | |* Gives a list of the immediate children of , as only the leafnames. The resulting children are each named /. GET_PERMS | |+ SET_PERMS ||+? is one of the following w write only r read only b both read and write n no access See http://wiki.xen.org/wiki/XenBus section `Permissions' for details of the permissions system. ---------- Watches ---------- WATCH ||? Adds a watch. When a is modified (including path creation, removal, contents change or permissions change) this generates an event on the changed . Changes made in transactions cause an event only if and when committed. Each occurring event is matched against all the watches currently set up, and each matching watch results in a WATCH_EVENT message (see below). The event's path matches the watch's if it is an child of . can be a to watch or @. In the latter case may have any syntax but it matches (according to the rules above) only the following special events which are invented by xenstored: @introduceDomain occurs on INTRODUCE @releaseDomain occurs on any domain crash or shutdown, and also on RELEASE and domain destruction When a watch is first set up it is triggered once straight away, with equal to . Watches may be triggered spuriously. The tx_id in a WATCH request is ignored. Watches are supposed to be restricted by the permissions system but in practice the implementation is imperfect. Applications should not rely on being sent a notification for paths that they cannot read; however, an application may rely on being sent a watch when a path which it _is_ able to read is deleted even if that leaves only a nonexistent unreadable parent. A notification may omitted if a node's permissions are changed so as to make it unreadable, in which case future notifications may be suppressed (and if the node is later made readable, some notifications may have been lost). WATCH_EVENT || Unsolicited `reply' generated for matching modification events as described above. req_id and tx_id are both 0. is the event's path, ie the actual path that was modified; however if the event was the recursive removal of an parent of , is just (rather than the actual path which was removed). So is a child of , regardless. Iff for the watch was specified as a relative pathname, the path will also be relative (with the same base, obviously). UNWATCH ||? RESET_WATCHES | Reset all watches and transactions of the caller. ---------- Transactions ---------- TRANSACTION_START | | is an opaque uint32_t allocated by xenstored represented as unsigned decimal. After this, transaction may be referenced by using (as 32-bit binary) in the tx_id request header field. When transaction is started whole db is copied; reads and writes happen on the copy. It is not legal to send non-0 tx_id in TRANSACTION_START. Currently xenstored has the bug that after 2^32 transactions it will allocate the transid 0 for an actual transaction. TRANSACTION_END T| TRANSACTION_END F| tx_id must refer to existing transaction. After this request the tx_id is no longer valid and may be reused by xenstore. If F, the transaction is discarded. If T, it is committed: if there were any other intervening writes then our END gets get EAGAIN. The plan is that in the future only intervening `conflicting' writes cause EAGAIN, meaning only writes or other commits which changed paths which were read or written in the transaction at hand. ---------- Domain management and xenstored communications ---------- INTRODUCE |||? Notifies xenstored to communicate with this domain. INTRODUCE is currently only used by xend (during domain startup and various forms of restore and resume), and xenstored prevents its use other than by dom0. must be a real domain id (not 0 and not a special DOMID_... value). must be a machine page in that domain represented in signed decimal (!). must be event channel is an unbound event channel in (likewise in decimal), on which xenstored will call bind_interdomain. Violations of these rules may result in undefined behaviour; for example passing a high-bit-set 32-bit mfn as an unsigned decimal will attempt to use 0x7fffffff instead (!). RELEASE | Manually requests that xenstored disconnect from the domain. The event channel is unbound at the xenstored end and the page unmapped. If the domain is still running it won't be able to communicate with xenstored. NB that xenstored will in any case detect domain destruction and disconnect by itself. xenstored prevents the use of RELEASE other than by dom0. GET_DOMAIN_PATH | | Returns the domain's base path, as is used for relative transactions: ie, /local/domain/ (with normalised). The answer will be useless unless is a real domain id. IS_DOMAIN_INTRODUCED | T| or F| Returns T if xenstored is in communication with the domain: ie, if INTRODUCE for the domain has not yet been followed by domain destruction or explicit RELEASE. RESUME | Arranges that @releaseDomain events will once more be generated when the domain becomes shut down. This might have to be used if a domain were to be shut down (generating one @releaseDomain) and then subsequently restarted, since the state-sensitive algorithm in xenstored will not otherwise send further watch event notifications if the domain were to be shut down again. It is not clear whether this is possible since one would normally expect a domain not to be restarted after being shut down without being destroyed in the meantime. There are currently no users of this request in xen-unstable. xenstored prevents the use of RESUME other than by dom0. SET_TARGET || Notifies xenstored that domain is targeting domain . This grants domain full access to paths owned by . Domain also inherits all permissions granted to on all other paths. This allows to behave as if it were dom0 when modifying paths related to . xenstored prevents the use of SET_TARGET other than by dom0. ---------- Miscellaneous ---------- DEBUG print||?? sends to debug log DEBUG print| EINVAL DEBUG check|?? checks xenstored innards DEBUG no-op (future extension) These requests should not generally be used and may be withdrawn in the future. xen-4.4.0/docs/misc/xl-disk-configuration.txt0000664000175000017500000001757212307313555017323 0ustar smbsmb --------------------- XL DISK CONFIGURATION --------------------- This document specifies the xl config file format disk configuration option. It has the following form: disk = [ '', '', ... ] where each diskspec is in this form: [=|,]*, [, [, [, []]]], [=|,]* [target=] For example, these strings are equivalent: /dev/vg/guest-volume,,hda /dev/vg/guest-volume,raw,hda,rw format=raw, vdev=hda, access=rw, target=/dev/vg/guest-volume raw:/dev/vg/guest-volume,hda,w (deprecated, see below) As are these: /root/image.iso,,hdc,cdrom /root/image.iso,,hdc,,cdrom /root/image.iso,raw,hdc,devtype=cdrom format=raw, vdev=hdc, access=ro, devtype=cdrom, target=/root/image.iso raw:/root/image.iso,hdc:cdrom,ro (deprecated, see below) These might be specified in the domain config file like this: disk = [ '/dev/vg/guest-volume,,hda', '/root/image.iso,,hdc,cdrom' ] More formally, the string is a series of comma-separated keyword/value pairs, flags and positional parameters. Parameters which are not bare keywords and which do not contain "=" symbols are assigned to the so-far-unspecified positional parameters, in the order below. The positional parameters may also be specified explicitly by name. Each parameter may be specified at most once, either as a positional parameter or a named parameter. Default values apply if the parameter is not specified, or if it is specified with an empty value (whether positionally or explicitly). Whitespace may appear before each parameter and will be ignored. ===================== POSITIONAL PARAMETERS ===================== target ------ Description: Block device or image file path. When this is used as a path, /dev will be prepended if the path doesn't start with a '/'. Supported values: N/A Deprecated values: N/A Default value: None. While a path is provided in most cases there is an exception: for a cdrom device, lack of this attribute would imply an empty cdrom drive. Special syntax: When this parameter is specified by name, ie with the "target=" syntax in the configuration file, it consumes the whole rest of the including trailing whitespaces. Therefore in that case it must come last. This is permissible even if an empty value for the target was already specified as a positional parameter. This is the only way to specify a target string containing metacharacters such as commas and (in some cases) colons, which would otherwise be misinterpreted. Future parameter and flag names will start with an ascii letter and contain only ascii alphanumerics, hyphens and underscores, and will not be legal as vdevs. Targets which might match that syntax should not be specified as positional parameters. format ------ Description: Specifies the format of image file. Supported values: raw, qcow, qcow2, vhd Deprecated values: None Default value: raw vdev ---- Description: Virtual device as seen by the guest (also referred to as guest drive designation in some specifications). See docs/misc/vbd-interface.txt. Supported values: hd[x], xvd[x], sd[x] etc. Please refer to the above specification for further details. Deprecated values: None Default Value: None, this parameter is mandatory. access ------- Description: Specified access control information. Whether or not the block device is provided to the guest in read-only or read-write mode depends on this attribute. Supported values: ro, r (specifies read-only) rw, w (specifies read/write) Deprecated values: None Default value: rw unless devtype=cdrom, in which case r ========================== OTHER PARAMETERS AND FLAGS ========================== devtype= ----------------- Description: Qualifies virtual device type. Supported values: cdrom Deprecated values: None Mandatory: No cdrom ----- Convenience alias for "devtype=cdrom". backend= --------------------- Description: Designates a backend domain for the device Supported values: Valid domain names Mandatory: No Specifies the backend domain which this device should attach to. This defaults to domain 0. Specifying another domain requires setting up a driver domain which is outside the scope of this document. backendtype= -------------------------- Description: Specifies the backend implementation to use Supported values: phy, tap, qdisk Mandatory: No Default value: Automatically determine which backend to use. This does not affect the guest's view of the device. It controls which software implementation of the Xen backend driver us used. Not all backend drivers support all combinations of other options. For example, "phy" does not support formats other than "raw". Normally this option should not be specified, in which case libxl will automatically determine the most suitable backend. script=